diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2017-02-13 18:49:26 +0000 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2017-02-13 18:49:26 +0000 |
commit | ec7a9bd5bb2c46c60cc0ec9b9d9f2ce404226ec0 (patch) | |
tree | ae717012129be5c9c27827e1eab623ff467ddb1c | |
parent | 305e6c8a2ff7a6e3f4942b50e853230f18eeb5a9 (diff) | |
parent | 982b78c5e37864c06fd7b5f156d80bf02628a855 (diff) |
Merge remote-tracking branch 'remotes/dgilbert/tags/pull-migration-20170213a' into staging
Migration
Amit: migration: remove myself as maintainer
MAINTAINERS: update my email address
Ashijeet: migrate: Introduce zero RAM checks to skip RAM migration
Pavel: Postcopy release RAM
Halil: consolidate VMStateField.start
Hailiang: COLO: fix setting checkpoint-delay not working properly
COLO: Shutdown related socket fd while do failover
COLO: Don't process failover request while loading VM's state
Me:
migration: Add VMSTATE_UNUSED_VARRAY_UINT32
migration: Add VMSTATE_WITH_TMP
tests/migration: Add test for VMSTATE_WITH_TMP
virtio-net VMState conversion and new VMSTATE macros
# gpg: Signature made Mon 13 Feb 2017 17:36:39 GMT
# gpg: using RSA key 0x0516331EBC5BFDE7
# gpg: Good signature from "Dr. David Alan Gilbert (RH2) <dgilbert@redhat.com>"
# gpg: WARNING: This key is not certified with a trusted signature!
# gpg: There is no indication that the signature belongs to the owner.
# Primary key fingerprint: 45F5 C71B 4A0C B7FB 977A 9FA9 0516 331E BC5B FDE7
* remotes/dgilbert/tags/pull-migration-20170213a:
virtio/migration: Migrate virtio-net to VMState
tests/migration: Add test for VMSTATE_WITH_TMP
migration: Add VMSTATE_WITH_TMP
migration: Add VMSTATE_UNUSED_VARRAY_UINT32
COLO: Don't process failover request while loading VM's state
COLO: Shutdown related socket fd while do failover
COLO: fix setting checkpoint-delay not working properly
migration: consolidate VMStateField.start
migrate: Introduce zero RAM checks to skip RAM migration
migration: discard non-dirty ram pages after the start of postcopy
add 'release-ram' migrate capability
migration: add MigrationState arg for ram_save_/compressed_/page()
MAINTAINERS: update my email address
migration: remove myself as maintainer
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r-- | MAINTAINERS | 5 | ||||
-rw-r--r-- | hw/char/exynos4210_uart.c | 2 | ||||
-rw-r--r-- | hw/display/g364fb.c | 2 | ||||
-rw-r--r-- | hw/dma/pl330.c | 8 | ||||
-rw-r--r-- | hw/intc/exynos4210_gic.c | 2 | ||||
-rw-r--r-- | hw/ipmi/isa_ipmi_bt.c | 6 | ||||
-rw-r--r-- | hw/net/virtio-net.c | 316 | ||||
-rw-r--r-- | hw/net/vmxnet3.c | 2 | ||||
-rw-r--r-- | hw/nvram/mac_nvram.c | 2 | ||||
-rw-r--r-- | hw/nvram/spapr_nvram.c | 2 | ||||
-rw-r--r-- | hw/sd/sdhci.c | 2 | ||||
-rw-r--r-- | hw/timer/m48t59.c | 2 | ||||
-rw-r--r-- | include/hw/virtio/virtio-net.h | 4 | ||||
-rw-r--r-- | include/migration/colo.h | 2 | ||||
-rw-r--r-- | include/migration/migration.h | 10 | ||||
-rw-r--r-- | include/migration/qemu-file.h | 3 | ||||
-rw-r--r-- | include/migration/vmstate.h | 51 | ||||
-rw-r--r-- | migration/colo.c | 102 | ||||
-rw-r--r-- | migration/migration.c | 16 | ||||
-rw-r--r-- | migration/qemu-file.c | 59 | ||||
-rw-r--r-- | migration/ram.c | 78 | ||||
-rw-r--r-- | migration/savevm.c | 2 | ||||
-rw-r--r-- | migration/vmstate.c | 44 | ||||
-rw-r--r-- | qapi-schema.json | 9 | ||||
-rw-r--r-- | target/s390x/machine.c | 2 | ||||
-rw-r--r-- | tests/test-vmstate.c | 98 | ||||
-rw-r--r-- | util/fifo8.c | 2 |
27 files changed, 648 insertions, 185 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 7afbadaa15..fb57d8eb45 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1034,7 +1034,7 @@ F: hw/input/virtio-input*.c F: include/hw/virtio/virtio-input.h virtio-serial -M: Amit Shah <amit.shah@redhat.com> +M: Amit Shah <amit@kernel.org> S: Supported F: hw/char/virtio-serial-bus.c F: hw/char/virtio-console.c @@ -1043,7 +1043,7 @@ F: tests/virtio-console-test.c F: tests/virtio-serial-test.c virtio-rng -M: Amit Shah <amit.shah@redhat.com> +M: Amit Shah <amit@kernel.org> S: Supported F: hw/virtio/virtio-rng.c F: include/hw/virtio/virtio-rng.h @@ -1431,7 +1431,6 @@ F: scripts/checkpatch.pl Migration M: Juan Quintela <quintela@redhat.com> -M: Amit Shah <amit.shah@redhat.com> M: Dr. David Alan Gilbert <dgilbert@redhat.com> S: Maintained F: include/migration/ diff --git a/hw/char/exynos4210_uart.c b/hw/char/exynos4210_uart.c index 7c16e894e2..b75f28d473 100644 --- a/hw/char/exynos4210_uart.c +++ b/hw/char/exynos4210_uart.c @@ -561,7 +561,7 @@ static const VMStateDescription vmstate_exynos4210_uart_fifo = { .fields = (VMStateField[]) { VMSTATE_UINT32(sp, Exynos4210UartFIFO), VMSTATE_UINT32(rp, Exynos4210UartFIFO), - VMSTATE_VBUFFER_UINT32(data, Exynos4210UartFIFO, 1, NULL, 0, size), + VMSTATE_VBUFFER_UINT32(data, Exynos4210UartFIFO, 1, NULL, size), VMSTATE_END_OF_LIST() } }; diff --git a/hw/display/g364fb.c b/hw/display/g364fb.c index 70ef2c7453..8cdc205dd9 100644 --- a/hw/display/g364fb.c +++ b/hw/display/g364fb.c @@ -464,7 +464,7 @@ static const VMStateDescription vmstate_g364fb = { .minimum_version_id = 1, .post_load = g364fb_post_load, .fields = (VMStateField[]) { - VMSTATE_VBUFFER_UINT32(vram, G364State, 1, NULL, 0, vram_size), + VMSTATE_VBUFFER_UINT32(vram, G364State, 1, NULL, vram_size), VMSTATE_BUFFER_UNSAFE(color_palette, G364State, 0, 256 * 3), VMSTATE_BUFFER_UNSAFE(cursor_palette, G364State, 0, 9), VMSTATE_UINT16_ARRAY(cursor, G364State, 512), diff --git a/hw/dma/pl330.c b/hw/dma/pl330.c index c0bd9fec30..32cf8399b8 100644 --- a/hw/dma/pl330.c +++ b/hw/dma/pl330.c @@ -173,8 +173,8 @@ static const VMStateDescription vmstate_pl330_fifo = { .version_id = 1, .minimum_version_id = 1, .fields = (VMStateField[]) { - VMSTATE_VBUFFER_UINT32(buf, PL330Fifo, 1, NULL, 0, buf_size), - VMSTATE_VBUFFER_UINT32(tag, PL330Fifo, 1, NULL, 0, buf_size), + VMSTATE_VBUFFER_UINT32(buf, PL330Fifo, 1, NULL, buf_size), + VMSTATE_VBUFFER_UINT32(tag, PL330Fifo, 1, NULL, buf_size), VMSTATE_UINT32(head, PL330Fifo), VMSTATE_UINT32(num, PL330Fifo), VMSTATE_UINT32(buf_size, PL330Fifo), @@ -282,8 +282,8 @@ static const VMStateDescription vmstate_pl330 = { VMSTATE_STRUCT(manager, PL330State, 0, vmstate_pl330_chan, PL330Chan), VMSTATE_STRUCT_VARRAY_UINT32(chan, PL330State, num_chnls, 0, vmstate_pl330_chan, PL330Chan), - VMSTATE_VBUFFER_UINT32(lo_seqn, PL330State, 1, NULL, 0, num_chnls), - VMSTATE_VBUFFER_UINT32(hi_seqn, PL330State, 1, NULL, 0, num_chnls), + VMSTATE_VBUFFER_UINT32(lo_seqn, PL330State, 1, NULL, num_chnls), + VMSTATE_VBUFFER_UINT32(hi_seqn, PL330State, 1, NULL, num_chnls), VMSTATE_STRUCT(fifo, PL330State, 0, vmstate_pl330_fifo, PL330Fifo), VMSTATE_STRUCT(read_queue, PL330State, 0, vmstate_pl330_queue, PL330Queue), diff --git a/hw/intc/exynos4210_gic.c b/hw/intc/exynos4210_gic.c index fd7a8f3058..2a55817b76 100644 --- a/hw/intc/exynos4210_gic.c +++ b/hw/intc/exynos4210_gic.c @@ -393,7 +393,7 @@ static const VMStateDescription vmstate_exynos4210_irq_gate = { .version_id = 2, .minimum_version_id = 2, .fields = (VMStateField[]) { - VMSTATE_VBUFFER_UINT32(level, Exynos4210IRQGateState, 1, NULL, 0, n_in), + VMSTATE_VBUFFER_UINT32(level, Exynos4210IRQGateState, 1, NULL, n_in), VMSTATE_END_OF_LIST() } }; diff --git a/hw/ipmi/isa_ipmi_bt.c b/hw/ipmi/isa_ipmi_bt.c index f03661715c..1c69cb33f8 100644 --- a/hw/ipmi/isa_ipmi_bt.c +++ b/hw/ipmi/isa_ipmi_bt.c @@ -471,10 +471,8 @@ static const VMStateDescription vmstate_ISAIPMIBTDevice = { VMSTATE_BOOL(bt.use_irq, ISAIPMIBTDevice), VMSTATE_BOOL(bt.irqs_enabled, ISAIPMIBTDevice), VMSTATE_UINT32(bt.outpos, ISAIPMIBTDevice), - VMSTATE_VBUFFER_UINT32(bt.outmsg, ISAIPMIBTDevice, 1, NULL, 0, - bt.outlen), - VMSTATE_VBUFFER_UINT32(bt.inmsg, ISAIPMIBTDevice, 1, NULL, 0, - bt.inlen), + VMSTATE_VBUFFER_UINT32(bt.outmsg, ISAIPMIBTDevice, 1, NULL, bt.outlen), + VMSTATE_VBUFFER_UINT32(bt.inmsg, ISAIPMIBTDevice, 1, NULL, bt.inlen), VMSTATE_UINT8(bt.control_reg, ISAIPMIBTDevice), VMSTATE_UINT8(bt.mask_reg, ISAIPMIBTDevice), VMSTATE_UINT8(bt.waiting_rsp, ISAIPMIBTDevice), diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 7b3ad4a9f0..354a19eab8 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -1557,119 +1557,22 @@ static void virtio_net_set_multiqueue(VirtIONet *n, int multiqueue) virtio_net_set_queues(n); } -static void virtio_net_save_device(VirtIODevice *vdev, QEMUFile *f) +static int virtio_net_post_load_device(void *opaque, int version_id) { - VirtIONet *n = VIRTIO_NET(vdev); - int i; - - qemu_put_buffer(f, n->mac, ETH_ALEN); - qemu_put_be32(f, n->vqs[0].tx_waiting); - qemu_put_be32(f, n->mergeable_rx_bufs); - qemu_put_be16(f, n->status); - qemu_put_byte(f, n->promisc); - qemu_put_byte(f, n->allmulti); - qemu_put_be32(f, n->mac_table.in_use); - qemu_put_buffer(f, n->mac_table.macs, n->mac_table.in_use * ETH_ALEN); - qemu_put_buffer(f, (uint8_t *)n->vlans, MAX_VLAN >> 3); - qemu_put_be32(f, n->has_vnet_hdr); - qemu_put_byte(f, n->mac_table.multi_overflow); - qemu_put_byte(f, n->mac_table.uni_overflow); - qemu_put_byte(f, n->alluni); - qemu_put_byte(f, n->nomulti); - qemu_put_byte(f, n->nouni); - qemu_put_byte(f, n->nobcast); - qemu_put_byte(f, n->has_ufo); - if (n->max_queues > 1) { - qemu_put_be16(f, n->max_queues); - qemu_put_be16(f, n->curr_queues); - for (i = 1; i < n->curr_queues; i++) { - qemu_put_be32(f, n->vqs[i].tx_waiting); - } - } - - if (virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) { - qemu_put_be64(f, n->curr_guest_offloads); - } -} - -static int virtio_net_load_device(VirtIODevice *vdev, QEMUFile *f, - int version_id) -{ - VirtIONet *n = VIRTIO_NET(vdev); + VirtIONet *n = opaque; + VirtIODevice *vdev = VIRTIO_DEVICE(n); int i, link_down; - qemu_get_buffer(f, n->mac, ETH_ALEN); - n->vqs[0].tx_waiting = qemu_get_be32(f); - - virtio_net_set_mrg_rx_bufs(n, qemu_get_be32(f), + virtio_net_set_mrg_rx_bufs(n, n->mergeable_rx_bufs, virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)); - n->status = qemu_get_be16(f); - - n->promisc = qemu_get_byte(f); - n->allmulti = qemu_get_byte(f); - - n->mac_table.in_use = qemu_get_be32(f); /* MAC_TABLE_ENTRIES may be different from the saved image */ - if (n->mac_table.in_use <= MAC_TABLE_ENTRIES) { - qemu_get_buffer(f, n->mac_table.macs, - n->mac_table.in_use * ETH_ALEN); - } else { - int64_t i; - - /* Overflow detected - can happen if source has a larger MAC table. - * We simply set overflow flag so there's no need to maintain the - * table of addresses, discard them all. - * Note: 64 bit math to avoid integer overflow. - */ - for (i = 0; i < (int64_t)n->mac_table.in_use * ETH_ALEN; ++i) { - qemu_get_byte(f); - } - n->mac_table.multi_overflow = n->mac_table.uni_overflow = 1; + if (n->mac_table.in_use > MAC_TABLE_ENTRIES) { n->mac_table.in_use = 0; } - - qemu_get_buffer(f, (uint8_t *)n->vlans, MAX_VLAN >> 3); - - if (qemu_get_be32(f) && !peer_has_vnet_hdr(n)) { - error_report("virtio-net: saved image requires vnet_hdr=on"); - return -1; - } - - n->mac_table.multi_overflow = qemu_get_byte(f); - n->mac_table.uni_overflow = qemu_get_byte(f); - - n->alluni = qemu_get_byte(f); - n->nomulti = qemu_get_byte(f); - n->nouni = qemu_get_byte(f); - n->nobcast = qemu_get_byte(f); - - if (qemu_get_byte(f) && !peer_has_ufo(n)) { - error_report("virtio-net: saved image requires TUN_F_UFO support"); - return -1; - } - if (n->max_queues > 1) { - if (n->max_queues != qemu_get_be16(f)) { - error_report("virtio-net: different max_queues "); - return -1; - } - - n->curr_queues = qemu_get_be16(f); - if (n->curr_queues > n->max_queues) { - error_report("virtio-net: curr_queues %x > max_queues %x", - n->curr_queues, n->max_queues); - return -1; - } - for (i = 1; i < n->curr_queues; i++) { - n->vqs[i].tx_waiting = qemu_get_be32(f); - } - } - - if (virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) { - n->curr_guest_offloads = qemu_get_be64(f); - } else { + if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) { n->curr_guest_offloads = virtio_net_supported_guest_offloads(n); } @@ -1703,6 +1606,210 @@ static int virtio_net_load_device(VirtIODevice *vdev, QEMUFile *f, return 0; } +/* tx_waiting field of a VirtIONetQueue */ +static const VMStateDescription vmstate_virtio_net_queue_tx_waiting = { + .name = "virtio-net-queue-tx_waiting", + .fields = (VMStateField[]) { + VMSTATE_UINT32(tx_waiting, VirtIONetQueue), + VMSTATE_END_OF_LIST() + }, +}; + +static bool max_queues_gt_1(void *opaque, int version_id) +{ + return VIRTIO_NET(opaque)->max_queues > 1; +} + +static bool has_ctrl_guest_offloads(void *opaque, int version_id) +{ + return virtio_vdev_has_feature(VIRTIO_DEVICE(opaque), + VIRTIO_NET_F_CTRL_GUEST_OFFLOADS); +} + +static bool mac_table_fits(void *opaque, int version_id) +{ + return VIRTIO_NET(opaque)->mac_table.in_use <= MAC_TABLE_ENTRIES; +} + +static bool mac_table_doesnt_fit(void *opaque, int version_id) +{ + return !mac_table_fits(opaque, version_id); +} + +/* This temporary type is shared by all the WITH_TMP methods + * although only some fields are used by each. + */ +struct VirtIONetMigTmp { + VirtIONet *parent; + VirtIONetQueue *vqs_1; + uint16_t curr_queues_1; + uint8_t has_ufo; + uint32_t has_vnet_hdr; +}; + +/* The 2nd and subsequent tx_waiting flags are loaded later than + * the 1st entry in the queues and only if there's more than one + * entry. We use the tmp mechanism to calculate a temporary + * pointer and count and also validate the count. + */ + +static void virtio_net_tx_waiting_pre_save(void *opaque) +{ + struct VirtIONetMigTmp *tmp = opaque; + + tmp->vqs_1 = tmp->parent->vqs + 1; + tmp->curr_queues_1 = tmp->parent->curr_queues - 1; + if (tmp->parent->curr_queues == 0) { + tmp->curr_queues_1 = 0; + } +} + +static int virtio_net_tx_waiting_pre_load(void *opaque) +{ + struct VirtIONetMigTmp *tmp = opaque; + + /* Reuse the pointer setup from save */ + virtio_net_tx_waiting_pre_save(opaque); + + if (tmp->parent->curr_queues > tmp->parent->max_queues) { + error_report("virtio-net: curr_queues %x > max_queues %x", + tmp->parent->curr_queues, tmp->parent->max_queues); + + return -EINVAL; + } + + return 0; /* all good */ +} + +static const VMStateDescription vmstate_virtio_net_tx_waiting = { + .name = "virtio-net-tx_waiting", + .pre_load = virtio_net_tx_waiting_pre_load, + .pre_save = virtio_net_tx_waiting_pre_save, + .fields = (VMStateField[]) { + VMSTATE_STRUCT_VARRAY_POINTER_UINT16(vqs_1, struct VirtIONetMigTmp, + curr_queues_1, + vmstate_virtio_net_queue_tx_waiting, + struct VirtIONetQueue), + VMSTATE_END_OF_LIST() + }, +}; + +/* the 'has_ufo' flag is just tested; if the incoming stream has the + * flag set we need to check that we have it + */ +static int virtio_net_ufo_post_load(void *opaque, int version_id) +{ + struct VirtIONetMigTmp *tmp = opaque; + + if (tmp->has_ufo && !peer_has_ufo(tmp->parent)) { + error_report("virtio-net: saved image requires TUN_F_UFO support"); + return -EINVAL; + } + + return 0; +} + +static void virtio_net_ufo_pre_save(void *opaque) +{ + struct VirtIONetMigTmp *tmp = opaque; + + tmp->has_ufo = tmp->parent->has_ufo; +} + +static const VMStateDescription vmstate_virtio_net_has_ufo = { + .name = "virtio-net-ufo", + .post_load = virtio_net_ufo_post_load, + .pre_save = virtio_net_ufo_pre_save, + .fields = (VMStateField[]) { + VMSTATE_UINT8(has_ufo, struct VirtIONetMigTmp), + VMSTATE_END_OF_LIST() + }, +}; + +/* the 'has_vnet_hdr' flag is just tested; if the incoming stream has the + * flag set we need to check that we have it + */ +static int virtio_net_vnet_post_load(void *opaque, int version_id) +{ + struct VirtIONetMigTmp *tmp = opaque; + + if (tmp->has_vnet_hdr && !peer_has_vnet_hdr(tmp->parent)) { + error_report("virtio-net: saved image requires vnet_hdr=on"); + return -EINVAL; + } + + return 0; +} + +static void virtio_net_vnet_pre_save(void *opaque) +{ + struct VirtIONetMigTmp *tmp = opaque; + + tmp->has_vnet_hdr = tmp->parent->has_vnet_hdr; +} + +static const VMStateDescription vmstate_virtio_net_has_vnet = { + .name = "virtio-net-vnet", + .post_load = virtio_net_vnet_post_load, + .pre_save = virtio_net_vnet_pre_save, + .fields = (VMStateField[]) { + VMSTATE_UINT32(has_vnet_hdr, struct VirtIONetMigTmp), + VMSTATE_END_OF_LIST() + }, +}; + +static const VMStateDescription vmstate_virtio_net_device = { + .name = "virtio-net-device", + .version_id = VIRTIO_NET_VM_VERSION, + .minimum_version_id = VIRTIO_NET_VM_VERSION, + .post_load = virtio_net_post_load_device, + .fields = (VMStateField[]) { + VMSTATE_UINT8_ARRAY(mac, VirtIONet, ETH_ALEN), + VMSTATE_STRUCT_POINTER(vqs, VirtIONet, + vmstate_virtio_net_queue_tx_waiting, + VirtIONetQueue), + VMSTATE_UINT32(mergeable_rx_bufs, VirtIONet), + VMSTATE_UINT16(status, VirtIONet), + VMSTATE_UINT8(promisc, VirtIONet), + VMSTATE_UINT8(allmulti, VirtIONet), + VMSTATE_UINT32(mac_table.in_use, VirtIONet), + + /* Guarded pair: If it fits we load it, else we throw it away + * - can happen if source has a larger MAC table.; post-load + * sets flags in this case. + */ + VMSTATE_VBUFFER_MULTIPLY(mac_table.macs, VirtIONet, + 0, mac_table_fits, mac_table.in_use, + ETH_ALEN), + VMSTATE_UNUSED_VARRAY_UINT32(VirtIONet, mac_table_doesnt_fit, 0, + mac_table.in_use, ETH_ALEN), + + /* Note: This is an array of uint32's that's always been saved as a + * buffer; hold onto your endiannesses; it's actually used as a bitmap + * but based on the uint. + */ + VMSTATE_BUFFER_POINTER_UNSAFE(vlans, VirtIONet, 0, MAX_VLAN >> 3), + VMSTATE_WITH_TMP(VirtIONet, struct VirtIONetMigTmp, + vmstate_virtio_net_has_vnet), + VMSTATE_UINT8(mac_table.multi_overflow, VirtIONet), + VMSTATE_UINT8(mac_table.uni_overflow, VirtIONet), + VMSTATE_UINT8(alluni, VirtIONet), + VMSTATE_UINT8(nomulti, VirtIONet), + VMSTATE_UINT8(nouni, VirtIONet), + VMSTATE_UINT8(nobcast, VirtIONet), + VMSTATE_WITH_TMP(VirtIONet, struct VirtIONetMigTmp, + vmstate_virtio_net_has_ufo), + VMSTATE_SINGLE_TEST(max_queues, VirtIONet, max_queues_gt_1, 0, + vmstate_info_uint16_equal, uint16_t), + VMSTATE_UINT16_TEST(curr_queues, VirtIONet, max_queues_gt_1), + VMSTATE_WITH_TMP(VirtIONet, struct VirtIONetMigTmp, + vmstate_virtio_net_tx_waiting), + VMSTATE_UINT64_TEST(curr_guest_offloads, VirtIONet, + has_ctrl_guest_offloads), + VMSTATE_END_OF_LIST() + }, +}; + static NetClientInfo net_virtio_info = { .type = NET_CLIENT_DRIVER_NIC, .size = sizeof(NICState), @@ -1989,9 +2096,8 @@ static void virtio_net_class_init(ObjectClass *klass, void *data) vdc->set_status = virtio_net_set_status; vdc->guest_notifier_mask = virtio_net_guest_notifier_mask; vdc->guest_notifier_pending = virtio_net_guest_notifier_pending; - vdc->load = virtio_net_load_device; - vdc->save = virtio_net_save_device; vdc->legacy_features |= (0x1 << VIRTIO_NET_F_GSO); + vdc->vmsd = &vmstate_virtio_net_device; } static const TypeInfo virtio_net_info = { diff --git a/hw/net/vmxnet3.c b/hw/net/vmxnet3.c index 7dd456551c..e13a798b3b 100644 --- a/hw/net/vmxnet3.c +++ b/hw/net/vmxnet3.c @@ -2397,7 +2397,7 @@ static const VMStateDescription vmxstate_vmxnet3_mcast_list = { .pre_load = vmxnet3_mcast_list_pre_load, .needed = vmxnet3_mc_list_needed, .fields = (VMStateField[]) { - VMSTATE_VBUFFER_UINT32(mcast_list, VMXNET3State, 0, NULL, 0, + VMSTATE_VBUFFER_UINT32(mcast_list, VMXNET3State, 0, NULL, mcast_list_buff_size), VMSTATE_END_OF_LIST() } diff --git a/hw/nvram/mac_nvram.c b/hw/nvram/mac_nvram.c index 63f9ed1d82..aef80e64df 100644 --- a/hw/nvram/mac_nvram.c +++ b/hw/nvram/mac_nvram.c @@ -82,7 +82,7 @@ static const VMStateDescription vmstate_macio_nvram = { .version_id = 1, .minimum_version_id = 1, .fields = (VMStateField[]) { - VMSTATE_VBUFFER_UINT32(data, MacIONVRAMState, 0, NULL, 0, size), + VMSTATE_VBUFFER_UINT32(data, MacIONVRAMState, 0, NULL, size), VMSTATE_END_OF_LIST() } }; diff --git a/hw/nvram/spapr_nvram.c b/hw/nvram/spapr_nvram.c index eb42ea323f..65ba188555 100644 --- a/hw/nvram/spapr_nvram.c +++ b/hw/nvram/spapr_nvram.c @@ -224,7 +224,7 @@ static const VMStateDescription vmstate_spapr_nvram = { .post_load = spapr_nvram_post_load, .fields = (VMStateField[]) { VMSTATE_UINT32(size, sPAPRNVRAM), - VMSTATE_VBUFFER_ALLOC_UINT32(buf, sPAPRNVRAM, 1, NULL, 0, size), + VMSTATE_VBUFFER_ALLOC_UINT32(buf, sPAPRNVRAM, 1, NULL, size), VMSTATE_END_OF_LIST() }, }; diff --git a/hw/sd/sdhci.c b/hw/sd/sdhci.c index 5bd5ab6319..da32b5f709 100644 --- a/hw/sd/sdhci.c +++ b/hw/sd/sdhci.c @@ -1253,7 +1253,7 @@ const VMStateDescription sdhci_vmstate = { VMSTATE_UINT16(data_count, SDHCIState), VMSTATE_UINT64(admasysaddr, SDHCIState), VMSTATE_UINT8(stopped_state, SDHCIState), - VMSTATE_VBUFFER_UINT32(fifo_buffer, SDHCIState, 1, NULL, 0, buf_maxsz), + VMSTATE_VBUFFER_UINT32(fifo_buffer, SDHCIState, 1, NULL, buf_maxsz), VMSTATE_TIMER_PTR(insert_timer, SDHCIState), VMSTATE_TIMER_PTR(transfer_timer, SDHCIState), VMSTATE_END_OF_LIST() diff --git a/hw/timer/m48t59.c b/hw/timer/m48t59.c index 015797732f..474981a6ac 100644 --- a/hw/timer/m48t59.c +++ b/hw/timer/m48t59.c @@ -563,7 +563,7 @@ static const VMStateDescription vmstate_m48t59 = { .fields = (VMStateField[]) { VMSTATE_UINT8(lock, M48t59State), VMSTATE_UINT16(addr, M48t59State), - VMSTATE_VBUFFER_UINT32(buffer, M48t59State, 0, NULL, 0, size), + VMSTATE_VBUFFER_UINT32(buffer, M48t59State, 0, NULL, size), VMSTATE_END_OF_LIST() } }; diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h index 8ea56a8f60..1eec9a2da3 100644 --- a/include/hw/virtio/virtio-net.h +++ b/include/hw/virtio/virtio-net.h @@ -47,7 +47,7 @@ typedef struct VirtIONetQueue { VirtQueue *tx_vq; QEMUTimer *tx_timer; QEMUBH *tx_bh; - int tx_waiting; + uint32_t tx_waiting; struct { VirtQueueElement *elem; } async_tx; @@ -68,7 +68,7 @@ typedef struct VirtIONet { size_t guest_hdr_len; uint32_t host_features; uint8_t has_ufo; - int mergeable_rx_bufs; + uint32_t mergeable_rx_bufs; uint8_t promisc; uint8_t allmulti; uint8_t alluni; diff --git a/include/migration/colo.h b/include/migration/colo.h index e32eef4763..2bbff9e6c2 100644 --- a/include/migration/colo.h +++ b/include/migration/colo.h @@ -35,4 +35,6 @@ COLOMode get_colo_mode(void); /* failover */ void colo_do_failover(MigrationState *s); + +void colo_checkpoint_notify(void *opaque); #endif diff --git a/include/migration/migration.h b/include/migration/migration.h index 7528cc2fbc..1735d66512 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -116,6 +116,7 @@ struct MigrationIncomingState { QemuThread colo_incoming_thread; /* The coroutine we should enter (back) after failover */ Coroutine *migration_incoming_co; + QemuSemaphore colo_incoming_sem; /* See savevm.c */ LoadStateEntry_Head loadvm_handlers; @@ -187,6 +188,13 @@ struct MigrationState QSIMPLEQ_HEAD(src_page_requests, MigrationSrcPageRequest) src_page_requests; /* The RAMBlock used in the last src_page_request */ RAMBlock *last_req_rb; + /* The semaphore is used to notify COLO thread that failover is finished */ + QemuSemaphore colo_exit_sem; + + /* The semaphore is used to notify COLO thread to do checkpoint */ + QemuSemaphore colo_checkpoint_sem; + int64_t colo_checkpoint_time; + QEMUTimer *colo_delay_timer; /* The last error that occurred */ Error *error; @@ -285,6 +293,7 @@ int ram_postcopy_send_discard_bitmap(MigrationState *ms); int ram_discard_range(MigrationIncomingState *mis, const char *block_name, uint64_t start, size_t length); int ram_postcopy_incoming_init(MigrationIncomingState *mis); +void ram_postcopy_migrated_memory_release(MigrationState *ms); /** * @migrate_add_blocker - prevent migration from proceeding @@ -304,6 +313,7 @@ int migrate_add_blocker(Error *reason, Error **errp); */ void migrate_del_blocker(Error *reason); +bool migrate_release_ram(void); bool migrate_postcopy_ram(void); bool migrate_zero_blocks(void); diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h index abedd466c9..0cd648a733 100644 --- a/include/migration/qemu-file.h +++ b/include/migration/qemu-file.h @@ -132,7 +132,8 @@ void qemu_put_byte(QEMUFile *f, int v); * put_buffer without copying the buffer. * The buffer should be available till it is sent asynchronously. */ -void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, size_t size); +void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, size_t size, + bool may_free); bool qemu_file_mode_is_not_valid(const char *mode); bool qemu_file_is_writable(QEMUFile *f); diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h index 6233fe2e5b..63e7b02e05 100644 --- a/include/migration/vmstate.h +++ b/include/migration/vmstate.h @@ -259,6 +259,7 @@ extern const VMStateInfo vmstate_info_cpudouble; extern const VMStateInfo vmstate_info_timer; extern const VMStateInfo vmstate_info_buffer; extern const VMStateInfo vmstate_info_unused_buffer; +extern const VMStateInfo vmstate_info_tmp; extern const VMStateInfo vmstate_info_bitmap; extern const VMStateInfo vmstate_info_qtailq; @@ -587,7 +588,8 @@ extern const VMStateInfo vmstate_info_qtailq; .offset = vmstate_offset_buffer(_state, _field) + _start, \ } -#define VMSTATE_VBUFFER_MULTIPLY(_field, _state, _version, _test, _start, _field_size, _multiply) { \ +#define VMSTATE_VBUFFER_MULTIPLY(_field, _state, _version, _test, \ + _field_size, _multiply) { \ .name = (stringify(_field)), \ .version_id = (_version), \ .field_exists = (_test), \ @@ -596,10 +598,9 @@ extern const VMStateInfo vmstate_info_qtailq; .info = &vmstate_info_buffer, \ .flags = VMS_VBUFFER|VMS_POINTER|VMS_MULTIPLY, \ .offset = offsetof(_state, _field), \ - .start = (_start), \ } -#define VMSTATE_VBUFFER(_field, _state, _version, _test, _start, _field_size) { \ +#define VMSTATE_VBUFFER(_field, _state, _version, _test, _field_size) { \ .name = (stringify(_field)), \ .version_id = (_version), \ .field_exists = (_test), \ @@ -607,10 +608,9 @@ extern const VMStateInfo vmstate_info_qtailq; .info = &vmstate_info_buffer, \ .flags = VMS_VBUFFER|VMS_POINTER, \ .offset = offsetof(_state, _field), \ - .start = (_start), \ } -#define VMSTATE_VBUFFER_UINT32(_field, _state, _version, _test, _start, _field_size) { \ +#define VMSTATE_VBUFFER_UINT32(_field, _state, _version, _test, _field_size) { \ .name = (stringify(_field)), \ .version_id = (_version), \ .field_exists = (_test), \ @@ -618,10 +618,10 @@ extern const VMStateInfo vmstate_info_qtailq; .info = &vmstate_info_buffer, \ .flags = VMS_VBUFFER|VMS_POINTER, \ .offset = offsetof(_state, _field), \ - .start = (_start), \ } -#define VMSTATE_VBUFFER_ALLOC_UINT32(_field, _state, _version, _test, _start, _field_size) { \ +#define VMSTATE_VBUFFER_ALLOC_UINT32(_field, _state, _version, \ + _test, _field_size) { \ .name = (stringify(_field)), \ .version_id = (_version), \ .field_exists = (_test), \ @@ -629,7 +629,6 @@ extern const VMStateInfo vmstate_info_qtailq; .info = &vmstate_info_buffer, \ .flags = VMS_VBUFFER|VMS_POINTER|VMS_ALLOC, \ .offset = offsetof(_state, _field), \ - .start = (_start), \ } #define VMSTATE_BUFFER_UNSAFE_INFO_TEST(_field, _state, _test, _version, _info, _size) { \ @@ -651,6 +650,24 @@ extern const VMStateInfo vmstate_info_qtailq; .offset = offsetof(_state, _field), \ } +/* Allocate a temporary of type 'tmp_type', set tmp->parent to _state + * and execute the vmsd on the temporary. Note that we're working with + * the whole of _state here, not a field within it. + * We compile time check that: + * That _tmp_type contains a 'parent' member that's a pointer to the + * '_state' type + * That the pointer is right at the start of _tmp_type. + */ +#define VMSTATE_WITH_TMP(_state, _tmp_type, _vmsd) { \ + .name = "tmp", \ + .size = sizeof(_tmp_type) + \ + QEMU_BUILD_BUG_ON_ZERO(offsetof(_tmp_type, parent) != 0) + \ + type_check_pointer(_state, \ + typeof_field(_tmp_type, parent)), \ + .vmsd = &(_vmsd), \ + .info = &vmstate_info_tmp, \ +} + #define VMSTATE_UNUSED_BUFFER(_test, _version, _size) { \ .name = "unused", \ .field_exists = (_test), \ @@ -660,6 +677,17 @@ extern const VMStateInfo vmstate_info_qtailq; .flags = VMS_BUFFER, \ } +/* Discard size * field_num bytes, where field_num is a uint32 member */ +#define VMSTATE_UNUSED_VARRAY_UINT32(_state, _test, _version, _field_num, _size) {\ + .name = "unused", \ + .field_exists = (_test), \ + .num_offset = vmstate_offset_value(_state, _field_num, uint32_t),\ + .version_id = (_version), \ + .size = (_size), \ + .info = &vmstate_info_unused_buffer, \ + .flags = VMS_VARRAY_UINT32 | VMS_BUFFER, \ +} + /* _field_size should be a int32_t field in the _state struct giving the * size of the bitmap _field in bits. */ @@ -948,13 +976,10 @@ extern const VMStateInfo vmstate_info_qtailq; VMSTATE_BUFFER_START_MIDDLE_V(_f, _s, _start, 0) #define VMSTATE_PARTIAL_VBUFFER(_f, _s, _size) \ - VMSTATE_VBUFFER(_f, _s, 0, NULL, 0, _size) + VMSTATE_VBUFFER(_f, _s, 0, NULL, _size) #define VMSTATE_PARTIAL_VBUFFER_UINT32(_f, _s, _size) \ - VMSTATE_VBUFFER_UINT32(_f, _s, 0, NULL, 0, _size) - -#define VMSTATE_SUB_VBUFFER(_f, _s, _start, _size) \ - VMSTATE_VBUFFER(_f, _s, 0, NULL, _start, _size) + VMSTATE_VBUFFER_UINT32(_f, _s, 0, NULL, _size) #define VMSTATE_BUFFER_TEST(_f, _s, _test) \ VMSTATE_STATIC_BUFFER(_f, _s, 0, _test, 0, sizeof(typeof_field(_s, _f))) diff --git a/migration/colo.c b/migration/colo.c index 93c85c538b..712308ed5e 100644 --- a/migration/colo.c +++ b/migration/colo.c @@ -20,6 +20,8 @@ #include "qapi/error.h" #include "migration/failover.h" +static bool vmstate_loading; + #define COLO_BUFFER_BASE_SIZE (4 * 1024 * 1024) bool colo_supported(void) @@ -51,6 +53,19 @@ static void secondary_vm_do_failover(void) int old_state; MigrationIncomingState *mis = migration_incoming_get_current(); + /* Can not do failover during the process of VM's loading VMstate, Or + * it will break the secondary VM. + */ + if (vmstate_loading) { + old_state = failover_set_state(FAILOVER_STATUS_ACTIVE, + FAILOVER_STATUS_RELAUNCH); + if (old_state != FAILOVER_STATUS_ACTIVE) { + error_report("Unknown error while do failover for secondary VM," + "old_state: %s", FailoverStatus_lookup[old_state]); + } + return; + } + migrate_set_state(&mis->state, MIGRATION_STATUS_COLO, MIGRATION_STATUS_COMPLETED); @@ -59,6 +74,18 @@ static void secondary_vm_do_failover(void) /* recover runstate to normal migration finish state */ autostart = true; } + /* + * Make sure COLO incoming thread not block in recv or send, + * If mis->from_src_file and mis->to_src_file use the same fd, + * The second shutdown() will return -1, we ignore this value, + * It is harmless. + */ + if (mis->from_src_file) { + qemu_file_shutdown(mis->from_src_file); + } + if (mis->to_src_file) { + qemu_file_shutdown(mis->to_src_file); + } old_state = failover_set_state(FAILOVER_STATUS_ACTIVE, FAILOVER_STATUS_COMPLETED); @@ -67,6 +94,8 @@ static void secondary_vm_do_failover(void) "secondary VM", FailoverStatus_lookup[old_state]); return; } + /* Notify COLO incoming thread that failover work is finished */ + qemu_sem_post(&mis->colo_incoming_sem); /* For Secondary VM, jump to incoming co */ if (mis->migration_incoming_co) { qemu_coroutine_enter(mis->migration_incoming_co); @@ -81,6 +110,18 @@ static void primary_vm_do_failover(void) migrate_set_state(&s->state, MIGRATION_STATUS_COLO, MIGRATION_STATUS_COMPLETED); + /* + * Wake up COLO thread which may blocked in recv() or send(), + * The s->rp_state.from_dst_file and s->to_dst_file may use the + * same fd, but we still shutdown the fd for twice, it is harmless. + */ + if (s->to_dst_file) { + qemu_file_shutdown(s->to_dst_file); + } + if (s->rp_state.from_dst_file) { + qemu_file_shutdown(s->rp_state.from_dst_file); + } + old_state = failover_set_state(FAILOVER_STATUS_ACTIVE, FAILOVER_STATUS_COMPLETED); if (old_state != FAILOVER_STATUS_ACTIVE) { @@ -88,6 +129,8 @@ static void primary_vm_do_failover(void) FailoverStatus_lookup[old_state]); return; } + /* Notify COLO thread that failover work is finished */ + qemu_sem_post(&s->colo_exit_sem); } void colo_do_failover(MigrationState *s) @@ -302,7 +345,7 @@ static void colo_process_checkpoint(MigrationState *s) { QIOChannelBuffer *bioc; QEMUFile *fb = NULL; - int64_t current_time, checkpoint_time = qemu_clock_get_ms(QEMU_CLOCK_HOST); + int64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_HOST); Error *local_err = NULL; int ret; @@ -332,26 +375,21 @@ static void colo_process_checkpoint(MigrationState *s) qemu_mutex_unlock_iothread(); trace_colo_vm_state_change("stop", "run"); + timer_mod(s->colo_delay_timer, + current_time + s->parameters.x_checkpoint_delay); + while (s->state == MIGRATION_STATUS_COLO) { if (failover_get_state() != FAILOVER_STATUS_NONE) { error_report("failover request"); goto out; } - current_time = qemu_clock_get_ms(QEMU_CLOCK_HOST); - if (current_time - checkpoint_time < - s->parameters.x_checkpoint_delay) { - int64_t delay_ms; + qemu_sem_wait(&s->colo_checkpoint_sem); - delay_ms = s->parameters.x_checkpoint_delay - - (current_time - checkpoint_time); - g_usleep(delay_ms * 1000); - } ret = colo_do_checkpoint_transaction(s, bioc, fb); if (ret < 0) { goto out; } - checkpoint_time = qemu_clock_get_ms(QEMU_CLOCK_HOST); } out: @@ -364,14 +402,41 @@ out: qemu_fclose(fb); } + timer_del(s->colo_delay_timer); + + /* Hope this not to be too long to wait here */ + qemu_sem_wait(&s->colo_exit_sem); + qemu_sem_destroy(&s->colo_exit_sem); + /* + * Must be called after failover BH is completed, + * Or the failover BH may shutdown the wrong fd that + * re-used by other threads after we release here. + */ if (s->rp_state.from_dst_file) { qemu_fclose(s->rp_state.from_dst_file); } } +void colo_checkpoint_notify(void *opaque) +{ + MigrationState *s = opaque; + int64_t next_notify_time; + + qemu_sem_post(&s->colo_checkpoint_sem); + s->colo_checkpoint_time = qemu_clock_get_ms(QEMU_CLOCK_HOST); + next_notify_time = s->colo_checkpoint_time + + s->parameters.x_checkpoint_delay; + timer_mod(s->colo_delay_timer, next_notify_time); +} + void migrate_start_colo_process(MigrationState *s) { qemu_mutex_unlock_iothread(); + qemu_sem_init(&s->colo_checkpoint_sem, 0); + s->colo_delay_timer = timer_new_ms(QEMU_CLOCK_HOST, + colo_checkpoint_notify, s); + + qemu_sem_init(&s->colo_exit_sem, 0); migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE, MIGRATION_STATUS_COLO); colo_process_checkpoint(s); @@ -410,6 +475,8 @@ void *colo_process_incoming_thread(void *opaque) uint64_t value; Error *local_err = NULL; + qemu_sem_init(&mis->colo_incoming_sem, 0); + migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE, MIGRATION_STATUS_COLO); @@ -496,13 +563,23 @@ void *colo_process_incoming_thread(void *opaque) qemu_mutex_lock_iothread(); qemu_system_reset(VMRESET_SILENT); + vmstate_loading = true; if (qemu_loadvm_state(fb) < 0) { error_report("COLO: loadvm failed"); qemu_mutex_unlock_iothread(); goto out; } + + vmstate_loading = false; qemu_mutex_unlock_iothread(); + if (failover_get_state() == FAILOVER_STATUS_RELAUNCH) { + failover_set_state(FAILOVER_STATUS_RELAUNCH, + FAILOVER_STATUS_NONE); + failover_request_active(NULL); + goto out; + } + colo_send_message(mis->to_src_file, COLO_MESSAGE_VMSTATE_LOADED, &local_err); if (local_err) { @@ -511,6 +588,7 @@ void *colo_process_incoming_thread(void *opaque) } out: + vmstate_loading = false; /* Throw the unreported error message after exited from loop */ if (local_err) { error_report_err(local_err); @@ -520,6 +598,10 @@ out: qemu_fclose(fb); } + /* Hope this not to be too long to loop here */ + qemu_sem_wait(&mis->colo_incoming_sem); + qemu_sem_destroy(&mis->colo_incoming_sem); + /* Must be called after failover BH is completed */ if (mis->to_src_file) { qemu_fclose(mis->to_src_file); } diff --git a/migration/migration.c b/migration/migration.c index 2b179c69fa..c6ae69d371 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -891,6 +891,9 @@ void qmp_migrate_set_parameters(MigrationParameters *params, Error **errp) if (params->has_x_checkpoint_delay) { s->parameters.x_checkpoint_delay = params->x_checkpoint_delay; + if (migration_in_colo_state()) { + colo_checkpoint_notify(s); + } } } @@ -1297,6 +1300,15 @@ void qmp_migrate_set_downtime(double value, Error **errp) qmp_migrate_set_parameters(&p, errp); } +bool migrate_release_ram(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->enabled_capabilities[MIGRATION_CAPABILITY_RELEASE_RAM]; +} + bool migrate_postcopy_ram(void) { MigrationState *s; @@ -1713,6 +1725,10 @@ static int postcopy_start(MigrationState *ms, bool *old_vm_running) */ qemu_savevm_send_ping(ms->to_dst_file, 4); + if (migrate_release_ram()) { + ram_postcopy_migrated_memory_release(ms); + } + ret = qemu_file_get_error(ms->to_dst_file); if (ret) { error_report("postcopy_start: Migration stream errored"); diff --git a/migration/qemu-file.c b/migration/qemu-file.c index e9fae31158..195fa94fcf 100644 --- a/migration/qemu-file.c +++ b/migration/qemu-file.c @@ -49,6 +49,7 @@ struct QEMUFile { int buf_size; /* 0 when writing */ uint8_t buf[IO_BUF_SIZE]; + DECLARE_BITMAP(may_free, MAX_IOV_SIZE); struct iovec iov[MAX_IOV_SIZE]; unsigned int iovcnt; @@ -132,6 +133,41 @@ bool qemu_file_is_writable(QEMUFile *f) return f->ops->writev_buffer; } +static void qemu_iovec_release_ram(QEMUFile *f) +{ + struct iovec iov; + unsigned long idx; + + /* Find and release all the contiguous memory ranges marked as may_free. */ + idx = find_next_bit(f->may_free, f->iovcnt, 0); + if (idx >= f->iovcnt) { + return; + } + iov = f->iov[idx]; + + /* The madvise() in the loop is called for iov within a continuous range and + * then reinitialize the iov. And in the end, madvise() is called for the + * last iov. + */ + while ((idx = find_next_bit(f->may_free, f->iovcnt, idx + 1)) < f->iovcnt) { + /* check for adjacent buffer and coalesce them */ + if (iov.iov_base + iov.iov_len == f->iov[idx].iov_base) { + iov.iov_len += f->iov[idx].iov_len; + continue; + } + if (qemu_madvise(iov.iov_base, iov.iov_len, QEMU_MADV_DONTNEED) < 0) { + error_report("migrate: madvise DONTNEED failed %p %zd: %s", + iov.iov_base, iov.iov_len, strerror(errno)); + } + iov = f->iov[idx]; + } + if (qemu_madvise(iov.iov_base, iov.iov_len, QEMU_MADV_DONTNEED) < 0) { + error_report("migrate: madvise DONTNEED failed %p %zd: %s", + iov.iov_base, iov.iov_len, strerror(errno)); + } + memset(f->may_free, 0, sizeof(f->may_free)); +} + /** * Flushes QEMUFile buffer * @@ -151,6 +187,8 @@ void qemu_fflush(QEMUFile *f) if (f->iovcnt > 0) { expect = iov_size(f->iov, f->iovcnt); ret = f->ops->writev_buffer(f->opaque, f->iov, f->iovcnt, f->pos); + + qemu_iovec_release_ram(f); } if (ret >= 0) { @@ -304,13 +342,19 @@ int qemu_fclose(QEMUFile *f) return ret; } -static void add_to_iovec(QEMUFile *f, const uint8_t *buf, size_t size) +static void add_to_iovec(QEMUFile *f, const uint8_t *buf, size_t size, + bool may_free) { /* check for adjacent buffer and coalesce them */ if (f->iovcnt > 0 && buf == f->iov[f->iovcnt - 1].iov_base + - f->iov[f->iovcnt - 1].iov_len) { + f->iov[f->iovcnt - 1].iov_len && + may_free == test_bit(f->iovcnt - 1, f->may_free)) + { f->iov[f->iovcnt - 1].iov_len += size; } else { + if (may_free) { + set_bit(f->iovcnt, f->may_free); + } f->iov[f->iovcnt].iov_base = (uint8_t *)buf; f->iov[f->iovcnt++].iov_len = size; } @@ -320,14 +364,15 @@ static void add_to_iovec(QEMUFile *f, const uint8_t *buf, size_t size) } } -void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, size_t size) +void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, size_t size, + bool may_free) { if (f->last_error) { return; } f->bytes_xfer += size; - add_to_iovec(f, buf, size); + add_to_iovec(f, buf, size, may_free); } void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, size_t size) @@ -345,7 +390,7 @@ void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, size_t size) } memcpy(f->buf + f->buf_index, buf, l); f->bytes_xfer += l; - add_to_iovec(f, f->buf + f->buf_index, l); + add_to_iovec(f, f->buf + f->buf_index, l, false); f->buf_index += l; if (f->buf_index == IO_BUF_SIZE) { qemu_fflush(f); @@ -366,7 +411,7 @@ void qemu_put_byte(QEMUFile *f, int v) f->buf[f->buf_index] = v; f->bytes_xfer++; - add_to_iovec(f, f->buf + f->buf_index, 1); + add_to_iovec(f, f->buf + f->buf_index, 1, false); f->buf_index++; if (f->buf_index == IO_BUF_SIZE) { qemu_fflush(f); @@ -647,7 +692,7 @@ ssize_t qemu_put_compression_data(QEMUFile *f, const uint8_t *p, size_t size, } qemu_put_be32(f, blen); if (f->ops->writev_buffer) { - add_to_iovec(f, f->buf + f->buf_index, blen); + add_to_iovec(f, f->buf + f->buf_index, blen, false); } f->buf_index += blen; if (f->buf_index == IO_BUF_SIZE) { diff --git a/migration/ram.c b/migration/ram.c index ef8fadfe69..f289fcddd5 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -705,6 +705,16 @@ static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset, return pages; } +static void ram_release_pages(MigrationState *ms, const char *block_name, + uint64_t offset, int pages) +{ + if (!migrate_release_ram() || !migration_in_postcopy(ms)) { + return; + } + + ram_discard_range(NULL, block_name, offset, pages << TARGET_PAGE_BITS); +} + /** * ram_save_page: Send the given page to the stream * @@ -713,13 +723,14 @@ static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset, * >=0 - Number of pages written - this might legally be 0 * if xbzrle noticed the page was the same. * + * @ms: The current migration state. * @f: QEMUFile where to send the data * @block: block that contains the page we want to send * @offset: offset inside the block for the page * @last_stage: if we are at the completion stage * @bytes_transferred: increase it with the number of transferred bytes */ -static int ram_save_page(QEMUFile *f, PageSearchStatus *pss, +static int ram_save_page(MigrationState *ms, QEMUFile *f, PageSearchStatus *pss, bool last_stage, uint64_t *bytes_transferred) { int pages = -1; @@ -764,9 +775,9 @@ static int ram_save_page(QEMUFile *f, PageSearchStatus *pss, * page would be stale */ xbzrle_cache_zero_page(current_addr); + ram_release_pages(ms, block->idstr, pss->offset, pages); } else if (!ram_bulk_stage && - !migration_in_postcopy(migrate_get_current()) && - migrate_use_xbzrle()) { + !migration_in_postcopy(ms) && migrate_use_xbzrle()) { pages = save_xbzrle_page(f, &p, current_addr, block, offset, last_stage, bytes_transferred); if (!last_stage) { @@ -783,7 +794,9 @@ static int ram_save_page(QEMUFile *f, PageSearchStatus *pss, *bytes_transferred += save_page_header(f, block, offset | RAM_SAVE_FLAG_PAGE); if (send_async) { - qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE); + qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE, + migrate_release_ram() & + migration_in_postcopy(ms)); } else { qemu_put_buffer(f, p, TARGET_PAGE_SIZE); } @@ -813,6 +826,8 @@ static int do_compress_ram_page(QEMUFile *f, RAMBlock *block, error_report("compressed data failed!"); } else { bytes_sent += blen; + ram_release_pages(migrate_get_current(), block->idstr, + offset & TARGET_PAGE_MASK, 1); } return bytes_sent; @@ -893,14 +908,15 @@ static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block, * * Returns: Number of pages written. * + * @ms: The current migration state. * @f: QEMUFile where to send the data * @block: block that contains the page we want to send * @offset: offset inside the block for the page * @last_stage: if we are at the completion stage * @bytes_transferred: increase it with the number of transferred bytes */ -static int ram_save_compressed_page(QEMUFile *f, PageSearchStatus *pss, - bool last_stage, +static int ram_save_compressed_page(MigrationState *ms, QEMUFile *f, + PageSearchStatus *pss, bool last_stage, uint64_t *bytes_transferred) { int pages = -1; @@ -951,12 +967,17 @@ static int ram_save_compressed_page(QEMUFile *f, PageSearchStatus *pss, error_report("compressed data failed!"); } } + if (pages > 0) { + ram_release_pages(ms, block->idstr, pss->offset, pages); + } } else { offset |= RAM_SAVE_FLAG_CONTINUE; pages = save_zero_page(f, block, offset, p, bytes_transferred); if (pages == -1) { pages = compress_page_with_multi_thread(f, block, offset, bytes_transferred); + } else { + ram_release_pages(ms, block->idstr, pss->offset, pages); } } } @@ -1231,11 +1252,11 @@ static int ram_save_target_page(MigrationState *ms, QEMUFile *f, if (migration_bitmap_clear_dirty(dirty_ram_abs)) { unsigned long *unsentmap; if (compression_switch && migrate_use_compression()) { - res = ram_save_compressed_page(f, pss, + res = ram_save_compressed_page(ms, f, pss, last_stage, bytes_transferred); } else { - res = ram_save_page(f, pss, last_stage, + res = ram_save_page(ms, f, pss, last_stage, bytes_transferred); } @@ -1325,6 +1346,11 @@ static int ram_find_and_save_block(QEMUFile *f, bool last_stage, ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in ram_addr_t space */ + /* No dirty page as there is zero RAM */ + if (!ram_bytes_total()) { + return pages; + } + pss.block = last_seen_block; pss.offset = last_offset; pss.complete_round = false; @@ -1516,6 +1542,25 @@ void ram_debug_dump_bitmap(unsigned long *todump, bool expected) /* **** functions for postcopy ***** */ +void ram_postcopy_migrated_memory_release(MigrationState *ms) +{ + struct RAMBlock *block; + unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; + + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { + unsigned long first = block->offset >> TARGET_PAGE_BITS; + unsigned long range = first + (block->used_length >> TARGET_PAGE_BITS); + unsigned long run_start = find_next_zero_bit(bitmap, range, first); + + while (run_start < range) { + unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); + ram_discard_range(NULL, block->idstr, run_start << TARGET_PAGE_BITS, + (run_end - run_start) << TARGET_PAGE_BITS); + run_start = find_next_zero_bit(bitmap, range, run_end + 1); + } + } +} + /* * Callback from postcopy_each_ram_send_discard for each RAMBlock * Note: At this point the 'unsentmap' is the processed bitmap combined @@ -1912,14 +1957,17 @@ static int ram_save_init_globals(void) bytes_transferred = 0; reset_ram_globals(); - ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS; migration_bitmap_rcu = g_new0(struct BitmapRcu, 1); - migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages); - bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages); - - if (migrate_postcopy_ram()) { - migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages); - bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages); + /* Skip setting bitmap if there is no RAM */ + if (ram_bytes_total()) { + ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS; + migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages); + bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages); + + if (migrate_postcopy_ram()) { + migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages); + bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages); + } } /* diff --git a/migration/savevm.c b/migration/savevm.c index 01997687c4..5ecd264134 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -356,7 +356,7 @@ static const VMStateDescription vmstate_configuration = { .pre_save = configuration_pre_save, .fields = (VMStateField[]) { VMSTATE_UINT32(len, SaveState), - VMSTATE_VBUFFER_ALLOC_UINT32(name, SaveState, 0, NULL, 0, len), + VMSTATE_VBUFFER_ALLOC_UINT32(name, SaveState, 0, NULL, len), VMSTATE_END_OF_LIST() }, .subsections = (const VMStateDescription*[]) { diff --git a/migration/vmstate.c b/migration/vmstate.c index 2b2b3a58e6..b4d8ae982a 100644 --- a/migration/vmstate.c +++ b/migration/vmstate.c @@ -68,10 +68,10 @@ static void *vmstate_base_addr(void *opaque, VMStateField *field, bool alloc) } } if (size) { - *((void **)base_addr + field->start) = g_malloc(size); + *(void **)base_addr = g_malloc(size); } } - base_addr = *(void **)base_addr + field->start; + base_addr = *(void **)base_addr; } return base_addr; @@ -935,6 +935,46 @@ const VMStateInfo vmstate_info_unused_buffer = { .put = put_unused_buffer, }; +/* vmstate_info_tmp, see VMSTATE_WITH_TMP, the idea is that we allocate + * a temporary buffer and the pre_load/pre_save methods in the child vmsd + * copy stuff from the parent into the child and do calculations to fill + * in fields that don't really exist in the parent but need to be in the + * stream. + */ +static int get_tmp(QEMUFile *f, void *pv, size_t size, VMStateField *field) +{ + int ret; + const VMStateDescription *vmsd = field->vmsd; + int version_id = field->version_id; + void *tmp = g_malloc(size); + + /* Writes the parent field which is at the start of the tmp */ + *(void **)tmp = pv; + ret = vmstate_load_state(f, vmsd, tmp, version_id); + g_free(tmp); + return ret; +} + +static int put_tmp(QEMUFile *f, void *pv, size_t size, VMStateField *field, + QJSON *vmdesc) +{ + const VMStateDescription *vmsd = field->vmsd; + void *tmp = g_malloc(size); + + /* Writes the parent field which is at the start of the tmp */ + *(void **)tmp = pv; + vmstate_save_state(f, vmsd, tmp, vmdesc); + g_free(tmp); + + return 0; +} + +const VMStateInfo vmstate_info_tmp = { + .name = "tmp", + .get = get_tmp, + .put = put_tmp, +}; + /* bitmaps (as defined by bitmap.h). Note that size here is the size * of the bitmap in bits. The on-the-wire format of a bitmap is 64 * bit words with the bits in big endian order. The in-memory format diff --git a/qapi-schema.json b/qapi-schema.json index 61151f34d0..5edb08d621 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -865,11 +865,14 @@ # side, this process is called COarse-Grain LOck Stepping (COLO) for # Non-stop Service. (since 2.8) # +# @release-ram: if enabled, qemu will free the migrated ram pages on the source +# during postcopy-ram migration. (since 2.9) +# # Since: 1.2 ## { 'enum': 'MigrationCapability', 'data': ['xbzrle', 'rdma-pin-all', 'auto-converge', 'zero-blocks', - 'compress', 'events', 'postcopy-ram', 'x-colo'] } + 'compress', 'events', 'postcopy-ram', 'x-colo', 'release-ram'] } ## # @MigrationCapabilityStatus: @@ -1190,10 +1193,12 @@ # # @completed: finish the process of failover # +# @relaunch: restart the failover process, from 'none' -> 'completed' (Since 2.9) +# # Since: 2.8 ## { 'enum': 'FailoverStatus', - 'data': [ 'none', 'require', 'active', 'completed'] } + 'data': [ 'none', 'require', 'active', 'completed', 'relaunch' ] } ## # @x-colo-lost-heartbeat: diff --git a/target/s390x/machine.c b/target/s390x/machine.c index edc3a4717b..8503fa1c8d 100644 --- a/target/s390x/machine.c +++ b/target/s390x/machine.c @@ -180,7 +180,7 @@ const VMStateDescription vmstate_s390_cpu = { VMSTATE_UINT8(env.cpu_state, S390CPU), VMSTATE_UINT8(env.sigp_order, S390CPU), VMSTATE_UINT32_V(irqstate_saved_size, S390CPU, 4), - VMSTATE_VBUFFER_UINT32(irqstate, S390CPU, 4, NULL, 0, + VMSTATE_VBUFFER_UINT32(irqstate, S390CPU, 4, NULL, irqstate_saved_size), VMSTATE_END_OF_LIST() }, diff --git a/tests/test-vmstate.c b/tests/test-vmstate.c index 9d87faf12b..d0dd390006 100644 --- a/tests/test-vmstate.c +++ b/tests/test-vmstate.c @@ -90,7 +90,7 @@ static void save_buffer(const uint8_t *buf, size_t buf_size) qemu_fclose(fsave); } -static void compare_vmstate(uint8_t *wire, size_t size) +static void compare_vmstate(const uint8_t *wire, size_t size) { QEMUFile *f = open_test_file(false); uint8_t result[size]; @@ -113,7 +113,7 @@ static void compare_vmstate(uint8_t *wire, size_t size) } static int load_vmstate_one(const VMStateDescription *desc, void *obj, - int version, uint8_t *wire, size_t size) + int version, const uint8_t *wire, size_t size) { QEMUFile *f; int ret; @@ -137,7 +137,7 @@ static int load_vmstate_one(const VMStateDescription *desc, void *obj, static int load_vmstate(const VMStateDescription *desc, void *obj, void *obj_clone, void (*obj_copy)(void *, void*), - int version, uint8_t *wire, size_t size) + int version, const uint8_t *wire, size_t size) { /* We test with zero size */ obj_copy(obj_clone, obj); @@ -289,7 +289,6 @@ static void test_simple_primitive(void) FIELD_EQUAL(i64_1); FIELD_EQUAL(i64_2); } -#undef FIELD_EQUAL typedef struct TestStruct { uint32_t a, b, c, e; @@ -474,7 +473,6 @@ static void test_load_skip(void) qemu_fclose(loading); } - typedef struct { int32_t i; } TestStructTriv; @@ -688,6 +686,94 @@ static void test_load_q(void) qemu_fclose(fload); } +typedef struct TmpTestStruct { + TestStruct *parent; + int64_t diff; +} TmpTestStruct; + +static void tmp_child_pre_save(void *opaque) +{ + struct TmpTestStruct *tts = opaque; + + tts->diff = tts->parent->b - tts->parent->a; +} + +static int tmp_child_post_load(void *opaque, int version_id) +{ + struct TmpTestStruct *tts = opaque; + + tts->parent->b = tts->parent->a + tts->diff; + + return 0; +} + +static const VMStateDescription vmstate_tmp_back_to_parent = { + .name = "test/tmp_child_parent", + .fields = (VMStateField[]) { + VMSTATE_UINT64(f, TestStruct), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_tmp_child = { + .name = "test/tmp_child", + .pre_save = tmp_child_pre_save, + .post_load = tmp_child_post_load, + .fields = (VMStateField[]) { + VMSTATE_INT64(diff, TmpTestStruct), + VMSTATE_STRUCT_POINTER(parent, TmpTestStruct, + vmstate_tmp_back_to_parent, TestStruct), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_with_tmp = { + .name = "test/with_tmp", + .version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT32(a, TestStruct), + VMSTATE_UINT64(d, TestStruct), + VMSTATE_WITH_TMP(TestStruct, TmpTestStruct, vmstate_tmp_child), + VMSTATE_END_OF_LIST() + } +}; + +static void obj_tmp_copy(void *target, void *source) +{ + memcpy(target, source, sizeof(TestStruct)); +} + +static void test_tmp_struct(void) +{ + TestStruct obj, obj_clone; + + uint8_t const wire_with_tmp[] = { + /* u32 a */ 0x00, 0x00, 0x00, 0x02, + /* u64 d */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, + /* diff */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, + /* u64 f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, + QEMU_VM_EOF, /* just to ensure we won't get EOF reported prematurely */ + }; + + memset(&obj, 0, sizeof(obj)); + obj.a = 2; + obj.b = 4; + obj.d = 1; + obj.f = 8; + save_vmstate(&vmstate_with_tmp, &obj); + + compare_vmstate(wire_with_tmp, sizeof(wire_with_tmp)); + + memset(&obj, 0, sizeof(obj)); + SUCCESS(load_vmstate(&vmstate_with_tmp, &obj, &obj_clone, + obj_tmp_copy, 1, wire_with_tmp, + sizeof(wire_with_tmp))); + g_assert_cmpint(obj.a, ==, 2); /* From top level vmsd */ + g_assert_cmpint(obj.b, ==, 4); /* from the post_load */ + g_assert_cmpint(obj.d, ==, 1); /* From top level vmsd */ + g_assert_cmpint(obj.f, ==, 8); /* From the child->parent */ +} + int main(int argc, char **argv) { temp_fd = mkstemp(temp_file); @@ -708,7 +794,7 @@ int main(int argc, char **argv) test_arr_ptr_str_no0_load); g_test_add_func("/vmstate/qtailq/save/saveq", test_save_q); g_test_add_func("/vmstate/qtailq/load/loadq", test_load_q); - + g_test_add_func("/vmstate/tmp_struct", test_tmp_struct); g_test_run(); close(temp_fd); diff --git a/util/fifo8.c b/util/fifo8.c index 5c64101b33..d38b3bdaa5 100644 --- a/util/fifo8.c +++ b/util/fifo8.c @@ -118,7 +118,7 @@ const VMStateDescription vmstate_fifo8 = { .version_id = 1, .minimum_version_id = 1, .fields = (VMStateField[]) { - VMSTATE_VBUFFER_UINT32(data, Fifo8, 1, NULL, 0, capacity), + VMSTATE_VBUFFER_UINT32(data, Fifo8, 1, NULL, capacity), VMSTATE_UINT32(head, Fifo8), VMSTATE_UINT32(num, Fifo8), VMSTATE_END_OF_LIST() |