diff options
-rw-r--r-- | arch_init.c | 64 | ||||
-rwxr-xr-x | configure | 10 | ||||
-rw-r--r-- | docs/rdma.txt | 24 | ||||
-rw-r--r-- | hw/audio/hda-codec.c | 60 | ||||
-rw-r--r-- | hw/net/vhost_net.c | 6 | ||||
-rw-r--r-- | hw/net/virtio-net.c | 16 | ||||
-rw-r--r-- | hw/net/vmxnet3.c | 22 | ||||
-rw-r--r-- | include/migration/page_cache.h | 2 | ||||
-rw-r--r-- | include/net/net.h | 19 | ||||
-rw-r--r-- | include/net/tap.h | 6 | ||||
-rw-r--r-- | migration-rdma.c | 2 | ||||
-rw-r--r-- | migration.c | 6 | ||||
-rw-r--r-- | net/net.c | 55 | ||||
-rw-r--r-- | net/netmap.c | 123 | ||||
-rw-r--r-- | net/tap-win32.c | 92 | ||||
-rw-r--r-- | net/tap.c | 20 | ||||
-rw-r--r-- | page_cache.c | 2 | ||||
-rw-r--r-- | qapi-schema.json | 7 | ||||
-rw-r--r-- | qemu-file.c | 9 | ||||
-rw-r--r-- | vmstate.c | 15 |
20 files changed, 360 insertions, 200 deletions
diff --git a/arch_init.c b/arch_init.c index 80574a090c..fe1727922c 100644 --- a/arch_init.c +++ b/arch_init.c @@ -122,7 +122,6 @@ static void check_guest_throttling(void); #define RAM_SAVE_FLAG_XBZRLE 0x40 /* 0x80 is reserved in migration.h start with 0x100 next */ - static struct defconfig_file { const char *filename; /* Indicates it is an user config file (disabled by -no-user-config) */ @@ -133,6 +132,7 @@ static struct defconfig_file { { NULL }, /* end of list */ }; +static const uint8_t ZERO_TARGET_PAGE[TARGET_PAGE_SIZE]; int qemu_read_default_config_files(bool userconfig) { @@ -273,6 +273,34 @@ static size_t save_block_hdr(QEMUFile *f, RAMBlock *block, ram_addr_t offset, return size; } +/* This is the last block that we have visited serching for dirty pages + */ +static RAMBlock *last_seen_block; +/* This is the last block from where we have sent data */ +static RAMBlock *last_sent_block; +static ram_addr_t last_offset; +static unsigned long *migration_bitmap; +static uint64_t migration_dirty_pages; +static uint32_t last_version; +static bool ram_bulk_stage; + +/* Update the xbzrle cache to reflect a page that's been sent as all 0. + * The important thing is that a stale (not-yet-0'd) page be replaced + * by the new data. + * As a bonus, if the page wasn't in the cache it gets added so that + * when a small write is made into the 0'd page it gets XBZRLE sent + */ +static void xbzrle_cache_zero_page(ram_addr_t current_addr) +{ + if (ram_bulk_stage || !migrate_use_xbzrle()) { + return; + } + + /* We don't care if this fails to allocate a new cache page + * as long as it updated an old one */ + cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE); +} + #define ENCODING_FLAG_XBZRLE 0x1 static int save_xbzrle_page(QEMUFile *f, uint8_t *current_data, @@ -329,18 +357,6 @@ static int save_xbzrle_page(QEMUFile *f, uint8_t *current_data, return bytes_sent; } - -/* This is the last block that we have visited serching for dirty pages - */ -static RAMBlock *last_seen_block; -/* This is the last block from where we have sent data */ -static RAMBlock *last_sent_block; -static ram_addr_t last_offset; -static unsigned long *migration_bitmap; -static uint64_t migration_dirty_pages; -static uint32_t last_version; -static bool ram_bulk_stage; - static inline ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr, ram_addr_t start) @@ -512,6 +528,7 @@ static int ram_save_block(QEMUFile *f, bool last_stage) } else { int ret; uint8_t *p; + bool send_async = true; int cont = (block == last_sent_block) ? RAM_SAVE_FLAG_CONTINUE : 0; @@ -522,6 +539,7 @@ static int ram_save_block(QEMUFile *f, bool last_stage) ret = ram_control_save_page(f, block->offset, offset, TARGET_PAGE_SIZE, &bytes_sent); + current_addr = block->offset + offset; if (ret != RAM_SAVE_CONTROL_NOT_SUPP) { if (ret != RAM_SAVE_CONTROL_DELAYED) { if (bytes_sent > 0) { @@ -536,19 +554,35 @@ static int ram_save_block(QEMUFile *f, bool last_stage) RAM_SAVE_FLAG_COMPRESS); qemu_put_byte(f, 0); bytes_sent++; + /* Must let xbzrle know, otherwise a previous (now 0'd) cached + * page would be stale + */ + xbzrle_cache_zero_page(current_addr); } else if (!ram_bulk_stage && migrate_use_xbzrle()) { - current_addr = block->offset + offset; bytes_sent = save_xbzrle_page(f, p, current_addr, block, offset, cont, last_stage); if (!last_stage) { + /* We must send exactly what's in the xbzrle cache + * even if the page wasn't xbzrle compressed, so that + * it's right next time. + */ p = get_cached_data(XBZRLE.cache, current_addr); + + /* Can't send this cached data async, since the cache page + * might get updated before it gets to the wire + */ + send_async = false; } } /* XBZRLE overflow or normal page */ if (bytes_sent == -1) { bytes_sent = save_block_hdr(f, block, offset, cont, RAM_SAVE_FLAG_PAGE); - qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE); + if (send_async) { + qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE); + } else { + qemu_put_buffer(f, p, TARGET_PAGE_SIZE); + } bytes_sent += TARGET_PAGE_SIZE; acct_info.norm_pages++; } @@ -2249,13 +2249,21 @@ EOF fi ########################################## -# netmap headers probe +# netmap support probe +# Apart from looking for netmap headers, we make sure that the host API version +# supports the netmap backend (>=11). The upper bound (15) is meant to simulate +# a minor/major version number. Minor new features will be marked with values up +# to 15, and if something happens that requires a change to the backend we will +# move above 15, submit the backend fixes and modify this two bounds. if test "$netmap" != "no" ; then cat > $TMPC << EOF #include <inttypes.h> #include <net/if.h> #include <net/netmap.h> #include <net/netmap_user.h> +#if (NETMAP_API < 11) || (NETMAP_API > 15) +#error +#endif int main(void) { return 0; } EOF if compile_prog "" "" ; then diff --git a/docs/rdma.txt b/docs/rdma.txt index 2aca63bd72..1f5d9e9fe4 100644 --- a/docs/rdma.txt +++ b/docs/rdma.txt @@ -66,7 +66,7 @@ bulk-phase round of the migration and can be enabled for extremely high-performance RDMA hardware using the following command: QEMU Monitor Command: -$ migrate_set_capability x-rdma-pin-all on # disabled by default +$ migrate_set_capability rdma-pin-all on # disabled by default Performing this action will cause all 8GB to be pinned, so if that's not what you want, then please ignore this step altogether. @@ -93,12 +93,12 @@ $ migrate_set_speed 40g # or whatever is the MAX of your RDMA device Next, on the destination machine, add the following to the QEMU command line: -qemu ..... -incoming x-rdma:host:port +qemu ..... -incoming rdma:host:port Finally, perform the actual migration on the source machine: QEMU Monitor Command: -$ migrate -d x-rdma:host:port +$ migrate -d rdma:host:port PERFORMANCE =========== @@ -120,8 +120,8 @@ For example, in the same 8GB RAM example with all 8GB of memory in active use and the VM itself is completely idle using the same 40 gbps infiniband link: -1. x-rdma-pin-all disabled total time: approximately 7.5 seconds @ 9.5 Gbps -2. x-rdma-pin-all enabled total time: approximately 4 seconds @ 26 Gbps +1. rdma-pin-all disabled total time: approximately 7.5 seconds @ 9.5 Gbps +2. rdma-pin-all enabled total time: approximately 4 seconds @ 26 Gbps These numbers would of course scale up to whatever size virtual machine you have to migrate using RDMA. @@ -407,18 +407,14 @@ socket is broken during a non-RDMA based migration. TODO: ===== -1. 'migrate x-rdma:host:port' and '-incoming x-rdma' options will be - renamed to 'rdma' after the experimental phase of this work has - completed upstream. -2. Currently, 'ulimit -l' mlock() limits as well as cgroups swap limits +1. Currently, 'ulimit -l' mlock() limits as well as cgroups swap limits are not compatible with infinband memory pinning and will result in an aborted migration (but with the source VM left unaffected). -3. Use of the recent /proc/<pid>/pagemap would likely speed up +2. Use of the recent /proc/<pid>/pagemap would likely speed up the use of KSM and ballooning while using RDMA. -4. Also, some form of balloon-device usage tracking would also +3. Also, some form of balloon-device usage tracking would also help alleviate some issues. -5. Move UNREGISTER requests to a separate thread. -6. Use LRU to provide more fine-grained direction of UNREGISTER +4. Use LRU to provide more fine-grained direction of UNREGISTER requests for unpinning memory in an overcommitted environment. -7. Expose UNREGISTER support to the user by way of workload-specific +5. Expose UNREGISTER support to the user by way of workload-specific hints about application behavior. diff --git a/hw/audio/hda-codec.c b/hw/audio/hda-codec.c index 986f2a9c92..a67ca91ca7 100644 --- a/hw/audio/hda-codec.c +++ b/hw/audio/hda-codec.c @@ -157,6 +157,9 @@ struct HDAAudioStream { uint32_t bpos; }; +#define TYPE_HDA_AUDIO "hda-audio" +#define HDA_AUDIO(obj) OBJECT_CHECK(HDAAudioState, (obj), TYPE_HDA_AUDIO) + struct HDAAudioState { HDACodecDevice hda; const char *name; @@ -288,7 +291,7 @@ static void hda_audio_setup(HDAAudioStream *st) static void hda_audio_command(HDACodecDevice *hda, uint32_t nid, uint32_t data) { - HDAAudioState *a = DO_UPCAST(HDAAudioState, hda, hda); + HDAAudioState *a = HDA_AUDIO(hda); HDAAudioStream *st; const desc_node *node = NULL; const desc_param *param; @@ -448,7 +451,7 @@ fail: static void hda_audio_stream(HDACodecDevice *hda, uint32_t stnr, bool running, bool output) { - HDAAudioState *a = DO_UPCAST(HDAAudioState, hda, hda); + HDAAudioState *a = HDA_AUDIO(hda); int s; a->running_compat[stnr] = running; @@ -469,7 +472,7 @@ static void hda_audio_stream(HDACodecDevice *hda, uint32_t stnr, bool running, b static int hda_audio_init(HDACodecDevice *hda, const struct desc_codec *desc) { - HDAAudioState *a = DO_UPCAST(HDAAudioState, hda, hda); + HDAAudioState *a = HDA_AUDIO(hda); HDAAudioStream *st; const desc_node *node; const desc_param *param; @@ -514,7 +517,7 @@ static int hda_audio_init(HDACodecDevice *hda, const struct desc_codec *desc) static int hda_audio_exit(HDACodecDevice *hda) { - HDAAudioState *a = DO_UPCAST(HDAAudioState, hda, hda); + HDAAudioState *a = HDA_AUDIO(hda); HDAAudioStream *st; int i; @@ -561,7 +564,7 @@ static int hda_audio_post_load(void *opaque, int version) static void hda_audio_reset(DeviceState *dev) { - HDAAudioState *a = DO_UPCAST(HDAAudioState, hda.qdev, dev); + HDAAudioState *a = HDA_AUDIO(dev); HDAAudioStream *st; int i; @@ -613,7 +616,7 @@ static Property hda_audio_properties[] = { static int hda_audio_init_output(HDACodecDevice *hda) { - HDAAudioState *a = DO_UPCAST(HDAAudioState, hda, hda); + HDAAudioState *a = HDA_AUDIO(hda); if (!a->mixer) { return hda_audio_init(hda, &output_nomixemu); @@ -624,7 +627,7 @@ static int hda_audio_init_output(HDACodecDevice *hda) static int hda_audio_init_duplex(HDACodecDevice *hda) { - HDAAudioState *a = DO_UPCAST(HDAAudioState, hda, hda); + HDAAudioState *a = HDA_AUDIO(hda); if (!a->mixer) { return hda_audio_init(hda, &duplex_nomixemu); @@ -635,7 +638,7 @@ static int hda_audio_init_duplex(HDACodecDevice *hda) static int hda_audio_init_micro(HDACodecDevice *hda) { - HDAAudioState *a = DO_UPCAST(HDAAudioState, hda, hda); + HDAAudioState *a = HDA_AUDIO(hda); if (!a->mixer) { return hda_audio_init(hda, µ_nomixemu); @@ -644,25 +647,39 @@ static int hda_audio_init_micro(HDACodecDevice *hda) } } -static void hda_audio_output_class_init(ObjectClass *klass, void *data) +static void hda_audio_base_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); HDACodecDeviceClass *k = HDA_CODEC_DEVICE_CLASS(klass); - k->init = hda_audio_init_output; k->exit = hda_audio_exit; k->command = hda_audio_command; k->stream = hda_audio_stream; set_bit(DEVICE_CATEGORY_SOUND, dc->categories); - dc->desc = "HDA Audio Codec, output-only (line-out)"; dc->reset = hda_audio_reset; dc->vmsd = &vmstate_hda_audio; dc->props = hda_audio_properties; } +static const TypeInfo hda_audio_info = { + .name = TYPE_HDA_AUDIO, + .parent = TYPE_HDA_CODEC_DEVICE, + .class_init = hda_audio_base_class_init, + .abstract = true, +}; + +static void hda_audio_output_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + HDACodecDeviceClass *k = HDA_CODEC_DEVICE_CLASS(klass); + + k->init = hda_audio_init_output; + dc->desc = "HDA Audio Codec, output-only (line-out)"; +} + static const TypeInfo hda_audio_output_info = { .name = "hda-output", - .parent = TYPE_HDA_CODEC_DEVICE, + .parent = TYPE_HDA_AUDIO, .instance_size = sizeof(HDAAudioState), .class_init = hda_audio_output_class_init, }; @@ -673,19 +690,12 @@ static void hda_audio_duplex_class_init(ObjectClass *klass, void *data) HDACodecDeviceClass *k = HDA_CODEC_DEVICE_CLASS(klass); k->init = hda_audio_init_duplex; - k->exit = hda_audio_exit; - k->command = hda_audio_command; - k->stream = hda_audio_stream; - set_bit(DEVICE_CATEGORY_SOUND, dc->categories); dc->desc = "HDA Audio Codec, duplex (line-out, line-in)"; - dc->reset = hda_audio_reset; - dc->vmsd = &vmstate_hda_audio; - dc->props = hda_audio_properties; } static const TypeInfo hda_audio_duplex_info = { .name = "hda-duplex", - .parent = TYPE_HDA_CODEC_DEVICE, + .parent = TYPE_HDA_AUDIO, .instance_size = sizeof(HDAAudioState), .class_init = hda_audio_duplex_class_init, }; @@ -696,25 +706,19 @@ static void hda_audio_micro_class_init(ObjectClass *klass, void *data) HDACodecDeviceClass *k = HDA_CODEC_DEVICE_CLASS(klass); k->init = hda_audio_init_micro; - k->exit = hda_audio_exit; - k->command = hda_audio_command; - k->stream = hda_audio_stream; - set_bit(DEVICE_CATEGORY_SOUND, dc->categories); dc->desc = "HDA Audio Codec, duplex (speaker, microphone)"; - dc->reset = hda_audio_reset; - dc->vmsd = &vmstate_hda_audio; - dc->props = hda_audio_properties; } static const TypeInfo hda_audio_micro_info = { .name = "hda-micro", - .parent = TYPE_HDA_CODEC_DEVICE, + .parent = TYPE_HDA_AUDIO, .instance_size = sizeof(HDAAudioState), .class_init = hda_audio_micro_class_init, }; static void hda_audio_register_types(void) { + type_register_static(&hda_audio_info); type_register_static(&hda_audio_output_info); type_register_static(&hda_audio_duplex_info); type_register_static(&hda_audio_micro_info); diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c index 854997d9ba..a1de2f43a0 100644 --- a/hw/net/vhost_net.c +++ b/hw/net/vhost_net.c @@ -106,7 +106,7 @@ struct vhost_net *vhost_net_init(NetClientState *backend, int devfd, goto fail; } net->nc = backend; - net->dev.backend_features = tap_has_vnet_hdr(backend) ? 0 : + net->dev.backend_features = qemu_has_vnet_hdr(backend) ? 0 : (1 << VHOST_NET_F_VIRTIO_NET_HDR); net->backend = r; @@ -117,8 +117,8 @@ struct vhost_net *vhost_net_init(NetClientState *backend, int devfd, if (r < 0) { goto fail; } - if (!tap_has_vnet_hdr_len(backend, - sizeof(struct virtio_net_hdr_mrg_rxbuf))) { + if (!qemu_has_vnet_hdr_len(backend, + sizeof(struct virtio_net_hdr_mrg_rxbuf))) { net->dev.features &= ~(1 << VIRTIO_NET_F_MRG_RXBUF); } if (~net->dev.features & net->dev.backend_features) { diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 36266083b2..3c0342e17a 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -325,11 +325,7 @@ static void peer_test_vnet_hdr(VirtIONet *n) return; } - if (nc->peer->info->type != NET_CLIENT_OPTIONS_KIND_TAP) { - return; - } - - n->has_vnet_hdr = tap_has_vnet_hdr(nc->peer); + n->has_vnet_hdr = qemu_has_vnet_hdr(nc->peer); } static int peer_has_vnet_hdr(VirtIONet *n) @@ -342,7 +338,7 @@ static int peer_has_ufo(VirtIONet *n) if (!peer_has_vnet_hdr(n)) return 0; - n->has_ufo = tap_has_ufo(qemu_get_queue(n->nic)->peer); + n->has_ufo = qemu_has_ufo(qemu_get_queue(n->nic)->peer); return n->has_ufo; } @@ -361,8 +357,8 @@ static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs) nc = qemu_get_subqueue(n->nic, i); if (peer_has_vnet_hdr(n) && - tap_has_vnet_hdr_len(nc->peer, n->guest_hdr_len)) { - tap_set_vnet_hdr_len(nc->peer, n->guest_hdr_len); + qemu_has_vnet_hdr_len(nc->peer, n->guest_hdr_len)) { + qemu_set_vnet_hdr_len(nc->peer, n->guest_hdr_len); n->host_hdr_len = n->guest_hdr_len; } } @@ -463,7 +459,7 @@ static uint32_t virtio_net_bad_features(VirtIODevice *vdev) static void virtio_net_apply_guest_offloads(VirtIONet *n) { - tap_set_offload(qemu_get_subqueue(n->nic, 0)->peer, + qemu_set_offload(qemu_get_queue(n->nic)->peer, !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_CSUM)), !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO4)), !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO6)), @@ -1544,7 +1540,7 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp) peer_test_vnet_hdr(n); if (peer_has_vnet_hdr(n)) { for (i = 0; i < n->max_queues; i++) { - tap_using_vnet_hdr(qemu_get_subqueue(n->nic, i)->peer, true); + qemu_using_vnet_hdr(qemu_get_subqueue(n->nic, i)->peer, true); } n->host_hdr_len = sizeof(struct virtio_net_hdr); } else { diff --git a/hw/net/vmxnet3.c b/hw/net/vmxnet3.c index 19687aa03c..5be807ce82 100644 --- a/hw/net/vmxnet3.c +++ b/hw/net/vmxnet3.c @@ -1290,12 +1290,12 @@ static void vmxnet3_update_features(VMXNET3State *s) s->lro_supported, rxcso_supported, s->rx_vlan_stripping); if (s->peer_has_vhdr) { - tap_set_offload(qemu_get_queue(s->nic)->peer, - rxcso_supported, - s->lro_supported, - s->lro_supported, - 0, - 0); + qemu_set_offload(qemu_get_queue(s->nic)->peer, + rxcso_supported, + s->lro_supported, + s->lro_supported, + 0, + 0); } } @@ -1883,11 +1883,9 @@ static NetClientInfo net_vmxnet3_info = { static bool vmxnet3_peer_has_vnet_hdr(VMXNET3State *s) { - NetClientState *peer = qemu_get_queue(s->nic)->peer; + NetClientState *nc = qemu_get_queue(s->nic); - if ((NULL != peer) && - (peer->info->type == NET_CLIENT_OPTIONS_KIND_TAP) && - tap_has_vnet_hdr(peer)) { + if (qemu_has_vnet_hdr(nc->peer)) { return true; } @@ -1935,10 +1933,10 @@ static void vmxnet3_net_init(VMXNET3State *s) s->lro_supported = false; if (s->peer_has_vhdr) { - tap_set_vnet_hdr_len(qemu_get_queue(s->nic)->peer, + qemu_set_vnet_hdr_len(qemu_get_queue(s->nic)->peer, sizeof(struct virtio_net_hdr)); - tap_using_vnet_hdr(qemu_get_queue(s->nic)->peer, 1); + qemu_using_vnet_hdr(qemu_get_queue(s->nic)->peer, 1); } qemu_format_nic_info_str(qemu_get_queue(s->nic), s->conf.macaddr.a); diff --git a/include/migration/page_cache.h b/include/migration/page_cache.h index d156f0d398..2d5ce2dd7a 100644 --- a/include/migration/page_cache.h +++ b/include/migration/page_cache.h @@ -66,7 +66,7 @@ uint8_t *get_cached_data(const PageCache *cache, uint64_t addr); * @addr: page address * @pdata: pointer to the page */ -int cache_insert(PageCache *cache, uint64_t addr, uint8_t *pdata); +int cache_insert(PageCache *cache, uint64_t addr, const uint8_t *pdata); /** * cache_resize: resize the page cache. In case of size reduction the extra diff --git a/include/net/net.h b/include/net/net.h index 11e146888b..8166345a13 100644 --- a/include/net/net.h +++ b/include/net/net.h @@ -50,6 +50,12 @@ typedef void (NetCleanup) (NetClientState *); typedef void (LinkStatusChanged)(NetClientState *); typedef void (NetClientDestructor)(NetClientState *); typedef RxFilterInfo *(QueryRxFilter)(NetClientState *); +typedef bool (HasUfo)(NetClientState *); +typedef bool (HasVnetHdr)(NetClientState *); +typedef bool (HasVnetHdrLen)(NetClientState *, int); +typedef void (UsingVnetHdr)(NetClientState *, bool); +typedef void (SetOffload)(NetClientState *, int, int, int, int, int); +typedef void (SetVnetHdrLen)(NetClientState *, int); typedef struct NetClientInfo { NetClientOptionsKind type; @@ -62,6 +68,12 @@ typedef struct NetClientInfo { LinkStatusChanged *link_status_changed; QueryRxFilter *query_rx_filter; NetPoll *poll; + HasUfo *has_ufo; + HasVnetHdr *has_vnet_hdr; + HasVnetHdrLen *has_vnet_hdr_len; + UsingVnetHdr *using_vnet_hdr; + SetOffload *set_offload; + SetVnetHdrLen *set_vnet_hdr_len; } NetClientInfo; struct NetClientState { @@ -120,6 +132,13 @@ ssize_t qemu_send_packet_async(NetClientState *nc, const uint8_t *buf, void qemu_purge_queued_packets(NetClientState *nc); void qemu_flush_queued_packets(NetClientState *nc); void qemu_format_nic_info_str(NetClientState *nc, uint8_t macaddr[6]); +bool qemu_has_ufo(NetClientState *nc); +bool qemu_has_vnet_hdr(NetClientState *nc); +bool qemu_has_vnet_hdr_len(NetClientState *nc, int len); +void qemu_using_vnet_hdr(NetClientState *nc, bool enable); +void qemu_set_offload(NetClientState *nc, int csum, int tso4, int tso6, + int ecn, int ufo); +void qemu_set_vnet_hdr_len(NetClientState *nc, int len); void qemu_macaddr_default_if_unset(MACAddr *macaddr); int qemu_show_nic_models(const char *arg, const char *const *models); void qemu_check_nic_model(NICInfo *nd, const char *model); diff --git a/include/net/tap.h b/include/net/tap.h index a994f20447..6daeb42b0f 100644 --- a/include/net/tap.h +++ b/include/net/tap.h @@ -29,12 +29,6 @@ #include "qemu-common.h" #include "qapi-types.h" -bool tap_has_ufo(NetClientState *nc); -int tap_has_vnet_hdr(NetClientState *nc); -int tap_has_vnet_hdr_len(NetClientState *nc, int len); -void tap_using_vnet_hdr(NetClientState *nc, bool using_vnet_hdr); -void tap_set_offload(NetClientState *nc, int csum, int tso4, int tso6, int ecn, int ufo); -void tap_set_vnet_hdr_len(NetClientState *nc, int len); int tap_enable(NetClientState *nc); int tap_disable(NetClientState *nc); diff --git a/migration-rdma.c b/migration-rdma.c index f94f3b4e3a..eeb4302215 100644 --- a/migration-rdma.c +++ b/migration-rdma.c @@ -3412,7 +3412,7 @@ void rdma_start_outgoing_migration(void *opaque, } ret = qemu_rdma_source_init(rdma, &local_err, - s->enabled_capabilities[MIGRATION_CAPABILITY_X_RDMA_PIN_ALL]); + s->enabled_capabilities[MIGRATION_CAPABILITY_RDMA_PIN_ALL]); if (ret) { goto err; diff --git a/migration.c b/migration.c index 25add6f9e2..14235b280a 100644 --- a/migration.c +++ b/migration.c @@ -82,7 +82,7 @@ void qemu_start_incoming_migration(const char *uri, Error **errp) if (strstart(uri, "tcp:", &p)) tcp_start_incoming_migration(p, errp); #ifdef CONFIG_RDMA - else if (strstart(uri, "x-rdma:", &p)) + else if (strstart(uri, "rdma:", &p)) rdma_start_incoming_migration(p, errp); #endif #if !defined(WIN32) @@ -438,7 +438,7 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk, if (strstart(uri, "tcp:", &p)) { tcp_start_outgoing_migration(s, p, &local_err); #ifdef CONFIG_RDMA - } else if (strstart(uri, "x-rdma:", &p)) { + } else if (strstart(uri, "rdma:", &p)) { rdma_start_outgoing_migration(s, p, &local_err); #endif #if !defined(WIN32) @@ -532,7 +532,7 @@ bool migrate_rdma_pin_all(void) s = migrate_get_current(); - return s->enabled_capabilities[MIGRATION_CAPABILITY_X_RDMA_PIN_ALL]; + return s->enabled_capabilities[MIGRATION_CAPABILITY_RDMA_PIN_ALL]; } bool migrate_auto_converge(void) @@ -378,6 +378,61 @@ void qemu_foreach_nic(qemu_nic_foreach func, void *opaque) } } +bool qemu_has_ufo(NetClientState *nc) +{ + if (!nc || !nc->info->has_ufo) { + return false; + } + + return nc->info->has_ufo(nc); +} + +bool qemu_has_vnet_hdr(NetClientState *nc) +{ + if (!nc || !nc->info->has_vnet_hdr) { + return false; + } + + return nc->info->has_vnet_hdr(nc); +} + +bool qemu_has_vnet_hdr_len(NetClientState *nc, int len) +{ + if (!nc || !nc->info->has_vnet_hdr_len) { + return false; + } + + return nc->info->has_vnet_hdr_len(nc, len); +} + +void qemu_using_vnet_hdr(NetClientState *nc, bool enable) +{ + if (!nc || !nc->info->using_vnet_hdr) { + return; + } + + nc->info->using_vnet_hdr(nc, enable); +} + +void qemu_set_offload(NetClientState *nc, int csum, int tso4, int tso6, + int ecn, int ufo) +{ + if (!nc || !nc->info->set_offload) { + return; + } + + nc->info->set_offload(nc, csum, tso4, tso6, ecn, ufo); +} + +void qemu_set_vnet_hdr_len(NetClientState *nc, int len) +{ + if (!nc || !nc->info->set_vnet_hdr_len) { + return; + } + + nc->info->set_vnet_hdr_len(nc, len); +} + int qemu_can_send_packet(NetClientState *sender) { if (!sender->peer) { diff --git a/net/netmap.c b/net/netmap.c index 0ccc4976b5..8213304a5b 100644 --- a/net/netmap.c +++ b/net/netmap.c @@ -27,10 +27,13 @@ #include <net/if.h> #include <sys/mman.h> #include <stdint.h> +#include <stdio.h> +#define NETMAP_WITH_LIBS #include <net/netmap.h> #include <net/netmap_user.h> #include "net/net.h" +#include "net/tap.h" #include "clients.h" #include "sysemu/sysemu.h" #include "qemu/error-report.h" @@ -54,33 +57,9 @@ typedef struct NetmapState { bool read_poll; bool write_poll; struct iovec iov[IOV_MAX]; + int vnet_hdr_len; /* Current virtio-net header length. */ } NetmapState; -#define D(format, ...) \ - do { \ - struct timeval __xxts; \ - gettimeofday(&__xxts, NULL); \ - printf("%03d.%06d %s [%d] " format "\n", \ - (int)__xxts.tv_sec % 1000, (int)__xxts.tv_usec, \ - __func__, __LINE__, ##__VA_ARGS__); \ - } while (0) - -/* Rate limited version of "D", lps indicates how many per second */ -#define RD(lps, format, ...) \ - do { \ - static int t0, __cnt; \ - struct timeval __xxts; \ - gettimeofday(&__xxts, NULL); \ - if (t0 != __xxts.tv_sec) { \ - t0 = __xxts.tv_sec; \ - __cnt = 0; \ - } \ - if (__cnt++ < lps) { \ - D(format, ##__VA_ARGS__); \ - } \ - } while (0) - - #ifndef __FreeBSD__ #define pkt_copy bcopy #else @@ -237,7 +216,7 @@ static ssize_t netmap_receive(NetClientState *nc, return size; } - if (ring->avail == 0) { + if (nm_ring_empty(ring)) { /* No available slots in the netmap TX ring. */ netmap_write_poll(s, true); return 0; @@ -250,8 +229,7 @@ static ssize_t netmap_receive(NetClientState *nc, ring->slot[i].len = size; ring->slot[i].flags = 0; pkt_copy(buf, dst, size); - ring->cur = NETMAP_RING_NEXT(ring, i); - ring->avail--; + ring->cur = ring->head = nm_ring_next(ring, i); ioctl(s->me.fd, NIOCTXSYNC, NULL); return size; @@ -267,17 +245,15 @@ static ssize_t netmap_receive_iov(NetClientState *nc, uint8_t *dst; int j; uint32_t i; - uint32_t avail; if (unlikely(!ring)) { /* Drop the packet. */ return iov_size(iov, iovcnt); } - i = ring->cur; - avail = ring->avail; + last = i = ring->cur; - if (avail < iovcnt) { + if (nm_ring_space(ring) < iovcnt) { /* Not enough netmap slots. */ netmap_write_poll(s, true); return 0; @@ -293,7 +269,7 @@ static ssize_t netmap_receive_iov(NetClientState *nc, while (iov_frag_size) { nm_frag_size = MIN(iov_frag_size, ring->nr_buf_size); - if (unlikely(avail == 0)) { + if (unlikely(nm_ring_empty(ring))) { /* We run out of netmap slots while splitting the iovec fragments. */ netmap_write_poll(s, true); @@ -308,8 +284,7 @@ static ssize_t netmap_receive_iov(NetClientState *nc, pkt_copy(iov[j].iov_base + offset, dst, nm_frag_size); last = i; - i = NETMAP_RING_NEXT(ring, i); - avail--; + i = nm_ring_next(ring, i); offset += nm_frag_size; iov_frag_size -= nm_frag_size; @@ -318,9 +293,8 @@ static ssize_t netmap_receive_iov(NetClientState *nc, /* The last slot must not have NS_MOREFRAG set. */ ring->slot[last].flags &= ~NS_MOREFRAG; - /* Now update ring->cur and ring->avail. */ - ring->cur = i; - ring->avail = avail; + /* Now update ring->cur and ring->head. */ + ring->cur = ring->head = i; ioctl(s->me.fd, NIOCTXSYNC, NULL); @@ -343,7 +317,7 @@ static void netmap_send(void *opaque) /* Keep sending while there are available packets into the netmap RX ring and the forwarding path towards the peer is open. */ - while (ring->avail > 0 && qemu_can_send_packet(&s->nc)) { + while (!nm_ring_empty(ring) && qemu_can_send_packet(&s->nc)) { uint32_t i; uint32_t idx; bool morefrag; @@ -358,11 +332,10 @@ static void netmap_send(void *opaque) s->iov[iovcnt].iov_len = ring->slot[i].len; iovcnt++; - ring->cur = NETMAP_RING_NEXT(ring, i); - ring->avail--; - } while (ring->avail && morefrag); + ring->cur = ring->head = nm_ring_next(ring, i); + } while (!nm_ring_empty(ring) && morefrag); - if (unlikely(!ring->avail && morefrag)) { + if (unlikely(nm_ring_empty(ring) && morefrag)) { RD(5, "[netmap_send] ran out of slots, with a pending" "incomplete packet\n"); } @@ -394,6 +367,63 @@ static void netmap_cleanup(NetClientState *nc) s->me.fd = -1; } +/* Offloading manipulation support callbacks. */ +static bool netmap_has_ufo(NetClientState *nc) +{ + return true; +} + +static bool netmap_has_vnet_hdr(NetClientState *nc) +{ + return true; +} + +static bool netmap_has_vnet_hdr_len(NetClientState *nc, int len) +{ + return len == 0 || len == sizeof(struct virtio_net_hdr) || + len == sizeof(struct virtio_net_hdr_mrg_rxbuf); +} + +static void netmap_using_vnet_hdr(NetClientState *nc, bool enable) +{ +} + +static void netmap_set_vnet_hdr_len(NetClientState *nc, int len) +{ + NetmapState *s = DO_UPCAST(NetmapState, nc, nc); + int err; + struct nmreq req; + + /* Issue a NETMAP_BDG_VNET_HDR command to change the virtio-net header + * length for the netmap adapter associated to 'me->ifname'. + */ + memset(&req, 0, sizeof(req)); + pstrcpy(req.nr_name, sizeof(req.nr_name), s->me.ifname); + req.nr_version = NETMAP_API; + req.nr_cmd = NETMAP_BDG_VNET_HDR; + req.nr_arg1 = len; + err = ioctl(s->me.fd, NIOCREGIF, &req); + if (err) { + error_report("Unable to execute NETMAP_BDG_VNET_HDR on %s: %s", + s->me.ifname, strerror(errno)); + } else { + /* Keep track of the current length. */ + s->vnet_hdr_len = len; + } +} + +static void netmap_set_offload(NetClientState *nc, int csum, int tso4, int tso6, + int ecn, int ufo) +{ + NetmapState *s = DO_UPCAST(NetmapState, nc, nc); + + /* Setting a virtio-net header length greater than zero automatically + * enables the offloadings. + */ + if (!s->vnet_hdr_len) { + netmap_set_vnet_hdr_len(nc, sizeof(struct virtio_net_hdr)); + } +} /* NetClientInfo methods */ static NetClientInfo net_netmap_info = { @@ -403,6 +433,12 @@ static NetClientInfo net_netmap_info = { .receive_iov = netmap_receive_iov, .poll = netmap_poll, .cleanup = netmap_cleanup, + .has_ufo = netmap_has_ufo, + .has_vnet_hdr = netmap_has_vnet_hdr, + .has_vnet_hdr_len = netmap_has_vnet_hdr_len, + .using_vnet_hdr = netmap_using_vnet_hdr, + .set_offload = netmap_set_offload, + .set_vnet_hdr_len = netmap_set_vnet_hdr_len, }; /* The exported init function @@ -428,6 +464,7 @@ int net_init_netmap(const NetClientOptions *opts, nc = qemu_new_net_client(&net_netmap_info, peer, "netmap", name); s = DO_UPCAST(NetmapState, nc, nc); s->me = me; + s->vnet_hdr_len = 0; netmap_read_poll(s, true); /* Initially only poll for reads. */ return 0; diff --git a/net/tap-win32.c b/net/tap-win32.c index 91e9e844a0..8aee611f7d 100644 --- a/net/tap-win32.c +++ b/net/tap-win32.c @@ -669,11 +669,60 @@ static void tap_win32_send(void *opaque) } } +static bool tap_has_ufo(NetClientState *nc) +{ + return false; +} + +static bool tap_has_vnet_hdr(NetClientState *nc) +{ + return false; +} + +int tap_probe_vnet_hdr_len(int fd, int len) +{ + return 0; +} + +void tap_fd_set_vnet_hdr_len(int fd, int len) +{ +} + +static void tap_using_vnet_hdr(NetClientState *nc, bool using_vnet_hdr) +{ +} + +static void tap_set_offload(NetClientState *nc, int csum, int tso4, + int tso6, int ecn, int ufo) +{ +} + +struct vhost_net *tap_get_vhost_net(NetClientState *nc) +{ + return NULL; +} + +static bool tap_has_vnet_hdr_len(NetClientState *nc, int len) +{ + return false; +} + +static void tap_set_vnet_hdr_len(NetClientState *nc, int len) +{ + abort(); +} + static NetClientInfo net_tap_win32_info = { .type = NET_CLIENT_OPTIONS_KIND_TAP, .size = sizeof(TAPState), .receive = tap_receive, .cleanup = tap_cleanup, + .has_ufo = tap_has_ufo, + .has_vnet_hdr = tap_has_vnet_hdr, + .has_vnet_hdr_len = tap_has_vnet_hdr_len, + .using_vnet_hdr = tap_using_vnet_hdr, + .set_offload = tap_set_offload, + .set_vnet_hdr_len = tap_set_vnet_hdr_len, }; static int tap_win32_init(NetClientState *peer, const char *model, @@ -722,49 +771,6 @@ int net_init_tap(const NetClientOptions *opts, const char *name, return 0; } -bool tap_has_ufo(NetClientState *nc) -{ - return false; -} - -int tap_has_vnet_hdr(NetClientState *nc) -{ - return 0; -} - -int tap_probe_vnet_hdr_len(int fd, int len) -{ - return 0; -} - -void tap_fd_set_vnet_hdr_len(int fd, int len) -{ -} - -void tap_using_vnet_hdr(NetClientState *nc, bool using_vnet_hdr) -{ -} - -void tap_set_offload(NetClientState *nc, int csum, int tso4, - int tso6, int ecn, int ufo) -{ -} - -struct vhost_net *tap_get_vhost_net(NetClientState *nc) -{ - return NULL; -} - -int tap_has_vnet_hdr_len(NetClientState *nc, int len) -{ - return 0; -} - -void tap_set_vnet_hdr_len(NetClientState *nc, int len) -{ - abort(); -} - int tap_enable(NetClientState *nc) { abort(); @@ -210,7 +210,7 @@ static void tap_send(void *opaque) } while (size > 0 && qemu_can_send_packet(&s->nc)); } -bool tap_has_ufo(NetClientState *nc) +static bool tap_has_ufo(NetClientState *nc) { TAPState *s = DO_UPCAST(TAPState, nc, nc); @@ -219,7 +219,7 @@ bool tap_has_ufo(NetClientState *nc) return s->has_ufo; } -int tap_has_vnet_hdr(NetClientState *nc) +static bool tap_has_vnet_hdr(NetClientState *nc) { TAPState *s = DO_UPCAST(TAPState, nc, nc); @@ -228,16 +228,16 @@ int tap_has_vnet_hdr(NetClientState *nc) return !!s->host_vnet_hdr_len; } -int tap_has_vnet_hdr_len(NetClientState *nc, int len) +static bool tap_has_vnet_hdr_len(NetClientState *nc, int len) { TAPState *s = DO_UPCAST(TAPState, nc, nc); assert(nc->info->type == NET_CLIENT_OPTIONS_KIND_TAP); - return tap_probe_vnet_hdr_len(s->fd, len); + return !!tap_probe_vnet_hdr_len(s->fd, len); } -void tap_set_vnet_hdr_len(NetClientState *nc, int len) +static void tap_set_vnet_hdr_len(NetClientState *nc, int len) { TAPState *s = DO_UPCAST(TAPState, nc, nc); @@ -249,7 +249,7 @@ void tap_set_vnet_hdr_len(NetClientState *nc, int len) s->host_vnet_hdr_len = len; } -void tap_using_vnet_hdr(NetClientState *nc, bool using_vnet_hdr) +static void tap_using_vnet_hdr(NetClientState *nc, bool using_vnet_hdr) { TAPState *s = DO_UPCAST(TAPState, nc, nc); @@ -259,7 +259,7 @@ void tap_using_vnet_hdr(NetClientState *nc, bool using_vnet_hdr) s->using_vnet_hdr = using_vnet_hdr; } -void tap_set_offload(NetClientState *nc, int csum, int tso4, +static void tap_set_offload(NetClientState *nc, int csum, int tso4, int tso6, int ecn, int ufo) { TAPState *s = DO_UPCAST(TAPState, nc, nc); @@ -314,6 +314,12 @@ static NetClientInfo net_tap_info = { .receive_iov = tap_receive_iov, .poll = tap_poll, .cleanup = tap_cleanup, + .has_ufo = tap_has_ufo, + .has_vnet_hdr = tap_has_vnet_hdr, + .has_vnet_hdr_len = tap_has_vnet_hdr_len, + .using_vnet_hdr = tap_using_vnet_hdr, + .set_offload = tap_set_offload, + .set_vnet_hdr_len = tap_set_vnet_hdr_len, }; static TAPState *net_tap_fd_init(NetClientState *peer, diff --git a/page_cache.c b/page_cache.c index 3ef6ee7ad2..b033681a93 100644 --- a/page_cache.c +++ b/page_cache.c @@ -150,7 +150,7 @@ uint8_t *get_cached_data(const PageCache *cache, uint64_t addr) return cache_get_by_addr(cache, addr)->it_data; } -int cache_insert(PageCache *cache, uint64_t addr, uint8_t *pdata) +int cache_insert(PageCache *cache, uint64_t addr, const uint8_t *pdata) { CacheItem *it = NULL; diff --git a/qapi-schema.json b/qapi-schema.json index fcb2280053..ac8ad24966 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -751,10 +751,9 @@ # This feature allows us to minimize migration traffic for certain work # loads, by sending compressed difference of the pages # -# @x-rdma-pin-all: Controls whether or not the entire VM memory footprint is +# @rdma-pin-all: Controls whether or not the entire VM memory footprint is # mlock()'d on demand or all at once. Refer to docs/rdma.txt for usage. -# Disabled by default. Experimental: may (or may not) be renamed after -# further testing is complete. (since 1.6) +# Disabled by default. (since 2.0) # # @zero-blocks: During storage migration encode blocks of zeroes efficiently. This # essentially saves 1MB of zeroes per block on the wire. Enabling requires @@ -768,7 +767,7 @@ # Since: 1.2 ## { 'enum': 'MigrationCapability', - 'data': ['xbzrle', 'x-rdma-pin-all', 'auto-converge', 'zero-blocks'] } + 'data': ['xbzrle', 'rdma-pin-all', 'auto-converge', 'zero-blocks'] } ## # @MigrationCapabilityStatus diff --git a/qemu-file.c b/qemu-file.c index 9473b674ba..f074af15c3 100644 --- a/qemu-file.c +++ b/qemu-file.c @@ -100,7 +100,14 @@ static int stdio_put_buffer(void *opaque, const uint8_t *buf, int64_t pos, int size) { QEMUFileStdio *s = opaque; - return fwrite(buf, 1, size, s->stdio_file); + int res; + + res = fwrite(buf, 1, size, s->stdio_file); + + if (res != size) { + return -EIO; /* fake errno value */ + } + return res; } static int stdio_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size) @@ -321,23 +321,24 @@ const VMStateInfo vmstate_info_int32_equal = { .put = put_int32, }; -/* 32 bit int. See that the received value is the less or the same - than the one in the field */ +/* 32 bit int. Check that the received value is less than or equal to + the one in the field */ static int get_int32_le(QEMUFile *f, void *pv, size_t size) { - int32_t *old = pv; - int32_t new; - qemu_get_sbe32s(f, &new); + int32_t *cur = pv; + int32_t loaded; + qemu_get_sbe32s(f, &loaded); - if (*old <= new) { + if (loaded <= *cur) { + *cur = loaded; return 0; } return -EINVAL; } const VMStateInfo vmstate_info_int32_le = { - .name = "int32 equal", + .name = "int32 le", .get = get_int32_le, .put = put_int32, }; |