aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arch_init.c64
-rwxr-xr-xconfigure10
-rw-r--r--docs/rdma.txt24
-rw-r--r--hw/audio/hda-codec.c60
-rw-r--r--hw/net/vhost_net.c6
-rw-r--r--hw/net/virtio-net.c16
-rw-r--r--hw/net/vmxnet3.c22
-rw-r--r--include/migration/page_cache.h2
-rw-r--r--include/net/net.h19
-rw-r--r--include/net/tap.h6
-rw-r--r--migration-rdma.c2
-rw-r--r--migration.c6
-rw-r--r--net/net.c55
-rw-r--r--net/netmap.c123
-rw-r--r--net/tap-win32.c92
-rw-r--r--net/tap.c20
-rw-r--r--page_cache.c2
-rw-r--r--qapi-schema.json7
-rw-r--r--qemu-file.c9
-rw-r--r--vmstate.c15
20 files changed, 360 insertions, 200 deletions
diff --git a/arch_init.c b/arch_init.c
index 80574a090c..fe1727922c 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -122,7 +122,6 @@ static void check_guest_throttling(void);
#define RAM_SAVE_FLAG_XBZRLE 0x40
/* 0x80 is reserved in migration.h start with 0x100 next */
-
static struct defconfig_file {
const char *filename;
/* Indicates it is an user config file (disabled by -no-user-config) */
@@ -133,6 +132,7 @@ static struct defconfig_file {
{ NULL }, /* end of list */
};
+static const uint8_t ZERO_TARGET_PAGE[TARGET_PAGE_SIZE];
int qemu_read_default_config_files(bool userconfig)
{
@@ -273,6 +273,34 @@ static size_t save_block_hdr(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
return size;
}
+/* This is the last block that we have visited serching for dirty pages
+ */
+static RAMBlock *last_seen_block;
+/* This is the last block from where we have sent data */
+static RAMBlock *last_sent_block;
+static ram_addr_t last_offset;
+static unsigned long *migration_bitmap;
+static uint64_t migration_dirty_pages;
+static uint32_t last_version;
+static bool ram_bulk_stage;
+
+/* Update the xbzrle cache to reflect a page that's been sent as all 0.
+ * The important thing is that a stale (not-yet-0'd) page be replaced
+ * by the new data.
+ * As a bonus, if the page wasn't in the cache it gets added so that
+ * when a small write is made into the 0'd page it gets XBZRLE sent
+ */
+static void xbzrle_cache_zero_page(ram_addr_t current_addr)
+{
+ if (ram_bulk_stage || !migrate_use_xbzrle()) {
+ return;
+ }
+
+ /* We don't care if this fails to allocate a new cache page
+ * as long as it updated an old one */
+ cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE);
+}
+
#define ENCODING_FLAG_XBZRLE 0x1
static int save_xbzrle_page(QEMUFile *f, uint8_t *current_data,
@@ -329,18 +357,6 @@ static int save_xbzrle_page(QEMUFile *f, uint8_t *current_data,
return bytes_sent;
}
-
-/* This is the last block that we have visited serching for dirty pages
- */
-static RAMBlock *last_seen_block;
-/* This is the last block from where we have sent data */
-static RAMBlock *last_sent_block;
-static ram_addr_t last_offset;
-static unsigned long *migration_bitmap;
-static uint64_t migration_dirty_pages;
-static uint32_t last_version;
-static bool ram_bulk_stage;
-
static inline
ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr,
ram_addr_t start)
@@ -512,6 +528,7 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
} else {
int ret;
uint8_t *p;
+ bool send_async = true;
int cont = (block == last_sent_block) ?
RAM_SAVE_FLAG_CONTINUE : 0;
@@ -522,6 +539,7 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
ret = ram_control_save_page(f, block->offset,
offset, TARGET_PAGE_SIZE, &bytes_sent);
+ current_addr = block->offset + offset;
if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
if (ret != RAM_SAVE_CONTROL_DELAYED) {
if (bytes_sent > 0) {
@@ -536,19 +554,35 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
RAM_SAVE_FLAG_COMPRESS);
qemu_put_byte(f, 0);
bytes_sent++;
+ /* Must let xbzrle know, otherwise a previous (now 0'd) cached
+ * page would be stale
+ */
+ xbzrle_cache_zero_page(current_addr);
} else if (!ram_bulk_stage && migrate_use_xbzrle()) {
- current_addr = block->offset + offset;
bytes_sent = save_xbzrle_page(f, p, current_addr, block,
offset, cont, last_stage);
if (!last_stage) {
+ /* We must send exactly what's in the xbzrle cache
+ * even if the page wasn't xbzrle compressed, so that
+ * it's right next time.
+ */
p = get_cached_data(XBZRLE.cache, current_addr);
+
+ /* Can't send this cached data async, since the cache page
+ * might get updated before it gets to the wire
+ */
+ send_async = false;
}
}
/* XBZRLE overflow or normal page */
if (bytes_sent == -1) {
bytes_sent = save_block_hdr(f, block, offset, cont, RAM_SAVE_FLAG_PAGE);
- qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE);
+ if (send_async) {
+ qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE);
+ } else {
+ qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
+ }
bytes_sent += TARGET_PAGE_SIZE;
acct_info.norm_pages++;
}
diff --git a/configure b/configure
index 030da86868..8ad03ea17d 100755
--- a/configure
+++ b/configure
@@ -2249,13 +2249,21 @@ EOF
fi
##########################################
-# netmap headers probe
+# netmap support probe
+# Apart from looking for netmap headers, we make sure that the host API version
+# supports the netmap backend (>=11). The upper bound (15) is meant to simulate
+# a minor/major version number. Minor new features will be marked with values up
+# to 15, and if something happens that requires a change to the backend we will
+# move above 15, submit the backend fixes and modify this two bounds.
if test "$netmap" != "no" ; then
cat > $TMPC << EOF
#include <inttypes.h>
#include <net/if.h>
#include <net/netmap.h>
#include <net/netmap_user.h>
+#if (NETMAP_API < 11) || (NETMAP_API > 15)
+#error
+#endif
int main(void) { return 0; }
EOF
if compile_prog "" "" ; then
diff --git a/docs/rdma.txt b/docs/rdma.txt
index 2aca63bd72..1f5d9e9fe4 100644
--- a/docs/rdma.txt
+++ b/docs/rdma.txt
@@ -66,7 +66,7 @@ bulk-phase round of the migration and can be enabled for extremely
high-performance RDMA hardware using the following command:
QEMU Monitor Command:
-$ migrate_set_capability x-rdma-pin-all on # disabled by default
+$ migrate_set_capability rdma-pin-all on # disabled by default
Performing this action will cause all 8GB to be pinned, so if that's
not what you want, then please ignore this step altogether.
@@ -93,12 +93,12 @@ $ migrate_set_speed 40g # or whatever is the MAX of your RDMA device
Next, on the destination machine, add the following to the QEMU command line:
-qemu ..... -incoming x-rdma:host:port
+qemu ..... -incoming rdma:host:port
Finally, perform the actual migration on the source machine:
QEMU Monitor Command:
-$ migrate -d x-rdma:host:port
+$ migrate -d rdma:host:port
PERFORMANCE
===========
@@ -120,8 +120,8 @@ For example, in the same 8GB RAM example with all 8GB of memory in
active use and the VM itself is completely idle using the same 40 gbps
infiniband link:
-1. x-rdma-pin-all disabled total time: approximately 7.5 seconds @ 9.5 Gbps
-2. x-rdma-pin-all enabled total time: approximately 4 seconds @ 26 Gbps
+1. rdma-pin-all disabled total time: approximately 7.5 seconds @ 9.5 Gbps
+2. rdma-pin-all enabled total time: approximately 4 seconds @ 26 Gbps
These numbers would of course scale up to whatever size virtual machine
you have to migrate using RDMA.
@@ -407,18 +407,14 @@ socket is broken during a non-RDMA based migration.
TODO:
=====
-1. 'migrate x-rdma:host:port' and '-incoming x-rdma' options will be
- renamed to 'rdma' after the experimental phase of this work has
- completed upstream.
-2. Currently, 'ulimit -l' mlock() limits as well as cgroups swap limits
+1. Currently, 'ulimit -l' mlock() limits as well as cgroups swap limits
are not compatible with infinband memory pinning and will result in
an aborted migration (but with the source VM left unaffected).
-3. Use of the recent /proc/<pid>/pagemap would likely speed up
+2. Use of the recent /proc/<pid>/pagemap would likely speed up
the use of KSM and ballooning while using RDMA.
-4. Also, some form of balloon-device usage tracking would also
+3. Also, some form of balloon-device usage tracking would also
help alleviate some issues.
-5. Move UNREGISTER requests to a separate thread.
-6. Use LRU to provide more fine-grained direction of UNREGISTER
+4. Use LRU to provide more fine-grained direction of UNREGISTER
requests for unpinning memory in an overcommitted environment.
-7. Expose UNREGISTER support to the user by way of workload-specific
+5. Expose UNREGISTER support to the user by way of workload-specific
hints about application behavior.
diff --git a/hw/audio/hda-codec.c b/hw/audio/hda-codec.c
index 986f2a9c92..a67ca91ca7 100644
--- a/hw/audio/hda-codec.c
+++ b/hw/audio/hda-codec.c
@@ -157,6 +157,9 @@ struct HDAAudioStream {
uint32_t bpos;
};
+#define TYPE_HDA_AUDIO "hda-audio"
+#define HDA_AUDIO(obj) OBJECT_CHECK(HDAAudioState, (obj), TYPE_HDA_AUDIO)
+
struct HDAAudioState {
HDACodecDevice hda;
const char *name;
@@ -288,7 +291,7 @@ static void hda_audio_setup(HDAAudioStream *st)
static void hda_audio_command(HDACodecDevice *hda, uint32_t nid, uint32_t data)
{
- HDAAudioState *a = DO_UPCAST(HDAAudioState, hda, hda);
+ HDAAudioState *a = HDA_AUDIO(hda);
HDAAudioStream *st;
const desc_node *node = NULL;
const desc_param *param;
@@ -448,7 +451,7 @@ fail:
static void hda_audio_stream(HDACodecDevice *hda, uint32_t stnr, bool running, bool output)
{
- HDAAudioState *a = DO_UPCAST(HDAAudioState, hda, hda);
+ HDAAudioState *a = HDA_AUDIO(hda);
int s;
a->running_compat[stnr] = running;
@@ -469,7 +472,7 @@ static void hda_audio_stream(HDACodecDevice *hda, uint32_t stnr, bool running, b
static int hda_audio_init(HDACodecDevice *hda, const struct desc_codec *desc)
{
- HDAAudioState *a = DO_UPCAST(HDAAudioState, hda, hda);
+ HDAAudioState *a = HDA_AUDIO(hda);
HDAAudioStream *st;
const desc_node *node;
const desc_param *param;
@@ -514,7 +517,7 @@ static int hda_audio_init(HDACodecDevice *hda, const struct desc_codec *desc)
static int hda_audio_exit(HDACodecDevice *hda)
{
- HDAAudioState *a = DO_UPCAST(HDAAudioState, hda, hda);
+ HDAAudioState *a = HDA_AUDIO(hda);
HDAAudioStream *st;
int i;
@@ -561,7 +564,7 @@ static int hda_audio_post_load(void *opaque, int version)
static void hda_audio_reset(DeviceState *dev)
{
- HDAAudioState *a = DO_UPCAST(HDAAudioState, hda.qdev, dev);
+ HDAAudioState *a = HDA_AUDIO(dev);
HDAAudioStream *st;
int i;
@@ -613,7 +616,7 @@ static Property hda_audio_properties[] = {
static int hda_audio_init_output(HDACodecDevice *hda)
{
- HDAAudioState *a = DO_UPCAST(HDAAudioState, hda, hda);
+ HDAAudioState *a = HDA_AUDIO(hda);
if (!a->mixer) {
return hda_audio_init(hda, &output_nomixemu);
@@ -624,7 +627,7 @@ static int hda_audio_init_output(HDACodecDevice *hda)
static int hda_audio_init_duplex(HDACodecDevice *hda)
{
- HDAAudioState *a = DO_UPCAST(HDAAudioState, hda, hda);
+ HDAAudioState *a = HDA_AUDIO(hda);
if (!a->mixer) {
return hda_audio_init(hda, &duplex_nomixemu);
@@ -635,7 +638,7 @@ static int hda_audio_init_duplex(HDACodecDevice *hda)
static int hda_audio_init_micro(HDACodecDevice *hda)
{
- HDAAudioState *a = DO_UPCAST(HDAAudioState, hda, hda);
+ HDAAudioState *a = HDA_AUDIO(hda);
if (!a->mixer) {
return hda_audio_init(hda, &micro_nomixemu);
@@ -644,25 +647,39 @@ static int hda_audio_init_micro(HDACodecDevice *hda)
}
}
-static void hda_audio_output_class_init(ObjectClass *klass, void *data)
+static void hda_audio_base_class_init(ObjectClass *klass, void *data)
{
DeviceClass *dc = DEVICE_CLASS(klass);
HDACodecDeviceClass *k = HDA_CODEC_DEVICE_CLASS(klass);
- k->init = hda_audio_init_output;
k->exit = hda_audio_exit;
k->command = hda_audio_command;
k->stream = hda_audio_stream;
set_bit(DEVICE_CATEGORY_SOUND, dc->categories);
- dc->desc = "HDA Audio Codec, output-only (line-out)";
dc->reset = hda_audio_reset;
dc->vmsd = &vmstate_hda_audio;
dc->props = hda_audio_properties;
}
+static const TypeInfo hda_audio_info = {
+ .name = TYPE_HDA_AUDIO,
+ .parent = TYPE_HDA_CODEC_DEVICE,
+ .class_init = hda_audio_base_class_init,
+ .abstract = true,
+};
+
+static void hda_audio_output_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ HDACodecDeviceClass *k = HDA_CODEC_DEVICE_CLASS(klass);
+
+ k->init = hda_audio_init_output;
+ dc->desc = "HDA Audio Codec, output-only (line-out)";
+}
+
static const TypeInfo hda_audio_output_info = {
.name = "hda-output",
- .parent = TYPE_HDA_CODEC_DEVICE,
+ .parent = TYPE_HDA_AUDIO,
.instance_size = sizeof(HDAAudioState),
.class_init = hda_audio_output_class_init,
};
@@ -673,19 +690,12 @@ static void hda_audio_duplex_class_init(ObjectClass *klass, void *data)
HDACodecDeviceClass *k = HDA_CODEC_DEVICE_CLASS(klass);
k->init = hda_audio_init_duplex;
- k->exit = hda_audio_exit;
- k->command = hda_audio_command;
- k->stream = hda_audio_stream;
- set_bit(DEVICE_CATEGORY_SOUND, dc->categories);
dc->desc = "HDA Audio Codec, duplex (line-out, line-in)";
- dc->reset = hda_audio_reset;
- dc->vmsd = &vmstate_hda_audio;
- dc->props = hda_audio_properties;
}
static const TypeInfo hda_audio_duplex_info = {
.name = "hda-duplex",
- .parent = TYPE_HDA_CODEC_DEVICE,
+ .parent = TYPE_HDA_AUDIO,
.instance_size = sizeof(HDAAudioState),
.class_init = hda_audio_duplex_class_init,
};
@@ -696,25 +706,19 @@ static void hda_audio_micro_class_init(ObjectClass *klass, void *data)
HDACodecDeviceClass *k = HDA_CODEC_DEVICE_CLASS(klass);
k->init = hda_audio_init_micro;
- k->exit = hda_audio_exit;
- k->command = hda_audio_command;
- k->stream = hda_audio_stream;
- set_bit(DEVICE_CATEGORY_SOUND, dc->categories);
dc->desc = "HDA Audio Codec, duplex (speaker, microphone)";
- dc->reset = hda_audio_reset;
- dc->vmsd = &vmstate_hda_audio;
- dc->props = hda_audio_properties;
}
static const TypeInfo hda_audio_micro_info = {
.name = "hda-micro",
- .parent = TYPE_HDA_CODEC_DEVICE,
+ .parent = TYPE_HDA_AUDIO,
.instance_size = sizeof(HDAAudioState),
.class_init = hda_audio_micro_class_init,
};
static void hda_audio_register_types(void)
{
+ type_register_static(&hda_audio_info);
type_register_static(&hda_audio_output_info);
type_register_static(&hda_audio_duplex_info);
type_register_static(&hda_audio_micro_info);
diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c
index 854997d9ba..a1de2f43a0 100644
--- a/hw/net/vhost_net.c
+++ b/hw/net/vhost_net.c
@@ -106,7 +106,7 @@ struct vhost_net *vhost_net_init(NetClientState *backend, int devfd,
goto fail;
}
net->nc = backend;
- net->dev.backend_features = tap_has_vnet_hdr(backend) ? 0 :
+ net->dev.backend_features = qemu_has_vnet_hdr(backend) ? 0 :
(1 << VHOST_NET_F_VIRTIO_NET_HDR);
net->backend = r;
@@ -117,8 +117,8 @@ struct vhost_net *vhost_net_init(NetClientState *backend, int devfd,
if (r < 0) {
goto fail;
}
- if (!tap_has_vnet_hdr_len(backend,
- sizeof(struct virtio_net_hdr_mrg_rxbuf))) {
+ if (!qemu_has_vnet_hdr_len(backend,
+ sizeof(struct virtio_net_hdr_mrg_rxbuf))) {
net->dev.features &= ~(1 << VIRTIO_NET_F_MRG_RXBUF);
}
if (~net->dev.features & net->dev.backend_features) {
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 36266083b2..3c0342e17a 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -325,11 +325,7 @@ static void peer_test_vnet_hdr(VirtIONet *n)
return;
}
- if (nc->peer->info->type != NET_CLIENT_OPTIONS_KIND_TAP) {
- return;
- }
-
- n->has_vnet_hdr = tap_has_vnet_hdr(nc->peer);
+ n->has_vnet_hdr = qemu_has_vnet_hdr(nc->peer);
}
static int peer_has_vnet_hdr(VirtIONet *n)
@@ -342,7 +338,7 @@ static int peer_has_ufo(VirtIONet *n)
if (!peer_has_vnet_hdr(n))
return 0;
- n->has_ufo = tap_has_ufo(qemu_get_queue(n->nic)->peer);
+ n->has_ufo = qemu_has_ufo(qemu_get_queue(n->nic)->peer);
return n->has_ufo;
}
@@ -361,8 +357,8 @@ static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs)
nc = qemu_get_subqueue(n->nic, i);
if (peer_has_vnet_hdr(n) &&
- tap_has_vnet_hdr_len(nc->peer, n->guest_hdr_len)) {
- tap_set_vnet_hdr_len(nc->peer, n->guest_hdr_len);
+ qemu_has_vnet_hdr_len(nc->peer, n->guest_hdr_len)) {
+ qemu_set_vnet_hdr_len(nc->peer, n->guest_hdr_len);
n->host_hdr_len = n->guest_hdr_len;
}
}
@@ -463,7 +459,7 @@ static uint32_t virtio_net_bad_features(VirtIODevice *vdev)
static void virtio_net_apply_guest_offloads(VirtIONet *n)
{
- tap_set_offload(qemu_get_subqueue(n->nic, 0)->peer,
+ qemu_set_offload(qemu_get_queue(n->nic)->peer,
!!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_CSUM)),
!!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO4)),
!!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO6)),
@@ -1544,7 +1540,7 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp)
peer_test_vnet_hdr(n);
if (peer_has_vnet_hdr(n)) {
for (i = 0; i < n->max_queues; i++) {
- tap_using_vnet_hdr(qemu_get_subqueue(n->nic, i)->peer, true);
+ qemu_using_vnet_hdr(qemu_get_subqueue(n->nic, i)->peer, true);
}
n->host_hdr_len = sizeof(struct virtio_net_hdr);
} else {
diff --git a/hw/net/vmxnet3.c b/hw/net/vmxnet3.c
index 19687aa03c..5be807ce82 100644
--- a/hw/net/vmxnet3.c
+++ b/hw/net/vmxnet3.c
@@ -1290,12 +1290,12 @@ static void vmxnet3_update_features(VMXNET3State *s)
s->lro_supported, rxcso_supported,
s->rx_vlan_stripping);
if (s->peer_has_vhdr) {
- tap_set_offload(qemu_get_queue(s->nic)->peer,
- rxcso_supported,
- s->lro_supported,
- s->lro_supported,
- 0,
- 0);
+ qemu_set_offload(qemu_get_queue(s->nic)->peer,
+ rxcso_supported,
+ s->lro_supported,
+ s->lro_supported,
+ 0,
+ 0);
}
}
@@ -1883,11 +1883,9 @@ static NetClientInfo net_vmxnet3_info = {
static bool vmxnet3_peer_has_vnet_hdr(VMXNET3State *s)
{
- NetClientState *peer = qemu_get_queue(s->nic)->peer;
+ NetClientState *nc = qemu_get_queue(s->nic);
- if ((NULL != peer) &&
- (peer->info->type == NET_CLIENT_OPTIONS_KIND_TAP) &&
- tap_has_vnet_hdr(peer)) {
+ if (qemu_has_vnet_hdr(nc->peer)) {
return true;
}
@@ -1935,10 +1933,10 @@ static void vmxnet3_net_init(VMXNET3State *s)
s->lro_supported = false;
if (s->peer_has_vhdr) {
- tap_set_vnet_hdr_len(qemu_get_queue(s->nic)->peer,
+ qemu_set_vnet_hdr_len(qemu_get_queue(s->nic)->peer,
sizeof(struct virtio_net_hdr));
- tap_using_vnet_hdr(qemu_get_queue(s->nic)->peer, 1);
+ qemu_using_vnet_hdr(qemu_get_queue(s->nic)->peer, 1);
}
qemu_format_nic_info_str(qemu_get_queue(s->nic), s->conf.macaddr.a);
diff --git a/include/migration/page_cache.h b/include/migration/page_cache.h
index d156f0d398..2d5ce2dd7a 100644
--- a/include/migration/page_cache.h
+++ b/include/migration/page_cache.h
@@ -66,7 +66,7 @@ uint8_t *get_cached_data(const PageCache *cache, uint64_t addr);
* @addr: page address
* @pdata: pointer to the page
*/
-int cache_insert(PageCache *cache, uint64_t addr, uint8_t *pdata);
+int cache_insert(PageCache *cache, uint64_t addr, const uint8_t *pdata);
/**
* cache_resize: resize the page cache. In case of size reduction the extra
diff --git a/include/net/net.h b/include/net/net.h
index 11e146888b..8166345a13 100644
--- a/include/net/net.h
+++ b/include/net/net.h
@@ -50,6 +50,12 @@ typedef void (NetCleanup) (NetClientState *);
typedef void (LinkStatusChanged)(NetClientState *);
typedef void (NetClientDestructor)(NetClientState *);
typedef RxFilterInfo *(QueryRxFilter)(NetClientState *);
+typedef bool (HasUfo)(NetClientState *);
+typedef bool (HasVnetHdr)(NetClientState *);
+typedef bool (HasVnetHdrLen)(NetClientState *, int);
+typedef void (UsingVnetHdr)(NetClientState *, bool);
+typedef void (SetOffload)(NetClientState *, int, int, int, int, int);
+typedef void (SetVnetHdrLen)(NetClientState *, int);
typedef struct NetClientInfo {
NetClientOptionsKind type;
@@ -62,6 +68,12 @@ typedef struct NetClientInfo {
LinkStatusChanged *link_status_changed;
QueryRxFilter *query_rx_filter;
NetPoll *poll;
+ HasUfo *has_ufo;
+ HasVnetHdr *has_vnet_hdr;
+ HasVnetHdrLen *has_vnet_hdr_len;
+ UsingVnetHdr *using_vnet_hdr;
+ SetOffload *set_offload;
+ SetVnetHdrLen *set_vnet_hdr_len;
} NetClientInfo;
struct NetClientState {
@@ -120,6 +132,13 @@ ssize_t qemu_send_packet_async(NetClientState *nc, const uint8_t *buf,
void qemu_purge_queued_packets(NetClientState *nc);
void qemu_flush_queued_packets(NetClientState *nc);
void qemu_format_nic_info_str(NetClientState *nc, uint8_t macaddr[6]);
+bool qemu_has_ufo(NetClientState *nc);
+bool qemu_has_vnet_hdr(NetClientState *nc);
+bool qemu_has_vnet_hdr_len(NetClientState *nc, int len);
+void qemu_using_vnet_hdr(NetClientState *nc, bool enable);
+void qemu_set_offload(NetClientState *nc, int csum, int tso4, int tso6,
+ int ecn, int ufo);
+void qemu_set_vnet_hdr_len(NetClientState *nc, int len);
void qemu_macaddr_default_if_unset(MACAddr *macaddr);
int qemu_show_nic_models(const char *arg, const char *const *models);
void qemu_check_nic_model(NICInfo *nd, const char *model);
diff --git a/include/net/tap.h b/include/net/tap.h
index a994f20447..6daeb42b0f 100644
--- a/include/net/tap.h
+++ b/include/net/tap.h
@@ -29,12 +29,6 @@
#include "qemu-common.h"
#include "qapi-types.h"
-bool tap_has_ufo(NetClientState *nc);
-int tap_has_vnet_hdr(NetClientState *nc);
-int tap_has_vnet_hdr_len(NetClientState *nc, int len);
-void tap_using_vnet_hdr(NetClientState *nc, bool using_vnet_hdr);
-void tap_set_offload(NetClientState *nc, int csum, int tso4, int tso6, int ecn, int ufo);
-void tap_set_vnet_hdr_len(NetClientState *nc, int len);
int tap_enable(NetClientState *nc);
int tap_disable(NetClientState *nc);
diff --git a/migration-rdma.c b/migration-rdma.c
index f94f3b4e3a..eeb4302215 100644
--- a/migration-rdma.c
+++ b/migration-rdma.c
@@ -3412,7 +3412,7 @@ void rdma_start_outgoing_migration(void *opaque,
}
ret = qemu_rdma_source_init(rdma, &local_err,
- s->enabled_capabilities[MIGRATION_CAPABILITY_X_RDMA_PIN_ALL]);
+ s->enabled_capabilities[MIGRATION_CAPABILITY_RDMA_PIN_ALL]);
if (ret) {
goto err;
diff --git a/migration.c b/migration.c
index 25add6f9e2..14235b280a 100644
--- a/migration.c
+++ b/migration.c
@@ -82,7 +82,7 @@ void qemu_start_incoming_migration(const char *uri, Error **errp)
if (strstart(uri, "tcp:", &p))
tcp_start_incoming_migration(p, errp);
#ifdef CONFIG_RDMA
- else if (strstart(uri, "x-rdma:", &p))
+ else if (strstart(uri, "rdma:", &p))
rdma_start_incoming_migration(p, errp);
#endif
#if !defined(WIN32)
@@ -438,7 +438,7 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,
if (strstart(uri, "tcp:", &p)) {
tcp_start_outgoing_migration(s, p, &local_err);
#ifdef CONFIG_RDMA
- } else if (strstart(uri, "x-rdma:", &p)) {
+ } else if (strstart(uri, "rdma:", &p)) {
rdma_start_outgoing_migration(s, p, &local_err);
#endif
#if !defined(WIN32)
@@ -532,7 +532,7 @@ bool migrate_rdma_pin_all(void)
s = migrate_get_current();
- return s->enabled_capabilities[MIGRATION_CAPABILITY_X_RDMA_PIN_ALL];
+ return s->enabled_capabilities[MIGRATION_CAPABILITY_RDMA_PIN_ALL];
}
bool migrate_auto_converge(void)
diff --git a/net/net.c b/net/net.c
index 41b38830ea..e3ef1e4f1d 100644
--- a/net/net.c
+++ b/net/net.c
@@ -378,6 +378,61 @@ void qemu_foreach_nic(qemu_nic_foreach func, void *opaque)
}
}
+bool qemu_has_ufo(NetClientState *nc)
+{
+ if (!nc || !nc->info->has_ufo) {
+ return false;
+ }
+
+ return nc->info->has_ufo(nc);
+}
+
+bool qemu_has_vnet_hdr(NetClientState *nc)
+{
+ if (!nc || !nc->info->has_vnet_hdr) {
+ return false;
+ }
+
+ return nc->info->has_vnet_hdr(nc);
+}
+
+bool qemu_has_vnet_hdr_len(NetClientState *nc, int len)
+{
+ if (!nc || !nc->info->has_vnet_hdr_len) {
+ return false;
+ }
+
+ return nc->info->has_vnet_hdr_len(nc, len);
+}
+
+void qemu_using_vnet_hdr(NetClientState *nc, bool enable)
+{
+ if (!nc || !nc->info->using_vnet_hdr) {
+ return;
+ }
+
+ nc->info->using_vnet_hdr(nc, enable);
+}
+
+void qemu_set_offload(NetClientState *nc, int csum, int tso4, int tso6,
+ int ecn, int ufo)
+{
+ if (!nc || !nc->info->set_offload) {
+ return;
+ }
+
+ nc->info->set_offload(nc, csum, tso4, tso6, ecn, ufo);
+}
+
+void qemu_set_vnet_hdr_len(NetClientState *nc, int len)
+{
+ if (!nc || !nc->info->set_vnet_hdr_len) {
+ return;
+ }
+
+ nc->info->set_vnet_hdr_len(nc, len);
+}
+
int qemu_can_send_packet(NetClientState *sender)
{
if (!sender->peer) {
diff --git a/net/netmap.c b/net/netmap.c
index 0ccc4976b5..8213304a5b 100644
--- a/net/netmap.c
+++ b/net/netmap.c
@@ -27,10 +27,13 @@
#include <net/if.h>
#include <sys/mman.h>
#include <stdint.h>
+#include <stdio.h>
+#define NETMAP_WITH_LIBS
#include <net/netmap.h>
#include <net/netmap_user.h>
#include "net/net.h"
+#include "net/tap.h"
#include "clients.h"
#include "sysemu/sysemu.h"
#include "qemu/error-report.h"
@@ -54,33 +57,9 @@ typedef struct NetmapState {
bool read_poll;
bool write_poll;
struct iovec iov[IOV_MAX];
+ int vnet_hdr_len; /* Current virtio-net header length. */
} NetmapState;
-#define D(format, ...) \
- do { \
- struct timeval __xxts; \
- gettimeofday(&__xxts, NULL); \
- printf("%03d.%06d %s [%d] " format "\n", \
- (int)__xxts.tv_sec % 1000, (int)__xxts.tv_usec, \
- __func__, __LINE__, ##__VA_ARGS__); \
- } while (0)
-
-/* Rate limited version of "D", lps indicates how many per second */
-#define RD(lps, format, ...) \
- do { \
- static int t0, __cnt; \
- struct timeval __xxts; \
- gettimeofday(&__xxts, NULL); \
- if (t0 != __xxts.tv_sec) { \
- t0 = __xxts.tv_sec; \
- __cnt = 0; \
- } \
- if (__cnt++ < lps) { \
- D(format, ##__VA_ARGS__); \
- } \
- } while (0)
-
-
#ifndef __FreeBSD__
#define pkt_copy bcopy
#else
@@ -237,7 +216,7 @@ static ssize_t netmap_receive(NetClientState *nc,
return size;
}
- if (ring->avail == 0) {
+ if (nm_ring_empty(ring)) {
/* No available slots in the netmap TX ring. */
netmap_write_poll(s, true);
return 0;
@@ -250,8 +229,7 @@ static ssize_t netmap_receive(NetClientState *nc,
ring->slot[i].len = size;
ring->slot[i].flags = 0;
pkt_copy(buf, dst, size);
- ring->cur = NETMAP_RING_NEXT(ring, i);
- ring->avail--;
+ ring->cur = ring->head = nm_ring_next(ring, i);
ioctl(s->me.fd, NIOCTXSYNC, NULL);
return size;
@@ -267,17 +245,15 @@ static ssize_t netmap_receive_iov(NetClientState *nc,
uint8_t *dst;
int j;
uint32_t i;
- uint32_t avail;
if (unlikely(!ring)) {
/* Drop the packet. */
return iov_size(iov, iovcnt);
}
- i = ring->cur;
- avail = ring->avail;
+ last = i = ring->cur;
- if (avail < iovcnt) {
+ if (nm_ring_space(ring) < iovcnt) {
/* Not enough netmap slots. */
netmap_write_poll(s, true);
return 0;
@@ -293,7 +269,7 @@ static ssize_t netmap_receive_iov(NetClientState *nc,
while (iov_frag_size) {
nm_frag_size = MIN(iov_frag_size, ring->nr_buf_size);
- if (unlikely(avail == 0)) {
+ if (unlikely(nm_ring_empty(ring))) {
/* We run out of netmap slots while splitting the
iovec fragments. */
netmap_write_poll(s, true);
@@ -308,8 +284,7 @@ static ssize_t netmap_receive_iov(NetClientState *nc,
pkt_copy(iov[j].iov_base + offset, dst, nm_frag_size);
last = i;
- i = NETMAP_RING_NEXT(ring, i);
- avail--;
+ i = nm_ring_next(ring, i);
offset += nm_frag_size;
iov_frag_size -= nm_frag_size;
@@ -318,9 +293,8 @@ static ssize_t netmap_receive_iov(NetClientState *nc,
/* The last slot must not have NS_MOREFRAG set. */
ring->slot[last].flags &= ~NS_MOREFRAG;
- /* Now update ring->cur and ring->avail. */
- ring->cur = i;
- ring->avail = avail;
+ /* Now update ring->cur and ring->head. */
+ ring->cur = ring->head = i;
ioctl(s->me.fd, NIOCTXSYNC, NULL);
@@ -343,7 +317,7 @@ static void netmap_send(void *opaque)
/* Keep sending while there are available packets into the netmap
RX ring and the forwarding path towards the peer is open. */
- while (ring->avail > 0 && qemu_can_send_packet(&s->nc)) {
+ while (!nm_ring_empty(ring) && qemu_can_send_packet(&s->nc)) {
uint32_t i;
uint32_t idx;
bool morefrag;
@@ -358,11 +332,10 @@ static void netmap_send(void *opaque)
s->iov[iovcnt].iov_len = ring->slot[i].len;
iovcnt++;
- ring->cur = NETMAP_RING_NEXT(ring, i);
- ring->avail--;
- } while (ring->avail && morefrag);
+ ring->cur = ring->head = nm_ring_next(ring, i);
+ } while (!nm_ring_empty(ring) && morefrag);
- if (unlikely(!ring->avail && morefrag)) {
+ if (unlikely(nm_ring_empty(ring) && morefrag)) {
RD(5, "[netmap_send] ran out of slots, with a pending"
"incomplete packet\n");
}
@@ -394,6 +367,63 @@ static void netmap_cleanup(NetClientState *nc)
s->me.fd = -1;
}
+/* Offloading manipulation support callbacks. */
+static bool netmap_has_ufo(NetClientState *nc)
+{
+ return true;
+}
+
+static bool netmap_has_vnet_hdr(NetClientState *nc)
+{
+ return true;
+}
+
+static bool netmap_has_vnet_hdr_len(NetClientState *nc, int len)
+{
+ return len == 0 || len == sizeof(struct virtio_net_hdr) ||
+ len == sizeof(struct virtio_net_hdr_mrg_rxbuf);
+}
+
+static void netmap_using_vnet_hdr(NetClientState *nc, bool enable)
+{
+}
+
+static void netmap_set_vnet_hdr_len(NetClientState *nc, int len)
+{
+ NetmapState *s = DO_UPCAST(NetmapState, nc, nc);
+ int err;
+ struct nmreq req;
+
+ /* Issue a NETMAP_BDG_VNET_HDR command to change the virtio-net header
+ * length for the netmap adapter associated to 'me->ifname'.
+ */
+ memset(&req, 0, sizeof(req));
+ pstrcpy(req.nr_name, sizeof(req.nr_name), s->me.ifname);
+ req.nr_version = NETMAP_API;
+ req.nr_cmd = NETMAP_BDG_VNET_HDR;
+ req.nr_arg1 = len;
+ err = ioctl(s->me.fd, NIOCREGIF, &req);
+ if (err) {
+ error_report("Unable to execute NETMAP_BDG_VNET_HDR on %s: %s",
+ s->me.ifname, strerror(errno));
+ } else {
+ /* Keep track of the current length. */
+ s->vnet_hdr_len = len;
+ }
+}
+
+static void netmap_set_offload(NetClientState *nc, int csum, int tso4, int tso6,
+ int ecn, int ufo)
+{
+ NetmapState *s = DO_UPCAST(NetmapState, nc, nc);
+
+ /* Setting a virtio-net header length greater than zero automatically
+ * enables the offloadings.
+ */
+ if (!s->vnet_hdr_len) {
+ netmap_set_vnet_hdr_len(nc, sizeof(struct virtio_net_hdr));
+ }
+}
/* NetClientInfo methods */
static NetClientInfo net_netmap_info = {
@@ -403,6 +433,12 @@ static NetClientInfo net_netmap_info = {
.receive_iov = netmap_receive_iov,
.poll = netmap_poll,
.cleanup = netmap_cleanup,
+ .has_ufo = netmap_has_ufo,
+ .has_vnet_hdr = netmap_has_vnet_hdr,
+ .has_vnet_hdr_len = netmap_has_vnet_hdr_len,
+ .using_vnet_hdr = netmap_using_vnet_hdr,
+ .set_offload = netmap_set_offload,
+ .set_vnet_hdr_len = netmap_set_vnet_hdr_len,
};
/* The exported init function
@@ -428,6 +464,7 @@ int net_init_netmap(const NetClientOptions *opts,
nc = qemu_new_net_client(&net_netmap_info, peer, "netmap", name);
s = DO_UPCAST(NetmapState, nc, nc);
s->me = me;
+ s->vnet_hdr_len = 0;
netmap_read_poll(s, true); /* Initially only poll for reads. */
return 0;
diff --git a/net/tap-win32.c b/net/tap-win32.c
index 91e9e844a0..8aee611f7d 100644
--- a/net/tap-win32.c
+++ b/net/tap-win32.c
@@ -669,11 +669,60 @@ static void tap_win32_send(void *opaque)
}
}
+static bool tap_has_ufo(NetClientState *nc)
+{
+ return false;
+}
+
+static bool tap_has_vnet_hdr(NetClientState *nc)
+{
+ return false;
+}
+
+int tap_probe_vnet_hdr_len(int fd, int len)
+{
+ return 0;
+}
+
+void tap_fd_set_vnet_hdr_len(int fd, int len)
+{
+}
+
+static void tap_using_vnet_hdr(NetClientState *nc, bool using_vnet_hdr)
+{
+}
+
+static void tap_set_offload(NetClientState *nc, int csum, int tso4,
+ int tso6, int ecn, int ufo)
+{
+}
+
+struct vhost_net *tap_get_vhost_net(NetClientState *nc)
+{
+ return NULL;
+}
+
+static bool tap_has_vnet_hdr_len(NetClientState *nc, int len)
+{
+ return false;
+}
+
+static void tap_set_vnet_hdr_len(NetClientState *nc, int len)
+{
+ abort();
+}
+
static NetClientInfo net_tap_win32_info = {
.type = NET_CLIENT_OPTIONS_KIND_TAP,
.size = sizeof(TAPState),
.receive = tap_receive,
.cleanup = tap_cleanup,
+ .has_ufo = tap_has_ufo,
+ .has_vnet_hdr = tap_has_vnet_hdr,
+ .has_vnet_hdr_len = tap_has_vnet_hdr_len,
+ .using_vnet_hdr = tap_using_vnet_hdr,
+ .set_offload = tap_set_offload,
+ .set_vnet_hdr_len = tap_set_vnet_hdr_len,
};
static int tap_win32_init(NetClientState *peer, const char *model,
@@ -722,49 +771,6 @@ int net_init_tap(const NetClientOptions *opts, const char *name,
return 0;
}
-bool tap_has_ufo(NetClientState *nc)
-{
- return false;
-}
-
-int tap_has_vnet_hdr(NetClientState *nc)
-{
- return 0;
-}
-
-int tap_probe_vnet_hdr_len(int fd, int len)
-{
- return 0;
-}
-
-void tap_fd_set_vnet_hdr_len(int fd, int len)
-{
-}
-
-void tap_using_vnet_hdr(NetClientState *nc, bool using_vnet_hdr)
-{
-}
-
-void tap_set_offload(NetClientState *nc, int csum, int tso4,
- int tso6, int ecn, int ufo)
-{
-}
-
-struct vhost_net *tap_get_vhost_net(NetClientState *nc)
-{
- return NULL;
-}
-
-int tap_has_vnet_hdr_len(NetClientState *nc, int len)
-{
- return 0;
-}
-
-void tap_set_vnet_hdr_len(NetClientState *nc, int len)
-{
- abort();
-}
-
int tap_enable(NetClientState *nc)
{
abort();
diff --git a/net/tap.c b/net/tap.c
index 39c1cda3e4..2d5099b9be 100644
--- a/net/tap.c
+++ b/net/tap.c
@@ -210,7 +210,7 @@ static void tap_send(void *opaque)
} while (size > 0 && qemu_can_send_packet(&s->nc));
}
-bool tap_has_ufo(NetClientState *nc)
+static bool tap_has_ufo(NetClientState *nc)
{
TAPState *s = DO_UPCAST(TAPState, nc, nc);
@@ -219,7 +219,7 @@ bool tap_has_ufo(NetClientState *nc)
return s->has_ufo;
}
-int tap_has_vnet_hdr(NetClientState *nc)
+static bool tap_has_vnet_hdr(NetClientState *nc)
{
TAPState *s = DO_UPCAST(TAPState, nc, nc);
@@ -228,16 +228,16 @@ int tap_has_vnet_hdr(NetClientState *nc)
return !!s->host_vnet_hdr_len;
}
-int tap_has_vnet_hdr_len(NetClientState *nc, int len)
+static bool tap_has_vnet_hdr_len(NetClientState *nc, int len)
{
TAPState *s = DO_UPCAST(TAPState, nc, nc);
assert(nc->info->type == NET_CLIENT_OPTIONS_KIND_TAP);
- return tap_probe_vnet_hdr_len(s->fd, len);
+ return !!tap_probe_vnet_hdr_len(s->fd, len);
}
-void tap_set_vnet_hdr_len(NetClientState *nc, int len)
+static void tap_set_vnet_hdr_len(NetClientState *nc, int len)
{
TAPState *s = DO_UPCAST(TAPState, nc, nc);
@@ -249,7 +249,7 @@ void tap_set_vnet_hdr_len(NetClientState *nc, int len)
s->host_vnet_hdr_len = len;
}
-void tap_using_vnet_hdr(NetClientState *nc, bool using_vnet_hdr)
+static void tap_using_vnet_hdr(NetClientState *nc, bool using_vnet_hdr)
{
TAPState *s = DO_UPCAST(TAPState, nc, nc);
@@ -259,7 +259,7 @@ void tap_using_vnet_hdr(NetClientState *nc, bool using_vnet_hdr)
s->using_vnet_hdr = using_vnet_hdr;
}
-void tap_set_offload(NetClientState *nc, int csum, int tso4,
+static void tap_set_offload(NetClientState *nc, int csum, int tso4,
int tso6, int ecn, int ufo)
{
TAPState *s = DO_UPCAST(TAPState, nc, nc);
@@ -314,6 +314,12 @@ static NetClientInfo net_tap_info = {
.receive_iov = tap_receive_iov,
.poll = tap_poll,
.cleanup = tap_cleanup,
+ .has_ufo = tap_has_ufo,
+ .has_vnet_hdr = tap_has_vnet_hdr,
+ .has_vnet_hdr_len = tap_has_vnet_hdr_len,
+ .using_vnet_hdr = tap_using_vnet_hdr,
+ .set_offload = tap_set_offload,
+ .set_vnet_hdr_len = tap_set_vnet_hdr_len,
};
static TAPState *net_tap_fd_init(NetClientState *peer,
diff --git a/page_cache.c b/page_cache.c
index 3ef6ee7ad2..b033681a93 100644
--- a/page_cache.c
+++ b/page_cache.c
@@ -150,7 +150,7 @@ uint8_t *get_cached_data(const PageCache *cache, uint64_t addr)
return cache_get_by_addr(cache, addr)->it_data;
}
-int cache_insert(PageCache *cache, uint64_t addr, uint8_t *pdata)
+int cache_insert(PageCache *cache, uint64_t addr, const uint8_t *pdata)
{
CacheItem *it = NULL;
diff --git a/qapi-schema.json b/qapi-schema.json
index fcb2280053..ac8ad24966 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -751,10 +751,9 @@
# This feature allows us to minimize migration traffic for certain work
# loads, by sending compressed difference of the pages
#
-# @x-rdma-pin-all: Controls whether or not the entire VM memory footprint is
+# @rdma-pin-all: Controls whether or not the entire VM memory footprint is
# mlock()'d on demand or all at once. Refer to docs/rdma.txt for usage.
-# Disabled by default. Experimental: may (or may not) be renamed after
-# further testing is complete. (since 1.6)
+# Disabled by default. (since 2.0)
#
# @zero-blocks: During storage migration encode blocks of zeroes efficiently. This
# essentially saves 1MB of zeroes per block on the wire. Enabling requires
@@ -768,7 +767,7 @@
# Since: 1.2
##
{ 'enum': 'MigrationCapability',
- 'data': ['xbzrle', 'x-rdma-pin-all', 'auto-converge', 'zero-blocks'] }
+ 'data': ['xbzrle', 'rdma-pin-all', 'auto-converge', 'zero-blocks'] }
##
# @MigrationCapabilityStatus
diff --git a/qemu-file.c b/qemu-file.c
index 9473b674ba..f074af15c3 100644
--- a/qemu-file.c
+++ b/qemu-file.c
@@ -100,7 +100,14 @@ static int stdio_put_buffer(void *opaque, const uint8_t *buf, int64_t pos,
int size)
{
QEMUFileStdio *s = opaque;
- return fwrite(buf, 1, size, s->stdio_file);
+ int res;
+
+ res = fwrite(buf, 1, size, s->stdio_file);
+
+ if (res != size) {
+ return -EIO; /* fake errno value */
+ }
+ return res;
}
static int stdio_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size)
diff --git a/vmstate.c b/vmstate.c
index 284b080f46..d1f5eb0e6a 100644
--- a/vmstate.c
+++ b/vmstate.c
@@ -321,23 +321,24 @@ const VMStateInfo vmstate_info_int32_equal = {
.put = put_int32,
};
-/* 32 bit int. See that the received value is the less or the same
- than the one in the field */
+/* 32 bit int. Check that the received value is less than or equal to
+ the one in the field */
static int get_int32_le(QEMUFile *f, void *pv, size_t size)
{
- int32_t *old = pv;
- int32_t new;
- qemu_get_sbe32s(f, &new);
+ int32_t *cur = pv;
+ int32_t loaded;
+ qemu_get_sbe32s(f, &loaded);
- if (*old <= new) {
+ if (loaded <= *cur) {
+ *cur = loaded;
return 0;
}
return -EINVAL;
}
const VMStateInfo vmstate_info_int32_le = {
- .name = "int32 equal",
+ .name = "int32 le",
.get = get_int32_le,
.put = put_int32,
};