aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStefan Hajnoczi <stefanha@redhat.com>2023-09-19 13:21:49 -0400
committerStefan Hajnoczi <stefanha@redhat.com>2023-09-19 13:21:49 -0400
commitdd0c84983dd5c3fefaa29f15ed1b4c5c7be9775d (patch)
tree2de4d420f503fd0c2cbb061fd966d399365b0279
parentd7754940d78a7d5bfb13531afa9a67f8c57e987e (diff)
parent6d7a53e9f16d2b18d94f9fce1e4eea34570286ef (diff)
Merge tag 'net-pull-request' of https://github.com/jasowang/qemu into staging
# -----BEGIN PGP SIGNATURE----- # Version: GnuPG v1 # # iQEcBAABAgAGBQJlB/SLAAoJEO8Ells5jWIR7EQH/1kAbxHcSGJXDOgQAXJ/rOZi # UKn3ugJzD0Hxd4Xz8cvdVLM+9/JoEEOK1uB+NIG7Ask/gA5D7eUYzaLtp1OJ8VNO # mamfKmn3EIBWJoLSHH19TKzfW2tGMJHQ0Nj+sbDQRkK5f2c7hwLTRXa1EmlJd4dB # VoVzX4OiJtrQyv4OVmpP/PSETXJDvYYX/DNcRl9/3ccKtQW/wVDI3YzrMzXrsgyc # w9ItJi8k+19mVH6RgQwciqRvTbVMdzkOxqvU//LY0TxnjsHfbyHr+KlNAa2WTY2N # QgpAlMZhHqUG6/XXAs0o2VEtA66zmw932Xfy/CZUEcdGWfkG/9CEVfbuT4CKGY4= # =tF7K # -----END PGP SIGNATURE----- # gpg: Signature made Mon 18 Sep 2023 02:56:11 EDT # gpg: using RSA key EF04965B398D6211 # gpg: Good signature from "Jason Wang (Jason Wang on RedHat) <jasowang@redhat.com>" [full] # Primary key fingerprint: 215D 46F4 8246 689E C77F 3562 EF04 965B 398D 6211 * tag 'net-pull-request' of https://github.com/jasowang/qemu: net/tap: Avoid variable-length array net/dump: Avoid variable length array hw/net/rocker: Avoid variable length array hw/net/fsl_etsec/rings.c: Avoid variable length array net: add initial support for AF_XDP network backend tests: bump libvirt-ci for libasan and libxdp e1000e: rename e1000e_ba_state and e1000e_write_hdr_to_rx_buffers igb: packet-split descriptors support igb: add IPv6 extended headers traffic detection igb: RX payload guest writting refactoring igb: RX descriptors guest writting refactoring igb: rename E1000E_RingInfo_st igb: remove TCP ACK detection virtio-net: Add support for USO features virtio-net: Add USO flags to vhost support. tap: Add check for USO features tap: Add USO support to tap device. Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-rw-r--r--MAINTAINERS4
-rw-r--r--hmp-commands.hx3
-rw-r--r--hw/core/machine.c4
-rw-r--r--hw/net/e1000e_core.c80
-rw-r--r--hw/net/fsl_etsec/rings.c12
-rw-r--r--hw/net/igb_core.c732
-rw-r--r--hw/net/igb_regs.h20
-rw-r--r--hw/net/rocker/rocker_of_dpa.c2
-rw-r--r--hw/net/trace-events6
-rw-r--r--hw/net/vhost_net.c3
-rw-r--r--hw/net/virtio-net.c35
-rw-r--r--hw/net/vmxnet3.c2
-rw-r--r--include/net/net.h7
-rw-r--r--meson.build9
-rw-r--r--meson_options.txt2
-rw-r--r--net/af-xdp.c526
-rw-r--r--net/clients.h5
-rw-r--r--net/dump.c2
-rw-r--r--net/meson.build3
-rw-r--r--net/net.c19
-rw-r--r--net/netmap.c2
-rw-r--r--net/tap-bsd.c7
-rw-r--r--net/tap-linux.c27
-rw-r--r--net/tap-linux.h2
-rw-r--r--net/tap-solaris.c7
-rw-r--r--net/tap-stub.c7
-rw-r--r--net/tap-win32.c2
-rw-r--r--net/tap.c21
-rw-r--r--net/tap_int.h4
-rw-r--r--net/vhost-vdpa.c3
-rw-r--r--qapi/net.json58
-rw-r--r--qemu-options.hx70
-rwxr-xr-xscripts/ci/org.centos/stream/8/x86_64/configure1
-rw-r--r--scripts/meson-buildoptions.sh3
-rw-r--r--tests/docker/dockerfiles/alpine.docker1
-rw-r--r--tests/docker/dockerfiles/centos8.docker1
-rw-r--r--tests/docker/dockerfiles/debian-amd64-cross.docker2
-rw-r--r--tests/docker/dockerfiles/debian-amd64.docker2
-rw-r--r--tests/docker/dockerfiles/debian-arm64-cross.docker2
-rw-r--r--tests/docker/dockerfiles/debian-armel-cross.docker2
-rw-r--r--tests/docker/dockerfiles/debian-armhf-cross.docker2
-rw-r--r--tests/docker/dockerfiles/debian-ppc64el-cross.docker2
-rw-r--r--tests/docker/dockerfiles/debian-s390x-cross.docker2
-rw-r--r--tests/docker/dockerfiles/fedora.docker1
-rw-r--r--tests/docker/dockerfiles/opensuse-leap.docker2
-rw-r--r--tests/docker/dockerfiles/ubuntu2004.docker2
-rw-r--r--tests/docker/dockerfiles/ubuntu2204.docker2
m---------tests/lcitool/libvirt-ci0
-rw-r--r--tests/lcitool/projects/qemu.yml1
-rw-r--r--tests/qtest/libqos/igb.c5
50 files changed, 1435 insertions, 284 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 00562f924f..67cefaa6f2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2957,6 +2957,10 @@ W: http://info.iet.unipi.it/~luigi/netmap/
S: Maintained
F: net/netmap.c
+AF_XDP network backend
+R: Ilya Maximets <i.maximets@ovn.org>
+F: net/af-xdp.c
+
Host Memory Backends
M: David Hildenbrand <david@redhat.com>
M: Igor Mammedov <imammedo@redhat.com>
diff --git a/hmp-commands.hx b/hmp-commands.hx
index 2cbd0f77a0..63eac22734 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1296,6 +1296,9 @@ ERST
.name = "netdev_add",
.args_type = "netdev:O",
.params = "[user|tap|socket|stream|dgram|vde|bridge|hubport|netmap|vhost-user"
+#ifdef CONFIG_AF_XDP
+ "|af-xdp"
+#endif
#ifdef CONFIG_VMNET
"|vmnet-host|vmnet-shared|vmnet-bridged"
#endif
diff --git a/hw/core/machine.c b/hw/core/machine.c
index da699cf4e1..230aab819c 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -38,6 +38,7 @@
#include "exec/confidential-guest-support.h"
#include "hw/virtio/virtio.h"
#include "hw/virtio/virtio-pci.h"
+#include "hw/virtio/virtio-net.h"
GlobalProperty hw_compat_8_1[] = {};
const size_t hw_compat_8_1_len = G_N_ELEMENTS(hw_compat_8_1);
@@ -45,6 +46,9 @@ const size_t hw_compat_8_1_len = G_N_ELEMENTS(hw_compat_8_1);
GlobalProperty hw_compat_8_0[] = {
{ "migration", "multifd-flush-after-each-section", "on"},
{ TYPE_PCI_DEVICE, "x-pcie-ari-nextfn-1", "on" },
+ { TYPE_VIRTIO_NET, "host_uso", "off"},
+ { TYPE_VIRTIO_NET, "guest_uso4", "off"},
+ { TYPE_VIRTIO_NET, "guest_uso6", "off"},
};
const size_t hw_compat_8_0_len = G_N_ELEMENTS(hw_compat_8_0);
diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
index f8aeafa16b..e324c02dd5 100644
--- a/hw/net/e1000e_core.c
+++ b/hw/net/e1000e_core.c
@@ -810,24 +810,24 @@ e1000e_txdesc_writeback(E1000ECore *core, dma_addr_t base,
return e1000e_tx_wb_interrupt_cause(core, queue_idx);
}
-typedef struct E1000E_RingInfo_st {
+typedef struct E1000ERingInfo {
int dbah;
int dbal;
int dlen;
int dh;
int dt;
int idx;
-} E1000E_RingInfo;
+} E1000ERingInfo;
static inline bool
-e1000e_ring_empty(E1000ECore *core, const E1000E_RingInfo *r)
+e1000e_ring_empty(E1000ECore *core, const E1000ERingInfo *r)
{
return core->mac[r->dh] == core->mac[r->dt] ||
core->mac[r->dt] >= core->mac[r->dlen] / E1000_RING_DESC_LEN;
}
static inline uint64_t
-e1000e_ring_base(E1000ECore *core, const E1000E_RingInfo *r)
+e1000e_ring_base(E1000ECore *core, const E1000ERingInfo *r)
{
uint64_t bah = core->mac[r->dbah];
uint64_t bal = core->mac[r->dbal];
@@ -836,13 +836,13 @@ e1000e_ring_base(E1000ECore *core, const E1000E_RingInfo *r)
}
static inline uint64_t
-e1000e_ring_head_descr(E1000ECore *core, const E1000E_RingInfo *r)
+e1000e_ring_head_descr(E1000ECore *core, const E1000ERingInfo *r)
{
return e1000e_ring_base(core, r) + E1000_RING_DESC_LEN * core->mac[r->dh];
}
static inline void
-e1000e_ring_advance(E1000ECore *core, const E1000E_RingInfo *r, uint32_t count)
+e1000e_ring_advance(E1000ECore *core, const E1000ERingInfo *r, uint32_t count)
{
core->mac[r->dh] += count;
@@ -852,7 +852,7 @@ e1000e_ring_advance(E1000ECore *core, const E1000E_RingInfo *r, uint32_t count)
}
static inline uint32_t
-e1000e_ring_free_descr_num(E1000ECore *core, const E1000E_RingInfo *r)
+e1000e_ring_free_descr_num(E1000ECore *core, const E1000ERingInfo *r)
{
trace_e1000e_ring_free_space(r->idx, core->mac[r->dlen],
core->mac[r->dh], core->mac[r->dt]);
@@ -871,19 +871,19 @@ e1000e_ring_free_descr_num(E1000ECore *core, const E1000E_RingInfo *r)
}
static inline bool
-e1000e_ring_enabled(E1000ECore *core, const E1000E_RingInfo *r)
+e1000e_ring_enabled(E1000ECore *core, const E1000ERingInfo *r)
{
return core->mac[r->dlen] > 0;
}
static inline uint32_t
-e1000e_ring_len(E1000ECore *core, const E1000E_RingInfo *r)
+e1000e_ring_len(E1000ECore *core, const E1000ERingInfo *r)
{
return core->mac[r->dlen];
}
typedef struct E1000E_TxRing_st {
- const E1000E_RingInfo *i;
+ const E1000ERingInfo *i;
struct e1000e_tx *tx;
} E1000E_TxRing;
@@ -896,7 +896,7 @@ e1000e_mq_queue_idx(int base_reg_idx, int reg_idx)
static inline void
e1000e_tx_ring_init(E1000ECore *core, E1000E_TxRing *txr, int idx)
{
- static const E1000E_RingInfo i[E1000E_NUM_QUEUES] = {
+ static const E1000ERingInfo i[E1000E_NUM_QUEUES] = {
{ TDBAH, TDBAL, TDLEN, TDH, TDT, 0 },
{ TDBAH1, TDBAL1, TDLEN1, TDH1, TDT1, 1 }
};
@@ -908,13 +908,13 @@ e1000e_tx_ring_init(E1000ECore *core, E1000E_TxRing *txr, int idx)
}
typedef struct E1000E_RxRing_st {
- const E1000E_RingInfo *i;
+ const E1000ERingInfo *i;
} E1000E_RxRing;
static inline void
e1000e_rx_ring_init(E1000ECore *core, E1000E_RxRing *rxr, int idx)
{
- static const E1000E_RingInfo i[E1000E_NUM_QUEUES] = {
+ static const E1000ERingInfo i[E1000E_NUM_QUEUES] = {
{ RDBAH0, RDBAL0, RDLEN0, RDH0, RDT0, 0 },
{ RDBAH1, RDBAL1, RDLEN1, RDH1, RDT1, 1 }
};
@@ -930,7 +930,7 @@ e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
dma_addr_t base;
struct e1000_tx_desc desc;
bool ide = false;
- const E1000E_RingInfo *txi = txr->i;
+ const E1000ERingInfo *txi = txr->i;
uint32_t cause = E1000_ICS_TXQE;
if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
@@ -960,7 +960,7 @@ e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
}
static bool
-e1000e_has_rxbufs(E1000ECore *core, const E1000E_RingInfo *r,
+e1000e_has_rxbufs(E1000ECore *core, const E1000ERingInfo *r,
size_t total_size)
{
uint32_t bufs = e1000e_ring_free_descr_num(core, r);
@@ -1397,17 +1397,17 @@ e1000e_pci_dma_write_rx_desc(E1000ECore *core, dma_addr_t addr,
}
}
-typedef struct e1000e_ba_state_st {
+typedef struct E1000EBAState {
uint16_t written[MAX_PS_BUFFERS];
uint8_t cur_idx;
-} e1000e_ba_state;
+} E1000EBAState;
static inline void
-e1000e_write_hdr_to_rx_buffers(E1000ECore *core,
- hwaddr ba[MAX_PS_BUFFERS],
- e1000e_ba_state *bastate,
- const char *data,
- dma_addr_t data_len)
+e1000e_write_hdr_frag_to_rx_buffers(E1000ECore *core,
+ hwaddr ba[MAX_PS_BUFFERS],
+ E1000EBAState *bastate,
+ const char *data,
+ dma_addr_t data_len)
{
assert(data_len <= core->rxbuf_sizes[0] - bastate->written[0]);
@@ -1418,11 +1418,11 @@ e1000e_write_hdr_to_rx_buffers(E1000ECore *core,
}
static void
-e1000e_write_to_rx_buffers(E1000ECore *core,
- hwaddr ba[MAX_PS_BUFFERS],
- e1000e_ba_state *bastate,
- const char *data,
- dma_addr_t data_len)
+e1000e_write_payload_frag_to_rx_buffers(E1000ECore *core,
+ hwaddr ba[MAX_PS_BUFFERS],
+ E1000EBAState *bastate,
+ const char *data,
+ dma_addr_t data_len)
{
while (data_len > 0) {
uint32_t cur_buf_len = core->rxbuf_sizes[bastate->cur_idx];
@@ -1460,7 +1460,7 @@ e1000e_update_rx_stats(E1000ECore *core, size_t pkt_size, size_t pkt_fcs_size)
}
static inline bool
-e1000e_rx_descr_threshold_hit(E1000ECore *core, const E1000E_RingInfo *rxi)
+e1000e_rx_descr_threshold_hit(E1000ECore *core, const E1000ERingInfo *rxi)
{
return e1000e_ring_free_descr_num(core, rxi) ==
e1000e_ring_len(core, rxi) >> core->rxbuf_min_shift;
@@ -1521,7 +1521,7 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,
struct iovec *iov = net_rx_pkt_get_iovec(pkt);
size_t size = net_rx_pkt_get_total_len(pkt);
size_t total_size = size + e1000x_fcs_len(core->mac);
- const E1000E_RingInfo *rxi;
+ const E1000ERingInfo *rxi;
size_t ps_hdr_len = 0;
bool do_ps = e1000e_do_ps(core, pkt, &ps_hdr_len);
bool is_first = true;
@@ -1530,7 +1530,7 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,
do {
hwaddr ba[MAX_PS_BUFFERS];
- e1000e_ba_state bastate = { { 0 } };
+ E1000EBAState bastate = { { 0 } };
bool is_last = false;
desc_size = total_size - desc_offset;
@@ -1568,8 +1568,10 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,
iov_copy = MIN(ps_hdr_len - ps_hdr_copied,
iov->iov_len - iov_ofs);
- e1000e_write_hdr_to_rx_buffers(core, ba, &bastate,
- iov->iov_base, iov_copy);
+ e1000e_write_hdr_frag_to_rx_buffers(core, ba,
+ &bastate,
+ iov->iov_base,
+ iov_copy);
copy_size -= iov_copy;
ps_hdr_copied += iov_copy;
@@ -1585,8 +1587,8 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,
} else {
/* Leave buffer 0 of each descriptor except first */
/* empty as per spec 7.1.5.1 */
- e1000e_write_hdr_to_rx_buffers(core, ba, &bastate,
- NULL, 0);
+ e1000e_write_hdr_frag_to_rx_buffers(core, ba, &bastate,
+ NULL, 0);
}
}
@@ -1594,8 +1596,10 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,
while (copy_size) {
iov_copy = MIN(copy_size, iov->iov_len - iov_ofs);
- e1000e_write_to_rx_buffers(core, ba, &bastate,
- iov->iov_base + iov_ofs, iov_copy);
+ e1000e_write_payload_frag_to_rx_buffers(core, ba, &bastate,
+ iov->iov_base +
+ iov_ofs,
+ iov_copy);
copy_size -= iov_copy;
iov_ofs += iov_copy;
@@ -1607,7 +1611,7 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,
if (desc_offset + desc_size >= total_size) {
/* Simulate FCS checksum presence in the last descriptor */
- e1000e_write_to_rx_buffers(core, ba, &bastate,
+ e1000e_write_payload_frag_to_rx_buffers(core, ba, &bastate,
(const char *) &fcs_pad, e1000x_fcs_len(core->mac));
}
}
@@ -2852,7 +2856,7 @@ e1000e_update_rx_offloads(E1000ECore *core)
if (core->has_vnet) {
qemu_set_offload(qemu_get_queue(core->owner_nic)->peer,
- cso_state, 0, 0, 0, 0);
+ cso_state, 0, 0, 0, 0, 0, 0);
}
}
diff --git a/hw/net/fsl_etsec/rings.c b/hw/net/fsl_etsec/rings.c
index 788463f1b6..2f2f359f7a 100644
--- a/hw/net/fsl_etsec/rings.c
+++ b/hw/net/fsl_etsec/rings.c
@@ -372,6 +372,12 @@ void etsec_walk_tx_ring(eTSEC *etsec, int ring_nbr)
etsec->regs[TSTAT].value |= 1 << (31 - ring_nbr);
}
+/*
+ * rx_init_frame() ensures we never do more padding than this
+ * (checksum plus minimum data packet size)
+ */
+#define MAX_RX_PADDING 64
+
static void fill_rx_bd(eTSEC *etsec,
eTSEC_rxtx_bd *bd,
const uint8_t **buf,
@@ -380,9 +386,11 @@ static void fill_rx_bd(eTSEC *etsec,
uint16_t to_write;
hwaddr bufptr = bd->bufptr +
((hwaddr)(etsec->regs[TBDBPH].value & 0xF) << 32);
- uint8_t padd[etsec->rx_padding];
+ uint8_t padd[MAX_RX_PADDING];
uint8_t rem;
+ assert(etsec->rx_padding <= MAX_RX_PADDING);
+
RING_DEBUG("eTSEC fill Rx buffer @ 0x%016" HWADDR_PRIx
" size:%zu(padding + crc:%u) + fcb:%u\n",
bufptr, *size, etsec->rx_padding, etsec->rx_fcb_size);
@@ -426,7 +434,7 @@ static void fill_rx_bd(eTSEC *etsec,
rem = MIN(etsec->regs[MRBLR].value - bd->length, etsec->rx_padding);
if (rem > 0) {
- memset(padd, 0x0, sizeof(padd));
+ memset(padd, 0x0, rem);
etsec->rx_padding -= rem;
*size -= rem;
bd->length += rem;
diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c
index 8b6b75c522..f6a5e2327b 100644
--- a/hw/net/igb_core.c
+++ b/hw/net/igb_core.c
@@ -267,6 +267,29 @@ igb_rx_use_legacy_descriptor(IGBCore *core)
return false;
}
+typedef struct E1000ERingInfo {
+ int dbah;
+ int dbal;
+ int dlen;
+ int dh;
+ int dt;
+ int idx;
+} E1000ERingInfo;
+
+static uint32_t
+igb_rx_queue_desctyp_get(IGBCore *core, const E1000ERingInfo *r)
+{
+ return core->mac[E1000_SRRCTL(r->idx) >> 2] & E1000_SRRCTL_DESCTYPE_MASK;
+}
+
+static bool
+igb_rx_use_ps_descriptor(IGBCore *core, const E1000ERingInfo *r)
+{
+ uint32_t desctyp = igb_rx_queue_desctyp_get(core, r);
+ return desctyp == E1000_SRRCTL_DESCTYPE_HDR_SPLIT ||
+ desctyp == E1000_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS;
+}
+
static inline bool
igb_rss_enabled(IGBCore *core)
{
@@ -694,24 +717,15 @@ static uint32_t igb_rx_wb_eic(IGBCore *core, int queue_idx)
return (ent & E1000_IVAR_VALID) ? BIT(ent & 0x1f) : 0;
}
-typedef struct E1000E_RingInfo_st {
- int dbah;
- int dbal;
- int dlen;
- int dh;
- int dt;
- int idx;
-} E1000E_RingInfo;
-
static inline bool
-igb_ring_empty(IGBCore *core, const E1000E_RingInfo *r)
+igb_ring_empty(IGBCore *core, const E1000ERingInfo *r)
{
return core->mac[r->dh] == core->mac[r->dt] ||
core->mac[r->dt] >= core->mac[r->dlen] / E1000_RING_DESC_LEN;
}
static inline uint64_t
-igb_ring_base(IGBCore *core, const E1000E_RingInfo *r)
+igb_ring_base(IGBCore *core, const E1000ERingInfo *r)
{
uint64_t bah = core->mac[r->dbah];
uint64_t bal = core->mac[r->dbal];
@@ -720,13 +734,13 @@ igb_ring_base(IGBCore *core, const E1000E_RingInfo *r)
}
static inline uint64_t
-igb_ring_head_descr(IGBCore *core, const E1000E_RingInfo *r)
+igb_ring_head_descr(IGBCore *core, const E1000ERingInfo *r)
{
return igb_ring_base(core, r) + E1000_RING_DESC_LEN * core->mac[r->dh];
}
static inline void
-igb_ring_advance(IGBCore *core, const E1000E_RingInfo *r, uint32_t count)
+igb_ring_advance(IGBCore *core, const E1000ERingInfo *r, uint32_t count)
{
core->mac[r->dh] += count;
@@ -736,7 +750,7 @@ igb_ring_advance(IGBCore *core, const E1000E_RingInfo *r, uint32_t count)
}
static inline uint32_t
-igb_ring_free_descr_num(IGBCore *core, const E1000E_RingInfo *r)
+igb_ring_free_descr_num(IGBCore *core, const E1000ERingInfo *r)
{
trace_e1000e_ring_free_space(r->idx, core->mac[r->dlen],
core->mac[r->dh], core->mac[r->dt]);
@@ -755,13 +769,13 @@ igb_ring_free_descr_num(IGBCore *core, const E1000E_RingInfo *r)
}
static inline bool
-igb_ring_enabled(IGBCore *core, const E1000E_RingInfo *r)
+igb_ring_enabled(IGBCore *core, const E1000ERingInfo *r)
{
return core->mac[r->dlen] > 0;
}
typedef struct IGB_TxRing_st {
- const E1000E_RingInfo *i;
+ const E1000ERingInfo *i;
struct igb_tx *tx;
} IGB_TxRing;
@@ -774,7 +788,7 @@ igb_mq_queue_idx(int base_reg_idx, int reg_idx)
static inline void
igb_tx_ring_init(IGBCore *core, IGB_TxRing *txr, int idx)
{
- static const E1000E_RingInfo i[IGB_NUM_QUEUES] = {
+ static const E1000ERingInfo i[IGB_NUM_QUEUES] = {
{ TDBAH0, TDBAL0, TDLEN0, TDH0, TDT0, 0 },
{ TDBAH1, TDBAL1, TDLEN1, TDH1, TDT1, 1 },
{ TDBAH2, TDBAL2, TDLEN2, TDH2, TDT2, 2 },
@@ -800,13 +814,13 @@ igb_tx_ring_init(IGBCore *core, IGB_TxRing *txr, int idx)
}
typedef struct E1000E_RxRing_st {
- const E1000E_RingInfo *i;
+ const E1000ERingInfo *i;
} E1000E_RxRing;
static inline void
igb_rx_ring_init(IGBCore *core, E1000E_RxRing *rxr, int idx)
{
- static const E1000E_RingInfo i[IGB_NUM_QUEUES] = {
+ static const E1000ERingInfo i[IGB_NUM_QUEUES] = {
{ RDBAH0, RDBAL0, RDLEN0, RDH0, RDT0, 0 },
{ RDBAH1, RDBAL1, RDLEN1, RDH1, RDT1, 1 },
{ RDBAH2, RDBAL2, RDLEN2, RDH2, RDT2, 2 },
@@ -833,7 +847,7 @@ igb_rx_ring_init(IGBCore *core, E1000E_RxRing *rxr, int idx)
static uint32_t
igb_txdesc_writeback(IGBCore *core, dma_addr_t base,
union e1000_adv_tx_desc *tx_desc,
- const E1000E_RingInfo *txi)
+ const E1000ERingInfo *txi)
{
PCIDevice *d;
uint32_t cmd_type_len = le32_to_cpu(tx_desc->read.cmd_type_len);
@@ -866,7 +880,7 @@ igb_txdesc_writeback(IGBCore *core, dma_addr_t base,
}
static inline bool
-igb_tx_enabled(IGBCore *core, const E1000E_RingInfo *txi)
+igb_tx_enabled(IGBCore *core, const E1000ERingInfo *txi)
{
bool vmdq = core->mac[MRQC] & 1;
uint16_t qn = txi->idx;
@@ -883,7 +897,7 @@ igb_start_xmit(IGBCore *core, const IGB_TxRing *txr)
PCIDevice *d;
dma_addr_t base;
union e1000_adv_tx_desc desc;
- const E1000E_RingInfo *txi = txr->i;
+ const E1000ERingInfo *txi = txr->i;
uint32_t eic = 0;
if (!igb_tx_enabled(core, txi)) {
@@ -918,7 +932,7 @@ igb_start_xmit(IGBCore *core, const IGB_TxRing *txr)
}
static uint32_t
-igb_rxbufsize(IGBCore *core, const E1000E_RingInfo *r)
+igb_rxbufsize(IGBCore *core, const E1000ERingInfo *r)
{
uint32_t srrctl = core->mac[E1000_SRRCTL(r->idx) >> 2];
uint32_t bsizepkt = srrctl & E1000_SRRCTL_BSIZEPKT_MASK;
@@ -930,7 +944,7 @@ igb_rxbufsize(IGBCore *core, const E1000E_RingInfo *r)
}
static bool
-igb_has_rxbufs(IGBCore *core, const E1000E_RingInfo *r, size_t total_size)
+igb_has_rxbufs(IGBCore *core, const E1000ERingInfo *r, size_t total_size)
{
uint32_t bufs = igb_ring_free_descr_num(core, r);
uint32_t bufsize = igb_rxbufsize(core, r);
@@ -941,6 +955,14 @@ igb_has_rxbufs(IGBCore *core, const E1000E_RingInfo *r, size_t total_size)
bufsize;
}
+static uint32_t
+igb_rxhdrbufsize(IGBCore *core, const E1000ERingInfo *r)
+{
+ uint32_t srrctl = core->mac[E1000_SRRCTL(r->idx) >> 2];
+ return (srrctl & E1000_SRRCTL_BSIZEHDRSIZE_MASK) >>
+ E1000_SRRCTL_BSIZEHDRSIZE_SHIFT;
+}
+
void
igb_start_recv(IGBCore *core)
{
@@ -1225,21 +1247,77 @@ igb_read_lgcy_rx_descr(IGBCore *core, struct e1000_rx_desc *desc,
}
static inline void
-igb_read_adv_rx_descr(IGBCore *core, union e1000_adv_rx_desc *desc,
- hwaddr *buff_addr)
+igb_read_adv_rx_single_buf_descr(IGBCore *core, union e1000_adv_rx_desc *desc,
+ hwaddr *buff_addr)
{
*buff_addr = le64_to_cpu(desc->read.pkt_addr);
}
static inline void
-igb_read_rx_descr(IGBCore *core, union e1000_rx_desc_union *desc,
- hwaddr *buff_addr)
+igb_read_adv_rx_split_buf_descr(IGBCore *core, union e1000_adv_rx_desc *desc,
+ hwaddr *buff_addr)
{
+ buff_addr[0] = le64_to_cpu(desc->read.hdr_addr);
+ buff_addr[1] = le64_to_cpu(desc->read.pkt_addr);
+}
+
+typedef struct IGBBAState {
+ uint16_t written[IGB_MAX_PS_BUFFERS];
+ uint8_t cur_idx;
+} IGBBAState;
+
+typedef struct IGBSplitDescriptorData {
+ bool sph;
+ bool hbo;
+ size_t hdr_len;
+} IGBSplitDescriptorData;
+
+typedef struct IGBPacketRxDMAState {
+ size_t size;
+ size_t total_size;
+ size_t ps_hdr_len;
+ size_t desc_size;
+ size_t desc_offset;
+ uint32_t rx_desc_packet_buf_size;
+ uint32_t rx_desc_header_buf_size;
+ struct iovec *iov;
+ size_t iov_ofs;
+ bool do_ps;
+ bool is_first;
+ IGBBAState bastate;
+ hwaddr ba[IGB_MAX_PS_BUFFERS];
+ IGBSplitDescriptorData ps_desc_data;
+} IGBPacketRxDMAState;
+
+static inline void
+igb_read_rx_descr(IGBCore *core,
+ union e1000_rx_desc_union *desc,
+ IGBPacketRxDMAState *pdma_st,
+ const E1000ERingInfo *r)
+{
+ uint32_t desc_type;
+
if (igb_rx_use_legacy_descriptor(core)) {
- igb_read_lgcy_rx_descr(core, &desc->legacy, buff_addr);
- } else {
- igb_read_adv_rx_descr(core, &desc->adv, buff_addr);
+ igb_read_lgcy_rx_descr(core, &desc->legacy, &pdma_st->ba[1]);
+ pdma_st->ba[0] = 0;
+ return;
}
+
+ /* advanced header split descriptor */
+ if (igb_rx_use_ps_descriptor(core, r)) {
+ igb_read_adv_rx_split_buf_descr(core, &desc->adv, &pdma_st->ba[0]);
+ return;
+ }
+
+ /* descriptor replication modes not supported */
+ desc_type = igb_rx_queue_desctyp_get(core, r);
+ if (desc_type != E1000_SRRCTL_DESCTYPE_ADV_ONEBUF) {
+ trace_igb_wrn_rx_desc_modes_not_supp(desc_type);
+ }
+
+ /* advanced single buffer descriptor */
+ igb_read_adv_rx_single_buf_descr(core, &desc->adv, &pdma_st->ba[1]);
+ pdma_st->ba[0] = 0;
}
static void
@@ -1281,15 +1359,11 @@ igb_verify_csum_in_sw(IGBCore *core,
}
static void
-igb_build_rx_metadata(IGBCore *core,
- struct NetRxPkt *pkt,
- bool is_eop,
- const E1000E_RSSInfo *rss_info, uint16_t etqf, bool ts,
- uint16_t *pkt_info, uint16_t *hdr_info,
- uint32_t *rss,
- uint32_t *status_flags,
- uint16_t *ip_id,
- uint16_t *vlan_tag)
+igb_build_rx_metadata_common(IGBCore *core,
+ struct NetRxPkt *pkt,
+ bool is_eop,
+ uint32_t *status_flags,
+ uint16_t *vlan_tag)
{
struct virtio_net_hdr *vhdr;
bool hasip4, hasip6, csum_valid;
@@ -1298,7 +1372,6 @@ igb_build_rx_metadata(IGBCore *core,
*status_flags = E1000_RXD_STAT_DD;
/* No additional metadata needed for non-EOP descriptors */
- /* TODO: EOP apply only to status so don't skip whole function. */
if (!is_eop) {
goto func_exit;
}
@@ -1315,64 +1388,6 @@ igb_build_rx_metadata(IGBCore *core,
trace_e1000e_rx_metadata_vlan(*vlan_tag);
}
- /* Packet parsing results */
- if ((core->mac[RXCSUM] & E1000_RXCSUM_PCSD) != 0) {
- if (rss_info->enabled) {
- *rss = cpu_to_le32(rss_info->hash);
- trace_igb_rx_metadata_rss(*rss);
- }
- } else if (hasip4) {
- *status_flags |= E1000_RXD_STAT_IPIDV;
- *ip_id = cpu_to_le16(net_rx_pkt_get_ip_id(pkt));
- trace_e1000e_rx_metadata_ip_id(*ip_id);
- }
-
- if (l4hdr_proto == ETH_L4_HDR_PROTO_TCP && net_rx_pkt_is_tcp_ack(pkt)) {
- *status_flags |= E1000_RXD_STAT_ACK;
- trace_e1000e_rx_metadata_ack();
- }
-
- if (pkt_info) {
- *pkt_info = rss_info->enabled ? rss_info->type : 0;
-
- if (etqf < 8) {
- *pkt_info |= (BIT(11) | etqf) << 4;
- } else {
- if (hasip4) {
- *pkt_info |= E1000_ADVRXD_PKT_IP4;
- }
-
- if (hasip6) {
- *pkt_info |= E1000_ADVRXD_PKT_IP6;
- }
-
- switch (l4hdr_proto) {
- case ETH_L4_HDR_PROTO_TCP:
- *pkt_info |= E1000_ADVRXD_PKT_TCP;
- break;
-
- case ETH_L4_HDR_PROTO_UDP:
- *pkt_info |= E1000_ADVRXD_PKT_UDP;
- break;
-
- case ETH_L4_HDR_PROTO_SCTP:
- *pkt_info |= E1000_ADVRXD_PKT_SCTP;
- break;
-
- default:
- break;
- }
- }
- }
-
- if (hdr_info) {
- *hdr_info = 0;
- }
-
- if (ts) {
- *status_flags |= BIT(16);
- }
-
/* RX CSO information */
if (hasip6 && (core->mac[RFCTL] & E1000_RFCTL_IPV6_XSUM_DIS)) {
trace_e1000e_rx_metadata_ipv6_sum_disabled();
@@ -1428,56 +1443,168 @@ func_exit:
static inline void
igb_write_lgcy_rx_descr(IGBCore *core, struct e1000_rx_desc *desc,
struct NetRxPkt *pkt,
- const E1000E_RSSInfo *rss_info, uint16_t etqf, bool ts,
+ const E1000E_RSSInfo *rss_info,
uint16_t length)
{
- uint32_t status_flags, rss;
- uint16_t ip_id;
+ uint32_t status_flags;
assert(!rss_info->enabled);
+
+ memset(desc, 0, sizeof(*desc));
desc->length = cpu_to_le16(length);
- desc->csum = 0;
+ igb_build_rx_metadata_common(core, pkt, pkt != NULL,
+ &status_flags,
+ &desc->special);
- igb_build_rx_metadata(core, pkt, pkt != NULL,
- rss_info, etqf, ts,
- NULL, NULL, &rss,
- &status_flags, &ip_id,
- &desc->special);
desc->errors = (uint8_t) (le32_to_cpu(status_flags) >> 24);
desc->status = (uint8_t) le32_to_cpu(status_flags);
}
+static bool
+igb_rx_ps_descriptor_split_always(IGBCore *core, const E1000ERingInfo *r)
+{
+ uint32_t desctyp = igb_rx_queue_desctyp_get(core, r);
+ return desctyp == E1000_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS;
+}
+
+static uint16_t
+igb_rx_desc_get_packet_type(IGBCore *core, struct NetRxPkt *pkt, uint16_t etqf)
+{
+ uint16_t pkt_type;
+ bool hasip4, hasip6;
+ EthL4HdrProto l4hdr_proto;
+
+ if (etqf < 8) {
+ pkt_type = BIT(11) | etqf;
+ return pkt_type;
+ }
+
+ net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto);
+
+ if (hasip6 && !(core->mac[RFCTL] & E1000_RFCTL_IPV6_DIS)) {
+ eth_ip6_hdr_info *ip6hdr_info = net_rx_pkt_get_ip6_info(pkt);
+ pkt_type = ip6hdr_info->has_ext_hdrs ? E1000_ADVRXD_PKT_IP6E :
+ E1000_ADVRXD_PKT_IP6;
+ } else if (hasip4) {
+ pkt_type = E1000_ADVRXD_PKT_IP4;
+ } else {
+ pkt_type = 0;
+ }
+
+ switch (l4hdr_proto) {
+ case ETH_L4_HDR_PROTO_TCP:
+ pkt_type |= E1000_ADVRXD_PKT_TCP;
+ break;
+ case ETH_L4_HDR_PROTO_UDP:
+ pkt_type |= E1000_ADVRXD_PKT_UDP;
+ break;
+ case ETH_L4_HDR_PROTO_SCTP:
+ pkt_type |= E1000_ADVRXD_PKT_SCTP;
+ break;
+ default:
+ break;
+ }
+
+ return pkt_type;
+}
+
static inline void
igb_write_adv_rx_descr(IGBCore *core, union e1000_adv_rx_desc *desc,
struct NetRxPkt *pkt,
const E1000E_RSSInfo *rss_info, uint16_t etqf, bool ts,
uint16_t length)
{
+ bool hasip4, hasip6;
+ EthL4HdrProto l4hdr_proto;
+ uint16_t rss_type = 0, pkt_type;
+ bool eop = (pkt != NULL);
+ uint32_t adv_desc_status_error = 0;
memset(&desc->wb, 0, sizeof(desc->wb));
desc->wb.upper.length = cpu_to_le16(length);
+ igb_build_rx_metadata_common(core, pkt, eop,
+ &desc->wb.upper.status_error,
+ &desc->wb.upper.vlan);
+
+ if (!eop) {
+ return;
+ }
+
+ net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto);
+
+ if ((core->mac[RXCSUM] & E1000_RXCSUM_PCSD) != 0) {
+ if (rss_info->enabled) {
+ desc->wb.lower.hi_dword.rss = cpu_to_le32(rss_info->hash);
+ rss_type = rss_info->type;
+ trace_igb_rx_metadata_rss(desc->wb.lower.hi_dword.rss, rss_type);
+ }
+ } else if (hasip4) {
+ adv_desc_status_error |= E1000_RXD_STAT_IPIDV;
+ desc->wb.lower.hi_dword.csum_ip.ip_id =
+ cpu_to_le16(net_rx_pkt_get_ip_id(pkt));
+ trace_e1000e_rx_metadata_ip_id(
+ desc->wb.lower.hi_dword.csum_ip.ip_id);
+ }
+
+ if (ts) {
+ adv_desc_status_error |= BIT(16);
+ }
+
+ pkt_type = igb_rx_desc_get_packet_type(core, pkt, etqf);
+ trace_e1000e_rx_metadata_pkt_type(pkt_type);
+ desc->wb.lower.lo_dword.pkt_info = cpu_to_le16(rss_type | (pkt_type << 4));
+ desc->wb.upper.status_error |= cpu_to_le32(adv_desc_status_error);
+}
+
+static inline void
+igb_write_adv_ps_rx_descr(IGBCore *core,
+ union e1000_adv_rx_desc *desc,
+ struct NetRxPkt *pkt,
+ const E1000E_RSSInfo *rss_info,
+ const E1000ERingInfo *r,
+ uint16_t etqf,
+ bool ts,
+ IGBPacketRxDMAState *pdma_st)
+{
+ size_t pkt_len;
+ uint16_t hdr_info = 0;
- igb_build_rx_metadata(core, pkt, pkt != NULL,
- rss_info, etqf, ts,
- &desc->wb.lower.lo_dword.pkt_info,
- &desc->wb.lower.lo_dword.hdr_info,
- &desc->wb.lower.hi_dword.rss,
- &desc->wb.upper.status_error,
- &desc->wb.lower.hi_dword.csum_ip.ip_id,
- &desc->wb.upper.vlan);
+ if (pdma_st->do_ps) {
+ pkt_len = pdma_st->bastate.written[1];
+ } else {
+ pkt_len = pdma_st->bastate.written[0] + pdma_st->bastate.written[1];
+ }
+
+ igb_write_adv_rx_descr(core, desc, pkt, rss_info, etqf, ts, pkt_len);
+
+ hdr_info = (pdma_st->ps_desc_data.hdr_len << E1000_ADVRXD_HDR_LEN_OFFSET) &
+ E1000_ADVRXD_ADV_HDR_LEN_MASK;
+ hdr_info |= pdma_st->ps_desc_data.sph ? E1000_ADVRXD_HDR_SPH : 0;
+ desc->wb.lower.lo_dword.hdr_info = cpu_to_le16(hdr_info);
+
+ desc->wb.upper.status_error |= cpu_to_le32(
+ pdma_st->ps_desc_data.hbo ? E1000_ADVRXD_ST_ERR_HBO_OFFSET : 0);
}
static inline void
-igb_write_rx_descr(IGBCore *core, union e1000_rx_desc_union *desc,
- struct NetRxPkt *pkt, const E1000E_RSSInfo *rss_info,
- uint16_t etqf, bool ts, uint16_t length)
+igb_write_rx_descr(IGBCore *core,
+ union e1000_rx_desc_union *desc,
+ struct NetRxPkt *pkt,
+ const E1000E_RSSInfo *rss_info,
+ uint16_t etqf,
+ bool ts,
+ IGBPacketRxDMAState *pdma_st,
+ const E1000ERingInfo *r)
{
if (igb_rx_use_legacy_descriptor(core)) {
igb_write_lgcy_rx_descr(core, &desc->legacy, pkt, rss_info,
- etqf, ts, length);
+ pdma_st->bastate.written[1]);
+ } else if (igb_rx_use_ps_descriptor(core, r)) {
+ igb_write_adv_ps_rx_descr(core, &desc->adv, pkt, rss_info, r, etqf, ts,
+ pdma_st);
} else {
igb_write_adv_rx_descr(core, &desc->adv, pkt, rss_info,
- etqf, ts, length);
+ etqf, ts, pdma_st->bastate.written[1]);
}
}
@@ -1514,20 +1641,7 @@ igb_pci_dma_write_rx_desc(IGBCore *core, PCIDevice *dev, dma_addr_t addr,
}
static void
-igb_write_to_rx_buffers(IGBCore *core,
- PCIDevice *d,
- hwaddr ba,
- uint16_t *written,
- const char *data,
- dma_addr_t data_len)
-{
- trace_igb_rx_desc_buff_write(ba, *written, data, data_len);
- pci_dma_write(d, ba + *written, data, data_len);
- *written += data_len;
-}
-
-static void
-igb_update_rx_stats(IGBCore *core, const E1000E_RingInfo *rxi,
+igb_update_rx_stats(IGBCore *core, const E1000ERingInfo *rxi,
size_t pkt_size, size_t pkt_fcs_size)
{
eth_pkt_types_e pkt_type = net_rx_pkt_get_packet_type(core->rx_pkt);
@@ -1545,12 +1659,256 @@ igb_update_rx_stats(IGBCore *core, const E1000E_RingInfo *rxi,
}
static inline bool
-igb_rx_descr_threshold_hit(IGBCore *core, const E1000E_RingInfo *rxi)
+igb_rx_descr_threshold_hit(IGBCore *core, const E1000ERingInfo *rxi)
{
return igb_ring_free_descr_num(core, rxi) ==
((core->mac[E1000_SRRCTL(rxi->idx) >> 2] >> 20) & 31) * 16;
}
+static bool
+igb_do_ps(IGBCore *core,
+ const E1000ERingInfo *r,
+ struct NetRxPkt *pkt,
+ IGBPacketRxDMAState *pdma_st)
+{
+ bool hasip4, hasip6;
+ EthL4HdrProto l4hdr_proto;
+ bool fragment;
+ bool split_always;
+ size_t bheader_size;
+ size_t total_pkt_len;
+
+ if (!igb_rx_use_ps_descriptor(core, r)) {
+ return false;
+ }
+
+ total_pkt_len = net_rx_pkt_get_total_len(pkt);
+ bheader_size = igb_rxhdrbufsize(core, r);
+ split_always = igb_rx_ps_descriptor_split_always(core, r);
+ if (split_always && total_pkt_len <= bheader_size) {
+ pdma_st->ps_hdr_len = total_pkt_len;
+ pdma_st->ps_desc_data.hdr_len = total_pkt_len;
+ return true;
+ }
+
+ net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto);
+
+ if (hasip4) {
+ fragment = net_rx_pkt_get_ip4_info(pkt)->fragment;
+ } else if (hasip6) {
+ fragment = net_rx_pkt_get_ip6_info(pkt)->fragment;
+ } else {
+ pdma_st->ps_desc_data.hdr_len = bheader_size;
+ goto header_not_handled;
+ }
+
+ if (fragment && (core->mac[RFCTL] & E1000_RFCTL_IPFRSP_DIS)) {
+ pdma_st->ps_desc_data.hdr_len = bheader_size;
+ goto header_not_handled;
+ }
+
+ /* no header splitting for SCTP */
+ if (!fragment && (l4hdr_proto == ETH_L4_HDR_PROTO_UDP ||
+ l4hdr_proto == ETH_L4_HDR_PROTO_TCP)) {
+ pdma_st->ps_hdr_len = net_rx_pkt_get_l5_hdr_offset(pkt);
+ } else {
+ pdma_st->ps_hdr_len = net_rx_pkt_get_l4_hdr_offset(pkt);
+ }
+
+ pdma_st->ps_desc_data.sph = true;
+ pdma_st->ps_desc_data.hdr_len = pdma_st->ps_hdr_len;
+
+ if (pdma_st->ps_hdr_len > bheader_size) {
+ pdma_st->ps_desc_data.hbo = true;
+ goto header_not_handled;
+ }
+
+ return true;
+
+header_not_handled:
+ if (split_always) {
+ pdma_st->ps_hdr_len = bheader_size;
+ return true;
+ }
+
+ return false;
+}
+
+static void
+igb_truncate_to_descriptor_size(IGBPacketRxDMAState *pdma_st, size_t *size)
+{
+ if (pdma_st->do_ps && pdma_st->is_first) {
+ if (*size > pdma_st->rx_desc_packet_buf_size + pdma_st->ps_hdr_len) {
+ *size = pdma_st->rx_desc_packet_buf_size + pdma_st->ps_hdr_len;
+ }
+ } else {
+ if (*size > pdma_st->rx_desc_packet_buf_size) {
+ *size = pdma_st->rx_desc_packet_buf_size;
+ }
+ }
+}
+
+static inline void
+igb_write_hdr_frag_to_rx_buffers(IGBCore *core,
+ PCIDevice *d,
+ IGBPacketRxDMAState *pdma_st,
+ const char *data,
+ dma_addr_t data_len)
+{
+ assert(data_len <= pdma_st->rx_desc_header_buf_size -
+ pdma_st->bastate.written[0]);
+ pci_dma_write(d,
+ pdma_st->ba[0] + pdma_st->bastate.written[0],
+ data, data_len);
+ pdma_st->bastate.written[0] += data_len;
+ pdma_st->bastate.cur_idx = 1;
+}
+
+static void
+igb_write_header_to_rx_buffers(IGBCore *core,
+ struct NetRxPkt *pkt,
+ PCIDevice *d,
+ IGBPacketRxDMAState *pdma_st,
+ size_t *copy_size)
+{
+ size_t iov_copy;
+ size_t ps_hdr_copied = 0;
+
+ if (!pdma_st->is_first) {
+ /* Leave buffer 0 of each descriptor except first */
+ /* empty */
+ pdma_st->bastate.cur_idx = 1;
+ return;
+ }
+
+ do {
+ iov_copy = MIN(pdma_st->ps_hdr_len - ps_hdr_copied,
+ pdma_st->iov->iov_len - pdma_st->iov_ofs);
+
+ igb_write_hdr_frag_to_rx_buffers(core, d, pdma_st,
+ pdma_st->iov->iov_base,
+ iov_copy);
+
+ *copy_size -= iov_copy;
+ ps_hdr_copied += iov_copy;
+
+ pdma_st->iov_ofs += iov_copy;
+ if (pdma_st->iov_ofs == pdma_st->iov->iov_len) {
+ pdma_st->iov++;
+ pdma_st->iov_ofs = 0;
+ }
+ } while (ps_hdr_copied < pdma_st->ps_hdr_len);
+
+ pdma_st->is_first = false;
+}
+
+static void
+igb_write_payload_frag_to_rx_buffers(IGBCore *core,
+ PCIDevice *d,
+ IGBPacketRxDMAState *pdma_st,
+ const char *data,
+ dma_addr_t data_len)
+{
+ while (data_len > 0) {
+ assert(pdma_st->bastate.cur_idx < IGB_MAX_PS_BUFFERS);
+
+ uint32_t cur_buf_bytes_left =
+ pdma_st->rx_desc_packet_buf_size -
+ pdma_st->bastate.written[pdma_st->bastate.cur_idx];
+ uint32_t bytes_to_write = MIN(data_len, cur_buf_bytes_left);
+
+ trace_igb_rx_desc_buff_write(
+ pdma_st->bastate.cur_idx,
+ pdma_st->ba[pdma_st->bastate.cur_idx],
+ pdma_st->bastate.written[pdma_st->bastate.cur_idx],
+ data,
+ bytes_to_write);
+
+ pci_dma_write(d,
+ pdma_st->ba[pdma_st->bastate.cur_idx] +
+ pdma_st->bastate.written[pdma_st->bastate.cur_idx],
+ data, bytes_to_write);
+
+ pdma_st->bastate.written[pdma_st->bastate.cur_idx] += bytes_to_write;
+ data += bytes_to_write;
+ data_len -= bytes_to_write;
+
+ if (pdma_st->bastate.written[pdma_st->bastate.cur_idx] ==
+ pdma_st->rx_desc_packet_buf_size) {
+ pdma_st->bastate.cur_idx++;
+ }
+ }
+}
+
+static void
+igb_write_payload_to_rx_buffers(IGBCore *core,
+ struct NetRxPkt *pkt,
+ PCIDevice *d,
+ IGBPacketRxDMAState *pdma_st,
+ size_t *copy_size)
+{
+ static const uint32_t fcs_pad;
+ size_t iov_copy;
+
+ /* Copy packet payload */
+ while (*copy_size) {
+ iov_copy = MIN(*copy_size, pdma_st->iov->iov_len - pdma_st->iov_ofs);
+ igb_write_payload_frag_to_rx_buffers(core, d,
+ pdma_st,
+ pdma_st->iov->iov_base +
+ pdma_st->iov_ofs,
+ iov_copy);
+
+ *copy_size -= iov_copy;
+ pdma_st->iov_ofs += iov_copy;
+ if (pdma_st->iov_ofs == pdma_st->iov->iov_len) {
+ pdma_st->iov++;
+ pdma_st->iov_ofs = 0;
+ }
+ }
+
+ if (pdma_st->desc_offset + pdma_st->desc_size >= pdma_st->total_size) {
+ /* Simulate FCS checksum presence in the last descriptor */
+ igb_write_payload_frag_to_rx_buffers(core, d,
+ pdma_st,
+ (const char *) &fcs_pad,
+ e1000x_fcs_len(core->mac));
+ }
+}
+
+static void
+igb_write_to_rx_buffers(IGBCore *core,
+ struct NetRxPkt *pkt,
+ PCIDevice *d,
+ IGBPacketRxDMAState *pdma_st)
+{
+ size_t copy_size;
+
+ if (!(pdma_st->ba)[1] || (pdma_st->do_ps && !(pdma_st->ba[0]))) {
+ /* as per intel docs; skip descriptors with null buf addr */
+ trace_e1000e_rx_null_descriptor();
+ return;
+ }
+
+ if (pdma_st->desc_offset >= pdma_st->size) {
+ return;
+ }
+
+ pdma_st->desc_size = pdma_st->total_size - pdma_st->desc_offset;
+ igb_truncate_to_descriptor_size(pdma_st, &pdma_st->desc_size);
+ copy_size = pdma_st->size - pdma_st->desc_offset;
+ igb_truncate_to_descriptor_size(pdma_st, &copy_size);
+
+ /* For PS mode copy the packet header first */
+ if (pdma_st->do_ps) {
+ igb_write_header_to_rx_buffers(core, pkt, d, pdma_st, &copy_size);
+ } else {
+ pdma_st->bastate.cur_idx = 1;
+ }
+
+ igb_write_payload_to_rx_buffers(core, pkt, d, pdma_st, &copy_size);
+}
+
static void
igb_write_packet_to_guest(IGBCore *core, struct NetRxPkt *pkt,
const E1000E_RxRing *rxr,
@@ -1560,95 +1918,61 @@ igb_write_packet_to_guest(IGBCore *core, struct NetRxPkt *pkt,
PCIDevice *d;
dma_addr_t base;
union e1000_rx_desc_union desc;
- size_t desc_size;
- size_t desc_offset = 0;
- size_t iov_ofs = 0;
-
- struct iovec *iov = net_rx_pkt_get_iovec(pkt);
- size_t size = net_rx_pkt_get_total_len(pkt);
- size_t total_size = size + e1000x_fcs_len(core->mac);
- const E1000E_RingInfo *rxi = rxr->i;
- size_t bufsize = igb_rxbufsize(core, rxi);
-
+ const E1000ERingInfo *rxi;
+ size_t rx_desc_len;
+
+ IGBPacketRxDMAState pdma_st = {0};
+ pdma_st.is_first = true;
+ pdma_st.size = net_rx_pkt_get_total_len(pkt);
+ pdma_st.total_size = pdma_st.size + e1000x_fcs_len(core->mac);
+
+ rxi = rxr->i;
+ rx_desc_len = core->rx_desc_len;
+ pdma_st.rx_desc_packet_buf_size = igb_rxbufsize(core, rxi);
+ pdma_st.rx_desc_header_buf_size = igb_rxhdrbufsize(core, rxi);
+ pdma_st.iov = net_rx_pkt_get_iovec(pkt);
d = pcie_sriov_get_vf_at_index(core->owner, rxi->idx % 8);
if (!d) {
d = core->owner;
}
+ pdma_st.do_ps = igb_do_ps(core, rxi, pkt, &pdma_st);
+
do {
- hwaddr ba;
- uint16_t written = 0;
+ memset(&pdma_st.bastate, 0, sizeof(IGBBAState));
bool is_last = false;
- desc_size = total_size - desc_offset;
-
- if (desc_size > bufsize) {
- desc_size = bufsize;
- }
-
if (igb_ring_empty(core, rxi)) {
return;
}
base = igb_ring_head_descr(core, rxi);
+ pci_dma_read(d, base, &desc, rx_desc_len);
+ trace_e1000e_rx_descr(rxi->idx, base, rx_desc_len);
- pci_dma_read(d, base, &desc, core->rx_desc_len);
-
- trace_e1000e_rx_descr(rxi->idx, base, core->rx_desc_len);
-
- igb_read_rx_descr(core, &desc, &ba);
-
- if (ba) {
- if (desc_offset < size) {
- static const uint32_t fcs_pad;
- size_t iov_copy;
- size_t copy_size = size - desc_offset;
- if (copy_size > bufsize) {
- copy_size = bufsize;
- }
-
- /* Copy packet payload */
- while (copy_size) {
- iov_copy = MIN(copy_size, iov->iov_len - iov_ofs);
+ igb_read_rx_descr(core, &desc, &pdma_st, rxi);
- igb_write_to_rx_buffers(core, d, ba, &written,
- iov->iov_base + iov_ofs, iov_copy);
-
- copy_size -= iov_copy;
- iov_ofs += iov_copy;
- if (iov_ofs == iov->iov_len) {
- iov++;
- iov_ofs = 0;
- }
- }
-
- if (desc_offset + desc_size >= total_size) {
- /* Simulate FCS checksum presence in the last descriptor */
- igb_write_to_rx_buffers(core, d, ba, &written,
- (const char *) &fcs_pad, e1000x_fcs_len(core->mac));
- }
- }
- } else { /* as per intel docs; skip descriptors with null buf addr */
- trace_e1000e_rx_null_descriptor();
- }
- desc_offset += desc_size;
- if (desc_offset >= total_size) {
+ igb_write_to_rx_buffers(core, pkt, d, &pdma_st);
+ pdma_st.desc_offset += pdma_st.desc_size;
+ if (pdma_st.desc_offset >= pdma_st.total_size) {
is_last = true;
}
- igb_write_rx_descr(core, &desc, is_last ? core->rx_pkt : NULL,
- rss_info, etqf, ts, written);
- igb_pci_dma_write_rx_desc(core, d, base, &desc, core->rx_desc_len);
-
- igb_ring_advance(core, rxi, core->rx_desc_len / E1000_MIN_RX_DESC_LEN);
-
- } while (desc_offset < total_size);
+ igb_write_rx_descr(core, &desc,
+ is_last ? pkt : NULL,
+ rss_info,
+ etqf, ts,
+ &pdma_st,
+ rxi);
+ igb_pci_dma_write_rx_desc(core, d, base, &desc, rx_desc_len);
+ igb_ring_advance(core, rxi, rx_desc_len / E1000_MIN_RX_DESC_LEN);
+ } while (pdma_st.desc_offset < pdma_st.total_size);
- igb_update_rx_stats(core, rxi, size, total_size);
+ igb_update_rx_stats(core, rxi, pdma_st.size, pdma_st.total_size);
}
static bool
-igb_rx_strip_vlan(IGBCore *core, const E1000E_RingInfo *rxi)
+igb_rx_strip_vlan(IGBCore *core, const E1000ERingInfo *rxi)
{
if (core->mac[MRQC] & 1) {
uint16_t pool = rxi->idx % IGB_NUM_VM_POOLS;
@@ -2753,7 +3077,7 @@ igb_update_rx_offloads(IGBCore *core)
if (core->has_vnet) {
qemu_set_offload(qemu_get_queue(core->owner_nic)->peer,
- cso_state, 0, 0, 0, 0);
+ cso_state, 0, 0, 0, 0, 0, 0);
}
}
diff --git a/hw/net/igb_regs.h b/hw/net/igb_regs.h
index 82ff195dfc..ed7427b8fe 100644
--- a/hw/net/igb_regs.h
+++ b/hw/net/igb_regs.h
@@ -452,6 +452,7 @@ union e1000_adv_rx_desc {
#define E1000_SRRCTL_BSIZEHDRSIZE_MASK 0x00000F00
#define E1000_SRRCTL_BSIZEHDRSIZE_SHIFT 2 /* Shift _left_ */
#define E1000_SRRCTL_DESCTYPE_ADV_ONEBUF 0x02000000
+#define E1000_SRRCTL_DESCTYPE_HDR_SPLIT 0x04000000
#define E1000_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS 0x0A000000
#define E1000_SRRCTL_DESCTYPE_MASK 0x0E000000
#define E1000_SRRCTL_DROP_EN 0x80000000
@@ -692,11 +693,20 @@ union e1000_adv_rx_desc {
#define E1000_STATUS_NUM_VFS_SHIFT 14
-#define E1000_ADVRXD_PKT_IP4 BIT(4)
-#define E1000_ADVRXD_PKT_IP6 BIT(6)
-#define E1000_ADVRXD_PKT_TCP BIT(8)
-#define E1000_ADVRXD_PKT_UDP BIT(9)
-#define E1000_ADVRXD_PKT_SCTP BIT(10)
+#define E1000_ADVRXD_PKT_IP4 BIT(0)
+#define E1000_ADVRXD_PKT_IP6 BIT(2)
+#define E1000_ADVRXD_PKT_IP6E BIT(3)
+#define E1000_ADVRXD_PKT_TCP BIT(4)
+#define E1000_ADVRXD_PKT_UDP BIT(5)
+#define E1000_ADVRXD_PKT_SCTP BIT(6)
+
+#define IGB_MAX_PS_BUFFERS 2
+
+#define E1000_ADVRXD_HDR_LEN_OFFSET (21 - 16)
+#define E1000_ADVRXD_ADV_HDR_LEN_MASK ((BIT(10) - 1) << \
+ E1000_ADVRXD_HDR_LEN_OFFSET)
+#define E1000_ADVRXD_HDR_SPH BIT(15)
+#define E1000_ADVRXD_ST_ERR_HBO_OFFSET BIT(3 + 20)
static inline uint8_t igb_ivar_entry_rx(uint8_t i)
{
diff --git a/hw/net/rocker/rocker_of_dpa.c b/hw/net/rocker/rocker_of_dpa.c
index dfe4754469..5e16056be6 100644
--- a/hw/net/rocker/rocker_of_dpa.c
+++ b/hw/net/rocker/rocker_of_dpa.c
@@ -1043,7 +1043,7 @@ static void of_dpa_flow_ig_tbl(OfDpaFlowContext *fc, uint32_t tbl_id)
static ssize_t of_dpa_ig(World *world, uint32_t pport,
const struct iovec *iov, int iovcnt)
{
- struct iovec iov_copy[iovcnt + 2];
+ g_autofree struct iovec *iov_copy = g_new(struct iovec, iovcnt + 2);
OfDpaFlowContext fc = {
.of_dpa = world_private(world),
.in_pport = pport,
diff --git a/hw/net/trace-events b/hw/net/trace-events
index 6b5ba669a2..3abfd65e5b 100644
--- a/hw/net/trace-events
+++ b/hw/net/trace-events
@@ -278,9 +278,9 @@ igb_core_mdic_write_unhandled(uint32_t addr) "MDIC WRITE: PHY[%u] UNHANDLED"
igb_link_set_ext_params(bool asd_check, bool speed_select_bypass, bool pfrstd) "Set extended link params: ASD check: %d, Speed select bypass: %d, PF reset done: %d"
igb_rx_desc_buff_size(uint32_t b) "buffer size: %u"
-igb_rx_desc_buff_write(uint64_t addr, uint16_t offset, const void* source, uint32_t len) "addr: 0x%"PRIx64", offset: %u, from: %p, length: %u"
+igb_rx_desc_buff_write(uint8_t idx, uint64_t addr, uint16_t offset, const void* source, uint32_t len) "buffer %u, addr: 0x%"PRIx64", offset: %u, from: %p, length: %u"
-igb_rx_metadata_rss(uint32_t rss) "RSS data: 0x%X"
+igb_rx_metadata_rss(uint32_t rss, uint16_t rss_pkt_type) "RSS data: rss: 0x%X, rss_pkt_type: 0x%X"
igb_irq_icr_clear_gpie_nsicr(void) "Clearing ICR on read due to GPIE.NSICR enabled"
igb_irq_set_iam(uint32_t icr) "Update IAM: 0x%x"
@@ -295,6 +295,8 @@ igb_irq_eitr_set(uint32_t eitr_num, uint32_t val) "EITR[%u] = 0x%x"
igb_set_pfmailbox(uint32_t vf_num, uint32_t val) "PFMailbox[%d]: 0x%x"
igb_set_vfmailbox(uint32_t vf_num, uint32_t val) "VFMailbox[%d]: 0x%x"
+igb_wrn_rx_desc_modes_not_supp(int desc_type) "Not supported descriptor type: %d"
+
# igbvf.c
igbvf_wrn_io_addr_unknown(uint64_t addr) "IO unknown register 0x%"PRIx64
diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c
index 6b958d6363..57427a3997 100644
--- a/hw/net/vhost_net.c
+++ b/hw/net/vhost_net.c
@@ -78,6 +78,9 @@ static const int user_feature_bits[] = {
VIRTIO_F_RING_RESET,
VIRTIO_NET_F_RSS,
VIRTIO_NET_F_HASH_REPORT,
+ VIRTIO_NET_F_GUEST_USO4,
+ VIRTIO_NET_F_GUEST_USO6,
+ VIRTIO_NET_F_HOST_USO,
/* This bit implies RARP isn't sent by QEMU out of band */
VIRTIO_NET_F_GUEST_ANNOUNCE,
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 7102ec4817..bd0ead94fe 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -659,6 +659,15 @@ static int peer_has_ufo(VirtIONet *n)
return n->has_ufo;
}
+static int peer_has_uso(VirtIONet *n)
+{
+ if (!peer_has_vnet_hdr(n)) {
+ return 0;
+ }
+
+ return qemu_has_uso(qemu_get_queue(n->nic)->peer);
+}
+
static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs,
int version_1, int hash_report)
{
@@ -796,6 +805,10 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features,
virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_TSO6);
virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_ECN);
+ virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO);
+ virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4);
+ virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6);
+
virtio_clear_feature(&features, VIRTIO_NET_F_HASH_REPORT);
}
@@ -804,6 +817,12 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features,
virtio_clear_feature(&features, VIRTIO_NET_F_HOST_UFO);
}
+ if (!peer_has_uso(n)) {
+ virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO);
+ virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4);
+ virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6);
+ }
+
if (!get_vhost_net(nc->peer)) {
return features;
}
@@ -859,17 +878,21 @@ static void virtio_net_apply_guest_offloads(VirtIONet *n)
!!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO4)),
!!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO6)),
!!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_ECN)),
- !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_UFO)));
+ !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_UFO)),
+ !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO4)),
+ !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO6)));
}
-static uint64_t virtio_net_guest_offloads_by_features(uint32_t features)
+static uint64_t virtio_net_guest_offloads_by_features(uint64_t features)
{
static const uint64_t guest_offloads_mask =
(1ULL << VIRTIO_NET_F_GUEST_CSUM) |
(1ULL << VIRTIO_NET_F_GUEST_TSO4) |
(1ULL << VIRTIO_NET_F_GUEST_TSO6) |
(1ULL << VIRTIO_NET_F_GUEST_ECN) |
- (1ULL << VIRTIO_NET_F_GUEST_UFO);
+ (1ULL << VIRTIO_NET_F_GUEST_UFO) |
+ (1ULL << VIRTIO_NET_F_GUEST_USO4) |
+ (1ULL << VIRTIO_NET_F_GUEST_USO6);
return guest_offloads_mask & features;
}
@@ -3922,6 +3945,12 @@ static Property virtio_net_properties[] = {
DEFINE_PROP_INT32("speed", VirtIONet, net_conf.speed, SPEED_UNKNOWN),
DEFINE_PROP_STRING("duplex", VirtIONet, net_conf.duplex_str),
DEFINE_PROP_BOOL("failover", VirtIONet, failover, false),
+ DEFINE_PROP_BIT64("guest_uso4", VirtIONet, host_features,
+ VIRTIO_NET_F_GUEST_USO4, true),
+ DEFINE_PROP_BIT64("guest_uso6", VirtIONet, host_features,
+ VIRTIO_NET_F_GUEST_USO6, true),
+ DEFINE_PROP_BIT64("host_uso", VirtIONet, host_features,
+ VIRTIO_NET_F_HOST_USO, true),
DEFINE_PROP_END_OF_LIST(),
};
diff --git a/hw/net/vmxnet3.c b/hw/net/vmxnet3.c
index 3fb108751a..226c0777f0 100644
--- a/hw/net/vmxnet3.c
+++ b/hw/net/vmxnet3.c
@@ -1341,6 +1341,8 @@ static void vmxnet3_update_features(VMXNET3State *s)
s->lro_supported,
s->lro_supported,
0,
+ 0,
+ 0,
0);
}
}
diff --git a/include/net/net.h b/include/net/net.h
index 1448d00afb..330d285930 100644
--- a/include/net/net.h
+++ b/include/net/net.h
@@ -54,11 +54,12 @@ typedef void (LinkStatusChanged)(NetClientState *);
typedef void (NetClientDestructor)(NetClientState *);
typedef RxFilterInfo *(QueryRxFilter)(NetClientState *);
typedef bool (HasUfo)(NetClientState *);
+typedef bool (HasUso)(NetClientState *);
typedef bool (HasVnetHdr)(NetClientState *);
typedef bool (HasVnetHdrLen)(NetClientState *, int);
typedef bool (GetUsingVnetHdr)(NetClientState *);
typedef void (UsingVnetHdr)(NetClientState *, bool);
-typedef void (SetOffload)(NetClientState *, int, int, int, int, int);
+typedef void (SetOffload)(NetClientState *, int, int, int, int, int, int, int);
typedef int (GetVnetHdrLen)(NetClientState *);
typedef void (SetVnetHdrLen)(NetClientState *, int);
typedef int (SetVnetLE)(NetClientState *, bool);
@@ -84,6 +85,7 @@ typedef struct NetClientInfo {
QueryRxFilter *query_rx_filter;
NetPoll *poll;
HasUfo *has_ufo;
+ HasUso *has_uso;
HasVnetHdr *has_vnet_hdr;
HasVnetHdrLen *has_vnet_hdr_len;
GetUsingVnetHdr *get_using_vnet_hdr;
@@ -187,12 +189,13 @@ void qemu_set_info_str(NetClientState *nc,
const char *fmt, ...) G_GNUC_PRINTF(2, 3);
void qemu_format_nic_info_str(NetClientState *nc, uint8_t macaddr[6]);
bool qemu_has_ufo(NetClientState *nc);
+bool qemu_has_uso(NetClientState *nc);
bool qemu_has_vnet_hdr(NetClientState *nc);
bool qemu_has_vnet_hdr_len(NetClientState *nc, int len);
bool qemu_get_using_vnet_hdr(NetClientState *nc);
void qemu_using_vnet_hdr(NetClientState *nc, bool enable);
void qemu_set_offload(NetClientState *nc, int csum, int tso4, int tso6,
- int ecn, int ufo);
+ int ecn, int ufo, int uso4, int uso6);
int qemu_get_vnet_hdr_len(NetClientState *nc);
void qemu_set_vnet_hdr_len(NetClientState *nc, int len);
int qemu_set_vnet_le(NetClientState *nc, bool is_le);
diff --git a/meson.build b/meson.build
index 5150a74831..f426861d90 100644
--- a/meson.build
+++ b/meson.build
@@ -1873,6 +1873,13 @@ if libbpf.found() and not cc.links('''
endif
endif
+# libxdp
+libxdp = not_found
+if not get_option('af_xdp').auto() or have_system
+ libxdp = dependency('libxdp', required: get_option('af_xdp'),
+ version: '>=1.4.0', method: 'pkg-config')
+endif
+
# libdw
libdw = not_found
if not get_option('libdw').auto() or \
@@ -2099,6 +2106,7 @@ config_host_data.set('CONFIG_HEXAGON_IDEF_PARSER', get_option('hexagon_idef_pars
config_host_data.set('CONFIG_LIBATTR', have_old_libattr)
config_host_data.set('CONFIG_LIBCAP_NG', libcap_ng.found())
config_host_data.set('CONFIG_EBPF', libbpf.found())
+config_host_data.set('CONFIG_AF_XDP', libxdp.found())
config_host_data.set('CONFIG_LIBDAXCTL', libdaxctl.found())
config_host_data.set('CONFIG_LIBISCSI', libiscsi.found())
config_host_data.set('CONFIG_LIBNFS', libnfs.found())
@@ -4270,6 +4278,7 @@ summary_info = {}
if targetos == 'darwin'
summary_info += {'vmnet.framework support': vmnet}
endif
+summary_info += {'AF_XDP support': libxdp}
summary_info += {'slirp support': slirp}
summary_info += {'vde support': vde}
summary_info += {'netmap support': have_netmap}
diff --git a/meson_options.txt b/meson_options.txt
index f82d88b7c6..2ca40f22e9 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -122,6 +122,8 @@ option('avx512bw', type: 'feature', value: 'auto',
option('keyring', type: 'feature', value: 'auto',
description: 'Linux keyring support')
+option('af_xdp', type : 'feature', value : 'auto',
+ description: 'AF_XDP network backend support')
option('attr', type : 'feature', value : 'auto',
description: 'attr/xattr support')
option('auth_pam', type : 'feature', value : 'auto',
diff --git a/net/af-xdp.c b/net/af-xdp.c
new file mode 100644
index 0000000000..6c65028fb0
--- /dev/null
+++ b/net/af-xdp.c
@@ -0,0 +1,526 @@
+/*
+ * AF_XDP network backend.
+ *
+ * Copyright (c) 2023 Red Hat, Inc.
+ *
+ * Authors:
+ * Ilya Maximets <i.maximets@ovn.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+
+#include "qemu/osdep.h"
+#include <bpf/bpf.h>
+#include <inttypes.h>
+#include <linux/if_link.h>
+#include <linux/if_xdp.h>
+#include <net/if.h>
+#include <xdp/xsk.h>
+
+#include "clients.h"
+#include "monitor/monitor.h"
+#include "net/net.h"
+#include "qapi/error.h"
+#include "qemu/cutils.h"
+#include "qemu/error-report.h"
+#include "qemu/iov.h"
+#include "qemu/main-loop.h"
+#include "qemu/memalign.h"
+
+
+typedef struct AFXDPState {
+ NetClientState nc;
+
+ struct xsk_socket *xsk;
+ struct xsk_ring_cons rx;
+ struct xsk_ring_prod tx;
+ struct xsk_ring_cons cq;
+ struct xsk_ring_prod fq;
+
+ char ifname[IFNAMSIZ];
+ int ifindex;
+ bool read_poll;
+ bool write_poll;
+ uint32_t outstanding_tx;
+
+ uint64_t *pool;
+ uint32_t n_pool;
+ char *buffer;
+ struct xsk_umem *umem;
+
+ uint32_t n_queues;
+ uint32_t xdp_flags;
+ bool inhibit;
+} AFXDPState;
+
+#define AF_XDP_BATCH_SIZE 64
+
+static void af_xdp_send(void *opaque);
+static void af_xdp_writable(void *opaque);
+
+/* Set the event-loop handlers for the af-xdp backend. */
+static void af_xdp_update_fd_handler(AFXDPState *s)
+{
+ qemu_set_fd_handler(xsk_socket__fd(s->xsk),
+ s->read_poll ? af_xdp_send : NULL,
+ s->write_poll ? af_xdp_writable : NULL,
+ s);
+}
+
+/* Update the read handler. */
+static void af_xdp_read_poll(AFXDPState *s, bool enable)
+{
+ if (s->read_poll != enable) {
+ s->read_poll = enable;
+ af_xdp_update_fd_handler(s);
+ }
+}
+
+/* Update the write handler. */
+static void af_xdp_write_poll(AFXDPState *s, bool enable)
+{
+ if (s->write_poll != enable) {
+ s->write_poll = enable;
+ af_xdp_update_fd_handler(s);
+ }
+}
+
+static void af_xdp_poll(NetClientState *nc, bool enable)
+{
+ AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
+
+ if (s->read_poll != enable || s->write_poll != enable) {
+ s->write_poll = enable;
+ s->read_poll = enable;
+ af_xdp_update_fd_handler(s);
+ }
+}
+
+static void af_xdp_complete_tx(AFXDPState *s)
+{
+ uint32_t idx = 0;
+ uint32_t done, i;
+ uint64_t *addr;
+
+ done = xsk_ring_cons__peek(&s->cq, XSK_RING_CONS__DEFAULT_NUM_DESCS, &idx);
+
+ for (i = 0; i < done; i++) {
+ addr = (void *) xsk_ring_cons__comp_addr(&s->cq, idx++);
+ s->pool[s->n_pool++] = *addr;
+ s->outstanding_tx--;
+ }
+
+ if (done) {
+ xsk_ring_cons__release(&s->cq, done);
+ }
+}
+
+/*
+ * The fd_write() callback, invoked if the fd is marked as writable
+ * after a poll.
+ */
+static void af_xdp_writable(void *opaque)
+{
+ AFXDPState *s = opaque;
+
+ /* Try to recover buffers that are already sent. */
+ af_xdp_complete_tx(s);
+
+ /*
+ * Unregister the handler, unless we still have packets to transmit
+ * and kernel needs a wake up.
+ */
+ if (!s->outstanding_tx || !xsk_ring_prod__needs_wakeup(&s->tx)) {
+ af_xdp_write_poll(s, false);
+ }
+
+ /* Flush any buffered packets. */
+ qemu_flush_queued_packets(&s->nc);
+}
+
+static ssize_t af_xdp_receive(NetClientState *nc,
+ const uint8_t *buf, size_t size)
+{
+ AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
+ struct xdp_desc *desc;
+ uint32_t idx;
+ void *data;
+
+ /* Try to recover buffers that are already sent. */
+ af_xdp_complete_tx(s);
+
+ if (size > XSK_UMEM__DEFAULT_FRAME_SIZE) {
+ /* We can't transmit packet this size... */
+ return size;
+ }
+
+ if (!s->n_pool || !xsk_ring_prod__reserve(&s->tx, 1, &idx)) {
+ /*
+ * Out of buffers or space in tx ring. Poll until we can write.
+ * This will also kick the Tx, if it was waiting on CQ.
+ */
+ af_xdp_write_poll(s, true);
+ return 0;
+ }
+
+ desc = xsk_ring_prod__tx_desc(&s->tx, idx);
+ desc->addr = s->pool[--s->n_pool];
+ desc->len = size;
+
+ data = xsk_umem__get_data(s->buffer, desc->addr);
+ memcpy(data, buf, size);
+
+ xsk_ring_prod__submit(&s->tx, 1);
+ s->outstanding_tx++;
+
+ if (xsk_ring_prod__needs_wakeup(&s->tx)) {
+ af_xdp_write_poll(s, true);
+ }
+
+ return size;
+}
+
+/*
+ * Complete a previous send (backend --> guest) and enable the
+ * fd_read callback.
+ */
+static void af_xdp_send_completed(NetClientState *nc, ssize_t len)
+{
+ AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
+
+ af_xdp_read_poll(s, true);
+}
+
+static void af_xdp_fq_refill(AFXDPState *s, uint32_t n)
+{
+ uint32_t i, idx = 0;
+
+ /* Leave one packet for Tx, just in case. */
+ if (s->n_pool < n + 1) {
+ n = s->n_pool;
+ }
+
+ if (!n || !xsk_ring_prod__reserve(&s->fq, n, &idx)) {
+ return;
+ }
+
+ for (i = 0; i < n; i++) {
+ *xsk_ring_prod__fill_addr(&s->fq, idx++) = s->pool[--s->n_pool];
+ }
+ xsk_ring_prod__submit(&s->fq, n);
+
+ if (xsk_ring_prod__needs_wakeup(&s->fq)) {
+ /* Receive was blocked by not having enough buffers. Wake it up. */
+ af_xdp_read_poll(s, true);
+ }
+}
+
+static void af_xdp_send(void *opaque)
+{
+ uint32_t i, n_rx, idx = 0;
+ AFXDPState *s = opaque;
+
+ n_rx = xsk_ring_cons__peek(&s->rx, AF_XDP_BATCH_SIZE, &idx);
+ if (!n_rx) {
+ return;
+ }
+
+ for (i = 0; i < n_rx; i++) {
+ const struct xdp_desc *desc;
+ struct iovec iov;
+
+ desc = xsk_ring_cons__rx_desc(&s->rx, idx++);
+
+ iov.iov_base = xsk_umem__get_data(s->buffer, desc->addr);
+ iov.iov_len = desc->len;
+
+ s->pool[s->n_pool++] = desc->addr;
+
+ if (!qemu_sendv_packet_async(&s->nc, &iov, 1,
+ af_xdp_send_completed)) {
+ /*
+ * The peer does not receive anymore. Packet is queued, stop
+ * reading from the backend until af_xdp_send_completed().
+ */
+ af_xdp_read_poll(s, false);
+
+ /* Return unused descriptors to not break the ring cache. */
+ xsk_ring_cons__cancel(&s->rx, n_rx - i - 1);
+ n_rx = i + 1;
+ break;
+ }
+ }
+
+ /* Release actually sent descriptors and try to re-fill. */
+ xsk_ring_cons__release(&s->rx, n_rx);
+ af_xdp_fq_refill(s, AF_XDP_BATCH_SIZE);
+}
+
+/* Flush and close. */
+static void af_xdp_cleanup(NetClientState *nc)
+{
+ AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
+
+ qemu_purge_queued_packets(nc);
+
+ af_xdp_poll(nc, false);
+
+ xsk_socket__delete(s->xsk);
+ s->xsk = NULL;
+ g_free(s->pool);
+ s->pool = NULL;
+ xsk_umem__delete(s->umem);
+ s->umem = NULL;
+ qemu_vfree(s->buffer);
+ s->buffer = NULL;
+
+ /* Remove the program if it's the last open queue. */
+ if (!s->inhibit && nc->queue_index == s->n_queues - 1 && s->xdp_flags
+ && bpf_xdp_detach(s->ifindex, s->xdp_flags, NULL) != 0) {
+ fprintf(stderr,
+ "af-xdp: unable to remove XDP program from '%s', ifindex: %d\n",
+ s->ifname, s->ifindex);
+ }
+}
+
+static int af_xdp_umem_create(AFXDPState *s, int sock_fd, Error **errp)
+{
+ struct xsk_umem_config config = {
+ .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
+ .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
+ .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE,
+ .frame_headroom = 0,
+ };
+ uint64_t n_descs;
+ uint64_t size;
+ int64_t i;
+ int ret;
+
+ /* Number of descriptors if all 4 queues (rx, tx, cq, fq) are full. */
+ n_descs = (XSK_RING_PROD__DEFAULT_NUM_DESCS
+ + XSK_RING_CONS__DEFAULT_NUM_DESCS) * 2;
+ size = n_descs * XSK_UMEM__DEFAULT_FRAME_SIZE;
+
+ s->buffer = qemu_memalign(qemu_real_host_page_size(), size);
+ memset(s->buffer, 0, size);
+
+ if (sock_fd < 0) {
+ ret = xsk_umem__create(&s->umem, s->buffer, size,
+ &s->fq, &s->cq, &config);
+ } else {
+ ret = xsk_umem__create_with_fd(&s->umem, sock_fd, s->buffer, size,
+ &s->fq, &s->cq, &config);
+ }
+
+ if (ret) {
+ qemu_vfree(s->buffer);
+ error_setg_errno(errp, errno,
+ "failed to create umem for %s queue_index: %d",
+ s->ifname, s->nc.queue_index);
+ return -1;
+ }
+
+ s->pool = g_new(uint64_t, n_descs);
+ /* Fill the pool in the opposite order, because it's a LIFO queue. */
+ for (i = n_descs; i >= 0; i--) {
+ s->pool[i] = i * XSK_UMEM__DEFAULT_FRAME_SIZE;
+ }
+ s->n_pool = n_descs;
+
+ af_xdp_fq_refill(s, XSK_RING_PROD__DEFAULT_NUM_DESCS);
+
+ return 0;
+}
+
+static int af_xdp_socket_create(AFXDPState *s,
+ const NetdevAFXDPOptions *opts, Error **errp)
+{
+ struct xsk_socket_config cfg = {
+ .rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
+ .tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
+ .libxdp_flags = 0,
+ .bind_flags = XDP_USE_NEED_WAKEUP,
+ .xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST,
+ };
+ int queue_id, error = 0;
+
+ s->inhibit = opts->has_inhibit && opts->inhibit;
+ if (s->inhibit) {
+ cfg.libxdp_flags |= XSK_LIBXDP_FLAGS__INHIBIT_PROG_LOAD;
+ }
+
+ if (opts->has_force_copy && opts->force_copy) {
+ cfg.bind_flags |= XDP_COPY;
+ }
+
+ queue_id = s->nc.queue_index;
+ if (opts->has_start_queue && opts->start_queue > 0) {
+ queue_id += opts->start_queue;
+ }
+
+ if (opts->has_mode) {
+ /* Specific mode requested. */
+ cfg.xdp_flags |= (opts->mode == AFXDP_MODE_NATIVE)
+ ? XDP_FLAGS_DRV_MODE : XDP_FLAGS_SKB_MODE;
+ if (xsk_socket__create(&s->xsk, s->ifname, queue_id,
+ s->umem, &s->rx, &s->tx, &cfg)) {
+ error = errno;
+ }
+ } else {
+ /* No mode requested, try native first. */
+ cfg.xdp_flags |= XDP_FLAGS_DRV_MODE;
+
+ if (xsk_socket__create(&s->xsk, s->ifname, queue_id,
+ s->umem, &s->rx, &s->tx, &cfg)) {
+ /* Can't use native mode, try skb. */
+ cfg.xdp_flags &= ~XDP_FLAGS_DRV_MODE;
+ cfg.xdp_flags |= XDP_FLAGS_SKB_MODE;
+
+ if (xsk_socket__create(&s->xsk, s->ifname, queue_id,
+ s->umem, &s->rx, &s->tx, &cfg)) {
+ error = errno;
+ }
+ }
+ }
+
+ if (error) {
+ error_setg_errno(errp, error,
+ "failed to create AF_XDP socket for %s queue_id: %d",
+ s->ifname, queue_id);
+ return -1;
+ }
+
+ s->xdp_flags = cfg.xdp_flags;
+
+ return 0;
+}
+
+/* NetClientInfo methods. */
+static NetClientInfo net_af_xdp_info = {
+ .type = NET_CLIENT_DRIVER_AF_XDP,
+ .size = sizeof(AFXDPState),
+ .receive = af_xdp_receive,
+ .poll = af_xdp_poll,
+ .cleanup = af_xdp_cleanup,
+};
+
+static int *parse_socket_fds(const char *sock_fds_str,
+ int64_t n_expected, Error **errp)
+{
+ gchar **substrings = g_strsplit(sock_fds_str, ":", -1);
+ int64_t i, n_sock_fds = g_strv_length(substrings);
+ int *sock_fds = NULL;
+
+ if (n_sock_fds != n_expected) {
+ error_setg(errp, "expected %"PRIi64" socket fds, got %"PRIi64,
+ n_expected, n_sock_fds);
+ goto exit;
+ }
+
+ sock_fds = g_new(int, n_sock_fds);
+
+ for (i = 0; i < n_sock_fds; i++) {
+ sock_fds[i] = monitor_fd_param(monitor_cur(), substrings[i], errp);
+ if (sock_fds[i] < 0) {
+ g_free(sock_fds);
+ sock_fds = NULL;
+ goto exit;
+ }
+ }
+
+exit:
+ g_strfreev(substrings);
+ return sock_fds;
+}
+
+/*
+ * The exported init function.
+ *
+ * ... -netdev af-xdp,ifname="..."
+ */
+int net_init_af_xdp(const Netdev *netdev,
+ const char *name, NetClientState *peer, Error **errp)
+{
+ const NetdevAFXDPOptions *opts = &netdev->u.af_xdp;
+ NetClientState *nc, *nc0 = NULL;
+ unsigned int ifindex;
+ uint32_t prog_id = 0;
+ int *sock_fds = NULL;
+ int64_t i, queues;
+ Error *err = NULL;
+ AFXDPState *s;
+
+ ifindex = if_nametoindex(opts->ifname);
+ if (!ifindex) {
+ error_setg_errno(errp, errno, "failed to get ifindex for '%s'",
+ opts->ifname);
+ return -1;
+ }
+
+ queues = opts->has_queues ? opts->queues : 1;
+ if (queues < 1) {
+ error_setg(errp, "invalid number of queues (%" PRIi64 ") for '%s'",
+ queues, opts->ifname);
+ return -1;
+ }
+
+ if ((opts->has_inhibit && opts->inhibit) != !!opts->sock_fds) {
+ error_setg(errp, "'inhibit=on' requires 'sock-fds' and vice versa");
+ return -1;
+ }
+
+ if (opts->sock_fds) {
+ sock_fds = parse_socket_fds(opts->sock_fds, queues, errp);
+ if (!sock_fds) {
+ return -1;
+ }
+ }
+
+ for (i = 0; i < queues; i++) {
+ nc = qemu_new_net_client(&net_af_xdp_info, peer, "af-xdp", name);
+ qemu_set_info_str(nc, "af-xdp%"PRIi64" to %s", i, opts->ifname);
+ nc->queue_index = i;
+
+ if (!nc0) {
+ nc0 = nc;
+ }
+
+ s = DO_UPCAST(AFXDPState, nc, nc);
+
+ pstrcpy(s->ifname, sizeof(s->ifname), opts->ifname);
+ s->ifindex = ifindex;
+ s->n_queues = queues;
+
+ if (af_xdp_umem_create(s, sock_fds ? sock_fds[i] : -1, errp)
+ || af_xdp_socket_create(s, opts, errp)) {
+ /* Make sure the XDP program will be removed. */
+ s->n_queues = i;
+ error_propagate(errp, err);
+ goto err;
+ }
+ }
+
+ if (nc0) {
+ s = DO_UPCAST(AFXDPState, nc, nc0);
+ if (bpf_xdp_query_id(s->ifindex, s->xdp_flags, &prog_id) || !prog_id) {
+ error_setg_errno(errp, errno,
+ "no XDP program loaded on '%s', ifindex: %d",
+ s->ifname, s->ifindex);
+ goto err;
+ }
+ }
+
+ af_xdp_read_poll(s, true); /* Initially only poll for reads. */
+
+ return 0;
+
+err:
+ g_free(sock_fds);
+ if (nc0) {
+ qemu_del_net_client(nc0);
+ }
+
+ return -1;
+}
diff --git a/net/clients.h b/net/clients.h
index ed8bdfff1e..be53794582 100644
--- a/net/clients.h
+++ b/net/clients.h
@@ -64,6 +64,11 @@ int net_init_netmap(const Netdev *netdev, const char *name,
NetClientState *peer, Error **errp);
#endif
+#ifdef CONFIG_AF_XDP
+int net_init_af_xdp(const Netdev *netdev, const char *name,
+ NetClientState *peer, Error **errp);
+#endif
+
int net_init_vhost_user(const Netdev *netdev, const char *name,
NetClientState *peer, Error **errp);
diff --git a/net/dump.c b/net/dump.c
index 7d05f16ca7..16073f2458 100644
--- a/net/dump.c
+++ b/net/dump.c
@@ -68,7 +68,7 @@ static ssize_t dump_receive_iov(DumpState *s, const struct iovec *iov, int cnt,
int64_t ts;
int caplen;
size_t size = iov_size(iov, cnt) - offset;
- struct iovec dumpiov[cnt + 1];
+ g_autofree struct iovec *dumpiov = g_new(struct iovec, cnt + 1);
/* Early return in case of previous error. */
if (s->fd < 0) {
diff --git a/net/meson.build b/net/meson.build
index 51caa42c9d..ce99bd4447 100644
--- a/net/meson.build
+++ b/net/meson.build
@@ -36,6 +36,9 @@ system_ss.add(when: vde, if_true: files('vde.c'))
if have_netmap
system_ss.add(files('netmap.c'))
endif
+
+system_ss.add(when: libxdp, if_true: files('af-xdp.c'))
+
if have_vhost_net_user
system_ss.add(when: 'CONFIG_VIRTIO_NET', if_true: files('vhost-user.c'), if_false: files('vhost-user-stub.c'))
system_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-user-stub.c'))
diff --git a/net/net.c b/net/net.c
index 6492ad530e..1c0bfdaa6c 100644
--- a/net/net.c
+++ b/net/net.c
@@ -495,6 +495,15 @@ bool qemu_has_ufo(NetClientState *nc)
return nc->info->has_ufo(nc);
}
+bool qemu_has_uso(NetClientState *nc)
+{
+ if (!nc || !nc->info->has_uso) {
+ return false;
+ }
+
+ return nc->info->has_uso(nc);
+}
+
bool qemu_has_vnet_hdr(NetClientState *nc)
{
if (!nc || !nc->info->has_vnet_hdr) {
@@ -532,13 +541,13 @@ void qemu_using_vnet_hdr(NetClientState *nc, bool enable)
}
void qemu_set_offload(NetClientState *nc, int csum, int tso4, int tso6,
- int ecn, int ufo)
+ int ecn, int ufo, int uso4, int uso6)
{
if (!nc || !nc->info->set_offload) {
return;
}
- nc->info->set_offload(nc, csum, tso4, tso6, ecn, ufo);
+ nc->info->set_offload(nc, csum, tso4, tso6, ecn, ufo, uso4, uso6);
}
int qemu_get_vnet_hdr_len(NetClientState *nc)
@@ -1082,6 +1091,9 @@ static int (* const net_client_init_fun[NET_CLIENT_DRIVER__MAX])(
#ifdef CONFIG_NETMAP
[NET_CLIENT_DRIVER_NETMAP] = net_init_netmap,
#endif
+#ifdef CONFIG_AF_XDP
+ [NET_CLIENT_DRIVER_AF_XDP] = net_init_af_xdp,
+#endif
#ifdef CONFIG_NET_BRIDGE
[NET_CLIENT_DRIVER_BRIDGE] = net_init_bridge,
#endif
@@ -1186,6 +1198,9 @@ void show_netdevs(void)
#ifdef CONFIG_NETMAP
"netmap",
#endif
+#ifdef CONFIG_AF_XDP
+ "af-xdp",
+#endif
#ifdef CONFIG_POSIX
"vhost-user",
#endif
diff --git a/net/netmap.c b/net/netmap.c
index 9e0cec58d3..241b27c8e9 100644
--- a/net/netmap.c
+++ b/net/netmap.c
@@ -371,7 +371,7 @@ static void netmap_set_vnet_hdr_len(NetClientState *nc, int len)
}
static void netmap_set_offload(NetClientState *nc, int csum, int tso4, int tso6,
- int ecn, int ufo)
+ int ecn, int ufo, int uso4, int uso6)
{
NetmapState *s = DO_UPCAST(NetmapState, nc, nc);
diff --git a/net/tap-bsd.c b/net/tap-bsd.c
index 4c98fdd337..274ea7bd2c 100644
--- a/net/tap-bsd.c
+++ b/net/tap-bsd.c
@@ -212,6 +212,11 @@ int tap_probe_has_ufo(int fd)
return 0;
}
+int tap_probe_has_uso(int fd)
+{
+ return 0;
+}
+
int tap_probe_vnet_hdr_len(int fd, int len)
{
return 0;
@@ -232,7 +237,7 @@ int tap_fd_set_vnet_be(int fd, int is_be)
}
void tap_fd_set_offload(int fd, int csum, int tso4,
- int tso6, int ecn, int ufo)
+ int tso6, int ecn, int ufo, int uso4, int uso6)
{
}
diff --git a/net/tap-linux.c b/net/tap-linux.c
index f54f308d35..c7e514ecb0 100644
--- a/net/tap-linux.c
+++ b/net/tap-linux.c
@@ -173,6 +173,18 @@ int tap_probe_has_ufo(int fd)
return 1;
}
+int tap_probe_has_uso(int fd)
+{
+ unsigned offload;
+
+ offload = TUN_F_CSUM | TUN_F_USO4 | TUN_F_USO6;
+
+ if (ioctl(fd, TUNSETOFFLOAD, offload) < 0) {
+ return 0;
+ }
+ return 1;
+}
+
/* Verify that we can assign given length */
int tap_probe_vnet_hdr_len(int fd, int len)
{
@@ -237,7 +249,7 @@ int tap_fd_set_vnet_be(int fd, int is_be)
}
void tap_fd_set_offload(int fd, int csum, int tso4,
- int tso6, int ecn, int ufo)
+ int tso6, int ecn, int ufo, int uso4, int uso6)
{
unsigned int offload = 0;
@@ -256,13 +268,22 @@ void tap_fd_set_offload(int fd, int csum, int tso4,
offload |= TUN_F_TSO_ECN;
if (ufo)
offload |= TUN_F_UFO;
+ if (uso4) {
+ offload |= TUN_F_USO4;
+ }
+ if (uso6) {
+ offload |= TUN_F_USO6;
+ }
}
if (ioctl(fd, TUNSETOFFLOAD, offload) != 0) {
- offload &= ~TUN_F_UFO;
+ offload &= ~(TUN_F_USO4 | TUN_F_USO6);
if (ioctl(fd, TUNSETOFFLOAD, offload) != 0) {
- fprintf(stderr, "TUNSETOFFLOAD ioctl() failed: %s\n",
+ offload &= ~TUN_F_UFO;
+ if (ioctl(fd, TUNSETOFFLOAD, offload) != 0) {
+ fprintf(stderr, "TUNSETOFFLOAD ioctl() failed: %s\n",
strerror(errno));
+ }
}
}
}
diff --git a/net/tap-linux.h b/net/tap-linux.h
index bbbb62c2a7..9a58cecb7f 100644
--- a/net/tap-linux.h
+++ b/net/tap-linux.h
@@ -50,5 +50,7 @@
#define TUN_F_TSO6 0x04 /* I can handle TSO for IPv6 packets */
#define TUN_F_TSO_ECN 0x08 /* I can handle TSO with ECN bits. */
#define TUN_F_UFO 0x10 /* I can handle UFO packets */
+#define TUN_F_USO4 0x20 /* I can handle USO for IPv4 packets */
+#define TUN_F_USO6 0x40 /* I can handle USO for IPv6 packets */
#endif /* QEMU_TAP_LINUX_H */
diff --git a/net/tap-solaris.c b/net/tap-solaris.c
index 38e15028bf..08b13af512 100644
--- a/net/tap-solaris.c
+++ b/net/tap-solaris.c
@@ -216,6 +216,11 @@ int tap_probe_has_ufo(int fd)
return 0;
}
+int tap_probe_has_uso(int fd)
+{
+ return 0;
+}
+
int tap_probe_vnet_hdr_len(int fd, int len)
{
return 0;
@@ -236,7 +241,7 @@ int tap_fd_set_vnet_be(int fd, int is_be)
}
void tap_fd_set_offload(int fd, int csum, int tso4,
- int tso6, int ecn, int ufo)
+ int tso6, int ecn, int ufo, int uso4, int uso6)
{
}
diff --git a/net/tap-stub.c b/net/tap-stub.c
index a0fa25804b..4b24f61e3a 100644
--- a/net/tap-stub.c
+++ b/net/tap-stub.c
@@ -47,6 +47,11 @@ int tap_probe_has_ufo(int fd)
return 0;
}
+int tap_probe_has_uso(int fd)
+{
+ return 0;
+}
+
int tap_probe_vnet_hdr_len(int fd, int len)
{
return 0;
@@ -67,7 +72,7 @@ int tap_fd_set_vnet_be(int fd, int is_be)
}
void tap_fd_set_offload(int fd, int csum, int tso4,
- int tso6, int ecn, int ufo)
+ int tso6, int ecn, int ufo, int uso4, int uso6)
{
}
diff --git a/net/tap-win32.c b/net/tap-win32.c
index f327d62ab0..7b8b4be02c 100644
--- a/net/tap-win32.c
+++ b/net/tap-win32.c
@@ -741,7 +741,7 @@ static void tap_using_vnet_hdr(NetClientState *nc, bool using_vnet_hdr)
}
static void tap_set_offload(NetClientState *nc, int csum, int tso4,
- int tso6, int ecn, int ufo)
+ int tso6, int ecn, int ufo, int uso4, int uso6)
{
}
diff --git a/net/tap.c b/net/tap.c
index 1bf085d422..c23d0323c2 100644
--- a/net/tap.c
+++ b/net/tap.c
@@ -57,6 +57,7 @@ typedef struct TAPState {
bool write_poll;
bool using_vnet_hdr;
bool has_ufo;
+ bool has_uso;
bool enabled;
VHostNetState *vhost_net;
unsigned host_vnet_hdr_len;
@@ -117,10 +118,11 @@ static ssize_t tap_receive_iov(NetClientState *nc, const struct iovec *iov,
{
TAPState *s = DO_UPCAST(TAPState, nc, nc);
const struct iovec *iovp = iov;
- struct iovec iov_copy[iovcnt + 1];
+ g_autofree struct iovec *iov_copy = NULL;
struct virtio_net_hdr_mrg_rxbuf hdr = { };
if (s->host_vnet_hdr_len && !s->using_vnet_hdr) {
+ iov_copy = g_new(struct iovec, iovcnt + 1);
iov_copy[0].iov_base = &hdr;
iov_copy[0].iov_len = s->host_vnet_hdr_len;
memcpy(&iov_copy[1], iov, iovcnt * sizeof(*iov));
@@ -237,6 +239,15 @@ static bool tap_has_ufo(NetClientState *nc)
return s->has_ufo;
}
+static bool tap_has_uso(NetClientState *nc)
+{
+ TAPState *s = DO_UPCAST(TAPState, nc, nc);
+
+ assert(nc->info->type == NET_CLIENT_DRIVER_TAP);
+
+ return s->has_uso;
+}
+
static bool tap_has_vnet_hdr(NetClientState *nc)
{
TAPState *s = DO_UPCAST(TAPState, nc, nc);
@@ -307,14 +318,14 @@ static int tap_set_vnet_be(NetClientState *nc, bool is_be)
}
static void tap_set_offload(NetClientState *nc, int csum, int tso4,
- int tso6, int ecn, int ufo)
+ int tso6, int ecn, int ufo, int uso4, int uso6)
{
TAPState *s = DO_UPCAST(TAPState, nc, nc);
if (s->fd < 0) {
return;
}
- tap_fd_set_offload(s->fd, csum, tso4, tso6, ecn, ufo);
+ tap_fd_set_offload(s->fd, csum, tso4, tso6, ecn, ufo, uso4, uso6);
}
static void tap_exit_notify(Notifier *notifier, void *data)
@@ -384,6 +395,7 @@ static NetClientInfo net_tap_info = {
.poll = tap_poll,
.cleanup = tap_cleanup,
.has_ufo = tap_has_ufo,
+ .has_uso = tap_has_uso,
.has_vnet_hdr = tap_has_vnet_hdr,
.has_vnet_hdr_len = tap_has_vnet_hdr_len,
.get_using_vnet_hdr = tap_get_using_vnet_hdr,
@@ -413,8 +425,9 @@ static TAPState *net_tap_fd_init(NetClientState *peer,
s->host_vnet_hdr_len = vnet_hdr ? sizeof(struct virtio_net_hdr) : 0;
s->using_vnet_hdr = false;
s->has_ufo = tap_probe_has_ufo(s->fd);
+ s->has_uso = tap_probe_has_uso(s->fd);
s->enabled = true;
- tap_set_offload(&s->nc, 0, 0, 0, 0, 0);
+ tap_set_offload(&s->nc, 0, 0, 0, 0, 0, 0, 0);
/*
* Make sure host header length is set correctly in tap:
* it might have been modified by another instance of qemu.
diff --git a/net/tap_int.h b/net/tap_int.h
index 547f8a5a28..9a2175655b 100644
--- a/net/tap_int.h
+++ b/net/tap_int.h
@@ -37,7 +37,9 @@ void tap_set_sndbuf(int fd, const NetdevTapOptions *tap, Error **errp);
int tap_probe_vnet_hdr(int fd, Error **errp);
int tap_probe_vnet_hdr_len(int fd, int len);
int tap_probe_has_ufo(int fd);
-void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo);
+int tap_probe_has_uso(int fd);
+void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo,
+ int uso4, int uso6);
void tap_fd_set_vnet_hdr_len(int fd, int len);
int tap_fd_set_vnet_le(int fd, int vnet_is_le);
int tap_fd_set_vnet_be(int fd, int vnet_is_be);
diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index 34202ca009..4e94c50bc7 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -75,11 +75,14 @@ const int vdpa_feature_bits[] = {
VIRTIO_NET_F_GUEST_TSO4,
VIRTIO_NET_F_GUEST_TSO6,
VIRTIO_NET_F_GUEST_UFO,
+ VIRTIO_NET_F_GUEST_USO4,
+ VIRTIO_NET_F_GUEST_USO6,
VIRTIO_NET_F_HASH_REPORT,
VIRTIO_NET_F_HOST_ECN,
VIRTIO_NET_F_HOST_TSO4,
VIRTIO_NET_F_HOST_TSO6,
VIRTIO_NET_F_HOST_UFO,
+ VIRTIO_NET_F_HOST_USO,
VIRTIO_NET_F_MQ,
VIRTIO_NET_F_MRG_RXBUF,
VIRTIO_NET_F_MTU,
diff --git a/qapi/net.json b/qapi/net.json
index 313c8a606e..8095b68fa8 100644
--- a/qapi/net.json
+++ b/qapi/net.json
@@ -409,6 +409,60 @@
'*devname': 'str' } }
##
+# @AFXDPMode:
+#
+# Attach mode for a default XDP program
+#
+# @skb: generic mode, no driver support necessary
+#
+# @native: DRV mode, program is attached to a driver, packets are passed to
+# the socket without allocation of skb.
+#
+# Since: 8.2
+##
+{ 'enum': 'AFXDPMode',
+ 'data': [ 'native', 'skb' ],
+ 'if': 'CONFIG_AF_XDP' }
+
+##
+# @NetdevAFXDPOptions:
+#
+# AF_XDP network backend
+#
+# @ifname: The name of an existing network interface.
+#
+# @mode: Attach mode for a default XDP program. If not specified, then
+# 'native' will be tried first, then 'skb'.
+#
+# @force-copy: Force XDP copy mode even if device supports zero-copy.
+# (default: false)
+#
+# @queues: number of queues to be used for multiqueue interfaces (default: 1).
+#
+# @start-queue: Use @queues starting from this queue number (default: 0).
+#
+# @inhibit: Don't load a default XDP program, use one already loaded to
+# the interface (default: false). Requires @sock-fds.
+#
+# @sock-fds: A colon (:) separated list of file descriptors for already open
+# but not bound AF_XDP sockets in the queue order. One fd per queue.
+# These descriptors should already be added into XDP socket map for
+# corresponding queues. Requires @inhibit.
+#
+# Since: 8.2
+##
+{ 'struct': 'NetdevAFXDPOptions',
+ 'data': {
+ 'ifname': 'str',
+ '*mode': 'AFXDPMode',
+ '*force-copy': 'bool',
+ '*queues': 'int',
+ '*start-queue': 'int',
+ '*inhibit': 'bool',
+ '*sock-fds': 'str' },
+ 'if': 'CONFIG_AF_XDP' }
+
+##
# @NetdevVhostUserOptions:
#
# Vhost-user network backend
@@ -642,6 +696,7 @@
# @vmnet-bridged: since 7.1
# @stream: since 7.2
# @dgram: since 7.2
+# @af-xdp: since 8.2
#
# Since: 2.7
##
@@ -649,6 +704,7 @@
'data': [ 'none', 'nic', 'user', 'tap', 'l2tpv3', 'socket', 'stream',
'dgram', 'vde', 'bridge', 'hubport', 'netmap', 'vhost-user',
'vhost-vdpa',
+ { 'name': 'af-xdp', 'if': 'CONFIG_AF_XDP' },
{ 'name': 'vmnet-host', 'if': 'CONFIG_VMNET' },
{ 'name': 'vmnet-shared', 'if': 'CONFIG_VMNET' },
{ 'name': 'vmnet-bridged', 'if': 'CONFIG_VMNET' }] }
@@ -679,6 +735,8 @@
'bridge': 'NetdevBridgeOptions',
'hubport': 'NetdevHubPortOptions',
'netmap': 'NetdevNetmapOptions',
+ 'af-xdp': { 'type': 'NetdevAFXDPOptions',
+ 'if': 'CONFIG_AF_XDP' },
'vhost-user': 'NetdevVhostUserOptions',
'vhost-vdpa': 'NetdevVhostVDPAOptions',
'vmnet-host': { 'type': 'NetdevVmnetHostOptions',
diff --git a/qemu-options.hx b/qemu-options.hx
index 6be621c232..2bcf7e4e97 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -2882,6 +2882,19 @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev,
" VALE port (created on the fly) called 'name' ('nmname' is name of the \n"
" netmap device, defaults to '/dev/netmap')\n"
#endif
+#ifdef CONFIG_AF_XDP
+ "-netdev af-xdp,id=str,ifname=name[,mode=native|skb][,force-copy=on|off]\n"
+ " [,queues=n][,start-queue=m][,inhibit=on|off][,sock-fds=x:y:...:z]\n"
+ " attach to the existing network interface 'name' with AF_XDP socket\n"
+ " use 'mode=MODE' to specify an XDP program attach mode\n"
+ " use 'force-copy=on|off' to force XDP copy mode even if device supports zero-copy (default: off)\n"
+ " use 'inhibit=on|off' to inhibit loading of a default XDP program (default: off)\n"
+ " with inhibit=on,\n"
+ " use 'sock-fds' to provide file descriptors for already open AF_XDP sockets\n"
+ " added to a socket map in XDP program. One socket per queue.\n"
+ " use 'queues=n' to specify how many queues of a multiqueue interface should be used\n"
+ " use 'start-queue=m' to specify the first queue that should be used\n"
+#endif
#ifdef CONFIG_POSIX
"-netdev vhost-user,id=str,chardev=dev[,vhostforce=on|off]\n"
" configure a vhost-user network, backed by a chardev 'dev'\n"
@@ -2927,6 +2940,9 @@ DEF("nic", HAS_ARG, QEMU_OPTION_nic,
#ifdef CONFIG_NETMAP
"netmap|"
#endif
+#ifdef CONFIG_AF_XDP
+ "af-xdp|"
+#endif
#ifdef CONFIG_POSIX
"vhost-user|"
#endif
@@ -2955,6 +2971,9 @@ DEF("net", HAS_ARG, QEMU_OPTION_net,
#ifdef CONFIG_NETMAP
"netmap|"
#endif
+#ifdef CONFIG_AF_XDP
+ "af-xdp|"
+#endif
#ifdef CONFIG_VMNET
"vmnet-host|vmnet-shared|vmnet-bridged|"
#endif
@@ -2962,7 +2981,7 @@ DEF("net", HAS_ARG, QEMU_OPTION_net,
" old way to initialize a host network interface\n"
" (use the -netdev option if possible instead)\n", QEMU_ARCH_ALL)
SRST
-``-nic [tap|bridge|user|l2tpv3|vde|netmap|vhost-user|socket][,...][,mac=macaddr][,model=mn]``
+``-nic [tap|bridge|user|l2tpv3|vde|netmap|af-xdp|vhost-user|socket][,...][,mac=macaddr][,model=mn]``
This option is a shortcut for configuring both the on-board
(default) guest NIC hardware and the host network backend in one go.
The host backend options are the same as with the corresponding
@@ -3376,6 +3395,55 @@ SRST
# launch QEMU instance
|qemu_system| linux.img -nic vde,sock=/tmp/myswitch
+``-netdev af-xdp,id=str,ifname=name[,mode=native|skb][,force-copy=on|off][,queues=n][,start-queue=m][,inhibit=on|off][,sock-fds=x:y:...:z]``
+ Configure AF_XDP backend to connect to a network interface 'name'
+ using AF_XDP socket. A specific program attach mode for a default
+ XDP program can be forced with 'mode', defaults to best-effort,
+ where the likely most performant mode will be in use. Number of queues
+ 'n' should generally match the number or queues in the interface,
+ defaults to 1. Traffic arriving on non-configured device queues will
+ not be delivered to the network backend.
+
+ .. parsed-literal::
+
+ # set number of queues to 4
+ ethtool -L eth0 combined 4
+ # launch QEMU instance
+ |qemu_system| linux.img -device virtio-net-pci,netdev=n1 \\
+ -netdev af-xdp,id=n1,ifname=eth0,queues=4
+
+ 'start-queue' option can be specified if a particular range of queues
+ [m, m + n] should be in use. For example, this is may be necessary in
+ order to use certain NICs in native mode. Kernel allows the driver to
+ create a separate set of XDP queues on top of regular ones, and only
+ these queues can be used for AF_XDP sockets. NICs that work this way
+ may also require an additional traffic redirection with ethtool to these
+ special queues.
+
+ .. parsed-literal::
+
+ # set number of queues to 1
+ ethtool -L eth0 combined 1
+ # redirect all the traffic to the second queue (id: 1)
+ # note: drivers may require non-empty key/mask pair.
+ ethtool -N eth0 flow-type ether \\
+ dst 00:00:00:00:00:00 m FF:FF:FF:FF:FF:FE action 1
+ ethtool -N eth0 flow-type ether \\
+ dst 00:00:00:00:00:01 m FF:FF:FF:FF:FF:FE action 1
+ # launch QEMU instance
+ |qemu_system| linux.img -device virtio-net-pci,netdev=n1 \\
+ -netdev af-xdp,id=n1,ifname=eth0,queues=1,start-queue=1
+
+ XDP program can also be loaded externally. In this case 'inhibit' option
+ should be set to 'on' and 'sock-fds' provided with file descriptors for
+ already open but not bound XDP sockets already added to a socket map for
+ corresponding queues. One socket per queue.
+
+ .. parsed-literal::
+
+ |qemu_system| linux.img -device virtio-net-pci,netdev=n1 \\
+ -netdev af-xdp,id=n1,ifname=eth0,queues=3,inhibit=on,sock-fds=15:16:17
+
``-netdev vhost-user,chardev=id[,vhostforce=on|off][,queues=n]``
Establish a vhost-user netdev, backed by a chardev id. The chardev
should be a unix domain socket backed one. The vhost-user uses a
diff --git a/scripts/ci/org.centos/stream/8/x86_64/configure b/scripts/ci/org.centos/stream/8/x86_64/configure
index 131f8ee5f3..76781f17f4 100755
--- a/scripts/ci/org.centos/stream/8/x86_64/configure
+++ b/scripts/ci/org.centos/stream/8/x86_64/configure
@@ -35,6 +35,7 @@
--block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
--with-coroutine=ucontext \
--tls-priority=@QEMU,SYSTEM \
+--disable-af-xdp \
--disable-attr \
--disable-auth-pam \
--disable-avx2 \
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
index e1d178370c..230119346a 100644
--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
@@ -76,6 +76,7 @@ meson_options_help() {
printf "%s\n" 'disabled with --disable-FEATURE, default is enabled if available'
printf "%s\n" '(unless built with --without-default-features):'
printf "%s\n" ''
+ printf "%s\n" ' af-xdp AF_XDP network backend support'
printf "%s\n" ' alsa ALSA sound support'
printf "%s\n" ' attr attr/xattr support'
printf "%s\n" ' auth-pam PAM access control'
@@ -208,6 +209,8 @@ meson_options_help() {
}
_meson_option_parse() {
case $1 in
+ --enable-af-xdp) printf "%s" -Daf_xdp=enabled ;;
+ --disable-af-xdp) printf "%s" -Daf_xdp=disabled ;;
--enable-alsa) printf "%s" -Dalsa=enabled ;;
--disable-alsa) printf "%s" -Dalsa=disabled ;;
--enable-attr) printf "%s" -Dattr=enabled ;;
diff --git a/tests/docker/dockerfiles/alpine.docker b/tests/docker/dockerfiles/alpine.docker
index fa455f1474..d25649cb4f 100644
--- a/tests/docker/dockerfiles/alpine.docker
+++ b/tests/docker/dockerfiles/alpine.docker
@@ -59,6 +59,7 @@ RUN apk update && \
libtasn1-dev \
liburing-dev \
libusb-dev \
+ libxdp-dev \
linux-pam-dev \
llvm \
lttng-ust-dev \
diff --git a/tests/docker/dockerfiles/centos8.docker b/tests/docker/dockerfiles/centos8.docker
index fc1830966f..68bfe606f5 100644
--- a/tests/docker/dockerfiles/centos8.docker
+++ b/tests/docker/dockerfiles/centos8.docker
@@ -75,6 +75,7 @@ RUN dnf distro-sync -y && \
libubsan \
liburing-devel \
libusbx-devel \
+ libxdp-devel \
libzstd-devel \
llvm \
lttng-ust-devel \
diff --git a/tests/docker/dockerfiles/debian-amd64-cross.docker b/tests/docker/dockerfiles/debian-amd64-cross.docker
index b66b9cc191..0cf3ba6d60 100644
--- a/tests/docker/dockerfiles/debian-amd64-cross.docker
+++ b/tests/docker/dockerfiles/debian-amd64-cross.docker
@@ -84,7 +84,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
g++-x86-64-linux-gnu \
gcc-x86-64-linux-gnu \
libaio-dev:amd64 \
- libasan5:amd64 \
+ libasan6:amd64 \
libasound2-dev:amd64 \
libattr1-dev:amd64 \
libbpf-dev:amd64 \
diff --git a/tests/docker/dockerfiles/debian-amd64.docker b/tests/docker/dockerfiles/debian-amd64.docker
index 02262bc70e..e3e1de25dd 100644
--- a/tests/docker/dockerfiles/debian-amd64.docker
+++ b/tests/docker/dockerfiles/debian-amd64.docker
@@ -32,7 +32,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
git \
hostname \
libaio-dev \
- libasan5 \
+ libasan6 \
libasound2-dev \
libattr1-dev \
libbpf-dev \
diff --git a/tests/docker/dockerfiles/debian-arm64-cross.docker b/tests/docker/dockerfiles/debian-arm64-cross.docker
index a0a968b8c6..d8cd4f87b6 100644
--- a/tests/docker/dockerfiles/debian-arm64-cross.docker
+++ b/tests/docker/dockerfiles/debian-arm64-cross.docker
@@ -84,7 +84,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
g++-aarch64-linux-gnu \
gcc-aarch64-linux-gnu \
libaio-dev:arm64 \
- libasan5:arm64 \
+ libasan6:arm64 \
libasound2-dev:arm64 \
libattr1-dev:arm64 \
libbpf-dev:arm64 \
diff --git a/tests/docker/dockerfiles/debian-armel-cross.docker b/tests/docker/dockerfiles/debian-armel-cross.docker
index f1fc34a28a..75342c09b0 100644
--- a/tests/docker/dockerfiles/debian-armel-cross.docker
+++ b/tests/docker/dockerfiles/debian-armel-cross.docker
@@ -84,7 +84,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
g++-arm-linux-gnueabi \
gcc-arm-linux-gnueabi \
libaio-dev:armel \
- libasan5:armel \
+ libasan6:armel \
libasound2-dev:armel \
libattr1-dev:armel \
libbpf-dev:armel \
diff --git a/tests/docker/dockerfiles/debian-armhf-cross.docker b/tests/docker/dockerfiles/debian-armhf-cross.docker
index a278578211..f45cfedd3f 100644
--- a/tests/docker/dockerfiles/debian-armhf-cross.docker
+++ b/tests/docker/dockerfiles/debian-armhf-cross.docker
@@ -84,7 +84,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
g++-arm-linux-gnueabihf \
gcc-arm-linux-gnueabihf \
libaio-dev:armhf \
- libasan5:armhf \
+ libasan6:armhf \
libasound2-dev:armhf \
libattr1-dev:armhf \
libbpf-dev:armhf \
diff --git a/tests/docker/dockerfiles/debian-ppc64el-cross.docker b/tests/docker/dockerfiles/debian-ppc64el-cross.docker
index 30e5efa986..52f8c34814 100644
--- a/tests/docker/dockerfiles/debian-ppc64el-cross.docker
+++ b/tests/docker/dockerfiles/debian-ppc64el-cross.docker
@@ -84,7 +84,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
g++-powerpc64le-linux-gnu \
gcc-powerpc64le-linux-gnu \
libaio-dev:ppc64el \
- libasan5:ppc64el \
+ libasan6:ppc64el \
libasound2-dev:ppc64el \
libattr1-dev:ppc64el \
libbpf-dev:ppc64el \
diff --git a/tests/docker/dockerfiles/debian-s390x-cross.docker b/tests/docker/dockerfiles/debian-s390x-cross.docker
index ee6db7b526..208e57bcf2 100644
--- a/tests/docker/dockerfiles/debian-s390x-cross.docker
+++ b/tests/docker/dockerfiles/debian-s390x-cross.docker
@@ -84,7 +84,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
g++-s390x-linux-gnu \
gcc-s390x-linux-gnu \
libaio-dev:s390x \
- libasan5:s390x \
+ libasan6:s390x \
libasound2-dev:s390x \
libattr1-dev:s390x \
libbpf-dev:s390x \
diff --git a/tests/docker/dockerfiles/fedora.docker b/tests/docker/dockerfiles/fedora.docker
index c5b6c96943..f00e9e267c 100644
--- a/tests/docker/dockerfiles/fedora.docker
+++ b/tests/docker/dockerfiles/fedora.docker
@@ -82,6 +82,7 @@ exec "$@"\n' > /usr/bin/nosync && \
libubsan \
liburing-devel \
libusbx-devel \
+ libxdp-devel \
libzstd-devel \
llvm \
lttng-ust-devel \
diff --git a/tests/docker/dockerfiles/opensuse-leap.docker b/tests/docker/dockerfiles/opensuse-leap.docker
index fef8d5a2e4..ed04b4d6da 100644
--- a/tests/docker/dockerfiles/opensuse-leap.docker
+++ b/tests/docker/dockerfiles/opensuse-leap.docker
@@ -40,7 +40,7 @@ RUN zypper update -y && \
libSDL2-devel \
libSDL2_image-devel \
libaio-devel \
- libasan6 \
+ libasan8 \
libattr-devel \
libbpf-devel \
libbz2-devel \
diff --git a/tests/docker/dockerfiles/ubuntu2004.docker b/tests/docker/dockerfiles/ubuntu2004.docker
index 4180cd8674..d3e212060c 100644
--- a/tests/docker/dockerfiles/ubuntu2004.docker
+++ b/tests/docker/dockerfiles/ubuntu2004.docker
@@ -32,7 +32,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
git \
hostname \
libaio-dev \
- libasan5 \
+ libasan6 \
libasound2-dev \
libattr1-dev \
libbrlapi-dev \
diff --git a/tests/docker/dockerfiles/ubuntu2204.docker b/tests/docker/dockerfiles/ubuntu2204.docker
index 88493f00f6..94c2c16118 100644
--- a/tests/docker/dockerfiles/ubuntu2204.docker
+++ b/tests/docker/dockerfiles/ubuntu2204.docker
@@ -32,7 +32,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
git \
hostname \
libaio-dev \
- libasan5 \
+ libasan6 \
libasound2-dev \
libattr1-dev \
libbpf-dev \
diff --git a/tests/lcitool/libvirt-ci b/tests/lcitool/libvirt-ci
-Subproject bbd55b4d18cce8f89b5167675e434a694131563
+Subproject 5f84a21881577a5fb56cc956f6fe4e2abd6fcff
diff --git a/tests/lcitool/projects/qemu.yml b/tests/lcitool/projects/qemu.yml
index 584f78cb7f..6f0885170d 100644
--- a/tests/lcitool/projects/qemu.yml
+++ b/tests/lcitool/projects/qemu.yml
@@ -69,6 +69,7 @@ packages:
- liburing
- libusbx
- libvdeplug
+ - libxdp
- libzstd
- llvm
- lttng-ust
diff --git a/tests/qtest/libqos/igb.c b/tests/qtest/libqos/igb.c
index a603468beb..f40c4ec4cd 100644
--- a/tests/qtest/libqos/igb.c
+++ b/tests/qtest/libqos/igb.c
@@ -109,6 +109,11 @@ static void igb_pci_start_hw(QOSGraphObject *obj)
E1000_RAH_AV | E1000_RAH_POOL_1 |
le16_to_cpu(*(uint16_t *)(address + 4)));
+ /* Set supported receive descriptor mode */
+ e1000e_macreg_write(&d->e1000e,
+ E1000_SRRCTL(0),
+ E1000_SRRCTL_DESCTYPE_ADV_ONEBUF);
+
/* Enable receive */
e1000e_macreg_write(&d->e1000e, E1000_RFCTL, E1000_RFCTL_EXTEN);
e1000e_macreg_write(&d->e1000e, E1000_RCTL, E1000_RCTL_EN);