diff options
author | Stefan Hajnoczi <stefanha@redhat.com> | 2023-09-19 13:21:49 -0400 |
---|---|---|
committer | Stefan Hajnoczi <stefanha@redhat.com> | 2023-09-19 13:21:49 -0400 |
commit | dd0c84983dd5c3fefaa29f15ed1b4c5c7be9775d (patch) | |
tree | 2de4d420f503fd0c2cbb061fd966d399365b0279 | |
parent | d7754940d78a7d5bfb13531afa9a67f8c57e987e (diff) | |
parent | 6d7a53e9f16d2b18d94f9fce1e4eea34570286ef (diff) |
Merge tag 'net-pull-request' of https://github.com/jasowang/qemu into staging
# -----BEGIN PGP SIGNATURE-----
# Version: GnuPG v1
#
# iQEcBAABAgAGBQJlB/SLAAoJEO8Ells5jWIR7EQH/1kAbxHcSGJXDOgQAXJ/rOZi
# UKn3ugJzD0Hxd4Xz8cvdVLM+9/JoEEOK1uB+NIG7Ask/gA5D7eUYzaLtp1OJ8VNO
# mamfKmn3EIBWJoLSHH19TKzfW2tGMJHQ0Nj+sbDQRkK5f2c7hwLTRXa1EmlJd4dB
# VoVzX4OiJtrQyv4OVmpP/PSETXJDvYYX/DNcRl9/3ccKtQW/wVDI3YzrMzXrsgyc
# w9ItJi8k+19mVH6RgQwciqRvTbVMdzkOxqvU//LY0TxnjsHfbyHr+KlNAa2WTY2N
# QgpAlMZhHqUG6/XXAs0o2VEtA66zmw932Xfy/CZUEcdGWfkG/9CEVfbuT4CKGY4=
# =tF7K
# -----END PGP SIGNATURE-----
# gpg: Signature made Mon 18 Sep 2023 02:56:11 EDT
# gpg: using RSA key EF04965B398D6211
# gpg: Good signature from "Jason Wang (Jason Wang on RedHat) <jasowang@redhat.com>" [full]
# Primary key fingerprint: 215D 46F4 8246 689E C77F 3562 EF04 965B 398D 6211
* tag 'net-pull-request' of https://github.com/jasowang/qemu:
net/tap: Avoid variable-length array
net/dump: Avoid variable length array
hw/net/rocker: Avoid variable length array
hw/net/fsl_etsec/rings.c: Avoid variable length array
net: add initial support for AF_XDP network backend
tests: bump libvirt-ci for libasan and libxdp
e1000e: rename e1000e_ba_state and e1000e_write_hdr_to_rx_buffers
igb: packet-split descriptors support
igb: add IPv6 extended headers traffic detection
igb: RX payload guest writting refactoring
igb: RX descriptors guest writting refactoring
igb: rename E1000E_RingInfo_st
igb: remove TCP ACK detection
virtio-net: Add support for USO features
virtio-net: Add USO flags to vhost support.
tap: Add check for USO features
tap: Add USO support to tap device.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
50 files changed, 1435 insertions, 284 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 00562f924f..67cefaa6f2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2957,6 +2957,10 @@ W: http://info.iet.unipi.it/~luigi/netmap/ S: Maintained F: net/netmap.c +AF_XDP network backend +R: Ilya Maximets <i.maximets@ovn.org> +F: net/af-xdp.c + Host Memory Backends M: David Hildenbrand <david@redhat.com> M: Igor Mammedov <imammedo@redhat.com> diff --git a/hmp-commands.hx b/hmp-commands.hx index 2cbd0f77a0..63eac22734 100644 --- a/hmp-commands.hx +++ b/hmp-commands.hx @@ -1296,6 +1296,9 @@ ERST .name = "netdev_add", .args_type = "netdev:O", .params = "[user|tap|socket|stream|dgram|vde|bridge|hubport|netmap|vhost-user" +#ifdef CONFIG_AF_XDP + "|af-xdp" +#endif #ifdef CONFIG_VMNET "|vmnet-host|vmnet-shared|vmnet-bridged" #endif diff --git a/hw/core/machine.c b/hw/core/machine.c index da699cf4e1..230aab819c 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -38,6 +38,7 @@ #include "exec/confidential-guest-support.h" #include "hw/virtio/virtio.h" #include "hw/virtio/virtio-pci.h" +#include "hw/virtio/virtio-net.h" GlobalProperty hw_compat_8_1[] = {}; const size_t hw_compat_8_1_len = G_N_ELEMENTS(hw_compat_8_1); @@ -45,6 +46,9 @@ const size_t hw_compat_8_1_len = G_N_ELEMENTS(hw_compat_8_1); GlobalProperty hw_compat_8_0[] = { { "migration", "multifd-flush-after-each-section", "on"}, { TYPE_PCI_DEVICE, "x-pcie-ari-nextfn-1", "on" }, + { TYPE_VIRTIO_NET, "host_uso", "off"}, + { TYPE_VIRTIO_NET, "guest_uso4", "off"}, + { TYPE_VIRTIO_NET, "guest_uso6", "off"}, }; const size_t hw_compat_8_0_len = G_N_ELEMENTS(hw_compat_8_0); diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c index f8aeafa16b..e324c02dd5 100644 --- a/hw/net/e1000e_core.c +++ b/hw/net/e1000e_core.c @@ -810,24 +810,24 @@ e1000e_txdesc_writeback(E1000ECore *core, dma_addr_t base, return e1000e_tx_wb_interrupt_cause(core, queue_idx); } -typedef struct E1000E_RingInfo_st { +typedef struct E1000ERingInfo { int dbah; int dbal; int dlen; int dh; int dt; int idx; -} E1000E_RingInfo; +} E1000ERingInfo; static inline bool -e1000e_ring_empty(E1000ECore *core, const E1000E_RingInfo *r) +e1000e_ring_empty(E1000ECore *core, const E1000ERingInfo *r) { return core->mac[r->dh] == core->mac[r->dt] || core->mac[r->dt] >= core->mac[r->dlen] / E1000_RING_DESC_LEN; } static inline uint64_t -e1000e_ring_base(E1000ECore *core, const E1000E_RingInfo *r) +e1000e_ring_base(E1000ECore *core, const E1000ERingInfo *r) { uint64_t bah = core->mac[r->dbah]; uint64_t bal = core->mac[r->dbal]; @@ -836,13 +836,13 @@ e1000e_ring_base(E1000ECore *core, const E1000E_RingInfo *r) } static inline uint64_t -e1000e_ring_head_descr(E1000ECore *core, const E1000E_RingInfo *r) +e1000e_ring_head_descr(E1000ECore *core, const E1000ERingInfo *r) { return e1000e_ring_base(core, r) + E1000_RING_DESC_LEN * core->mac[r->dh]; } static inline void -e1000e_ring_advance(E1000ECore *core, const E1000E_RingInfo *r, uint32_t count) +e1000e_ring_advance(E1000ECore *core, const E1000ERingInfo *r, uint32_t count) { core->mac[r->dh] += count; @@ -852,7 +852,7 @@ e1000e_ring_advance(E1000ECore *core, const E1000E_RingInfo *r, uint32_t count) } static inline uint32_t -e1000e_ring_free_descr_num(E1000ECore *core, const E1000E_RingInfo *r) +e1000e_ring_free_descr_num(E1000ECore *core, const E1000ERingInfo *r) { trace_e1000e_ring_free_space(r->idx, core->mac[r->dlen], core->mac[r->dh], core->mac[r->dt]); @@ -871,19 +871,19 @@ e1000e_ring_free_descr_num(E1000ECore *core, const E1000E_RingInfo *r) } static inline bool -e1000e_ring_enabled(E1000ECore *core, const E1000E_RingInfo *r) +e1000e_ring_enabled(E1000ECore *core, const E1000ERingInfo *r) { return core->mac[r->dlen] > 0; } static inline uint32_t -e1000e_ring_len(E1000ECore *core, const E1000E_RingInfo *r) +e1000e_ring_len(E1000ECore *core, const E1000ERingInfo *r) { return core->mac[r->dlen]; } typedef struct E1000E_TxRing_st { - const E1000E_RingInfo *i; + const E1000ERingInfo *i; struct e1000e_tx *tx; } E1000E_TxRing; @@ -896,7 +896,7 @@ e1000e_mq_queue_idx(int base_reg_idx, int reg_idx) static inline void e1000e_tx_ring_init(E1000ECore *core, E1000E_TxRing *txr, int idx) { - static const E1000E_RingInfo i[E1000E_NUM_QUEUES] = { + static const E1000ERingInfo i[E1000E_NUM_QUEUES] = { { TDBAH, TDBAL, TDLEN, TDH, TDT, 0 }, { TDBAH1, TDBAL1, TDLEN1, TDH1, TDT1, 1 } }; @@ -908,13 +908,13 @@ e1000e_tx_ring_init(E1000ECore *core, E1000E_TxRing *txr, int idx) } typedef struct E1000E_RxRing_st { - const E1000E_RingInfo *i; + const E1000ERingInfo *i; } E1000E_RxRing; static inline void e1000e_rx_ring_init(E1000ECore *core, E1000E_RxRing *rxr, int idx) { - static const E1000E_RingInfo i[E1000E_NUM_QUEUES] = { + static const E1000ERingInfo i[E1000E_NUM_QUEUES] = { { RDBAH0, RDBAL0, RDLEN0, RDH0, RDT0, 0 }, { RDBAH1, RDBAL1, RDLEN1, RDH1, RDT1, 1 } }; @@ -930,7 +930,7 @@ e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr) dma_addr_t base; struct e1000_tx_desc desc; bool ide = false; - const E1000E_RingInfo *txi = txr->i; + const E1000ERingInfo *txi = txr->i; uint32_t cause = E1000_ICS_TXQE; if (!(core->mac[TCTL] & E1000_TCTL_EN)) { @@ -960,7 +960,7 @@ e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr) } static bool -e1000e_has_rxbufs(E1000ECore *core, const E1000E_RingInfo *r, +e1000e_has_rxbufs(E1000ECore *core, const E1000ERingInfo *r, size_t total_size) { uint32_t bufs = e1000e_ring_free_descr_num(core, r); @@ -1397,17 +1397,17 @@ e1000e_pci_dma_write_rx_desc(E1000ECore *core, dma_addr_t addr, } } -typedef struct e1000e_ba_state_st { +typedef struct E1000EBAState { uint16_t written[MAX_PS_BUFFERS]; uint8_t cur_idx; -} e1000e_ba_state; +} E1000EBAState; static inline void -e1000e_write_hdr_to_rx_buffers(E1000ECore *core, - hwaddr ba[MAX_PS_BUFFERS], - e1000e_ba_state *bastate, - const char *data, - dma_addr_t data_len) +e1000e_write_hdr_frag_to_rx_buffers(E1000ECore *core, + hwaddr ba[MAX_PS_BUFFERS], + E1000EBAState *bastate, + const char *data, + dma_addr_t data_len) { assert(data_len <= core->rxbuf_sizes[0] - bastate->written[0]); @@ -1418,11 +1418,11 @@ e1000e_write_hdr_to_rx_buffers(E1000ECore *core, } static void -e1000e_write_to_rx_buffers(E1000ECore *core, - hwaddr ba[MAX_PS_BUFFERS], - e1000e_ba_state *bastate, - const char *data, - dma_addr_t data_len) +e1000e_write_payload_frag_to_rx_buffers(E1000ECore *core, + hwaddr ba[MAX_PS_BUFFERS], + E1000EBAState *bastate, + const char *data, + dma_addr_t data_len) { while (data_len > 0) { uint32_t cur_buf_len = core->rxbuf_sizes[bastate->cur_idx]; @@ -1460,7 +1460,7 @@ e1000e_update_rx_stats(E1000ECore *core, size_t pkt_size, size_t pkt_fcs_size) } static inline bool -e1000e_rx_descr_threshold_hit(E1000ECore *core, const E1000E_RingInfo *rxi) +e1000e_rx_descr_threshold_hit(E1000ECore *core, const E1000ERingInfo *rxi) { return e1000e_ring_free_descr_num(core, rxi) == e1000e_ring_len(core, rxi) >> core->rxbuf_min_shift; @@ -1521,7 +1521,7 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt, struct iovec *iov = net_rx_pkt_get_iovec(pkt); size_t size = net_rx_pkt_get_total_len(pkt); size_t total_size = size + e1000x_fcs_len(core->mac); - const E1000E_RingInfo *rxi; + const E1000ERingInfo *rxi; size_t ps_hdr_len = 0; bool do_ps = e1000e_do_ps(core, pkt, &ps_hdr_len); bool is_first = true; @@ -1530,7 +1530,7 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt, do { hwaddr ba[MAX_PS_BUFFERS]; - e1000e_ba_state bastate = { { 0 } }; + E1000EBAState bastate = { { 0 } }; bool is_last = false; desc_size = total_size - desc_offset; @@ -1568,8 +1568,10 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt, iov_copy = MIN(ps_hdr_len - ps_hdr_copied, iov->iov_len - iov_ofs); - e1000e_write_hdr_to_rx_buffers(core, ba, &bastate, - iov->iov_base, iov_copy); + e1000e_write_hdr_frag_to_rx_buffers(core, ba, + &bastate, + iov->iov_base, + iov_copy); copy_size -= iov_copy; ps_hdr_copied += iov_copy; @@ -1585,8 +1587,8 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt, } else { /* Leave buffer 0 of each descriptor except first */ /* empty as per spec 7.1.5.1 */ - e1000e_write_hdr_to_rx_buffers(core, ba, &bastate, - NULL, 0); + e1000e_write_hdr_frag_to_rx_buffers(core, ba, &bastate, + NULL, 0); } } @@ -1594,8 +1596,10 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt, while (copy_size) { iov_copy = MIN(copy_size, iov->iov_len - iov_ofs); - e1000e_write_to_rx_buffers(core, ba, &bastate, - iov->iov_base + iov_ofs, iov_copy); + e1000e_write_payload_frag_to_rx_buffers(core, ba, &bastate, + iov->iov_base + + iov_ofs, + iov_copy); copy_size -= iov_copy; iov_ofs += iov_copy; @@ -1607,7 +1611,7 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt, if (desc_offset + desc_size >= total_size) { /* Simulate FCS checksum presence in the last descriptor */ - e1000e_write_to_rx_buffers(core, ba, &bastate, + e1000e_write_payload_frag_to_rx_buffers(core, ba, &bastate, (const char *) &fcs_pad, e1000x_fcs_len(core->mac)); } } @@ -2852,7 +2856,7 @@ e1000e_update_rx_offloads(E1000ECore *core) if (core->has_vnet) { qemu_set_offload(qemu_get_queue(core->owner_nic)->peer, - cso_state, 0, 0, 0, 0); + cso_state, 0, 0, 0, 0, 0, 0); } } diff --git a/hw/net/fsl_etsec/rings.c b/hw/net/fsl_etsec/rings.c index 788463f1b6..2f2f359f7a 100644 --- a/hw/net/fsl_etsec/rings.c +++ b/hw/net/fsl_etsec/rings.c @@ -372,6 +372,12 @@ void etsec_walk_tx_ring(eTSEC *etsec, int ring_nbr) etsec->regs[TSTAT].value |= 1 << (31 - ring_nbr); } +/* + * rx_init_frame() ensures we never do more padding than this + * (checksum plus minimum data packet size) + */ +#define MAX_RX_PADDING 64 + static void fill_rx_bd(eTSEC *etsec, eTSEC_rxtx_bd *bd, const uint8_t **buf, @@ -380,9 +386,11 @@ static void fill_rx_bd(eTSEC *etsec, uint16_t to_write; hwaddr bufptr = bd->bufptr + ((hwaddr)(etsec->regs[TBDBPH].value & 0xF) << 32); - uint8_t padd[etsec->rx_padding]; + uint8_t padd[MAX_RX_PADDING]; uint8_t rem; + assert(etsec->rx_padding <= MAX_RX_PADDING); + RING_DEBUG("eTSEC fill Rx buffer @ 0x%016" HWADDR_PRIx " size:%zu(padding + crc:%u) + fcb:%u\n", bufptr, *size, etsec->rx_padding, etsec->rx_fcb_size); @@ -426,7 +434,7 @@ static void fill_rx_bd(eTSEC *etsec, rem = MIN(etsec->regs[MRBLR].value - bd->length, etsec->rx_padding); if (rem > 0) { - memset(padd, 0x0, sizeof(padd)); + memset(padd, 0x0, rem); etsec->rx_padding -= rem; *size -= rem; bd->length += rem; diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index 8b6b75c522..f6a5e2327b 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -267,6 +267,29 @@ igb_rx_use_legacy_descriptor(IGBCore *core) return false; } +typedef struct E1000ERingInfo { + int dbah; + int dbal; + int dlen; + int dh; + int dt; + int idx; +} E1000ERingInfo; + +static uint32_t +igb_rx_queue_desctyp_get(IGBCore *core, const E1000ERingInfo *r) +{ + return core->mac[E1000_SRRCTL(r->idx) >> 2] & E1000_SRRCTL_DESCTYPE_MASK; +} + +static bool +igb_rx_use_ps_descriptor(IGBCore *core, const E1000ERingInfo *r) +{ + uint32_t desctyp = igb_rx_queue_desctyp_get(core, r); + return desctyp == E1000_SRRCTL_DESCTYPE_HDR_SPLIT || + desctyp == E1000_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS; +} + static inline bool igb_rss_enabled(IGBCore *core) { @@ -694,24 +717,15 @@ static uint32_t igb_rx_wb_eic(IGBCore *core, int queue_idx) return (ent & E1000_IVAR_VALID) ? BIT(ent & 0x1f) : 0; } -typedef struct E1000E_RingInfo_st { - int dbah; - int dbal; - int dlen; - int dh; - int dt; - int idx; -} E1000E_RingInfo; - static inline bool -igb_ring_empty(IGBCore *core, const E1000E_RingInfo *r) +igb_ring_empty(IGBCore *core, const E1000ERingInfo *r) { return core->mac[r->dh] == core->mac[r->dt] || core->mac[r->dt] >= core->mac[r->dlen] / E1000_RING_DESC_LEN; } static inline uint64_t -igb_ring_base(IGBCore *core, const E1000E_RingInfo *r) +igb_ring_base(IGBCore *core, const E1000ERingInfo *r) { uint64_t bah = core->mac[r->dbah]; uint64_t bal = core->mac[r->dbal]; @@ -720,13 +734,13 @@ igb_ring_base(IGBCore *core, const E1000E_RingInfo *r) } static inline uint64_t -igb_ring_head_descr(IGBCore *core, const E1000E_RingInfo *r) +igb_ring_head_descr(IGBCore *core, const E1000ERingInfo *r) { return igb_ring_base(core, r) + E1000_RING_DESC_LEN * core->mac[r->dh]; } static inline void -igb_ring_advance(IGBCore *core, const E1000E_RingInfo *r, uint32_t count) +igb_ring_advance(IGBCore *core, const E1000ERingInfo *r, uint32_t count) { core->mac[r->dh] += count; @@ -736,7 +750,7 @@ igb_ring_advance(IGBCore *core, const E1000E_RingInfo *r, uint32_t count) } static inline uint32_t -igb_ring_free_descr_num(IGBCore *core, const E1000E_RingInfo *r) +igb_ring_free_descr_num(IGBCore *core, const E1000ERingInfo *r) { trace_e1000e_ring_free_space(r->idx, core->mac[r->dlen], core->mac[r->dh], core->mac[r->dt]); @@ -755,13 +769,13 @@ igb_ring_free_descr_num(IGBCore *core, const E1000E_RingInfo *r) } static inline bool -igb_ring_enabled(IGBCore *core, const E1000E_RingInfo *r) +igb_ring_enabled(IGBCore *core, const E1000ERingInfo *r) { return core->mac[r->dlen] > 0; } typedef struct IGB_TxRing_st { - const E1000E_RingInfo *i; + const E1000ERingInfo *i; struct igb_tx *tx; } IGB_TxRing; @@ -774,7 +788,7 @@ igb_mq_queue_idx(int base_reg_idx, int reg_idx) static inline void igb_tx_ring_init(IGBCore *core, IGB_TxRing *txr, int idx) { - static const E1000E_RingInfo i[IGB_NUM_QUEUES] = { + static const E1000ERingInfo i[IGB_NUM_QUEUES] = { { TDBAH0, TDBAL0, TDLEN0, TDH0, TDT0, 0 }, { TDBAH1, TDBAL1, TDLEN1, TDH1, TDT1, 1 }, { TDBAH2, TDBAL2, TDLEN2, TDH2, TDT2, 2 }, @@ -800,13 +814,13 @@ igb_tx_ring_init(IGBCore *core, IGB_TxRing *txr, int idx) } typedef struct E1000E_RxRing_st { - const E1000E_RingInfo *i; + const E1000ERingInfo *i; } E1000E_RxRing; static inline void igb_rx_ring_init(IGBCore *core, E1000E_RxRing *rxr, int idx) { - static const E1000E_RingInfo i[IGB_NUM_QUEUES] = { + static const E1000ERingInfo i[IGB_NUM_QUEUES] = { { RDBAH0, RDBAL0, RDLEN0, RDH0, RDT0, 0 }, { RDBAH1, RDBAL1, RDLEN1, RDH1, RDT1, 1 }, { RDBAH2, RDBAL2, RDLEN2, RDH2, RDT2, 2 }, @@ -833,7 +847,7 @@ igb_rx_ring_init(IGBCore *core, E1000E_RxRing *rxr, int idx) static uint32_t igb_txdesc_writeback(IGBCore *core, dma_addr_t base, union e1000_adv_tx_desc *tx_desc, - const E1000E_RingInfo *txi) + const E1000ERingInfo *txi) { PCIDevice *d; uint32_t cmd_type_len = le32_to_cpu(tx_desc->read.cmd_type_len); @@ -866,7 +880,7 @@ igb_txdesc_writeback(IGBCore *core, dma_addr_t base, } static inline bool -igb_tx_enabled(IGBCore *core, const E1000E_RingInfo *txi) +igb_tx_enabled(IGBCore *core, const E1000ERingInfo *txi) { bool vmdq = core->mac[MRQC] & 1; uint16_t qn = txi->idx; @@ -883,7 +897,7 @@ igb_start_xmit(IGBCore *core, const IGB_TxRing *txr) PCIDevice *d; dma_addr_t base; union e1000_adv_tx_desc desc; - const E1000E_RingInfo *txi = txr->i; + const E1000ERingInfo *txi = txr->i; uint32_t eic = 0; if (!igb_tx_enabled(core, txi)) { @@ -918,7 +932,7 @@ igb_start_xmit(IGBCore *core, const IGB_TxRing *txr) } static uint32_t -igb_rxbufsize(IGBCore *core, const E1000E_RingInfo *r) +igb_rxbufsize(IGBCore *core, const E1000ERingInfo *r) { uint32_t srrctl = core->mac[E1000_SRRCTL(r->idx) >> 2]; uint32_t bsizepkt = srrctl & E1000_SRRCTL_BSIZEPKT_MASK; @@ -930,7 +944,7 @@ igb_rxbufsize(IGBCore *core, const E1000E_RingInfo *r) } static bool -igb_has_rxbufs(IGBCore *core, const E1000E_RingInfo *r, size_t total_size) +igb_has_rxbufs(IGBCore *core, const E1000ERingInfo *r, size_t total_size) { uint32_t bufs = igb_ring_free_descr_num(core, r); uint32_t bufsize = igb_rxbufsize(core, r); @@ -941,6 +955,14 @@ igb_has_rxbufs(IGBCore *core, const E1000E_RingInfo *r, size_t total_size) bufsize; } +static uint32_t +igb_rxhdrbufsize(IGBCore *core, const E1000ERingInfo *r) +{ + uint32_t srrctl = core->mac[E1000_SRRCTL(r->idx) >> 2]; + return (srrctl & E1000_SRRCTL_BSIZEHDRSIZE_MASK) >> + E1000_SRRCTL_BSIZEHDRSIZE_SHIFT; +} + void igb_start_recv(IGBCore *core) { @@ -1225,21 +1247,77 @@ igb_read_lgcy_rx_descr(IGBCore *core, struct e1000_rx_desc *desc, } static inline void -igb_read_adv_rx_descr(IGBCore *core, union e1000_adv_rx_desc *desc, - hwaddr *buff_addr) +igb_read_adv_rx_single_buf_descr(IGBCore *core, union e1000_adv_rx_desc *desc, + hwaddr *buff_addr) { *buff_addr = le64_to_cpu(desc->read.pkt_addr); } static inline void -igb_read_rx_descr(IGBCore *core, union e1000_rx_desc_union *desc, - hwaddr *buff_addr) +igb_read_adv_rx_split_buf_descr(IGBCore *core, union e1000_adv_rx_desc *desc, + hwaddr *buff_addr) { + buff_addr[0] = le64_to_cpu(desc->read.hdr_addr); + buff_addr[1] = le64_to_cpu(desc->read.pkt_addr); +} + +typedef struct IGBBAState { + uint16_t written[IGB_MAX_PS_BUFFERS]; + uint8_t cur_idx; +} IGBBAState; + +typedef struct IGBSplitDescriptorData { + bool sph; + bool hbo; + size_t hdr_len; +} IGBSplitDescriptorData; + +typedef struct IGBPacketRxDMAState { + size_t size; + size_t total_size; + size_t ps_hdr_len; + size_t desc_size; + size_t desc_offset; + uint32_t rx_desc_packet_buf_size; + uint32_t rx_desc_header_buf_size; + struct iovec *iov; + size_t iov_ofs; + bool do_ps; + bool is_first; + IGBBAState bastate; + hwaddr ba[IGB_MAX_PS_BUFFERS]; + IGBSplitDescriptorData ps_desc_data; +} IGBPacketRxDMAState; + +static inline void +igb_read_rx_descr(IGBCore *core, + union e1000_rx_desc_union *desc, + IGBPacketRxDMAState *pdma_st, + const E1000ERingInfo *r) +{ + uint32_t desc_type; + if (igb_rx_use_legacy_descriptor(core)) { - igb_read_lgcy_rx_descr(core, &desc->legacy, buff_addr); - } else { - igb_read_adv_rx_descr(core, &desc->adv, buff_addr); + igb_read_lgcy_rx_descr(core, &desc->legacy, &pdma_st->ba[1]); + pdma_st->ba[0] = 0; + return; } + + /* advanced header split descriptor */ + if (igb_rx_use_ps_descriptor(core, r)) { + igb_read_adv_rx_split_buf_descr(core, &desc->adv, &pdma_st->ba[0]); + return; + } + + /* descriptor replication modes not supported */ + desc_type = igb_rx_queue_desctyp_get(core, r); + if (desc_type != E1000_SRRCTL_DESCTYPE_ADV_ONEBUF) { + trace_igb_wrn_rx_desc_modes_not_supp(desc_type); + } + + /* advanced single buffer descriptor */ + igb_read_adv_rx_single_buf_descr(core, &desc->adv, &pdma_st->ba[1]); + pdma_st->ba[0] = 0; } static void @@ -1281,15 +1359,11 @@ igb_verify_csum_in_sw(IGBCore *core, } static void -igb_build_rx_metadata(IGBCore *core, - struct NetRxPkt *pkt, - bool is_eop, - const E1000E_RSSInfo *rss_info, uint16_t etqf, bool ts, - uint16_t *pkt_info, uint16_t *hdr_info, - uint32_t *rss, - uint32_t *status_flags, - uint16_t *ip_id, - uint16_t *vlan_tag) +igb_build_rx_metadata_common(IGBCore *core, + struct NetRxPkt *pkt, + bool is_eop, + uint32_t *status_flags, + uint16_t *vlan_tag) { struct virtio_net_hdr *vhdr; bool hasip4, hasip6, csum_valid; @@ -1298,7 +1372,6 @@ igb_build_rx_metadata(IGBCore *core, *status_flags = E1000_RXD_STAT_DD; /* No additional metadata needed for non-EOP descriptors */ - /* TODO: EOP apply only to status so don't skip whole function. */ if (!is_eop) { goto func_exit; } @@ -1315,64 +1388,6 @@ igb_build_rx_metadata(IGBCore *core, trace_e1000e_rx_metadata_vlan(*vlan_tag); } - /* Packet parsing results */ - if ((core->mac[RXCSUM] & E1000_RXCSUM_PCSD) != 0) { - if (rss_info->enabled) { - *rss = cpu_to_le32(rss_info->hash); - trace_igb_rx_metadata_rss(*rss); - } - } else if (hasip4) { - *status_flags |= E1000_RXD_STAT_IPIDV; - *ip_id = cpu_to_le16(net_rx_pkt_get_ip_id(pkt)); - trace_e1000e_rx_metadata_ip_id(*ip_id); - } - - if (l4hdr_proto == ETH_L4_HDR_PROTO_TCP && net_rx_pkt_is_tcp_ack(pkt)) { - *status_flags |= E1000_RXD_STAT_ACK; - trace_e1000e_rx_metadata_ack(); - } - - if (pkt_info) { - *pkt_info = rss_info->enabled ? rss_info->type : 0; - - if (etqf < 8) { - *pkt_info |= (BIT(11) | etqf) << 4; - } else { - if (hasip4) { - *pkt_info |= E1000_ADVRXD_PKT_IP4; - } - - if (hasip6) { - *pkt_info |= E1000_ADVRXD_PKT_IP6; - } - - switch (l4hdr_proto) { - case ETH_L4_HDR_PROTO_TCP: - *pkt_info |= E1000_ADVRXD_PKT_TCP; - break; - - case ETH_L4_HDR_PROTO_UDP: - *pkt_info |= E1000_ADVRXD_PKT_UDP; - break; - - case ETH_L4_HDR_PROTO_SCTP: - *pkt_info |= E1000_ADVRXD_PKT_SCTP; - break; - - default: - break; - } - } - } - - if (hdr_info) { - *hdr_info = 0; - } - - if (ts) { - *status_flags |= BIT(16); - } - /* RX CSO information */ if (hasip6 && (core->mac[RFCTL] & E1000_RFCTL_IPV6_XSUM_DIS)) { trace_e1000e_rx_metadata_ipv6_sum_disabled(); @@ -1428,56 +1443,168 @@ func_exit: static inline void igb_write_lgcy_rx_descr(IGBCore *core, struct e1000_rx_desc *desc, struct NetRxPkt *pkt, - const E1000E_RSSInfo *rss_info, uint16_t etqf, bool ts, + const E1000E_RSSInfo *rss_info, uint16_t length) { - uint32_t status_flags, rss; - uint16_t ip_id; + uint32_t status_flags; assert(!rss_info->enabled); + + memset(desc, 0, sizeof(*desc)); desc->length = cpu_to_le16(length); - desc->csum = 0; + igb_build_rx_metadata_common(core, pkt, pkt != NULL, + &status_flags, + &desc->special); - igb_build_rx_metadata(core, pkt, pkt != NULL, - rss_info, etqf, ts, - NULL, NULL, &rss, - &status_flags, &ip_id, - &desc->special); desc->errors = (uint8_t) (le32_to_cpu(status_flags) >> 24); desc->status = (uint8_t) le32_to_cpu(status_flags); } +static bool +igb_rx_ps_descriptor_split_always(IGBCore *core, const E1000ERingInfo *r) +{ + uint32_t desctyp = igb_rx_queue_desctyp_get(core, r); + return desctyp == E1000_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS; +} + +static uint16_t +igb_rx_desc_get_packet_type(IGBCore *core, struct NetRxPkt *pkt, uint16_t etqf) +{ + uint16_t pkt_type; + bool hasip4, hasip6; + EthL4HdrProto l4hdr_proto; + + if (etqf < 8) { + pkt_type = BIT(11) | etqf; + return pkt_type; + } + + net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto); + + if (hasip6 && !(core->mac[RFCTL] & E1000_RFCTL_IPV6_DIS)) { + eth_ip6_hdr_info *ip6hdr_info = net_rx_pkt_get_ip6_info(pkt); + pkt_type = ip6hdr_info->has_ext_hdrs ? E1000_ADVRXD_PKT_IP6E : + E1000_ADVRXD_PKT_IP6; + } else if (hasip4) { + pkt_type = E1000_ADVRXD_PKT_IP4; + } else { + pkt_type = 0; + } + + switch (l4hdr_proto) { + case ETH_L4_HDR_PROTO_TCP: + pkt_type |= E1000_ADVRXD_PKT_TCP; + break; + case ETH_L4_HDR_PROTO_UDP: + pkt_type |= E1000_ADVRXD_PKT_UDP; + break; + case ETH_L4_HDR_PROTO_SCTP: + pkt_type |= E1000_ADVRXD_PKT_SCTP; + break; + default: + break; + } + + return pkt_type; +} + static inline void igb_write_adv_rx_descr(IGBCore *core, union e1000_adv_rx_desc *desc, struct NetRxPkt *pkt, const E1000E_RSSInfo *rss_info, uint16_t etqf, bool ts, uint16_t length) { + bool hasip4, hasip6; + EthL4HdrProto l4hdr_proto; + uint16_t rss_type = 0, pkt_type; + bool eop = (pkt != NULL); + uint32_t adv_desc_status_error = 0; memset(&desc->wb, 0, sizeof(desc->wb)); desc->wb.upper.length = cpu_to_le16(length); + igb_build_rx_metadata_common(core, pkt, eop, + &desc->wb.upper.status_error, + &desc->wb.upper.vlan); + + if (!eop) { + return; + } + + net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto); + + if ((core->mac[RXCSUM] & E1000_RXCSUM_PCSD) != 0) { + if (rss_info->enabled) { + desc->wb.lower.hi_dword.rss = cpu_to_le32(rss_info->hash); + rss_type = rss_info->type; + trace_igb_rx_metadata_rss(desc->wb.lower.hi_dword.rss, rss_type); + } + } else if (hasip4) { + adv_desc_status_error |= E1000_RXD_STAT_IPIDV; + desc->wb.lower.hi_dword.csum_ip.ip_id = + cpu_to_le16(net_rx_pkt_get_ip_id(pkt)); + trace_e1000e_rx_metadata_ip_id( + desc->wb.lower.hi_dword.csum_ip.ip_id); + } + + if (ts) { + adv_desc_status_error |= BIT(16); + } + + pkt_type = igb_rx_desc_get_packet_type(core, pkt, etqf); + trace_e1000e_rx_metadata_pkt_type(pkt_type); + desc->wb.lower.lo_dword.pkt_info = cpu_to_le16(rss_type | (pkt_type << 4)); + desc->wb.upper.status_error |= cpu_to_le32(adv_desc_status_error); +} + +static inline void +igb_write_adv_ps_rx_descr(IGBCore *core, + union e1000_adv_rx_desc *desc, + struct NetRxPkt *pkt, + const E1000E_RSSInfo *rss_info, + const E1000ERingInfo *r, + uint16_t etqf, + bool ts, + IGBPacketRxDMAState *pdma_st) +{ + size_t pkt_len; + uint16_t hdr_info = 0; - igb_build_rx_metadata(core, pkt, pkt != NULL, - rss_info, etqf, ts, - &desc->wb.lower.lo_dword.pkt_info, - &desc->wb.lower.lo_dword.hdr_info, - &desc->wb.lower.hi_dword.rss, - &desc->wb.upper.status_error, - &desc->wb.lower.hi_dword.csum_ip.ip_id, - &desc->wb.upper.vlan); + if (pdma_st->do_ps) { + pkt_len = pdma_st->bastate.written[1]; + } else { + pkt_len = pdma_st->bastate.written[0] + pdma_st->bastate.written[1]; + } + + igb_write_adv_rx_descr(core, desc, pkt, rss_info, etqf, ts, pkt_len); + + hdr_info = (pdma_st->ps_desc_data.hdr_len << E1000_ADVRXD_HDR_LEN_OFFSET) & + E1000_ADVRXD_ADV_HDR_LEN_MASK; + hdr_info |= pdma_st->ps_desc_data.sph ? E1000_ADVRXD_HDR_SPH : 0; + desc->wb.lower.lo_dword.hdr_info = cpu_to_le16(hdr_info); + + desc->wb.upper.status_error |= cpu_to_le32( + pdma_st->ps_desc_data.hbo ? E1000_ADVRXD_ST_ERR_HBO_OFFSET : 0); } static inline void -igb_write_rx_descr(IGBCore *core, union e1000_rx_desc_union *desc, - struct NetRxPkt *pkt, const E1000E_RSSInfo *rss_info, - uint16_t etqf, bool ts, uint16_t length) +igb_write_rx_descr(IGBCore *core, + union e1000_rx_desc_union *desc, + struct NetRxPkt *pkt, + const E1000E_RSSInfo *rss_info, + uint16_t etqf, + bool ts, + IGBPacketRxDMAState *pdma_st, + const E1000ERingInfo *r) { if (igb_rx_use_legacy_descriptor(core)) { igb_write_lgcy_rx_descr(core, &desc->legacy, pkt, rss_info, - etqf, ts, length); + pdma_st->bastate.written[1]); + } else if (igb_rx_use_ps_descriptor(core, r)) { + igb_write_adv_ps_rx_descr(core, &desc->adv, pkt, rss_info, r, etqf, ts, + pdma_st); } else { igb_write_adv_rx_descr(core, &desc->adv, pkt, rss_info, - etqf, ts, length); + etqf, ts, pdma_st->bastate.written[1]); } } @@ -1514,20 +1641,7 @@ igb_pci_dma_write_rx_desc(IGBCore *core, PCIDevice *dev, dma_addr_t addr, } static void -igb_write_to_rx_buffers(IGBCore *core, - PCIDevice *d, - hwaddr ba, - uint16_t *written, - const char *data, - dma_addr_t data_len) -{ - trace_igb_rx_desc_buff_write(ba, *written, data, data_len); - pci_dma_write(d, ba + *written, data, data_len); - *written += data_len; -} - -static void -igb_update_rx_stats(IGBCore *core, const E1000E_RingInfo *rxi, +igb_update_rx_stats(IGBCore *core, const E1000ERingInfo *rxi, size_t pkt_size, size_t pkt_fcs_size) { eth_pkt_types_e pkt_type = net_rx_pkt_get_packet_type(core->rx_pkt); @@ -1545,12 +1659,256 @@ igb_update_rx_stats(IGBCore *core, const E1000E_RingInfo *rxi, } static inline bool -igb_rx_descr_threshold_hit(IGBCore *core, const E1000E_RingInfo *rxi) +igb_rx_descr_threshold_hit(IGBCore *core, const E1000ERingInfo *rxi) { return igb_ring_free_descr_num(core, rxi) == ((core->mac[E1000_SRRCTL(rxi->idx) >> 2] >> 20) & 31) * 16; } +static bool +igb_do_ps(IGBCore *core, + const E1000ERingInfo *r, + struct NetRxPkt *pkt, + IGBPacketRxDMAState *pdma_st) +{ + bool hasip4, hasip6; + EthL4HdrProto l4hdr_proto; + bool fragment; + bool split_always; + size_t bheader_size; + size_t total_pkt_len; + + if (!igb_rx_use_ps_descriptor(core, r)) { + return false; + } + + total_pkt_len = net_rx_pkt_get_total_len(pkt); + bheader_size = igb_rxhdrbufsize(core, r); + split_always = igb_rx_ps_descriptor_split_always(core, r); + if (split_always && total_pkt_len <= bheader_size) { + pdma_st->ps_hdr_len = total_pkt_len; + pdma_st->ps_desc_data.hdr_len = total_pkt_len; + return true; + } + + net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto); + + if (hasip4) { + fragment = net_rx_pkt_get_ip4_info(pkt)->fragment; + } else if (hasip6) { + fragment = net_rx_pkt_get_ip6_info(pkt)->fragment; + } else { + pdma_st->ps_desc_data.hdr_len = bheader_size; + goto header_not_handled; + } + + if (fragment && (core->mac[RFCTL] & E1000_RFCTL_IPFRSP_DIS)) { + pdma_st->ps_desc_data.hdr_len = bheader_size; + goto header_not_handled; + } + + /* no header splitting for SCTP */ + if (!fragment && (l4hdr_proto == ETH_L4_HDR_PROTO_UDP || + l4hdr_proto == ETH_L4_HDR_PROTO_TCP)) { + pdma_st->ps_hdr_len = net_rx_pkt_get_l5_hdr_offset(pkt); + } else { + pdma_st->ps_hdr_len = net_rx_pkt_get_l4_hdr_offset(pkt); + } + + pdma_st->ps_desc_data.sph = true; + pdma_st->ps_desc_data.hdr_len = pdma_st->ps_hdr_len; + + if (pdma_st->ps_hdr_len > bheader_size) { + pdma_st->ps_desc_data.hbo = true; + goto header_not_handled; + } + + return true; + +header_not_handled: + if (split_always) { + pdma_st->ps_hdr_len = bheader_size; + return true; + } + + return false; +} + +static void +igb_truncate_to_descriptor_size(IGBPacketRxDMAState *pdma_st, size_t *size) +{ + if (pdma_st->do_ps && pdma_st->is_first) { + if (*size > pdma_st->rx_desc_packet_buf_size + pdma_st->ps_hdr_len) { + *size = pdma_st->rx_desc_packet_buf_size + pdma_st->ps_hdr_len; + } + } else { + if (*size > pdma_st->rx_desc_packet_buf_size) { + *size = pdma_st->rx_desc_packet_buf_size; + } + } +} + +static inline void +igb_write_hdr_frag_to_rx_buffers(IGBCore *core, + PCIDevice *d, + IGBPacketRxDMAState *pdma_st, + const char *data, + dma_addr_t data_len) +{ + assert(data_len <= pdma_st->rx_desc_header_buf_size - + pdma_st->bastate.written[0]); + pci_dma_write(d, + pdma_st->ba[0] + pdma_st->bastate.written[0], + data, data_len); + pdma_st->bastate.written[0] += data_len; + pdma_st->bastate.cur_idx = 1; +} + +static void +igb_write_header_to_rx_buffers(IGBCore *core, + struct NetRxPkt *pkt, + PCIDevice *d, + IGBPacketRxDMAState *pdma_st, + size_t *copy_size) +{ + size_t iov_copy; + size_t ps_hdr_copied = 0; + + if (!pdma_st->is_first) { + /* Leave buffer 0 of each descriptor except first */ + /* empty */ + pdma_st->bastate.cur_idx = 1; + return; + } + + do { + iov_copy = MIN(pdma_st->ps_hdr_len - ps_hdr_copied, + pdma_st->iov->iov_len - pdma_st->iov_ofs); + + igb_write_hdr_frag_to_rx_buffers(core, d, pdma_st, + pdma_st->iov->iov_base, + iov_copy); + + *copy_size -= iov_copy; + ps_hdr_copied += iov_copy; + + pdma_st->iov_ofs += iov_copy; + if (pdma_st->iov_ofs == pdma_st->iov->iov_len) { + pdma_st->iov++; + pdma_st->iov_ofs = 0; + } + } while (ps_hdr_copied < pdma_st->ps_hdr_len); + + pdma_st->is_first = false; +} + +static void +igb_write_payload_frag_to_rx_buffers(IGBCore *core, + PCIDevice *d, + IGBPacketRxDMAState *pdma_st, + const char *data, + dma_addr_t data_len) +{ + while (data_len > 0) { + assert(pdma_st->bastate.cur_idx < IGB_MAX_PS_BUFFERS); + + uint32_t cur_buf_bytes_left = + pdma_st->rx_desc_packet_buf_size - + pdma_st->bastate.written[pdma_st->bastate.cur_idx]; + uint32_t bytes_to_write = MIN(data_len, cur_buf_bytes_left); + + trace_igb_rx_desc_buff_write( + pdma_st->bastate.cur_idx, + pdma_st->ba[pdma_st->bastate.cur_idx], + pdma_st->bastate.written[pdma_st->bastate.cur_idx], + data, + bytes_to_write); + + pci_dma_write(d, + pdma_st->ba[pdma_st->bastate.cur_idx] + + pdma_st->bastate.written[pdma_st->bastate.cur_idx], + data, bytes_to_write); + + pdma_st->bastate.written[pdma_st->bastate.cur_idx] += bytes_to_write; + data += bytes_to_write; + data_len -= bytes_to_write; + + if (pdma_st->bastate.written[pdma_st->bastate.cur_idx] == + pdma_st->rx_desc_packet_buf_size) { + pdma_st->bastate.cur_idx++; + } + } +} + +static void +igb_write_payload_to_rx_buffers(IGBCore *core, + struct NetRxPkt *pkt, + PCIDevice *d, + IGBPacketRxDMAState *pdma_st, + size_t *copy_size) +{ + static const uint32_t fcs_pad; + size_t iov_copy; + + /* Copy packet payload */ + while (*copy_size) { + iov_copy = MIN(*copy_size, pdma_st->iov->iov_len - pdma_st->iov_ofs); + igb_write_payload_frag_to_rx_buffers(core, d, + pdma_st, + pdma_st->iov->iov_base + + pdma_st->iov_ofs, + iov_copy); + + *copy_size -= iov_copy; + pdma_st->iov_ofs += iov_copy; + if (pdma_st->iov_ofs == pdma_st->iov->iov_len) { + pdma_st->iov++; + pdma_st->iov_ofs = 0; + } + } + + if (pdma_st->desc_offset + pdma_st->desc_size >= pdma_st->total_size) { + /* Simulate FCS checksum presence in the last descriptor */ + igb_write_payload_frag_to_rx_buffers(core, d, + pdma_st, + (const char *) &fcs_pad, + e1000x_fcs_len(core->mac)); + } +} + +static void +igb_write_to_rx_buffers(IGBCore *core, + struct NetRxPkt *pkt, + PCIDevice *d, + IGBPacketRxDMAState *pdma_st) +{ + size_t copy_size; + + if (!(pdma_st->ba)[1] || (pdma_st->do_ps && !(pdma_st->ba[0]))) { + /* as per intel docs; skip descriptors with null buf addr */ + trace_e1000e_rx_null_descriptor(); + return; + } + + if (pdma_st->desc_offset >= pdma_st->size) { + return; + } + + pdma_st->desc_size = pdma_st->total_size - pdma_st->desc_offset; + igb_truncate_to_descriptor_size(pdma_st, &pdma_st->desc_size); + copy_size = pdma_st->size - pdma_st->desc_offset; + igb_truncate_to_descriptor_size(pdma_st, ©_size); + + /* For PS mode copy the packet header first */ + if (pdma_st->do_ps) { + igb_write_header_to_rx_buffers(core, pkt, d, pdma_st, ©_size); + } else { + pdma_st->bastate.cur_idx = 1; + } + + igb_write_payload_to_rx_buffers(core, pkt, d, pdma_st, ©_size); +} + static void igb_write_packet_to_guest(IGBCore *core, struct NetRxPkt *pkt, const E1000E_RxRing *rxr, @@ -1560,95 +1918,61 @@ igb_write_packet_to_guest(IGBCore *core, struct NetRxPkt *pkt, PCIDevice *d; dma_addr_t base; union e1000_rx_desc_union desc; - size_t desc_size; - size_t desc_offset = 0; - size_t iov_ofs = 0; - - struct iovec *iov = net_rx_pkt_get_iovec(pkt); - size_t size = net_rx_pkt_get_total_len(pkt); - size_t total_size = size + e1000x_fcs_len(core->mac); - const E1000E_RingInfo *rxi = rxr->i; - size_t bufsize = igb_rxbufsize(core, rxi); - + const E1000ERingInfo *rxi; + size_t rx_desc_len; + + IGBPacketRxDMAState pdma_st = {0}; + pdma_st.is_first = true; + pdma_st.size = net_rx_pkt_get_total_len(pkt); + pdma_st.total_size = pdma_st.size + e1000x_fcs_len(core->mac); + + rxi = rxr->i; + rx_desc_len = core->rx_desc_len; + pdma_st.rx_desc_packet_buf_size = igb_rxbufsize(core, rxi); + pdma_st.rx_desc_header_buf_size = igb_rxhdrbufsize(core, rxi); + pdma_st.iov = net_rx_pkt_get_iovec(pkt); d = pcie_sriov_get_vf_at_index(core->owner, rxi->idx % 8); if (!d) { d = core->owner; } + pdma_st.do_ps = igb_do_ps(core, rxi, pkt, &pdma_st); + do { - hwaddr ba; - uint16_t written = 0; + memset(&pdma_st.bastate, 0, sizeof(IGBBAState)); bool is_last = false; - desc_size = total_size - desc_offset; - - if (desc_size > bufsize) { - desc_size = bufsize; - } - if (igb_ring_empty(core, rxi)) { return; } base = igb_ring_head_descr(core, rxi); + pci_dma_read(d, base, &desc, rx_desc_len); + trace_e1000e_rx_descr(rxi->idx, base, rx_desc_len); - pci_dma_read(d, base, &desc, core->rx_desc_len); - - trace_e1000e_rx_descr(rxi->idx, base, core->rx_desc_len); - - igb_read_rx_descr(core, &desc, &ba); - - if (ba) { - if (desc_offset < size) { - static const uint32_t fcs_pad; - size_t iov_copy; - size_t copy_size = size - desc_offset; - if (copy_size > bufsize) { - copy_size = bufsize; - } - - /* Copy packet payload */ - while (copy_size) { - iov_copy = MIN(copy_size, iov->iov_len - iov_ofs); + igb_read_rx_descr(core, &desc, &pdma_st, rxi); - igb_write_to_rx_buffers(core, d, ba, &written, - iov->iov_base + iov_ofs, iov_copy); - - copy_size -= iov_copy; - iov_ofs += iov_copy; - if (iov_ofs == iov->iov_len) { - iov++; - iov_ofs = 0; - } - } - - if (desc_offset + desc_size >= total_size) { - /* Simulate FCS checksum presence in the last descriptor */ - igb_write_to_rx_buffers(core, d, ba, &written, - (const char *) &fcs_pad, e1000x_fcs_len(core->mac)); - } - } - } else { /* as per intel docs; skip descriptors with null buf addr */ - trace_e1000e_rx_null_descriptor(); - } - desc_offset += desc_size; - if (desc_offset >= total_size) { + igb_write_to_rx_buffers(core, pkt, d, &pdma_st); + pdma_st.desc_offset += pdma_st.desc_size; + if (pdma_st.desc_offset >= pdma_st.total_size) { is_last = true; } - igb_write_rx_descr(core, &desc, is_last ? core->rx_pkt : NULL, - rss_info, etqf, ts, written); - igb_pci_dma_write_rx_desc(core, d, base, &desc, core->rx_desc_len); - - igb_ring_advance(core, rxi, core->rx_desc_len / E1000_MIN_RX_DESC_LEN); - - } while (desc_offset < total_size); + igb_write_rx_descr(core, &desc, + is_last ? pkt : NULL, + rss_info, + etqf, ts, + &pdma_st, + rxi); + igb_pci_dma_write_rx_desc(core, d, base, &desc, rx_desc_len); + igb_ring_advance(core, rxi, rx_desc_len / E1000_MIN_RX_DESC_LEN); + } while (pdma_st.desc_offset < pdma_st.total_size); - igb_update_rx_stats(core, rxi, size, total_size); + igb_update_rx_stats(core, rxi, pdma_st.size, pdma_st.total_size); } static bool -igb_rx_strip_vlan(IGBCore *core, const E1000E_RingInfo *rxi) +igb_rx_strip_vlan(IGBCore *core, const E1000ERingInfo *rxi) { if (core->mac[MRQC] & 1) { uint16_t pool = rxi->idx % IGB_NUM_VM_POOLS; @@ -2753,7 +3077,7 @@ igb_update_rx_offloads(IGBCore *core) if (core->has_vnet) { qemu_set_offload(qemu_get_queue(core->owner_nic)->peer, - cso_state, 0, 0, 0, 0); + cso_state, 0, 0, 0, 0, 0, 0); } } diff --git a/hw/net/igb_regs.h b/hw/net/igb_regs.h index 82ff195dfc..ed7427b8fe 100644 --- a/hw/net/igb_regs.h +++ b/hw/net/igb_regs.h @@ -452,6 +452,7 @@ union e1000_adv_rx_desc { #define E1000_SRRCTL_BSIZEHDRSIZE_MASK 0x00000F00 #define E1000_SRRCTL_BSIZEHDRSIZE_SHIFT 2 /* Shift _left_ */ #define E1000_SRRCTL_DESCTYPE_ADV_ONEBUF 0x02000000 +#define E1000_SRRCTL_DESCTYPE_HDR_SPLIT 0x04000000 #define E1000_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS 0x0A000000 #define E1000_SRRCTL_DESCTYPE_MASK 0x0E000000 #define E1000_SRRCTL_DROP_EN 0x80000000 @@ -692,11 +693,20 @@ union e1000_adv_rx_desc { #define E1000_STATUS_NUM_VFS_SHIFT 14 -#define E1000_ADVRXD_PKT_IP4 BIT(4) -#define E1000_ADVRXD_PKT_IP6 BIT(6) -#define E1000_ADVRXD_PKT_TCP BIT(8) -#define E1000_ADVRXD_PKT_UDP BIT(9) -#define E1000_ADVRXD_PKT_SCTP BIT(10) +#define E1000_ADVRXD_PKT_IP4 BIT(0) +#define E1000_ADVRXD_PKT_IP6 BIT(2) +#define E1000_ADVRXD_PKT_IP6E BIT(3) +#define E1000_ADVRXD_PKT_TCP BIT(4) +#define E1000_ADVRXD_PKT_UDP BIT(5) +#define E1000_ADVRXD_PKT_SCTP BIT(6) + +#define IGB_MAX_PS_BUFFERS 2 + +#define E1000_ADVRXD_HDR_LEN_OFFSET (21 - 16) +#define E1000_ADVRXD_ADV_HDR_LEN_MASK ((BIT(10) - 1) << \ + E1000_ADVRXD_HDR_LEN_OFFSET) +#define E1000_ADVRXD_HDR_SPH BIT(15) +#define E1000_ADVRXD_ST_ERR_HBO_OFFSET BIT(3 + 20) static inline uint8_t igb_ivar_entry_rx(uint8_t i) { diff --git a/hw/net/rocker/rocker_of_dpa.c b/hw/net/rocker/rocker_of_dpa.c index dfe4754469..5e16056be6 100644 --- a/hw/net/rocker/rocker_of_dpa.c +++ b/hw/net/rocker/rocker_of_dpa.c @@ -1043,7 +1043,7 @@ static void of_dpa_flow_ig_tbl(OfDpaFlowContext *fc, uint32_t tbl_id) static ssize_t of_dpa_ig(World *world, uint32_t pport, const struct iovec *iov, int iovcnt) { - struct iovec iov_copy[iovcnt + 2]; + g_autofree struct iovec *iov_copy = g_new(struct iovec, iovcnt + 2); OfDpaFlowContext fc = { .of_dpa = world_private(world), .in_pport = pport, diff --git a/hw/net/trace-events b/hw/net/trace-events index 6b5ba669a2..3abfd65e5b 100644 --- a/hw/net/trace-events +++ b/hw/net/trace-events @@ -278,9 +278,9 @@ igb_core_mdic_write_unhandled(uint32_t addr) "MDIC WRITE: PHY[%u] UNHANDLED" igb_link_set_ext_params(bool asd_check, bool speed_select_bypass, bool pfrstd) "Set extended link params: ASD check: %d, Speed select bypass: %d, PF reset done: %d" igb_rx_desc_buff_size(uint32_t b) "buffer size: %u" -igb_rx_desc_buff_write(uint64_t addr, uint16_t offset, const void* source, uint32_t len) "addr: 0x%"PRIx64", offset: %u, from: %p, length: %u" +igb_rx_desc_buff_write(uint8_t idx, uint64_t addr, uint16_t offset, const void* source, uint32_t len) "buffer %u, addr: 0x%"PRIx64", offset: %u, from: %p, length: %u" -igb_rx_metadata_rss(uint32_t rss) "RSS data: 0x%X" +igb_rx_metadata_rss(uint32_t rss, uint16_t rss_pkt_type) "RSS data: rss: 0x%X, rss_pkt_type: 0x%X" igb_irq_icr_clear_gpie_nsicr(void) "Clearing ICR on read due to GPIE.NSICR enabled" igb_irq_set_iam(uint32_t icr) "Update IAM: 0x%x" @@ -295,6 +295,8 @@ igb_irq_eitr_set(uint32_t eitr_num, uint32_t val) "EITR[%u] = 0x%x" igb_set_pfmailbox(uint32_t vf_num, uint32_t val) "PFMailbox[%d]: 0x%x" igb_set_vfmailbox(uint32_t vf_num, uint32_t val) "VFMailbox[%d]: 0x%x" +igb_wrn_rx_desc_modes_not_supp(int desc_type) "Not supported descriptor type: %d" + # igbvf.c igbvf_wrn_io_addr_unknown(uint64_t addr) "IO unknown register 0x%"PRIx64 diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c index 6b958d6363..57427a3997 100644 --- a/hw/net/vhost_net.c +++ b/hw/net/vhost_net.c @@ -78,6 +78,9 @@ static const int user_feature_bits[] = { VIRTIO_F_RING_RESET, VIRTIO_NET_F_RSS, VIRTIO_NET_F_HASH_REPORT, + VIRTIO_NET_F_GUEST_USO4, + VIRTIO_NET_F_GUEST_USO6, + VIRTIO_NET_F_HOST_USO, /* This bit implies RARP isn't sent by QEMU out of band */ VIRTIO_NET_F_GUEST_ANNOUNCE, diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 7102ec4817..bd0ead94fe 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -659,6 +659,15 @@ static int peer_has_ufo(VirtIONet *n) return n->has_ufo; } +static int peer_has_uso(VirtIONet *n) +{ + if (!peer_has_vnet_hdr(n)) { + return 0; + } + + return qemu_has_uso(qemu_get_queue(n->nic)->peer); +} + static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs, int version_1, int hash_report) { @@ -796,6 +805,10 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features, virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_TSO6); virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_ECN); + virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO); + virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4); + virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6); + virtio_clear_feature(&features, VIRTIO_NET_F_HASH_REPORT); } @@ -804,6 +817,12 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features, virtio_clear_feature(&features, VIRTIO_NET_F_HOST_UFO); } + if (!peer_has_uso(n)) { + virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO); + virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4); + virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6); + } + if (!get_vhost_net(nc->peer)) { return features; } @@ -859,17 +878,21 @@ static void virtio_net_apply_guest_offloads(VirtIONet *n) !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO4)), !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO6)), !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_ECN)), - !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_UFO))); + !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_UFO)), + !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO4)), + !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO6))); } -static uint64_t virtio_net_guest_offloads_by_features(uint32_t features) +static uint64_t virtio_net_guest_offloads_by_features(uint64_t features) { static const uint64_t guest_offloads_mask = (1ULL << VIRTIO_NET_F_GUEST_CSUM) | (1ULL << VIRTIO_NET_F_GUEST_TSO4) | (1ULL << VIRTIO_NET_F_GUEST_TSO6) | (1ULL << VIRTIO_NET_F_GUEST_ECN) | - (1ULL << VIRTIO_NET_F_GUEST_UFO); + (1ULL << VIRTIO_NET_F_GUEST_UFO) | + (1ULL << VIRTIO_NET_F_GUEST_USO4) | + (1ULL << VIRTIO_NET_F_GUEST_USO6); return guest_offloads_mask & features; } @@ -3922,6 +3945,12 @@ static Property virtio_net_properties[] = { DEFINE_PROP_INT32("speed", VirtIONet, net_conf.speed, SPEED_UNKNOWN), DEFINE_PROP_STRING("duplex", VirtIONet, net_conf.duplex_str), DEFINE_PROP_BOOL("failover", VirtIONet, failover, false), + DEFINE_PROP_BIT64("guest_uso4", VirtIONet, host_features, + VIRTIO_NET_F_GUEST_USO4, true), + DEFINE_PROP_BIT64("guest_uso6", VirtIONet, host_features, + VIRTIO_NET_F_GUEST_USO6, true), + DEFINE_PROP_BIT64("host_uso", VirtIONet, host_features, + VIRTIO_NET_F_HOST_USO, true), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/net/vmxnet3.c b/hw/net/vmxnet3.c index 3fb108751a..226c0777f0 100644 --- a/hw/net/vmxnet3.c +++ b/hw/net/vmxnet3.c @@ -1341,6 +1341,8 @@ static void vmxnet3_update_features(VMXNET3State *s) s->lro_supported, s->lro_supported, 0, + 0, + 0, 0); } } diff --git a/include/net/net.h b/include/net/net.h index 1448d00afb..330d285930 100644 --- a/include/net/net.h +++ b/include/net/net.h @@ -54,11 +54,12 @@ typedef void (LinkStatusChanged)(NetClientState *); typedef void (NetClientDestructor)(NetClientState *); typedef RxFilterInfo *(QueryRxFilter)(NetClientState *); typedef bool (HasUfo)(NetClientState *); +typedef bool (HasUso)(NetClientState *); typedef bool (HasVnetHdr)(NetClientState *); typedef bool (HasVnetHdrLen)(NetClientState *, int); typedef bool (GetUsingVnetHdr)(NetClientState *); typedef void (UsingVnetHdr)(NetClientState *, bool); -typedef void (SetOffload)(NetClientState *, int, int, int, int, int); +typedef void (SetOffload)(NetClientState *, int, int, int, int, int, int, int); typedef int (GetVnetHdrLen)(NetClientState *); typedef void (SetVnetHdrLen)(NetClientState *, int); typedef int (SetVnetLE)(NetClientState *, bool); @@ -84,6 +85,7 @@ typedef struct NetClientInfo { QueryRxFilter *query_rx_filter; NetPoll *poll; HasUfo *has_ufo; + HasUso *has_uso; HasVnetHdr *has_vnet_hdr; HasVnetHdrLen *has_vnet_hdr_len; GetUsingVnetHdr *get_using_vnet_hdr; @@ -187,12 +189,13 @@ void qemu_set_info_str(NetClientState *nc, const char *fmt, ...) G_GNUC_PRINTF(2, 3); void qemu_format_nic_info_str(NetClientState *nc, uint8_t macaddr[6]); bool qemu_has_ufo(NetClientState *nc); +bool qemu_has_uso(NetClientState *nc); bool qemu_has_vnet_hdr(NetClientState *nc); bool qemu_has_vnet_hdr_len(NetClientState *nc, int len); bool qemu_get_using_vnet_hdr(NetClientState *nc); void qemu_using_vnet_hdr(NetClientState *nc, bool enable); void qemu_set_offload(NetClientState *nc, int csum, int tso4, int tso6, - int ecn, int ufo); + int ecn, int ufo, int uso4, int uso6); int qemu_get_vnet_hdr_len(NetClientState *nc); void qemu_set_vnet_hdr_len(NetClientState *nc, int len); int qemu_set_vnet_le(NetClientState *nc, bool is_le); diff --git a/meson.build b/meson.build index 5150a74831..f426861d90 100644 --- a/meson.build +++ b/meson.build @@ -1873,6 +1873,13 @@ if libbpf.found() and not cc.links(''' endif endif +# libxdp +libxdp = not_found +if not get_option('af_xdp').auto() or have_system + libxdp = dependency('libxdp', required: get_option('af_xdp'), + version: '>=1.4.0', method: 'pkg-config') +endif + # libdw libdw = not_found if not get_option('libdw').auto() or \ @@ -2099,6 +2106,7 @@ config_host_data.set('CONFIG_HEXAGON_IDEF_PARSER', get_option('hexagon_idef_pars config_host_data.set('CONFIG_LIBATTR', have_old_libattr) config_host_data.set('CONFIG_LIBCAP_NG', libcap_ng.found()) config_host_data.set('CONFIG_EBPF', libbpf.found()) +config_host_data.set('CONFIG_AF_XDP', libxdp.found()) config_host_data.set('CONFIG_LIBDAXCTL', libdaxctl.found()) config_host_data.set('CONFIG_LIBISCSI', libiscsi.found()) config_host_data.set('CONFIG_LIBNFS', libnfs.found()) @@ -4270,6 +4278,7 @@ summary_info = {} if targetos == 'darwin' summary_info += {'vmnet.framework support': vmnet} endif +summary_info += {'AF_XDP support': libxdp} summary_info += {'slirp support': slirp} summary_info += {'vde support': vde} summary_info += {'netmap support': have_netmap} diff --git a/meson_options.txt b/meson_options.txt index f82d88b7c6..2ca40f22e9 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -122,6 +122,8 @@ option('avx512bw', type: 'feature', value: 'auto', option('keyring', type: 'feature', value: 'auto', description: 'Linux keyring support') +option('af_xdp', type : 'feature', value : 'auto', + description: 'AF_XDP network backend support') option('attr', type : 'feature', value : 'auto', description: 'attr/xattr support') option('auth_pam', type : 'feature', value : 'auto', diff --git a/net/af-xdp.c b/net/af-xdp.c new file mode 100644 index 0000000000..6c65028fb0 --- /dev/null +++ b/net/af-xdp.c @@ -0,0 +1,526 @@ +/* + * AF_XDP network backend. + * + * Copyright (c) 2023 Red Hat, Inc. + * + * Authors: + * Ilya Maximets <i.maximets@ovn.org> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + + +#include "qemu/osdep.h" +#include <bpf/bpf.h> +#include <inttypes.h> +#include <linux/if_link.h> +#include <linux/if_xdp.h> +#include <net/if.h> +#include <xdp/xsk.h> + +#include "clients.h" +#include "monitor/monitor.h" +#include "net/net.h" +#include "qapi/error.h" +#include "qemu/cutils.h" +#include "qemu/error-report.h" +#include "qemu/iov.h" +#include "qemu/main-loop.h" +#include "qemu/memalign.h" + + +typedef struct AFXDPState { + NetClientState nc; + + struct xsk_socket *xsk; + struct xsk_ring_cons rx; + struct xsk_ring_prod tx; + struct xsk_ring_cons cq; + struct xsk_ring_prod fq; + + char ifname[IFNAMSIZ]; + int ifindex; + bool read_poll; + bool write_poll; + uint32_t outstanding_tx; + + uint64_t *pool; + uint32_t n_pool; + char *buffer; + struct xsk_umem *umem; + + uint32_t n_queues; + uint32_t xdp_flags; + bool inhibit; +} AFXDPState; + +#define AF_XDP_BATCH_SIZE 64 + +static void af_xdp_send(void *opaque); +static void af_xdp_writable(void *opaque); + +/* Set the event-loop handlers for the af-xdp backend. */ +static void af_xdp_update_fd_handler(AFXDPState *s) +{ + qemu_set_fd_handler(xsk_socket__fd(s->xsk), + s->read_poll ? af_xdp_send : NULL, + s->write_poll ? af_xdp_writable : NULL, + s); +} + +/* Update the read handler. */ +static void af_xdp_read_poll(AFXDPState *s, bool enable) +{ + if (s->read_poll != enable) { + s->read_poll = enable; + af_xdp_update_fd_handler(s); + } +} + +/* Update the write handler. */ +static void af_xdp_write_poll(AFXDPState *s, bool enable) +{ + if (s->write_poll != enable) { + s->write_poll = enable; + af_xdp_update_fd_handler(s); + } +} + +static void af_xdp_poll(NetClientState *nc, bool enable) +{ + AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc); + + if (s->read_poll != enable || s->write_poll != enable) { + s->write_poll = enable; + s->read_poll = enable; + af_xdp_update_fd_handler(s); + } +} + +static void af_xdp_complete_tx(AFXDPState *s) +{ + uint32_t idx = 0; + uint32_t done, i; + uint64_t *addr; + + done = xsk_ring_cons__peek(&s->cq, XSK_RING_CONS__DEFAULT_NUM_DESCS, &idx); + + for (i = 0; i < done; i++) { + addr = (void *) xsk_ring_cons__comp_addr(&s->cq, idx++); + s->pool[s->n_pool++] = *addr; + s->outstanding_tx--; + } + + if (done) { + xsk_ring_cons__release(&s->cq, done); + } +} + +/* + * The fd_write() callback, invoked if the fd is marked as writable + * after a poll. + */ +static void af_xdp_writable(void *opaque) +{ + AFXDPState *s = opaque; + + /* Try to recover buffers that are already sent. */ + af_xdp_complete_tx(s); + + /* + * Unregister the handler, unless we still have packets to transmit + * and kernel needs a wake up. + */ + if (!s->outstanding_tx || !xsk_ring_prod__needs_wakeup(&s->tx)) { + af_xdp_write_poll(s, false); + } + + /* Flush any buffered packets. */ + qemu_flush_queued_packets(&s->nc); +} + +static ssize_t af_xdp_receive(NetClientState *nc, + const uint8_t *buf, size_t size) +{ + AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc); + struct xdp_desc *desc; + uint32_t idx; + void *data; + + /* Try to recover buffers that are already sent. */ + af_xdp_complete_tx(s); + + if (size > XSK_UMEM__DEFAULT_FRAME_SIZE) { + /* We can't transmit packet this size... */ + return size; + } + + if (!s->n_pool || !xsk_ring_prod__reserve(&s->tx, 1, &idx)) { + /* + * Out of buffers or space in tx ring. Poll until we can write. + * This will also kick the Tx, if it was waiting on CQ. + */ + af_xdp_write_poll(s, true); + return 0; + } + + desc = xsk_ring_prod__tx_desc(&s->tx, idx); + desc->addr = s->pool[--s->n_pool]; + desc->len = size; + + data = xsk_umem__get_data(s->buffer, desc->addr); + memcpy(data, buf, size); + + xsk_ring_prod__submit(&s->tx, 1); + s->outstanding_tx++; + + if (xsk_ring_prod__needs_wakeup(&s->tx)) { + af_xdp_write_poll(s, true); + } + + return size; +} + +/* + * Complete a previous send (backend --> guest) and enable the + * fd_read callback. + */ +static void af_xdp_send_completed(NetClientState *nc, ssize_t len) +{ + AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc); + + af_xdp_read_poll(s, true); +} + +static void af_xdp_fq_refill(AFXDPState *s, uint32_t n) +{ + uint32_t i, idx = 0; + + /* Leave one packet for Tx, just in case. */ + if (s->n_pool < n + 1) { + n = s->n_pool; + } + + if (!n || !xsk_ring_prod__reserve(&s->fq, n, &idx)) { + return; + } + + for (i = 0; i < n; i++) { + *xsk_ring_prod__fill_addr(&s->fq, idx++) = s->pool[--s->n_pool]; + } + xsk_ring_prod__submit(&s->fq, n); + + if (xsk_ring_prod__needs_wakeup(&s->fq)) { + /* Receive was blocked by not having enough buffers. Wake it up. */ + af_xdp_read_poll(s, true); + } +} + +static void af_xdp_send(void *opaque) +{ + uint32_t i, n_rx, idx = 0; + AFXDPState *s = opaque; + + n_rx = xsk_ring_cons__peek(&s->rx, AF_XDP_BATCH_SIZE, &idx); + if (!n_rx) { + return; + } + + for (i = 0; i < n_rx; i++) { + const struct xdp_desc *desc; + struct iovec iov; + + desc = xsk_ring_cons__rx_desc(&s->rx, idx++); + + iov.iov_base = xsk_umem__get_data(s->buffer, desc->addr); + iov.iov_len = desc->len; + + s->pool[s->n_pool++] = desc->addr; + + if (!qemu_sendv_packet_async(&s->nc, &iov, 1, + af_xdp_send_completed)) { + /* + * The peer does not receive anymore. Packet is queued, stop + * reading from the backend until af_xdp_send_completed(). + */ + af_xdp_read_poll(s, false); + + /* Return unused descriptors to not break the ring cache. */ + xsk_ring_cons__cancel(&s->rx, n_rx - i - 1); + n_rx = i + 1; + break; + } + } + + /* Release actually sent descriptors and try to re-fill. */ + xsk_ring_cons__release(&s->rx, n_rx); + af_xdp_fq_refill(s, AF_XDP_BATCH_SIZE); +} + +/* Flush and close. */ +static void af_xdp_cleanup(NetClientState *nc) +{ + AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc); + + qemu_purge_queued_packets(nc); + + af_xdp_poll(nc, false); + + xsk_socket__delete(s->xsk); + s->xsk = NULL; + g_free(s->pool); + s->pool = NULL; + xsk_umem__delete(s->umem); + s->umem = NULL; + qemu_vfree(s->buffer); + s->buffer = NULL; + + /* Remove the program if it's the last open queue. */ + if (!s->inhibit && nc->queue_index == s->n_queues - 1 && s->xdp_flags + && bpf_xdp_detach(s->ifindex, s->xdp_flags, NULL) != 0) { + fprintf(stderr, + "af-xdp: unable to remove XDP program from '%s', ifindex: %d\n", + s->ifname, s->ifindex); + } +} + +static int af_xdp_umem_create(AFXDPState *s, int sock_fd, Error **errp) +{ + struct xsk_umem_config config = { + .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, + .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, + .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE, + .frame_headroom = 0, + }; + uint64_t n_descs; + uint64_t size; + int64_t i; + int ret; + + /* Number of descriptors if all 4 queues (rx, tx, cq, fq) are full. */ + n_descs = (XSK_RING_PROD__DEFAULT_NUM_DESCS + + XSK_RING_CONS__DEFAULT_NUM_DESCS) * 2; + size = n_descs * XSK_UMEM__DEFAULT_FRAME_SIZE; + + s->buffer = qemu_memalign(qemu_real_host_page_size(), size); + memset(s->buffer, 0, size); + + if (sock_fd < 0) { + ret = xsk_umem__create(&s->umem, s->buffer, size, + &s->fq, &s->cq, &config); + } else { + ret = xsk_umem__create_with_fd(&s->umem, sock_fd, s->buffer, size, + &s->fq, &s->cq, &config); + } + + if (ret) { + qemu_vfree(s->buffer); + error_setg_errno(errp, errno, + "failed to create umem for %s queue_index: %d", + s->ifname, s->nc.queue_index); + return -1; + } + + s->pool = g_new(uint64_t, n_descs); + /* Fill the pool in the opposite order, because it's a LIFO queue. */ + for (i = n_descs; i >= 0; i--) { + s->pool[i] = i * XSK_UMEM__DEFAULT_FRAME_SIZE; + } + s->n_pool = n_descs; + + af_xdp_fq_refill(s, XSK_RING_PROD__DEFAULT_NUM_DESCS); + + return 0; +} + +static int af_xdp_socket_create(AFXDPState *s, + const NetdevAFXDPOptions *opts, Error **errp) +{ + struct xsk_socket_config cfg = { + .rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, + .tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, + .libxdp_flags = 0, + .bind_flags = XDP_USE_NEED_WAKEUP, + .xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST, + }; + int queue_id, error = 0; + + s->inhibit = opts->has_inhibit && opts->inhibit; + if (s->inhibit) { + cfg.libxdp_flags |= XSK_LIBXDP_FLAGS__INHIBIT_PROG_LOAD; + } + + if (opts->has_force_copy && opts->force_copy) { + cfg.bind_flags |= XDP_COPY; + } + + queue_id = s->nc.queue_index; + if (opts->has_start_queue && opts->start_queue > 0) { + queue_id += opts->start_queue; + } + + if (opts->has_mode) { + /* Specific mode requested. */ + cfg.xdp_flags |= (opts->mode == AFXDP_MODE_NATIVE) + ? XDP_FLAGS_DRV_MODE : XDP_FLAGS_SKB_MODE; + if (xsk_socket__create(&s->xsk, s->ifname, queue_id, + s->umem, &s->rx, &s->tx, &cfg)) { + error = errno; + } + } else { + /* No mode requested, try native first. */ + cfg.xdp_flags |= XDP_FLAGS_DRV_MODE; + + if (xsk_socket__create(&s->xsk, s->ifname, queue_id, + s->umem, &s->rx, &s->tx, &cfg)) { + /* Can't use native mode, try skb. */ + cfg.xdp_flags &= ~XDP_FLAGS_DRV_MODE; + cfg.xdp_flags |= XDP_FLAGS_SKB_MODE; + + if (xsk_socket__create(&s->xsk, s->ifname, queue_id, + s->umem, &s->rx, &s->tx, &cfg)) { + error = errno; + } + } + } + + if (error) { + error_setg_errno(errp, error, + "failed to create AF_XDP socket for %s queue_id: %d", + s->ifname, queue_id); + return -1; + } + + s->xdp_flags = cfg.xdp_flags; + + return 0; +} + +/* NetClientInfo methods. */ +static NetClientInfo net_af_xdp_info = { + .type = NET_CLIENT_DRIVER_AF_XDP, + .size = sizeof(AFXDPState), + .receive = af_xdp_receive, + .poll = af_xdp_poll, + .cleanup = af_xdp_cleanup, +}; + +static int *parse_socket_fds(const char *sock_fds_str, + int64_t n_expected, Error **errp) +{ + gchar **substrings = g_strsplit(sock_fds_str, ":", -1); + int64_t i, n_sock_fds = g_strv_length(substrings); + int *sock_fds = NULL; + + if (n_sock_fds != n_expected) { + error_setg(errp, "expected %"PRIi64" socket fds, got %"PRIi64, + n_expected, n_sock_fds); + goto exit; + } + + sock_fds = g_new(int, n_sock_fds); + + for (i = 0; i < n_sock_fds; i++) { + sock_fds[i] = monitor_fd_param(monitor_cur(), substrings[i], errp); + if (sock_fds[i] < 0) { + g_free(sock_fds); + sock_fds = NULL; + goto exit; + } + } + +exit: + g_strfreev(substrings); + return sock_fds; +} + +/* + * The exported init function. + * + * ... -netdev af-xdp,ifname="..." + */ +int net_init_af_xdp(const Netdev *netdev, + const char *name, NetClientState *peer, Error **errp) +{ + const NetdevAFXDPOptions *opts = &netdev->u.af_xdp; + NetClientState *nc, *nc0 = NULL; + unsigned int ifindex; + uint32_t prog_id = 0; + int *sock_fds = NULL; + int64_t i, queues; + Error *err = NULL; + AFXDPState *s; + + ifindex = if_nametoindex(opts->ifname); + if (!ifindex) { + error_setg_errno(errp, errno, "failed to get ifindex for '%s'", + opts->ifname); + return -1; + } + + queues = opts->has_queues ? opts->queues : 1; + if (queues < 1) { + error_setg(errp, "invalid number of queues (%" PRIi64 ") for '%s'", + queues, opts->ifname); + return -1; + } + + if ((opts->has_inhibit && opts->inhibit) != !!opts->sock_fds) { + error_setg(errp, "'inhibit=on' requires 'sock-fds' and vice versa"); + return -1; + } + + if (opts->sock_fds) { + sock_fds = parse_socket_fds(opts->sock_fds, queues, errp); + if (!sock_fds) { + return -1; + } + } + + for (i = 0; i < queues; i++) { + nc = qemu_new_net_client(&net_af_xdp_info, peer, "af-xdp", name); + qemu_set_info_str(nc, "af-xdp%"PRIi64" to %s", i, opts->ifname); + nc->queue_index = i; + + if (!nc0) { + nc0 = nc; + } + + s = DO_UPCAST(AFXDPState, nc, nc); + + pstrcpy(s->ifname, sizeof(s->ifname), opts->ifname); + s->ifindex = ifindex; + s->n_queues = queues; + + if (af_xdp_umem_create(s, sock_fds ? sock_fds[i] : -1, errp) + || af_xdp_socket_create(s, opts, errp)) { + /* Make sure the XDP program will be removed. */ + s->n_queues = i; + error_propagate(errp, err); + goto err; + } + } + + if (nc0) { + s = DO_UPCAST(AFXDPState, nc, nc0); + if (bpf_xdp_query_id(s->ifindex, s->xdp_flags, &prog_id) || !prog_id) { + error_setg_errno(errp, errno, + "no XDP program loaded on '%s', ifindex: %d", + s->ifname, s->ifindex); + goto err; + } + } + + af_xdp_read_poll(s, true); /* Initially only poll for reads. */ + + return 0; + +err: + g_free(sock_fds); + if (nc0) { + qemu_del_net_client(nc0); + } + + return -1; +} diff --git a/net/clients.h b/net/clients.h index ed8bdfff1e..be53794582 100644 --- a/net/clients.h +++ b/net/clients.h @@ -64,6 +64,11 @@ int net_init_netmap(const Netdev *netdev, const char *name, NetClientState *peer, Error **errp); #endif +#ifdef CONFIG_AF_XDP +int net_init_af_xdp(const Netdev *netdev, const char *name, + NetClientState *peer, Error **errp); +#endif + int net_init_vhost_user(const Netdev *netdev, const char *name, NetClientState *peer, Error **errp); diff --git a/net/dump.c b/net/dump.c index 7d05f16ca7..16073f2458 100644 --- a/net/dump.c +++ b/net/dump.c @@ -68,7 +68,7 @@ static ssize_t dump_receive_iov(DumpState *s, const struct iovec *iov, int cnt, int64_t ts; int caplen; size_t size = iov_size(iov, cnt) - offset; - struct iovec dumpiov[cnt + 1]; + g_autofree struct iovec *dumpiov = g_new(struct iovec, cnt + 1); /* Early return in case of previous error. */ if (s->fd < 0) { diff --git a/net/meson.build b/net/meson.build index 51caa42c9d..ce99bd4447 100644 --- a/net/meson.build +++ b/net/meson.build @@ -36,6 +36,9 @@ system_ss.add(when: vde, if_true: files('vde.c')) if have_netmap system_ss.add(files('netmap.c')) endif + +system_ss.add(when: libxdp, if_true: files('af-xdp.c')) + if have_vhost_net_user system_ss.add(when: 'CONFIG_VIRTIO_NET', if_true: files('vhost-user.c'), if_false: files('vhost-user-stub.c')) system_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-user-stub.c')) @@ -495,6 +495,15 @@ bool qemu_has_ufo(NetClientState *nc) return nc->info->has_ufo(nc); } +bool qemu_has_uso(NetClientState *nc) +{ + if (!nc || !nc->info->has_uso) { + return false; + } + + return nc->info->has_uso(nc); +} + bool qemu_has_vnet_hdr(NetClientState *nc) { if (!nc || !nc->info->has_vnet_hdr) { @@ -532,13 +541,13 @@ void qemu_using_vnet_hdr(NetClientState *nc, bool enable) } void qemu_set_offload(NetClientState *nc, int csum, int tso4, int tso6, - int ecn, int ufo) + int ecn, int ufo, int uso4, int uso6) { if (!nc || !nc->info->set_offload) { return; } - nc->info->set_offload(nc, csum, tso4, tso6, ecn, ufo); + nc->info->set_offload(nc, csum, tso4, tso6, ecn, ufo, uso4, uso6); } int qemu_get_vnet_hdr_len(NetClientState *nc) @@ -1082,6 +1091,9 @@ static int (* const net_client_init_fun[NET_CLIENT_DRIVER__MAX])( #ifdef CONFIG_NETMAP [NET_CLIENT_DRIVER_NETMAP] = net_init_netmap, #endif +#ifdef CONFIG_AF_XDP + [NET_CLIENT_DRIVER_AF_XDP] = net_init_af_xdp, +#endif #ifdef CONFIG_NET_BRIDGE [NET_CLIENT_DRIVER_BRIDGE] = net_init_bridge, #endif @@ -1186,6 +1198,9 @@ void show_netdevs(void) #ifdef CONFIG_NETMAP "netmap", #endif +#ifdef CONFIG_AF_XDP + "af-xdp", +#endif #ifdef CONFIG_POSIX "vhost-user", #endif diff --git a/net/netmap.c b/net/netmap.c index 9e0cec58d3..241b27c8e9 100644 --- a/net/netmap.c +++ b/net/netmap.c @@ -371,7 +371,7 @@ static void netmap_set_vnet_hdr_len(NetClientState *nc, int len) } static void netmap_set_offload(NetClientState *nc, int csum, int tso4, int tso6, - int ecn, int ufo) + int ecn, int ufo, int uso4, int uso6) { NetmapState *s = DO_UPCAST(NetmapState, nc, nc); diff --git a/net/tap-bsd.c b/net/tap-bsd.c index 4c98fdd337..274ea7bd2c 100644 --- a/net/tap-bsd.c +++ b/net/tap-bsd.c @@ -212,6 +212,11 @@ int tap_probe_has_ufo(int fd) return 0; } +int tap_probe_has_uso(int fd) +{ + return 0; +} + int tap_probe_vnet_hdr_len(int fd, int len) { return 0; @@ -232,7 +237,7 @@ int tap_fd_set_vnet_be(int fd, int is_be) } void tap_fd_set_offload(int fd, int csum, int tso4, - int tso6, int ecn, int ufo) + int tso6, int ecn, int ufo, int uso4, int uso6) { } diff --git a/net/tap-linux.c b/net/tap-linux.c index f54f308d35..c7e514ecb0 100644 --- a/net/tap-linux.c +++ b/net/tap-linux.c @@ -173,6 +173,18 @@ int tap_probe_has_ufo(int fd) return 1; } +int tap_probe_has_uso(int fd) +{ + unsigned offload; + + offload = TUN_F_CSUM | TUN_F_USO4 | TUN_F_USO6; + + if (ioctl(fd, TUNSETOFFLOAD, offload) < 0) { + return 0; + } + return 1; +} + /* Verify that we can assign given length */ int tap_probe_vnet_hdr_len(int fd, int len) { @@ -237,7 +249,7 @@ int tap_fd_set_vnet_be(int fd, int is_be) } void tap_fd_set_offload(int fd, int csum, int tso4, - int tso6, int ecn, int ufo) + int tso6, int ecn, int ufo, int uso4, int uso6) { unsigned int offload = 0; @@ -256,13 +268,22 @@ void tap_fd_set_offload(int fd, int csum, int tso4, offload |= TUN_F_TSO_ECN; if (ufo) offload |= TUN_F_UFO; + if (uso4) { + offload |= TUN_F_USO4; + } + if (uso6) { + offload |= TUN_F_USO6; + } } if (ioctl(fd, TUNSETOFFLOAD, offload) != 0) { - offload &= ~TUN_F_UFO; + offload &= ~(TUN_F_USO4 | TUN_F_USO6); if (ioctl(fd, TUNSETOFFLOAD, offload) != 0) { - fprintf(stderr, "TUNSETOFFLOAD ioctl() failed: %s\n", + offload &= ~TUN_F_UFO; + if (ioctl(fd, TUNSETOFFLOAD, offload) != 0) { + fprintf(stderr, "TUNSETOFFLOAD ioctl() failed: %s\n", strerror(errno)); + } } } } diff --git a/net/tap-linux.h b/net/tap-linux.h index bbbb62c2a7..9a58cecb7f 100644 --- a/net/tap-linux.h +++ b/net/tap-linux.h @@ -50,5 +50,7 @@ #define TUN_F_TSO6 0x04 /* I can handle TSO for IPv6 packets */ #define TUN_F_TSO_ECN 0x08 /* I can handle TSO with ECN bits. */ #define TUN_F_UFO 0x10 /* I can handle UFO packets */ +#define TUN_F_USO4 0x20 /* I can handle USO for IPv4 packets */ +#define TUN_F_USO6 0x40 /* I can handle USO for IPv6 packets */ #endif /* QEMU_TAP_LINUX_H */ diff --git a/net/tap-solaris.c b/net/tap-solaris.c index 38e15028bf..08b13af512 100644 --- a/net/tap-solaris.c +++ b/net/tap-solaris.c @@ -216,6 +216,11 @@ int tap_probe_has_ufo(int fd) return 0; } +int tap_probe_has_uso(int fd) +{ + return 0; +} + int tap_probe_vnet_hdr_len(int fd, int len) { return 0; @@ -236,7 +241,7 @@ int tap_fd_set_vnet_be(int fd, int is_be) } void tap_fd_set_offload(int fd, int csum, int tso4, - int tso6, int ecn, int ufo) + int tso6, int ecn, int ufo, int uso4, int uso6) { } diff --git a/net/tap-stub.c b/net/tap-stub.c index a0fa25804b..4b24f61e3a 100644 --- a/net/tap-stub.c +++ b/net/tap-stub.c @@ -47,6 +47,11 @@ int tap_probe_has_ufo(int fd) return 0; } +int tap_probe_has_uso(int fd) +{ + return 0; +} + int tap_probe_vnet_hdr_len(int fd, int len) { return 0; @@ -67,7 +72,7 @@ int tap_fd_set_vnet_be(int fd, int is_be) } void tap_fd_set_offload(int fd, int csum, int tso4, - int tso6, int ecn, int ufo) + int tso6, int ecn, int ufo, int uso4, int uso6) { } diff --git a/net/tap-win32.c b/net/tap-win32.c index f327d62ab0..7b8b4be02c 100644 --- a/net/tap-win32.c +++ b/net/tap-win32.c @@ -741,7 +741,7 @@ static void tap_using_vnet_hdr(NetClientState *nc, bool using_vnet_hdr) } static void tap_set_offload(NetClientState *nc, int csum, int tso4, - int tso6, int ecn, int ufo) + int tso6, int ecn, int ufo, int uso4, int uso6) { } @@ -57,6 +57,7 @@ typedef struct TAPState { bool write_poll; bool using_vnet_hdr; bool has_ufo; + bool has_uso; bool enabled; VHostNetState *vhost_net; unsigned host_vnet_hdr_len; @@ -117,10 +118,11 @@ static ssize_t tap_receive_iov(NetClientState *nc, const struct iovec *iov, { TAPState *s = DO_UPCAST(TAPState, nc, nc); const struct iovec *iovp = iov; - struct iovec iov_copy[iovcnt + 1]; + g_autofree struct iovec *iov_copy = NULL; struct virtio_net_hdr_mrg_rxbuf hdr = { }; if (s->host_vnet_hdr_len && !s->using_vnet_hdr) { + iov_copy = g_new(struct iovec, iovcnt + 1); iov_copy[0].iov_base = &hdr; iov_copy[0].iov_len = s->host_vnet_hdr_len; memcpy(&iov_copy[1], iov, iovcnt * sizeof(*iov)); @@ -237,6 +239,15 @@ static bool tap_has_ufo(NetClientState *nc) return s->has_ufo; } +static bool tap_has_uso(NetClientState *nc) +{ + TAPState *s = DO_UPCAST(TAPState, nc, nc); + + assert(nc->info->type == NET_CLIENT_DRIVER_TAP); + + return s->has_uso; +} + static bool tap_has_vnet_hdr(NetClientState *nc) { TAPState *s = DO_UPCAST(TAPState, nc, nc); @@ -307,14 +318,14 @@ static int tap_set_vnet_be(NetClientState *nc, bool is_be) } static void tap_set_offload(NetClientState *nc, int csum, int tso4, - int tso6, int ecn, int ufo) + int tso6, int ecn, int ufo, int uso4, int uso6) { TAPState *s = DO_UPCAST(TAPState, nc, nc); if (s->fd < 0) { return; } - tap_fd_set_offload(s->fd, csum, tso4, tso6, ecn, ufo); + tap_fd_set_offload(s->fd, csum, tso4, tso6, ecn, ufo, uso4, uso6); } static void tap_exit_notify(Notifier *notifier, void *data) @@ -384,6 +395,7 @@ static NetClientInfo net_tap_info = { .poll = tap_poll, .cleanup = tap_cleanup, .has_ufo = tap_has_ufo, + .has_uso = tap_has_uso, .has_vnet_hdr = tap_has_vnet_hdr, .has_vnet_hdr_len = tap_has_vnet_hdr_len, .get_using_vnet_hdr = tap_get_using_vnet_hdr, @@ -413,8 +425,9 @@ static TAPState *net_tap_fd_init(NetClientState *peer, s->host_vnet_hdr_len = vnet_hdr ? sizeof(struct virtio_net_hdr) : 0; s->using_vnet_hdr = false; s->has_ufo = tap_probe_has_ufo(s->fd); + s->has_uso = tap_probe_has_uso(s->fd); s->enabled = true; - tap_set_offload(&s->nc, 0, 0, 0, 0, 0); + tap_set_offload(&s->nc, 0, 0, 0, 0, 0, 0, 0); /* * Make sure host header length is set correctly in tap: * it might have been modified by another instance of qemu. diff --git a/net/tap_int.h b/net/tap_int.h index 547f8a5a28..9a2175655b 100644 --- a/net/tap_int.h +++ b/net/tap_int.h @@ -37,7 +37,9 @@ void tap_set_sndbuf(int fd, const NetdevTapOptions *tap, Error **errp); int tap_probe_vnet_hdr(int fd, Error **errp); int tap_probe_vnet_hdr_len(int fd, int len); int tap_probe_has_ufo(int fd); -void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo); +int tap_probe_has_uso(int fd); +void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo, + int uso4, int uso6); void tap_fd_set_vnet_hdr_len(int fd, int len); int tap_fd_set_vnet_le(int fd, int vnet_is_le); int tap_fd_set_vnet_be(int fd, int vnet_is_be); diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c index 34202ca009..4e94c50bc7 100644 --- a/net/vhost-vdpa.c +++ b/net/vhost-vdpa.c @@ -75,11 +75,14 @@ const int vdpa_feature_bits[] = { VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, VIRTIO_NET_F_GUEST_UFO, + VIRTIO_NET_F_GUEST_USO4, + VIRTIO_NET_F_GUEST_USO6, VIRTIO_NET_F_HASH_REPORT, VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_TSO6, VIRTIO_NET_F_HOST_UFO, + VIRTIO_NET_F_HOST_USO, VIRTIO_NET_F_MQ, VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_MTU, diff --git a/qapi/net.json b/qapi/net.json index 313c8a606e..8095b68fa8 100644 --- a/qapi/net.json +++ b/qapi/net.json @@ -409,6 +409,60 @@ '*devname': 'str' } } ## +# @AFXDPMode: +# +# Attach mode for a default XDP program +# +# @skb: generic mode, no driver support necessary +# +# @native: DRV mode, program is attached to a driver, packets are passed to +# the socket without allocation of skb. +# +# Since: 8.2 +## +{ 'enum': 'AFXDPMode', + 'data': [ 'native', 'skb' ], + 'if': 'CONFIG_AF_XDP' } + +## +# @NetdevAFXDPOptions: +# +# AF_XDP network backend +# +# @ifname: The name of an existing network interface. +# +# @mode: Attach mode for a default XDP program. If not specified, then +# 'native' will be tried first, then 'skb'. +# +# @force-copy: Force XDP copy mode even if device supports zero-copy. +# (default: false) +# +# @queues: number of queues to be used for multiqueue interfaces (default: 1). +# +# @start-queue: Use @queues starting from this queue number (default: 0). +# +# @inhibit: Don't load a default XDP program, use one already loaded to +# the interface (default: false). Requires @sock-fds. +# +# @sock-fds: A colon (:) separated list of file descriptors for already open +# but not bound AF_XDP sockets in the queue order. One fd per queue. +# These descriptors should already be added into XDP socket map for +# corresponding queues. Requires @inhibit. +# +# Since: 8.2 +## +{ 'struct': 'NetdevAFXDPOptions', + 'data': { + 'ifname': 'str', + '*mode': 'AFXDPMode', + '*force-copy': 'bool', + '*queues': 'int', + '*start-queue': 'int', + '*inhibit': 'bool', + '*sock-fds': 'str' }, + 'if': 'CONFIG_AF_XDP' } + +## # @NetdevVhostUserOptions: # # Vhost-user network backend @@ -642,6 +696,7 @@ # @vmnet-bridged: since 7.1 # @stream: since 7.2 # @dgram: since 7.2 +# @af-xdp: since 8.2 # # Since: 2.7 ## @@ -649,6 +704,7 @@ 'data': [ 'none', 'nic', 'user', 'tap', 'l2tpv3', 'socket', 'stream', 'dgram', 'vde', 'bridge', 'hubport', 'netmap', 'vhost-user', 'vhost-vdpa', + { 'name': 'af-xdp', 'if': 'CONFIG_AF_XDP' }, { 'name': 'vmnet-host', 'if': 'CONFIG_VMNET' }, { 'name': 'vmnet-shared', 'if': 'CONFIG_VMNET' }, { 'name': 'vmnet-bridged', 'if': 'CONFIG_VMNET' }] } @@ -679,6 +735,8 @@ 'bridge': 'NetdevBridgeOptions', 'hubport': 'NetdevHubPortOptions', 'netmap': 'NetdevNetmapOptions', + 'af-xdp': { 'type': 'NetdevAFXDPOptions', + 'if': 'CONFIG_AF_XDP' }, 'vhost-user': 'NetdevVhostUserOptions', 'vhost-vdpa': 'NetdevVhostVDPAOptions', 'vmnet-host': { 'type': 'NetdevVmnetHostOptions', diff --git a/qemu-options.hx b/qemu-options.hx index 6be621c232..2bcf7e4e97 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -2882,6 +2882,19 @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev, " VALE port (created on the fly) called 'name' ('nmname' is name of the \n" " netmap device, defaults to '/dev/netmap')\n" #endif +#ifdef CONFIG_AF_XDP + "-netdev af-xdp,id=str,ifname=name[,mode=native|skb][,force-copy=on|off]\n" + " [,queues=n][,start-queue=m][,inhibit=on|off][,sock-fds=x:y:...:z]\n" + " attach to the existing network interface 'name' with AF_XDP socket\n" + " use 'mode=MODE' to specify an XDP program attach mode\n" + " use 'force-copy=on|off' to force XDP copy mode even if device supports zero-copy (default: off)\n" + " use 'inhibit=on|off' to inhibit loading of a default XDP program (default: off)\n" + " with inhibit=on,\n" + " use 'sock-fds' to provide file descriptors for already open AF_XDP sockets\n" + " added to a socket map in XDP program. One socket per queue.\n" + " use 'queues=n' to specify how many queues of a multiqueue interface should be used\n" + " use 'start-queue=m' to specify the first queue that should be used\n" +#endif #ifdef CONFIG_POSIX "-netdev vhost-user,id=str,chardev=dev[,vhostforce=on|off]\n" " configure a vhost-user network, backed by a chardev 'dev'\n" @@ -2927,6 +2940,9 @@ DEF("nic", HAS_ARG, QEMU_OPTION_nic, #ifdef CONFIG_NETMAP "netmap|" #endif +#ifdef CONFIG_AF_XDP + "af-xdp|" +#endif #ifdef CONFIG_POSIX "vhost-user|" #endif @@ -2955,6 +2971,9 @@ DEF("net", HAS_ARG, QEMU_OPTION_net, #ifdef CONFIG_NETMAP "netmap|" #endif +#ifdef CONFIG_AF_XDP + "af-xdp|" +#endif #ifdef CONFIG_VMNET "vmnet-host|vmnet-shared|vmnet-bridged|" #endif @@ -2962,7 +2981,7 @@ DEF("net", HAS_ARG, QEMU_OPTION_net, " old way to initialize a host network interface\n" " (use the -netdev option if possible instead)\n", QEMU_ARCH_ALL) SRST -``-nic [tap|bridge|user|l2tpv3|vde|netmap|vhost-user|socket][,...][,mac=macaddr][,model=mn]`` +``-nic [tap|bridge|user|l2tpv3|vde|netmap|af-xdp|vhost-user|socket][,...][,mac=macaddr][,model=mn]`` This option is a shortcut for configuring both the on-board (default) guest NIC hardware and the host network backend in one go. The host backend options are the same as with the corresponding @@ -3376,6 +3395,55 @@ SRST # launch QEMU instance |qemu_system| linux.img -nic vde,sock=/tmp/myswitch +``-netdev af-xdp,id=str,ifname=name[,mode=native|skb][,force-copy=on|off][,queues=n][,start-queue=m][,inhibit=on|off][,sock-fds=x:y:...:z]`` + Configure AF_XDP backend to connect to a network interface 'name' + using AF_XDP socket. A specific program attach mode for a default + XDP program can be forced with 'mode', defaults to best-effort, + where the likely most performant mode will be in use. Number of queues + 'n' should generally match the number or queues in the interface, + defaults to 1. Traffic arriving on non-configured device queues will + not be delivered to the network backend. + + .. parsed-literal:: + + # set number of queues to 4 + ethtool -L eth0 combined 4 + # launch QEMU instance + |qemu_system| linux.img -device virtio-net-pci,netdev=n1 \\ + -netdev af-xdp,id=n1,ifname=eth0,queues=4 + + 'start-queue' option can be specified if a particular range of queues + [m, m + n] should be in use. For example, this is may be necessary in + order to use certain NICs in native mode. Kernel allows the driver to + create a separate set of XDP queues on top of regular ones, and only + these queues can be used for AF_XDP sockets. NICs that work this way + may also require an additional traffic redirection with ethtool to these + special queues. + + .. parsed-literal:: + + # set number of queues to 1 + ethtool -L eth0 combined 1 + # redirect all the traffic to the second queue (id: 1) + # note: drivers may require non-empty key/mask pair. + ethtool -N eth0 flow-type ether \\ + dst 00:00:00:00:00:00 m FF:FF:FF:FF:FF:FE action 1 + ethtool -N eth0 flow-type ether \\ + dst 00:00:00:00:00:01 m FF:FF:FF:FF:FF:FE action 1 + # launch QEMU instance + |qemu_system| linux.img -device virtio-net-pci,netdev=n1 \\ + -netdev af-xdp,id=n1,ifname=eth0,queues=1,start-queue=1 + + XDP program can also be loaded externally. In this case 'inhibit' option + should be set to 'on' and 'sock-fds' provided with file descriptors for + already open but not bound XDP sockets already added to a socket map for + corresponding queues. One socket per queue. + + .. parsed-literal:: + + |qemu_system| linux.img -device virtio-net-pci,netdev=n1 \\ + -netdev af-xdp,id=n1,ifname=eth0,queues=3,inhibit=on,sock-fds=15:16:17 + ``-netdev vhost-user,chardev=id[,vhostforce=on|off][,queues=n]`` Establish a vhost-user netdev, backed by a chardev id. The chardev should be a unix domain socket backed one. The vhost-user uses a diff --git a/scripts/ci/org.centos/stream/8/x86_64/configure b/scripts/ci/org.centos/stream/8/x86_64/configure index 131f8ee5f3..76781f17f4 100755 --- a/scripts/ci/org.centos/stream/8/x86_64/configure +++ b/scripts/ci/org.centos/stream/8/x86_64/configure @@ -35,6 +35,7 @@ --block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \ --with-coroutine=ucontext \ --tls-priority=@QEMU,SYSTEM \ +--disable-af-xdp \ --disable-attr \ --disable-auth-pam \ --disable-avx2 \ diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh index e1d178370c..230119346a 100644 --- a/scripts/meson-buildoptions.sh +++ b/scripts/meson-buildoptions.sh @@ -76,6 +76,7 @@ meson_options_help() { printf "%s\n" 'disabled with --disable-FEATURE, default is enabled if available' printf "%s\n" '(unless built with --without-default-features):' printf "%s\n" '' + printf "%s\n" ' af-xdp AF_XDP network backend support' printf "%s\n" ' alsa ALSA sound support' printf "%s\n" ' attr attr/xattr support' printf "%s\n" ' auth-pam PAM access control' @@ -208,6 +209,8 @@ meson_options_help() { } _meson_option_parse() { case $1 in + --enable-af-xdp) printf "%s" -Daf_xdp=enabled ;; + --disable-af-xdp) printf "%s" -Daf_xdp=disabled ;; --enable-alsa) printf "%s" -Dalsa=enabled ;; --disable-alsa) printf "%s" -Dalsa=disabled ;; --enable-attr) printf "%s" -Dattr=enabled ;; diff --git a/tests/docker/dockerfiles/alpine.docker b/tests/docker/dockerfiles/alpine.docker index fa455f1474..d25649cb4f 100644 --- a/tests/docker/dockerfiles/alpine.docker +++ b/tests/docker/dockerfiles/alpine.docker @@ -59,6 +59,7 @@ RUN apk update && \ libtasn1-dev \ liburing-dev \ libusb-dev \ + libxdp-dev \ linux-pam-dev \ llvm \ lttng-ust-dev \ diff --git a/tests/docker/dockerfiles/centos8.docker b/tests/docker/dockerfiles/centos8.docker index fc1830966f..68bfe606f5 100644 --- a/tests/docker/dockerfiles/centos8.docker +++ b/tests/docker/dockerfiles/centos8.docker @@ -75,6 +75,7 @@ RUN dnf distro-sync -y && \ libubsan \ liburing-devel \ libusbx-devel \ + libxdp-devel \ libzstd-devel \ llvm \ lttng-ust-devel \ diff --git a/tests/docker/dockerfiles/debian-amd64-cross.docker b/tests/docker/dockerfiles/debian-amd64-cross.docker index b66b9cc191..0cf3ba6d60 100644 --- a/tests/docker/dockerfiles/debian-amd64-cross.docker +++ b/tests/docker/dockerfiles/debian-amd64-cross.docker @@ -84,7 +84,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \ g++-x86-64-linux-gnu \ gcc-x86-64-linux-gnu \ libaio-dev:amd64 \ - libasan5:amd64 \ + libasan6:amd64 \ libasound2-dev:amd64 \ libattr1-dev:amd64 \ libbpf-dev:amd64 \ diff --git a/tests/docker/dockerfiles/debian-amd64.docker b/tests/docker/dockerfiles/debian-amd64.docker index 02262bc70e..e3e1de25dd 100644 --- a/tests/docker/dockerfiles/debian-amd64.docker +++ b/tests/docker/dockerfiles/debian-amd64.docker @@ -32,7 +32,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \ git \ hostname \ libaio-dev \ - libasan5 \ + libasan6 \ libasound2-dev \ libattr1-dev \ libbpf-dev \ diff --git a/tests/docker/dockerfiles/debian-arm64-cross.docker b/tests/docker/dockerfiles/debian-arm64-cross.docker index a0a968b8c6..d8cd4f87b6 100644 --- a/tests/docker/dockerfiles/debian-arm64-cross.docker +++ b/tests/docker/dockerfiles/debian-arm64-cross.docker @@ -84,7 +84,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \ g++-aarch64-linux-gnu \ gcc-aarch64-linux-gnu \ libaio-dev:arm64 \ - libasan5:arm64 \ + libasan6:arm64 \ libasound2-dev:arm64 \ libattr1-dev:arm64 \ libbpf-dev:arm64 \ diff --git a/tests/docker/dockerfiles/debian-armel-cross.docker b/tests/docker/dockerfiles/debian-armel-cross.docker index f1fc34a28a..75342c09b0 100644 --- a/tests/docker/dockerfiles/debian-armel-cross.docker +++ b/tests/docker/dockerfiles/debian-armel-cross.docker @@ -84,7 +84,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \ g++-arm-linux-gnueabi \ gcc-arm-linux-gnueabi \ libaio-dev:armel \ - libasan5:armel \ + libasan6:armel \ libasound2-dev:armel \ libattr1-dev:armel \ libbpf-dev:armel \ diff --git a/tests/docker/dockerfiles/debian-armhf-cross.docker b/tests/docker/dockerfiles/debian-armhf-cross.docker index a278578211..f45cfedd3f 100644 --- a/tests/docker/dockerfiles/debian-armhf-cross.docker +++ b/tests/docker/dockerfiles/debian-armhf-cross.docker @@ -84,7 +84,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \ g++-arm-linux-gnueabihf \ gcc-arm-linux-gnueabihf \ libaio-dev:armhf \ - libasan5:armhf \ + libasan6:armhf \ libasound2-dev:armhf \ libattr1-dev:armhf \ libbpf-dev:armhf \ diff --git a/tests/docker/dockerfiles/debian-ppc64el-cross.docker b/tests/docker/dockerfiles/debian-ppc64el-cross.docker index 30e5efa986..52f8c34814 100644 --- a/tests/docker/dockerfiles/debian-ppc64el-cross.docker +++ b/tests/docker/dockerfiles/debian-ppc64el-cross.docker @@ -84,7 +84,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \ g++-powerpc64le-linux-gnu \ gcc-powerpc64le-linux-gnu \ libaio-dev:ppc64el \ - libasan5:ppc64el \ + libasan6:ppc64el \ libasound2-dev:ppc64el \ libattr1-dev:ppc64el \ libbpf-dev:ppc64el \ diff --git a/tests/docker/dockerfiles/debian-s390x-cross.docker b/tests/docker/dockerfiles/debian-s390x-cross.docker index ee6db7b526..208e57bcf2 100644 --- a/tests/docker/dockerfiles/debian-s390x-cross.docker +++ b/tests/docker/dockerfiles/debian-s390x-cross.docker @@ -84,7 +84,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \ g++-s390x-linux-gnu \ gcc-s390x-linux-gnu \ libaio-dev:s390x \ - libasan5:s390x \ + libasan6:s390x \ libasound2-dev:s390x \ libattr1-dev:s390x \ libbpf-dev:s390x \ diff --git a/tests/docker/dockerfiles/fedora.docker b/tests/docker/dockerfiles/fedora.docker index c5b6c96943..f00e9e267c 100644 --- a/tests/docker/dockerfiles/fedora.docker +++ b/tests/docker/dockerfiles/fedora.docker @@ -82,6 +82,7 @@ exec "$@"\n' > /usr/bin/nosync && \ libubsan \ liburing-devel \ libusbx-devel \ + libxdp-devel \ libzstd-devel \ llvm \ lttng-ust-devel \ diff --git a/tests/docker/dockerfiles/opensuse-leap.docker b/tests/docker/dockerfiles/opensuse-leap.docker index fef8d5a2e4..ed04b4d6da 100644 --- a/tests/docker/dockerfiles/opensuse-leap.docker +++ b/tests/docker/dockerfiles/opensuse-leap.docker @@ -40,7 +40,7 @@ RUN zypper update -y && \ libSDL2-devel \ libSDL2_image-devel \ libaio-devel \ - libasan6 \ + libasan8 \ libattr-devel \ libbpf-devel \ libbz2-devel \ diff --git a/tests/docker/dockerfiles/ubuntu2004.docker b/tests/docker/dockerfiles/ubuntu2004.docker index 4180cd8674..d3e212060c 100644 --- a/tests/docker/dockerfiles/ubuntu2004.docker +++ b/tests/docker/dockerfiles/ubuntu2004.docker @@ -32,7 +32,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \ git \ hostname \ libaio-dev \ - libasan5 \ + libasan6 \ libasound2-dev \ libattr1-dev \ libbrlapi-dev \ diff --git a/tests/docker/dockerfiles/ubuntu2204.docker b/tests/docker/dockerfiles/ubuntu2204.docker index 88493f00f6..94c2c16118 100644 --- a/tests/docker/dockerfiles/ubuntu2204.docker +++ b/tests/docker/dockerfiles/ubuntu2204.docker @@ -32,7 +32,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \ git \ hostname \ libaio-dev \ - libasan5 \ + libasan6 \ libasound2-dev \ libattr1-dev \ libbpf-dev \ diff --git a/tests/lcitool/libvirt-ci b/tests/lcitool/libvirt-ci -Subproject bbd55b4d18cce8f89b5167675e434a694131563 +Subproject 5f84a21881577a5fb56cc956f6fe4e2abd6fcff diff --git a/tests/lcitool/projects/qemu.yml b/tests/lcitool/projects/qemu.yml index 584f78cb7f..6f0885170d 100644 --- a/tests/lcitool/projects/qemu.yml +++ b/tests/lcitool/projects/qemu.yml @@ -69,6 +69,7 @@ packages: - liburing - libusbx - libvdeplug + - libxdp - libzstd - llvm - lttng-ust diff --git a/tests/qtest/libqos/igb.c b/tests/qtest/libqos/igb.c index a603468beb..f40c4ec4cd 100644 --- a/tests/qtest/libqos/igb.c +++ b/tests/qtest/libqos/igb.c @@ -109,6 +109,11 @@ static void igb_pci_start_hw(QOSGraphObject *obj) E1000_RAH_AV | E1000_RAH_POOL_1 | le16_to_cpu(*(uint16_t *)(address + 4))); + /* Set supported receive descriptor mode */ + e1000e_macreg_write(&d->e1000e, + E1000_SRRCTL(0), + E1000_SRRCTL_DESCTYPE_ADV_ONEBUF); + /* Enable receive */ e1000e_macreg_write(&d->e1000e, E1000_RFCTL, E1000_RFCTL_EXTEN); e1000e_macreg_write(&d->e1000e, E1000_RCTL, E1000_RCTL_EN); |