diff options
author | Anthony Liguori <aliguori@us.ibm.com> | 2012-10-29 14:31:47 -0500 |
---|---|---|
committer | Anthony Liguori <aliguori@us.ibm.com> | 2012-10-29 14:31:47 -0500 |
commit | 233926fafa6c4a0fb666e1469524d66dd3b47ddd (patch) | |
tree | 2dbe1506ee13fbc7d2d56b317f596114d0f849bb | |
parent | b308c82cbda44e138ef990af64d44a5613c16092 (diff) | |
parent | 523a59f596a3e62f5a28eb171adba35e71310040 (diff) |
Merge remote-tracking branch 'mst/tags/for_anthony' into staging
virtio,pci infrastructure
This includes infrastructure patches that don't do much by themselves
but should help vfio and q35 make progress.
Also included is rework of virtio-net to use iovec APIs
for vector access - helpful to make it more secure
and in preparation for a new feature that will allow
arbitrary s/g layout for guests.
Also included is a pci bridge bugfix by Avi.
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
* mst/tags/for_anthony: (25 commits)
pci: avoid destroying bridge address space windows in a transaction
virtio-net: enable mrg buf header in tap on linux
virtio-net: test peer header support at init time
virtio-net: minor code simplification
virtio-net: simplify rx code
virtio-net: switch tx to safe iov functions
virtio-net: first s/g is always at start of buf
virtio-net: refactor receive_hdr
virtio-net: use safe iov operations for rx
virtio-net: avoid sg copy
iov: add iov_cpy
virtio-net: track host/guest header length
pcie: Convert PCIExpressHost to use the QOM.
pcie: pass pcie window size to pcie_host_mmcfg_update()
pci: Add class 0xc05 as 'SMBus'
pci: introduce pci_swizzle_map_irq_fn() for standardized interrupt pin swizzle
pci_ids: add intel 82801BA pci-to-pci bridge id
pci: pci capability must be in PCI space
pci: make each capability DWORD aligned
qemu: enable PV EOI for qemu 1.3
...
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
-rw-r--r-- | hw/kvm/pci-assign.c | 8 | ||||
-rw-r--r-- | hw/msi.c | 45 | ||||
-rw-r--r-- | hw/msi.h | 1 | ||||
-rw-r--r-- | hw/pc_piix.c | 9 | ||||
-rw-r--r-- | hw/pci.c | 42 | ||||
-rw-r--r-- | hw/pci.h | 3 | ||||
-rw-r--r-- | hw/pci_ids.h | 2 | ||||
-rw-r--r-- | hw/pcie_host.c | 35 | ||||
-rw-r--r-- | hw/pcie_host.h | 11 | ||||
-rw-r--r-- | hw/vhost_net.c | 13 | ||||
-rw-r--r-- | hw/virtio-net.c | 176 | ||||
-rw-r--r-- | iov.c | 23 | ||||
-rw-r--r-- | iov.h | 9 | ||||
-rw-r--r-- | target-i386/cpu.c | 33 | ||||
-rw-r--r-- | target-i386/cpu.h | 2 |
15 files changed, 266 insertions, 146 deletions
diff --git a/hw/kvm/pci-assign.c b/hw/kvm/pci-assign.c index bfffbab1b3..e80dad009c 100644 --- a/hw/kvm/pci-assign.c +++ b/hw/kvm/pci-assign.c @@ -882,8 +882,7 @@ static int assign_intx(AssignedDevice *dev) intx_route = pci_device_route_intx_to_irq(&dev->dev, dev->intpin); assert(intx_route.mode != PCI_INTX_INVERTED); - if (dev->intx_route.mode == intx_route.mode && - dev->intx_route.irq == intx_route.irq) { + if (!pci_intx_route_changed(&dev->intx_route, &intx_route)) { return 0; } @@ -997,12 +996,9 @@ static void assigned_dev_update_msi(PCIDevice *pci_dev) } if (ctrl_byte & PCI_MSI_FLAGS_ENABLE) { - uint8_t *pos = pci_dev->config + pci_dev->msi_cap; - MSIMessage msg; + MSIMessage msg = msi_get_message(pci_dev, 0); int virq; - msg.address = pci_get_long(pos + PCI_MSI_ADDRESS_LO); - msg.data = pci_get_word(pos + PCI_MSI_DATA_32); virq = kvm_irqchip_add_msi_route(kvm_state, msg); if (virq < 0) { perror("assigned_dev_update_msi: kvm_irqchip_add_msi_route"); @@ -122,6 +122,31 @@ void msi_set_message(PCIDevice *dev, MSIMessage msg) pci_set_word(dev->config + msi_data_off(dev, msi64bit), msg.data); } +MSIMessage msi_get_message(PCIDevice *dev, unsigned int vector) +{ + uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev)); + bool msi64bit = flags & PCI_MSI_FLAGS_64BIT; + unsigned int nr_vectors = msi_nr_vectors(flags); + MSIMessage msg; + + assert(vector < nr_vectors); + + if (msi64bit) { + msg.address = pci_get_quad(dev->config + msi_address_lo_off(dev)); + } else { + msg.address = pci_get_long(dev->config + msi_address_lo_off(dev)); + } + + /* upper bit 31:16 is zero */ + msg.data = pci_get_word(dev->config + msi_data_off(dev, msi64bit)); + if (nr_vectors > 1) { + msg.data &= ~(nr_vectors - 1); + msg.data |= vector; + } + + return msg; +} + bool msi_enabled(const PCIDevice *dev) { return msi_present(dev) && @@ -249,8 +274,7 @@ void msi_notify(PCIDevice *dev, unsigned int vector) uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev)); bool msi64bit = flags & PCI_MSI_FLAGS_64BIT; unsigned int nr_vectors = msi_nr_vectors(flags); - uint64_t address; - uint32_t data; + MSIMessage msg; assert(vector < nr_vectors); if (msi_is_masked(dev, vector)) { @@ -261,24 +285,13 @@ void msi_notify(PCIDevice *dev, unsigned int vector) return; } - if (msi64bit) { - address = pci_get_quad(dev->config + msi_address_lo_off(dev)); - } else { - address = pci_get_long(dev->config + msi_address_lo_off(dev)); - } - - /* upper bit 31:16 is zero */ - data = pci_get_word(dev->config + msi_data_off(dev, msi64bit)); - if (nr_vectors > 1) { - data &= ~(nr_vectors - 1); - data |= vector; - } + msg = msi_get_message(dev, vector); MSI_DEV_PRINTF(dev, "notify vector 0x%x" " address: 0x%"PRIx64" data: 0x%"PRIx32"\n", - vector, address, data); - stl_le_phys(address, data); + vector, msg.address, msg.data); + stl_le_phys(msg.address, msg.data); } /* Normally called by pci_default_write_config(). */ @@ -32,6 +32,7 @@ struct MSIMessage { extern bool msi_supported; void msi_set_message(PCIDevice *dev, MSIMessage msg); +MSIMessage msi_get_message(PCIDevice *dev, unsigned int vector); bool msi_enabled(const PCIDevice *dev); int msi_init(struct PCIDevice *dev, uint8_t offset, unsigned int nr_vectors, bool msi64bit, bool msi_per_vector_mask); diff --git a/hw/pc_piix.c b/hw/pc_piix.c index c7dd75b0c0..85529b2cea 100644 --- a/hw/pc_piix.c +++ b/hw/pc_piix.c @@ -43,6 +43,7 @@ #include "xen.h" #include "memory.h" #include "exec-memory.h" +#include "cpu.h" #ifdef CONFIG_XEN # include <xen/hvm/hvm_info_table.h> #endif @@ -302,6 +303,12 @@ static void pc_init_pci(QEMUMachineInitArgs *args) initrd_filename, cpu_model, 1, 1); } +static void pc_init_pci_1_3(QEMUMachineInitArgs *args) +{ + enable_kvm_pv_eoi(); + pc_init_pci(args); +} + static void pc_init_pci_no_kvmclock(QEMUMachineInitArgs *args) { ram_addr_t ram_size = args->ram_size; @@ -349,7 +356,7 @@ static QEMUMachine pc_machine_v1_3 = { .name = "pc-1.3", .alias = "pc", .desc = "Standard PC", - .init = pc_init_pci, + .init = pc_init_pci_1_3, .max_cpus = 255, .is_default = 1, }; @@ -1117,10 +1117,21 @@ PCIINTxRoute pci_device_route_intx_to_irq(PCIDevice *dev, int pin) pin = bus->map_irq(dev, pin); dev = bus->parent_dev; } while (dev); - assert(bus->route_intx_to_irq); + + if (!bus->route_intx_to_irq) { + error_report("PCI: Bug - unimplemented PCI INTx routing (%s)\n", + object_get_typename(OBJECT(bus->qbus.parent))); + return (PCIINTxRoute) { PCI_INTX_DISABLED, -1 }; + } + return bus->route_intx_to_irq(bus->irq_opaque, pin); } +bool pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new) +{ + return old->mode != new->mode || old->irq != new->irq; +} + void pci_bus_fire_intx_routing_notifier(PCIBus *bus) { PCIDevice *dev; @@ -1144,6 +1155,24 @@ void pci_device_set_intx_routing_notifier(PCIDevice *dev, dev->intx_routing_notifier = notifier; } +/* + * PCI-to-PCI bridge specification + * 9.1: Interrupt routing. Table 9-1 + * + * the PCI Express Base Specification, Revision 2.1 + * 2.2.8.1: INTx interrutp signaling - Rules + * the Implementation Note + * Table 2-20 + */ +/* + * 0 <= pin <= 3 0 = INTA, 1 = INTB, 2 = INTC, 3 = INTD + * 0-origin unlike PCI interrupt pin register. + */ +int pci_swizzle_map_irq_fn(PCIDevice *pci_dev, int pin) +{ + return (pin + PCI_SLOT(pci_dev->devfn)) % PCI_NUM_PINS; +} + /***********************************************************/ /* monitor info on PCI */ @@ -1208,6 +1237,7 @@ static const pci_class_desc pci_class_descriptions[] = { 0x0c02, "SSA controller", "ssa"}, { 0x0c03, "USB controller", "usb"}, { 0x0c04, "Fibre channel controller", "fibre-channel"}, + { 0x0c05, "SMBus"}, { 0, NULL} }; @@ -1667,16 +1697,16 @@ PCIDevice *pci_create_simple(PCIBus *bus, int devfn, const char *name) return pci_create_simple_multifunction(bus, devfn, false, name); } -static int pci_find_space(PCIDevice *pdev, uint8_t size) +static uint8_t pci_find_space(PCIDevice *pdev, uint8_t size) { - int config_size = pci_config_size(pdev); int offset = PCI_CONFIG_HEADER_SIZE; int i; - for (i = PCI_CONFIG_HEADER_SIZE; i < config_size; ++i) + for (i = PCI_CONFIG_HEADER_SIZE; i < PCI_CONFIG_SPACE_SIZE; ++i) { if (pdev->used[i]) offset = i + 1; else if (i - offset + 1 == size) return offset; + } return 0; } @@ -1895,7 +1925,7 @@ int pci_add_capability(PCIDevice *pdev, uint8_t cap_id, config[PCI_CAP_LIST_NEXT] = pdev->config[PCI_CAPABILITY_LIST]; pdev->config[PCI_CAPABILITY_LIST] = offset; pdev->config[PCI_STATUS] |= PCI_STATUS_CAP_LIST; - memset(pdev->used + offset, 0xFF, size); + memset(pdev->used + offset, 0xFF, QEMU_ALIGN_UP(size, 4)); /* Make capability read-only by default */ memset(pdev->wmask + offset, 0, size); /* Check capability by default */ @@ -1915,7 +1945,7 @@ void pci_del_capability(PCIDevice *pdev, uint8_t cap_id, uint8_t size) memset(pdev->w1cmask + offset, 0, size); /* Clear cmask as device-specific registers can't be checked */ memset(pdev->cmask + offset, 0, size); - memset(pdev->used + offset, 0, size); + memset(pdev->used + offset, 0, QEMU_ALIGN_UP(size, 4)); if (!pdev->config[PCI_CAPABILITY_LIST]) pdev->config[PCI_STATUS] &= ~PCI_STATUS_CAP_LIST; @@ -318,6 +318,8 @@ void pci_bus_irqs(PCIBus *bus, pci_set_irq_fn set_irq, pci_map_irq_fn map_irq, void *irq_opaque, int nirq); int pci_bus_get_irq_level(PCIBus *bus, int irq_num); void pci_bus_hotplug(PCIBus *bus, pci_hotplug_fn hotplug, DeviceState *dev); +/* 0 <= pin <= 3 0 = INTA, 1 = INTB, 2 = INTC, 3 = INTD */ +int pci_swizzle_map_irq_fn(PCIDevice *pci_dev, int pin); PCIBus *pci_register_bus(DeviceState *parent, const char *name, pci_set_irq_fn set_irq, pci_map_irq_fn map_irq, void *irq_opaque, @@ -326,6 +328,7 @@ PCIBus *pci_register_bus(DeviceState *parent, const char *name, uint8_t devfn_min, int nirq); void pci_bus_set_route_irq_fn(PCIBus *, pci_route_irq_fn); PCIINTxRoute pci_device_route_intx_to_irq(PCIDevice *dev, int pin); +bool pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new); void pci_bus_fire_intx_routing_notifier(PCIBus *bus); void pci_device_set_intx_routing_notifier(PCIDevice *dev, PCIINTxRoutingNotifier notifier); diff --git a/hw/pci_ids.h b/hw/pci_ids.h index c017a79998..41f3570fb9 100644 --- a/hw/pci_ids.h +++ b/hw/pci_ids.h @@ -31,6 +31,7 @@ #define PCI_CLASS_SYSTEM_OTHER 0x0880 #define PCI_CLASS_SERIAL_USB 0x0c03 +#define PCI_CLASS_SERIAL_SMBUS 0x0c05 #define PCI_CLASS_BRIDGE_HOST 0x0600 #define PCI_CLASS_BRIDGE_ISA 0x0601 @@ -105,6 +106,7 @@ #define PCI_DEVICE_ID_INTEL_82378 0x0484 #define PCI_DEVICE_ID_INTEL_82441 0x1237 #define PCI_DEVICE_ID_INTEL_82801AA_5 0x2415 +#define PCI_DEVICE_ID_INTEL_82801BA_11 0x244e #define PCI_DEVICE_ID_INTEL_82801D 0x24CD #define PCI_DEVICE_ID_INTEL_ESB_9 0x25ab #define PCI_DEVICE_ID_INTEL_82371SB_0 0x7000 diff --git a/hw/pcie_host.c b/hw/pcie_host.c index 9f7f3d3b1e..c257fb43ca 100644 --- a/hw/pcie_host.c +++ b/hw/pcie_host.c @@ -107,14 +107,9 @@ static const MemoryRegionOps pcie_mmcfg_ops = { /* pcie_host::base_addr == PCIE_BASE_ADDR_UNMAPPED when it isn't mapped. */ #define PCIE_BASE_ADDR_UNMAPPED ((hwaddr)-1ULL) -int pcie_host_init(PCIExpressHost *e, uint32_t size) +int pcie_host_init(PCIExpressHost *e) { - assert(!(size & (size - 1))); /* power of 2 */ - assert(size >= PCIE_MMCFG_SIZE_MIN); - assert(size <= PCIE_MMCFG_SIZE_MAX); e->base_addr = PCIE_BASE_ADDR_UNMAPPED; - e->size = size; - memory_region_init_io(&e->mmio, &pcie_mmcfg_ops, e, "pcie-mmcfg", e->size); return 0; } @@ -123,22 +118,44 @@ void pcie_host_mmcfg_unmap(PCIExpressHost *e) { if (e->base_addr != PCIE_BASE_ADDR_UNMAPPED) { memory_region_del_subregion(get_system_memory(), &e->mmio); + memory_region_destroy(&e->mmio); e->base_addr = PCIE_BASE_ADDR_UNMAPPED; } } -void pcie_host_mmcfg_map(PCIExpressHost *e, hwaddr addr) +void pcie_host_mmcfg_map(PCIExpressHost *e, hwaddr addr, + uint32_t size) { + assert(!(size & (size - 1))); /* power of 2 */ + assert(size >= PCIE_MMCFG_SIZE_MIN); + assert(size <= PCIE_MMCFG_SIZE_MAX); + e->size = size; + memory_region_init_io(&e->mmio, &pcie_mmcfg_ops, e, "pcie-mmcfg", e->size); e->base_addr = addr; memory_region_add_subregion(get_system_memory(), e->base_addr, &e->mmio); } void pcie_host_mmcfg_update(PCIExpressHost *e, int enable, - hwaddr addr) + hwaddr addr, + uint32_t size) { pcie_host_mmcfg_unmap(e); if (enable) { - pcie_host_mmcfg_map(e, addr); + pcie_host_mmcfg_map(e, addr, size); } } + +static const TypeInfo pcie_host_type_info = { + .name = TYPE_PCIE_HOST_BRIDGE, + .parent = TYPE_PCI_HOST_BRIDGE, + .abstract = true, + .instance_size = sizeof(PCIExpressHost), +}; + +static void pcie_host_register_types(void) +{ + type_register_static(&pcie_host_type_info); +} + +type_init(pcie_host_register_types) diff --git a/hw/pcie_host.h b/hw/pcie_host.h index 9978b9f2f1..392193530d 100644 --- a/hw/pcie_host.h +++ b/hw/pcie_host.h @@ -24,6 +24,10 @@ #include "pci_host.h" #include "memory.h" +#define TYPE_PCIE_HOST_BRIDGE "pcie-host-bridge" +#define PCIE_HOST_BRIDGE(obj) \ + OBJECT_CHECK(PCIExpressHost, (obj), TYPE_PCIE_HOST_BRIDGE) + struct PCIExpressHost { PCIHostState pci; @@ -39,11 +43,12 @@ struct PCIExpressHost { MemoryRegion mmio; }; -int pcie_host_init(PCIExpressHost *e, uint32_t size); +int pcie_host_init(PCIExpressHost *e); void pcie_host_mmcfg_unmap(PCIExpressHost *e); -void pcie_host_mmcfg_map(PCIExpressHost *e, hwaddr addr); +void pcie_host_mmcfg_map(PCIExpressHost *e, hwaddr addr, uint32_t size); void pcie_host_mmcfg_update(PCIExpressHost *e, int enable, - hwaddr addr); + hwaddr addr, + uint32_t size); #endif /* PCIE_HOST_H */ diff --git a/hw/vhost_net.c b/hw/vhost_net.c index df2c4a30a2..8241601539 100644 --- a/hw/vhost_net.c +++ b/hw/vhost_net.c @@ -150,10 +150,6 @@ int vhost_net_start(struct vhost_net *net, if (r < 0) { goto fail_notifiers; } - if (net->dev.acked_features & (1 << VIRTIO_NET_F_MRG_RXBUF)) { - tap_set_vnet_hdr_len(net->nc, - sizeof(struct virtio_net_hdr_mrg_rxbuf)); - } r = vhost_dev_start(&net->dev, dev); if (r < 0) { @@ -179,9 +175,6 @@ fail: } net->nc->info->poll(net->nc, true); vhost_dev_stop(&net->dev, dev); - if (net->dev.acked_features & (1 << VIRTIO_NET_F_MRG_RXBUF)) { - tap_set_vnet_hdr_len(net->nc, sizeof(struct virtio_net_hdr)); - } fail_start: vhost_dev_disable_notifiers(&net->dev, dev); fail_notifiers: @@ -199,18 +192,12 @@ void vhost_net_stop(struct vhost_net *net, } net->nc->info->poll(net->nc, true); vhost_dev_stop(&net->dev, dev); - if (net->dev.acked_features & (1 << VIRTIO_NET_F_MRG_RXBUF)) { - tap_set_vnet_hdr_len(net->nc, sizeof(struct virtio_net_hdr)); - } vhost_dev_disable_notifiers(&net->dev, dev); } void vhost_net_cleanup(struct vhost_net *net) { vhost_dev_cleanup(&net->dev); - if (net->dev.acked_features & (1 << VIRTIO_NET_F_MRG_RXBUF)) { - tap_set_vnet_hdr_len(net->nc, sizeof(struct virtio_net_hdr)); - } g_free(net); } #else diff --git a/hw/virtio-net.c b/hw/virtio-net.c index 50ba728c02..108ce07cfc 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -41,6 +41,8 @@ typedef struct VirtIONet int32_t tx_burst; int tx_waiting; uint32_t has_vnet_hdr; + size_t host_hdr_len; + size_t guest_hdr_len; uint8_t has_ufo; struct { VirtQueueElement elem; @@ -200,16 +202,19 @@ static void virtio_net_reset(VirtIODevice *vdev) memset(n->vlans, 0, MAX_VLAN >> 3); } -static int peer_has_vnet_hdr(VirtIONet *n) +static void peer_test_vnet_hdr(VirtIONet *n) { if (!n->nic->nc.peer) - return 0; + return; if (n->nic->nc.peer->info->type != NET_CLIENT_OPTIONS_KIND_TAP) - return 0; + return; n->has_vnet_hdr = tap_has_vnet_hdr(n->nic->nc.peer); +} +static int peer_has_vnet_hdr(VirtIONet *n) +{ return n->has_vnet_hdr; } @@ -223,15 +228,27 @@ static int peer_has_ufo(VirtIONet *n) return n->has_ufo; } +static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs) +{ + n->mergeable_rx_bufs = mergeable_rx_bufs; + + n->guest_hdr_len = n->mergeable_rx_bufs ? + sizeof(struct virtio_net_hdr_mrg_rxbuf) : sizeof(struct virtio_net_hdr); + + if (peer_has_vnet_hdr(n) && + tap_has_vnet_hdr_len(n->nic->nc.peer, n->guest_hdr_len)) { + tap_set_vnet_hdr_len(n->nic->nc.peer, n->guest_hdr_len); + n->host_hdr_len = n->guest_hdr_len; + } +} + static uint32_t virtio_net_get_features(VirtIODevice *vdev, uint32_t features) { VirtIONet *n = to_virtio_net(vdev); features |= (1 << VIRTIO_NET_F_MAC); - if (peer_has_vnet_hdr(n)) { - tap_using_vnet_hdr(n->nic->nc.peer, 1); - } else { + if (!peer_has_vnet_hdr(n)) { features &= ~(0x1 << VIRTIO_NET_F_CSUM); features &= ~(0x1 << VIRTIO_NET_F_HOST_TSO4); features &= ~(0x1 << VIRTIO_NET_F_HOST_TSO6); @@ -277,7 +294,7 @@ static void virtio_net_set_features(VirtIODevice *vdev, uint32_t features) { VirtIONet *n = to_virtio_net(vdev); - n->mergeable_rx_bufs = !!(features & (1 << VIRTIO_NET_F_MRG_RXBUF)); + virtio_net_set_mrg_rx_bufs(n, !!(features & (1 << VIRTIO_NET_F_MRG_RXBUF))); if (n->has_vnet_hdr) { tap_set_offload(n->nic->nc.peer, @@ -499,41 +516,34 @@ static int virtio_net_has_buffers(VirtIONet *n, int bufsize) * cache. */ static void work_around_broken_dhclient(struct virtio_net_hdr *hdr, - const uint8_t *buf, size_t size) + uint8_t *buf, size_t size) { if ((hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && /* missing csum */ (size > 27 && size < 1500) && /* normal sized MTU */ (buf[12] == 0x08 && buf[13] == 0x00) && /* ethertype == IPv4 */ (buf[23] == 17) && /* ip.protocol == UDP */ (buf[34] == 0 && buf[35] == 67)) { /* udp.srcport == bootps */ - /* FIXME this cast is evil */ - net_checksum_calculate((uint8_t *)buf, size); + net_checksum_calculate(buf, size); hdr->flags &= ~VIRTIO_NET_HDR_F_NEEDS_CSUM; } } -static int receive_header(VirtIONet *n, struct iovec *iov, int iovcnt, - const void *buf, size_t size, size_t hdr_len) +static void receive_header(VirtIONet *n, const struct iovec *iov, int iov_cnt, + const void *buf, size_t size) { - struct virtio_net_hdr *hdr = (struct virtio_net_hdr *)iov[0].iov_base; - int offset = 0; - - hdr->flags = 0; - hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE; - if (n->has_vnet_hdr) { - memcpy(hdr, buf, sizeof(*hdr)); - offset = sizeof(*hdr); - work_around_broken_dhclient(hdr, buf + offset, size - offset); + /* FIXME this cast is evil */ + void *wbuf = (void *)buf; + work_around_broken_dhclient(wbuf, wbuf + n->host_hdr_len, + size - n->host_hdr_len); + iov_from_buf(iov, iov_cnt, 0, buf, sizeof(struct virtio_net_hdr)); + } else { + struct virtio_net_hdr hdr = { + .flags = 0, + .gso_type = VIRTIO_NET_HDR_GSO_NONE + }; + iov_from_buf(iov, iov_cnt, 0, &hdr, sizeof hdr); } - - /* We only ever receive a struct virtio_net_hdr from the tapfd, - * but we may be passing along a larger header to the guest. - */ - iov[0].iov_base += hdr_len; - iov[0].iov_len -= hdr_len; - - return offset; } static int receive_filter(VirtIONet *n, const uint8_t *buf, int size) @@ -546,9 +556,7 @@ static int receive_filter(VirtIONet *n, const uint8_t *buf, int size) if (n->promisc) return 1; - if (n->has_vnet_hdr) { - ptr += sizeof(struct virtio_net_hdr); - } + ptr += n->host_hdr_len; if (!memcmp(&ptr[12], vlan, sizeof(vlan))) { int vid = be16_to_cpup((uint16_t *)(ptr + 14)) & 0xfff; @@ -592,19 +600,16 @@ static int receive_filter(VirtIONet *n, const uint8_t *buf, int size) static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t size) { VirtIONet *n = DO_UPCAST(NICState, nc, nc)->opaque; - struct virtio_net_hdr_mrg_rxbuf *mhdr = NULL; - size_t guest_hdr_len, offset, i, host_hdr_len; + struct iovec mhdr_sg[VIRTQUEUE_MAX_SIZE]; + struct virtio_net_hdr_mrg_rxbuf mhdr; + unsigned mhdr_cnt = 0; + size_t offset, i, guest_offset; if (!virtio_net_can_receive(&n->nic->nc)) return -1; /* hdr_len refers to the header we supply to the guest */ - guest_hdr_len = n->mergeable_rx_bufs ? - sizeof(struct virtio_net_hdr_mrg_rxbuf) : sizeof(struct virtio_net_hdr); - - - host_hdr_len = n->has_vnet_hdr ? sizeof(struct virtio_net_hdr) : 0; - if (!virtio_net_has_buffers(n, size + guest_hdr_len - host_hdr_len)) + if (!virtio_net_has_buffers(n, size + n->guest_hdr_len - n->host_hdr_len)) return 0; if (!receive_filter(n, buf, size)) @@ -615,7 +620,7 @@ static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t while (offset < size) { VirtQueueElement elem; int len, total; - struct iovec sg[VIRTQUEUE_MAX_SIZE]; + const struct iovec *sg = elem.in_sg; total = 0; @@ -626,7 +631,7 @@ static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t "i %zd mergeable %d offset %zd, size %zd, " "guest hdr len %zd, host hdr len %zd guest features 0x%x", i, n->mergeable_rx_bufs, offset, size, - guest_hdr_len, host_hdr_len, n->vdev.guest_features); + n->guest_hdr_len, n->host_hdr_len, n->vdev.guest_features); exit(1); } @@ -635,24 +640,25 @@ static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t exit(1); } - if (!n->mergeable_rx_bufs && elem.in_sg[0].iov_len != guest_hdr_len) { - error_report("virtio-net header not in first element"); - exit(1); - } - - memcpy(&sg, &elem.in_sg[0], sizeof(sg[0]) * elem.in_num); - if (i == 0) { - if (n->mergeable_rx_bufs) - mhdr = (struct virtio_net_hdr_mrg_rxbuf *)sg[0].iov_base; + assert(offset == 0); + if (n->mergeable_rx_bufs) { + mhdr_cnt = iov_copy(mhdr_sg, ARRAY_SIZE(mhdr_sg), + sg, elem.in_num, + offsetof(typeof(mhdr), num_buffers), + sizeof(mhdr.num_buffers)); + } - offset += receive_header(n, sg, elem.in_num, - buf + offset, size - offset, guest_hdr_len); - total += guest_hdr_len; + receive_header(n, sg, elem.in_num, buf, size); + offset = n->host_hdr_len; + total += n->guest_hdr_len; + guest_offset = n->guest_hdr_len; + } else { + guest_offset = 0; } /* copy in packet. ugh */ - len = iov_from_buf(sg, elem.in_num, 0, + len = iov_from_buf(sg, elem.in_num, guest_offset, buf + offset, size - offset); total += len; offset += len; @@ -665,7 +671,7 @@ static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t "i %zd mergeable %d offset %zd, size %zd, " "guest hdr len %zd, host hdr len %zd", i, n->mergeable_rx_bufs, - offset, size, guest_hdr_len, host_hdr_len); + offset, size, n->guest_hdr_len, n->host_hdr_len); #endif return size; } @@ -674,8 +680,11 @@ static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t virtqueue_fill(n->rx_vq, &elem, total, i++); } - if (mhdr) { - stw_p(&mhdr->num_buffers, i); + if (mhdr_cnt) { + stw_p(&mhdr.num_buffers, i); + iov_from_buf(mhdr_sg, mhdr_cnt, + 0, + &mhdr.num_buffers, sizeof mhdr.num_buffers); } virtqueue_flush(n->rx_vq, i); @@ -716,33 +725,35 @@ static int32_t virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq) } while (virtqueue_pop(vq, &elem)) { - ssize_t ret, len = 0; + ssize_t ret, len; unsigned int out_num = elem.out_num; struct iovec *out_sg = &elem.out_sg[0]; - unsigned hdr_len; - - /* hdr_len refers to the header received from the guest */ - hdr_len = n->mergeable_rx_bufs ? - sizeof(struct virtio_net_hdr_mrg_rxbuf) : - sizeof(struct virtio_net_hdr); + struct iovec sg[VIRTQUEUE_MAX_SIZE]; - if (out_num < 1 || out_sg->iov_len != hdr_len) { + if (out_num < 1) { error_report("virtio-net header not in first element"); exit(1); } - /* ignore the header if GSO is not supported */ - if (!n->has_vnet_hdr) { - out_num--; - out_sg++; - len += hdr_len; - } else if (n->mergeable_rx_bufs) { - /* tapfd expects a struct virtio_net_hdr */ - hdr_len -= sizeof(struct virtio_net_hdr); - out_sg->iov_len -= hdr_len; - len += hdr_len; + /* + * If host wants to see the guest header as is, we can + * pass it on unchanged. Otherwise, copy just the parts + * that host is interested in. + */ + assert(n->host_hdr_len <= n->guest_hdr_len); + if (n->host_hdr_len != n->guest_hdr_len) { + unsigned sg_num = iov_copy(sg, ARRAY_SIZE(sg), + out_sg, out_num, + 0, n->host_hdr_len); + sg_num += iov_copy(sg + sg_num, ARRAY_SIZE(sg) - sg_num, + out_sg, out_num, + n->guest_hdr_len, -1); + out_num = sg_num; + out_sg = sg; } + len = n->guest_hdr_len; + ret = qemu_sendv_packet_async(&n->nic->nc, out_sg, out_num, virtio_net_tx_complete); if (ret == 0) { @@ -899,7 +910,8 @@ static int virtio_net_load(QEMUFile *f, void *opaque, int version_id) qemu_get_buffer(f, n->mac, ETH_ALEN); n->tx_waiting = qemu_get_be32(f); - n->mergeable_rx_bufs = qemu_get_be32(f); + + virtio_net_set_mrg_rx_bufs(n, qemu_get_be32(f)); if (version_id >= 3) n->status = qemu_get_be16(f); @@ -939,7 +951,6 @@ static int virtio_net_load(QEMUFile *f, void *opaque, int version_id) } if (n->has_vnet_hdr) { - tap_using_vnet_hdr(n->nic->nc.peer, 1); tap_set_offload(n->nic->nc.peer, (n->vdev.guest_features >> VIRTIO_NET_F_GUEST_CSUM) & 1, (n->vdev.guest_features >> VIRTIO_NET_F_GUEST_TSO4) & 1, @@ -1038,12 +1049,19 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf, n->status = VIRTIO_NET_S_LINK_UP; n->nic = qemu_new_nic(&net_virtio_info, conf, object_get_typename(OBJECT(dev)), dev->id, n); + peer_test_vnet_hdr(n); + if (peer_has_vnet_hdr(n)) { + tap_using_vnet_hdr(n->nic->nc.peer, 1); + n->host_hdr_len = sizeof(struct virtio_net_hdr); + } else { + n->host_hdr_len = 0; + } qemu_format_nic_info_str(&n->nic->nc, conf->macaddr.a); n->tx_waiting = 0; n->tx_burst = net->txburst; - n->mergeable_rx_bufs = 0; + virtio_net_set_mrg_rx_bufs(n, 0); n->promisc = 1; /* for compatibility */ n->mac_table.macs = g_malloc0(MAC_TABLE_ENTRIES * ETH_ALEN); @@ -228,3 +228,26 @@ void iov_hexdump(const struct iovec *iov, const unsigned int iov_cnt, fprintf(fp, "\n"); } } + +unsigned iov_copy(struct iovec *dst_iov, unsigned int dst_iov_cnt, + const struct iovec *iov, unsigned int iov_cnt, + size_t offset, size_t bytes) +{ + size_t len; + unsigned int i, j; + for (i = 0, j = 0; i < iov_cnt && j < dst_iov_cnt && bytes; i++) { + if (offset >= iov[i].iov_len) { + offset -= iov[i].iov_len; + continue; + } + len = MIN(bytes, iov[i].iov_len - offset); + + dst_iov[j].iov_base = iov[i].iov_base + offset; + dst_iov[j].iov_len = len; + j++; + bytes -= len; + offset = 0; + } + assert(offset == 0); + return j; +} @@ -86,3 +86,12 @@ ssize_t iov_send_recv(int sockfd, struct iovec *iov, unsigned iov_cnt, */ void iov_hexdump(const struct iovec *iov, const unsigned int iov_cnt, FILE *fp, const char *prefix, size_t limit); + +/* + * Partial copy of vector from iov to dst_iov (data is not copied). + * dst_iov overlaps iov at a specified offset. + * size of dst_iov is at most bytes. dst vector count is returned. + */ +unsigned iov_copy(struct iovec *dst_iov, unsigned int dst_iov_cnt, + const struct iovec *iov, unsigned int iov_cnt, + size_t offset, size_t bytes); diff --git a/target-i386/cpu.c b/target-i386/cpu.c index 6411042b79..d4f2e65cd9 100644 --- a/target-i386/cpu.c +++ b/target-i386/cpu.c @@ -125,6 +125,25 @@ typedef struct model_features_t { int check_cpuid = 0; int enforce_cpuid = 0; +#if defined(CONFIG_KVM) +static uint32_t kvm_default_features = (1 << KVM_FEATURE_CLOCKSOURCE) | + (1 << KVM_FEATURE_NOP_IO_DELAY) | + (1 << KVM_FEATURE_MMU_OP) | + (1 << KVM_FEATURE_CLOCKSOURCE2) | + (1 << KVM_FEATURE_ASYNC_PF) | + (1 << KVM_FEATURE_STEAL_TIME) | + (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); +static const uint32_t kvm_pv_eoi_features = (0x1 << KVM_FEATURE_PV_EOI); +#else +static uint32_t kvm_default_features = 0; +static const uint32_t kvm_pv_eoi_features = 0; +#endif + +void enable_kvm_pv_eoi(void) +{ + kvm_default_features |= kvm_pv_eoi_features; +} + void host_cpuid(uint32_t function, uint32_t count, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) { @@ -1108,7 +1127,7 @@ static int cpu_x86_find_by_name(x86_def_t *x86_cpu_def, const char *cpu_model) /* Features to be added*/ uint32_t plus_features = 0, plus_ext_features = 0; uint32_t plus_ext2_features = 0, plus_ext3_features = 0; - uint32_t plus_kvm_features = 0, plus_svm_features = 0; + uint32_t plus_kvm_features = kvm_default_features, plus_svm_features = 0; uint32_t plus_7_0_ebx_features = 0; /* Features to be removed */ uint32_t minus_features = 0, minus_ext_features = 0; @@ -1128,18 +1147,6 @@ static int cpu_x86_find_by_name(x86_def_t *x86_cpu_def, const char *cpu_model) memcpy(x86_cpu_def, def, sizeof(*def)); } -#if defined(CONFIG_KVM) - plus_kvm_features = (1 << KVM_FEATURE_CLOCKSOURCE) | - (1 << KVM_FEATURE_NOP_IO_DELAY) | - (1 << KVM_FEATURE_MMU_OP) | - (1 << KVM_FEATURE_CLOCKSOURCE2) | - (1 << KVM_FEATURE_ASYNC_PF) | - (1 << KVM_FEATURE_STEAL_TIME) | - (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); -#else - plus_kvm_features = 0; -#endif - add_flagname_to_bitmaps("hypervisor", &plus_features, &plus_ext_features, &plus_ext2_features, &plus_ext3_features, &plus_kvm_features, &plus_svm_features, &plus_7_0_ebx_features); diff --git a/target-i386/cpu.h b/target-i386/cpu.h index 871c270116..de33303dea 100644 --- a/target-i386/cpu.h +++ b/target-i386/cpu.h @@ -1188,4 +1188,6 @@ void do_smm_enter(CPUX86State *env1); void cpu_report_tpr_access(CPUX86State *env, TPRAccess access); +void enable_kvm_pv_eoi(void); + #endif /* CPU_I386_H */ |