Merge remote-tracking branch 'mst/tags/for_anthony' into staging

virtio,pci infrastructure This includes infrastructure patches that don't do much by themselves but should help vfio and q35 make progress. Also included is rework of virtio-net to use iovec APIs for vector access - helpful to make it more secure and in preparation for a new feature that will allow arbitrary s/g layout for guests. Also included is a pci bridge bugfix by Avi. Signed-off-by: Michael S. Tsirkin <mst@redhat.com> * mst/tags/for_anthony: (25 commits) pci: avoid destroying bridge address space windows in a transaction virtio-net: enable mrg buf header in tap on linux virtio-net: test peer header support at init time virtio-net: minor code simplification virtio-net: simplify rx code virtio-net: switch tx to safe iov functions virtio-net: first s/g is always at start of buf virtio-net: refactor receive_hdr virtio-net: use safe iov operations for rx virtio-net: avoid sg copy iov: add iov_cpy virtio-net: track host/guest header length pcie: Convert PCIExpressHost to use the QOM. pcie: pass pcie window size to pcie_host_mmcfg_update() pci: Add class 0xc05 as 'SMBus' pci: introduce pci_swizzle_map_irq_fn() for standardized interrupt pin swizzle pci_ids: add intel 82801BA pci-to-pci bridge id pci: pci capability must be in PCI space pci: make each capability DWORD aligned qemu: enable PV EOI for qemu 1.3 ... Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
author: Anthony Liguori <aliguori@us.ibm.com> 2012-10-29 14:31:47 -0500
committer: Anthony Liguori <aliguori@us.ibm.com> 2012-10-29 14:31:47 -0500
commit: 233926fafa6c4a0fb666e1469524d66dd3b47ddd (patch)
tree: 2dbe1506ee13fbc7d2d56b317f596114d0f849bb
parent: b308c82cbda44e138ef990af64d44a5613c16092 (diff)
parent: 523a59f596a3e62f5a28eb171adba35e71310040 (diff)
15 files changed, 266 insertions, 146 deletions
diff --git a/hw/kvm/pci-assign.c b/hw/kvm/pci-assign.c
index bfffbab1b3..e80dad009c 100644
--- a/hw/kvm/pci-assign.c
+++ b/hw/kvm/pci-assign.c
@@ -882,8 +882,7 @@ static int assign_intx(AssignedDevice *dev)
     intx_route = pci_device_route_intx_to_irq(&dev->dev, dev->intpin);
     assert(intx_route.mode != PCI_INTX_INVERTED);
 
-    if (dev->intx_route.mode == intx_route.mode &&
-        dev->intx_route.irq == intx_route.irq) {
+    if (!pci_intx_route_changed(&dev->intx_route, &intx_route)) {
         return 0;
     }
 
@@ -997,12 +996,9 @@ static void assigned_dev_update_msi(PCIDevice *pci_dev)
     }
 
     if (ctrl_byte & PCI_MSI_FLAGS_ENABLE) {
-        uint8_t *pos = pci_dev->config + pci_dev->msi_cap;
-        MSIMessage msg;
+        MSIMessage msg = msi_get_message(pci_dev, 0);
         int virq;
 
-        msg.address = pci_get_long(pos + PCI_MSI_ADDRESS_LO);
-        msg.data = pci_get_word(pos + PCI_MSI_DATA_32);
         virq = kvm_irqchip_add_msi_route(kvm_state, msg);
         if (virq < 0) {
             perror("assigned_dev_update_msi: kvm_irqchip_add_msi_route");
diff --git a/hw/msi.c b/hw/msi.c
index e2273a09ae..33037a80e9 100644
--- a/hw/msi.c
+++ b/hw/msi.c
@@ -122,6 +122,31 @@ void msi_set_message(PCIDevice *dev, MSIMessage msg)
     pci_set_word(dev->config + msi_data_off(dev, msi64bit), msg.data);
 }
 
+MSIMessage msi_get_message(PCIDevice *dev, unsigned int vector)
+{
+    uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
+    bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
+    unsigned int nr_vectors = msi_nr_vectors(flags);
+    MSIMessage msg;
+
+    assert(vector < nr_vectors);
+
+    if (msi64bit) {
+        msg.address = pci_get_quad(dev->config + msi_address_lo_off(dev));
+    } else {
+        msg.address = pci_get_long(dev->config + msi_address_lo_off(dev));
+    }
+
+    /* upper bit 31:16 is zero */
+    msg.data = pci_get_word(dev->config + msi_data_off(dev, msi64bit));
+    if (nr_vectors > 1) {
+        msg.data &= ~(nr_vectors - 1);
+        msg.data |= vector;
+    }
+
+    return msg;
+}
+
 bool msi_enabled(const PCIDevice *dev)
 {
     return msi_present(dev) &&
@@ -249,8 +274,7 @@ void msi_notify(PCIDevice *dev, unsigned int vector)
     uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
     bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
     unsigned int nr_vectors = msi_nr_vectors(flags);
-    uint64_t address;
-    uint32_t data;
+    MSIMessage msg;
 
     assert(vector < nr_vectors);
     if (msi_is_masked(dev, vector)) {
@@ -261,24 +285,13 @@ void msi_notify(PCIDevice *dev, unsigned int vector)
         return;
     }
 
-    if (msi64bit) {
-        address = pci_get_quad(dev->config + msi_address_lo_off(dev));
-    } else {
-        address = pci_get_long(dev->config + msi_address_lo_off(dev));
-    }
-
-    /* upper bit 31:16 is zero */
-    data = pci_get_word(dev->config + msi_data_off(dev, msi64bit));
-    if (nr_vectors > 1) {
-        data &= ~(nr_vectors - 1);
-        data |= vector;
-    }
+    msg = msi_get_message(dev, vector);
 
     MSI_DEV_PRINTF(dev,
                    "notify vector 0x%x"
                    " address: 0x%"PRIx64" data: 0x%"PRIx32"\n",
-                   vector, address, data);
-    stl_le_phys(address, data);
+                   vector, msg.address, msg.data);
+    stl_le_phys(msg.address, msg.data);
 }
 
 /* Normally called by pci_default_write_config(). */
diff --git a/hw/msi.h b/hw/msi.h
index 6ec1f99f80..150b09a19d 100644
--- a/hw/msi.h
+++ b/hw/msi.h
@@ -32,6 +32,7 @@ struct MSIMessage {
 extern bool msi_supported;
 
 void msi_set_message(PCIDevice *dev, MSIMessage msg);
+MSIMessage msi_get_message(PCIDevice *dev, unsigned int vector);
 bool msi_enabled(const PCIDevice *dev);
 int msi_init(struct PCIDevice *dev, uint8_t offset,
              unsigned int nr_vectors, bool msi64bit, bool msi_per_vector_mask);
diff --git a/hw/pc_piix.c b/hw/pc_piix.c
index c7dd75b0c0..85529b2cea 100644
--- a/hw/pc_piix.c
+++ b/hw/pc_piix.c
@@ -43,6 +43,7 @@
 #include "xen.h"
 #include "memory.h"
 #include "exec-memory.h"
+#include "cpu.h"
 #ifdef CONFIG_XEN
 #  include <xen/hvm/hvm_info_table.h>
 #endif
@@ -302,6 +303,12 @@ static void pc_init_pci(QEMUMachineInitArgs *args)
              initrd_filename, cpu_model, 1, 1);
 }
 
+static void pc_init_pci_1_3(QEMUMachineInitArgs *args)
+{
+    enable_kvm_pv_eoi();
+    pc_init_pci(args);
+}
+
 static void pc_init_pci_no_kvmclock(QEMUMachineInitArgs *args)
 {
     ram_addr_t ram_size = args->ram_size;
@@ -349,7 +356,7 @@ static QEMUMachine pc_machine_v1_3 = {
     .name = "pc-1.3",
     .alias = "pc",
     .desc = "Standard PC",
-    .init = pc_init_pci,
+    .init = pc_init_pci_1_3,
     .max_cpus = 255,
     .is_default = 1,
 };
diff --git a/hw/pci.c b/hw/pci.c
index d44fd0e10a..dceda0bdc5 100644
--- a/hw/pci.c
+++ b/hw/pci.c
@@ -1117,10 +1117,21 @@ PCIINTxRoute pci_device_route_intx_to_irq(PCIDevice *dev, int pin)
          pin = bus->map_irq(dev, pin);
          dev = bus->parent_dev;
     } while (dev);
-    assert(bus->route_intx_to_irq);
+
+    if (!bus->route_intx_to_irq) {
+        error_report("PCI: Bug - unimplemented PCI INTx routing (%s)\n",
+                     object_get_typename(OBJECT(bus->qbus.parent)));
+        return (PCIINTxRoute) { PCI_INTX_DISABLED, -1 };
+    }
+
     return bus->route_intx_to_irq(bus->irq_opaque, pin);
 }
 
+bool pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new)
+{
+    return old->mode != new->mode || old->irq != new->irq;
+}
+
 void pci_bus_fire_intx_routing_notifier(PCIBus *bus)
 {
     PCIDevice *dev;
@@ -1144,6 +1155,24 @@ void pci_device_set_intx_routing_notifier(PCIDevice *dev,
     dev->intx_routing_notifier = notifier;
 }
 
+/*
+ * PCI-to-PCI bridge specification
+ * 9.1: Interrupt routing. Table 9-1
+ *
+ * the PCI Express Base Specification, Revision 2.1
+ * 2.2.8.1: INTx interrutp signaling - Rules
+ *          the Implementation Note
+ *          Table 2-20
+ */
+/*
+ * 0 <= pin <= 3 0 = INTA, 1 = INTB, 2 = INTC, 3 = INTD
+ * 0-origin unlike PCI interrupt pin register.
+ */
+int pci_swizzle_map_irq_fn(PCIDevice *pci_dev, int pin)
+{
+    return (pin + PCI_SLOT(pci_dev->devfn)) % PCI_NUM_PINS;
+}
+
 /***********************************************************/
 /* monitor info on PCI */
 
@@ -1208,6 +1237,7 @@ static const pci_class_desc pci_class_descriptions[] =
     { 0x0c02, "SSA controller", "ssa"},
     { 0x0c03, "USB controller", "usb"},
     { 0x0c04, "Fibre channel controller", "fibre-channel"},
+    { 0x0c05, "SMBus"},
     { 0, NULL}
 };
 
@@ -1667,16 +1697,16 @@ PCIDevice *pci_create_simple(PCIBus *bus, int devfn, const char *name)
     return pci_create_simple_multifunction(bus, devfn, false, name);
 }
 
-static int pci_find_space(PCIDevice *pdev, uint8_t size)
+static uint8_t pci_find_space(PCIDevice *pdev, uint8_t size)
 {
-    int config_size = pci_config_size(pdev);
     int offset = PCI_CONFIG_HEADER_SIZE;
     int i;
-    for (i = PCI_CONFIG_HEADER_SIZE; i < config_size; ++i)
+    for (i = PCI_CONFIG_HEADER_SIZE; i < PCI_CONFIG_SPACE_SIZE; ++i) {
         if (pdev->used[i])
             offset = i + 1;
         else if (i - offset + 1 == size)
             return offset;
+    }
     return 0;
 }
 
@@ -1895,7 +1925,7 @@ int pci_add_capability(PCIDevice *pdev, uint8_t cap_id,
     config[PCI_CAP_LIST_NEXT] = pdev->config[PCI_CAPABILITY_LIST];
     pdev->config[PCI_CAPABILITY_LIST] = offset;
     pdev->config[PCI_STATUS] |= PCI_STATUS_CAP_LIST;
-    memset(pdev->used + offset, 0xFF, size);
+    memset(pdev->used + offset, 0xFF, QEMU_ALIGN_UP(size, 4));
     /* Make capability read-only by default */
     memset(pdev->wmask + offset, 0, size);
     /* Check capability by default */
@@ -1915,7 +1945,7 @@ void pci_del_capability(PCIDevice *pdev, uint8_t cap_id, uint8_t size)
     memset(pdev->w1cmask + offset, 0, size);
     /* Clear cmask as device-specific registers can't be checked */
     memset(pdev->cmask + offset, 0, size);
-    memset(pdev->used + offset, 0, size);
+    memset(pdev->used + offset, 0, QEMU_ALIGN_UP(size, 4));
 
     if (!pdev->config[PCI_CAPABILITY_LIST])
         pdev->config[PCI_STATUS] &= ~PCI_STATUS_CAP_LIST;
diff --git a/hw/pci.h b/hw/pci.h
index 1f902f5b59..241c1d8905 100644
--- a/hw/pci.h
+++ b/hw/pci.h
@@ -318,6 +318,8 @@ void pci_bus_irqs(PCIBus *bus, pci_set_irq_fn set_irq, pci_map_irq_fn map_irq,
                   void *irq_opaque, int nirq);
 int pci_bus_get_irq_level(PCIBus *bus, int irq_num);
 void pci_bus_hotplug(PCIBus *bus, pci_hotplug_fn hotplug, DeviceState *dev);
+/* 0 <= pin <= 3 0 = INTA, 1 = INTB, 2 = INTC, 3 = INTD */
+int pci_swizzle_map_irq_fn(PCIDevice *pci_dev, int pin);
 PCIBus *pci_register_bus(DeviceState *parent, const char *name,
                          pci_set_irq_fn set_irq, pci_map_irq_fn map_irq,
                          void *irq_opaque,
@@ -326,6 +328,7 @@ PCIBus *pci_register_bus(DeviceState *parent, const char *name,
                          uint8_t devfn_min, int nirq);
 void pci_bus_set_route_irq_fn(PCIBus *, pci_route_irq_fn);
 PCIINTxRoute pci_device_route_intx_to_irq(PCIDevice *dev, int pin);
+bool pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new);
 void pci_bus_fire_intx_routing_notifier(PCIBus *bus);
 void pci_device_set_intx_routing_notifier(PCIDevice *dev,
                                           PCIINTxRoutingNotifier notifier);
diff --git a/hw/pci_ids.h b/hw/pci_ids.h
index c017a79998..41f3570fb9 100644
--- a/hw/pci_ids.h
+++ b/hw/pci_ids.h
@@ -31,6 +31,7 @@
 #define PCI_CLASS_SYSTEM_OTHER           0x0880
 
 #define PCI_CLASS_SERIAL_USB             0x0c03
+#define PCI_CLASS_SERIAL_SMBUS           0x0c05
 
 #define PCI_CLASS_BRIDGE_HOST            0x0600
 #define PCI_CLASS_BRIDGE_ISA             0x0601
@@ -105,6 +106,7 @@
 #define PCI_DEVICE_ID_INTEL_82378        0x0484
 #define PCI_DEVICE_ID_INTEL_82441        0x1237
 #define PCI_DEVICE_ID_INTEL_82801AA_5    0x2415
+#define PCI_DEVICE_ID_INTEL_82801BA_11   0x244e
 #define PCI_DEVICE_ID_INTEL_82801D       0x24CD
 #define PCI_DEVICE_ID_INTEL_ESB_9        0x25ab
 #define PCI_DEVICE_ID_INTEL_82371SB_0    0x7000
diff --git a/hw/pcie_host.c b/hw/pcie_host.c
index 9f7f3d3b1e..c257fb43ca 100644
--- a/hw/pcie_host.c
+++ b/hw/pcie_host.c
@@ -107,14 +107,9 @@ static const MemoryRegionOps pcie_mmcfg_ops = {
 /* pcie_host::base_addr == PCIE_BASE_ADDR_UNMAPPED when it isn't mapped. */
 #define PCIE_BASE_ADDR_UNMAPPED  ((hwaddr)-1ULL)
 
-int pcie_host_init(PCIExpressHost *e, uint32_t size)
+int pcie_host_init(PCIExpressHost *e)
 {
-    assert(!(size & (size - 1)));       /* power of 2 */
-    assert(size >= PCIE_MMCFG_SIZE_MIN);
-    assert(size <= PCIE_MMCFG_SIZE_MAX);
     e->base_addr = PCIE_BASE_ADDR_UNMAPPED;
-    e->size = size;
-    memory_region_init_io(&e->mmio, &pcie_mmcfg_ops, e, "pcie-mmcfg", e->size);
 
     return 0;
 }
@@ -123,22 +118,44 @@ void pcie_host_mmcfg_unmap(PCIExpressHost *e)
 {
     if (e->base_addr != PCIE_BASE_ADDR_UNMAPPED) {
         memory_region_del_subregion(get_system_memory(), &e->mmio);
+        memory_region_destroy(&e->mmio);
         e->base_addr = PCIE_BASE_ADDR_UNMAPPED;
     }
 }
 
-void pcie_host_mmcfg_map(PCIExpressHost *e, hwaddr addr)
+void pcie_host_mmcfg_map(PCIExpressHost *e, hwaddr addr,
+                         uint32_t size)
 {
+    assert(!(size & (size - 1)));       /* power of 2 */
+    assert(size >= PCIE_MMCFG_SIZE_MIN);
+    assert(size <= PCIE_MMCFG_SIZE_MAX);
+    e->size = size;
+    memory_region_init_io(&e->mmio, &pcie_mmcfg_ops, e, "pcie-mmcfg", e->size);
     e->base_addr = addr;
     memory_region_add_subregion(get_system_memory(), e->base_addr, &e->mmio);
 }
 
 void pcie_host_mmcfg_update(PCIExpressHost *e,
                             int enable,
-                            hwaddr addr)
+                            hwaddr addr,
+                            uint32_t size)
 {
     pcie_host_mmcfg_unmap(e);
     if (enable) {
-        pcie_host_mmcfg_map(e, addr);
+        pcie_host_mmcfg_map(e, addr, size);
     }
 }
+
+static const TypeInfo pcie_host_type_info = {
+    .name = TYPE_PCIE_HOST_BRIDGE,
+    .parent = TYPE_PCI_HOST_BRIDGE,
+    .abstract = true,
+    .instance_size = sizeof(PCIExpressHost),
+};
+
+static void pcie_host_register_types(void)
+{
+    type_register_static(&pcie_host_type_info);
+}
+
+type_init(pcie_host_register_types)
diff --git a/hw/pcie_host.h b/hw/pcie_host.h
index 9978b9f2f1..392193530d 100644
--- a/hw/pcie_host.h
+++ b/hw/pcie_host.h
@@ -24,6 +24,10 @@
 #include "pci_host.h"
 #include "memory.h"
 
+#define TYPE_PCIE_HOST_BRIDGE "pcie-host-bridge"
+#define PCIE_HOST_BRIDGE(obj) \
+    OBJECT_CHECK(PCIExpressHost, (obj), TYPE_PCIE_HOST_BRIDGE)
+
 struct PCIExpressHost {
     PCIHostState pci;
 
@@ -39,11 +43,12 @@ struct PCIExpressHost {
     MemoryRegion mmio;
 };
 
-int pcie_host_init(PCIExpressHost *e, uint32_t size);
+int pcie_host_init(PCIExpressHost *e);
 void pcie_host_mmcfg_unmap(PCIExpressHost *e);
-void pcie_host_mmcfg_map(PCIExpressHost *e, hwaddr addr);
+void pcie_host_mmcfg_map(PCIExpressHost *e, hwaddr addr, uint32_t size);
 void pcie_host_mmcfg_update(PCIExpressHost *e,
                             int enable,
-                            hwaddr addr);
+                            hwaddr addr,
+                            uint32_t size);
 
 #endif /* PCIE_HOST_H */
diff --git a/hw/vhost_net.c b/hw/vhost_net.c
index df2c4a30a2..8241601539 100644
--- a/hw/vhost_net.c
+++ b/hw/vhost_net.c
@@ -150,10 +150,6 @@ int vhost_net_start(struct vhost_net *net,
     if (r < 0) {
         goto fail_notifiers;
     }
-    if (net->dev.acked_features & (1 << VIRTIO_NET_F_MRG_RXBUF)) {
-        tap_set_vnet_hdr_len(net->nc,
-                             sizeof(struct virtio_net_hdr_mrg_rxbuf));
-    }
 
     r = vhost_dev_start(&net->dev, dev);
     if (r < 0) {
@@ -179,9 +175,6 @@ fail:
     }
     net->nc->info->poll(net->nc, true);
     vhost_dev_stop(&net->dev, dev);
-    if (net->dev.acked_features & (1 << VIRTIO_NET_F_MRG_RXBUF)) {
-        tap_set_vnet_hdr_len(net->nc, sizeof(struct virtio_net_hdr));
-    }
 fail_start:
     vhost_dev_disable_notifiers(&net->dev, dev);
 fail_notifiers:
@@ -199,18 +192,12 @@ void vhost_net_stop(struct vhost_net *net,
     }
     net->nc->info->poll(net->nc, true);
     vhost_dev_stop(&net->dev, dev);
-    if (net->dev.acked_features & (1 << VIRTIO_NET_F_MRG_RXBUF)) {
-        tap_set_vnet_hdr_len(net->nc, sizeof(struct virtio_net_hdr));
-    }
     vhost_dev_disable_notifiers(&net->dev, dev);
 }
 
 void vhost_net_cleanup(struct vhost_net *net)
 {
     vhost_dev_cleanup(&net->dev);
-    if (net->dev.acked_features & (1 << VIRTIO_NET_F_MRG_RXBUF)) {
-        tap_set_vnet_hdr_len(net->nc, sizeof(struct virtio_net_hdr));
-    }
     g_free(net);
 }
 #else
diff --git a/hw/virtio-net.c b/hw/virtio-net.c
index 50ba728c02..108ce07cfc 100644
--- a/hw/virtio-net.c
+++ b/hw/virtio-net.c
@@ -41,6 +41,8 @@ typedef struct VirtIONet
     int32_t tx_burst;
     int tx_waiting;
     uint32_t has_vnet_hdr;
+    size_t host_hdr_len;
+    size_t guest_hdr_len;
     uint8_t has_ufo;
     struct {
         VirtQueueElement elem;
@@ -200,16 +202,19 @@ static void virtio_net_reset(VirtIODevice *vdev)
     memset(n->vlans, 0, MAX_VLAN >> 3);
 }
 
-static int peer_has_vnet_hdr(VirtIONet *n)
+static void peer_test_vnet_hdr(VirtIONet *n)
 {
     if (!n->nic->nc.peer)
-        return 0;
+        return;
 
     if (n->nic->nc.peer->info->type != NET_CLIENT_OPTIONS_KIND_TAP)
-        return 0;
+        return;
 
     n->has_vnet_hdr = tap_has_vnet_hdr(n->nic->nc.peer);
+}
 
+static int peer_has_vnet_hdr(VirtIONet *n)
+{
     return n->has_vnet_hdr;
 }
 
@@ -223,15 +228,27 @@ static int peer_has_ufo(VirtIONet *n)
     return n->has_ufo;
 }
 
+static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs)
+{
+    n->mergeable_rx_bufs = mergeable_rx_bufs;
+
+    n->guest_hdr_len = n->mergeable_rx_bufs ?
+        sizeof(struct virtio_net_hdr_mrg_rxbuf) : sizeof(struct virtio_net_hdr);
+
+    if (peer_has_vnet_hdr(n) &&
+        tap_has_vnet_hdr_len(n->nic->nc.peer, n->guest_hdr_len)) {
+        tap_set_vnet_hdr_len(n->nic->nc.peer, n->guest_hdr_len);
+        n->host_hdr_len = n->guest_hdr_len;
+    }
+}
+
 static uint32_t virtio_net_get_features(VirtIODevice *vdev, uint32_t features)
 {
     VirtIONet *n = to_virtio_net(vdev);
 
     features |= (1 << VIRTIO_NET_F_MAC);
 
-    if (peer_has_vnet_hdr(n)) {
-        tap_using_vnet_hdr(n->nic->nc.peer, 1);
-    } else {
+    if (!peer_has_vnet_hdr(n)) {
         features &= ~(0x1 << VIRTIO_NET_F_CSUM);
         features &= ~(0x1 << VIRTIO_NET_F_HOST_TSO4);
         features &= ~(0x1 << VIRTIO_NET_F_HOST_TSO6);
@@ -277,7 +294,7 @@ static void virtio_net_set_features(VirtIODevice *vdev, uint32_t features)
 {
     VirtIONet *n = to_virtio_net(vdev);
 
-    n->mergeable_rx_bufs = !!(features & (1 << VIRTIO_NET_F_MRG_RXBUF));
+    virtio_net_set_mrg_rx_bufs(n, !!(features & (1 << VIRTIO_NET_F_MRG_RXBUF)));
 
     if (n->has_vnet_hdr) {
         tap_set_offload(n->nic->nc.peer,
@@ -499,41 +516,34 @@ static int virtio_net_has_buffers(VirtIONet *n, int bufsize)
  * cache.
  */
 static void work_around_broken_dhclient(struct virtio_net_hdr *hdr,
-                                        const uint8_t *buf, size_t size)
+                                        uint8_t *buf, size_t size)
 {
     if ((hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && /* missing csum */
         (size > 27 && size < 1500) && /* normal sized MTU */
         (buf[12] == 0x08 && buf[13] == 0x00) && /* ethertype == IPv4 */
         (buf[23] == 17) && /* ip.protocol == UDP */
         (buf[34] == 0 && buf[35] == 67)) { /* udp.srcport == bootps */
-        /* FIXME this cast is evil */
-        net_checksum_calculate((uint8_t *)buf, size);
+        net_checksum_calculate(buf, size);
         hdr->flags &= ~VIRTIO_NET_HDR_F_NEEDS_CSUM;
     }
 }
 
-static int receive_header(VirtIONet *n, struct iovec *iov, int iovcnt,
-                          const void *buf, size_t size, size_t hdr_len)
+static void receive_header(VirtIONet *n, const struct iovec *iov, int iov_cnt,
+                           const void *buf, size_t size)
 {
-    struct virtio_net_hdr *hdr = (struct virtio_net_hdr *)iov[0].iov_base;
-    int offset = 0;
-
-    hdr->flags = 0;
-    hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;
-
     if (n->has_vnet_hdr) {
-        memcpy(hdr, buf, sizeof(*hdr));
-        offset = sizeof(*hdr);
-        work_around_broken_dhclient(hdr, buf + offset, size - offset);
+        /* FIXME this cast is evil */
+        void *wbuf = (void *)buf;
+        work_around_broken_dhclient(wbuf, wbuf + n->host_hdr_len,
+                                    size - n->host_hdr_len);
+        iov_from_buf(iov, iov_cnt, 0, buf, sizeof(struct virtio_net_hdr));
+    } else {
+        struct virtio_net_hdr hdr = {
+            .flags = 0,
+            .gso_type = VIRTIO_NET_HDR_GSO_NONE
+        };
+        iov_from_buf(iov, iov_cnt, 0, &hdr, sizeof hdr);
     }
-
-    /* We only ever receive a struct virtio_net_hdr from the tapfd,
-     * but we may be passing along a larger header to the guest.
-     */
-    iov[0].iov_base += hdr_len;
-    iov[0].iov_len  -= hdr_len;
-
-    return offset;
 }
 
 static int receive_filter(VirtIONet *n, const uint8_t *buf, int size)
@@ -546,9 +556,7 @@ static int receive_filter(VirtIONet *n, const uint8_t *buf, int size)
     if (n->promisc)
         return 1;
 
-    if (n->has_vnet_hdr) {
-        ptr += sizeof(struct virtio_net_hdr);
-    }
+    ptr += n->host_hdr_len;
 
     if (!memcmp(&ptr[12], vlan, sizeof(vlan))) {
         int vid = be16_to_cpup((uint16_t *)(ptr + 14)) & 0xfff;
@@ -592,19 +600,16 @@ static int receive_filter(VirtIONet *n, const uint8_t *buf, int size)
 static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t size)
 {
     VirtIONet *n = DO_UPCAST(NICState, nc, nc)->opaque;
-    struct virtio_net_hdr_mrg_rxbuf *mhdr = NULL;
-    size_t guest_hdr_len, offset, i, host_hdr_len;
+    struct iovec mhdr_sg[VIRTQUEUE_MAX_SIZE];
+    struct virtio_net_hdr_mrg_rxbuf mhdr;
+    unsigned mhdr_cnt = 0;
+    size_t offset, i, guest_offset;
 
     if (!virtio_net_can_receive(&n->nic->nc))
         return -1;
 
     /* hdr_len refers to the header we supply to the guest */
-    guest_hdr_len = n->mergeable_rx_bufs ?
-        sizeof(struct virtio_net_hdr_mrg_rxbuf) : sizeof(struct virtio_net_hdr);
-
-
-    host_hdr_len = n->has_vnet_hdr ? sizeof(struct virtio_net_hdr) : 0;
-    if (!virtio_net_has_buffers(n, size + guest_hdr_len - host_hdr_len))
+    if (!virtio_net_has_buffers(n, size + n->guest_hdr_len - n->host_hdr_len))
         return 0;
 
     if (!receive_filter(n, buf, size))
@@ -615,7 +620,7 @@ static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t
     while (offset < size) {
         VirtQueueElement elem;
         int len, total;
-        struct iovec sg[VIRTQUEUE_MAX_SIZE];
+        const struct iovec *sg = elem.in_sg;
 
         total = 0;
 
@@ -626,7 +631,7 @@ static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t
                     "i %zd mergeable %d offset %zd, size %zd, "
                     "guest hdr len %zd, host hdr len %zd guest features 0x%x",
                     i, n->mergeable_rx_bufs, offset, size,
-                    guest_hdr_len, host_hdr_len, n->vdev.guest_features);
+                    n->guest_hdr_len, n->host_hdr_len, n->vdev.guest_features);
             exit(1);
         }
 
@@ -635,24 +640,25 @@ static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t
             exit(1);
         }
 
-        if (!n->mergeable_rx_bufs && elem.in_sg[0].iov_len != guest_hdr_len) {
-            error_report("virtio-net header not in first element");
-            exit(1);
-        }
-
-        memcpy(&sg, &elem.in_sg[0], sizeof(sg[0]) * elem.in_num);
-
         if (i == 0) {
-            if (n->mergeable_rx_bufs)
-                mhdr = (struct virtio_net_hdr_mrg_rxbuf *)sg[0].iov_base;
+            assert(offset == 0);
+            if (n->mergeable_rx_bufs) {
+                mhdr_cnt = iov_copy(mhdr_sg, ARRAY_SIZE(mhdr_sg),
+                                    sg, elem.in_num,
+                                    offsetof(typeof(mhdr), num_buffers),
+                                    sizeof(mhdr.num_buffers));
+            }
 
-            offset += receive_header(n, sg, elem.in_num,
-                                     buf + offset, size - offset, guest_hdr_len);
-            total += guest_hdr_len;
+            receive_header(n, sg, elem.in_num, buf, size);
+            offset = n->host_hdr_len;
+            total += n->guest_hdr_len;
+            guest_offset = n->guest_hdr_len;
+        } else {
+            guest_offset = 0;
         }
 
         /* copy in packet.  ugh */
-        len = iov_from_buf(sg, elem.in_num, 0,
+        len = iov_from_buf(sg, elem.in_num, guest_offset,
                            buf + offset, size - offset);
         total += len;
         offset += len;
@@ -665,7 +671,7 @@ static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t
                          "i %zd mergeable %d offset %zd, size %zd, "
                          "guest hdr len %zd, host hdr len %zd",
                          i, n->mergeable_rx_bufs,
-                         offset, size, guest_hdr_len, host_hdr_len);
+                         offset, size, n->guest_hdr_len, n->host_hdr_len);
 #endif
             return size;
         }
@@ -674,8 +680,11 @@ static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t
         virtqueue_fill(n->rx_vq, &elem, total, i++);
     }
 
-    if (mhdr) {
-        stw_p(&mhdr->num_buffers, i);
+    if (mhdr_cnt) {
+        stw_p(&mhdr.num_buffers, i);
+        iov_from_buf(mhdr_sg, mhdr_cnt,
+                     0,
+                     &mhdr.num_buffers, sizeof mhdr.num_buffers);
     }
 
     virtqueue_flush(n->rx_vq, i);
@@ -716,33 +725,35 @@ static int32_t virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq)
     }
 
     while (virtqueue_pop(vq, &elem)) {
-        ssize_t ret, len = 0;
+        ssize_t ret, len;
         unsigned int out_num = elem.out_num;
         struct iovec *out_sg = &elem.out_sg[0];
-        unsigned hdr_len;
-
-        /* hdr_len refers to the header received from the guest */
-        hdr_len = n->mergeable_rx_bufs ?
-            sizeof(struct virtio_net_hdr_mrg_rxbuf) :
-            sizeof(struct virtio_net_hdr);
+        struct iovec sg[VIRTQUEUE_MAX_SIZE];
 
-        if (out_num < 1 || out_sg->iov_len != hdr_len) {
+        if (out_num < 1) {
             error_report("virtio-net header not in first element");
             exit(1);
         }
 
-        /* ignore the header if GSO is not supported */
-        if (!n->has_vnet_hdr) {
-            out_num--;
-            out_sg++;
-            len += hdr_len;
-        } else if (n->mergeable_rx_bufs) {
-            /* tapfd expects a struct virtio_net_hdr */
-            hdr_len -= sizeof(struct virtio_net_hdr);
-            out_sg->iov_len -= hdr_len;
-            len += hdr_len;
+        /*
+         * If host wants to see the guest header as is, we can
+         * pass it on unchanged. Otherwise, copy just the parts
+         * that host is interested in.
+         */
+        assert(n->host_hdr_len <= n->guest_hdr_len);
+        if (n->host_hdr_len != n->guest_hdr_len) {
+            unsigned sg_num = iov_copy(sg, ARRAY_SIZE(sg),
+                                       out_sg, out_num,
+                                       0, n->host_hdr_len);
+            sg_num += iov_copy(sg + sg_num, ARRAY_SIZE(sg) - sg_num,
+                             out_sg, out_num,
+                             n->guest_hdr_len, -1);
+            out_num = sg_num;
+            out_sg = sg;
         }
 
+        len = n->guest_hdr_len;
+
         ret = qemu_sendv_packet_async(&n->nic->nc, out_sg, out_num,
                                       virtio_net_tx_complete);
         if (ret == 0) {
@@ -899,7 +910,8 @@ static int virtio_net_load(QEMUFile *f, void *opaque, int version_id)
 
     qemu_get_buffer(f, n->mac, ETH_ALEN);
     n->tx_waiting = qemu_get_be32(f);
-    n->mergeable_rx_bufs = qemu_get_be32(f);
+
+    virtio_net_set_mrg_rx_bufs(n, qemu_get_be32(f));
 
     if (version_id >= 3)
         n->status = qemu_get_be16(f);
@@ -939,7 +951,6 @@ static int virtio_net_load(QEMUFile *f, void *opaque, int version_id)
         }
 
         if (n->has_vnet_hdr) {
-            tap_using_vnet_hdr(n->nic->nc.peer, 1);
             tap_set_offload(n->nic->nc.peer,
                     (n->vdev.guest_features >> VIRTIO_NET_F_GUEST_CSUM) & 1,
                     (n->vdev.guest_features >> VIRTIO_NET_F_GUEST_TSO4) & 1,
@@ -1038,12 +1049,19 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf,
     n->status = VIRTIO_NET_S_LINK_UP;
 
     n->nic = qemu_new_nic(&net_virtio_info, conf, object_get_typename(OBJECT(dev)), dev->id, n);
+    peer_test_vnet_hdr(n);
+    if (peer_has_vnet_hdr(n)) {
+        tap_using_vnet_hdr(n->nic->nc.peer, 1);
+        n->host_hdr_len = sizeof(struct virtio_net_hdr);
+    } else {
+        n->host_hdr_len = 0;
+    }
 
     qemu_format_nic_info_str(&n->nic->nc, conf->macaddr.a);
 
     n->tx_waiting = 0;
     n->tx_burst = net->txburst;
-    n->mergeable_rx_bufs = 0;
+    virtio_net_set_mrg_rx_bufs(n, 0);
     n->promisc = 1; /* for compatibility */
 
     n->mac_table.macs = g_malloc0(MAC_TABLE_ENTRIES * ETH_ALEN);
diff --git a/iov.c b/iov.c
index c6a66f0afe..b7378bf7ce 100644
--- a/iov.c
+++ b/iov.c
@@ -228,3 +228,26 @@ void iov_hexdump(const struct iovec *iov, const unsigned int iov_cnt,
         fprintf(fp, "\n");
     }
 }
+
+unsigned iov_copy(struct iovec *dst_iov, unsigned int dst_iov_cnt,
+                 const struct iovec *iov, unsigned int iov_cnt,
+                 size_t offset, size_t bytes)
+{
+    size_t len;
+    unsigned int i, j;
+    for (i = 0, j = 0; i < iov_cnt && j < dst_iov_cnt && bytes; i++) {
+        if (offset >= iov[i].iov_len) {
+            offset -= iov[i].iov_len;
+            continue;
+        }
+        len = MIN(bytes, iov[i].iov_len - offset);
+
+        dst_iov[j].iov_base = iov[i].iov_base + offset;
+        dst_iov[j].iov_len = len;
+        j++;
+        bytes -= len;
+        offset = 0;
+    }
+    assert(offset == 0);
+    return j;
+}
diff --git a/iov.h b/iov.h
index a73569f94e..34c8ec9faa 100644
--- a/iov.h
+++ b/iov.h
@@ -86,3 +86,12 @@ ssize_t iov_send_recv(int sockfd, struct iovec *iov, unsigned iov_cnt,
  */
 void iov_hexdump(const struct iovec *iov, const unsigned int iov_cnt,
                  FILE *fp, const char *prefix, size_t limit);
+
+/*
+ * Partial copy of vector from iov to dst_iov (data is not copied).
+ * dst_iov overlaps iov at a specified offset.
+ * size of dst_iov is at most bytes. dst vector count is returned.
+ */
+unsigned iov_copy(struct iovec *dst_iov, unsigned int dst_iov_cnt,
+                 const struct iovec *iov, unsigned int iov_cnt,
+                 size_t offset, size_t bytes);
diff --git a/target-i386/cpu.c b/target-i386/cpu.c
index 6411042b79..d4f2e65cd9 100644
--- a/target-i386/cpu.c
+++ b/target-i386/cpu.c
@@ -125,6 +125,25 @@ typedef struct model_features_t {
 int check_cpuid = 0;
 int enforce_cpuid = 0;
 
+#if defined(CONFIG_KVM)
+static uint32_t kvm_default_features = (1 << KVM_FEATURE_CLOCKSOURCE) |
+        (1 << KVM_FEATURE_NOP_IO_DELAY) |
+        (1 << KVM_FEATURE_MMU_OP) |
+        (1 << KVM_FEATURE_CLOCKSOURCE2) |
+        (1 << KVM_FEATURE_ASYNC_PF) |
+        (1 << KVM_FEATURE_STEAL_TIME) |
+        (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
+static const uint32_t kvm_pv_eoi_features = (0x1 << KVM_FEATURE_PV_EOI);
+#else
+static uint32_t kvm_default_features = 0;
+static const uint32_t kvm_pv_eoi_features = 0;
+#endif
+
+void enable_kvm_pv_eoi(void)
+{
+    kvm_default_features |= kvm_pv_eoi_features;
+}
+
 void host_cpuid(uint32_t function, uint32_t count,
                 uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
 {
@@ -1108,7 +1127,7 @@ static int cpu_x86_find_by_name(x86_def_t *x86_cpu_def, const char *cpu_model)
     /* Features to be added*/
     uint32_t plus_features = 0, plus_ext_features = 0;
     uint32_t plus_ext2_features = 0, plus_ext3_features = 0;
-    uint32_t plus_kvm_features = 0, plus_svm_features = 0;
+    uint32_t plus_kvm_features = kvm_default_features, plus_svm_features = 0;
     uint32_t plus_7_0_ebx_features = 0;
     /* Features to be removed */
     uint32_t minus_features = 0, minus_ext_features = 0;
@@ -1128,18 +1147,6 @@ static int cpu_x86_find_by_name(x86_def_t *x86_cpu_def, const char *cpu_model)
         memcpy(x86_cpu_def, def, sizeof(*def));
     }
 
-#if defined(CONFIG_KVM)
-    plus_kvm_features = (1 << KVM_FEATURE_CLOCKSOURCE) |
-        (1 << KVM_FEATURE_NOP_IO_DELAY) | 
-        (1 << KVM_FEATURE_MMU_OP) |
-        (1 << KVM_FEATURE_CLOCKSOURCE2) |
-        (1 << KVM_FEATURE_ASYNC_PF) | 
-        (1 << KVM_FEATURE_STEAL_TIME) |
-        (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
-#else
-    plus_kvm_features = 0;
-#endif
-
     add_flagname_to_bitmaps("hypervisor", &plus_features,
             &plus_ext_features, &plus_ext2_features, &plus_ext3_features,
             &plus_kvm_features, &plus_svm_features,  &plus_7_0_ebx_features);
diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index 871c270116..de33303dea 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -1188,4 +1188,6 @@ void do_smm_enter(CPUX86State *env1);
 
 void cpu_report_tpr_access(CPUX86State *env, TPRAccess access);
 
+void enable_kvm_pv_eoi(void);
+
 #endif /* CPU_I386_H */
author	Anthony Liguori <aliguori@us.ibm.com>	2012-10-29 14:31:47 -0500
committer	Anthony Liguori <aliguori@us.ibm.com>	2012-10-29 14:31:47 -0500
commit	233926fafa6c4a0fb666e1469524d66dd3b47ddd (patch)
tree	2dbe1506ee13fbc7d2d56b317f596114d0f849bb
parent	b308c82cbda44e138ef990af64d44a5613c16092 (diff)
parent	523a59f596a3e62f5a28eb171adba35e71310040 (diff)