Merge remote branch 'mst/for_anthony' into staging

author: Anthony Liguori <aliguori@us.ibm.com> 2011-01-17 09:49:38 -0600
committer: Anthony Liguori <aliguori@us.ibm.com> 2011-01-17 09:49:38 -0600
commit: 9e8a69cfd6f0fe2585528fc7a85110fc25c05d0b (patch)
tree: 6025ebe5f73460a803577e62c894a0e985f91fd2
parent: 51f9b84e759c692575542627dd8d39ae216ac521 (diff)
parent: b36e391441906c36ed0856b69de84001860402bf (diff)
23 files changed, 359 insertions, 55 deletions
diff --git a/docs/qdev-device-use.txt b/docs/qdev-device-use.txt
index f252c8e3bc..f2f9b757a5 100644
--- a/docs/qdev-device-use.txt
+++ b/docs/qdev-device-use.txt
@@ -97,10 +97,13 @@ The -device argument differs in detail for each kind of drive:
 
 * if=virtio
 
-  -device virtio-blk-pci,drive=DRIVE-ID,class=C,vectors=V
+  -device virtio-blk-pci,drive=DRIVE-ID,class=C,vectors=V,ioeventfd=IOEVENTFD
 
   This lets you control PCI device class and MSI-X vectors.
 
+  IOEVENTFD controls whether or not ioeventfd is used for virtqueue notify.  It
+  can be set to on (default) or off.
+
   As for all PCI devices, you can add bus=PCI-BUS,addr=DEVFN to
   control the PCI device address.
 
@@ -240,6 +243,9 @@ For PCI devices, you can add bus=PCI-BUS,addr=DEVFN to control the PCI
 device address, as usual.  The old -net nic provides parameter addr
 for that, it is silently ignored when the NIC is not a PCI device.
 
+For virtio-net-pci, you can control whether or not ioeventfd is used for
+virtqueue notify by setting ioeventfd= to on or off (default).
+
 -net nic accepts vectors=V for all models, but it's silently ignored
 except for virtio-net-pci (model=virtio).  With -device, only devices
 that support it accept it.
diff --git a/hw/acpi_piix4.c b/hw/acpi_piix4.c
index 173d78148d..273097d480 100644
--- a/hw/acpi_piix4.c
+++ b/hw/acpi_piix4.c
@@ -428,6 +428,8 @@ static PCIDeviceInfo piix4_pm_info = {
     .qdev.desc          = "PM",
     .qdev.size          = sizeof(PIIX4PMState),
     .qdev.vmsd          = &vmstate_acpi,
+    .qdev.no_user       = 1,
+    .no_hotplug         = 1,
     .init               = piix4_pm_initfn,
     .config_write       = pm_write_config,
     .qdev.props         = (Property[]) {
diff --git a/hw/cirrus_vga.c b/hw/cirrus_vga.c
index 75d1cc6f57..5f45b5dee7 100644
--- a/hw/cirrus_vga.c
+++ b/hw/cirrus_vga.c
@@ -3140,6 +3140,7 @@ static PCIDeviceInfo cirrus_vga_info = {
     .qdev.desc    = "Cirrus CLGD 54xx VGA",
     .qdev.size    = sizeof(PCICirrusVGAState),
     .qdev.vmsd    = &vmstate_pci_cirrus_vga,
+    .no_hotplug   = 1,
     .init         = pci_cirrus_vga_initfn,
     .romfile      = VGABIOS_CIRRUS_FILENAME,
     .config_write = pci_cirrus_write_config,
diff --git a/hw/ide/piix.c b/hw/ide/piix.c
index 1cad9066a0..d4289af9c4 100644
--- a/hw/ide/piix.c
+++ b/hw/ide/piix.c
@@ -194,11 +194,13 @@ static PCIDeviceInfo piix_ide_info[] = {
         .qdev.name    = "piix3-ide",
         .qdev.size    = sizeof(PCIIDEState),
         .qdev.no_user = 1,
+        .no_hotplug   = 1,
         .init         = pci_piix3_ide_initfn,
     },{
         .qdev.name    = "piix4-ide",
         .qdev.size    = sizeof(PCIIDEState),
         .qdev.no_user = 1,
+        .no_hotplug   = 1,
         .init         = pci_piix4_ide_initfn,
     },{
         /* end of list */
diff --git a/hw/pci.c b/hw/pci.c
index d0b51b80bd..8d0e3df2e5 100644
--- a/hw/pci.c
+++ b/hw/pci.c
@@ -1624,6 +1624,11 @@ static int pci_qdev_init(DeviceState *qdev, DeviceInfo *base)
                                      info->is_bridge);
     if (pci_dev == NULL)
         return -1;
+    if (qdev->hotplugged && info->no_hotplug) {
+        qerror_report(QERR_DEVICE_NO_HOTPLUG, info->qdev.name);
+        do_pci_unregister_device(pci_dev);
+        return -1;
+    }
     rc = info->init(pci_dev);
     if (rc != 0) {
         do_pci_unregister_device(pci_dev);
@@ -1656,7 +1661,12 @@ static int pci_qdev_init(DeviceState *qdev, DeviceInfo *base)
 static int pci_unplug_device(DeviceState *qdev)
 {
     PCIDevice *dev = DO_UPCAST(PCIDevice, qdev, qdev);
+    PCIDeviceInfo *info = container_of(qdev->info, PCIDeviceInfo, qdev);
 
+    if (info->no_hotplug) {
+        qerror_report(QERR_DEVICE_NO_HOTPLUG, info->qdev.name);
+        return -1;
+    }
     return dev->bus->hotplug(dev->bus->hotplug_qdev, dev,
                              PCI_HOTPLUG_DISABLED);
 }
diff --git a/hw/pci.h b/hw/pci.h
index 052960e3ea..bc8d5bb3c7 100644
--- a/hw/pci.h
+++ b/hw/pci.h
@@ -436,6 +436,9 @@ typedef struct {
     /* pcie stuff */
     int is_express;   /* is this device pci express? */
 
+    /* device isn't hot-pluggable */
+    int no_hotplug;
+
     /* rom bar */
     const char *romfile;
 } PCIDeviceInfo;
diff --git a/hw/piix4.c b/hw/piix4.c
index 5489386d68..72073cd0a0 100644
--- a/hw/piix4.c
+++ b/hw/piix4.c
@@ -113,6 +113,7 @@ static PCIDeviceInfo piix4_info[] = {
         .qdev.desc    = "ISA bridge",
         .qdev.size    = sizeof(PCIDevice),
         .qdev.no_user = 1,
+        .no_hotplug   = 1,
         .init         = piix4_initfn,
     },{
         /* end of list */
diff --git a/hw/piix_pci.c b/hw/piix_pci.c
index 38f9d9eea4..358da58a80 100644
--- a/hw/piix_pci.c
+++ b/hw/piix_pci.c
@@ -348,6 +348,7 @@ static PCIDeviceInfo i440fx_info[] = {
         .qdev.size    = sizeof(PCII440FXState),
         .qdev.vmsd    = &vmstate_i440fx,
         .qdev.no_user = 1,
+        .no_hotplug   = 1,
         .init         = i440fx_initfn,
         .config_write = i440fx_write_config,
     },{
@@ -356,6 +357,7 @@ static PCIDeviceInfo i440fx_info[] = {
         .qdev.size    = sizeof(PIIX3State),
         .qdev.vmsd    = &vmstate_piix3,
         .qdev.no_user = 1,
+        .no_hotplug   = 1,
         .init         = piix3_initfn,
     },{
         /* end of list */
diff --git a/hw/qdev.c b/hw/qdev.c
index 31eb464f23..5b8d3742ec 100644
--- a/hw/qdev.c
+++ b/hw/qdev.c
@@ -32,6 +32,8 @@
 #include "blockdev.h"
 
 static int qdev_hotplug = 0;
+static bool qdev_hot_added = false;
+static bool qdev_hot_removed = false;
 
 /* This is a nasty hack to allow passing a NULL bus to qdev_create.  */
 static BusState *main_system_bus;
@@ -93,6 +95,7 @@ static DeviceState *qdev_create_from_info(BusState *bus, DeviceInfo *info)
     if (qdev_hotplug) {
         assert(bus->allow_hotplug);
         dev->hotplugged = 1;
+        qdev_hot_added = true;
     }
     dev->instance_id_alias = -1;
     dev->state = DEV_STATE_CREATED;
@@ -294,6 +297,8 @@ int qdev_unplug(DeviceState *dev)
     }
     assert(dev->info->unplug != NULL);
 
+    qdev_hot_removed = true;
+
     return dev->info->unplug(dev);
 }
 
@@ -395,6 +400,11 @@ void qdev_machine_creation_done(void)
     qdev_hotplug = 1;
 }
 
+bool qdev_machine_modified(void)
+{
+    return qdev_hot_added || qdev_hot_removed;
+}
+
 /* Get a character (serial) device interface.  */
 CharDriverState *qdev_init_chardev(DeviceState *dev)
 {
diff --git a/hw/qdev.h b/hw/qdev.h
index 2be775f9e8..e520aaa786 100644
--- a/hw/qdev.h
+++ b/hw/qdev.h
@@ -132,6 +132,7 @@ int qdev_unplug(DeviceState *dev);
 void qdev_free(DeviceState *dev);
 int qdev_simple_unplug_cb(DeviceState *dev);
 void qdev_machine_creation_done(void);
+bool qdev_machine_modified(void);
 
 qemu_irq qdev_get_gpio_in(DeviceState *dev, int n);
 void qdev_connect_gpio_out(DeviceState *dev, int n, qemu_irq pin);
diff --git a/hw/qxl.c b/hw/qxl.c
index 207aa63f90..bd71e5810f 100644
--- a/hw/qxl.c
+++ b/hw/qxl.c
@@ -1546,6 +1546,7 @@ static PCIDeviceInfo qxl_info_primary = {
     .qdev.size    = sizeof(PCIQXLDevice),
     .qdev.reset   = qxl_reset_handler,
     .qdev.vmsd    = &qxl_vmstate,
+    .no_hotplug   = 1,
     .init         = qxl_init_primary,
     .config_write = qxl_write_config,
     .romfile      = "vgabios-qxl.bin",
diff --git a/hw/rtl8139.c b/hw/rtl8139.c
index a8aed89074..a22530cf89 100644
--- a/hw/rtl8139.c
+++ b/hw/rtl8139.c
@@ -495,6 +495,8 @@ typedef struct RTL8139State {
     QEMUTimer *timer;
     int64_t TimerExpire;
 
+    /* Support migration to/from old versions */
+    int rtl8139_mmio_io_addr_dummy;
 } RTL8139State;
 
 static void rtl8139_set_next_tctr_time(RTL8139State *s, int64_t current_time);
@@ -3162,6 +3164,21 @@ static int rtl8139_post_load(void *opaque, int version_id)
     return 0;
 }
 
+static bool rtl8139_hotplug_ready_needed(void *opaque)
+{
+    return qdev_machine_modified();
+}
+
+static const VMStateDescription vmstate_rtl8139_hotplug_ready ={
+    .name = "rtl8139/hotplug_ready",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .minimum_version_id_old = 1,
+    .fields      = (VMStateField []) {
+        VMSTATE_END_OF_LIST()
+    }
+};
+
 static void rtl8139_pre_save(void *opaque)
 {
     RTL8139State* s = opaque;
@@ -3171,6 +3188,7 @@ static void rtl8139_pre_save(void *opaque)
     rtl8139_set_next_tctr_time(s, current_time);
     s->TCTR = muldiv64(current_time - s->TCTR_base, PCI_FREQUENCY,
                        get_ticks_per_sec());
+    s->rtl8139_mmio_io_addr_dummy = s->rtl8139_mmio_io_addr;
 }
 
 static const VMStateDescription vmstate_rtl8139 = {
@@ -3223,7 +3241,7 @@ static const VMStateDescription vmstate_rtl8139 = {
 
         VMSTATE_UNUSED(4),
         VMSTATE_MACADDR(conf.macaddr, RTL8139State),
-        VMSTATE_INT32(rtl8139_mmio_io_addr, RTL8139State),
+        VMSTATE_INT32(rtl8139_mmio_io_addr_dummy, RTL8139State),
 
         VMSTATE_UINT32(currTxDesc, RTL8139State),
         VMSTATE_UINT32(currCPlusRxDesc, RTL8139State),
@@ -3252,6 +3270,14 @@ static const VMStateDescription vmstate_rtl8139 = {
 
         VMSTATE_UINT32_V(cplus_enabled, RTL8139State, 4),
         VMSTATE_END_OF_LIST()
+    },
+    .subsections = (VMStateSubsection []) {
+        {
+            .vmsd = &vmstate_rtl8139_hotplug_ready,
+            .needed = rtl8139_hotplug_ready_needed,
+        }, {
+            /* empty */
+        }
     }
 };
 
diff --git a/hw/vga-pci.c b/hw/vga-pci.c
index 791ca22763..ce9ec45777 100644
--- a/hw/vga-pci.c
+++ b/hw/vga-pci.c
@@ -110,6 +110,7 @@ static PCIDeviceInfo vga_info = {
     .qdev.name    = "VGA",
     .qdev.size    = sizeof(PCIVGAState),
     .qdev.vmsd    = &vmstate_vga_pci,
+    .no_hotplug   = 1,
     .init         = pci_vga_initfn,
     .config_write = pci_vga_write_config,
     .romfile      = "vgabios-stdvga.bin",
diff --git a/hw/virtio-net.c b/hw/virtio-net.c
index ec1bf8dda7..ccb3e632a4 100644
--- a/hw/virtio-net.c
+++ b/hw/virtio-net.c
@@ -54,8 +54,6 @@ typedef struct VirtIONet
     uint8_t nouni;
     uint8_t nobcast;
     uint8_t vhost_started;
-    bool vm_running;
-    VMChangeStateEntry *vmstate;
     struct {
         int in_use;
         int first_multi;
@@ -102,7 +100,7 @@ static void virtio_net_set_config(VirtIODevice *vdev, const uint8_t *config)
 static bool virtio_net_started(VirtIONet *n, uint8_t status)
 {
     return (status & VIRTIO_CONFIG_S_DRIVER_OK) &&
-        (n->status & VIRTIO_NET_S_LINK_UP) && n->vm_running;
+        (n->status & VIRTIO_NET_S_LINK_UP) && n->vdev.vm_running;
 }
 
 static void virtio_net_vhost_status(VirtIONet *n, uint8_t status)
@@ -453,7 +451,7 @@ static void virtio_net_handle_rx(VirtIODevice *vdev, VirtQueue *vq)
 static int virtio_net_can_receive(VLANClientState *nc)
 {
     VirtIONet *n = DO_UPCAST(NICState, nc, nc)->opaque;
-    if (!n->vm_running) {
+    if (!n->vdev.vm_running) {
         return 0;
     }
 
@@ -708,7 +706,7 @@ static int32_t virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq)
         return num_packets;
     }
 
-    assert(n->vm_running);
+    assert(n->vdev.vm_running);
 
     if (n->async_tx.elem.out_num) {
         virtio_queue_set_notification(n->tx_vq, 0);
@@ -769,7 +767,7 @@ static void virtio_net_handle_tx_timer(VirtIODevice *vdev, VirtQueue *vq)
     VirtIONet *n = to_virtio_net(vdev);
 
     /* This happens when device was stopped but VCPU wasn't. */
-    if (!n->vm_running) {
+    if (!n->vdev.vm_running) {
         n->tx_waiting = 1;
         return;
     }
@@ -796,7 +794,7 @@ static void virtio_net_handle_tx_bh(VirtIODevice *vdev, VirtQueue *vq)
     }
     n->tx_waiting = 1;
     /* This happens when device was stopped but VCPU wasn't. */
-    if (!n->vm_running) {
+    if (!n->vdev.vm_running) {
         return;
     }
     virtio_queue_set_notification(vq, 0);
@@ -806,7 +804,7 @@ static void virtio_net_handle_tx_bh(VirtIODevice *vdev, VirtQueue *vq)
 static void virtio_net_tx_timer(void *opaque)
 {
     VirtIONet *n = opaque;
-    assert(n->vm_running);
+    assert(n->vdev.vm_running);
 
     n->tx_waiting = 0;
 
@@ -823,7 +821,7 @@ static void virtio_net_tx_bh(void *opaque)
     VirtIONet *n = opaque;
     int32_t ret;
 
-    assert(n->vm_running);
+    assert(n->vdev.vm_running);
 
     n->tx_waiting = 0;
 
@@ -988,16 +986,6 @@ static NetClientInfo net_virtio_info = {
     .link_status_changed = virtio_net_set_link_status,
 };
 
-static void virtio_net_vmstate_change(void *opaque, int running, int reason)
-{
-    VirtIONet *n = opaque;
-    n->vm_running = running;
-    /* This is called when vm is started/stopped,
-     * it will start/stop vhost backend if appropriate
-     * e.g. after migration. */
-    virtio_net_set_status(&n->vdev, n->vdev.status);
-}
-
 VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf,
                               virtio_net_conf *net)
 {
@@ -1052,7 +1040,6 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf,
     n->qdev = dev;
     register_savevm(dev, "virtio-net", -1, VIRTIO_NET_VM_VERSION,
                     virtio_net_save, virtio_net_load, n);
-    n->vmstate = qemu_add_vm_change_state_handler(virtio_net_vmstate_change, n);
 
     add_boot_device_path(conf->bootindex, dev, "/ethernet-phy@0");
 
@@ -1062,7 +1049,6 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf,
 void virtio_net_exit(VirtIODevice *vdev)
 {
     VirtIONet *n = DO_UPCAST(VirtIONet, vdev, vdev);
-    qemu_del_vm_change_state_handler(n->vmstate);
 
     /* This will stop vhost backend if appropriate. */
     virtio_net_set_status(vdev, 0);
diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c
index 6186142b2b..d07ff976be 100644
--- a/hw/virtio-pci.c
+++ b/hw/virtio-pci.c
@@ -80,9 +80,13 @@
  * 12 is historical, and due to x86 page size. */
 #define VIRTIO_PCI_QUEUE_ADDR_SHIFT    12
 
-/* We can catch some guest bugs inside here so we continue supporting older
-   guests. */
-#define VIRTIO_PCI_BUG_BUS_MASTER	(1 << 0)
+/* Flags track per-device state like workarounds for quirks in older guests. */
+#define VIRTIO_PCI_FLAG_BUS_MASTER_BUG  (1 << 0)
+
+/* Performance improves when virtqueue kick processing is decoupled from the
+ * vcpu thread using ioeventfd for some devices. */
+#define VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT 1
+#define VIRTIO_PCI_FLAG_USE_IOEVENTFD   (1 << VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT)
 
 /* QEMU doesn't strictly need write barriers since everything runs in
  * lock-step.  We'll leave the calls to wmb() in though to make it obvious for
@@ -95,7 +99,7 @@
 typedef struct {
     PCIDevice pci_dev;
     VirtIODevice *vdev;
-    uint32_t bugs;
+    uint32_t flags;
     uint32_t addr;
     uint32_t class_code;
     uint32_t nvectors;
@@ -108,6 +112,8 @@ typedef struct {
     /* Max. number of ports we can have for a the virtio-serial device */
     uint32_t max_virtserial_ports;
     virtio_net_conf net;
+    bool ioeventfd_disabled;
+    bool ioeventfd_started;
 } VirtIOPCIProxy;
 
 /* virtio device */
@@ -159,7 +165,7 @@ static int virtio_pci_load_config(void * opaque, QEMUFile *f)
        in ready state. Then we have a buggy guest OS. */
     if ((proxy->vdev->status & VIRTIO_CONFIG_S_DRIVER_OK) &&
         !(proxy->pci_dev.config[PCI_COMMAND] & PCI_COMMAND_MASTER)) {
-        proxy->bugs |= VIRTIO_PCI_BUG_BUS_MASTER;
+        proxy->flags |= VIRTIO_PCI_FLAG_BUS_MASTER_BUG;
     }
     return 0;
 }
@@ -180,12 +186,139 @@ static int virtio_pci_load_queue(void * opaque, int n, QEMUFile *f)
     return 0;
 }
 
+static int virtio_pci_set_host_notifier_internal(VirtIOPCIProxy *proxy,
+                                                 int n, bool assign)
+{
+    VirtQueue *vq = virtio_get_queue(proxy->vdev, n);
+    EventNotifier *notifier = virtio_queue_get_host_notifier(vq);
+    int r;
+    if (assign) {
+        r = event_notifier_init(notifier, 1);
+        if (r < 0) {
+            error_report("%s: unable to init event notifier: %d",
+                         __func__, r);
+            return r;
+        }
+        r = kvm_set_ioeventfd_pio_word(event_notifier_get_fd(notifier),
+                                       proxy->addr + VIRTIO_PCI_QUEUE_NOTIFY,
+                                       n, assign);
+        if (r < 0) {
+            error_report("%s: unable to map ioeventfd: %d",
+                         __func__, r);
+            event_notifier_cleanup(notifier);
+        }
+    } else {
+        r = kvm_set_ioeventfd_pio_word(event_notifier_get_fd(notifier),
+                                       proxy->addr + VIRTIO_PCI_QUEUE_NOTIFY,
+                                       n, assign);
+        if (r < 0) {
+            error_report("%s: unable to unmap ioeventfd: %d",
+                         __func__, r);
+            return r;
+        }
+
+        /* Handle the race condition where the guest kicked and we deassigned
+         * before we got around to handling the kick.
+         */
+        if (event_notifier_test_and_clear(notifier)) {
+            virtio_queue_notify_vq(vq);
+        }
+
+        event_notifier_cleanup(notifier);
+    }
+    return r;
+}
+
+static void virtio_pci_host_notifier_read(void *opaque)
+{
+    VirtQueue *vq = opaque;
+    EventNotifier *n = virtio_queue_get_host_notifier(vq);
+    if (event_notifier_test_and_clear(n)) {
+        virtio_queue_notify_vq(vq);
+    }
+}
+
+static void virtio_pci_set_host_notifier_fd_handler(VirtIOPCIProxy *proxy,
+                                                    int n, bool assign)
+{
+    VirtQueue *vq = virtio_get_queue(proxy->vdev, n);
+    EventNotifier *notifier = virtio_queue_get_host_notifier(vq);
+    if (assign) {
+        qemu_set_fd_handler(event_notifier_get_fd(notifier),
+                            virtio_pci_host_notifier_read, NULL, vq);
+    } else {
+        qemu_set_fd_handler(event_notifier_get_fd(notifier),
+                            NULL, NULL, NULL);
+    }
+}
+
+static void virtio_pci_start_ioeventfd(VirtIOPCIProxy *proxy)
+{
+    int n, r;
+
+    if (!(proxy->flags & VIRTIO_PCI_FLAG_USE_IOEVENTFD) ||
+        proxy->ioeventfd_disabled ||
+        proxy->ioeventfd_started) {
+        return;
+    }
+
+    for (n = 0; n < VIRTIO_PCI_QUEUE_MAX; n++) {
+        if (!virtio_queue_get_num(proxy->vdev, n)) {
+            continue;
+        }
+
+        r = virtio_pci_set_host_notifier_internal(proxy, n, true);
+        if (r < 0) {
+            goto assign_error;
+        }
+
+        virtio_pci_set_host_notifier_fd_handler(proxy, n, true);
+    }
+    proxy->ioeventfd_started = true;
+    return;
+
+assign_error:
+    while (--n >= 0) {
+        if (!virtio_queue_get_num(proxy->vdev, n)) {
+            continue;
+        }
+
+        virtio_pci_set_host_notifier_fd_handler(proxy, n, false);
+        r = virtio_pci_set_host_notifier_internal(proxy, n, false);
+        assert(r >= 0);
+    }
+    proxy->ioeventfd_started = false;
+    error_report("%s: failed. Fallback to a userspace (slower).", __func__);
+}
+
+static void virtio_pci_stop_ioeventfd(VirtIOPCIProxy *proxy)
+{
+    int r;
+    int n;
+
+    if (!proxy->ioeventfd_started) {
+        return;
+    }
+
+    for (n = 0; n < VIRTIO_PCI_QUEUE_MAX; n++) {
+        if (!virtio_queue_get_num(proxy->vdev, n)) {
+            continue;
+        }
+
+        virtio_pci_set_host_notifier_fd_handler(proxy, n, false);
+        r = virtio_pci_set_host_notifier_internal(proxy, n, false);
+        assert(r >= 0);
+    }
+    proxy->ioeventfd_started = false;
+}
+
 static void virtio_pci_reset(DeviceState *d)
 {
     VirtIOPCIProxy *proxy = container_of(d, VirtIOPCIProxy, pci_dev.qdev);
+    virtio_pci_stop_ioeventfd(proxy);
     virtio_reset(proxy->vdev);
     msix_reset(&proxy->pci_dev);
-    proxy->bugs = 0;
+    proxy->flags &= ~VIRTIO_PCI_FLAG_BUS_MASTER_BUG;
 }
 
 static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
@@ -210,6 +343,7 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
     case VIRTIO_PCI_QUEUE_PFN:
         pa = (target_phys_addr_t)val << VIRTIO_PCI_QUEUE_ADDR_SHIFT;
         if (pa == 0) {
+            virtio_pci_stop_ioeventfd(proxy);
             virtio_reset(proxy->vdev);
             msix_unuse_all_vectors(&proxy->pci_dev);
         }
@@ -224,7 +358,16 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
         virtio_queue_notify(vdev, val);
         break;
     case VIRTIO_PCI_STATUS:
+        if (!(val & VIRTIO_CONFIG_S_DRIVER_OK)) {
+            virtio_pci_stop_ioeventfd(proxy);
+        }
+
         virtio_set_status(vdev, val & 0xFF);
+
+        if (val & VIRTIO_CONFIG_S_DRIVER_OK) {
+            virtio_pci_start_ioeventfd(proxy);
+        }
+
         if (vdev->status == 0) {
             virtio_reset(proxy->vdev);
             msix_unuse_all_vectors(&proxy->pci_dev);
@@ -235,7 +378,7 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
            some safety checks. */
         if ((val & VIRTIO_CONFIG_S_DRIVER_OK) &&
             !(proxy->pci_dev.config[PCI_COMMAND] & PCI_COMMAND_MASTER)) {
-            proxy->bugs |= VIRTIO_PCI_BUG_BUS_MASTER;
+            proxy->flags |= VIRTIO_PCI_FLAG_BUS_MASTER_BUG;
         }
         break;
     case VIRTIO_MSI_CONFIG_VECTOR:
@@ -403,7 +546,8 @@ static void virtio_write_config(PCIDevice *pci_dev, uint32_t address,
 
     if (PCI_COMMAND == address) {
         if (!(val & PCI_COMMAND_MASTER)) {
-            if (!(proxy->bugs & VIRTIO_PCI_BUG_BUS_MASTER)) {
+            if (!(proxy->flags & VIRTIO_PCI_FLAG_BUS_MASTER_BUG)) {
+                virtio_pci_stop_ioeventfd(proxy);
                 virtio_set_status(proxy->vdev,
                                   proxy->vdev->status & ~VIRTIO_CONFIG_S_DRIVER_OK);
             }
@@ -481,30 +625,30 @@ assign_error:
 static int virtio_pci_set_host_notifier(void *opaque, int n, bool assign)
 {
     VirtIOPCIProxy *proxy = opaque;
-    VirtQueue *vq = virtio_get_queue(proxy->vdev, n);
-    EventNotifier *notifier = virtio_queue_get_host_notifier(vq);
-    int r;
+
+    /* Stop using ioeventfd for virtqueue kick if the device starts using host
+     * notifiers.  This makes it easy to avoid stepping on each others' toes.
+     */
+    proxy->ioeventfd_disabled = assign;
     if (assign) {
-        r = event_notifier_init(notifier, 1);
-        if (r < 0) {
-            return r;
-        }
-        r = kvm_set_ioeventfd_pio_word(event_notifier_get_fd(notifier),
-                                       proxy->addr + VIRTIO_PCI_QUEUE_NOTIFY,
-                                       n, assign);
-        if (r < 0) {
-            event_notifier_cleanup(notifier);
-        }
+        virtio_pci_stop_ioeventfd(proxy);
+    }
+    /* We don't need to start here: it's not needed because backend
+     * currently only stops on status change away from ok,
+     * reset, vmstop and such. If we do add code to start here,
+     * need to check vmstate, device state etc. */
+    return virtio_pci_set_host_notifier_internal(proxy, n, assign);
+}
+
+static void virtio_pci_vmstate_change(void *opaque, bool running)
+{
+    VirtIOPCIProxy *proxy = opaque;
+
+    if (running) {
+        virtio_pci_start_ioeventfd(proxy);
     } else {
-        r = kvm_set_ioeventfd_pio_word(event_notifier_get_fd(notifier),
-                                       proxy->addr + VIRTIO_PCI_QUEUE_NOTIFY,
-                                       n, assign);
-        if (r < 0) {
-            return r;
-        }
-        event_notifier_cleanup(notifier);
+        virtio_pci_stop_ioeventfd(proxy);
     }
-    return r;
 }
 
 static const VirtIOBindings virtio_pci_bindings = {
@@ -516,6 +660,7 @@ static const VirtIOBindings virtio_pci_bindings = {
     .get_features = virtio_pci_get_features,
     .set_host_notifier = virtio_pci_set_host_notifier,
     .set_guest_notifiers = virtio_pci_set_guest_notifiers,
+    .vmstate_change = virtio_pci_vmstate_change,
 };
 
 static void virtio_init_pci(VirtIOPCIProxy *proxy, VirtIODevice *vdev,
@@ -560,6 +705,10 @@ static void virtio_init_pci(VirtIOPCIProxy *proxy, VirtIODevice *vdev,
     pci_register_bar(&proxy->pci_dev, 0, size, PCI_BASE_ADDRESS_SPACE_IO,
                            virtio_map);
 
+    if (!kvm_has_many_ioeventfds()) {
+        proxy->flags &= ~VIRTIO_PCI_FLAG_USE_IOEVENTFD;
+    }
+
     virtio_bind_device(vdev, &virtio_pci_bindings, proxy);
     proxy->host_features |= 0x1 << VIRTIO_F_NOTIFY_ON_EMPTY;
     proxy->host_features |= 0x1 << VIRTIO_F_BAD_FEATURE;
@@ -598,6 +747,7 @@ static int virtio_blk_exit_pci(PCIDevice *pci_dev)
 {
     VirtIOPCIProxy *proxy = DO_UPCAST(VirtIOPCIProxy, pci_dev, pci_dev);
 
+    virtio_pci_stop_ioeventfd(proxy);
     virtio_blk_exit(proxy->vdev);
     blockdev_mark_auto_del(proxy->block.bs);
     return virtio_exit_pci(pci_dev);
@@ -659,6 +809,7 @@ static int virtio_net_exit_pci(PCIDevice *pci_dev)
 {
     VirtIOPCIProxy *proxy = DO_UPCAST(VirtIOPCIProxy, pci_dev, pci_dev);
 
+    virtio_pci_stop_ioeventfd(proxy);
     virtio_net_exit(proxy->vdev);
     return virtio_exit_pci(pci_dev);
 }
@@ -706,6 +857,8 @@ static PCIDeviceInfo virtio_info[] = {
         .qdev.props = (Property[]) {
             DEFINE_PROP_HEX32("class", VirtIOPCIProxy, class_code, 0),
             DEFINE_BLOCK_PROPERTIES(VirtIOPCIProxy, block),
+            DEFINE_PROP_BIT("ioeventfd", VirtIOPCIProxy, flags,
+                            VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT, true),
             DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 2),
             DEFINE_VIRTIO_BLK_FEATURES(VirtIOPCIProxy, host_features),
             DEFINE_PROP_END_OF_LIST(),
@@ -718,6 +871,8 @@ static PCIDeviceInfo virtio_info[] = {
         .exit       = virtio_net_exit_pci,
         .romfile    = "pxe-virtio.bin",
         .qdev.props = (Property[]) {
+            DEFINE_PROP_BIT("ioeventfd", VirtIOPCIProxy, flags,
+                            VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT, false),
             DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 3),
             DEFINE_VIRTIO_NET_FEATURES(VirtIOPCIProxy, host_features),
             DEFINE_NIC_PROPERTIES(VirtIOPCIProxy, nic),
diff --git a/hw/virtio.c b/hw/virtio.c
index 07dbf868fd..31bd9e32dc 100644
--- a/hw/virtio.c
+++ b/hw/virtio.c
@@ -575,11 +575,19 @@ int virtio_queue_get_num(VirtIODevice *vdev, int n)
     return vdev->vq[n].vring.num;
 }
 
+void virtio_queue_notify_vq(VirtQueue *vq)
+{
+    if (vq->vring.desc) {
+        VirtIODevice *vdev = vq->vdev;
+        trace_virtio_queue_notify(vdev, vq - vdev->vq, vq);
+        vq->handle_output(vdev, vq);
+    }
+}
+
 void virtio_queue_notify(VirtIODevice *vdev, int n)
 {
-    if (n < VIRTIO_PCI_QUEUE_MAX && vdev->vq[n].vring.desc) {
-        trace_virtio_queue_notify(vdev, n, &vdev->vq[n]);
-        vdev->vq[n].handle_output(vdev, &vdev->vq[n]);
+    if (n < VIRTIO_PCI_QUEUE_MAX) {
+        virtio_queue_notify_vq(&vdev->vq[n]);
     }
 }
 
@@ -743,11 +751,31 @@ int virtio_load(VirtIODevice *vdev, QEMUFile *f)
 
 void virtio_cleanup(VirtIODevice *vdev)
 {
+    qemu_del_vm_change_state_handler(vdev->vmstate);
     if (vdev->config)
         qemu_free(vdev->config);
     qemu_free(vdev->vq);
 }
 
+static void virtio_vmstate_change(void *opaque, int running, int reason)
+{
+    VirtIODevice *vdev = opaque;
+    bool backend_run = running && (vdev->status & VIRTIO_CONFIG_S_DRIVER_OK);
+    vdev->vm_running = running;
+
+    if (backend_run) {
+        virtio_set_status(vdev, vdev->status);
+    }
+
+    if (vdev->binding->vmstate_change) {
+        vdev->binding->vmstate_change(vdev->binding_opaque, backend_run);
+    }
+
+    if (!backend_run) {
+        virtio_set_status(vdev, vdev->status);
+    }
+}
+
 VirtIODevice *virtio_common_init(const char *name, uint16_t device_id,
                                  size_t config_size, size_t struct_size)
 {
@@ -774,6 +802,8 @@ VirtIODevice *virtio_common_init(const char *name, uint16_t device_id,
     else
         vdev->config = NULL;
 
+    vdev->vmstate = qemu_add_vm_change_state_handler(virtio_vmstate_change, vdev);
+
     return vdev;
 }
 
diff --git a/hw/virtio.h b/hw/virtio.h
index 02fa312d3e..d8546d5b30 100644
--- a/hw/virtio.h
+++ b/hw/virtio.h
@@ -95,6 +95,7 @@ typedef struct {
     unsigned (*get_features)(void * opaque);
     int (*set_guest_notifiers)(void * opaque, bool assigned);
     int (*set_host_notifier)(void * opaque, int n, bool assigned);
+    void (*vmstate_change)(void * opaque, bool running);
 } VirtIOBindings;
 
 #define VIRTIO_PCI_QUEUE_MAX 64
@@ -123,6 +124,8 @@ struct VirtIODevice
     const VirtIOBindings *binding;
     void *binding_opaque;
     uint16_t device_id;
+    bool vm_running;
+    VMChangeStateEntry *vmstate;
 };
 
 static inline void virtio_set_status(VirtIODevice *vdev, uint8_t val)
@@ -219,5 +222,6 @@ void virtio_queue_set_last_avail_idx(VirtIODevice *vdev, int n, uint16_t idx);
 VirtQueue *virtio_get_queue(VirtIODevice *vdev, int n);
 EventNotifier *virtio_queue_get_guest_notifier(VirtQueue *vq);
 EventNotifier *virtio_queue_get_host_notifier(VirtQueue *vq);
+void virtio_queue_notify_vq(VirtQueue *vq);
 void virtio_irq(VirtQueue *vq);
 #endif
diff --git a/hw/vmware_vga.c b/hw/vmware_vga.c
index d9dd52fc60..6c59053308 100644
--- a/hw/vmware_vga.c
+++ b/hw/vmware_vga.c
@@ -1318,6 +1318,7 @@ static PCIDeviceInfo vmsvga_info = {
     .qdev.name    = "vmware-svga",
     .qdev.size    = sizeof(struct pci_vmsvga_state_s),
     .qdev.vmsd    = &vmstate_vmware_vga,
+    .no_hotplug   = 1,
     .init         = pci_vmsvga_initfn,
     .romfile      = "vgabios-vmware.bin",
 };
diff --git a/kvm-all.c b/kvm-all.c
index cae24bb87c..255b6fad9c 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -28,6 +28,11 @@
 #include "kvm.h"
 #include "bswap.h"
 
+/* This check must be after config-host.h is included */
+#ifdef CONFIG_EVENTFD
+#include <sys/eventfd.h>
+#endif
+
 /* KVM uses PAGE_SIZE in it's definition of COALESCED_MMIO_MAX */
 #define PAGE_SIZE TARGET_PAGE_SIZE
 
@@ -72,6 +77,7 @@ struct KVMState
     int irqchip_in_kernel;
     int pit_in_kernel;
     int xsave, xcrs;
+    int many_ioeventfds;
 };
 
 static KVMState *kvm_state;
@@ -441,6 +447,39 @@ int kvm_check_extension(KVMState *s, unsigned int extension)
     return ret;
 }
 
+static int kvm_check_many_ioeventfds(void)
+{
+    /* Older kernels have a 6 device limit on the KVM io bus.  Find out so we
+     * can avoid creating too many ioeventfds.
+     */
+#ifdef CONFIG_EVENTFD
+    int ioeventfds[7];
+    int i, ret = 0;
+    for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) {
+        ioeventfds[i] = eventfd(0, EFD_CLOEXEC);
+        if (ioeventfds[i] < 0) {
+            break;
+        }
+        ret = kvm_set_ioeventfd_pio_word(ioeventfds[i], 0, i, true);
+        if (ret < 0) {
+            close(ioeventfds[i]);
+            break;
+        }
+    }
+
+    /* Decide whether many devices are supported or not */
+    ret = i == ARRAY_SIZE(ioeventfds);
+
+    while (i-- > 0) {
+        kvm_set_ioeventfd_pio_word(ioeventfds[i], 0, i, false);
+        close(ioeventfds[i]);
+    }
+    return ret;
+#else
+    return 0;
+#endif
+}
+
 static void kvm_set_phys_mem(target_phys_addr_t start_addr,
 			     ram_addr_t size,
 			     ram_addr_t phys_offset)
@@ -717,6 +756,8 @@ int kvm_init(int smp_cpus)
     kvm_state = s;
     cpu_register_phys_memory_client(&kvm_cpu_phys_memory_client);
 
+    s->many_ioeventfds = kvm_check_many_ioeventfds();
+
     return 0;
 
 err:
@@ -1046,6 +1087,14 @@ int kvm_has_xcrs(void)
     return kvm_state->xcrs;
 }
 
+int kvm_has_many_ioeventfds(void)
+{
+    if (!kvm_enabled()) {
+        return 0;
+    }
+    return kvm_state->many_ioeventfds;
+}
+
 void kvm_setup_guest_memory(void *start, size_t size)
 {
     if (!kvm_has_sync_mmu()) {
diff --git a/kvm-stub.c b/kvm-stub.c
index 5384a4b9a4..33d4476fa3 100644
--- a/kvm-stub.c
+++ b/kvm-stub.c
@@ -99,6 +99,11 @@ int kvm_has_robust_singlestep(void)
     return 0;
 }
 
+int kvm_has_many_ioeventfds(void)
+{
+    return 0;
+}
+
 void kvm_setup_guest_memory(void *start, size_t size)
 {
 }
diff --git a/kvm.h b/kvm.h
index 60a9b425c8..ce08d42756 100644
--- a/kvm.h
+++ b/kvm.h
@@ -42,6 +42,7 @@ int kvm_has_robust_singlestep(void);
 int kvm_has_debugregs(void);
 int kvm_has_xsave(void);
 int kvm_has_xcrs(void);
+int kvm_has_many_ioeventfds(void);
 
 #ifdef NEED_CPU_H
 int kvm_init_vcpu(CPUState *env);
diff --git a/qerror.c b/qerror.c
index ac2cdafa65..9d0cdeb45c 100644
--- a/qerror.c
+++ b/qerror.c
@@ -101,6 +101,10 @@ static const QErrorStringTable qerror_table[] = {
         .desc      = "Device '%(device)' has no child bus",
     },
     {
+        .error_fmt = QERR_DEVICE_NO_HOTPLUG,
+        .desc      = "Device '%(device)' does not support hotplugging",
+    },
+    {
         .error_fmt = QERR_DUPLICATE_ID,
         .desc      = "Duplicate ID '%(id)' for %(object)",
     },
diff --git a/qerror.h b/qerror.h
index 943a24b4e5..b0f69dabe5 100644
--- a/qerror.h
+++ b/qerror.h
@@ -90,6 +90,9 @@ QError *qobject_to_qerror(const QObject *obj);
 #define QERR_DEVICE_NO_BUS \
     "{ 'class': 'DeviceNoBus', 'data': { 'device': %s } }"
 
+#define QERR_DEVICE_NO_HOTPLUG \
+    "{ 'class': 'DeviceNoHotplug', 'data': { 'device': %s } }"
+
 #define QERR_DUPLICATE_ID \
     "{ 'class': 'DuplicateId', 'data': { 'id': %s, 'object': %s } }"
author	Anthony Liguori <aliguori@us.ibm.com>	2011-01-17 09:49:38 -0600
committer	Anthony Liguori <aliguori@us.ibm.com>	2011-01-17 09:49:38 -0600
commit	9e8a69cfd6f0fe2585528fc7a85110fc25c05d0b (patch)
tree	6025ebe5f73460a803577e62c894a0e985f91fd2
parent	51f9b84e759c692575542627dd8d39ae216ac521 (diff)
parent	b36e391441906c36ed0856b69de84001860402bf (diff)