From 4cb47d281a995cb49e4652cb26bafb3ab2d9bd28 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Fri, 30 May 2014 12:43:50 -0600
Subject: vfio-pci: Quirk RTL8168 NIC

This device is ridiculous.  It has two MMIO BARs, BAR4 and BAR2.  BAR4
hosts the MSI-X table, so oviously it would be too easy to access it
directly, instead it creates a window register in BAR2 that, among
other things, provides access to the MSI-X table.  This means MSI-X
doesn't work in the guest because the driver actually manages to
program the physical table.  When interrupt remapping is present, the
device MSI will be blocked.  The Linux driver doesn't make use of this
window, so apparently it's not required to make use of MSI-X.  This
quirk makes the device work with the Windows driver that does use this
window for MSI-X, but I certainly cannot recommend this device for
assignment (the Windows 7 driver also constantly pokes PCI config
space).

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 hw/misc/vfio.c | 144 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 144 insertions(+)

(limited to 'hw')

diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
index 9cf5b84045..76715cc323 100644
--- a/hw/misc/vfio.c
+++ b/hw/misc/vfio.c
@@ -1668,6 +1668,149 @@ static void vfio_probe_ati_bar4_window_quirk(VFIODevice *vdev, int nr)
             vdev->host.function);
 }
 
+#define PCI_VENDOR_ID_REALTEK 0x10ec
+
+/*
+ * RTL8168 devices have a backdoor that can access the MSI-X table.  At BAR2
+ * offset 0x70 there is a dword data register, offset 0x74 is a dword address
+ * register.  According to the Linux r8169 driver, the MSI-X table is addressed
+ * when the "type" portion of the address register is set to 0x1.  This appears
+ * to be bits 16:30.  Bit 31 is both a write indicator and some sort of
+ * "address latched" indicator.  Bits 12:15 are a mask field, which we can
+ * ignore because the MSI-X table should always be accessed as a dword (full
+ * mask).  Bits 0:11 is offset within the type.
+ *
+ * Example trace:
+ *
+ * Read from MSI-X table offset 0
+ * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x1f000, 4) // store read addr
+ * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x8001f000 // latch
+ * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x70, 4) = 0xfee00398 // read data
+ *
+ * Write 0xfee00000 to MSI-X table offset 0
+ * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x70, 0xfee00000, 4) // write data
+ * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x8001f000, 4) // do write
+ * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x1f000 // complete
+ */
+
+static uint64_t vfio_rtl8168_window_quirk_read(void *opaque,
+                                               hwaddr addr, unsigned size)
+{
+    VFIOQuirk *quirk = opaque;
+    VFIODevice *vdev = quirk->vdev;
+
+    switch (addr) {
+    case 4: /* address */
+        if (quirk->data.flags) {
+            DPRINTF("%s fake read(%04x:%02x:%02x.%d)\n",
+                    memory_region_name(&quirk->mem), vdev->host.domain,
+                    vdev->host.bus, vdev->host.slot, vdev->host.function);
+
+            return quirk->data.address_match ^ 0x10000000U;
+        }
+        break;
+    case 0: /* data */
+        if (quirk->data.flags) {
+            uint64_t val;
+
+            DPRINTF("%s MSI-X table read(%04x:%02x:%02x.%d)\n",
+                    memory_region_name(&quirk->mem), vdev->host.domain,
+                    vdev->host.bus, vdev->host.slot, vdev->host.function);
+
+            if (!(vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX)) {
+                return 0;
+            }
+
+            io_mem_read(&vdev->pdev.msix_table_mmio,
+                        (hwaddr)(quirk->data.address_match & 0xfff),
+                        &val, size);
+            return val;
+        }
+    }
+
+    DPRINTF("%s direct read(%04x:%02x:%02x.%d)\n",
+            memory_region_name(&quirk->mem), vdev->host.domain,
+            vdev->host.bus, vdev->host.slot, vdev->host.function);
+
+    return vfio_bar_read(&vdev->bars[quirk->data.bar], addr + 0x70, size);
+}
+
+static void vfio_rtl8168_window_quirk_write(void *opaque, hwaddr addr,
+                                            uint64_t data, unsigned size)
+{
+    VFIOQuirk *quirk = opaque;
+    VFIODevice *vdev = quirk->vdev;
+
+    switch (addr) {
+    case 4: /* address */
+        if ((data & 0x7fff0000) == 0x10000) {
+            if (data & 0x10000000U &&
+                vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX) {
+
+                DPRINTF("%s MSI-X table write(%04x:%02x:%02x.%d)\n",
+                        memory_region_name(&quirk->mem), vdev->host.domain,
+                        vdev->host.bus, vdev->host.slot, vdev->host.function);
+
+                io_mem_write(&vdev->pdev.msix_table_mmio,
+                             (hwaddr)(quirk->data.address_match & 0xfff),
+                             data, size);
+            }
+
+            quirk->data.flags = 1;
+            quirk->data.address_match = data;
+
+            return;
+        }
+        quirk->data.flags = 0;
+        break;
+    case 0: /* data */
+        quirk->data.address_mask = data;
+        break;
+    }
+
+    DPRINTF("%s direct write(%04x:%02x:%02x.%d)\n",
+            memory_region_name(&quirk->mem), vdev->host.domain,
+            vdev->host.bus, vdev->host.slot, vdev->host.function);
+
+    vfio_bar_write(&vdev->bars[quirk->data.bar], addr + 0x70, data, size);
+}
+
+static const MemoryRegionOps vfio_rtl8168_window_quirk = {
+    .read = vfio_rtl8168_window_quirk_read,
+    .write = vfio_rtl8168_window_quirk_write,
+    .valid = {
+        .min_access_size = 4,
+        .max_access_size = 4,
+        .unaligned = false,
+    },
+    .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
+static void vfio_probe_rtl8168_bar2_window_quirk(VFIODevice *vdev, int nr)
+{
+    PCIDevice *pdev = &vdev->pdev;
+    VFIOQuirk *quirk;
+
+    if (pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_REALTEK ||
+        pci_get_word(pdev->config + PCI_DEVICE_ID) != 0x8168 || nr != 2) {
+        return;
+    }
+
+    quirk = g_malloc0(sizeof(*quirk));
+    quirk->vdev = vdev;
+    quirk->data.bar = nr;
+
+    memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_rtl8168_window_quirk,
+                          quirk, "vfio-rtl8168-window-quirk", 8);
+    memory_region_add_subregion_overlap(&vdev->bars[nr].mem,
+                                        0x70, &quirk->mem, 1);
+
+    QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
+
+    DPRINTF("Enabled RTL8168 BAR2 window quirk for device %04x:%02x:%02x.%x\n",
+            vdev->host.domain, vdev->host.bus, vdev->host.slot,
+            vdev->host.function);
+}
 /*
  * Trap the BAR2 MMIO window to config space as well.
  */
@@ -2071,6 +2214,7 @@ static void vfio_bar_quirk_setup(VFIODevice *vdev, int nr)
     vfio_probe_nvidia_bar5_window_quirk(vdev, nr);
     vfio_probe_nvidia_bar0_88000_quirk(vdev, nr);
     vfio_probe_nvidia_bar0_1800_quirk(vdev, nr);
+    vfio_probe_rtl8168_bar2_window_quirk(vdev, nr);
 }
 
 static void vfio_bar_quirk_teardown(VFIODevice *vdev, int nr)
-- 
cgit v1.2.3


From 7532d3cbf148f72c9476484223b94a188ab59706 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 30 May 2014 13:02:02 -0600
Subject: vfio: Fix 128 bit handling

Upcoming VFIO on SPAPR PPC64 support will initialize the IOMMU
memory region with UINT64_MAX (2^64 bytes) size so int128_get64()
will assert.

The patch takes care of this check. The existing type1 IOMMU code
is not expected to map all 64 bits of RAM so the patch does not
touch that part.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 hw/misc/vfio.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'hw')

diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
index 76715cc323..0d4594ed7d 100644
--- a/hw/misc/vfio.c
+++ b/hw/misc/vfio.c
@@ -2392,6 +2392,7 @@ static void vfio_listener_region_add(MemoryListener *listener,
     VFIOContainer *container = container_of(listener, VFIOContainer,
                                             iommu_data.type1.listener);
     hwaddr iova, end;
+    Int128 llend;
     void *vaddr;
     int ret;
 
@@ -2412,13 +2413,15 @@ static void vfio_listener_region_add(MemoryListener *listener,
     }
 
     iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
-    end = (section->offset_within_address_space + int128_get64(section->size)) &
-          TARGET_PAGE_MASK;
+    llend = int128_make64(section->offset_within_address_space);
+    llend = int128_add(llend, section->size);
+    llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
 
-    if (iova >= end) {
+    if (int128_ge(int128_make64(iova), llend)) {
         return;
     }
 
+    end = int128_get64(llend);
     vaddr = memory_region_get_ram_ptr(section->mr) +
             section->offset_within_region +
             (iova - section->offset_within_address_space);
-- 
cgit v1.2.3


From 279a35ab4a8515d1eac06a76547d4b65a6cde519 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 30 May 2014 13:03:21 -0600
Subject: vfio: Rework to have error paths

This reworks vfio_connect_container() and vfio_get_group() to have
common exit path at the end of the function bodies.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 hw/misc/vfio.c | 58 ++++++++++++++++++++++++++++++++--------------------------
 1 file changed, 32 insertions(+), 26 deletions(-)

(limited to 'hw')

diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
index 0d4594ed7d..cba0ac1e77 100644
--- a/hw/misc/vfio.c
+++ b/hw/misc/vfio.c
@@ -3448,8 +3448,8 @@ static int vfio_connect_container(VFIOGroup *group)
     if (ret != VFIO_API_VERSION) {
         error_report("vfio: supported vfio version: %d, "
                      "reported version: %d", VFIO_API_VERSION, ret);
-        close(fd);
-        return -EINVAL;
+        ret = -EINVAL;
+        goto close_fd_exit;
     }
 
     container = g_malloc0(sizeof(*container));
@@ -3459,17 +3459,15 @@ static int vfio_connect_container(VFIOGroup *group)
         ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
         if (ret) {
             error_report("vfio: failed to set group container: %m");
-            g_free(container);
-            close(fd);
-            return -errno;
+            ret = -errno;
+            goto free_container_exit;
         }
 
         ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
         if (ret) {
             error_report("vfio: failed to set iommu for container: %m");
-            g_free(container);
-            close(fd);
-            return -errno;
+            ret = -errno;
+            goto free_container_exit;
         }
 
         container->iommu_data.type1.listener = vfio_memory_listener;
@@ -3480,20 +3478,16 @@ static int vfio_connect_container(VFIOGroup *group)
 
         if (container->iommu_data.type1.error) {
             ret = container->iommu_data.type1.error;
-            vfio_listener_release(container);
-            g_free(container);
-            close(fd);
             error_report("vfio: memory listener initialization failed for container");
-            return ret;
+            goto listener_release_exit;
         }
 
         container->iommu_data.type1.initialized = true;
 
     } else {
         error_report("vfio: No available IOMMU models");
-        g_free(container);
-        close(fd);
-        return -EINVAL;
+        ret = -EINVAL;
+        goto free_container_exit;
     }
 
     QLIST_INIT(&container->group_list);
@@ -3503,6 +3497,17 @@ static int vfio_connect_container(VFIOGroup *group)
     QLIST_INSERT_HEAD(&container->group_list, group, container_next);
 
     return 0;
+
+listener_release_exit:
+    vfio_listener_release(container);
+
+free_container_exit:
+    g_free(container);
+
+close_fd_exit:
+    close(fd);
+
+    return ret;
 }
 
 static void vfio_disconnect_container(VFIOGroup *group)
@@ -3546,24 +3551,19 @@ static VFIOGroup *vfio_get_group(int groupid)
     group->fd = qemu_open(path, O_RDWR);
     if (group->fd < 0) {
         error_report("vfio: error opening %s: %m", path);
-        g_free(group);
-        return NULL;
+        goto free_group_exit;
     }
 
     if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
         error_report("vfio: error getting group status: %m");
-        close(group->fd);
-        g_free(group);
-        return NULL;
+        goto close_fd_exit;
     }
 
     if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
         error_report("vfio: error, group %d is not viable, please ensure "
                      "all devices within the iommu_group are bound to their "
                      "vfio bus driver.", groupid);
-        close(group->fd);
-        g_free(group);
-        return NULL;
+        goto close_fd_exit;
     }
 
     group->groupid = groupid;
@@ -3571,9 +3571,7 @@ static VFIOGroup *vfio_get_group(int groupid)
 
     if (vfio_connect_container(group)) {
         error_report("vfio: failed to setup container for group %d", groupid);
-        close(group->fd);
-        g_free(group);
-        return NULL;
+        goto close_fd_exit;
     }
 
     if (QLIST_EMPTY(&group_list)) {
@@ -3585,6 +3583,14 @@ static VFIOGroup *vfio_get_group(int groupid)
     vfio_kvm_device_add_group(group);
 
     return group;
+
+close_fd_exit:
+    close(group->fd);
+
+free_group_exit:
+    g_free(group);
+
+    return NULL;
 }
 
 static void vfio_put_group(VFIOGroup *group)
-- 
cgit v1.2.3


From 3df3e0a5872cbc8fcc55a0413416352eec68132e Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 30 May 2014 13:05:19 -0600
Subject: vfio: Introduce VFIO address spaces

The only model so far supported for VFIO passthrough devices is the model
usually used on x86, where all of the guest's RAM is mapped into the
(host) IOMMU and there is no IOMMU visible in the guest.

This patch begins to relax this model, introducing the notion of a
VFIOAddressSpace.  This represents a logical DMA address space which will
be visible to one or more VFIO devices by appropriate mapping in the (host)
IOMMU.  Thus the currently global list of containers becomes local to
a VFIOAddressSpace, and we verify that we don't attempt to add a VFIO
group to multiple address spaces.

For now, only one VFIOAddressSpace is created and used, corresponding to
main system memory, that will change in future patches.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 hw/misc/vfio.c | 51 ++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 38 insertions(+), 13 deletions(-)

(limited to 'hw')

diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
index cba0ac1e77..c764322ac2 100644
--- a/hw/misc/vfio.c
+++ b/hw/misc/vfio.c
@@ -133,6 +133,20 @@ enum {
     VFIO_INT_MSIX = 3,
 };
 
+typedef struct VFIOAddressSpace {
+    AddressSpace *as;
+    QLIST_HEAD(, VFIOContainer) containers;
+    QLIST_ENTRY(VFIOAddressSpace) list;
+} VFIOAddressSpace;
+
+static VFIOAddressSpace vfio_address_space_memory;
+
+static void vfio_address_space_init(VFIOAddressSpace *space, AddressSpace *as)
+{
+    space->as = as;
+    QLIST_INIT(&space->containers);
+}
+
 struct VFIOGroup;
 
 typedef struct VFIOType1 {
@@ -142,6 +156,7 @@ typedef struct VFIOType1 {
 } VFIOType1;
 
 typedef struct VFIOContainer {
+    VFIOAddressSpace *space;
     int fd; /* /dev/vfio/vfio, empowered by the attached groups */
     struct {
         /* enable abstraction to support various iommu backends */
@@ -234,9 +249,6 @@ static const VFIORomBlacklistEntry romblacklist[] = {
 
 #define MSIX_CAP_LENGTH 12
 
-static QLIST_HEAD(, VFIOContainer)
-    container_list = QLIST_HEAD_INITIALIZER(container_list);
-
 static QLIST_HEAD(, VFIOGroup)
     group_list = QLIST_HEAD_INITIALIZER(group_list);
 
@@ -3421,16 +3433,15 @@ static void vfio_kvm_device_del_group(VFIOGroup *group)
 #endif
 }
 
-static int vfio_connect_container(VFIOGroup *group)
+static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
 {
     VFIOContainer *container;
     int ret, fd;
+    VFIOAddressSpace *space;
 
-    if (group->container) {
-        return 0;
-    }
+    space = &vfio_address_space_memory;
 
-    QLIST_FOREACH(container, &container_list, next) {
+    QLIST_FOREACH(container, &space->containers, next) {
         if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
             group->container = container;
             QLIST_INSERT_HEAD(&container->group_list, group, container_next);
@@ -3453,6 +3464,7 @@ static int vfio_connect_container(VFIOGroup *group)
     }
 
     container = g_malloc0(sizeof(*container));
+    container->space = space;
     container->fd = fd;
 
     if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
@@ -3491,7 +3503,7 @@ static int vfio_connect_container(VFIOGroup *group)
     }
 
     QLIST_INIT(&container->group_list);
-    QLIST_INSERT_HEAD(&container_list, container, next);
+    QLIST_INSERT_HEAD(&space->containers, container, next);
 
     group->container = container;
     QLIST_INSERT_HEAD(&container->group_list, group, container_next);
@@ -3533,7 +3545,7 @@ static void vfio_disconnect_container(VFIOGroup *group)
     }
 }
 
-static VFIOGroup *vfio_get_group(int groupid)
+static VFIOGroup *vfio_get_group(int groupid, AddressSpace *as)
 {
     VFIOGroup *group;
     char path[32];
@@ -3541,7 +3553,14 @@ static VFIOGroup *vfio_get_group(int groupid)
 
     QLIST_FOREACH(group, &group_list, next) {
         if (group->groupid == groupid) {
-            return group;
+            /* Found it.  Now is it already in the right context? */
+            if (group->container->space->as == as) {
+                return group;
+            } else {
+                error_report("vfio: group %d used in multiple address spaces",
+                             group->groupid);
+                return NULL;
+            }
         }
     }
 
@@ -3569,7 +3588,7 @@ static VFIOGroup *vfio_get_group(int groupid)
     group->groupid = groupid;
     QLIST_INIT(&group->device_list);
 
-    if (vfio_connect_container(group)) {
+    if (vfio_connect_container(group, as)) {
         error_report("vfio: failed to setup container for group %d", groupid);
         goto close_fd_exit;
     }
@@ -3921,7 +3940,12 @@ static int vfio_initfn(PCIDevice *pdev)
     DPRINTF("%s(%04x:%02x:%02x.%x) group %d\n", __func__, vdev->host.domain,
             vdev->host.bus, vdev->host.slot, vdev->host.function, groupid);
 
-    group = vfio_get_group(groupid);
+    if (pci_device_iommu_address_space(pdev) != &address_space_memory) {
+        error_report("vfio: DMA address space must be system memory");
+        return -EINVAL;
+    }
+
+    group = vfio_get_group(groupid, &address_space_memory);
     if (!group) {
         error_report("vfio: failed to get group %d", groupid);
         return -ENOENT;
@@ -4135,6 +4159,7 @@ static const TypeInfo vfio_pci_dev_info = {
 
 static void register_vfio_pci_dev_type(void)
 {
+    vfio_address_space_init(&vfio_address_space_memory, &address_space_memory);
     type_register_static(&vfio_pci_dev_info);
 }
 
-- 
cgit v1.2.3


From 0688448b71d22912601380b7e7eb66a38dc996b8 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 30 May 2014 13:09:14 -0600
Subject: vfio: Create VFIOAddressSpace objects as needed

So far, VFIO has a notion of different logical DMA address spaces, but
only ever uses one (system memory).  This patch extends this, creating
new VFIOAddressSpace objects as necessary, according to the AddressSpace
reported by the PCI subsystem for this device's DMAs.

This isn't enough yet to support guest side IOMMUs with VFIO, but it does
mean we could now support VFIO devices on, for example, a guest side PCI
host bridge which maps system memory at somewhere other than 0 in PCI
space.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 hw/misc/vfio.c | 57 +++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 41 insertions(+), 16 deletions(-)

(limited to 'hw')

diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
index c764322ac2..b170fd245d 100644
--- a/hw/misc/vfio.c
+++ b/hw/misc/vfio.c
@@ -139,13 +139,8 @@ typedef struct VFIOAddressSpace {
     QLIST_ENTRY(VFIOAddressSpace) list;
 } VFIOAddressSpace;
 
-static VFIOAddressSpace vfio_address_space_memory;
-
-static void vfio_address_space_init(VFIOAddressSpace *space, AddressSpace *as)
-{
-    space->as = as;
-    QLIST_INIT(&space->containers);
-}
+static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces =
+    QLIST_HEAD_INITIALIZER(vfio_address_spaces);
 
 struct VFIOGroup;
 
@@ -3433,13 +3428,41 @@ static void vfio_kvm_device_del_group(VFIOGroup *group)
 #endif
 }
 
+static VFIOAddressSpace *vfio_get_address_space(AddressSpace *as)
+{
+    VFIOAddressSpace *space;
+
+    QLIST_FOREACH(space, &vfio_address_spaces, list) {
+        if (space->as == as) {
+            return space;
+        }
+    }
+
+    /* No suitable VFIOAddressSpace, create a new one */
+    space = g_malloc0(sizeof(*space));
+    space->as = as;
+    QLIST_INIT(&space->containers);
+
+    QLIST_INSERT_HEAD(&vfio_address_spaces, space, list);
+
+    return space;
+}
+
+static void vfio_put_address_space(VFIOAddressSpace *space)
+{
+    if (QLIST_EMPTY(&space->containers)) {
+        QLIST_REMOVE(space, list);
+        g_free(space);
+    }
+}
+
 static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
 {
     VFIOContainer *container;
     int ret, fd;
     VFIOAddressSpace *space;
 
-    space = &vfio_address_space_memory;
+    space = vfio_get_address_space(as);
 
     QLIST_FOREACH(container, &space->containers, next) {
         if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
@@ -3452,7 +3475,8 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
     fd = qemu_open("/dev/vfio/vfio", O_RDWR);
     if (fd < 0) {
         error_report("vfio: failed to open /dev/vfio/vfio: %m");
-        return -errno;
+        ret = -errno;
+        goto put_space_exit;
     }
 
     ret = ioctl(fd, VFIO_GET_API_VERSION);
@@ -3519,6 +3543,9 @@ free_container_exit:
 close_fd_exit:
     close(fd);
 
+put_space_exit:
+    vfio_put_address_space(space);
+
     return ret;
 }
 
@@ -3535,6 +3562,8 @@ static void vfio_disconnect_container(VFIOGroup *group)
     group->container = NULL;
 
     if (QLIST_EMPTY(&container->group_list)) {
+        VFIOAddressSpace *space = container->space;
+
         if (container->iommu_data.release) {
             container->iommu_data.release(container);
         }
@@ -3542,6 +3571,8 @@ static void vfio_disconnect_container(VFIOGroup *group)
         DPRINTF("vfio_disconnect_container: close container->fd\n");
         close(container->fd);
         g_free(container);
+
+        vfio_put_address_space(space);
     }
 }
 
@@ -3940,12 +3971,7 @@ static int vfio_initfn(PCIDevice *pdev)
     DPRINTF("%s(%04x:%02x:%02x.%x) group %d\n", __func__, vdev->host.domain,
             vdev->host.bus, vdev->host.slot, vdev->host.function, groupid);
 
-    if (pci_device_iommu_address_space(pdev) != &address_space_memory) {
-        error_report("vfio: DMA address space must be system memory");
-        return -EINVAL;
-    }
-
-    group = vfio_get_group(groupid, &address_space_memory);
+    group = vfio_get_group(groupid, pci_device_iommu_address_space(pdev));
     if (!group) {
         error_report("vfio: failed to get group %d", groupid);
         return -ENOENT;
@@ -4159,7 +4185,6 @@ static const TypeInfo vfio_pci_dev_info = {
 
 static void register_vfio_pci_dev_type(void)
 {
-    vfio_address_space_init(&vfio_address_space_memory, &address_space_memory);
     type_register_static(&vfio_pci_dev_info);
 }
 
-- 
cgit v1.2.3


From 5e70018b00dbe59a96b80933a0dd308ee1ec4488 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 30 May 2014 13:10:07 -0600
Subject: vfio: Add guest side IOMMU support

This patch uses the new IOMMU notifiers to allow VFIO pass through devices
to work with guest side IOMMUs, as long as the host-side VFIO iommu has
sufficient capability and granularity to match the guest side. This works
by tracking all map and unmap operations on the guest IOMMU using the
notifiers, and mirroring them into VFIO.

There are a number of FIXMEs, and the scheme involves rather more notifier
structures than I'd like, but it should make for a reasonable proof of
concept.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 hw/misc/vfio.c | 139 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 134 insertions(+), 5 deletions(-)

(limited to 'hw')

diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
index b170fd245d..7437c2e3c3 100644
--- a/hw/misc/vfio.c
+++ b/hw/misc/vfio.c
@@ -160,10 +160,18 @@ typedef struct VFIOContainer {
         };
         void (*release)(struct VFIOContainer *);
     } iommu_data;
+    QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
     QLIST_HEAD(, VFIOGroup) group_list;
     QLIST_ENTRY(VFIOContainer) next;
 } VFIOContainer;
 
+typedef struct VFIOGuestIOMMU {
+    VFIOContainer *container;
+    MemoryRegion *iommu;
+    Notifier n;
+    QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
+} VFIOGuestIOMMU;
+
 /* Cache of MSI-X setup plus extra mmap and memory region for split BAR map */
 typedef struct VFIOMSIXInfo {
     uint8_t table_bar;
@@ -2383,7 +2391,8 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
 
 static bool vfio_listener_skipped_section(MemoryRegionSection *section)
 {
-    return !memory_region_is_ram(section->mr) ||
+    return (!memory_region_is_ram(section->mr) &&
+            !memory_region_is_iommu(section->mr)) ||
            /*
             * Sizing an enabled 64-bit BAR can cause spurious mappings to
             * addresses in the upper part of the 64-bit address space.  These
@@ -2393,6 +2402,65 @@ static bool vfio_listener_skipped_section(MemoryRegionSection *section)
            section->offset_within_address_space & (1ULL << 63);
 }
 
+static void vfio_iommu_map_notify(Notifier *n, void *data)
+{
+    VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
+    VFIOContainer *container = giommu->container;
+    IOMMUTLBEntry *iotlb = data;
+    MemoryRegion *mr;
+    hwaddr xlat;
+    hwaddr len = iotlb->addr_mask + 1;
+    void *vaddr;
+    int ret;
+
+    DPRINTF("iommu map @ %"HWADDR_PRIx" - %"HWADDR_PRIx"\n",
+            iotlb->iova, iotlb->iova + iotlb->addr_mask);
+
+    /*
+     * The IOMMU TLB entry we have just covers translation through
+     * this IOMMU to its immediate target.  We need to translate
+     * it the rest of the way through to memory.
+     */
+    mr = address_space_translate(&address_space_memory,
+                                 iotlb->translated_addr,
+                                 &xlat, &len, iotlb->perm & IOMMU_WO);
+    if (!memory_region_is_ram(mr)) {
+        DPRINTF("iommu map to non memory area %"HWADDR_PRIx"\n",
+                xlat);
+        return;
+    }
+    /*
+     * Translation truncates length to the IOMMU page size,
+     * check that it did not truncate too much.
+     */
+    if (len & iotlb->addr_mask) {
+        DPRINTF("iommu has granularity incompatible with target AS\n");
+        return;
+    }
+
+    if (iotlb->perm != IOMMU_NONE) {
+        vaddr = memory_region_get_ram_ptr(mr) + xlat;
+
+        ret = vfio_dma_map(container, iotlb->iova,
+                           iotlb->addr_mask + 1, vaddr,
+                           !(iotlb->perm & IOMMU_WO) || mr->readonly);
+        if (ret) {
+            error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
+                         "0x%"HWADDR_PRIx", %p) = %d (%m)",
+                         container, iotlb->iova,
+                         iotlb->addr_mask + 1, vaddr, ret);
+        }
+    } else {
+        ret = vfio_dma_unmap(container, iotlb->iova, iotlb->addr_mask + 1);
+        if (ret) {
+            error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
+                         "0x%"HWADDR_PRIx") = %d (%m)",
+                         container, iotlb->iova,
+                         iotlb->addr_mask + 1, ret);
+        }
+    }
+}
+
 static void vfio_listener_region_add(MemoryListener *listener,
                                      MemoryRegionSection *section)
 {
@@ -2403,8 +2471,6 @@ static void vfio_listener_region_add(MemoryListener *listener,
     void *vaddr;
     int ret;
 
-    assert(!memory_region_is_iommu(section->mr));
-
     if (vfio_listener_skipped_section(section)) {
         DPRINTF("SKIPPING region_add %"HWADDR_PRIx" - %"PRIx64"\n",
                 section->offset_within_address_space,
@@ -2428,15 +2494,57 @@ static void vfio_listener_region_add(MemoryListener *listener,
         return;
     }
 
+    memory_region_ref(section->mr);
+
+    if (memory_region_is_iommu(section->mr)) {
+        VFIOGuestIOMMU *giommu;
+
+        DPRINTF("region_add [iommu] %"HWADDR_PRIx" - %"HWADDR_PRIx"\n",
+                iova, int128_get64(int128_sub(llend, int128_one())));
+        /*
+         * FIXME: We should do some checking to see if the
+         * capabilities of the host VFIO IOMMU are adequate to model
+         * the guest IOMMU
+         *
+         * FIXME: For VFIO iommu types which have KVM acceleration to
+         * avoid bouncing all map/unmaps through qemu this way, this
+         * would be the right place to wire that up (tell the KVM
+         * device emulation the VFIO iommu handles to use).
+         */
+        /*
+         * This assumes that the guest IOMMU is empty of
+         * mappings at this point.
+         *
+         * One way of doing this is:
+         * 1. Avoid sharing IOMMUs between emulated devices or different
+         * IOMMU groups.
+         * 2. Implement VFIO_IOMMU_ENABLE in the host kernel to fail if
+         * there are some mappings in IOMMU.
+         *
+         * VFIO on SPAPR does that. Other IOMMU models may do that different,
+         * they must make sure there are no existing mappings or
+         * loop through existing mappings to map them into VFIO.
+         */
+        giommu = g_malloc0(sizeof(*giommu));
+        giommu->iommu = section->mr;
+        giommu->container = container;
+        giommu->n.notify = vfio_iommu_map_notify;
+        QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
+        memory_region_register_iommu_notifier(giommu->iommu, &giommu->n);
+
+        return;
+    }
+
+    /* Here we assume that memory_region_is_ram(section->mr)==true */
+
     end = int128_get64(llend);
     vaddr = memory_region_get_ram_ptr(section->mr) +
             section->offset_within_region +
             (iova - section->offset_within_address_space);
 
-    DPRINTF("region_add %"HWADDR_PRIx" - %"HWADDR_PRIx" [%p]\n",
+    DPRINTF("region_add [ram] %"HWADDR_PRIx" - %"HWADDR_PRIx" [%p]\n",
             iova, end - 1, vaddr);
 
-    memory_region_ref(section->mr);
     ret = vfio_dma_map(container, iova, end - iova, vaddr, section->readonly);
     if (ret) {
         error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
@@ -2480,6 +2588,27 @@ static void vfio_listener_region_del(MemoryListener *listener,
         return;
     }
 
+    if (memory_region_is_iommu(section->mr)) {
+        VFIOGuestIOMMU *giommu;
+
+        QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
+            if (giommu->iommu == section->mr) {
+                memory_region_unregister_iommu_notifier(&giommu->n);
+                QLIST_REMOVE(giommu, giommu_next);
+                g_free(giommu);
+                break;
+            }
+        }
+
+        /*
+         * FIXME: We assume the one big unmap below is adequate to
+         * remove any individual page mappings in the IOMMU which
+         * might have been copied into VFIO. This works for a page table
+         * based IOMMU where a big unmap flattens a large range of IO-PTEs.
+         * That may not be true for all IOMMU types.
+         */
+    }
+
     iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
     end = (section->offset_within_address_space + int128_get64(section->size)) &
           TARGET_PAGE_MASK;
-- 
cgit v1.2.3