spapr_iommu: Do not replay mappings from just created DMA window

On sPAPR vfio_listener_region_add() is called in 2 situations: 1. a new listener is registered from vfio_connect_container(); 2. a new IOMMU Memory Region is added from rtas_ibm_create_pe_dma_window(). In both cases vfio_listener_region_add() calls memory_region_iommu_replay() to notify newly registered IOMMU notifiers about existing mappings which is totally desirable for case 1. However for case 2 it is nothing but noop as the window has just been created and has no valid mappings so replaying those does not do anything. It is barely noticeable with usual guests but if the window happens to be really big, such no-op replay might take minutes and trigger RCU stall warnings in the guest. For example, a upcoming GPU RAM memory region mapped at 64TiB (right after SPAPR_PCI_LIMIT) causes a 64bit DMA window to be at least 128TiB which is (128<<40)/0x10000=2.147.483.648 TCEs to replay. This mitigates the problem by adding an "skipping_replay" flag to sPAPRTCETable and defining sPAPR own IOMMU MR replay() hook which does exactly the same thing as the generic one except it returns early if @skipping_replay==true. Another way of fixing this would be delaying replay till the very first H_PUT_TCE but this does not work if in-kernel H_PUT_TCE handler is enabled (a likely case). When "ibm,create-pe-dma-window" is complete, the guest will map only required regions of the huge DMA window. Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> Message-Id: <20190307050518.64968-2-aik@ozlabs.ru> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
author: Alexey Kardashevskiy <aik@ozlabs.ru> 2019-03-07 16:05:16 +1100
committer: David Gibson <david@gibson.dropbear.id.au> 2019-03-12 14:33:04 +1100
commit: 5f3666672255cbd00ce96fb5688dfca25a4c4d55 (patch)
tree: 01c707680b3b06bcda74416d9f34b33877b21329 /hw
parent: f7eb6a0a9b9326d3daae4bf0d9c664c8e79bea39 (diff)
2 files changed, 41 insertions, 0 deletions
diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
index 37e98f9321..8f231799b2 100644
--- a/hw/ppc/spapr_iommu.c
+++ b/hw/ppc/spapr_iommu.c
@@ -141,6 +141,36 @@ static IOMMUTLBEntry spapr_tce_translate_iommu(IOMMUMemoryRegion *iommu,
     return ret;
 }
 
+static void spapr_tce_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
+{
+    MemoryRegion *mr = MEMORY_REGION(iommu_mr);
+    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr);
+    hwaddr addr, granularity;
+    IOMMUTLBEntry iotlb;
+    sPAPRTCETable *tcet = container_of(iommu_mr, sPAPRTCETable, iommu);
+
+    if (tcet->skipping_replay) {
+        return;
+    }
+
+    granularity = memory_region_iommu_get_min_page_size(iommu_mr);
+
+    for (addr = 0; addr < memory_region_size(mr); addr += granularity) {
+        iotlb = imrc->translate(iommu_mr, addr, IOMMU_NONE, n->iommu_idx);
+        if (iotlb.perm != IOMMU_NONE) {
+            n->notify(n, &iotlb);
+        }
+
+        /*
+         * if (2^64 - MR size) < granularity, it's possible to get an
+         * infinite loop here.  This should catch such a wraparound.
+         */
+        if ((addr + granularity) < addr) {
+            break;
+        }
+    }
+}
+
 static int spapr_tce_table_pre_save(void *opaque)
 {
     sPAPRTCETable *tcet = SPAPR_TCE_TABLE(opaque);
@@ -659,6 +689,7 @@ static void spapr_iommu_memory_region_class_init(ObjectClass *klass, void *data)
     IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
 
     imrc->translate = spapr_tce_translate_iommu;
+    imrc->replay = spapr_tce_replay;
     imrc->get_min_page_size = spapr_tce_get_min_page_size;
     imrc->notify_flag_changed = spapr_tce_notify_flag_changed;
     imrc->get_attr = spapr_tce_get_attr;
diff --git a/hw/ppc/spapr_rtas_ddw.c b/hw/ppc/spapr_rtas_ddw.c
index cb8a410359..cc9d1f5c1c 100644
--- a/hw/ppc/spapr_rtas_ddw.c
+++ b/hw/ppc/spapr_rtas_ddw.c
@@ -171,8 +171,18 @@ static void rtas_ibm_create_pe_dma_window(PowerPCCPU *cpu,
     }
 
     win_addr = (windows == 0) ? sphb->dma_win_addr : sphb->dma64_win_addr;
+    /*
+     * We have just created a window, we know for the fact that it is empty,
+     * use a hack to avoid iterating over the table as it is quite possible
+     * to have billions of TCEs, all empty.
+     * Note that we cannot delay this to the first H_PUT_TCE as this hcall is
+     * mostly likely to be handled in KVM so QEMU just does not know if it
+     * happened.
+     */
+    tcet->skipping_replay = true;
     spapr_tce_table_enable(tcet, page_shift, win_addr,
                            1ULL << (window_shift - page_shift));
+    tcet->skipping_replay = false;
     if (!tcet->nb_table) {
         goto hw_error_exit;
     }
author	Alexey Kardashevskiy <aik@ozlabs.ru>	2019-03-07 16:05:16 +1100
committer	David Gibson <david@gibson.dropbear.id.au>	2019-03-12 14:33:04 +1100
commit	5f3666672255cbd00ce96fb5688dfca25a4c4d55 (patch)
tree	01c707680b3b06bcda74416d9f34b33877b21329 /hw
parent	f7eb6a0a9b9326d3daae4bf0d9c664c8e79bea39 (diff)