From bf55b7afce53718ef96f4e6616da62c0ccac37dd Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 19 May 2017 11:19:40 +0800 Subject: memory: tune last param of iommu_ops.translate() This patch converts the old "is_write" bool into IOMMUAccessFlags. The difference is that "is_write" can only express either read/write, but sometimes what we really want is "none" here (neither read nor write). Replay is an good example - during replay, we should not check any RW permission bits since thats not an actual IO at all. CC: Paolo Bonzini CC: David Gibson Reviewed-by: David Gibson Acked-by: David Gibson Acked-by: Paolo Bonzini Signed-off-by: Peter Xu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Jason Wang --- hw/alpha/typhoon.c | 2 +- hw/dma/rc4030.c | 2 +- hw/i386/amd_iommu.c | 4 ++-- hw/i386/intel_iommu.c | 4 ++-- hw/pci-host/apb.c | 2 +- hw/ppc/spapr_iommu.c | 2 +- hw/s390x/s390-pci-bus.c | 2 +- hw/s390x/s390-pci-inst.c | 2 +- 8 files changed, 10 insertions(+), 10 deletions(-) (limited to 'hw') diff --git a/hw/alpha/typhoon.c b/hw/alpha/typhoon.c index f50f5cf186..c1cf7802a4 100644 --- a/hw/alpha/typhoon.c +++ b/hw/alpha/typhoon.c @@ -664,7 +664,7 @@ static bool window_translate(TyphoonWindow *win, hwaddr addr, /* TODO: A translation failure here ought to set PCI error codes on the Pchip and generate a machine check interrupt. */ static IOMMUTLBEntry typhoon_translate_iommu(MemoryRegion *iommu, hwaddr addr, - bool is_write) + IOMMUAccessFlags flag) { TyphoonPchip *pchip = container_of(iommu, TyphoonPchip, iommu); IOMMUTLBEntry ret; diff --git a/hw/dma/rc4030.c b/hw/dma/rc4030.c index 0080141905..edf9432051 100644 --- a/hw/dma/rc4030.c +++ b/hw/dma/rc4030.c @@ -489,7 +489,7 @@ static const MemoryRegionOps jazzio_ops = { }; static IOMMUTLBEntry rc4030_dma_translate(MemoryRegion *iommu, hwaddr addr, - bool is_write) + IOMMUAccessFlags flag) { rc4030State *s = container_of(iommu, rc4030State, dma_mr); IOMMUTLBEntry ret = { diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c index 329058dac8..7b6d4ea3f3 100644 --- a/hw/i386/amd_iommu.c +++ b/hw/i386/amd_iommu.c @@ -988,7 +988,7 @@ static inline bool amdvi_is_interrupt_addr(hwaddr addr) } static IOMMUTLBEntry amdvi_translate(MemoryRegion *iommu, hwaddr addr, - bool is_write) + IOMMUAccessFlags flag) { AMDVIAddressSpace *as = container_of(iommu, AMDVIAddressSpace, iommu); AMDVIState *s = as->iommu_state; @@ -1017,7 +1017,7 @@ static IOMMUTLBEntry amdvi_translate(MemoryRegion *iommu, hwaddr addr, return ret; } - amdvi_do_translate(as, addr, is_write, &ret); + amdvi_do_translate(as, addr, flag & IOMMU_WO, &ret); trace_amdvi_translation_result(as->bus_num, PCI_SLOT(as->devfn), PCI_FUNC(as->devfn), addr, ret.translated_addr); return ret; diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 9ba2162cd9..4a51df8561 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -2221,7 +2221,7 @@ static void vtd_mem_write(void *opaque, hwaddr addr, } static IOMMUTLBEntry vtd_iommu_translate(MemoryRegion *iommu, hwaddr addr, - bool is_write) + IOMMUAccessFlags flag) { VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu); IntelIOMMUState *s = vtd_as->iommu_state; @@ -2243,7 +2243,7 @@ static IOMMUTLBEntry vtd_iommu_translate(MemoryRegion *iommu, hwaddr addr, } vtd_do_iommu_translate(vtd_as, vtd_as->bus, vtd_as->devfn, addr, - is_write, &ret); + flag & IOMMU_WO, &ret); VTD_DPRINTF(MMU, "bus %"PRIu8 " slot %"PRIu8 " func %"PRIu8 " devfn %"PRIu8 " iova 0x%"PRIx64 " hpa 0x%"PRIx64, pci_bus_num(vtd_as->bus), diff --git a/hw/pci-host/apb.c b/hw/pci-host/apb.c index edc88f4c65..2a80f687a6 100644 --- a/hw/pci-host/apb.c +++ b/hw/pci-host/apb.c @@ -209,7 +209,7 @@ static AddressSpace *pbm_pci_dma_iommu(PCIBus *bus, void *opaque, int devfn) /* Called from RCU critical section */ static IOMMUTLBEntry pbm_translate_iommu(MemoryRegion *iommu, hwaddr addr, - bool is_write) + IOMMUAccessFlags flag) { IOMMUState *is = container_of(iommu, IOMMUState, iommu); hwaddr baseaddr, offset; diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c index 29c80bb3c8..0341bc069d 100644 --- a/hw/ppc/spapr_iommu.c +++ b/hw/ppc/spapr_iommu.c @@ -111,7 +111,7 @@ static void spapr_tce_free_table(uint64_t *table, int fd, uint32_t nb_table) /* Called from RCU critical section */ static IOMMUTLBEntry spapr_tce_translate_iommu(MemoryRegion *iommu, hwaddr addr, - bool is_write) + IOMMUAccessFlags flag) { sPAPRTCETable *tcet = container_of(iommu, sPAPRTCETable, iommu); uint64_t tce; diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c index 66a6fbeb8c..5651483781 100644 --- a/hw/s390x/s390-pci-bus.c +++ b/hw/s390x/s390-pci-bus.c @@ -357,7 +357,7 @@ out: } static IOMMUTLBEntry s390_translate_iommu(MemoryRegion *mr, hwaddr addr, - bool is_write) + IOMMUAccessFlags flag) { uint64_t pte; uint32_t flags; diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c index 314a9cbad4..8bc7c98682 100644 --- a/hw/s390x/s390-pci-inst.c +++ b/hw/s390x/s390-pci-inst.c @@ -624,7 +624,7 @@ int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2) mr = &iommu->iommu_mr; while (start < end) { - entry = mr->iommu_ops->translate(mr, start, 0); + entry = mr->iommu_ops->translate(mr, start, IOMMU_NONE); if (!entry.translated_addr) { pbdev->state = ZPCI_FS_ERROR; -- cgit v1.2.3 From ad523590f62cf5d44e97388de370d27b95b25aff Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 19 May 2017 11:19:41 +0800 Subject: memory: remove the last param in memory_region_iommu_replay() We were always passing in that one as "false" to assume that's an read operation, and we also assume that IOMMU translation would always have that read permission. A better permission would be IOMMU_NONE since the replay is after all not a real read operation, but just a page table rebuilding process. CC: David Gibson CC: Paolo Bonzini Reviewed-by: David Gibson Acked-by: Paolo Bonzini Signed-off-by: Peter Xu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Jason Wang --- hw/vfio/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'hw') diff --git a/hw/vfio/common.c b/hw/vfio/common.c index a8f12eeb35..b9abe77f5a 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -502,7 +502,7 @@ static void vfio_listener_region_add(MemoryListener *listener, QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next); memory_region_register_iommu_notifier(giommu->iommu, &giommu->n); - memory_region_iommu_replay(giommu->iommu, &giommu->n, false); + memory_region_iommu_replay(giommu->iommu, &giommu->n); return; } -- cgit v1.2.3 From 0b77d30a43bf21c7bf6af43cd546d8270bec7b99 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 19 May 2017 11:19:42 +0800 Subject: x86-iommu: use DeviceClass properties No reason to keep tens of lines if we can do it actually far shorter. Signed-off-by: Peter Xu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Jason Wang --- hw/i386/x86-iommu.c | 47 +++++++---------------------------------------- 1 file changed, 7 insertions(+), 40 deletions(-) (limited to 'hw') diff --git a/hw/i386/x86-iommu.c b/hw/i386/x86-iommu.c index 23dcd3f039..02b88258a4 100644 --- a/hw/i386/x86-iommu.c +++ b/hw/i386/x86-iommu.c @@ -88,55 +88,22 @@ static void x86_iommu_realize(DeviceState *dev, Error **errp) x86_iommu_set_default(X86_IOMMU_DEVICE(dev)); } +static Property x86_iommu_properties[] = { + DEFINE_PROP_BOOL("intremap", X86IOMMUState, intr_supported, false), + DEFINE_PROP_BOOL("device-iotlb", X86IOMMUState, dt_supported, false), + DEFINE_PROP_END_OF_LIST(), +}; + static void x86_iommu_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); dc->realize = x86_iommu_realize; -} - -static bool x86_iommu_intremap_prop_get(Object *o, Error **errp) -{ - X86IOMMUState *s = X86_IOMMU_DEVICE(o); - return s->intr_supported; -} - -static void x86_iommu_intremap_prop_set(Object *o, bool value, Error **errp) -{ - X86IOMMUState *s = X86_IOMMU_DEVICE(o); - s->intr_supported = value; -} - -static bool x86_iommu_device_iotlb_prop_get(Object *o, Error **errp) -{ - X86IOMMUState *s = X86_IOMMU_DEVICE(o); - return s->dt_supported; -} - -static void x86_iommu_device_iotlb_prop_set(Object *o, bool value, Error **errp) -{ - X86IOMMUState *s = X86_IOMMU_DEVICE(o); - s->dt_supported = value; -} - -static void x86_iommu_instance_init(Object *o) -{ - X86IOMMUState *s = X86_IOMMU_DEVICE(o); - - /* By default, do not support IR */ - s->intr_supported = false; - object_property_add_bool(o, "intremap", x86_iommu_intremap_prop_get, - x86_iommu_intremap_prop_set, NULL); - s->dt_supported = false; - object_property_add_bool(o, "device-iotlb", - x86_iommu_device_iotlb_prop_get, - x86_iommu_device_iotlb_prop_set, - NULL); + dc->props = x86_iommu_properties; } static const TypeInfo x86_iommu_info = { .name = TYPE_X86_IOMMU_DEVICE, .parent = TYPE_SYS_BUS_DEVICE, - .instance_init = x86_iommu_instance_init, .instance_size = sizeof(X86IOMMUState), .class_init = x86_iommu_class_init, .class_size = sizeof(X86IOMMUClass), -- cgit v1.2.3 From 8f7d7161dd78142e77b96d7035682451154ed0d2 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 19 May 2017 11:19:43 +0800 Subject: intel_iommu: renaming context entry helpers The old names are too long and less ordered. Let's start to use vtd_ce_*() as a pattern. Signed-off-by: Peter Xu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Jason Wang --- hw/i386/intel_iommu.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'hw') diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 4a51df8561..f06055f053 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -512,7 +512,7 @@ static int vtd_get_root_entry(IntelIOMMUState *s, uint8_t index, return 0; } -static inline bool vtd_context_entry_present(VTDContextEntry *context) +static inline bool vtd_ce_present(VTDContextEntry *context) { return context->lo & VTD_CONTEXT_ENTRY_P; } @@ -533,7 +533,7 @@ static int vtd_get_context_entry_from_root(VTDRootEntry *root, uint8_t index, return 0; } -static inline dma_addr_t vtd_get_slpt_base_from_context(VTDContextEntry *ce) +static inline dma_addr_t vtd_ce_get_slpt_base(VTDContextEntry *ce) { return ce->lo & VTD_CONTEXT_ENTRY_SLPTPTR; } @@ -585,19 +585,19 @@ static inline bool vtd_is_level_supported(IntelIOMMUState *s, uint32_t level) /* Get the page-table level that hardware should use for the second-level * page-table walk from the Address Width field of context-entry. */ -static inline uint32_t vtd_get_level_from_context_entry(VTDContextEntry *ce) +static inline uint32_t vtd_ce_get_level(VTDContextEntry *ce) { return 2 + (ce->hi & VTD_CONTEXT_ENTRY_AW); } -static inline uint32_t vtd_get_agaw_from_context_entry(VTDContextEntry *ce) +static inline uint32_t vtd_ce_get_agaw(VTDContextEntry *ce) { return 30 + (ce->hi & VTD_CONTEXT_ENTRY_AW) * 9; } static inline uint64_t vtd_iova_limit(VTDContextEntry *ce) { - uint32_t ce_agaw = vtd_get_agaw_from_context_entry(ce); + uint32_t ce_agaw = vtd_ce_get_agaw(ce); return 1ULL << MIN(ce_agaw, VTD_MGAW); } @@ -642,8 +642,8 @@ static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write, uint64_t *slptep, uint32_t *slpte_level, bool *reads, bool *writes) { - dma_addr_t addr = vtd_get_slpt_base_from_context(ce); - uint32_t level = vtd_get_level_from_context_entry(ce); + dma_addr_t addr = vtd_ce_get_slpt_base(ce); + uint32_t level = vtd_ce_get_level(ce); uint32_t offset; uint64_t slpte; uint64_t access_right_check; @@ -664,7 +664,7 @@ static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write, VTD_DPRINTF(GENERAL, "error: fail to access second-level paging " "entry at level %"PRIu32 " for iova 0x%"PRIx64, level, iova); - if (level == vtd_get_level_from_context_entry(ce)) { + if (level == vtd_ce_get_level(ce)) { /* Invalid programming of context-entry */ return -VTD_FR_CONTEXT_ENTRY_INV; } else { @@ -809,8 +809,8 @@ static int vtd_page_walk(VTDContextEntry *ce, uint64_t start, uint64_t end, vtd_page_walk_hook hook_fn, void *private, bool notify_unmap) { - dma_addr_t addr = vtd_get_slpt_base_from_context(ce); - uint32_t level = vtd_get_level_from_context_entry(ce); + dma_addr_t addr = vtd_ce_get_slpt_base(ce); + uint32_t level = vtd_ce_get_level(ce); if (!vtd_iova_range_check(start, ce)) { return -VTD_FR_ADDR_BEYOND_MGAW; @@ -851,7 +851,7 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, return ret_fr; } - if (!vtd_context_entry_present(ce)) { + if (!vtd_ce_present(ce)) { /* Not error - it's okay we don't have context entry. */ trace_vtd_ce_not_present(bus_num, devfn); return -VTD_FR_CONTEXT_ENTRY_P; @@ -861,7 +861,7 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, return -VTD_FR_CONTEXT_ENTRY_RSVD; } /* Check if the programming of context-entry is valid */ - if (!vtd_is_level_supported(s, vtd_get_level_from_context_entry(ce))) { + if (!vtd_is_level_supported(s, vtd_ce_get_level(ce))) { trace_vtd_ce_invalid(ce->hi, ce->lo); return -VTD_FR_CONTEXT_ENTRY_INV; } else { -- cgit v1.2.3 From 127ff5c356e1c2c1328ed2fbb582eb14e412b160 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 19 May 2017 11:19:44 +0800 Subject: intel_iommu: provide vtd_ce_get_type() Helper to fetch VT-d context entry type. Signed-off-by: Peter Xu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Jason Wang --- hw/i386/intel_iommu.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'hw') diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index f06055f053..b4771430e0 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -595,6 +595,11 @@ static inline uint32_t vtd_ce_get_agaw(VTDContextEntry *ce) return 30 + (ce->hi & VTD_CONTEXT_ENTRY_AW) * 9; } +static inline uint32_t vtd_ce_get_type(VTDContextEntry *ce) +{ + return ce->lo & VTD_CONTEXT_ENTRY_TT; +} + static inline uint64_t vtd_iova_limit(VTDContextEntry *ce) { uint32_t ce_agaw = vtd_ce_get_agaw(ce); @@ -865,7 +870,7 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, trace_vtd_ce_invalid(ce->hi, ce->lo); return -VTD_FR_CONTEXT_ENTRY_INV; } else { - switch (ce->lo & VTD_CONTEXT_ENTRY_TT) { + switch (vtd_ce_get_type(ce)) { case VTD_CONTEXT_TT_MULTI_LEVEL: /* fall through */ case VTD_CONTEXT_TT_DEV_IOTLB: -- cgit v1.2.3 From 5a38cb5940f71254e7dcc4bce7ffca8c2102f0ec Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 19 May 2017 11:19:45 +0800 Subject: intel_iommu: use IOMMU_ACCESS_FLAG() We have that now, so why not use it. Signed-off-by: Peter Xu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Jason Wang --- hw/i386/intel_iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'hw') diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index b4771430e0..3240e5de37 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -1010,7 +1010,7 @@ out: entry->iova = addr & page_mask; entry->translated_addr = vtd_get_slpte_addr(slpte) & page_mask; entry->addr_mask = ~page_mask; - entry->perm = (writes ? 2 : 0) + (reads ? 1 : 0); + entry->perm = IOMMU_ACCESS_FLAG(reads, writes); } static void vtd_root_table_setup(IntelIOMMUState *s) -- cgit v1.2.3 From f80c98740e8da9fa0e4056f174ca66a3afb1d15b Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 19 May 2017 11:19:46 +0800 Subject: intel_iommu: allow dev-iotlb context entry conditionally When device-iotlb is not specified, we should fail this check. A new function vtd_ce_type_check() is introduced. While I'm at it, clean up the vtd_dev_to_context_entry() a bit - replace many "else if" usage into direct if check. That'll make the logic more clear. Signed-off-by: Peter Xu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Jason Wang --- hw/i386/intel_iommu.c | 49 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 13 deletions(-) (limited to 'hw') diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 3240e5de37..aac2cc7483 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -600,6 +600,26 @@ static inline uint32_t vtd_ce_get_type(VTDContextEntry *ce) return ce->lo & VTD_CONTEXT_ENTRY_TT; } +/* Return true if check passed, otherwise false */ +static inline bool vtd_ce_type_check(X86IOMMUState *x86_iommu, + VTDContextEntry *ce) +{ + switch (vtd_ce_get_type(ce)) { + case VTD_CONTEXT_TT_MULTI_LEVEL: + /* Always supported */ + break; + case VTD_CONTEXT_TT_DEV_IOTLB: + if (!x86_iommu->dt_supported) { + return false; + } + break; + default: + /* Unknwon type */ + return false; + } + return true; +} + static inline uint64_t vtd_iova_limit(VTDContextEntry *ce) { uint32_t ce_agaw = vtd_ce_get_agaw(ce); @@ -836,6 +856,7 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, { VTDRootEntry re; int ret_fr; + X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); ret_fr = vtd_get_root_entry(s, bus_num, &re); if (ret_fr) { @@ -846,7 +867,9 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, /* Not error - it's okay we don't have root entry. */ trace_vtd_re_not_present(bus_num); return -VTD_FR_ROOT_ENTRY_P; - } else if (re.rsvd || (re.val & VTD_ROOT_ENTRY_RSVD)) { + } + + if (re.rsvd || (re.val & VTD_ROOT_ENTRY_RSVD)) { trace_vtd_re_invalid(re.rsvd, re.val); return -VTD_FR_ROOT_ENTRY_RSVD; } @@ -860,26 +883,26 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, /* Not error - it's okay we don't have context entry. */ trace_vtd_ce_not_present(bus_num, devfn); return -VTD_FR_CONTEXT_ENTRY_P; - } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) || - (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) { + } + + if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) || + (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) { trace_vtd_ce_invalid(ce->hi, ce->lo); return -VTD_FR_CONTEXT_ENTRY_RSVD; } + /* Check if the programming of context-entry is valid */ if (!vtd_is_level_supported(s, vtd_ce_get_level(ce))) { trace_vtd_ce_invalid(ce->hi, ce->lo); return -VTD_FR_CONTEXT_ENTRY_INV; - } else { - switch (vtd_ce_get_type(ce)) { - case VTD_CONTEXT_TT_MULTI_LEVEL: - /* fall through */ - case VTD_CONTEXT_TT_DEV_IOTLB: - break; - default: - trace_vtd_ce_invalid(ce->hi, ce->lo); - return -VTD_FR_CONTEXT_ENTRY_INV; - } } + + /* Do translation type check */ + if (!vtd_ce_type_check(x86_iommu, ce)) { + trace_vtd_ce_invalid(ce->hi, ce->lo); + return -VTD_FR_CONTEXT_ENTRY_INV; + } + return 0; } -- cgit v1.2.3 From dbaabb25f441264d9029dc53e84a156269ecd275 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 19 May 2017 11:19:47 +0800 Subject: intel_iommu: support passthrough (PT) Hardware support for VT-d device passthrough. Although current Linux can live with iommu=pt even without this, but this is faster than when using software passthrough. Signed-off-by: Peter Xu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Liu, Yi L Reviewed-by: Jason Wang --- hw/i386/intel_iommu.c | 231 ++++++++++++++++++++++++++++++----------- hw/i386/intel_iommu_internal.h | 1 + hw/i386/trace-events | 2 + hw/i386/x86-iommu.c | 1 + 4 files changed, 176 insertions(+), 59 deletions(-) (limited to 'hw') diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index aac2cc7483..15610b9de8 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -613,6 +613,11 @@ static inline bool vtd_ce_type_check(X86IOMMUState *x86_iommu, return false; } break; + case VTD_CONTEXT_TT_PASS_THROUGH: + if (!x86_iommu->pt_supported) { + return false; + } + break; default: /* Unknwon type */ return false; @@ -660,6 +665,29 @@ static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level) } } +/* Find the VTD address space associated with a given bus number */ +static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num) +{ + VTDBus *vtd_bus = s->vtd_as_by_bus_num[bus_num]; + if (!vtd_bus) { + /* + * Iterate over the registered buses to find the one which + * currently hold this bus number, and update the bus_num + * lookup table: + */ + GHashTableIter iter; + + g_hash_table_iter_init(&iter, s->vtd_as_by_busptr); + while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) { + if (pci_bus_num(vtd_bus->bus) == bus_num) { + s->vtd_as_by_bus_num[bus_num] = vtd_bus; + return vtd_bus; + } + } + } + return vtd_bus; +} + /* Given the @iova, get relevant @slptep. @slpte_level will be the last level * of the translation, can be used for deciding the size of large page. */ @@ -906,6 +934,91 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, return 0; } +/* + * Fetch translation type for specific device. Returns <0 if error + * happens, otherwise return the shifted type to check against + * VTD_CONTEXT_TT_*. + */ +static int vtd_dev_get_trans_type(VTDAddressSpace *as) +{ + IntelIOMMUState *s; + VTDContextEntry ce; + int ret; + + s = as->iommu_state; + + ret = vtd_dev_to_context_entry(s, pci_bus_num(as->bus), + as->devfn, &ce); + if (ret) { + return ret; + } + + return vtd_ce_get_type(&ce); +} + +static bool vtd_dev_pt_enabled(VTDAddressSpace *as) +{ + int ret; + + assert(as); + + ret = vtd_dev_get_trans_type(as); + if (ret < 0) { + /* + * Possibly failed to parse the context entry for some reason + * (e.g., during init, or any guest configuration errors on + * context entries). We should assume PT not enabled for + * safety. + */ + return false; + } + + return ret == VTD_CONTEXT_TT_PASS_THROUGH; +} + +/* Return whether the device is using IOMMU translation. */ +static bool vtd_switch_address_space(VTDAddressSpace *as) +{ + bool use_iommu; + + assert(as); + + use_iommu = as->iommu_state->dmar_enabled & !vtd_dev_pt_enabled(as); + + trace_vtd_switch_address_space(pci_bus_num(as->bus), + VTD_PCI_SLOT(as->devfn), + VTD_PCI_FUNC(as->devfn), + use_iommu); + + /* Turn off first then on the other */ + if (use_iommu) { + memory_region_set_enabled(&as->sys_alias, false); + memory_region_set_enabled(&as->iommu, true); + } else { + memory_region_set_enabled(&as->iommu, false); + memory_region_set_enabled(&as->sys_alias, true); + } + + return use_iommu; +} + +static void vtd_switch_address_space_all(IntelIOMMUState *s) +{ + GHashTableIter iter; + VTDBus *vtd_bus; + int i; + + g_hash_table_iter_init(&iter, s->vtd_as_by_busptr); + while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) { + for (i = 0; i < X86_IOMMU_PCI_DEVFN_MAX; i++) { + if (!vtd_bus->dev_as[i]) { + continue; + } + vtd_switch_address_space(vtd_bus->dev_as[i]); + } + } +} + static inline uint16_t vtd_make_source_id(uint8_t bus_num, uint8_t devfn) { return ((bus_num & 0xffUL) << 8) | (devfn & 0xffUL); @@ -943,6 +1056,31 @@ static inline bool vtd_is_interrupt_addr(hwaddr addr) return VTD_INTERRUPT_ADDR_FIRST <= addr && addr <= VTD_INTERRUPT_ADDR_LAST; } +static void vtd_pt_enable_fast_path(IntelIOMMUState *s, uint16_t source_id) +{ + VTDBus *vtd_bus; + VTDAddressSpace *vtd_as; + bool success = false; + + vtd_bus = vtd_find_as_from_bus_num(s, VTD_SID_TO_BUS(source_id)); + if (!vtd_bus) { + goto out; + } + + vtd_as = vtd_bus->dev_as[VTD_SID_TO_DEVFN(source_id)]; + if (!vtd_as) { + goto out; + } + + if (vtd_switch_address_space(vtd_as) == false) { + /* We switched off IOMMU region successfully. */ + success = true; + } + +out: + trace_vtd_pt_enable_fast_path(source_id, success); +} + /* Map dev to context-entry then do a paging-structures walk to do a iommu * translation. * @@ -1014,6 +1152,30 @@ static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, cc_entry->context_cache_gen = s->context_cache_gen; } + /* + * We don't need to translate for pass-through context entries. + * Also, let's ignore IOTLB caching as well for PT devices. + */ + if (vtd_ce_get_type(&ce) == VTD_CONTEXT_TT_PASS_THROUGH) { + entry->translated_addr = entry->iova; + entry->addr_mask = VTD_PAGE_SIZE - 1; + entry->perm = IOMMU_RW; + trace_vtd_translate_pt(source_id, entry->iova); + + /* + * When this happens, it means firstly caching-mode is not + * enabled, and this is the first passthrough translation for + * the device. Let's enable the fast path for passthrough. + * + * When passthrough is disabled again for the device, we can + * capture it via the context entry invalidation, then the + * IOMMU region can be swapped back. + */ + vtd_pt_enable_fast_path(s, source_id); + + return; + } + ret_fr = vtd_iova_to_slpte(&ce, addr, is_write, &slpte, &level, &reads, &writes); if (ret_fr) { @@ -1083,6 +1245,7 @@ static void vtd_context_global_invalidate(IntelIOMMUState *s) if (s->context_cache_gen == VTD_CONTEXT_CACHE_GEN_MAX) { vtd_reset_context_cache(s); } + vtd_switch_address_space_all(s); /* * From VT-d spec 6.5.2.1, a global context entry invalidation * should be followed by a IOTLB global invalidation, so we should @@ -1093,29 +1256,6 @@ static void vtd_context_global_invalidate(IntelIOMMUState *s) vtd_iommu_replay_all(s); } - -/* Find the VTD address space currently associated with a given bus number, - */ -static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num) -{ - VTDBus *vtd_bus = s->vtd_as_by_bus_num[bus_num]; - if (!vtd_bus) { - /* Iterate over the registered buses to find the one - * which currently hold this bus number, and update the bus_num lookup table: - */ - GHashTableIter iter; - - g_hash_table_iter_init(&iter, s->vtd_as_by_busptr); - while (g_hash_table_iter_next (&iter, NULL, (void**)&vtd_bus)) { - if (pci_bus_num(vtd_bus->bus) == bus_num) { - s->vtd_as_by_bus_num[bus_num] = vtd_bus; - return vtd_bus; - } - } - } - return vtd_bus; -} - /* Do a context-cache device-selective invalidation. * @func_mask: FM field after shifting */ @@ -1157,6 +1297,11 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s, trace_vtd_inv_desc_cc_device(bus_n, VTD_PCI_SLOT(devfn_it), VTD_PCI_FUNC(devfn_it)); vtd_as->context_cache_entry.context_cache_gen = 0; + /* + * Do switch address space when needed, in case if the + * device passthrough bit is switched. + */ + vtd_switch_address_space(vtd_as); /* * So a device is moving out of (or moving into) a * domain, a replay() suites here to notify all the @@ -1389,42 +1534,6 @@ static void vtd_handle_gcmd_sirtp(IntelIOMMUState *s) vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRTPS); } -static void vtd_switch_address_space(VTDAddressSpace *as) -{ - assert(as); - - trace_vtd_switch_address_space(pci_bus_num(as->bus), - VTD_PCI_SLOT(as->devfn), - VTD_PCI_FUNC(as->devfn), - as->iommu_state->dmar_enabled); - - /* Turn off first then on the other */ - if (as->iommu_state->dmar_enabled) { - memory_region_set_enabled(&as->sys_alias, false); - memory_region_set_enabled(&as->iommu, true); - } else { - memory_region_set_enabled(&as->iommu, false); - memory_region_set_enabled(&as->sys_alias, true); - } -} - -static void vtd_switch_address_space_all(IntelIOMMUState *s) -{ - GHashTableIter iter; - VTDBus *vtd_bus; - int i; - - g_hash_table_iter_init(&iter, s->vtd_as_by_busptr); - while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) { - for (i = 0; i < X86_IOMMU_PCI_DEVFN_MAX; i++) { - if (!vtd_bus->dev_as[i]) { - continue; - } - vtd_switch_address_space(vtd_bus->dev_as[i]); - } - } -} - /* Handle Translation Enable/Disable */ static void vtd_handle_gcmd_te(IntelIOMMUState *s, bool en) { @@ -2872,6 +2981,10 @@ static void vtd_init(IntelIOMMUState *s) s->ecap |= VTD_ECAP_DT; } + if (x86_iommu->pt_supported) { + s->ecap |= VTD_ECAP_PT; + } + if (s->caching_mode) { s->cap |= VTD_CAP_CM; } diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h index 29d67075f4..0e73a65bf2 100644 --- a/hw/i386/intel_iommu_internal.h +++ b/hw/i386/intel_iommu_internal.h @@ -187,6 +187,7 @@ /* Interrupt Remapping support */ #define VTD_ECAP_IR (1ULL << 3) #define VTD_ECAP_EIM (1ULL << 4) +#define VTD_ECAP_PT (1ULL << 6) #define VTD_ECAP_MHMV (15ULL << 20) /* CAP_REG */ diff --git a/hw/i386/trace-events b/hw/i386/trace-events index 04a6980800..72556dad48 100644 --- a/hw/i386/trace-events +++ b/hw/i386/trace-events @@ -38,6 +38,8 @@ vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"P vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set" vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)" vtd_as_unmap_whole(uint8_t bus, uint8_t slot, uint8_t fn, uint64_t iova, uint64_t size) "Device %02x:%02x.%x start 0x%"PRIx64" size 0x%"PRIx64 +vtd_translate_pt(uint16_t sid, uint64_t addr) "source id 0x%"PRIu16", iova 0x%"PRIx64 +vtd_pt_enable_fast_path(uint16_t sid, bool success) "sid 0x%"PRIu16" %d" # hw/i386/amd_iommu.c amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at addr 0x%"PRIx64" + offset 0x%"PRIx32 diff --git a/hw/i386/x86-iommu.c b/hw/i386/x86-iommu.c index 02b88258a4..293caf83ef 100644 --- a/hw/i386/x86-iommu.c +++ b/hw/i386/x86-iommu.c @@ -91,6 +91,7 @@ static void x86_iommu_realize(DeviceState *dev, Error **errp) static Property x86_iommu_properties[] = { DEFINE_PROP_BOOL("intremap", X86IOMMUState, intr_supported, false), DEFINE_PROP_BOOL("device-iotlb", X86IOMMUState, dt_supported, false), + DEFINE_PROP_BOOL("pt", X86IOMMUState, pt_supported, true), DEFINE_PROP_END_OF_LIST(), }; -- cgit v1.2.3 From 75ebec11afe49539f71cc1c494e3010f91c86adb Mon Sep 17 00:00:00 2001 From: Maxime Coquelin Date: Tue, 23 May 2017 14:31:19 +0200 Subject: virtio_net: Bypass backends for MTU feature negotiation This patch adds a new internal "x-mtu-bypass-backend" property to bypass backends for MTU feature negotiation. When this property is set, the MTU feature is negotiated as soon as supported by the guest and a MTU value is set via the host_mtu parameter. In case the backend advertises the feature (e.g. DPDK's vhost-user backend), the feature negotiation is propagated down to the backend. When this property is not set, the backend has to support the MTU feature for its negotiation to succeed. For compatibility purpose, this property is disabled for machine types v2.9 and older. Cc: Aaron Conole Suggested-by: Michael S. Tsirkin Signed-off-by: Maxime Coquelin Reviewed-by: Vlad Yasevich Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/net/virtio-net.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'hw') diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 98bd683f31..9a3d769aa2 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -589,7 +589,15 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features, if (!get_vhost_net(nc->peer)) { return features; } - return vhost_net_get_features(get_vhost_net(nc->peer), features); + features = vhost_net_get_features(get_vhost_net(nc->peer), features); + vdev->backend_features = features; + + if (n->mtu_bypass_backend && + (n->host_features & 1ULL << VIRTIO_NET_F_MTU)) { + features |= (1ULL << VIRTIO_NET_F_MTU); + } + + return features; } static uint64_t virtio_net_bad_features(VirtIODevice *vdev) @@ -640,6 +648,11 @@ static void virtio_net_set_features(VirtIODevice *vdev, uint64_t features) VirtIONet *n = VIRTIO_NET(vdev); int i; + if (n->mtu_bypass_backend && + !virtio_has_feature(vdev->backend_features, VIRTIO_NET_F_MTU)) { + features &= ~(1ULL << VIRTIO_NET_F_MTU); + } + virtio_net_set_multiqueue(n, virtio_has_feature(features, VIRTIO_NET_F_MQ)); @@ -2093,6 +2106,8 @@ static Property virtio_net_properties[] = { DEFINE_PROP_UINT16("rx_queue_size", VirtIONet, net_conf.rx_queue_size, VIRTIO_NET_RX_QUEUE_DEFAULT_SIZE), DEFINE_PROP_UINT16("host_mtu", VirtIONet, net_conf.mtu, 0), + DEFINE_PROP_BOOL("x-mtu-bypass-backend", VirtIONet, mtu_bypass_backend, + true), DEFINE_PROP_END_OF_LIST(), }; -- cgit v1.2.3 From 3cf7daf8c3b0a1c417490741227d910b253796ff Mon Sep 17 00:00:00 2001 From: Maxime Coquelin Date: Wed, 24 May 2017 11:05:20 +0200 Subject: vhost-user: pass message as a pointer to process_message_reply() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit process_message_reply() was recently updated to get full message content instead of only its request field. There is no need to copy all the struct content into the stack, so just pass its pointer as const. Reviewed-by: Jens Freimann Reviewed-by: Zhiyong Yang Signed-off-by: Maxime Coquelin Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Marc-André Lureau --- hw/virtio/vhost-user.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'hw') diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index b87a176770..dde094abb4 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -162,11 +162,11 @@ fail: } static int process_message_reply(struct vhost_dev *dev, - VhostUserMsg msg) + const VhostUserMsg *msg) { VhostUserMsg msg_reply; - if ((msg.flags & VHOST_USER_NEED_REPLY_MASK) == 0) { + if ((msg->flags & VHOST_USER_NEED_REPLY_MASK) == 0) { return 0; } @@ -174,10 +174,10 @@ static int process_message_reply(struct vhost_dev *dev, return -1; } - if (msg_reply.request != msg.request) { + if (msg_reply.request != msg->request) { error_report("Received unexpected msg type." "Expected %d received %d", - msg.request, msg_reply.request); + msg->request, msg_reply.request); return -1; } @@ -324,7 +324,7 @@ static int vhost_user_set_mem_table(struct vhost_dev *dev, } if (reply_supported) { - return process_message_reply(dev, msg); + return process_message_reply(dev, &msg); } return 0; @@ -716,7 +716,7 @@ static int vhost_user_net_set_mtu(struct vhost_dev *dev, uint16_t mtu) /* If reply_ack supported, slave has to ack specified MTU is valid */ if (reply_supported) { - return process_message_reply(dev, msg); + return process_message_reply(dev, &msg); } return 0; -- cgit v1.2.3 From ede24a026411c260b5471348f431ec2f3c5e8f2b Mon Sep 17 00:00:00 2001 From: Ladi Prosek Date: Thu, 25 May 2017 09:07:47 +0200 Subject: pc: ACPI BIOS: use highest NUMA node for hotplug mem hole SRAT entry For reasons unknown, Windows won't online all memory, both at command line and hot-plugged later, unless the hotplug mem hole SRAT entry specifies a node greater than or equal to the ones where memory is added. Using the highest node on the machine makes recent versions of Windows happy. With this example command line: ... \ -m 1024,slots=4,maxmem=32G \ -numa node,nodeid=0 \ -numa node,nodeid=1 \ -numa node,nodeid=2 \ -numa node,nodeid=3 \ -object memory-backend-ram,size=1G,id=mem-mem1 \ -device pc-dimm,id=dimm-mem1,memdev=mem-mem1,node=1 Windows reports a total of 1G of RAM without this commit and the expected 2G with this commit. Signed-off-by: Ladi Prosek Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Igor Mammedov Acked-by: Laszlo Ersek --- hw/i386/acpi-build.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'hw') diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index afcadacd2e..82bd44f38e 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -2404,14 +2404,17 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) } /* - * Entry is required for Windows to enable memory hotplug in OS. + * Entry is required for Windows to enable memory hotplug in OS + * and for Linux to enable SWIOTLB when booted with less than + * 4G of RAM. Windows works better if the entry sets proximity + * to the highest NUMA node in the machine. * Memory devices may override proximity set by this entry, * providing _PXM method if necessary. */ if (hotplugabble_address_space_size) { numamem = acpi_data_push(table_data, sizeof *numamem); build_srat_memory(numamem, pcms->hotplug_memory.base, - hotplugabble_address_space_size, 0, + hotplugabble_address_space_size, pcms->numa_nodes - 1, MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED); } -- cgit v1.2.3