diff options
author | Stefan Hajnoczi <stefanha@redhat.com> | 2017-05-30 14:15:04 +0100 |
---|---|---|
committer | Stefan Hajnoczi <stefanha@redhat.com> | 2017-05-30 14:15:04 +0100 |
commit | a3203e7dd375d4f619b72b854fcb6d3f2d14bfef (patch) | |
tree | b1367cde8d39f37b34cebc6a9375897983608bca | |
parent | 08f44282c1f6afe1ffdc31e45dae870ddbb32ed3 (diff) | |
parent | 811bf15114bff4d46735acedbd72b906616ce365 (diff) |
Merge remote-tracking branch 'mst/tags/for_upstream' into staging
pci, virtio, vhost: fixes
A bunch of fixes all over the place. Most notably this fixes
the new MTU feature when using vhost.
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
# gpg: Signature made Mon 29 May 2017 01:10:24 AM BST
# gpg: using RSA key 0x281F0DB8D28D5469
# gpg: Good signature from "Michael S. Tsirkin <mst@kernel.org>"
# gpg: aka "Michael S. Tsirkin <mst@redhat.com>"
# Primary key fingerprint: 0270 606B 6F3C DF3D 0B17 0970 C350 3912 AFBE 8E67
# Subkey fingerprint: 5D09 FD08 71C8 F85B 94CA 8A0D 281F 0DB8 D28D 5469
* mst/tags/for_upstream:
acpi-test: update expected files
pc: ACPI BIOS: use highest NUMA node for hotplug mem hole SRAT entry
vhost-user: pass message as a pointer to process_message_reply()
virtio_net: Bypass backends for MTU feature negotiation
intel_iommu: turn off pt before 2.9
intel_iommu: support passthrough (PT)
intel_iommu: allow dev-iotlb context entry conditionally
intel_iommu: use IOMMU_ACCESS_FLAG()
intel_iommu: provide vtd_ce_get_type()
intel_iommu: renaming context entry helpers
x86-iommu: use DeviceClass properties
memory: remove the last param in memory_region_iommu_replay()
memory: tune last param of iommu_ops.translate()
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-rw-r--r-- | exec.c | 3 | ||||
-rw-r--r-- | hw/alpha/typhoon.c | 2 | ||||
-rw-r--r-- | hw/dma/rc4030.c | 2 | ||||
-rw-r--r-- | hw/i386/acpi-build.c | 7 | ||||
-rw-r--r-- | hw/i386/amd_iommu.c | 4 | ||||
-rw-r--r-- | hw/i386/intel_iommu.c | 313 | ||||
-rw-r--r-- | hw/i386/intel_iommu_internal.h | 1 | ||||
-rw-r--r-- | hw/i386/trace-events | 2 | ||||
-rw-r--r-- | hw/i386/x86-iommu.c | 48 | ||||
-rw-r--r-- | hw/net/virtio-net.c | 17 | ||||
-rw-r--r-- | hw/pci-host/apb.c | 2 | ||||
-rw-r--r-- | hw/ppc/spapr_iommu.c | 2 | ||||
-rw-r--r-- | hw/s390x/s390-pci-bus.c | 2 | ||||
-rw-r--r-- | hw/s390x/s390-pci-inst.c | 2 | ||||
-rw-r--r-- | hw/vfio/common.c | 2 | ||||
-rw-r--r-- | hw/virtio/vhost-user.c | 12 | ||||
-rw-r--r-- | include/exec/memory.h | 15 | ||||
-rw-r--r-- | include/hw/compat.h | 8 | ||||
-rw-r--r-- | include/hw/i386/x86-iommu.h | 1 | ||||
-rw-r--r-- | include/hw/virtio/virtio-net.h | 1 | ||||
-rw-r--r-- | include/hw/virtio/virtio.h | 1 | ||||
-rw-r--r-- | memory.c | 7 | ||||
-rw-r--r-- | tests/acpi-test-data/pc/SRAT.memhp | bin | 264 -> 264 bytes | |||
-rw-r--r-- | tests/acpi-test-data/q35/SRAT.memhp | bin | 264 -> 264 bytes |
24 files changed, 299 insertions, 155 deletions
@@ -486,7 +486,8 @@ static MemoryRegionSection address_space_do_translate(AddressSpace *as, break; } - iotlb = mr->iommu_ops->translate(mr, addr, is_write); + iotlb = mr->iommu_ops->translate(mr, addr, is_write ? + IOMMU_WO : IOMMU_RO); addr = ((iotlb.translated_addr & ~iotlb.addr_mask) | (addr & iotlb.addr_mask)); *plen = MIN(*plen, (addr | iotlb.addr_mask) - addr + 1); diff --git a/hw/alpha/typhoon.c b/hw/alpha/typhoon.c index f50f5cf186..c1cf7802a4 100644 --- a/hw/alpha/typhoon.c +++ b/hw/alpha/typhoon.c @@ -664,7 +664,7 @@ static bool window_translate(TyphoonWindow *win, hwaddr addr, /* TODO: A translation failure here ought to set PCI error codes on the Pchip and generate a machine check interrupt. */ static IOMMUTLBEntry typhoon_translate_iommu(MemoryRegion *iommu, hwaddr addr, - bool is_write) + IOMMUAccessFlags flag) { TyphoonPchip *pchip = container_of(iommu, TyphoonPchip, iommu); IOMMUTLBEntry ret; diff --git a/hw/dma/rc4030.c b/hw/dma/rc4030.c index 0080141905..edf9432051 100644 --- a/hw/dma/rc4030.c +++ b/hw/dma/rc4030.c @@ -489,7 +489,7 @@ static const MemoryRegionOps jazzio_ops = { }; static IOMMUTLBEntry rc4030_dma_translate(MemoryRegion *iommu, hwaddr addr, - bool is_write) + IOMMUAccessFlags flag) { rc4030State *s = container_of(iommu, rc4030State, dma_mr); IOMMUTLBEntry ret = { diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index afcadacd2e..82bd44f38e 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -2404,14 +2404,17 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) } /* - * Entry is required for Windows to enable memory hotplug in OS. + * Entry is required for Windows to enable memory hotplug in OS + * and for Linux to enable SWIOTLB when booted with less than + * 4G of RAM. Windows works better if the entry sets proximity + * to the highest NUMA node in the machine. * Memory devices may override proximity set by this entry, * providing _PXM method if necessary. */ if (hotplugabble_address_space_size) { numamem = acpi_data_push(table_data, sizeof *numamem); build_srat_memory(numamem, pcms->hotplug_memory.base, - hotplugabble_address_space_size, 0, + hotplugabble_address_space_size, pcms->numa_nodes - 1, MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED); } diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c index 329058dac8..7b6d4ea3f3 100644 --- a/hw/i386/amd_iommu.c +++ b/hw/i386/amd_iommu.c @@ -988,7 +988,7 @@ static inline bool amdvi_is_interrupt_addr(hwaddr addr) } static IOMMUTLBEntry amdvi_translate(MemoryRegion *iommu, hwaddr addr, - bool is_write) + IOMMUAccessFlags flag) { AMDVIAddressSpace *as = container_of(iommu, AMDVIAddressSpace, iommu); AMDVIState *s = as->iommu_state; @@ -1017,7 +1017,7 @@ static IOMMUTLBEntry amdvi_translate(MemoryRegion *iommu, hwaddr addr, return ret; } - amdvi_do_translate(as, addr, is_write, &ret); + amdvi_do_translate(as, addr, flag & IOMMU_WO, &ret); trace_amdvi_translation_result(as->bus_num, PCI_SLOT(as->devfn), PCI_FUNC(as->devfn), addr, ret.translated_addr); return ret; diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 9ba2162cd9..15610b9de8 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -512,7 +512,7 @@ static int vtd_get_root_entry(IntelIOMMUState *s, uint8_t index, return 0; } -static inline bool vtd_context_entry_present(VTDContextEntry *context) +static inline bool vtd_ce_present(VTDContextEntry *context) { return context->lo & VTD_CONTEXT_ENTRY_P; } @@ -533,7 +533,7 @@ static int vtd_get_context_entry_from_root(VTDRootEntry *root, uint8_t index, return 0; } -static inline dma_addr_t vtd_get_slpt_base_from_context(VTDContextEntry *ce) +static inline dma_addr_t vtd_ce_get_slpt_base(VTDContextEntry *ce) { return ce->lo & VTD_CONTEXT_ENTRY_SLPTPTR; } @@ -585,19 +585,49 @@ static inline bool vtd_is_level_supported(IntelIOMMUState *s, uint32_t level) /* Get the page-table level that hardware should use for the second-level * page-table walk from the Address Width field of context-entry. */ -static inline uint32_t vtd_get_level_from_context_entry(VTDContextEntry *ce) +static inline uint32_t vtd_ce_get_level(VTDContextEntry *ce) { return 2 + (ce->hi & VTD_CONTEXT_ENTRY_AW); } -static inline uint32_t vtd_get_agaw_from_context_entry(VTDContextEntry *ce) +static inline uint32_t vtd_ce_get_agaw(VTDContextEntry *ce) { return 30 + (ce->hi & VTD_CONTEXT_ENTRY_AW) * 9; } +static inline uint32_t vtd_ce_get_type(VTDContextEntry *ce) +{ + return ce->lo & VTD_CONTEXT_ENTRY_TT; +} + +/* Return true if check passed, otherwise false */ +static inline bool vtd_ce_type_check(X86IOMMUState *x86_iommu, + VTDContextEntry *ce) +{ + switch (vtd_ce_get_type(ce)) { + case VTD_CONTEXT_TT_MULTI_LEVEL: + /* Always supported */ + break; + case VTD_CONTEXT_TT_DEV_IOTLB: + if (!x86_iommu->dt_supported) { + return false; + } + break; + case VTD_CONTEXT_TT_PASS_THROUGH: + if (!x86_iommu->pt_supported) { + return false; + } + break; + default: + /* Unknwon type */ + return false; + } + return true; +} + static inline uint64_t vtd_iova_limit(VTDContextEntry *ce) { - uint32_t ce_agaw = vtd_get_agaw_from_context_entry(ce); + uint32_t ce_agaw = vtd_ce_get_agaw(ce); return 1ULL << MIN(ce_agaw, VTD_MGAW); } @@ -635,6 +665,29 @@ static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level) } } +/* Find the VTD address space associated with a given bus number */ +static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num) +{ + VTDBus *vtd_bus = s->vtd_as_by_bus_num[bus_num]; + if (!vtd_bus) { + /* + * Iterate over the registered buses to find the one which + * currently hold this bus number, and update the bus_num + * lookup table: + */ + GHashTableIter iter; + + g_hash_table_iter_init(&iter, s->vtd_as_by_busptr); + while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) { + if (pci_bus_num(vtd_bus->bus) == bus_num) { + s->vtd_as_by_bus_num[bus_num] = vtd_bus; + return vtd_bus; + } + } + } + return vtd_bus; +} + /* Given the @iova, get relevant @slptep. @slpte_level will be the last level * of the translation, can be used for deciding the size of large page. */ @@ -642,8 +695,8 @@ static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write, uint64_t *slptep, uint32_t *slpte_level, bool *reads, bool *writes) { - dma_addr_t addr = vtd_get_slpt_base_from_context(ce); - uint32_t level = vtd_get_level_from_context_entry(ce); + dma_addr_t addr = vtd_ce_get_slpt_base(ce); + uint32_t level = vtd_ce_get_level(ce); uint32_t offset; uint64_t slpte; uint64_t access_right_check; @@ -664,7 +717,7 @@ static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write, VTD_DPRINTF(GENERAL, "error: fail to access second-level paging " "entry at level %"PRIu32 " for iova 0x%"PRIx64, level, iova); - if (level == vtd_get_level_from_context_entry(ce)) { + if (level == vtd_ce_get_level(ce)) { /* Invalid programming of context-entry */ return -VTD_FR_CONTEXT_ENTRY_INV; } else { @@ -809,8 +862,8 @@ static int vtd_page_walk(VTDContextEntry *ce, uint64_t start, uint64_t end, vtd_page_walk_hook hook_fn, void *private, bool notify_unmap) { - dma_addr_t addr = vtd_get_slpt_base_from_context(ce); - uint32_t level = vtd_get_level_from_context_entry(ce); + dma_addr_t addr = vtd_ce_get_slpt_base(ce); + uint32_t level = vtd_ce_get_level(ce); if (!vtd_iova_range_check(start, ce)) { return -VTD_FR_ADDR_BEYOND_MGAW; @@ -831,6 +884,7 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, { VTDRootEntry re; int ret_fr; + X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); ret_fr = vtd_get_root_entry(s, bus_num, &re); if (ret_fr) { @@ -841,7 +895,9 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, /* Not error - it's okay we don't have root entry. */ trace_vtd_re_not_present(bus_num); return -VTD_FR_ROOT_ENTRY_P; - } else if (re.rsvd || (re.val & VTD_ROOT_ENTRY_RSVD)) { + } + + if (re.rsvd || (re.val & VTD_ROOT_ENTRY_RSVD)) { trace_vtd_re_invalid(re.rsvd, re.val); return -VTD_FR_ROOT_ENTRY_RSVD; } @@ -851,31 +907,116 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, return ret_fr; } - if (!vtd_context_entry_present(ce)) { + if (!vtd_ce_present(ce)) { /* Not error - it's okay we don't have context entry. */ trace_vtd_ce_not_present(bus_num, devfn); return -VTD_FR_CONTEXT_ENTRY_P; - } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) || - (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) { + } + + if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) || + (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) { trace_vtd_ce_invalid(ce->hi, ce->lo); return -VTD_FR_CONTEXT_ENTRY_RSVD; } + /* Check if the programming of context-entry is valid */ - if (!vtd_is_level_supported(s, vtd_get_level_from_context_entry(ce))) { + if (!vtd_is_level_supported(s, vtd_ce_get_level(ce))) { + trace_vtd_ce_invalid(ce->hi, ce->lo); + return -VTD_FR_CONTEXT_ENTRY_INV; + } + + /* Do translation type check */ + if (!vtd_ce_type_check(x86_iommu, ce)) { trace_vtd_ce_invalid(ce->hi, ce->lo); return -VTD_FR_CONTEXT_ENTRY_INV; + } + + return 0; +} + +/* + * Fetch translation type for specific device. Returns <0 if error + * happens, otherwise return the shifted type to check against + * VTD_CONTEXT_TT_*. + */ +static int vtd_dev_get_trans_type(VTDAddressSpace *as) +{ + IntelIOMMUState *s; + VTDContextEntry ce; + int ret; + + s = as->iommu_state; + + ret = vtd_dev_to_context_entry(s, pci_bus_num(as->bus), + as->devfn, &ce); + if (ret) { + return ret; + } + + return vtd_ce_get_type(&ce); +} + +static bool vtd_dev_pt_enabled(VTDAddressSpace *as) +{ + int ret; + + assert(as); + + ret = vtd_dev_get_trans_type(as); + if (ret < 0) { + /* + * Possibly failed to parse the context entry for some reason + * (e.g., during init, or any guest configuration errors on + * context entries). We should assume PT not enabled for + * safety. + */ + return false; + } + + return ret == VTD_CONTEXT_TT_PASS_THROUGH; +} + +/* Return whether the device is using IOMMU translation. */ +static bool vtd_switch_address_space(VTDAddressSpace *as) +{ + bool use_iommu; + + assert(as); + + use_iommu = as->iommu_state->dmar_enabled & !vtd_dev_pt_enabled(as); + + trace_vtd_switch_address_space(pci_bus_num(as->bus), + VTD_PCI_SLOT(as->devfn), + VTD_PCI_FUNC(as->devfn), + use_iommu); + + /* Turn off first then on the other */ + if (use_iommu) { + memory_region_set_enabled(&as->sys_alias, false); + memory_region_set_enabled(&as->iommu, true); } else { - switch (ce->lo & VTD_CONTEXT_ENTRY_TT) { - case VTD_CONTEXT_TT_MULTI_LEVEL: - /* fall through */ - case VTD_CONTEXT_TT_DEV_IOTLB: - break; - default: - trace_vtd_ce_invalid(ce->hi, ce->lo); - return -VTD_FR_CONTEXT_ENTRY_INV; + memory_region_set_enabled(&as->iommu, false); + memory_region_set_enabled(&as->sys_alias, true); + } + + return use_iommu; +} + +static void vtd_switch_address_space_all(IntelIOMMUState *s) +{ + GHashTableIter iter; + VTDBus *vtd_bus; + int i; + + g_hash_table_iter_init(&iter, s->vtd_as_by_busptr); + while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) { + for (i = 0; i < X86_IOMMU_PCI_DEVFN_MAX; i++) { + if (!vtd_bus->dev_as[i]) { + continue; + } + vtd_switch_address_space(vtd_bus->dev_as[i]); } } - return 0; } static inline uint16_t vtd_make_source_id(uint8_t bus_num, uint8_t devfn) @@ -915,6 +1056,31 @@ static inline bool vtd_is_interrupt_addr(hwaddr addr) return VTD_INTERRUPT_ADDR_FIRST <= addr && addr <= VTD_INTERRUPT_ADDR_LAST; } +static void vtd_pt_enable_fast_path(IntelIOMMUState *s, uint16_t source_id) +{ + VTDBus *vtd_bus; + VTDAddressSpace *vtd_as; + bool success = false; + + vtd_bus = vtd_find_as_from_bus_num(s, VTD_SID_TO_BUS(source_id)); + if (!vtd_bus) { + goto out; + } + + vtd_as = vtd_bus->dev_as[VTD_SID_TO_DEVFN(source_id)]; + if (!vtd_as) { + goto out; + } + + if (vtd_switch_address_space(vtd_as) == false) { + /* We switched off IOMMU region successfully. */ + success = true; + } + +out: + trace_vtd_pt_enable_fast_path(source_id, success); +} + /* Map dev to context-entry then do a paging-structures walk to do a iommu * translation. * @@ -986,6 +1152,30 @@ static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, cc_entry->context_cache_gen = s->context_cache_gen; } + /* + * We don't need to translate for pass-through context entries. + * Also, let's ignore IOTLB caching as well for PT devices. + */ + if (vtd_ce_get_type(&ce) == VTD_CONTEXT_TT_PASS_THROUGH) { + entry->translated_addr = entry->iova; + entry->addr_mask = VTD_PAGE_SIZE - 1; + entry->perm = IOMMU_RW; + trace_vtd_translate_pt(source_id, entry->iova); + + /* + * When this happens, it means firstly caching-mode is not + * enabled, and this is the first passthrough translation for + * the device. Let's enable the fast path for passthrough. + * + * When passthrough is disabled again for the device, we can + * capture it via the context entry invalidation, then the + * IOMMU region can be swapped back. + */ + vtd_pt_enable_fast_path(s, source_id); + + return; + } + ret_fr = vtd_iova_to_slpte(&ce, addr, is_write, &slpte, &level, &reads, &writes); if (ret_fr) { @@ -1005,7 +1195,7 @@ out: entry->iova = addr & page_mask; entry->translated_addr = vtd_get_slpte_addr(slpte) & page_mask; entry->addr_mask = ~page_mask; - entry->perm = (writes ? 2 : 0) + (reads ? 1 : 0); + entry->perm = IOMMU_ACCESS_FLAG(reads, writes); } static void vtd_root_table_setup(IntelIOMMUState *s) @@ -1055,6 +1245,7 @@ static void vtd_context_global_invalidate(IntelIOMMUState *s) if (s->context_cache_gen == VTD_CONTEXT_CACHE_GEN_MAX) { vtd_reset_context_cache(s); } + vtd_switch_address_space_all(s); /* * From VT-d spec 6.5.2.1, a global context entry invalidation * should be followed by a IOTLB global invalidation, so we should @@ -1065,29 +1256,6 @@ static void vtd_context_global_invalidate(IntelIOMMUState *s) vtd_iommu_replay_all(s); } - -/* Find the VTD address space currently associated with a given bus number, - */ -static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num) -{ - VTDBus *vtd_bus = s->vtd_as_by_bus_num[bus_num]; - if (!vtd_bus) { - /* Iterate over the registered buses to find the one - * which currently hold this bus number, and update the bus_num lookup table: - */ - GHashTableIter iter; - - g_hash_table_iter_init(&iter, s->vtd_as_by_busptr); - while (g_hash_table_iter_next (&iter, NULL, (void**)&vtd_bus)) { - if (pci_bus_num(vtd_bus->bus) == bus_num) { - s->vtd_as_by_bus_num[bus_num] = vtd_bus; - return vtd_bus; - } - } - } - return vtd_bus; -} - /* Do a context-cache device-selective invalidation. * @func_mask: FM field after shifting */ @@ -1130,6 +1298,11 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s, VTD_PCI_FUNC(devfn_it)); vtd_as->context_cache_entry.context_cache_gen = 0; /* + * Do switch address space when needed, in case if the + * device passthrough bit is switched. + */ + vtd_switch_address_space(vtd_as); + /* * So a device is moving out of (or moving into) a * domain, a replay() suites here to notify all the * IOMMU_NOTIFIER_MAP registers about this change. @@ -1361,42 +1534,6 @@ static void vtd_handle_gcmd_sirtp(IntelIOMMUState *s) vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRTPS); } -static void vtd_switch_address_space(VTDAddressSpace *as) -{ - assert(as); - - trace_vtd_switch_address_space(pci_bus_num(as->bus), - VTD_PCI_SLOT(as->devfn), - VTD_PCI_FUNC(as->devfn), - as->iommu_state->dmar_enabled); - - /* Turn off first then on the other */ - if (as->iommu_state->dmar_enabled) { - memory_region_set_enabled(&as->sys_alias, false); - memory_region_set_enabled(&as->iommu, true); - } else { - memory_region_set_enabled(&as->iommu, false); - memory_region_set_enabled(&as->sys_alias, true); - } -} - -static void vtd_switch_address_space_all(IntelIOMMUState *s) -{ - GHashTableIter iter; - VTDBus *vtd_bus; - int i; - - g_hash_table_iter_init(&iter, s->vtd_as_by_busptr); - while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) { - for (i = 0; i < X86_IOMMU_PCI_DEVFN_MAX; i++) { - if (!vtd_bus->dev_as[i]) { - continue; - } - vtd_switch_address_space(vtd_bus->dev_as[i]); - } - } -} - /* Handle Translation Enable/Disable */ static void vtd_handle_gcmd_te(IntelIOMMUState *s, bool en) { @@ -2221,7 +2358,7 @@ static void vtd_mem_write(void *opaque, hwaddr addr, } static IOMMUTLBEntry vtd_iommu_translate(MemoryRegion *iommu, hwaddr addr, - bool is_write) + IOMMUAccessFlags flag) { VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu); IntelIOMMUState *s = vtd_as->iommu_state; @@ -2243,7 +2380,7 @@ static IOMMUTLBEntry vtd_iommu_translate(MemoryRegion *iommu, hwaddr addr, } vtd_do_iommu_translate(vtd_as, vtd_as->bus, vtd_as->devfn, addr, - is_write, &ret); + flag & IOMMU_WO, &ret); VTD_DPRINTF(MMU, "bus %"PRIu8 " slot %"PRIu8 " func %"PRIu8 " devfn %"PRIu8 " iova 0x%"PRIx64 " hpa 0x%"PRIx64, pci_bus_num(vtd_as->bus), @@ -2844,6 +2981,10 @@ static void vtd_init(IntelIOMMUState *s) s->ecap |= VTD_ECAP_DT; } + if (x86_iommu->pt_supported) { + s->ecap |= VTD_ECAP_PT; + } + if (s->caching_mode) { s->cap |= VTD_CAP_CM; } diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h index 29d67075f4..0e73a65bf2 100644 --- a/hw/i386/intel_iommu_internal.h +++ b/hw/i386/intel_iommu_internal.h @@ -187,6 +187,7 @@ /* Interrupt Remapping support */ #define VTD_ECAP_IR (1ULL << 3) #define VTD_ECAP_EIM (1ULL << 4) +#define VTD_ECAP_PT (1ULL << 6) #define VTD_ECAP_MHMV (15ULL << 20) /* CAP_REG */ diff --git a/hw/i386/trace-events b/hw/i386/trace-events index 04a6980800..72556dad48 100644 --- a/hw/i386/trace-events +++ b/hw/i386/trace-events @@ -38,6 +38,8 @@ vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"P vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set" vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)" vtd_as_unmap_whole(uint8_t bus, uint8_t slot, uint8_t fn, uint64_t iova, uint64_t size) "Device %02x:%02x.%x start 0x%"PRIx64" size 0x%"PRIx64 +vtd_translate_pt(uint16_t sid, uint64_t addr) "source id 0x%"PRIu16", iova 0x%"PRIx64 +vtd_pt_enable_fast_path(uint16_t sid, bool success) "sid 0x%"PRIu16" %d" # hw/i386/amd_iommu.c amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at addr 0x%"PRIx64" + offset 0x%"PRIx32 diff --git a/hw/i386/x86-iommu.c b/hw/i386/x86-iommu.c index 23dcd3f039..293caf83ef 100644 --- a/hw/i386/x86-iommu.c +++ b/hw/i386/x86-iommu.c @@ -88,55 +88,23 @@ static void x86_iommu_realize(DeviceState *dev, Error **errp) x86_iommu_set_default(X86_IOMMU_DEVICE(dev)); } +static Property x86_iommu_properties[] = { + DEFINE_PROP_BOOL("intremap", X86IOMMUState, intr_supported, false), + DEFINE_PROP_BOOL("device-iotlb", X86IOMMUState, dt_supported, false), + DEFINE_PROP_BOOL("pt", X86IOMMUState, pt_supported, true), + DEFINE_PROP_END_OF_LIST(), +}; + static void x86_iommu_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); dc->realize = x86_iommu_realize; -} - -static bool x86_iommu_intremap_prop_get(Object *o, Error **errp) -{ - X86IOMMUState *s = X86_IOMMU_DEVICE(o); - return s->intr_supported; -} - -static void x86_iommu_intremap_prop_set(Object *o, bool value, Error **errp) -{ - X86IOMMUState *s = X86_IOMMU_DEVICE(o); - s->intr_supported = value; -} - -static bool x86_iommu_device_iotlb_prop_get(Object *o, Error **errp) -{ - X86IOMMUState *s = X86_IOMMU_DEVICE(o); - return s->dt_supported; -} - -static void x86_iommu_device_iotlb_prop_set(Object *o, bool value, Error **errp) -{ - X86IOMMUState *s = X86_IOMMU_DEVICE(o); - s->dt_supported = value; -} - -static void x86_iommu_instance_init(Object *o) -{ - X86IOMMUState *s = X86_IOMMU_DEVICE(o); - - /* By default, do not support IR */ - s->intr_supported = false; - object_property_add_bool(o, "intremap", x86_iommu_intremap_prop_get, - x86_iommu_intremap_prop_set, NULL); - s->dt_supported = false; - object_property_add_bool(o, "device-iotlb", - x86_iommu_device_iotlb_prop_get, - x86_iommu_device_iotlb_prop_set, - NULL); + dc->props = x86_iommu_properties; } static const TypeInfo x86_iommu_info = { .name = TYPE_X86_IOMMU_DEVICE, .parent = TYPE_SYS_BUS_DEVICE, - .instance_init = x86_iommu_instance_init, .instance_size = sizeof(X86IOMMUState), .class_init = x86_iommu_class_init, .class_size = sizeof(X86IOMMUClass), diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 98bd683f31..9a3d769aa2 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -589,7 +589,15 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features, if (!get_vhost_net(nc->peer)) { return features; } - return vhost_net_get_features(get_vhost_net(nc->peer), features); + features = vhost_net_get_features(get_vhost_net(nc->peer), features); + vdev->backend_features = features; + + if (n->mtu_bypass_backend && + (n->host_features & 1ULL << VIRTIO_NET_F_MTU)) { + features |= (1ULL << VIRTIO_NET_F_MTU); + } + + return features; } static uint64_t virtio_net_bad_features(VirtIODevice *vdev) @@ -640,6 +648,11 @@ static void virtio_net_set_features(VirtIODevice *vdev, uint64_t features) VirtIONet *n = VIRTIO_NET(vdev); int i; + if (n->mtu_bypass_backend && + !virtio_has_feature(vdev->backend_features, VIRTIO_NET_F_MTU)) { + features &= ~(1ULL << VIRTIO_NET_F_MTU); + } + virtio_net_set_multiqueue(n, virtio_has_feature(features, VIRTIO_NET_F_MQ)); @@ -2093,6 +2106,8 @@ static Property virtio_net_properties[] = { DEFINE_PROP_UINT16("rx_queue_size", VirtIONet, net_conf.rx_queue_size, VIRTIO_NET_RX_QUEUE_DEFAULT_SIZE), DEFINE_PROP_UINT16("host_mtu", VirtIONet, net_conf.mtu, 0), + DEFINE_PROP_BOOL("x-mtu-bypass-backend", VirtIONet, mtu_bypass_backend, + true), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/pci-host/apb.c b/hw/pci-host/apb.c index f04104cdf2..326f5ef024 100644 --- a/hw/pci-host/apb.c +++ b/hw/pci-host/apb.c @@ -209,7 +209,7 @@ static AddressSpace *pbm_pci_dma_iommu(PCIBus *bus, void *opaque, int devfn) /* Called from RCU critical section */ static IOMMUTLBEntry pbm_translate_iommu(MemoryRegion *iommu, hwaddr addr, - bool is_write) + IOMMUAccessFlags flag) { IOMMUState *is = container_of(iommu, IOMMUState, iommu); hwaddr baseaddr, offset; diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c index 29c80bb3c8..0341bc069d 100644 --- a/hw/ppc/spapr_iommu.c +++ b/hw/ppc/spapr_iommu.c @@ -111,7 +111,7 @@ static void spapr_tce_free_table(uint64_t *table, int fd, uint32_t nb_table) /* Called from RCU critical section */ static IOMMUTLBEntry spapr_tce_translate_iommu(MemoryRegion *iommu, hwaddr addr, - bool is_write) + IOMMUAccessFlags flag) { sPAPRTCETable *tcet = container_of(iommu, sPAPRTCETable, iommu); uint64_t tce; diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c index 66a6fbeb8c..5651483781 100644 --- a/hw/s390x/s390-pci-bus.c +++ b/hw/s390x/s390-pci-bus.c @@ -357,7 +357,7 @@ out: } static IOMMUTLBEntry s390_translate_iommu(MemoryRegion *mr, hwaddr addr, - bool is_write) + IOMMUAccessFlags flag) { uint64_t pte; uint32_t flags; diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c index 314a9cbad4..8bc7c98682 100644 --- a/hw/s390x/s390-pci-inst.c +++ b/hw/s390x/s390-pci-inst.c @@ -624,7 +624,7 @@ int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2) mr = &iommu->iommu_mr; while (start < end) { - entry = mr->iommu_ops->translate(mr, start, 0); + entry = mr->iommu_ops->translate(mr, start, IOMMU_NONE); if (!entry.translated_addr) { pbdev->state = ZPCI_FS_ERROR; diff --git a/hw/vfio/common.c b/hw/vfio/common.c index a8f12eeb35..b9abe77f5a 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -502,7 +502,7 @@ static void vfio_listener_region_add(MemoryListener *listener, QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next); memory_region_register_iommu_notifier(giommu->iommu, &giommu->n); - memory_region_iommu_replay(giommu->iommu, &giommu->n, false); + memory_region_iommu_replay(giommu->iommu, &giommu->n); return; } diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index b87a176770..dde094abb4 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -162,11 +162,11 @@ fail: } static int process_message_reply(struct vhost_dev *dev, - VhostUserMsg msg) + const VhostUserMsg *msg) { VhostUserMsg msg_reply; - if ((msg.flags & VHOST_USER_NEED_REPLY_MASK) == 0) { + if ((msg->flags & VHOST_USER_NEED_REPLY_MASK) == 0) { return 0; } @@ -174,10 +174,10 @@ static int process_message_reply(struct vhost_dev *dev, return -1; } - if (msg_reply.request != msg.request) { + if (msg_reply.request != msg->request) { error_report("Received unexpected msg type." "Expected %d received %d", - msg.request, msg_reply.request); + msg->request, msg_reply.request); return -1; } @@ -324,7 +324,7 @@ static int vhost_user_set_mem_table(struct vhost_dev *dev, } if (reply_supported) { - return process_message_reply(dev, msg); + return process_message_reply(dev, &msg); } return 0; @@ -716,7 +716,7 @@ static int vhost_user_net_set_mtu(struct vhost_dev *dev, uint16_t mtu) /* If reply_ack supported, slave has to ack specified MTU is valid */ if (reply_supported) { - return process_message_reply(dev, msg); + return process_message_reply(dev, &msg); } return 0; diff --git a/include/exec/memory.h b/include/exec/memory.h index 99e0f54d86..bfdc685f24 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -185,8 +185,14 @@ struct MemoryRegionOps { typedef struct MemoryRegionIOMMUOps MemoryRegionIOMMUOps; struct MemoryRegionIOMMUOps { - /* Return a TLB entry that contains a given address. */ - IOMMUTLBEntry (*translate)(MemoryRegion *iommu, hwaddr addr, bool is_write); + /* + * Return a TLB entry that contains a given address. Flag should + * be the access permission of this translation operation. We can + * set flag to IOMMU_NONE to mean that we don't need any + * read/write permission checks, like, when for region replay. + */ + IOMMUTLBEntry (*translate)(MemoryRegion *iommu, hwaddr addr, + IOMMUAccessFlags flag); /* Returns minimum supported page size */ uint64_t (*get_min_page_size)(MemoryRegion *iommu); /* Called when IOMMU Notifier flag changed */ @@ -725,11 +731,8 @@ void memory_region_register_iommu_notifier(MemoryRegion *mr, * * @mr: the memory region to observe * @n: the notifier to which to replay iommu mappings - * @is_write: Whether to treat the replay as a translate "write" - * through the iommu */ -void memory_region_iommu_replay(MemoryRegion *mr, IOMMUNotifier *n, - bool is_write); +void memory_region_iommu_replay(MemoryRegion *mr, IOMMUNotifier *n); /** * memory_region_iommu_replay_all: replay existing IOMMU translations diff --git a/include/hw/compat.h b/include/hw/compat.h index 55b176507a..400c64b318 100644 --- a/include/hw/compat.h +++ b/include/hw/compat.h @@ -6,6 +6,14 @@ .driver = "pci-bridge",\ .property = "shpc",\ .value = "off",\ + },{\ + .driver = "intel-iommu",\ + .property = "pt",\ + .value = "off",\ + },{\ + .driver = "virtio-net-device",\ + .property = "x-mtu-bypass-backend",\ + .value = "off",\ }, #define HW_COMPAT_2_8 \ diff --git a/include/hw/i386/x86-iommu.h b/include/hw/i386/x86-iommu.h index 361c07cdc6..ef89c0c646 100644 --- a/include/hw/i386/x86-iommu.h +++ b/include/hw/i386/x86-iommu.h @@ -74,6 +74,7 @@ struct X86IOMMUState { SysBusDevice busdev; bool intr_supported; /* Whether vIOMMU supports IR */ bool dt_supported; /* Whether vIOMMU supports DT */ + bool pt_supported; /* Whether vIOMMU supports pass-through */ IommuType type; /* IOMMU type - AMD/Intel */ QLIST_HEAD(, IEC_Notifier) iec_notifiers; /* IEC notify list */ }; diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h index 1eec9a2da3..602b4868d4 100644 --- a/include/hw/virtio/virtio-net.h +++ b/include/hw/virtio/virtio-net.h @@ -97,6 +97,7 @@ typedef struct VirtIONet { QEMUTimer *announce_timer; int announce_counter; bool needs_vnet_hdr_swap; + bool mtu_bypass_backend; } VirtIONet; void virtio_net_set_netclient_name(VirtIONet *n, const char *name, diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index 7b6edbafd7..80c45c321e 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -79,6 +79,7 @@ struct VirtIODevice uint16_t queue_sel; uint64_t guest_features; uint64_t host_features; + uint64_t backend_features; size_t config_len; void *config; uint16_t config_vector; @@ -1620,8 +1620,7 @@ uint64_t memory_region_iommu_get_min_page_size(MemoryRegion *mr) return TARGET_PAGE_SIZE; } -void memory_region_iommu_replay(MemoryRegion *mr, IOMMUNotifier *n, - bool is_write) +void memory_region_iommu_replay(MemoryRegion *mr, IOMMUNotifier *n) { hwaddr addr, granularity; IOMMUTLBEntry iotlb; @@ -1635,7 +1634,7 @@ void memory_region_iommu_replay(MemoryRegion *mr, IOMMUNotifier *n, granularity = memory_region_iommu_get_min_page_size(mr); for (addr = 0; addr < memory_region_size(mr); addr += granularity) { - iotlb = mr->iommu_ops->translate(mr, addr, is_write); + iotlb = mr->iommu_ops->translate(mr, addr, IOMMU_NONE); if (iotlb.perm != IOMMU_NONE) { n->notify(n, &iotlb); } @@ -1653,7 +1652,7 @@ void memory_region_iommu_replay_all(MemoryRegion *mr) IOMMUNotifier *notifier; IOMMU_NOTIFIER_FOREACH(notifier, mr) { - memory_region_iommu_replay(mr, notifier, false); + memory_region_iommu_replay(mr, notifier); } } diff --git a/tests/acpi-test-data/pc/SRAT.memhp b/tests/acpi-test-data/pc/SRAT.memhp Binary files differindex a7dddf7760..e508b4ae3c 100644 --- a/tests/acpi-test-data/pc/SRAT.memhp +++ b/tests/acpi-test-data/pc/SRAT.memhp diff --git a/tests/acpi-test-data/q35/SRAT.memhp b/tests/acpi-test-data/q35/SRAT.memhp Binary files differindex a7dddf7760..e508b4ae3c 100644 --- a/tests/acpi-test-data/q35/SRAT.memhp +++ b/tests/acpi-test-data/q35/SRAT.memhp |