diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2017-02-20 09:53:59 +0000 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2017-02-20 09:53:59 +0000 |
commit | d514cfd763b271b4e97a9fc6adaabc8fd50084ab (patch) | |
tree | 146f1c9b310813894b79976e28a092b5cbf2ed12 | |
parent | ad584d37f2a86b392c25f3f00cc1f1532676c2d1 (diff) | |
parent | 7e58326ad7e79b8c5dbcc6f24e9dc1523d84c11b (diff) |
Merge remote-tracking branch 'remotes/mst/tags/for_upstream' into staging
virtio, pci: fixes, features
virtio is using region caches for performance
iommu support for IOTLBs
misc fixes
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
# gpg: Signature made Fri 17 Feb 2017 19:53:02 GMT
# gpg: using RSA key 0x281F0DB8D28D5469
# gpg: Good signature from "Michael S. Tsirkin <mst@kernel.org>"
# gpg: aka "Michael S. Tsirkin <mst@redhat.com>"
# Primary key fingerprint: 0270 606B 6F3C DF3D 0B17 0970 C350 3912 AFBE 8E67
# Subkey fingerprint: 5D09 FD08 71C8 F85B 94CA 8A0D 281F 0DB8 D28D 5469
* remotes/mst/tags/for_upstream: (23 commits)
intel_iommu: vtd_slpt_level_shift check level
intel_iommu: convert dbg macros to trace for trans
intel_iommu: convert dbg macros to traces for inv
intel_iommu: renaming gpa to iova where proper
intel_iommu: simplify irq region translation
intel_iommu: add "caching-mode" option
vfio: allow to notify unmap for very large region
vfio: introduce vfio_get_vaddr()
vfio: trace map/unmap for notify as well
pcie: simplify pcie_add_capability()
virtio: Fix no interrupt when not creating msi controller
virtio: use VRingMemoryRegionCaches for avail and used rings
virtio: check for vring setup in virtio_queue_update_used_idx
virtio: use VRingMemoryRegionCaches for descriptor ring
virtio: add MemoryListener to cache ring translations
virtio: use MemoryRegionCache to access descriptors
exec: make address_space_cache_destroy idempotent
virtio: use address_space_map/unmap to access descriptors
virtio: add virtio_*_phys_cached
memory: make memory_listener_unregister idempotent
...
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r-- | docs/nvdimm.txt | 124 | ||||
-rw-r--r-- | exec.c | 1 | ||||
-rw-r--r-- | hw/block/dataplane/virtio-blk.c | 4 | ||||
-rw-r--r-- | hw/block/virtio-blk.c | 12 | ||||
-rw-r--r-- | hw/i386/intel_iommu.c | 238 | ||||
-rw-r--r-- | hw/i386/intel_iommu_internal.h | 1 | ||||
-rw-r--r-- | hw/i386/trace-events | 28 | ||||
-rw-r--r-- | hw/net/virtio-net.c | 14 | ||||
-rw-r--r-- | hw/pci/pcie.c | 23 | ||||
-rw-r--r-- | hw/scsi/virtio-scsi-dataplane.c | 14 | ||||
-rw-r--r-- | hw/scsi/virtio-scsi.c | 14 | ||||
-rw-r--r-- | hw/vfio/common.c | 65 | ||||
-rw-r--r-- | hw/vfio/trace-events | 2 | ||||
-rw-r--r-- | hw/virtio/virtio.c | 364 | ||||
-rw-r--r-- | include/exec/memory.h | 2 | ||||
-rw-r--r-- | include/hw/i386/intel_iommu.h | 2 | ||||
-rw-r--r-- | include/hw/virtio/virtio-access.h | 52 | ||||
-rw-r--r-- | include/hw/virtio/virtio-blk.h | 2 | ||||
-rw-r--r-- | include/hw/virtio/virtio-scsi.h | 6 | ||||
-rw-r--r-- | include/hw/virtio/virtio.h | 5 | ||||
-rw-r--r-- | memory.c | 5 |
21 files changed, 702 insertions, 276 deletions
diff --git a/docs/nvdimm.txt b/docs/nvdimm.txt new file mode 100644 index 0000000000..2d9f8c0e8c --- /dev/null +++ b/docs/nvdimm.txt @@ -0,0 +1,124 @@ +QEMU Virtual NVDIMM +=================== + +This document explains the usage of virtual NVDIMM (vNVDIMM) feature +which is available since QEMU v2.6.0. + +The current QEMU only implements the persistent memory mode of vNVDIMM +device and not the block window mode. + +Basic Usage +----------- + +The storage of a vNVDIMM device in QEMU is provided by the memory +backend (i.e. memory-backend-file and memory-backend-ram). A simple +way to create a vNVDIMM device at startup time is done via the +following command line options: + + -machine pc,nvdimm + -m $RAM_SIZE,slots=$N,maxmem=$MAX_SIZE + -object memory-backend-file,id=mem1,share=on,mem-path=$PATH,size=$NVDIMM_SIZE + -device nvdimm,id=nvdimm1,memdev=mem1 + +Where, + + - the "nvdimm" machine option enables vNVDIMM feature. + + - "slots=$N" should be equal to or larger than the total amount of + normal RAM devices and vNVDIMM devices, e.g. $N should be >= 2 here. + + - "maxmem=$MAX_SIZE" should be equal to or larger than the total size + of normal RAM devices and vNVDIMM devices, e.g. $MAX_SIZE should be + >= $RAM_SIZE + $NVDIMM_SIZE here. + + - "object memory-backend-file,id=mem1,share=on,mem-path=$PATH,size=$NVDIMM_SIZE" + creates a backend storage of size $NVDIMM_SIZE on a file $PATH. All + accesses to the virtual NVDIMM device go to the file $PATH. + + "share=on/off" controls the visibility of guest writes. If + "share=on", then guest writes will be applied to the backend + file. If another guest uses the same backend file with option + "share=on", then above writes will be visible to it as well. If + "share=off", then guest writes won't be applied to the backend + file and thus will be invisible to other guests. + + - "device nvdimm,id=nvdimm1,memdev=mem1" creates a virtual NVDIMM + device whose storage is provided by above memory backend device. + +Multiple vNVDIMM devices can be created if multiple pairs of "-object" +and "-device" are provided. + +For above command line options, if the guest OS has the proper NVDIMM +driver, it should be able to detect a NVDIMM device which is in the +persistent memory mode and whose size is $NVDIMM_SIZE. + +Note: + +1. Prior to QEMU v2.8.0, if memory-backend-file is used and the actual + backend file size is not equal to the size given by "size" option, + QEMU will truncate the backend file by ftruncate(2), which will + corrupt the existing data in the backend file, especially for the + shrink case. + + QEMU v2.8.0 and later check the backend file size and the "size" + option. If they do not match, QEMU will report errors and abort in + order to avoid the data corruption. + +2. QEMU v2.6.0 only puts a basic alignment requirement on the "size" + option of memory-backend-file, e.g. 4KB alignment on x86. However, + QEMU v.2.7.0 puts an additional alignment requirement, which may + require a larger value than the basic one, e.g. 2MB on x86. This + change breaks the usage of memory-backend-file that only satisfies + the basic alignment. + + QEMU v2.8.0 and later remove the additional alignment on non-s390x + architectures, so the broken memory-backend-file can work again. + +Label +----- + +QEMU v2.7.0 and later implement the label support for vNVDIMM devices. +To enable label on vNVDIMM devices, users can simply add +"label-size=$SZ" option to "-device nvdimm", e.g. + + -device nvdimm,id=nvdimm1,memdev=mem1,label-size=128K + +Note: + +1. The minimal label size is 128KB. + +2. QEMU v2.7.0 and later store labels at the end of backend storage. + If a memory backend file, which was previously used as the backend + of a vNVDIMM device without labels, is now used for a vNVDIMM + device with label, the data in the label area at the end of file + will be inaccessible to the guest. If any useful data (e.g. the + meta-data of the file system) was stored there, the latter usage + may result guest data corruption (e.g. breakage of guest file + system). + +Hotplug +------- + +QEMU v2.8.0 and later implement the hotplug support for vNVDIMM +devices. Similarly to the RAM hotplug, the vNVDIMM hotplug is +accomplished by two monitor commands "object_add" and "device_add". + +For example, the following commands add another 4GB vNVDIMM device to +the guest: + + (qemu) object_add memory-backend-file,id=mem2,share=on,mem-path=new_nvdimm.img,size=4G + (qemu) device_add nvdimm,id=nvdimm2,memdev=mem2 + +Note: + +1. Each hotplugged vNVDIMM device consumes one memory slot. Users + should always ensure the memory option "-m ...,slots=N" specifies + enough number of slots, i.e. + N >= number of RAM devices + + number of statically plugged vNVDIMM devices + + number of hotplugged vNVDIMM devices + +2. The similar is required for the memory option "-m ...,maxmem=M", i.e. + M >= size of RAM devices + + size of statically plugged vNVDIMM devices + + size of hotplugged vNVDIMM devices @@ -3166,6 +3166,7 @@ void address_space_cache_destroy(MemoryRegionCache *cache) xen_invalidate_map_cache_entry(cache->ptr); } memory_region_unref(cache->mr); + cache->mr = NULL; } /* Called from RCU critical section. This function has the same diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c index d1f9f63eaf..5556f0e64e 100644 --- a/hw/block/dataplane/virtio-blk.c +++ b/hw/block/dataplane/virtio-blk.c @@ -147,7 +147,7 @@ void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s) g_free(s); } -static void virtio_blk_data_plane_handle_output(VirtIODevice *vdev, +static bool virtio_blk_data_plane_handle_output(VirtIODevice *vdev, VirtQueue *vq) { VirtIOBlock *s = (VirtIOBlock *)vdev; @@ -155,7 +155,7 @@ static void virtio_blk_data_plane_handle_output(VirtIODevice *vdev, assert(s->dataplane); assert(s->dataplane_started); - virtio_blk_handle_vq(s, vq); + return virtio_blk_handle_vq(s, vq); } /* Context: QEMU global mutex held */ diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index 702eda863e..baaa19593f 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -581,10 +581,11 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb) return 0; } -void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq) +bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq) { VirtIOBlockReq *req; MultiReqBuffer mrb = {}; + bool progress = false; blk_io_plug(s->blk); @@ -592,6 +593,7 @@ void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq) virtio_queue_set_notification(vq, 0); while ((req = virtio_blk_get_request(s, vq))) { + progress = true; if (virtio_blk_handle_request(req, &mrb)) { virtqueue_detach_element(req->vq, &req->elem, 0); virtio_blk_free_request(req); @@ -607,6 +609,12 @@ void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq) } blk_io_unplug(s->blk); + return progress; +} + +static void virtio_blk_handle_output_do(VirtIOBlock *s, VirtQueue *vq) +{ + virtio_blk_handle_vq(s, vq); } static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq) @@ -622,7 +630,7 @@ static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq) return; } } - virtio_blk_handle_vq(s, vq); + virtio_blk_handle_output_do(s, vq); } static void virtio_blk_dma_restart_bh(void *opaque) diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 3270fb9162..22d8226e43 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -35,6 +35,7 @@ #include "sysemu/kvm.h" #include "hw/i386/apic_internal.h" #include "kvm_i386.h" +#include "trace.h" /*#define DEBUG_INTEL_IOMMU*/ #ifdef DEBUG_INTEL_IOMMU @@ -167,6 +168,7 @@ static gboolean vtd_hash_remove_by_domain(gpointer key, gpointer value, /* The shift of an addr for a certain level of paging structure */ static inline uint32_t vtd_slpt_level_shift(uint32_t level) { + assert(level != 0); return VTD_PAGE_SHIFT_4K + (level - 1) * VTD_SL_LEVEL_BITS; } @@ -259,11 +261,9 @@ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id, uint64_t *key = g_malloc(sizeof(*key)); uint64_t gfn = vtd_get_iotlb_gfn(addr, level); - VTD_DPRINTF(CACHE, "update iotlb sid 0x%"PRIx16 " gpa 0x%"PRIx64 - " slpte 0x%"PRIx64 " did 0x%"PRIx16, source_id, addr, slpte, - domain_id); + trace_vtd_iotlb_page_update(source_id, addr, slpte, domain_id); if (g_hash_table_size(s->iotlb) >= VTD_IOTLB_MAX_SIZE) { - VTD_DPRINTF(CACHE, "iotlb exceeds size limit, forced to reset"); + trace_vtd_iotlb_reset("iotlb exceeds size limit"); vtd_reset_iotlb(s); } @@ -474,22 +474,19 @@ static void vtd_handle_inv_queue_error(IntelIOMMUState *s) /* Set the IWC field and try to generate an invalidation completion interrupt */ static void vtd_generate_completion_event(IntelIOMMUState *s) { - VTD_DPRINTF(INV, "completes an invalidation wait command with " - "Interrupt Flag"); if (vtd_get_long_raw(s, DMAR_ICS_REG) & VTD_ICS_IWC) { - VTD_DPRINTF(INV, "there is a previous interrupt condition to be " - "serviced by software, " - "new invalidation event is not generated"); + trace_vtd_inv_desc_wait_irq("One pending, skip current"); return; } vtd_set_clear_mask_long(s, DMAR_ICS_REG, 0, VTD_ICS_IWC); vtd_set_clear_mask_long(s, DMAR_IECTL_REG, 0, VTD_IECTL_IP); if (vtd_get_long_raw(s, DMAR_IECTL_REG) & VTD_IECTL_IM) { - VTD_DPRINTF(INV, "IM filed in IECTL_REG is set, new invalidation " - "event is not generated"); + trace_vtd_inv_desc_wait_irq("IM in IECTL_REG is set, " + "new event not generated"); return; } else { /* Generate the interrupt event */ + trace_vtd_inv_desc_wait_irq("Generating complete event"); vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG); vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0); } @@ -507,8 +504,7 @@ static int vtd_get_root_entry(IntelIOMMUState *s, uint8_t index, addr = s->root + index * sizeof(*re); if (dma_memory_read(&address_space_memory, addr, re, sizeof(*re))) { - VTD_DPRINTF(GENERAL, "error: fail to access root-entry at 0x%"PRIx64 - " + %"PRIu8, s->root, index); + trace_vtd_re_invalid(re->rsvd, re->val); re->val = 0; return -VTD_FR_ROOT_TABLE_INV; } @@ -526,15 +522,10 @@ static int vtd_get_context_entry_from_root(VTDRootEntry *root, uint8_t index, { dma_addr_t addr; - if (!vtd_root_entry_present(root)) { - VTD_DPRINTF(GENERAL, "error: root-entry is not present"); - return -VTD_FR_ROOT_ENTRY_P; - } + /* we have checked that root entry is present */ addr = (root->val & VTD_ROOT_ENTRY_CTP) + index * sizeof(*ce); if (dma_memory_read(&address_space_memory, addr, ce, sizeof(*ce))) { - VTD_DPRINTF(GENERAL, "error: fail to access context-entry at 0x%"PRIx64 - " + %"PRIu8, - (uint64_t)(root->val & VTD_ROOT_ENTRY_CTP), index); + trace_vtd_re_invalid(root->rsvd, root->val); return -VTD_FR_CONTEXT_TABLE_INV; } ce->lo = le64_to_cpu(ce->lo); @@ -575,12 +566,12 @@ static uint64_t vtd_get_slpte(dma_addr_t base_addr, uint32_t index) return slpte; } -/* Given a gpa and the level of paging structure, return the offset of current - * level. +/* Given an iova and the level of paging structure, return the offset + * of current level. */ -static inline uint32_t vtd_gpa_level_offset(uint64_t gpa, uint32_t level) +static inline uint32_t vtd_iova_level_offset(uint64_t iova, uint32_t level) { - return (gpa >> vtd_slpt_level_shift(level)) & + return (iova >> vtd_slpt_level_shift(level)) & ((1ULL << VTD_SL_LEVEL_BITS) - 1); } @@ -628,12 +619,12 @@ static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level) } } -/* Given the @gpa, get relevant @slptep. @slpte_level will be the last level +/* Given the @iova, get relevant @slptep. @slpte_level will be the last level * of the translation, can be used for deciding the size of large page. */ -static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa, bool is_write, - uint64_t *slptep, uint32_t *slpte_level, - bool *reads, bool *writes) +static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write, + uint64_t *slptep, uint32_t *slpte_level, + bool *reads, bool *writes) { dma_addr_t addr = vtd_get_slpt_base_from_context(ce); uint32_t level = vtd_get_level_from_context_entry(ce); @@ -642,11 +633,11 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa, bool is_write, uint32_t ce_agaw = vtd_get_agaw_from_context_entry(ce); uint64_t access_right_check; - /* Check if @gpa is above 2^X-1, where X is the minimum of MGAW in CAP_REG - * and AW in context-entry. + /* Check if @iova is above 2^X-1, where X is the minimum of MGAW + * in CAP_REG and AW in context-entry. */ - if (gpa & ~((1ULL << MIN(ce_agaw, VTD_MGAW)) - 1)) { - VTD_DPRINTF(GENERAL, "error: gpa 0x%"PRIx64 " exceeds limits", gpa); + if (iova & ~((1ULL << MIN(ce_agaw, VTD_MGAW)) - 1)) { + VTD_DPRINTF(GENERAL, "error: iova 0x%"PRIx64 " exceeds limits", iova); return -VTD_FR_ADDR_BEYOND_MGAW; } @@ -654,13 +645,13 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa, bool is_write, access_right_check = is_write ? VTD_SL_W : VTD_SL_R; while (true) { - offset = vtd_gpa_level_offset(gpa, level); + offset = vtd_iova_level_offset(iova, level); slpte = vtd_get_slpte(addr, offset); if (slpte == (uint64_t)-1) { VTD_DPRINTF(GENERAL, "error: fail to access second-level paging " - "entry at level %"PRIu32 " for gpa 0x%"PRIx64, - level, gpa); + "entry at level %"PRIu32 " for iova 0x%"PRIx64, + level, iova); if (level == vtd_get_level_from_context_entry(ce)) { /* Invalid programming of context-entry */ return -VTD_FR_CONTEXT_ENTRY_INV; @@ -672,8 +663,8 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa, bool is_write, *writes = (*writes) && (slpte & VTD_SL_W); if (!(slpte & access_right_check)) { VTD_DPRINTF(GENERAL, "error: lack of %s permission for " - "gpa 0x%"PRIx64 " slpte 0x%"PRIx64, - (is_write ? "write" : "read"), gpa, slpte); + "iova 0x%"PRIx64 " slpte 0x%"PRIx64, + (is_write ? "write" : "read"), iova, slpte); return is_write ? -VTD_FR_WRITE : -VTD_FR_READ; } if (vtd_slpte_nonzero_rsvd(slpte, level)) { @@ -706,12 +697,11 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, } if (!vtd_root_entry_present(&re)) { - VTD_DPRINTF(GENERAL, "error: root-entry #%"PRIu8 " is not present", - bus_num); + /* Not error - it's okay we don't have root entry. */ + trace_vtd_re_not_present(bus_num); return -VTD_FR_ROOT_ENTRY_P; } else if (re.rsvd || (re.val & VTD_ROOT_ENTRY_RSVD)) { - VTD_DPRINTF(GENERAL, "error: non-zero reserved field in root-entry " - "hi 0x%"PRIx64 " lo 0x%"PRIx64, re.rsvd, re.val); + trace_vtd_re_invalid(re.rsvd, re.val); return -VTD_FR_ROOT_ENTRY_RSVD; } @@ -721,22 +711,17 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, } if (!vtd_context_entry_present(ce)) { - VTD_DPRINTF(GENERAL, - "error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") " - "is not present", devfn, bus_num); + /* Not error - it's okay we don't have context entry. */ + trace_vtd_ce_not_present(bus_num, devfn); return -VTD_FR_CONTEXT_ENTRY_P; } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) || (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) { - VTD_DPRINTF(GENERAL, - "error: non-zero reserved field in context-entry " - "hi 0x%"PRIx64 " lo 0x%"PRIx64, ce->hi, ce->lo); + trace_vtd_ce_invalid(ce->hi, ce->lo); return -VTD_FR_CONTEXT_ENTRY_RSVD; } /* Check if the programming of context-entry is valid */ if (!vtd_is_level_supported(s, vtd_get_level_from_context_entry(ce))) { - VTD_DPRINTF(GENERAL, "error: unsupported Address Width value in " - "context-entry hi 0x%"PRIx64 " lo 0x%"PRIx64, - ce->hi, ce->lo); + trace_vtd_ce_invalid(ce->hi, ce->lo); return -VTD_FR_CONTEXT_ENTRY_INV; } else { switch (ce->lo & VTD_CONTEXT_ENTRY_TT) { @@ -745,9 +730,7 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, case VTD_CONTEXT_TT_DEV_IOTLB: break; default: - VTD_DPRINTF(GENERAL, "error: unsupported Translation Type in " - "context-entry hi 0x%"PRIx64 " lo 0x%"PRIx64, - ce->hi, ce->lo); + trace_vtd_ce_invalid(ce->hi, ce->lo); return -VTD_FR_CONTEXT_ENTRY_INV; } } @@ -818,34 +801,17 @@ static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, bool writes = true; VTDIOTLBEntry *iotlb_entry; - /* Check if the request is in interrupt address range */ - if (vtd_is_interrupt_addr(addr)) { - if (is_write) { - /* FIXME: since we don't know the length of the access here, we - * treat Non-DWORD length write requests without PASID as - * interrupt requests, too. Withoud interrupt remapping support, - * we just use 1:1 mapping. - */ - VTD_DPRINTF(MMU, "write request to interrupt address " - "gpa 0x%"PRIx64, addr); - entry->iova = addr & VTD_PAGE_MASK_4K; - entry->translated_addr = addr & VTD_PAGE_MASK_4K; - entry->addr_mask = ~VTD_PAGE_MASK_4K; - entry->perm = IOMMU_WO; - return; - } else { - VTD_DPRINTF(GENERAL, "error: read request from interrupt address " - "gpa 0x%"PRIx64, addr); - vtd_report_dmar_fault(s, source_id, addr, VTD_FR_READ, is_write); - return; - } - } + /* + * We have standalone memory region for interrupt addresses, we + * should never receive translation requests in this region. + */ + assert(!vtd_is_interrupt_addr(addr)); + /* Try to fetch slpte form IOTLB */ iotlb_entry = vtd_lookup_iotlb(s, source_id, addr); if (iotlb_entry) { - VTD_DPRINTF(CACHE, "hit iotlb sid 0x%"PRIx16 " gpa 0x%"PRIx64 - " slpte 0x%"PRIx64 " did 0x%"PRIx16, source_id, addr, - iotlb_entry->slpte, iotlb_entry->domain_id); + trace_vtd_iotlb_page_hit(source_id, addr, iotlb_entry->slpte, + iotlb_entry->domain_id); slpte = iotlb_entry->slpte; reads = iotlb_entry->read_flags; writes = iotlb_entry->write_flags; @@ -854,10 +820,9 @@ static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, } /* Try to fetch context-entry from cache first */ if (cc_entry->context_cache_gen == s->context_cache_gen) { - VTD_DPRINTF(CACHE, "hit context-cache bus %d devfn %d " - "(hi %"PRIx64 " lo %"PRIx64 " gen %"PRIu32 ")", - bus_num, devfn, cc_entry->context_entry.hi, - cc_entry->context_entry.lo, cc_entry->context_cache_gen); + trace_vtd_iotlb_cc_hit(bus_num, devfn, cc_entry->context_entry.hi, + cc_entry->context_entry.lo, + cc_entry->context_cache_gen); ce = cc_entry->context_entry; is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD; } else { @@ -866,30 +831,26 @@ static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, if (ret_fr) { ret_fr = -ret_fr; if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) { - VTD_DPRINTF(FLOG, "fault processing is disabled for DMA " - "requests through this context-entry " - "(with FPD Set)"); + trace_vtd_fault_disabled(); } else { vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write); } return; } /* Update context-cache */ - VTD_DPRINTF(CACHE, "update context-cache bus %d devfn %d " - "(hi %"PRIx64 " lo %"PRIx64 " gen %"PRIu32 "->%"PRIu32 ")", - bus_num, devfn, ce.hi, ce.lo, - cc_entry->context_cache_gen, s->context_cache_gen); + trace_vtd_iotlb_cc_update(bus_num, devfn, ce.hi, ce.lo, + cc_entry->context_cache_gen, + s->context_cache_gen); cc_entry->context_entry = ce; cc_entry->context_cache_gen = s->context_cache_gen; } - ret_fr = vtd_gpa_to_slpte(&ce, addr, is_write, &slpte, &level, - &reads, &writes); + ret_fr = vtd_iova_to_slpte(&ce, addr, is_write, &slpte, &level, + &reads, &writes); if (ret_fr) { ret_fr = -ret_fr; if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) { - VTD_DPRINTF(FLOG, "fault processing is disabled for DMA requests " - "through this context-entry (with FPD Set)"); + trace_vtd_fault_disabled(); } else { vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write); } @@ -939,6 +900,7 @@ static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s) static void vtd_context_global_invalidate(IntelIOMMUState *s) { + trace_vtd_inv_desc_cc_global(); s->context_cache_gen++; if (s->context_cache_gen == VTD_CONTEXT_CACHE_GEN_MAX) { vtd_reset_context_cache(s); @@ -978,9 +940,11 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s, uint16_t mask; VTDBus *vtd_bus; VTDAddressSpace *vtd_as; - uint16_t devfn; + uint8_t bus_n, devfn; uint16_t devfn_it; + trace_vtd_inv_desc_cc_devices(source_id, func_mask); + switch (func_mask & 3) { case 0: mask = 0; /* No bits in the SID field masked */ @@ -996,16 +960,16 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s, break; } mask = ~mask; - VTD_DPRINTF(INV, "device-selective invalidation source 0x%"PRIx16 - " mask %"PRIu16, source_id, mask); - vtd_bus = vtd_find_as_from_bus_num(s, VTD_SID_TO_BUS(source_id)); + + bus_n = VTD_SID_TO_BUS(source_id); + vtd_bus = vtd_find_as_from_bus_num(s, bus_n); if (vtd_bus) { devfn = VTD_SID_TO_DEVFN(source_id); for (devfn_it = 0; devfn_it < X86_IOMMU_PCI_DEVFN_MAX; ++devfn_it) { vtd_as = vtd_bus->dev_as[devfn_it]; if (vtd_as && ((devfn_it & mask) == (devfn & mask))) { - VTD_DPRINTF(INV, "invalidate context-cahce of devfn 0x%"PRIx16, - devfn_it); + trace_vtd_inv_desc_cc_device(bus_n, VTD_PCI_SLOT(devfn_it), + VTD_PCI_FUNC(devfn_it)); vtd_as->context_cache_entry.context_cache_gen = 0; } } @@ -1046,6 +1010,7 @@ static uint64_t vtd_context_cache_invalidate(IntelIOMMUState *s, uint64_t val) static void vtd_iotlb_global_invalidate(IntelIOMMUState *s) { + trace_vtd_iotlb_reset("global invalidation recved"); vtd_reset_iotlb(s); } @@ -1318,9 +1283,7 @@ static bool vtd_process_wait_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) { if ((inv_desc->hi & VTD_INV_DESC_WAIT_RSVD_HI) || (inv_desc->lo & VTD_INV_DESC_WAIT_RSVD_LO)) { - VTD_DPRINTF(GENERAL, "error: non-zero reserved field in Invalidation " - "Wait Descriptor hi 0x%"PRIx64 " lo 0x%"PRIx64, - inv_desc->hi, inv_desc->lo); + trace_vtd_inv_desc_wait_invalid(inv_desc->hi, inv_desc->lo); return false; } if (inv_desc->lo & VTD_INV_DESC_WAIT_SW) { @@ -1332,21 +1295,18 @@ static bool vtd_process_wait_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) /* FIXME: need to be masked with HAW? */ dma_addr_t status_addr = inv_desc->hi; - VTD_DPRINTF(INV, "status data 0x%x, status addr 0x%"PRIx64, - status_data, status_addr); + trace_vtd_inv_desc_wait_sw(status_addr, status_data); status_data = cpu_to_le32(status_data); if (dma_memory_write(&address_space_memory, status_addr, &status_data, sizeof(status_data))) { - VTD_DPRINTF(GENERAL, "error: fail to perform a coherent write"); + trace_vtd_inv_desc_wait_write_fail(inv_desc->hi, inv_desc->lo); return false; } } else if (inv_desc->lo & VTD_INV_DESC_WAIT_IF) { /* Interrupt flag */ - VTD_DPRINTF(INV, "Invalidation Wait Descriptor interrupt completion"); vtd_generate_completion_event(s); } else { - VTD_DPRINTF(GENERAL, "error: invalid Invalidation Wait Descriptor: " - "hi 0x%"PRIx64 " lo 0x%"PRIx64, inv_desc->hi, inv_desc->lo); + trace_vtd_inv_desc_wait_invalid(inv_desc->hi, inv_desc->lo); return false; } return true; @@ -1355,30 +1315,29 @@ static bool vtd_process_wait_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) static bool vtd_process_context_cache_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) { + uint16_t sid, fmask; + if ((inv_desc->lo & VTD_INV_DESC_CC_RSVD) || inv_desc->hi) { - VTD_DPRINTF(GENERAL, "error: non-zero reserved field in Context-cache " - "Invalidate Descriptor"); + trace_vtd_inv_desc_cc_invalid(inv_desc->hi, inv_desc->lo); return false; } switch (inv_desc->lo & VTD_INV_DESC_CC_G) { case VTD_INV_DESC_CC_DOMAIN: - VTD_DPRINTF(INV, "domain-selective invalidation domain 0x%"PRIx16, - (uint16_t)VTD_INV_DESC_CC_DID(inv_desc->lo)); + trace_vtd_inv_desc_cc_domain( + (uint16_t)VTD_INV_DESC_CC_DID(inv_desc->lo)); /* Fall through */ case VTD_INV_DESC_CC_GLOBAL: - VTD_DPRINTF(INV, "global invalidation"); vtd_context_global_invalidate(s); break; case VTD_INV_DESC_CC_DEVICE: - vtd_context_device_invalidate(s, VTD_INV_DESC_CC_SID(inv_desc->lo), - VTD_INV_DESC_CC_FM(inv_desc->lo)); + sid = VTD_INV_DESC_CC_SID(inv_desc->lo); + fmask = VTD_INV_DESC_CC_FM(inv_desc->lo); + vtd_context_device_invalidate(s, sid, fmask); break; default: - VTD_DPRINTF(GENERAL, "error: invalid granularity in Context-cache " - "Invalidate Descriptor hi 0x%"PRIx64 " lo 0x%"PRIx64, - inv_desc->hi, inv_desc->lo); + trace_vtd_inv_desc_cc_invalid(inv_desc->hi, inv_desc->lo); return false; } return true; @@ -1392,22 +1351,19 @@ static bool vtd_process_iotlb_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) if ((inv_desc->lo & VTD_INV_DESC_IOTLB_RSVD_LO) || (inv_desc->hi & VTD_INV_DESC_IOTLB_RSVD_HI)) { - VTD_DPRINTF(GENERAL, "error: non-zero reserved field in IOTLB " - "Invalidate Descriptor hi 0x%"PRIx64 " lo 0x%"PRIx64, - inv_desc->hi, inv_desc->lo); + trace_vtd_inv_desc_iotlb_invalid(inv_desc->hi, inv_desc->lo); return false; } switch (inv_desc->lo & VTD_INV_DESC_IOTLB_G) { case VTD_INV_DESC_IOTLB_GLOBAL: - VTD_DPRINTF(INV, "global invalidation"); + trace_vtd_inv_desc_iotlb_global(); vtd_iotlb_global_invalidate(s); break; case VTD_INV_DESC_IOTLB_DOMAIN: domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo); - VTD_DPRINTF(INV, "domain-selective invalidation domain 0x%"PRIx16, - domain_id); + trace_vtd_inv_desc_iotlb_domain(domain_id); vtd_iotlb_domain_invalidate(s, domain_id); break; @@ -1415,20 +1371,16 @@ static bool vtd_process_iotlb_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo); addr = VTD_INV_DESC_IOTLB_ADDR(inv_desc->hi); am = VTD_INV_DESC_IOTLB_AM(inv_desc->hi); - VTD_DPRINTF(INV, "page-selective invalidation domain 0x%"PRIx16 - " addr 0x%"PRIx64 " mask %"PRIu8, domain_id, addr, am); + trace_vtd_inv_desc_iotlb_pages(domain_id, addr, am); if (am > VTD_MAMV) { - VTD_DPRINTF(GENERAL, "error: supported max address mask value is " - "%"PRIu8, (uint8_t)VTD_MAMV); + trace_vtd_inv_desc_iotlb_invalid(inv_desc->hi, inv_desc->lo); return false; } vtd_iotlb_page_invalidate(s, domain_id, addr, am); break; default: - VTD_DPRINTF(GENERAL, "error: invalid granularity in IOTLB Invalidate " - "Descriptor hi 0x%"PRIx64 " lo 0x%"PRIx64, - inv_desc->hi, inv_desc->lo); + trace_vtd_inv_desc_iotlb_invalid(inv_desc->hi, inv_desc->lo); return false; } return true; @@ -1527,33 +1479,28 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s) switch (desc_type) { case VTD_INV_DESC_CC: - VTD_DPRINTF(INV, "Context-cache Invalidate Descriptor hi 0x%"PRIx64 - " lo 0x%"PRIx64, inv_desc.hi, inv_desc.lo); + trace_vtd_inv_desc("context-cache", inv_desc.hi, inv_desc.lo); if (!vtd_process_context_cache_desc(s, &inv_desc)) { return false; } break; case VTD_INV_DESC_IOTLB: - VTD_DPRINTF(INV, "IOTLB Invalidate Descriptor hi 0x%"PRIx64 - " lo 0x%"PRIx64, inv_desc.hi, inv_desc.lo); + trace_vtd_inv_desc("iotlb", inv_desc.hi, inv_desc.lo); if (!vtd_process_iotlb_desc(s, &inv_desc)) { return false; } break; case VTD_INV_DESC_WAIT: - VTD_DPRINTF(INV, "Invalidation Wait Descriptor hi 0x%"PRIx64 - " lo 0x%"PRIx64, inv_desc.hi, inv_desc.lo); + trace_vtd_inv_desc("wait", inv_desc.hi, inv_desc.lo); if (!vtd_process_wait_desc(s, &inv_desc)) { return false; } break; case VTD_INV_DESC_IEC: - VTD_DPRINTF(INV, "Invalidation Interrupt Entry Cache " - "Descriptor hi 0x%"PRIx64 " lo 0x%"PRIx64, - inv_desc.hi, inv_desc.lo); + trace_vtd_inv_desc("iec", inv_desc.hi, inv_desc.lo); if (!vtd_process_inv_iec_desc(s, &inv_desc)) { return false; } @@ -1568,9 +1515,7 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s) break; default: - VTD_DPRINTF(GENERAL, "error: unkonw Invalidation Descriptor type " - "hi 0x%"PRIx64 " lo 0x%"PRIx64 " type %"PRIu8, - inv_desc.hi, inv_desc.lo, desc_type); + trace_vtd_inv_desc_invalid(inv_desc.hi, inv_desc.lo); return false; } s->iq_head++; @@ -2049,7 +1994,7 @@ static IOMMUTLBEntry vtd_iommu_translate(MemoryRegion *iommu, hwaddr addr, is_write, &ret); VTD_DPRINTF(MMU, "bus %"PRIu8 " slot %"PRIu8 " func %"PRIu8 " devfn %"PRIu8 - " gpa 0x%"PRIx64 " hpa 0x%"PRIx64, pci_bus_num(vtd_as->bus), + " iova 0x%"PRIx64 " hpa 0x%"PRIx64, pci_bus_num(vtd_as->bus), VTD_PCI_SLOT(vtd_as->devfn), VTD_PCI_FUNC(vtd_as->devfn), vtd_as->devfn, addr, ret.translated_addr); return ret; @@ -2115,6 +2060,7 @@ static Property vtd_properties[] = { DEFINE_PROP_ON_OFF_AUTO("eim", IntelIOMMUState, intr_eim, ON_OFF_AUTO_AUTO), DEFINE_PROP_BOOL("x-buggy-eim", IntelIOMMUState, buggy_eim, false), + DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE), DEFINE_PROP_END_OF_LIST(), }; @@ -2496,6 +2442,10 @@ static void vtd_init(IntelIOMMUState *s) s->ecap |= VTD_ECAP_DT; } + if (s->caching_mode) { + s->cap |= VTD_CAP_CM; + } + vtd_reset_context_cache(s); vtd_reset_iotlb(s); diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h index 356f188b73..41041219ba 100644 --- a/hw/i386/intel_iommu_internal.h +++ b/hw/i386/intel_iommu_internal.h @@ -202,6 +202,7 @@ #define VTD_CAP_MAMV (VTD_MAMV << 48) #define VTD_CAP_PSI (1ULL << 39) #define VTD_CAP_SLLPS ((1ULL << 34) | (1ULL << 35)) +#define VTD_CAP_CM (1ULL << 7) /* Supported Adjusted Guest Address Widths */ #define VTD_CAP_SAGAW_SHIFT 8 diff --git a/hw/i386/trace-events b/hw/i386/trace-events index 1cc4a10a07..88ad5e4c43 100644 --- a/hw/i386/trace-events +++ b/hw/i386/trace-events @@ -3,6 +3,34 @@ # hw/i386/x86-iommu.c x86_iommu_iec_notify(bool global, uint32_t index, uint32_t mask) "Notify IEC invalidation: global=%d index=%" PRIu32 " mask=%" PRIu32 +# hw/i386/intel_iommu.c +vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)" +vtd_inv_desc(const char *type, uint64_t hi, uint64_t lo) "invalidate desc type %s high 0x%"PRIx64" low 0x%"PRIx64 +vtd_inv_desc_invalid(uint64_t hi, uint64_t lo) "invalid inv desc hi 0x%"PRIx64" lo 0x%"PRIx64 +vtd_inv_desc_cc_domain(uint16_t domain) "context invalidate domain 0x%"PRIx16 +vtd_inv_desc_cc_global(void) "context invalidate globally" +vtd_inv_desc_cc_device(uint8_t bus, uint8_t dev, uint8_t fn) "context invalidate device %02"PRIx8":%02"PRIx8".%02"PRIx8 +vtd_inv_desc_cc_devices(uint16_t sid, uint16_t fmask) "context invalidate devices sid 0x%"PRIx16" fmask 0x%"PRIx16 +vtd_inv_desc_cc_invalid(uint64_t hi, uint64_t lo) "invalid context-cache desc hi 0x%"PRIx64" lo 0x%"PRIx64 +vtd_inv_desc_iotlb_global(void) "iotlb invalidate global" +vtd_inv_desc_iotlb_domain(uint16_t domain) "iotlb invalidate whole domain 0x%"PRIx16 +vtd_inv_desc_iotlb_pages(uint16_t domain, uint64_t addr, uint8_t mask) "iotlb invalidate domain 0x%"PRIx16" addr 0x%"PRIx64" mask 0x%"PRIx8 +vtd_inv_desc_iotlb_invalid(uint64_t hi, uint64_t lo) "invalid iotlb desc hi 0x%"PRIx64" lo 0x%"PRIx64 +vtd_inv_desc_wait_sw(uint64_t addr, uint32_t data) "wait invalidate status write addr 0x%"PRIx64" data 0x%"PRIx32 +vtd_inv_desc_wait_irq(const char *msg) "%s" +vtd_inv_desc_wait_invalid(uint64_t hi, uint64_t lo) "invalid wait desc hi 0x%"PRIx64" lo 0x%"PRIx64 +vtd_inv_desc_wait_write_fail(uint64_t hi, uint64_t lo) "write fail for wait desc hi 0x%"PRIx64" lo 0x%"PRIx64 +vtd_re_not_present(uint8_t bus) "Root entry bus %"PRIu8" not present" +vtd_re_invalid(uint64_t hi, uint64_t lo) "invalid root entry hi 0x%"PRIx64" lo 0x%"PRIx64 +vtd_ce_not_present(uint8_t bus, uint8_t devfn) "Context entry bus %"PRIu8" devfn %"PRIu8" not present" +vtd_ce_invalid(uint64_t hi, uint64_t lo) "invalid context entry hi 0x%"PRIx64" lo 0x%"PRIx64 +vtd_iotlb_page_hit(uint16_t sid, uint64_t addr, uint64_t slpte, uint16_t domain) "IOTLB page hit sid 0x%"PRIx16" iova 0x%"PRIx64" slpte 0x%"PRIx64" domain 0x%"PRIx16 +vtd_iotlb_page_update(uint16_t sid, uint64_t addr, uint64_t slpte, uint16_t domain) "IOTLB page update sid 0x%"PRIx16" iova 0x%"PRIx64" slpte 0x%"PRIx64" domain 0x%"PRIx16 +vtd_iotlb_cc_hit(uint8_t bus, uint8_t devfn, uint64_t high, uint64_t low, uint32_t gen) "IOTLB context hit bus 0x%"PRIx8" devfn 0x%"PRIx8" high 0x%"PRIx64" low 0x%"PRIx64" gen %"PRIu32 +vtd_iotlb_cc_update(uint8_t bus, uint8_t devfn, uint64_t high, uint64_t low, uint32_t gen1, uint32_t gen2) "IOTLB context update bus 0x%"PRIx8" devfn 0x%"PRIx8" high 0x%"PRIx64" low 0x%"PRIx64" gen %"PRIu32" -> gen %"PRIu32 +vtd_iotlb_reset(const char *reason) "IOTLB reset (reason: %s)" +vtd_fault_disabled(void) "Fault processing disabled for context entry" + # hw/i386/amd_iommu.c amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at addr 0x%"PRIx64" + offset 0x%"PRIx32 amdvi_cache_update(uint16_t domid, uint8_t bus, uint8_t slot, uint8_t func, uint64_t gpa, uint64_t txaddr) " update iotlb domid 0x%"PRIx16" devid: %02x:%02x.%x gpa 0x%"PRIx64" hpa 0x%"PRIx64 diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 354a19eab8..c32168077a 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -1130,7 +1130,8 @@ static int receive_filter(VirtIONet *n, const uint8_t *buf, int size) return 0; } -static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t size) +static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf, + size_t size) { VirtIONet *n = qemu_get_nic_opaque(nc); VirtIONetQueue *q = virtio_net_get_subqueue(nc); @@ -1233,6 +1234,17 @@ static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t return size; } +static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, + size_t size) +{ + ssize_t r; + + rcu_read_lock(); + r = virtio_net_receive_rcu(nc, buf, size); + rcu_read_unlock(); + return r; +} + static int32_t virtio_net_flush_tx(VirtIONetQueue *q); static void virtio_net_tx_complete(NetClientState *nc, ssize_t len) diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c index cbd4bb4f8c..fc54bfd53d 100644 --- a/hw/pci/pcie.c +++ b/hw/pci/pcie.c @@ -610,7 +610,8 @@ bool pcie_cap_is_arifwd_enabled(const PCIDevice *dev) * uint16_t ext_cap_size */ -static uint16_t pcie_find_capability_list(PCIDevice *dev, uint16_t cap_id, +/* Passing a cap_id value > 0xffff will return 0 and put end of list in prev */ +static uint16_t pcie_find_capability_list(PCIDevice *dev, uint32_t cap_id, uint16_t *prev_p) { uint16_t prev = 0; @@ -664,30 +665,24 @@ void pcie_add_capability(PCIDevice *dev, uint16_t cap_id, uint8_t cap_ver, uint16_t offset, uint16_t size) { - uint32_t header; - uint16_t next; - assert(offset >= PCI_CONFIG_SPACE_SIZE); assert(offset < offset + size); assert(offset + size <= PCIE_CONFIG_SPACE_SIZE); assert(size >= 8); assert(pci_is_express(dev)); - if (offset == PCI_CONFIG_SPACE_SIZE) { - header = pci_get_long(dev->config + offset); - next = PCI_EXT_CAP_NEXT(header); - } else { + if (offset != PCI_CONFIG_SPACE_SIZE) { uint16_t prev; - /* 0 is reserved cap id. use internally to find the last capability - in the linked list */ - next = pcie_find_capability_list(dev, 0, &prev); - + /* + * 0xffffffff is not a valid cap id (it's a 16 bit field). use + * internally to find the last capability in the linked list. + */ + pcie_find_capability_list(dev, 0xffffffff, &prev); assert(prev >= PCI_CONFIG_SPACE_SIZE); - assert(next == 0); pcie_ext_cap_set_next(dev, prev, offset); } - pci_set_long(dev->config + offset, PCI_EXT_CAP(cap_id, cap_ver, next)); + pci_set_long(dev->config + offset, PCI_EXT_CAP(cap_id, cap_ver, 0)); /* Make capability read-only by default */ memset(dev->wmask + offset, 0, size); diff --git a/hw/scsi/virtio-scsi-dataplane.c b/hw/scsi/virtio-scsi-dataplane.c index 6b8d0f0024..74c95e0e60 100644 --- a/hw/scsi/virtio-scsi-dataplane.c +++ b/hw/scsi/virtio-scsi-dataplane.c @@ -49,35 +49,35 @@ void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error **errp) } } -static void virtio_scsi_data_plane_handle_cmd(VirtIODevice *vdev, +static bool virtio_scsi_data_plane_handle_cmd(VirtIODevice *vdev, VirtQueue *vq) { VirtIOSCSI *s = (VirtIOSCSI *)vdev; assert(s->ctx && s->dataplane_started); - virtio_scsi_handle_cmd_vq(s, vq); + return virtio_scsi_handle_cmd_vq(s, vq); } -static void virtio_scsi_data_plane_handle_ctrl(VirtIODevice *vdev, +static bool virtio_scsi_data_plane_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq) { VirtIOSCSI *s = VIRTIO_SCSI(vdev); assert(s->ctx && s->dataplane_started); - virtio_scsi_handle_ctrl_vq(s, vq); + return virtio_scsi_handle_ctrl_vq(s, vq); } -static void virtio_scsi_data_plane_handle_event(VirtIODevice *vdev, +static bool virtio_scsi_data_plane_handle_event(VirtIODevice *vdev, VirtQueue *vq) { VirtIOSCSI *s = VIRTIO_SCSI(vdev); assert(s->ctx && s->dataplane_started); - virtio_scsi_handle_event_vq(s, vq); + return virtio_scsi_handle_event_vq(s, vq); } static int virtio_scsi_vring_init(VirtIOSCSI *s, VirtQueue *vq, int n, - void (*fn)(VirtIODevice *vdev, VirtQueue *vq)) + VirtIOHandleAIOOutput fn) { BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(s))); int rc; diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c index ce19efffc8..b01030b745 100644 --- a/hw/scsi/virtio-scsi.c +++ b/hw/scsi/virtio-scsi.c @@ -436,13 +436,16 @@ static inline void virtio_scsi_release(VirtIOSCSI *s) } } -void virtio_scsi_handle_ctrl_vq(VirtIOSCSI *s, VirtQueue *vq) +bool virtio_scsi_handle_ctrl_vq(VirtIOSCSI *s, VirtQueue *vq) { VirtIOSCSIReq *req; + bool progress = false; while ((req = virtio_scsi_pop_req(s, vq))) { + progress = true; virtio_scsi_handle_ctrl_req(s, req); } + return progress; } static void virtio_scsi_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq) @@ -591,10 +594,11 @@ static void virtio_scsi_handle_cmd_req_submit(VirtIOSCSI *s, VirtIOSCSIReq *req) scsi_req_unref(sreq); } -void virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq) +bool virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq) { VirtIOSCSIReq *req, *next; int ret = 0; + bool progress = false; QTAILQ_HEAD(, VirtIOSCSIReq) reqs = QTAILQ_HEAD_INITIALIZER(reqs); @@ -602,6 +606,7 @@ void virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq) virtio_queue_set_notification(vq, 0); while ((req = virtio_scsi_pop_req(s, vq))) { + progress = true; ret = virtio_scsi_handle_cmd_req_prepare(s, req); if (!ret) { QTAILQ_INSERT_TAIL(&reqs, req, next); @@ -624,6 +629,7 @@ void virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq) QTAILQ_FOREACH_SAFE(req, &reqs, next, next) { virtio_scsi_handle_cmd_req_submit(s, req); } + return progress; } static void virtio_scsi_handle_cmd(VirtIODevice *vdev, VirtQueue *vq) @@ -752,11 +758,13 @@ out: virtio_scsi_release(s); } -void virtio_scsi_handle_event_vq(VirtIOSCSI *s, VirtQueue *vq) +bool virtio_scsi_handle_event_vq(VirtIOSCSI *s, VirtQueue *vq) { if (s->events_dropped) { virtio_scsi_push_event(s, NULL, VIRTIO_SCSI_T_NO_EVENT, 0); + return true; } + return false; } static void virtio_scsi_handle_event(VirtIODevice *vdev, VirtQueue *vq) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 801578b4b9..f3ba9b9007 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -294,53 +294,78 @@ static bool vfio_listener_skipped_section(MemoryRegionSection *section) section->offset_within_address_space & (1ULL << 63); } -static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) +/* Called with rcu_read_lock held. */ +static bool vfio_get_vaddr(IOMMUTLBEntry *iotlb, void **vaddr, + bool *read_only) { - VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); - VFIOContainer *container = giommu->container; - hwaddr iova = iotlb->iova + giommu->iommu_offset; MemoryRegion *mr; hwaddr xlat; hwaddr len = iotlb->addr_mask + 1; - void *vaddr; - int ret; - - trace_vfio_iommu_map_notify(iova, iova + iotlb->addr_mask); - - if (iotlb->target_as != &address_space_memory) { - error_report("Wrong target AS \"%s\", only system memory is allowed", - iotlb->target_as->name ? iotlb->target_as->name : "none"); - return; - } + bool writable = iotlb->perm & IOMMU_WO; /* * The IOMMU TLB entry we have just covers translation through * this IOMMU to its immediate target. We need to translate * it the rest of the way through to memory. */ - rcu_read_lock(); mr = address_space_translate(&address_space_memory, iotlb->translated_addr, - &xlat, &len, iotlb->perm & IOMMU_WO); + &xlat, &len, writable); if (!memory_region_is_ram(mr)) { error_report("iommu map to non memory area %"HWADDR_PRIx"", xlat); - goto out; + return false; } + /* * Translation truncates length to the IOMMU page size, * check that it did not truncate too much. */ if (len & iotlb->addr_mask) { error_report("iommu has granularity incompatible with target AS"); - goto out; + return false; } + *vaddr = memory_region_get_ram_ptr(mr) + xlat; + *read_only = !writable || mr->readonly; + + return true; +} + +static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) +{ + VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); + VFIOContainer *container = giommu->container; + hwaddr iova = iotlb->iova + giommu->iommu_offset; + bool read_only; + void *vaddr; + int ret; + + trace_vfio_iommu_map_notify(iotlb->perm == IOMMU_NONE ? "UNMAP" : "MAP", + iova, iova + iotlb->addr_mask); + + if (iotlb->target_as != &address_space_memory) { + error_report("Wrong target AS \"%s\", only system memory is allowed", + iotlb->target_as->name ? iotlb->target_as->name : "none"); + return; + } + + rcu_read_lock(); + if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) { - vaddr = memory_region_get_ram_ptr(mr) + xlat; + if (!vfio_get_vaddr(iotlb, &vaddr, &read_only)) { + goto out; + } + /* + * vaddr is only valid until rcu_read_unlock(). But after + * vfio_dma_map has set up the mapping the pages will be + * pinned by the kernel. This makes sure that the RAM backend + * of vaddr will always be there, even if the memory object is + * destroyed and its backing memory munmap-ed. + */ ret = vfio_dma_map(container, iova, iotlb->addr_mask + 1, vaddr, - !(iotlb->perm & IOMMU_WO) || mr->readonly); + read_only); if (ret) { error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx", %p) = %d (%m)", diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 8de8281357..2561c6d31a 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -84,7 +84,7 @@ vfio_pci_igd_lpc_bridge_enabled(const char *name) "%s" # hw/vfio/common.c vfio_region_write(const char *name, int index, uint64_t addr, uint64_t data, unsigned size) " (%s:region%d+0x%"PRIx64", 0x%"PRIx64 ", %d)" vfio_region_read(char *name, int index, uint64_t addr, unsigned size, uint64_t data) " (%s:region%d+0x%"PRIx64", %d) = 0x%"PRIx64 -vfio_iommu_map_notify(uint64_t iova_start, uint64_t iova_end) "iommu map @ %"PRIx64" - %"PRIx64 +vfio_iommu_map_notify(const char *op, uint64_t iova_start, uint64_t iova_end) "iommu %s @ %"PRIx64" - %"PRIx64 vfio_listener_region_add_skip(uint64_t start, uint64_t end) "SKIPPING region_add %"PRIx64" - %"PRIx64 vfio_listener_region_add_iommu(uint64_t start, uint64_t end) "region_add [iommu] %"PRIx64" - %"PRIx64 vfio_listener_region_add_ram(uint64_t iova_start, uint64_t iova_end, void *vaddr) "region_add [ram] %"PRIx64" - %"PRIx64" [%p]" diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 63657066e7..23483c752f 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -60,6 +60,13 @@ typedef struct VRingUsed VRingUsedElem ring[0]; } VRingUsed; +typedef struct VRingMemoryRegionCaches { + struct rcu_head rcu; + MemoryRegionCache desc; + MemoryRegionCache avail; + MemoryRegionCache used; +} VRingMemoryRegionCaches; + typedef struct VRing { unsigned int num; @@ -68,6 +75,7 @@ typedef struct VRing hwaddr desc; hwaddr avail; hwaddr used; + VRingMemoryRegionCaches *caches; } VRing; struct VirtQueue @@ -97,13 +105,58 @@ struct VirtQueue uint16_t vector; VirtIOHandleOutput handle_output; - VirtIOHandleOutput handle_aio_output; + VirtIOHandleAIOOutput handle_aio_output; VirtIODevice *vdev; EventNotifier guest_notifier; EventNotifier host_notifier; QLIST_ENTRY(VirtQueue) node; }; +static void virtio_free_region_cache(VRingMemoryRegionCaches *caches) +{ + if (!caches) { + return; + } + + address_space_cache_destroy(&caches->desc); + address_space_cache_destroy(&caches->avail); + address_space_cache_destroy(&caches->used); + g_free(caches); +} + +static void virtio_init_region_cache(VirtIODevice *vdev, int n) +{ + VirtQueue *vq = &vdev->vq[n]; + VRingMemoryRegionCaches *old = vq->vring.caches; + VRingMemoryRegionCaches *new; + hwaddr addr, size; + int event_size; + + event_size = virtio_vdev_has_feature(vq->vdev, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; + + addr = vq->vring.desc; + if (!addr) { + return; + } + new = g_new0(VRingMemoryRegionCaches, 1); + size = virtio_queue_get_desc_size(vdev, n); + address_space_cache_init(&new->desc, vdev->dma_as, + addr, size, false); + + size = virtio_queue_get_used_size(vdev, n) + event_size; + address_space_cache_init(&new->used, vdev->dma_as, + vq->vring.used, size, true); + + size = virtio_queue_get_avail_size(vdev, n) + event_size; + address_space_cache_init(&new->avail, vdev->dma_as, + vq->vring.avail, size, false); + + atomic_rcu_set(&vq->vring.caches, new); + if (old) { + call_rcu(old, virtio_free_region_cache, rcu); + } +} + /* virt queue functions */ void virtio_queue_update_rings(VirtIODevice *vdev, int n) { @@ -117,101 +170,125 @@ void virtio_queue_update_rings(VirtIODevice *vdev, int n) vring->used = vring_align(vring->avail + offsetof(VRingAvail, ring[vring->num]), vring->align); + virtio_init_region_cache(vdev, n); } +/* Called within rcu_read_lock(). */ static void vring_desc_read(VirtIODevice *vdev, VRingDesc *desc, - hwaddr desc_pa, int i) + MemoryRegionCache *cache, int i) { - address_space_read(vdev->dma_as, desc_pa + i * sizeof(VRingDesc), - MEMTXATTRS_UNSPECIFIED, (void *)desc, sizeof(VRingDesc)); + address_space_read_cached(cache, i * sizeof(VRingDesc), + desc, sizeof(VRingDesc)); virtio_tswap64s(vdev, &desc->addr); virtio_tswap32s(vdev, &desc->len); virtio_tswap16s(vdev, &desc->flags); virtio_tswap16s(vdev, &desc->next); } +/* Called within rcu_read_lock(). */ static inline uint16_t vring_avail_flags(VirtQueue *vq) { - hwaddr pa; - pa = vq->vring.avail + offsetof(VRingAvail, flags); - return virtio_lduw_phys(vq->vdev, pa); + VRingMemoryRegionCaches *caches = atomic_rcu_read(&vq->vring.caches); + hwaddr pa = offsetof(VRingAvail, flags); + return virtio_lduw_phys_cached(vq->vdev, &caches->avail, pa); } +/* Called within rcu_read_lock(). */ static inline uint16_t vring_avail_idx(VirtQueue *vq) { - hwaddr pa; - pa = vq->vring.avail + offsetof(VRingAvail, idx); - vq->shadow_avail_idx = virtio_lduw_phys(vq->vdev, pa); + VRingMemoryRegionCaches *caches = atomic_rcu_read(&vq->vring.caches); + hwaddr pa = offsetof(VRingAvail, idx); + vq->shadow_avail_idx = virtio_lduw_phys_cached(vq->vdev, &caches->avail, pa); return vq->shadow_avail_idx; } +/* Called within rcu_read_lock(). */ static inline uint16_t vring_avail_ring(VirtQueue *vq, int i) { - hwaddr pa; - pa = vq->vring.avail + offsetof(VRingAvail, ring[i]); - return virtio_lduw_phys(vq->vdev, pa); + VRingMemoryRegionCaches *caches = atomic_rcu_read(&vq->vring.caches); + hwaddr pa = offsetof(VRingAvail, ring[i]); + return virtio_lduw_phys_cached(vq->vdev, &caches->avail, pa); } +/* Called within rcu_read_lock(). */ static inline uint16_t vring_get_used_event(VirtQueue *vq) { return vring_avail_ring(vq, vq->vring.num); } +/* Called within rcu_read_lock(). */ static inline void vring_used_write(VirtQueue *vq, VRingUsedElem *uelem, int i) { - hwaddr pa; + VRingMemoryRegionCaches *caches = atomic_rcu_read(&vq->vring.caches); + hwaddr pa = offsetof(VRingUsed, ring[i]); virtio_tswap32s(vq->vdev, &uelem->id); virtio_tswap32s(vq->vdev, &uelem->len); - pa = vq->vring.used + offsetof(VRingUsed, ring[i]); - address_space_write(vq->vdev->dma_as, pa, MEMTXATTRS_UNSPECIFIED, - (void *)uelem, sizeof(VRingUsedElem)); + address_space_write_cached(&caches->used, pa, uelem, sizeof(VRingUsedElem)); + address_space_cache_invalidate(&caches->used, pa, sizeof(VRingUsedElem)); } +/* Called within rcu_read_lock(). */ static uint16_t vring_used_idx(VirtQueue *vq) { - hwaddr pa; - pa = vq->vring.used + offsetof(VRingUsed, idx); - return virtio_lduw_phys(vq->vdev, pa); + VRingMemoryRegionCaches *caches = atomic_rcu_read(&vq->vring.caches); + hwaddr pa = offsetof(VRingUsed, idx); + return virtio_lduw_phys_cached(vq->vdev, &caches->used, pa); } +/* Called within rcu_read_lock(). */ static inline void vring_used_idx_set(VirtQueue *vq, uint16_t val) { - hwaddr pa; - pa = vq->vring.used + offsetof(VRingUsed, idx); - virtio_stw_phys(vq->vdev, pa, val); + VRingMemoryRegionCaches *caches = atomic_rcu_read(&vq->vring.caches); + hwaddr pa = offsetof(VRingUsed, idx); + virtio_stw_phys_cached(vq->vdev, &caches->used, pa, val); + address_space_cache_invalidate(&caches->used, pa, sizeof(val)); vq->used_idx = val; } +/* Called within rcu_read_lock(). */ static inline void vring_used_flags_set_bit(VirtQueue *vq, int mask) { + VRingMemoryRegionCaches *caches = atomic_rcu_read(&vq->vring.caches); VirtIODevice *vdev = vq->vdev; - hwaddr pa; - pa = vq->vring.used + offsetof(VRingUsed, flags); - virtio_stw_phys(vdev, pa, virtio_lduw_phys(vdev, pa) | mask); + hwaddr pa = offsetof(VRingUsed, flags); + uint16_t flags = virtio_lduw_phys_cached(vq->vdev, &caches->used, pa); + + virtio_stw_phys_cached(vdev, &caches->used, pa, flags | mask); + address_space_cache_invalidate(&caches->used, pa, sizeof(flags)); } +/* Called within rcu_read_lock(). */ static inline void vring_used_flags_unset_bit(VirtQueue *vq, int mask) { + VRingMemoryRegionCaches *caches = atomic_rcu_read(&vq->vring.caches); VirtIODevice *vdev = vq->vdev; - hwaddr pa; - pa = vq->vring.used + offsetof(VRingUsed, flags); - virtio_stw_phys(vdev, pa, virtio_lduw_phys(vdev, pa) & ~mask); + hwaddr pa = offsetof(VRingUsed, flags); + uint16_t flags = virtio_lduw_phys_cached(vq->vdev, &caches->used, pa); + + virtio_stw_phys_cached(vdev, &caches->used, pa, flags & ~mask); + address_space_cache_invalidate(&caches->used, pa, sizeof(flags)); } +/* Called within rcu_read_lock(). */ static inline void vring_set_avail_event(VirtQueue *vq, uint16_t val) { + VRingMemoryRegionCaches *caches; hwaddr pa; if (!vq->notification) { return; } - pa = vq->vring.used + offsetof(VRingUsed, ring[vq->vring.num]); - virtio_stw_phys(vq->vdev, pa, val); + + caches = atomic_rcu_read(&vq->vring.caches); + pa = offsetof(VRingUsed, ring[vq->vring.num]); + virtio_stw_phys_cached(vq->vdev, &caches->used, pa, val); } void virtio_queue_set_notification(VirtQueue *vq, int enable) { vq->notification = enable; + + rcu_read_lock(); if (virtio_vdev_has_feature(vq->vdev, VIRTIO_RING_F_EVENT_IDX)) { vring_set_avail_event(vq, vring_avail_idx(vq)); } else if (enable) { @@ -223,6 +300,7 @@ void virtio_queue_set_notification(VirtQueue *vq, int enable) /* Expose avail event/used flags before caller checks the avail idx. */ smp_mb(); } + rcu_read_unlock(); } int virtio_queue_ready(VirtQueue *vq) @@ -231,8 +309,9 @@ int virtio_queue_ready(VirtQueue *vq) } /* Fetch avail_idx from VQ memory only when we really need to know if - * guest has added some buffers. */ -int virtio_queue_empty(VirtQueue *vq) + * guest has added some buffers. + * Called within rcu_read_lock(). */ +static int virtio_queue_empty_rcu(VirtQueue *vq) { if (vq->shadow_avail_idx != vq->last_avail_idx) { return 0; @@ -241,6 +320,20 @@ int virtio_queue_empty(VirtQueue *vq) return vring_avail_idx(vq) == vq->last_avail_idx; } +int virtio_queue_empty(VirtQueue *vq) +{ + bool empty; + + if (vq->shadow_avail_idx != vq->last_avail_idx) { + return 0; + } + + rcu_read_lock(); + empty = vring_avail_idx(vq) == vq->last_avail_idx; + rcu_read_unlock(); + return empty; +} + static void virtqueue_unmap_sg(VirtQueue *vq, const VirtQueueElement *elem, unsigned int len) { @@ -319,6 +412,7 @@ bool virtqueue_rewind(VirtQueue *vq, unsigned int num) return true; } +/* Called within rcu_read_lock(). */ void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem, unsigned int len, unsigned int idx) { @@ -339,6 +433,7 @@ void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem, vring_used_write(vq, &uelem, idx); } +/* Called within rcu_read_lock(). */ void virtqueue_flush(VirtQueue *vq, unsigned int count) { uint16_t old, new; @@ -362,10 +457,13 @@ void virtqueue_flush(VirtQueue *vq, unsigned int count) void virtqueue_push(VirtQueue *vq, const VirtQueueElement *elem, unsigned int len) { + rcu_read_lock(); virtqueue_fill(vq, elem, len, 0); virtqueue_flush(vq, 1); + rcu_read_unlock(); } +/* Called within rcu_read_lock(). */ static int virtqueue_num_heads(VirtQueue *vq, unsigned int idx) { uint16_t num_heads = vring_avail_idx(vq) - idx; @@ -385,6 +483,7 @@ static int virtqueue_num_heads(VirtQueue *vq, unsigned int idx) return num_heads; } +/* Called within rcu_read_lock(). */ static bool virtqueue_get_head(VirtQueue *vq, unsigned int idx, unsigned int *head) { @@ -408,7 +507,7 @@ enum { }; static int virtqueue_read_next_desc(VirtIODevice *vdev, VRingDesc *desc, - hwaddr desc_pa, unsigned int max, + MemoryRegionCache *desc_cache, unsigned int max, unsigned int *next) { /* If this descriptor says it doesn't chain, we're done. */ @@ -426,7 +525,7 @@ static int virtqueue_read_next_desc(VirtIODevice *vdev, VRingDesc *desc, return VIRTQUEUE_READ_DESC_ERROR; } - vring_desc_read(vdev, desc, desc_pa, *next); + vring_desc_read(vdev, desc, desc_cache, *next); return VIRTQUEUE_READ_DESC_MORE; } @@ -434,29 +533,38 @@ void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, unsigned int *out_bytes, unsigned max_in_bytes, unsigned max_out_bytes) { - unsigned int idx; + VirtIODevice *vdev = vq->vdev; + unsigned int max, idx; unsigned int total_bufs, in_total, out_total; + VRingMemoryRegionCaches *caches; + MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID; + int64_t len = 0; int rc; + rcu_read_lock(); idx = vq->last_avail_idx; - total_bufs = in_total = out_total = 0; + + max = vq->vring.num; + caches = atomic_rcu_read(&vq->vring.caches); + if (caches->desc.len < max * sizeof(VRingDesc)) { + virtio_error(vdev, "Cannot map descriptor ring"); + goto err; + } + while ((rc = virtqueue_num_heads(vq, idx)) > 0) { - VirtIODevice *vdev = vq->vdev; - unsigned int max, num_bufs, indirect = 0; + MemoryRegionCache *desc_cache = &caches->desc; + unsigned int num_bufs; VRingDesc desc; - hwaddr desc_pa; unsigned int i; - max = vq->vring.num; num_bufs = total_bufs; if (!virtqueue_get_head(vq, idx++, &i)) { goto err; } - desc_pa = vq->vring.desc; - vring_desc_read(vdev, &desc, desc_pa, i); + vring_desc_read(vdev, &desc, desc_cache, i); if (desc.flags & VRING_DESC_F_INDIRECT) { if (desc.len % sizeof(VRingDesc)) { @@ -471,11 +579,18 @@ void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, } /* loop over the indirect descriptor table */ - indirect = 1; + len = address_space_cache_init(&indirect_desc_cache, + vdev->dma_as, + desc.addr, desc.len, false); + desc_cache = &indirect_desc_cache; + if (len < desc.len) { + virtio_error(vdev, "Cannot map indirect buffer"); + goto err; + } + max = desc.len / sizeof(VRingDesc); - desc_pa = desc.addr; num_bufs = i = 0; - vring_desc_read(vdev, &desc, desc_pa, i); + vring_desc_read(vdev, &desc, desc_cache, i); } do { @@ -494,17 +609,19 @@ void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, goto done; } - rc = virtqueue_read_next_desc(vdev, &desc, desc_pa, max, &i); + rc = virtqueue_read_next_desc(vdev, &desc, desc_cache, max, &i); } while (rc == VIRTQUEUE_READ_DESC_MORE); if (rc == VIRTQUEUE_READ_DESC_ERROR) { goto err; } - if (!indirect) - total_bufs = num_bufs; - else + if (desc_cache == &indirect_desc_cache) { + address_space_cache_destroy(&indirect_desc_cache); total_bufs++; + } else { + total_bufs = num_bufs; + } } if (rc < 0) { @@ -512,12 +629,14 @@ void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, } done: + address_space_cache_destroy(&indirect_desc_cache); if (in_bytes) { *in_bytes = in_total; } if (out_bytes) { *out_bytes = out_total; } + rcu_read_unlock(); return; err: @@ -651,9 +770,12 @@ static void *virtqueue_alloc_element(size_t sz, unsigned out_num, unsigned in_nu void *virtqueue_pop(VirtQueue *vq, size_t sz) { unsigned int i, head, max; - hwaddr desc_pa = vq->vring.desc; + VRingMemoryRegionCaches *caches; + MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID; + MemoryRegionCache *desc_cache; + int64_t len; VirtIODevice *vdev = vq->vdev; - VirtQueueElement *elem; + VirtQueueElement *elem = NULL; unsigned out_num, in_num; hwaddr addr[VIRTQUEUE_MAX_SIZE]; struct iovec iov[VIRTQUEUE_MAX_SIZE]; @@ -663,8 +785,9 @@ void *virtqueue_pop(VirtQueue *vq, size_t sz) if (unlikely(vdev->broken)) { return NULL; } - if (virtio_queue_empty(vq)) { - return NULL; + rcu_read_lock(); + if (virtio_queue_empty_rcu(vq)) { + goto done; } /* Needed after virtio_queue_empty(), see comment in * virtqueue_num_heads(). */ @@ -677,11 +800,11 @@ void *virtqueue_pop(VirtQueue *vq, size_t sz) if (vq->inuse >= vq->vring.num) { virtio_error(vdev, "Virtqueue size exceeded"); - return NULL; + goto done; } if (!virtqueue_get_head(vq, vq->last_avail_idx++, &head)) { - return NULL; + goto done; } if (virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) { @@ -689,18 +812,33 @@ void *virtqueue_pop(VirtQueue *vq, size_t sz) } i = head; - vring_desc_read(vdev, &desc, desc_pa, i); + + caches = atomic_rcu_read(&vq->vring.caches); + if (caches->desc.len < max * sizeof(VRingDesc)) { + virtio_error(vdev, "Cannot map descriptor ring"); + goto done; + } + + desc_cache = &caches->desc; + vring_desc_read(vdev, &desc, desc_cache, i); if (desc.flags & VRING_DESC_F_INDIRECT) { if (desc.len % sizeof(VRingDesc)) { virtio_error(vdev, "Invalid size for indirect buffer table"); - return NULL; + goto done; } /* loop over the indirect descriptor table */ + len = address_space_cache_init(&indirect_desc_cache, vdev->dma_as, + desc.addr, desc.len, false); + desc_cache = &indirect_desc_cache; + if (len < desc.len) { + virtio_error(vdev, "Cannot map indirect buffer"); + goto done; + } + max = desc.len / sizeof(VRingDesc); - desc_pa = desc.addr; i = 0; - vring_desc_read(vdev, &desc, desc_pa, i); + vring_desc_read(vdev, &desc, desc_cache, i); } /* Collect all the descriptors */ @@ -731,7 +869,7 @@ void *virtqueue_pop(VirtQueue *vq, size_t sz) goto err_undo_map; } - rc = virtqueue_read_next_desc(vdev, &desc, desc_pa, max, &i); + rc = virtqueue_read_next_desc(vdev, &desc, desc_cache, max, &i); } while (rc == VIRTQUEUE_READ_DESC_MORE); if (rc == VIRTQUEUE_READ_DESC_ERROR) { @@ -753,11 +891,15 @@ void *virtqueue_pop(VirtQueue *vq, size_t sz) vq->inuse++; trace_virtqueue_pop(vq, elem, elem->in_num, elem->out_num); +done: + address_space_cache_destroy(&indirect_desc_cache); + rcu_read_unlock(); + return elem; err_undo_map: virtqueue_undo_map_desc(out_num, in_num, iov); - return NULL; + goto done; } /* virtqueue_drop_all: @@ -1219,6 +1361,7 @@ void virtio_queue_set_rings(VirtIODevice *vdev, int n, hwaddr desc, vdev->vq[n].vring.desc = desc; vdev->vq[n].vring.avail = avail; vdev->vq[n].vring.used = used; + virtio_init_region_cache(vdev, n); } void virtio_queue_set_num(VirtIODevice *vdev, int n, int num) @@ -1287,14 +1430,16 @@ void virtio_queue_set_align(VirtIODevice *vdev, int n, int align) virtio_queue_update_rings(vdev, n); } -static void virtio_queue_notify_aio_vq(VirtQueue *vq) +static bool virtio_queue_notify_aio_vq(VirtQueue *vq) { if (vq->vring.desc && vq->handle_aio_output) { VirtIODevice *vdev = vq->vdev; trace_virtio_queue_notify(vdev, vq - vdev->vq, vq); - vq->handle_aio_output(vdev, vq); + return vq->handle_aio_output(vdev, vq); } + + return false; } static void virtio_queue_notify_vq(VirtQueue *vq) @@ -1383,6 +1528,7 @@ static void virtio_set_isr(VirtIODevice *vdev, int value) } } +/* Called within rcu_read_lock(). */ static bool virtio_should_notify(VirtIODevice *vdev, VirtQueue *vq) { uint16_t old, new; @@ -1408,7 +1554,12 @@ static bool virtio_should_notify(VirtIODevice *vdev, VirtQueue *vq) void virtio_notify_irqfd(VirtIODevice *vdev, VirtQueue *vq) { - if (!virtio_should_notify(vdev, vq)) { + bool should_notify; + rcu_read_lock(); + should_notify = virtio_should_notify(vdev, vq); + rcu_read_unlock(); + + if (!should_notify) { return; } @@ -1433,15 +1584,25 @@ void virtio_notify_irqfd(VirtIODevice *vdev, VirtQueue *vq) event_notifier_set(&vq->guest_notifier); } +static void virtio_irq(VirtQueue *vq) +{ + virtio_set_isr(vq->vdev, 0x1); + virtio_notify_vector(vq->vdev, vq->vector); +} + void virtio_notify(VirtIODevice *vdev, VirtQueue *vq) { - if (!virtio_should_notify(vdev, vq)) { + bool should_notify; + rcu_read_lock(); + should_notify = virtio_should_notify(vdev, vq); + rcu_read_unlock(); + + if (!should_notify) { return; } trace_virtio_notify(vdev, vq); - virtio_set_isr(vq->vdev, 0x1); - virtio_notify_vector(vdev, vq->vector); + virtio_irq(vq); } void virtio_notify_config(VirtIODevice *vdev) @@ -1896,6 +2057,7 @@ int virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id) } } + rcu_read_lock(); for (i = 0; i < num; i++) { if (vdev->vq[i].vring.desc) { uint16_t nheads; @@ -1930,6 +2092,7 @@ int virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id) } } } + rcu_read_unlock(); return 0; } @@ -1937,9 +2100,6 @@ int virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id) void virtio_cleanup(VirtIODevice *vdev) { qemu_del_vm_change_state_handler(vdev->vmstate); - g_free(vdev->config); - g_free(vdev->vq); - g_free(vdev->vector_queues); } static void virtio_vmstate_change(void *opaque, int running, RunState state) @@ -2059,7 +2219,11 @@ void virtio_queue_set_last_avail_idx(VirtIODevice *vdev, int n, uint16_t idx) void virtio_queue_update_used_idx(VirtIODevice *vdev, int n) { - vdev->vq[n].used_idx = vring_used_idx(&vdev->vq[n]); + rcu_read_lock(); + if (vdev->vq[n].vring.desc) { + vdev->vq[n].used_idx = vring_used_idx(&vdev->vq[n]); + } + rcu_read_unlock(); } void virtio_queue_invalidate_signalled_used(VirtIODevice *vdev, int n) @@ -2081,7 +2245,7 @@ static void virtio_queue_guest_notifier_read(EventNotifier *n) { VirtQueue *vq = container_of(n, VirtQueue, guest_notifier); if (event_notifier_test_and_clear(n)) { - virtio_notify_vector(vq->vdev, vq->vector); + virtio_irq(vq); } } @@ -2125,16 +2289,17 @@ static bool virtio_queue_host_notifier_aio_poll(void *opaque) { EventNotifier *n = opaque; VirtQueue *vq = container_of(n, VirtQueue, host_notifier); + bool progress; if (virtio_queue_empty(vq)) { return false; } - virtio_queue_notify_aio_vq(vq); + progress = virtio_queue_notify_aio_vq(vq); /* In case the handler function re-enabled notifications */ virtio_queue_set_notification(vq, 0); - return true; + return progress; } static void virtio_queue_host_notifier_aio_poll_end(EventNotifier *n) @@ -2146,7 +2311,7 @@ static void virtio_queue_host_notifier_aio_poll_end(EventNotifier *n) } void virtio_queue_aio_set_host_notifier_handler(VirtQueue *vq, AioContext *ctx, - VirtIOHandleOutput handle_output) + VirtIOHandleAIOOutput handle_output) { if (handle_output) { vq->handle_aio_output = handle_output; @@ -2200,6 +2365,19 @@ void GCC_FMT_ATTR(2, 3) virtio_error(VirtIODevice *vdev, const char *fmt, ...) } } +static void virtio_memory_listener_commit(MemoryListener *listener) +{ + VirtIODevice *vdev = container_of(listener, VirtIODevice, listener); + int i; + + for (i = 0; i < VIRTIO_QUEUE_MAX; i++) { + if (vdev->vq[i].vring.num == 0) { + break; + } + virtio_init_region_cache(vdev, i); + } +} + static void virtio_device_realize(DeviceState *dev, Error **errp) { VirtIODevice *vdev = VIRTIO_DEVICE(dev); @@ -2222,6 +2400,9 @@ static void virtio_device_realize(DeviceState *dev, Error **errp) error_propagate(errp, err); return; } + + vdev->listener.commit = virtio_memory_listener_commit; + memory_listener_register(&vdev->listener, vdev->dma_as); } static void virtio_device_unrealize(DeviceState *dev, Error **errp) @@ -2244,6 +2425,36 @@ static void virtio_device_unrealize(DeviceState *dev, Error **errp) vdev->bus_name = NULL; } +static void virtio_device_free_virtqueues(VirtIODevice *vdev) +{ + int i; + if (!vdev->vq) { + return; + } + + for (i = 0; i < VIRTIO_QUEUE_MAX; i++) { + VRingMemoryRegionCaches *caches; + if (vdev->vq[i].vring.num == 0) { + break; + } + caches = atomic_read(&vdev->vq[i].vring.caches); + atomic_set(&vdev->vq[i].vring.caches, NULL); + virtio_free_region_cache(caches); + } + g_free(vdev->vq); +} + +static void virtio_device_instance_finalize(Object *obj) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(obj); + + memory_listener_unregister(&vdev->listener); + virtio_device_free_virtqueues(vdev); + + g_free(vdev->config); + g_free(vdev->vector_queues); +} + static Property virtio_properties[] = { DEFINE_VIRTIO_COMMON_FEATURES(VirtIODevice, host_features), DEFINE_PROP_END_OF_LIST(), @@ -2370,6 +2581,7 @@ static const TypeInfo virtio_device_info = { .parent = TYPE_DEVICE, .instance_size = sizeof(VirtIODevice), .class_init = virtio_device_class_init, + .instance_finalize = virtio_device_instance_finalize, .abstract = true, .class_size = sizeof(VirtioDeviceClass), }; diff --git a/include/exec/memory.h b/include/exec/memory.h index 987f9251c6..691102317c 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -1426,6 +1426,8 @@ struct MemoryRegionCache { bool is_write; }; +#define MEMORY_REGION_CACHE_INVALID ((MemoryRegionCache) { .mr = NULL }) + /* address_space_cache_init: prepare for repeated access to a physical * memory region * diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h index 405c9d122e..fe645aa93a 100644 --- a/include/hw/i386/intel_iommu.h +++ b/include/hw/i386/intel_iommu.h @@ -257,6 +257,8 @@ struct IntelIOMMUState { uint8_t womask[DMAR_REG_SIZE]; /* WO (write only - read returns 0) */ uint32_t version; + bool caching_mode; /* RO - is cap CM enabled? */ + dma_addr_t root; /* Current root table pointer */ bool root_extended; /* Type of root table (extended or not) */ bool dmar_enabled; /* Set if DMA remapping is enabled */ diff --git a/include/hw/virtio/virtio-access.h b/include/hw/virtio/virtio-access.h index 91ae14d254..2e92074bd1 100644 --- a/include/hw/virtio/virtio-access.h +++ b/include/hw/virtio/virtio-access.h @@ -156,6 +156,58 @@ static inline uint16_t virtio_tswap16(VirtIODevice *vdev, uint16_t s) #endif } +static inline uint16_t virtio_lduw_phys_cached(VirtIODevice *vdev, + MemoryRegionCache *cache, + hwaddr pa) +{ + if (virtio_access_is_big_endian(vdev)) { + return lduw_be_phys_cached(cache, pa); + } + return lduw_le_phys_cached(cache, pa); +} + +static inline uint32_t virtio_ldl_phys_cached(VirtIODevice *vdev, + MemoryRegionCache *cache, + hwaddr pa) +{ + if (virtio_access_is_big_endian(vdev)) { + return ldl_be_phys_cached(cache, pa); + } + return ldl_le_phys_cached(cache, pa); +} + +static inline uint64_t virtio_ldq_phys_cached(VirtIODevice *vdev, + MemoryRegionCache *cache, + hwaddr pa) +{ + if (virtio_access_is_big_endian(vdev)) { + return ldq_be_phys_cached(cache, pa); + } + return ldq_le_phys_cached(cache, pa); +} + +static inline void virtio_stw_phys_cached(VirtIODevice *vdev, + MemoryRegionCache *cache, + hwaddr pa, uint16_t value) +{ + if (virtio_access_is_big_endian(vdev)) { + stw_be_phys_cached(cache, pa, value); + } else { + stw_le_phys_cached(cache, pa, value); + } +} + +static inline void virtio_stl_phys_cached(VirtIODevice *vdev, + MemoryRegionCache *cache, + hwaddr pa, uint32_t value) +{ + if (virtio_access_is_big_endian(vdev)) { + stl_be_phys_cached(cache, pa, value); + } else { + stl_le_phys_cached(cache, pa, value); + } +} + static inline void virtio_tswap16s(VirtIODevice *vdev, uint16_t *s) { *s = virtio_tswap16(vdev, *s); diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h index 9734b4c446..d3c8a6fa8c 100644 --- a/include/hw/virtio/virtio-blk.h +++ b/include/hw/virtio/virtio-blk.h @@ -80,6 +80,6 @@ typedef struct MultiReqBuffer { bool is_write; } MultiReqBuffer; -void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq); +bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq); #endif diff --git a/include/hw/virtio/virtio-scsi.h b/include/hw/virtio/virtio-scsi.h index 73751969ba..f536f77e68 100644 --- a/include/hw/virtio/virtio-scsi.h +++ b/include/hw/virtio/virtio-scsi.h @@ -126,9 +126,9 @@ void virtio_scsi_common_realize(DeviceState *dev, Error **errp, VirtIOHandleOutput cmd); void virtio_scsi_common_unrealize(DeviceState *dev, Error **errp); -void virtio_scsi_handle_event_vq(VirtIOSCSI *s, VirtQueue *vq); -void virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq); -void virtio_scsi_handle_ctrl_vq(VirtIOSCSI *s, VirtQueue *vq); +bool virtio_scsi_handle_event_vq(VirtIOSCSI *s, VirtQueue *vq); +bool virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq); +bool virtio_scsi_handle_ctrl_vq(VirtIOSCSI *s, VirtQueue *vq); void virtio_scsi_init_req(VirtIOSCSI *s, VirtQueue *vq, VirtIOSCSIReq *req); void virtio_scsi_free_req(VirtIOSCSIReq *req); void virtio_scsi_push_event(VirtIOSCSI *s, SCSIDevice *dev, diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index 525da24222..15efcf2057 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -85,6 +85,7 @@ struct VirtIODevice uint32_t generation; int nvectors; VirtQueue *vq; + MemoryListener listener; uint16_t device_id; bool vm_running; bool broken; /* device in invalid state, needs reset */ @@ -154,6 +155,7 @@ void virtio_error(VirtIODevice *vdev, const char *fmt, ...) GCC_FMT_ATTR(2, 3); void virtio_device_set_child_bus_name(VirtIODevice *vdev, char *bus_name); typedef void (*VirtIOHandleOutput)(VirtIODevice *, VirtQueue *); +typedef bool (*VirtIOHandleAIOOutput)(VirtIODevice *, VirtQueue *); VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size, VirtIOHandleOutput handle_output); @@ -284,8 +286,7 @@ bool virtio_device_ioeventfd_enabled(VirtIODevice *vdev); EventNotifier *virtio_queue_get_host_notifier(VirtQueue *vq); void virtio_queue_host_notifier_read(EventNotifier *n); void virtio_queue_aio_set_host_notifier_handler(VirtQueue *vq, AioContext *ctx, - void (*fn)(VirtIODevice *, - VirtQueue *)); + VirtIOHandleAIOOutput handle_output); VirtQueue *virtio_vector_first_queue(VirtIODevice *vdev, uint16_t vector); VirtQueue *virtio_vector_next_queue(VirtQueue *vq); @@ -2371,8 +2371,13 @@ void memory_listener_register(MemoryListener *listener, AddressSpace *as) void memory_listener_unregister(MemoryListener *listener) { + if (!listener->address_space) { + return; + } + QTAILQ_REMOVE(&memory_listeners, listener, link); QTAILQ_REMOVE(&listener->address_space->listeners, listener, link_as); + listener->address_space = NULL; } void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name) |