diff options
Diffstat (limited to 'hw')
-rw-r--r-- | hw/s390x/meson.build | 1 | ||||
-rw-r--r-- | hw/s390x/s390-pci-bus.c | 91 | ||||
-rw-r--r-- | hw/s390x/s390-pci-bus.h | 372 | ||||
-rw-r--r-- | hw/s390x/s390-pci-inst.c | 78 | ||||
-rw-r--r-- | hw/s390x/s390-pci-inst.h | 312 | ||||
-rw-r--r-- | hw/s390x/s390-pci-vfio.c | 276 | ||||
-rw-r--r-- | hw/s390x/s390-virtio-ccw.c | 2 | ||||
-rw-r--r-- | hw/s390x/trace-events | 5 | ||||
-rw-r--r-- | hw/vfio/common.c | 508 | ||||
-rw-r--r-- | hw/vfio/meson.build | 1 | ||||
-rw-r--r-- | hw/vfio/migration.c | 933 | ||||
-rw-r--r-- | hw/vfio/pci.c | 87 | ||||
-rw-r--r-- | hw/vfio/pci.h | 1 | ||||
-rw-r--r-- | hw/vfio/platform.c | 7 | ||||
-rw-r--r-- | hw/vfio/trace-events | 21 |
15 files changed, 1922 insertions, 773 deletions
diff --git a/hw/s390x/meson.build b/hw/s390x/meson.build index 948ceae7a7..f4663a8355 100644 --- a/hw/s390x/meson.build +++ b/hw/s390x/meson.build @@ -27,6 +27,7 @@ s390x_ss.add(when: 'CONFIG_KVM', if_true: files( )) s390x_ss.add(when: 'CONFIG_S390_CCW_VIRTIO', if_true: files('s390-virtio-ccw.c')) s390x_ss.add(when: 'CONFIG_TERMINAL3270', if_true: files('3270-ccw.c')) +s390x_ss.add(when: 'CONFIG_LINUX', if_true: files('s390-pci-vfio.c')) virtio_ss = ss.source_set() virtio_ss.add(files('virtio-ccw.c')) diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c index fb4cee87a4..48a3be802f 100644 --- a/hw/s390x/s390-pci-bus.c +++ b/hw/s390x/s390-pci-bus.c @@ -15,8 +15,9 @@ #include "qapi/error.h" #include "qapi/visitor.h" #include "cpu.h" -#include "s390-pci-bus.h" -#include "s390-pci-inst.h" +#include "hw/s390x/s390-pci-bus.h" +#include "hw/s390x/s390-pci-inst.h" +#include "hw/s390x/s390-pci-vfio.h" #include "hw/pci/pci_bus.h" #include "hw/qdev-properties.h" #include "hw/pci/pci_bridge.h" @@ -737,6 +738,57 @@ static void s390_pci_iommu_free(S390pciState *s, PCIBus *bus, int32_t devfn) object_unref(OBJECT(iommu)); } +S390PCIGroup *s390_group_create(int id) +{ + S390PCIGroup *group; + S390pciState *s = s390_get_phb(); + + group = g_new0(S390PCIGroup, 1); + group->id = id; + QTAILQ_INSERT_TAIL(&s->zpci_groups, group, link); + return group; +} + +S390PCIGroup *s390_group_find(int id) +{ + S390PCIGroup *group; + S390pciState *s = s390_get_phb(); + + QTAILQ_FOREACH(group, &s->zpci_groups, link) { + if (group->id == id) { + return group; + } + } + return NULL; +} + +static void s390_pci_init_default_group(void) +{ + S390PCIGroup *group; + ClpRspQueryPciGrp *resgrp; + + group = s390_group_create(ZPCI_DEFAULT_FN_GRP); + resgrp = &group->zpci_group; + resgrp->fr = 1; + stq_p(&resgrp->dasm, 0); + stq_p(&resgrp->msia, ZPCI_MSI_ADDR); + stw_p(&resgrp->mui, DEFAULT_MUI); + stw_p(&resgrp->i, 128); + stw_p(&resgrp->maxstbl, 128); + resgrp->version = 0; +} + +static void set_pbdev_info(S390PCIBusDevice *pbdev) +{ + pbdev->zpci_fn.sdma = ZPCI_SDMA_ADDR; + pbdev->zpci_fn.edma = ZPCI_EDMA_ADDR; + pbdev->zpci_fn.pchid = 0; + pbdev->zpci_fn.pfgid = ZPCI_DEFAULT_FN_GRP; + pbdev->zpci_fn.fid = pbdev->fid; + pbdev->zpci_fn.uid = pbdev->uid; + pbdev->pci_group = s390_group_find(ZPCI_DEFAULT_FN_GRP); +} + static void s390_pcihost_realize(DeviceState *dev, Error **errp) { PCIBus *b; @@ -764,11 +816,25 @@ static void s390_pcihost_realize(DeviceState *dev, Error **errp) s->bus_no = 0; QTAILQ_INIT(&s->pending_sei); QTAILQ_INIT(&s->zpci_devs); + QTAILQ_INIT(&s->zpci_dma_limit); + QTAILQ_INIT(&s->zpci_groups); + s390_pci_init_default_group(); css_register_io_adapters(CSS_IO_ADAPTER_PCI, true, false, S390_ADAPTER_SUPPRESSIBLE, errp); } +static void s390_pcihost_unrealize(DeviceState *dev) +{ + S390PCIGroup *group; + S390pciState *s = S390_PCI_HOST_BRIDGE(dev); + + while (!QTAILQ_EMPTY(&s->zpci_groups)) { + group = QTAILQ_FIRST(&s->zpci_groups); + QTAILQ_REMOVE(&s->zpci_groups, group, link); + } +} + static int s390_pci_msix_init(S390PCIBusDevice *pbdev) { char *name; @@ -797,7 +863,8 @@ static int s390_pci_msix_init(S390PCIBusDevice *pbdev) name = g_strdup_printf("msix-s390-%04x", pbdev->uid); memory_region_init_io(&pbdev->msix_notify_mr, OBJECT(pbdev), &s390_msi_ctrl_ops, pbdev, name, PAGE_SIZE); - memory_region_add_subregion(&pbdev->iommu->mr, ZPCI_MSI_ADDR, + memory_region_add_subregion(&pbdev->iommu->mr, + pbdev->pci_group->zpci_group.msia, &pbdev->msix_notify_mr); g_free(name); @@ -941,17 +1008,21 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev, } } + pbdev->pdev = pdev; + pbdev->iommu = s390_pci_get_iommu(s, pci_get_bus(pdev), pdev->devfn); + pbdev->iommu->pbdev = pbdev; + pbdev->state = ZPCI_FS_DISABLED; + set_pbdev_info(pbdev); + if (object_dynamic_cast(OBJECT(dev), "vfio-pci")) { pbdev->fh |= FH_SHM_VFIO; + pbdev->iommu->dma_limit = s390_pci_start_dma_count(s, pbdev); + /* Fill in CLP information passed via the vfio region */ + s390_pci_get_clp_info(pbdev); } else { pbdev->fh |= FH_SHM_EMUL; } - pbdev->pdev = pdev; - pbdev->iommu = s390_pci_get_iommu(s, pci_get_bus(pdev), pdev->devfn); - pbdev->iommu->pbdev = pbdev; - pbdev->state = ZPCI_FS_DISABLED; - if (s390_pci_msix_init(pbdev)) { error_setg(errp, "MSI-X support is mandatory " "in the S390 architecture"); @@ -1004,6 +1075,9 @@ static void s390_pcihost_unplug(HotplugHandler *hotplug_dev, DeviceState *dev, pbdev->fid = 0; QTAILQ_REMOVE(&s->zpci_devs, pbdev, link); g_hash_table_remove(s->zpci_table, &pbdev->idx); + if (pbdev->iommu->dma_limit) { + s390_pci_end_dma_count(s, pbdev->iommu->dma_limit); + } qdev_unrealize(dev); } } @@ -1123,6 +1197,7 @@ static void s390_pcihost_class_init(ObjectClass *klass, void *data) dc->reset = s390_pcihost_reset; dc->realize = s390_pcihost_realize; + dc->unrealize = s390_pcihost_unrealize; hc->pre_plug = s390_pcihost_pre_plug; hc->plug = s390_pcihost_plug; hc->unplug_request = s390_pcihost_unplug_request; diff --git a/hw/s390x/s390-pci-bus.h b/hw/s390x/s390-pci-bus.h deleted file mode 100644 index 97464d0ad3..0000000000 --- a/hw/s390x/s390-pci-bus.h +++ /dev/null @@ -1,372 +0,0 @@ -/* - * s390 PCI BUS definitions - * - * Copyright 2014 IBM Corp. - * Author(s): Frank Blaschka <frank.blaschka@de.ibm.com> - * Hong Bo Li <lihbbj@cn.ibm.com> - * Yi Min Zhao <zyimin@cn.ibm.com> - * - * This work is licensed under the terms of the GNU GPL, version 2 or (at - * your option) any later version. See the COPYING file in the top-level - * directory. - */ - -#ifndef HW_S390_PCI_BUS_H -#define HW_S390_PCI_BUS_H - -#include "hw/pci/pci.h" -#include "hw/pci/pci_host.h" -#include "hw/s390x/sclp.h" -#include "hw/s390x/s390_flic.h" -#include "hw/s390x/css.h" -#include "qom/object.h" - -#define TYPE_S390_PCI_HOST_BRIDGE "s390-pcihost" -#define TYPE_S390_PCI_BUS "s390-pcibus" -#define TYPE_S390_PCI_DEVICE "zpci" -#define TYPE_S390_PCI_IOMMU "s390-pci-iommu" -#define TYPE_S390_IOMMU_MEMORY_REGION "s390-iommu-memory-region" -#define FH_MASK_ENABLE 0x80000000 -#define FH_MASK_INSTANCE 0x7f000000 -#define FH_MASK_SHM 0x00ff0000 -#define FH_MASK_INDEX 0x0000ffff -#define FH_SHM_VFIO 0x00010000 -#define FH_SHM_EMUL 0x00020000 -#define ZPCI_MAX_FID 0xffffffff -#define ZPCI_MAX_UID 0xffff -#define UID_UNDEFINED 0 -#define UID_CHECKING_ENABLED 0x01 - -OBJECT_DECLARE_SIMPLE_TYPE(S390pciState, S390_PCI_HOST_BRIDGE) -OBJECT_DECLARE_SIMPLE_TYPE(S390PCIBus, S390_PCI_BUS) -OBJECT_DECLARE_SIMPLE_TYPE(S390PCIBusDevice, S390_PCI_DEVICE) -OBJECT_DECLARE_SIMPLE_TYPE(S390PCIIOMMU, S390_PCI_IOMMU) - -#define HP_EVENT_TO_CONFIGURED 0x0301 -#define HP_EVENT_RESERVED_TO_STANDBY 0x0302 -#define HP_EVENT_DECONFIGURE_REQUEST 0x0303 -#define HP_EVENT_CONFIGURED_TO_STBRES 0x0304 -#define HP_EVENT_STANDBY_TO_RESERVED 0x0308 - -#define ERR_EVENT_INVALAS 0x1 -#define ERR_EVENT_OORANGE 0x2 -#define ERR_EVENT_INVALTF 0x3 -#define ERR_EVENT_TPROTE 0x4 -#define ERR_EVENT_APROTE 0x5 -#define ERR_EVENT_KEYE 0x6 -#define ERR_EVENT_INVALTE 0x7 -#define ERR_EVENT_INVALTL 0x8 -#define ERR_EVENT_TT 0x9 -#define ERR_EVENT_INVALMS 0xa -#define ERR_EVENT_SERR 0xb -#define ERR_EVENT_NOMSI 0x10 -#define ERR_EVENT_INVALBV 0x11 -#define ERR_EVENT_AIBV 0x12 -#define ERR_EVENT_AIRERR 0x13 -#define ERR_EVENT_FMBA 0x2a -#define ERR_EVENT_FMBUP 0x2b -#define ERR_EVENT_FMBPRO 0x2c -#define ERR_EVENT_CCONF 0x30 -#define ERR_EVENT_SERVAC 0x3a -#define ERR_EVENT_PERMERR 0x3b - -#define ERR_EVENT_Q_BIT 0x2 -#define ERR_EVENT_MVN_OFFSET 16 - -#define ZPCI_MSI_VEC_BITS 11 -#define ZPCI_MSI_VEC_MASK 0x7ff - -#define ZPCI_MSI_ADDR 0xfe00000000000000ULL -#define ZPCI_SDMA_ADDR 0x100000000ULL -#define ZPCI_EDMA_ADDR 0x1ffffffffffffffULL - -#define PAGE_SHIFT 12 -#define PAGE_SIZE (1 << PAGE_SHIFT) -#define PAGE_MASK (~(PAGE_SIZE-1)) -#define PAGE_DEFAULT_ACC 0 -#define PAGE_DEFAULT_KEY (PAGE_DEFAULT_ACC << 4) - -/* I/O Translation Anchor (IOTA) */ -enum ZpciIoatDtype { - ZPCI_IOTA_STO = 0, - ZPCI_IOTA_RTTO = 1, - ZPCI_IOTA_RSTO = 2, - ZPCI_IOTA_RFTO = 3, - ZPCI_IOTA_PFAA = 4, - ZPCI_IOTA_IOPFAA = 5, - ZPCI_IOTA_IOPTO = 7 -}; - -#define ZPCI_IOTA_IOT_ENABLED 0x800ULL -#define ZPCI_IOTA_DT_ST (ZPCI_IOTA_STO << 2) -#define ZPCI_IOTA_DT_RT (ZPCI_IOTA_RTTO << 2) -#define ZPCI_IOTA_DT_RS (ZPCI_IOTA_RSTO << 2) -#define ZPCI_IOTA_DT_RF (ZPCI_IOTA_RFTO << 2) -#define ZPCI_IOTA_DT_PF (ZPCI_IOTA_PFAA << 2) -#define ZPCI_IOTA_FS_4K 0 -#define ZPCI_IOTA_FS_1M 1 -#define ZPCI_IOTA_FS_2G 2 -#define ZPCI_KEY (PAGE_DEFAULT_KEY << 5) - -#define ZPCI_IOTA_STO_FLAG (ZPCI_IOTA_IOT_ENABLED | ZPCI_KEY | ZPCI_IOTA_DT_ST) -#define ZPCI_IOTA_RTTO_FLAG (ZPCI_IOTA_IOT_ENABLED | ZPCI_KEY | ZPCI_IOTA_DT_RT) -#define ZPCI_IOTA_RSTO_FLAG (ZPCI_IOTA_IOT_ENABLED | ZPCI_KEY | ZPCI_IOTA_DT_RS) -#define ZPCI_IOTA_RFTO_FLAG (ZPCI_IOTA_IOT_ENABLED | ZPCI_KEY | ZPCI_IOTA_DT_RF) -#define ZPCI_IOTA_RFAA_FLAG (ZPCI_IOTA_IOT_ENABLED | ZPCI_KEY |\ - ZPCI_IOTA_DT_PF | ZPCI_IOTA_FS_2G) - -/* I/O Region and segment tables */ -#define ZPCI_INDEX_MASK 0x7ffULL - -#define ZPCI_TABLE_TYPE_MASK 0xc -#define ZPCI_TABLE_TYPE_RFX 0xc -#define ZPCI_TABLE_TYPE_RSX 0x8 -#define ZPCI_TABLE_TYPE_RTX 0x4 -#define ZPCI_TABLE_TYPE_SX 0x0 - -#define ZPCI_TABLE_LEN_RFX 0x3 -#define ZPCI_TABLE_LEN_RSX 0x3 -#define ZPCI_TABLE_LEN_RTX 0x3 - -#define ZPCI_TABLE_OFFSET_MASK 0xc0 -#define ZPCI_TABLE_SIZE 0x4000 -#define ZPCI_TABLE_ALIGN ZPCI_TABLE_SIZE -#define ZPCI_TABLE_ENTRY_SIZE (sizeof(unsigned long)) -#define ZPCI_TABLE_ENTRIES (ZPCI_TABLE_SIZE / ZPCI_TABLE_ENTRY_SIZE) - -#define ZPCI_TABLE_BITS 11 -#define ZPCI_PT_BITS 8 -#define ZPCI_ST_SHIFT (ZPCI_PT_BITS + PAGE_SHIFT) -#define ZPCI_RT_SHIFT (ZPCI_ST_SHIFT + ZPCI_TABLE_BITS) - -#define ZPCI_RTE_FLAG_MASK 0x3fffULL -#define ZPCI_RTE_ADDR_MASK (~ZPCI_RTE_FLAG_MASK) -#define ZPCI_STE_FLAG_MASK 0x7ffULL -#define ZPCI_STE_ADDR_MASK (~ZPCI_STE_FLAG_MASK) - -#define ZPCI_SFAA_MASK (~((1ULL << 20) - 1)) - -/* I/O Page tables */ -#define ZPCI_PTE_VALID_MASK 0x400 -#define ZPCI_PTE_INVALID 0x400 -#define ZPCI_PTE_VALID 0x000 -#define ZPCI_PT_SIZE 0x800 -#define ZPCI_PT_ALIGN ZPCI_PT_SIZE -#define ZPCI_PT_ENTRIES (ZPCI_PT_SIZE / ZPCI_TABLE_ENTRY_SIZE) -#define ZPCI_PT_MASK (ZPCI_PT_ENTRIES - 1) - -#define ZPCI_PTE_FLAG_MASK 0xfffULL -#define ZPCI_PTE_ADDR_MASK (~ZPCI_PTE_FLAG_MASK) - -/* Shared bits */ -#define ZPCI_TABLE_VALID 0x00 -#define ZPCI_TABLE_INVALID 0x20 -#define ZPCI_TABLE_PROTECTED 0x200 -#define ZPCI_TABLE_UNPROTECTED 0x000 -#define ZPCI_TABLE_FC 0x400 - -#define ZPCI_TABLE_VALID_MASK 0x20 -#define ZPCI_TABLE_PROT_MASK 0x200 - -#define ZPCI_ETT_RT 1 -#define ZPCI_ETT_ST 0 -#define ZPCI_ETT_PT -1 - -/* PCI Function States - * - * reserved: default; device has just been plugged or is in progress of being - * unplugged - * standby: device is present but not configured; transition from any - * configured state/to this state via sclp configure/deconfigure - * - * The following states make up the "configured" meta-state: - * disabled: device is configured but not enabled; transition between this - * state and enabled via clp enable/disable - * enbaled: device is ready for use; transition to disabled via clp disable; - * may enter an error state - * blocked: ignore all DMA and interrupts; transition back to enabled or from - * error state via mpcifc - * error: an error occurred; transition back to enabled via mpcifc - * permanent error: an unrecoverable error occurred; transition to standby via - * sclp deconfigure - */ -typedef enum { - ZPCI_FS_RESERVED, - ZPCI_FS_STANDBY, - ZPCI_FS_DISABLED, - ZPCI_FS_ENABLED, - ZPCI_FS_BLOCKED, - ZPCI_FS_ERROR, - ZPCI_FS_PERMANENT_ERROR, -} ZpciState; - -typedef struct SeiContainer { - QTAILQ_ENTRY(SeiContainer) link; - uint32_t fid; - uint32_t fh; - uint8_t cc; - uint16_t pec; - uint64_t faddr; - uint32_t e; -} SeiContainer; - -typedef struct PciCcdfErr { - uint32_t reserved1; - uint32_t fh; - uint32_t fid; - uint32_t e; - uint64_t faddr; - uint32_t reserved3; - uint16_t reserved4; - uint16_t pec; -} QEMU_PACKED PciCcdfErr; - -typedef struct PciCcdfAvail { - uint32_t reserved1; - uint32_t fh; - uint32_t fid; - uint32_t reserved2; - uint32_t reserved3; - uint32_t reserved4; - uint32_t reserved5; - uint16_t reserved6; - uint16_t pec; -} QEMU_PACKED PciCcdfAvail; - -typedef struct ChscSeiNt2Res { - uint16_t length; - uint16_t code; - uint16_t reserved1; - uint8_t reserved2; - uint8_t nt; - uint8_t flags; - uint8_t reserved3; - uint8_t reserved4; - uint8_t cc; - uint32_t reserved5[13]; - uint8_t ccdf[4016]; -} QEMU_PACKED ChscSeiNt2Res; - -typedef struct S390MsixInfo { - uint8_t table_bar; - uint8_t pba_bar; - uint16_t entries; - uint32_t table_offset; - uint32_t pba_offset; -} S390MsixInfo; - -typedef struct S390IOTLBEntry { - uint64_t iova; - uint64_t translated_addr; - uint64_t len; - uint64_t perm; -} S390IOTLBEntry; - -struct S390PCIIOMMU { - Object parent_obj; - S390PCIBusDevice *pbdev; - AddressSpace as; - MemoryRegion mr; - IOMMUMemoryRegion iommu_mr; - bool enabled; - uint64_t g_iota; - uint64_t pba; - uint64_t pal; - GHashTable *iotlb; -}; - -typedef struct S390PCIIOMMUTable { - uint64_t key; - S390PCIIOMMU *iommu[PCI_SLOT_MAX]; -} S390PCIIOMMUTable; - -/* Function Measurement Block */ -#define DEFAULT_MUI 4000 -#define UPDATE_U_BIT 0x1ULL -#define FMBK_MASK 0xfULL - -typedef struct ZpciFmbFmt0 { - uint64_t dma_rbytes; - uint64_t dma_wbytes; -} ZpciFmbFmt0; - -#define ZPCI_FMB_CNT_LD 0 -#define ZPCI_FMB_CNT_ST 1 -#define ZPCI_FMB_CNT_STB 2 -#define ZPCI_FMB_CNT_RPCIT 3 -#define ZPCI_FMB_CNT_MAX 4 - -#define ZPCI_FMB_FORMAT 0 - -typedef struct ZpciFmb { - uint32_t format; - uint32_t sample; - uint64_t last_update; - uint64_t counter[ZPCI_FMB_CNT_MAX]; - ZpciFmbFmt0 fmt0; -} ZpciFmb; -QEMU_BUILD_BUG_MSG(offsetof(ZpciFmb, fmt0) != 48, "padding in ZpciFmb"); - -struct S390PCIBusDevice { - DeviceState qdev; - PCIDevice *pdev; - ZpciState state; - char *target; - uint16_t uid; - uint32_t idx; - uint32_t fh; - uint32_t fid; - bool fid_defined; - uint64_t fmb_addr; - ZpciFmb fmb; - QEMUTimer *fmb_timer; - uint8_t isc; - uint16_t noi; - uint16_t maxstbl; - uint8_t sum; - S390MsixInfo msix; - AdapterRoutes routes; - S390PCIIOMMU *iommu; - MemoryRegion msix_notify_mr; - IndAddr *summary_ind; - IndAddr *indicator; - bool pci_unplug_request_processed; - bool unplug_requested; - QTAILQ_ENTRY(S390PCIBusDevice) link; -}; - -struct S390PCIBus { - BusState qbus; -}; - -struct S390pciState { - PCIHostState parent_obj; - uint32_t next_idx; - int bus_no; - S390PCIBus *bus; - GHashTable *iommu_table; - GHashTable *zpci_table; - QTAILQ_HEAD(, SeiContainer) pending_sei; - QTAILQ_HEAD(, S390PCIBusDevice) zpci_devs; -}; - -S390pciState *s390_get_phb(void); -int pci_chsc_sei_nt2_get_event(void *res); -int pci_chsc_sei_nt2_have_event(void); -void s390_pci_sclp_configure(SCCB *sccb); -void s390_pci_sclp_deconfigure(SCCB *sccb); -void s390_pci_iommu_enable(S390PCIIOMMU *iommu); -void s390_pci_iommu_disable(S390PCIIOMMU *iommu); -void s390_pci_generate_error_event(uint16_t pec, uint32_t fh, uint32_t fid, - uint64_t faddr, uint32_t e); -uint16_t s390_guest_io_table_walk(uint64_t g_iota, hwaddr addr, - S390IOTLBEntry *entry); -S390PCIBusDevice *s390_pci_find_dev_by_idx(S390pciState *s, uint32_t idx); -S390PCIBusDevice *s390_pci_find_dev_by_fh(S390pciState *s, uint32_t fh); -S390PCIBusDevice *s390_pci_find_dev_by_fid(S390pciState *s, uint32_t fid); -S390PCIBusDevice *s390_pci_find_dev_by_target(S390pciState *s, - const char *target); -S390PCIBusDevice *s390_pci_find_next_avail_dev(S390pciState *s, - S390PCIBusDevice *pbdev); - -#endif diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c index 2f7a7d7bd1..58cd041d17 100644 --- a/hw/s390x/s390-pci-inst.c +++ b/hw/s390x/s390-pci-inst.c @@ -13,12 +13,12 @@ #include "qemu/osdep.h" #include "cpu.h" -#include "s390-pci-inst.h" -#include "s390-pci-bus.h" #include "exec/memop.h" #include "exec/memory-internal.h" #include "qemu/error-report.h" #include "sysemu/hw_accel.h" +#include "hw/s390x/s390-pci-inst.h" +#include "hw/s390x/s390-pci-bus.h" #include "hw/s390x/tod.h" #ifndef DEBUG_S390PCI_INST @@ -32,6 +32,20 @@ } \ } while (0) +static inline void inc_dma_avail(S390PCIIOMMU *iommu) +{ + if (iommu->dma_limit) { + iommu->dma_limit->avail++; + } +} + +static inline void dec_dma_avail(S390PCIIOMMU *iommu) +{ + if (iommu->dma_limit) { + iommu->dma_limit->avail--; + } +} + static void s390_set_status_code(CPUS390XState *env, uint8_t r, uint64_t status_code) { @@ -267,6 +281,8 @@ int clp_service_call(S390CPU *cpu, uint8_t r2, uintptr_t ra) goto out; } + memcpy(resquery, &pbdev->zpci_fn, sizeof(*resquery)); + for (i = 0; i < PCI_BAR_COUNT; i++) { uint32_t data = pci_get_long(pbdev->pdev->config + PCI_BASE_ADDRESS_0 + (i * 4)); @@ -280,25 +296,23 @@ int clp_service_call(S390CPU *cpu, uint8_t r2, uintptr_t ra) resquery->bar_size[i]); } - stq_p(&resquery->sdma, ZPCI_SDMA_ADDR); - stq_p(&resquery->edma, ZPCI_EDMA_ADDR); - stl_p(&resquery->fid, pbdev->fid); - stw_p(&resquery->pchid, 0); - stw_p(&resquery->ug, 1); - stl_p(&resquery->uid, pbdev->uid); stw_p(&resquery->hdr.rsp, CLP_RC_OK); break; } case CLP_QUERY_PCI_FNGRP: { ClpRspQueryPciGrp *resgrp = (ClpRspQueryPciGrp *)resh; - resgrp->fr = 1; - stq_p(&resgrp->dasm, 0); - stq_p(&resgrp->msia, ZPCI_MSI_ADDR); - stw_p(&resgrp->mui, DEFAULT_MUI); - stw_p(&resgrp->i, 128); - stw_p(&resgrp->maxstbl, 128); - resgrp->version = 0; + ClpReqQueryPciGrp *reqgrp = (ClpReqQueryPciGrp *)reqh; + S390PCIGroup *group; + + group = s390_group_find(reqgrp->g); + if (!group) { + /* We do not allow access to unknown groups */ + /* The group must have been obtained with a vfio device */ + stw_p(&resgrp->hdr.rsp, CLP_RC_QUERYPCIFG_PFGID); + goto out; + } + memcpy(resgrp, &group->zpci_group, sizeof(ClpRspQueryPciGrp)); stw_p(&resgrp->hdr.rsp, CLP_RC_OK); break; } @@ -572,7 +586,8 @@ int pcistg_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra) return 0; } -static void s390_pci_update_iotlb(S390PCIIOMMU *iommu, S390IOTLBEntry *entry) +static uint32_t s390_pci_update_iotlb(S390PCIIOMMU *iommu, + S390IOTLBEntry *entry) { S390IOTLBEntry *cache = g_hash_table_lookup(iommu->iotlb, &entry->iova); IOMMUTLBEntry notify = { @@ -585,14 +600,15 @@ static void s390_pci_update_iotlb(S390PCIIOMMU *iommu, S390IOTLBEntry *entry) if (entry->perm == IOMMU_NONE) { if (!cache) { - return; + goto out; } g_hash_table_remove(iommu->iotlb, &entry->iova); + inc_dma_avail(iommu); } else { if (cache) { if (cache->perm == entry->perm && cache->translated_addr == entry->translated_addr) { - return; + goto out; } notify.perm = IOMMU_NONE; @@ -606,9 +622,13 @@ static void s390_pci_update_iotlb(S390PCIIOMMU *iommu, S390IOTLBEntry *entry) cache->len = PAGE_SIZE; cache->perm = entry->perm; g_hash_table_replace(iommu->iotlb, &cache->iova, cache); + dec_dma_avail(iommu); } memory_region_notify_iommu(&iommu->iommu_mr, 0, notify); + +out: + return iommu->dma_limit ? iommu->dma_limit->avail : 1; } int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra) @@ -620,6 +640,7 @@ int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra) S390PCIIOMMU *iommu; S390IOTLBEntry entry; hwaddr start, end; + uint32_t dma_avail; if (env->psw.mask & PSW_MASK_PSTATE) { s390_program_interrupt(env, PGM_PRIVILEGED, ra); @@ -658,6 +679,11 @@ int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra) } iommu = pbdev->iommu; + if (iommu->dma_limit) { + dma_avail = iommu->dma_limit->avail; + } else { + dma_avail = 1; + } if (!iommu->g_iota) { error = ERR_EVENT_INVALAS; goto err; @@ -675,8 +701,9 @@ int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra) } start += entry.len; - while (entry.iova < start && entry.iova < end) { - s390_pci_update_iotlb(iommu, &entry); + while (entry.iova < start && entry.iova < end && + (dma_avail > 0 || entry.perm == IOMMU_NONE)) { + dma_avail = s390_pci_update_iotlb(iommu, &entry); entry.iova += PAGE_SIZE; entry.translated_addr += PAGE_SIZE; } @@ -689,7 +716,13 @@ err: s390_pci_generate_error_event(error, pbdev->fh, pbdev->fid, start, 0); } else { pbdev->fmb.counter[ZPCI_FMB_CNT_RPCIT]++; - setcc(cpu, ZPCI_PCI_LS_OK); + if (dma_avail > 0) { + setcc(cpu, ZPCI_PCI_LS_OK); + } else { + /* vfio DMA mappings are exhausted, trigger a RPCIT */ + setcc(cpu, ZPCI_PCI_LS_ERR); + s390_set_status_code(env, r1, ZPCI_RPCIT_ST_INSUFF_RES); + } } return 0; } @@ -754,7 +787,8 @@ int pcistb_service_call(S390CPU *cpu, uint8_t r1, uint8_t r3, uint64_t gaddr, } /* Length must be greater than 8, a multiple of 8 */ /* and not greater than maxstbl */ - if ((len <= 8) || (len % 8) || (len > pbdev->maxstbl)) { + if ((len <= 8) || (len % 8) || + (len > pbdev->pci_group->zpci_group.maxstbl)) { goto specification_error; } /* Do not cross a 4K-byte boundary */ diff --git a/hw/s390x/s390-pci-inst.h b/hw/s390x/s390-pci-inst.h deleted file mode 100644 index fa3bf8b5aa..0000000000 --- a/hw/s390x/s390-pci-inst.h +++ /dev/null @@ -1,312 +0,0 @@ -/* - * s390 PCI instruction definitions - * - * Copyright 2014 IBM Corp. - * Author(s): Frank Blaschka <frank.blaschka@de.ibm.com> - * Hong Bo Li <lihbbj@cn.ibm.com> - * Yi Min Zhao <zyimin@cn.ibm.com> - * - * This work is licensed under the terms of the GNU GPL, version 2 or (at - * your option) any later version. See the COPYING file in the top-level - * directory. - */ - -#ifndef HW_S390_PCI_INST_H -#define HW_S390_PCI_INST_H - -#include "s390-pci-bus.h" -#include "sysemu/dma.h" - -/* CLP common request & response block size */ -#define CLP_BLK_SIZE 4096 -#define PCI_BAR_COUNT 6 -#define PCI_MAX_FUNCTIONS 4096 - -typedef struct ClpReqHdr { - uint16_t len; - uint16_t cmd; -} QEMU_PACKED ClpReqHdr; - -typedef struct ClpRspHdr { - uint16_t len; - uint16_t rsp; -} QEMU_PACKED ClpRspHdr; - -/* CLP Response Codes */ -#define CLP_RC_OK 0x0010 /* Command request successfully */ -#define CLP_RC_CMD 0x0020 /* Command code not recognized */ -#define CLP_RC_PERM 0x0030 /* Command not authorized */ -#define CLP_RC_FMT 0x0040 /* Invalid command request format */ -#define CLP_RC_LEN 0x0050 /* Invalid command request length */ -#define CLP_RC_8K 0x0060 /* Command requires 8K LPCB */ -#define CLP_RC_RESNOT0 0x0070 /* Reserved field not zero */ -#define CLP_RC_NODATA 0x0080 /* No data available */ -#define CLP_RC_FC_UNKNOWN 0x0100 /* Function code not recognized */ - -/* - * Call Logical Processor - Command Codes - */ -#define CLP_LIST_PCI 0x0002 -#define CLP_QUERY_PCI_FN 0x0003 -#define CLP_QUERY_PCI_FNGRP 0x0004 -#define CLP_SET_PCI_FN 0x0005 - -/* PCI function handle list entry */ -typedef struct ClpFhListEntry { - uint16_t device_id; - uint16_t vendor_id; -#define CLP_FHLIST_MASK_CONFIG 0x80000000 - uint32_t config; - uint32_t fid; - uint32_t fh; -} QEMU_PACKED ClpFhListEntry; - -#define CLP_RC_SETPCIFN_FH 0x0101 /* Invalid PCI fn handle */ -#define CLP_RC_SETPCIFN_FHOP 0x0102 /* Fn handle not valid for op */ -#define CLP_RC_SETPCIFN_DMAAS 0x0103 /* Invalid DMA addr space */ -#define CLP_RC_SETPCIFN_RES 0x0104 /* Insufficient resources */ -#define CLP_RC_SETPCIFN_ALRDY 0x0105 /* Fn already in requested state */ -#define CLP_RC_SETPCIFN_ERR 0x0106 /* Fn in permanent error state */ -#define CLP_RC_SETPCIFN_RECPND 0x0107 /* Error recovery pending */ -#define CLP_RC_SETPCIFN_BUSY 0x0108 /* Fn busy */ -#define CLP_RC_LISTPCI_BADRT 0x010a /* Resume token not recognized */ -#define CLP_RC_QUERYPCIFG_PFGID 0x010b /* Unrecognized PFGID */ - -/* request or response block header length */ -#define LIST_PCI_HDR_LEN 32 - -/* Number of function handles fitting in response block */ -#define CLP_FH_LIST_NR_ENTRIES \ - ((CLP_BLK_SIZE - 2 * LIST_PCI_HDR_LEN) \ - / sizeof(ClpFhListEntry)) - -#define CLP_SET_ENABLE_PCI_FN 0 /* Yes, 0 enables it */ -#define CLP_SET_DISABLE_PCI_FN 1 /* Yes, 1 disables it */ - -#define CLP_UTIL_STR_LEN 64 - -#define CLP_MASK_FMT 0xf0000000 - -/* List PCI functions request */ -typedef struct ClpReqListPci { - ClpReqHdr hdr; - uint32_t fmt; - uint64_t reserved1; - uint64_t resume_token; - uint64_t reserved2; -} QEMU_PACKED ClpReqListPci; - -/* List PCI functions response */ -typedef struct ClpRspListPci { - ClpRspHdr hdr; - uint32_t fmt; - uint64_t reserved1; - uint64_t resume_token; - uint32_t mdd; - uint16_t max_fn; - uint8_t flags; - uint8_t entry_size; - ClpFhListEntry fh_list[CLP_FH_LIST_NR_ENTRIES]; -} QEMU_PACKED ClpRspListPci; - -/* Query PCI function request */ -typedef struct ClpReqQueryPci { - ClpReqHdr hdr; - uint32_t fmt; - uint64_t reserved1; - uint32_t fh; /* function handle */ - uint32_t reserved2; - uint64_t reserved3; -} QEMU_PACKED ClpReqQueryPci; - -/* Query PCI function response */ -typedef struct ClpRspQueryPci { - ClpRspHdr hdr; - uint32_t fmt; - uint64_t reserved1; - uint16_t vfn; /* virtual fn number */ -#define CLP_RSP_QPCI_MASK_UTIL 0x100 -#define CLP_RSP_QPCI_MASK_PFGID 0xff - uint16_t ug; - uint32_t fid; /* pci function id */ - uint8_t bar_size[PCI_BAR_COUNT]; - uint16_t pchid; - uint32_t bar[PCI_BAR_COUNT]; - uint64_t reserved2; - uint64_t sdma; /* start dma as */ - uint64_t edma; /* end dma as */ - uint32_t reserved3[11]; - uint32_t uid; - uint8_t util_str[CLP_UTIL_STR_LEN]; /* utility string */ -} QEMU_PACKED ClpRspQueryPci; - -/* Query PCI function group request */ -typedef struct ClpReqQueryPciGrp { - ClpReqHdr hdr; - uint32_t fmt; - uint64_t reserved1; -#define CLP_REQ_QPCIG_MASK_PFGID 0xff - uint32_t g; - uint32_t reserved2; - uint64_t reserved3; -} QEMU_PACKED ClpReqQueryPciGrp; - -/* Query PCI function group response */ -typedef struct ClpRspQueryPciGrp { - ClpRspHdr hdr; - uint32_t fmt; - uint64_t reserved1; -#define CLP_RSP_QPCIG_MASK_NOI 0xfff - uint16_t i; - uint8_t version; -#define CLP_RSP_QPCIG_MASK_FRAME 0x2 -#define CLP_RSP_QPCIG_MASK_REFRESH 0x1 - uint8_t fr; - uint16_t maxstbl; - uint16_t mui; - uint64_t reserved3; - uint64_t dasm; /* dma address space mask */ - uint64_t msia; /* MSI address */ - uint64_t reserved4; - uint64_t reserved5; -} QEMU_PACKED ClpRspQueryPciGrp; - -/* Set PCI function request */ -typedef struct ClpReqSetPci { - ClpReqHdr hdr; - uint32_t fmt; - uint64_t reserved1; - uint32_t fh; /* function handle */ - uint16_t reserved2; - uint8_t oc; /* operation controls */ - uint8_t ndas; /* number of dma spaces */ - uint64_t reserved3; -} QEMU_PACKED ClpReqSetPci; - -/* Set PCI function response */ -typedef struct ClpRspSetPci { - ClpRspHdr hdr; - uint32_t fmt; - uint64_t reserved1; - uint32_t fh; /* function handle */ - uint32_t reserved3; - uint64_t reserved4; -} QEMU_PACKED ClpRspSetPci; - -typedef struct ClpReqRspListPci { - ClpReqListPci request; - ClpRspListPci response; -} QEMU_PACKED ClpReqRspListPci; - -typedef struct ClpReqRspSetPci { - ClpReqSetPci request; - ClpRspSetPci response; -} QEMU_PACKED ClpReqRspSetPci; - -typedef struct ClpReqRspQueryPci { - ClpReqQueryPci request; - ClpRspQueryPci response; -} QEMU_PACKED ClpReqRspQueryPci; - -typedef struct ClpReqRspQueryPciGrp { - ClpReqQueryPciGrp request; - ClpRspQueryPciGrp response; -} QEMU_PACKED ClpReqRspQueryPciGrp; - -/* Load/Store status codes */ -#define ZPCI_PCI_ST_FUNC_NOT_ENABLED 4 -#define ZPCI_PCI_ST_FUNC_IN_ERR 8 -#define ZPCI_PCI_ST_BLOCKED 12 -#define ZPCI_PCI_ST_INSUF_RES 16 -#define ZPCI_PCI_ST_INVAL_AS 20 -#define ZPCI_PCI_ST_FUNC_ALREADY_ENABLED 24 -#define ZPCI_PCI_ST_DMA_AS_NOT_ENABLED 28 -#define ZPCI_PCI_ST_2ND_OP_IN_INV_AS 36 -#define ZPCI_PCI_ST_FUNC_NOT_AVAIL 40 -#define ZPCI_PCI_ST_ALREADY_IN_RQ_STATE 44 - -/* Load/Store return codes */ -#define ZPCI_PCI_LS_OK 0 -#define ZPCI_PCI_LS_ERR 1 -#define ZPCI_PCI_LS_BUSY 2 -#define ZPCI_PCI_LS_INVAL_HANDLE 3 - -/* Modify PCI status codes */ -#define ZPCI_MOD_ST_RES_NOT_AVAIL 4 -#define ZPCI_MOD_ST_INSUF_RES 16 -#define ZPCI_MOD_ST_SEQUENCE 24 -#define ZPCI_MOD_ST_DMAAS_INVAL 28 -#define ZPCI_MOD_ST_FRAME_INVAL 32 -#define ZPCI_MOD_ST_ERROR_RECOVER 40 - -/* Modify PCI Function Controls */ -#define ZPCI_MOD_FC_REG_INT 2 -#define ZPCI_MOD_FC_DEREG_INT 3 -#define ZPCI_MOD_FC_REG_IOAT 4 -#define ZPCI_MOD_FC_DEREG_IOAT 5 -#define ZPCI_MOD_FC_REREG_IOAT 6 -#define ZPCI_MOD_FC_RESET_ERROR 7 -#define ZPCI_MOD_FC_RESET_BLOCK 9 -#define ZPCI_MOD_FC_SET_MEASURE 10 - -/* Store PCI Function Controls status codes */ -#define ZPCI_STPCIFC_ST_PERM_ERROR 8 -#define ZPCI_STPCIFC_ST_INVAL_DMAAS 28 -#define ZPCI_STPCIFC_ST_ERROR_RECOVER 40 - -/* FIB function controls */ -#define ZPCI_FIB_FC_ENABLED 0x80 -#define ZPCI_FIB_FC_ERROR 0x40 -#define ZPCI_FIB_FC_LS_BLOCKED 0x20 -#define ZPCI_FIB_FC_DMAAS_REG 0x10 - -/* FIB function controls */ -#define ZPCI_FIB_FC_ENABLED 0x80 -#define ZPCI_FIB_FC_ERROR 0x40 -#define ZPCI_FIB_FC_LS_BLOCKED 0x20 -#define ZPCI_FIB_FC_DMAAS_REG 0x10 - -/* Function Information Block */ -typedef struct ZpciFib { - uint8_t fmt; /* format */ - uint8_t reserved1[7]; - uint8_t fc; /* function controls */ - uint8_t reserved2; - uint16_t reserved3; - uint32_t reserved4; - uint64_t pba; /* PCI base address */ - uint64_t pal; /* PCI address limit */ - uint64_t iota; /* I/O Translation Anchor */ -#define FIB_DATA_ISC(x) (((x) >> 28) & 0x7) -#define FIB_DATA_NOI(x) (((x) >> 16) & 0xfff) -#define FIB_DATA_AIBVO(x) (((x) >> 8) & 0x3f) -#define FIB_DATA_SUM(x) (((x) >> 7) & 0x1) -#define FIB_DATA_AISBO(x) ((x) & 0x3f) - uint32_t data; - uint32_t reserved5; - uint64_t aibv; /* Adapter int bit vector address */ - uint64_t aisb; /* Adapter int summary bit address */ - uint64_t fmb_addr; /* Function measurement address and key */ - uint32_t reserved6; - uint32_t gd; -} QEMU_PACKED ZpciFib; - -int pci_dereg_irqs(S390PCIBusDevice *pbdev); -void pci_dereg_ioat(S390PCIIOMMU *iommu); -int clp_service_call(S390CPU *cpu, uint8_t r2, uintptr_t ra); -int pcilg_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra); -int pcistg_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra); -int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra); -int pcistb_service_call(S390CPU *cpu, uint8_t r1, uint8_t r3, uint64_t gaddr, - uint8_t ar, uintptr_t ra); -int mpcifc_service_call(S390CPU *cpu, uint8_t r1, uint64_t fiba, uint8_t ar, - uintptr_t ra); -int stpcifc_service_call(S390CPU *cpu, uint8_t r1, uint64_t fiba, uint8_t ar, - uintptr_t ra); -void fmb_timer_free(S390PCIBusDevice *pbdev); - -#define ZPCI_IO_BAR_MIN 0 -#define ZPCI_IO_BAR_MAX 5 -#define ZPCI_CONFIG_BAR 15 - -#endif diff --git a/hw/s390x/s390-pci-vfio.c b/hw/s390x/s390-pci-vfio.c new file mode 100644 index 0000000000..d5c78063b5 --- /dev/null +++ b/hw/s390x/s390-pci-vfio.c @@ -0,0 +1,276 @@ +/* + * s390 vfio-pci interfaces + * + * Copyright 2020 IBM Corp. + * Author(s): Matthew Rosato <mjrosato@linux.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or (at + * your option) any later version. See the COPYING file in the top-level + * directory. + */ + +#include <sys/ioctl.h> +#include <linux/vfio.h> +#include <linux/vfio_zdev.h> + +#include "qemu/osdep.h" +#include "trace.h" +#include "hw/s390x/s390-pci-bus.h" +#include "hw/s390x/s390-pci-clp.h" +#include "hw/s390x/s390-pci-vfio.h" +#include "hw/vfio/pci.h" +#include "hw/vfio/vfio-common.h" + +/* + * Get the current DMA available count from vfio. Returns true if vfio is + * limiting DMA requests, false otherwise. The current available count read + * from vfio is returned in avail. + */ +bool s390_pci_update_dma_avail(int fd, unsigned int *avail) +{ + g_autofree struct vfio_iommu_type1_info *info; + uint32_t argsz; + + assert(avail); + + argsz = sizeof(struct vfio_iommu_type1_info); + info = g_malloc0(argsz); + + /* + * If the specified argsz is not large enough to contain all capabilities + * it will be updated upon return from the ioctl. Retry until we have + * a big enough buffer to hold the entire capability chain. + */ +retry: + info->argsz = argsz; + + if (ioctl(fd, VFIO_IOMMU_GET_INFO, info)) { + return false; + } + + if (info->argsz > argsz) { + argsz = info->argsz; + info = g_realloc(info, argsz); + goto retry; + } + + /* If the capability exists, update with the current value */ + return vfio_get_info_dma_avail(info, avail); +} + +S390PCIDMACount *s390_pci_start_dma_count(S390pciState *s, + S390PCIBusDevice *pbdev) +{ + S390PCIDMACount *cnt; + uint32_t avail; + VFIOPCIDevice *vpdev = container_of(pbdev->pdev, VFIOPCIDevice, pdev); + int id; + + assert(vpdev); + + id = vpdev->vbasedev.group->container->fd; + + if (!s390_pci_update_dma_avail(id, &avail)) { + return NULL; + } + + QTAILQ_FOREACH(cnt, &s->zpci_dma_limit, link) { + if (cnt->id == id) { + cnt->users++; + return cnt; + } + } + + cnt = g_new0(S390PCIDMACount, 1); + cnt->id = id; + cnt->users = 1; + cnt->avail = avail; + QTAILQ_INSERT_TAIL(&s->zpci_dma_limit, cnt, link); + return cnt; +} + +void s390_pci_end_dma_count(S390pciState *s, S390PCIDMACount *cnt) +{ + assert(cnt); + + cnt->users--; + if (cnt->users == 0) { + QTAILQ_REMOVE(&s->zpci_dma_limit, cnt, link); + } +} + +static void s390_pci_read_base(S390PCIBusDevice *pbdev, + struct vfio_device_info *info) +{ + struct vfio_info_cap_header *hdr; + struct vfio_device_info_cap_zpci_base *cap; + VFIOPCIDevice *vpci = container_of(pbdev->pdev, VFIOPCIDevice, pdev); + + hdr = vfio_get_device_info_cap(info, VFIO_DEVICE_INFO_CAP_ZPCI_BASE); + + /* If capability not provided, just leave the defaults in place */ + if (hdr == NULL) { + trace_s390_pci_clp_cap(vpci->vbasedev.name, + VFIO_DEVICE_INFO_CAP_ZPCI_BASE); + return; + } + cap = (void *) hdr; + + pbdev->zpci_fn.sdma = cap->start_dma; + pbdev->zpci_fn.edma = cap->end_dma; + pbdev->zpci_fn.pchid = cap->pchid; + pbdev->zpci_fn.vfn = cap->vfn; + pbdev->zpci_fn.pfgid = cap->gid; + /* The following values remain 0 until we support other FMB formats */ + pbdev->zpci_fn.fmbl = 0; + pbdev->zpci_fn.pft = 0; +} + +static void s390_pci_read_group(S390PCIBusDevice *pbdev, + struct vfio_device_info *info) +{ + struct vfio_info_cap_header *hdr; + struct vfio_device_info_cap_zpci_group *cap; + ClpRspQueryPciGrp *resgrp; + VFIOPCIDevice *vpci = container_of(pbdev->pdev, VFIOPCIDevice, pdev); + + hdr = vfio_get_device_info_cap(info, VFIO_DEVICE_INFO_CAP_ZPCI_GROUP); + + /* If capability not provided, just use the default group */ + if (hdr == NULL) { + trace_s390_pci_clp_cap(vpci->vbasedev.name, + VFIO_DEVICE_INFO_CAP_ZPCI_GROUP); + pbdev->zpci_fn.pfgid = ZPCI_DEFAULT_FN_GRP; + pbdev->pci_group = s390_group_find(ZPCI_DEFAULT_FN_GRP); + return; + } + cap = (void *) hdr; + + /* See if the PCI group is already defined, create if not */ + pbdev->pci_group = s390_group_find(pbdev->zpci_fn.pfgid); + + if (!pbdev->pci_group) { + pbdev->pci_group = s390_group_create(pbdev->zpci_fn.pfgid); + + resgrp = &pbdev->pci_group->zpci_group; + if (cap->flags & VFIO_DEVICE_INFO_ZPCI_FLAG_REFRESH) { + resgrp->fr = 1; + } + stq_p(&resgrp->dasm, cap->dasm); + stq_p(&resgrp->msia, cap->msi_addr); + stw_p(&resgrp->mui, cap->mui); + stw_p(&resgrp->i, cap->noi); + stw_p(&resgrp->maxstbl, cap->maxstbl); + stb_p(&resgrp->version, cap->version); + } +} + +static void s390_pci_read_util(S390PCIBusDevice *pbdev, + struct vfio_device_info *info) +{ + struct vfio_info_cap_header *hdr; + struct vfio_device_info_cap_zpci_util *cap; + VFIOPCIDevice *vpci = container_of(pbdev->pdev, VFIOPCIDevice, pdev); + + hdr = vfio_get_device_info_cap(info, VFIO_DEVICE_INFO_CAP_ZPCI_UTIL); + + /* If capability not provided, just leave the defaults in place */ + if (hdr == NULL) { + trace_s390_pci_clp_cap(vpci->vbasedev.name, + VFIO_DEVICE_INFO_CAP_ZPCI_UTIL); + return; + } + cap = (void *) hdr; + + if (cap->size > CLP_UTIL_STR_LEN) { + trace_s390_pci_clp_cap_size(vpci->vbasedev.name, cap->size, + VFIO_DEVICE_INFO_CAP_ZPCI_UTIL); + return; + } + + pbdev->zpci_fn.flags |= CLP_RSP_QPCI_MASK_UTIL; + memcpy(pbdev->zpci_fn.util_str, cap->util_str, CLP_UTIL_STR_LEN); +} + +static void s390_pci_read_pfip(S390PCIBusDevice *pbdev, + struct vfio_device_info *info) +{ + struct vfio_info_cap_header *hdr; + struct vfio_device_info_cap_zpci_pfip *cap; + VFIOPCIDevice *vpci = container_of(pbdev->pdev, VFIOPCIDevice, pdev); + + hdr = vfio_get_device_info_cap(info, VFIO_DEVICE_INFO_CAP_ZPCI_PFIP); + + /* If capability not provided, just leave the defaults in place */ + if (hdr == NULL) { + trace_s390_pci_clp_cap(vpci->vbasedev.name, + VFIO_DEVICE_INFO_CAP_ZPCI_PFIP); + return; + } + cap = (void *) hdr; + + if (cap->size > CLP_PFIP_NR_SEGMENTS) { + trace_s390_pci_clp_cap_size(vpci->vbasedev.name, cap->size, + VFIO_DEVICE_INFO_CAP_ZPCI_PFIP); + return; + } + + memcpy(pbdev->zpci_fn.pfip, cap->pfip, CLP_PFIP_NR_SEGMENTS); +} + +/* + * This function will issue the VFIO_DEVICE_GET_INFO ioctl and look for + * capabilities that contain information about CLP features provided by the + * underlying host. + * On entry, defaults have already been placed into the guest CLP response + * buffers. On exit, defaults will have been overwritten for any CLP features + * found in the capability chain; defaults will remain for any CLP features not + * found in the chain. + */ +void s390_pci_get_clp_info(S390PCIBusDevice *pbdev) +{ + g_autofree struct vfio_device_info *info; + VFIOPCIDevice *vfio_pci; + uint32_t argsz; + int fd; + + argsz = sizeof(*info); + info = g_malloc0(argsz); + + vfio_pci = container_of(pbdev->pdev, VFIOPCIDevice, pdev); + fd = vfio_pci->vbasedev.fd; + + /* + * If the specified argsz is not large enough to contain all capabilities + * it will be updated upon return from the ioctl. Retry until we have + * a big enough buffer to hold the entire capability chain. On error, + * just exit and rely on CLP defaults. + */ +retry: + info->argsz = argsz; + + if (ioctl(fd, VFIO_DEVICE_GET_INFO, info)) { + trace_s390_pci_clp_dev_info(vfio_pci->vbasedev.name); + return; + } + + if (info->argsz > argsz) { + argsz = info->argsz; + info = g_realloc(info, argsz); + goto retry; + } + + /* + * Find the CLP features provided and fill in the guest CLP responses. + * Always call s390_pci_read_base first as information from this could + * determine which function group is used in s390_pci_read_group. + * For any feature not found, the default values will remain in the CLP + * response. + */ + s390_pci_read_base(pbdev, info); + s390_pci_read_group(pbdev, info); + s390_pci_read_util(pbdev, info); + s390_pci_read_pfip(pbdev, info); + + return; +} diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c index 2e900335ea..22222c4fd5 100644 --- a/hw/s390x/s390-virtio-ccw.c +++ b/hw/s390x/s390-virtio-ccw.c @@ -28,7 +28,7 @@ #include "qemu/error-report.h" #include "qemu/option.h" #include "qemu/qemu-print.h" -#include "s390-pci-bus.h" +#include "hw/s390x/s390-pci-bus.h" #include "sysemu/reset.h" #include "hw/s390x/storage-keys.h" #include "hw/s390x/storage-attributes.h" diff --git a/hw/s390x/trace-events b/hw/s390x/trace-events index 0dc5b818c4..8156693749 100644 --- a/hw/s390x/trace-events +++ b/hw/s390x/trace-events @@ -14,3 +14,8 @@ css_do_sic(uint16_t mode, uint8_t isc) "CSS: set interruption mode 0x%x on isc 0 virtio_ccw_interpret_ccw(int cssid, int ssid, int schid, int cmd_code) "VIRTIO-CCW: %x.%x.%04x: interpret command 0x%x" virtio_ccw_new_device(int cssid, int ssid, int schid, int devno, const char *devno_mode) "VIRTIO-CCW: add subchannel %x.%x.%04x, devno 0x%04x (%s)" virtio_ccw_set_ind(uint64_t ind_loc, uint8_t ind_old, uint8_t ind_new) "VIRTIO-CCW: indicator at %" PRIu64 ": 0x%x->0x%x" + +# s390-pci-vfio.c +s390_pci_clp_cap(const char *id, uint32_t cap) "PCI: %s: missing expected CLP capability %u" +s390_pci_clp_cap_size(const char *id, uint32_t size, uint32_t cap) "PCI: %s: bad size (%u) for CLP capability %u" +s390_pci_clp_dev_info(const char *id) "PCI: %s: cannot read vfio device info" diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 13471ae294..e18ea2cf91 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -29,6 +29,7 @@ #include "hw/vfio/vfio.h" #include "exec/address-spaces.h" #include "exec/memory.h" +#include "exec/ram_addr.h" #include "hw/hw.h" #include "qemu/error-report.h" #include "qemu/main-loop.h" @@ -37,6 +38,7 @@ #include "sysemu/reset.h" #include "trace.h" #include "qapi/error.h" +#include "migration/migration.h" VFIOGroupList vfio_group_list = QLIST_HEAD_INITIALIZER(vfio_group_list); @@ -203,7 +205,7 @@ void vfio_region_write(void *opaque, hwaddr addr, buf.qword = cpu_to_le64(data); break; default: - hw_error("vfio: unsupported write size, %d bytes", size); + hw_error("vfio: unsupported write size, %u bytes", size); break; } @@ -260,7 +262,7 @@ uint64_t vfio_region_read(void *opaque, data = le64_to_cpu(buf.qword); break; default: - hw_error("vfio: unsupported read size, %d bytes", size); + hw_error("vfio: unsupported read size, %u bytes", size); break; } @@ -287,10 +289,146 @@ const MemoryRegionOps vfio_region_ops = { }; /* + * Device state interfaces + */ + +bool vfio_mig_active(void) +{ + VFIOGroup *group; + VFIODevice *vbasedev; + + if (QLIST_EMPTY(&vfio_group_list)) { + return false; + } + + QLIST_FOREACH(group, &vfio_group_list, next) { + QLIST_FOREACH(vbasedev, &group->device_list, next) { + if (vbasedev->migration_blocker) { + return false; + } + } + } + return true; +} + +static bool vfio_devices_all_stopped_and_saving(VFIOContainer *container) +{ + VFIOGroup *group; + VFIODevice *vbasedev; + MigrationState *ms = migrate_get_current(); + + if (!migration_is_setup_or_active(ms->state)) { + return false; + } + + QLIST_FOREACH(group, &container->group_list, container_next) { + QLIST_FOREACH(vbasedev, &group->device_list, next) { + VFIOMigration *migration = vbasedev->migration; + + if (!migration) { + return false; + } + + if ((migration->device_state & VFIO_DEVICE_STATE_SAVING) && + !(migration->device_state & VFIO_DEVICE_STATE_RUNNING)) { + continue; + } else { + return false; + } + } + } + return true; +} + +static bool vfio_devices_all_running_and_saving(VFIOContainer *container) +{ + VFIOGroup *group; + VFIODevice *vbasedev; + MigrationState *ms = migrate_get_current(); + + if (!migration_is_setup_or_active(ms->state)) { + return false; + } + + QLIST_FOREACH(group, &container->group_list, container_next) { + QLIST_FOREACH(vbasedev, &group->device_list, next) { + VFIOMigration *migration = vbasedev->migration; + + if (!migration) { + return false; + } + + if ((migration->device_state & VFIO_DEVICE_STATE_SAVING) && + (migration->device_state & VFIO_DEVICE_STATE_RUNNING)) { + continue; + } else { + return false; + } + } + } + return true; +} + +static int vfio_dma_unmap_bitmap(VFIOContainer *container, + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb) +{ + struct vfio_iommu_type1_dma_unmap *unmap; + struct vfio_bitmap *bitmap; + uint64_t pages = TARGET_PAGE_ALIGN(size) >> TARGET_PAGE_BITS; + int ret; + + unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap)); + + unmap->argsz = sizeof(*unmap) + sizeof(*bitmap); + unmap->iova = iova; + unmap->size = size; + unmap->flags |= VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP; + bitmap = (struct vfio_bitmap *)&unmap->data; + + /* + * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of + * TARGET_PAGE_SIZE to mark those dirty. Hence set bitmap_pgsize to + * TARGET_PAGE_SIZE. + */ + + bitmap->pgsize = TARGET_PAGE_SIZE; + bitmap->size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) / + BITS_PER_BYTE; + + if (bitmap->size > container->max_dirty_bitmap_size) { + error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, + (uint64_t)bitmap->size); + ret = -E2BIG; + goto unmap_exit; + } + + bitmap->data = g_try_malloc0(bitmap->size); + if (!bitmap->data) { + ret = -ENOMEM; + goto unmap_exit; + } + + ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap); + if (!ret) { + cpu_physical_memory_set_dirty_lebitmap((unsigned long *)bitmap->data, + iotlb->translated_addr, pages); + } else { + error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m"); + } + + g_free(bitmap->data); +unmap_exit: + g_free(unmap); + return ret; +} + +/* * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86 */ static int vfio_dma_unmap(VFIOContainer *container, - hwaddr iova, ram_addr_t size) + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb) { struct vfio_iommu_type1_dma_unmap unmap = { .argsz = sizeof(unmap), @@ -299,6 +437,11 @@ static int vfio_dma_unmap(VFIOContainer *container, .size = size, }; + if (iotlb && container->dirty_pages_supported && + vfio_devices_all_running_and_saving(container)) { + return vfio_dma_unmap_bitmap(container, iova, size, iotlb); + } + while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) { /* * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c @@ -346,7 +489,7 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova, * the VGA ROM space. */ if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 || - (errno == EBUSY && vfio_dma_unmap(container, iova, size) == 0 && + (errno == EBUSY && vfio_dma_unmap(container, iova, size, NULL) == 0 && ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) { return 0; } @@ -407,8 +550,8 @@ static bool vfio_listener_skipped_section(MemoryRegionSection *section) } /* Called with rcu_read_lock held. */ -static bool vfio_get_vaddr(IOMMUTLBEntry *iotlb, void **vaddr, - bool *read_only) +static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, + ram_addr_t *ram_addr, bool *read_only) { MemoryRegion *mr; hwaddr xlat; @@ -439,8 +582,17 @@ static bool vfio_get_vaddr(IOMMUTLBEntry *iotlb, void **vaddr, return false; } - *vaddr = memory_region_get_ram_ptr(mr) + xlat; - *read_only = !writable || mr->readonly; + if (vaddr) { + *vaddr = memory_region_get_ram_ptr(mr) + xlat; + } + + if (ram_addr) { + *ram_addr = memory_region_get_ram_addr(mr) + xlat; + } + + if (read_only) { + *read_only = !writable || mr->readonly; + } return true; } @@ -450,7 +602,6 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); VFIOContainer *container = giommu->container; hwaddr iova = iotlb->iova + giommu->iommu_offset; - bool read_only; void *vaddr; int ret; @@ -466,7 +617,9 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) rcu_read_lock(); if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) { - if (!vfio_get_vaddr(iotlb, &vaddr, &read_only)) { + bool read_only; + + if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only)) { goto out; } /* @@ -486,7 +639,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) iotlb->addr_mask + 1, vaddr, ret); } } else { - ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1); + ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb); if (ret) { error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx") = %d (%m)", @@ -789,7 +942,7 @@ static void vfio_listener_region_del(MemoryListener *listener, } if (try_unmap) { - ret = vfio_dma_unmap(container, iova, int128_get64(llsize)); + ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL); if (ret) { error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx") = %d (%m)", @@ -812,9 +965,156 @@ static void vfio_listener_region_del(MemoryListener *listener, } } +static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, + uint64_t size, ram_addr_t ram_addr) +{ + struct vfio_iommu_type1_dirty_bitmap *dbitmap; + struct vfio_iommu_type1_dirty_bitmap_get *range; + uint64_t pages; + int ret; + + dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range)); + + dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range); + dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP; + range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data; + range->iova = iova; + range->size = size; + + /* + * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of + * TARGET_PAGE_SIZE to mark those dirty. Hence set bitmap's pgsize to + * TARGET_PAGE_SIZE. + */ + range->bitmap.pgsize = TARGET_PAGE_SIZE; + + pages = TARGET_PAGE_ALIGN(range->size) >> TARGET_PAGE_BITS; + range->bitmap.size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) / + BITS_PER_BYTE; + range->bitmap.data = g_try_malloc0(range->bitmap.size); + if (!range->bitmap.data) { + ret = -ENOMEM; + goto err_out; + } + + ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap); + if (ret) { + error_report("Failed to get dirty bitmap for iova: 0x%"PRIx64 + " size: 0x%"PRIx64" err: %d", (uint64_t)range->iova, + (uint64_t)range->size, errno); + goto err_out; + } + + cpu_physical_memory_set_dirty_lebitmap((unsigned long *)range->bitmap.data, + ram_addr, pages); + + trace_vfio_get_dirty_bitmap(container->fd, range->iova, range->size, + range->bitmap.size, ram_addr); +err_out: + g_free(range->bitmap.data); + g_free(dbitmap); + + return ret; +} + +typedef struct { + IOMMUNotifier n; + VFIOGuestIOMMU *giommu; +} vfio_giommu_dirty_notifier; + +static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) +{ + vfio_giommu_dirty_notifier *gdn = container_of(n, + vfio_giommu_dirty_notifier, n); + VFIOGuestIOMMU *giommu = gdn->giommu; + VFIOContainer *container = giommu->container; + hwaddr iova = iotlb->iova + giommu->iommu_offset; + ram_addr_t translated_addr; + + trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask); + + if (iotlb->target_as != &address_space_memory) { + error_report("Wrong target AS \"%s\", only system memory is allowed", + iotlb->target_as->name ? iotlb->target_as->name : "none"); + return; + } + + rcu_read_lock(); + if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) { + int ret; + + ret = vfio_get_dirty_bitmap(container, iova, iotlb->addr_mask + 1, + translated_addr); + if (ret) { + error_report("vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", " + "0x%"HWADDR_PRIx") = %d (%m)", + container, iova, + iotlb->addr_mask + 1, ret); + } + } + rcu_read_unlock(); +} + +static int vfio_sync_dirty_bitmap(VFIOContainer *container, + MemoryRegionSection *section) +{ + ram_addr_t ram_addr; + + if (memory_region_is_iommu(section->mr)) { + VFIOGuestIOMMU *giommu; + + QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) { + if (MEMORY_REGION(giommu->iommu) == section->mr && + giommu->n.start == section->offset_within_region) { + Int128 llend; + vfio_giommu_dirty_notifier gdn = { .giommu = giommu }; + int idx = memory_region_iommu_attrs_to_index(giommu->iommu, + MEMTXATTRS_UNSPECIFIED); + + llend = int128_add(int128_make64(section->offset_within_region), + section->size); + llend = int128_sub(llend, int128_one()); + + iommu_notifier_init(&gdn.n, + vfio_iommu_map_dirty_notify, + IOMMU_NOTIFIER_MAP, + section->offset_within_region, + int128_get64(llend), + idx); + memory_region_iommu_replay(giommu->iommu, &gdn.n); + break; + } + } + return 0; + } + + ram_addr = memory_region_get_ram_addr(section->mr) + + section->offset_within_region; + + return vfio_get_dirty_bitmap(container, + TARGET_PAGE_ALIGN(section->offset_within_address_space), + int128_get64(section->size), ram_addr); +} + +static void vfio_listerner_log_sync(MemoryListener *listener, + MemoryRegionSection *section) +{ + VFIOContainer *container = container_of(listener, VFIOContainer, listener); + + if (vfio_listener_skipped_section(section) || + !container->dirty_pages_supported) { + return; + } + + if (vfio_devices_all_stopped_and_saving(container)) { + vfio_sync_dirty_bitmap(container, section); + } +} + static const MemoryListener vfio_memory_listener = { .region_add = vfio_listener_region_add, .region_del = vfio_listener_region_del, + .log_sync = vfio_listerner_log_sync, }; static void vfio_listener_release(VFIOContainer *container) @@ -825,17 +1125,12 @@ static void vfio_listener_release(VFIOContainer *container) } } -struct vfio_info_cap_header * -vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id) +static struct vfio_info_cap_header * +vfio_get_cap(void *ptr, uint32_t cap_offset, uint16_t id) { struct vfio_info_cap_header *hdr; - void *ptr = info; - if (!(info->flags & VFIO_REGION_INFO_FLAG_CAPS)) { - return NULL; - } - - for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) { + for (hdr = ptr + cap_offset; hdr != ptr; hdr = ptr + hdr->next) { if (hdr->id == id) { return hdr; } @@ -844,6 +1139,57 @@ vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id) return NULL; } +struct vfio_info_cap_header * +vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id) +{ + if (!(info->flags & VFIO_REGION_INFO_FLAG_CAPS)) { + return NULL; + } + + return vfio_get_cap((void *)info, info->cap_offset, id); +} + +static struct vfio_info_cap_header * +vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id) +{ + if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) { + return NULL; + } + + return vfio_get_cap((void *)info, info->cap_offset, id); +} + +struct vfio_info_cap_header * +vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id) +{ + if (!(info->flags & VFIO_DEVICE_FLAGS_CAPS)) { + return NULL; + } + + return vfio_get_cap((void *)info, info->cap_offset, id); +} + +bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info, + unsigned int *avail) +{ + struct vfio_info_cap_header *hdr; + struct vfio_iommu_type1_info_dma_avail *cap; + + /* If the capability cannot be found, assume no DMA limiting */ + hdr = vfio_get_iommu_type1_info_cap(info, + VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL); + if (hdr == NULL) { + return false; + } + + if (avail != NULL) { + cap = (void *) hdr; + *avail = cap->avail; + } + + return true; +} + static int vfio_setup_region_sparse_mmaps(VFIORegion *region, struct vfio_region_info *info) { @@ -924,6 +1270,18 @@ int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region, return 0; } +static void vfio_subregion_unmap(VFIORegion *region, int index) +{ + trace_vfio_region_unmap(memory_region_name(®ion->mmaps[index].mem), + region->mmaps[index].offset, + region->mmaps[index].offset + + region->mmaps[index].size - 1); + memory_region_del_subregion(region->mem, ®ion->mmaps[index].mem); + munmap(region->mmaps[index].mmap, region->mmaps[index].size); + object_unparent(OBJECT(®ion->mmaps[index].mem)); + region->mmaps[index].mmap = NULL; +} + int vfio_region_mmap(VFIORegion *region) { int i, prot = 0; @@ -954,10 +1312,7 @@ int vfio_region_mmap(VFIORegion *region) region->mmaps[i].mmap = NULL; for (i--; i >= 0; i--) { - memory_region_del_subregion(region->mem, ®ion->mmaps[i].mem); - munmap(region->mmaps[i].mmap, region->mmaps[i].size); - object_unparent(OBJECT(®ion->mmaps[i].mem)); - region->mmaps[i].mmap = NULL; + vfio_subregion_unmap(region, i); } return ret; @@ -982,6 +1337,21 @@ int vfio_region_mmap(VFIORegion *region) return 0; } +void vfio_region_unmap(VFIORegion *region) +{ + int i; + + if (!region->mem) { + return; + } + + for (i = 0; i < region->nr_mmaps; i++) { + if (region->mmaps[i].mmap) { + vfio_subregion_unmap(region, i); + } + } +} + void vfio_region_exit(VFIORegion *region) { int i; @@ -1204,6 +1574,75 @@ static int vfio_init_container(VFIOContainer *container, int group_fd, return 0; } +static int vfio_get_iommu_info(VFIOContainer *container, + struct vfio_iommu_type1_info **info) +{ + + size_t argsz = sizeof(struct vfio_iommu_type1_info); + + *info = g_new0(struct vfio_iommu_type1_info, 1); +again: + (*info)->argsz = argsz; + + if (ioctl(container->fd, VFIO_IOMMU_GET_INFO, *info)) { + g_free(*info); + *info = NULL; + return -errno; + } + + if (((*info)->argsz > argsz)) { + argsz = (*info)->argsz; + *info = g_realloc(*info, argsz); + goto again; + } + + return 0; +} + +static struct vfio_info_cap_header * +vfio_get_iommu_info_cap(struct vfio_iommu_type1_info *info, uint16_t id) +{ + struct vfio_info_cap_header *hdr; + void *ptr = info; + + if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) { + return NULL; + } + + for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) { + if (hdr->id == id) { + return hdr; + } + } + + return NULL; +} + +static void vfio_get_iommu_info_migration(VFIOContainer *container, + struct vfio_iommu_type1_info *info) +{ + struct vfio_info_cap_header *hdr; + struct vfio_iommu_type1_info_cap_migration *cap_mig; + + hdr = vfio_get_iommu_info_cap(info, VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION); + if (!hdr) { + return; + } + + cap_mig = container_of(hdr, struct vfio_iommu_type1_info_cap_migration, + header); + + /* + * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of + * TARGET_PAGE_SIZE to mark those dirty. + */ + if (cap_mig->pgsize_bitmap & TARGET_PAGE_SIZE) { + container->dirty_pages_supported = true; + container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size; + container->dirty_pgsizes = cap_mig->pgsize_bitmap; + } +} + static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, Error **errp) { @@ -1273,6 +1712,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, container->space = space; container->fd = fd; container->error = NULL; + container->dirty_pages_supported = false; QLIST_INIT(&container->giommu_list); QLIST_INIT(&container->hostwin_list); @@ -1285,7 +1725,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, case VFIO_TYPE1v2_IOMMU: case VFIO_TYPE1_IOMMU: { - struct vfio_iommu_type1_info info; + struct vfio_iommu_type1_info *info; /* * FIXME: This assumes that a Type1 IOMMU can map any 64-bit @@ -1294,15 +1734,19 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, * existing Type1 IOMMUs generally support any IOVA we're * going to actually try in practice. */ - info.argsz = sizeof(info); - ret = ioctl(fd, VFIO_IOMMU_GET_INFO, &info); - /* Ignore errors */ - if (ret || !(info.flags & VFIO_IOMMU_INFO_PGSIZES)) { + ret = vfio_get_iommu_info(container, &info); + + if (ret || !(info->flags & VFIO_IOMMU_INFO_PGSIZES)) { /* Assume 4k IOVA page size */ - info.iova_pgsizes = 4096; + info->iova_pgsizes = 4096; + } + vfio_host_win_add(container, 0, (hwaddr)-1, info->iova_pgsizes); + container->pgsizes = info->iova_pgsizes; + + if (!ret) { + vfio_get_iommu_info_migration(container, info); } - vfio_host_win_add(container, 0, (hwaddr)-1, info.iova_pgsizes); - container->pgsizes = info.iova_pgsizes; + g_free(info); break; } case VFIO_SPAPR_TCE_v2_IOMMU: diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build index 37efa74018..da9af297a0 100644 --- a/hw/vfio/meson.build +++ b/hw/vfio/meson.build @@ -2,6 +2,7 @@ vfio_ss = ss.source_set() vfio_ss.add(files( 'common.c', 'spapr.c', + 'migration.c', )) vfio_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files( 'display.c', diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c new file mode 100644 index 0000000000..3ce285ea39 --- /dev/null +++ b/hw/vfio/migration.c @@ -0,0 +1,933 @@ +/* + * Migration support for VFIO devices + * + * Copyright NVIDIA, Inc. 2020 + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/main-loop.h" +#include "qemu/cutils.h" +#include <linux/vfio.h> +#include <sys/ioctl.h> + +#include "sysemu/runstate.h" +#include "hw/vfio/vfio-common.h" +#include "cpu.h" +#include "migration/migration.h" +#include "migration/vmstate.h" +#include "migration/qemu-file.h" +#include "migration/register.h" +#include "migration/blocker.h" +#include "migration/misc.h" +#include "qapi/error.h" +#include "exec/ramlist.h" +#include "exec/ram_addr.h" +#include "pci.h" +#include "trace.h" +#include "hw/hw.h" + +/* + * Flags to be used as unique delimiters for VFIO devices in the migration + * stream. These flags are composed as: + * 0xffffffff => MSB 32-bit all 1s + * 0xef10 => Magic ID, represents emulated (virtual) function IO + * 0x0000 => 16-bits reserved for flags + * + * The beginning of state information is marked by _DEV_CONFIG_STATE, + * _DEV_SETUP_STATE, or _DEV_DATA_STATE, respectively. The end of a + * certain state information is marked by _END_OF_STATE. + */ +#define VFIO_MIG_FLAG_END_OF_STATE (0xffffffffef100001ULL) +#define VFIO_MIG_FLAG_DEV_CONFIG_STATE (0xffffffffef100002ULL) +#define VFIO_MIG_FLAG_DEV_SETUP_STATE (0xffffffffef100003ULL) +#define VFIO_MIG_FLAG_DEV_DATA_STATE (0xffffffffef100004ULL) + +static int64_t bytes_transferred; + +static inline int vfio_mig_access(VFIODevice *vbasedev, void *val, int count, + off_t off, bool iswrite) +{ + int ret; + + ret = iswrite ? pwrite(vbasedev->fd, val, count, off) : + pread(vbasedev->fd, val, count, off); + if (ret < count) { + error_report("vfio_mig_%s %d byte %s: failed at offset 0x%" + HWADDR_PRIx", err: %s", iswrite ? "write" : "read", count, + vbasedev->name, off, strerror(errno)); + return (ret < 0) ? ret : -EINVAL; + } + return 0; +} + +static int vfio_mig_rw(VFIODevice *vbasedev, __u8 *buf, size_t count, + off_t off, bool iswrite) +{ + int ret, done = 0; + __u8 *tbuf = buf; + + while (count) { + int bytes = 0; + + if (count >= 8 && !(off % 8)) { + bytes = 8; + } else if (count >= 4 && !(off % 4)) { + bytes = 4; + } else if (count >= 2 && !(off % 2)) { + bytes = 2; + } else { + bytes = 1; + } + + ret = vfio_mig_access(vbasedev, tbuf, bytes, off, iswrite); + if (ret) { + return ret; + } + + count -= bytes; + done += bytes; + off += bytes; + tbuf += bytes; + } + return done; +} + +#define vfio_mig_read(f, v, c, o) vfio_mig_rw(f, (__u8 *)v, c, o, false) +#define vfio_mig_write(f, v, c, o) vfio_mig_rw(f, (__u8 *)v, c, o, true) + +#define VFIO_MIG_STRUCT_OFFSET(f) \ + offsetof(struct vfio_device_migration_info, f) +/* + * Change the device_state register for device @vbasedev. Bits set in @mask + * are preserved, bits set in @value are set, and bits not set in either @mask + * or @value are cleared in device_state. If the register cannot be accessed, + * the resulting state would be invalid, or the device enters an error state, + * an error is returned. + */ + +static int vfio_migration_set_state(VFIODevice *vbasedev, uint32_t mask, + uint32_t value) +{ + VFIOMigration *migration = vbasedev->migration; + VFIORegion *region = &migration->region; + off_t dev_state_off = region->fd_offset + + VFIO_MIG_STRUCT_OFFSET(device_state); + uint32_t device_state; + int ret; + + ret = vfio_mig_read(vbasedev, &device_state, sizeof(device_state), + dev_state_off); + if (ret < 0) { + return ret; + } + + device_state = (device_state & mask) | value; + + if (!VFIO_DEVICE_STATE_VALID(device_state)) { + return -EINVAL; + } + + ret = vfio_mig_write(vbasedev, &device_state, sizeof(device_state), + dev_state_off); + if (ret < 0) { + int rret; + + rret = vfio_mig_read(vbasedev, &device_state, sizeof(device_state), + dev_state_off); + + if ((rret < 0) || (VFIO_DEVICE_STATE_IS_ERROR(device_state))) { + hw_error("%s: Device in error state 0x%x", vbasedev->name, + device_state); + return rret ? rret : -EIO; + } + return ret; + } + + migration->device_state = device_state; + trace_vfio_migration_set_state(vbasedev->name, device_state); + return 0; +} + +static void *get_data_section_size(VFIORegion *region, uint64_t data_offset, + uint64_t data_size, uint64_t *size) +{ + void *ptr = NULL; + uint64_t limit = 0; + int i; + + if (!region->mmaps) { + if (size) { + *size = MIN(data_size, region->size - data_offset); + } + return ptr; + } + + for (i = 0; i < region->nr_mmaps; i++) { + VFIOMmap *map = region->mmaps + i; + + if ((data_offset >= map->offset) && + (data_offset < map->offset + map->size)) { + + /* check if data_offset is within sparse mmap areas */ + ptr = map->mmap + data_offset - map->offset; + if (size) { + *size = MIN(data_size, map->offset + map->size - data_offset); + } + break; + } else if ((data_offset < map->offset) && + (!limit || limit > map->offset)) { + /* + * data_offset is not within sparse mmap areas, find size of + * non-mapped area. Check through all list since region->mmaps list + * is not sorted. + */ + limit = map->offset; + } + } + + if (!ptr && size) { + *size = limit ? MIN(data_size, limit - data_offset) : data_size; + } + return ptr; +} + +static int vfio_save_buffer(QEMUFile *f, VFIODevice *vbasedev, uint64_t *size) +{ + VFIOMigration *migration = vbasedev->migration; + VFIORegion *region = &migration->region; + uint64_t data_offset = 0, data_size = 0, sz; + int ret; + + ret = vfio_mig_read(vbasedev, &data_offset, sizeof(data_offset), + region->fd_offset + VFIO_MIG_STRUCT_OFFSET(data_offset)); + if (ret < 0) { + return ret; + } + + ret = vfio_mig_read(vbasedev, &data_size, sizeof(data_size), + region->fd_offset + VFIO_MIG_STRUCT_OFFSET(data_size)); + if (ret < 0) { + return ret; + } + + trace_vfio_save_buffer(vbasedev->name, data_offset, data_size, + migration->pending_bytes); + + qemu_put_be64(f, data_size); + sz = data_size; + + while (sz) { + void *buf; + uint64_t sec_size; + bool buf_allocated = false; + + buf = get_data_section_size(region, data_offset, sz, &sec_size); + + if (!buf) { + buf = g_try_malloc(sec_size); + if (!buf) { + error_report("%s: Error allocating buffer ", __func__); + return -ENOMEM; + } + buf_allocated = true; + + ret = vfio_mig_read(vbasedev, buf, sec_size, + region->fd_offset + data_offset); + if (ret < 0) { + g_free(buf); + return ret; + } + } + + qemu_put_buffer(f, buf, sec_size); + + if (buf_allocated) { + g_free(buf); + } + sz -= sec_size; + data_offset += sec_size; + } + + ret = qemu_file_get_error(f); + + if (!ret && size) { + *size = data_size; + } + + bytes_transferred += data_size; + return ret; +} + +static int vfio_load_buffer(QEMUFile *f, VFIODevice *vbasedev, + uint64_t data_size) +{ + VFIORegion *region = &vbasedev->migration->region; + uint64_t data_offset = 0, size, report_size; + int ret; + + do { + ret = vfio_mig_read(vbasedev, &data_offset, sizeof(data_offset), + region->fd_offset + VFIO_MIG_STRUCT_OFFSET(data_offset)); + if (ret < 0) { + return ret; + } + + if (data_offset + data_size > region->size) { + /* + * If data_size is greater than the data section of migration region + * then iterate the write buffer operation. This case can occur if + * size of migration region at destination is smaller than size of + * migration region at source. + */ + report_size = size = region->size - data_offset; + data_size -= size; + } else { + report_size = size = data_size; + data_size = 0; + } + + trace_vfio_load_state_device_data(vbasedev->name, data_offset, size); + + while (size) { + void *buf; + uint64_t sec_size; + bool buf_alloc = false; + + buf = get_data_section_size(region, data_offset, size, &sec_size); + + if (!buf) { + buf = g_try_malloc(sec_size); + if (!buf) { + error_report("%s: Error allocating buffer ", __func__); + return -ENOMEM; + } + buf_alloc = true; + } + + qemu_get_buffer(f, buf, sec_size); + + if (buf_alloc) { + ret = vfio_mig_write(vbasedev, buf, sec_size, + region->fd_offset + data_offset); + g_free(buf); + + if (ret < 0) { + return ret; + } + } + size -= sec_size; + data_offset += sec_size; + } + + ret = vfio_mig_write(vbasedev, &report_size, sizeof(report_size), + region->fd_offset + VFIO_MIG_STRUCT_OFFSET(data_size)); + if (ret < 0) { + return ret; + } + } while (data_size); + + return 0; +} + +static int vfio_update_pending(VFIODevice *vbasedev) +{ + VFIOMigration *migration = vbasedev->migration; + VFIORegion *region = &migration->region; + uint64_t pending_bytes = 0; + int ret; + + ret = vfio_mig_read(vbasedev, &pending_bytes, sizeof(pending_bytes), + region->fd_offset + VFIO_MIG_STRUCT_OFFSET(pending_bytes)); + if (ret < 0) { + migration->pending_bytes = 0; + return ret; + } + + migration->pending_bytes = pending_bytes; + trace_vfio_update_pending(vbasedev->name, pending_bytes); + return 0; +} + +static int vfio_save_device_config_state(QEMUFile *f, void *opaque) +{ + VFIODevice *vbasedev = opaque; + + qemu_put_be64(f, VFIO_MIG_FLAG_DEV_CONFIG_STATE); + + if (vbasedev->ops && vbasedev->ops->vfio_save_config) { + vbasedev->ops->vfio_save_config(vbasedev, f); + } + + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); + + trace_vfio_save_device_config_state(vbasedev->name); + + return qemu_file_get_error(f); +} + +static int vfio_load_device_config_state(QEMUFile *f, void *opaque) +{ + VFIODevice *vbasedev = opaque; + uint64_t data; + + if (vbasedev->ops && vbasedev->ops->vfio_load_config) { + int ret; + + ret = vbasedev->ops->vfio_load_config(vbasedev, f); + if (ret) { + error_report("%s: Failed to load device config space", + vbasedev->name); + return ret; + } + } + + data = qemu_get_be64(f); + if (data != VFIO_MIG_FLAG_END_OF_STATE) { + error_report("%s: Failed loading device config space, " + "end flag incorrect 0x%"PRIx64, vbasedev->name, data); + return -EINVAL; + } + + trace_vfio_load_device_config_state(vbasedev->name); + return qemu_file_get_error(f); +} + +static int vfio_set_dirty_page_tracking(VFIODevice *vbasedev, bool start) +{ + int ret; + VFIOMigration *migration = vbasedev->migration; + VFIOContainer *container = vbasedev->group->container; + struct vfio_iommu_type1_dirty_bitmap dirty = { + .argsz = sizeof(dirty), + }; + + if (start) { + if (migration->device_state & VFIO_DEVICE_STATE_SAVING) { + dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START; + } else { + return -EINVAL; + } + } else { + dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP; + } + + ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty); + if (ret) { + error_report("Failed to set dirty tracking flag 0x%x errno: %d", + dirty.flags, errno); + return -errno; + } + return ret; +} + +static void vfio_migration_cleanup(VFIODevice *vbasedev) +{ + VFIOMigration *migration = vbasedev->migration; + + vfio_set_dirty_page_tracking(vbasedev, false); + + if (migration->region.mmaps) { + vfio_region_unmap(&migration->region); + } +} + +/* ---------------------------------------------------------------------- */ + +static int vfio_save_setup(QEMUFile *f, void *opaque) +{ + VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; + int ret; + + trace_vfio_save_setup(vbasedev->name); + + qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE); + + if (migration->region.mmaps) { + /* + * Calling vfio_region_mmap() from migration thread. Memory API called + * from this function require locking the iothread when called from + * outside the main loop thread. + */ + qemu_mutex_lock_iothread(); + ret = vfio_region_mmap(&migration->region); + qemu_mutex_unlock_iothread(); + if (ret) { + error_report("%s: Failed to mmap VFIO migration region: %s", + vbasedev->name, strerror(-ret)); + error_report("%s: Falling back to slow path", vbasedev->name); + } + } + + ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_MASK, + VFIO_DEVICE_STATE_SAVING); + if (ret) { + error_report("%s: Failed to set state SAVING", vbasedev->name); + return ret; + } + + ret = vfio_set_dirty_page_tracking(vbasedev, true); + if (ret) { + return ret; + } + + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); + + ret = qemu_file_get_error(f); + if (ret) { + return ret; + } + + return 0; +} + +static void vfio_save_cleanup(void *opaque) +{ + VFIODevice *vbasedev = opaque; + + vfio_migration_cleanup(vbasedev); + trace_vfio_save_cleanup(vbasedev->name); +} + +static void vfio_save_pending(QEMUFile *f, void *opaque, + uint64_t threshold_size, + uint64_t *res_precopy_only, + uint64_t *res_compatible, + uint64_t *res_postcopy_only) +{ + VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; + int ret; + + ret = vfio_update_pending(vbasedev); + if (ret) { + return; + } + + *res_precopy_only += migration->pending_bytes; + + trace_vfio_save_pending(vbasedev->name, *res_precopy_only, + *res_postcopy_only, *res_compatible); +} + +static int vfio_save_iterate(QEMUFile *f, void *opaque) +{ + VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; + uint64_t data_size; + int ret; + + qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE); + + if (migration->pending_bytes == 0) { + ret = vfio_update_pending(vbasedev); + if (ret) { + return ret; + } + + if (migration->pending_bytes == 0) { + qemu_put_be64(f, 0); + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); + /* indicates data finished, goto complete phase */ + return 1; + } + } + + ret = vfio_save_buffer(f, vbasedev, &data_size); + if (ret) { + error_report("%s: vfio_save_buffer failed %s", vbasedev->name, + strerror(errno)); + return ret; + } + + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); + + ret = qemu_file_get_error(f); + if (ret) { + return ret; + } + + /* + * Reset pending_bytes as .save_live_pending is not called during savevm or + * snapshot case, in such case vfio_update_pending() at the start of this + * function updates pending_bytes. + */ + migration->pending_bytes = 0; + trace_vfio_save_iterate(vbasedev->name, data_size); + return 0; +} + +static int vfio_save_complete_precopy(QEMUFile *f, void *opaque) +{ + VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; + uint64_t data_size; + int ret; + + ret = vfio_migration_set_state(vbasedev, ~VFIO_DEVICE_STATE_RUNNING, + VFIO_DEVICE_STATE_SAVING); + if (ret) { + error_report("%s: Failed to set state STOP and SAVING", + vbasedev->name); + return ret; + } + + ret = vfio_save_device_config_state(f, opaque); + if (ret) { + return ret; + } + + ret = vfio_update_pending(vbasedev); + if (ret) { + return ret; + } + + while (migration->pending_bytes > 0) { + qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE); + ret = vfio_save_buffer(f, vbasedev, &data_size); + if (ret < 0) { + error_report("%s: Failed to save buffer", vbasedev->name); + return ret; + } + + if (data_size == 0) { + break; + } + + ret = vfio_update_pending(vbasedev); + if (ret) { + return ret; + } + } + + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); + + ret = qemu_file_get_error(f); + if (ret) { + return ret; + } + + ret = vfio_migration_set_state(vbasedev, ~VFIO_DEVICE_STATE_SAVING, 0); + if (ret) { + error_report("%s: Failed to set state STOPPED", vbasedev->name); + return ret; + } + + trace_vfio_save_complete_precopy(vbasedev->name); + return ret; +} + +static int vfio_load_setup(QEMUFile *f, void *opaque) +{ + VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; + int ret = 0; + + if (migration->region.mmaps) { + ret = vfio_region_mmap(&migration->region); + if (ret) { + error_report("%s: Failed to mmap VFIO migration region %d: %s", + vbasedev->name, migration->region.nr, + strerror(-ret)); + error_report("%s: Falling back to slow path", vbasedev->name); + } + } + + ret = vfio_migration_set_state(vbasedev, ~VFIO_DEVICE_STATE_MASK, + VFIO_DEVICE_STATE_RESUMING); + if (ret) { + error_report("%s: Failed to set state RESUMING", vbasedev->name); + if (migration->region.mmaps) { + vfio_region_unmap(&migration->region); + } + } + return ret; +} + +static int vfio_load_cleanup(void *opaque) +{ + VFIODevice *vbasedev = opaque; + + vfio_migration_cleanup(vbasedev); + trace_vfio_load_cleanup(vbasedev->name); + return 0; +} + +static int vfio_load_state(QEMUFile *f, void *opaque, int version_id) +{ + VFIODevice *vbasedev = opaque; + int ret = 0; + uint64_t data; + + data = qemu_get_be64(f); + while (data != VFIO_MIG_FLAG_END_OF_STATE) { + + trace_vfio_load_state(vbasedev->name, data); + + switch (data) { + case VFIO_MIG_FLAG_DEV_CONFIG_STATE: + { + ret = vfio_load_device_config_state(f, opaque); + if (ret) { + return ret; + } + break; + } + case VFIO_MIG_FLAG_DEV_SETUP_STATE: + { + data = qemu_get_be64(f); + if (data == VFIO_MIG_FLAG_END_OF_STATE) { + return ret; + } else { + error_report("%s: SETUP STATE: EOS not found 0x%"PRIx64, + vbasedev->name, data); + return -EINVAL; + } + break; + } + case VFIO_MIG_FLAG_DEV_DATA_STATE: + { + uint64_t data_size = qemu_get_be64(f); + + if (data_size) { + ret = vfio_load_buffer(f, vbasedev, data_size); + if (ret < 0) { + return ret; + } + } + break; + } + default: + error_report("%s: Unknown tag 0x%"PRIx64, vbasedev->name, data); + return -EINVAL; + } + + data = qemu_get_be64(f); + ret = qemu_file_get_error(f); + if (ret) { + return ret; + } + } + return ret; +} + +static SaveVMHandlers savevm_vfio_handlers = { + .save_setup = vfio_save_setup, + .save_cleanup = vfio_save_cleanup, + .save_live_pending = vfio_save_pending, + .save_live_iterate = vfio_save_iterate, + .save_live_complete_precopy = vfio_save_complete_precopy, + .load_setup = vfio_load_setup, + .load_cleanup = vfio_load_cleanup, + .load_state = vfio_load_state, +}; + +/* ---------------------------------------------------------------------- */ + +static void vfio_vmstate_change(void *opaque, int running, RunState state) +{ + VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; + uint32_t value, mask; + int ret; + + if (vbasedev->migration->vm_running == running) { + return; + } + + if (running) { + /* + * Here device state can have one of _SAVING, _RESUMING or _STOP bit. + * Transition from _SAVING to _RUNNING can happen if there is migration + * failure, in that case clear _SAVING bit. + * Transition from _RESUMING to _RUNNING occurs during resuming + * phase, in that case clear _RESUMING bit. + * In both the above cases, set _RUNNING bit. + */ + mask = ~VFIO_DEVICE_STATE_MASK; + value = VFIO_DEVICE_STATE_RUNNING; + } else { + /* + * Here device state could be either _RUNNING or _SAVING|_RUNNING. Reset + * _RUNNING bit + */ + mask = ~VFIO_DEVICE_STATE_RUNNING; + value = 0; + } + + ret = vfio_migration_set_state(vbasedev, mask, value); + if (ret) { + /* + * Migration should be aborted in this case, but vm_state_notify() + * currently does not support reporting failures. + */ + error_report("%s: Failed to set device state 0x%x", vbasedev->name, + (migration->device_state & mask) | value); + qemu_file_set_error(migrate_get_current()->to_dst_file, ret); + } + vbasedev->migration->vm_running = running; + trace_vfio_vmstate_change(vbasedev->name, running, RunState_str(state), + (migration->device_state & mask) | value); +} + +static void vfio_migration_state_notifier(Notifier *notifier, void *data) +{ + MigrationState *s = data; + VFIOMigration *migration = container_of(notifier, VFIOMigration, + migration_state); + VFIODevice *vbasedev = migration->vbasedev; + int ret; + + trace_vfio_migration_state_notifier(vbasedev->name, + MigrationStatus_str(s->state)); + + switch (s->state) { + case MIGRATION_STATUS_CANCELLING: + case MIGRATION_STATUS_CANCELLED: + case MIGRATION_STATUS_FAILED: + bytes_transferred = 0; + ret = vfio_migration_set_state(vbasedev, + ~(VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RESUMING), + VFIO_DEVICE_STATE_RUNNING); + if (ret) { + error_report("%s: Failed to set state RUNNING", vbasedev->name); + } + } +} + +static void vfio_migration_exit(VFIODevice *vbasedev) +{ + VFIOMigration *migration = vbasedev->migration; + + vfio_region_exit(&migration->region); + vfio_region_finalize(&migration->region); + g_free(vbasedev->migration); + vbasedev->migration = NULL; +} + +static int vfio_migration_init(VFIODevice *vbasedev, + struct vfio_region_info *info) +{ + int ret; + Object *obj; + VFIOMigration *migration; + char id[256] = ""; + g_autofree char *path = NULL, *oid = NULL; + + if (!vbasedev->ops->vfio_get_object) { + return -EINVAL; + } + + obj = vbasedev->ops->vfio_get_object(vbasedev); + if (!obj) { + return -EINVAL; + } + + vbasedev->migration = g_new0(VFIOMigration, 1); + + ret = vfio_region_setup(obj, vbasedev, &vbasedev->migration->region, + info->index, "migration"); + if (ret) { + error_report("%s: Failed to setup VFIO migration region %d: %s", + vbasedev->name, info->index, strerror(-ret)); + goto err; + } + + if (!vbasedev->migration->region.size) { + error_report("%s: Invalid zero-sized VFIO migration region %d", + vbasedev->name, info->index); + ret = -EINVAL; + goto err; + } + + migration = vbasedev->migration; + migration->vbasedev = vbasedev; + + oid = vmstate_if_get_id(VMSTATE_IF(DEVICE(obj))); + if (oid) { + path = g_strdup_printf("%s/vfio", oid); + } else { + path = g_strdup("vfio"); + } + strpadcpy(id, sizeof(id), path, '\0'); + + register_savevm_live(id, VMSTATE_INSTANCE_ID_ANY, 1, &savevm_vfio_handlers, + vbasedev); + + migration->vm_state = qemu_add_vm_change_state_handler(vfio_vmstate_change, + vbasedev); + migration->migration_state.notify = vfio_migration_state_notifier; + add_migration_state_change_notifier(&migration->migration_state); + return 0; + +err: + vfio_migration_exit(vbasedev); + return ret; +} + +/* ---------------------------------------------------------------------- */ + +int64_t vfio_mig_bytes_transferred(void) +{ + return bytes_transferred; +} + +int vfio_migration_probe(VFIODevice *vbasedev, Error **errp) +{ + VFIOContainer *container = vbasedev->group->container; + struct vfio_region_info *info = NULL; + Error *local_err = NULL; + int ret = -ENOTSUP; + + if (!container->dirty_pages_supported) { + goto add_blocker; + } + + ret = vfio_get_dev_region_info(vbasedev, VFIO_REGION_TYPE_MIGRATION, + VFIO_REGION_SUBTYPE_MIGRATION, &info); + if (ret) { + goto add_blocker; + } + + ret = vfio_migration_init(vbasedev, info); + if (ret) { + goto add_blocker; + } + + g_free(info); + trace_vfio_migration_probe(vbasedev->name, info->index); + return 0; + +add_blocker: + error_setg(&vbasedev->migration_blocker, + "VFIO device doesn't support migration"); + g_free(info); + + ret = migrate_add_blocker(vbasedev->migration_blocker, &local_err); + if (local_err) { + error_propagate(errp, local_err); + error_free(vbasedev->migration_blocker); + vbasedev->migration_blocker = NULL; + } + return ret; +} + +void vfio_migration_finalize(VFIODevice *vbasedev) +{ + if (vbasedev->migration) { + VFIOMigration *migration = vbasedev->migration; + + remove_migration_state_change_notifier(&migration->migration_state); + qemu_del_vm_change_state_handler(migration->vm_state); + vfio_migration_exit(vbasedev); + } + + if (vbasedev->migration_blocker) { + migrate_del_blocker(vbasedev->migration_blocker); + error_free(vbasedev->migration_blocker); + vbasedev->migration_blocker = NULL; + } +} diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 0d83eb0e47..58c0ce8971 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -41,6 +41,7 @@ #include "trace.h" #include "qapi/error.h" #include "migration/blocker.h" +#include "migration/qemu-file.h" #define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug" @@ -2394,10 +2395,68 @@ static void vfio_pci_compute_needs_reset(VFIODevice *vbasedev) } } +static Object *vfio_pci_get_object(VFIODevice *vbasedev) +{ + VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); + + return OBJECT(vdev); +} + +static bool vfio_msix_present(void *opaque, int version_id) +{ + PCIDevice *pdev = opaque; + + return msix_present(pdev); +} + +const VMStateDescription vmstate_vfio_pci_config = { + .name = "VFIOPCIDevice", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_PCI_DEVICE(pdev, VFIOPCIDevice), + VMSTATE_MSIX_TEST(pdev, VFIOPCIDevice, vfio_msix_present), + VMSTATE_END_OF_LIST() + } +}; + +static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f) +{ + VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); + + vmstate_save_state(f, &vmstate_vfio_pci_config, vdev, NULL); +} + +static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f) +{ + VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); + PCIDevice *pdev = &vdev->pdev; + int ret; + + ret = vmstate_load_state(f, &vmstate_vfio_pci_config, vdev, 1); + if (ret) { + return ret; + } + + vfio_pci_write_config(pdev, PCI_COMMAND, + pci_get_word(pdev->config + PCI_COMMAND), 2); + + if (msi_enabled(pdev)) { + vfio_msi_enable(vdev); + } else if (msix_enabled(pdev)) { + vfio_msix_enable(vdev); + } + + return ret; +} + static VFIODeviceOps vfio_pci_ops = { .vfio_compute_needs_reset = vfio_pci_compute_needs_reset, .vfio_hot_reset_multi = vfio_pci_hot_reset_multi, .vfio_eoi = vfio_intx_eoi, + .vfio_get_object = vfio_pci_get_object, + .vfio_save_config = vfio_pci_save_config, + .vfio_load_config = vfio_pci_load_config, }; int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp) @@ -2732,17 +2791,6 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) return; } - if (!pdev->failover_pair_id) { - error_setg(&vdev->migration_blocker, - "VFIO device doesn't support migration"); - ret = migrate_add_blocker(vdev->migration_blocker, errp); - if (ret) { - error_free(vdev->migration_blocker); - vdev->migration_blocker = NULL; - return; - } - } - vdev->vbasedev.name = g_path_get_basename(vdev->vbasedev.sysfsdev); vdev->vbasedev.ops = &vfio_pci_ops; vdev->vbasedev.type = VFIO_DEVICE_TYPE_PCI; @@ -3010,6 +3058,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) } } + if (!pdev->failover_pair_id) { + ret = vfio_migration_probe(&vdev->vbasedev, errp); + if (ret) { + error_report("%s: Migration disabled", vdev->vbasedev.name); + } + } + vfio_register_err_notifier(vdev); vfio_register_req_notifier(vdev); vfio_setup_resetfn_quirk(vdev); @@ -3024,11 +3079,6 @@ out_teardown: vfio_bars_exit(vdev); error: error_prepend(errp, VFIO_MSG_PREFIX, vdev->vbasedev.name); - if (vdev->migration_blocker) { - migrate_del_blocker(vdev->migration_blocker); - error_free(vdev->migration_blocker); - vdev->migration_blocker = NULL; - } } static void vfio_instance_finalize(Object *obj) @@ -3040,10 +3090,6 @@ static void vfio_instance_finalize(Object *obj) vfio_bars_finalize(vdev); g_free(vdev->emulated_config_bits); g_free(vdev->rom); - if (vdev->migration_blocker) { - migrate_del_blocker(vdev->migration_blocker); - error_free(vdev->migration_blocker); - } /* * XXX Leaking igd_opregion is not an oversight, we can't remove the * fw_cfg entry therefore leaking this allocation seems like the safest @@ -3071,6 +3117,7 @@ static void vfio_exitfn(PCIDevice *pdev) } vfio_teardown_msi(vdev); vfio_bars_exit(vdev); + vfio_migration_finalize(&vdev->vbasedev); } static void vfio_pci_reset(DeviceState *dev) diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index bce71a9ac9..1574ef983f 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -172,7 +172,6 @@ struct VFIOPCIDevice { bool no_vfio_ioeventfd; bool enable_ramfb; VFIODisplay *dpy; - Error *migration_blocker; Notifier irqchip_change_notifier; }; diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c index 869ed2c39d..cc3f66f7e4 100644 --- a/hw/vfio/platform.c +++ b/hw/vfio/platform.c @@ -166,7 +166,7 @@ static void vfio_intp_mmap_enable(void *opaque) VFIOINTp *tmp; VFIOPlatformDevice *vdev = (VFIOPlatformDevice *)opaque; - qemu_mutex_lock(&vdev->intp_mutex); + QEMU_LOCK_GUARD(&vdev->intp_mutex); QLIST_FOREACH(tmp, &vdev->intp_list, next) { if (tmp->state == VFIO_IRQ_ACTIVE) { trace_vfio_platform_intp_mmap_enable(tmp->pin); @@ -174,12 +174,10 @@ static void vfio_intp_mmap_enable(void *opaque) timer_mod(vdev->mmap_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->mmap_timeout); - qemu_mutex_unlock(&vdev->intp_mutex); return; } } vfio_mmap_set_enabled(vdev, true); - qemu_mutex_unlock(&vdev->intp_mutex); } /** @@ -289,7 +287,7 @@ static void vfio_platform_eoi(VFIODevice *vbasedev) VFIOPlatformDevice *vdev = container_of(vbasedev, VFIOPlatformDevice, vbasedev); - qemu_mutex_lock(&vdev->intp_mutex); + QEMU_LOCK_GUARD(&vdev->intp_mutex); QLIST_FOREACH(intp, &vdev->intp_list, next) { if (intp->state == VFIO_IRQ_ACTIVE) { trace_vfio_platform_eoi(intp->pin, @@ -314,7 +312,6 @@ static void vfio_platform_eoi(VFIODevice *vbasedev) vfio_intp_inject_pending_lockheld(intp); QSIMPLEQ_REMOVE_HEAD(&vdev->pending_intp_queue, pqnext); } - qemu_mutex_unlock(&vdev->intp_mutex); } /** diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 93a0bc2522..c0e75f24b7 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -113,6 +113,7 @@ vfio_region_mmap(const char *name, unsigned long offset, unsigned long end) "Reg vfio_region_exit(const char *name, int index) "Device %s, region %d" vfio_region_finalize(const char *name, int index) "Device %s, region %d" vfio_region_mmaps_set_enabled(const char *name, bool enabled) "Region %s mmaps enabled: %d" +vfio_region_unmap(const char *name, unsigned long offset, unsigned long end) "Region %s unmap [0x%lx - 0x%lx]" vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Device %s region %d: %d sparse mmap entries" vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]" vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%0x8" @@ -144,3 +145,23 @@ vfio_display_edid_link_up(void) "" vfio_display_edid_link_down(void) "" vfio_display_edid_update(uint32_t prefx, uint32_t prefy) "%ux%u" vfio_display_edid_write_error(void) "" + +# migration.c +vfio_migration_probe(const char *name, uint32_t index) " (%s) Region %d" +vfio_migration_set_state(const char *name, uint32_t state) " (%s) state %d" +vfio_vmstate_change(const char *name, int running, const char *reason, uint32_t dev_state) " (%s) running %d reason %s device state %d" +vfio_migration_state_notifier(const char *name, const char *state) " (%s) state %s" +vfio_save_setup(const char *name) " (%s)" +vfio_save_cleanup(const char *name) " (%s)" +vfio_save_buffer(const char *name, uint64_t data_offset, uint64_t data_size, uint64_t pending) " (%s) Offset 0x%"PRIx64" size 0x%"PRIx64" pending 0x%"PRIx64 +vfio_update_pending(const char *name, uint64_t pending) " (%s) pending 0x%"PRIx64 +vfio_save_device_config_state(const char *name) " (%s)" +vfio_save_pending(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t compatible) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" compatible 0x%"PRIx64 +vfio_save_iterate(const char *name, int data_size) " (%s) data_size %d" +vfio_save_complete_precopy(const char *name) " (%s)" +vfio_load_device_config_state(const char *name) " (%s)" +vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64 +vfio_load_state_device_data(const char *name, uint64_t data_offset, uint64_t data_size) " (%s) Offset 0x%"PRIx64" size 0x%"PRIx64 +vfio_load_cleanup(const char *name) " (%s)" +vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start) "container fd=%d, iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64 +vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64 |