diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2023-03-09 15:19:44 +0000 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2023-03-09 15:19:44 +0000 |
commit | 66a6aa8f9a56a6317e074b1f5e269fecdf4ad782 (patch) | |
tree | f4ebf90dbeecbbdee2bda81f2a38247b7ae153e8 /hw | |
parent | dea644928d7583d91170d013716bbbeb938cb938 (diff) | |
parent | 969dae5448eaa2914be5b974f9e0311b3f95ee2c (diff) |
Merge tag 'vfio-updates-20230307.1' of https://gitlab.com/alex.williamson/qemu into staging
VFIO updates for 8.0
* Device level dirty page tracking support for vfio migration, as well as
various cleanups and consolidations. (Avihai Horon, Joao Martins)
* Trivial cleanup of migration entry points. (Alex Williamson)
* Fix trace event typo. (Cédric Le Goater)
# -----BEGIN PGP SIGNATURE-----
#
# iQJPBAABCAA5FiEEQvbATlQL0amee4qQI5ubbjuwiyIFAmQHgCUbHGFsZXgud2ls
# bGlhbXNvbkByZWRoYXQuY29tAAoJECObm247sIsi4i0P/RwP3TJ4jDBEW9JNa52O
# 6Hu6tWDccjSZFX7W/pnUztFtIqYBG6Jcms5VLZhaqrSda2BKa3dVoY+iU2finHRn
# q4CNQ4EVbKBG0HvA9SEd7WchAKADBCVpjeUBAF6jVQHBCQECHnfWtA2Y0T5oEGgw
# H1dwuw3YX6Jwyh5RmT/m7wNtOo2ms/CpDAc7d5rfLg0cDQ0vXPCu/CVvqAXbBpVd
# g7NrMLw1wfhKLYN2eWYkiZ+pGwNX5uxsp0jOSA7leFcfkuLX2KzQ99JpCNhX1oRd
# H5bedA62ffFLGQdlM2zyiAi37CgmeElKSlnaJdBX91Y4DQ3HSdbHYWoiYtzl89rB
# 7QxYHG7XOMdYKssN7qz+oVUpI+ycB18wSW2D/h4fJCNkH92cSHMyJ/yEA3r39eX4
# 7rgu0j8cg2iwIiGlh/klguXfatMDJvbrazDHYixKUJD5vlDXQvTe9LVpwUaUhGGM
# Gh4g8wx9gmDE9H1FbQ0kQqut70sO1Hnw2Pj19qzfdwfL6LeYWk+5AfQZmyziYGFM
# CGRKz5RhlN/Ori9gTKfn00stuxdD09Md5fPllKyMq7a1tkQt58RxLSkUN8hygeki
# Uqnlx5KXBLQ/7ZtnQNoe8frn5FhKBBSLC3tA71PyL4kIbcuiHXLvxIOeE9oJpSPi
# Bt8sTr3eCnVF9mys1ZmGmaYY
# =nM9d
# -----END PGP SIGNATURE-----
# gpg: Signature made Tue 07 Mar 2023 18:19:17 GMT
# gpg: using RSA key 42F6C04E540BD1A99E7B8A90239B9B6E3BB08B22
# gpg: issuer "alex.williamson@redhat.com"
# gpg: Good signature from "Alex Williamson <alex.williamson@redhat.com>" [full]
# gpg: aka "Alex Williamson <alex@shazbot.org>" [full]
# gpg: aka "Alex Williamson <alwillia@redhat.com>" [full]
# gpg: aka "Alex Williamson <alex.l.williamson@gmail.com>" [full]
# Primary key fingerprint: 42F6 C04E 540B D1A9 9E7B 8A90 239B 9B6E 3BB0 8B22
* tag 'vfio-updates-20230307.1' of https://gitlab.com/alex.williamson/qemu:
vfio: Fix vfio_get_dev_region() trace event
vfio/migration: Rename entry points
docs/devel: Document VFIO device dirty page tracking
vfio/migration: Query device dirty page tracking support
vfio/migration: Block migration with vIOMMU
vfio/common: Add device dirty page bitmap sync
vfio/common: Extract code from vfio_get_dirty_bitmap() to new function
vfio/common: Add device dirty page tracking start/stop
vfio/common: Record DMA mapped IOVA ranges
vfio/common: Add helper to consolidate iova/end calculation
vfio/common: Consolidate skip/invalid section into helper
vfio/common: Use a single tracepoint for skipped sections
vfio/common: Add helper to validate iova/end against hostwin
vfio/common: Add VFIOBitmap and alloc function
vfio/common: Abort migration if dirty log start/stop/sync fails
vfio/common: Fix wrong %m usages
vfio/common: Fix error reporting in vfio_get_dirty_bitmap()
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Diffstat (limited to 'hw')
-rw-r--r-- | hw/vfio/common.c | 699 | ||||
-rw-r--r-- | hw/vfio/migration.c | 28 | ||||
-rw-r--r-- | hw/vfio/pci.c | 5 | ||||
-rw-r--r-- | hw/vfio/trace-events | 7 |
4 files changed, 600 insertions, 139 deletions
diff --git a/hw/vfio/common.c b/hw/vfio/common.c index bab83c0e55..4d01ea3515 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -42,6 +42,7 @@ #include "migration/migration.h" #include "migration/misc.h" #include "migration/blocker.h" +#include "migration/qemu-file.h" #include "sysemu/tpm.h" VFIOGroupList vfio_group_list = @@ -319,6 +320,28 @@ const MemoryRegionOps vfio_region_ops = { * Device state interfaces */ +typedef struct { + unsigned long *bitmap; + hwaddr size; + hwaddr pages; +} VFIOBitmap; + +static int vfio_bitmap_alloc(VFIOBitmap *vbmap, hwaddr size) +{ + vbmap->pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size(); + vbmap->size = ROUND_UP(vbmap->pages, sizeof(__u64) * BITS_PER_BYTE) / + BITS_PER_BYTE; + vbmap->bitmap = g_try_malloc0(vbmap->size); + if (!vbmap->bitmap) { + return -ENOMEM; + } + + return 0; +} + +static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, + uint64_t size, ram_addr_t ram_addr); + bool vfio_mig_active(void) { VFIOGroup *group; @@ -339,6 +362,7 @@ bool vfio_mig_active(void) } static Error *multiple_devices_migration_blocker; +static Error *giommu_migration_blocker; static unsigned int vfio_migratable_device_num(void) { @@ -390,6 +414,64 @@ void vfio_unblock_multiple_devices_migration(void) multiple_devices_migration_blocker = NULL; } +static bool vfio_viommu_preset(void) +{ + VFIOAddressSpace *space; + + QLIST_FOREACH(space, &vfio_address_spaces, list) { + if (space->as != &address_space_memory) { + return true; + } + } + + return false; +} + +int vfio_block_giommu_migration(Error **errp) +{ + int ret; + + if (giommu_migration_blocker || + !vfio_viommu_preset()) { + return 0; + } + + error_setg(&giommu_migration_blocker, + "Migration is currently not supported with vIOMMU enabled"); + ret = migrate_add_blocker(giommu_migration_blocker, errp); + if (ret < 0) { + error_free(giommu_migration_blocker); + giommu_migration_blocker = NULL; + } + + return ret; +} + +void vfio_migration_finalize(void) +{ + if (!giommu_migration_blocker || + vfio_viommu_preset()) { + return; + } + + migrate_del_blocker(giommu_migration_blocker); + error_free(giommu_migration_blocker); + giommu_migration_blocker = NULL; +} + +static void vfio_set_migration_error(int err) +{ + MigrationState *ms = migrate_get_current(); + + if (migration_is_setup_or_active(ms->state)) { + WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) { + if (ms->to_dst_file) { + qemu_file_set_error(ms->to_dst_file, err); + } + } + } +} + static bool vfio_devices_all_dirty_tracking(VFIOContainer *container) { VFIOGroup *group; @@ -417,6 +499,22 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer *container) return true; } +static bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container) +{ + VFIOGroup *group; + VFIODevice *vbasedev; + + QLIST_FOREACH(group, &container->group_list, container_next) { + QLIST_FOREACH(vbasedev, &group->device_list, next) { + if (!vbasedev->dirty_pages_supported) { + return false; + } + } + } + + return true; +} + /* * Check if all VFIO devices are running and migration is active, which is * essentially equivalent to the migration being in pre-copy phase. @@ -454,9 +552,14 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container, { struct vfio_iommu_type1_dma_unmap *unmap; struct vfio_bitmap *bitmap; - uint64_t pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size(); + VFIOBitmap vbmap; int ret; + ret = vfio_bitmap_alloc(&vbmap, size); + if (ret) { + return ret; + } + unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap)); unmap->argsz = sizeof(*unmap) + sizeof(*bitmap); @@ -470,35 +573,28 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container, * qemu_real_host_page_size to mark those dirty. Hence set bitmap_pgsize * to qemu_real_host_page_size. */ - bitmap->pgsize = qemu_real_host_page_size(); - bitmap->size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) / - BITS_PER_BYTE; + bitmap->size = vbmap.size; + bitmap->data = (__u64 *)vbmap.bitmap; - if (bitmap->size > container->max_dirty_bitmap_size) { - error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, - (uint64_t)bitmap->size); + if (vbmap.size > container->max_dirty_bitmap_size) { + error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, vbmap.size); ret = -E2BIG; goto unmap_exit; } - bitmap->data = g_try_malloc0(bitmap->size); - if (!bitmap->data) { - ret = -ENOMEM; - goto unmap_exit; - } - ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap); if (!ret) { - cpu_physical_memory_set_dirty_lebitmap((unsigned long *)bitmap->data, - iotlb->translated_addr, pages); + cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, + iotlb->translated_addr, vbmap.pages); } else { error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m"); } - g_free(bitmap->data); unmap_exit: g_free(unmap); + g_free(vbmap.bitmap); + return ret; } @@ -515,10 +611,16 @@ static int vfio_dma_unmap(VFIOContainer *container, .iova = iova, .size = size, }; + bool need_dirty_sync = false; + int ret; - if (iotlb && container->dirty_pages_supported && - vfio_devices_all_running_and_mig_active(container)) { - return vfio_dma_unmap_bitmap(container, iova, size, iotlb); + if (iotlb && vfio_devices_all_running_and_mig_active(container)) { + if (!vfio_devices_all_device_dirty_tracking(container) && + container->dirty_pages_supported) { + return vfio_dma_unmap_bitmap(container, iova, size, iotlb); + } + + need_dirty_sync = true; } while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) { @@ -544,10 +646,12 @@ static int vfio_dma_unmap(VFIOContainer *container, return -errno; } - if (iotlb && vfio_devices_all_running_and_mig_active(container)) { - cpu_physical_memory_set_dirty_range(iotlb->translated_addr, size, - tcg_enabled() ? DIRTY_CLIENTS_ALL : - DIRTY_CLIENTS_NOCODE); + if (need_dirty_sync) { + ret = vfio_get_dirty_bitmap(container, iova, size, + iotlb->translated_addr); + if (ret) { + return ret; + } } return 0; @@ -680,6 +784,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) if (iotlb->target_as != &address_space_memory) { error_report("Wrong target AS \"%s\", only system memory is allowed", iotlb->target_as->name ? iotlb->target_as->name : "none"); + vfio_set_migration_error(-EINVAL); return; } @@ -703,17 +808,18 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) read_only); if (ret) { error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", " - "0x%"HWADDR_PRIx", %p) = %d (%m)", + "0x%"HWADDR_PRIx", %p) = %d (%s)", container, iova, - iotlb->addr_mask + 1, vaddr, ret); + iotlb->addr_mask + 1, vaddr, ret, strerror(-ret)); } } else { ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb); if (ret) { error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " - "0x%"HWADDR_PRIx") = %d (%m)", + "0x%"HWADDR_PRIx") = %d (%s)", container, iova, - iotlb->addr_mask + 1, ret); + iotlb->addr_mask + 1, ret, strerror(-ret)); + vfio_set_migration_error(ret); } } out: @@ -868,6 +974,22 @@ static void vfio_unregister_ram_discard_listener(VFIOContainer *container, g_free(vrdl); } +static VFIOHostDMAWindow *vfio_find_hostwin(VFIOContainer *container, + hwaddr iova, hwaddr end) +{ + VFIOHostDMAWindow *hostwin; + bool hostwin_found = false; + + QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { + if (hostwin->min_iova <= iova && end <= hostwin->max_iova) { + hostwin_found = true; + break; + } + } + + return hostwin_found ? hostwin : NULL; +} + static bool vfio_known_safe_misalignment(MemoryRegionSection *section) { MemoryRegion *mr = section->mr; @@ -884,24 +1006,15 @@ static bool vfio_known_safe_misalignment(MemoryRegionSection *section) return true; } -static void vfio_listener_region_add(MemoryListener *listener, - MemoryRegionSection *section) +static bool vfio_listener_valid_section(MemoryRegionSection *section, + const char *name) { - VFIOContainer *container = container_of(listener, VFIOContainer, listener); - hwaddr iova, end; - Int128 llend, llsize; - void *vaddr; - int ret; - VFIOHostDMAWindow *hostwin; - bool hostwin_found; - Error *err = NULL; - if (vfio_listener_skipped_section(section)) { - trace_vfio_listener_region_add_skip( + trace_vfio_listener_region_skip(name, section->offset_within_address_space, section->offset_within_address_space + int128_get64(int128_sub(section->size, int128_one()))); - return; + return false; } if (unlikely((section->offset_within_address_space & @@ -916,15 +1029,53 @@ static void vfio_listener_region_add(MemoryListener *listener, section->offset_within_region, qemu_real_host_page_size()); } - return; + return false; } + return true; +} + +static bool vfio_get_section_iova_range(VFIOContainer *container, + MemoryRegionSection *section, + hwaddr *out_iova, hwaddr *out_end, + Int128 *out_llend) +{ + Int128 llend; + hwaddr iova; + iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space); llend = int128_make64(section->offset_within_address_space); llend = int128_add(llend, section->size); llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask())); if (int128_ge(int128_make64(iova), llend)) { + return false; + } + + *out_iova = iova; + *out_end = int128_get64(int128_sub(llend, int128_one())); + if (out_llend) { + *out_llend = llend; + } + return true; +} + +static void vfio_listener_region_add(MemoryListener *listener, + MemoryRegionSection *section) +{ + VFIOContainer *container = container_of(listener, VFIOContainer, listener); + hwaddr iova, end; + Int128 llend, llsize; + void *vaddr; + int ret; + VFIOHostDMAWindow *hostwin; + Error *err = NULL; + + if (!vfio_listener_valid_section(section, "region_add")) { + return; + } + + if (!vfio_get_section_iova_range(container, section, &iova, &end, &llend)) { if (memory_region_is_ram_device(section->mr)) { trace_vfio_listener_region_add_no_dma_map( memory_region_name(section->mr), @@ -934,7 +1085,6 @@ static void vfio_listener_region_add(MemoryListener *listener, } return; } - end = int128_get64(int128_sub(llend, int128_one())); if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { hwaddr pgsize = 0; @@ -994,15 +1144,8 @@ static void vfio_listener_region_add(MemoryListener *listener, #endif } - hostwin_found = false; - QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { - if (hostwin->min_iova <= iova && end <= hostwin->max_iova) { - hostwin_found = true; - break; - } - } - - if (!hostwin_found) { + hostwin = vfio_find_hostwin(container, iova, end); + if (!hostwin) { error_setg(&err, "Container %p can't map guest IOVA region" " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, iova, end); goto fail; @@ -1095,8 +1238,9 @@ static void vfio_listener_region_add(MemoryListener *listener, vaddr, section->readonly); if (ret) { error_setg(&err, "vfio_dma_map(%p, 0x%"HWADDR_PRIx", " - "0x%"HWADDR_PRIx", %p) = %d (%m)", - container, iova, int128_get64(llsize), vaddr, ret); + "0x%"HWADDR_PRIx", %p) = %d (%s)", + container, iova, int128_get64(llsize), vaddr, ret, + strerror(-ret)); if (memory_region_is_ram_device(section->mr)) { /* Allow unexpected mappings not to be fatal for RAM devices */ error_report_err(err); @@ -1140,26 +1284,7 @@ static void vfio_listener_region_del(MemoryListener *listener, int ret; bool try_unmap = true; - if (vfio_listener_skipped_section(section)) { - trace_vfio_listener_region_del_skip( - section->offset_within_address_space, - section->offset_within_address_space + - int128_get64(int128_sub(section->size, int128_one()))); - return; - } - - if (unlikely((section->offset_within_address_space & - ~qemu_real_host_page_mask()) != - (section->offset_within_region & ~qemu_real_host_page_mask()))) { - if (!vfio_known_safe_misalignment(section)) { - error_report("%s received unaligned region %s iova=0x%"PRIx64 - " offset_within_region=0x%"PRIx64 - " qemu_real_host_page_size=0x%"PRIxPTR, - __func__, memory_region_name(section->mr), - section->offset_within_address_space, - section->offset_within_region, - qemu_real_host_page_size()); - } + if (!vfio_listener_valid_section(section, "region_del")) { return; } @@ -1186,15 +1311,9 @@ static void vfio_listener_region_del(MemoryListener *listener, */ } - iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space); - llend = int128_make64(section->offset_within_address_space); - llend = int128_add(llend, section->size); - llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask())); - - if (int128_ge(int128_make64(iova), llend)) { + if (!vfio_get_section_iova_range(container, section, &iova, &end, &llend)) { return; } - end = int128_get64(int128_sub(llend, int128_one())); llsize = int128_sub(llend, int128_make64(iova)); @@ -1203,15 +1322,9 @@ static void vfio_listener_region_del(MemoryListener *listener, if (memory_region_is_ram_device(section->mr)) { hwaddr pgmask; VFIOHostDMAWindow *hostwin; - bool hostwin_found = false; - QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { - if (hostwin->min_iova <= iova && end <= hostwin->max_iova) { - hostwin_found = true; - break; - } - } - assert(hostwin_found); /* or region_add() would have failed */ + hostwin = vfio_find_hostwin(container, iova, end); + assert(hostwin); /* or region_add() would have failed */ pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1; try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask)); @@ -1228,16 +1341,18 @@ static void vfio_listener_region_del(MemoryListener *listener, ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL); if (ret) { error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " - "0x%"HWADDR_PRIx") = %d (%m)", - container, iova, int128_get64(llsize), ret); + "0x%"HWADDR_PRIx") = %d (%s)", + container, iova, int128_get64(llsize), ret, + strerror(-ret)); } iova += int128_get64(llsize); } ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL); if (ret) { error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " - "0x%"HWADDR_PRIx") = %d (%m)", - container, iova, int128_get64(llsize), ret); + "0x%"HWADDR_PRIx") = %d (%s)", + container, iova, int128_get64(llsize), ret, + strerror(-ret)); } } @@ -1256,7 +1371,7 @@ static void vfio_listener_region_del(MemoryListener *listener, } } -static void vfio_set_dirty_page_tracking(VFIOContainer *container, bool start) +static int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start) { int ret; struct vfio_iommu_type1_dirty_bitmap dirty = { @@ -1264,7 +1379,7 @@ static void vfio_set_dirty_page_tracking(VFIOContainer *container, bool start) }; if (!container->dirty_pages_supported) { - return; + return 0; } if (start) { @@ -1275,40 +1390,327 @@ static void vfio_set_dirty_page_tracking(VFIOContainer *container, bool start) ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty); if (ret) { + ret = -errno; error_report("Failed to set dirty tracking flag 0x%x errno: %d", dirty.flags, errno); } + + return ret; +} + +typedef struct VFIODirtyRanges { + hwaddr min32; + hwaddr max32; + hwaddr min64; + hwaddr max64; +} VFIODirtyRanges; + +typedef struct VFIODirtyRangesListener { + VFIOContainer *container; + VFIODirtyRanges ranges; + MemoryListener listener; +} VFIODirtyRangesListener; + +static void vfio_dirty_tracking_update(MemoryListener *listener, + MemoryRegionSection *section) +{ + VFIODirtyRangesListener *dirty = container_of(listener, + VFIODirtyRangesListener, + listener); + VFIODirtyRanges *range = &dirty->ranges; + hwaddr iova, end, *min, *max; + + if (!vfio_listener_valid_section(section, "tracking_update") || + !vfio_get_section_iova_range(dirty->container, section, + &iova, &end, NULL)) { + return; + } + + /* + * The address space passed to the dirty tracker is reduced to two ranges: + * one for 32-bit DMA ranges, and another one for 64-bit DMA ranges. + * The underlying reports of dirty will query a sub-interval of each of + * these ranges. + * + * The purpose of the dual range handling is to handle known cases of big + * holes in the address space, like the x86 AMD 1T hole. The alternative + * would be an IOVATree but that has a much bigger runtime overhead and + * unnecessary complexity. + */ + min = (end <= UINT32_MAX) ? &range->min32 : &range->min64; + max = (end <= UINT32_MAX) ? &range->max32 : &range->max64; + + if (*min > iova) { + *min = iova; + } + if (*max < end) { + *max = end; + } + + trace_vfio_device_dirty_tracking_update(iova, end, *min, *max); + return; +} + +static const MemoryListener vfio_dirty_tracking_listener = { + .name = "vfio-tracking", + .region_add = vfio_dirty_tracking_update, +}; + +static void vfio_dirty_tracking_init(VFIOContainer *container, + VFIODirtyRanges *ranges) +{ + VFIODirtyRangesListener dirty; + + memset(&dirty, 0, sizeof(dirty)); + dirty.ranges.min32 = UINT32_MAX; + dirty.ranges.min64 = UINT64_MAX; + dirty.listener = vfio_dirty_tracking_listener; + dirty.container = container; + + memory_listener_register(&dirty.listener, + container->space->as); + + *ranges = dirty.ranges; + + /* + * The memory listener is synchronous, and used to calculate the range + * to dirty tracking. Unregister it after we are done as we are not + * interested in any follow-up updates. + */ + memory_listener_unregister(&dirty.listener); +} + +static void vfio_devices_dma_logging_stop(VFIOContainer *container) +{ + uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature), + sizeof(uint64_t))] = {}; + struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; + VFIODevice *vbasedev; + VFIOGroup *group; + + feature->argsz = sizeof(buf); + feature->flags = VFIO_DEVICE_FEATURE_SET | + VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP; + + QLIST_FOREACH(group, &container->group_list, container_next) { + QLIST_FOREACH(vbasedev, &group->device_list, next) { + if (!vbasedev->dirty_tracking) { + continue; + } + + if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { + warn_report("%s: Failed to stop DMA logging, err %d (%s)", + vbasedev->name, -errno, strerror(errno)); + } + vbasedev->dirty_tracking = false; + } + } +} + +static struct vfio_device_feature * +vfio_device_feature_dma_logging_start_create(VFIOContainer *container, + VFIODirtyRanges *tracking) +{ + struct vfio_device_feature *feature; + size_t feature_size; + struct vfio_device_feature_dma_logging_control *control; + struct vfio_device_feature_dma_logging_range *ranges; + + feature_size = sizeof(struct vfio_device_feature) + + sizeof(struct vfio_device_feature_dma_logging_control); + feature = g_try_malloc0(feature_size); + if (!feature) { + errno = ENOMEM; + return NULL; + } + feature->argsz = feature_size; + feature->flags = VFIO_DEVICE_FEATURE_SET | + VFIO_DEVICE_FEATURE_DMA_LOGGING_START; + + control = (struct vfio_device_feature_dma_logging_control *)feature->data; + control->page_size = qemu_real_host_page_size(); + + /* + * DMA logging uAPI guarantees to support at least a number of ranges that + * fits into a single host kernel base page. + */ + control->num_ranges = !!tracking->max32 + !!tracking->max64; + ranges = g_try_new0(struct vfio_device_feature_dma_logging_range, + control->num_ranges); + if (!ranges) { + g_free(feature); + errno = ENOMEM; + + return NULL; + } + + control->ranges = (__u64)(uintptr_t)ranges; + if (tracking->max32) { + ranges->iova = tracking->min32; + ranges->length = (tracking->max32 - tracking->min32) + 1; + ranges++; + } + if (tracking->max64) { + ranges->iova = tracking->min64; + ranges->length = (tracking->max64 - tracking->min64) + 1; + } + + trace_vfio_device_dirty_tracking_start(control->num_ranges, + tracking->min32, tracking->max32, + tracking->min64, tracking->max64); + + return feature; +} + +static void vfio_device_feature_dma_logging_start_destroy( + struct vfio_device_feature *feature) +{ + struct vfio_device_feature_dma_logging_control *control = + (struct vfio_device_feature_dma_logging_control *)feature->data; + struct vfio_device_feature_dma_logging_range *ranges = + (struct vfio_device_feature_dma_logging_range *)(uintptr_t)control->ranges; + + g_free(ranges); + g_free(feature); +} + +static int vfio_devices_dma_logging_start(VFIOContainer *container) +{ + struct vfio_device_feature *feature; + VFIODirtyRanges ranges; + VFIODevice *vbasedev; + VFIOGroup *group; + int ret = 0; + + vfio_dirty_tracking_init(container, &ranges); + feature = vfio_device_feature_dma_logging_start_create(container, + &ranges); + if (!feature) { + return -errno; + } + + QLIST_FOREACH(group, &container->group_list, container_next) { + QLIST_FOREACH(vbasedev, &group->device_list, next) { + if (vbasedev->dirty_tracking) { + continue; + } + + ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature); + if (ret) { + ret = -errno; + error_report("%s: Failed to start DMA logging, err %d (%s)", + vbasedev->name, ret, strerror(errno)); + goto out; + } + vbasedev->dirty_tracking = true; + } + } + +out: + if (ret) { + vfio_devices_dma_logging_stop(container); + } + + vfio_device_feature_dma_logging_start_destroy(feature); + + return ret; } static void vfio_listener_log_global_start(MemoryListener *listener) { VFIOContainer *container = container_of(listener, VFIOContainer, listener); + int ret; + + if (vfio_devices_all_device_dirty_tracking(container)) { + ret = vfio_devices_dma_logging_start(container); + } else { + ret = vfio_set_dirty_page_tracking(container, true); + } - vfio_set_dirty_page_tracking(container, true); + if (ret) { + error_report("vfio: Could not start dirty page tracking, err: %d (%s)", + ret, strerror(-ret)); + vfio_set_migration_error(ret); + } } static void vfio_listener_log_global_stop(MemoryListener *listener) { VFIOContainer *container = container_of(listener, VFIOContainer, listener); + int ret = 0; + + if (vfio_devices_all_device_dirty_tracking(container)) { + vfio_devices_dma_logging_stop(container); + } else { + ret = vfio_set_dirty_page_tracking(container, false); + } - vfio_set_dirty_page_tracking(container, false); + if (ret) { + error_report("vfio: Could not stop dirty page tracking, err: %d (%s)", + ret, strerror(-ret)); + vfio_set_migration_error(ret); + } } -static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, - uint64_t size, ram_addr_t ram_addr) +static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova, + hwaddr size, void *bitmap) { - struct vfio_iommu_type1_dirty_bitmap *dbitmap; - struct vfio_iommu_type1_dirty_bitmap_get *range; - uint64_t pages; + uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + + sizeof(struct vfio_device_feature_dma_logging_report), + sizeof(__u64))] = {}; + struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; + struct vfio_device_feature_dma_logging_report *report = + (struct vfio_device_feature_dma_logging_report *)feature->data; + + report->iova = iova; + report->length = size; + report->page_size = qemu_real_host_page_size(); + report->bitmap = (__u64)(uintptr_t)bitmap; + + feature->argsz = sizeof(buf); + feature->flags = VFIO_DEVICE_FEATURE_GET | + VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT; + + if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { + return -errno; + } + + return 0; +} + +static int vfio_devices_query_dirty_bitmap(VFIOContainer *container, + VFIOBitmap *vbmap, hwaddr iova, + hwaddr size) +{ + VFIODevice *vbasedev; + VFIOGroup *group; int ret; - if (!container->dirty_pages_supported) { - cpu_physical_memory_set_dirty_range(ram_addr, size, - tcg_enabled() ? DIRTY_CLIENTS_ALL : - DIRTY_CLIENTS_NOCODE); - return 0; + QLIST_FOREACH(group, &container->group_list, container_next) { + QLIST_FOREACH(vbasedev, &group->device_list, next) { + ret = vfio_device_dma_logging_report(vbasedev, iova, size, + vbmap->bitmap); + if (ret) { + error_report("%s: Failed to get DMA logging report, iova: " + "0x%" HWADDR_PRIx ", size: 0x%" HWADDR_PRIx + ", err: %d (%s)", + vbasedev->name, iova, size, ret, strerror(-ret)); + + return ret; + } + } } + return 0; +} + +static int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap, + hwaddr iova, hwaddr size) +{ + struct vfio_iommu_type1_dirty_bitmap *dbitmap; + struct vfio_iommu_type1_dirty_bitmap_get *range; + int ret; + dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range)); dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range); @@ -1323,36 +1725,63 @@ static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, * to qemu_real_host_page_size. */ range->bitmap.pgsize = qemu_real_host_page_size(); - - pages = REAL_HOST_PAGE_ALIGN(range->size) / qemu_real_host_page_size(); - range->bitmap.size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) / - BITS_PER_BYTE; - range->bitmap.data = g_try_malloc0(range->bitmap.size); - if (!range->bitmap.data) { - ret = -ENOMEM; - goto err_out; - } + range->bitmap.size = vbmap->size; + range->bitmap.data = (__u64 *)vbmap->bitmap; ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap); if (ret) { + ret = -errno; error_report("Failed to get dirty bitmap for iova: 0x%"PRIx64 " size: 0x%"PRIx64" err: %d", (uint64_t)range->iova, (uint64_t)range->size, errno); - goto err_out; } - cpu_physical_memory_set_dirty_lebitmap((unsigned long *)range->bitmap.data, - ram_addr, pages); - - trace_vfio_get_dirty_bitmap(container->fd, range->iova, range->size, - range->bitmap.size, ram_addr); -err_out: - g_free(range->bitmap.data); g_free(dbitmap); return ret; } +static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, + uint64_t size, ram_addr_t ram_addr) +{ + bool all_device_dirty_tracking = + vfio_devices_all_device_dirty_tracking(container); + VFIOBitmap vbmap; + int ret; + + if (!container->dirty_pages_supported && !all_device_dirty_tracking) { + cpu_physical_memory_set_dirty_range(ram_addr, size, + tcg_enabled() ? DIRTY_CLIENTS_ALL : + DIRTY_CLIENTS_NOCODE); + return 0; + } + + ret = vfio_bitmap_alloc(&vbmap, size); + if (ret) { + return ret; + } + + if (all_device_dirty_tracking) { + ret = vfio_devices_query_dirty_bitmap(container, &vbmap, iova, size); + } else { + ret = vfio_query_dirty_bitmap(container, &vbmap, iova, size); + } + + if (ret) { + goto out; + } + + cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, ram_addr, + vbmap.pages); + + trace_vfio_get_dirty_bitmap(container->fd, iova, size, vbmap.size, + ram_addr); +out: + g_free(vbmap.bitmap); + + return ret; +} + typedef struct { IOMMUNotifier n; VFIOGuestIOMMU *giommu; @@ -1366,29 +1795,33 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) VFIOContainer *container = giommu->container; hwaddr iova = iotlb->iova + giommu->iommu_offset; ram_addr_t translated_addr; + int ret = -EINVAL; trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask); if (iotlb->target_as != &address_space_memory) { error_report("Wrong target AS \"%s\", only system memory is allowed", iotlb->target_as->name ? iotlb->target_as->name : "none"); - return; + goto out; } rcu_read_lock(); if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) { - int ret; - ret = vfio_get_dirty_bitmap(container, iova, iotlb->addr_mask + 1, translated_addr); if (ret) { error_report("vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", " - "0x%"HWADDR_PRIx") = %d (%m)", - container, iova, - iotlb->addr_mask + 1, ret); + "0x%"HWADDR_PRIx") = %d (%s)", + container, iova, iotlb->addr_mask + 1, ret, + strerror(-ret)); } } rcu_read_unlock(); + +out: + if (ret) { + vfio_set_migration_error(ret); + } } static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section, @@ -1481,13 +1914,19 @@ static void vfio_listener_log_sync(MemoryListener *listener, MemoryRegionSection *section) { VFIOContainer *container = container_of(listener, VFIOContainer, listener); + int ret; if (vfio_listener_skipped_section(section)) { return; } if (vfio_devices_all_dirty_tracking(container)) { - vfio_sync_dirty_bitmap(container, section); + ret = vfio_sync_dirty_bitmap(container, section); + if (ret) { + error_report("vfio: Failed to sync dirty bitmap, err: %d (%s)", ret, + strerror(-ret)); + vfio_set_migration_error(ret); + } } } diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index a2c3d9bade..1a1a8659c8 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -521,7 +521,7 @@ static void vfio_migration_state_notifier(Notifier *notifier, void *data) } } -static void vfio_migration_exit(VFIODevice *vbasedev) +static void vfio_migration_free(VFIODevice *vbasedev) { g_free(vbasedev->migration); vbasedev->migration = NULL; @@ -555,6 +555,19 @@ static int vfio_migration_query_flags(VFIODevice *vbasedev, uint64_t *mig_flags) return 0; } +static bool vfio_dma_logging_supported(VFIODevice *vbasedev) +{ + uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature), + sizeof(uint64_t))] = {}; + struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; + + feature->argsz = sizeof(buf); + feature->flags = VFIO_DEVICE_FEATURE_PROBE | + VFIO_DEVICE_FEATURE_DMA_LOGGING_START; + + return !ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature); +} + static int vfio_migration_init(VFIODevice *vbasedev) { int ret; @@ -589,6 +602,8 @@ static int vfio_migration_init(VFIODevice *vbasedev) migration->device_state = VFIO_DEVICE_STATE_RUNNING; migration->data_fd = -1; + vbasedev->dirty_pages_supported = vfio_dma_logging_supported(vbasedev); + oid = vmstate_if_get_id(VMSTATE_IF(DEVICE(obj))); if (oid) { path = g_strdup_printf("%s/vfio", oid); @@ -616,7 +631,7 @@ int64_t vfio_mig_bytes_transferred(void) return bytes_transferred; } -int vfio_migration_probe(VFIODevice *vbasedev, Error **errp) +int vfio_migration_realize(VFIODevice *vbasedev, Error **errp) { int ret = -ENOTSUP; @@ -634,6 +649,11 @@ int vfio_migration_probe(VFIODevice *vbasedev, Error **errp) return ret; } + ret = vfio_block_giommu_migration(errp); + if (ret) { + return ret; + } + trace_vfio_migration_probe(vbasedev->name); return 0; @@ -649,7 +669,7 @@ add_blocker: return ret; } -void vfio_migration_finalize(VFIODevice *vbasedev) +void vfio_migration_exit(VFIODevice *vbasedev) { if (vbasedev->migration) { VFIOMigration *migration = vbasedev->migration; @@ -657,7 +677,7 @@ void vfio_migration_finalize(VFIODevice *vbasedev) remove_migration_state_change_notifier(&migration->migration_state); qemu_del_vm_change_state_handler(migration->vm_state); unregister_savevm(VMSTATE_IF(vbasedev->dev), "vfio", vbasedev); - vfio_migration_exit(vbasedev); + vfio_migration_free(vbasedev); vfio_unblock_multiple_devices_migration(); } diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 939dcc3d4a..ec9a854361 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3145,7 +3145,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) } if (!pdev->failover_pair_id) { - ret = vfio_migration_probe(vbasedev, errp); + ret = vfio_migration_realize(vbasedev, errp); if (ret) { error_report("%s: Migration disabled", vbasedev->name); } @@ -3185,6 +3185,7 @@ static void vfio_instance_finalize(Object *obj) */ vfio_put_device(vdev); vfio_put_group(group); + vfio_migration_finalize(); } static void vfio_exitfn(PCIDevice *pdev) @@ -3203,7 +3204,7 @@ static void vfio_exitfn(PCIDevice *pdev) } vfio_teardown_msi(vdev); vfio_bars_exit(vdev); - vfio_migration_finalize(&vdev->vbasedev); + vfio_migration_exit(&vdev->vbasedev); } static void vfio_pci_reset(DeviceState *dev) diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 669d9fe07c..646e42fd27 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -96,14 +96,15 @@ vfio_pci_igd_lpc_bridge_enabled(const char *name) "%s" vfio_region_write(const char *name, int index, uint64_t addr, uint64_t data, unsigned size) " (%s:region%d+0x%"PRIx64", 0x%"PRIx64 ", %d)" vfio_region_read(char *name, int index, uint64_t addr, unsigned size, uint64_t data) " (%s:region%d+0x%"PRIx64", %d) = 0x%"PRIx64 vfio_iommu_map_notify(const char *op, uint64_t iova_start, uint64_t iova_end) "iommu %s @ 0x%"PRIx64" - 0x%"PRIx64 -vfio_listener_region_add_skip(uint64_t start, uint64_t end) "SKIPPING region_add 0x%"PRIx64" - 0x%"PRIx64 +vfio_listener_region_skip(const char *name, uint64_t start, uint64_t end) "SKIPPING %s 0x%"PRIx64" - 0x%"PRIx64 vfio_spapr_group_attach(int groupfd, int tablefd) "Attached groupfd %d to liobn fd %d" vfio_listener_region_add_iommu(uint64_t start, uint64_t end) "region_add [iommu] 0x%"PRIx64" - 0x%"PRIx64 vfio_listener_region_add_ram(uint64_t iova_start, uint64_t iova_end, void *vaddr) "region_add [ram] 0x%"PRIx64" - 0x%"PRIx64" [%p]" vfio_known_safe_misalignment(const char *name, uint64_t iova, uint64_t offset_within_region, uintptr_t page_size) "Region \"%s\" iova=0x%"PRIx64" offset_within_region=0x%"PRIx64" qemu_real_host_page_size=0x%"PRIxPTR vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64" is not aligned to 0x%"PRIx64" and cannot be mapped for DMA" -vfio_listener_region_del_skip(uint64_t start, uint64_t end) "SKIPPING region_del 0x%"PRIx64" - 0x%"PRIx64 vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" - 0x%"PRIx64 +vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t min, uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" - 0x%"PRIx64"]" +vfio_device_dirty_tracking_start(int nr_ranges, uint64_t min32, uint64_t max32, uint64_t min64, uint64_t max64) "nr_ranges %d 32:[0x%"PRIx64" - 0x%"PRIx64"], 64:[0x%"PRIx64" - 0x%"PRIx64"]" vfio_disconnect_container(int fd) "close container->fd=%d" vfio_put_group(int fd) "close group->fd=%d" vfio_get_device(const char * name, unsigned int flags, unsigned int num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u" @@ -117,7 +118,7 @@ vfio_region_mmaps_set_enabled(const char *name, bool enabled) "Region %s mmaps e vfio_region_unmap(const char *name, unsigned long offset, unsigned long end) "Region %s unmap [0x%lx - 0x%lx]" vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Device %s region %d: %d sparse mmap entries" vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]" -vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%0x8" +vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%08x" vfio_dma_unmap_overflow_workaround(void) "" vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start) "container fd=%d, iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64 vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64 |