aboutsummaryrefslogtreecommitdiff
path: root/hw
diff options
context:
space:
mode:
authorPeter Maydell <peter.maydell@linaro.org>2023-03-09 15:19:44 +0000
committerPeter Maydell <peter.maydell@linaro.org>2023-03-09 15:19:44 +0000
commit66a6aa8f9a56a6317e074b1f5e269fecdf4ad782 (patch)
treef4ebf90dbeecbbdee2bda81f2a38247b7ae153e8 /hw
parentdea644928d7583d91170d013716bbbeb938cb938 (diff)
parent969dae5448eaa2914be5b974f9e0311b3f95ee2c (diff)
Merge tag 'vfio-updates-20230307.1' of https://gitlab.com/alex.williamson/qemu into staging
VFIO updates for 8.0 * Device level dirty page tracking support for vfio migration, as well as various cleanups and consolidations. (Avihai Horon, Joao Martins) * Trivial cleanup of migration entry points. (Alex Williamson) * Fix trace event typo. (Cédric Le Goater) # -----BEGIN PGP SIGNATURE----- # # iQJPBAABCAA5FiEEQvbATlQL0amee4qQI5ubbjuwiyIFAmQHgCUbHGFsZXgud2ls # bGlhbXNvbkByZWRoYXQuY29tAAoJECObm247sIsi4i0P/RwP3TJ4jDBEW9JNa52O # 6Hu6tWDccjSZFX7W/pnUztFtIqYBG6Jcms5VLZhaqrSda2BKa3dVoY+iU2finHRn # q4CNQ4EVbKBG0HvA9SEd7WchAKADBCVpjeUBAF6jVQHBCQECHnfWtA2Y0T5oEGgw # H1dwuw3YX6Jwyh5RmT/m7wNtOo2ms/CpDAc7d5rfLg0cDQ0vXPCu/CVvqAXbBpVd # g7NrMLw1wfhKLYN2eWYkiZ+pGwNX5uxsp0jOSA7leFcfkuLX2KzQ99JpCNhX1oRd # H5bedA62ffFLGQdlM2zyiAi37CgmeElKSlnaJdBX91Y4DQ3HSdbHYWoiYtzl89rB # 7QxYHG7XOMdYKssN7qz+oVUpI+ycB18wSW2D/h4fJCNkH92cSHMyJ/yEA3r39eX4 # 7rgu0j8cg2iwIiGlh/klguXfatMDJvbrazDHYixKUJD5vlDXQvTe9LVpwUaUhGGM # Gh4g8wx9gmDE9H1FbQ0kQqut70sO1Hnw2Pj19qzfdwfL6LeYWk+5AfQZmyziYGFM # CGRKz5RhlN/Ori9gTKfn00stuxdD09Md5fPllKyMq7a1tkQt58RxLSkUN8hygeki # Uqnlx5KXBLQ/7ZtnQNoe8frn5FhKBBSLC3tA71PyL4kIbcuiHXLvxIOeE9oJpSPi # Bt8sTr3eCnVF9mys1ZmGmaYY # =nM9d # -----END PGP SIGNATURE----- # gpg: Signature made Tue 07 Mar 2023 18:19:17 GMT # gpg: using RSA key 42F6C04E540BD1A99E7B8A90239B9B6E3BB08B22 # gpg: issuer "alex.williamson@redhat.com" # gpg: Good signature from "Alex Williamson <alex.williamson@redhat.com>" [full] # gpg: aka "Alex Williamson <alex@shazbot.org>" [full] # gpg: aka "Alex Williamson <alwillia@redhat.com>" [full] # gpg: aka "Alex Williamson <alex.l.williamson@gmail.com>" [full] # Primary key fingerprint: 42F6 C04E 540B D1A9 9E7B 8A90 239B 9B6E 3BB0 8B22 * tag 'vfio-updates-20230307.1' of https://gitlab.com/alex.williamson/qemu: vfio: Fix vfio_get_dev_region() trace event vfio/migration: Rename entry points docs/devel: Document VFIO device dirty page tracking vfio/migration: Query device dirty page tracking support vfio/migration: Block migration with vIOMMU vfio/common: Add device dirty page bitmap sync vfio/common: Extract code from vfio_get_dirty_bitmap() to new function vfio/common: Add device dirty page tracking start/stop vfio/common: Record DMA mapped IOVA ranges vfio/common: Add helper to consolidate iova/end calculation vfio/common: Consolidate skip/invalid section into helper vfio/common: Use a single tracepoint for skipped sections vfio/common: Add helper to validate iova/end against hostwin vfio/common: Add VFIOBitmap and alloc function vfio/common: Abort migration if dirty log start/stop/sync fails vfio/common: Fix wrong %m usages vfio/common: Fix error reporting in vfio_get_dirty_bitmap() Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Diffstat (limited to 'hw')
-rw-r--r--hw/vfio/common.c699
-rw-r--r--hw/vfio/migration.c28
-rw-r--r--hw/vfio/pci.c5
-rw-r--r--hw/vfio/trace-events7
4 files changed, 600 insertions, 139 deletions
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index bab83c0e55..4d01ea3515 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -42,6 +42,7 @@
#include "migration/migration.h"
#include "migration/misc.h"
#include "migration/blocker.h"
+#include "migration/qemu-file.h"
#include "sysemu/tpm.h"
VFIOGroupList vfio_group_list =
@@ -319,6 +320,28 @@ const MemoryRegionOps vfio_region_ops = {
* Device state interfaces
*/
+typedef struct {
+ unsigned long *bitmap;
+ hwaddr size;
+ hwaddr pages;
+} VFIOBitmap;
+
+static int vfio_bitmap_alloc(VFIOBitmap *vbmap, hwaddr size)
+{
+ vbmap->pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size();
+ vbmap->size = ROUND_UP(vbmap->pages, sizeof(__u64) * BITS_PER_BYTE) /
+ BITS_PER_BYTE;
+ vbmap->bitmap = g_try_malloc0(vbmap->size);
+ if (!vbmap->bitmap) {
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
+ uint64_t size, ram_addr_t ram_addr);
+
bool vfio_mig_active(void)
{
VFIOGroup *group;
@@ -339,6 +362,7 @@ bool vfio_mig_active(void)
}
static Error *multiple_devices_migration_blocker;
+static Error *giommu_migration_blocker;
static unsigned int vfio_migratable_device_num(void)
{
@@ -390,6 +414,64 @@ void vfio_unblock_multiple_devices_migration(void)
multiple_devices_migration_blocker = NULL;
}
+static bool vfio_viommu_preset(void)
+{
+ VFIOAddressSpace *space;
+
+ QLIST_FOREACH(space, &vfio_address_spaces, list) {
+ if (space->as != &address_space_memory) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+int vfio_block_giommu_migration(Error **errp)
+{
+ int ret;
+
+ if (giommu_migration_blocker ||
+ !vfio_viommu_preset()) {
+ return 0;
+ }
+
+ error_setg(&giommu_migration_blocker,
+ "Migration is currently not supported with vIOMMU enabled");
+ ret = migrate_add_blocker(giommu_migration_blocker, errp);
+ if (ret < 0) {
+ error_free(giommu_migration_blocker);
+ giommu_migration_blocker = NULL;
+ }
+
+ return ret;
+}
+
+void vfio_migration_finalize(void)
+{
+ if (!giommu_migration_blocker ||
+ vfio_viommu_preset()) {
+ return;
+ }
+
+ migrate_del_blocker(giommu_migration_blocker);
+ error_free(giommu_migration_blocker);
+ giommu_migration_blocker = NULL;
+}
+
+static void vfio_set_migration_error(int err)
+{
+ MigrationState *ms = migrate_get_current();
+
+ if (migration_is_setup_or_active(ms->state)) {
+ WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) {
+ if (ms->to_dst_file) {
+ qemu_file_set_error(ms->to_dst_file, err);
+ }
+ }
+ }
+}
+
static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
{
VFIOGroup *group;
@@ -417,6 +499,22 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
return true;
}
+static bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container)
+{
+ VFIOGroup *group;
+ VFIODevice *vbasedev;
+
+ QLIST_FOREACH(group, &container->group_list, container_next) {
+ QLIST_FOREACH(vbasedev, &group->device_list, next) {
+ if (!vbasedev->dirty_pages_supported) {
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
/*
* Check if all VFIO devices are running and migration is active, which is
* essentially equivalent to the migration being in pre-copy phase.
@@ -454,9 +552,14 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container,
{
struct vfio_iommu_type1_dma_unmap *unmap;
struct vfio_bitmap *bitmap;
- uint64_t pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size();
+ VFIOBitmap vbmap;
int ret;
+ ret = vfio_bitmap_alloc(&vbmap, size);
+ if (ret) {
+ return ret;
+ }
+
unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
unmap->argsz = sizeof(*unmap) + sizeof(*bitmap);
@@ -470,35 +573,28 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container,
* qemu_real_host_page_size to mark those dirty. Hence set bitmap_pgsize
* to qemu_real_host_page_size.
*/
-
bitmap->pgsize = qemu_real_host_page_size();
- bitmap->size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
- BITS_PER_BYTE;
+ bitmap->size = vbmap.size;
+ bitmap->data = (__u64 *)vbmap.bitmap;
- if (bitmap->size > container->max_dirty_bitmap_size) {
- error_report("UNMAP: Size of bitmap too big 0x%"PRIx64,
- (uint64_t)bitmap->size);
+ if (vbmap.size > container->max_dirty_bitmap_size) {
+ error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, vbmap.size);
ret = -E2BIG;
goto unmap_exit;
}
- bitmap->data = g_try_malloc0(bitmap->size);
- if (!bitmap->data) {
- ret = -ENOMEM;
- goto unmap_exit;
- }
-
ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
if (!ret) {
- cpu_physical_memory_set_dirty_lebitmap((unsigned long *)bitmap->data,
- iotlb->translated_addr, pages);
+ cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap,
+ iotlb->translated_addr, vbmap.pages);
} else {
error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m");
}
- g_free(bitmap->data);
unmap_exit:
g_free(unmap);
+ g_free(vbmap.bitmap);
+
return ret;
}
@@ -515,10 +611,16 @@ static int vfio_dma_unmap(VFIOContainer *container,
.iova = iova,
.size = size,
};
+ bool need_dirty_sync = false;
+ int ret;
- if (iotlb && container->dirty_pages_supported &&
- vfio_devices_all_running_and_mig_active(container)) {
- return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
+ if (iotlb && vfio_devices_all_running_and_mig_active(container)) {
+ if (!vfio_devices_all_device_dirty_tracking(container) &&
+ container->dirty_pages_supported) {
+ return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
+ }
+
+ need_dirty_sync = true;
}
while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
@@ -544,10 +646,12 @@ static int vfio_dma_unmap(VFIOContainer *container,
return -errno;
}
- if (iotlb && vfio_devices_all_running_and_mig_active(container)) {
- cpu_physical_memory_set_dirty_range(iotlb->translated_addr, size,
- tcg_enabled() ? DIRTY_CLIENTS_ALL :
- DIRTY_CLIENTS_NOCODE);
+ if (need_dirty_sync) {
+ ret = vfio_get_dirty_bitmap(container, iova, size,
+ iotlb->translated_addr);
+ if (ret) {
+ return ret;
+ }
}
return 0;
@@ -680,6 +784,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
if (iotlb->target_as != &address_space_memory) {
error_report("Wrong target AS \"%s\", only system memory is allowed",
iotlb->target_as->name ? iotlb->target_as->name : "none");
+ vfio_set_migration_error(-EINVAL);
return;
}
@@ -703,17 +808,18 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
read_only);
if (ret) {
error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
- "0x%"HWADDR_PRIx", %p) = %d (%m)",
+ "0x%"HWADDR_PRIx", %p) = %d (%s)",
container, iova,
- iotlb->addr_mask + 1, vaddr, ret);
+ iotlb->addr_mask + 1, vaddr, ret, strerror(-ret));
}
} else {
ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb);
if (ret) {
error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
- "0x%"HWADDR_PRIx") = %d (%m)",
+ "0x%"HWADDR_PRIx") = %d (%s)",
container, iova,
- iotlb->addr_mask + 1, ret);
+ iotlb->addr_mask + 1, ret, strerror(-ret));
+ vfio_set_migration_error(ret);
}
}
out:
@@ -868,6 +974,22 @@ static void vfio_unregister_ram_discard_listener(VFIOContainer *container,
g_free(vrdl);
}
+static VFIOHostDMAWindow *vfio_find_hostwin(VFIOContainer *container,
+ hwaddr iova, hwaddr end)
+{
+ VFIOHostDMAWindow *hostwin;
+ bool hostwin_found = false;
+
+ QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
+ if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
+ hostwin_found = true;
+ break;
+ }
+ }
+
+ return hostwin_found ? hostwin : NULL;
+}
+
static bool vfio_known_safe_misalignment(MemoryRegionSection *section)
{
MemoryRegion *mr = section->mr;
@@ -884,24 +1006,15 @@ static bool vfio_known_safe_misalignment(MemoryRegionSection *section)
return true;
}
-static void vfio_listener_region_add(MemoryListener *listener,
- MemoryRegionSection *section)
+static bool vfio_listener_valid_section(MemoryRegionSection *section,
+ const char *name)
{
- VFIOContainer *container = container_of(listener, VFIOContainer, listener);
- hwaddr iova, end;
- Int128 llend, llsize;
- void *vaddr;
- int ret;
- VFIOHostDMAWindow *hostwin;
- bool hostwin_found;
- Error *err = NULL;
-
if (vfio_listener_skipped_section(section)) {
- trace_vfio_listener_region_add_skip(
+ trace_vfio_listener_region_skip(name,
section->offset_within_address_space,
section->offset_within_address_space +
int128_get64(int128_sub(section->size, int128_one())));
- return;
+ return false;
}
if (unlikely((section->offset_within_address_space &
@@ -916,15 +1029,53 @@ static void vfio_listener_region_add(MemoryListener *listener,
section->offset_within_region,
qemu_real_host_page_size());
}
- return;
+ return false;
}
+ return true;
+}
+
+static bool vfio_get_section_iova_range(VFIOContainer *container,
+ MemoryRegionSection *section,
+ hwaddr *out_iova, hwaddr *out_end,
+ Int128 *out_llend)
+{
+ Int128 llend;
+ hwaddr iova;
+
iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
llend = int128_make64(section->offset_within_address_space);
llend = int128_add(llend, section->size);
llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
if (int128_ge(int128_make64(iova), llend)) {
+ return false;
+ }
+
+ *out_iova = iova;
+ *out_end = int128_get64(int128_sub(llend, int128_one()));
+ if (out_llend) {
+ *out_llend = llend;
+ }
+ return true;
+}
+
+static void vfio_listener_region_add(MemoryListener *listener,
+ MemoryRegionSection *section)
+{
+ VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+ hwaddr iova, end;
+ Int128 llend, llsize;
+ void *vaddr;
+ int ret;
+ VFIOHostDMAWindow *hostwin;
+ Error *err = NULL;
+
+ if (!vfio_listener_valid_section(section, "region_add")) {
+ return;
+ }
+
+ if (!vfio_get_section_iova_range(container, section, &iova, &end, &llend)) {
if (memory_region_is_ram_device(section->mr)) {
trace_vfio_listener_region_add_no_dma_map(
memory_region_name(section->mr),
@@ -934,7 +1085,6 @@ static void vfio_listener_region_add(MemoryListener *listener,
}
return;
}
- end = int128_get64(int128_sub(llend, int128_one()));
if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
hwaddr pgsize = 0;
@@ -994,15 +1144,8 @@ static void vfio_listener_region_add(MemoryListener *listener,
#endif
}
- hostwin_found = false;
- QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
- if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
- hostwin_found = true;
- break;
- }
- }
-
- if (!hostwin_found) {
+ hostwin = vfio_find_hostwin(container, iova, end);
+ if (!hostwin) {
error_setg(&err, "Container %p can't map guest IOVA region"
" 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, iova, end);
goto fail;
@@ -1095,8 +1238,9 @@ static void vfio_listener_region_add(MemoryListener *listener,
vaddr, section->readonly);
if (ret) {
error_setg(&err, "vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
- "0x%"HWADDR_PRIx", %p) = %d (%m)",
- container, iova, int128_get64(llsize), vaddr, ret);
+ "0x%"HWADDR_PRIx", %p) = %d (%s)",
+ container, iova, int128_get64(llsize), vaddr, ret,
+ strerror(-ret));
if (memory_region_is_ram_device(section->mr)) {
/* Allow unexpected mappings not to be fatal for RAM devices */
error_report_err(err);
@@ -1140,26 +1284,7 @@ static void vfio_listener_region_del(MemoryListener *listener,
int ret;
bool try_unmap = true;
- if (vfio_listener_skipped_section(section)) {
- trace_vfio_listener_region_del_skip(
- section->offset_within_address_space,
- section->offset_within_address_space +
- int128_get64(int128_sub(section->size, int128_one())));
- return;
- }
-
- if (unlikely((section->offset_within_address_space &
- ~qemu_real_host_page_mask()) !=
- (section->offset_within_region & ~qemu_real_host_page_mask()))) {
- if (!vfio_known_safe_misalignment(section)) {
- error_report("%s received unaligned region %s iova=0x%"PRIx64
- " offset_within_region=0x%"PRIx64
- " qemu_real_host_page_size=0x%"PRIxPTR,
- __func__, memory_region_name(section->mr),
- section->offset_within_address_space,
- section->offset_within_region,
- qemu_real_host_page_size());
- }
+ if (!vfio_listener_valid_section(section, "region_del")) {
return;
}
@@ -1186,15 +1311,9 @@ static void vfio_listener_region_del(MemoryListener *listener,
*/
}
- iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
- llend = int128_make64(section->offset_within_address_space);
- llend = int128_add(llend, section->size);
- llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
-
- if (int128_ge(int128_make64(iova), llend)) {
+ if (!vfio_get_section_iova_range(container, section, &iova, &end, &llend)) {
return;
}
- end = int128_get64(int128_sub(llend, int128_one()));
llsize = int128_sub(llend, int128_make64(iova));
@@ -1203,15 +1322,9 @@ static void vfio_listener_region_del(MemoryListener *listener,
if (memory_region_is_ram_device(section->mr)) {
hwaddr pgmask;
VFIOHostDMAWindow *hostwin;
- bool hostwin_found = false;
- QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
- if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
- hostwin_found = true;
- break;
- }
- }
- assert(hostwin_found); /* or region_add() would have failed */
+ hostwin = vfio_find_hostwin(container, iova, end);
+ assert(hostwin); /* or region_add() would have failed */
pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask));
@@ -1228,16 +1341,18 @@ static void vfio_listener_region_del(MemoryListener *listener,
ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
if (ret) {
error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
- "0x%"HWADDR_PRIx") = %d (%m)",
- container, iova, int128_get64(llsize), ret);
+ "0x%"HWADDR_PRIx") = %d (%s)",
+ container, iova, int128_get64(llsize), ret,
+ strerror(-ret));
}
iova += int128_get64(llsize);
}
ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
if (ret) {
error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
- "0x%"HWADDR_PRIx") = %d (%m)",
- container, iova, int128_get64(llsize), ret);
+ "0x%"HWADDR_PRIx") = %d (%s)",
+ container, iova, int128_get64(llsize), ret,
+ strerror(-ret));
}
}
@@ -1256,7 +1371,7 @@ static void vfio_listener_region_del(MemoryListener *listener,
}
}
-static void vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
+static int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
{
int ret;
struct vfio_iommu_type1_dirty_bitmap dirty = {
@@ -1264,7 +1379,7 @@ static void vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
};
if (!container->dirty_pages_supported) {
- return;
+ return 0;
}
if (start) {
@@ -1275,40 +1390,327 @@ static void vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty);
if (ret) {
+ ret = -errno;
error_report("Failed to set dirty tracking flag 0x%x errno: %d",
dirty.flags, errno);
}
+
+ return ret;
+}
+
+typedef struct VFIODirtyRanges {
+ hwaddr min32;
+ hwaddr max32;
+ hwaddr min64;
+ hwaddr max64;
+} VFIODirtyRanges;
+
+typedef struct VFIODirtyRangesListener {
+ VFIOContainer *container;
+ VFIODirtyRanges ranges;
+ MemoryListener listener;
+} VFIODirtyRangesListener;
+
+static void vfio_dirty_tracking_update(MemoryListener *listener,
+ MemoryRegionSection *section)
+{
+ VFIODirtyRangesListener *dirty = container_of(listener,
+ VFIODirtyRangesListener,
+ listener);
+ VFIODirtyRanges *range = &dirty->ranges;
+ hwaddr iova, end, *min, *max;
+
+ if (!vfio_listener_valid_section(section, "tracking_update") ||
+ !vfio_get_section_iova_range(dirty->container, section,
+ &iova, &end, NULL)) {
+ return;
+ }
+
+ /*
+ * The address space passed to the dirty tracker is reduced to two ranges:
+ * one for 32-bit DMA ranges, and another one for 64-bit DMA ranges.
+ * The underlying reports of dirty will query a sub-interval of each of
+ * these ranges.
+ *
+ * The purpose of the dual range handling is to handle known cases of big
+ * holes in the address space, like the x86 AMD 1T hole. The alternative
+ * would be an IOVATree but that has a much bigger runtime overhead and
+ * unnecessary complexity.
+ */
+ min = (end <= UINT32_MAX) ? &range->min32 : &range->min64;
+ max = (end <= UINT32_MAX) ? &range->max32 : &range->max64;
+
+ if (*min > iova) {
+ *min = iova;
+ }
+ if (*max < end) {
+ *max = end;
+ }
+
+ trace_vfio_device_dirty_tracking_update(iova, end, *min, *max);
+ return;
+}
+
+static const MemoryListener vfio_dirty_tracking_listener = {
+ .name = "vfio-tracking",
+ .region_add = vfio_dirty_tracking_update,
+};
+
+static void vfio_dirty_tracking_init(VFIOContainer *container,
+ VFIODirtyRanges *ranges)
+{
+ VFIODirtyRangesListener dirty;
+
+ memset(&dirty, 0, sizeof(dirty));
+ dirty.ranges.min32 = UINT32_MAX;
+ dirty.ranges.min64 = UINT64_MAX;
+ dirty.listener = vfio_dirty_tracking_listener;
+ dirty.container = container;
+
+ memory_listener_register(&dirty.listener,
+ container->space->as);
+
+ *ranges = dirty.ranges;
+
+ /*
+ * The memory listener is synchronous, and used to calculate the range
+ * to dirty tracking. Unregister it after we are done as we are not
+ * interested in any follow-up updates.
+ */
+ memory_listener_unregister(&dirty.listener);
+}
+
+static void vfio_devices_dma_logging_stop(VFIOContainer *container)
+{
+ uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature),
+ sizeof(uint64_t))] = {};
+ struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
+ VFIODevice *vbasedev;
+ VFIOGroup *group;
+
+ feature->argsz = sizeof(buf);
+ feature->flags = VFIO_DEVICE_FEATURE_SET |
+ VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP;
+
+ QLIST_FOREACH(group, &container->group_list, container_next) {
+ QLIST_FOREACH(vbasedev, &group->device_list, next) {
+ if (!vbasedev->dirty_tracking) {
+ continue;
+ }
+
+ if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
+ warn_report("%s: Failed to stop DMA logging, err %d (%s)",
+ vbasedev->name, -errno, strerror(errno));
+ }
+ vbasedev->dirty_tracking = false;
+ }
+ }
+}
+
+static struct vfio_device_feature *
+vfio_device_feature_dma_logging_start_create(VFIOContainer *container,
+ VFIODirtyRanges *tracking)
+{
+ struct vfio_device_feature *feature;
+ size_t feature_size;
+ struct vfio_device_feature_dma_logging_control *control;
+ struct vfio_device_feature_dma_logging_range *ranges;
+
+ feature_size = sizeof(struct vfio_device_feature) +
+ sizeof(struct vfio_device_feature_dma_logging_control);
+ feature = g_try_malloc0(feature_size);
+ if (!feature) {
+ errno = ENOMEM;
+ return NULL;
+ }
+ feature->argsz = feature_size;
+ feature->flags = VFIO_DEVICE_FEATURE_SET |
+ VFIO_DEVICE_FEATURE_DMA_LOGGING_START;
+
+ control = (struct vfio_device_feature_dma_logging_control *)feature->data;
+ control->page_size = qemu_real_host_page_size();
+
+ /*
+ * DMA logging uAPI guarantees to support at least a number of ranges that
+ * fits into a single host kernel base page.
+ */
+ control->num_ranges = !!tracking->max32 + !!tracking->max64;
+ ranges = g_try_new0(struct vfio_device_feature_dma_logging_range,
+ control->num_ranges);
+ if (!ranges) {
+ g_free(feature);
+ errno = ENOMEM;
+
+ return NULL;
+ }
+
+ control->ranges = (__u64)(uintptr_t)ranges;
+ if (tracking->max32) {
+ ranges->iova = tracking->min32;
+ ranges->length = (tracking->max32 - tracking->min32) + 1;
+ ranges++;
+ }
+ if (tracking->max64) {
+ ranges->iova = tracking->min64;
+ ranges->length = (tracking->max64 - tracking->min64) + 1;
+ }
+
+ trace_vfio_device_dirty_tracking_start(control->num_ranges,
+ tracking->min32, tracking->max32,
+ tracking->min64, tracking->max64);
+
+ return feature;
+}
+
+static void vfio_device_feature_dma_logging_start_destroy(
+ struct vfio_device_feature *feature)
+{
+ struct vfio_device_feature_dma_logging_control *control =
+ (struct vfio_device_feature_dma_logging_control *)feature->data;
+ struct vfio_device_feature_dma_logging_range *ranges =
+ (struct vfio_device_feature_dma_logging_range *)(uintptr_t)control->ranges;
+
+ g_free(ranges);
+ g_free(feature);
+}
+
+static int vfio_devices_dma_logging_start(VFIOContainer *container)
+{
+ struct vfio_device_feature *feature;
+ VFIODirtyRanges ranges;
+ VFIODevice *vbasedev;
+ VFIOGroup *group;
+ int ret = 0;
+
+ vfio_dirty_tracking_init(container, &ranges);
+ feature = vfio_device_feature_dma_logging_start_create(container,
+ &ranges);
+ if (!feature) {
+ return -errno;
+ }
+
+ QLIST_FOREACH(group, &container->group_list, container_next) {
+ QLIST_FOREACH(vbasedev, &group->device_list, next) {
+ if (vbasedev->dirty_tracking) {
+ continue;
+ }
+
+ ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature);
+ if (ret) {
+ ret = -errno;
+ error_report("%s: Failed to start DMA logging, err %d (%s)",
+ vbasedev->name, ret, strerror(errno));
+ goto out;
+ }
+ vbasedev->dirty_tracking = true;
+ }
+ }
+
+out:
+ if (ret) {
+ vfio_devices_dma_logging_stop(container);
+ }
+
+ vfio_device_feature_dma_logging_start_destroy(feature);
+
+ return ret;
}
static void vfio_listener_log_global_start(MemoryListener *listener)
{
VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+ int ret;
+
+ if (vfio_devices_all_device_dirty_tracking(container)) {
+ ret = vfio_devices_dma_logging_start(container);
+ } else {
+ ret = vfio_set_dirty_page_tracking(container, true);
+ }
- vfio_set_dirty_page_tracking(container, true);
+ if (ret) {
+ error_report("vfio: Could not start dirty page tracking, err: %d (%s)",
+ ret, strerror(-ret));
+ vfio_set_migration_error(ret);
+ }
}
static void vfio_listener_log_global_stop(MemoryListener *listener)
{
VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+ int ret = 0;
+
+ if (vfio_devices_all_device_dirty_tracking(container)) {
+ vfio_devices_dma_logging_stop(container);
+ } else {
+ ret = vfio_set_dirty_page_tracking(container, false);
+ }
- vfio_set_dirty_page_tracking(container, false);
+ if (ret) {
+ error_report("vfio: Could not stop dirty page tracking, err: %d (%s)",
+ ret, strerror(-ret));
+ vfio_set_migration_error(ret);
+ }
}
-static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
- uint64_t size, ram_addr_t ram_addr)
+static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova,
+ hwaddr size, void *bitmap)
{
- struct vfio_iommu_type1_dirty_bitmap *dbitmap;
- struct vfio_iommu_type1_dirty_bitmap_get *range;
- uint64_t pages;
+ uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) +
+ sizeof(struct vfio_device_feature_dma_logging_report),
+ sizeof(__u64))] = {};
+ struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
+ struct vfio_device_feature_dma_logging_report *report =
+ (struct vfio_device_feature_dma_logging_report *)feature->data;
+
+ report->iova = iova;
+ report->length = size;
+ report->page_size = qemu_real_host_page_size();
+ report->bitmap = (__u64)(uintptr_t)bitmap;
+
+ feature->argsz = sizeof(buf);
+ feature->flags = VFIO_DEVICE_FEATURE_GET |
+ VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT;
+
+ if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
+ return -errno;
+ }
+
+ return 0;
+}
+
+static int vfio_devices_query_dirty_bitmap(VFIOContainer *container,
+ VFIOBitmap *vbmap, hwaddr iova,
+ hwaddr size)
+{
+ VFIODevice *vbasedev;
+ VFIOGroup *group;
int ret;
- if (!container->dirty_pages_supported) {
- cpu_physical_memory_set_dirty_range(ram_addr, size,
- tcg_enabled() ? DIRTY_CLIENTS_ALL :
- DIRTY_CLIENTS_NOCODE);
- return 0;
+ QLIST_FOREACH(group, &container->group_list, container_next) {
+ QLIST_FOREACH(vbasedev, &group->device_list, next) {
+ ret = vfio_device_dma_logging_report(vbasedev, iova, size,
+ vbmap->bitmap);
+ if (ret) {
+ error_report("%s: Failed to get DMA logging report, iova: "
+ "0x%" HWADDR_PRIx ", size: 0x%" HWADDR_PRIx
+ ", err: %d (%s)",
+ vbasedev->name, iova, size, ret, strerror(-ret));
+
+ return ret;
+ }
+ }
}
+ return 0;
+}
+
+static int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap,
+ hwaddr iova, hwaddr size)
+{
+ struct vfio_iommu_type1_dirty_bitmap *dbitmap;
+ struct vfio_iommu_type1_dirty_bitmap_get *range;
+ int ret;
+
dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
@@ -1323,36 +1725,63 @@ static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
* to qemu_real_host_page_size.
*/
range->bitmap.pgsize = qemu_real_host_page_size();
-
- pages = REAL_HOST_PAGE_ALIGN(range->size) / qemu_real_host_page_size();
- range->bitmap.size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
- BITS_PER_BYTE;
- range->bitmap.data = g_try_malloc0(range->bitmap.size);
- if (!range->bitmap.data) {
- ret = -ENOMEM;
- goto err_out;
- }
+ range->bitmap.size = vbmap->size;
+ range->bitmap.data = (__u64 *)vbmap->bitmap;
ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap);
if (ret) {
+ ret = -errno;
error_report("Failed to get dirty bitmap for iova: 0x%"PRIx64
" size: 0x%"PRIx64" err: %d", (uint64_t)range->iova,
(uint64_t)range->size, errno);
- goto err_out;
}
- cpu_physical_memory_set_dirty_lebitmap((unsigned long *)range->bitmap.data,
- ram_addr, pages);
-
- trace_vfio_get_dirty_bitmap(container->fd, range->iova, range->size,
- range->bitmap.size, ram_addr);
-err_out:
- g_free(range->bitmap.data);
g_free(dbitmap);
return ret;
}
+static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
+ uint64_t size, ram_addr_t ram_addr)
+{
+ bool all_device_dirty_tracking =
+ vfio_devices_all_device_dirty_tracking(container);
+ VFIOBitmap vbmap;
+ int ret;
+
+ if (!container->dirty_pages_supported && !all_device_dirty_tracking) {
+ cpu_physical_memory_set_dirty_range(ram_addr, size,
+ tcg_enabled() ? DIRTY_CLIENTS_ALL :
+ DIRTY_CLIENTS_NOCODE);
+ return 0;
+ }
+
+ ret = vfio_bitmap_alloc(&vbmap, size);
+ if (ret) {
+ return ret;
+ }
+
+ if (all_device_dirty_tracking) {
+ ret = vfio_devices_query_dirty_bitmap(container, &vbmap, iova, size);
+ } else {
+ ret = vfio_query_dirty_bitmap(container, &vbmap, iova, size);
+ }
+
+ if (ret) {
+ goto out;
+ }
+
+ cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, ram_addr,
+ vbmap.pages);
+
+ trace_vfio_get_dirty_bitmap(container->fd, iova, size, vbmap.size,
+ ram_addr);
+out:
+ g_free(vbmap.bitmap);
+
+ return ret;
+}
+
typedef struct {
IOMMUNotifier n;
VFIOGuestIOMMU *giommu;
@@ -1366,29 +1795,33 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
VFIOContainer *container = giommu->container;
hwaddr iova = iotlb->iova + giommu->iommu_offset;
ram_addr_t translated_addr;
+ int ret = -EINVAL;
trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask);
if (iotlb->target_as != &address_space_memory) {
error_report("Wrong target AS \"%s\", only system memory is allowed",
iotlb->target_as->name ? iotlb->target_as->name : "none");
- return;
+ goto out;
}
rcu_read_lock();
if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) {
- int ret;
-
ret = vfio_get_dirty_bitmap(container, iova, iotlb->addr_mask + 1,
translated_addr);
if (ret) {
error_report("vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", "
- "0x%"HWADDR_PRIx") = %d (%m)",
- container, iova,
- iotlb->addr_mask + 1, ret);
+ "0x%"HWADDR_PRIx") = %d (%s)",
+ container, iova, iotlb->addr_mask + 1, ret,
+ strerror(-ret));
}
}
rcu_read_unlock();
+
+out:
+ if (ret) {
+ vfio_set_migration_error(ret);
+ }
}
static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section,
@@ -1481,13 +1914,19 @@ static void vfio_listener_log_sync(MemoryListener *listener,
MemoryRegionSection *section)
{
VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+ int ret;
if (vfio_listener_skipped_section(section)) {
return;
}
if (vfio_devices_all_dirty_tracking(container)) {
- vfio_sync_dirty_bitmap(container, section);
+ ret = vfio_sync_dirty_bitmap(container, section);
+ if (ret) {
+ error_report("vfio: Failed to sync dirty bitmap, err: %d (%s)", ret,
+ strerror(-ret));
+ vfio_set_migration_error(ret);
+ }
}
}
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index a2c3d9bade..1a1a8659c8 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -521,7 +521,7 @@ static void vfio_migration_state_notifier(Notifier *notifier, void *data)
}
}
-static void vfio_migration_exit(VFIODevice *vbasedev)
+static void vfio_migration_free(VFIODevice *vbasedev)
{
g_free(vbasedev->migration);
vbasedev->migration = NULL;
@@ -555,6 +555,19 @@ static int vfio_migration_query_flags(VFIODevice *vbasedev, uint64_t *mig_flags)
return 0;
}
+static bool vfio_dma_logging_supported(VFIODevice *vbasedev)
+{
+ uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature),
+ sizeof(uint64_t))] = {};
+ struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
+
+ feature->argsz = sizeof(buf);
+ feature->flags = VFIO_DEVICE_FEATURE_PROBE |
+ VFIO_DEVICE_FEATURE_DMA_LOGGING_START;
+
+ return !ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature);
+}
+
static int vfio_migration_init(VFIODevice *vbasedev)
{
int ret;
@@ -589,6 +602,8 @@ static int vfio_migration_init(VFIODevice *vbasedev)
migration->device_state = VFIO_DEVICE_STATE_RUNNING;
migration->data_fd = -1;
+ vbasedev->dirty_pages_supported = vfio_dma_logging_supported(vbasedev);
+
oid = vmstate_if_get_id(VMSTATE_IF(DEVICE(obj)));
if (oid) {
path = g_strdup_printf("%s/vfio", oid);
@@ -616,7 +631,7 @@ int64_t vfio_mig_bytes_transferred(void)
return bytes_transferred;
}
-int vfio_migration_probe(VFIODevice *vbasedev, Error **errp)
+int vfio_migration_realize(VFIODevice *vbasedev, Error **errp)
{
int ret = -ENOTSUP;
@@ -634,6 +649,11 @@ int vfio_migration_probe(VFIODevice *vbasedev, Error **errp)
return ret;
}
+ ret = vfio_block_giommu_migration(errp);
+ if (ret) {
+ return ret;
+ }
+
trace_vfio_migration_probe(vbasedev->name);
return 0;
@@ -649,7 +669,7 @@ add_blocker:
return ret;
}
-void vfio_migration_finalize(VFIODevice *vbasedev)
+void vfio_migration_exit(VFIODevice *vbasedev)
{
if (vbasedev->migration) {
VFIOMigration *migration = vbasedev->migration;
@@ -657,7 +677,7 @@ void vfio_migration_finalize(VFIODevice *vbasedev)
remove_migration_state_change_notifier(&migration->migration_state);
qemu_del_vm_change_state_handler(migration->vm_state);
unregister_savevm(VMSTATE_IF(vbasedev->dev), "vfio", vbasedev);
- vfio_migration_exit(vbasedev);
+ vfio_migration_free(vbasedev);
vfio_unblock_multiple_devices_migration();
}
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 939dcc3d4a..ec9a854361 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3145,7 +3145,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
}
if (!pdev->failover_pair_id) {
- ret = vfio_migration_probe(vbasedev, errp);
+ ret = vfio_migration_realize(vbasedev, errp);
if (ret) {
error_report("%s: Migration disabled", vbasedev->name);
}
@@ -3185,6 +3185,7 @@ static void vfio_instance_finalize(Object *obj)
*/
vfio_put_device(vdev);
vfio_put_group(group);
+ vfio_migration_finalize();
}
static void vfio_exitfn(PCIDevice *pdev)
@@ -3203,7 +3204,7 @@ static void vfio_exitfn(PCIDevice *pdev)
}
vfio_teardown_msi(vdev);
vfio_bars_exit(vdev);
- vfio_migration_finalize(&vdev->vbasedev);
+ vfio_migration_exit(&vdev->vbasedev);
}
static void vfio_pci_reset(DeviceState *dev)
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 669d9fe07c..646e42fd27 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -96,14 +96,15 @@ vfio_pci_igd_lpc_bridge_enabled(const char *name) "%s"
vfio_region_write(const char *name, int index, uint64_t addr, uint64_t data, unsigned size) " (%s:region%d+0x%"PRIx64", 0x%"PRIx64 ", %d)"
vfio_region_read(char *name, int index, uint64_t addr, unsigned size, uint64_t data) " (%s:region%d+0x%"PRIx64", %d) = 0x%"PRIx64
vfio_iommu_map_notify(const char *op, uint64_t iova_start, uint64_t iova_end) "iommu %s @ 0x%"PRIx64" - 0x%"PRIx64
-vfio_listener_region_add_skip(uint64_t start, uint64_t end) "SKIPPING region_add 0x%"PRIx64" - 0x%"PRIx64
+vfio_listener_region_skip(const char *name, uint64_t start, uint64_t end) "SKIPPING %s 0x%"PRIx64" - 0x%"PRIx64
vfio_spapr_group_attach(int groupfd, int tablefd) "Attached groupfd %d to liobn fd %d"
vfio_listener_region_add_iommu(uint64_t start, uint64_t end) "region_add [iommu] 0x%"PRIx64" - 0x%"PRIx64
vfio_listener_region_add_ram(uint64_t iova_start, uint64_t iova_end, void *vaddr) "region_add [ram] 0x%"PRIx64" - 0x%"PRIx64" [%p]"
vfio_known_safe_misalignment(const char *name, uint64_t iova, uint64_t offset_within_region, uintptr_t page_size) "Region \"%s\" iova=0x%"PRIx64" offset_within_region=0x%"PRIx64" qemu_real_host_page_size=0x%"PRIxPTR
vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64" is not aligned to 0x%"PRIx64" and cannot be mapped for DMA"
-vfio_listener_region_del_skip(uint64_t start, uint64_t end) "SKIPPING region_del 0x%"PRIx64" - 0x%"PRIx64
vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" - 0x%"PRIx64
+vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t min, uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" - 0x%"PRIx64"]"
+vfio_device_dirty_tracking_start(int nr_ranges, uint64_t min32, uint64_t max32, uint64_t min64, uint64_t max64) "nr_ranges %d 32:[0x%"PRIx64" - 0x%"PRIx64"], 64:[0x%"PRIx64" - 0x%"PRIx64"]"
vfio_disconnect_container(int fd) "close container->fd=%d"
vfio_put_group(int fd) "close group->fd=%d"
vfio_get_device(const char * name, unsigned int flags, unsigned int num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u"
@@ -117,7 +118,7 @@ vfio_region_mmaps_set_enabled(const char *name, bool enabled) "Region %s mmaps e
vfio_region_unmap(const char *name, unsigned long offset, unsigned long end) "Region %s unmap [0x%lx - 0x%lx]"
vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Device %s region %d: %d sparse mmap entries"
vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]"
-vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%0x8"
+vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%08x"
vfio_dma_unmap_overflow_workaround(void) ""
vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start) "container fd=%d, iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64
vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64