diff options
-rw-r--r-- | block.c | 80 | ||||
-rw-r--r-- | block/io.c | 68 | ||||
-rw-r--r-- | include/block/block_int.h | 50 |
3 files changed, 195 insertions, 3 deletions
@@ -49,6 +49,8 @@ #include "qemu/timer.h" #include "qemu/cutils.h" #include "qemu/id.h" +#include "qemu/range.h" +#include "qemu/rcu.h" #include "block/coroutines.h" #ifdef CONFIG_BSD @@ -401,6 +403,9 @@ BlockDriverState *bdrv_new(void) qemu_co_queue_init(&bs->flush_queue); + qemu_co_mutex_init(&bs->bsc_modify_lock); + bs->block_status_cache = g_new0(BdrvBlockStatusCache, 1); + for (i = 0; i < bdrv_drain_all_count; i++) { bdrv_drained_begin(bs); } @@ -4694,6 +4699,8 @@ static void bdrv_close(BlockDriverState *bs) bs->explicit_options = NULL; qobject_unref(bs->full_open_options); bs->full_open_options = NULL; + g_free(bs->block_status_cache); + bs->block_status_cache = NULL; bdrv_release_named_dirty_bitmaps(bs); assert(QLIST_EMPTY(&bs->dirty_bitmaps)); @@ -7684,3 +7691,76 @@ BlockDriverState *bdrv_backing_chain_next(BlockDriverState *bs) { return bdrv_skip_filters(bdrv_cow_bs(bdrv_skip_filters(bs))); } + +/** + * Check whether [offset, offset + bytes) overlaps with the cached + * block-status data region. + * + * If so, and @pnum is not NULL, set *pnum to `bsc.data_end - offset`, + * which is what bdrv_bsc_is_data()'s interface needs. + * Otherwise, *pnum is not touched. + */ +static bool bdrv_bsc_range_overlaps_locked(BlockDriverState *bs, + int64_t offset, int64_t bytes, + int64_t *pnum) +{ + BdrvBlockStatusCache *bsc = qatomic_rcu_read(&bs->block_status_cache); + bool overlaps; + + overlaps = + qatomic_read(&bsc->valid) && + ranges_overlap(offset, bytes, bsc->data_start, + bsc->data_end - bsc->data_start); + + if (overlaps && pnum) { + *pnum = bsc->data_end - offset; + } + + return overlaps; +} + +/** + * See block_int.h for this function's documentation. + */ +bool bdrv_bsc_is_data(BlockDriverState *bs, int64_t offset, int64_t *pnum) +{ + RCU_READ_LOCK_GUARD(); + + return bdrv_bsc_range_overlaps_locked(bs, offset, 1, pnum); +} + +/** + * See block_int.h for this function's documentation. + */ +void bdrv_bsc_invalidate_range(BlockDriverState *bs, + int64_t offset, int64_t bytes) +{ + RCU_READ_LOCK_GUARD(); + + if (bdrv_bsc_range_overlaps_locked(bs, offset, bytes, NULL)) { + qatomic_set(&bs->block_status_cache->valid, false); + } +} + +/** + * See block_int.h for this function's documentation. + */ +void bdrv_bsc_fill(BlockDriverState *bs, int64_t offset, int64_t bytes) +{ + BdrvBlockStatusCache *new_bsc = g_new(BdrvBlockStatusCache, 1); + BdrvBlockStatusCache *old_bsc; + + *new_bsc = (BdrvBlockStatusCache) { + .valid = true, + .data_start = offset, + .data_end = offset + bytes, + }; + + QEMU_LOCK_GUARD(&bs->bsc_modify_lock); + + old_bsc = qatomic_rcu_read(&bs->block_status_cache); + qatomic_rcu_set(&bs->block_status_cache, new_bsc); + if (old_bsc) { + g_free_rcu(old_bsc, rcu); + } +} diff --git a/block/io.c b/block/io.c index a19942718b..99ee182ca4 100644 --- a/block/io.c +++ b/block/io.c @@ -1883,6 +1883,9 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, return -ENOTSUP; } + /* Invalidate the cached block-status data range if this write overlaps */ + bdrv_bsc_invalidate_range(bs, offset, bytes); + assert(alignment % bs->bl.request_alignment == 0); head = offset % alignment; tail = (offset + bytes) % alignment; @@ -2447,9 +2450,65 @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset; if (bs->drv->bdrv_co_block_status) { - ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset, - aligned_bytes, pnum, &local_map, - &local_file); + /* + * Use the block-status cache only for protocol nodes: Format + * drivers are generally quick to inquire the status, but protocol + * drivers often need to get information from outside of qemu, so + * we do not have control over the actual implementation. There + * have been cases where inquiring the status took an unreasonably + * long time, and we can do nothing in qemu to fix it. + * This is especially problematic for images with large data areas, + * because finding the few holes in them and giving them special + * treatment does not gain much performance. Therefore, we try to + * cache the last-identified data region. + * + * Second, limiting ourselves to protocol nodes allows us to assume + * the block status for data regions to be DATA | OFFSET_VALID, and + * that the host offset is the same as the guest offset. + * + * Note that it is possible that external writers zero parts of + * the cached regions without the cache being invalidated, and so + * we may report zeroes as data. This is not catastrophic, + * however, because reporting zeroes as data is fine. + */ + if (QLIST_EMPTY(&bs->children) && + bdrv_bsc_is_data(bs, aligned_offset, pnum)) + { + ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID; + local_file = bs; + local_map = aligned_offset; + } else { + ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset, + aligned_bytes, pnum, &local_map, + &local_file); + + /* + * Note that checking QLIST_EMPTY(&bs->children) is also done when + * the cache is queried above. Technically, we do not need to check + * it here; the worst that can happen is that we fill the cache for + * non-protocol nodes, and then it is never used. However, filling + * the cache requires an RCU update, so double check here to avoid + * such an update if possible. + */ + if (ret == (BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID) && + QLIST_EMPTY(&bs->children)) + { + /* + * When a protocol driver reports BLOCK_OFFSET_VALID, the + * returned local_map value must be the same as the offset we + * have passed (aligned_offset), and local_bs must be the node + * itself. + * Assert this, because we follow this rule when reading from + * the cache (see the `local_file = bs` and + * `local_map = aligned_offset` assignments above), and the + * result the cache delivers must be the same as the driver + * would deliver. + */ + assert(local_file == bs); + assert(local_map == aligned_offset); + bdrv_bsc_fill(bs, aligned_offset, *pnum); + } + } } else { /* Default code for filters */ @@ -3002,6 +3061,9 @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, return 0; } + /* Invalidate the cached block-status data range if this discard overlaps */ + bdrv_bsc_invalidate_range(bs, offset, bytes); + /* Discard is advisory, but some devices track and coalesce * unaligned requests, so we must pass everything down rather than * round here. Still, most devices will just silently ignore diff --git a/include/block/block_int.h b/include/block/block_int.h index 12e5750fe8..437d746733 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -34,6 +34,7 @@ #include "qemu/hbitmap.h" #include "block/snapshot.h" #include "qemu/throttle.h" +#include "qemu/rcu.h" #define BLOCK_FLAG_LAZY_REFCOUNTS 8 @@ -839,6 +840,24 @@ struct BdrvChild { QLIST_ENTRY(BdrvChild) next_parent; }; +/* + * Allows bdrv_co_block_status() to cache one data region for a + * protocol node. + * + * @valid: Whether the cache is valid (should be accessed with atomic + * functions so this can be reset by RCU readers) + * @data_start: Offset where we know (or strongly assume) is data + * @data_end: Offset where the data region ends (which is not necessarily + * the start of a zeroed region) + */ +typedef struct BdrvBlockStatusCache { + struct rcu_head rcu; + + bool valid; + int64_t data_start; + int64_t data_end; +} BdrvBlockStatusCache; + struct BlockDriverState { /* Protected by big QEMU lock or read-only after opening. No special * locking needed during I/O... @@ -1004,6 +1023,11 @@ struct BlockDriverState { /* BdrvChild links to this node may never be frozen */ bool never_freeze; + + /* Lock for block-status cache RCU writers */ + CoMutex bsc_modify_lock; + /* Always non-NULL, but must only be dereferenced under an RCU read guard */ + BdrvBlockStatusCache *block_status_cache; }; struct BlockBackendRootState { @@ -1429,4 +1453,30 @@ static inline BlockDriverState *bdrv_primary_bs(BlockDriverState *bs) */ void bdrv_drain_all_end_quiesce(BlockDriverState *bs); +/** + * Check whether the given offset is in the cached block-status data + * region. + * + * If it is, and @pnum is not NULL, *pnum is set to + * `bsc.data_end - offset`, i.e. how many bytes, starting from + * @offset, are data (according to the cache). + * Otherwise, *pnum is not touched. + */ +bool bdrv_bsc_is_data(BlockDriverState *bs, int64_t offset, int64_t *pnum); + +/** + * If [offset, offset + bytes) overlaps with the currently cached + * block-status region, invalidate the cache. + * + * (To be used by I/O paths that cause data regions to be zero or + * holes.) + */ +void bdrv_bsc_invalidate_range(BlockDriverState *bs, + int64_t offset, int64_t bytes); + +/** + * Mark the range [offset, offset + bytes) as a data region. + */ +void bdrv_bsc_fill(BlockDriverState *bs, int64_t offset, int64_t bytes); + #endif /* BLOCK_INT_H */ |