diff options
-rw-r--r-- | block.c | 10 | ||||
-rw-r--r-- | block/accounting.c | 78 | ||||
-rw-r--r-- | block/block-backend.c | 6 | ||||
-rw-r--r-- | block/dirty-bitmap.c | 112 | ||||
-rw-r--r-- | block/io.c | 51 | ||||
-rw-r--r-- | block/mirror.c | 14 | ||||
-rw-r--r-- | block/nfs.c | 4 | ||||
-rw-r--r-- | block/qapi.c | 2 | ||||
-rw-r--r-- | block/sheepdog.c | 3 | ||||
-rw-r--r-- | block/throttle-groups.c | 91 | ||||
-rw-r--r-- | blockdev.c | 48 | ||||
-rw-r--r-- | include/block/accounting.h | 11 | ||||
-rw-r--r-- | include/block/block.h | 5 | ||||
-rw-r--r-- | include/block/block_int.h | 61 | ||||
-rw-r--r-- | include/block/dirty-bitmap.h | 25 | ||||
-rw-r--r-- | include/qemu/stats64.h | 193 | ||||
-rw-r--r-- | include/sysemu/block-backend.h | 10 | ||||
-rw-r--r-- | migration/block.c | 17 | ||||
-rw-r--r-- | tests/docker/Makefile.include | 2 | ||||
-rw-r--r-- | tests/docker/dockerfiles/centos6.docker | 2 | ||||
-rw-r--r-- | tests/docker/dockerfiles/fedora.docker | 4 | ||||
-rw-r--r-- | util/Makefile.objs | 1 | ||||
-rw-r--r-- | util/stats64.c | 137 |
23 files changed, 698 insertions, 189 deletions
@@ -320,6 +320,8 @@ BlockDriverState *bdrv_new(void) QLIST_INIT(&bs->op_blockers[i]); } notifier_with_return_list_init(&bs->before_write_notifiers); + qemu_co_mutex_init(&bs->reqs_lock); + qemu_mutex_init(&bs->dirty_bitmap_mutex); bs->refcnt = 1; bs->aio_context = qemu_get_aio_context(); @@ -1300,7 +1302,9 @@ static int bdrv_open_common(BlockDriverState *bs, BlockBackend *file, goto fail_opts; } - assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */ + /* bdrv_new() and bdrv_close() make it so */ + assert(atomic_read(&bs->copy_on_read) == 0); + if (bs->open_flags & BDRV_O_COPY_ON_READ) { if (!bs->read_only) { bdrv_enable_copy_on_read(bs); @@ -3063,7 +3067,7 @@ static void bdrv_close(BlockDriverState *bs) g_free(bs->opaque); bs->opaque = NULL; - bs->copy_on_read = 0; + atomic_set(&bs->copy_on_read, 0); bs->backing_file[0] = '\0'; bs->backing_format[0] = '\0'; bs->total_sectors = 0; @@ -3422,7 +3426,7 @@ int bdrv_truncate(BdrvChild *child, int64_t offset, Error **errp) ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS); bdrv_dirty_bitmap_truncate(bs); bdrv_parent_cb_resize(bs); - ++bs->write_gen; + atomic_inc(&bs->write_gen); } return ret; } diff --git a/block/accounting.c b/block/accounting.c index 3f457c4e73..87ef5bbfaa 100644 --- a/block/accounting.c +++ b/block/accounting.c @@ -32,23 +32,28 @@ static QEMUClockType clock_type = QEMU_CLOCK_REALTIME; static const int qtest_latency_ns = NANOSECONDS_PER_SECOND / 1000; -void block_acct_init(BlockAcctStats *stats, bool account_invalid, - bool account_failed) +void block_acct_init(BlockAcctStats *stats) { - stats->account_invalid = account_invalid; - stats->account_failed = account_failed; - + qemu_mutex_init(&stats->lock); if (qtest_enabled()) { clock_type = QEMU_CLOCK_VIRTUAL; } } +void block_acct_setup(BlockAcctStats *stats, bool account_invalid, + bool account_failed) +{ + stats->account_invalid = account_invalid; + stats->account_failed = account_failed; +} + void block_acct_cleanup(BlockAcctStats *stats) { BlockAcctTimedStats *s, *next; QSLIST_FOREACH_SAFE(s, &stats->intervals, entries, next) { g_free(s); } + qemu_mutex_destroy(&stats->lock); } void block_acct_add_interval(BlockAcctStats *stats, unsigned interval_length) @@ -58,12 +63,15 @@ void block_acct_add_interval(BlockAcctStats *stats, unsigned interval_length) s = g_new0(BlockAcctTimedStats, 1); s->interval_length = interval_length; + s->stats = stats; + qemu_mutex_lock(&stats->lock); QSLIST_INSERT_HEAD(&stats->intervals, s, entries); for (i = 0; i < BLOCK_MAX_IOTYPE; i++) { timed_average_init(&s->latency[i], clock_type, (uint64_t) interval_length * NANOSECONDS_PER_SECOND); } + qemu_mutex_unlock(&stats->lock); } BlockAcctTimedStats *block_acct_interval_next(BlockAcctStats *stats, @@ -86,7 +94,8 @@ void block_acct_start(BlockAcctStats *stats, BlockAcctCookie *cookie, cookie->type = type; } -void block_acct_done(BlockAcctStats *stats, BlockAcctCookie *cookie) +static void block_account_one_io(BlockAcctStats *stats, BlockAcctCookie *cookie, + bool failed) { BlockAcctTimedStats *s; int64_t time_ns = qemu_clock_get_ns(clock_type); @@ -98,31 +107,16 @@ void block_acct_done(BlockAcctStats *stats, BlockAcctCookie *cookie) assert(cookie->type < BLOCK_MAX_IOTYPE); - stats->nr_bytes[cookie->type] += cookie->bytes; - stats->nr_ops[cookie->type]++; - stats->total_time_ns[cookie->type] += latency_ns; - stats->last_access_time_ns = time_ns; + qemu_mutex_lock(&stats->lock); - QSLIST_FOREACH(s, &stats->intervals, entries) { - timed_average_account(&s->latency[cookie->type], latency_ns); + if (failed) { + stats->failed_ops[cookie->type]++; + } else { + stats->nr_bytes[cookie->type] += cookie->bytes; + stats->nr_ops[cookie->type]++; } -} - -void block_acct_failed(BlockAcctStats *stats, BlockAcctCookie *cookie) -{ - assert(cookie->type < BLOCK_MAX_IOTYPE); - - stats->failed_ops[cookie->type]++; - - if (stats->account_failed) { - BlockAcctTimedStats *s; - int64_t time_ns = qemu_clock_get_ns(clock_type); - int64_t latency_ns = time_ns - cookie->start_time_ns; - - if (qtest_enabled()) { - latency_ns = qtest_latency_ns; - } + if (!failed || stats->account_failed) { stats->total_time_ns[cookie->type] += latency_ns; stats->last_access_time_ns = time_ns; @@ -130,29 +124,45 @@ void block_acct_failed(BlockAcctStats *stats, BlockAcctCookie *cookie) timed_average_account(&s->latency[cookie->type], latency_ns); } } + + qemu_mutex_unlock(&stats->lock); +} + +void block_acct_done(BlockAcctStats *stats, BlockAcctCookie *cookie) +{ + block_account_one_io(stats, cookie, false); +} + +void block_acct_failed(BlockAcctStats *stats, BlockAcctCookie *cookie) +{ + block_account_one_io(stats, cookie, true); } void block_acct_invalid(BlockAcctStats *stats, enum BlockAcctType type) { assert(type < BLOCK_MAX_IOTYPE); - /* block_acct_done() and block_acct_failed() update - * total_time_ns[], but this one does not. The reason is that - * invalid requests are accounted during their submission, - * therefore there's no actual I/O involved. */ - + /* block_account_one_io() updates total_time_ns[], but this one does + * not. The reason is that invalid requests are accounted during their + * submission, therefore there's no actual I/O involved. + */ + qemu_mutex_lock(&stats->lock); stats->invalid_ops[type]++; if (stats->account_invalid) { stats->last_access_time_ns = qemu_clock_get_ns(clock_type); } + qemu_mutex_unlock(&stats->lock); } void block_acct_merge_done(BlockAcctStats *stats, enum BlockAcctType type, int num_requests) { assert(type < BLOCK_MAX_IOTYPE); + + qemu_mutex_lock(&stats->lock); stats->merged[type] += num_requests; + qemu_mutex_unlock(&stats->lock); } int64_t block_acct_idle_time_ns(BlockAcctStats *stats) @@ -167,7 +177,9 @@ double block_acct_queue_depth(BlockAcctTimedStats *stats, assert(type < BLOCK_MAX_IOTYPE); + qemu_mutex_lock(&stats->stats->lock); sum = timed_average_sum(&stats->latency[type], &elapsed); + qemu_mutex_unlock(&stats->stats->lock); return (double) sum / elapsed; } diff --git a/block/block-backend.c b/block/block-backend.c index 7d7f3697d1..a2bbae90b1 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -216,8 +216,10 @@ BlockBackend *blk_new(uint64_t perm, uint64_t shared_perm) blk->shared_perm = shared_perm; blk_set_enable_write_cache(blk, true); + qemu_co_mutex_init(&blk->public.throttled_reqs_lock); qemu_co_queue_init(&blk->public.throttled_reqs[0]); qemu_co_queue_init(&blk->public.throttled_reqs[1]); + block_acct_init(&blk->stats); notifier_list_init(&blk->remove_bs_notifiers); notifier_list_init(&blk->insert_bs_notifiers); @@ -1953,7 +1955,7 @@ static void blk_root_drained_begin(BdrvChild *child) /* Note that blk->root may not be accessible here yet if we are just * attaching to a BlockDriverState that is drained. Use child instead. */ - if (blk->public.io_limits_disabled++ == 0) { + if (atomic_fetch_inc(&blk->public.io_limits_disabled) == 0) { throttle_group_restart_blk(blk); } } @@ -1964,7 +1966,7 @@ static void blk_root_drained_end(BdrvChild *child) assert(blk->quiesce_counter); assert(blk->public.io_limits_disabled); - --blk->public.io_limits_disabled; + atomic_dec(&blk->public.io_limits_disabled); if (--blk->quiesce_counter == 0) { if (blk->dev_ops && blk->dev_ops->drained_end) { diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c index 519737c8d3..a04c6e4154 100644 --- a/block/dirty-bitmap.c +++ b/block/dirty-bitmap.c @@ -37,6 +37,7 @@ * or enabled. A frozen bitmap can only abdicate() or reclaim(). */ struct BdrvDirtyBitmap { + QemuMutex *mutex; HBitmap *bitmap; /* Dirty sector bitmap implementation */ HBitmap *meta; /* Meta dirty bitmap */ BdrvDirtyBitmap *successor; /* Anonymous child; implies frozen status */ @@ -52,6 +53,27 @@ struct BdrvDirtyBitmapIter { BdrvDirtyBitmap *bitmap; }; +static inline void bdrv_dirty_bitmaps_lock(BlockDriverState *bs) +{ + qemu_mutex_lock(&bs->dirty_bitmap_mutex); +} + +static inline void bdrv_dirty_bitmaps_unlock(BlockDriverState *bs) +{ + qemu_mutex_unlock(&bs->dirty_bitmap_mutex); +} + +void bdrv_dirty_bitmap_lock(BdrvDirtyBitmap *bitmap) +{ + qemu_mutex_lock(bitmap->mutex); +} + +void bdrv_dirty_bitmap_unlock(BdrvDirtyBitmap *bitmap) +{ + qemu_mutex_unlock(bitmap->mutex); +} + +/* Called with BQL or dirty_bitmap lock taken. */ BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, const char *name) { BdrvDirtyBitmap *bm; @@ -65,6 +87,7 @@ BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, const char *name) return NULL; } +/* Called with BQL taken. */ void bdrv_dirty_bitmap_make_anon(BdrvDirtyBitmap *bitmap) { assert(!bdrv_dirty_bitmap_frozen(bitmap)); @@ -72,6 +95,7 @@ void bdrv_dirty_bitmap_make_anon(BdrvDirtyBitmap *bitmap) bitmap->name = NULL; } +/* Called with BQL taken. */ BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, uint32_t granularity, const char *name, @@ -96,11 +120,14 @@ BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, return NULL; } bitmap = g_new0(BdrvDirtyBitmap, 1); + bitmap->mutex = &bs->dirty_bitmap_mutex; bitmap->bitmap = hbitmap_alloc(bitmap_size, ctz32(sector_granularity)); bitmap->size = bitmap_size; bitmap->name = g_strdup(name); bitmap->disabled = false; + bdrv_dirty_bitmaps_lock(bs); QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list); + bdrv_dirty_bitmaps_unlock(bs); return bitmap; } @@ -119,20 +146,24 @@ void bdrv_create_meta_dirty_bitmap(BdrvDirtyBitmap *bitmap, int chunk_size) { assert(!bitmap->meta); + qemu_mutex_lock(bitmap->mutex); bitmap->meta = hbitmap_create_meta(bitmap->bitmap, chunk_size * BITS_PER_BYTE); + qemu_mutex_unlock(bitmap->mutex); } void bdrv_release_meta_dirty_bitmap(BdrvDirtyBitmap *bitmap) { assert(bitmap->meta); + qemu_mutex_lock(bitmap->mutex); hbitmap_free_meta(bitmap->bitmap); bitmap->meta = NULL; + qemu_mutex_unlock(bitmap->mutex); } -int bdrv_dirty_bitmap_get_meta(BlockDriverState *bs, - BdrvDirtyBitmap *bitmap, int64_t sector, - int nb_sectors) +int bdrv_dirty_bitmap_get_meta_locked(BlockDriverState *bs, + BdrvDirtyBitmap *bitmap, int64_t sector, + int nb_sectors) { uint64_t i; int sectors_per_bit = 1 << hbitmap_granularity(bitmap->meta); @@ -147,11 +178,26 @@ int bdrv_dirty_bitmap_get_meta(BlockDriverState *bs, return false; } +int bdrv_dirty_bitmap_get_meta(BlockDriverState *bs, + BdrvDirtyBitmap *bitmap, int64_t sector, + int nb_sectors) +{ + bool dirty; + + qemu_mutex_lock(bitmap->mutex); + dirty = bdrv_dirty_bitmap_get_meta_locked(bs, bitmap, sector, nb_sectors); + qemu_mutex_unlock(bitmap->mutex); + + return dirty; +} + void bdrv_dirty_bitmap_reset_meta(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector, int nb_sectors) { + qemu_mutex_lock(bitmap->mutex); hbitmap_reset(bitmap->meta, sector, nb_sectors); + qemu_mutex_unlock(bitmap->mutex); } int64_t bdrv_dirty_bitmap_size(const BdrvDirtyBitmap *bitmap) @@ -164,16 +210,19 @@ const char *bdrv_dirty_bitmap_name(const BdrvDirtyBitmap *bitmap) return bitmap->name; } +/* Called with BQL taken. */ bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap) { return bitmap->successor; } +/* Called with BQL taken. */ bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap) { return !(bitmap->disabled || bitmap->successor); } +/* Called with BQL taken. */ DirtyBitmapStatus bdrv_dirty_bitmap_status(BdrvDirtyBitmap *bitmap) { if (bdrv_dirty_bitmap_frozen(bitmap)) { @@ -188,6 +237,7 @@ DirtyBitmapStatus bdrv_dirty_bitmap_status(BdrvDirtyBitmap *bitmap) /** * Create a successor bitmap destined to replace this bitmap after an operation. * Requires that the bitmap is not frozen and has no successor. + * Called with BQL taken. */ int bdrv_dirty_bitmap_create_successor(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, Error **errp) @@ -220,6 +270,7 @@ int bdrv_dirty_bitmap_create_successor(BlockDriverState *bs, /** * For a bitmap with a successor, yield our name to the successor, * delete the old bitmap, and return a handle to the new bitmap. + * Called with BQL taken. */ BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, @@ -247,6 +298,7 @@ BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BlockDriverState *bs, * In cases of failure where we can no longer safely delete the parent, * we may wish to re-join the parent and child/successor. * The merged parent will be un-frozen, but not explicitly re-enabled. + * Called with BQL taken. */ BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *parent, @@ -271,25 +323,30 @@ BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BlockDriverState *bs, /** * Truncates _all_ bitmaps attached to a BDS. + * Called with BQL taken. */ void bdrv_dirty_bitmap_truncate(BlockDriverState *bs) { BdrvDirtyBitmap *bitmap; uint64_t size = bdrv_nb_sectors(bs); + bdrv_dirty_bitmaps_lock(bs); QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) { assert(!bdrv_dirty_bitmap_frozen(bitmap)); assert(!bitmap->active_iterators); hbitmap_truncate(bitmap->bitmap, size); bitmap->size = size; } + bdrv_dirty_bitmaps_unlock(bs); } +/* Called with BQL taken. */ static void bdrv_do_release_matching_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, bool only_named) { BdrvDirtyBitmap *bm, *next; + bdrv_dirty_bitmaps_lock(bs); QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) { if ((!bitmap || bm == bitmap) && (!only_named || bm->name)) { assert(!bm->active_iterators); @@ -301,15 +358,19 @@ static void bdrv_do_release_matching_dirty_bitmap(BlockDriverState *bs, g_free(bm); if (bitmap) { - return; + goto out; } } } if (bitmap) { abort(); } + +out: + bdrv_dirty_bitmaps_unlock(bs); } +/* Called with BQL taken. */ void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap) { bdrv_do_release_matching_dirty_bitmap(bs, bitmap, false); @@ -318,18 +379,21 @@ void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap) /** * Release all named dirty bitmaps attached to a BDS (for use in bdrv_close()). * There must not be any frozen bitmaps attached. + * Called with BQL taken. */ void bdrv_release_named_dirty_bitmaps(BlockDriverState *bs) { bdrv_do_release_matching_dirty_bitmap(bs, NULL, true); } +/* Called with BQL taken. */ void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap) { assert(!bdrv_dirty_bitmap_frozen(bitmap)); bitmap->disabled = true; } +/* Called with BQL taken. */ void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap) { assert(!bdrv_dirty_bitmap_frozen(bitmap)); @@ -342,6 +406,7 @@ BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs) BlockDirtyInfoList *list = NULL; BlockDirtyInfoList **plist = &list; + bdrv_dirty_bitmaps_lock(bs); QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) { BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1); BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1); @@ -354,12 +419,14 @@ BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs) *plist = entry; plist = &entry->next; } + bdrv_dirty_bitmaps_unlock(bs); return list; } -int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, - int64_t sector) +/* Called within bdrv_dirty_bitmap_lock..unlock */ +int bdrv_get_dirty_locked(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, + int64_t sector) { if (bitmap) { return hbitmap_get(bitmap->bitmap, sector); @@ -432,23 +499,42 @@ int64_t bdrv_dirty_iter_next(BdrvDirtyBitmapIter *iter) return hbitmap_iter_next(&iter->hbi); } +/* Called within bdrv_dirty_bitmap_lock..unlock */ +void bdrv_set_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap, + int64_t cur_sector, int64_t nr_sectors) +{ + assert(bdrv_dirty_bitmap_enabled(bitmap)); + hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors); +} + void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap, int64_t cur_sector, int64_t nr_sectors) { + bdrv_dirty_bitmap_lock(bitmap); + bdrv_set_dirty_bitmap_locked(bitmap, cur_sector, nr_sectors); + bdrv_dirty_bitmap_unlock(bitmap); +} + +/* Called within bdrv_dirty_bitmap_lock..unlock */ +void bdrv_reset_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap, + int64_t cur_sector, int64_t nr_sectors) +{ assert(bdrv_dirty_bitmap_enabled(bitmap)); - hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors); + hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors); } void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap, int64_t cur_sector, int64_t nr_sectors) { - assert(bdrv_dirty_bitmap_enabled(bitmap)); - hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors); + bdrv_dirty_bitmap_lock(bitmap); + bdrv_reset_dirty_bitmap_locked(bitmap, cur_sector, nr_sectors); + bdrv_dirty_bitmap_unlock(bitmap); } void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out) { assert(bdrv_dirty_bitmap_enabled(bitmap)); + bdrv_dirty_bitmap_lock(bitmap); if (!out) { hbitmap_reset_all(bitmap->bitmap); } else { @@ -457,6 +543,7 @@ void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out) hbitmap_granularity(backup)); *out = backup; } + bdrv_dirty_bitmap_unlock(bitmap); } void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in) @@ -508,12 +595,19 @@ void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, int64_t nr_sectors) { BdrvDirtyBitmap *bitmap; + + if (QLIST_EMPTY(&bs->dirty_bitmaps)) { + return; + } + + bdrv_dirty_bitmaps_lock(bs); QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) { if (!bdrv_dirty_bitmap_enabled(bitmap)) { continue; } hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors); } + bdrv_dirty_bitmaps_unlock(bs); } /** diff --git a/block/io.c b/block/io.c index ed31810c0a..91611ffb2a 100644 --- a/block/io.c +++ b/block/io.c @@ -130,13 +130,13 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) */ void bdrv_enable_copy_on_read(BlockDriverState *bs) { - bs->copy_on_read++; + atomic_inc(&bs->copy_on_read); } void bdrv_disable_copy_on_read(BlockDriverState *bs) { - assert(bs->copy_on_read > 0); - bs->copy_on_read--; + int old = atomic_fetch_dec(&bs->copy_on_read); + assert(old >= 1); } /* Check if any requests are in-flight (including throttled requests) */ @@ -241,7 +241,7 @@ void bdrv_drained_begin(BlockDriverState *bs) return; } - if (!bs->quiesce_counter++) { + if (atomic_fetch_inc(&bs->quiesce_counter) == 0) { aio_disable_external(bdrv_get_aio_context(bs)); bdrv_parent_drained_begin(bs); } @@ -252,7 +252,7 @@ void bdrv_drained_begin(BlockDriverState *bs) void bdrv_drained_end(BlockDriverState *bs) { assert(bs->quiesce_counter > 0); - if (--bs->quiesce_counter > 0) { + if (atomic_fetch_dec(&bs->quiesce_counter) > 1) { return; } @@ -375,11 +375,13 @@ void bdrv_drain_all(void) static void tracked_request_end(BdrvTrackedRequest *req) { if (req->serialising) { - req->bs->serialising_in_flight--; + atomic_dec(&req->bs->serialising_in_flight); } + qemu_co_mutex_lock(&req->bs->reqs_lock); QLIST_REMOVE(req, list); qemu_co_queue_restart_all(&req->wait_queue); + qemu_co_mutex_unlock(&req->bs->reqs_lock); } /** @@ -404,7 +406,9 @@ static void tracked_request_begin(BdrvTrackedRequest *req, qemu_co_queue_init(&req->wait_queue); + qemu_co_mutex_lock(&bs->reqs_lock); QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); + qemu_co_mutex_unlock(&bs->reqs_lock); } static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) @@ -414,7 +418,7 @@ static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) - overlap_offset; if (!req->serialising) { - req->bs->serialising_in_flight++; + atomic_inc(&req->bs->serialising_in_flight); req->serialising = true; } @@ -501,7 +505,8 @@ static void dummy_bh_cb(void *opaque) void bdrv_wakeup(BlockDriverState *bs) { - if (bs->wakeup) { + /* The barrier (or an atomic op) is in the caller. */ + if (atomic_read(&bs->wakeup)) { aio_bh_schedule_oneshot(qemu_get_aio_context(), dummy_bh_cb, NULL); } } @@ -519,12 +524,13 @@ static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) bool retry; bool waited = false; - if (!bs->serialising_in_flight) { + if (!atomic_read(&bs->serialising_in_flight)) { return false; } do { retry = false; + qemu_co_mutex_lock(&bs->reqs_lock); QLIST_FOREACH(req, &bs->tracked_requests, list) { if (req == self || (!req->serialising && !self->serialising)) { continue; @@ -543,7 +549,7 @@ static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) * (instead of producing a deadlock in the former case). */ if (!req->waiting_for) { self->waiting_for = req; - qemu_co_queue_wait(&req->wait_queue, NULL); + qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock); self->waiting_for = NULL; retry = true; waited = true; @@ -551,6 +557,7 @@ static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) } } } + qemu_co_mutex_unlock(&bs->reqs_lock); } while (retry); return waited; @@ -1144,7 +1151,7 @@ int coroutine_fn bdrv_co_preadv(BdrvChild *child, bdrv_inc_in_flight(bs); /* Don't do copy-on-read if we read data before write operation */ - if (bs->copy_on_read && !(flags & BDRV_REQ_NO_SERIALISING)) { + if (atomic_read(&bs->copy_on_read) && !(flags & BDRV_REQ_NO_SERIALISING)) { flags |= BDRV_REQ_COPY_ON_READ; } @@ -1401,12 +1408,10 @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, } bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); - ++bs->write_gen; + atomic_inc(&bs->write_gen); bdrv_set_dirty(bs, start_sector, end_sector - start_sector); - if (bs->wr_highest_offset < offset + bytes) { - bs->wr_highest_offset = offset + bytes; - } + stat64_max(&bs->wr_highest_offset, offset + bytes); if (ret >= 0) { bs->total_sectors = MAX(bs->total_sectors, end_sector); @@ -2292,14 +2297,17 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs) goto early_exit; } - current_gen = bs->write_gen; + qemu_co_mutex_lock(&bs->reqs_lock); + current_gen = atomic_read(&bs->write_gen); /* Wait until any previous flushes are completed */ while (bs->active_flush_req) { - qemu_co_queue_wait(&bs->flush_queue, NULL); + qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock); } + /* Flushes reach this point in nondecreasing current_gen order. */ bs->active_flush_req = true; + qemu_co_mutex_unlock(&bs->reqs_lock); /* Write back all layers by calling one driver function */ if (bs->drv->bdrv_co_flush) { @@ -2371,9 +2379,12 @@ out: if (ret == 0) { bs->flushed_gen = current_gen; } + + qemu_co_mutex_lock(&bs->reqs_lock); bs->active_flush_req = false; /* Return value is ignored - it's ok if wait queue is empty */ qemu_co_queue_next(&bs->flush_queue); + qemu_co_mutex_unlock(&bs->reqs_lock); early_exit: bdrv_dec_in_flight(bs); @@ -2517,7 +2528,7 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, } ret = 0; out: - ++bs->write_gen; + atomic_inc(&bs->write_gen); bdrv_set_dirty(bs, req.offset >> BDRV_SECTOR_BITS, req.bytes >> BDRV_SECTOR_BITS); tracked_request_end(&req); @@ -2644,7 +2655,7 @@ void bdrv_io_plug(BlockDriverState *bs) bdrv_io_plug(child->bs); } - if (bs->io_plugged++ == 0) { + if (atomic_fetch_inc(&bs->io_plugged) == 0) { BlockDriver *drv = bs->drv; if (drv && drv->bdrv_io_plug) { drv->bdrv_io_plug(bs); @@ -2657,7 +2668,7 @@ void bdrv_io_unplug(BlockDriverState *bs) BdrvChild *child; assert(bs->io_plugged); - if (--bs->io_plugged == 0) { + if (atomic_fetch_dec(&bs->io_plugged) == 1) { BlockDriver *drv = bs->drv; if (drv && drv->bdrv_io_unplug) { drv->bdrv_io_unplug(bs); diff --git a/block/mirror.c b/block/mirror.c index a2a970301c..19afcc6f1a 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -342,6 +342,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) int max_io_sectors = MAX((s->buf_size >> BDRV_SECTOR_BITS) / MAX_IN_FLIGHT, MAX_IO_SECTORS); + bdrv_dirty_bitmap_lock(s->dirty_bitmap); sector_num = bdrv_dirty_iter_next(s->dbi); if (sector_num < 0) { bdrv_set_dirty_iter(s->dbi, 0); @@ -349,6 +350,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap)); assert(sector_num >= 0); } + bdrv_dirty_bitmap_unlock(s->dirty_bitmap); first_chunk = sector_num / sectors_per_chunk; while (test_bit(first_chunk, s->in_flight_bitmap)) { @@ -360,12 +362,13 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) /* Find the number of consective dirty chunks following the first dirty * one, and wait for in flight requests in them. */ + bdrv_dirty_bitmap_lock(s->dirty_bitmap); while (nb_chunks * sectors_per_chunk < (s->buf_size >> BDRV_SECTOR_BITS)) { int64_t next_dirty; int64_t next_sector = sector_num + nb_chunks * sectors_per_chunk; int64_t next_chunk = next_sector / sectors_per_chunk; if (next_sector >= end || - !bdrv_get_dirty(source, s->dirty_bitmap, next_sector)) { + !bdrv_get_dirty_locked(source, s->dirty_bitmap, next_sector)) { break; } if (test_bit(next_chunk, s->in_flight_bitmap)) { @@ -386,8 +389,10 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) * calling bdrv_get_block_status_above could yield - if some blocks are * marked dirty in this window, we need to know. */ - bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num, - nb_chunks * sectors_per_chunk); + bdrv_reset_dirty_bitmap_locked(s->dirty_bitmap, sector_num, + nb_chunks * sectors_per_chunk); + bdrv_dirty_bitmap_unlock(s->dirty_bitmap); + bitmap_set(s->in_flight_bitmap, sector_num / sectors_per_chunk, nb_chunks); while (nb_chunks > 0 && sector_num < end) { int64_t ret; @@ -506,6 +511,8 @@ static void mirror_exit(BlockJob *job, void *opaque) BlockDriverState *mirror_top_bs = s->mirror_top_bs; Error *local_err = NULL; + bdrv_release_dirty_bitmap(src, s->dirty_bitmap); + /* Make sure that the source BDS doesn't go away before we called * block_job_completed(). */ bdrv_ref(src); @@ -904,7 +911,6 @@ immediate_exit: g_free(s->cow_bitmap); g_free(s->in_flight_bitmap); bdrv_dirty_iter_free(s->dbi); - bdrv_release_dirty_bitmap(bs, s->dirty_bitmap); data = g_malloc(sizeof(*data)); data->ret = ret; diff --git a/block/nfs.c b/block/nfs.c index 848b2c0bb0..18c87d2f25 100644 --- a/block/nfs.c +++ b/block/nfs.c @@ -730,7 +730,9 @@ nfs_get_allocated_file_size_cb(int ret, struct nfs_context *nfs, void *data, if (task->ret < 0) { error_report("NFS Error: %s", nfs_get_error(nfs)); } - task->complete = 1; + + /* Set task->complete before reading bs->wakeup. */ + atomic_mb_set(&task->complete, 1); bdrv_wakeup(task->bs); } diff --git a/block/qapi.c b/block/qapi.c index a40922ea26..14b60ae66c 100644 --- a/block/qapi.c +++ b/block/qapi.c @@ -441,7 +441,7 @@ static BlockStats *bdrv_query_bds_stats(const BlockDriverState *bs, s->node_name = g_strdup(bdrv_get_node_name(bs)); } - s->stats->wr_highest_offset = bs->wr_highest_offset; + s->stats->wr_highest_offset = stat64_get(&bs->wr_highest_offset); if (bs->file) { s->has_parent = true; diff --git a/block/sheepdog.c b/block/sheepdog.c index a18315a1ca..5ebf5d9fbb 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -698,7 +698,8 @@ out: srco->co = NULL; srco->ret = ret; - srco->finished = true; + /* Set srco->finished before reading bs->wakeup. */ + atomic_mb_set(&srco->finished, true); if (srco->bs) { bdrv_wakeup(srco->bs); } diff --git a/block/throttle-groups.c b/block/throttle-groups.c index b73e7a800b..a181cb1dee 100644 --- a/block/throttle-groups.c +++ b/block/throttle-groups.c @@ -240,7 +240,7 @@ static bool throttle_group_schedule_timer(BlockBackend *blk, bool is_write) ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); bool must_wait; - if (blkp->io_limits_disabled) { + if (atomic_read(&blkp->io_limits_disabled)) { return false; } @@ -260,6 +260,25 @@ static bool throttle_group_schedule_timer(BlockBackend *blk, bool is_write) return must_wait; } +/* Start the next pending I/O request for a BlockBackend. Return whether + * any request was actually pending. + * + * @blk: the current BlockBackend + * @is_write: the type of operation (read/write) + */ +static bool coroutine_fn throttle_group_co_restart_queue(BlockBackend *blk, + bool is_write) +{ + BlockBackendPublic *blkp = blk_get_public(blk); + bool ret; + + qemu_co_mutex_lock(&blkp->throttled_reqs_lock); + ret = qemu_co_queue_next(&blkp->throttled_reqs[is_write]); + qemu_co_mutex_unlock(&blkp->throttled_reqs_lock); + + return ret; +} + /* Look for the next pending I/O request and schedule it. * * This assumes that tg->lock is held. @@ -287,12 +306,12 @@ static void schedule_next_request(BlockBackend *blk, bool is_write) if (!must_wait) { /* Give preference to requests from the current blk */ if (qemu_in_coroutine() && - qemu_co_queue_next(&blkp->throttled_reqs[is_write])) { + throttle_group_co_restart_queue(blk, is_write)) { token = blk; } else { ThrottleTimers *tt = &blk_get_public(token)->throttle_timers; int64_t now = qemu_clock_get_ns(tt->clock_type); - timer_mod(tt->timers[is_write], now + 1); + timer_mod(tt->timers[is_write], now); tg->any_timer_armed[is_write] = true; } tg->tokens[is_write] = token; @@ -326,7 +345,10 @@ void coroutine_fn throttle_group_co_io_limits_intercept(BlockBackend *blk, if (must_wait || blkp->pending_reqs[is_write]) { blkp->pending_reqs[is_write]++; qemu_mutex_unlock(&tg->lock); - qemu_co_queue_wait(&blkp->throttled_reqs[is_write], NULL); + qemu_co_mutex_lock(&blkp->throttled_reqs_lock); + qemu_co_queue_wait(&blkp->throttled_reqs[is_write], + &blkp->throttled_reqs_lock); + qemu_co_mutex_unlock(&blkp->throttled_reqs_lock); qemu_mutex_lock(&tg->lock); blkp->pending_reqs[is_write]--; } @@ -340,15 +362,50 @@ void coroutine_fn throttle_group_co_io_limits_intercept(BlockBackend *blk, qemu_mutex_unlock(&tg->lock); } +typedef struct { + BlockBackend *blk; + bool is_write; +} RestartData; + +static void coroutine_fn throttle_group_restart_queue_entry(void *opaque) +{ + RestartData *data = opaque; + BlockBackend *blk = data->blk; + bool is_write = data->is_write; + BlockBackendPublic *blkp = blk_get_public(blk); + ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts); + bool empty_queue; + + empty_queue = !throttle_group_co_restart_queue(blk, is_write); + + /* If the request queue was empty then we have to take care of + * scheduling the next one */ + if (empty_queue) { + qemu_mutex_lock(&tg->lock); + schedule_next_request(blk, is_write); + qemu_mutex_unlock(&tg->lock); + } +} + +static void throttle_group_restart_queue(BlockBackend *blk, bool is_write) +{ + Coroutine *co; + RestartData rd = { + .blk = blk, + .is_write = is_write + }; + + co = qemu_coroutine_create(throttle_group_restart_queue_entry, &rd); + aio_co_enter(blk_get_aio_context(blk), co); +} + void throttle_group_restart_blk(BlockBackend *blk) { BlockBackendPublic *blkp = blk_get_public(blk); - int i; - for (i = 0; i < 2; i++) { - while (qemu_co_enter_next(&blkp->throttled_reqs[i])) { - ; - } + if (blkp->throttle_state) { + throttle_group_restart_queue(blk, 0); + throttle_group_restart_queue(blk, 1); } } @@ -376,8 +433,7 @@ void throttle_group_config(BlockBackend *blk, ThrottleConfig *cfg) throttle_config(ts, tt, cfg); qemu_mutex_unlock(&tg->lock); - qemu_co_enter_next(&blkp->throttled_reqs[0]); - qemu_co_enter_next(&blkp->throttled_reqs[1]); + throttle_group_restart_blk(blk); } /* Get the throttle configuration from a particular group. Similar to @@ -408,7 +464,6 @@ static void timer_cb(BlockBackend *blk, bool is_write) BlockBackendPublic *blkp = blk_get_public(blk); ThrottleState *ts = blkp->throttle_state; ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); - bool empty_queue; /* The timer has just been fired, so we can update the flag */ qemu_mutex_lock(&tg->lock); @@ -416,17 +471,7 @@ static void timer_cb(BlockBackend *blk, bool is_write) qemu_mutex_unlock(&tg->lock); /* Run the request that was waiting for this timer */ - aio_context_acquire(blk_get_aio_context(blk)); - empty_queue = !qemu_co_enter_next(&blkp->throttled_reqs[is_write]); - aio_context_release(blk_get_aio_context(blk)); - - /* If the request queue was empty then we have to take care of - * scheduling the next one */ - if (empty_queue) { - qemu_mutex_lock(&tg->lock); - schedule_next_request(blk, is_write); - qemu_mutex_unlock(&tg->lock); - } + throttle_group_restart_queue(blk, is_write); } static void read_timer_cb(void *opaque) diff --git a/blockdev.c b/blockdev.c index 6472548186..39d6b9712b 100644 --- a/blockdev.c +++ b/blockdev.c @@ -595,7 +595,7 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts, autostart = 0; } - block_acct_init(blk_get_stats(blk), account_invalid, account_failed); + block_acct_setup(blk_get_stats(blk), account_invalid, account_failed); if (!parse_stats_intervals(blk_get_stats(blk), interval_list, errp)) { blk_unref(blk); @@ -1362,12 +1362,10 @@ out_aio_context: static BdrvDirtyBitmap *block_dirty_bitmap_lookup(const char *node, const char *name, BlockDriverState **pbs, - AioContext **paio, Error **errp) { BlockDriverState *bs; BdrvDirtyBitmap *bitmap; - AioContext *aio_context; if (!node) { error_setg(errp, "Node cannot be NULL"); @@ -1383,29 +1381,17 @@ static BdrvDirtyBitmap *block_dirty_bitmap_lookup(const char *node, return NULL; } - aio_context = bdrv_get_aio_context(bs); - aio_context_acquire(aio_context); - bitmap = bdrv_find_dirty_bitmap(bs, name); if (!bitmap) { error_setg(errp, "Dirty bitmap '%s' not found", name); - goto fail; + return NULL; } if (pbs) { *pbs = bs; } - if (paio) { - *paio = aio_context; - } else { - aio_context_release(aio_context); - } return bitmap; - - fail: - aio_context_release(aio_context); - return NULL; } /* New and old BlockDriverState structs for atomic group operations */ @@ -1791,7 +1777,7 @@ static void external_snapshot_commit(BlkActionState *common) /* We don't need (or want) to use the transactional * bdrv_reopen_multiple() across all the entries at once, because we * don't want to abort all of them if one of them fails the reopen */ - if (!state->old_bs->copy_on_read) { + if (!atomic_read(&state->old_bs->copy_on_read)) { bdrv_reopen(state->old_bs, state->old_bs->open_flags & ~BDRV_O_RDWR, NULL); } @@ -2025,7 +2011,6 @@ static void block_dirty_bitmap_clear_prepare(BlkActionState *common, state->bitmap = block_dirty_bitmap_lookup(action->node, action->name, &state->bs, - &state->aio_context, errp); if (!state->bitmap) { return; @@ -2733,7 +2718,6 @@ void qmp_block_dirty_bitmap_add(const char *node, const char *name, bool has_granularity, uint32_t granularity, Error **errp) { - AioContext *aio_context; BlockDriverState *bs; if (!name || name[0] == '\0') { @@ -2746,14 +2730,11 @@ void qmp_block_dirty_bitmap_add(const char *node, const char *name, return; } - aio_context = bdrv_get_aio_context(bs); - aio_context_acquire(aio_context); - if (has_granularity) { if (granularity < 512 || !is_power_of_2(granularity)) { error_setg(errp, "Granularity must be power of 2 " "and at least 512"); - goto out; + return; } } else { /* Default to cluster size, if available: */ @@ -2761,19 +2742,15 @@ void qmp_block_dirty_bitmap_add(const char *node, const char *name, } bdrv_create_dirty_bitmap(bs, granularity, name, errp); - - out: - aio_context_release(aio_context); } void qmp_block_dirty_bitmap_remove(const char *node, const char *name, Error **errp) { - AioContext *aio_context; BlockDriverState *bs; BdrvDirtyBitmap *bitmap; - bitmap = block_dirty_bitmap_lookup(node, name, &bs, &aio_context, errp); + bitmap = block_dirty_bitmap_lookup(node, name, &bs, errp); if (!bitmap || !bs) { return; } @@ -2782,13 +2759,10 @@ void qmp_block_dirty_bitmap_remove(const char *node, const char *name, error_setg(errp, "Bitmap '%s' is currently frozen and cannot be removed", name); - goto out; + return; } bdrv_dirty_bitmap_make_anon(bitmap); bdrv_release_dirty_bitmap(bs, bitmap); - - out: - aio_context_release(aio_context); } /** @@ -2798,11 +2772,10 @@ void qmp_block_dirty_bitmap_remove(const char *node, const char *name, void qmp_block_dirty_bitmap_clear(const char *node, const char *name, Error **errp) { - AioContext *aio_context; BdrvDirtyBitmap *bitmap; BlockDriverState *bs; - bitmap = block_dirty_bitmap_lookup(node, name, &bs, &aio_context, errp); + bitmap = block_dirty_bitmap_lookup(node, name, &bs, errp); if (!bitmap || !bs) { return; } @@ -2811,18 +2784,15 @@ void qmp_block_dirty_bitmap_clear(const char *node, const char *name, error_setg(errp, "Bitmap '%s' is currently frozen and cannot be modified", name); - goto out; + return; } else if (!bdrv_dirty_bitmap_enabled(bitmap)) { error_setg(errp, "Bitmap '%s' is currently disabled and cannot be cleared", name); - goto out; + return; } bdrv_clear_dirty_bitmap(bitmap, NULL); - - out: - aio_context_release(aio_context); } void hmp_drive_del(Monitor *mon, const QDict *qdict) diff --git a/include/block/accounting.h b/include/block/accounting.h index 20891639d5..b833d26d6c 100644 --- a/include/block/accounting.h +++ b/include/block/accounting.h @@ -26,8 +26,10 @@ #define BLOCK_ACCOUNTING_H #include "qemu/timed-average.h" +#include "qemu/thread.h" typedef struct BlockAcctTimedStats BlockAcctTimedStats; +typedef struct BlockAcctStats BlockAcctStats; enum BlockAcctType { BLOCK_ACCT_READ, @@ -37,12 +39,14 @@ enum BlockAcctType { }; struct BlockAcctTimedStats { + BlockAcctStats *stats; TimedAverage latency[BLOCK_MAX_IOTYPE]; unsigned interval_length; /* in seconds */ QSLIST_ENTRY(BlockAcctTimedStats) entries; }; -typedef struct BlockAcctStats { +struct BlockAcctStats { + QemuMutex lock; uint64_t nr_bytes[BLOCK_MAX_IOTYPE]; uint64_t nr_ops[BLOCK_MAX_IOTYPE]; uint64_t invalid_ops[BLOCK_MAX_IOTYPE]; @@ -53,7 +57,7 @@ typedef struct BlockAcctStats { QSLIST_HEAD(, BlockAcctTimedStats) intervals; bool account_invalid; bool account_failed; -} BlockAcctStats; +}; typedef struct BlockAcctCookie { int64_t bytes; @@ -61,7 +65,8 @@ typedef struct BlockAcctCookie { enum BlockAcctType type; } BlockAcctCookie; -void block_acct_init(BlockAcctStats *stats, bool account_invalid, +void block_acct_init(BlockAcctStats *stats); +void block_acct_setup(BlockAcctStats *stats, bool account_invalid, bool account_failed); void block_acct_cleanup(BlockAcctStats *stats); void block_acct_add_interval(BlockAcctStats *stats, unsigned interval_length); diff --git a/include/block/block.h b/include/block/block.h index 9b355e92d8..a4f09df95a 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -402,7 +402,8 @@ void bdrv_drain_all(void); * block_job_defer_to_main_loop for how to do it). \ */ \ assert(!bs_->wakeup); \ - bs_->wakeup = true; \ + /* Set bs->wakeup before evaluating cond. */ \ + atomic_mb_set(&bs_->wakeup, true); \ while (busy_) { \ if ((cond)) { \ waited_ = busy_ = true; \ @@ -414,7 +415,7 @@ void bdrv_drain_all(void); waited_ |= busy_; \ } \ } \ - bs_->wakeup = false; \ + atomic_set(&bs_->wakeup, false); \ } \ waited_; }) diff --git a/include/block/block_int.h b/include/block/block_int.h index cb78c4fa82..748970055e 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -29,6 +29,7 @@ #include "qemu/option.h" #include "qemu/queue.h" #include "qemu/coroutine.h" +#include "qemu/stats64.h" #include "qemu/timer.h" #include "qapi-types.h" #include "qemu/hbitmap.h" @@ -595,11 +596,6 @@ struct BlockDriverState { /* Protected by AioContext lock */ - /* If true, copy read backing sectors into image. Can be >1 if more - * than one client has requested copy-on-read. - */ - int copy_on_read; - /* If we are reading a disk image, give its size in sectors. * Generally read-only; it is written to by load_snapshot and * save_snaphost, but the block layer is quiescent during those. @@ -609,34 +605,57 @@ struct BlockDriverState { /* Callback before write request is processed */ NotifierWithReturnList before_write_notifiers; - /* number of in-flight requests; overall and serialising */ - unsigned int in_flight; - unsigned int serialising_in_flight; + /* threshold limit for writes, in bytes. "High water mark". */ + uint64_t write_threshold_offset; + NotifierWithReturn write_threshold_notifier; - bool wakeup; + /* Writing to the list requires the BQL _and_ the dirty_bitmap_mutex. + * Reading from the list can be done with either the BQL or the + * dirty_bitmap_mutex. Modifying a bitmap only requires + * dirty_bitmap_mutex. */ + QemuMutex dirty_bitmap_mutex; + QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps; /* Offset after the highest byte written to */ - uint64_t wr_highest_offset; + Stat64 wr_highest_offset; - /* threshold limit for writes, in bytes. "High water mark". */ - uint64_t write_threshold_offset; - NotifierWithReturn write_threshold_notifier; + /* If true, copy read backing sectors into image. Can be >1 if more + * than one client has requested copy-on-read. Accessed with atomic + * ops. + */ + int copy_on_read; - /* counter for nested bdrv_io_plug */ - unsigned io_plugged; + /* number of in-flight requests; overall and serialising. + * Accessed with atomic ops. + */ + unsigned int in_flight; + unsigned int serialising_in_flight; - QLIST_HEAD(, BdrvTrackedRequest) tracked_requests; - CoQueue flush_queue; /* Serializing flush queue */ - bool active_flush_req; /* Flush request in flight? */ - unsigned int write_gen; /* Current data generation */ - unsigned int flushed_gen; /* Flushed write generation */ + /* Internal to BDRV_POLL_WHILE and bdrv_wakeup. Accessed with atomic + * ops. + */ + bool wakeup; - QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps; + /* counter for nested bdrv_io_plug. + * Accessed with atomic ops. + */ + unsigned io_plugged; /* do we need to tell the quest if we have a volatile write cache? */ int enable_write_cache; + /* Accessed with atomic ops. */ int quiesce_counter; + unsigned int write_gen; /* Current data generation */ + + /* Protected by reqs_lock. */ + CoMutex reqs_lock; + QLIST_HEAD(, BdrvTrackedRequest) tracked_requests; + CoQueue flush_queue; /* Serializing flush queue */ + bool active_flush_req; /* Flush request in flight? */ + + /* Only read/written by whoever has set active_flush_req to true. */ + unsigned int flushed_gen; /* Flushed write generation */ }; struct BlockBackendRootState { diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h index 9dea14ba03..ad6558af56 100644 --- a/include/block/dirty-bitmap.h +++ b/include/block/dirty-bitmap.h @@ -36,8 +36,6 @@ bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap); const char *bdrv_dirty_bitmap_name(const BdrvDirtyBitmap *bitmap); int64_t bdrv_dirty_bitmap_size(const BdrvDirtyBitmap *bitmap); DirtyBitmapStatus bdrv_dirty_bitmap_status(BdrvDirtyBitmap *bitmap); -int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, - int64_t sector); void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap, int64_t cur_sector, int64_t nr_sectors); void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap, @@ -45,6 +43,9 @@ void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap, int bdrv_dirty_bitmap_get_meta(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector, int nb_sectors); +int bdrv_dirty_bitmap_get_meta_locked(BlockDriverState *bs, + BdrvDirtyBitmap *bitmap, int64_t sector, + int nb_sectors); void bdrv_dirty_bitmap_reset_meta(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector, int nb_sectors); @@ -52,11 +53,6 @@ BdrvDirtyBitmapIter *bdrv_dirty_meta_iter_new(BdrvDirtyBitmap *bitmap); BdrvDirtyBitmapIter *bdrv_dirty_iter_new(BdrvDirtyBitmap *bitmap, uint64_t first_sector); void bdrv_dirty_iter_free(BdrvDirtyBitmapIter *iter); -int64_t bdrv_dirty_iter_next(BdrvDirtyBitmapIter *iter); -void bdrv_set_dirty_iter(BdrvDirtyBitmapIter *hbi, int64_t sector_num); -int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap); -int64_t bdrv_get_meta_dirty_count(BdrvDirtyBitmap *bitmap); -void bdrv_dirty_bitmap_truncate(BlockDriverState *bs); uint64_t bdrv_dirty_bitmap_serialization_size(const BdrvDirtyBitmap *bitmap, uint64_t start, uint64_t count); @@ -72,4 +68,19 @@ void bdrv_dirty_bitmap_deserialize_zeroes(BdrvDirtyBitmap *bitmap, bool finish); void bdrv_dirty_bitmap_deserialize_finish(BdrvDirtyBitmap *bitmap); +/* Functions that require manual locking. */ +void bdrv_dirty_bitmap_lock(BdrvDirtyBitmap *bitmap); +void bdrv_dirty_bitmap_unlock(BdrvDirtyBitmap *bitmap); +int bdrv_get_dirty_locked(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, + int64_t sector); +void bdrv_set_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap, + int64_t cur_sector, int64_t nr_sectors); +void bdrv_reset_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap, + int64_t cur_sector, int64_t nr_sectors); +int64_t bdrv_dirty_iter_next(BdrvDirtyBitmapIter *iter); +void bdrv_set_dirty_iter(BdrvDirtyBitmapIter *hbi, int64_t sector_num); +int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap); +int64_t bdrv_get_meta_dirty_count(BdrvDirtyBitmap *bitmap); +void bdrv_dirty_bitmap_truncate(BlockDriverState *bs); + #endif diff --git a/include/qemu/stats64.h b/include/qemu/stats64.h new file mode 100644 index 0000000000..4a357b3e9d --- /dev/null +++ b/include/qemu/stats64.h @@ -0,0 +1,193 @@ +/* + * Atomic operations on 64-bit quantities. + * + * Copyright (C) 2017 Red Hat, Inc. + * + * Author: Paolo Bonzini <pbonzini@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef QEMU_STATS64_H +#define QEMU_STATS64_H 1 + +#include "qemu/atomic.h" + +/* This provides atomic operations on 64-bit type, using a reader-writer + * spinlock on architectures that do not have 64-bit accesses. Even on + * those architectures, it tries hard not to take the lock. + */ + +typedef struct Stat64 { +#ifdef CONFIG_ATOMIC64 + uint64_t value; +#else + uint32_t low, high; + uint32_t lock; +#endif +} Stat64; + +#ifdef CONFIG_ATOMIC64 +static inline void stat64_init(Stat64 *s, uint64_t value) +{ + /* This is not guaranteed to be atomic! */ + *s = (Stat64) { value }; +} + +static inline uint64_t stat64_get(const Stat64 *s) +{ + return atomic_read__nocheck(&s->value); +} + +static inline void stat64_add(Stat64 *s, uint64_t value) +{ + atomic_add(&s->value, value); +} + +static inline void stat64_min(Stat64 *s, uint64_t value) +{ + uint64_t orig = atomic_read__nocheck(&s->value); + while (orig > value) { + orig = atomic_cmpxchg__nocheck(&s->value, orig, value); + } +} + +static inline void stat64_max(Stat64 *s, uint64_t value) +{ + uint64_t orig = atomic_read__nocheck(&s->value); + while (orig < value) { + orig = atomic_cmpxchg__nocheck(&s->value, orig, value); + } +} +#else +uint64_t stat64_get(const Stat64 *s); +bool stat64_min_slow(Stat64 *s, uint64_t value); +bool stat64_max_slow(Stat64 *s, uint64_t value); +bool stat64_add32_carry(Stat64 *s, uint32_t low, uint32_t high); + +static inline void stat64_init(Stat64 *s, uint64_t value) +{ + /* This is not guaranteed to be atomic! */ + *s = (Stat64) { .low = value, .high = value >> 32, .lock = 0 }; +} + +static inline void stat64_add(Stat64 *s, uint64_t value) +{ + uint32_t low, high; + high = value >> 32; + low = (uint32_t) value; + if (!low) { + if (high) { + atomic_add(&s->high, high); + } + return; + } + + for (;;) { + uint32_t orig = s->low; + uint32_t result = orig + low; + uint32_t old; + + if (result < low || high) { + /* If the high part is affected, take the lock. */ + if (stat64_add32_carry(s, low, high)) { + return; + } + continue; + } + + /* No carry, try with a 32-bit cmpxchg. The result is independent of + * the high 32 bits, so it can race just fine with stat64_add32_carry + * and even stat64_get! + */ + old = atomic_cmpxchg(&s->low, orig, result); + if (orig == old) { + return; + } + } +} + +static inline void stat64_min(Stat64 *s, uint64_t value) +{ + uint32_t low, high; + uint32_t orig_low, orig_high; + + high = value >> 32; + low = (uint32_t) value; + do { + orig_high = atomic_read(&s->high); + if (orig_high < high) { + return; + } + + if (orig_high == high) { + /* High 32 bits are equal. Read low after high, otherwise we + * can get a false positive (e.g. 0x1235,0x0000 changes to + * 0x1234,0x8000 and we read it as 0x1234,0x0000). Pairs with + * the write barrier in stat64_min_slow. + */ + smp_rmb(); + orig_low = atomic_read(&s->low); + if (orig_low <= low) { + return; + } + + /* See if we were lucky and a writer raced against us. The + * barrier is theoretically unnecessary, but if we remove it + * we may miss being lucky. + */ + smp_rmb(); + orig_high = atomic_read(&s->high); + if (orig_high < high) { + return; + } + } + + /* If the value changes in any way, we have to take the lock. */ + } while (!stat64_min_slow(s, value)); +} + +static inline void stat64_max(Stat64 *s, uint64_t value) +{ + uint32_t low, high; + uint32_t orig_low, orig_high; + + high = value >> 32; + low = (uint32_t) value; + do { + orig_high = atomic_read(&s->high); + if (orig_high > high) { + return; + } + + if (orig_high == high) { + /* High 32 bits are equal. Read low after high, otherwise we + * can get a false positive (e.g. 0x1234,0x8000 changes to + * 0x1235,0x0000 and we read it as 0x1235,0x8000). Pairs with + * the write barrier in stat64_max_slow. + */ + smp_rmb(); + orig_low = atomic_read(&s->low); + if (orig_low >= low) { + return; + } + + /* See if we were lucky and a writer raced against us. The + * barrier is theoretically unnecessary, but if we remove it + * we may miss being lucky. + */ + smp_rmb(); + orig_high = atomic_read(&s->high); + if (orig_high > high) { + return; + } + } + + /* If the value changes in any way, we have to take the lock. */ + } while (!stat64_max_slow(s, value)); +} + +#endif + +#endif diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h index 840ad6134c..999eb2333a 100644 --- a/include/sysemu/block-backend.h +++ b/include/sysemu/block-backend.h @@ -72,15 +72,13 @@ typedef struct BlockDevOps { * fields that must be public. This is in particular for QLIST_ENTRY() and * friends so that BlockBackends can be kept in lists outside block-backend.c */ typedef struct BlockBackendPublic { - /* I/O throttling has its own locking, but also some fields are - * protected by the AioContext lock. - */ - - /* Protected by AioContext lock. */ + /* throttled_reqs_lock protects the CoQueues for throttled requests. */ + CoMutex throttled_reqs_lock; CoQueue throttled_reqs[2]; /* Nonzero if the I/O limits are currently being ignored; generally - * it is zero. */ + * it is zero. Accessed with atomic operations. + */ unsigned int io_limits_disabled; /* The following fields are protected by the ThrottleGroup lock. diff --git a/migration/block.c b/migration/block.c index 3aae5a375e..7674ae1078 100644 --- a/migration/block.c +++ b/migration/block.c @@ -341,10 +341,8 @@ static int set_dirty_tracking(void) int ret; QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { - aio_context_acquire(blk_get_aio_context(bmds->blk)); bmds->dirty_bitmap = bdrv_create_dirty_bitmap(blk_bs(bmds->blk), BLOCK_SIZE, NULL, NULL); - aio_context_release(blk_get_aio_context(bmds->blk)); if (!bmds->dirty_bitmap) { ret = -errno; goto fail; @@ -355,9 +353,7 @@ static int set_dirty_tracking(void) fail: QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { if (bmds->dirty_bitmap) { - aio_context_acquire(blk_get_aio_context(bmds->blk)); bdrv_release_dirty_bitmap(blk_bs(bmds->blk), bmds->dirty_bitmap); - aio_context_release(blk_get_aio_context(bmds->blk)); } } return ret; @@ -370,9 +366,7 @@ static void unset_dirty_tracking(void) BlkMigDevState *bmds; QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { - aio_context_acquire(blk_get_aio_context(bmds->blk)); bdrv_release_dirty_bitmap(blk_bs(bmds->blk), bmds->dirty_bitmap); - aio_context_release(blk_get_aio_context(bmds->blk)); } } @@ -531,13 +525,16 @@ static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds, } else { blk_mig_unlock(); } - if (bdrv_get_dirty(bs, bmds->dirty_bitmap, sector)) { - + bdrv_dirty_bitmap_lock(bmds->dirty_bitmap); + if (bdrv_get_dirty_locked(bs, bmds->dirty_bitmap, sector)) { if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) { nr_sectors = total_sectors - sector; } else { nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK; } + bdrv_reset_dirty_bitmap_locked(bmds->dirty_bitmap, sector, nr_sectors); + bdrv_dirty_bitmap_unlock(bmds->dirty_bitmap); + blk = g_new(BlkMigBlock, 1); blk->buf = g_malloc(BLOCK_SIZE); blk->bmds = bmds; @@ -570,12 +567,12 @@ static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds, g_free(blk); } - bdrv_reset_dirty_bitmap(bmds->dirty_bitmap, sector, nr_sectors); sector += nr_sectors; bmds->cur_dirty = sector; - break; } + + bdrv_dirty_bitmap_unlock(bmds->dirty_bitmap); sector += BDRV_SECTORS_PER_DIRTY_CHUNK; bmds->cur_dirty = sector; } diff --git a/tests/docker/Makefile.include b/tests/docker/Makefile.include index 03eda37bf4..0ed8c3d323 100644 --- a/tests/docker/Makefile.include +++ b/tests/docker/Makefile.include @@ -126,7 +126,7 @@ docker-run: docker-qemu-src " COPYING $(EXECUTABLE) to $(IMAGE)")) $(call quiet-command, \ $(SRC_PATH)/tests/docker/docker.py run \ - -t \ + $(if $(NOUSER),,-u $(shell id -u)) -t \ $(if $V,,--rm) \ $(if $(DEBUG),-i,--net=none) \ -e TARGET_LIST=$(TARGET_LIST) \ diff --git a/tests/docker/dockerfiles/centos6.docker b/tests/docker/dockerfiles/centos6.docker index 34e0d3b91e..17a4d24d54 100644 --- a/tests/docker/dockerfiles/centos6.docker +++ b/tests/docker/dockerfiles/centos6.docker @@ -1,7 +1,7 @@ FROM centos:6 RUN yum install -y epel-release ENV PACKAGES libfdt-devel ccache \ - tar git make gcc g++ \ + tar git make gcc g++ flex bison \ zlib-devel glib2-devel SDL-devel pixman-devel \ epel-release RUN yum install -y $PACKAGES diff --git a/tests/docker/dockerfiles/fedora.docker b/tests/docker/dockerfiles/fedora.docker index c4f80ad3d8..4eaa8ed2a5 100644 --- a/tests/docker/dockerfiles/fedora.docker +++ b/tests/docker/dockerfiles/fedora.docker @@ -1,8 +1,8 @@ FROM fedora:latest ENV PACKAGES \ - ccache git tar PyYAML sparse flex bison python2 \ + ccache git tar PyYAML sparse flex bison python2 bzip2 hostname \ glib2-devel pixman-devel zlib-devel SDL-devel libfdt-devel \ - gcc gcc-c++ clang make perl which bc findutils \ + gcc gcc-c++ clang make perl which bc findutils libaio-devel \ mingw32-pixman mingw32-glib2 mingw32-gmp mingw32-SDL mingw32-pkg-config \ mingw32-gtk2 mingw32-gtk3 mingw32-gnutls mingw32-nettle mingw32-libtasn1 \ mingw32-libjpeg-turbo mingw32-libpng mingw32-curl mingw32-libssh2 \ diff --git a/util/Makefile.objs b/util/Makefile.objs index c6205ebf86..8a333d3dd7 100644 --- a/util/Makefile.objs +++ b/util/Makefile.objs @@ -42,4 +42,5 @@ util-obj-y += log.o util-obj-y += qdist.o util-obj-y += qht.o util-obj-y += range.o +util-obj-y += stats64.o util-obj-y += systemd.o diff --git a/util/stats64.c b/util/stats64.c new file mode 100644 index 0000000000..9968fcceac --- /dev/null +++ b/util/stats64.c @@ -0,0 +1,137 @@ +/* + * Atomic operations on 64-bit quantities. + * + * Copyright (C) 2017 Red Hat, Inc. + * + * Author: Paolo Bonzini <pbonzini@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/atomic.h" +#include "qemu/stats64.h" +#include "qemu/processor.h" + +#ifndef CONFIG_ATOMIC64 +static inline void stat64_rdlock(Stat64 *s) +{ + /* Keep out incoming writers to avoid them starving us. */ + atomic_add(&s->lock, 2); + + /* If there is a concurrent writer, wait for it. */ + while (atomic_read(&s->lock) & 1) { + cpu_relax(); + } +} + +static inline void stat64_rdunlock(Stat64 *s) +{ + atomic_sub(&s->lock, 2); +} + +static inline bool stat64_wrtrylock(Stat64 *s) +{ + return atomic_cmpxchg(&s->lock, 0, 1) == 0; +} + +static inline void stat64_wrunlock(Stat64 *s) +{ + atomic_dec(&s->lock); +} + +uint64_t stat64_get(const Stat64 *s) +{ + uint32_t high, low; + + stat64_rdlock((Stat64 *)s); + + /* 64-bit writes always take the lock, so we can read in + * any order. + */ + high = atomic_read(&s->high); + low = atomic_read(&s->low); + stat64_rdunlock((Stat64 *)s); + + return ((uint64_t)high << 32) | low; +} + +bool stat64_add32_carry(Stat64 *s, uint32_t low, uint32_t high) +{ + uint32_t old; + + if (!stat64_wrtrylock(s)) { + cpu_relax(); + return false; + } + + /* 64-bit reads always take the lock, so they don't care about the + * order of our update. By updating s->low first, we can check + * whether we have to carry into s->high. + */ + old = atomic_fetch_add(&s->low, low); + high += (old + low) < old; + atomic_add(&s->high, high); + stat64_wrunlock(s); + return true; +} + +bool stat64_min_slow(Stat64 *s, uint64_t value) +{ + uint32_t high, low; + uint64_t orig; + + if (!stat64_wrtrylock(s)) { + cpu_relax(); + return false; + } + + high = atomic_read(&s->high); + low = atomic_read(&s->low); + + orig = ((uint64_t)high << 32) | low; + if (orig < value) { + /* We have to set low before high, just like stat64_min reads + * high before low. The value may become higher temporarily, but + * stat64_get does not notice (it takes the lock) and the only ill + * effect on stat64_min is that the slow path may be triggered + * unnecessarily. + */ + atomic_set(&s->low, (uint32_t)value); + smp_wmb(); + atomic_set(&s->high, value >> 32); + } + stat64_wrunlock(s); + return true; +} + +bool stat64_max_slow(Stat64 *s, uint64_t value) +{ + uint32_t high, low; + uint64_t orig; + + if (!stat64_wrtrylock(s)) { + cpu_relax(); + return false; + } + + high = atomic_read(&s->high); + low = atomic_read(&s->low); + + orig = ((uint64_t)high << 32) | low; + if (orig > value) { + /* We have to set low before high, just like stat64_max reads + * high before low. The value may become lower temporarily, but + * stat64_get does not notice (it takes the lock) and the only ill + * effect on stat64_max is that the slow path may be triggered + * unnecessarily. + */ + atomic_set(&s->low, (uint32_t)value); + smp_wmb(); + atomic_set(&s->high, value >> 32); + } + stat64_wrunlock(s); + return true; +} +#endif |