diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2017-06-20 16:01:15 +0100 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2017-06-20 16:01:15 +0100 |
commit | 65a0e3e842df7f73ff2e4a61948f992a41e570a8 (patch) | |
tree | b1364b41d6f6b270c589433478297e2a91289af6 /block | |
parent | 7e56accdaf35234b69c33c85e4a44a5d56325e53 (diff) | |
parent | 5b50bf77ce6e773b04a303a2912876c9a1bfca43 (diff) |
Merge remote-tracking branch 'remotes/famz/tags/docker-and-block-pull-request' into staging
# gpg: Signature made Fri 16 Jun 2017 01:18:46 BST
# gpg: using RSA key 0xCA35624C6A9171C6
# gpg: Good signature from "Fam Zheng <famz@redhat.com>"
# gpg: WARNING: This key is not certified with sufficiently trusted signatures!
# gpg: It is not certain that the signature belongs to the owner.
# Primary key fingerprint: 5003 7CB7 9706 0F76 F021 AD56 CA35 624C 6A91 71C6
* remotes/famz/tags/docker-and-block-pull-request: (23 commits)
block: make accounting thread-safe
block: split BlockAcctStats creation and setup
block: introduce block_account_one_io
block: protect modification of dirty bitmaps with a mutex
migration/block: reset dirty bitmap before reading
block: introduce dirty_bitmap_mutex
block: protect tracked_requests and flush_queue with reqs_lock
block: access write_gen with atomics
block: use Stat64 for wr_highest_offset
util: add stats64 module
throttle-groups: protect throttled requests with a CoMutex
throttle-groups: do not use qemu_co_enter_next
throttle-groups: only start one coroutine from drained_begin
block: access io_plugged with atomic ops
block: access wakeup with atomic ops
block: access serialising_in_flight with atomic ops
block: access io_limits_disabled with atomic ops
block: access quiesce_counter with atomic ops
block: access copy_on_read with atomic ops
docker: Add flex and bison to centos6 image
...
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Diffstat (limited to 'block')
-rw-r--r-- | block/accounting.c | 78 | ||||
-rw-r--r-- | block/block-backend.c | 6 | ||||
-rw-r--r-- | block/dirty-bitmap.c | 112 | ||||
-rw-r--r-- | block/io.c | 51 | ||||
-rw-r--r-- | block/mirror.c | 14 | ||||
-rw-r--r-- | block/nfs.c | 4 | ||||
-rw-r--r-- | block/qapi.c | 2 | ||||
-rw-r--r-- | block/sheepdog.c | 3 | ||||
-rw-r--r-- | block/throttle-groups.c | 91 |
9 files changed, 267 insertions, 94 deletions
diff --git a/block/accounting.c b/block/accounting.c index 3f457c4e73..87ef5bbfaa 100644 --- a/block/accounting.c +++ b/block/accounting.c @@ -32,23 +32,28 @@ static QEMUClockType clock_type = QEMU_CLOCK_REALTIME; static const int qtest_latency_ns = NANOSECONDS_PER_SECOND / 1000; -void block_acct_init(BlockAcctStats *stats, bool account_invalid, - bool account_failed) +void block_acct_init(BlockAcctStats *stats) { - stats->account_invalid = account_invalid; - stats->account_failed = account_failed; - + qemu_mutex_init(&stats->lock); if (qtest_enabled()) { clock_type = QEMU_CLOCK_VIRTUAL; } } +void block_acct_setup(BlockAcctStats *stats, bool account_invalid, + bool account_failed) +{ + stats->account_invalid = account_invalid; + stats->account_failed = account_failed; +} + void block_acct_cleanup(BlockAcctStats *stats) { BlockAcctTimedStats *s, *next; QSLIST_FOREACH_SAFE(s, &stats->intervals, entries, next) { g_free(s); } + qemu_mutex_destroy(&stats->lock); } void block_acct_add_interval(BlockAcctStats *stats, unsigned interval_length) @@ -58,12 +63,15 @@ void block_acct_add_interval(BlockAcctStats *stats, unsigned interval_length) s = g_new0(BlockAcctTimedStats, 1); s->interval_length = interval_length; + s->stats = stats; + qemu_mutex_lock(&stats->lock); QSLIST_INSERT_HEAD(&stats->intervals, s, entries); for (i = 0; i < BLOCK_MAX_IOTYPE; i++) { timed_average_init(&s->latency[i], clock_type, (uint64_t) interval_length * NANOSECONDS_PER_SECOND); } + qemu_mutex_unlock(&stats->lock); } BlockAcctTimedStats *block_acct_interval_next(BlockAcctStats *stats, @@ -86,7 +94,8 @@ void block_acct_start(BlockAcctStats *stats, BlockAcctCookie *cookie, cookie->type = type; } -void block_acct_done(BlockAcctStats *stats, BlockAcctCookie *cookie) +static void block_account_one_io(BlockAcctStats *stats, BlockAcctCookie *cookie, + bool failed) { BlockAcctTimedStats *s; int64_t time_ns = qemu_clock_get_ns(clock_type); @@ -98,31 +107,16 @@ void block_acct_done(BlockAcctStats *stats, BlockAcctCookie *cookie) assert(cookie->type < BLOCK_MAX_IOTYPE); - stats->nr_bytes[cookie->type] += cookie->bytes; - stats->nr_ops[cookie->type]++; - stats->total_time_ns[cookie->type] += latency_ns; - stats->last_access_time_ns = time_ns; + qemu_mutex_lock(&stats->lock); - QSLIST_FOREACH(s, &stats->intervals, entries) { - timed_average_account(&s->latency[cookie->type], latency_ns); + if (failed) { + stats->failed_ops[cookie->type]++; + } else { + stats->nr_bytes[cookie->type] += cookie->bytes; + stats->nr_ops[cookie->type]++; } -} - -void block_acct_failed(BlockAcctStats *stats, BlockAcctCookie *cookie) -{ - assert(cookie->type < BLOCK_MAX_IOTYPE); - - stats->failed_ops[cookie->type]++; - - if (stats->account_failed) { - BlockAcctTimedStats *s; - int64_t time_ns = qemu_clock_get_ns(clock_type); - int64_t latency_ns = time_ns - cookie->start_time_ns; - - if (qtest_enabled()) { - latency_ns = qtest_latency_ns; - } + if (!failed || stats->account_failed) { stats->total_time_ns[cookie->type] += latency_ns; stats->last_access_time_ns = time_ns; @@ -130,29 +124,45 @@ void block_acct_failed(BlockAcctStats *stats, BlockAcctCookie *cookie) timed_average_account(&s->latency[cookie->type], latency_ns); } } + + qemu_mutex_unlock(&stats->lock); +} + +void block_acct_done(BlockAcctStats *stats, BlockAcctCookie *cookie) +{ + block_account_one_io(stats, cookie, false); +} + +void block_acct_failed(BlockAcctStats *stats, BlockAcctCookie *cookie) +{ + block_account_one_io(stats, cookie, true); } void block_acct_invalid(BlockAcctStats *stats, enum BlockAcctType type) { assert(type < BLOCK_MAX_IOTYPE); - /* block_acct_done() and block_acct_failed() update - * total_time_ns[], but this one does not. The reason is that - * invalid requests are accounted during their submission, - * therefore there's no actual I/O involved. */ - + /* block_account_one_io() updates total_time_ns[], but this one does + * not. The reason is that invalid requests are accounted during their + * submission, therefore there's no actual I/O involved. + */ + qemu_mutex_lock(&stats->lock); stats->invalid_ops[type]++; if (stats->account_invalid) { stats->last_access_time_ns = qemu_clock_get_ns(clock_type); } + qemu_mutex_unlock(&stats->lock); } void block_acct_merge_done(BlockAcctStats *stats, enum BlockAcctType type, int num_requests) { assert(type < BLOCK_MAX_IOTYPE); + + qemu_mutex_lock(&stats->lock); stats->merged[type] += num_requests; + qemu_mutex_unlock(&stats->lock); } int64_t block_acct_idle_time_ns(BlockAcctStats *stats) @@ -167,7 +177,9 @@ double block_acct_queue_depth(BlockAcctTimedStats *stats, assert(type < BLOCK_MAX_IOTYPE); + qemu_mutex_lock(&stats->stats->lock); sum = timed_average_sum(&stats->latency[type], &elapsed); + qemu_mutex_unlock(&stats->stats->lock); return (double) sum / elapsed; } diff --git a/block/block-backend.c b/block/block-backend.c index 7d7f3697d1..a2bbae90b1 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -216,8 +216,10 @@ BlockBackend *blk_new(uint64_t perm, uint64_t shared_perm) blk->shared_perm = shared_perm; blk_set_enable_write_cache(blk, true); + qemu_co_mutex_init(&blk->public.throttled_reqs_lock); qemu_co_queue_init(&blk->public.throttled_reqs[0]); qemu_co_queue_init(&blk->public.throttled_reqs[1]); + block_acct_init(&blk->stats); notifier_list_init(&blk->remove_bs_notifiers); notifier_list_init(&blk->insert_bs_notifiers); @@ -1953,7 +1955,7 @@ static void blk_root_drained_begin(BdrvChild *child) /* Note that blk->root may not be accessible here yet if we are just * attaching to a BlockDriverState that is drained. Use child instead. */ - if (blk->public.io_limits_disabled++ == 0) { + if (atomic_fetch_inc(&blk->public.io_limits_disabled) == 0) { throttle_group_restart_blk(blk); } } @@ -1964,7 +1966,7 @@ static void blk_root_drained_end(BdrvChild *child) assert(blk->quiesce_counter); assert(blk->public.io_limits_disabled); - --blk->public.io_limits_disabled; + atomic_dec(&blk->public.io_limits_disabled); if (--blk->quiesce_counter == 0) { if (blk->dev_ops && blk->dev_ops->drained_end) { diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c index 519737c8d3..a04c6e4154 100644 --- a/block/dirty-bitmap.c +++ b/block/dirty-bitmap.c @@ -37,6 +37,7 @@ * or enabled. A frozen bitmap can only abdicate() or reclaim(). */ struct BdrvDirtyBitmap { + QemuMutex *mutex; HBitmap *bitmap; /* Dirty sector bitmap implementation */ HBitmap *meta; /* Meta dirty bitmap */ BdrvDirtyBitmap *successor; /* Anonymous child; implies frozen status */ @@ -52,6 +53,27 @@ struct BdrvDirtyBitmapIter { BdrvDirtyBitmap *bitmap; }; +static inline void bdrv_dirty_bitmaps_lock(BlockDriverState *bs) +{ + qemu_mutex_lock(&bs->dirty_bitmap_mutex); +} + +static inline void bdrv_dirty_bitmaps_unlock(BlockDriverState *bs) +{ + qemu_mutex_unlock(&bs->dirty_bitmap_mutex); +} + +void bdrv_dirty_bitmap_lock(BdrvDirtyBitmap *bitmap) +{ + qemu_mutex_lock(bitmap->mutex); +} + +void bdrv_dirty_bitmap_unlock(BdrvDirtyBitmap *bitmap) +{ + qemu_mutex_unlock(bitmap->mutex); +} + +/* Called with BQL or dirty_bitmap lock taken. */ BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, const char *name) { BdrvDirtyBitmap *bm; @@ -65,6 +87,7 @@ BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, const char *name) return NULL; } +/* Called with BQL taken. */ void bdrv_dirty_bitmap_make_anon(BdrvDirtyBitmap *bitmap) { assert(!bdrv_dirty_bitmap_frozen(bitmap)); @@ -72,6 +95,7 @@ void bdrv_dirty_bitmap_make_anon(BdrvDirtyBitmap *bitmap) bitmap->name = NULL; } +/* Called with BQL taken. */ BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, uint32_t granularity, const char *name, @@ -96,11 +120,14 @@ BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, return NULL; } bitmap = g_new0(BdrvDirtyBitmap, 1); + bitmap->mutex = &bs->dirty_bitmap_mutex; bitmap->bitmap = hbitmap_alloc(bitmap_size, ctz32(sector_granularity)); bitmap->size = bitmap_size; bitmap->name = g_strdup(name); bitmap->disabled = false; + bdrv_dirty_bitmaps_lock(bs); QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list); + bdrv_dirty_bitmaps_unlock(bs); return bitmap; } @@ -119,20 +146,24 @@ void bdrv_create_meta_dirty_bitmap(BdrvDirtyBitmap *bitmap, int chunk_size) { assert(!bitmap->meta); + qemu_mutex_lock(bitmap->mutex); bitmap->meta = hbitmap_create_meta(bitmap->bitmap, chunk_size * BITS_PER_BYTE); + qemu_mutex_unlock(bitmap->mutex); } void bdrv_release_meta_dirty_bitmap(BdrvDirtyBitmap *bitmap) { assert(bitmap->meta); + qemu_mutex_lock(bitmap->mutex); hbitmap_free_meta(bitmap->bitmap); bitmap->meta = NULL; + qemu_mutex_unlock(bitmap->mutex); } -int bdrv_dirty_bitmap_get_meta(BlockDriverState *bs, - BdrvDirtyBitmap *bitmap, int64_t sector, - int nb_sectors) +int bdrv_dirty_bitmap_get_meta_locked(BlockDriverState *bs, + BdrvDirtyBitmap *bitmap, int64_t sector, + int nb_sectors) { uint64_t i; int sectors_per_bit = 1 << hbitmap_granularity(bitmap->meta); @@ -147,11 +178,26 @@ int bdrv_dirty_bitmap_get_meta(BlockDriverState *bs, return false; } +int bdrv_dirty_bitmap_get_meta(BlockDriverState *bs, + BdrvDirtyBitmap *bitmap, int64_t sector, + int nb_sectors) +{ + bool dirty; + + qemu_mutex_lock(bitmap->mutex); + dirty = bdrv_dirty_bitmap_get_meta_locked(bs, bitmap, sector, nb_sectors); + qemu_mutex_unlock(bitmap->mutex); + + return dirty; +} + void bdrv_dirty_bitmap_reset_meta(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector, int nb_sectors) { + qemu_mutex_lock(bitmap->mutex); hbitmap_reset(bitmap->meta, sector, nb_sectors); + qemu_mutex_unlock(bitmap->mutex); } int64_t bdrv_dirty_bitmap_size(const BdrvDirtyBitmap *bitmap) @@ -164,16 +210,19 @@ const char *bdrv_dirty_bitmap_name(const BdrvDirtyBitmap *bitmap) return bitmap->name; } +/* Called with BQL taken. */ bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap) { return bitmap->successor; } +/* Called with BQL taken. */ bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap) { return !(bitmap->disabled || bitmap->successor); } +/* Called with BQL taken. */ DirtyBitmapStatus bdrv_dirty_bitmap_status(BdrvDirtyBitmap *bitmap) { if (bdrv_dirty_bitmap_frozen(bitmap)) { @@ -188,6 +237,7 @@ DirtyBitmapStatus bdrv_dirty_bitmap_status(BdrvDirtyBitmap *bitmap) /** * Create a successor bitmap destined to replace this bitmap after an operation. * Requires that the bitmap is not frozen and has no successor. + * Called with BQL taken. */ int bdrv_dirty_bitmap_create_successor(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, Error **errp) @@ -220,6 +270,7 @@ int bdrv_dirty_bitmap_create_successor(BlockDriverState *bs, /** * For a bitmap with a successor, yield our name to the successor, * delete the old bitmap, and return a handle to the new bitmap. + * Called with BQL taken. */ BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, @@ -247,6 +298,7 @@ BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BlockDriverState *bs, * In cases of failure where we can no longer safely delete the parent, * we may wish to re-join the parent and child/successor. * The merged parent will be un-frozen, but not explicitly re-enabled. + * Called with BQL taken. */ BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *parent, @@ -271,25 +323,30 @@ BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BlockDriverState *bs, /** * Truncates _all_ bitmaps attached to a BDS. + * Called with BQL taken. */ void bdrv_dirty_bitmap_truncate(BlockDriverState *bs) { BdrvDirtyBitmap *bitmap; uint64_t size = bdrv_nb_sectors(bs); + bdrv_dirty_bitmaps_lock(bs); QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) { assert(!bdrv_dirty_bitmap_frozen(bitmap)); assert(!bitmap->active_iterators); hbitmap_truncate(bitmap->bitmap, size); bitmap->size = size; } + bdrv_dirty_bitmaps_unlock(bs); } +/* Called with BQL taken. */ static void bdrv_do_release_matching_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, bool only_named) { BdrvDirtyBitmap *bm, *next; + bdrv_dirty_bitmaps_lock(bs); QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) { if ((!bitmap || bm == bitmap) && (!only_named || bm->name)) { assert(!bm->active_iterators); @@ -301,15 +358,19 @@ static void bdrv_do_release_matching_dirty_bitmap(BlockDriverState *bs, g_free(bm); if (bitmap) { - return; + goto out; } } } if (bitmap) { abort(); } + +out: + bdrv_dirty_bitmaps_unlock(bs); } +/* Called with BQL taken. */ void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap) { bdrv_do_release_matching_dirty_bitmap(bs, bitmap, false); @@ -318,18 +379,21 @@ void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap) /** * Release all named dirty bitmaps attached to a BDS (for use in bdrv_close()). * There must not be any frozen bitmaps attached. + * Called with BQL taken. */ void bdrv_release_named_dirty_bitmaps(BlockDriverState *bs) { bdrv_do_release_matching_dirty_bitmap(bs, NULL, true); } +/* Called with BQL taken. */ void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap) { assert(!bdrv_dirty_bitmap_frozen(bitmap)); bitmap->disabled = true; } +/* Called with BQL taken. */ void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap) { assert(!bdrv_dirty_bitmap_frozen(bitmap)); @@ -342,6 +406,7 @@ BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs) BlockDirtyInfoList *list = NULL; BlockDirtyInfoList **plist = &list; + bdrv_dirty_bitmaps_lock(bs); QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) { BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1); BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1); @@ -354,12 +419,14 @@ BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs) *plist = entry; plist = &entry->next; } + bdrv_dirty_bitmaps_unlock(bs); return list; } -int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, - int64_t sector) +/* Called within bdrv_dirty_bitmap_lock..unlock */ +int bdrv_get_dirty_locked(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, + int64_t sector) { if (bitmap) { return hbitmap_get(bitmap->bitmap, sector); @@ -432,23 +499,42 @@ int64_t bdrv_dirty_iter_next(BdrvDirtyBitmapIter *iter) return hbitmap_iter_next(&iter->hbi); } +/* Called within bdrv_dirty_bitmap_lock..unlock */ +void bdrv_set_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap, + int64_t cur_sector, int64_t nr_sectors) +{ + assert(bdrv_dirty_bitmap_enabled(bitmap)); + hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors); +} + void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap, int64_t cur_sector, int64_t nr_sectors) { + bdrv_dirty_bitmap_lock(bitmap); + bdrv_set_dirty_bitmap_locked(bitmap, cur_sector, nr_sectors); + bdrv_dirty_bitmap_unlock(bitmap); +} + +/* Called within bdrv_dirty_bitmap_lock..unlock */ +void bdrv_reset_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap, + int64_t cur_sector, int64_t nr_sectors) +{ assert(bdrv_dirty_bitmap_enabled(bitmap)); - hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors); + hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors); } void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap, int64_t cur_sector, int64_t nr_sectors) { - assert(bdrv_dirty_bitmap_enabled(bitmap)); - hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors); + bdrv_dirty_bitmap_lock(bitmap); + bdrv_reset_dirty_bitmap_locked(bitmap, cur_sector, nr_sectors); + bdrv_dirty_bitmap_unlock(bitmap); } void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out) { assert(bdrv_dirty_bitmap_enabled(bitmap)); + bdrv_dirty_bitmap_lock(bitmap); if (!out) { hbitmap_reset_all(bitmap->bitmap); } else { @@ -457,6 +543,7 @@ void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out) hbitmap_granularity(backup)); *out = backup; } + bdrv_dirty_bitmap_unlock(bitmap); } void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in) @@ -508,12 +595,19 @@ void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, int64_t nr_sectors) { BdrvDirtyBitmap *bitmap; + + if (QLIST_EMPTY(&bs->dirty_bitmaps)) { + return; + } + + bdrv_dirty_bitmaps_lock(bs); QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) { if (!bdrv_dirty_bitmap_enabled(bitmap)) { continue; } hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors); } + bdrv_dirty_bitmaps_unlock(bs); } /** diff --git a/block/io.c b/block/io.c index ed31810c0a..91611ffb2a 100644 --- a/block/io.c +++ b/block/io.c @@ -130,13 +130,13 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) */ void bdrv_enable_copy_on_read(BlockDriverState *bs) { - bs->copy_on_read++; + atomic_inc(&bs->copy_on_read); } void bdrv_disable_copy_on_read(BlockDriverState *bs) { - assert(bs->copy_on_read > 0); - bs->copy_on_read--; + int old = atomic_fetch_dec(&bs->copy_on_read); + assert(old >= 1); } /* Check if any requests are in-flight (including throttled requests) */ @@ -241,7 +241,7 @@ void bdrv_drained_begin(BlockDriverState *bs) return; } - if (!bs->quiesce_counter++) { + if (atomic_fetch_inc(&bs->quiesce_counter) == 0) { aio_disable_external(bdrv_get_aio_context(bs)); bdrv_parent_drained_begin(bs); } @@ -252,7 +252,7 @@ void bdrv_drained_begin(BlockDriverState *bs) void bdrv_drained_end(BlockDriverState *bs) { assert(bs->quiesce_counter > 0); - if (--bs->quiesce_counter > 0) { + if (atomic_fetch_dec(&bs->quiesce_counter) > 1) { return; } @@ -375,11 +375,13 @@ void bdrv_drain_all(void) static void tracked_request_end(BdrvTrackedRequest *req) { if (req->serialising) { - req->bs->serialising_in_flight--; + atomic_dec(&req->bs->serialising_in_flight); } + qemu_co_mutex_lock(&req->bs->reqs_lock); QLIST_REMOVE(req, list); qemu_co_queue_restart_all(&req->wait_queue); + qemu_co_mutex_unlock(&req->bs->reqs_lock); } /** @@ -404,7 +406,9 @@ static void tracked_request_begin(BdrvTrackedRequest *req, qemu_co_queue_init(&req->wait_queue); + qemu_co_mutex_lock(&bs->reqs_lock); QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); + qemu_co_mutex_unlock(&bs->reqs_lock); } static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) @@ -414,7 +418,7 @@ static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) - overlap_offset; if (!req->serialising) { - req->bs->serialising_in_flight++; + atomic_inc(&req->bs->serialising_in_flight); req->serialising = true; } @@ -501,7 +505,8 @@ static void dummy_bh_cb(void *opaque) void bdrv_wakeup(BlockDriverState *bs) { - if (bs->wakeup) { + /* The barrier (or an atomic op) is in the caller. */ + if (atomic_read(&bs->wakeup)) { aio_bh_schedule_oneshot(qemu_get_aio_context(), dummy_bh_cb, NULL); } } @@ -519,12 +524,13 @@ static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) bool retry; bool waited = false; - if (!bs->serialising_in_flight) { + if (!atomic_read(&bs->serialising_in_flight)) { return false; } do { retry = false; + qemu_co_mutex_lock(&bs->reqs_lock); QLIST_FOREACH(req, &bs->tracked_requests, list) { if (req == self || (!req->serialising && !self->serialising)) { continue; @@ -543,7 +549,7 @@ static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) * (instead of producing a deadlock in the former case). */ if (!req->waiting_for) { self->waiting_for = req; - qemu_co_queue_wait(&req->wait_queue, NULL); + qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock); self->waiting_for = NULL; retry = true; waited = true; @@ -551,6 +557,7 @@ static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) } } } + qemu_co_mutex_unlock(&bs->reqs_lock); } while (retry); return waited; @@ -1144,7 +1151,7 @@ int coroutine_fn bdrv_co_preadv(BdrvChild *child, bdrv_inc_in_flight(bs); /* Don't do copy-on-read if we read data before write operation */ - if (bs->copy_on_read && !(flags & BDRV_REQ_NO_SERIALISING)) { + if (atomic_read(&bs->copy_on_read) && !(flags & BDRV_REQ_NO_SERIALISING)) { flags |= BDRV_REQ_COPY_ON_READ; } @@ -1401,12 +1408,10 @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, } bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); - ++bs->write_gen; + atomic_inc(&bs->write_gen); bdrv_set_dirty(bs, start_sector, end_sector - start_sector); - if (bs->wr_highest_offset < offset + bytes) { - bs->wr_highest_offset = offset + bytes; - } + stat64_max(&bs->wr_highest_offset, offset + bytes); if (ret >= 0) { bs->total_sectors = MAX(bs->total_sectors, end_sector); @@ -2292,14 +2297,17 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs) goto early_exit; } - current_gen = bs->write_gen; + qemu_co_mutex_lock(&bs->reqs_lock); + current_gen = atomic_read(&bs->write_gen); /* Wait until any previous flushes are completed */ while (bs->active_flush_req) { - qemu_co_queue_wait(&bs->flush_queue, NULL); + qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock); } + /* Flushes reach this point in nondecreasing current_gen order. */ bs->active_flush_req = true; + qemu_co_mutex_unlock(&bs->reqs_lock); /* Write back all layers by calling one driver function */ if (bs->drv->bdrv_co_flush) { @@ -2371,9 +2379,12 @@ out: if (ret == 0) { bs->flushed_gen = current_gen; } + + qemu_co_mutex_lock(&bs->reqs_lock); bs->active_flush_req = false; /* Return value is ignored - it's ok if wait queue is empty */ qemu_co_queue_next(&bs->flush_queue); + qemu_co_mutex_unlock(&bs->reqs_lock); early_exit: bdrv_dec_in_flight(bs); @@ -2517,7 +2528,7 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, } ret = 0; out: - ++bs->write_gen; + atomic_inc(&bs->write_gen); bdrv_set_dirty(bs, req.offset >> BDRV_SECTOR_BITS, req.bytes >> BDRV_SECTOR_BITS); tracked_request_end(&req); @@ -2644,7 +2655,7 @@ void bdrv_io_plug(BlockDriverState *bs) bdrv_io_plug(child->bs); } - if (bs->io_plugged++ == 0) { + if (atomic_fetch_inc(&bs->io_plugged) == 0) { BlockDriver *drv = bs->drv; if (drv && drv->bdrv_io_plug) { drv->bdrv_io_plug(bs); @@ -2657,7 +2668,7 @@ void bdrv_io_unplug(BlockDriverState *bs) BdrvChild *child; assert(bs->io_plugged); - if (--bs->io_plugged == 0) { + if (atomic_fetch_dec(&bs->io_plugged) == 1) { BlockDriver *drv = bs->drv; if (drv && drv->bdrv_io_unplug) { drv->bdrv_io_unplug(bs); diff --git a/block/mirror.c b/block/mirror.c index a2a970301c..19afcc6f1a 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -342,6 +342,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) int max_io_sectors = MAX((s->buf_size >> BDRV_SECTOR_BITS) / MAX_IN_FLIGHT, MAX_IO_SECTORS); + bdrv_dirty_bitmap_lock(s->dirty_bitmap); sector_num = bdrv_dirty_iter_next(s->dbi); if (sector_num < 0) { bdrv_set_dirty_iter(s->dbi, 0); @@ -349,6 +350,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap)); assert(sector_num >= 0); } + bdrv_dirty_bitmap_unlock(s->dirty_bitmap); first_chunk = sector_num / sectors_per_chunk; while (test_bit(first_chunk, s->in_flight_bitmap)) { @@ -360,12 +362,13 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) /* Find the number of consective dirty chunks following the first dirty * one, and wait for in flight requests in them. */ + bdrv_dirty_bitmap_lock(s->dirty_bitmap); while (nb_chunks * sectors_per_chunk < (s->buf_size >> BDRV_SECTOR_BITS)) { int64_t next_dirty; int64_t next_sector = sector_num + nb_chunks * sectors_per_chunk; int64_t next_chunk = next_sector / sectors_per_chunk; if (next_sector >= end || - !bdrv_get_dirty(source, s->dirty_bitmap, next_sector)) { + !bdrv_get_dirty_locked(source, s->dirty_bitmap, next_sector)) { break; } if (test_bit(next_chunk, s->in_flight_bitmap)) { @@ -386,8 +389,10 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) * calling bdrv_get_block_status_above could yield - if some blocks are * marked dirty in this window, we need to know. */ - bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num, - nb_chunks * sectors_per_chunk); + bdrv_reset_dirty_bitmap_locked(s->dirty_bitmap, sector_num, + nb_chunks * sectors_per_chunk); + bdrv_dirty_bitmap_unlock(s->dirty_bitmap); + bitmap_set(s->in_flight_bitmap, sector_num / sectors_per_chunk, nb_chunks); while (nb_chunks > 0 && sector_num < end) { int64_t ret; @@ -506,6 +511,8 @@ static void mirror_exit(BlockJob *job, void *opaque) BlockDriverState *mirror_top_bs = s->mirror_top_bs; Error *local_err = NULL; + bdrv_release_dirty_bitmap(src, s->dirty_bitmap); + /* Make sure that the source BDS doesn't go away before we called * block_job_completed(). */ bdrv_ref(src); @@ -904,7 +911,6 @@ immediate_exit: g_free(s->cow_bitmap); g_free(s->in_flight_bitmap); bdrv_dirty_iter_free(s->dbi); - bdrv_release_dirty_bitmap(bs, s->dirty_bitmap); data = g_malloc(sizeof(*data)); data->ret = ret; diff --git a/block/nfs.c b/block/nfs.c index 848b2c0bb0..18c87d2f25 100644 --- a/block/nfs.c +++ b/block/nfs.c @@ -730,7 +730,9 @@ nfs_get_allocated_file_size_cb(int ret, struct nfs_context *nfs, void *data, if (task->ret < 0) { error_report("NFS Error: %s", nfs_get_error(nfs)); } - task->complete = 1; + + /* Set task->complete before reading bs->wakeup. */ + atomic_mb_set(&task->complete, 1); bdrv_wakeup(task->bs); } diff --git a/block/qapi.c b/block/qapi.c index a40922ea26..14b60ae66c 100644 --- a/block/qapi.c +++ b/block/qapi.c @@ -441,7 +441,7 @@ static BlockStats *bdrv_query_bds_stats(const BlockDriverState *bs, s->node_name = g_strdup(bdrv_get_node_name(bs)); } - s->stats->wr_highest_offset = bs->wr_highest_offset; + s->stats->wr_highest_offset = stat64_get(&bs->wr_highest_offset); if (bs->file) { s->has_parent = true; diff --git a/block/sheepdog.c b/block/sheepdog.c index a18315a1ca..5ebf5d9fbb 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -698,7 +698,8 @@ out: srco->co = NULL; srco->ret = ret; - srco->finished = true; + /* Set srco->finished before reading bs->wakeup. */ + atomic_mb_set(&srco->finished, true); if (srco->bs) { bdrv_wakeup(srco->bs); } diff --git a/block/throttle-groups.c b/block/throttle-groups.c index b73e7a800b..a181cb1dee 100644 --- a/block/throttle-groups.c +++ b/block/throttle-groups.c @@ -240,7 +240,7 @@ static bool throttle_group_schedule_timer(BlockBackend *blk, bool is_write) ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); bool must_wait; - if (blkp->io_limits_disabled) { + if (atomic_read(&blkp->io_limits_disabled)) { return false; } @@ -260,6 +260,25 @@ static bool throttle_group_schedule_timer(BlockBackend *blk, bool is_write) return must_wait; } +/* Start the next pending I/O request for a BlockBackend. Return whether + * any request was actually pending. + * + * @blk: the current BlockBackend + * @is_write: the type of operation (read/write) + */ +static bool coroutine_fn throttle_group_co_restart_queue(BlockBackend *blk, + bool is_write) +{ + BlockBackendPublic *blkp = blk_get_public(blk); + bool ret; + + qemu_co_mutex_lock(&blkp->throttled_reqs_lock); + ret = qemu_co_queue_next(&blkp->throttled_reqs[is_write]); + qemu_co_mutex_unlock(&blkp->throttled_reqs_lock); + + return ret; +} + /* Look for the next pending I/O request and schedule it. * * This assumes that tg->lock is held. @@ -287,12 +306,12 @@ static void schedule_next_request(BlockBackend *blk, bool is_write) if (!must_wait) { /* Give preference to requests from the current blk */ if (qemu_in_coroutine() && - qemu_co_queue_next(&blkp->throttled_reqs[is_write])) { + throttle_group_co_restart_queue(blk, is_write)) { token = blk; } else { ThrottleTimers *tt = &blk_get_public(token)->throttle_timers; int64_t now = qemu_clock_get_ns(tt->clock_type); - timer_mod(tt->timers[is_write], now + 1); + timer_mod(tt->timers[is_write], now); tg->any_timer_armed[is_write] = true; } tg->tokens[is_write] = token; @@ -326,7 +345,10 @@ void coroutine_fn throttle_group_co_io_limits_intercept(BlockBackend *blk, if (must_wait || blkp->pending_reqs[is_write]) { blkp->pending_reqs[is_write]++; qemu_mutex_unlock(&tg->lock); - qemu_co_queue_wait(&blkp->throttled_reqs[is_write], NULL); + qemu_co_mutex_lock(&blkp->throttled_reqs_lock); + qemu_co_queue_wait(&blkp->throttled_reqs[is_write], + &blkp->throttled_reqs_lock); + qemu_co_mutex_unlock(&blkp->throttled_reqs_lock); qemu_mutex_lock(&tg->lock); blkp->pending_reqs[is_write]--; } @@ -340,15 +362,50 @@ void coroutine_fn throttle_group_co_io_limits_intercept(BlockBackend *blk, qemu_mutex_unlock(&tg->lock); } +typedef struct { + BlockBackend *blk; + bool is_write; +} RestartData; + +static void coroutine_fn throttle_group_restart_queue_entry(void *opaque) +{ + RestartData *data = opaque; + BlockBackend *blk = data->blk; + bool is_write = data->is_write; + BlockBackendPublic *blkp = blk_get_public(blk); + ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts); + bool empty_queue; + + empty_queue = !throttle_group_co_restart_queue(blk, is_write); + + /* If the request queue was empty then we have to take care of + * scheduling the next one */ + if (empty_queue) { + qemu_mutex_lock(&tg->lock); + schedule_next_request(blk, is_write); + qemu_mutex_unlock(&tg->lock); + } +} + +static void throttle_group_restart_queue(BlockBackend *blk, bool is_write) +{ + Coroutine *co; + RestartData rd = { + .blk = blk, + .is_write = is_write + }; + + co = qemu_coroutine_create(throttle_group_restart_queue_entry, &rd); + aio_co_enter(blk_get_aio_context(blk), co); +} + void throttle_group_restart_blk(BlockBackend *blk) { BlockBackendPublic *blkp = blk_get_public(blk); - int i; - for (i = 0; i < 2; i++) { - while (qemu_co_enter_next(&blkp->throttled_reqs[i])) { - ; - } + if (blkp->throttle_state) { + throttle_group_restart_queue(blk, 0); + throttle_group_restart_queue(blk, 1); } } @@ -376,8 +433,7 @@ void throttle_group_config(BlockBackend *blk, ThrottleConfig *cfg) throttle_config(ts, tt, cfg); qemu_mutex_unlock(&tg->lock); - qemu_co_enter_next(&blkp->throttled_reqs[0]); - qemu_co_enter_next(&blkp->throttled_reqs[1]); + throttle_group_restart_blk(blk); } /* Get the throttle configuration from a particular group. Similar to @@ -408,7 +464,6 @@ static void timer_cb(BlockBackend *blk, bool is_write) BlockBackendPublic *blkp = blk_get_public(blk); ThrottleState *ts = blkp->throttle_state; ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); - bool empty_queue; /* The timer has just been fired, so we can update the flag */ qemu_mutex_lock(&tg->lock); @@ -416,17 +471,7 @@ static void timer_cb(BlockBackend *blk, bool is_write) qemu_mutex_unlock(&tg->lock); /* Run the request that was waiting for this timer */ - aio_context_acquire(blk_get_aio_context(blk)); - empty_queue = !qemu_co_enter_next(&blkp->throttled_reqs[is_write]); - aio_context_release(blk_get_aio_context(blk)); - - /* If the request queue was empty then we have to take care of - * scheduling the next one */ - if (empty_queue) { - qemu_mutex_lock(&tg->lock); - schedule_next_request(blk, is_write); - qemu_mutex_unlock(&tg->lock); - } + throttle_group_restart_queue(blk, is_write); } static void read_timer_cb(void *opaque) |