diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/backup.c | 2 | ||||
-rw-r--r-- | block/block-backend.c | 5 | ||||
-rw-r--r-- | block/dirty-bitmap.c | 57 | ||||
-rw-r--r-- | block/io.c | 332 | ||||
-rw-r--r-- | block/mirror.c | 613 | ||||
-rw-r--r-- | block/vvfat.c | 1 |
6 files changed, 767 insertions, 243 deletions
diff --git a/block/backup.c b/block/backup.c index 5661435675..d18be40caf 100644 --- a/block/backup.c +++ b/block/backup.c @@ -354,7 +354,7 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job) HBitmapIter hbi; hbitmap_iter_init(&hbi, job->copy_bitmap, 0); - while ((cluster = hbitmap_iter_next(&hbi)) != -1) { + while ((cluster = hbitmap_iter_next(&hbi, true)) != -1) { do { if (yield_and_check(job)) { return 0; diff --git a/block/block-backend.c b/block/block-backend.c index 2d1a3463e8..6b75bca317 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -767,6 +767,11 @@ void blk_remove_bs(BlockBackend *blk) blk_update_root_state(blk); + /* bdrv_root_unref_child() will cause blk->root to become stale and may + * switch to a completion coroutine later on. Let's drain all I/O here + * to avoid that and a potential QEMU crash. + */ + blk_drain(blk); bdrv_root_unref_child(blk->root); blk->root = NULL; } diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c index 383d742cdb..db1782ec1f 100644 --- a/block/dirty-bitmap.c +++ b/block/dirty-bitmap.c @@ -519,7 +519,62 @@ void bdrv_dirty_iter_free(BdrvDirtyBitmapIter *iter) int64_t bdrv_dirty_iter_next(BdrvDirtyBitmapIter *iter) { - return hbitmap_iter_next(&iter->hbi); + return hbitmap_iter_next(&iter->hbi, true); +} + +/** + * Return the next consecutively dirty area in the dirty bitmap + * belonging to the given iterator @iter. + * + * @max_offset: Maximum value that may be returned for + * *offset + *bytes + * @offset: Will contain the start offset of the next dirty area + * @bytes: Will contain the length of the next dirty area + * + * Returns: True if a dirty area could be found before max_offset + * (which means that *offset and *bytes then contain valid + * values), false otherwise. + * + * Note that @iter is never advanced if false is returned. If an area + * is found (which means that true is returned), it will be advanced + * past that area. + */ +bool bdrv_dirty_iter_next_area(BdrvDirtyBitmapIter *iter, uint64_t max_offset, + uint64_t *offset, int *bytes) +{ + uint32_t granularity = bdrv_dirty_bitmap_granularity(iter->bitmap); + uint64_t gran_max_offset; + int64_t ret; + int size; + + if (max_offset == iter->bitmap->size) { + /* If max_offset points to the image end, round it up by the + * bitmap granularity */ + gran_max_offset = ROUND_UP(max_offset, granularity); + } else { + gran_max_offset = max_offset; + } + + ret = hbitmap_iter_next(&iter->hbi, false); + if (ret < 0 || ret + granularity > gran_max_offset) { + return false; + } + + *offset = ret; + size = 0; + + assert(granularity <= INT_MAX); + + do { + /* Advance iterator */ + ret = hbitmap_iter_next(&iter->hbi, true); + size += granularity; + } while (ret + granularity <= gran_max_offset && + hbitmap_iter_next(&iter->hbi, false) == ret + granularity && + size <= INT_MAX - granularity); + + *bytes = MIN(size, max_offset - *offset); + return true; } /* Called within bdrv_dirty_bitmap_lock..unlock */ diff --git a/block/io.c b/block/io.c index b7beaeeb9f..ef4fedd364 100644 --- a/block/io.c +++ b/block/io.c @@ -38,15 +38,18 @@ /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */ #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS) +static AioWait drain_all_aio_wait; + static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int bytes, BdrvRequestFlags flags); -void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore) +void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore, + bool ignore_bds_parents) { BdrvChild *c, *next; QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { - if (c == ignore) { + if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) { continue; } if (c->role->drained_begin) { @@ -55,12 +58,13 @@ void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore) } } -void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore) +void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore, + bool ignore_bds_parents) { BdrvChild *c, *next; QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { - if (c == ignore) { + if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) { continue; } if (c->role->drained_end) { @@ -69,6 +73,24 @@ void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore) } } +static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore, + bool ignore_bds_parents) +{ + BdrvChild *c, *next; + bool busy = false; + + QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { + if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) { + continue; + } + if (c->role->drained_poll) { + busy |= c->role->drained_poll(c); + } + } + + return busy; +} + static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src) { dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer); @@ -148,7 +170,9 @@ typedef struct { bool done; bool begin; bool recursive; + bool poll; BdrvChild *parent; + bool ignore_bds_parents; } BdrvCoDrainData; static void coroutine_fn bdrv_drain_invoke_entry(void *opaque) @@ -164,67 +188,83 @@ static void coroutine_fn bdrv_drain_invoke_entry(void *opaque) /* Set data->done before reading bs->wakeup. */ atomic_mb_set(&data->done, true); - bdrv_wakeup(bs); + bdrv_dec_in_flight(bs); + + if (data->begin) { + g_free(data); + } } /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */ -static void bdrv_drain_invoke(BlockDriverState *bs, bool begin, bool recursive) +static void bdrv_drain_invoke(BlockDriverState *bs, bool begin) { - BdrvChild *child, *tmp; - BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin}; + BdrvCoDrainData *data; if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) || (!begin && !bs->drv->bdrv_co_drain_end)) { return; } - data.co = qemu_coroutine_create(bdrv_drain_invoke_entry, &data); - bdrv_coroutine_enter(bs, data.co); - BDRV_POLL_WHILE(bs, !data.done); + data = g_new(BdrvCoDrainData, 1); + *data = (BdrvCoDrainData) { + .bs = bs, + .done = false, + .begin = begin + }; - if (recursive) { - QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) { - bdrv_drain_invoke(child->bs, begin, true); - } + /* Make sure the driver callback completes during the polling phase for + * drain_begin. */ + bdrv_inc_in_flight(bs); + data->co = qemu_coroutine_create(bdrv_drain_invoke_entry, data); + aio_co_schedule(bdrv_get_aio_context(bs), data->co); + + if (!begin) { + BDRV_POLL_WHILE(bs, !data->done); + g_free(data); } } -static bool bdrv_drain_recurse(BlockDriverState *bs) +/* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */ +bool bdrv_drain_poll(BlockDriverState *bs, bool recursive, + BdrvChild *ignore_parent, bool ignore_bds_parents) { - BdrvChild *child, *tmp; - bool waited; + BdrvChild *child, *next; - /* Wait for drained requests to finish */ - waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0); - - QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) { - BlockDriverState *bs = child->bs; - bool in_main_loop = - qemu_get_current_aio_context() == qemu_get_aio_context(); - assert(bs->refcnt > 0); - if (in_main_loop) { - /* In case the recursive bdrv_drain_recurse processes a - * block_job_defer_to_main_loop BH and modifies the graph, - * let's hold a reference to bs until we are done. - * - * IOThread doesn't have such a BH, and it is not safe to call - * bdrv_unref without BQL, so skip doing it there. - */ - bdrv_ref(bs); - } - waited |= bdrv_drain_recurse(bs); - if (in_main_loop) { - bdrv_unref(bs); + if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) { + return true; + } + + if (atomic_read(&bs->in_flight)) { + return true; + } + + if (recursive) { + assert(!ignore_bds_parents); + QLIST_FOREACH_SAFE(child, &bs->children, next, next) { + if (bdrv_drain_poll(child->bs, recursive, child, false)) { + return true; + } } } - return waited; + return false; +} + +static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive, + BdrvChild *ignore_parent) +{ + /* Execute pending BHs first and check everything else only after the BHs + * have executed. */ + while (aio_poll(bs->aio_context, false)); + + return bdrv_drain_poll(bs, recursive, ignore_parent, false); } static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, - BdrvChild *parent); + BdrvChild *parent, bool ignore_bds_parents, + bool poll); static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, - BdrvChild *parent); + BdrvChild *parent, bool ignore_bds_parents); static void bdrv_co_drain_bh_cb(void *opaque) { @@ -232,11 +272,18 @@ static void bdrv_co_drain_bh_cb(void *opaque) Coroutine *co = data->co; BlockDriverState *bs = data->bs; - bdrv_dec_in_flight(bs); - if (data->begin) { - bdrv_do_drained_begin(bs, data->recursive, data->parent); + if (bs) { + bdrv_dec_in_flight(bs); + if (data->begin) { + bdrv_do_drained_begin(bs, data->recursive, data->parent, + data->ignore_bds_parents, data->poll); + } else { + bdrv_do_drained_end(bs, data->recursive, data->parent, + data->ignore_bds_parents); + } } else { - bdrv_do_drained_end(bs, data->recursive, data->parent); + assert(data->begin); + bdrv_drain_all_begin(); } data->done = true; @@ -245,7 +292,9 @@ static void bdrv_co_drain_bh_cb(void *opaque) static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs, bool begin, bool recursive, - BdrvChild *parent) + BdrvChild *parent, + bool ignore_bds_parents, + bool poll) { BdrvCoDrainData data; @@ -260,8 +309,12 @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs, .begin = begin, .recursive = recursive, .parent = parent, + .ignore_bds_parents = ignore_bds_parents, + .poll = poll, }; - bdrv_inc_in_flight(bs); + if (bs) { + bdrv_inc_in_flight(bs); + } aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), bdrv_co_drain_bh_cb, &data); @@ -271,79 +324,106 @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs, assert(data.done); } -void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, - BdrvChild *parent) +void bdrv_do_drained_begin_quiesce(BlockDriverState *bs, + BdrvChild *parent, bool ignore_bds_parents) { - BdrvChild *child, *next; - - if (qemu_in_coroutine()) { - bdrv_co_yield_to_drain(bs, true, recursive, parent); - return; - } + assert(!qemu_in_coroutine()); /* Stop things in parent-to-child order */ if (atomic_fetch_inc(&bs->quiesce_counter) == 0) { aio_disable_external(bdrv_get_aio_context(bs)); } - bdrv_parent_drained_begin(bs, parent); - bdrv_drain_invoke(bs, true, false); - bdrv_drain_recurse(bs); + bdrv_parent_drained_begin(bs, parent, ignore_bds_parents); + bdrv_drain_invoke(bs, true); +} + +static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, + BdrvChild *parent, bool ignore_bds_parents, + bool poll) +{ + BdrvChild *child, *next; + + if (qemu_in_coroutine()) { + bdrv_co_yield_to_drain(bs, true, recursive, parent, ignore_bds_parents, + poll); + return; + } + + bdrv_do_drained_begin_quiesce(bs, parent, ignore_bds_parents); if (recursive) { + assert(!ignore_bds_parents); bs->recursive_quiesce_counter++; QLIST_FOREACH_SAFE(child, &bs->children, next, next) { - bdrv_do_drained_begin(child->bs, true, child); + bdrv_do_drained_begin(child->bs, true, child, ignore_bds_parents, + false); } } + + /* + * Wait for drained requests to finish. + * + * Calling BDRV_POLL_WHILE() only once for the top-level node is okay: The + * call is needed so things in this AioContext can make progress even + * though we don't return to the main AioContext loop - this automatically + * includes other nodes in the same AioContext and therefore all child + * nodes. + */ + if (poll) { + assert(!ignore_bds_parents); + BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent)); + } } void bdrv_drained_begin(BlockDriverState *bs) { - bdrv_do_drained_begin(bs, false, NULL); + bdrv_do_drained_begin(bs, false, NULL, false, true); } void bdrv_subtree_drained_begin(BlockDriverState *bs) { - bdrv_do_drained_begin(bs, true, NULL); + bdrv_do_drained_begin(bs, true, NULL, false, true); } -void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, - BdrvChild *parent) +static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, + BdrvChild *parent, bool ignore_bds_parents) { BdrvChild *child, *next; int old_quiesce_counter; if (qemu_in_coroutine()) { - bdrv_co_yield_to_drain(bs, false, recursive, parent); + bdrv_co_yield_to_drain(bs, false, recursive, parent, ignore_bds_parents, + false); return; } assert(bs->quiesce_counter > 0); old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter); /* Re-enable things in child-to-parent order */ - bdrv_drain_invoke(bs, false, false); - bdrv_parent_drained_end(bs, parent); + bdrv_drain_invoke(bs, false); + bdrv_parent_drained_end(bs, parent, ignore_bds_parents); if (old_quiesce_counter == 1) { aio_enable_external(bdrv_get_aio_context(bs)); } if (recursive) { + assert(!ignore_bds_parents); bs->recursive_quiesce_counter--; QLIST_FOREACH_SAFE(child, &bs->children, next, next) { - bdrv_do_drained_end(child->bs, true, child); + bdrv_do_drained_end(child->bs, true, child, ignore_bds_parents); } } } void bdrv_drained_end(BlockDriverState *bs) { - bdrv_do_drained_end(bs, false, NULL); + bdrv_do_drained_end(bs, false, NULL, false); } void bdrv_subtree_drained_end(BlockDriverState *bs) { - bdrv_do_drained_end(bs, true, NULL); + bdrv_do_drained_end(bs, true, NULL, false); } void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent) @@ -351,7 +431,7 @@ void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent) int i; for (i = 0; i < new_parent->recursive_quiesce_counter; i++) { - bdrv_do_drained_begin(child->bs, true, child); + bdrv_do_drained_begin(child->bs, true, child, false, true); } } @@ -360,7 +440,7 @@ void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent) int i; for (i = 0; i < old_parent->recursive_quiesce_counter; i++) { - bdrv_do_drained_end(child->bs, true, child); + bdrv_do_drained_end(child->bs, true, child, false); } } @@ -370,10 +450,6 @@ void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent) * * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState * AioContext. - * - * Only this BlockDriverState's AioContext is run, so in-flight requests must - * not depend on events in other AioContexts. In that case, use - * bdrv_drain_all() instead. */ void coroutine_fn bdrv_co_drain(BlockDriverState *bs) { @@ -388,6 +464,39 @@ void bdrv_drain(BlockDriverState *bs) bdrv_drained_end(bs); } +static void bdrv_drain_assert_idle(BlockDriverState *bs) +{ + BdrvChild *child, *next; + + assert(atomic_read(&bs->in_flight) == 0); + QLIST_FOREACH_SAFE(child, &bs->children, next, next) { + bdrv_drain_assert_idle(child->bs); + } +} + +unsigned int bdrv_drain_all_count = 0; + +static bool bdrv_drain_all_poll(void) +{ + BlockDriverState *bs = NULL; + bool result = false; + + /* Execute pending BHs first (may modify the graph) and check everything + * else only after the BHs have executed. */ + while (aio_poll(qemu_get_aio_context(), false)); + + /* bdrv_drain_poll() can't make changes to the graph and we are holding the + * main AioContext lock, so iterating bdrv_next_all_states() is safe. */ + while ((bs = bdrv_next_all_states(bs))) { + AioContext *aio_context = bdrv_get_aio_context(bs); + aio_context_acquire(aio_context); + result |= bdrv_drain_poll(bs, false, NULL, true); + aio_context_release(aio_context); + } + + return result; +} + /* * Wait for pending requests to complete across all BlockDriverStates * @@ -402,73 +511,51 @@ void bdrv_drain(BlockDriverState *bs) */ void bdrv_drain_all_begin(void) { - /* Always run first iteration so any pending completion BHs run */ - bool waited = true; - BlockDriverState *bs; - BdrvNextIterator it; - GSList *aio_ctxs = NULL, *ctx; + BlockDriverState *bs = NULL; + + if (qemu_in_coroutine()) { + bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true); + return; + } - /* BDRV_POLL_WHILE() for a node can only be called from its own I/O thread - * or the main loop AioContext. We potentially use BDRV_POLL_WHILE() on - * nodes in several different AioContexts, so make sure we're in the main - * context. */ + /* AIO_WAIT_WHILE() with a NULL context can only be called from the main + * loop AioContext, so make sure we're in the main context. */ assert(qemu_get_current_aio_context() == qemu_get_aio_context()); + assert(bdrv_drain_all_count < INT_MAX); + bdrv_drain_all_count++; - for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { + /* Quiesce all nodes, without polling in-flight requests yet. The graph + * cannot change during this loop. */ + while ((bs = bdrv_next_all_states(bs))) { AioContext *aio_context = bdrv_get_aio_context(bs); - /* Stop things in parent-to-child order */ aio_context_acquire(aio_context); - aio_disable_external(aio_context); - bdrv_parent_drained_begin(bs, NULL); - bdrv_drain_invoke(bs, true, true); + bdrv_do_drained_begin(bs, false, NULL, true, false); aio_context_release(aio_context); - - if (!g_slist_find(aio_ctxs, aio_context)) { - aio_ctxs = g_slist_prepend(aio_ctxs, aio_context); - } } - /* Note that completion of an asynchronous I/O operation can trigger any - * number of other I/O operations on other devices---for example a - * coroutine can submit an I/O request to another device in response to - * request completion. Therefore we must keep looping until there was no - * more activity rather than simply draining each device independently. - */ - while (waited) { - waited = false; - - for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) { - AioContext *aio_context = ctx->data; + /* Now poll the in-flight requests */ + AIO_WAIT_WHILE(&drain_all_aio_wait, NULL, bdrv_drain_all_poll()); - aio_context_acquire(aio_context); - for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { - if (aio_context == bdrv_get_aio_context(bs)) { - waited |= bdrv_drain_recurse(bs); - } - } - aio_context_release(aio_context); - } + while ((bs = bdrv_next_all_states(bs))) { + bdrv_drain_assert_idle(bs); } - - g_slist_free(aio_ctxs); } void bdrv_drain_all_end(void) { - BlockDriverState *bs; - BdrvNextIterator it; + BlockDriverState *bs = NULL; - for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { + while ((bs = bdrv_next_all_states(bs))) { AioContext *aio_context = bdrv_get_aio_context(bs); - /* Re-enable things in child-to-parent order */ aio_context_acquire(aio_context); - bdrv_drain_invoke(bs, false, true); - bdrv_parent_drained_end(bs, NULL); - aio_enable_external(aio_context); + bdrv_do_drained_end(bs, false, NULL, true); aio_context_release(aio_context); } + + assert(bdrv_drain_all_count > 0); + bdrv_drain_all_count--; } void bdrv_drain_all(void) @@ -591,6 +678,7 @@ void bdrv_inc_in_flight(BlockDriverState *bs) void bdrv_wakeup(BlockDriverState *bs) { aio_wait_kick(bdrv_get_aio_wait(bs)); + aio_wait_kick(&drain_all_aio_wait); } void bdrv_dec_in_flight(BlockDriverState *bs) diff --git a/block/mirror.c b/block/mirror.c index 435268bbbf..61bd9f3cf1 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -13,6 +13,8 @@ #include "qemu/osdep.h" #include "qemu/cutils.h" +#include "qemu/coroutine.h" +#include "qemu/range.h" #include "trace.h" #include "block/blockjob_int.h" #include "block/block_int.h" @@ -33,11 +35,12 @@ typedef struct MirrorBuffer { QSIMPLEQ_ENTRY(MirrorBuffer) next; } MirrorBuffer; +typedef struct MirrorOp MirrorOp; + typedef struct MirrorBlockJob { BlockJob common; BlockBackend *target; BlockDriverState *mirror_top_bs; - BlockDriverState *source; BlockDriverState *base; /* The name of the graph node to replace */ @@ -48,8 +51,12 @@ typedef struct MirrorBlockJob { Error *replace_blocker; bool is_none_mode; BlockMirrorBackingMode backing_mode; + MirrorCopyMode copy_mode; BlockdevOnError on_source_error, on_target_error; bool synced; + /* Set when the target is synced (dirty bitmap is clean, nothing + * in flight) and the job is running in active mode */ + bool actively_synced; bool should_complete; int64_t granularity; size_t buf_size; @@ -65,25 +72,47 @@ typedef struct MirrorBlockJob { unsigned long *in_flight_bitmap; int in_flight; int64_t bytes_in_flight; + QTAILQ_HEAD(MirrorOpList, MirrorOp) ops_in_flight; int ret; bool unmap; - bool waiting_for_io; int target_cluster_size; int max_iov; bool initial_zeroing_ongoing; + int in_active_write_counter; } MirrorBlockJob; -typedef struct MirrorOp { +typedef struct MirrorBDSOpaque { + MirrorBlockJob *job; +} MirrorBDSOpaque; + +struct MirrorOp { MirrorBlockJob *s; QEMUIOVector qiov; int64_t offset; uint64_t bytes; -} MirrorOp; + + /* The pointee is set by mirror_co_read(), mirror_co_zero(), and + * mirror_co_discard() before yielding for the first time */ + int64_t *bytes_handled; + + bool is_pseudo_op; + bool is_active_write; + CoQueue waiting_requests; + + QTAILQ_ENTRY(MirrorOp) next; +}; + +typedef enum MirrorMethod { + MIRROR_METHOD_COPY, + MIRROR_METHOD_ZERO, + MIRROR_METHOD_DISCARD, +} MirrorMethod; static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read, int error) { s->synced = false; + s->actively_synced = false; if (read) { return block_job_error_action(&s->common, s->on_source_error, true, error); @@ -93,7 +122,42 @@ static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read, } } -static void mirror_iteration_done(MirrorOp *op, int ret) +static void coroutine_fn mirror_wait_on_conflicts(MirrorOp *self, + MirrorBlockJob *s, + uint64_t offset, + uint64_t bytes) +{ + uint64_t self_start_chunk = offset / s->granularity; + uint64_t self_end_chunk = DIV_ROUND_UP(offset + bytes, s->granularity); + uint64_t self_nb_chunks = self_end_chunk - self_start_chunk; + + while (find_next_bit(s->in_flight_bitmap, self_end_chunk, + self_start_chunk) < self_end_chunk && + s->ret >= 0) + { + MirrorOp *op; + + QTAILQ_FOREACH(op, &s->ops_in_flight, next) { + uint64_t op_start_chunk = op->offset / s->granularity; + uint64_t op_nb_chunks = DIV_ROUND_UP(op->offset + op->bytes, + s->granularity) - + op_start_chunk; + + if (op == self) { + continue; + } + + if (ranges_overlap(self_start_chunk, self_nb_chunks, + op_start_chunk, op_nb_chunks)) + { + qemu_co_queue_wait(&op->waiting_requests, NULL); + break; + } + } + } +} + +static void coroutine_fn mirror_iteration_done(MirrorOp *op, int ret) { MirrorBlockJob *s = op->s; struct iovec *iov; @@ -113,7 +177,9 @@ static void mirror_iteration_done(MirrorOp *op, int ret) chunk_num = op->offset / s->granularity; nb_chunks = DIV_ROUND_UP(op->bytes, s->granularity); + bitmap_clear(s->in_flight_bitmap, chunk_num, nb_chunks); + QTAILQ_REMOVE(&s->ops_in_flight, op, next); if (ret >= 0) { if (s->cow_bitmap) { bitmap_set(s->cow_bitmap, chunk_num, nb_chunks); @@ -123,16 +189,13 @@ static void mirror_iteration_done(MirrorOp *op, int ret) } } qemu_iovec_destroy(&op->qiov); - g_free(op); - if (s->waiting_for_io) { - qemu_coroutine_enter(s->common.job.co); - } + qemu_co_queue_restart_all(&op->waiting_requests); + g_free(op); } -static void mirror_write_complete(void *opaque, int ret) +static void coroutine_fn mirror_write_complete(MirrorOp *op, int ret) { - MirrorOp *op = opaque; MirrorBlockJob *s = op->s; aio_context_acquire(blk_get_aio_context(s->common.blk)); @@ -149,9 +212,8 @@ static void mirror_write_complete(void *opaque, int ret) aio_context_release(blk_get_aio_context(s->common.blk)); } -static void mirror_read_complete(void *opaque, int ret) +static void coroutine_fn mirror_read_complete(MirrorOp *op, int ret) { - MirrorOp *op = opaque; MirrorBlockJob *s = op->s; aio_context_acquire(blk_get_aio_context(s->common.blk)); @@ -166,8 +228,9 @@ static void mirror_read_complete(void *opaque, int ret) mirror_iteration_done(op, ret); } else { - blk_aio_pwritev(s->target, op->offset, &op->qiov, - 0, mirror_write_complete, op); + ret = blk_co_pwritev(s->target, op->offset, + op->qiov.size, &op->qiov, 0); + mirror_write_complete(op, ret); } aio_context_release(blk_get_aio_context(s->common.blk)); } @@ -216,68 +279,80 @@ static int mirror_cow_align(MirrorBlockJob *s, int64_t *offset, return ret; } -static inline void mirror_wait_for_io(MirrorBlockJob *s) +static inline void mirror_wait_for_any_operation(MirrorBlockJob *s, bool active) { - assert(!s->waiting_for_io); - s->waiting_for_io = true; - qemu_coroutine_yield(); - s->waiting_for_io = false; + MirrorOp *op; + + QTAILQ_FOREACH(op, &s->ops_in_flight, next) { + /* Do not wait on pseudo ops, because it may in turn wait on + * some other operation to start, which may in fact be the + * caller of this function. Since there is only one pseudo op + * at any given time, we will always find some real operation + * to wait on. */ + if (!op->is_pseudo_op && op->is_active_write == active) { + qemu_co_queue_wait(&op->waiting_requests, NULL); + return; + } + } + abort(); } -/* Submit async read while handling COW. - * Returns: The number of bytes copied after and including offset, - * excluding any bytes copied prior to offset due to alignment. - * This will be @bytes if no alignment is necessary, or - * (new_end - offset) if tail is rounded up or down due to - * alignment or buffer limit. +static inline void mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s) +{ + /* Only non-active operations use up in-flight slots */ + mirror_wait_for_any_operation(s, false); +} + +/* Perform a mirror copy operation. + * + * *op->bytes_handled is set to the number of bytes copied after and + * including offset, excluding any bytes copied prior to offset due + * to alignment. This will be op->bytes if no alignment is necessary, + * or (new_end - op->offset) if the tail is rounded up or down due to + * alignment or buffer limit. */ -static uint64_t mirror_do_read(MirrorBlockJob *s, int64_t offset, - uint64_t bytes) +static void coroutine_fn mirror_co_read(void *opaque) { - BlockBackend *source = s->common.blk; + MirrorOp *op = opaque; + MirrorBlockJob *s = op->s; int nb_chunks; uint64_t ret; - MirrorOp *op; uint64_t max_bytes; max_bytes = s->granularity * s->max_iov; /* We can only handle as much as buf_size at a time. */ - bytes = MIN(s->buf_size, MIN(max_bytes, bytes)); - assert(bytes); - assert(bytes < BDRV_REQUEST_MAX_BYTES); - ret = bytes; + op->bytes = MIN(s->buf_size, MIN(max_bytes, op->bytes)); + assert(op->bytes); + assert(op->bytes < BDRV_REQUEST_MAX_BYTES); + *op->bytes_handled = op->bytes; if (s->cow_bitmap) { - ret += mirror_cow_align(s, &offset, &bytes); + *op->bytes_handled += mirror_cow_align(s, &op->offset, &op->bytes); } - assert(bytes <= s->buf_size); + /* Cannot exceed BDRV_REQUEST_MAX_BYTES + INT_MAX */ + assert(*op->bytes_handled <= UINT_MAX); + assert(op->bytes <= s->buf_size); /* The offset is granularity-aligned because: * 1) Caller passes in aligned values; * 2) mirror_cow_align is used only when target cluster is larger. */ - assert(QEMU_IS_ALIGNED(offset, s->granularity)); + assert(QEMU_IS_ALIGNED(op->offset, s->granularity)); /* The range is sector-aligned, since bdrv_getlength() rounds up. */ - assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)); - nb_chunks = DIV_ROUND_UP(bytes, s->granularity); + assert(QEMU_IS_ALIGNED(op->bytes, BDRV_SECTOR_SIZE)); + nb_chunks = DIV_ROUND_UP(op->bytes, s->granularity); while (s->buf_free_count < nb_chunks) { - trace_mirror_yield_in_flight(s, offset, s->in_flight); - mirror_wait_for_io(s); + trace_mirror_yield_in_flight(s, op->offset, s->in_flight); + mirror_wait_for_free_in_flight_slot(s); } - /* Allocate a MirrorOp that is used as an AIO callback. */ - op = g_new(MirrorOp, 1); - op->s = s; - op->offset = offset; - op->bytes = bytes; - /* Now make a QEMUIOVector taking enough granularity-sized chunks * from s->buf_free. */ qemu_iovec_init(&op->qiov, nb_chunks); while (nb_chunks-- > 0) { MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free); - size_t remaining = bytes - op->qiov.size; + size_t remaining = op->bytes - op->qiov.size; QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next); s->buf_free_count--; @@ -286,44 +361,92 @@ static uint64_t mirror_do_read(MirrorBlockJob *s, int64_t offset, /* Copy the dirty cluster. */ s->in_flight++; - s->bytes_in_flight += bytes; - trace_mirror_one_iteration(s, offset, bytes); + s->bytes_in_flight += op->bytes; + trace_mirror_one_iteration(s, op->offset, op->bytes); - blk_aio_preadv(source, offset, &op->qiov, 0, mirror_read_complete, op); - return ret; + ret = bdrv_co_preadv(s->mirror_top_bs->backing, op->offset, op->bytes, + &op->qiov, 0); + mirror_read_complete(op, ret); } -static void mirror_do_zero_or_discard(MirrorBlockJob *s, - int64_t offset, - uint64_t bytes, - bool is_discard) +static void coroutine_fn mirror_co_zero(void *opaque) { - MirrorOp *op; + MirrorOp *op = opaque; + int ret; - /* Allocate a MirrorOp that is used as an AIO callback. The qiov is zeroed - * so the freeing in mirror_iteration_done is nop. */ - op = g_new0(MirrorOp, 1); - op->s = s; - op->offset = offset; - op->bytes = bytes; + op->s->in_flight++; + op->s->bytes_in_flight += op->bytes; + *op->bytes_handled = op->bytes; - s->in_flight++; - s->bytes_in_flight += bytes; - if (is_discard) { - blk_aio_pdiscard(s->target, offset, - op->bytes, mirror_write_complete, op); - } else { - blk_aio_pwrite_zeroes(s->target, offset, - op->bytes, s->unmap ? BDRV_REQ_MAY_UNMAP : 0, - mirror_write_complete, op); + ret = blk_co_pwrite_zeroes(op->s->target, op->offset, op->bytes, + op->s->unmap ? BDRV_REQ_MAY_UNMAP : 0); + mirror_write_complete(op, ret); +} + +static void coroutine_fn mirror_co_discard(void *opaque) +{ + MirrorOp *op = opaque; + int ret; + + op->s->in_flight++; + op->s->bytes_in_flight += op->bytes; + *op->bytes_handled = op->bytes; + + ret = blk_co_pdiscard(op->s->target, op->offset, op->bytes); + mirror_write_complete(op, ret); +} + +static unsigned mirror_perform(MirrorBlockJob *s, int64_t offset, + unsigned bytes, MirrorMethod mirror_method) +{ + MirrorOp *op; + Coroutine *co; + int64_t bytes_handled = -1; + + op = g_new(MirrorOp, 1); + *op = (MirrorOp){ + .s = s, + .offset = offset, + .bytes = bytes, + .bytes_handled = &bytes_handled, + }; + qemu_co_queue_init(&op->waiting_requests); + + switch (mirror_method) { + case MIRROR_METHOD_COPY: + co = qemu_coroutine_create(mirror_co_read, op); + break; + case MIRROR_METHOD_ZERO: + co = qemu_coroutine_create(mirror_co_zero, op); + break; + case MIRROR_METHOD_DISCARD: + co = qemu_coroutine_create(mirror_co_discard, op); + break; + default: + abort(); } + + QTAILQ_INSERT_TAIL(&s->ops_in_flight, op, next); + qemu_coroutine_enter(co); + /* At this point, ownership of op has been moved to the coroutine + * and the object may already be freed */ + + /* Assert that this value has been set */ + assert(bytes_handled >= 0); + + /* Same assertion as in mirror_co_read() (and for mirror_co_read() + * and mirror_co_discard(), bytes_handled == op->bytes, which + * is the @bytes parameter given to this function) */ + assert(bytes_handled <= UINT_MAX); + return bytes_handled; } static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) { - BlockDriverState *source = s->source; - int64_t offset, first_chunk; - uint64_t delay_ns = 0; + BlockDriverState *source = s->mirror_top_bs->backing->bs; + MirrorOp *pseudo_op; + int64_t offset; + uint64_t delay_ns = 0, ret = 0; /* At least the first dirty chunk is mirrored in one iteration. */ int nb_chunks = 1; bool write_zeroes_ok = bdrv_can_write_zeroes_with_unmap(blk_bs(s->target)); @@ -339,11 +462,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) } bdrv_dirty_bitmap_unlock(s->dirty_bitmap); - first_chunk = offset / s->granularity; - while (test_bit(first_chunk, s->in_flight_bitmap)) { - trace_mirror_yield_in_flight(s, offset, s->in_flight); - mirror_wait_for_io(s); - } + mirror_wait_on_conflicts(NULL, s, offset, 1); job_pause_point(&s->common.job); @@ -380,16 +499,27 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) nb_chunks * s->granularity); bdrv_dirty_bitmap_unlock(s->dirty_bitmap); + /* Before claiming an area in the in-flight bitmap, we have to + * create a MirrorOp for it so that conflicting requests can wait + * for it. mirror_perform() will create the real MirrorOps later, + * for now we just create a pseudo operation that will wake up all + * conflicting requests once all real operations have been + * launched. */ + pseudo_op = g_new(MirrorOp, 1); + *pseudo_op = (MirrorOp){ + .offset = offset, + .bytes = nb_chunks * s->granularity, + .is_pseudo_op = true, + }; + qemu_co_queue_init(&pseudo_op->waiting_requests); + QTAILQ_INSERT_TAIL(&s->ops_in_flight, pseudo_op, next); + bitmap_set(s->in_flight_bitmap, offset / s->granularity, nb_chunks); while (nb_chunks > 0 && offset < s->bdev_length) { int ret; int64_t io_bytes; int64_t io_bytes_acct; - enum MirrorMethod { - MIRROR_METHOD_COPY, - MIRROR_METHOD_ZERO, - MIRROR_METHOD_DISCARD - } mirror_method = MIRROR_METHOD_COPY; + MirrorMethod mirror_method = MIRROR_METHOD_COPY; assert(!(offset % s->granularity)); ret = bdrv_block_status_above(source, NULL, offset, @@ -419,37 +549,34 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) while (s->in_flight >= MAX_IN_FLIGHT) { trace_mirror_yield_in_flight(s, offset, s->in_flight); - mirror_wait_for_io(s); + mirror_wait_for_free_in_flight_slot(s); } if (s->ret < 0) { - return 0; + ret = 0; + goto fail; } io_bytes = mirror_clip_bytes(s, offset, io_bytes); - switch (mirror_method) { - case MIRROR_METHOD_COPY: - io_bytes = io_bytes_acct = mirror_do_read(s, offset, io_bytes); - break; - case MIRROR_METHOD_ZERO: - case MIRROR_METHOD_DISCARD: - mirror_do_zero_or_discard(s, offset, io_bytes, - mirror_method == MIRROR_METHOD_DISCARD); - if (write_zeroes_ok) { - io_bytes_acct = 0; - } else { - io_bytes_acct = io_bytes; - } - break; - default: - abort(); + io_bytes = mirror_perform(s, offset, io_bytes, mirror_method); + if (mirror_method != MIRROR_METHOD_COPY && write_zeroes_ok) { + io_bytes_acct = 0; + } else { + io_bytes_acct = io_bytes; } assert(io_bytes); offset += io_bytes; nb_chunks -= DIV_ROUND_UP(io_bytes, s->granularity); delay_ns = block_job_ratelimit_get_delay(&s->common, io_bytes_acct); } - return delay_ns; + + ret = delay_ns; +fail: + QTAILQ_REMOVE(&s->ops_in_flight, pseudo_op, next); + qemu_co_queue_restart_all(&pseudo_op->waiting_requests); + g_free(pseudo_op); + + return ret; } static void mirror_free_init(MirrorBlockJob *s) @@ -476,7 +603,7 @@ static void mirror_free_init(MirrorBlockJob *s) static void mirror_wait_for_all_io(MirrorBlockJob *s) { while (s->in_flight > 0) { - mirror_wait_for_io(s); + mirror_wait_for_free_in_flight_slot(s); } } @@ -489,8 +616,9 @@ static void mirror_exit(Job *job, void *opaque) MirrorBlockJob *s = container_of(job, MirrorBlockJob, common.job); BlockJob *bjob = &s->common; MirrorExitData *data = opaque; + MirrorBDSOpaque *bs_opaque = s->mirror_top_bs->opaque; AioContext *replace_aio_context = NULL; - BlockDriverState *src = s->source; + BlockDriverState *src = s->mirror_top_bs->backing->bs; BlockDriverState *target_bs = blk_bs(s->target); BlockDriverState *mirror_top_bs = s->mirror_top_bs; Error *local_err = NULL; @@ -581,6 +709,7 @@ static void mirror_exit(Job *job, void *opaque) blk_set_perm(bjob->blk, 0, BLK_PERM_ALL, &error_abort); blk_insert_bs(bjob->blk, mirror_top_bs, &error_abort); + bs_opaque->job = NULL; job_completed(job, data->ret, NULL); g_free(data); @@ -605,7 +734,7 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s) { int64_t offset; BlockDriverState *base = s->base; - BlockDriverState *bs = s->source; + BlockDriverState *bs = s->mirror_top_bs->backing->bs; BlockDriverState *target_bs = blk_bs(s->target); int ret; int64_t count; @@ -631,11 +760,11 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s) if (s->in_flight >= MAX_IN_FLIGHT) { trace_mirror_yield(s, UINT64_MAX, s->buf_free_count, s->in_flight); - mirror_wait_for_io(s); + mirror_wait_for_free_in_flight_slot(s); continue; } - mirror_do_zero_or_discard(s, offset, bytes, false); + mirror_perform(s, offset, bytes, MIRROR_METHOD_ZERO); offset += bytes; } @@ -687,7 +816,7 @@ static void coroutine_fn mirror_run(void *opaque) { MirrorBlockJob *s = opaque; MirrorExitData *data; - BlockDriverState *bs = s->source; + BlockDriverState *bs = s->mirror_top_bs->backing->bs; BlockDriverState *target_bs = blk_bs(s->target); bool need_drain = true; int64_t length; @@ -730,6 +859,7 @@ static void coroutine_fn mirror_run(void *opaque) /* Transition to the READY state and wait for complete. */ job_transition_to_ready(&s->common.job); s->synced = true; + s->actively_synced = true; while (!job_is_cancelled(&s->common.job) && !s->should_complete) { job_yield(&s->common.job); } @@ -781,6 +911,12 @@ static void coroutine_fn mirror_run(void *opaque) int64_t cnt, delta; bool should_complete; + /* Do not start passive operations while there are active + * writes in progress */ + while (s->in_active_write_counter) { + mirror_wait_for_any_operation(s, true); + } + if (s->ret < 0) { ret = s->ret; goto immediate_exit; @@ -804,7 +940,7 @@ static void coroutine_fn mirror_run(void *opaque) if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 || (cnt == 0 && s->in_flight > 0)) { trace_mirror_yield(s, cnt, s->buf_free_count, s->in_flight); - mirror_wait_for_io(s); + mirror_wait_for_free_in_flight_slot(s); continue; } else if (cnt != 0) { delay_ns = mirror_iteration(s); @@ -826,6 +962,9 @@ static void coroutine_fn mirror_run(void *opaque) */ job_transition_to_ready(&s->common.job); s->synced = true; + if (s->copy_mode != MIRROR_COPY_MODE_BACKGROUND) { + s->actively_synced = true; + } } should_complete = s->should_complete || @@ -964,6 +1103,12 @@ static void mirror_pause(Job *job) mirror_wait_for_all_io(s); } +static bool mirror_drained_poll(BlockJob *job) +{ + MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); + return !!s->in_flight; +} + static void mirror_attached_aio_context(BlockJob *job, AioContext *new_context) { MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); @@ -997,6 +1142,7 @@ static const BlockJobDriver mirror_job_driver = { .pause = mirror_pause, .complete = mirror_complete, }, + .drained_poll = mirror_drained_poll, .attached_aio_context = mirror_attached_aio_context, .drain = mirror_drain, }; @@ -1012,20 +1158,237 @@ static const BlockJobDriver commit_active_job_driver = { .pause = mirror_pause, .complete = mirror_complete, }, + .drained_poll = mirror_drained_poll, .attached_aio_context = mirror_attached_aio_context, .drain = mirror_drain, }; +static void do_sync_target_write(MirrorBlockJob *job, MirrorMethod method, + uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) +{ + BdrvDirtyBitmapIter *iter; + QEMUIOVector target_qiov; + uint64_t dirty_offset; + int dirty_bytes; + + if (qiov) { + qemu_iovec_init(&target_qiov, qiov->niov); + } + + iter = bdrv_dirty_iter_new(job->dirty_bitmap); + bdrv_set_dirty_iter(iter, offset); + + while (true) { + bool valid_area; + int ret; + + bdrv_dirty_bitmap_lock(job->dirty_bitmap); + valid_area = bdrv_dirty_iter_next_area(iter, offset + bytes, + &dirty_offset, &dirty_bytes); + if (!valid_area) { + bdrv_dirty_bitmap_unlock(job->dirty_bitmap); + break; + } + + bdrv_reset_dirty_bitmap_locked(job->dirty_bitmap, + dirty_offset, dirty_bytes); + bdrv_dirty_bitmap_unlock(job->dirty_bitmap); + + job_progress_increase_remaining(&job->common.job, dirty_bytes); + + assert(dirty_offset - offset <= SIZE_MAX); + if (qiov) { + qemu_iovec_reset(&target_qiov); + qemu_iovec_concat(&target_qiov, qiov, + dirty_offset - offset, dirty_bytes); + } + + switch (method) { + case MIRROR_METHOD_COPY: + ret = blk_co_pwritev(job->target, dirty_offset, dirty_bytes, + qiov ? &target_qiov : NULL, flags); + break; + + case MIRROR_METHOD_ZERO: + assert(!qiov); + ret = blk_co_pwrite_zeroes(job->target, dirty_offset, dirty_bytes, + flags); + break; + + case MIRROR_METHOD_DISCARD: + assert(!qiov); + ret = blk_co_pdiscard(job->target, dirty_offset, dirty_bytes); + break; + + default: + abort(); + } + + if (ret >= 0) { + job_progress_update(&job->common.job, dirty_bytes); + } else { + BlockErrorAction action; + + bdrv_set_dirty_bitmap(job->dirty_bitmap, dirty_offset, dirty_bytes); + job->actively_synced = false; + + action = mirror_error_action(job, false, -ret); + if (action == BLOCK_ERROR_ACTION_REPORT) { + if (!job->ret) { + job->ret = ret; + } + break; + } + } + } + + bdrv_dirty_iter_free(iter); + if (qiov) { + qemu_iovec_destroy(&target_qiov); + } +} + +static MirrorOp *coroutine_fn active_write_prepare(MirrorBlockJob *s, + uint64_t offset, + uint64_t bytes) +{ + MirrorOp *op; + uint64_t start_chunk = offset / s->granularity; + uint64_t end_chunk = DIV_ROUND_UP(offset + bytes, s->granularity); + + op = g_new(MirrorOp, 1); + *op = (MirrorOp){ + .s = s, + .offset = offset, + .bytes = bytes, + .is_active_write = true, + }; + qemu_co_queue_init(&op->waiting_requests); + QTAILQ_INSERT_TAIL(&s->ops_in_flight, op, next); + + s->in_active_write_counter++; + + mirror_wait_on_conflicts(op, s, offset, bytes); + + bitmap_set(s->in_flight_bitmap, start_chunk, end_chunk - start_chunk); + + return op; +} + +static void coroutine_fn active_write_settle(MirrorOp *op) +{ + uint64_t start_chunk = op->offset / op->s->granularity; + uint64_t end_chunk = DIV_ROUND_UP(op->offset + op->bytes, + op->s->granularity); + + if (!--op->s->in_active_write_counter && op->s->actively_synced) { + BdrvChild *source = op->s->mirror_top_bs->backing; + + if (QLIST_FIRST(&source->bs->parents) == source && + QLIST_NEXT(source, next_parent) == NULL) + { + /* Assert that we are back in sync once all active write + * operations are settled. + * Note that we can only assert this if the mirror node + * is the source node's only parent. */ + assert(!bdrv_get_dirty_count(op->s->dirty_bitmap)); + } + } + bitmap_clear(op->s->in_flight_bitmap, start_chunk, end_chunk - start_chunk); + QTAILQ_REMOVE(&op->s->ops_in_flight, op, next); + qemu_co_queue_restart_all(&op->waiting_requests); + g_free(op); +} + static int coroutine_fn bdrv_mirror_top_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) { return bdrv_co_preadv(bs->backing, offset, bytes, qiov, flags); } +static int coroutine_fn bdrv_mirror_top_do_write(BlockDriverState *bs, + MirrorMethod method, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, + int flags) +{ + MirrorOp *op = NULL; + MirrorBDSOpaque *s = bs->opaque; + int ret = 0; + bool copy_to_target; + + copy_to_target = s->job->ret >= 0 && + s->job->copy_mode == MIRROR_COPY_MODE_WRITE_BLOCKING; + + if (copy_to_target) { + op = active_write_prepare(s->job, offset, bytes); + } + + switch (method) { + case MIRROR_METHOD_COPY: + ret = bdrv_co_pwritev(bs->backing, offset, bytes, qiov, flags); + break; + + case MIRROR_METHOD_ZERO: + ret = bdrv_co_pwrite_zeroes(bs->backing, offset, bytes, flags); + break; + + case MIRROR_METHOD_DISCARD: + ret = bdrv_co_pdiscard(bs->backing->bs, offset, bytes); + break; + + default: + abort(); + } + + if (ret < 0) { + goto out; + } + + if (copy_to_target) { + do_sync_target_write(s->job, method, offset, bytes, qiov, flags); + } + +out: + if (copy_to_target) { + active_write_settle(op); + } + return ret; +} + static int coroutine_fn bdrv_mirror_top_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) { - return bdrv_co_pwritev(bs->backing, offset, bytes, qiov, flags); + MirrorBDSOpaque *s = bs->opaque; + QEMUIOVector bounce_qiov; + void *bounce_buf; + int ret = 0; + bool copy_to_target; + + copy_to_target = s->job->ret >= 0 && + s->job->copy_mode == MIRROR_COPY_MODE_WRITE_BLOCKING; + + if (copy_to_target) { + /* The guest might concurrently modify the data to write; but + * the data on source and destination must match, so we have + * to use a bounce buffer if we are going to write to the + * target now. */ + bounce_buf = qemu_blockalign(bs, bytes); + iov_to_buf_full(qiov->iov, qiov->niov, 0, bounce_buf, bytes); + + qemu_iovec_init(&bounce_qiov, 1); + qemu_iovec_add(&bounce_qiov, bounce_buf, bytes); + qiov = &bounce_qiov; + } + + ret = bdrv_mirror_top_do_write(bs, MIRROR_METHOD_COPY, offset, bytes, qiov, + flags); + + if (copy_to_target) { + qemu_iovec_destroy(&bounce_qiov); + qemu_vfree(bounce_buf); + } + + return ret; } static int coroutine_fn bdrv_mirror_top_flush(BlockDriverState *bs) @@ -1040,13 +1403,15 @@ static int coroutine_fn bdrv_mirror_top_flush(BlockDriverState *bs) static int coroutine_fn bdrv_mirror_top_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int bytes, BdrvRequestFlags flags) { - return bdrv_co_pwrite_zeroes(bs->backing, offset, bytes, flags); + return bdrv_mirror_top_do_write(bs, MIRROR_METHOD_ZERO, offset, bytes, NULL, + flags); } static int coroutine_fn bdrv_mirror_top_pdiscard(BlockDriverState *bs, int64_t offset, int bytes) { - return bdrv_co_pdiscard(bs->backing->bs, offset, bytes); + return bdrv_mirror_top_do_write(bs, MIRROR_METHOD_DISCARD, offset, bytes, + NULL, 0); } static void bdrv_mirror_top_refresh_filename(BlockDriverState *bs, QDict *opts) @@ -1108,10 +1473,11 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs, const BlockJobDriver *driver, bool is_none_mode, BlockDriverState *base, bool auto_complete, const char *filter_node_name, - bool is_mirror, + bool is_mirror, MirrorCopyMode copy_mode, Error **errp) { MirrorBlockJob *s; + MirrorBDSOpaque *bs_opaque; BlockDriverState *mirror_top_bs; bool target_graph_mod; bool target_is_backing; @@ -1147,6 +1513,8 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs, mirror_top_bs->total_sectors = bs->total_sectors; mirror_top_bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED; mirror_top_bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED; + bs_opaque = g_new0(MirrorBDSOpaque, 1); + mirror_top_bs->opaque = bs_opaque; bdrv_set_aio_context(mirror_top_bs, bdrv_get_aio_context(bs)); /* bdrv_append takes ownership of the mirror_top_bs reference, need to keep @@ -1171,10 +1539,11 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs, if (!s) { goto fail; } + bs_opaque->job = s; + /* The block job now has a reference to this node */ bdrv_unref(mirror_top_bs); - s->source = bs; s->mirror_top_bs = mirror_top_bs; /* No resize for the target either; while the mirror is still running, a @@ -1212,6 +1581,7 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs, s->on_target_error = on_target_error; s->is_none_mode = is_none_mode; s->backing_mode = backing_mode; + s->copy_mode = copy_mode; s->base = base; s->granularity = granularity; s->buf_size = ROUND_UP(buf_size, granularity); @@ -1247,6 +1617,8 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs, } } + QTAILQ_INIT(&s->ops_in_flight); + trace_mirror_start(bs, s, opaque); job_start(&s->common.job); return; @@ -1259,6 +1631,7 @@ fail: g_free(s->replaces); blk_unref(s->target); + bs_opaque->job = NULL; job_early_fail(&s->common.job); } @@ -1275,7 +1648,8 @@ void mirror_start(const char *job_id, BlockDriverState *bs, MirrorSyncMode mode, BlockMirrorBackingMode backing_mode, BlockdevOnError on_source_error, BlockdevOnError on_target_error, - bool unmap, const char *filter_node_name, Error **errp) + bool unmap, const char *filter_node_name, + MirrorCopyMode copy_mode, Error **errp) { bool is_none_mode; BlockDriverState *base; @@ -1290,7 +1664,7 @@ void mirror_start(const char *job_id, BlockDriverState *bs, speed, granularity, buf_size, backing_mode, on_source_error, on_target_error, unmap, NULL, NULL, &mirror_job_driver, is_none_mode, base, false, - filter_node_name, true, errp); + filter_node_name, true, copy_mode, errp); } void commit_active_start(const char *job_id, BlockDriverState *bs, @@ -1313,7 +1687,8 @@ void commit_active_start(const char *job_id, BlockDriverState *bs, MIRROR_LEAVE_BACKING_CHAIN, on_error, on_error, true, cb, opaque, &commit_active_job_driver, false, base, auto_complete, - filter_node_name, false, &local_err); + filter_node_name, false, MIRROR_COPY_MODE_BACKGROUND, + &local_err); if (local_err) { error_propagate(errp, local_err); goto error_restore_flags; diff --git a/block/vvfat.c b/block/vvfat.c index 4595f335b8..c7d2ed2d96 100644 --- a/block/vvfat.c +++ b/block/vvfat.c @@ -3134,6 +3134,7 @@ static void vvfat_qcow_options(int *child_flags, QDict *child_options, } static const BdrvChildRole child_vvfat_qcow = { + .parent_is_bds = true, .inherit_options = vvfat_qcow_options, }; |