diff options
45 files changed, 1574 insertions, 1127 deletions
@@ -93,8 +93,6 @@ static bool bdrv_recurse_has_child(BlockDriverState *bs, static void bdrv_replace_child_noperm(BdrvChild *child, BlockDriverState *new_bs); static void bdrv_remove_child(BdrvChild *child, Transaction *tran); -static void bdrv_remove_filter_or_cow_child(BlockDriverState *bs, - Transaction *tran); static int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue, @@ -528,65 +526,24 @@ typedef struct CreateCo { Error *err; } CreateCo; -static void coroutine_fn bdrv_create_co_entry(void *opaque) -{ - Error *local_err = NULL; - int ret; - - CreateCo *cco = opaque; - assert(cco->drv); - GLOBAL_STATE_CODE(); - - ret = cco->drv->bdrv_co_create_opts(cco->drv, - cco->filename, cco->opts, &local_err); - error_propagate(&cco->err, local_err); - cco->ret = ret; -} - -int bdrv_create(BlockDriver *drv, const char* filename, - QemuOpts *opts, Error **errp) +int coroutine_fn bdrv_co_create(BlockDriver *drv, const char *filename, + QemuOpts *opts, Error **errp) { int ret; - GLOBAL_STATE_CODE(); - - Coroutine *co; - CreateCo cco = { - .drv = drv, - .filename = g_strdup(filename), - .opts = opts, - .ret = NOT_DONE, - .err = NULL, - }; + ERRP_GUARD(); if (!drv->bdrv_co_create_opts) { - error_setg(errp, "Driver '%s' does not support image creation", drv->format_name); - ret = -ENOTSUP; - goto out; - } - - if (qemu_in_coroutine()) { - /* Fast-path if already in coroutine context */ - bdrv_create_co_entry(&cco); - } else { - co = qemu_coroutine_create(bdrv_create_co_entry, &cco); - qemu_coroutine_enter(co); - while (cco.ret == NOT_DONE) { - aio_poll(qemu_get_aio_context(), true); - } + error_setg(errp, "Driver '%s' does not support image creation", + drv->format_name); + return -ENOTSUP; } - ret = cco.ret; - if (ret < 0) { - if (cco.err) { - error_propagate(errp, cco.err); - } else { - error_setg_errno(errp, -ret, "Could not create image"); - } + ret = drv->bdrv_co_create_opts(drv, filename, opts, errp); + if (ret < 0 && !*errp) { + error_setg_errno(errp, -ret, "Could not create image"); } -out: - g_free(cco.filename); return ret; } @@ -725,7 +682,8 @@ out: return ret; } -int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp) +int coroutine_fn bdrv_co_create_file(const char *filename, QemuOpts *opts, + Error **errp) { QemuOpts *protocol_opts; BlockDriver *drv; @@ -766,7 +724,7 @@ int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp) goto out; } - ret = bdrv_create(drv, filename, protocol_opts, errp); + ret = bdrv_co_create(drv, filename, protocol_opts, errp); out: qemu_opts_del(protocol_opts); qobject_unref(qdict); @@ -1228,20 +1186,19 @@ static char *bdrv_child_get_parent_desc(BdrvChild *c) static void bdrv_child_cb_drained_begin(BdrvChild *child) { BlockDriverState *bs = child->opaque; - bdrv_do_drained_begin_quiesce(bs, NULL, false); + bdrv_do_drained_begin_quiesce(bs, NULL); } static bool bdrv_child_cb_drained_poll(BdrvChild *child) { BlockDriverState *bs = child->opaque; - return bdrv_drain_poll(bs, false, NULL, false); + return bdrv_drain_poll(bs, NULL, false); } -static void bdrv_child_cb_drained_end(BdrvChild *child, - int *drained_end_counter) +static void bdrv_child_cb_drained_end(BdrvChild *child) { BlockDriverState *bs = child->opaque; - bdrv_drained_end_no_poll(bs, drained_end_counter); + bdrv_drained_end(bs); } static int bdrv_child_cb_inactivate(BdrvChild *child) @@ -1445,11 +1402,11 @@ static void bdrv_inherited_options(BdrvChildRole role, bool parent_is_format, *child_flags = flags; } -static void bdrv_child_cb_attach(BdrvChild *child) +static void GRAPH_WRLOCK bdrv_child_cb_attach(BdrvChild *child) { BlockDriverState *bs = child->opaque; - assert_bdrv_graph_writable(bs); + assert_bdrv_graph_writable(); QLIST_INSERT_HEAD(&bs->children, child, next); if (bs->drv->is_filter || (child->role & BDRV_CHILD_FILTERED)) { /* @@ -1485,11 +1442,9 @@ static void bdrv_child_cb_attach(BdrvChild *child) assert(!bs->file); bs->file = child; } - - bdrv_apply_subtree_drain(child, bs); } -static void bdrv_child_cb_detach(BdrvChild *child) +static void GRAPH_WRLOCK bdrv_child_cb_detach(BdrvChild *child) { BlockDriverState *bs = child->opaque; @@ -1497,9 +1452,7 @@ static void bdrv_child_cb_detach(BdrvChild *child) bdrv_backing_detach(child); } - bdrv_unapply_subtree_drain(child, bs); - - assert_bdrv_graph_writable(bs); + assert_bdrv_graph_writable(); QLIST_REMOVE(child, next); if (child == bs->backing) { assert(child != bs->file); @@ -1715,8 +1668,8 @@ static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv, assert(is_power_of_2(bs->bl.request_alignment)); for (i = 0; i < bs->quiesce_counter; i++) { - if (drv->bdrv_co_drain_begin) { - drv->bdrv_co_drain_begin(bs); + if (drv->bdrv_drain_begin) { + drv->bdrv_drain_begin(bs); } } @@ -2414,6 +2367,20 @@ static void bdrv_replace_child_abort(void *opaque) GLOBAL_STATE_CODE(); /* old_bs reference is transparently moved from @s to @s->child */ + if (!s->child->bs) { + /* + * The parents were undrained when removing old_bs from the child. New + * requests can't have been made, though, because the child was empty. + * + * TODO Make bdrv_replace_child_noperm() transactionable to avoid + * undraining the parent in the first place. Once this is done, having + * new_bs drained when calling bdrv_replace_child_tran() is not a + * requirement any more. + */ + bdrv_parent_drained_begin_single(s->child); + assert(!bdrv_parent_drained_poll_single(s->child)); + } + assert(s->child->quiesced_parent); bdrv_replace_child_noperm(s->child, s->old_bs); bdrv_unref(new_bs); } @@ -2429,12 +2396,19 @@ static TransactionActionDrv bdrv_replace_child_drv = { * * Note: real unref of old_bs is done only on commit. * + * Both @child->bs and @new_bs (if non-NULL) must be drained. @new_bs must be + * kept drained until the transaction is completed. + * * The function doesn't update permissions, caller is responsible for this. */ static void bdrv_replace_child_tran(BdrvChild *child, BlockDriverState *new_bs, Transaction *tran) { BdrvReplaceChildState *s = g_new(BdrvReplaceChildState, 1); + + assert(child->quiesced_parent); + assert(!new_bs || new_bs->quiesce_counter); + *s = (BdrvReplaceChildState) { .child = child, .old_bs = child->bs, @@ -2523,8 +2497,12 @@ static int bdrv_node_refresh_perm(BlockDriverState *bs, BlockReopenQueue *q, return 0; } -static int bdrv_list_refresh_perms(GSList *list, BlockReopenQueue *q, - Transaction *tran, Error **errp) +/* + * @list is a product of bdrv_topological_dfs() (may be called several times) - + * a topologically sorted subgraph. + */ +static int bdrv_do_refresh_perms(GSList *list, BlockReopenQueue *q, + Transaction *tran, Error **errp) { int ret; BlockDriverState *bs; @@ -2546,6 +2524,24 @@ static int bdrv_list_refresh_perms(GSList *list, BlockReopenQueue *q, return 0; } +/* + * @list is any list of nodes. List is completed by all subtrees and + * topologically sorted. It's not a problem if some node occurs in the @list + * several times. + */ +static int bdrv_list_refresh_perms(GSList *list, BlockReopenQueue *q, + Transaction *tran, Error **errp) +{ + g_autoptr(GHashTable) found = g_hash_table_new(NULL, NULL); + g_autoptr(GSList) refresh_list = NULL; + + for ( ; list; list = list->next) { + refresh_list = bdrv_topological_dfs(refresh_list, found, list->data); + } + + return bdrv_do_refresh_perms(refresh_list, q, tran, errp); +} + void bdrv_get_cumulative_perm(BlockDriverState *bs, uint64_t *perm, uint64_t *shared_perm) { @@ -2593,15 +2589,24 @@ char *bdrv_perm_names(uint64_t perm) } -static int bdrv_refresh_perms(BlockDriverState *bs, Error **errp) +/* @tran is allowed to be NULL. In this case no rollback is possible */ +static int bdrv_refresh_perms(BlockDriverState *bs, Transaction *tran, + Error **errp) { int ret; - Transaction *tran = tran_new(); + Transaction *local_tran = NULL; g_autoptr(GSList) list = bdrv_topological_dfs(NULL, NULL, bs); GLOBAL_STATE_CODE(); - ret = bdrv_list_refresh_perms(list, NULL, tran, errp); - tran_finalize(tran, ret); + if (!tran) { + tran = local_tran = tran_new(); + } + + ret = bdrv_do_refresh_perms(list, NULL, tran, errp); + + if (local_tran) { + tran_finalize(local_tran, ret); + } return ret; } @@ -2617,7 +2622,7 @@ int bdrv_child_try_set_perm(BdrvChild *c, uint64_t perm, uint64_t shared, bdrv_child_set_perm(c, perm, shared, tran); - ret = bdrv_refresh_perms(c->bs, &local_err); + ret = bdrv_refresh_perms(c->bs, tran, &local_err); tran_finalize(tran, ret); @@ -2826,14 +2831,41 @@ uint64_t bdrv_qapi_perm_to_blk_perm(BlockPermission qapi_perm) return permissions[qapi_perm]; } +/* + * Replaces the node that a BdrvChild points to without updating permissions. + * + * If @new_bs is non-NULL, the parent of @child must already be drained through + * @child. + */ static void bdrv_replace_child_noperm(BdrvChild *child, BlockDriverState *new_bs) { BlockDriverState *old_bs = child->bs; int new_bs_quiesce_counter; - int drain_saldo; assert(!child->frozen); + + /* + * If we want to change the BdrvChild to point to a drained node as its new + * child->bs, we need to make sure that its new parent is drained, too. In + * other words, either child->quiesce_parent must already be true or we must + * be able to set it and keep the parent's quiesce_counter consistent with + * that, but without polling or starting new requests (this function + * guarantees that it doesn't poll, and starting new requests would be + * against the invariants of drain sections). + * + * To keep things simple, we pick the first option (child->quiesce_parent + * must already be true). We also generalise the rule a bit to make it + * easier to verify in callers and more likely to be covered in test cases: + * The parent must be quiesced through this child even if new_bs isn't + * currently drained. + * + * The only exception is for callers that always pass new_bs == NULL. In + * this case, we obviously never need to consider the case of a drained + * new_bs, so we can keep the callers simpler by allowing them not to drain + * the parent. + */ + assert(!new_bs || child->quiesced_parent); assert(old_bs != new_bs); GLOBAL_STATE_CODE(); @@ -2841,59 +2873,33 @@ static void bdrv_replace_child_noperm(BdrvChild *child, assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs)); } - new_bs_quiesce_counter = (new_bs ? new_bs->quiesce_counter : 0); - drain_saldo = new_bs_quiesce_counter - child->parent_quiesce_counter; - - /* - * If the new child node is drained but the old one was not, flush - * all outstanding requests to the old child node. - */ - while (drain_saldo > 0 && child->klass->drained_begin) { - bdrv_parent_drained_begin_single(child, true); - drain_saldo--; - } - + /* TODO Pull this up into the callers to avoid polling here */ + bdrv_graph_wrlock(); if (old_bs) { - /* Detach first so that the recursive drain sections coming from @child - * are already gone and we only end the drain sections that came from - * elsewhere. */ if (child->klass->detach) { child->klass->detach(child); } - assert_bdrv_graph_writable(old_bs); QLIST_REMOVE(child, next_parent); } child->bs = new_bs; if (new_bs) { - assert_bdrv_graph_writable(new_bs); QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent); - - /* - * Detaching the old node may have led to the new node's - * quiesce_counter having been decreased. Not a problem, we - * just need to recognize this here and then invoke - * drained_end appropriately more often. - */ - assert(new_bs->quiesce_counter <= new_bs_quiesce_counter); - drain_saldo += new_bs->quiesce_counter - new_bs_quiesce_counter; - - /* Attach only after starting new drained sections, so that recursive - * drain sections coming from @child don't get an extra .drained_begin - * callback. */ if (child->klass->attach) { child->klass->attach(child); } } + bdrv_graph_wrunlock(); /* - * If the old child node was drained but the new one is not, allow - * requests to come in only after the new node has been attached. + * If the parent was drained through this BdrvChild previously, but new_bs + * is not drained, allow requests to come in only after the new node has + * been attached. */ - while (drain_saldo < 0 && child->klass->drained_end) { + new_bs_quiesce_counter = (new_bs ? new_bs->quiesce_counter : 0); + if (!new_bs_quiesce_counter && child->quiesced_parent) { bdrv_parent_drained_end_single(child); - drain_saldo++; } } @@ -3026,6 +3032,24 @@ static BdrvChild *bdrv_attach_child_common(BlockDriverState *child_bs, } bdrv_ref(child_bs); + /* + * Let every new BdrvChild start with a drained parent. Inserting the child + * in the graph with bdrv_replace_child_noperm() will undrain it if + * @child_bs is not drained. + * + * The child was only just created and is not yet visible in global state + * until bdrv_replace_child_noperm() inserts it into the graph, so nobody + * could have sent requests and polling is not necessary. + * + * Note that this means that the parent isn't fully drained yet, we only + * stop new requests from coming in. This is fine, we don't care about the + * old requests here, they are not for this child. If another place enters a + * drain section for the same parent, but wants it to be fully quiesced, it + * will not run most of the the code in .drained_begin() again (which is not + * a problem, we already did this), but it will still poll until the parent + * is fully quiesced, so it will not be negatively affected either. + */ + bdrv_parent_drained_begin_single(new_child); bdrv_replace_child_noperm(new_child, child_bs); BdrvAttachChildCommonState *s = g_new(BdrvAttachChildCommonState, 1); @@ -3070,30 +3094,6 @@ static BdrvChild *bdrv_attach_child_noperm(BlockDriverState *parent_bs, tran, errp); } -static void bdrv_detach_child(BdrvChild *child) -{ - BlockDriverState *old_bs = child->bs; - - GLOBAL_STATE_CODE(); - bdrv_replace_child_noperm(child, NULL); - bdrv_child_free(child); - - if (old_bs) { - /* - * Update permissions for old node. We're just taking a parent away, so - * we're loosening restrictions. Errors of permission update are not - * fatal in this case, ignore them. - */ - bdrv_refresh_perms(old_bs, NULL); - - /* - * When the parent requiring a non-default AioContext is removed, the - * node moves back to the main AioContext - */ - bdrv_try_change_aio_context(old_bs, qemu_get_aio_context(), NULL, NULL); - } -} - /* * This function steals the reference to child_bs from the caller. * That reference is later dropped by bdrv_root_unref_child(). @@ -3125,7 +3125,7 @@ BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs, goto out; } - ret = bdrv_refresh_perms(child_bs, errp); + ret = bdrv_refresh_perms(child_bs, tran, errp); out: tran_finalize(tran, ret); @@ -3166,7 +3166,7 @@ BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs, goto out; } - ret = bdrv_refresh_perms(parent_bs, errp); + ret = bdrv_refresh_perms(parent_bs, tran, errp); if (ret < 0) { goto out; } @@ -3182,12 +3182,28 @@ out: /* Callers must ensure that child->frozen is false. */ void bdrv_root_unref_child(BdrvChild *child) { - BlockDriverState *child_bs; + BlockDriverState *child_bs = child->bs; GLOBAL_STATE_CODE(); + bdrv_replace_child_noperm(child, NULL); + bdrv_child_free(child); + + if (child_bs) { + /* + * Update permissions for old node. We're just taking a parent away, so + * we're loosening restrictions. Errors of permission update are not + * fatal in this case, ignore them. + */ + bdrv_refresh_perms(child_bs, NULL, NULL); + + /* + * When the parent requiring a non-default AioContext is removed, the + * node moves back to the main AioContext + */ + bdrv_try_change_aio_context(child_bs, qemu_get_aio_context(), NULL, + NULL); + } - child_bs = child->bs; - bdrv_detach_child(child); bdrv_unref(child_bs); } @@ -3406,24 +3422,35 @@ static int bdrv_set_backing_noperm(BlockDriverState *bs, return bdrv_set_file_or_backing_noperm(bs, backing_hd, true, tran, errp); } -int bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd, - Error **errp) +int bdrv_set_backing_hd_drained(BlockDriverState *bs, + BlockDriverState *backing_hd, + Error **errp) { int ret; Transaction *tran = tran_new(); GLOBAL_STATE_CODE(); - bdrv_drained_begin(bs); + assert(bs->quiesce_counter > 0); ret = bdrv_set_backing_noperm(bs, backing_hd, tran, errp); if (ret < 0) { goto out; } - ret = bdrv_refresh_perms(bs, errp); + ret = bdrv_refresh_perms(bs, tran, errp); out: tran_finalize(tran, ret); + return ret; +} + +int bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd, + Error **errp) +{ + int ret; + GLOBAL_STATE_CODE(); + bdrv_drained_begin(bs); + ret = bdrv_set_backing_hd_drained(bs, backing_hd, errp); bdrv_drained_end(bs); return ret; @@ -4153,7 +4180,9 @@ static bool bdrv_recurse_has_child(BlockDriverState *bs, * returns a pointer to bs_queue, which is either the newly allocated * bs_queue, or the existing bs_queue being used. * - * bs must be drained between bdrv_reopen_queue() and bdrv_reopen_multiple(). + * bs is drained here and undrained by bdrv_reopen_queue_free(). + * + * To be called with bs->aio_context locked. */ static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue, BlockDriverState *bs, @@ -4173,12 +4202,10 @@ static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue, int flags; QemuOpts *opts; - /* Make sure that the caller remembered to use a drained section. This is - * important to avoid graph changes between the recursive queuing here and - * bdrv_reopen_multiple(). */ - assert(bs->quiesce_counter > 0); GLOBAL_STATE_CODE(); + bdrv_drained_begin(bs); + if (bs_queue == NULL) { bs_queue = g_new0(BlockReopenQueue, 1); QTAILQ_INIT(bs_queue); @@ -4312,6 +4339,7 @@ static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue, return bs_queue; } +/* To be called with bs->aio_context locked */ BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue, BlockDriverState *bs, QDict *options, bool keep_old_opts) @@ -4328,6 +4356,12 @@ void bdrv_reopen_queue_free(BlockReopenQueue *bs_queue) if (bs_queue) { BlockReopenQueueEntry *bs_entry, *next; QTAILQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) { + AioContext *ctx = bdrv_get_aio_context(bs_entry->state.bs); + + aio_context_acquire(ctx); + bdrv_drained_end(bs_entry->state.bs); + aio_context_release(ctx); + qobject_unref(bs_entry->state.explicit_options); qobject_unref(bs_entry->state.options); g_free(bs_entry); @@ -4361,7 +4395,6 @@ int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp) BlockReopenQueueEntry *bs_entry, *next; AioContext *ctx; Transaction *tran = tran_new(); - g_autoptr(GHashTable) found = NULL; g_autoptr(GSList) refresh_list = NULL; assert(qemu_get_current_aio_context() == qemu_get_aio_context()); @@ -4391,18 +4424,15 @@ int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp) bs_entry->prepared = true; } - found = g_hash_table_new(NULL, NULL); QTAILQ_FOREACH(bs_entry, bs_queue, entry) { BDRVReopenState *state = &bs_entry->state; - refresh_list = bdrv_topological_dfs(refresh_list, found, state->bs); + refresh_list = g_slist_prepend(refresh_list, state->bs); if (state->old_backing_bs) { - refresh_list = bdrv_topological_dfs(refresh_list, found, - state->old_backing_bs); + refresh_list = g_slist_prepend(refresh_list, state->old_backing_bs); } if (state->old_file_bs) { - refresh_list = bdrv_topological_dfs(refresh_list, found, - state->old_file_bs); + refresh_list = g_slist_prepend(refresh_list, state->old_file_bs); } } @@ -4475,18 +4505,16 @@ int bdrv_reopen(BlockDriverState *bs, QDict *opts, bool keep_old_opts, GLOBAL_STATE_CODE(); - bdrv_subtree_drained_begin(bs); + queue = bdrv_reopen_queue(NULL, bs, opts, keep_old_opts); + if (ctx != qemu_get_aio_context()) { aio_context_release(ctx); } - - queue = bdrv_reopen_queue(NULL, bs, opts, keep_old_opts); ret = bdrv_reopen_multiple(queue, errp); if (ctx != qemu_get_aio_context()) { aio_context_acquire(ctx); } - bdrv_subtree_drained_end(bs); return ret; } @@ -5067,23 +5095,24 @@ static void bdrv_remove_child(BdrvChild *child, Transaction *tran) } if (child->bs) { + BlockDriverState *bs = child->bs; + bdrv_drained_begin(bs); bdrv_replace_child_tran(child, NULL, tran); + bdrv_drained_end(bs); } tran_add(tran, &bdrv_remove_child_drv, child); } -/* - * A function to remove backing-chain child of @bs if exists: cow child for - * format nodes (always .backing) and filter child for filters (may be .file or - * .backing) - */ -static void bdrv_remove_filter_or_cow_child(BlockDriverState *bs, - Transaction *tran) +static void undrain_on_clean_cb(void *opaque) { - bdrv_remove_child(bdrv_filter_or_cow_child(bs), tran); + bdrv_drained_end(opaque); } +static TransactionActionDrv undrain_on_clean = { + .clean = undrain_on_clean_cb, +}; + static int bdrv_replace_node_noperm(BlockDriverState *from, BlockDriverState *to, bool auto_skip, Transaction *tran, @@ -5093,6 +5122,11 @@ static int bdrv_replace_node_noperm(BlockDriverState *from, GLOBAL_STATE_CODE(); + bdrv_drained_begin(from); + bdrv_drained_begin(to); + tran_add(tran, &undrain_on_clean, from); + tran_add(tran, &undrain_on_clean, to); + QLIST_FOREACH_SAFE(c, &from->parents, next_parent, next) { assert(c->bs == from); if (!should_update_child(c, to)) { @@ -5130,7 +5164,6 @@ static int bdrv_replace_node_common(BlockDriverState *from, Error **errp) { Transaction *tran = tran_new(); - g_autoptr(GHashTable) found = NULL; g_autoptr(GSList) refresh_list = NULL; BlockDriverState *to_cow_parent = NULL; int ret; @@ -5168,13 +5201,11 @@ static int bdrv_replace_node_common(BlockDriverState *from, } if (detach_subchain) { - bdrv_remove_filter_or_cow_child(to_cow_parent, tran); + bdrv_remove_child(bdrv_filter_or_cow_child(to_cow_parent), tran); } - found = g_hash_table_new(NULL, NULL); - - refresh_list = bdrv_topological_dfs(refresh_list, found, to); - refresh_list = bdrv_topological_dfs(refresh_list, found, from); + refresh_list = g_slist_prepend(refresh_list, to); + refresh_list = g_slist_prepend(refresh_list, from); ret = bdrv_list_refresh_perms(refresh_list, NULL, tran, errp); if (ret < 0) { @@ -5244,7 +5275,7 @@ int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top, goto out; } - ret = bdrv_refresh_perms(bs_new, errp); + ret = bdrv_refresh_perms(bs_new, tran, errp); out: tran_finalize(tran, ret); @@ -5259,7 +5290,6 @@ int bdrv_replace_child_bs(BdrvChild *child, BlockDriverState *new_bs, { int ret; Transaction *tran = tran_new(); - g_autoptr(GHashTable) found = NULL; g_autoptr(GSList) refresh_list = NULL; BlockDriverState *old_bs = child->bs; @@ -5271,9 +5301,8 @@ int bdrv_replace_child_bs(BdrvChild *child, BlockDriverState *new_bs, bdrv_replace_child_tran(child, new_bs, tran); - found = g_hash_table_new(NULL, NULL); - refresh_list = bdrv_topological_dfs(refresh_list, found, old_bs); - refresh_list = bdrv_topological_dfs(refresh_list, found, new_bs); + refresh_list = g_slist_prepend(refresh_list, old_bs); + refresh_list = g_slist_prepend(refresh_list, new_bs); ret = bdrv_list_refresh_perms(refresh_list, NULL, tran, errp); @@ -5373,6 +5402,7 @@ int coroutine_fn bdrv_co_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix) { IO_CODE(); + assert_bdrv_graph_readable(); if (bs->drv == NULL) { return -ENOMEDIUM; } @@ -5595,7 +5625,7 @@ int bdrv_drop_intermediate(BlockDriverState *top, BlockDriverState *base, GLOBAL_STATE_CODE(); bdrv_ref(top); - bdrv_subtree_drained_begin(top); + bdrv_drained_begin(base); if (!top->drv || !base->drv) { goto exit; @@ -5668,7 +5698,7 @@ int bdrv_drop_intermediate(BlockDriverState *top, BlockDriverState *base, ret = 0; exit: - bdrv_subtree_drained_end(top); + bdrv_drained_end(base); bdrv_unref(top); return ret; } @@ -6544,7 +6574,7 @@ int bdrv_activate(BlockDriverState *bs, Error **errp) */ if (bs->open_flags & BDRV_O_INACTIVE) { bs->open_flags &= ~BDRV_O_INACTIVE; - ret = bdrv_refresh_perms(bs, errp); + ret = bdrv_refresh_perms(bs, NULL, errp); if (ret < 0) { bs->open_flags |= BDRV_O_INACTIVE; return ret; @@ -6588,6 +6618,7 @@ int coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs, Error **errp) IO_CODE(); assert(!(bs->open_flags & BDRV_O_INACTIVE)); + assert_bdrv_graph_readable(); if (bs->drv->bdrv_co_invalidate_cache) { bs->drv->bdrv_co_invalidate_cache(bs, &local_err); @@ -6689,7 +6720,7 @@ static int bdrv_inactivate_recurse(BlockDriverState *bs) * We only tried to loosen restrictions, so errors are not fatal, ignore * them. */ - bdrv_refresh_perms(bs, NULL); + bdrv_refresh_perms(bs, NULL, NULL); /* Recursively inactivate children */ QLIST_FOREACH(child, &bs->children, next) { @@ -6894,6 +6925,10 @@ bool bdrv_op_blocker_is_empty(BlockDriverState *bs) return true; } +/* + * Must not be called while holding the lock of an AioContext other than the + * current one. + */ void bdrv_img_create(const char *filename, const char *fmt, const char *base_filename, const char *base_fmt, char *options, uint64_t img_size, int flags, bool quiet, @@ -7181,7 +7216,6 @@ static void bdrv_detach_aio_context(BlockDriverState *bs) if (bs->quiesce_counter) { aio_enable_external(bs->aio_context); } - assert_bdrv_graph_writable(bs); bs->aio_context = NULL; } @@ -7195,7 +7229,6 @@ static void bdrv_attach_aio_context(BlockDriverState *bs, aio_disable_external(new_context); } - assert_bdrv_graph_writable(bs); bs->aio_context = new_context; if (bs->drv && bs->drv->bdrv_attach_aio_context) { @@ -7276,7 +7309,6 @@ static void bdrv_set_aio_context_commit(void *opaque) BlockDriverState *bs = (BlockDriverState *) state->bs; AioContext *new_context = state->new_ctx; AioContext *old_context = bdrv_get_aio_context(bs); - assert_bdrv_graph_writable(bs); /* * Take the old AioContex when detaching it from bs. diff --git a/block/block-backend.c b/block/block-backend.c index bf0ea3cfed..ba7bf1d6bc 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -129,7 +129,7 @@ static void blk_root_inherit_options(BdrvChildRole role, bool parent_is_format, } static void blk_root_drained_begin(BdrvChild *child); static bool blk_root_drained_poll(BdrvChild *child); -static void blk_root_drained_end(BdrvChild *child, int *drained_end_counter); +static void blk_root_drained_end(BdrvChild *child); static void blk_root_change_media(BdrvChild *child, bool load); static void blk_root_resize(BdrvChild *child); @@ -1424,6 +1424,27 @@ int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset, return blk_co_pwritev_part(blk, offset, bytes, qiov, 0, flags); } +int coroutine_fn blk_co_block_status_above(BlockBackend *blk, + BlockDriverState *base, + int64_t offset, int64_t bytes, + int64_t *pnum, int64_t *map, + BlockDriverState **file) +{ + IO_CODE(); + return bdrv_co_block_status_above(blk_bs(blk), base, offset, bytes, pnum, + map, file); +} + +int coroutine_fn blk_co_is_allocated_above(BlockBackend *blk, + BlockDriverState *base, + bool include_base, int64_t offset, + int64_t bytes, int64_t *pnum) +{ + IO_CODE(); + return bdrv_co_is_allocated_above(blk_bs(blk), base, include_base, offset, + bytes, pnum); +} + typedef struct BlkRwCo { BlockBackend *blk; int64_t offset; @@ -2556,7 +2577,7 @@ static bool blk_root_drained_poll(BdrvChild *child) return busy || !!blk->in_flight; } -static void blk_root_drained_end(BdrvChild *child, int *drained_end_counter) +static void blk_root_drained_end(BdrvChild *child) { BlockBackend *blk = child->opaque; assert(blk->quiesce_counter); diff --git a/block/block-copy.c b/block/block-copy.c index bb947afdda..5e59d6262f 100644 --- a/block/block-copy.c +++ b/block/block-copy.c @@ -577,8 +577,9 @@ static coroutine_fn int block_copy_task_entry(AioTask *task) return ret; } -static int block_copy_block_status(BlockCopyState *s, int64_t offset, - int64_t bytes, int64_t *pnum) +static coroutine_fn int block_copy_block_status(BlockCopyState *s, + int64_t offset, + int64_t bytes, int64_t *pnum) { int64_t num; BlockDriverState *base; @@ -590,8 +591,8 @@ static int block_copy_block_status(BlockCopyState *s, int64_t offset, base = NULL; } - ret = bdrv_block_status_above(s->source->bs, base, offset, bytes, &num, - NULL, NULL); + ret = bdrv_co_block_status_above(s->source->bs, base, offset, bytes, &num, + NULL, NULL); if (ret < 0 || num < s->cluster_size) { /* * On error or if failed to obtain large enough chunk just fallback to @@ -613,8 +614,9 @@ static int block_copy_block_status(BlockCopyState *s, int64_t offset, * Check if the cluster starting at offset is allocated or not. * return via pnum the number of contiguous clusters sharing this allocation. */ -static int block_copy_is_cluster_allocated(BlockCopyState *s, int64_t offset, - int64_t *pnum) +static int coroutine_fn block_copy_is_cluster_allocated(BlockCopyState *s, + int64_t offset, + int64_t *pnum) { BlockDriverState *bs = s->source->bs; int64_t count, total_count = 0; @@ -624,7 +626,7 @@ static int block_copy_is_cluster_allocated(BlockCopyState *s, int64_t offset, assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); while (true) { - ret = bdrv_is_allocated(bs, offset, bytes, &count); + ret = bdrv_co_is_allocated(bs, offset, bytes, &count); if (ret < 0) { return ret; } @@ -669,8 +671,9 @@ void block_copy_reset(BlockCopyState *s, int64_t offset, int64_t bytes) * @return 0 when the cluster at @offset was unallocated, * 1 otherwise, and -ret on error. */ -int64_t block_copy_reset_unallocated(BlockCopyState *s, - int64_t offset, int64_t *count) +int64_t coroutine_fn block_copy_reset_unallocated(BlockCopyState *s, + int64_t offset, + int64_t *count) { int ret; int64_t clusters, bytes; diff --git a/block/block-gen.h b/block/block-gen.h index f80cf4897d..89b7daaa1f 100644 --- a/block/block-gen.h +++ b/block/block-gen.h @@ -30,20 +30,17 @@ /* Base structure for argument packing structures */ typedef struct BdrvPollCo { - BlockDriverState *bs; + AioContext *ctx; bool in_progress; - int ret; Coroutine *co; /* Keep pointer here for debugging */ } BdrvPollCo; -static inline int bdrv_poll_co(BdrvPollCo *s) +static inline void bdrv_poll_co(BdrvPollCo *s) { assert(!qemu_in_coroutine()); - bdrv_coroutine_enter(s->bs, s->co); - BDRV_POLL_WHILE(s->bs, s->in_progress); - - return s->ret; + aio_co_enter(s->ctx, s->co); + AIO_WAIT_WHILE(s->ctx, s->in_progress); } #endif /* BLOCK_BLOCK_GEN_H */ diff --git a/block/commit.c b/block/commit.c index 0029b31944..b346341767 100644 --- a/block/commit.c +++ b/block/commit.c @@ -155,8 +155,8 @@ static int coroutine_fn commit_run(Job *job, Error **errp) break; } /* Copy if allocated above the base */ - ret = bdrv_is_allocated_above(blk_bs(s->top), s->base_overlay, true, - offset, COMMIT_BUFFER_SIZE, &n); + ret = blk_co_is_allocated_above(s->top, s->base_overlay, true, + offset, COMMIT_BUFFER_SIZE, &n); copy = (ret > 0); trace_commit_one_iteration(s, offset, n, ret); if (copy) { diff --git a/block/coroutines.h b/block/coroutines.h index 3a2bad564f..2a1e0b3c9d 100644 --- a/block/coroutines.h +++ b/block/coroutines.h @@ -37,9 +37,11 @@ * the I/O API. */ -int coroutine_fn bdrv_co_check(BlockDriverState *bs, - BdrvCheckResult *res, BdrvCheckMode fix); -int coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs, Error **errp); +int coroutine_fn GRAPH_RDLOCK +bdrv_co_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix); + +int coroutine_fn GRAPH_RDLOCK +bdrv_co_invalidate_cache(BlockDriverState *bs, Error **errp); int coroutine_fn bdrv_co_common_block_status_above(BlockDriverState *bs, @@ -53,10 +55,11 @@ bdrv_co_common_block_status_above(BlockDriverState *bs, BlockDriverState **file, int *depth); -int coroutine_fn bdrv_co_readv_vmstate(BlockDriverState *bs, - QEMUIOVector *qiov, int64_t pos); -int coroutine_fn bdrv_co_writev_vmstate(BlockDriverState *bs, - QEMUIOVector *qiov, int64_t pos); +int coroutine_fn GRAPH_RDLOCK +bdrv_co_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos); + +int coroutine_fn GRAPH_RDLOCK +bdrv_co_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos); int coroutine_fn nbd_co_do_establish_connection(BlockDriverState *bs, bool blocking, @@ -71,7 +74,7 @@ nbd_co_do_establish_connection(BlockDriverState *bs, bool blocking, * the "I/O or GS" API. */ -int generated_co_wrapper +int co_wrapper_mixed_bdrv_rdlock bdrv_common_block_status_above(BlockDriverState *bs, BlockDriverState *base, bool include_base, @@ -82,7 +85,7 @@ bdrv_common_block_status_above(BlockDriverState *bs, int64_t *map, BlockDriverState **file, int *depth); -int generated_co_wrapper +int co_wrapper_mixed nbd_do_establish_connection(BlockDriverState *bs, bool blocking, Error **errp); #endif /* BLOCK_COROUTINES_H */ diff --git a/block/crypto.c b/block/crypto.c index 2fb8add458..bbeb9f437c 100644 --- a/block/crypto.c +++ b/block/crypto.c @@ -703,7 +703,7 @@ static int coroutine_fn block_crypto_co_create_opts_luks(BlockDriver *drv, } /* Create protocol layer */ - ret = bdrv_create_file(filename, opts, errp); + ret = bdrv_co_create_file(filename, opts, errp); if (ret < 0) { goto fail; } diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c index 9c39550698..956feeb2ae 100644 --- a/block/dirty-bitmap.c +++ b/block/dirty-bitmap.c @@ -388,7 +388,7 @@ void bdrv_release_named_dirty_bitmaps(BlockDriverState *bs) * not fail. * This function doesn't release corresponding BdrvDirtyBitmap. */ -static int coroutine_fn +int coroutine_fn bdrv_co_remove_persistent_dirty_bitmap(BlockDriverState *bs, const char *name, Error **errp) { @@ -399,45 +399,6 @@ bdrv_co_remove_persistent_dirty_bitmap(BlockDriverState *bs, const char *name, return 0; } -typedef struct BdrvRemovePersistentDirtyBitmapCo { - BlockDriverState *bs; - const char *name; - Error **errp; - int ret; -} BdrvRemovePersistentDirtyBitmapCo; - -static void coroutine_fn -bdrv_co_remove_persistent_dirty_bitmap_entry(void *opaque) -{ - BdrvRemovePersistentDirtyBitmapCo *s = opaque; - - s->ret = bdrv_co_remove_persistent_dirty_bitmap(s->bs, s->name, s->errp); - aio_wait_kick(); -} - -int bdrv_remove_persistent_dirty_bitmap(BlockDriverState *bs, const char *name, - Error **errp) -{ - if (qemu_in_coroutine()) { - return bdrv_co_remove_persistent_dirty_bitmap(bs, name, errp); - } else { - Coroutine *co; - BdrvRemovePersistentDirtyBitmapCo s = { - .bs = bs, - .name = name, - .errp = errp, - .ret = -EINPROGRESS, - }; - - co = qemu_coroutine_create(bdrv_co_remove_persistent_dirty_bitmap_entry, - &s); - bdrv_coroutine_enter(bs, co); - BDRV_POLL_WHILE(bs, s.ret == -EINPROGRESS); - - return s.ret; - } -} - bool bdrv_supports_persistent_dirty_bitmap(BlockDriverState *bs) { @@ -447,7 +408,7 @@ bdrv_supports_persistent_dirty_bitmap(BlockDriverState *bs) return false; } -static bool coroutine_fn +bool coroutine_fn bdrv_co_can_store_new_dirty_bitmap(BlockDriverState *bs, const char *name, uint32_t granularity, Error **errp) { @@ -470,51 +431,6 @@ bdrv_co_can_store_new_dirty_bitmap(BlockDriverState *bs, const char *name, return drv->bdrv_co_can_store_new_dirty_bitmap(bs, name, granularity, errp); } -typedef struct BdrvCanStoreNewDirtyBitmapCo { - BlockDriverState *bs; - const char *name; - uint32_t granularity; - Error **errp; - bool ret; - - bool in_progress; -} BdrvCanStoreNewDirtyBitmapCo; - -static void coroutine_fn bdrv_co_can_store_new_dirty_bitmap_entry(void *opaque) -{ - BdrvCanStoreNewDirtyBitmapCo *s = opaque; - - s->ret = bdrv_co_can_store_new_dirty_bitmap(s->bs, s->name, s->granularity, - s->errp); - s->in_progress = false; - aio_wait_kick(); -} - -bool bdrv_can_store_new_dirty_bitmap(BlockDriverState *bs, const char *name, - uint32_t granularity, Error **errp) -{ - IO_CODE(); - if (qemu_in_coroutine()) { - return bdrv_co_can_store_new_dirty_bitmap(bs, name, granularity, errp); - } else { - Coroutine *co; - BdrvCanStoreNewDirtyBitmapCo s = { - .bs = bs, - .name = name, - .granularity = granularity, - .errp = errp, - .in_progress = true, - }; - - co = qemu_coroutine_create(bdrv_co_can_store_new_dirty_bitmap_entry, - &s); - bdrv_coroutine_enter(bs, co); - BDRV_POLL_WHILE(bs, s.in_progress); - - return s.ret; - } -} - void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap) { bdrv_dirty_bitmaps_lock(bitmap->bs); diff --git a/block/graph-lock.c b/block/graph-lock.c new file mode 100644 index 0000000000..454c31e691 --- /dev/null +++ b/block/graph-lock.c @@ -0,0 +1,275 @@ +/* + * Graph lock: rwlock to protect block layer graph manipulations (add/remove + * edges and nodes) + * + * Copyright (c) 2022 Red Hat + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "qemu/main-loop.h" +#include "block/graph-lock.h" +#include "block/block.h" +#include "block/block_int.h" + +/* Dummy lock object to use for Thread Safety Analysis (TSA) */ +BdrvGraphLock graph_lock; + +/* Protects the list of aiocontext and orphaned_reader_count */ +static QemuMutex aio_context_list_lock; + +/* Written and read with atomic operations. */ +static int has_writer; + +/* + * A reader coroutine could move from an AioContext to another. + * If this happens, there is no problem from the point of view of + * counters. The problem is that the total count becomes + * unbalanced if one of the two AioContexts gets deleted. + * The count of readers must remain correct, so the AioContext's + * balance is transferred to this glboal variable. + * Protected by aio_context_list_lock. + */ +static uint32_t orphaned_reader_count; + +/* Queue of readers waiting for the writer to finish */ +static CoQueue reader_queue; + +struct BdrvGraphRWlock { + /* How many readers are currently reading the graph. */ + uint32_t reader_count; + + /* + * List of BdrvGraphRWlock kept in graph-lock.c + * Protected by aio_context_list_lock + */ + QTAILQ_ENTRY(BdrvGraphRWlock) next_aio; +}; + +/* + * List of BdrvGraphRWlock. This list ensures that each BdrvGraphRWlock + * can safely modify only its own counter, avoid reading/writing + * others and thus improving performances by avoiding cacheline bounces. + */ +static QTAILQ_HEAD(, BdrvGraphRWlock) aio_context_list = + QTAILQ_HEAD_INITIALIZER(aio_context_list); + +static void __attribute__((__constructor__)) bdrv_init_graph_lock(void) +{ + qemu_mutex_init(&aio_context_list_lock); + qemu_co_queue_init(&reader_queue); +} + +void register_aiocontext(AioContext *ctx) +{ + ctx->bdrv_graph = g_new0(BdrvGraphRWlock, 1); + QEMU_LOCK_GUARD(&aio_context_list_lock); + assert(ctx->bdrv_graph->reader_count == 0); + QTAILQ_INSERT_TAIL(&aio_context_list, ctx->bdrv_graph, next_aio); +} + +void unregister_aiocontext(AioContext *ctx) +{ + QEMU_LOCK_GUARD(&aio_context_list_lock); + orphaned_reader_count += ctx->bdrv_graph->reader_count; + QTAILQ_REMOVE(&aio_context_list, ctx->bdrv_graph, next_aio); + g_free(ctx->bdrv_graph); +} + +static uint32_t reader_count(void) +{ + BdrvGraphRWlock *brdv_graph; + uint32_t rd; + + QEMU_LOCK_GUARD(&aio_context_list_lock); + + /* rd can temporarly be negative, but the total will *always* be >= 0 */ + rd = orphaned_reader_count; + QTAILQ_FOREACH(brdv_graph, &aio_context_list, next_aio) { + rd += qatomic_read(&brdv_graph->reader_count); + } + + /* shouldn't overflow unless there are 2^31 readers */ + assert((int32_t)rd >= 0); + return rd; +} + +void bdrv_graph_wrlock(void) +{ + GLOBAL_STATE_CODE(); + assert(!qatomic_read(&has_writer)); + + /* Make sure that constantly arriving new I/O doesn't cause starvation */ + bdrv_drain_all_begin_nopoll(); + + /* + * reader_count == 0: this means writer will read has_reader as 1 + * reader_count >= 1: we don't know if writer read has_writer == 0 or 1, + * but we need to wait. + * Wait by allowing other coroutine (and possible readers) to continue. + */ + do { + /* + * has_writer must be 0 while polling, otherwise we get a deadlock if + * any callback involved during AIO_WAIT_WHILE() tries to acquire the + * reader lock. + */ + qatomic_set(&has_writer, 0); + AIO_WAIT_WHILE(qemu_get_aio_context(), reader_count() >= 1); + qatomic_set(&has_writer, 1); + + /* + * We want to only check reader_count() after has_writer = 1 is visible + * to other threads. That way no more readers can sneak in after we've + * determined reader_count() == 0. + */ + smp_mb(); + } while (reader_count() >= 1); + + bdrv_drain_all_end(); +} + +void bdrv_graph_wrunlock(void) +{ + GLOBAL_STATE_CODE(); + QEMU_LOCK_GUARD(&aio_context_list_lock); + assert(qatomic_read(&has_writer)); + + /* + * No need for memory barriers, this works in pair with + * the slow path of rdlock() and both take the lock. + */ + qatomic_store_release(&has_writer, 0); + + /* Wake up all coroutine that are waiting to read the graph */ + qemu_co_enter_all(&reader_queue, &aio_context_list_lock); +} + +void coroutine_fn bdrv_graph_co_rdlock(void) +{ + BdrvGraphRWlock *bdrv_graph; + bdrv_graph = qemu_get_current_aio_context()->bdrv_graph; + + /* Do not lock if in main thread */ + if (qemu_in_main_thread()) { + return; + } + + for (;;) { + qatomic_set(&bdrv_graph->reader_count, + bdrv_graph->reader_count + 1); + /* make sure writer sees reader_count before we check has_writer */ + smp_mb(); + + /* + * has_writer == 0: this means writer will read reader_count as >= 1 + * has_writer == 1: we don't know if writer read reader_count == 0 + * or > 0, but we need to wait anyways because + * it will write. + */ + if (!qatomic_read(&has_writer)) { + break; + } + + /* + * Synchronize access with reader_count() in bdrv_graph_wrlock(). + * Case 1: + * If this critical section gets executed first, reader_count will + * decrease and the reader will go to sleep. + * Then the writer will read reader_count that does not take into + * account this reader, and if there's no other reader it will + * enter the write section. + * Case 2: + * If reader_count() critical section gets executed first, + * then writer will read reader_count >= 1. + * It will wait in AIO_WAIT_WHILE(), but once it releases the lock + * we will enter this critical section and call aio_wait_kick(). + */ + WITH_QEMU_LOCK_GUARD(&aio_context_list_lock) { + /* + * Additional check when we use the above lock to synchronize + * with bdrv_graph_wrunlock(). + * Case 1: + * If this gets executed first, has_writer is still 1, so we reduce + * reader_count and go to sleep. + * Then the writer will set has_writer to 0 and wake up all readers, + * us included. + * Case 2: + * If bdrv_graph_wrunlock() critical section gets executed first, + * then it will set has_writer to 0 and wake up all other readers. + * Then we execute this critical section, and therefore must check + * again for has_writer, otherwise we sleep without any writer + * actually running. + */ + if (!qatomic_read(&has_writer)) { + return; + } + + /* slow path where reader sleeps */ + bdrv_graph->reader_count--; + aio_wait_kick(); + qemu_co_queue_wait(&reader_queue, &aio_context_list_lock); + } + } +} + +void coroutine_fn bdrv_graph_co_rdunlock(void) +{ + BdrvGraphRWlock *bdrv_graph; + bdrv_graph = qemu_get_current_aio_context()->bdrv_graph; + + /* Do not lock if in main thread */ + if (qemu_in_main_thread()) { + return; + } + + qatomic_store_release(&bdrv_graph->reader_count, + bdrv_graph->reader_count - 1); + /* make sure writer sees reader_count before we check has_writer */ + smp_mb(); + + /* + * has_writer == 0: this means reader will read reader_count decreased + * has_writer == 1: we don't know if writer read reader_count old or + * new. Therefore, kick again so on next iteration + * writer will for sure read the updated value. + */ + if (qatomic_read(&has_writer)) { + aio_wait_kick(); + } +} + +void bdrv_graph_rdlock_main_loop(void) +{ + GLOBAL_STATE_CODE(); + assert(!qemu_in_coroutine()); +} + +void bdrv_graph_rdunlock_main_loop(void) +{ + GLOBAL_STATE_CODE(); + assert(!qemu_in_coroutine()); +} + +void assert_bdrv_graph_readable(void) +{ + assert(qemu_in_main_thread() || reader_count()); +} + +void assert_bdrv_graph_writable(void) +{ + assert(qemu_in_main_thread()); + assert(qatomic_read(&has_writer)); +} diff --git a/block/io.c b/block/io.c index b9424024f9..d87788dfbb 100644 --- a/block/io.c +++ b/block/io.c @@ -45,53 +45,43 @@ static void bdrv_parent_cb_resize(BlockDriverState *bs); static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes, BdrvRequestFlags flags); -static void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore, - bool ignore_bds_parents) +static void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore) { BdrvChild *c, *next; QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { - if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) { + if (c == ignore) { continue; } - bdrv_parent_drained_begin_single(c, false); - } -} - -static void bdrv_parent_drained_end_single_no_poll(BdrvChild *c, - int *drained_end_counter) -{ - assert(c->parent_quiesce_counter > 0); - c->parent_quiesce_counter--; - if (c->klass->drained_end) { - c->klass->drained_end(c, drained_end_counter); + bdrv_parent_drained_begin_single(c); } } void bdrv_parent_drained_end_single(BdrvChild *c) { - int drained_end_counter = 0; - AioContext *ctx = bdrv_child_get_parent_aio_context(c); IO_OR_GS_CODE(); - bdrv_parent_drained_end_single_no_poll(c, &drained_end_counter); - AIO_WAIT_WHILE(ctx, qatomic_read(&drained_end_counter) > 0); + + assert(c->quiesced_parent); + c->quiesced_parent = false; + + if (c->klass->drained_end) { + c->klass->drained_end(c); + } } -static void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore, - bool ignore_bds_parents, - int *drained_end_counter) +static void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore) { BdrvChild *c; QLIST_FOREACH(c, &bs->parents, next_parent) { - if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) { + if (c == ignore) { continue; } - bdrv_parent_drained_end_single_no_poll(c, drained_end_counter); + bdrv_parent_drained_end_single(c); } } -static bool bdrv_parent_drained_poll_single(BdrvChild *c) +bool bdrv_parent_drained_poll_single(BdrvChild *c) { if (c->klass->drained_poll) { return c->klass->drained_poll(c); @@ -115,17 +105,16 @@ static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore, return busy; } -void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll) +void bdrv_parent_drained_begin_single(BdrvChild *c) { - AioContext *ctx = bdrv_child_get_parent_aio_context(c); IO_OR_GS_CODE(); - c->parent_quiesce_counter++; + + assert(!c->quiesced_parent); + c->quiesced_parent = true; + if (c->klass->drained_begin) { c->klass->drained_begin(c); } - if (poll) { - AIO_WAIT_WHILE(ctx, bdrv_parent_drained_poll_single(c)); - } } static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src) @@ -245,69 +234,14 @@ typedef struct { BlockDriverState *bs; bool done; bool begin; - bool recursive; bool poll; BdrvChild *parent; - bool ignore_bds_parents; - int *drained_end_counter; } BdrvCoDrainData; -static void coroutine_fn bdrv_drain_invoke_entry(void *opaque) -{ - BdrvCoDrainData *data = opaque; - BlockDriverState *bs = data->bs; - - if (data->begin) { - bs->drv->bdrv_co_drain_begin(bs); - } else { - bs->drv->bdrv_co_drain_end(bs); - } - - /* Set data->done and decrement drained_end_counter before bdrv_wakeup() */ - qatomic_mb_set(&data->done, true); - if (!data->begin) { - qatomic_dec(data->drained_end_counter); - } - bdrv_dec_in_flight(bs); - - g_free(data); -} - -/* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */ -static void bdrv_drain_invoke(BlockDriverState *bs, bool begin, - int *drained_end_counter) -{ - BdrvCoDrainData *data; - - if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) || - (!begin && !bs->drv->bdrv_co_drain_end)) { - return; - } - - data = g_new(BdrvCoDrainData, 1); - *data = (BdrvCoDrainData) { - .bs = bs, - .done = false, - .begin = begin, - .drained_end_counter = drained_end_counter, - }; - - if (!begin) { - qatomic_inc(drained_end_counter); - } - - /* Make sure the driver callback completes during the polling phase for - * drain_begin. */ - bdrv_inc_in_flight(bs); - data->co = qemu_coroutine_create(bdrv_drain_invoke_entry, data); - aio_co_schedule(bdrv_get_aio_context(bs), data->co); -} - /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */ -bool bdrv_drain_poll(BlockDriverState *bs, bool recursive, - BdrvChild *ignore_parent, bool ignore_bds_parents) +bool bdrv_drain_poll(BlockDriverState *bs, BdrvChild *ignore_parent, + bool ignore_bds_parents) { - BdrvChild *child, *next; IO_OR_GS_CODE(); if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) { @@ -318,30 +252,18 @@ bool bdrv_drain_poll(BlockDriverState *bs, bool recursive, return true; } - if (recursive) { - assert(!ignore_bds_parents); - QLIST_FOREACH_SAFE(child, &bs->children, next, next) { - if (bdrv_drain_poll(child->bs, recursive, child, false)) { - return true; - } - } - } - return false; } -static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive, +static bool bdrv_drain_poll_top_level(BlockDriverState *bs, BdrvChild *ignore_parent) { - return bdrv_drain_poll(bs, recursive, ignore_parent, false); + return bdrv_drain_poll(bs, ignore_parent, false); } -static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, - BdrvChild *parent, bool ignore_bds_parents, +static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent, bool poll); -static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, - BdrvChild *parent, bool ignore_bds_parents, - int *drained_end_counter); +static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent); static void bdrv_co_drain_bh_cb(void *opaque) { @@ -354,14 +276,10 @@ static void bdrv_co_drain_bh_cb(void *opaque) aio_context_acquire(ctx); bdrv_dec_in_flight(bs); if (data->begin) { - assert(!data->drained_end_counter); - bdrv_do_drained_begin(bs, data->recursive, data->parent, - data->ignore_bds_parents, data->poll); + bdrv_do_drained_begin(bs, data->parent, data->poll); } else { assert(!data->poll); - bdrv_do_drained_end(bs, data->recursive, data->parent, - data->ignore_bds_parents, - data->drained_end_counter); + bdrv_do_drained_end(bs, data->parent); } aio_context_release(ctx); } else { @@ -374,11 +292,9 @@ static void bdrv_co_drain_bh_cb(void *opaque) } static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs, - bool begin, bool recursive, + bool begin, BdrvChild *parent, - bool ignore_bds_parents, - bool poll, - int *drained_end_counter) + bool poll) { BdrvCoDrainData data; Coroutine *self = qemu_coroutine_self(); @@ -394,11 +310,8 @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs, .bs = bs, .done = false, .begin = begin, - .recursive = recursive, .parent = parent, - .ignore_bds_parents = ignore_bds_parents, .poll = poll, - .drained_end_counter = drained_end_counter, }; if (bs) { @@ -429,41 +342,22 @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs, } } -void bdrv_do_drained_begin_quiesce(BlockDriverState *bs, - BdrvChild *parent, bool ignore_bds_parents) -{ - IO_OR_GS_CODE(); - assert(!qemu_in_coroutine()); - - /* Stop things in parent-to-child order */ - if (qatomic_fetch_inc(&bs->quiesce_counter) == 0) { - aio_disable_external(bdrv_get_aio_context(bs)); - } - - bdrv_parent_drained_begin(bs, parent, ignore_bds_parents); - bdrv_drain_invoke(bs, true, NULL); -} - -static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, - BdrvChild *parent, bool ignore_bds_parents, +static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent, bool poll) { - BdrvChild *child, *next; + IO_OR_GS_CODE(); if (qemu_in_coroutine()) { - bdrv_co_yield_to_drain(bs, true, recursive, parent, ignore_bds_parents, - poll, NULL); + bdrv_co_yield_to_drain(bs, true, parent, poll); return; } - bdrv_do_drained_begin_quiesce(bs, parent, ignore_bds_parents); - - if (recursive) { - assert(!ignore_bds_parents); - bs->recursive_quiesce_counter++; - QLIST_FOREACH_SAFE(child, &bs->children, next, next) { - bdrv_do_drained_begin(child->bs, true, child, ignore_bds_parents, - false); + /* Stop things in parent-to-child order */ + if (qatomic_fetch_inc(&bs->quiesce_counter) == 0) { + aio_disable_external(bdrv_get_aio_context(bs)); + bdrv_parent_drained_begin(bs, parent); + if (bs->drv && bs->drv->bdrv_drain_begin) { + bs->drv->bdrv_drain_begin(bs); } } @@ -477,117 +371,50 @@ static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, * nodes. */ if (poll) { - assert(!ignore_bds_parents); - BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent)); + BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, parent)); } } -void bdrv_drained_begin(BlockDriverState *bs) +void bdrv_do_drained_begin_quiesce(BlockDriverState *bs, BdrvChild *parent) { - IO_OR_GS_CODE(); - bdrv_do_drained_begin(bs, false, NULL, false, true); + bdrv_do_drained_begin(bs, parent, false); } -void bdrv_subtree_drained_begin(BlockDriverState *bs) +void bdrv_drained_begin(BlockDriverState *bs) { IO_OR_GS_CODE(); - bdrv_do_drained_begin(bs, true, NULL, false, true); + bdrv_do_drained_begin(bs, NULL, true); } /** * This function does not poll, nor must any of its recursively called - * functions. The *drained_end_counter pointee will be incremented - * once for every background operation scheduled, and decremented once - * the operation settles. Therefore, the pointer must remain valid - * until the pointee reaches 0. That implies that whoever sets up the - * pointee has to poll until it is 0. - * - * We use atomic operations to access *drained_end_counter, because - * (1) when called from bdrv_set_aio_context_ignore(), the subgraph of - * @bs may contain nodes in different AioContexts, - * (2) bdrv_drain_all_end() uses the same counter for all nodes, - * regardless of which AioContext they are in. + * functions. */ -static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, - BdrvChild *parent, bool ignore_bds_parents, - int *drained_end_counter) +static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent) { - BdrvChild *child; int old_quiesce_counter; - assert(drained_end_counter != NULL); - if (qemu_in_coroutine()) { - bdrv_co_yield_to_drain(bs, false, recursive, parent, ignore_bds_parents, - false, drained_end_counter); + bdrv_co_yield_to_drain(bs, false, parent, false); return; } assert(bs->quiesce_counter > 0); /* Re-enable things in child-to-parent order */ - bdrv_drain_invoke(bs, false, drained_end_counter); - bdrv_parent_drained_end(bs, parent, ignore_bds_parents, - drained_end_counter); - old_quiesce_counter = qatomic_fetch_dec(&bs->quiesce_counter); if (old_quiesce_counter == 1) { - aio_enable_external(bdrv_get_aio_context(bs)); - } - - if (recursive) { - assert(!ignore_bds_parents); - bs->recursive_quiesce_counter--; - QLIST_FOREACH(child, &bs->children, next) { - bdrv_do_drained_end(child->bs, true, child, ignore_bds_parents, - drained_end_counter); + if (bs->drv && bs->drv->bdrv_drain_end) { + bs->drv->bdrv_drain_end(bs); } + bdrv_parent_drained_end(bs, parent); + aio_enable_external(bdrv_get_aio_context(bs)); } } void bdrv_drained_end(BlockDriverState *bs) { - int drained_end_counter = 0; - IO_OR_GS_CODE(); - bdrv_do_drained_end(bs, false, NULL, false, &drained_end_counter); - BDRV_POLL_WHILE(bs, qatomic_read(&drained_end_counter) > 0); -} - -void bdrv_drained_end_no_poll(BlockDriverState *bs, int *drained_end_counter) -{ - IO_CODE(); - bdrv_do_drained_end(bs, false, NULL, false, drained_end_counter); -} - -void bdrv_subtree_drained_end(BlockDriverState *bs) -{ - int drained_end_counter = 0; - IO_OR_GS_CODE(); - bdrv_do_drained_end(bs, true, NULL, false, &drained_end_counter); - BDRV_POLL_WHILE(bs, qatomic_read(&drained_end_counter) > 0); -} - -void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent) -{ - int i; - IO_OR_GS_CODE(); - - for (i = 0; i < new_parent->recursive_quiesce_counter; i++) { - bdrv_do_drained_begin(child->bs, true, child, false, true); - } -} - -void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent) -{ - int drained_end_counter = 0; - int i; IO_OR_GS_CODE(); - - for (i = 0; i < old_parent->recursive_quiesce_counter; i++) { - bdrv_do_drained_end(child->bs, true, child, false, - &drained_end_counter); - } - - BDRV_POLL_WHILE(child->bs, qatomic_read(&drained_end_counter) > 0); + bdrv_do_drained_end(bs, NULL); } void bdrv_drain(BlockDriverState *bs) @@ -620,7 +447,7 @@ static bool bdrv_drain_all_poll(void) while ((bs = bdrv_next_all_states(bs))) { AioContext *aio_context = bdrv_get_aio_context(bs); aio_context_acquire(aio_context); - result |= bdrv_drain_poll(bs, false, NULL, true); + result |= bdrv_drain_poll(bs, NULL, true); aio_context_release(aio_context); } @@ -639,16 +466,11 @@ static bool bdrv_drain_all_poll(void) * NOTE: no new block jobs or BlockDriverStates can be created between * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls. */ -void bdrv_drain_all_begin(void) +void bdrv_drain_all_begin_nopoll(void) { BlockDriverState *bs = NULL; GLOBAL_STATE_CODE(); - if (qemu_in_coroutine()) { - bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true, NULL); - return; - } - /* * bdrv queue is managed by record/replay, * waiting for finishing the I/O requests may @@ -670,9 +492,21 @@ void bdrv_drain_all_begin(void) AioContext *aio_context = bdrv_get_aio_context(bs); aio_context_acquire(aio_context); - bdrv_do_drained_begin(bs, false, NULL, true, false); + bdrv_do_drained_begin(bs, NULL, false); aio_context_release(aio_context); } +} + +void bdrv_drain_all_begin(void) +{ + BlockDriverState *bs = NULL; + + if (qemu_in_coroutine()) { + bdrv_co_yield_to_drain(NULL, true, NULL, true); + return; + } + + bdrv_drain_all_begin_nopoll(); /* Now poll the in-flight requests */ AIO_WAIT_WHILE(NULL, bdrv_drain_all_poll()); @@ -684,22 +518,19 @@ void bdrv_drain_all_begin(void) void bdrv_drain_all_end_quiesce(BlockDriverState *bs) { - int drained_end_counter = 0; GLOBAL_STATE_CODE(); g_assert(bs->quiesce_counter > 0); g_assert(!bs->refcnt); while (bs->quiesce_counter) { - bdrv_do_drained_end(bs, false, NULL, true, &drained_end_counter); + bdrv_do_drained_end(bs, NULL); } - BDRV_POLL_WHILE(bs, qatomic_read(&drained_end_counter) > 0); } void bdrv_drain_all_end(void) { BlockDriverState *bs = NULL; - int drained_end_counter = 0; GLOBAL_STATE_CODE(); /* @@ -715,13 +546,11 @@ void bdrv_drain_all_end(void) AioContext *aio_context = bdrv_get_aio_context(bs); aio_context_acquire(aio_context); - bdrv_do_drained_end(bs, false, NULL, true, &drained_end_counter); + bdrv_do_drained_end(bs, NULL); aio_context_release(aio_context); } assert(qemu_get_current_aio_context() == qemu_get_aio_context()); - AIO_WAIT_WHILE(NULL, qatomic_read(&drained_end_counter) > 0); - assert(bdrv_drain_all_count > 0); bdrv_drain_all_count--; } @@ -2711,6 +2540,17 @@ bdrv_co_common_block_status_above(BlockDriverState *bs, return ret; } +int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs, + BlockDriverState *base, + int64_t offset, int64_t bytes, + int64_t *pnum, int64_t *map, + BlockDriverState **file) +{ + IO_CODE(); + return bdrv_co_common_block_status_above(bs, base, false, true, offset, + bytes, pnum, map, file, NULL); +} + int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base, int64_t offset, int64_t bytes, int64_t *pnum, int64_t *map, BlockDriverState **file) @@ -2756,6 +2596,22 @@ int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset, return (pnum == bytes) && (ret & BDRV_BLOCK_ZERO); } +int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t offset, + int64_t bytes, int64_t *pnum) +{ + int ret; + int64_t dummy; + IO_CODE(); + + ret = bdrv_co_common_block_status_above(bs, bs, true, false, offset, + bytes, pnum ? pnum : &dummy, NULL, + NULL, NULL); + if (ret < 0) { + return ret; + } + return !!(ret & BDRV_BLOCK_ALLOCATED); +} + int bdrv_is_allocated(BlockDriverState *bs, int64_t offset, int64_t bytes, int64_t *pnum) { @@ -2772,6 +2628,29 @@ int bdrv_is_allocated(BlockDriverState *bs, int64_t offset, int64_t bytes, return !!(ret & BDRV_BLOCK_ALLOCATED); } +/* See bdrv_is_allocated_above for documentation */ +int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top, + BlockDriverState *base, + bool include_base, int64_t offset, + int64_t bytes, int64_t *pnum) +{ + int depth; + int ret; + IO_CODE(); + + ret = bdrv_co_common_block_status_above(top, base, include_base, false, + offset, bytes, pnum, NULL, NULL, + &depth); + if (ret < 0) { + return ret; + } + + if (ret & BDRV_BLOCK_ALLOCATED) { + return depth; + } + return 0; +} + /* * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] * @@ -2795,10 +2674,12 @@ int bdrv_is_allocated_above(BlockDriverState *top, int64_t bytes, int64_t *pnum) { int depth; - int ret = bdrv_common_block_status_above(top, base, include_base, false, - offset, bytes, pnum, NULL, NULL, - &depth); + int ret; IO_CODE(); + + ret = bdrv_common_block_status_above(top, base, include_base, false, + offset, bytes, pnum, NULL, NULL, + &depth); if (ret < 0) { return ret; } @@ -2816,6 +2697,7 @@ bdrv_co_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) BlockDriverState *child_bs = bdrv_primary_bs(bs); int ret; IO_CODE(); + assert_bdrv_graph_readable(); ret = bdrv_check_qiov_request(pos, qiov->size, qiov, 0, NULL); if (ret < 0) { @@ -2848,6 +2730,7 @@ bdrv_co_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) BlockDriverState *child_bs = bdrv_primary_bs(bs); int ret; IO_CODE(); + assert_bdrv_graph_readable(); ret = bdrv_check_qiov_request(pos, qiov->size, qiov, 0, NULL); if (ret < 0) { diff --git a/block/meson.build b/block/meson.build index b7c68b83a3..90011a2805 100644 --- a/block/meson.build +++ b/block/meson.build @@ -10,6 +10,7 @@ block_ss.add(files( 'blkverify.c', 'block-backend.c', 'block-copy.c', + 'graph-lock.c', 'commit.c', 'copy-on-read.c', 'preallocate.c', @@ -137,6 +138,7 @@ block_gen_c = custom_target('block-gen.c', output: 'block-gen.c', input: files( '../include/block/block-io.h', + '../include/block/dirty-bitmap.h', '../include/block/block-global-state.h', '../include/sysemu/block-backend-io.h', 'coroutines.h' diff --git a/block/parallels.c b/block/parallels.c index fa08c1104b..bbea2f2221 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -646,7 +646,7 @@ static int coroutine_fn parallels_co_create_opts(BlockDriver *drv, } /* Create and open the file (protocol layer) */ - ret = bdrv_create_file(filename, opts, errp); + ret = bdrv_co_create_file(filename, opts, errp); if (ret < 0) { goto done; } diff --git a/block/qcow.c b/block/qcow.c index 8bffc41531..5d99f00411 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -973,7 +973,7 @@ static int coroutine_fn qcow_co_create_opts(BlockDriver *drv, } /* Create and open the file (protocol layer) */ - ret = bdrv_create_file(filename, opts, errp); + ret = bdrv_co_create_file(filename, opts, errp); if (ret < 0) { goto fail; } diff --git a/block/qcow2.c b/block/qcow2.c index 941782a011..bafbd077b9 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -3871,7 +3871,7 @@ static int coroutine_fn qcow2_co_create_opts(BlockDriver *drv, } /* Create and open the file (protocol layer) */ - ret = bdrv_create_file(filename, opts, errp); + ret = bdrv_co_create_file(filename, opts, errp); if (ret < 0) { goto finish; } @@ -3886,7 +3886,7 @@ static int coroutine_fn qcow2_co_create_opts(BlockDriver *drv, /* Create and open an external data file (protocol layer) */ val = qdict_get_try_str(qdict, BLOCK_OPT_DATA_FILE); if (val) { - ret = bdrv_create_file(val, opts, errp); + ret = bdrv_co_create_file(val, opts, errp); if (ret < 0) { goto finish; } diff --git a/block/qed.c b/block/qed.c index 0ab159b468..faa606618e 100644 --- a/block/qed.c +++ b/block/qed.c @@ -262,7 +262,7 @@ static bool coroutine_fn qed_plug_allocating_write_reqs(BDRVQEDState *s) assert(!s->allocating_write_reqs_plugged); if (s->allocating_acb != NULL) { /* Another allocating write came concurrently. This cannot happen - * from bdrv_qed_co_drain_begin, but it can happen when the timer runs. + * from bdrv_qed_drain_begin, but it can happen when the timer runs. */ qemu_co_mutex_unlock(&s->table_lock); return false; @@ -282,9 +282,8 @@ static void coroutine_fn qed_unplug_allocating_write_reqs(BDRVQEDState *s) qemu_co_mutex_unlock(&s->table_lock); } -static void coroutine_fn qed_need_check_timer_entry(void *opaque) +static void coroutine_fn qed_need_check_timer(BDRVQEDState *s) { - BDRVQEDState *s = opaque; int ret; trace_qed_need_check_timer_cb(s); @@ -310,9 +309,20 @@ static void coroutine_fn qed_need_check_timer_entry(void *opaque) (void) ret; } +static void coroutine_fn qed_need_check_timer_entry(void *opaque) +{ + BDRVQEDState *s = opaque; + + qed_need_check_timer(opaque); + bdrv_dec_in_flight(s->bs); +} + static void qed_need_check_timer_cb(void *opaque) { + BDRVQEDState *s = opaque; Coroutine *co = qemu_coroutine_create(qed_need_check_timer_entry, opaque); + + bdrv_inc_in_flight(s->bs); qemu_coroutine_enter(co); } @@ -355,7 +365,7 @@ static void bdrv_qed_attach_aio_context(BlockDriverState *bs, } } -static void coroutine_fn bdrv_qed_co_drain_begin(BlockDriverState *bs) +static void bdrv_qed_drain_begin(BlockDriverState *bs) { BDRVQEDState *s = bs->opaque; @@ -363,8 +373,12 @@ static void coroutine_fn bdrv_qed_co_drain_begin(BlockDriverState *bs) * header is flushed. */ if (s->need_check_timer && timer_pending(s->need_check_timer)) { + Coroutine *co; + qed_cancel_need_check_timer(s); - qed_need_check_timer_entry(s); + co = qemu_coroutine_create(qed_need_check_timer_entry, s); + bdrv_inc_in_flight(bs); + aio_co_enter(bdrv_get_aio_context(bs), co); } } @@ -764,7 +778,7 @@ static int coroutine_fn bdrv_qed_co_create_opts(BlockDriver *drv, } /* Create and open the file (protocol layer) */ - ret = bdrv_create_file(filename, opts, errp); + ret = bdrv_co_create_file(filename, opts, errp); if (ret < 0) { goto fail; } @@ -1647,7 +1661,7 @@ static BlockDriver bdrv_qed = { .bdrv_co_check = bdrv_qed_co_check, .bdrv_detach_aio_context = bdrv_qed_detach_aio_context, .bdrv_attach_aio_context = bdrv_qed_attach_aio_context, - .bdrv_co_drain_begin = bdrv_qed_co_drain_begin, + .bdrv_drain_begin = bdrv_qed_drain_begin, }; static void bdrv_qed_init(void) diff --git a/block/raw-format.c b/block/raw-format.c index a68014ef0b..28905b09ee 100644 --- a/block/raw-format.c +++ b/block/raw-format.c @@ -433,7 +433,7 @@ static int coroutine_fn raw_co_create_opts(BlockDriver *drv, QemuOpts *opts, Error **errp) { - return bdrv_create_file(filename, opts, errp); + return bdrv_co_create_file(filename, opts, errp); } static int raw_open(BlockDriverState *bs, QDict *options, int flags, diff --git a/block/replication.c b/block/replication.c index f1eed25e43..c62f48a874 100644 --- a/block/replication.c +++ b/block/replication.c @@ -374,9 +374,6 @@ static void reopen_backing_file(BlockDriverState *bs, bool writable, s->orig_secondary_read_only = bdrv_is_read_only(secondary_disk->bs); } - bdrv_subtree_drained_begin(hidden_disk->bs); - bdrv_subtree_drained_begin(secondary_disk->bs); - if (s->orig_hidden_read_only) { QDict *opts = qdict_new(); qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !writable); @@ -401,9 +398,6 @@ static void reopen_backing_file(BlockDriverState *bs, bool writable, aio_context_acquire(ctx); } } - - bdrv_subtree_drained_end(hidden_disk->bs); - bdrv_subtree_drained_end(secondary_disk->bs); } static void backup_job_cleanup(BlockDriverState *bs) diff --git a/block/stream.c b/block/stream.c index 694709bd25..8744ad103f 100644 --- a/block/stream.c +++ b/block/stream.c @@ -64,13 +64,16 @@ static int stream_prepare(Job *job) bdrv_cor_filter_drop(s->cor_filter_bs); s->cor_filter_bs = NULL; - bdrv_subtree_drained_begin(s->above_base); + /* + * bdrv_set_backing_hd() requires that unfiltered_bs is drained. Drain + * already here and use bdrv_set_backing_hd_drained() instead because + * the polling during drained_begin() might change the graph, and if we do + * this only later, we may end up working with the wrong base node (or it + * might even have gone away by the time we want to use it). + */ + bdrv_drained_begin(unfiltered_bs); base = bdrv_filter_or_cow_bs(s->above_base); - if (base) { - bdrv_ref(base); - } - unfiltered_base = bdrv_skip_filters(base); if (bdrv_cow_child(unfiltered_bs)) { @@ -82,7 +85,13 @@ static int stream_prepare(Job *job) } } - bdrv_set_backing_hd(unfiltered_bs, base, &local_err); + bdrv_set_backing_hd_drained(unfiltered_bs, base, &local_err); + + /* + * This call will do I/O, so the graph can change again from here on. + * We have already completed the graph change, so we are not in danger + * of operating on the wrong node any more if this happens. + */ ret = bdrv_change_backing_file(unfiltered_bs, base_id, base_fmt, false); if (local_err) { error_report_err(local_err); @@ -92,10 +101,7 @@ static int stream_prepare(Job *job) } out: - if (base) { - bdrv_unref(base); - } - bdrv_subtree_drained_end(s->above_base); + bdrv_drained_end(unfiltered_bs); return ret; } diff --git a/block/throttle.c b/block/throttle.c index 131eba3ab4..88851c84f4 100644 --- a/block/throttle.c +++ b/block/throttle.c @@ -214,7 +214,7 @@ static void throttle_reopen_abort(BDRVReopenState *reopen_state) reopen_state->opaque = NULL; } -static void coroutine_fn throttle_co_drain_begin(BlockDriverState *bs) +static void throttle_drain_begin(BlockDriverState *bs) { ThrottleGroupMember *tgm = bs->opaque; if (qatomic_fetch_inc(&tgm->io_limits_disabled) == 0) { @@ -222,7 +222,7 @@ static void coroutine_fn throttle_co_drain_begin(BlockDriverState *bs) } } -static void coroutine_fn throttle_co_drain_end(BlockDriverState *bs) +static void throttle_drain_end(BlockDriverState *bs) { ThrottleGroupMember *tgm = bs->opaque; assert(tgm->io_limits_disabled); @@ -261,8 +261,8 @@ static BlockDriver bdrv_throttle = { .bdrv_reopen_commit = throttle_reopen_commit, .bdrv_reopen_abort = throttle_reopen_abort, - .bdrv_co_drain_begin = throttle_co_drain_begin, - .bdrv_co_drain_end = throttle_co_drain_end, + .bdrv_drain_begin = throttle_drain_begin, + .bdrv_drain_end = throttle_drain_end, .is_filter = true, .strong_runtime_opts = throttle_strong_runtime_opts, diff --git a/block/vdi.c b/block/vdi.c index c0c111c4b9..479bcfe820 100644 --- a/block/vdi.c +++ b/block/vdi.c @@ -934,7 +934,7 @@ static int coroutine_fn vdi_co_create_opts(BlockDriver *drv, qdict = qemu_opts_to_qdict_filtered(opts, NULL, &vdi_create_opts, true); /* Create and open the file (protocol layer) */ - ret = bdrv_create_file(filename, opts, errp); + ret = bdrv_co_create_file(filename, opts, errp); if (ret < 0) { goto done; } diff --git a/block/vhdx.c b/block/vhdx.c index bad9ca691b..4c929800fe 100644 --- a/block/vhdx.c +++ b/block/vhdx.c @@ -2084,7 +2084,7 @@ static int coroutine_fn vhdx_co_create_opts(BlockDriver *drv, } /* Create and open the file (protocol layer) */ - ret = bdrv_create_file(filename, opts, errp); + ret = bdrv_co_create_file(filename, opts, errp); if (ret < 0) { goto fail; } diff --git a/block/vmdk.c b/block/vmdk.c index bac3d8db50..8894dac2d4 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -2285,15 +2285,16 @@ exit: return ret; } -static int vmdk_create_extent(const char *filename, int64_t filesize, - bool flat, bool compress, bool zeroed_grain, - BlockBackend **pbb, - QemuOpts *opts, Error **errp) +static int coroutine_fn vmdk_create_extent(const char *filename, + int64_t filesize, bool flat, + bool compress, bool zeroed_grain, + BlockBackend **pbb, + QemuOpts *opts, Error **errp) { int ret; BlockBackend *blk = NULL; - ret = bdrv_create_file(filename, opts, errp); + ret = bdrv_co_create_file(filename, opts, errp); if (ret < 0) { goto exit; } @@ -2366,14 +2367,14 @@ static int filename_decompose(const char *filename, char *path, char *prefix, * non-split format. * idx >= 1: get the n-th extent if in a split subformat */ -typedef BlockBackend *(*vmdk_create_extent_fn)(int64_t size, - int idx, - bool flat, - bool split, - bool compress, - bool zeroed_grain, - void *opaque, - Error **errp); +typedef BlockBackend * coroutine_fn (*vmdk_create_extent_fn)(int64_t size, + int idx, + bool flat, + bool split, + bool compress, + bool zeroed_grain, + void *opaque, + Error **errp); static void vmdk_desc_add_extent(GString *desc, const char *extent_line_fmt, @@ -2616,7 +2617,7 @@ typedef struct { QemuOpts *opts; } VMDKCreateOptsData; -static BlockBackend *vmdk_co_create_opts_cb(int64_t size, int idx, +static BlockBackend * coroutine_fn vmdk_co_create_opts_cb(int64_t size, int idx, bool flat, bool split, bool compress, bool zeroed_grain, void *opaque, Error **errp) @@ -2768,10 +2769,11 @@ exit: return ret; } -static BlockBackend *vmdk_co_create_cb(int64_t size, int idx, - bool flat, bool split, bool compress, - bool zeroed_grain, void *opaque, - Error **errp) +static BlockBackend * coroutine_fn vmdk_co_create_cb(int64_t size, int idx, + bool flat, bool split, + bool compress, + bool zeroed_grain, + void *opaque, Error **errp) { int ret; BlockDriverState *bs; diff --git a/block/vpc.c b/block/vpc.c index 95841f259a..6ee95dcb96 100644 --- a/block/vpc.c +++ b/block/vpc.c @@ -1111,7 +1111,7 @@ static int coroutine_fn vpc_co_create_opts(BlockDriver *drv, } /* Create and open the file (protocol layer) */ - ret = bdrv_create_file(filename, opts, errp); + ret = bdrv_co_create_file(filename, opts, errp); if (ret < 0) { goto fail; } diff --git a/blockdev.c b/blockdev.c index 75eef8535e..ebf952cd21 100644 --- a/blockdev.c +++ b/blockdev.c @@ -1507,10 +1507,14 @@ static void external_snapshot_prepare(BlkActionState *common, goto out; } bdrv_refresh_filename(state->old_bs); + + aio_context_release(aio_context); bdrv_img_create(new_image_file, format, state->old_bs->filename, state->old_bs->drv->format_name, NULL, size, flags, false, &local_err); + aio_context_acquire(aio_context); + if (local_err) { error_propagate(errp, local_err); goto out; @@ -3515,8 +3519,6 @@ fail: void qmp_blockdev_reopen(BlockdevOptionsList *reopen_list, Error **errp) { BlockReopenQueue *queue = NULL; - GSList *drained = NULL; - GSList *p; /* Add each one of the BDS that we want to reopen to the queue */ for (; reopen_list != NULL; reopen_list = reopen_list->next) { @@ -3553,9 +3555,7 @@ void qmp_blockdev_reopen(BlockdevOptionsList *reopen_list, Error **errp) ctx = bdrv_get_aio_context(bs); aio_context_acquire(ctx); - bdrv_subtree_drained_begin(bs); queue = bdrv_reopen_queue(queue, bs, qdict, false); - drained = g_slist_prepend(drained, bs); aio_context_release(ctx); } @@ -3566,15 +3566,6 @@ void qmp_blockdev_reopen(BlockdevOptionsList *reopen_list, Error **errp) fail: bdrv_reopen_queue_free(queue); - for (p = drained; p; p = p->next) { - BlockDriverState *bs = p->data; - AioContext *ctx = bdrv_get_aio_context(bs); - - aio_context_acquire(ctx); - bdrv_subtree_drained_end(bs); - aio_context_release(ctx); - } - g_slist_free(drained); } void qmp_blockdev_del(const char *node_name, Error **errp) diff --git a/blockjob.c b/blockjob.c index 3c8f3543a2..b7daf2a9f6 100644 --- a/blockjob.c +++ b/blockjob.c @@ -120,7 +120,7 @@ static bool child_job_drained_poll(BdrvChild *c) } } -static void child_job_drained_end(BdrvChild *c, int *drained_end_counter) +static void child_job_drained_end(BdrvChild *c) { BlockJob *job = c->opaque; job_resume(&job->job); diff --git a/docs/devel/block-coroutine-wrapper.rst b/docs/devel/block-coroutine-wrapper.rst index 412851986b..6dd2cdcab3 100644 --- a/docs/devel/block-coroutine-wrapper.rst +++ b/docs/devel/block-coroutine-wrapper.rst @@ -26,12 +26,12 @@ called ``bdrv_foo(<same args>)``. In this case the script can help. To trigger the generation: 1. You need ``bdrv_foo`` declaration somewhere (for example, in - ``block/coroutines.h``) with the ``generated_co_wrapper`` mark, + ``block/coroutines.h``) with the ``co_wrapper`` mark, like this: .. code-block:: c - int generated_co_wrapper bdrv_foo(<some args>); + int co_wrapper bdrv_foo(<some args>); 2. You need to feed this declaration to block-coroutine-wrapper script. For this, add the .h (or .c) file with the declaration to the @@ -46,7 +46,7 @@ Links 1. The script location is ``scripts/block-coroutine-wrapper.py``. -2. Generic place for private ``generated_co_wrapper`` declarations is +2. Generic place for private ``co_wrapper`` declarations is ``block/coroutines.h``, for public declarations: ``include/block/block.h`` diff --git a/include/block/aio.h b/include/block/aio.h index d128558f1d..0f65a3cc9e 100644 --- a/include/block/aio.h +++ b/include/block/aio.h @@ -22,6 +22,7 @@ #include "qemu/event_notifier.h" #include "qemu/thread.h" #include "qemu/timer.h" +#include "block/graph-lock.h" typedef struct BlockAIOCB BlockAIOCB; typedef void BlockCompletionFunc(void *opaque, int ret); @@ -127,6 +128,14 @@ struct AioContext { /* Used by AioContext users to protect from multi-threaded access. */ QemuRecMutex lock; + /* + * Keep track of readers and writers of the block layer graph. + * This is essential to avoid performing additions and removal + * of nodes and edges from block graph while some + * other thread is traversing it. + */ + BdrvGraphRWlock *bdrv_graph; + /* The list of registered AIO handlers. Protected by ctx->list_lock. */ AioHandlerList aio_handlers; diff --git a/include/block/block-common.h b/include/block/block-common.h index 297704c1e9..4749c46a5e 100644 --- a/include/block/block-common.h +++ b/include/block/block-common.h @@ -29,20 +29,35 @@ #include "qemu/iov.h" #include "qemu/coroutine.h" #include "block/accounting.h" -#include "block/dirty-bitmap.h" -#include "block/blockjob.h" #include "qemu/hbitmap.h" #include "qemu/transactions.h" /* - * generated_co_wrapper + * co_wrapper{*}: Function specifiers used by block-coroutine-wrapper.py * - * Function specifier, which does nothing but mark functions to be + * Function specifiers, which do nothing but mark functions to be * generated by scripts/block-coroutine-wrapper.py * - * Read more in docs/devel/block-coroutine-wrapper.rst + * Usage: read docs/devel/block-coroutine-wrapper.rst + * + * There are 4 kind of specifiers: + * - co_wrapper functions can be called by only non-coroutine context, because + * they always generate a new coroutine. + * - co_wrapper_mixed functions can be called by both coroutine and + * non-coroutine context. + * - co_wrapper_bdrv_rdlock are co_wrapper functions but automatically take and + * release the graph rdlock when creating a new coroutine + * - co_wrapper_mixed_bdrv_rdlock are co_wrapper_mixed functions but + * automatically take and release the graph rdlock when creating a new + * coroutine. */ -#define generated_co_wrapper +#define co_wrapper +#define co_wrapper_mixed +#define co_wrapper_bdrv_rdlock +#define co_wrapper_mixed_bdrv_rdlock + +#include "block/dirty-bitmap.h" +#include "block/blockjob.h" /* block.c */ typedef struct BlockDriver BlockDriver; diff --git a/include/block/block-copy.h b/include/block/block-copy.h index ba0b425d78..8cea4f9b90 100644 --- a/include/block/block-copy.h +++ b/include/block/block-copy.h @@ -36,8 +36,9 @@ void block_copy_set_progress_meter(BlockCopyState *s, ProgressMeter *pm); void block_copy_state_free(BlockCopyState *s); void block_copy_reset(BlockCopyState *s, int64_t offset, int64_t bytes); -int64_t block_copy_reset_unallocated(BlockCopyState *s, - int64_t offset, int64_t *count); +int64_t coroutine_fn block_copy_reset_unallocated(BlockCopyState *s, + int64_t offset, + int64_t *count); int coroutine_fn block_copy(BlockCopyState *s, int64_t offset, int64_t bytes, bool ignore_ratelimit, uint64_t timeout_ns, diff --git a/include/block/block-global-state.h b/include/block/block-global-state.h index c7bd4a2088..b0a3cfe6b8 100644 --- a/include/block/block-global-state.h +++ b/include/block/block-global-state.h @@ -55,9 +55,14 @@ BlockDriver *bdrv_find_protocol(const char *filename, bool allow_protocol_prefix, Error **errp); BlockDriver *bdrv_find_format(const char *format_name); -int bdrv_create(BlockDriver *drv, const char* filename, - QemuOpts *opts, Error **errp); -int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp); + +int coroutine_fn bdrv_co_create(BlockDriver *drv, const char *filename, + QemuOpts *opts, Error **errp); +int co_wrapper bdrv_create(BlockDriver *drv, const char *filename, + QemuOpts *opts, Error **errp); + +int coroutine_fn bdrv_co_create_file(const char *filename, QemuOpts *opts, + Error **errp); BlockDriverState *bdrv_new(void); int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top, @@ -82,6 +87,9 @@ int bdrv_open_file_child(const char *filename, BlockDriverState *bdrv_open_blockdev_ref(BlockdevRef *ref, Error **errp); int bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd, Error **errp); +int bdrv_set_backing_hd_drained(BlockDriverState *bs, + BlockDriverState *backing_hd, + Error **errp); int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options, const char *bdref_key, Error **errp); BlockDriverState *bdrv_open(const char *filename, const char *reference, @@ -144,6 +152,7 @@ int bdrv_inactivate_all(void); int bdrv_flush_all(void); void bdrv_close_all(void); void bdrv_drain_all_begin(void); +void bdrv_drain_all_begin_nopoll(void); void bdrv_drain_all_end(void); void bdrv_drain_all(void); diff --git a/include/block/block-io.h b/include/block/block-io.h index b099d7db45..2ed6214909 100644 --- a/include/block/block-io.h +++ b/include/block/block-io.h @@ -39,19 +39,24 @@ * to catch when they are accidentally called by the wrong API. */ -int generated_co_wrapper bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset, - int64_t bytes, - BdrvRequestFlags flags); +int co_wrapper_mixed_bdrv_rdlock +bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset, int64_t bytes, + BdrvRequestFlags flags); + int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags); -int generated_co_wrapper bdrv_pread(BdrvChild *child, int64_t offset, - int64_t bytes, void *buf, - BdrvRequestFlags flags); -int generated_co_wrapper bdrv_pwrite(BdrvChild *child, int64_t offset, - int64_t bytes, const void *buf, - BdrvRequestFlags flags); -int generated_co_wrapper bdrv_pwrite_sync(BdrvChild *child, int64_t offset, - int64_t bytes, const void *buf, - BdrvRequestFlags flags); + +int co_wrapper_mixed_bdrv_rdlock +bdrv_pread(BdrvChild *child, int64_t offset, int64_t bytes, void *buf, + BdrvRequestFlags flags); + +int co_wrapper_mixed_bdrv_rdlock +bdrv_pwrite(BdrvChild *child, int64_t offset,int64_t bytes, + const void *buf, BdrvRequestFlags flags); + +int co_wrapper_mixed_bdrv_rdlock +bdrv_pwrite_sync(BdrvChild *child, int64_t offset, int64_t bytes, + const void *buf, BdrvRequestFlags flags); + int coroutine_fn bdrv_co_pwrite_sync(BdrvChild *child, int64_t offset, int64_t bytes, const void *buf, BdrvRequestFlags flags); @@ -94,14 +99,29 @@ bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs); int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes, int64_t *pnum, int64_t *map, BlockDriverState **file); + +int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs, + BlockDriverState *base, + int64_t offset, int64_t bytes, + int64_t *pnum, int64_t *map, + BlockDriverState **file); int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base, int64_t offset, int64_t bytes, int64_t *pnum, int64_t *map, BlockDriverState **file); + +int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t offset, + int64_t bytes, int64_t *pnum); int bdrv_is_allocated(BlockDriverState *bs, int64_t offset, int64_t bytes, int64_t *pnum); + +int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top, + BlockDriverState *base, + bool include_base, int64_t offset, + int64_t bytes, int64_t *pnum); int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base, bool include_base, int64_t offset, int64_t bytes, int64_t *pnum); + int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset, int64_t bytes); @@ -200,8 +220,14 @@ AioContext *child_of_bds_get_parent_aio_context(BdrvChild *c); void bdrv_io_plug(BlockDriverState *bs); void bdrv_io_unplug(BlockDriverState *bs); -bool bdrv_can_store_new_dirty_bitmap(BlockDriverState *bs, const char *name, - uint32_t granularity, Error **errp); +bool coroutine_fn bdrv_co_can_store_new_dirty_bitmap(BlockDriverState *bs, + const char *name, + uint32_t granularity, + Error **errp); +bool co_wrapper bdrv_can_store_new_dirty_bitmap(BlockDriverState *bs, + const char *name, + uint32_t granularity, + Error **errp); /** * @@ -237,21 +263,6 @@ int coroutine_fn bdrv_co_copy_range(BdrvChild *src, int64_t src_offset, int64_t bytes, BdrvRequestFlags read_flags, BdrvRequestFlags write_flags); -/** - * bdrv_drained_end_no_poll: - * - * Same as bdrv_drained_end(), but do not poll for the subgraph to - * actually become unquiesced. Therefore, no graph changes will occur - * with this function. - * - * *drained_end_counter is incremented for every background operation - * that is scheduled, and will be decremented for every operation once - * it settles. The caller must poll until it reaches 0. The counter - * should be accessed using atomic operations only. - */ -void bdrv_drained_end_no_poll(BlockDriverState *bs, int *drained_end_counter); - - /* * "I/O or GS" API functions. These functions can run without * the BQL, but only in one specific iothread/main loop. @@ -281,47 +292,54 @@ void bdrv_drained_end_no_poll(BlockDriverState *bs, int *drained_end_counter); void bdrv_drain(BlockDriverState *bs); -int generated_co_wrapper +int co_wrapper_mixed_bdrv_rdlock bdrv_truncate(BdrvChild *child, int64_t offset, bool exact, PreallocMode prealloc, BdrvRequestFlags flags, Error **errp); -int generated_co_wrapper bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, - BdrvCheckMode fix); +int co_wrapper_mixed_bdrv_rdlock +bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix); /* Invalidate any cached metadata used by image formats */ -int generated_co_wrapper bdrv_invalidate_cache(BlockDriverState *bs, - Error **errp); -int generated_co_wrapper bdrv_flush(BlockDriverState *bs); -int generated_co_wrapper bdrv_pdiscard(BdrvChild *child, int64_t offset, - int64_t bytes); -int generated_co_wrapper +int co_wrapper_mixed_bdrv_rdlock +bdrv_invalidate_cache(BlockDriverState *bs, Error **errp); + +int co_wrapper_mixed_bdrv_rdlock bdrv_flush(BlockDriverState *bs); + +int co_wrapper_mixed_bdrv_rdlock +bdrv_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes); + +int co_wrapper_mixed_bdrv_rdlock bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos); -int generated_co_wrapper + +int co_wrapper_mixed_bdrv_rdlock bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos); /** * bdrv_parent_drained_begin_single: * - * Begin a quiesced section for the parent of @c. If @poll is true, wait for - * any pending activity to cease. + * Begin a quiesced section for the parent of @c. + */ +void bdrv_parent_drained_begin_single(BdrvChild *c); + +/** + * bdrv_parent_drained_poll_single: + * + * Returns true if there is any pending activity to cease before @c can be + * called quiesced, false otherwise. */ -void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll); +bool bdrv_parent_drained_poll_single(BdrvChild *c); /** * bdrv_parent_drained_end_single: * * End a quiesced section for the parent of @c. - * - * This polls @bs's AioContext until all scheduled sub-drained_ends - * have settled, which may result in graph changes. */ void bdrv_parent_drained_end_single(BdrvChild *c); /** * bdrv_drain_poll: * - * Poll for pending requests in @bs, its parents (except for @ignore_parent), - * and if @recursive is true its children as well (used for subtree drain). + * Poll for pending requests in @bs and its parents (except for @ignore_parent). * * If @ignore_bds_parents is true, parents that are BlockDriverStates must * ignore the drain request because they will be drained separately (used for @@ -329,8 +347,8 @@ void bdrv_parent_drained_end_single(BdrvChild *c); * * This is part of bdrv_drained_begin. */ -bool bdrv_drain_poll(BlockDriverState *bs, bool recursive, - BdrvChild *ignore_parent, bool ignore_bds_parents); +bool bdrv_drain_poll(BlockDriverState *bs, BdrvChild *ignore_parent, + bool ignore_bds_parents); /** * bdrv_drained_begin: @@ -348,31 +366,13 @@ void bdrv_drained_begin(BlockDriverState *bs); * Quiesces a BDS like bdrv_drained_begin(), but does not wait for already * running requests to complete. */ -void bdrv_do_drained_begin_quiesce(BlockDriverState *bs, - BdrvChild *parent, bool ignore_bds_parents); - -/** - * Like bdrv_drained_begin, but recursively begins a quiesced section for - * exclusive access to all child nodes as well. - */ -void bdrv_subtree_drained_begin(BlockDriverState *bs); +void bdrv_do_drained_begin_quiesce(BlockDriverState *bs, BdrvChild *parent); /** * bdrv_drained_end: * * End a quiescent section started by bdrv_drained_begin(). - * - * This polls @bs's AioContext until all scheduled sub-drained_ends - * have settled. On one hand, that may result in graph changes. On - * the other, this requires that the caller either runs in the main - * loop; or that all involved nodes (@bs and all of its parents) are - * in the caller's AioContext. */ void bdrv_drained_end(BlockDriverState *bs); -/** - * End a quiescent section started by bdrv_subtree_drained_begin(). - */ -void bdrv_subtree_drained_end(BlockDriverState *bs); - #endif /* BLOCK_IO_H */ diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h index 31ae91e56e..c34c525fa6 100644 --- a/include/block/block_int-common.h +++ b/include/block/block_int-common.h @@ -641,8 +641,8 @@ struct BlockDriver { /* * Invalidate any cached meta-data. */ - void coroutine_fn (*bdrv_co_invalidate_cache)(BlockDriverState *bs, - Error **errp); + void coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_invalidate_cache)( + BlockDriverState *bs, Error **errp); /* * Flushes all data for all layers by calling bdrv_co_flush for underlying @@ -701,12 +701,11 @@ struct BlockDriver { Error **errp); BlockStatsSpecific *(*bdrv_get_specific_stats)(BlockDriverState *bs); - int coroutine_fn (*bdrv_save_vmstate)(BlockDriverState *bs, - QEMUIOVector *qiov, - int64_t pos); - int coroutine_fn (*bdrv_load_vmstate)(BlockDriverState *bs, - QEMUIOVector *qiov, - int64_t pos); + int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_save_vmstate)( + BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos); + + int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_load_vmstate)( + BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos); /* removable device specific */ bool (*bdrv_is_inserted)(BlockDriverState *bs); @@ -724,9 +723,8 @@ struct BlockDriver { * Returns 0 for completed check, -errno for internal errors. * The check results are stored in result. */ - int coroutine_fn (*bdrv_co_check)(BlockDriverState *bs, - BdrvCheckResult *result, - BdrvCheckMode fix); + int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_check)( + BlockDriverState *bs, BdrvCheckResult *result, BdrvCheckMode fix); void (*bdrv_debug_event)(BlockDriverState *bs, BlkdebugEvent event); @@ -735,17 +733,19 @@ struct BlockDriver { void (*bdrv_io_unplug)(BlockDriverState *bs); /** - * bdrv_co_drain_begin is called if implemented in the beginning of a + * bdrv_drain_begin is called if implemented in the beginning of a * drain operation to drain and stop any internal sources of requests in * the driver. - * bdrv_co_drain_end is called if implemented at the end of the drain. + * bdrv_drain_end is called if implemented at the end of the drain. * * They should be used by the driver to e.g. manage scheduled I/O * requests, or toggle an internal state. After the end of the drain new * requests will continue normally. + * + * Implementations of both functions must not call aio_poll(). */ - void coroutine_fn (*bdrv_co_drain_begin)(BlockDriverState *bs); - void coroutine_fn (*bdrv_co_drain_end)(BlockDriverState *bs); + void (*bdrv_drain_begin)(BlockDriverState *bs); + void (*bdrv_drain_end)(BlockDriverState *bs); bool (*bdrv_supports_persistent_dirty_bitmap)(BlockDriverState *bs); bool coroutine_fn (*bdrv_co_can_store_new_dirty_bitmap)( @@ -896,8 +896,8 @@ struct BdrvChildClass { void (*activate)(BdrvChild *child, Error **errp); int (*inactivate)(BdrvChild *child); - void (*attach)(BdrvChild *child); - void (*detach)(BdrvChild *child); + void GRAPH_WRLOCK_PTR (*attach)(BdrvChild *child); + void GRAPH_WRLOCK_PTR (*detach)(BdrvChild *child); /* * Notifies the parent that the filename of its child has changed (e.g. @@ -937,15 +937,11 @@ struct BdrvChildClass { * These functions must not change the graph (and therefore also must not * call aio_poll(), which could change the graph indirectly). * - * If drained_end() schedules background operations, it must atomically - * increment *drained_end_counter for each such operation and atomically - * decrement it once the operation has settled. - * * Note that this can be nested. If drained_begin() was called twice, new * I/O is allowed only after drained_end() was called twice, too. */ void (*drained_begin)(BdrvChild *child); - void (*drained_end)(BdrvChild *child, int *drained_end_counter); + void (*drained_end)(BdrvChild *child); /* * Returns whether the parent has pending requests for the child. This @@ -982,13 +978,13 @@ struct BdrvChild { bool frozen; /* - * How many times the parent of this child has been drained + * True if the parent of this child has been drained by this BdrvChild * (through klass->drained_*). - * Usually, this is equal to bs->quiesce_counter (potentially - * reduced by bdrv_drain_all_count). It may differ while the + * + * It is generally true if bs->quiesce_counter > 0. It may differ while the * child is entering or leaving a drained section. */ - int parent_quiesce_counter; + bool quiesced_parent; QLIST_ENTRY(BdrvChild) next; QLIST_ENTRY(BdrvChild) next_parent; @@ -1186,7 +1182,6 @@ struct BlockDriverState { /* Accessed with atomic ops. */ int quiesce_counter; - int recursive_quiesce_counter; unsigned int write_gen; /* Current data generation */ diff --git a/include/block/block_int-global-state.h b/include/block/block_int-global-state.h index b49f4eb35b..2f0993f6e9 100644 --- a/include/block/block_int-global-state.h +++ b/include/block/block_int-global-state.h @@ -310,21 +310,4 @@ void bdrv_remove_aio_context_notifier(BlockDriverState *bs, */ void bdrv_drain_all_end_quiesce(BlockDriverState *bs); -/** - * Make sure that the function is running under both drain and BQL. - * The latter protects from concurrent writings - * from the GS API, while the former prevents concurrent reads - * from I/O. - */ -static inline void assert_bdrv_graph_writable(BlockDriverState *bs) -{ - /* - * TODO: this function is incomplete. Because the users of this - * assert lack the necessary drains, check only for BQL. - * Once the necessary drains are added, - * assert also for qatomic_read(&bs->quiesce_counter) > 0 - */ - assert(qemu_in_main_thread()); -} - #endif /* BLOCK_INT_GLOBAL_STATE_H */ diff --git a/include/block/block_int-io.h b/include/block/block_int-io.h index 4b0b3e17ef..8bc061ebb8 100644 --- a/include/block/block_int-io.h +++ b/include/block/block_int-io.h @@ -179,16 +179,4 @@ void bdrv_bsc_invalidate_range(BlockDriverState *bs, */ void bdrv_bsc_fill(BlockDriverState *bs, int64_t offset, int64_t bytes); - -/* - * "I/O or GS" API functions. These functions can run without - * the BQL, but only in one specific iothread/main loop. - * - * See include/block/block-io.h for more information about - * the "I/O or GS" API. - */ - -void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent); -void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent); - #endif /* BLOCK_INT_IO_H */ diff --git a/include/block/block_int.h b/include/block/block_int.h index 7d50b6bbd1..b35b0138ed 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -26,6 +26,7 @@ #include "block_int-global-state.h" #include "block_int-io.h" +#include "block/graph-lock.h" /* DO NOT ADD ANYTHING IN HERE. USE ONE OF THE HEADERS INCLUDED ABOVE */ diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h index 6528336c4c..c3700cec76 100644 --- a/include/block/dirty-bitmap.h +++ b/include/block/dirty-bitmap.h @@ -34,8 +34,14 @@ int bdrv_dirty_bitmap_check(const BdrvDirtyBitmap *bitmap, uint32_t flags, Error **errp); void bdrv_release_dirty_bitmap(BdrvDirtyBitmap *bitmap); void bdrv_release_named_dirty_bitmaps(BlockDriverState *bs); -int bdrv_remove_persistent_dirty_bitmap(BlockDriverState *bs, const char *name, - Error **errp); + +int coroutine_fn bdrv_co_remove_persistent_dirty_bitmap(BlockDriverState *bs, + const char *name, + Error **errp); +int co_wrapper bdrv_remove_persistent_dirty_bitmap(BlockDriverState *bs, + const char *name, + Error **errp); + void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap); void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap); void bdrv_enable_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap); diff --git a/include/block/graph-lock.h b/include/block/graph-lock.h new file mode 100644 index 0000000000..4c92cd8edf --- /dev/null +++ b/include/block/graph-lock.h @@ -0,0 +1,280 @@ +/* + * Graph lock: rwlock to protect block layer graph manipulations (add/remove + * edges and nodes) + * + * Copyright (c) 2022 Red Hat + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ +#ifndef GRAPH_LOCK_H +#define GRAPH_LOCK_H + +#include "qemu/osdep.h" +#include "qemu/clang-tsa.h" + +#include "qemu/coroutine.h" + +/** + * Graph Lock API + * This API provides a rwlock used to protect block layer + * graph modifications like edge (BdrvChild) and node (BlockDriverState) + * addition and removal. + * Currently we have 1 writer only, the Main loop, and many + * readers, mostly coroutines running in other AioContext thus other threads. + * + * We distinguish between writer (main loop, under BQL) that modifies the + * graph, and readers (all other coroutines running in various AioContext), + * that go through the graph edges, reading + * BlockDriverState ->parents and->children. + * + * The writer (main loop) has an "exclusive" access, so it first waits for + * current read to finish, and then prevents incoming ones from + * entering while it has the exclusive access. + * + * The readers (coroutines in multiple AioContext) are free to + * access the graph as long the writer is not modifying the graph. + * In case it is, they go in a CoQueue and sleep until the writer + * is done. + * + * If a coroutine changes AioContext, the counter in the original and new + * AioContext are left intact, since the writer does not care where is the + * reader, but only if there is one. + * As a result, some AioContexts might have a negative reader count, to + * balance the positive count of the AioContext that took the lock. + * This also means that when an AioContext is deleted it may have a nonzero + * reader count. In that case we transfer the count to a global shared counter + * so that the writer is always aware of all readers. + */ +typedef struct BdrvGraphRWlock BdrvGraphRWlock; + +/* Dummy lock object to use for Thread Safety Analysis (TSA) */ +typedef struct TSA_CAPABILITY("mutex") BdrvGraphLock { +} BdrvGraphLock; + +extern BdrvGraphLock graph_lock; + +/* + * clang doesn't check consistency in locking annotations between forward + * declarations and the function definition. Having the annotation on the + * definition, but not the declaration in a header file, may give the reader + * a false sense of security because the condition actually remains unchecked + * for callers in other source files. + * + * Therefore, as a convention, for public functions, GRAPH_RDLOCK and + * GRAPH_WRLOCK annotations should be present only in the header file. + */ +#define GRAPH_WRLOCK TSA_REQUIRES(graph_lock) +#define GRAPH_RDLOCK TSA_REQUIRES_SHARED(graph_lock) + +/* + * TSA annotations are not part of function types, so checks are defeated when + * using a function pointer. As a workaround, annotate function pointers with + * this macro that will require that the lock is at least taken while reading + * the pointer. In most cases this is equivalent to actually protecting the + * function call. + */ +#define GRAPH_RDLOCK_PTR TSA_GUARDED_BY(graph_lock) +#define GRAPH_WRLOCK_PTR TSA_GUARDED_BY(graph_lock) + +/* + * register_aiocontext: + * Add AioContext @ctx to the list of AioContext. + * This list is used to obtain the total number of readers + * currently running the graph. + */ +void register_aiocontext(AioContext *ctx); + +/* + * unregister_aiocontext: + * Removes AioContext @ctx to the list of AioContext. + */ +void unregister_aiocontext(AioContext *ctx); + +/* + * bdrv_graph_wrlock: + * Start an exclusive write operation to modify the graph. This means we are + * adding or removing an edge or a node in the block layer graph. Nobody else + * is allowed to access the graph. + * + * Must only be called from outside bdrv_graph_co_rdlock. + * + * The wrlock can only be taken from the main loop, with BQL held, as only the + * main loop is allowed to modify the graph. + * + * This function polls. Callers must not hold the lock of any AioContext other + * than the current one. + */ +void bdrv_graph_wrlock(void) TSA_ACQUIRE(graph_lock) TSA_NO_TSA; + +/* + * bdrv_graph_wrunlock: + * Write finished, reset global has_writer to 0 and restart + * all readers that are waiting. + */ +void bdrv_graph_wrunlock(void) TSA_RELEASE(graph_lock) TSA_NO_TSA; + +/* + * bdrv_graph_co_rdlock: + * Read the bs graph. This usually means traversing all nodes in + * the graph, therefore it can't happen while another thread is + * modifying it. + * Increases the reader counter of the current aiocontext, + * and if has_writer is set, it means that the writer is modifying + * the graph, therefore wait in a coroutine queue. + * The writer will then wake this coroutine once it is done. + * + * This lock should be taken from Iothreads (IO_CODE() class of functions) + * because it signals the writer that there are some + * readers currently running, or waits until the current + * write is finished before continuing. + * Calling this function from the Main Loop with BQL held + * is not necessary, since the Main Loop itself is the only + * writer, thus won't be able to read and write at the same time. + * The only exception to that is when we can't take the lock in the + * function/coroutine itself, and need to delegate the caller (usually main + * loop) to take it and wait that the coroutine ends, so that + * we always signal that a reader is running. + */ +void coroutine_fn TSA_ACQUIRE_SHARED(graph_lock) TSA_NO_TSA +bdrv_graph_co_rdlock(void); + +/* + * bdrv_graph_rdunlock: + * Read terminated, decrease the count of readers in the current aiocontext. + * If the writer is waiting for reads to finish (has_writer == 1), signal + * the writer that we are done via aio_wait_kick() to let it continue. + */ +void coroutine_fn TSA_RELEASE_SHARED(graph_lock) TSA_NO_TSA +bdrv_graph_co_rdunlock(void); + +/* + * bdrv_graph_rd{un}lock_main_loop: + * Just a placeholder to mark where the graph rdlock should be taken + * in the main loop. It is just asserting that we are not + * in a coroutine and in GLOBAL_STATE_CODE. + */ +void TSA_ACQUIRE_SHARED(graph_lock) TSA_NO_TSA +bdrv_graph_rdlock_main_loop(void); + +void TSA_RELEASE_SHARED(graph_lock) TSA_NO_TSA +bdrv_graph_rdunlock_main_loop(void); + +/* + * assert_bdrv_graph_readable: + * Make sure that the reader is either the main loop, + * or there is at least a reader helding the rdlock. + * In this way an incoming writer is aware of the read and waits. + */ +void GRAPH_RDLOCK assert_bdrv_graph_readable(void); + +/* + * assert_bdrv_graph_writable: + * Make sure that the writer is the main loop and has set @has_writer, + * so that incoming readers will pause. + */ +void GRAPH_WRLOCK assert_bdrv_graph_writable(void); + +/* + * Calling this function tells TSA that we know that the lock is effectively + * taken even though we cannot prove it (yet) with GRAPH_RDLOCK. This can be + * useful in intermediate stages of a conversion to using the GRAPH_RDLOCK + * macro. + */ +static inline void TSA_ASSERT_SHARED(graph_lock) TSA_NO_TSA +assume_graph_lock(void) +{ +} + +typedef struct GraphLockable { } GraphLockable; + +/* + * In C, compound literals have the lifetime of an automatic variable. + * In C++ it would be different, but then C++ wouldn't need QemuLockable + * either... + */ +#define GML_OBJ_() (&(GraphLockable) { }) + +/* + * This is not marked as TSA_ACQUIRE() because TSA doesn't understand the + * cleanup attribute and would therefore complain that the graph is never + * unlocked. TSA_ASSERT() makes sure that the following calls know that we + * hold the lock while unlocking is left unchecked. + */ +static inline GraphLockable * TSA_ASSERT(graph_lock) TSA_NO_TSA +graph_lockable_auto_lock(GraphLockable *x) +{ + bdrv_graph_co_rdlock(); + return x; +} + +static inline void TSA_NO_TSA +graph_lockable_auto_unlock(GraphLockable *x) +{ + bdrv_graph_co_rdunlock(); +} + +G_DEFINE_AUTOPTR_CLEANUP_FUNC(GraphLockable, graph_lockable_auto_unlock) + +#define WITH_GRAPH_RDLOCK_GUARD_(var) \ + for (g_autoptr(GraphLockable) var = graph_lockable_auto_lock(GML_OBJ_()); \ + var; \ + graph_lockable_auto_unlock(var), var = NULL) + +#define WITH_GRAPH_RDLOCK_GUARD() \ + WITH_GRAPH_RDLOCK_GUARD_(glue(graph_lockable_auto, __COUNTER__)) + +#define GRAPH_RDLOCK_GUARD(x) \ + g_autoptr(GraphLockable) \ + glue(graph_lockable_auto, __COUNTER__) G_GNUC_UNUSED = \ + graph_lockable_auto_lock(GML_OBJ_()) + + +typedef struct GraphLockableMainloop { } GraphLockableMainloop; + +/* + * In C, compound literals have the lifetime of an automatic variable. + * In C++ it would be different, but then C++ wouldn't need QemuLockable + * either... + */ +#define GMLML_OBJ_() (&(GraphLockableMainloop) { }) + +/* + * This is not marked as TSA_ACQUIRE() because TSA doesn't understand the + * cleanup attribute and would therefore complain that the graph is never + * unlocked. TSA_ASSERT() makes sure that the following calls know that we + * hold the lock while unlocking is left unchecked. + */ +static inline GraphLockableMainloop * TSA_ASSERT(graph_lock) TSA_NO_TSA +graph_lockable_auto_lock_mainloop(GraphLockableMainloop *x) +{ + bdrv_graph_rdlock_main_loop(); + return x; +} + +static inline void TSA_NO_TSA +graph_lockable_auto_unlock_mainloop(GraphLockableMainloop *x) +{ + bdrv_graph_rdunlock_main_loop(); +} + +G_DEFINE_AUTOPTR_CLEANUP_FUNC(GraphLockableMainloop, + graph_lockable_auto_unlock_mainloop) + +#define GRAPH_RDLOCK_GUARD_MAINLOOP(x) \ + g_autoptr(GraphLockableMainloop) \ + glue(graph_lockable_auto, __COUNTER__) G_GNUC_UNUSED = \ + graph_lockable_auto_lock_mainloop(GMLML_OBJ_()) + +#endif /* GRAPH_LOCK_H */ + diff --git a/include/qemu/clang-tsa.h b/include/qemu/clang-tsa.h new file mode 100644 index 0000000000..ba06fb8c92 --- /dev/null +++ b/include/qemu/clang-tsa.h @@ -0,0 +1,114 @@ +#ifndef CLANG_TSA_H +#define CLANG_TSA_H + +/* + * Copyright 2018 Jarkko Hietaniemi <jhi@iki.fi> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without + * limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/* http://clang.llvm.org/docs/ThreadSafetyAnalysis.html + * + * TSA is available since clang 3.6-ish. + */ +#ifdef __clang__ +# define TSA(x) __attribute__((x)) +#else +# define TSA(x) /* No TSA, make TSA attributes no-ops. */ +#endif + +/* TSA_CAPABILITY() is used to annotate typedefs: + * + * typedef pthread_mutex_t TSA_CAPABILITY("mutex") tsa_mutex; + */ +#define TSA_CAPABILITY(x) TSA(capability(x)) + +/* TSA_GUARDED_BY() is used to annotate global variables, + * the data is guarded: + * + * Foo foo TSA_GUARDED_BY(mutex); + */ +#define TSA_GUARDED_BY(x) TSA(guarded_by(x)) + +/* TSA_PT_GUARDED_BY() is used to annotate global pointers, the data + * behind the pointer is guarded. + * + * Foo* ptr TSA_PT_GUARDED_BY(mutex); + */ +#define TSA_PT_GUARDED_BY(x) TSA(pt_guarded_by(x)) + +/* The TSA_REQUIRES() is used to annotate functions: the caller of the + * function MUST hold the resource, the function will NOT release it. + * + * More than one mutex may be specified, comma-separated. + * + * void Foo(void) TSA_REQUIRES(mutex); + */ +#define TSA_REQUIRES(...) TSA(requires_capability(__VA_ARGS__)) +#define TSA_REQUIRES_SHARED(...) TSA(requires_shared_capability(__VA_ARGS__)) + +/* TSA_EXCLUDES() is used to annotate functions: the caller of the + * function MUST NOT hold resource, the function first acquires the + * resource, and then releases it. + * + * More than one mutex may be specified, comma-separated. + * + * void Foo(void) TSA_EXCLUDES(mutex); + */ +#define TSA_EXCLUDES(...) TSA(locks_excluded(__VA_ARGS__)) + +/* TSA_ACQUIRE() is used to annotate functions: the caller of the + * function MUST NOT hold the resource, the function will acquire the + * resource, but NOT release it. + * + * More than one mutex may be specified, comma-separated. + * + * void Foo(void) TSA_ACQUIRE(mutex); + */ +#define TSA_ACQUIRE(...) TSA(acquire_capability(__VA_ARGS__)) +#define TSA_ACQUIRE_SHARED(...) TSA(acquire_shared_capability(__VA_ARGS__)) + +/* TSA_RELEASE() is used to annotate functions: the caller of the + * function MUST hold the resource, but the function will then release it. + * + * More than one mutex may be specified, comma-separated. + * + * void Foo(void) TSA_RELEASE(mutex); + */ +#define TSA_RELEASE(...) TSA(release_capability(__VA_ARGS__)) +#define TSA_RELEASE_SHARED(...) TSA(release_shared_capability(__VA_ARGS__)) + +/* TSA_NO_TSA is used to annotate functions. Use only when you need to. + * + * void Foo(void) TSA_NO_TSA; + */ +#define TSA_NO_TSA TSA(no_thread_safety_analysis) + +/* + * TSA_ASSERT() is used to annotate functions: This function will assert that + * the lock is held. When it returns, the caller of the function is assumed to + * already hold the resource. + * + * More than one mutex may be specified, comma-separated. + */ +#define TSA_ASSERT(...) TSA(assert_capability(__VA_ARGS__)) +#define TSA_ASSERT_SHARED(...) TSA(assert_shared_capability(__VA_ARGS__)) + +#endif /* #ifndef CLANG_TSA_H */ diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h index 50f5aa2e07..7ec6d978d4 100644 --- a/include/sysemu/block-backend-io.h +++ b/include/sysemu/block-backend-io.h @@ -92,6 +92,15 @@ int coroutine_fn blk_co_copy_range(BlockBackend *blk_in, int64_t off_in, int64_t bytes, BdrvRequestFlags read_flags, BdrvRequestFlags write_flags); +int coroutine_fn blk_co_block_status_above(BlockBackend *blk, + BlockDriverState *base, + int64_t offset, int64_t bytes, + int64_t *pnum, int64_t *map, + BlockDriverState **file); +int coroutine_fn blk_co_is_allocated_above(BlockBackend *blk, + BlockDriverState *base, + bool include_base, int64_t offset, + int64_t bytes, int64_t *pnum); /* * "I/O or GS" API functions. These functions can run without @@ -101,77 +110,77 @@ int coroutine_fn blk_co_copy_range(BlockBackend *blk_in, int64_t off_in, * the "I/O or GS" API. */ -int generated_co_wrapper blk_pread(BlockBackend *blk, int64_t offset, - int64_t bytes, void *buf, - BdrvRequestFlags flags); +int co_wrapper_mixed blk_pread(BlockBackend *blk, int64_t offset, + int64_t bytes, void *buf, + BdrvRequestFlags flags); int coroutine_fn blk_co_pread(BlockBackend *blk, int64_t offset, int64_t bytes, void *buf, BdrvRequestFlags flags); -int generated_co_wrapper blk_preadv(BlockBackend *blk, int64_t offset, - int64_t bytes, QEMUIOVector *qiov, - BdrvRequestFlags flags); +int co_wrapper_mixed blk_preadv(BlockBackend *blk, int64_t offset, + int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags); int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset, int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags); -int generated_co_wrapper blk_preadv_part(BlockBackend *blk, int64_t offset, - int64_t bytes, QEMUIOVector *qiov, - size_t qiov_offset, - BdrvRequestFlags flags); +int co_wrapper_mixed blk_preadv_part(BlockBackend *blk, int64_t offset, + int64_t bytes, QEMUIOVector *qiov, + size_t qiov_offset, + BdrvRequestFlags flags); int coroutine_fn blk_co_preadv_part(BlockBackend *blk, int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset, BdrvRequestFlags flags); -int generated_co_wrapper blk_pwrite(BlockBackend *blk, int64_t offset, - int64_t bytes, const void *buf, - BdrvRequestFlags flags); +int co_wrapper_mixed blk_pwrite(BlockBackend *blk, int64_t offset, + int64_t bytes, const void *buf, + BdrvRequestFlags flags); int coroutine_fn blk_co_pwrite(BlockBackend *blk, int64_t offset, int64_t bytes, const void *buf, BdrvRequestFlags flags); -int generated_co_wrapper blk_pwritev(BlockBackend *blk, int64_t offset, - int64_t bytes, QEMUIOVector *qiov, - BdrvRequestFlags flags); +int co_wrapper_mixed blk_pwritev(BlockBackend *blk, int64_t offset, + int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags); int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset, int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags); -int generated_co_wrapper blk_pwritev_part(BlockBackend *blk, int64_t offset, - int64_t bytes, QEMUIOVector *qiov, - size_t qiov_offset, - BdrvRequestFlags flags); +int co_wrapper_mixed blk_pwritev_part(BlockBackend *blk, int64_t offset, + int64_t bytes, QEMUIOVector *qiov, + size_t qiov_offset, + BdrvRequestFlags flags); int coroutine_fn blk_co_pwritev_part(BlockBackend *blk, int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset, BdrvRequestFlags flags); -int generated_co_wrapper blk_pwrite_compressed(BlockBackend *blk, - int64_t offset, int64_t bytes, - const void *buf); +int co_wrapper_mixed blk_pwrite_compressed(BlockBackend *blk, + int64_t offset, int64_t bytes, + const void *buf); int coroutine_fn blk_co_pwrite_compressed(BlockBackend *blk, int64_t offset, int64_t bytes, const void *buf); -int generated_co_wrapper blk_pwrite_zeroes(BlockBackend *blk, int64_t offset, - int64_t bytes, - BdrvRequestFlags flags); +int co_wrapper_mixed blk_pwrite_zeroes(BlockBackend *blk, int64_t offset, + int64_t bytes, + BdrvRequestFlags flags); int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset, int64_t bytes, BdrvRequestFlags flags); -int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset, - int64_t bytes); +int co_wrapper_mixed blk_pdiscard(BlockBackend *blk, int64_t offset, + int64_t bytes); int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes); -int generated_co_wrapper blk_flush(BlockBackend *blk); +int co_wrapper_mixed blk_flush(BlockBackend *blk); int coroutine_fn blk_co_flush(BlockBackend *blk); -int generated_co_wrapper blk_ioctl(BlockBackend *blk, unsigned long int req, - void *buf); +int co_wrapper_mixed blk_ioctl(BlockBackend *blk, unsigned long int req, + void *buf); int coroutine_fn blk_co_ioctl(BlockBackend *blk, unsigned long int req, void *buf); -int generated_co_wrapper blk_truncate(BlockBackend *blk, int64_t offset, - bool exact, PreallocMode prealloc, - BdrvRequestFlags flags, Error **errp); +int co_wrapper_mixed blk_truncate(BlockBackend *blk, int64_t offset, + bool exact, PreallocMode prealloc, + BdrvRequestFlags flags, Error **errp); int coroutine_fn blk_co_truncate(BlockBackend *blk, int64_t offset, bool exact, PreallocMode prealloc, BdrvRequestFlags flags, Error **errp); diff --git a/nbd/server.c b/nbd/server.c index 0570596312..67ed333578 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -1988,7 +1988,7 @@ static int coroutine_fn nbd_co_send_structured_error(NBDClient *client, } /* Do a sparse read and send the structured reply to the client. - * Returns -errno if sending fails. bdrv_block_status_above() failure is + * Returns -errno if sending fails. blk_co_block_status_above() failure is * reported to the client, at which point this function succeeds. */ static int coroutine_fn nbd_co_send_sparse_read(NBDClient *client, @@ -2004,10 +2004,10 @@ static int coroutine_fn nbd_co_send_sparse_read(NBDClient *client, while (progress < size) { int64_t pnum; - int status = bdrv_block_status_above(blk_bs(exp->common.blk), NULL, - offset + progress, - size - progress, &pnum, NULL, - NULL); + int status = blk_co_block_status_above(exp->common.blk, NULL, + offset + progress, + size - progress, &pnum, NULL, + NULL); bool final; if (status < 0) { @@ -2138,14 +2138,15 @@ static int nbd_extent_array_add(NBDExtentArray *ea, return 0; } -static int blockstatus_to_extents(BlockDriverState *bs, uint64_t offset, - uint64_t bytes, NBDExtentArray *ea) +static int coroutine_fn blockstatus_to_extents(BlockBackend *blk, + uint64_t offset, uint64_t bytes, + NBDExtentArray *ea) { while (bytes) { uint32_t flags; int64_t num; - int ret = bdrv_block_status_above(bs, NULL, offset, bytes, &num, - NULL, NULL); + int ret = blk_co_block_status_above(blk, NULL, offset, bytes, &num, + NULL, NULL); if (ret < 0) { return ret; @@ -2165,13 +2166,14 @@ static int blockstatus_to_extents(BlockDriverState *bs, uint64_t offset, return 0; } -static int blockalloc_to_extents(BlockDriverState *bs, uint64_t offset, - uint64_t bytes, NBDExtentArray *ea) +static int coroutine_fn blockalloc_to_extents(BlockBackend *blk, + uint64_t offset, uint64_t bytes, + NBDExtentArray *ea) { while (bytes) { int64_t num; - int ret = bdrv_is_allocated_above(bs, NULL, false, offset, bytes, - &num); + int ret = blk_co_is_allocated_above(blk, NULL, false, offset, bytes, + &num); if (ret < 0) { return ret; @@ -2217,20 +2219,21 @@ static int nbd_co_send_extents(NBDClient *client, uint64_t handle, } /* Get block status from the exported device and send it to the client */ -static int nbd_co_send_block_status(NBDClient *client, uint64_t handle, - BlockDriverState *bs, uint64_t offset, - uint32_t length, bool dont_fragment, - bool last, uint32_t context_id, - Error **errp) +static int +coroutine_fn nbd_co_send_block_status(NBDClient *client, uint64_t handle, + BlockBackend *blk, uint64_t offset, + uint32_t length, bool dont_fragment, + bool last, uint32_t context_id, + Error **errp) { int ret; unsigned int nb_extents = dont_fragment ? 1 : NBD_MAX_BLOCK_STATUS_EXTENTS; g_autoptr(NBDExtentArray) ea = nbd_extent_array_new(nb_extents); if (context_id == NBD_META_ID_BASE_ALLOCATION) { - ret = blockstatus_to_extents(bs, offset, length, ea); + ret = blockstatus_to_extents(blk, offset, length, ea); } else { - ret = blockalloc_to_extents(bs, offset, length, ea); + ret = blockalloc_to_extents(blk, offset, length, ea); } if (ret < 0) { return nbd_co_send_structured_error( @@ -2557,7 +2560,7 @@ static coroutine_fn int nbd_handle_request(NBDClient *client, if (client->export_meta.base_allocation) { ret = nbd_co_send_block_status(client, request->handle, - blk_bs(exp->common.blk), + exp->common.blk, request->from, request->len, dont_fragment, !--contexts_remaining, @@ -2570,7 +2573,7 @@ static coroutine_fn int nbd_handle_request(NBDClient *client, if (client->export_meta.allocation_depth) { ret = nbd_co_send_block_status(client, request->handle, - blk_bs(exp->common.blk), + exp->common.blk, request->from, request->len, dont_fragment, !--contexts_remaining, diff --git a/scripts/block-coroutine-wrapper.py b/scripts/block-coroutine-wrapper.py index 08be813407..6e087fa0b7 100644 --- a/scripts/block-coroutine-wrapper.py +++ b/scripts/block-coroutine-wrapper.py @@ -2,7 +2,7 @@ """Generate coroutine wrappers for block subsystem. The program parses one or several concatenated c files from stdin, -searches for functions with the 'generated_co_wrapper' specifier +searches for functions with the 'co_wrapper' specifier and generates corresponding wrappers on stdout. Usage: block-coroutine-wrapper.py generated-file.c FILE.[ch]... @@ -62,10 +62,28 @@ class ParamDecl: class FuncDecl: - def __init__(self, return_type: str, name: str, args: str) -> None: + def __init__(self, return_type: str, name: str, args: str, + variant: str) -> None: self.return_type = return_type.strip() self.name = name.strip() + self.struct_name = snake_to_camel(self.name) self.args = [ParamDecl(arg.strip()) for arg in args.split(',')] + self.create_only_co = 'mixed' not in variant + self.graph_rdlock = 'bdrv_rdlock' in variant + + subsystem, subname = self.name.split('_', 1) + self.co_name = f'{subsystem}_co_{subname}' + + t = self.args[0].type + if t == 'BlockDriverState *': + ctx = 'bdrv_get_aio_context(bs)' + elif t == 'BdrvChild *': + ctx = 'bdrv_get_aio_context(child->bs)' + elif t == 'BlockBackend *': + ctx = 'blk_get_aio_context(blk)' + else: + ctx = 'qemu_get_aio_context()' + self.ctx = ctx def gen_list(self, format: str) -> str: return ', '.join(format.format_map(arg.__dict__) for arg in self.args) @@ -74,17 +92,20 @@ class FuncDecl: return '\n'.join(format.format_map(arg.__dict__) for arg in self.args) -# Match wrappers declared with a generated_co_wrapper mark -func_decl_re = re.compile(r'^int\s*generated_co_wrapper\s*' +# Match wrappers declared with a co_wrapper mark +func_decl_re = re.compile(r'^(?P<return_type>[a-zA-Z][a-zA-Z0-9_]* [\*]?)' + r'\s*co_wrapper' + r'(?P<variant>(_[a-z][a-z0-9_]*)?)\s*' r'(?P<wrapper_name>[a-z][a-z0-9_]*)' r'\((?P<args>[^)]*)\);$', re.MULTILINE) def func_decl_iter(text: str) -> Iterator: for m in func_decl_re.finditer(text): - yield FuncDecl(return_type='int', + yield FuncDecl(return_type=m.group('return_type'), name=m.group('wrapper_name'), - args=m.group('args')) + args=m.group('args'), + variant=m.group('variant')) def snake_to_camel(func_name: str) -> str: @@ -97,24 +118,75 @@ def snake_to_camel(func_name: str) -> str: return ''.join(words) +def create_mixed_wrapper(func: FuncDecl) -> str: + """ + Checks if we are already in coroutine + """ + name = func.co_name + struct_name = func.struct_name + graph_assume_lock = 'assume_graph_lock();' if func.graph_rdlock else '' + + return f"""\ +{func.return_type} {func.name}({ func.gen_list('{decl}') }) +{{ + if (qemu_in_coroutine()) {{ + {graph_assume_lock} + return {name}({ func.gen_list('{name}') }); + }} else {{ + {struct_name} s = {{ + .poll_state.ctx = {func.ctx}, + .poll_state.in_progress = true, + +{ func.gen_block(' .{name} = {name},') } + }}; + + s.poll_state.co = qemu_coroutine_create({name}_entry, &s); + + bdrv_poll_co(&s.poll_state); + return s.ret; + }} +}}""" + + +def create_co_wrapper(func: FuncDecl) -> str: + """ + Assumes we are not in coroutine, and creates one + """ + name = func.co_name + struct_name = func.struct_name + return f"""\ +{func.return_type} {func.name}({ func.gen_list('{decl}') }) +{{ + {struct_name} s = {{ + .poll_state.ctx = {func.ctx}, + .poll_state.in_progress = true, + +{ func.gen_block(' .{name} = {name},') } + }}; + assert(!qemu_in_coroutine()); + + s.poll_state.co = qemu_coroutine_create({name}_entry, &s); + + bdrv_poll_co(&s.poll_state); + return s.ret; +}}""" + + def gen_wrapper(func: FuncDecl) -> str: assert not '_co_' in func.name - assert func.return_type == 'int' - assert func.args[0].type in ['BlockDriverState *', 'BdrvChild *', - 'BlockBackend *'] - subsystem, subname = func.name.split('_', 1) + name = func.co_name + struct_name = func.struct_name - name = f'{subsystem}_co_{subname}' + graph_lock='' + graph_unlock='' + if func.graph_rdlock: + graph_lock=' bdrv_graph_co_rdlock();' + graph_unlock=' bdrv_graph_co_rdunlock();' - t = func.args[0].type - if t == 'BlockDriverState *': - bs = 'bs' - elif t == 'BdrvChild *': - bs = 'child->bs' - else: - bs = 'blk_bs(blk)' - struct_name = snake_to_camel(name) + creation_function = create_mixed_wrapper + if func.create_only_co: + creation_function = create_co_wrapper return f"""\ /* @@ -123,6 +195,7 @@ def gen_wrapper(func: FuncDecl) -> str: typedef struct {struct_name} {{ BdrvPollCo poll_state; + {func.return_type} ret; { func.gen_block(' {decl};') } }} {struct_name}; @@ -130,29 +203,15 @@ static void coroutine_fn {name}_entry(void *opaque) {{ {struct_name} *s = opaque; - s->poll_state.ret = {name}({ func.gen_list('s->{name}') }); +{graph_lock} + s->ret = {name}({ func.gen_list('s->{name}') }); +{graph_unlock} s->poll_state.in_progress = false; aio_wait_kick(); }} -int {func.name}({ func.gen_list('{decl}') }) -{{ - if (qemu_in_coroutine()) {{ - return {name}({ func.gen_list('{name}') }); - }} else {{ - {struct_name} s = {{ - .poll_state.bs = {bs}, - .poll_state.in_progress = true, - -{ func.gen_block(' .{name} = {name},') } - }}; - - s.poll_state.co = qemu_coroutine_create({name}_entry, &s); - - return bdrv_poll_co(&s.poll_state); - }} -}}""" +{creation_function(func)}""" def gen_wrappers(input_code: str) -> str: diff --git a/stubs/graph-lock.c b/stubs/graph-lock.c new file mode 100644 index 0000000000..177aa0a8ba --- /dev/null +++ b/stubs/graph-lock.c @@ -0,0 +1,10 @@ +#include "qemu/osdep.h" +#include "block/graph-lock.h" + +void register_aiocontext(AioContext *ctx) +{ +} + +void unregister_aiocontext(AioContext *ctx) +{ +} diff --git a/stubs/meson.build b/stubs/meson.build index c96a74f095..981585cbdf 100644 --- a/stubs/meson.build +++ b/stubs/meson.build @@ -13,6 +13,7 @@ stub_ss.add(files('error-printf.c')) stub_ss.add(files('fdset.c')) stub_ss.add(files('gdbstub.c')) stub_ss.add(files('get-vm-name.c')) +stub_ss.add(files('graph-lock.c')) if linux_io_uring.found() stub_ss.add(files('io_uring.c')) endif diff --git a/tests/unit/test-bdrv-drain.c b/tests/unit/test-bdrv-drain.c index 09dc4a4891..8cedea4959 100644 --- a/tests/unit/test-bdrv-drain.c +++ b/tests/unit/test-bdrv-drain.c @@ -38,16 +38,26 @@ typedef struct BDRVTestState { bool sleep_in_drain_begin; } BDRVTestState; -static void coroutine_fn bdrv_test_co_drain_begin(BlockDriverState *bs) +static void coroutine_fn sleep_in_drain_begin(void *opaque) +{ + BlockDriverState *bs = opaque; + + qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000); + bdrv_dec_in_flight(bs); +} + +static void bdrv_test_drain_begin(BlockDriverState *bs) { BDRVTestState *s = bs->opaque; s->drain_count++; if (s->sleep_in_drain_begin) { - qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000); + Coroutine *co = qemu_coroutine_create(sleep_in_drain_begin, bs); + bdrv_inc_in_flight(bs); + aio_co_enter(bdrv_get_aio_context(bs), co); } } -static void coroutine_fn bdrv_test_co_drain_end(BlockDriverState *bs) +static void bdrv_test_drain_end(BlockDriverState *bs) { BDRVTestState *s = bs->opaque; s->drain_count--; @@ -101,8 +111,8 @@ static BlockDriver bdrv_test = { .bdrv_close = bdrv_test_close, .bdrv_co_preadv = bdrv_test_co_preadv, - .bdrv_co_drain_begin = bdrv_test_co_drain_begin, - .bdrv_co_drain_end = bdrv_test_co_drain_end, + .bdrv_drain_begin = bdrv_test_drain_begin, + .bdrv_drain_end = bdrv_test_drain_end, .bdrv_child_perm = bdrv_default_perms, @@ -146,7 +156,6 @@ static void call_in_coroutine(void (*entry)(void)) enum drain_type { BDRV_DRAIN_ALL, BDRV_DRAIN, - BDRV_SUBTREE_DRAIN, DRAIN_TYPE_MAX, }; @@ -155,7 +164,6 @@ static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs) switch (drain_type) { case BDRV_DRAIN_ALL: bdrv_drain_all_begin(); break; case BDRV_DRAIN: bdrv_drained_begin(bs); break; - case BDRV_SUBTREE_DRAIN: bdrv_subtree_drained_begin(bs); break; default: g_assert_not_reached(); } } @@ -165,7 +173,6 @@ static void do_drain_end(enum drain_type drain_type, BlockDriverState *bs) switch (drain_type) { case BDRV_DRAIN_ALL: bdrv_drain_all_end(); break; case BDRV_DRAIN: bdrv_drained_end(bs); break; - case BDRV_SUBTREE_DRAIN: bdrv_subtree_drained_end(bs); break; default: g_assert_not_reached(); } } @@ -261,11 +268,6 @@ static void test_drv_cb_drain(void) test_drv_cb_common(BDRV_DRAIN, false); } -static void test_drv_cb_drain_subtree(void) -{ - test_drv_cb_common(BDRV_SUBTREE_DRAIN, true); -} - static void test_drv_cb_co_drain_all(void) { call_in_coroutine(test_drv_cb_drain_all); @@ -276,11 +278,6 @@ static void test_drv_cb_co_drain(void) call_in_coroutine(test_drv_cb_drain); } -static void test_drv_cb_co_drain_subtree(void) -{ - call_in_coroutine(test_drv_cb_drain_subtree); -} - static void test_quiesce_common(enum drain_type drain_type, bool recursive) { BlockBackend *blk; @@ -299,7 +296,11 @@ static void test_quiesce_common(enum drain_type drain_type, bool recursive) do_drain_begin(drain_type, bs); - g_assert_cmpint(bs->quiesce_counter, ==, 1); + if (drain_type == BDRV_DRAIN_ALL) { + g_assert_cmpint(bs->quiesce_counter, ==, 2); + } else { + g_assert_cmpint(bs->quiesce_counter, ==, 1); + } g_assert_cmpint(backing->quiesce_counter, ==, !!recursive); do_drain_end(drain_type, bs); @@ -322,11 +323,6 @@ static void test_quiesce_drain(void) test_quiesce_common(BDRV_DRAIN, false); } -static void test_quiesce_drain_subtree(void) -{ - test_quiesce_common(BDRV_SUBTREE_DRAIN, true); -} - static void test_quiesce_co_drain_all(void) { call_in_coroutine(test_quiesce_drain_all); @@ -337,11 +333,6 @@ static void test_quiesce_co_drain(void) call_in_coroutine(test_quiesce_drain); } -static void test_quiesce_co_drain_subtree(void) -{ - call_in_coroutine(test_quiesce_drain_subtree); -} - static void test_nested(void) { BlockBackend *blk; @@ -361,8 +352,8 @@ static void test_nested(void) for (outer = 0; outer < DRAIN_TYPE_MAX; outer++) { for (inner = 0; inner < DRAIN_TYPE_MAX; inner++) { - int backing_quiesce = (outer != BDRV_DRAIN) + - (inner != BDRV_DRAIN); + int backing_quiesce = (outer == BDRV_DRAIN_ALL) + + (inner == BDRV_DRAIN_ALL); g_assert_cmpint(bs->quiesce_counter, ==, 0); g_assert_cmpint(backing->quiesce_counter, ==, 0); @@ -372,10 +363,10 @@ static void test_nested(void) do_drain_begin(outer, bs); do_drain_begin(inner, bs); - g_assert_cmpint(bs->quiesce_counter, ==, 2); + g_assert_cmpint(bs->quiesce_counter, ==, 2 + !!backing_quiesce); g_assert_cmpint(backing->quiesce_counter, ==, backing_quiesce); - g_assert_cmpint(s->drain_count, ==, 2); - g_assert_cmpint(backing_s->drain_count, ==, backing_quiesce); + g_assert_cmpint(s->drain_count, ==, 1); + g_assert_cmpint(backing_s->drain_count, ==, !!backing_quiesce); do_drain_end(inner, bs); do_drain_end(outer, bs); @@ -392,158 +383,6 @@ static void test_nested(void) blk_unref(blk); } -static void test_multiparent(void) -{ - BlockBackend *blk_a, *blk_b; - BlockDriverState *bs_a, *bs_b, *backing; - BDRVTestState *a_s, *b_s, *backing_s; - - blk_a = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL); - bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR, - &error_abort); - a_s = bs_a->opaque; - blk_insert_bs(blk_a, bs_a, &error_abort); - - blk_b = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL); - bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR, - &error_abort); - b_s = bs_b->opaque; - blk_insert_bs(blk_b, bs_b, &error_abort); - - backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort); - backing_s = backing->opaque; - bdrv_set_backing_hd(bs_a, backing, &error_abort); - bdrv_set_backing_hd(bs_b, backing, &error_abort); - - g_assert_cmpint(bs_a->quiesce_counter, ==, 0); - g_assert_cmpint(bs_b->quiesce_counter, ==, 0); - g_assert_cmpint(backing->quiesce_counter, ==, 0); - g_assert_cmpint(a_s->drain_count, ==, 0); - g_assert_cmpint(b_s->drain_count, ==, 0); - g_assert_cmpint(backing_s->drain_count, ==, 0); - - do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a); - - g_assert_cmpint(bs_a->quiesce_counter, ==, 1); - g_assert_cmpint(bs_b->quiesce_counter, ==, 1); - g_assert_cmpint(backing->quiesce_counter, ==, 1); - g_assert_cmpint(a_s->drain_count, ==, 1); - g_assert_cmpint(b_s->drain_count, ==, 1); - g_assert_cmpint(backing_s->drain_count, ==, 1); - - do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b); - - g_assert_cmpint(bs_a->quiesce_counter, ==, 2); - g_assert_cmpint(bs_b->quiesce_counter, ==, 2); - g_assert_cmpint(backing->quiesce_counter, ==, 2); - g_assert_cmpint(a_s->drain_count, ==, 2); - g_assert_cmpint(b_s->drain_count, ==, 2); - g_assert_cmpint(backing_s->drain_count, ==, 2); - - do_drain_end(BDRV_SUBTREE_DRAIN, bs_b); - - g_assert_cmpint(bs_a->quiesce_counter, ==, 1); - g_assert_cmpint(bs_b->quiesce_counter, ==, 1); - g_assert_cmpint(backing->quiesce_counter, ==, 1); - g_assert_cmpint(a_s->drain_count, ==, 1); - g_assert_cmpint(b_s->drain_count, ==, 1); - g_assert_cmpint(backing_s->drain_count, ==, 1); - - do_drain_end(BDRV_SUBTREE_DRAIN, bs_a); - - g_assert_cmpint(bs_a->quiesce_counter, ==, 0); - g_assert_cmpint(bs_b->quiesce_counter, ==, 0); - g_assert_cmpint(backing->quiesce_counter, ==, 0); - g_assert_cmpint(a_s->drain_count, ==, 0); - g_assert_cmpint(b_s->drain_count, ==, 0); - g_assert_cmpint(backing_s->drain_count, ==, 0); - - bdrv_unref(backing); - bdrv_unref(bs_a); - bdrv_unref(bs_b); - blk_unref(blk_a); - blk_unref(blk_b); -} - -static void test_graph_change_drain_subtree(void) -{ - BlockBackend *blk_a, *blk_b; - BlockDriverState *bs_a, *bs_b, *backing; - BDRVTestState *a_s, *b_s, *backing_s; - - blk_a = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL); - bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR, - &error_abort); - a_s = bs_a->opaque; - blk_insert_bs(blk_a, bs_a, &error_abort); - - blk_b = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL); - bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR, - &error_abort); - b_s = bs_b->opaque; - blk_insert_bs(blk_b, bs_b, &error_abort); - - backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort); - backing_s = backing->opaque; - bdrv_set_backing_hd(bs_a, backing, &error_abort); - - g_assert_cmpint(bs_a->quiesce_counter, ==, 0); - g_assert_cmpint(bs_b->quiesce_counter, ==, 0); - g_assert_cmpint(backing->quiesce_counter, ==, 0); - g_assert_cmpint(a_s->drain_count, ==, 0); - g_assert_cmpint(b_s->drain_count, ==, 0); - g_assert_cmpint(backing_s->drain_count, ==, 0); - - do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a); - do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a); - do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a); - do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b); - do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b); - - bdrv_set_backing_hd(bs_b, backing, &error_abort); - g_assert_cmpint(bs_a->quiesce_counter, ==, 5); - g_assert_cmpint(bs_b->quiesce_counter, ==, 5); - g_assert_cmpint(backing->quiesce_counter, ==, 5); - g_assert_cmpint(a_s->drain_count, ==, 5); - g_assert_cmpint(b_s->drain_count, ==, 5); - g_assert_cmpint(backing_s->drain_count, ==, 5); - - bdrv_set_backing_hd(bs_b, NULL, &error_abort); - g_assert_cmpint(bs_a->quiesce_counter, ==, 3); - g_assert_cmpint(bs_b->quiesce_counter, ==, 2); - g_assert_cmpint(backing->quiesce_counter, ==, 3); - g_assert_cmpint(a_s->drain_count, ==, 3); - g_assert_cmpint(b_s->drain_count, ==, 2); - g_assert_cmpint(backing_s->drain_count, ==, 3); - - bdrv_set_backing_hd(bs_b, backing, &error_abort); - g_assert_cmpint(bs_a->quiesce_counter, ==, 5); - g_assert_cmpint(bs_b->quiesce_counter, ==, 5); - g_assert_cmpint(backing->quiesce_counter, ==, 5); - g_assert_cmpint(a_s->drain_count, ==, 5); - g_assert_cmpint(b_s->drain_count, ==, 5); - g_assert_cmpint(backing_s->drain_count, ==, 5); - - do_drain_end(BDRV_SUBTREE_DRAIN, bs_b); - do_drain_end(BDRV_SUBTREE_DRAIN, bs_b); - do_drain_end(BDRV_SUBTREE_DRAIN, bs_a); - do_drain_end(BDRV_SUBTREE_DRAIN, bs_a); - do_drain_end(BDRV_SUBTREE_DRAIN, bs_a); - - g_assert_cmpint(bs_a->quiesce_counter, ==, 0); - g_assert_cmpint(bs_b->quiesce_counter, ==, 0); - g_assert_cmpint(backing->quiesce_counter, ==, 0); - g_assert_cmpint(a_s->drain_count, ==, 0); - g_assert_cmpint(b_s->drain_count, ==, 0); - g_assert_cmpint(backing_s->drain_count, ==, 0); - - bdrv_unref(backing); - bdrv_unref(bs_a); - bdrv_unref(bs_b); - blk_unref(blk_a); - blk_unref(blk_b); -} - static void test_graph_change_drain_all(void) { BlockBackend *blk_a, *blk_b; @@ -763,12 +602,6 @@ static void test_iothread_drain(void) test_iothread_common(BDRV_DRAIN, 1); } -static void test_iothread_drain_subtree(void) -{ - test_iothread_common(BDRV_SUBTREE_DRAIN, 0); - test_iothread_common(BDRV_SUBTREE_DRAIN, 1); -} - typedef struct TestBlockJob { BlockJob common; @@ -853,7 +686,6 @@ enum test_job_result { enum test_job_drain_node { TEST_JOB_DRAIN_SRC, TEST_JOB_DRAIN_SRC_CHILD, - TEST_JOB_DRAIN_SRC_PARENT, }; static void test_blockjob_common_drain_node(enum drain_type drain_type, @@ -891,9 +723,6 @@ static void test_blockjob_common_drain_node(enum drain_type drain_type, case TEST_JOB_DRAIN_SRC_CHILD: drain_bs = src_backing; break; - case TEST_JOB_DRAIN_SRC_PARENT: - drain_bs = src_overlay; - break; default: g_assert_not_reached(); } @@ -1045,10 +874,6 @@ static void test_blockjob_common(enum drain_type drain_type, bool use_iothread, TEST_JOB_DRAIN_SRC); test_blockjob_common_drain_node(drain_type, use_iothread, result, TEST_JOB_DRAIN_SRC_CHILD); - if (drain_type == BDRV_SUBTREE_DRAIN) { - test_blockjob_common_drain_node(drain_type, use_iothread, result, - TEST_JOB_DRAIN_SRC_PARENT); - } } static void test_blockjob_drain_all(void) @@ -1061,11 +886,6 @@ static void test_blockjob_drain(void) test_blockjob_common(BDRV_DRAIN, false, TEST_JOB_SUCCESS); } -static void test_blockjob_drain_subtree(void) -{ - test_blockjob_common(BDRV_SUBTREE_DRAIN, false, TEST_JOB_SUCCESS); -} - static void test_blockjob_error_drain_all(void) { test_blockjob_common(BDRV_DRAIN_ALL, false, TEST_JOB_FAIL_RUN); @@ -1078,12 +898,6 @@ static void test_blockjob_error_drain(void) test_blockjob_common(BDRV_DRAIN, false, TEST_JOB_FAIL_PREPARE); } -static void test_blockjob_error_drain_subtree(void) -{ - test_blockjob_common(BDRV_SUBTREE_DRAIN, false, TEST_JOB_FAIL_RUN); - test_blockjob_common(BDRV_SUBTREE_DRAIN, false, TEST_JOB_FAIL_PREPARE); -} - static void test_blockjob_iothread_drain_all(void) { test_blockjob_common(BDRV_DRAIN_ALL, true, TEST_JOB_SUCCESS); @@ -1094,11 +908,6 @@ static void test_blockjob_iothread_drain(void) test_blockjob_common(BDRV_DRAIN, true, TEST_JOB_SUCCESS); } -static void test_blockjob_iothread_drain_subtree(void) -{ - test_blockjob_common(BDRV_SUBTREE_DRAIN, true, TEST_JOB_SUCCESS); -} - static void test_blockjob_iothread_error_drain_all(void) { test_blockjob_common(BDRV_DRAIN_ALL, true, TEST_JOB_FAIL_RUN); @@ -1111,12 +920,6 @@ static void test_blockjob_iothread_error_drain(void) test_blockjob_common(BDRV_DRAIN, true, TEST_JOB_FAIL_PREPARE); } -static void test_blockjob_iothread_error_drain_subtree(void) -{ - test_blockjob_common(BDRV_SUBTREE_DRAIN, true, TEST_JOB_FAIL_RUN); - test_blockjob_common(BDRV_SUBTREE_DRAIN, true, TEST_JOB_FAIL_PREPARE); -} - typedef struct BDRVTestTopState { BdrvChild *wait_child; @@ -1263,14 +1066,6 @@ static void do_test_delete_by_drain(bool detach_instead_of_delete, bdrv_drain(child_bs); bdrv_unref(child_bs); break; - case BDRV_SUBTREE_DRAIN: - /* Would have to ref/unref bs here for !detach_instead_of_delete, but - * then the whole test becomes pointless because the graph changes - * don't occur during the drain any more. */ - assert(detach_instead_of_delete); - bdrv_subtree_drained_begin(bs); - bdrv_subtree_drained_end(bs); - break; case BDRV_DRAIN_ALL: bdrv_drain_all_begin(); bdrv_drain_all_end(); @@ -1305,11 +1100,6 @@ static void test_detach_by_drain(void) do_test_delete_by_drain(true, BDRV_DRAIN); } -static void test_detach_by_drain_subtree(void) -{ - do_test_delete_by_drain(true, BDRV_SUBTREE_DRAIN); -} - struct detach_by_parent_data { BlockDriverState *parent_b; @@ -1317,6 +1107,7 @@ struct detach_by_parent_data { BlockDriverState *c; BdrvChild *child_c; bool by_parent_cb; + bool detach_on_drain; }; static struct detach_by_parent_data detach_by_parent_data; @@ -1324,6 +1115,7 @@ static void detach_indirect_bh(void *opaque) { struct detach_by_parent_data *data = opaque; + bdrv_dec_in_flight(data->child_b->bs); bdrv_unref_child(data->parent_b, data->child_b); bdrv_ref(data->c); @@ -1338,12 +1130,21 @@ static void detach_by_parent_aio_cb(void *opaque, int ret) g_assert_cmpint(ret, ==, 0); if (data->by_parent_cb) { + bdrv_inc_in_flight(data->child_b->bs); detach_indirect_bh(data); } } static void detach_by_driver_cb_drained_begin(BdrvChild *child) { + struct detach_by_parent_data *data = &detach_by_parent_data; + + if (!data->detach_on_drain) { + return; + } + data->detach_on_drain = false; + + bdrv_inc_in_flight(data->child_b->bs); aio_bh_schedule_oneshot(qemu_get_current_aio_context(), detach_indirect_bh, &detach_by_parent_data); child_of_bds.drained_begin(child); @@ -1384,8 +1185,14 @@ static void test_detach_indirect(bool by_parent_cb) detach_by_driver_cb_class = child_of_bds; detach_by_driver_cb_class.drained_begin = detach_by_driver_cb_drained_begin; + detach_by_driver_cb_class.drained_end = NULL; + detach_by_driver_cb_class.drained_poll = NULL; } + detach_by_parent_data = (struct detach_by_parent_data) { + .detach_on_drain = false, + }; + /* Create all involved nodes */ parent_a = bdrv_new_open_driver(&bdrv_test, "parent-a", BDRV_O_RDWR, &error_abort); @@ -1437,12 +1244,16 @@ static void test_detach_indirect(bool by_parent_cb) .child_b = child_b, .c = c, .by_parent_cb = by_parent_cb, + .detach_on_drain = true, }; acb = blk_aio_preadv(blk, 0, &qiov, 0, detach_by_parent_aio_cb, NULL); g_assert(acb != NULL); /* Drain and check the expected result */ - bdrv_subtree_drained_begin(parent_b); + bdrv_drained_begin(parent_b); + bdrv_drained_begin(a); + bdrv_drained_begin(b); + bdrv_drained_begin(c); g_assert(detach_by_parent_data.child_c != NULL); @@ -1457,12 +1268,15 @@ static void test_detach_indirect(bool by_parent_cb) g_assert(QLIST_NEXT(child_a, next) == NULL); g_assert_cmpint(parent_a->quiesce_counter, ==, 1); - g_assert_cmpint(parent_b->quiesce_counter, ==, 1); + g_assert_cmpint(parent_b->quiesce_counter, ==, 3); g_assert_cmpint(a->quiesce_counter, ==, 1); - g_assert_cmpint(b->quiesce_counter, ==, 0); + g_assert_cmpint(b->quiesce_counter, ==, 1); g_assert_cmpint(c->quiesce_counter, ==, 1); - bdrv_subtree_drained_end(parent_b); + bdrv_drained_end(parent_b); + bdrv_drained_end(a); + bdrv_drained_end(b); + bdrv_drained_end(c); bdrv_unref(parent_b); blk_unref(blk); @@ -1693,6 +1507,7 @@ static void test_blockjob_commit_by_drained_end(void) bdrv_drained_begin(bs_child); g_assert(!job_has_completed); bdrv_drained_end(bs_child); + aio_poll(qemu_get_aio_context(), false); g_assert(job_has_completed); bdrv_unref(bs_parents[0]); @@ -1848,6 +1663,7 @@ static void test_drop_intermediate_poll(void) g_assert(!job_has_completed); ret = bdrv_drop_intermediate(chain[1], chain[0], NULL); + aio_poll(qemu_get_aio_context(), false); g_assert(ret == 0); g_assert(job_has_completed); @@ -1856,6 +1672,7 @@ static void test_drop_intermediate_poll(void) typedef struct BDRVReplaceTestState { + bool setup_completed; bool was_drained; bool was_undrained; bool has_read; @@ -1916,52 +1733,78 @@ static int coroutine_fn bdrv_replace_test_co_preadv(BlockDriverState *bs, return 0; } +static void coroutine_fn bdrv_replace_test_drain_co(void *opaque) +{ + BlockDriverState *bs = opaque; + BDRVReplaceTestState *s = bs->opaque; + + /* Keep waking io_co up until it is done */ + while (s->io_co) { + aio_co_wake(s->io_co); + s->io_co = NULL; + qemu_coroutine_yield(); + } + s->drain_co = NULL; + bdrv_dec_in_flight(bs); +} + /** * If .drain_count is 0, wake up .io_co if there is one; and set * .was_drained. * Increment .drain_count. */ -static void coroutine_fn bdrv_replace_test_co_drain_begin(BlockDriverState *bs) +static void bdrv_replace_test_drain_begin(BlockDriverState *bs) { BDRVReplaceTestState *s = bs->opaque; - if (!s->drain_count) { - /* Keep waking io_co up until it is done */ - s->drain_co = qemu_coroutine_self(); - while (s->io_co) { - aio_co_wake(s->io_co); - s->io_co = NULL; - qemu_coroutine_yield(); - } - s->drain_co = NULL; + if (!s->setup_completed) { + return; + } + if (!s->drain_count) { + s->drain_co = qemu_coroutine_create(bdrv_replace_test_drain_co, bs); + bdrv_inc_in_flight(bs); + aio_co_enter(bdrv_get_aio_context(bs), s->drain_co); s->was_drained = true; } s->drain_count++; } +static void coroutine_fn bdrv_replace_test_read_entry(void *opaque) +{ + BlockDriverState *bs = opaque; + char data; + QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, &data, 1); + int ret; + + /* Queue a read request post-drain */ + ret = bdrv_replace_test_co_preadv(bs, 0, 1, &qiov, 0); + g_assert(ret >= 0); + bdrv_dec_in_flight(bs); +} + /** * Reduce .drain_count, set .was_undrained once it reaches 0. * If .drain_count reaches 0 and the node has a backing file, issue a * read request. */ -static void coroutine_fn bdrv_replace_test_co_drain_end(BlockDriverState *bs) +static void bdrv_replace_test_drain_end(BlockDriverState *bs) { BDRVReplaceTestState *s = bs->opaque; + if (!s->setup_completed) { + return; + } + g_assert(s->drain_count > 0); if (!--s->drain_count) { - int ret; - s->was_undrained = true; if (bs->backing) { - char data; - QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, &data, 1); - - /* Queue a read request post-drain */ - ret = bdrv_replace_test_co_preadv(bs, 0, 1, &qiov, 0); - g_assert(ret >= 0); + Coroutine *co = qemu_coroutine_create(bdrv_replace_test_read_entry, + bs); + bdrv_inc_in_flight(bs); + aio_co_enter(bdrv_get_aio_context(bs), co); } } } @@ -1974,8 +1817,8 @@ static BlockDriver bdrv_replace_test = { .bdrv_close = bdrv_replace_test_close, .bdrv_co_preadv = bdrv_replace_test_co_preadv, - .bdrv_co_drain_begin = bdrv_replace_test_co_drain_begin, - .bdrv_co_drain_end = bdrv_replace_test_co_drain_end, + .bdrv_drain_begin = bdrv_replace_test_drain_begin, + .bdrv_drain_end = bdrv_replace_test_drain_end, .bdrv_child_perm = bdrv_default_perms, }; @@ -2051,6 +1894,7 @@ static void do_test_replace_child_mid_drain(int old_drain_count, bdrv_ref(old_child_bs); bdrv_attach_child(parent_bs, old_child_bs, "child", &child_of_bds, BDRV_CHILD_COW, &error_abort); + parent_s->setup_completed = true; for (i = 0; i < old_drain_count; i++) { bdrv_drained_begin(old_child_bs); @@ -2172,70 +2016,47 @@ int main(int argc, char **argv) g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all); g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain); - g_test_add_func("/bdrv-drain/driver-cb/drain_subtree", - test_drv_cb_drain_subtree); g_test_add_func("/bdrv-drain/driver-cb/co/drain_all", test_drv_cb_co_drain_all); g_test_add_func("/bdrv-drain/driver-cb/co/drain", test_drv_cb_co_drain); - g_test_add_func("/bdrv-drain/driver-cb/co/drain_subtree", - test_drv_cb_co_drain_subtree); - g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all); g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain); - g_test_add_func("/bdrv-drain/quiesce/drain_subtree", - test_quiesce_drain_subtree); g_test_add_func("/bdrv-drain/quiesce/co/drain_all", test_quiesce_co_drain_all); g_test_add_func("/bdrv-drain/quiesce/co/drain", test_quiesce_co_drain); - g_test_add_func("/bdrv-drain/quiesce/co/drain_subtree", - test_quiesce_co_drain_subtree); g_test_add_func("/bdrv-drain/nested", test_nested); - g_test_add_func("/bdrv-drain/multiparent", test_multiparent); - g_test_add_func("/bdrv-drain/graph-change/drain_subtree", - test_graph_change_drain_subtree); g_test_add_func("/bdrv-drain/graph-change/drain_all", test_graph_change_drain_all); g_test_add_func("/bdrv-drain/iothread/drain_all", test_iothread_drain_all); g_test_add_func("/bdrv-drain/iothread/drain", test_iothread_drain); - g_test_add_func("/bdrv-drain/iothread/drain_subtree", - test_iothread_drain_subtree); g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all); g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain); - g_test_add_func("/bdrv-drain/blockjob/drain_subtree", - test_blockjob_drain_subtree); g_test_add_func("/bdrv-drain/blockjob/error/drain_all", test_blockjob_error_drain_all); g_test_add_func("/bdrv-drain/blockjob/error/drain", test_blockjob_error_drain); - g_test_add_func("/bdrv-drain/blockjob/error/drain_subtree", - test_blockjob_error_drain_subtree); g_test_add_func("/bdrv-drain/blockjob/iothread/drain_all", test_blockjob_iothread_drain_all); g_test_add_func("/bdrv-drain/blockjob/iothread/drain", test_blockjob_iothread_drain); - g_test_add_func("/bdrv-drain/blockjob/iothread/drain_subtree", - test_blockjob_iothread_drain_subtree); g_test_add_func("/bdrv-drain/blockjob/iothread/error/drain_all", test_blockjob_iothread_error_drain_all); g_test_add_func("/bdrv-drain/blockjob/iothread/error/drain", test_blockjob_iothread_error_drain); - g_test_add_func("/bdrv-drain/blockjob/iothread/error/drain_subtree", - test_blockjob_iothread_error_drain_subtree); g_test_add_func("/bdrv-drain/deletion/drain", test_delete_by_drain); g_test_add_func("/bdrv-drain/detach/drain_all", test_detach_by_drain_all); g_test_add_func("/bdrv-drain/detach/drain", test_detach_by_drain); - g_test_add_func("/bdrv-drain/detach/drain_subtree", test_detach_by_drain_subtree); g_test_add_func("/bdrv-drain/detach/parent_cb", test_detach_by_parent_cb); g_test_add_func("/bdrv-drain/detach/driver_cb", test_detach_by_driver_cb); diff --git a/util/async.c b/util/async.c index 63434ddae4..14d63b3091 100644 --- a/util/async.c +++ b/util/async.c @@ -27,6 +27,7 @@ #include "qapi/error.h" #include "block/aio.h" #include "block/thread-pool.h" +#include "block/graph-lock.h" #include "qemu/main-loop.h" #include "qemu/atomic.h" #include "qemu/rcu_queue.h" @@ -376,6 +377,7 @@ aio_ctx_finalize(GSource *source) qemu_rec_mutex_destroy(&ctx->lock); qemu_lockcnt_destroy(&ctx->list_lock); timerlistgroup_deinit(&ctx->tlg); + unregister_aiocontext(ctx); aio_context_destroy(ctx); } @@ -574,6 +576,8 @@ AioContext *aio_context_new(Error **errp) ctx->thread_pool_min = 0; ctx->thread_pool_max = THREAD_POOL_MAX_THREADS_DEFAULT; + register_aiocontext(ctx); + return ctx; fail: g_source_destroy(&ctx->source); |