aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--block.c404
-rw-r--r--block/block-backend.c25
-rw-r--r--block/block-copy.c21
-rw-r--r--block/block-gen.h11
-rw-r--r--block/commit.c4
-rw-r--r--block/coroutines.h21
-rw-r--r--block/crypto.c2
-rw-r--r--block/dirty-bitmap.c88
-rw-r--r--block/graph-lock.c275
-rw-r--r--block/io.c367
-rw-r--r--block/meson.build2
-rw-r--r--block/parallels.c2
-rw-r--r--block/qcow.c2
-rw-r--r--block/qcow2.c4
-rw-r--r--block/qed.c28
-rw-r--r--block/raw-format.c2
-rw-r--r--block/replication.c6
-rw-r--r--block/stream.c26
-rw-r--r--block/throttle.c8
-rw-r--r--block/vdi.c2
-rw-r--r--block/vhdx.c2
-rw-r--r--block/vmdk.c38
-rw-r--r--block/vpc.c2
-rw-r--r--blockdev.c17
-rw-r--r--blockjob.c2
-rw-r--r--docs/devel/block-coroutine-wrapper.rst6
-rw-r--r--include/block/aio.h9
-rw-r--r--include/block/block-common.h27
-rw-r--r--include/block/block-copy.h5
-rw-r--r--include/block/block-global-state.h15
-rw-r--r--include/block/block-io.h136
-rw-r--r--include/block/block_int-common.h49
-rw-r--r--include/block/block_int-global-state.h17
-rw-r--r--include/block/block_int-io.h12
-rw-r--r--include/block/block_int.h1
-rw-r--r--include/block/dirty-bitmap.h10
-rw-r--r--include/block/graph-lock.h280
-rw-r--r--include/qemu/clang-tsa.h114
-rw-r--r--include/sysemu/block-backend-io.h77
-rw-r--r--nbd/server.c47
-rw-r--r--scripts/block-coroutine-wrapper.py133
-rw-r--r--stubs/graph-lock.c10
-rw-r--r--stubs/meson.build1
-rw-r--r--tests/unit/test-bdrv-drain.c387
-rw-r--r--util/async.c4
45 files changed, 1574 insertions, 1127 deletions
diff --git a/block.c b/block.c
index a18f052374..9c2ac757e4 100644
--- a/block.c
+++ b/block.c
@@ -93,8 +93,6 @@ static bool bdrv_recurse_has_child(BlockDriverState *bs,
static void bdrv_replace_child_noperm(BdrvChild *child,
BlockDriverState *new_bs);
static void bdrv_remove_child(BdrvChild *child, Transaction *tran);
-static void bdrv_remove_filter_or_cow_child(BlockDriverState *bs,
- Transaction *tran);
static int bdrv_reopen_prepare(BDRVReopenState *reopen_state,
BlockReopenQueue *queue,
@@ -528,65 +526,24 @@ typedef struct CreateCo {
Error *err;
} CreateCo;
-static void coroutine_fn bdrv_create_co_entry(void *opaque)
-{
- Error *local_err = NULL;
- int ret;
-
- CreateCo *cco = opaque;
- assert(cco->drv);
- GLOBAL_STATE_CODE();
-
- ret = cco->drv->bdrv_co_create_opts(cco->drv,
- cco->filename, cco->opts, &local_err);
- error_propagate(&cco->err, local_err);
- cco->ret = ret;
-}
-
-int bdrv_create(BlockDriver *drv, const char* filename,
- QemuOpts *opts, Error **errp)
+int coroutine_fn bdrv_co_create(BlockDriver *drv, const char *filename,
+ QemuOpts *opts, Error **errp)
{
int ret;
-
GLOBAL_STATE_CODE();
-
- Coroutine *co;
- CreateCo cco = {
- .drv = drv,
- .filename = g_strdup(filename),
- .opts = opts,
- .ret = NOT_DONE,
- .err = NULL,
- };
+ ERRP_GUARD();
if (!drv->bdrv_co_create_opts) {
- error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
- ret = -ENOTSUP;
- goto out;
- }
-
- if (qemu_in_coroutine()) {
- /* Fast-path if already in coroutine context */
- bdrv_create_co_entry(&cco);
- } else {
- co = qemu_coroutine_create(bdrv_create_co_entry, &cco);
- qemu_coroutine_enter(co);
- while (cco.ret == NOT_DONE) {
- aio_poll(qemu_get_aio_context(), true);
- }
+ error_setg(errp, "Driver '%s' does not support image creation",
+ drv->format_name);
+ return -ENOTSUP;
}
- ret = cco.ret;
- if (ret < 0) {
- if (cco.err) {
- error_propagate(errp, cco.err);
- } else {
- error_setg_errno(errp, -ret, "Could not create image");
- }
+ ret = drv->bdrv_co_create_opts(drv, filename, opts, errp);
+ if (ret < 0 && !*errp) {
+ error_setg_errno(errp, -ret, "Could not create image");
}
-out:
- g_free(cco.filename);
return ret;
}
@@ -725,7 +682,8 @@ out:
return ret;
}
-int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
+int coroutine_fn bdrv_co_create_file(const char *filename, QemuOpts *opts,
+ Error **errp)
{
QemuOpts *protocol_opts;
BlockDriver *drv;
@@ -766,7 +724,7 @@ int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
goto out;
}
- ret = bdrv_create(drv, filename, protocol_opts, errp);
+ ret = bdrv_co_create(drv, filename, protocol_opts, errp);
out:
qemu_opts_del(protocol_opts);
qobject_unref(qdict);
@@ -1228,20 +1186,19 @@ static char *bdrv_child_get_parent_desc(BdrvChild *c)
static void bdrv_child_cb_drained_begin(BdrvChild *child)
{
BlockDriverState *bs = child->opaque;
- bdrv_do_drained_begin_quiesce(bs, NULL, false);
+ bdrv_do_drained_begin_quiesce(bs, NULL);
}
static bool bdrv_child_cb_drained_poll(BdrvChild *child)
{
BlockDriverState *bs = child->opaque;
- return bdrv_drain_poll(bs, false, NULL, false);
+ return bdrv_drain_poll(bs, NULL, false);
}
-static void bdrv_child_cb_drained_end(BdrvChild *child,
- int *drained_end_counter)
+static void bdrv_child_cb_drained_end(BdrvChild *child)
{
BlockDriverState *bs = child->opaque;
- bdrv_drained_end_no_poll(bs, drained_end_counter);
+ bdrv_drained_end(bs);
}
static int bdrv_child_cb_inactivate(BdrvChild *child)
@@ -1445,11 +1402,11 @@ static void bdrv_inherited_options(BdrvChildRole role, bool parent_is_format,
*child_flags = flags;
}
-static void bdrv_child_cb_attach(BdrvChild *child)
+static void GRAPH_WRLOCK bdrv_child_cb_attach(BdrvChild *child)
{
BlockDriverState *bs = child->opaque;
- assert_bdrv_graph_writable(bs);
+ assert_bdrv_graph_writable();
QLIST_INSERT_HEAD(&bs->children, child, next);
if (bs->drv->is_filter || (child->role & BDRV_CHILD_FILTERED)) {
/*
@@ -1485,11 +1442,9 @@ static void bdrv_child_cb_attach(BdrvChild *child)
assert(!bs->file);
bs->file = child;
}
-
- bdrv_apply_subtree_drain(child, bs);
}
-static void bdrv_child_cb_detach(BdrvChild *child)
+static void GRAPH_WRLOCK bdrv_child_cb_detach(BdrvChild *child)
{
BlockDriverState *bs = child->opaque;
@@ -1497,9 +1452,7 @@ static void bdrv_child_cb_detach(BdrvChild *child)
bdrv_backing_detach(child);
}
- bdrv_unapply_subtree_drain(child, bs);
-
- assert_bdrv_graph_writable(bs);
+ assert_bdrv_graph_writable();
QLIST_REMOVE(child, next);
if (child == bs->backing) {
assert(child != bs->file);
@@ -1715,8 +1668,8 @@ static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv,
assert(is_power_of_2(bs->bl.request_alignment));
for (i = 0; i < bs->quiesce_counter; i++) {
- if (drv->bdrv_co_drain_begin) {
- drv->bdrv_co_drain_begin(bs);
+ if (drv->bdrv_drain_begin) {
+ drv->bdrv_drain_begin(bs);
}
}
@@ -2414,6 +2367,20 @@ static void bdrv_replace_child_abort(void *opaque)
GLOBAL_STATE_CODE();
/* old_bs reference is transparently moved from @s to @s->child */
+ if (!s->child->bs) {
+ /*
+ * The parents were undrained when removing old_bs from the child. New
+ * requests can't have been made, though, because the child was empty.
+ *
+ * TODO Make bdrv_replace_child_noperm() transactionable to avoid
+ * undraining the parent in the first place. Once this is done, having
+ * new_bs drained when calling bdrv_replace_child_tran() is not a
+ * requirement any more.
+ */
+ bdrv_parent_drained_begin_single(s->child);
+ assert(!bdrv_parent_drained_poll_single(s->child));
+ }
+ assert(s->child->quiesced_parent);
bdrv_replace_child_noperm(s->child, s->old_bs);
bdrv_unref(new_bs);
}
@@ -2429,12 +2396,19 @@ static TransactionActionDrv bdrv_replace_child_drv = {
*
* Note: real unref of old_bs is done only on commit.
*
+ * Both @child->bs and @new_bs (if non-NULL) must be drained. @new_bs must be
+ * kept drained until the transaction is completed.
+ *
* The function doesn't update permissions, caller is responsible for this.
*/
static void bdrv_replace_child_tran(BdrvChild *child, BlockDriverState *new_bs,
Transaction *tran)
{
BdrvReplaceChildState *s = g_new(BdrvReplaceChildState, 1);
+
+ assert(child->quiesced_parent);
+ assert(!new_bs || new_bs->quiesce_counter);
+
*s = (BdrvReplaceChildState) {
.child = child,
.old_bs = child->bs,
@@ -2523,8 +2497,12 @@ static int bdrv_node_refresh_perm(BlockDriverState *bs, BlockReopenQueue *q,
return 0;
}
-static int bdrv_list_refresh_perms(GSList *list, BlockReopenQueue *q,
- Transaction *tran, Error **errp)
+/*
+ * @list is a product of bdrv_topological_dfs() (may be called several times) -
+ * a topologically sorted subgraph.
+ */
+static int bdrv_do_refresh_perms(GSList *list, BlockReopenQueue *q,
+ Transaction *tran, Error **errp)
{
int ret;
BlockDriverState *bs;
@@ -2546,6 +2524,24 @@ static int bdrv_list_refresh_perms(GSList *list, BlockReopenQueue *q,
return 0;
}
+/*
+ * @list is any list of nodes. List is completed by all subtrees and
+ * topologically sorted. It's not a problem if some node occurs in the @list
+ * several times.
+ */
+static int bdrv_list_refresh_perms(GSList *list, BlockReopenQueue *q,
+ Transaction *tran, Error **errp)
+{
+ g_autoptr(GHashTable) found = g_hash_table_new(NULL, NULL);
+ g_autoptr(GSList) refresh_list = NULL;
+
+ for ( ; list; list = list->next) {
+ refresh_list = bdrv_topological_dfs(refresh_list, found, list->data);
+ }
+
+ return bdrv_do_refresh_perms(refresh_list, q, tran, errp);
+}
+
void bdrv_get_cumulative_perm(BlockDriverState *bs, uint64_t *perm,
uint64_t *shared_perm)
{
@@ -2593,15 +2589,24 @@ char *bdrv_perm_names(uint64_t perm)
}
-static int bdrv_refresh_perms(BlockDriverState *bs, Error **errp)
+/* @tran is allowed to be NULL. In this case no rollback is possible */
+static int bdrv_refresh_perms(BlockDriverState *bs, Transaction *tran,
+ Error **errp)
{
int ret;
- Transaction *tran = tran_new();
+ Transaction *local_tran = NULL;
g_autoptr(GSList) list = bdrv_topological_dfs(NULL, NULL, bs);
GLOBAL_STATE_CODE();
- ret = bdrv_list_refresh_perms(list, NULL, tran, errp);
- tran_finalize(tran, ret);
+ if (!tran) {
+ tran = local_tran = tran_new();
+ }
+
+ ret = bdrv_do_refresh_perms(list, NULL, tran, errp);
+
+ if (local_tran) {
+ tran_finalize(local_tran, ret);
+ }
return ret;
}
@@ -2617,7 +2622,7 @@ int bdrv_child_try_set_perm(BdrvChild *c, uint64_t perm, uint64_t shared,
bdrv_child_set_perm(c, perm, shared, tran);
- ret = bdrv_refresh_perms(c->bs, &local_err);
+ ret = bdrv_refresh_perms(c->bs, tran, &local_err);
tran_finalize(tran, ret);
@@ -2826,14 +2831,41 @@ uint64_t bdrv_qapi_perm_to_blk_perm(BlockPermission qapi_perm)
return permissions[qapi_perm];
}
+/*
+ * Replaces the node that a BdrvChild points to without updating permissions.
+ *
+ * If @new_bs is non-NULL, the parent of @child must already be drained through
+ * @child.
+ */
static void bdrv_replace_child_noperm(BdrvChild *child,
BlockDriverState *new_bs)
{
BlockDriverState *old_bs = child->bs;
int new_bs_quiesce_counter;
- int drain_saldo;
assert(!child->frozen);
+
+ /*
+ * If we want to change the BdrvChild to point to a drained node as its new
+ * child->bs, we need to make sure that its new parent is drained, too. In
+ * other words, either child->quiesce_parent must already be true or we must
+ * be able to set it and keep the parent's quiesce_counter consistent with
+ * that, but without polling or starting new requests (this function
+ * guarantees that it doesn't poll, and starting new requests would be
+ * against the invariants of drain sections).
+ *
+ * To keep things simple, we pick the first option (child->quiesce_parent
+ * must already be true). We also generalise the rule a bit to make it
+ * easier to verify in callers and more likely to be covered in test cases:
+ * The parent must be quiesced through this child even if new_bs isn't
+ * currently drained.
+ *
+ * The only exception is for callers that always pass new_bs == NULL. In
+ * this case, we obviously never need to consider the case of a drained
+ * new_bs, so we can keep the callers simpler by allowing them not to drain
+ * the parent.
+ */
+ assert(!new_bs || child->quiesced_parent);
assert(old_bs != new_bs);
GLOBAL_STATE_CODE();
@@ -2841,59 +2873,33 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
}
- new_bs_quiesce_counter = (new_bs ? new_bs->quiesce_counter : 0);
- drain_saldo = new_bs_quiesce_counter - child->parent_quiesce_counter;
-
- /*
- * If the new child node is drained but the old one was not, flush
- * all outstanding requests to the old child node.
- */
- while (drain_saldo > 0 && child->klass->drained_begin) {
- bdrv_parent_drained_begin_single(child, true);
- drain_saldo--;
- }
-
+ /* TODO Pull this up into the callers to avoid polling here */
+ bdrv_graph_wrlock();
if (old_bs) {
- /* Detach first so that the recursive drain sections coming from @child
- * are already gone and we only end the drain sections that came from
- * elsewhere. */
if (child->klass->detach) {
child->klass->detach(child);
}
- assert_bdrv_graph_writable(old_bs);
QLIST_REMOVE(child, next_parent);
}
child->bs = new_bs;
if (new_bs) {
- assert_bdrv_graph_writable(new_bs);
QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
-
- /*
- * Detaching the old node may have led to the new node's
- * quiesce_counter having been decreased. Not a problem, we
- * just need to recognize this here and then invoke
- * drained_end appropriately more often.
- */
- assert(new_bs->quiesce_counter <= new_bs_quiesce_counter);
- drain_saldo += new_bs->quiesce_counter - new_bs_quiesce_counter;
-
- /* Attach only after starting new drained sections, so that recursive
- * drain sections coming from @child don't get an extra .drained_begin
- * callback. */
if (child->klass->attach) {
child->klass->attach(child);
}
}
+ bdrv_graph_wrunlock();
/*
- * If the old child node was drained but the new one is not, allow
- * requests to come in only after the new node has been attached.
+ * If the parent was drained through this BdrvChild previously, but new_bs
+ * is not drained, allow requests to come in only after the new node has
+ * been attached.
*/
- while (drain_saldo < 0 && child->klass->drained_end) {
+ new_bs_quiesce_counter = (new_bs ? new_bs->quiesce_counter : 0);
+ if (!new_bs_quiesce_counter && child->quiesced_parent) {
bdrv_parent_drained_end_single(child);
- drain_saldo++;
}
}
@@ -3026,6 +3032,24 @@ static BdrvChild *bdrv_attach_child_common(BlockDriverState *child_bs,
}
bdrv_ref(child_bs);
+ /*
+ * Let every new BdrvChild start with a drained parent. Inserting the child
+ * in the graph with bdrv_replace_child_noperm() will undrain it if
+ * @child_bs is not drained.
+ *
+ * The child was only just created and is not yet visible in global state
+ * until bdrv_replace_child_noperm() inserts it into the graph, so nobody
+ * could have sent requests and polling is not necessary.
+ *
+ * Note that this means that the parent isn't fully drained yet, we only
+ * stop new requests from coming in. This is fine, we don't care about the
+ * old requests here, they are not for this child. If another place enters a
+ * drain section for the same parent, but wants it to be fully quiesced, it
+ * will not run most of the the code in .drained_begin() again (which is not
+ * a problem, we already did this), but it will still poll until the parent
+ * is fully quiesced, so it will not be negatively affected either.
+ */
+ bdrv_parent_drained_begin_single(new_child);
bdrv_replace_child_noperm(new_child, child_bs);
BdrvAttachChildCommonState *s = g_new(BdrvAttachChildCommonState, 1);
@@ -3070,30 +3094,6 @@ static BdrvChild *bdrv_attach_child_noperm(BlockDriverState *parent_bs,
tran, errp);
}
-static void bdrv_detach_child(BdrvChild *child)
-{
- BlockDriverState *old_bs = child->bs;
-
- GLOBAL_STATE_CODE();
- bdrv_replace_child_noperm(child, NULL);
- bdrv_child_free(child);
-
- if (old_bs) {
- /*
- * Update permissions for old node. We're just taking a parent away, so
- * we're loosening restrictions. Errors of permission update are not
- * fatal in this case, ignore them.
- */
- bdrv_refresh_perms(old_bs, NULL);
-
- /*
- * When the parent requiring a non-default AioContext is removed, the
- * node moves back to the main AioContext
- */
- bdrv_try_change_aio_context(old_bs, qemu_get_aio_context(), NULL, NULL);
- }
-}
-
/*
* This function steals the reference to child_bs from the caller.
* That reference is later dropped by bdrv_root_unref_child().
@@ -3125,7 +3125,7 @@ BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,
goto out;
}
- ret = bdrv_refresh_perms(child_bs, errp);
+ ret = bdrv_refresh_perms(child_bs, tran, errp);
out:
tran_finalize(tran, ret);
@@ -3166,7 +3166,7 @@ BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
goto out;
}
- ret = bdrv_refresh_perms(parent_bs, errp);
+ ret = bdrv_refresh_perms(parent_bs, tran, errp);
if (ret < 0) {
goto out;
}
@@ -3182,12 +3182,28 @@ out:
/* Callers must ensure that child->frozen is false. */
void bdrv_root_unref_child(BdrvChild *child)
{
- BlockDriverState *child_bs;
+ BlockDriverState *child_bs = child->bs;
GLOBAL_STATE_CODE();
+ bdrv_replace_child_noperm(child, NULL);
+ bdrv_child_free(child);
+
+ if (child_bs) {
+ /*
+ * Update permissions for old node. We're just taking a parent away, so
+ * we're loosening restrictions. Errors of permission update are not
+ * fatal in this case, ignore them.
+ */
+ bdrv_refresh_perms(child_bs, NULL, NULL);
+
+ /*
+ * When the parent requiring a non-default AioContext is removed, the
+ * node moves back to the main AioContext
+ */
+ bdrv_try_change_aio_context(child_bs, qemu_get_aio_context(), NULL,
+ NULL);
+ }
- child_bs = child->bs;
- bdrv_detach_child(child);
bdrv_unref(child_bs);
}
@@ -3406,24 +3422,35 @@ static int bdrv_set_backing_noperm(BlockDriverState *bs,
return bdrv_set_file_or_backing_noperm(bs, backing_hd, true, tran, errp);
}
-int bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd,
- Error **errp)
+int bdrv_set_backing_hd_drained(BlockDriverState *bs,
+ BlockDriverState *backing_hd,
+ Error **errp)
{
int ret;
Transaction *tran = tran_new();
GLOBAL_STATE_CODE();
- bdrv_drained_begin(bs);
+ assert(bs->quiesce_counter > 0);
ret = bdrv_set_backing_noperm(bs, backing_hd, tran, errp);
if (ret < 0) {
goto out;
}
- ret = bdrv_refresh_perms(bs, errp);
+ ret = bdrv_refresh_perms(bs, tran, errp);
out:
tran_finalize(tran, ret);
+ return ret;
+}
+
+int bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd,
+ Error **errp)
+{
+ int ret;
+ GLOBAL_STATE_CODE();
+ bdrv_drained_begin(bs);
+ ret = bdrv_set_backing_hd_drained(bs, backing_hd, errp);
bdrv_drained_end(bs);
return ret;
@@ -4153,7 +4180,9 @@ static bool bdrv_recurse_has_child(BlockDriverState *bs,
* returns a pointer to bs_queue, which is either the newly allocated
* bs_queue, or the existing bs_queue being used.
*
- * bs must be drained between bdrv_reopen_queue() and bdrv_reopen_multiple().
+ * bs is drained here and undrained by bdrv_reopen_queue_free().
+ *
+ * To be called with bs->aio_context locked.
*/
static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
BlockDriverState *bs,
@@ -4173,12 +4202,10 @@ static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
int flags;
QemuOpts *opts;
- /* Make sure that the caller remembered to use a drained section. This is
- * important to avoid graph changes between the recursive queuing here and
- * bdrv_reopen_multiple(). */
- assert(bs->quiesce_counter > 0);
GLOBAL_STATE_CODE();
+ bdrv_drained_begin(bs);
+
if (bs_queue == NULL) {
bs_queue = g_new0(BlockReopenQueue, 1);
QTAILQ_INIT(bs_queue);
@@ -4312,6 +4339,7 @@ static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
return bs_queue;
}
+/* To be called with bs->aio_context locked */
BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
BlockDriverState *bs,
QDict *options, bool keep_old_opts)
@@ -4328,6 +4356,12 @@ void bdrv_reopen_queue_free(BlockReopenQueue *bs_queue)
if (bs_queue) {
BlockReopenQueueEntry *bs_entry, *next;
QTAILQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
+ AioContext *ctx = bdrv_get_aio_context(bs_entry->state.bs);
+
+ aio_context_acquire(ctx);
+ bdrv_drained_end(bs_entry->state.bs);
+ aio_context_release(ctx);
+
qobject_unref(bs_entry->state.explicit_options);
qobject_unref(bs_entry->state.options);
g_free(bs_entry);
@@ -4361,7 +4395,6 @@ int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
BlockReopenQueueEntry *bs_entry, *next;
AioContext *ctx;
Transaction *tran = tran_new();
- g_autoptr(GHashTable) found = NULL;
g_autoptr(GSList) refresh_list = NULL;
assert(qemu_get_current_aio_context() == qemu_get_aio_context());
@@ -4391,18 +4424,15 @@ int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
bs_entry->prepared = true;
}
- found = g_hash_table_new(NULL, NULL);
QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
BDRVReopenState *state = &bs_entry->state;
- refresh_list = bdrv_topological_dfs(refresh_list, found, state->bs);
+ refresh_list = g_slist_prepend(refresh_list, state->bs);
if (state->old_backing_bs) {
- refresh_list = bdrv_topological_dfs(refresh_list, found,
- state->old_backing_bs);
+ refresh_list = g_slist_prepend(refresh_list, state->old_backing_bs);
}
if (state->old_file_bs) {
- refresh_list = bdrv_topological_dfs(refresh_list, found,
- state->old_file_bs);
+ refresh_list = g_slist_prepend(refresh_list, state->old_file_bs);
}
}
@@ -4475,18 +4505,16 @@ int bdrv_reopen(BlockDriverState *bs, QDict *opts, bool keep_old_opts,
GLOBAL_STATE_CODE();
- bdrv_subtree_drained_begin(bs);
+ queue = bdrv_reopen_queue(NULL, bs, opts, keep_old_opts);
+
if (ctx != qemu_get_aio_context()) {
aio_context_release(ctx);
}
-
- queue = bdrv_reopen_queue(NULL, bs, opts, keep_old_opts);
ret = bdrv_reopen_multiple(queue, errp);
if (ctx != qemu_get_aio_context()) {
aio_context_acquire(ctx);
}
- bdrv_subtree_drained_end(bs);
return ret;
}
@@ -5067,23 +5095,24 @@ static void bdrv_remove_child(BdrvChild *child, Transaction *tran)
}
if (child->bs) {
+ BlockDriverState *bs = child->bs;
+ bdrv_drained_begin(bs);
bdrv_replace_child_tran(child, NULL, tran);
+ bdrv_drained_end(bs);
}
tran_add(tran, &bdrv_remove_child_drv, child);
}
-/*
- * A function to remove backing-chain child of @bs if exists: cow child for
- * format nodes (always .backing) and filter child for filters (may be .file or
- * .backing)
- */
-static void bdrv_remove_filter_or_cow_child(BlockDriverState *bs,
- Transaction *tran)
+static void undrain_on_clean_cb(void *opaque)
{
- bdrv_remove_child(bdrv_filter_or_cow_child(bs), tran);
+ bdrv_drained_end(opaque);
}
+static TransactionActionDrv undrain_on_clean = {
+ .clean = undrain_on_clean_cb,
+};
+
static int bdrv_replace_node_noperm(BlockDriverState *from,
BlockDriverState *to,
bool auto_skip, Transaction *tran,
@@ -5093,6 +5122,11 @@ static int bdrv_replace_node_noperm(BlockDriverState *from,
GLOBAL_STATE_CODE();
+ bdrv_drained_begin(from);
+ bdrv_drained_begin(to);
+ tran_add(tran, &undrain_on_clean, from);
+ tran_add(tran, &undrain_on_clean, to);
+
QLIST_FOREACH_SAFE(c, &from->parents, next_parent, next) {
assert(c->bs == from);
if (!should_update_child(c, to)) {
@@ -5130,7 +5164,6 @@ static int bdrv_replace_node_common(BlockDriverState *from,
Error **errp)
{
Transaction *tran = tran_new();
- g_autoptr(GHashTable) found = NULL;
g_autoptr(GSList) refresh_list = NULL;
BlockDriverState *to_cow_parent = NULL;
int ret;
@@ -5168,13 +5201,11 @@ static int bdrv_replace_node_common(BlockDriverState *from,
}
if (detach_subchain) {
- bdrv_remove_filter_or_cow_child(to_cow_parent, tran);
+ bdrv_remove_child(bdrv_filter_or_cow_child(to_cow_parent), tran);
}
- found = g_hash_table_new(NULL, NULL);
-
- refresh_list = bdrv_topological_dfs(refresh_list, found, to);
- refresh_list = bdrv_topological_dfs(refresh_list, found, from);
+ refresh_list = g_slist_prepend(refresh_list, to);
+ refresh_list = g_slist_prepend(refresh_list, from);
ret = bdrv_list_refresh_perms(refresh_list, NULL, tran, errp);
if (ret < 0) {
@@ -5244,7 +5275,7 @@ int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
goto out;
}
- ret = bdrv_refresh_perms(bs_new, errp);
+ ret = bdrv_refresh_perms(bs_new, tran, errp);
out:
tran_finalize(tran, ret);
@@ -5259,7 +5290,6 @@ int bdrv_replace_child_bs(BdrvChild *child, BlockDriverState *new_bs,
{
int ret;
Transaction *tran = tran_new();
- g_autoptr(GHashTable) found = NULL;
g_autoptr(GSList) refresh_list = NULL;
BlockDriverState *old_bs = child->bs;
@@ -5271,9 +5301,8 @@ int bdrv_replace_child_bs(BdrvChild *child, BlockDriverState *new_bs,
bdrv_replace_child_tran(child, new_bs, tran);
- found = g_hash_table_new(NULL, NULL);
- refresh_list = bdrv_topological_dfs(refresh_list, found, old_bs);
- refresh_list = bdrv_topological_dfs(refresh_list, found, new_bs);
+ refresh_list = g_slist_prepend(refresh_list, old_bs);
+ refresh_list = g_slist_prepend(refresh_list, new_bs);
ret = bdrv_list_refresh_perms(refresh_list, NULL, tran, errp);
@@ -5373,6 +5402,7 @@ int coroutine_fn bdrv_co_check(BlockDriverState *bs,
BdrvCheckResult *res, BdrvCheckMode fix)
{
IO_CODE();
+ assert_bdrv_graph_readable();
if (bs->drv == NULL) {
return -ENOMEDIUM;
}
@@ -5595,7 +5625,7 @@ int bdrv_drop_intermediate(BlockDriverState *top, BlockDriverState *base,
GLOBAL_STATE_CODE();
bdrv_ref(top);
- bdrv_subtree_drained_begin(top);
+ bdrv_drained_begin(base);
if (!top->drv || !base->drv) {
goto exit;
@@ -5668,7 +5698,7 @@ int bdrv_drop_intermediate(BlockDriverState *top, BlockDriverState *base,
ret = 0;
exit:
- bdrv_subtree_drained_end(top);
+ bdrv_drained_end(base);
bdrv_unref(top);
return ret;
}
@@ -6544,7 +6574,7 @@ int bdrv_activate(BlockDriverState *bs, Error **errp)
*/
if (bs->open_flags & BDRV_O_INACTIVE) {
bs->open_flags &= ~BDRV_O_INACTIVE;
- ret = bdrv_refresh_perms(bs, errp);
+ ret = bdrv_refresh_perms(bs, NULL, errp);
if (ret < 0) {
bs->open_flags |= BDRV_O_INACTIVE;
return ret;
@@ -6588,6 +6618,7 @@ int coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs, Error **errp)
IO_CODE();
assert(!(bs->open_flags & BDRV_O_INACTIVE));
+ assert_bdrv_graph_readable();
if (bs->drv->bdrv_co_invalidate_cache) {
bs->drv->bdrv_co_invalidate_cache(bs, &local_err);
@@ -6689,7 +6720,7 @@ static int bdrv_inactivate_recurse(BlockDriverState *bs)
* We only tried to loosen restrictions, so errors are not fatal, ignore
* them.
*/
- bdrv_refresh_perms(bs, NULL);
+ bdrv_refresh_perms(bs, NULL, NULL);
/* Recursively inactivate children */
QLIST_FOREACH(child, &bs->children, next) {
@@ -6894,6 +6925,10 @@ bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
return true;
}
+/*
+ * Must not be called while holding the lock of an AioContext other than the
+ * current one.
+ */
void bdrv_img_create(const char *filename, const char *fmt,
const char *base_filename, const char *base_fmt,
char *options, uint64_t img_size, int flags, bool quiet,
@@ -7181,7 +7216,6 @@ static void bdrv_detach_aio_context(BlockDriverState *bs)
if (bs->quiesce_counter) {
aio_enable_external(bs->aio_context);
}
- assert_bdrv_graph_writable(bs);
bs->aio_context = NULL;
}
@@ -7195,7 +7229,6 @@ static void bdrv_attach_aio_context(BlockDriverState *bs,
aio_disable_external(new_context);
}
- assert_bdrv_graph_writable(bs);
bs->aio_context = new_context;
if (bs->drv && bs->drv->bdrv_attach_aio_context) {
@@ -7276,7 +7309,6 @@ static void bdrv_set_aio_context_commit(void *opaque)
BlockDriverState *bs = (BlockDriverState *) state->bs;
AioContext *new_context = state->new_ctx;
AioContext *old_context = bdrv_get_aio_context(bs);
- assert_bdrv_graph_writable(bs);
/*
* Take the old AioContex when detaching it from bs.
diff --git a/block/block-backend.c b/block/block-backend.c
index bf0ea3cfed..ba7bf1d6bc 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -129,7 +129,7 @@ static void blk_root_inherit_options(BdrvChildRole role, bool parent_is_format,
}
static void blk_root_drained_begin(BdrvChild *child);
static bool blk_root_drained_poll(BdrvChild *child);
-static void blk_root_drained_end(BdrvChild *child, int *drained_end_counter);
+static void blk_root_drained_end(BdrvChild *child);
static void blk_root_change_media(BdrvChild *child, bool load);
static void blk_root_resize(BdrvChild *child);
@@ -1424,6 +1424,27 @@ int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
return blk_co_pwritev_part(blk, offset, bytes, qiov, 0, flags);
}
+int coroutine_fn blk_co_block_status_above(BlockBackend *blk,
+ BlockDriverState *base,
+ int64_t offset, int64_t bytes,
+ int64_t *pnum, int64_t *map,
+ BlockDriverState **file)
+{
+ IO_CODE();
+ return bdrv_co_block_status_above(blk_bs(blk), base, offset, bytes, pnum,
+ map, file);
+}
+
+int coroutine_fn blk_co_is_allocated_above(BlockBackend *blk,
+ BlockDriverState *base,
+ bool include_base, int64_t offset,
+ int64_t bytes, int64_t *pnum)
+{
+ IO_CODE();
+ return bdrv_co_is_allocated_above(blk_bs(blk), base, include_base, offset,
+ bytes, pnum);
+}
+
typedef struct BlkRwCo {
BlockBackend *blk;
int64_t offset;
@@ -2556,7 +2577,7 @@ static bool blk_root_drained_poll(BdrvChild *child)
return busy || !!blk->in_flight;
}
-static void blk_root_drained_end(BdrvChild *child, int *drained_end_counter)
+static void blk_root_drained_end(BdrvChild *child)
{
BlockBackend *blk = child->opaque;
assert(blk->quiesce_counter);
diff --git a/block/block-copy.c b/block/block-copy.c
index bb947afdda..5e59d6262f 100644
--- a/block/block-copy.c
+++ b/block/block-copy.c
@@ -577,8 +577,9 @@ static coroutine_fn int block_copy_task_entry(AioTask *task)
return ret;
}
-static int block_copy_block_status(BlockCopyState *s, int64_t offset,
- int64_t bytes, int64_t *pnum)
+static coroutine_fn int block_copy_block_status(BlockCopyState *s,
+ int64_t offset,
+ int64_t bytes, int64_t *pnum)
{
int64_t num;
BlockDriverState *base;
@@ -590,8 +591,8 @@ static int block_copy_block_status(BlockCopyState *s, int64_t offset,
base = NULL;
}
- ret = bdrv_block_status_above(s->source->bs, base, offset, bytes, &num,
- NULL, NULL);
+ ret = bdrv_co_block_status_above(s->source->bs, base, offset, bytes, &num,
+ NULL, NULL);
if (ret < 0 || num < s->cluster_size) {
/*
* On error or if failed to obtain large enough chunk just fallback to
@@ -613,8 +614,9 @@ static int block_copy_block_status(BlockCopyState *s, int64_t offset,
* Check if the cluster starting at offset is allocated or not.
* return via pnum the number of contiguous clusters sharing this allocation.
*/
-static int block_copy_is_cluster_allocated(BlockCopyState *s, int64_t offset,
- int64_t *pnum)
+static int coroutine_fn block_copy_is_cluster_allocated(BlockCopyState *s,
+ int64_t offset,
+ int64_t *pnum)
{
BlockDriverState *bs = s->source->bs;
int64_t count, total_count = 0;
@@ -624,7 +626,7 @@ static int block_copy_is_cluster_allocated(BlockCopyState *s, int64_t offset,
assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
while (true) {
- ret = bdrv_is_allocated(bs, offset, bytes, &count);
+ ret = bdrv_co_is_allocated(bs, offset, bytes, &count);
if (ret < 0) {
return ret;
}
@@ -669,8 +671,9 @@ void block_copy_reset(BlockCopyState *s, int64_t offset, int64_t bytes)
* @return 0 when the cluster at @offset was unallocated,
* 1 otherwise, and -ret on error.
*/
-int64_t block_copy_reset_unallocated(BlockCopyState *s,
- int64_t offset, int64_t *count)
+int64_t coroutine_fn block_copy_reset_unallocated(BlockCopyState *s,
+ int64_t offset,
+ int64_t *count)
{
int ret;
int64_t clusters, bytes;
diff --git a/block/block-gen.h b/block/block-gen.h
index f80cf4897d..89b7daaa1f 100644
--- a/block/block-gen.h
+++ b/block/block-gen.h
@@ -30,20 +30,17 @@
/* Base structure for argument packing structures */
typedef struct BdrvPollCo {
- BlockDriverState *bs;
+ AioContext *ctx;
bool in_progress;
- int ret;
Coroutine *co; /* Keep pointer here for debugging */
} BdrvPollCo;
-static inline int bdrv_poll_co(BdrvPollCo *s)
+static inline void bdrv_poll_co(BdrvPollCo *s)
{
assert(!qemu_in_coroutine());
- bdrv_coroutine_enter(s->bs, s->co);
- BDRV_POLL_WHILE(s->bs, s->in_progress);
-
- return s->ret;
+ aio_co_enter(s->ctx, s->co);
+ AIO_WAIT_WHILE(s->ctx, s->in_progress);
}
#endif /* BLOCK_BLOCK_GEN_H */
diff --git a/block/commit.c b/block/commit.c
index 0029b31944..b346341767 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -155,8 +155,8 @@ static int coroutine_fn commit_run(Job *job, Error **errp)
break;
}
/* Copy if allocated above the base */
- ret = bdrv_is_allocated_above(blk_bs(s->top), s->base_overlay, true,
- offset, COMMIT_BUFFER_SIZE, &n);
+ ret = blk_co_is_allocated_above(s->top, s->base_overlay, true,
+ offset, COMMIT_BUFFER_SIZE, &n);
copy = (ret > 0);
trace_commit_one_iteration(s, offset, n, ret);
if (copy) {
diff --git a/block/coroutines.h b/block/coroutines.h
index 3a2bad564f..2a1e0b3c9d 100644
--- a/block/coroutines.h
+++ b/block/coroutines.h
@@ -37,9 +37,11 @@
* the I/O API.
*/
-int coroutine_fn bdrv_co_check(BlockDriverState *bs,
- BdrvCheckResult *res, BdrvCheckMode fix);
-int coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs, Error **errp);
+int coroutine_fn GRAPH_RDLOCK
+bdrv_co_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix);
+
+int coroutine_fn GRAPH_RDLOCK
+bdrv_co_invalidate_cache(BlockDriverState *bs, Error **errp);
int coroutine_fn
bdrv_co_common_block_status_above(BlockDriverState *bs,
@@ -53,10 +55,11 @@ bdrv_co_common_block_status_above(BlockDriverState *bs,
BlockDriverState **file,
int *depth);
-int coroutine_fn bdrv_co_readv_vmstate(BlockDriverState *bs,
- QEMUIOVector *qiov, int64_t pos);
-int coroutine_fn bdrv_co_writev_vmstate(BlockDriverState *bs,
- QEMUIOVector *qiov, int64_t pos);
+int coroutine_fn GRAPH_RDLOCK
+bdrv_co_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
+
+int coroutine_fn GRAPH_RDLOCK
+bdrv_co_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
int coroutine_fn
nbd_co_do_establish_connection(BlockDriverState *bs, bool blocking,
@@ -71,7 +74,7 @@ nbd_co_do_establish_connection(BlockDriverState *bs, bool blocking,
* the "I/O or GS" API.
*/
-int generated_co_wrapper
+int co_wrapper_mixed_bdrv_rdlock
bdrv_common_block_status_above(BlockDriverState *bs,
BlockDriverState *base,
bool include_base,
@@ -82,7 +85,7 @@ bdrv_common_block_status_above(BlockDriverState *bs,
int64_t *map,
BlockDriverState **file,
int *depth);
-int generated_co_wrapper
+int co_wrapper_mixed
nbd_do_establish_connection(BlockDriverState *bs, bool blocking, Error **errp);
#endif /* BLOCK_COROUTINES_H */
diff --git a/block/crypto.c b/block/crypto.c
index 2fb8add458..bbeb9f437c 100644
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -703,7 +703,7 @@ static int coroutine_fn block_crypto_co_create_opts_luks(BlockDriver *drv,
}
/* Create protocol layer */
- ret = bdrv_create_file(filename, opts, errp);
+ ret = bdrv_co_create_file(filename, opts, errp);
if (ret < 0) {
goto fail;
}
diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
index 9c39550698..956feeb2ae 100644
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -388,7 +388,7 @@ void bdrv_release_named_dirty_bitmaps(BlockDriverState *bs)
* not fail.
* This function doesn't release corresponding BdrvDirtyBitmap.
*/
-static int coroutine_fn
+int coroutine_fn
bdrv_co_remove_persistent_dirty_bitmap(BlockDriverState *bs, const char *name,
Error **errp)
{
@@ -399,45 +399,6 @@ bdrv_co_remove_persistent_dirty_bitmap(BlockDriverState *bs, const char *name,
return 0;
}
-typedef struct BdrvRemovePersistentDirtyBitmapCo {
- BlockDriverState *bs;
- const char *name;
- Error **errp;
- int ret;
-} BdrvRemovePersistentDirtyBitmapCo;
-
-static void coroutine_fn
-bdrv_co_remove_persistent_dirty_bitmap_entry(void *opaque)
-{
- BdrvRemovePersistentDirtyBitmapCo *s = opaque;
-
- s->ret = bdrv_co_remove_persistent_dirty_bitmap(s->bs, s->name, s->errp);
- aio_wait_kick();
-}
-
-int bdrv_remove_persistent_dirty_bitmap(BlockDriverState *bs, const char *name,
- Error **errp)
-{
- if (qemu_in_coroutine()) {
- return bdrv_co_remove_persistent_dirty_bitmap(bs, name, errp);
- } else {
- Coroutine *co;
- BdrvRemovePersistentDirtyBitmapCo s = {
- .bs = bs,
- .name = name,
- .errp = errp,
- .ret = -EINPROGRESS,
- };
-
- co = qemu_coroutine_create(bdrv_co_remove_persistent_dirty_bitmap_entry,
- &s);
- bdrv_coroutine_enter(bs, co);
- BDRV_POLL_WHILE(bs, s.ret == -EINPROGRESS);
-
- return s.ret;
- }
-}
-
bool
bdrv_supports_persistent_dirty_bitmap(BlockDriverState *bs)
{
@@ -447,7 +408,7 @@ bdrv_supports_persistent_dirty_bitmap(BlockDriverState *bs)
return false;
}
-static bool coroutine_fn
+bool coroutine_fn
bdrv_co_can_store_new_dirty_bitmap(BlockDriverState *bs, const char *name,
uint32_t granularity, Error **errp)
{
@@ -470,51 +431,6 @@ bdrv_co_can_store_new_dirty_bitmap(BlockDriverState *bs, const char *name,
return drv->bdrv_co_can_store_new_dirty_bitmap(bs, name, granularity, errp);
}
-typedef struct BdrvCanStoreNewDirtyBitmapCo {
- BlockDriverState *bs;
- const char *name;
- uint32_t granularity;
- Error **errp;
- bool ret;
-
- bool in_progress;
-} BdrvCanStoreNewDirtyBitmapCo;
-
-static void coroutine_fn bdrv_co_can_store_new_dirty_bitmap_entry(void *opaque)
-{
- BdrvCanStoreNewDirtyBitmapCo *s = opaque;
-
- s->ret = bdrv_co_can_store_new_dirty_bitmap(s->bs, s->name, s->granularity,
- s->errp);
- s->in_progress = false;
- aio_wait_kick();
-}
-
-bool bdrv_can_store_new_dirty_bitmap(BlockDriverState *bs, const char *name,
- uint32_t granularity, Error **errp)
-{
- IO_CODE();
- if (qemu_in_coroutine()) {
- return bdrv_co_can_store_new_dirty_bitmap(bs, name, granularity, errp);
- } else {
- Coroutine *co;
- BdrvCanStoreNewDirtyBitmapCo s = {
- .bs = bs,
- .name = name,
- .granularity = granularity,
- .errp = errp,
- .in_progress = true,
- };
-
- co = qemu_coroutine_create(bdrv_co_can_store_new_dirty_bitmap_entry,
- &s);
- bdrv_coroutine_enter(bs, co);
- BDRV_POLL_WHILE(bs, s.in_progress);
-
- return s.ret;
- }
-}
-
void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap)
{
bdrv_dirty_bitmaps_lock(bitmap->bs);
diff --git a/block/graph-lock.c b/block/graph-lock.c
new file mode 100644
index 0000000000..454c31e691
--- /dev/null
+++ b/block/graph-lock.c
@@ -0,0 +1,275 @@
+/*
+ * Graph lock: rwlock to protect block layer graph manipulations (add/remove
+ * edges and nodes)
+ *
+ * Copyright (c) 2022 Red Hat
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/main-loop.h"
+#include "block/graph-lock.h"
+#include "block/block.h"
+#include "block/block_int.h"
+
+/* Dummy lock object to use for Thread Safety Analysis (TSA) */
+BdrvGraphLock graph_lock;
+
+/* Protects the list of aiocontext and orphaned_reader_count */
+static QemuMutex aio_context_list_lock;
+
+/* Written and read with atomic operations. */
+static int has_writer;
+
+/*
+ * A reader coroutine could move from an AioContext to another.
+ * If this happens, there is no problem from the point of view of
+ * counters. The problem is that the total count becomes
+ * unbalanced if one of the two AioContexts gets deleted.
+ * The count of readers must remain correct, so the AioContext's
+ * balance is transferred to this glboal variable.
+ * Protected by aio_context_list_lock.
+ */
+static uint32_t orphaned_reader_count;
+
+/* Queue of readers waiting for the writer to finish */
+static CoQueue reader_queue;
+
+struct BdrvGraphRWlock {
+ /* How many readers are currently reading the graph. */
+ uint32_t reader_count;
+
+ /*
+ * List of BdrvGraphRWlock kept in graph-lock.c
+ * Protected by aio_context_list_lock
+ */
+ QTAILQ_ENTRY(BdrvGraphRWlock) next_aio;
+};
+
+/*
+ * List of BdrvGraphRWlock. This list ensures that each BdrvGraphRWlock
+ * can safely modify only its own counter, avoid reading/writing
+ * others and thus improving performances by avoiding cacheline bounces.
+ */
+static QTAILQ_HEAD(, BdrvGraphRWlock) aio_context_list =
+ QTAILQ_HEAD_INITIALIZER(aio_context_list);
+
+static void __attribute__((__constructor__)) bdrv_init_graph_lock(void)
+{
+ qemu_mutex_init(&aio_context_list_lock);
+ qemu_co_queue_init(&reader_queue);
+}
+
+void register_aiocontext(AioContext *ctx)
+{
+ ctx->bdrv_graph = g_new0(BdrvGraphRWlock, 1);
+ QEMU_LOCK_GUARD(&aio_context_list_lock);
+ assert(ctx->bdrv_graph->reader_count == 0);
+ QTAILQ_INSERT_TAIL(&aio_context_list, ctx->bdrv_graph, next_aio);
+}
+
+void unregister_aiocontext(AioContext *ctx)
+{
+ QEMU_LOCK_GUARD(&aio_context_list_lock);
+ orphaned_reader_count += ctx->bdrv_graph->reader_count;
+ QTAILQ_REMOVE(&aio_context_list, ctx->bdrv_graph, next_aio);
+ g_free(ctx->bdrv_graph);
+}
+
+static uint32_t reader_count(void)
+{
+ BdrvGraphRWlock *brdv_graph;
+ uint32_t rd;
+
+ QEMU_LOCK_GUARD(&aio_context_list_lock);
+
+ /* rd can temporarly be negative, but the total will *always* be >= 0 */
+ rd = orphaned_reader_count;
+ QTAILQ_FOREACH(brdv_graph, &aio_context_list, next_aio) {
+ rd += qatomic_read(&brdv_graph->reader_count);
+ }
+
+ /* shouldn't overflow unless there are 2^31 readers */
+ assert((int32_t)rd >= 0);
+ return rd;
+}
+
+void bdrv_graph_wrlock(void)
+{
+ GLOBAL_STATE_CODE();
+ assert(!qatomic_read(&has_writer));
+
+ /* Make sure that constantly arriving new I/O doesn't cause starvation */
+ bdrv_drain_all_begin_nopoll();
+
+ /*
+ * reader_count == 0: this means writer will read has_reader as 1
+ * reader_count >= 1: we don't know if writer read has_writer == 0 or 1,
+ * but we need to wait.
+ * Wait by allowing other coroutine (and possible readers) to continue.
+ */
+ do {
+ /*
+ * has_writer must be 0 while polling, otherwise we get a deadlock if
+ * any callback involved during AIO_WAIT_WHILE() tries to acquire the
+ * reader lock.
+ */
+ qatomic_set(&has_writer, 0);
+ AIO_WAIT_WHILE(qemu_get_aio_context(), reader_count() >= 1);
+ qatomic_set(&has_writer, 1);
+
+ /*
+ * We want to only check reader_count() after has_writer = 1 is visible
+ * to other threads. That way no more readers can sneak in after we've
+ * determined reader_count() == 0.
+ */
+ smp_mb();
+ } while (reader_count() >= 1);
+
+ bdrv_drain_all_end();
+}
+
+void bdrv_graph_wrunlock(void)
+{
+ GLOBAL_STATE_CODE();
+ QEMU_LOCK_GUARD(&aio_context_list_lock);
+ assert(qatomic_read(&has_writer));
+
+ /*
+ * No need for memory barriers, this works in pair with
+ * the slow path of rdlock() and both take the lock.
+ */
+ qatomic_store_release(&has_writer, 0);
+
+ /* Wake up all coroutine that are waiting to read the graph */
+ qemu_co_enter_all(&reader_queue, &aio_context_list_lock);
+}
+
+void coroutine_fn bdrv_graph_co_rdlock(void)
+{
+ BdrvGraphRWlock *bdrv_graph;
+ bdrv_graph = qemu_get_current_aio_context()->bdrv_graph;
+
+ /* Do not lock if in main thread */
+ if (qemu_in_main_thread()) {
+ return;
+ }
+
+ for (;;) {
+ qatomic_set(&bdrv_graph->reader_count,
+ bdrv_graph->reader_count + 1);
+ /* make sure writer sees reader_count before we check has_writer */
+ smp_mb();
+
+ /*
+ * has_writer == 0: this means writer will read reader_count as >= 1
+ * has_writer == 1: we don't know if writer read reader_count == 0
+ * or > 0, but we need to wait anyways because
+ * it will write.
+ */
+ if (!qatomic_read(&has_writer)) {
+ break;
+ }
+
+ /*
+ * Synchronize access with reader_count() in bdrv_graph_wrlock().
+ * Case 1:
+ * If this critical section gets executed first, reader_count will
+ * decrease and the reader will go to sleep.
+ * Then the writer will read reader_count that does not take into
+ * account this reader, and if there's no other reader it will
+ * enter the write section.
+ * Case 2:
+ * If reader_count() critical section gets executed first,
+ * then writer will read reader_count >= 1.
+ * It will wait in AIO_WAIT_WHILE(), but once it releases the lock
+ * we will enter this critical section and call aio_wait_kick().
+ */
+ WITH_QEMU_LOCK_GUARD(&aio_context_list_lock) {
+ /*
+ * Additional check when we use the above lock to synchronize
+ * with bdrv_graph_wrunlock().
+ * Case 1:
+ * If this gets executed first, has_writer is still 1, so we reduce
+ * reader_count and go to sleep.
+ * Then the writer will set has_writer to 0 and wake up all readers,
+ * us included.
+ * Case 2:
+ * If bdrv_graph_wrunlock() critical section gets executed first,
+ * then it will set has_writer to 0 and wake up all other readers.
+ * Then we execute this critical section, and therefore must check
+ * again for has_writer, otherwise we sleep without any writer
+ * actually running.
+ */
+ if (!qatomic_read(&has_writer)) {
+ return;
+ }
+
+ /* slow path where reader sleeps */
+ bdrv_graph->reader_count--;
+ aio_wait_kick();
+ qemu_co_queue_wait(&reader_queue, &aio_context_list_lock);
+ }
+ }
+}
+
+void coroutine_fn bdrv_graph_co_rdunlock(void)
+{
+ BdrvGraphRWlock *bdrv_graph;
+ bdrv_graph = qemu_get_current_aio_context()->bdrv_graph;
+
+ /* Do not lock if in main thread */
+ if (qemu_in_main_thread()) {
+ return;
+ }
+
+ qatomic_store_release(&bdrv_graph->reader_count,
+ bdrv_graph->reader_count - 1);
+ /* make sure writer sees reader_count before we check has_writer */
+ smp_mb();
+
+ /*
+ * has_writer == 0: this means reader will read reader_count decreased
+ * has_writer == 1: we don't know if writer read reader_count old or
+ * new. Therefore, kick again so on next iteration
+ * writer will for sure read the updated value.
+ */
+ if (qatomic_read(&has_writer)) {
+ aio_wait_kick();
+ }
+}
+
+void bdrv_graph_rdlock_main_loop(void)
+{
+ GLOBAL_STATE_CODE();
+ assert(!qemu_in_coroutine());
+}
+
+void bdrv_graph_rdunlock_main_loop(void)
+{
+ GLOBAL_STATE_CODE();
+ assert(!qemu_in_coroutine());
+}
+
+void assert_bdrv_graph_readable(void)
+{
+ assert(qemu_in_main_thread() || reader_count());
+}
+
+void assert_bdrv_graph_writable(void)
+{
+ assert(qemu_in_main_thread());
+ assert(qatomic_read(&has_writer));
+}
diff --git a/block/io.c b/block/io.c
index b9424024f9..d87788dfbb 100644
--- a/block/io.c
+++ b/block/io.c
@@ -45,53 +45,43 @@ static void bdrv_parent_cb_resize(BlockDriverState *bs);
static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
int64_t offset, int64_t bytes, BdrvRequestFlags flags);
-static void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore,
- bool ignore_bds_parents)
+static void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore)
{
BdrvChild *c, *next;
QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
- if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) {
+ if (c == ignore) {
continue;
}
- bdrv_parent_drained_begin_single(c, false);
- }
-}
-
-static void bdrv_parent_drained_end_single_no_poll(BdrvChild *c,
- int *drained_end_counter)
-{
- assert(c->parent_quiesce_counter > 0);
- c->parent_quiesce_counter--;
- if (c->klass->drained_end) {
- c->klass->drained_end(c, drained_end_counter);
+ bdrv_parent_drained_begin_single(c);
}
}
void bdrv_parent_drained_end_single(BdrvChild *c)
{
- int drained_end_counter = 0;
- AioContext *ctx = bdrv_child_get_parent_aio_context(c);
IO_OR_GS_CODE();
- bdrv_parent_drained_end_single_no_poll(c, &drained_end_counter);
- AIO_WAIT_WHILE(ctx, qatomic_read(&drained_end_counter) > 0);
+
+ assert(c->quiesced_parent);
+ c->quiesced_parent = false;
+
+ if (c->klass->drained_end) {
+ c->klass->drained_end(c);
+ }
}
-static void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore,
- bool ignore_bds_parents,
- int *drained_end_counter)
+static void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore)
{
BdrvChild *c;
QLIST_FOREACH(c, &bs->parents, next_parent) {
- if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) {
+ if (c == ignore) {
continue;
}
- bdrv_parent_drained_end_single_no_poll(c, drained_end_counter);
+ bdrv_parent_drained_end_single(c);
}
}
-static bool bdrv_parent_drained_poll_single(BdrvChild *c)
+bool bdrv_parent_drained_poll_single(BdrvChild *c)
{
if (c->klass->drained_poll) {
return c->klass->drained_poll(c);
@@ -115,17 +105,16 @@ static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore,
return busy;
}
-void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll)
+void bdrv_parent_drained_begin_single(BdrvChild *c)
{
- AioContext *ctx = bdrv_child_get_parent_aio_context(c);
IO_OR_GS_CODE();
- c->parent_quiesce_counter++;
+
+ assert(!c->quiesced_parent);
+ c->quiesced_parent = true;
+
if (c->klass->drained_begin) {
c->klass->drained_begin(c);
}
- if (poll) {
- AIO_WAIT_WHILE(ctx, bdrv_parent_drained_poll_single(c));
- }
}
static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src)
@@ -245,69 +234,14 @@ typedef struct {
BlockDriverState *bs;
bool done;
bool begin;
- bool recursive;
bool poll;
BdrvChild *parent;
- bool ignore_bds_parents;
- int *drained_end_counter;
} BdrvCoDrainData;
-static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
-{
- BdrvCoDrainData *data = opaque;
- BlockDriverState *bs = data->bs;
-
- if (data->begin) {
- bs->drv->bdrv_co_drain_begin(bs);
- } else {
- bs->drv->bdrv_co_drain_end(bs);
- }
-
- /* Set data->done and decrement drained_end_counter before bdrv_wakeup() */
- qatomic_mb_set(&data->done, true);
- if (!data->begin) {
- qatomic_dec(data->drained_end_counter);
- }
- bdrv_dec_in_flight(bs);
-
- g_free(data);
-}
-
-/* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
-static void bdrv_drain_invoke(BlockDriverState *bs, bool begin,
- int *drained_end_counter)
-{
- BdrvCoDrainData *data;
-
- if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
- (!begin && !bs->drv->bdrv_co_drain_end)) {
- return;
- }
-
- data = g_new(BdrvCoDrainData, 1);
- *data = (BdrvCoDrainData) {
- .bs = bs,
- .done = false,
- .begin = begin,
- .drained_end_counter = drained_end_counter,
- };
-
- if (!begin) {
- qatomic_inc(drained_end_counter);
- }
-
- /* Make sure the driver callback completes during the polling phase for
- * drain_begin. */
- bdrv_inc_in_flight(bs);
- data->co = qemu_coroutine_create(bdrv_drain_invoke_entry, data);
- aio_co_schedule(bdrv_get_aio_context(bs), data->co);
-}
-
/* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */
-bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
- BdrvChild *ignore_parent, bool ignore_bds_parents)
+bool bdrv_drain_poll(BlockDriverState *bs, BdrvChild *ignore_parent,
+ bool ignore_bds_parents)
{
- BdrvChild *child, *next;
IO_OR_GS_CODE();
if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) {
@@ -318,30 +252,18 @@ bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
return true;
}
- if (recursive) {
- assert(!ignore_bds_parents);
- QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
- if (bdrv_drain_poll(child->bs, recursive, child, false)) {
- return true;
- }
- }
- }
-
return false;
}
-static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive,
+static bool bdrv_drain_poll_top_level(BlockDriverState *bs,
BdrvChild *ignore_parent)
{
- return bdrv_drain_poll(bs, recursive, ignore_parent, false);
+ return bdrv_drain_poll(bs, ignore_parent, false);
}
-static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
- BdrvChild *parent, bool ignore_bds_parents,
+static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent,
bool poll);
-static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
- BdrvChild *parent, bool ignore_bds_parents,
- int *drained_end_counter);
+static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent);
static void bdrv_co_drain_bh_cb(void *opaque)
{
@@ -354,14 +276,10 @@ static void bdrv_co_drain_bh_cb(void *opaque)
aio_context_acquire(ctx);
bdrv_dec_in_flight(bs);
if (data->begin) {
- assert(!data->drained_end_counter);
- bdrv_do_drained_begin(bs, data->recursive, data->parent,
- data->ignore_bds_parents, data->poll);
+ bdrv_do_drained_begin(bs, data->parent, data->poll);
} else {
assert(!data->poll);
- bdrv_do_drained_end(bs, data->recursive, data->parent,
- data->ignore_bds_parents,
- data->drained_end_counter);
+ bdrv_do_drained_end(bs, data->parent);
}
aio_context_release(ctx);
} else {
@@ -374,11 +292,9 @@ static void bdrv_co_drain_bh_cb(void *opaque)
}
static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
- bool begin, bool recursive,
+ bool begin,
BdrvChild *parent,
- bool ignore_bds_parents,
- bool poll,
- int *drained_end_counter)
+ bool poll)
{
BdrvCoDrainData data;
Coroutine *self = qemu_coroutine_self();
@@ -394,11 +310,8 @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
.bs = bs,
.done = false,
.begin = begin,
- .recursive = recursive,
.parent = parent,
- .ignore_bds_parents = ignore_bds_parents,
.poll = poll,
- .drained_end_counter = drained_end_counter,
};
if (bs) {
@@ -429,41 +342,22 @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
}
}
-void bdrv_do_drained_begin_quiesce(BlockDriverState *bs,
- BdrvChild *parent, bool ignore_bds_parents)
-{
- IO_OR_GS_CODE();
- assert(!qemu_in_coroutine());
-
- /* Stop things in parent-to-child order */
- if (qatomic_fetch_inc(&bs->quiesce_counter) == 0) {
- aio_disable_external(bdrv_get_aio_context(bs));
- }
-
- bdrv_parent_drained_begin(bs, parent, ignore_bds_parents);
- bdrv_drain_invoke(bs, true, NULL);
-}
-
-static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
- BdrvChild *parent, bool ignore_bds_parents,
+static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent,
bool poll)
{
- BdrvChild *child, *next;
+ IO_OR_GS_CODE();
if (qemu_in_coroutine()) {
- bdrv_co_yield_to_drain(bs, true, recursive, parent, ignore_bds_parents,
- poll, NULL);
+ bdrv_co_yield_to_drain(bs, true, parent, poll);
return;
}
- bdrv_do_drained_begin_quiesce(bs, parent, ignore_bds_parents);
-
- if (recursive) {
- assert(!ignore_bds_parents);
- bs->recursive_quiesce_counter++;
- QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
- bdrv_do_drained_begin(child->bs, true, child, ignore_bds_parents,
- false);
+ /* Stop things in parent-to-child order */
+ if (qatomic_fetch_inc(&bs->quiesce_counter) == 0) {
+ aio_disable_external(bdrv_get_aio_context(bs));
+ bdrv_parent_drained_begin(bs, parent);
+ if (bs->drv && bs->drv->bdrv_drain_begin) {
+ bs->drv->bdrv_drain_begin(bs);
}
}
@@ -477,117 +371,50 @@ static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
* nodes.
*/
if (poll) {
- assert(!ignore_bds_parents);
- BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent));
+ BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, parent));
}
}
-void bdrv_drained_begin(BlockDriverState *bs)
+void bdrv_do_drained_begin_quiesce(BlockDriverState *bs, BdrvChild *parent)
{
- IO_OR_GS_CODE();
- bdrv_do_drained_begin(bs, false, NULL, false, true);
+ bdrv_do_drained_begin(bs, parent, false);
}
-void bdrv_subtree_drained_begin(BlockDriverState *bs)
+void bdrv_drained_begin(BlockDriverState *bs)
{
IO_OR_GS_CODE();
- bdrv_do_drained_begin(bs, true, NULL, false, true);
+ bdrv_do_drained_begin(bs, NULL, true);
}
/**
* This function does not poll, nor must any of its recursively called
- * functions. The *drained_end_counter pointee will be incremented
- * once for every background operation scheduled, and decremented once
- * the operation settles. Therefore, the pointer must remain valid
- * until the pointee reaches 0. That implies that whoever sets up the
- * pointee has to poll until it is 0.
- *
- * We use atomic operations to access *drained_end_counter, because
- * (1) when called from bdrv_set_aio_context_ignore(), the subgraph of
- * @bs may contain nodes in different AioContexts,
- * (2) bdrv_drain_all_end() uses the same counter for all nodes,
- * regardless of which AioContext they are in.
+ * functions.
*/
-static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
- BdrvChild *parent, bool ignore_bds_parents,
- int *drained_end_counter)
+static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
{
- BdrvChild *child;
int old_quiesce_counter;
- assert(drained_end_counter != NULL);
-
if (qemu_in_coroutine()) {
- bdrv_co_yield_to_drain(bs, false, recursive, parent, ignore_bds_parents,
- false, drained_end_counter);
+ bdrv_co_yield_to_drain(bs, false, parent, false);
return;
}
assert(bs->quiesce_counter > 0);
/* Re-enable things in child-to-parent order */
- bdrv_drain_invoke(bs, false, drained_end_counter);
- bdrv_parent_drained_end(bs, parent, ignore_bds_parents,
- drained_end_counter);
-
old_quiesce_counter = qatomic_fetch_dec(&bs->quiesce_counter);
if (old_quiesce_counter == 1) {
- aio_enable_external(bdrv_get_aio_context(bs));
- }
-
- if (recursive) {
- assert(!ignore_bds_parents);
- bs->recursive_quiesce_counter--;
- QLIST_FOREACH(child, &bs->children, next) {
- bdrv_do_drained_end(child->bs, true, child, ignore_bds_parents,
- drained_end_counter);
+ if (bs->drv && bs->drv->bdrv_drain_end) {
+ bs->drv->bdrv_drain_end(bs);
}
+ bdrv_parent_drained_end(bs, parent);
+ aio_enable_external(bdrv_get_aio_context(bs));
}
}
void bdrv_drained_end(BlockDriverState *bs)
{
- int drained_end_counter = 0;
- IO_OR_GS_CODE();
- bdrv_do_drained_end(bs, false, NULL, false, &drained_end_counter);
- BDRV_POLL_WHILE(bs, qatomic_read(&drained_end_counter) > 0);
-}
-
-void bdrv_drained_end_no_poll(BlockDriverState *bs, int *drained_end_counter)
-{
- IO_CODE();
- bdrv_do_drained_end(bs, false, NULL, false, drained_end_counter);
-}
-
-void bdrv_subtree_drained_end(BlockDriverState *bs)
-{
- int drained_end_counter = 0;
- IO_OR_GS_CODE();
- bdrv_do_drained_end(bs, true, NULL, false, &drained_end_counter);
- BDRV_POLL_WHILE(bs, qatomic_read(&drained_end_counter) > 0);
-}
-
-void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
-{
- int i;
- IO_OR_GS_CODE();
-
- for (i = 0; i < new_parent->recursive_quiesce_counter; i++) {
- bdrv_do_drained_begin(child->bs, true, child, false, true);
- }
-}
-
-void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
-{
- int drained_end_counter = 0;
- int i;
IO_OR_GS_CODE();
-
- for (i = 0; i < old_parent->recursive_quiesce_counter; i++) {
- bdrv_do_drained_end(child->bs, true, child, false,
- &drained_end_counter);
- }
-
- BDRV_POLL_WHILE(child->bs, qatomic_read(&drained_end_counter) > 0);
+ bdrv_do_drained_end(bs, NULL);
}
void bdrv_drain(BlockDriverState *bs)
@@ -620,7 +447,7 @@ static bool bdrv_drain_all_poll(void)
while ((bs = bdrv_next_all_states(bs))) {
AioContext *aio_context = bdrv_get_aio_context(bs);
aio_context_acquire(aio_context);
- result |= bdrv_drain_poll(bs, false, NULL, true);
+ result |= bdrv_drain_poll(bs, NULL, true);
aio_context_release(aio_context);
}
@@ -639,16 +466,11 @@ static bool bdrv_drain_all_poll(void)
* NOTE: no new block jobs or BlockDriverStates can be created between
* the bdrv_drain_all_begin() and bdrv_drain_all_end() calls.
*/
-void bdrv_drain_all_begin(void)
+void bdrv_drain_all_begin_nopoll(void)
{
BlockDriverState *bs = NULL;
GLOBAL_STATE_CODE();
- if (qemu_in_coroutine()) {
- bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true, NULL);
- return;
- }
-
/*
* bdrv queue is managed by record/replay,
* waiting for finishing the I/O requests may
@@ -670,9 +492,21 @@ void bdrv_drain_all_begin(void)
AioContext *aio_context = bdrv_get_aio_context(bs);
aio_context_acquire(aio_context);
- bdrv_do_drained_begin(bs, false, NULL, true, false);
+ bdrv_do_drained_begin(bs, NULL, false);
aio_context_release(aio_context);
}
+}
+
+void bdrv_drain_all_begin(void)
+{
+ BlockDriverState *bs = NULL;
+
+ if (qemu_in_coroutine()) {
+ bdrv_co_yield_to_drain(NULL, true, NULL, true);
+ return;
+ }
+
+ bdrv_drain_all_begin_nopoll();
/* Now poll the in-flight requests */
AIO_WAIT_WHILE(NULL, bdrv_drain_all_poll());
@@ -684,22 +518,19 @@ void bdrv_drain_all_begin(void)
void bdrv_drain_all_end_quiesce(BlockDriverState *bs)
{
- int drained_end_counter = 0;
GLOBAL_STATE_CODE();
g_assert(bs->quiesce_counter > 0);
g_assert(!bs->refcnt);
while (bs->quiesce_counter) {
- bdrv_do_drained_end(bs, false, NULL, true, &drained_end_counter);
+ bdrv_do_drained_end(bs, NULL);
}
- BDRV_POLL_WHILE(bs, qatomic_read(&drained_end_counter) > 0);
}
void bdrv_drain_all_end(void)
{
BlockDriverState *bs = NULL;
- int drained_end_counter = 0;
GLOBAL_STATE_CODE();
/*
@@ -715,13 +546,11 @@ void bdrv_drain_all_end(void)
AioContext *aio_context = bdrv_get_aio_context(bs);
aio_context_acquire(aio_context);
- bdrv_do_drained_end(bs, false, NULL, true, &drained_end_counter);
+ bdrv_do_drained_end(bs, NULL);
aio_context_release(aio_context);
}
assert(qemu_get_current_aio_context() == qemu_get_aio_context());
- AIO_WAIT_WHILE(NULL, qatomic_read(&drained_end_counter) > 0);
-
assert(bdrv_drain_all_count > 0);
bdrv_drain_all_count--;
}
@@ -2711,6 +2540,17 @@ bdrv_co_common_block_status_above(BlockDriverState *bs,
return ret;
}
+int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs,
+ BlockDriverState *base,
+ int64_t offset, int64_t bytes,
+ int64_t *pnum, int64_t *map,
+ BlockDriverState **file)
+{
+ IO_CODE();
+ return bdrv_co_common_block_status_above(bs, base, false, true, offset,
+ bytes, pnum, map, file, NULL);
+}
+
int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base,
int64_t offset, int64_t bytes, int64_t *pnum,
int64_t *map, BlockDriverState **file)
@@ -2756,6 +2596,22 @@ int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset,
return (pnum == bytes) && (ret & BDRV_BLOCK_ZERO);
}
+int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t offset,
+ int64_t bytes, int64_t *pnum)
+{
+ int ret;
+ int64_t dummy;
+ IO_CODE();
+
+ ret = bdrv_co_common_block_status_above(bs, bs, true, false, offset,
+ bytes, pnum ? pnum : &dummy, NULL,
+ NULL, NULL);
+ if (ret < 0) {
+ return ret;
+ }
+ return !!(ret & BDRV_BLOCK_ALLOCATED);
+}
+
int bdrv_is_allocated(BlockDriverState *bs, int64_t offset, int64_t bytes,
int64_t *pnum)
{
@@ -2772,6 +2628,29 @@ int bdrv_is_allocated(BlockDriverState *bs, int64_t offset, int64_t bytes,
return !!(ret & BDRV_BLOCK_ALLOCATED);
}
+/* See bdrv_is_allocated_above for documentation */
+int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top,
+ BlockDriverState *base,
+ bool include_base, int64_t offset,
+ int64_t bytes, int64_t *pnum)
+{
+ int depth;
+ int ret;
+ IO_CODE();
+
+ ret = bdrv_co_common_block_status_above(top, base, include_base, false,
+ offset, bytes, pnum, NULL, NULL,
+ &depth);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (ret & BDRV_BLOCK_ALLOCATED) {
+ return depth;
+ }
+ return 0;
+}
+
/*
* Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
*
@@ -2795,10 +2674,12 @@ int bdrv_is_allocated_above(BlockDriverState *top,
int64_t bytes, int64_t *pnum)
{
int depth;
- int ret = bdrv_common_block_status_above(top, base, include_base, false,
- offset, bytes, pnum, NULL, NULL,
- &depth);
+ int ret;
IO_CODE();
+
+ ret = bdrv_common_block_status_above(top, base, include_base, false,
+ offset, bytes, pnum, NULL, NULL,
+ &depth);
if (ret < 0) {
return ret;
}
@@ -2816,6 +2697,7 @@ bdrv_co_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
BlockDriverState *child_bs = bdrv_primary_bs(bs);
int ret;
IO_CODE();
+ assert_bdrv_graph_readable();
ret = bdrv_check_qiov_request(pos, qiov->size, qiov, 0, NULL);
if (ret < 0) {
@@ -2848,6 +2730,7 @@ bdrv_co_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
BlockDriverState *child_bs = bdrv_primary_bs(bs);
int ret;
IO_CODE();
+ assert_bdrv_graph_readable();
ret = bdrv_check_qiov_request(pos, qiov->size, qiov, 0, NULL);
if (ret < 0) {
diff --git a/block/meson.build b/block/meson.build
index b7c68b83a3..90011a2805 100644
--- a/block/meson.build
+++ b/block/meson.build
@@ -10,6 +10,7 @@ block_ss.add(files(
'blkverify.c',
'block-backend.c',
'block-copy.c',
+ 'graph-lock.c',
'commit.c',
'copy-on-read.c',
'preallocate.c',
@@ -137,6 +138,7 @@ block_gen_c = custom_target('block-gen.c',
output: 'block-gen.c',
input: files(
'../include/block/block-io.h',
+ '../include/block/dirty-bitmap.h',
'../include/block/block-global-state.h',
'../include/sysemu/block-backend-io.h',
'coroutines.h'
diff --git a/block/parallels.c b/block/parallels.c
index fa08c1104b..bbea2f2221 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -646,7 +646,7 @@ static int coroutine_fn parallels_co_create_opts(BlockDriver *drv,
}
/* Create and open the file (protocol layer) */
- ret = bdrv_create_file(filename, opts, errp);
+ ret = bdrv_co_create_file(filename, opts, errp);
if (ret < 0) {
goto done;
}
diff --git a/block/qcow.c b/block/qcow.c
index 8bffc41531..5d99f00411 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -973,7 +973,7 @@ static int coroutine_fn qcow_co_create_opts(BlockDriver *drv,
}
/* Create and open the file (protocol layer) */
- ret = bdrv_create_file(filename, opts, errp);
+ ret = bdrv_co_create_file(filename, opts, errp);
if (ret < 0) {
goto fail;
}
diff --git a/block/qcow2.c b/block/qcow2.c
index 941782a011..bafbd077b9 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -3871,7 +3871,7 @@ static int coroutine_fn qcow2_co_create_opts(BlockDriver *drv,
}
/* Create and open the file (protocol layer) */
- ret = bdrv_create_file(filename, opts, errp);
+ ret = bdrv_co_create_file(filename, opts, errp);
if (ret < 0) {
goto finish;
}
@@ -3886,7 +3886,7 @@ static int coroutine_fn qcow2_co_create_opts(BlockDriver *drv,
/* Create and open an external data file (protocol layer) */
val = qdict_get_try_str(qdict, BLOCK_OPT_DATA_FILE);
if (val) {
- ret = bdrv_create_file(val, opts, errp);
+ ret = bdrv_co_create_file(val, opts, errp);
if (ret < 0) {
goto finish;
}
diff --git a/block/qed.c b/block/qed.c
index 0ab159b468..faa606618e 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -262,7 +262,7 @@ static bool coroutine_fn qed_plug_allocating_write_reqs(BDRVQEDState *s)
assert(!s->allocating_write_reqs_plugged);
if (s->allocating_acb != NULL) {
/* Another allocating write came concurrently. This cannot happen
- * from bdrv_qed_co_drain_begin, but it can happen when the timer runs.
+ * from bdrv_qed_drain_begin, but it can happen when the timer runs.
*/
qemu_co_mutex_unlock(&s->table_lock);
return false;
@@ -282,9 +282,8 @@ static void coroutine_fn qed_unplug_allocating_write_reqs(BDRVQEDState *s)
qemu_co_mutex_unlock(&s->table_lock);
}
-static void coroutine_fn qed_need_check_timer_entry(void *opaque)
+static void coroutine_fn qed_need_check_timer(BDRVQEDState *s)
{
- BDRVQEDState *s = opaque;
int ret;
trace_qed_need_check_timer_cb(s);
@@ -310,9 +309,20 @@ static void coroutine_fn qed_need_check_timer_entry(void *opaque)
(void) ret;
}
+static void coroutine_fn qed_need_check_timer_entry(void *opaque)
+{
+ BDRVQEDState *s = opaque;
+
+ qed_need_check_timer(opaque);
+ bdrv_dec_in_flight(s->bs);
+}
+
static void qed_need_check_timer_cb(void *opaque)
{
+ BDRVQEDState *s = opaque;
Coroutine *co = qemu_coroutine_create(qed_need_check_timer_entry, opaque);
+
+ bdrv_inc_in_flight(s->bs);
qemu_coroutine_enter(co);
}
@@ -355,7 +365,7 @@ static void bdrv_qed_attach_aio_context(BlockDriverState *bs,
}
}
-static void coroutine_fn bdrv_qed_co_drain_begin(BlockDriverState *bs)
+static void bdrv_qed_drain_begin(BlockDriverState *bs)
{
BDRVQEDState *s = bs->opaque;
@@ -363,8 +373,12 @@ static void coroutine_fn bdrv_qed_co_drain_begin(BlockDriverState *bs)
* header is flushed.
*/
if (s->need_check_timer && timer_pending(s->need_check_timer)) {
+ Coroutine *co;
+
qed_cancel_need_check_timer(s);
- qed_need_check_timer_entry(s);
+ co = qemu_coroutine_create(qed_need_check_timer_entry, s);
+ bdrv_inc_in_flight(bs);
+ aio_co_enter(bdrv_get_aio_context(bs), co);
}
}
@@ -764,7 +778,7 @@ static int coroutine_fn bdrv_qed_co_create_opts(BlockDriver *drv,
}
/* Create and open the file (protocol layer) */
- ret = bdrv_create_file(filename, opts, errp);
+ ret = bdrv_co_create_file(filename, opts, errp);
if (ret < 0) {
goto fail;
}
@@ -1647,7 +1661,7 @@ static BlockDriver bdrv_qed = {
.bdrv_co_check = bdrv_qed_co_check,
.bdrv_detach_aio_context = bdrv_qed_detach_aio_context,
.bdrv_attach_aio_context = bdrv_qed_attach_aio_context,
- .bdrv_co_drain_begin = bdrv_qed_co_drain_begin,
+ .bdrv_drain_begin = bdrv_qed_drain_begin,
};
static void bdrv_qed_init(void)
diff --git a/block/raw-format.c b/block/raw-format.c
index a68014ef0b..28905b09ee 100644
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -433,7 +433,7 @@ static int coroutine_fn raw_co_create_opts(BlockDriver *drv,
QemuOpts *opts,
Error **errp)
{
- return bdrv_create_file(filename, opts, errp);
+ return bdrv_co_create_file(filename, opts, errp);
}
static int raw_open(BlockDriverState *bs, QDict *options, int flags,
diff --git a/block/replication.c b/block/replication.c
index f1eed25e43..c62f48a874 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -374,9 +374,6 @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
s->orig_secondary_read_only = bdrv_is_read_only(secondary_disk->bs);
}
- bdrv_subtree_drained_begin(hidden_disk->bs);
- bdrv_subtree_drained_begin(secondary_disk->bs);
-
if (s->orig_hidden_read_only) {
QDict *opts = qdict_new();
qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !writable);
@@ -401,9 +398,6 @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
aio_context_acquire(ctx);
}
}
-
- bdrv_subtree_drained_end(hidden_disk->bs);
- bdrv_subtree_drained_end(secondary_disk->bs);
}
static void backup_job_cleanup(BlockDriverState *bs)
diff --git a/block/stream.c b/block/stream.c
index 694709bd25..8744ad103f 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -64,13 +64,16 @@ static int stream_prepare(Job *job)
bdrv_cor_filter_drop(s->cor_filter_bs);
s->cor_filter_bs = NULL;
- bdrv_subtree_drained_begin(s->above_base);
+ /*
+ * bdrv_set_backing_hd() requires that unfiltered_bs is drained. Drain
+ * already here and use bdrv_set_backing_hd_drained() instead because
+ * the polling during drained_begin() might change the graph, and if we do
+ * this only later, we may end up working with the wrong base node (or it
+ * might even have gone away by the time we want to use it).
+ */
+ bdrv_drained_begin(unfiltered_bs);
base = bdrv_filter_or_cow_bs(s->above_base);
- if (base) {
- bdrv_ref(base);
- }
-
unfiltered_base = bdrv_skip_filters(base);
if (bdrv_cow_child(unfiltered_bs)) {
@@ -82,7 +85,13 @@ static int stream_prepare(Job *job)
}
}
- bdrv_set_backing_hd(unfiltered_bs, base, &local_err);
+ bdrv_set_backing_hd_drained(unfiltered_bs, base, &local_err);
+
+ /*
+ * This call will do I/O, so the graph can change again from here on.
+ * We have already completed the graph change, so we are not in danger
+ * of operating on the wrong node any more if this happens.
+ */
ret = bdrv_change_backing_file(unfiltered_bs, base_id, base_fmt, false);
if (local_err) {
error_report_err(local_err);
@@ -92,10 +101,7 @@ static int stream_prepare(Job *job)
}
out:
- if (base) {
- bdrv_unref(base);
- }
- bdrv_subtree_drained_end(s->above_base);
+ bdrv_drained_end(unfiltered_bs);
return ret;
}
diff --git a/block/throttle.c b/block/throttle.c
index 131eba3ab4..88851c84f4 100644
--- a/block/throttle.c
+++ b/block/throttle.c
@@ -214,7 +214,7 @@ static void throttle_reopen_abort(BDRVReopenState *reopen_state)
reopen_state->opaque = NULL;
}
-static void coroutine_fn throttle_co_drain_begin(BlockDriverState *bs)
+static void throttle_drain_begin(BlockDriverState *bs)
{
ThrottleGroupMember *tgm = bs->opaque;
if (qatomic_fetch_inc(&tgm->io_limits_disabled) == 0) {
@@ -222,7 +222,7 @@ static void coroutine_fn throttle_co_drain_begin(BlockDriverState *bs)
}
}
-static void coroutine_fn throttle_co_drain_end(BlockDriverState *bs)
+static void throttle_drain_end(BlockDriverState *bs)
{
ThrottleGroupMember *tgm = bs->opaque;
assert(tgm->io_limits_disabled);
@@ -261,8 +261,8 @@ static BlockDriver bdrv_throttle = {
.bdrv_reopen_commit = throttle_reopen_commit,
.bdrv_reopen_abort = throttle_reopen_abort,
- .bdrv_co_drain_begin = throttle_co_drain_begin,
- .bdrv_co_drain_end = throttle_co_drain_end,
+ .bdrv_drain_begin = throttle_drain_begin,
+ .bdrv_drain_end = throttle_drain_end,
.is_filter = true,
.strong_runtime_opts = throttle_strong_runtime_opts,
diff --git a/block/vdi.c b/block/vdi.c
index c0c111c4b9..479bcfe820 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -934,7 +934,7 @@ static int coroutine_fn vdi_co_create_opts(BlockDriver *drv,
qdict = qemu_opts_to_qdict_filtered(opts, NULL, &vdi_create_opts, true);
/* Create and open the file (protocol layer) */
- ret = bdrv_create_file(filename, opts, errp);
+ ret = bdrv_co_create_file(filename, opts, errp);
if (ret < 0) {
goto done;
}
diff --git a/block/vhdx.c b/block/vhdx.c
index bad9ca691b..4c929800fe 100644
--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -2084,7 +2084,7 @@ static int coroutine_fn vhdx_co_create_opts(BlockDriver *drv,
}
/* Create and open the file (protocol layer) */
- ret = bdrv_create_file(filename, opts, errp);
+ ret = bdrv_co_create_file(filename, opts, errp);
if (ret < 0) {
goto fail;
}
diff --git a/block/vmdk.c b/block/vmdk.c
index bac3d8db50..8894dac2d4 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -2285,15 +2285,16 @@ exit:
return ret;
}
-static int vmdk_create_extent(const char *filename, int64_t filesize,
- bool flat, bool compress, bool zeroed_grain,
- BlockBackend **pbb,
- QemuOpts *opts, Error **errp)
+static int coroutine_fn vmdk_create_extent(const char *filename,
+ int64_t filesize, bool flat,
+ bool compress, bool zeroed_grain,
+ BlockBackend **pbb,
+ QemuOpts *opts, Error **errp)
{
int ret;
BlockBackend *blk = NULL;
- ret = bdrv_create_file(filename, opts, errp);
+ ret = bdrv_co_create_file(filename, opts, errp);
if (ret < 0) {
goto exit;
}
@@ -2366,14 +2367,14 @@ static int filename_decompose(const char *filename, char *path, char *prefix,
* non-split format.
* idx >= 1: get the n-th extent if in a split subformat
*/
-typedef BlockBackend *(*vmdk_create_extent_fn)(int64_t size,
- int idx,
- bool flat,
- bool split,
- bool compress,
- bool zeroed_grain,
- void *opaque,
- Error **errp);
+typedef BlockBackend * coroutine_fn (*vmdk_create_extent_fn)(int64_t size,
+ int idx,
+ bool flat,
+ bool split,
+ bool compress,
+ bool zeroed_grain,
+ void *opaque,
+ Error **errp);
static void vmdk_desc_add_extent(GString *desc,
const char *extent_line_fmt,
@@ -2616,7 +2617,7 @@ typedef struct {
QemuOpts *opts;
} VMDKCreateOptsData;
-static BlockBackend *vmdk_co_create_opts_cb(int64_t size, int idx,
+static BlockBackend * coroutine_fn vmdk_co_create_opts_cb(int64_t size, int idx,
bool flat, bool split, bool compress,
bool zeroed_grain, void *opaque,
Error **errp)
@@ -2768,10 +2769,11 @@ exit:
return ret;
}
-static BlockBackend *vmdk_co_create_cb(int64_t size, int idx,
- bool flat, bool split, bool compress,
- bool zeroed_grain, void *opaque,
- Error **errp)
+static BlockBackend * coroutine_fn vmdk_co_create_cb(int64_t size, int idx,
+ bool flat, bool split,
+ bool compress,
+ bool zeroed_grain,
+ void *opaque, Error **errp)
{
int ret;
BlockDriverState *bs;
diff --git a/block/vpc.c b/block/vpc.c
index 95841f259a..6ee95dcb96 100644
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -1111,7 +1111,7 @@ static int coroutine_fn vpc_co_create_opts(BlockDriver *drv,
}
/* Create and open the file (protocol layer) */
- ret = bdrv_create_file(filename, opts, errp);
+ ret = bdrv_co_create_file(filename, opts, errp);
if (ret < 0) {
goto fail;
}
diff --git a/blockdev.c b/blockdev.c
index 75eef8535e..ebf952cd21 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1507,10 +1507,14 @@ static void external_snapshot_prepare(BlkActionState *common,
goto out;
}
bdrv_refresh_filename(state->old_bs);
+
+ aio_context_release(aio_context);
bdrv_img_create(new_image_file, format,
state->old_bs->filename,
state->old_bs->drv->format_name,
NULL, size, flags, false, &local_err);
+ aio_context_acquire(aio_context);
+
if (local_err) {
error_propagate(errp, local_err);
goto out;
@@ -3515,8 +3519,6 @@ fail:
void qmp_blockdev_reopen(BlockdevOptionsList *reopen_list, Error **errp)
{
BlockReopenQueue *queue = NULL;
- GSList *drained = NULL;
- GSList *p;
/* Add each one of the BDS that we want to reopen to the queue */
for (; reopen_list != NULL; reopen_list = reopen_list->next) {
@@ -3553,9 +3555,7 @@ void qmp_blockdev_reopen(BlockdevOptionsList *reopen_list, Error **errp)
ctx = bdrv_get_aio_context(bs);
aio_context_acquire(ctx);
- bdrv_subtree_drained_begin(bs);
queue = bdrv_reopen_queue(queue, bs, qdict, false);
- drained = g_slist_prepend(drained, bs);
aio_context_release(ctx);
}
@@ -3566,15 +3566,6 @@ void qmp_blockdev_reopen(BlockdevOptionsList *reopen_list, Error **errp)
fail:
bdrv_reopen_queue_free(queue);
- for (p = drained; p; p = p->next) {
- BlockDriverState *bs = p->data;
- AioContext *ctx = bdrv_get_aio_context(bs);
-
- aio_context_acquire(ctx);
- bdrv_subtree_drained_end(bs);
- aio_context_release(ctx);
- }
- g_slist_free(drained);
}
void qmp_blockdev_del(const char *node_name, Error **errp)
diff --git a/blockjob.c b/blockjob.c
index 3c8f3543a2..b7daf2a9f6 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -120,7 +120,7 @@ static bool child_job_drained_poll(BdrvChild *c)
}
}
-static void child_job_drained_end(BdrvChild *c, int *drained_end_counter)
+static void child_job_drained_end(BdrvChild *c)
{
BlockJob *job = c->opaque;
job_resume(&job->job);
diff --git a/docs/devel/block-coroutine-wrapper.rst b/docs/devel/block-coroutine-wrapper.rst
index 412851986b..6dd2cdcab3 100644
--- a/docs/devel/block-coroutine-wrapper.rst
+++ b/docs/devel/block-coroutine-wrapper.rst
@@ -26,12 +26,12 @@ called ``bdrv_foo(<same args>)``. In this case the script can help. To
trigger the generation:
1. You need ``bdrv_foo`` declaration somewhere (for example, in
- ``block/coroutines.h``) with the ``generated_co_wrapper`` mark,
+ ``block/coroutines.h``) with the ``co_wrapper`` mark,
like this:
.. code-block:: c
- int generated_co_wrapper bdrv_foo(<some args>);
+ int co_wrapper bdrv_foo(<some args>);
2. You need to feed this declaration to block-coroutine-wrapper script.
For this, add the .h (or .c) file with the declaration to the
@@ -46,7 +46,7 @@ Links
1. The script location is ``scripts/block-coroutine-wrapper.py``.
-2. Generic place for private ``generated_co_wrapper`` declarations is
+2. Generic place for private ``co_wrapper`` declarations is
``block/coroutines.h``, for public declarations:
``include/block/block.h``
diff --git a/include/block/aio.h b/include/block/aio.h
index d128558f1d..0f65a3cc9e 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -22,6 +22,7 @@
#include "qemu/event_notifier.h"
#include "qemu/thread.h"
#include "qemu/timer.h"
+#include "block/graph-lock.h"
typedef struct BlockAIOCB BlockAIOCB;
typedef void BlockCompletionFunc(void *opaque, int ret);
@@ -127,6 +128,14 @@ struct AioContext {
/* Used by AioContext users to protect from multi-threaded access. */
QemuRecMutex lock;
+ /*
+ * Keep track of readers and writers of the block layer graph.
+ * This is essential to avoid performing additions and removal
+ * of nodes and edges from block graph while some
+ * other thread is traversing it.
+ */
+ BdrvGraphRWlock *bdrv_graph;
+
/* The list of registered AIO handlers. Protected by ctx->list_lock. */
AioHandlerList aio_handlers;
diff --git a/include/block/block-common.h b/include/block/block-common.h
index 297704c1e9..4749c46a5e 100644
--- a/include/block/block-common.h
+++ b/include/block/block-common.h
@@ -29,20 +29,35 @@
#include "qemu/iov.h"
#include "qemu/coroutine.h"
#include "block/accounting.h"
-#include "block/dirty-bitmap.h"
-#include "block/blockjob.h"
#include "qemu/hbitmap.h"
#include "qemu/transactions.h"
/*
- * generated_co_wrapper
+ * co_wrapper{*}: Function specifiers used by block-coroutine-wrapper.py
*
- * Function specifier, which does nothing but mark functions to be
+ * Function specifiers, which do nothing but mark functions to be
* generated by scripts/block-coroutine-wrapper.py
*
- * Read more in docs/devel/block-coroutine-wrapper.rst
+ * Usage: read docs/devel/block-coroutine-wrapper.rst
+ *
+ * There are 4 kind of specifiers:
+ * - co_wrapper functions can be called by only non-coroutine context, because
+ * they always generate a new coroutine.
+ * - co_wrapper_mixed functions can be called by both coroutine and
+ * non-coroutine context.
+ * - co_wrapper_bdrv_rdlock are co_wrapper functions but automatically take and
+ * release the graph rdlock when creating a new coroutine
+ * - co_wrapper_mixed_bdrv_rdlock are co_wrapper_mixed functions but
+ * automatically take and release the graph rdlock when creating a new
+ * coroutine.
*/
-#define generated_co_wrapper
+#define co_wrapper
+#define co_wrapper_mixed
+#define co_wrapper_bdrv_rdlock
+#define co_wrapper_mixed_bdrv_rdlock
+
+#include "block/dirty-bitmap.h"
+#include "block/blockjob.h"
/* block.c */
typedef struct BlockDriver BlockDriver;
diff --git a/include/block/block-copy.h b/include/block/block-copy.h
index ba0b425d78..8cea4f9b90 100644
--- a/include/block/block-copy.h
+++ b/include/block/block-copy.h
@@ -36,8 +36,9 @@ void block_copy_set_progress_meter(BlockCopyState *s, ProgressMeter *pm);
void block_copy_state_free(BlockCopyState *s);
void block_copy_reset(BlockCopyState *s, int64_t offset, int64_t bytes);
-int64_t block_copy_reset_unallocated(BlockCopyState *s,
- int64_t offset, int64_t *count);
+int64_t coroutine_fn block_copy_reset_unallocated(BlockCopyState *s,
+ int64_t offset,
+ int64_t *count);
int coroutine_fn block_copy(BlockCopyState *s, int64_t offset, int64_t bytes,
bool ignore_ratelimit, uint64_t timeout_ns,
diff --git a/include/block/block-global-state.h b/include/block/block-global-state.h
index c7bd4a2088..b0a3cfe6b8 100644
--- a/include/block/block-global-state.h
+++ b/include/block/block-global-state.h
@@ -55,9 +55,14 @@ BlockDriver *bdrv_find_protocol(const char *filename,
bool allow_protocol_prefix,
Error **errp);
BlockDriver *bdrv_find_format(const char *format_name);
-int bdrv_create(BlockDriver *drv, const char* filename,
- QemuOpts *opts, Error **errp);
-int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp);
+
+int coroutine_fn bdrv_co_create(BlockDriver *drv, const char *filename,
+ QemuOpts *opts, Error **errp);
+int co_wrapper bdrv_create(BlockDriver *drv, const char *filename,
+ QemuOpts *opts, Error **errp);
+
+int coroutine_fn bdrv_co_create_file(const char *filename, QemuOpts *opts,
+ Error **errp);
BlockDriverState *bdrv_new(void);
int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
@@ -82,6 +87,9 @@ int bdrv_open_file_child(const char *filename,
BlockDriverState *bdrv_open_blockdev_ref(BlockdevRef *ref, Error **errp);
int bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd,
Error **errp);
+int bdrv_set_backing_hd_drained(BlockDriverState *bs,
+ BlockDriverState *backing_hd,
+ Error **errp);
int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,
const char *bdref_key, Error **errp);
BlockDriverState *bdrv_open(const char *filename, const char *reference,
@@ -144,6 +152,7 @@ int bdrv_inactivate_all(void);
int bdrv_flush_all(void);
void bdrv_close_all(void);
void bdrv_drain_all_begin(void);
+void bdrv_drain_all_begin_nopoll(void);
void bdrv_drain_all_end(void);
void bdrv_drain_all(void);
diff --git a/include/block/block-io.h b/include/block/block-io.h
index b099d7db45..2ed6214909 100644
--- a/include/block/block-io.h
+++ b/include/block/block-io.h
@@ -39,19 +39,24 @@
* to catch when they are accidentally called by the wrong API.
*/
-int generated_co_wrapper bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,
- int64_t bytes,
- BdrvRequestFlags flags);
+int co_wrapper_mixed_bdrv_rdlock
+bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset, int64_t bytes,
+ BdrvRequestFlags flags);
+
int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags);
-int generated_co_wrapper bdrv_pread(BdrvChild *child, int64_t offset,
- int64_t bytes, void *buf,
- BdrvRequestFlags flags);
-int generated_co_wrapper bdrv_pwrite(BdrvChild *child, int64_t offset,
- int64_t bytes, const void *buf,
- BdrvRequestFlags flags);
-int generated_co_wrapper bdrv_pwrite_sync(BdrvChild *child, int64_t offset,
- int64_t bytes, const void *buf,
- BdrvRequestFlags flags);
+
+int co_wrapper_mixed_bdrv_rdlock
+bdrv_pread(BdrvChild *child, int64_t offset, int64_t bytes, void *buf,
+ BdrvRequestFlags flags);
+
+int co_wrapper_mixed_bdrv_rdlock
+bdrv_pwrite(BdrvChild *child, int64_t offset,int64_t bytes,
+ const void *buf, BdrvRequestFlags flags);
+
+int co_wrapper_mixed_bdrv_rdlock
+bdrv_pwrite_sync(BdrvChild *child, int64_t offset, int64_t bytes,
+ const void *buf, BdrvRequestFlags flags);
+
int coroutine_fn bdrv_co_pwrite_sync(BdrvChild *child, int64_t offset,
int64_t bytes, const void *buf,
BdrvRequestFlags flags);
@@ -94,14 +99,29 @@ bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
int bdrv_block_status(BlockDriverState *bs, int64_t offset,
int64_t bytes, int64_t *pnum, int64_t *map,
BlockDriverState **file);
+
+int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs,
+ BlockDriverState *base,
+ int64_t offset, int64_t bytes,
+ int64_t *pnum, int64_t *map,
+ BlockDriverState **file);
int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base,
int64_t offset, int64_t bytes, int64_t *pnum,
int64_t *map, BlockDriverState **file);
+
+int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t offset,
+ int64_t bytes, int64_t *pnum);
int bdrv_is_allocated(BlockDriverState *bs, int64_t offset, int64_t bytes,
int64_t *pnum);
+
+int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top,
+ BlockDriverState *base,
+ bool include_base, int64_t offset,
+ int64_t bytes, int64_t *pnum);
int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base,
bool include_base, int64_t offset, int64_t bytes,
int64_t *pnum);
+
int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset,
int64_t bytes);
@@ -200,8 +220,14 @@ AioContext *child_of_bds_get_parent_aio_context(BdrvChild *c);
void bdrv_io_plug(BlockDriverState *bs);
void bdrv_io_unplug(BlockDriverState *bs);
-bool bdrv_can_store_new_dirty_bitmap(BlockDriverState *bs, const char *name,
- uint32_t granularity, Error **errp);
+bool coroutine_fn bdrv_co_can_store_new_dirty_bitmap(BlockDriverState *bs,
+ const char *name,
+ uint32_t granularity,
+ Error **errp);
+bool co_wrapper bdrv_can_store_new_dirty_bitmap(BlockDriverState *bs,
+ const char *name,
+ uint32_t granularity,
+ Error **errp);
/**
*
@@ -237,21 +263,6 @@ int coroutine_fn bdrv_co_copy_range(BdrvChild *src, int64_t src_offset,
int64_t bytes, BdrvRequestFlags read_flags,
BdrvRequestFlags write_flags);
-/**
- * bdrv_drained_end_no_poll:
- *
- * Same as bdrv_drained_end(), but do not poll for the subgraph to
- * actually become unquiesced. Therefore, no graph changes will occur
- * with this function.
- *
- * *drained_end_counter is incremented for every background operation
- * that is scheduled, and will be decremented for every operation once
- * it settles. The caller must poll until it reaches 0. The counter
- * should be accessed using atomic operations only.
- */
-void bdrv_drained_end_no_poll(BlockDriverState *bs, int *drained_end_counter);
-
-
/*
* "I/O or GS" API functions. These functions can run without
* the BQL, but only in one specific iothread/main loop.
@@ -281,47 +292,54 @@ void bdrv_drained_end_no_poll(BlockDriverState *bs, int *drained_end_counter);
void bdrv_drain(BlockDriverState *bs);
-int generated_co_wrapper
+int co_wrapper_mixed_bdrv_rdlock
bdrv_truncate(BdrvChild *child, int64_t offset, bool exact,
PreallocMode prealloc, BdrvRequestFlags flags, Error **errp);
-int generated_co_wrapper bdrv_check(BlockDriverState *bs, BdrvCheckResult *res,
- BdrvCheckMode fix);
+int co_wrapper_mixed_bdrv_rdlock
+bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix);
/* Invalidate any cached metadata used by image formats */
-int generated_co_wrapper bdrv_invalidate_cache(BlockDriverState *bs,
- Error **errp);
-int generated_co_wrapper bdrv_flush(BlockDriverState *bs);
-int generated_co_wrapper bdrv_pdiscard(BdrvChild *child, int64_t offset,
- int64_t bytes);
-int generated_co_wrapper
+int co_wrapper_mixed_bdrv_rdlock
+bdrv_invalidate_cache(BlockDriverState *bs, Error **errp);
+
+int co_wrapper_mixed_bdrv_rdlock bdrv_flush(BlockDriverState *bs);
+
+int co_wrapper_mixed_bdrv_rdlock
+bdrv_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
+
+int co_wrapper_mixed_bdrv_rdlock
bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
-int generated_co_wrapper
+
+int co_wrapper_mixed_bdrv_rdlock
bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
/**
* bdrv_parent_drained_begin_single:
*
- * Begin a quiesced section for the parent of @c. If @poll is true, wait for
- * any pending activity to cease.
+ * Begin a quiesced section for the parent of @c.
+ */
+void bdrv_parent_drained_begin_single(BdrvChild *c);
+
+/**
+ * bdrv_parent_drained_poll_single:
+ *
+ * Returns true if there is any pending activity to cease before @c can be
+ * called quiesced, false otherwise.
*/
-void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll);
+bool bdrv_parent_drained_poll_single(BdrvChild *c);
/**
* bdrv_parent_drained_end_single:
*
* End a quiesced section for the parent of @c.
- *
- * This polls @bs's AioContext until all scheduled sub-drained_ends
- * have settled, which may result in graph changes.
*/
void bdrv_parent_drained_end_single(BdrvChild *c);
/**
* bdrv_drain_poll:
*
- * Poll for pending requests in @bs, its parents (except for @ignore_parent),
- * and if @recursive is true its children as well (used for subtree drain).
+ * Poll for pending requests in @bs and its parents (except for @ignore_parent).
*
* If @ignore_bds_parents is true, parents that are BlockDriverStates must
* ignore the drain request because they will be drained separately (used for
@@ -329,8 +347,8 @@ void bdrv_parent_drained_end_single(BdrvChild *c);
*
* This is part of bdrv_drained_begin.
*/
-bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
- BdrvChild *ignore_parent, bool ignore_bds_parents);
+bool bdrv_drain_poll(BlockDriverState *bs, BdrvChild *ignore_parent,
+ bool ignore_bds_parents);
/**
* bdrv_drained_begin:
@@ -348,31 +366,13 @@ void bdrv_drained_begin(BlockDriverState *bs);
* Quiesces a BDS like bdrv_drained_begin(), but does not wait for already
* running requests to complete.
*/
-void bdrv_do_drained_begin_quiesce(BlockDriverState *bs,
- BdrvChild *parent, bool ignore_bds_parents);
-
-/**
- * Like bdrv_drained_begin, but recursively begins a quiesced section for
- * exclusive access to all child nodes as well.
- */
-void bdrv_subtree_drained_begin(BlockDriverState *bs);
+void bdrv_do_drained_begin_quiesce(BlockDriverState *bs, BdrvChild *parent);
/**
* bdrv_drained_end:
*
* End a quiescent section started by bdrv_drained_begin().
- *
- * This polls @bs's AioContext until all scheduled sub-drained_ends
- * have settled. On one hand, that may result in graph changes. On
- * the other, this requires that the caller either runs in the main
- * loop; or that all involved nodes (@bs and all of its parents) are
- * in the caller's AioContext.
*/
void bdrv_drained_end(BlockDriverState *bs);
-/**
- * End a quiescent section started by bdrv_subtree_drained_begin().
- */
-void bdrv_subtree_drained_end(BlockDriverState *bs);
-
#endif /* BLOCK_IO_H */
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 31ae91e56e..c34c525fa6 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -641,8 +641,8 @@ struct BlockDriver {
/*
* Invalidate any cached meta-data.
*/
- void coroutine_fn (*bdrv_co_invalidate_cache)(BlockDriverState *bs,
- Error **errp);
+ void coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_invalidate_cache)(
+ BlockDriverState *bs, Error **errp);
/*
* Flushes all data for all layers by calling bdrv_co_flush for underlying
@@ -701,12 +701,11 @@ struct BlockDriver {
Error **errp);
BlockStatsSpecific *(*bdrv_get_specific_stats)(BlockDriverState *bs);
- int coroutine_fn (*bdrv_save_vmstate)(BlockDriverState *bs,
- QEMUIOVector *qiov,
- int64_t pos);
- int coroutine_fn (*bdrv_load_vmstate)(BlockDriverState *bs,
- QEMUIOVector *qiov,
- int64_t pos);
+ int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_save_vmstate)(
+ BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
+
+ int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_load_vmstate)(
+ BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
/* removable device specific */
bool (*bdrv_is_inserted)(BlockDriverState *bs);
@@ -724,9 +723,8 @@ struct BlockDriver {
* Returns 0 for completed check, -errno for internal errors.
* The check results are stored in result.
*/
- int coroutine_fn (*bdrv_co_check)(BlockDriverState *bs,
- BdrvCheckResult *result,
- BdrvCheckMode fix);
+ int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_check)(
+ BlockDriverState *bs, BdrvCheckResult *result, BdrvCheckMode fix);
void (*bdrv_debug_event)(BlockDriverState *bs, BlkdebugEvent event);
@@ -735,17 +733,19 @@ struct BlockDriver {
void (*bdrv_io_unplug)(BlockDriverState *bs);
/**
- * bdrv_co_drain_begin is called if implemented in the beginning of a
+ * bdrv_drain_begin is called if implemented in the beginning of a
* drain operation to drain and stop any internal sources of requests in
* the driver.
- * bdrv_co_drain_end is called if implemented at the end of the drain.
+ * bdrv_drain_end is called if implemented at the end of the drain.
*
* They should be used by the driver to e.g. manage scheduled I/O
* requests, or toggle an internal state. After the end of the drain new
* requests will continue normally.
+ *
+ * Implementations of both functions must not call aio_poll().
*/
- void coroutine_fn (*bdrv_co_drain_begin)(BlockDriverState *bs);
- void coroutine_fn (*bdrv_co_drain_end)(BlockDriverState *bs);
+ void (*bdrv_drain_begin)(BlockDriverState *bs);
+ void (*bdrv_drain_end)(BlockDriverState *bs);
bool (*bdrv_supports_persistent_dirty_bitmap)(BlockDriverState *bs);
bool coroutine_fn (*bdrv_co_can_store_new_dirty_bitmap)(
@@ -896,8 +896,8 @@ struct BdrvChildClass {
void (*activate)(BdrvChild *child, Error **errp);
int (*inactivate)(BdrvChild *child);
- void (*attach)(BdrvChild *child);
- void (*detach)(BdrvChild *child);
+ void GRAPH_WRLOCK_PTR (*attach)(BdrvChild *child);
+ void GRAPH_WRLOCK_PTR (*detach)(BdrvChild *child);
/*
* Notifies the parent that the filename of its child has changed (e.g.
@@ -937,15 +937,11 @@ struct BdrvChildClass {
* These functions must not change the graph (and therefore also must not
* call aio_poll(), which could change the graph indirectly).
*
- * If drained_end() schedules background operations, it must atomically
- * increment *drained_end_counter for each such operation and atomically
- * decrement it once the operation has settled.
- *
* Note that this can be nested. If drained_begin() was called twice, new
* I/O is allowed only after drained_end() was called twice, too.
*/
void (*drained_begin)(BdrvChild *child);
- void (*drained_end)(BdrvChild *child, int *drained_end_counter);
+ void (*drained_end)(BdrvChild *child);
/*
* Returns whether the parent has pending requests for the child. This
@@ -982,13 +978,13 @@ struct BdrvChild {
bool frozen;
/*
- * How many times the parent of this child has been drained
+ * True if the parent of this child has been drained by this BdrvChild
* (through klass->drained_*).
- * Usually, this is equal to bs->quiesce_counter (potentially
- * reduced by bdrv_drain_all_count). It may differ while the
+ *
+ * It is generally true if bs->quiesce_counter > 0. It may differ while the
* child is entering or leaving a drained section.
*/
- int parent_quiesce_counter;
+ bool quiesced_parent;
QLIST_ENTRY(BdrvChild) next;
QLIST_ENTRY(BdrvChild) next_parent;
@@ -1186,7 +1182,6 @@ struct BlockDriverState {
/* Accessed with atomic ops. */
int quiesce_counter;
- int recursive_quiesce_counter;
unsigned int write_gen; /* Current data generation */
diff --git a/include/block/block_int-global-state.h b/include/block/block_int-global-state.h
index b49f4eb35b..2f0993f6e9 100644
--- a/include/block/block_int-global-state.h
+++ b/include/block/block_int-global-state.h
@@ -310,21 +310,4 @@ void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
*/
void bdrv_drain_all_end_quiesce(BlockDriverState *bs);
-/**
- * Make sure that the function is running under both drain and BQL.
- * The latter protects from concurrent writings
- * from the GS API, while the former prevents concurrent reads
- * from I/O.
- */
-static inline void assert_bdrv_graph_writable(BlockDriverState *bs)
-{
- /*
- * TODO: this function is incomplete. Because the users of this
- * assert lack the necessary drains, check only for BQL.
- * Once the necessary drains are added,
- * assert also for qatomic_read(&bs->quiesce_counter) > 0
- */
- assert(qemu_in_main_thread());
-}
-
#endif /* BLOCK_INT_GLOBAL_STATE_H */
diff --git a/include/block/block_int-io.h b/include/block/block_int-io.h
index 4b0b3e17ef..8bc061ebb8 100644
--- a/include/block/block_int-io.h
+++ b/include/block/block_int-io.h
@@ -179,16 +179,4 @@ void bdrv_bsc_invalidate_range(BlockDriverState *bs,
*/
void bdrv_bsc_fill(BlockDriverState *bs, int64_t offset, int64_t bytes);
-
-/*
- * "I/O or GS" API functions. These functions can run without
- * the BQL, but only in one specific iothread/main loop.
- *
- * See include/block/block-io.h for more information about
- * the "I/O or GS" API.
- */
-
-void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent);
-void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent);
-
#endif /* BLOCK_INT_IO_H */
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 7d50b6bbd1..b35b0138ed 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -26,6 +26,7 @@
#include "block_int-global-state.h"
#include "block_int-io.h"
+#include "block/graph-lock.h"
/* DO NOT ADD ANYTHING IN HERE. USE ONE OF THE HEADERS INCLUDED ABOVE */
diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h
index 6528336c4c..c3700cec76 100644
--- a/include/block/dirty-bitmap.h
+++ b/include/block/dirty-bitmap.h
@@ -34,8 +34,14 @@ int bdrv_dirty_bitmap_check(const BdrvDirtyBitmap *bitmap, uint32_t flags,
Error **errp);
void bdrv_release_dirty_bitmap(BdrvDirtyBitmap *bitmap);
void bdrv_release_named_dirty_bitmaps(BlockDriverState *bs);
-int bdrv_remove_persistent_dirty_bitmap(BlockDriverState *bs, const char *name,
- Error **errp);
+
+int coroutine_fn bdrv_co_remove_persistent_dirty_bitmap(BlockDriverState *bs,
+ const char *name,
+ Error **errp);
+int co_wrapper bdrv_remove_persistent_dirty_bitmap(BlockDriverState *bs,
+ const char *name,
+ Error **errp);
+
void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap);
void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap);
void bdrv_enable_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap);
diff --git a/include/block/graph-lock.h b/include/block/graph-lock.h
new file mode 100644
index 0000000000..4c92cd8edf
--- /dev/null
+++ b/include/block/graph-lock.h
@@ -0,0 +1,280 @@
+/*
+ * Graph lock: rwlock to protect block layer graph manipulations (add/remove
+ * edges and nodes)
+ *
+ * Copyright (c) 2022 Red Hat
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef GRAPH_LOCK_H
+#define GRAPH_LOCK_H
+
+#include "qemu/osdep.h"
+#include "qemu/clang-tsa.h"
+
+#include "qemu/coroutine.h"
+
+/**
+ * Graph Lock API
+ * This API provides a rwlock used to protect block layer
+ * graph modifications like edge (BdrvChild) and node (BlockDriverState)
+ * addition and removal.
+ * Currently we have 1 writer only, the Main loop, and many
+ * readers, mostly coroutines running in other AioContext thus other threads.
+ *
+ * We distinguish between writer (main loop, under BQL) that modifies the
+ * graph, and readers (all other coroutines running in various AioContext),
+ * that go through the graph edges, reading
+ * BlockDriverState ->parents and->children.
+ *
+ * The writer (main loop) has an "exclusive" access, so it first waits for
+ * current read to finish, and then prevents incoming ones from
+ * entering while it has the exclusive access.
+ *
+ * The readers (coroutines in multiple AioContext) are free to
+ * access the graph as long the writer is not modifying the graph.
+ * In case it is, they go in a CoQueue and sleep until the writer
+ * is done.
+ *
+ * If a coroutine changes AioContext, the counter in the original and new
+ * AioContext are left intact, since the writer does not care where is the
+ * reader, but only if there is one.
+ * As a result, some AioContexts might have a negative reader count, to
+ * balance the positive count of the AioContext that took the lock.
+ * This also means that when an AioContext is deleted it may have a nonzero
+ * reader count. In that case we transfer the count to a global shared counter
+ * so that the writer is always aware of all readers.
+ */
+typedef struct BdrvGraphRWlock BdrvGraphRWlock;
+
+/* Dummy lock object to use for Thread Safety Analysis (TSA) */
+typedef struct TSA_CAPABILITY("mutex") BdrvGraphLock {
+} BdrvGraphLock;
+
+extern BdrvGraphLock graph_lock;
+
+/*
+ * clang doesn't check consistency in locking annotations between forward
+ * declarations and the function definition. Having the annotation on the
+ * definition, but not the declaration in a header file, may give the reader
+ * a false sense of security because the condition actually remains unchecked
+ * for callers in other source files.
+ *
+ * Therefore, as a convention, for public functions, GRAPH_RDLOCK and
+ * GRAPH_WRLOCK annotations should be present only in the header file.
+ */
+#define GRAPH_WRLOCK TSA_REQUIRES(graph_lock)
+#define GRAPH_RDLOCK TSA_REQUIRES_SHARED(graph_lock)
+
+/*
+ * TSA annotations are not part of function types, so checks are defeated when
+ * using a function pointer. As a workaround, annotate function pointers with
+ * this macro that will require that the lock is at least taken while reading
+ * the pointer. In most cases this is equivalent to actually protecting the
+ * function call.
+ */
+#define GRAPH_RDLOCK_PTR TSA_GUARDED_BY(graph_lock)
+#define GRAPH_WRLOCK_PTR TSA_GUARDED_BY(graph_lock)
+
+/*
+ * register_aiocontext:
+ * Add AioContext @ctx to the list of AioContext.
+ * This list is used to obtain the total number of readers
+ * currently running the graph.
+ */
+void register_aiocontext(AioContext *ctx);
+
+/*
+ * unregister_aiocontext:
+ * Removes AioContext @ctx to the list of AioContext.
+ */
+void unregister_aiocontext(AioContext *ctx);
+
+/*
+ * bdrv_graph_wrlock:
+ * Start an exclusive write operation to modify the graph. This means we are
+ * adding or removing an edge or a node in the block layer graph. Nobody else
+ * is allowed to access the graph.
+ *
+ * Must only be called from outside bdrv_graph_co_rdlock.
+ *
+ * The wrlock can only be taken from the main loop, with BQL held, as only the
+ * main loop is allowed to modify the graph.
+ *
+ * This function polls. Callers must not hold the lock of any AioContext other
+ * than the current one.
+ */
+void bdrv_graph_wrlock(void) TSA_ACQUIRE(graph_lock) TSA_NO_TSA;
+
+/*
+ * bdrv_graph_wrunlock:
+ * Write finished, reset global has_writer to 0 and restart
+ * all readers that are waiting.
+ */
+void bdrv_graph_wrunlock(void) TSA_RELEASE(graph_lock) TSA_NO_TSA;
+
+/*
+ * bdrv_graph_co_rdlock:
+ * Read the bs graph. This usually means traversing all nodes in
+ * the graph, therefore it can't happen while another thread is
+ * modifying it.
+ * Increases the reader counter of the current aiocontext,
+ * and if has_writer is set, it means that the writer is modifying
+ * the graph, therefore wait in a coroutine queue.
+ * The writer will then wake this coroutine once it is done.
+ *
+ * This lock should be taken from Iothreads (IO_CODE() class of functions)
+ * because it signals the writer that there are some
+ * readers currently running, or waits until the current
+ * write is finished before continuing.
+ * Calling this function from the Main Loop with BQL held
+ * is not necessary, since the Main Loop itself is the only
+ * writer, thus won't be able to read and write at the same time.
+ * The only exception to that is when we can't take the lock in the
+ * function/coroutine itself, and need to delegate the caller (usually main
+ * loop) to take it and wait that the coroutine ends, so that
+ * we always signal that a reader is running.
+ */
+void coroutine_fn TSA_ACQUIRE_SHARED(graph_lock) TSA_NO_TSA
+bdrv_graph_co_rdlock(void);
+
+/*
+ * bdrv_graph_rdunlock:
+ * Read terminated, decrease the count of readers in the current aiocontext.
+ * If the writer is waiting for reads to finish (has_writer == 1), signal
+ * the writer that we are done via aio_wait_kick() to let it continue.
+ */
+void coroutine_fn TSA_RELEASE_SHARED(graph_lock) TSA_NO_TSA
+bdrv_graph_co_rdunlock(void);
+
+/*
+ * bdrv_graph_rd{un}lock_main_loop:
+ * Just a placeholder to mark where the graph rdlock should be taken
+ * in the main loop. It is just asserting that we are not
+ * in a coroutine and in GLOBAL_STATE_CODE.
+ */
+void TSA_ACQUIRE_SHARED(graph_lock) TSA_NO_TSA
+bdrv_graph_rdlock_main_loop(void);
+
+void TSA_RELEASE_SHARED(graph_lock) TSA_NO_TSA
+bdrv_graph_rdunlock_main_loop(void);
+
+/*
+ * assert_bdrv_graph_readable:
+ * Make sure that the reader is either the main loop,
+ * or there is at least a reader helding the rdlock.
+ * In this way an incoming writer is aware of the read and waits.
+ */
+void GRAPH_RDLOCK assert_bdrv_graph_readable(void);
+
+/*
+ * assert_bdrv_graph_writable:
+ * Make sure that the writer is the main loop and has set @has_writer,
+ * so that incoming readers will pause.
+ */
+void GRAPH_WRLOCK assert_bdrv_graph_writable(void);
+
+/*
+ * Calling this function tells TSA that we know that the lock is effectively
+ * taken even though we cannot prove it (yet) with GRAPH_RDLOCK. This can be
+ * useful in intermediate stages of a conversion to using the GRAPH_RDLOCK
+ * macro.
+ */
+static inline void TSA_ASSERT_SHARED(graph_lock) TSA_NO_TSA
+assume_graph_lock(void)
+{
+}
+
+typedef struct GraphLockable { } GraphLockable;
+
+/*
+ * In C, compound literals have the lifetime of an automatic variable.
+ * In C++ it would be different, but then C++ wouldn't need QemuLockable
+ * either...
+ */
+#define GML_OBJ_() (&(GraphLockable) { })
+
+/*
+ * This is not marked as TSA_ACQUIRE() because TSA doesn't understand the
+ * cleanup attribute and would therefore complain that the graph is never
+ * unlocked. TSA_ASSERT() makes sure that the following calls know that we
+ * hold the lock while unlocking is left unchecked.
+ */
+static inline GraphLockable * TSA_ASSERT(graph_lock) TSA_NO_TSA
+graph_lockable_auto_lock(GraphLockable *x)
+{
+ bdrv_graph_co_rdlock();
+ return x;
+}
+
+static inline void TSA_NO_TSA
+graph_lockable_auto_unlock(GraphLockable *x)
+{
+ bdrv_graph_co_rdunlock();
+}
+
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(GraphLockable, graph_lockable_auto_unlock)
+
+#define WITH_GRAPH_RDLOCK_GUARD_(var) \
+ for (g_autoptr(GraphLockable) var = graph_lockable_auto_lock(GML_OBJ_()); \
+ var; \
+ graph_lockable_auto_unlock(var), var = NULL)
+
+#define WITH_GRAPH_RDLOCK_GUARD() \
+ WITH_GRAPH_RDLOCK_GUARD_(glue(graph_lockable_auto, __COUNTER__))
+
+#define GRAPH_RDLOCK_GUARD(x) \
+ g_autoptr(GraphLockable) \
+ glue(graph_lockable_auto, __COUNTER__) G_GNUC_UNUSED = \
+ graph_lockable_auto_lock(GML_OBJ_())
+
+
+typedef struct GraphLockableMainloop { } GraphLockableMainloop;
+
+/*
+ * In C, compound literals have the lifetime of an automatic variable.
+ * In C++ it would be different, but then C++ wouldn't need QemuLockable
+ * either...
+ */
+#define GMLML_OBJ_() (&(GraphLockableMainloop) { })
+
+/*
+ * This is not marked as TSA_ACQUIRE() because TSA doesn't understand the
+ * cleanup attribute and would therefore complain that the graph is never
+ * unlocked. TSA_ASSERT() makes sure that the following calls know that we
+ * hold the lock while unlocking is left unchecked.
+ */
+static inline GraphLockableMainloop * TSA_ASSERT(graph_lock) TSA_NO_TSA
+graph_lockable_auto_lock_mainloop(GraphLockableMainloop *x)
+{
+ bdrv_graph_rdlock_main_loop();
+ return x;
+}
+
+static inline void TSA_NO_TSA
+graph_lockable_auto_unlock_mainloop(GraphLockableMainloop *x)
+{
+ bdrv_graph_rdunlock_main_loop();
+}
+
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(GraphLockableMainloop,
+ graph_lockable_auto_unlock_mainloop)
+
+#define GRAPH_RDLOCK_GUARD_MAINLOOP(x) \
+ g_autoptr(GraphLockableMainloop) \
+ glue(graph_lockable_auto, __COUNTER__) G_GNUC_UNUSED = \
+ graph_lockable_auto_lock_mainloop(GMLML_OBJ_())
+
+#endif /* GRAPH_LOCK_H */
+
diff --git a/include/qemu/clang-tsa.h b/include/qemu/clang-tsa.h
new file mode 100644
index 0000000000..ba06fb8c92
--- /dev/null
+++ b/include/qemu/clang-tsa.h
@@ -0,0 +1,114 @@
+#ifndef CLANG_TSA_H
+#define CLANG_TSA_H
+
+/*
+ * Copyright 2018 Jarkko Hietaniemi <jhi@iki.fi>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without
+ * limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/* http://clang.llvm.org/docs/ThreadSafetyAnalysis.html
+ *
+ * TSA is available since clang 3.6-ish.
+ */
+#ifdef __clang__
+# define TSA(x) __attribute__((x))
+#else
+# define TSA(x) /* No TSA, make TSA attributes no-ops. */
+#endif
+
+/* TSA_CAPABILITY() is used to annotate typedefs:
+ *
+ * typedef pthread_mutex_t TSA_CAPABILITY("mutex") tsa_mutex;
+ */
+#define TSA_CAPABILITY(x) TSA(capability(x))
+
+/* TSA_GUARDED_BY() is used to annotate global variables,
+ * the data is guarded:
+ *
+ * Foo foo TSA_GUARDED_BY(mutex);
+ */
+#define TSA_GUARDED_BY(x) TSA(guarded_by(x))
+
+/* TSA_PT_GUARDED_BY() is used to annotate global pointers, the data
+ * behind the pointer is guarded.
+ *
+ * Foo* ptr TSA_PT_GUARDED_BY(mutex);
+ */
+#define TSA_PT_GUARDED_BY(x) TSA(pt_guarded_by(x))
+
+/* The TSA_REQUIRES() is used to annotate functions: the caller of the
+ * function MUST hold the resource, the function will NOT release it.
+ *
+ * More than one mutex may be specified, comma-separated.
+ *
+ * void Foo(void) TSA_REQUIRES(mutex);
+ */
+#define TSA_REQUIRES(...) TSA(requires_capability(__VA_ARGS__))
+#define TSA_REQUIRES_SHARED(...) TSA(requires_shared_capability(__VA_ARGS__))
+
+/* TSA_EXCLUDES() is used to annotate functions: the caller of the
+ * function MUST NOT hold resource, the function first acquires the
+ * resource, and then releases it.
+ *
+ * More than one mutex may be specified, comma-separated.
+ *
+ * void Foo(void) TSA_EXCLUDES(mutex);
+ */
+#define TSA_EXCLUDES(...) TSA(locks_excluded(__VA_ARGS__))
+
+/* TSA_ACQUIRE() is used to annotate functions: the caller of the
+ * function MUST NOT hold the resource, the function will acquire the
+ * resource, but NOT release it.
+ *
+ * More than one mutex may be specified, comma-separated.
+ *
+ * void Foo(void) TSA_ACQUIRE(mutex);
+ */
+#define TSA_ACQUIRE(...) TSA(acquire_capability(__VA_ARGS__))
+#define TSA_ACQUIRE_SHARED(...) TSA(acquire_shared_capability(__VA_ARGS__))
+
+/* TSA_RELEASE() is used to annotate functions: the caller of the
+ * function MUST hold the resource, but the function will then release it.
+ *
+ * More than one mutex may be specified, comma-separated.
+ *
+ * void Foo(void) TSA_RELEASE(mutex);
+ */
+#define TSA_RELEASE(...) TSA(release_capability(__VA_ARGS__))
+#define TSA_RELEASE_SHARED(...) TSA(release_shared_capability(__VA_ARGS__))
+
+/* TSA_NO_TSA is used to annotate functions. Use only when you need to.
+ *
+ * void Foo(void) TSA_NO_TSA;
+ */
+#define TSA_NO_TSA TSA(no_thread_safety_analysis)
+
+/*
+ * TSA_ASSERT() is used to annotate functions: This function will assert that
+ * the lock is held. When it returns, the caller of the function is assumed to
+ * already hold the resource.
+ *
+ * More than one mutex may be specified, comma-separated.
+ */
+#define TSA_ASSERT(...) TSA(assert_capability(__VA_ARGS__))
+#define TSA_ASSERT_SHARED(...) TSA(assert_shared_capability(__VA_ARGS__))
+
+#endif /* #ifndef CLANG_TSA_H */
diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h
index 50f5aa2e07..7ec6d978d4 100644
--- a/include/sysemu/block-backend-io.h
+++ b/include/sysemu/block-backend-io.h
@@ -92,6 +92,15 @@ int coroutine_fn blk_co_copy_range(BlockBackend *blk_in, int64_t off_in,
int64_t bytes, BdrvRequestFlags read_flags,
BdrvRequestFlags write_flags);
+int coroutine_fn blk_co_block_status_above(BlockBackend *blk,
+ BlockDriverState *base,
+ int64_t offset, int64_t bytes,
+ int64_t *pnum, int64_t *map,
+ BlockDriverState **file);
+int coroutine_fn blk_co_is_allocated_above(BlockBackend *blk,
+ BlockDriverState *base,
+ bool include_base, int64_t offset,
+ int64_t bytes, int64_t *pnum);
/*
* "I/O or GS" API functions. These functions can run without
@@ -101,77 +110,77 @@ int coroutine_fn blk_co_copy_range(BlockBackend *blk_in, int64_t off_in,
* the "I/O or GS" API.
*/
-int generated_co_wrapper blk_pread(BlockBackend *blk, int64_t offset,
- int64_t bytes, void *buf,
- BdrvRequestFlags flags);
+int co_wrapper_mixed blk_pread(BlockBackend *blk, int64_t offset,
+ int64_t bytes, void *buf,
+ BdrvRequestFlags flags);
int coroutine_fn blk_co_pread(BlockBackend *blk, int64_t offset, int64_t bytes,
void *buf, BdrvRequestFlags flags);
-int generated_co_wrapper blk_preadv(BlockBackend *blk, int64_t offset,
- int64_t bytes, QEMUIOVector *qiov,
- BdrvRequestFlags flags);
+int co_wrapper_mixed blk_preadv(BlockBackend *blk, int64_t offset,
+ int64_t bytes, QEMUIOVector *qiov,
+ BdrvRequestFlags flags);
int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
int64_t bytes, QEMUIOVector *qiov,
BdrvRequestFlags flags);
-int generated_co_wrapper blk_preadv_part(BlockBackend *blk, int64_t offset,
- int64_t bytes, QEMUIOVector *qiov,
- size_t qiov_offset,
- BdrvRequestFlags flags);
+int co_wrapper_mixed blk_preadv_part(BlockBackend *blk, int64_t offset,
+ int64_t bytes, QEMUIOVector *qiov,
+ size_t qiov_offset,
+ BdrvRequestFlags flags);
int coroutine_fn blk_co_preadv_part(BlockBackend *blk, int64_t offset,
int64_t bytes, QEMUIOVector *qiov,
size_t qiov_offset, BdrvRequestFlags flags);
-int generated_co_wrapper blk_pwrite(BlockBackend *blk, int64_t offset,
- int64_t bytes, const void *buf,
- BdrvRequestFlags flags);
+int co_wrapper_mixed blk_pwrite(BlockBackend *blk, int64_t offset,
+ int64_t bytes, const void *buf,
+ BdrvRequestFlags flags);
int coroutine_fn blk_co_pwrite(BlockBackend *blk, int64_t offset, int64_t bytes,
const void *buf, BdrvRequestFlags flags);
-int generated_co_wrapper blk_pwritev(BlockBackend *blk, int64_t offset,
- int64_t bytes, QEMUIOVector *qiov,
- BdrvRequestFlags flags);
+int co_wrapper_mixed blk_pwritev(BlockBackend *blk, int64_t offset,
+ int64_t bytes, QEMUIOVector *qiov,
+ BdrvRequestFlags flags);
int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
int64_t bytes, QEMUIOVector *qiov,
BdrvRequestFlags flags);
-int generated_co_wrapper blk_pwritev_part(BlockBackend *blk, int64_t offset,
- int64_t bytes, QEMUIOVector *qiov,
- size_t qiov_offset,
- BdrvRequestFlags flags);
+int co_wrapper_mixed blk_pwritev_part(BlockBackend *blk, int64_t offset,
+ int64_t bytes, QEMUIOVector *qiov,
+ size_t qiov_offset,
+ BdrvRequestFlags flags);
int coroutine_fn blk_co_pwritev_part(BlockBackend *blk, int64_t offset,
int64_t bytes,
QEMUIOVector *qiov, size_t qiov_offset,
BdrvRequestFlags flags);
-int generated_co_wrapper blk_pwrite_compressed(BlockBackend *blk,
- int64_t offset, int64_t bytes,
- const void *buf);
+int co_wrapper_mixed blk_pwrite_compressed(BlockBackend *blk,
+ int64_t offset, int64_t bytes,
+ const void *buf);
int coroutine_fn blk_co_pwrite_compressed(BlockBackend *blk, int64_t offset,
int64_t bytes, const void *buf);
-int generated_co_wrapper blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
- int64_t bytes,
- BdrvRequestFlags flags);
+int co_wrapper_mixed blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
+ int64_t bytes,
+ BdrvRequestFlags flags);
int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
int64_t bytes, BdrvRequestFlags flags);
-int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset,
- int64_t bytes);
+int co_wrapper_mixed blk_pdiscard(BlockBackend *blk, int64_t offset,
+ int64_t bytes);
int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset,
int64_t bytes);
-int generated_co_wrapper blk_flush(BlockBackend *blk);
+int co_wrapper_mixed blk_flush(BlockBackend *blk);
int coroutine_fn blk_co_flush(BlockBackend *blk);
-int generated_co_wrapper blk_ioctl(BlockBackend *blk, unsigned long int req,
- void *buf);
+int co_wrapper_mixed blk_ioctl(BlockBackend *blk, unsigned long int req,
+ void *buf);
int coroutine_fn blk_co_ioctl(BlockBackend *blk, unsigned long int req,
void *buf);
-int generated_co_wrapper blk_truncate(BlockBackend *blk, int64_t offset,
- bool exact, PreallocMode prealloc,
- BdrvRequestFlags flags, Error **errp);
+int co_wrapper_mixed blk_truncate(BlockBackend *blk, int64_t offset,
+ bool exact, PreallocMode prealloc,
+ BdrvRequestFlags flags, Error **errp);
int coroutine_fn blk_co_truncate(BlockBackend *blk, int64_t offset, bool exact,
PreallocMode prealloc, BdrvRequestFlags flags,
Error **errp);
diff --git a/nbd/server.c b/nbd/server.c
index 0570596312..67ed333578 100644
--- a/nbd/server.c
+++ b/nbd/server.c
@@ -1988,7 +1988,7 @@ static int coroutine_fn nbd_co_send_structured_error(NBDClient *client,
}
/* Do a sparse read and send the structured reply to the client.
- * Returns -errno if sending fails. bdrv_block_status_above() failure is
+ * Returns -errno if sending fails. blk_co_block_status_above() failure is
* reported to the client, at which point this function succeeds.
*/
static int coroutine_fn nbd_co_send_sparse_read(NBDClient *client,
@@ -2004,10 +2004,10 @@ static int coroutine_fn nbd_co_send_sparse_read(NBDClient *client,
while (progress < size) {
int64_t pnum;
- int status = bdrv_block_status_above(blk_bs(exp->common.blk), NULL,
- offset + progress,
- size - progress, &pnum, NULL,
- NULL);
+ int status = blk_co_block_status_above(exp->common.blk, NULL,
+ offset + progress,
+ size - progress, &pnum, NULL,
+ NULL);
bool final;
if (status < 0) {
@@ -2138,14 +2138,15 @@ static int nbd_extent_array_add(NBDExtentArray *ea,
return 0;
}
-static int blockstatus_to_extents(BlockDriverState *bs, uint64_t offset,
- uint64_t bytes, NBDExtentArray *ea)
+static int coroutine_fn blockstatus_to_extents(BlockBackend *blk,
+ uint64_t offset, uint64_t bytes,
+ NBDExtentArray *ea)
{
while (bytes) {
uint32_t flags;
int64_t num;
- int ret = bdrv_block_status_above(bs, NULL, offset, bytes, &num,
- NULL, NULL);
+ int ret = blk_co_block_status_above(blk, NULL, offset, bytes, &num,
+ NULL, NULL);
if (ret < 0) {
return ret;
@@ -2165,13 +2166,14 @@ static int blockstatus_to_extents(BlockDriverState *bs, uint64_t offset,
return 0;
}
-static int blockalloc_to_extents(BlockDriverState *bs, uint64_t offset,
- uint64_t bytes, NBDExtentArray *ea)
+static int coroutine_fn blockalloc_to_extents(BlockBackend *blk,
+ uint64_t offset, uint64_t bytes,
+ NBDExtentArray *ea)
{
while (bytes) {
int64_t num;
- int ret = bdrv_is_allocated_above(bs, NULL, false, offset, bytes,
- &num);
+ int ret = blk_co_is_allocated_above(blk, NULL, false, offset, bytes,
+ &num);
if (ret < 0) {
return ret;
@@ -2217,20 +2219,21 @@ static int nbd_co_send_extents(NBDClient *client, uint64_t handle,
}
/* Get block status from the exported device and send it to the client */
-static int nbd_co_send_block_status(NBDClient *client, uint64_t handle,
- BlockDriverState *bs, uint64_t offset,
- uint32_t length, bool dont_fragment,
- bool last, uint32_t context_id,
- Error **errp)
+static int
+coroutine_fn nbd_co_send_block_status(NBDClient *client, uint64_t handle,
+ BlockBackend *blk, uint64_t offset,
+ uint32_t length, bool dont_fragment,
+ bool last, uint32_t context_id,
+ Error **errp)
{
int ret;
unsigned int nb_extents = dont_fragment ? 1 : NBD_MAX_BLOCK_STATUS_EXTENTS;
g_autoptr(NBDExtentArray) ea = nbd_extent_array_new(nb_extents);
if (context_id == NBD_META_ID_BASE_ALLOCATION) {
- ret = blockstatus_to_extents(bs, offset, length, ea);
+ ret = blockstatus_to_extents(blk, offset, length, ea);
} else {
- ret = blockalloc_to_extents(bs, offset, length, ea);
+ ret = blockalloc_to_extents(blk, offset, length, ea);
}
if (ret < 0) {
return nbd_co_send_structured_error(
@@ -2557,7 +2560,7 @@ static coroutine_fn int nbd_handle_request(NBDClient *client,
if (client->export_meta.base_allocation) {
ret = nbd_co_send_block_status(client, request->handle,
- blk_bs(exp->common.blk),
+ exp->common.blk,
request->from,
request->len, dont_fragment,
!--contexts_remaining,
@@ -2570,7 +2573,7 @@ static coroutine_fn int nbd_handle_request(NBDClient *client,
if (client->export_meta.allocation_depth) {
ret = nbd_co_send_block_status(client, request->handle,
- blk_bs(exp->common.blk),
+ exp->common.blk,
request->from, request->len,
dont_fragment,
!--contexts_remaining,
diff --git a/scripts/block-coroutine-wrapper.py b/scripts/block-coroutine-wrapper.py
index 08be813407..6e087fa0b7 100644
--- a/scripts/block-coroutine-wrapper.py
+++ b/scripts/block-coroutine-wrapper.py
@@ -2,7 +2,7 @@
"""Generate coroutine wrappers for block subsystem.
The program parses one or several concatenated c files from stdin,
-searches for functions with the 'generated_co_wrapper' specifier
+searches for functions with the 'co_wrapper' specifier
and generates corresponding wrappers on stdout.
Usage: block-coroutine-wrapper.py generated-file.c FILE.[ch]...
@@ -62,10 +62,28 @@ class ParamDecl:
class FuncDecl:
- def __init__(self, return_type: str, name: str, args: str) -> None:
+ def __init__(self, return_type: str, name: str, args: str,
+ variant: str) -> None:
self.return_type = return_type.strip()
self.name = name.strip()
+ self.struct_name = snake_to_camel(self.name)
self.args = [ParamDecl(arg.strip()) for arg in args.split(',')]
+ self.create_only_co = 'mixed' not in variant
+ self.graph_rdlock = 'bdrv_rdlock' in variant
+
+ subsystem, subname = self.name.split('_', 1)
+ self.co_name = f'{subsystem}_co_{subname}'
+
+ t = self.args[0].type
+ if t == 'BlockDriverState *':
+ ctx = 'bdrv_get_aio_context(bs)'
+ elif t == 'BdrvChild *':
+ ctx = 'bdrv_get_aio_context(child->bs)'
+ elif t == 'BlockBackend *':
+ ctx = 'blk_get_aio_context(blk)'
+ else:
+ ctx = 'qemu_get_aio_context()'
+ self.ctx = ctx
def gen_list(self, format: str) -> str:
return ', '.join(format.format_map(arg.__dict__) for arg in self.args)
@@ -74,17 +92,20 @@ class FuncDecl:
return '\n'.join(format.format_map(arg.__dict__) for arg in self.args)
-# Match wrappers declared with a generated_co_wrapper mark
-func_decl_re = re.compile(r'^int\s*generated_co_wrapper\s*'
+# Match wrappers declared with a co_wrapper mark
+func_decl_re = re.compile(r'^(?P<return_type>[a-zA-Z][a-zA-Z0-9_]* [\*]?)'
+ r'\s*co_wrapper'
+ r'(?P<variant>(_[a-z][a-z0-9_]*)?)\s*'
r'(?P<wrapper_name>[a-z][a-z0-9_]*)'
r'\((?P<args>[^)]*)\);$', re.MULTILINE)
def func_decl_iter(text: str) -> Iterator:
for m in func_decl_re.finditer(text):
- yield FuncDecl(return_type='int',
+ yield FuncDecl(return_type=m.group('return_type'),
name=m.group('wrapper_name'),
- args=m.group('args'))
+ args=m.group('args'),
+ variant=m.group('variant'))
def snake_to_camel(func_name: str) -> str:
@@ -97,24 +118,75 @@ def snake_to_camel(func_name: str) -> str:
return ''.join(words)
+def create_mixed_wrapper(func: FuncDecl) -> str:
+ """
+ Checks if we are already in coroutine
+ """
+ name = func.co_name
+ struct_name = func.struct_name
+ graph_assume_lock = 'assume_graph_lock();' if func.graph_rdlock else ''
+
+ return f"""\
+{func.return_type} {func.name}({ func.gen_list('{decl}') })
+{{
+ if (qemu_in_coroutine()) {{
+ {graph_assume_lock}
+ return {name}({ func.gen_list('{name}') });
+ }} else {{
+ {struct_name} s = {{
+ .poll_state.ctx = {func.ctx},
+ .poll_state.in_progress = true,
+
+{ func.gen_block(' .{name} = {name},') }
+ }};
+
+ s.poll_state.co = qemu_coroutine_create({name}_entry, &s);
+
+ bdrv_poll_co(&s.poll_state);
+ return s.ret;
+ }}
+}}"""
+
+
+def create_co_wrapper(func: FuncDecl) -> str:
+ """
+ Assumes we are not in coroutine, and creates one
+ """
+ name = func.co_name
+ struct_name = func.struct_name
+ return f"""\
+{func.return_type} {func.name}({ func.gen_list('{decl}') })
+{{
+ {struct_name} s = {{
+ .poll_state.ctx = {func.ctx},
+ .poll_state.in_progress = true,
+
+{ func.gen_block(' .{name} = {name},') }
+ }};
+ assert(!qemu_in_coroutine());
+
+ s.poll_state.co = qemu_coroutine_create({name}_entry, &s);
+
+ bdrv_poll_co(&s.poll_state);
+ return s.ret;
+}}"""
+
+
def gen_wrapper(func: FuncDecl) -> str:
assert not '_co_' in func.name
- assert func.return_type == 'int'
- assert func.args[0].type in ['BlockDriverState *', 'BdrvChild *',
- 'BlockBackend *']
- subsystem, subname = func.name.split('_', 1)
+ name = func.co_name
+ struct_name = func.struct_name
- name = f'{subsystem}_co_{subname}'
+ graph_lock=''
+ graph_unlock=''
+ if func.graph_rdlock:
+ graph_lock=' bdrv_graph_co_rdlock();'
+ graph_unlock=' bdrv_graph_co_rdunlock();'
- t = func.args[0].type
- if t == 'BlockDriverState *':
- bs = 'bs'
- elif t == 'BdrvChild *':
- bs = 'child->bs'
- else:
- bs = 'blk_bs(blk)'
- struct_name = snake_to_camel(name)
+ creation_function = create_mixed_wrapper
+ if func.create_only_co:
+ creation_function = create_co_wrapper
return f"""\
/*
@@ -123,6 +195,7 @@ def gen_wrapper(func: FuncDecl) -> str:
typedef struct {struct_name} {{
BdrvPollCo poll_state;
+ {func.return_type} ret;
{ func.gen_block(' {decl};') }
}} {struct_name};
@@ -130,29 +203,15 @@ static void coroutine_fn {name}_entry(void *opaque)
{{
{struct_name} *s = opaque;
- s->poll_state.ret = {name}({ func.gen_list('s->{name}') });
+{graph_lock}
+ s->ret = {name}({ func.gen_list('s->{name}') });
+{graph_unlock}
s->poll_state.in_progress = false;
aio_wait_kick();
}}
-int {func.name}({ func.gen_list('{decl}') })
-{{
- if (qemu_in_coroutine()) {{
- return {name}({ func.gen_list('{name}') });
- }} else {{
- {struct_name} s = {{
- .poll_state.bs = {bs},
- .poll_state.in_progress = true,
-
-{ func.gen_block(' .{name} = {name},') }
- }};
-
- s.poll_state.co = qemu_coroutine_create({name}_entry, &s);
-
- return bdrv_poll_co(&s.poll_state);
- }}
-}}"""
+{creation_function(func)}"""
def gen_wrappers(input_code: str) -> str:
diff --git a/stubs/graph-lock.c b/stubs/graph-lock.c
new file mode 100644
index 0000000000..177aa0a8ba
--- /dev/null
+++ b/stubs/graph-lock.c
@@ -0,0 +1,10 @@
+#include "qemu/osdep.h"
+#include "block/graph-lock.h"
+
+void register_aiocontext(AioContext *ctx)
+{
+}
+
+void unregister_aiocontext(AioContext *ctx)
+{
+}
diff --git a/stubs/meson.build b/stubs/meson.build
index c96a74f095..981585cbdf 100644
--- a/stubs/meson.build
+++ b/stubs/meson.build
@@ -13,6 +13,7 @@ stub_ss.add(files('error-printf.c'))
stub_ss.add(files('fdset.c'))
stub_ss.add(files('gdbstub.c'))
stub_ss.add(files('get-vm-name.c'))
+stub_ss.add(files('graph-lock.c'))
if linux_io_uring.found()
stub_ss.add(files('io_uring.c'))
endif
diff --git a/tests/unit/test-bdrv-drain.c b/tests/unit/test-bdrv-drain.c
index 09dc4a4891..8cedea4959 100644
--- a/tests/unit/test-bdrv-drain.c
+++ b/tests/unit/test-bdrv-drain.c
@@ -38,16 +38,26 @@ typedef struct BDRVTestState {
bool sleep_in_drain_begin;
} BDRVTestState;
-static void coroutine_fn bdrv_test_co_drain_begin(BlockDriverState *bs)
+static void coroutine_fn sleep_in_drain_begin(void *opaque)
+{
+ BlockDriverState *bs = opaque;
+
+ qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000);
+ bdrv_dec_in_flight(bs);
+}
+
+static void bdrv_test_drain_begin(BlockDriverState *bs)
{
BDRVTestState *s = bs->opaque;
s->drain_count++;
if (s->sleep_in_drain_begin) {
- qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000);
+ Coroutine *co = qemu_coroutine_create(sleep_in_drain_begin, bs);
+ bdrv_inc_in_flight(bs);
+ aio_co_enter(bdrv_get_aio_context(bs), co);
}
}
-static void coroutine_fn bdrv_test_co_drain_end(BlockDriverState *bs)
+static void bdrv_test_drain_end(BlockDriverState *bs)
{
BDRVTestState *s = bs->opaque;
s->drain_count--;
@@ -101,8 +111,8 @@ static BlockDriver bdrv_test = {
.bdrv_close = bdrv_test_close,
.bdrv_co_preadv = bdrv_test_co_preadv,
- .bdrv_co_drain_begin = bdrv_test_co_drain_begin,
- .bdrv_co_drain_end = bdrv_test_co_drain_end,
+ .bdrv_drain_begin = bdrv_test_drain_begin,
+ .bdrv_drain_end = bdrv_test_drain_end,
.bdrv_child_perm = bdrv_default_perms,
@@ -146,7 +156,6 @@ static void call_in_coroutine(void (*entry)(void))
enum drain_type {
BDRV_DRAIN_ALL,
BDRV_DRAIN,
- BDRV_SUBTREE_DRAIN,
DRAIN_TYPE_MAX,
};
@@ -155,7 +164,6 @@ static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
switch (drain_type) {
case BDRV_DRAIN_ALL: bdrv_drain_all_begin(); break;
case BDRV_DRAIN: bdrv_drained_begin(bs); break;
- case BDRV_SUBTREE_DRAIN: bdrv_subtree_drained_begin(bs); break;
default: g_assert_not_reached();
}
}
@@ -165,7 +173,6 @@ static void do_drain_end(enum drain_type drain_type, BlockDriverState *bs)
switch (drain_type) {
case BDRV_DRAIN_ALL: bdrv_drain_all_end(); break;
case BDRV_DRAIN: bdrv_drained_end(bs); break;
- case BDRV_SUBTREE_DRAIN: bdrv_subtree_drained_end(bs); break;
default: g_assert_not_reached();
}
}
@@ -261,11 +268,6 @@ static void test_drv_cb_drain(void)
test_drv_cb_common(BDRV_DRAIN, false);
}
-static void test_drv_cb_drain_subtree(void)
-{
- test_drv_cb_common(BDRV_SUBTREE_DRAIN, true);
-}
-
static void test_drv_cb_co_drain_all(void)
{
call_in_coroutine(test_drv_cb_drain_all);
@@ -276,11 +278,6 @@ static void test_drv_cb_co_drain(void)
call_in_coroutine(test_drv_cb_drain);
}
-static void test_drv_cb_co_drain_subtree(void)
-{
- call_in_coroutine(test_drv_cb_drain_subtree);
-}
-
static void test_quiesce_common(enum drain_type drain_type, bool recursive)
{
BlockBackend *blk;
@@ -299,7 +296,11 @@ static void test_quiesce_common(enum drain_type drain_type, bool recursive)
do_drain_begin(drain_type, bs);
- g_assert_cmpint(bs->quiesce_counter, ==, 1);
+ if (drain_type == BDRV_DRAIN_ALL) {
+ g_assert_cmpint(bs->quiesce_counter, ==, 2);
+ } else {
+ g_assert_cmpint(bs->quiesce_counter, ==, 1);
+ }
g_assert_cmpint(backing->quiesce_counter, ==, !!recursive);
do_drain_end(drain_type, bs);
@@ -322,11 +323,6 @@ static void test_quiesce_drain(void)
test_quiesce_common(BDRV_DRAIN, false);
}
-static void test_quiesce_drain_subtree(void)
-{
- test_quiesce_common(BDRV_SUBTREE_DRAIN, true);
-}
-
static void test_quiesce_co_drain_all(void)
{
call_in_coroutine(test_quiesce_drain_all);
@@ -337,11 +333,6 @@ static void test_quiesce_co_drain(void)
call_in_coroutine(test_quiesce_drain);
}
-static void test_quiesce_co_drain_subtree(void)
-{
- call_in_coroutine(test_quiesce_drain_subtree);
-}
-
static void test_nested(void)
{
BlockBackend *blk;
@@ -361,8 +352,8 @@ static void test_nested(void)
for (outer = 0; outer < DRAIN_TYPE_MAX; outer++) {
for (inner = 0; inner < DRAIN_TYPE_MAX; inner++) {
- int backing_quiesce = (outer != BDRV_DRAIN) +
- (inner != BDRV_DRAIN);
+ int backing_quiesce = (outer == BDRV_DRAIN_ALL) +
+ (inner == BDRV_DRAIN_ALL);
g_assert_cmpint(bs->quiesce_counter, ==, 0);
g_assert_cmpint(backing->quiesce_counter, ==, 0);
@@ -372,10 +363,10 @@ static void test_nested(void)
do_drain_begin(outer, bs);
do_drain_begin(inner, bs);
- g_assert_cmpint(bs->quiesce_counter, ==, 2);
+ g_assert_cmpint(bs->quiesce_counter, ==, 2 + !!backing_quiesce);
g_assert_cmpint(backing->quiesce_counter, ==, backing_quiesce);
- g_assert_cmpint(s->drain_count, ==, 2);
- g_assert_cmpint(backing_s->drain_count, ==, backing_quiesce);
+ g_assert_cmpint(s->drain_count, ==, 1);
+ g_assert_cmpint(backing_s->drain_count, ==, !!backing_quiesce);
do_drain_end(inner, bs);
do_drain_end(outer, bs);
@@ -392,158 +383,6 @@ static void test_nested(void)
blk_unref(blk);
}
-static void test_multiparent(void)
-{
- BlockBackend *blk_a, *blk_b;
- BlockDriverState *bs_a, *bs_b, *backing;
- BDRVTestState *a_s, *b_s, *backing_s;
-
- blk_a = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
- bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
- &error_abort);
- a_s = bs_a->opaque;
- blk_insert_bs(blk_a, bs_a, &error_abort);
-
- blk_b = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
- bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
- &error_abort);
- b_s = bs_b->opaque;
- blk_insert_bs(blk_b, bs_b, &error_abort);
-
- backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
- backing_s = backing->opaque;
- bdrv_set_backing_hd(bs_a, backing, &error_abort);
- bdrv_set_backing_hd(bs_b, backing, &error_abort);
-
- g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
- g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
- g_assert_cmpint(backing->quiesce_counter, ==, 0);
- g_assert_cmpint(a_s->drain_count, ==, 0);
- g_assert_cmpint(b_s->drain_count, ==, 0);
- g_assert_cmpint(backing_s->drain_count, ==, 0);
-
- do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
-
- g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
- g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
- g_assert_cmpint(backing->quiesce_counter, ==, 1);
- g_assert_cmpint(a_s->drain_count, ==, 1);
- g_assert_cmpint(b_s->drain_count, ==, 1);
- g_assert_cmpint(backing_s->drain_count, ==, 1);
-
- do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
-
- g_assert_cmpint(bs_a->quiesce_counter, ==, 2);
- g_assert_cmpint(bs_b->quiesce_counter, ==, 2);
- g_assert_cmpint(backing->quiesce_counter, ==, 2);
- g_assert_cmpint(a_s->drain_count, ==, 2);
- g_assert_cmpint(b_s->drain_count, ==, 2);
- g_assert_cmpint(backing_s->drain_count, ==, 2);
-
- do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
-
- g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
- g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
- g_assert_cmpint(backing->quiesce_counter, ==, 1);
- g_assert_cmpint(a_s->drain_count, ==, 1);
- g_assert_cmpint(b_s->drain_count, ==, 1);
- g_assert_cmpint(backing_s->drain_count, ==, 1);
-
- do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
-
- g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
- g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
- g_assert_cmpint(backing->quiesce_counter, ==, 0);
- g_assert_cmpint(a_s->drain_count, ==, 0);
- g_assert_cmpint(b_s->drain_count, ==, 0);
- g_assert_cmpint(backing_s->drain_count, ==, 0);
-
- bdrv_unref(backing);
- bdrv_unref(bs_a);
- bdrv_unref(bs_b);
- blk_unref(blk_a);
- blk_unref(blk_b);
-}
-
-static void test_graph_change_drain_subtree(void)
-{
- BlockBackend *blk_a, *blk_b;
- BlockDriverState *bs_a, *bs_b, *backing;
- BDRVTestState *a_s, *b_s, *backing_s;
-
- blk_a = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
- bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
- &error_abort);
- a_s = bs_a->opaque;
- blk_insert_bs(blk_a, bs_a, &error_abort);
-
- blk_b = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
- bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
- &error_abort);
- b_s = bs_b->opaque;
- blk_insert_bs(blk_b, bs_b, &error_abort);
-
- backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
- backing_s = backing->opaque;
- bdrv_set_backing_hd(bs_a, backing, &error_abort);
-
- g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
- g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
- g_assert_cmpint(backing->quiesce_counter, ==, 0);
- g_assert_cmpint(a_s->drain_count, ==, 0);
- g_assert_cmpint(b_s->drain_count, ==, 0);
- g_assert_cmpint(backing_s->drain_count, ==, 0);
-
- do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
- do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
- do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
- do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
- do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
-
- bdrv_set_backing_hd(bs_b, backing, &error_abort);
- g_assert_cmpint(bs_a->quiesce_counter, ==, 5);
- g_assert_cmpint(bs_b->quiesce_counter, ==, 5);
- g_assert_cmpint(backing->quiesce_counter, ==, 5);
- g_assert_cmpint(a_s->drain_count, ==, 5);
- g_assert_cmpint(b_s->drain_count, ==, 5);
- g_assert_cmpint(backing_s->drain_count, ==, 5);
-
- bdrv_set_backing_hd(bs_b, NULL, &error_abort);
- g_assert_cmpint(bs_a->quiesce_counter, ==, 3);
- g_assert_cmpint(bs_b->quiesce_counter, ==, 2);
- g_assert_cmpint(backing->quiesce_counter, ==, 3);
- g_assert_cmpint(a_s->drain_count, ==, 3);
- g_assert_cmpint(b_s->drain_count, ==, 2);
- g_assert_cmpint(backing_s->drain_count, ==, 3);
-
- bdrv_set_backing_hd(bs_b, backing, &error_abort);
- g_assert_cmpint(bs_a->quiesce_counter, ==, 5);
- g_assert_cmpint(bs_b->quiesce_counter, ==, 5);
- g_assert_cmpint(backing->quiesce_counter, ==, 5);
- g_assert_cmpint(a_s->drain_count, ==, 5);
- g_assert_cmpint(b_s->drain_count, ==, 5);
- g_assert_cmpint(backing_s->drain_count, ==, 5);
-
- do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
- do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
- do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
- do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
- do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
-
- g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
- g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
- g_assert_cmpint(backing->quiesce_counter, ==, 0);
- g_assert_cmpint(a_s->drain_count, ==, 0);
- g_assert_cmpint(b_s->drain_count, ==, 0);
- g_assert_cmpint(backing_s->drain_count, ==, 0);
-
- bdrv_unref(backing);
- bdrv_unref(bs_a);
- bdrv_unref(bs_b);
- blk_unref(blk_a);
- blk_unref(blk_b);
-}
-
static void test_graph_change_drain_all(void)
{
BlockBackend *blk_a, *blk_b;
@@ -763,12 +602,6 @@ static void test_iothread_drain(void)
test_iothread_common(BDRV_DRAIN, 1);
}
-static void test_iothread_drain_subtree(void)
-{
- test_iothread_common(BDRV_SUBTREE_DRAIN, 0);
- test_iothread_common(BDRV_SUBTREE_DRAIN, 1);
-}
-
typedef struct TestBlockJob {
BlockJob common;
@@ -853,7 +686,6 @@ enum test_job_result {
enum test_job_drain_node {
TEST_JOB_DRAIN_SRC,
TEST_JOB_DRAIN_SRC_CHILD,
- TEST_JOB_DRAIN_SRC_PARENT,
};
static void test_blockjob_common_drain_node(enum drain_type drain_type,
@@ -891,9 +723,6 @@ static void test_blockjob_common_drain_node(enum drain_type drain_type,
case TEST_JOB_DRAIN_SRC_CHILD:
drain_bs = src_backing;
break;
- case TEST_JOB_DRAIN_SRC_PARENT:
- drain_bs = src_overlay;
- break;
default:
g_assert_not_reached();
}
@@ -1045,10 +874,6 @@ static void test_blockjob_common(enum drain_type drain_type, bool use_iothread,
TEST_JOB_DRAIN_SRC);
test_blockjob_common_drain_node(drain_type, use_iothread, result,
TEST_JOB_DRAIN_SRC_CHILD);
- if (drain_type == BDRV_SUBTREE_DRAIN) {
- test_blockjob_common_drain_node(drain_type, use_iothread, result,
- TEST_JOB_DRAIN_SRC_PARENT);
- }
}
static void test_blockjob_drain_all(void)
@@ -1061,11 +886,6 @@ static void test_blockjob_drain(void)
test_blockjob_common(BDRV_DRAIN, false, TEST_JOB_SUCCESS);
}
-static void test_blockjob_drain_subtree(void)
-{
- test_blockjob_common(BDRV_SUBTREE_DRAIN, false, TEST_JOB_SUCCESS);
-}
-
static void test_blockjob_error_drain_all(void)
{
test_blockjob_common(BDRV_DRAIN_ALL, false, TEST_JOB_FAIL_RUN);
@@ -1078,12 +898,6 @@ static void test_blockjob_error_drain(void)
test_blockjob_common(BDRV_DRAIN, false, TEST_JOB_FAIL_PREPARE);
}
-static void test_blockjob_error_drain_subtree(void)
-{
- test_blockjob_common(BDRV_SUBTREE_DRAIN, false, TEST_JOB_FAIL_RUN);
- test_blockjob_common(BDRV_SUBTREE_DRAIN, false, TEST_JOB_FAIL_PREPARE);
-}
-
static void test_blockjob_iothread_drain_all(void)
{
test_blockjob_common(BDRV_DRAIN_ALL, true, TEST_JOB_SUCCESS);
@@ -1094,11 +908,6 @@ static void test_blockjob_iothread_drain(void)
test_blockjob_common(BDRV_DRAIN, true, TEST_JOB_SUCCESS);
}
-static void test_blockjob_iothread_drain_subtree(void)
-{
- test_blockjob_common(BDRV_SUBTREE_DRAIN, true, TEST_JOB_SUCCESS);
-}
-
static void test_blockjob_iothread_error_drain_all(void)
{
test_blockjob_common(BDRV_DRAIN_ALL, true, TEST_JOB_FAIL_RUN);
@@ -1111,12 +920,6 @@ static void test_blockjob_iothread_error_drain(void)
test_blockjob_common(BDRV_DRAIN, true, TEST_JOB_FAIL_PREPARE);
}
-static void test_blockjob_iothread_error_drain_subtree(void)
-{
- test_blockjob_common(BDRV_SUBTREE_DRAIN, true, TEST_JOB_FAIL_RUN);
- test_blockjob_common(BDRV_SUBTREE_DRAIN, true, TEST_JOB_FAIL_PREPARE);
-}
-
typedef struct BDRVTestTopState {
BdrvChild *wait_child;
@@ -1263,14 +1066,6 @@ static void do_test_delete_by_drain(bool detach_instead_of_delete,
bdrv_drain(child_bs);
bdrv_unref(child_bs);
break;
- case BDRV_SUBTREE_DRAIN:
- /* Would have to ref/unref bs here for !detach_instead_of_delete, but
- * then the whole test becomes pointless because the graph changes
- * don't occur during the drain any more. */
- assert(detach_instead_of_delete);
- bdrv_subtree_drained_begin(bs);
- bdrv_subtree_drained_end(bs);
- break;
case BDRV_DRAIN_ALL:
bdrv_drain_all_begin();
bdrv_drain_all_end();
@@ -1305,11 +1100,6 @@ static void test_detach_by_drain(void)
do_test_delete_by_drain(true, BDRV_DRAIN);
}
-static void test_detach_by_drain_subtree(void)
-{
- do_test_delete_by_drain(true, BDRV_SUBTREE_DRAIN);
-}
-
struct detach_by_parent_data {
BlockDriverState *parent_b;
@@ -1317,6 +1107,7 @@ struct detach_by_parent_data {
BlockDriverState *c;
BdrvChild *child_c;
bool by_parent_cb;
+ bool detach_on_drain;
};
static struct detach_by_parent_data detach_by_parent_data;
@@ -1324,6 +1115,7 @@ static void detach_indirect_bh(void *opaque)
{
struct detach_by_parent_data *data = opaque;
+ bdrv_dec_in_flight(data->child_b->bs);
bdrv_unref_child(data->parent_b, data->child_b);
bdrv_ref(data->c);
@@ -1338,12 +1130,21 @@ static void detach_by_parent_aio_cb(void *opaque, int ret)
g_assert_cmpint(ret, ==, 0);
if (data->by_parent_cb) {
+ bdrv_inc_in_flight(data->child_b->bs);
detach_indirect_bh(data);
}
}
static void detach_by_driver_cb_drained_begin(BdrvChild *child)
{
+ struct detach_by_parent_data *data = &detach_by_parent_data;
+
+ if (!data->detach_on_drain) {
+ return;
+ }
+ data->detach_on_drain = false;
+
+ bdrv_inc_in_flight(data->child_b->bs);
aio_bh_schedule_oneshot(qemu_get_current_aio_context(),
detach_indirect_bh, &detach_by_parent_data);
child_of_bds.drained_begin(child);
@@ -1384,8 +1185,14 @@ static void test_detach_indirect(bool by_parent_cb)
detach_by_driver_cb_class = child_of_bds;
detach_by_driver_cb_class.drained_begin =
detach_by_driver_cb_drained_begin;
+ detach_by_driver_cb_class.drained_end = NULL;
+ detach_by_driver_cb_class.drained_poll = NULL;
}
+ detach_by_parent_data = (struct detach_by_parent_data) {
+ .detach_on_drain = false,
+ };
+
/* Create all involved nodes */
parent_a = bdrv_new_open_driver(&bdrv_test, "parent-a", BDRV_O_RDWR,
&error_abort);
@@ -1437,12 +1244,16 @@ static void test_detach_indirect(bool by_parent_cb)
.child_b = child_b,
.c = c,
.by_parent_cb = by_parent_cb,
+ .detach_on_drain = true,
};
acb = blk_aio_preadv(blk, 0, &qiov, 0, detach_by_parent_aio_cb, NULL);
g_assert(acb != NULL);
/* Drain and check the expected result */
- bdrv_subtree_drained_begin(parent_b);
+ bdrv_drained_begin(parent_b);
+ bdrv_drained_begin(a);
+ bdrv_drained_begin(b);
+ bdrv_drained_begin(c);
g_assert(detach_by_parent_data.child_c != NULL);
@@ -1457,12 +1268,15 @@ static void test_detach_indirect(bool by_parent_cb)
g_assert(QLIST_NEXT(child_a, next) == NULL);
g_assert_cmpint(parent_a->quiesce_counter, ==, 1);
- g_assert_cmpint(parent_b->quiesce_counter, ==, 1);
+ g_assert_cmpint(parent_b->quiesce_counter, ==, 3);
g_assert_cmpint(a->quiesce_counter, ==, 1);
- g_assert_cmpint(b->quiesce_counter, ==, 0);
+ g_assert_cmpint(b->quiesce_counter, ==, 1);
g_assert_cmpint(c->quiesce_counter, ==, 1);
- bdrv_subtree_drained_end(parent_b);
+ bdrv_drained_end(parent_b);
+ bdrv_drained_end(a);
+ bdrv_drained_end(b);
+ bdrv_drained_end(c);
bdrv_unref(parent_b);
blk_unref(blk);
@@ -1693,6 +1507,7 @@ static void test_blockjob_commit_by_drained_end(void)
bdrv_drained_begin(bs_child);
g_assert(!job_has_completed);
bdrv_drained_end(bs_child);
+ aio_poll(qemu_get_aio_context(), false);
g_assert(job_has_completed);
bdrv_unref(bs_parents[0]);
@@ -1848,6 +1663,7 @@ static void test_drop_intermediate_poll(void)
g_assert(!job_has_completed);
ret = bdrv_drop_intermediate(chain[1], chain[0], NULL);
+ aio_poll(qemu_get_aio_context(), false);
g_assert(ret == 0);
g_assert(job_has_completed);
@@ -1856,6 +1672,7 @@ static void test_drop_intermediate_poll(void)
typedef struct BDRVReplaceTestState {
+ bool setup_completed;
bool was_drained;
bool was_undrained;
bool has_read;
@@ -1916,52 +1733,78 @@ static int coroutine_fn bdrv_replace_test_co_preadv(BlockDriverState *bs,
return 0;
}
+static void coroutine_fn bdrv_replace_test_drain_co(void *opaque)
+{
+ BlockDriverState *bs = opaque;
+ BDRVReplaceTestState *s = bs->opaque;
+
+ /* Keep waking io_co up until it is done */
+ while (s->io_co) {
+ aio_co_wake(s->io_co);
+ s->io_co = NULL;
+ qemu_coroutine_yield();
+ }
+ s->drain_co = NULL;
+ bdrv_dec_in_flight(bs);
+}
+
/**
* If .drain_count is 0, wake up .io_co if there is one; and set
* .was_drained.
* Increment .drain_count.
*/
-static void coroutine_fn bdrv_replace_test_co_drain_begin(BlockDriverState *bs)
+static void bdrv_replace_test_drain_begin(BlockDriverState *bs)
{
BDRVReplaceTestState *s = bs->opaque;
- if (!s->drain_count) {
- /* Keep waking io_co up until it is done */
- s->drain_co = qemu_coroutine_self();
- while (s->io_co) {
- aio_co_wake(s->io_co);
- s->io_co = NULL;
- qemu_coroutine_yield();
- }
- s->drain_co = NULL;
+ if (!s->setup_completed) {
+ return;
+ }
+ if (!s->drain_count) {
+ s->drain_co = qemu_coroutine_create(bdrv_replace_test_drain_co, bs);
+ bdrv_inc_in_flight(bs);
+ aio_co_enter(bdrv_get_aio_context(bs), s->drain_co);
s->was_drained = true;
}
s->drain_count++;
}
+static void coroutine_fn bdrv_replace_test_read_entry(void *opaque)
+{
+ BlockDriverState *bs = opaque;
+ char data;
+ QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, &data, 1);
+ int ret;
+
+ /* Queue a read request post-drain */
+ ret = bdrv_replace_test_co_preadv(bs, 0, 1, &qiov, 0);
+ g_assert(ret >= 0);
+ bdrv_dec_in_flight(bs);
+}
+
/**
* Reduce .drain_count, set .was_undrained once it reaches 0.
* If .drain_count reaches 0 and the node has a backing file, issue a
* read request.
*/
-static void coroutine_fn bdrv_replace_test_co_drain_end(BlockDriverState *bs)
+static void bdrv_replace_test_drain_end(BlockDriverState *bs)
{
BDRVReplaceTestState *s = bs->opaque;
+ if (!s->setup_completed) {
+ return;
+ }
+
g_assert(s->drain_count > 0);
if (!--s->drain_count) {
- int ret;
-
s->was_undrained = true;
if (bs->backing) {
- char data;
- QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, &data, 1);
-
- /* Queue a read request post-drain */
- ret = bdrv_replace_test_co_preadv(bs, 0, 1, &qiov, 0);
- g_assert(ret >= 0);
+ Coroutine *co = qemu_coroutine_create(bdrv_replace_test_read_entry,
+ bs);
+ bdrv_inc_in_flight(bs);
+ aio_co_enter(bdrv_get_aio_context(bs), co);
}
}
}
@@ -1974,8 +1817,8 @@ static BlockDriver bdrv_replace_test = {
.bdrv_close = bdrv_replace_test_close,
.bdrv_co_preadv = bdrv_replace_test_co_preadv,
- .bdrv_co_drain_begin = bdrv_replace_test_co_drain_begin,
- .bdrv_co_drain_end = bdrv_replace_test_co_drain_end,
+ .bdrv_drain_begin = bdrv_replace_test_drain_begin,
+ .bdrv_drain_end = bdrv_replace_test_drain_end,
.bdrv_child_perm = bdrv_default_perms,
};
@@ -2051,6 +1894,7 @@ static void do_test_replace_child_mid_drain(int old_drain_count,
bdrv_ref(old_child_bs);
bdrv_attach_child(parent_bs, old_child_bs, "child", &child_of_bds,
BDRV_CHILD_COW, &error_abort);
+ parent_s->setup_completed = true;
for (i = 0; i < old_drain_count; i++) {
bdrv_drained_begin(old_child_bs);
@@ -2172,70 +2016,47 @@ int main(int argc, char **argv)
g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
- g_test_add_func("/bdrv-drain/driver-cb/drain_subtree",
- test_drv_cb_drain_subtree);
g_test_add_func("/bdrv-drain/driver-cb/co/drain_all",
test_drv_cb_co_drain_all);
g_test_add_func("/bdrv-drain/driver-cb/co/drain", test_drv_cb_co_drain);
- g_test_add_func("/bdrv-drain/driver-cb/co/drain_subtree",
- test_drv_cb_co_drain_subtree);
-
g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
- g_test_add_func("/bdrv-drain/quiesce/drain_subtree",
- test_quiesce_drain_subtree);
g_test_add_func("/bdrv-drain/quiesce/co/drain_all",
test_quiesce_co_drain_all);
g_test_add_func("/bdrv-drain/quiesce/co/drain", test_quiesce_co_drain);
- g_test_add_func("/bdrv-drain/quiesce/co/drain_subtree",
- test_quiesce_co_drain_subtree);
g_test_add_func("/bdrv-drain/nested", test_nested);
- g_test_add_func("/bdrv-drain/multiparent", test_multiparent);
- g_test_add_func("/bdrv-drain/graph-change/drain_subtree",
- test_graph_change_drain_subtree);
g_test_add_func("/bdrv-drain/graph-change/drain_all",
test_graph_change_drain_all);
g_test_add_func("/bdrv-drain/iothread/drain_all", test_iothread_drain_all);
g_test_add_func("/bdrv-drain/iothread/drain", test_iothread_drain);
- g_test_add_func("/bdrv-drain/iothread/drain_subtree",
- test_iothread_drain_subtree);
g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
- g_test_add_func("/bdrv-drain/blockjob/drain_subtree",
- test_blockjob_drain_subtree);
g_test_add_func("/bdrv-drain/blockjob/error/drain_all",
test_blockjob_error_drain_all);
g_test_add_func("/bdrv-drain/blockjob/error/drain",
test_blockjob_error_drain);
- g_test_add_func("/bdrv-drain/blockjob/error/drain_subtree",
- test_blockjob_error_drain_subtree);
g_test_add_func("/bdrv-drain/blockjob/iothread/drain_all",
test_blockjob_iothread_drain_all);
g_test_add_func("/bdrv-drain/blockjob/iothread/drain",
test_blockjob_iothread_drain);
- g_test_add_func("/bdrv-drain/blockjob/iothread/drain_subtree",
- test_blockjob_iothread_drain_subtree);
g_test_add_func("/bdrv-drain/blockjob/iothread/error/drain_all",
test_blockjob_iothread_error_drain_all);
g_test_add_func("/bdrv-drain/blockjob/iothread/error/drain",
test_blockjob_iothread_error_drain);
- g_test_add_func("/bdrv-drain/blockjob/iothread/error/drain_subtree",
- test_blockjob_iothread_error_drain_subtree);
g_test_add_func("/bdrv-drain/deletion/drain", test_delete_by_drain);
g_test_add_func("/bdrv-drain/detach/drain_all", test_detach_by_drain_all);
g_test_add_func("/bdrv-drain/detach/drain", test_detach_by_drain);
- g_test_add_func("/bdrv-drain/detach/drain_subtree", test_detach_by_drain_subtree);
g_test_add_func("/bdrv-drain/detach/parent_cb", test_detach_by_parent_cb);
g_test_add_func("/bdrv-drain/detach/driver_cb", test_detach_by_driver_cb);
diff --git a/util/async.c b/util/async.c
index 63434ddae4..14d63b3091 100644
--- a/util/async.c
+++ b/util/async.c
@@ -27,6 +27,7 @@
#include "qapi/error.h"
#include "block/aio.h"
#include "block/thread-pool.h"
+#include "block/graph-lock.h"
#include "qemu/main-loop.h"
#include "qemu/atomic.h"
#include "qemu/rcu_queue.h"
@@ -376,6 +377,7 @@ aio_ctx_finalize(GSource *source)
qemu_rec_mutex_destroy(&ctx->lock);
qemu_lockcnt_destroy(&ctx->list_lock);
timerlistgroup_deinit(&ctx->tlg);
+ unregister_aiocontext(ctx);
aio_context_destroy(ctx);
}
@@ -574,6 +576,8 @@ AioContext *aio_context_new(Error **errp)
ctx->thread_pool_min = 0;
ctx->thread_pool_max = THREAD_POOL_MAX_THREADS_DEFAULT;
+ register_aiocontext(ctx);
+
return ctx;
fail:
g_source_destroy(&ctx->source);