aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Henderson <richard.henderson@linaro.org>2023-06-01 11:47:58 -0700
committerRichard Henderson <richard.henderson@linaro.org>2023-06-01 11:47:58 -0700
commit9eb400cdd7b0940bd696aa01462dd53004ae04e9 (patch)
tree0d2e2fc43a89da7aad2f9b71eb5446dedd484b95
parent19a720b74fde7e859d19f12c66a72e545947a657 (diff)
parent98b126f5e3228a346c774e569e26689943b401dd (diff)
Merge tag 'block-pull-request' of https://gitlab.com/stefanha/qemu into staging
Pull request - Stefano Garzarella's blkio block driver 'fd' parameter - My thread-local blk_io_plug() series # -----BEGIN PGP SIGNATURE----- # # iQEzBAABCAAdFiEEhpWov9P5fNqsNXdanKSrs4Grc8gFAmR4uHoACgkQnKSrs4Gr # c8hFBAgAo+SFrOteYgdELM9s0EWb0AU39MTOyNXW7i5mPZNXrn5J7pfRD/5wvI6l # wl5GNMQ+M5HVYO7CumKWr4M1IpKV5Jin6FN/2h15fWkeg17lBOmNHUF+LctLYQbq # HwtNA4hdw1+SEv8kQLBgiqSJMqWcn80X09emgPMCIwET9zxokRYwVjQJx2alM5bd # SqgitDp5qlHyj5HQPX2orT9KrXYWQdGr8i50bn0S67r1wdqTRMu93wrWdEUUncId # 7otlUaq8cARbRMJzIwDmy/cF24Ynr0wCJb4aHW+trRtf+PNgx1Ki+YOiz+LFyjq7 # t6KOMeignzhz9Uzq8EVG4XW8SHpGkw== # =Ms48 # -----END PGP SIGNATURE----- # gpg: Signature made Thu 01 Jun 2023 08:25:46 AM PDT # gpg: using RSA key 8695A8BFD3F97CDAAC35775A9CA4ABB381AB73C8 # gpg: Good signature from "Stefan Hajnoczi <stefanha@redhat.com>" [full] # gpg: aka "Stefan Hajnoczi <stefanha@gmail.com>" [full] * tag 'block-pull-request' of https://gitlab.com/stefanha/qemu: qapi: add '@fdset' feature for BlockdevOptionsVirtioBlkVhostVdpa block/blkio: use qemu_open() to support fd passing for virtio-blk block: remove bdrv_co_io_plug() API block/linux-aio: convert to blk_io_plug_call() API block/io_uring: convert to blk_io_plug_call() API block/blkio: convert to blk_io_plug_call() API block/nvme: convert to blk_io_plug_call() API block: add blk_io_plug_call() API Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
-rw-r--r--MAINTAINERS1
-rw-r--r--block/blkio.c96
-rw-r--r--block/block-backend.c22
-rw-r--r--block/file-posix.c38
-rw-r--r--block/io.c37
-rw-r--r--block/io_uring.c44
-rw-r--r--block/linux-aio.c41
-rw-r--r--block/meson.build1
-rw-r--r--block/nvme.c44
-rw-r--r--block/plug.c159
-rw-r--r--block/trace-events6
-rw-r--r--hw/block/dataplane/xen-block.c8
-rw-r--r--hw/block/virtio-blk.c4
-rw-r--r--hw/scsi/virtio-scsi.c6
-rw-r--r--include/block/block-io.h3
-rw-r--r--include/block/block_int-common.h11
-rw-r--r--include/block/raw-aio.h14
-rw-r--r--include/sysemu/block-backend-io.h13
-rw-r--r--meson.build4
-rw-r--r--qapi/block-core.json6
20 files changed, 293 insertions, 265 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 4b025a7b63..89f274f85e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2650,6 +2650,7 @@ F: util/aio-*.c
F: util/aio-*.h
F: util/fdmon-*.c
F: block/io.c
+F: block/plug.c
F: migration/block*
F: include/block/aio.h
F: include/block/aio-wait.h
diff --git a/block/blkio.c b/block/blkio.c
index 72117fa005..527323d625 100644
--- a/block/blkio.c
+++ b/block/blkio.c
@@ -17,6 +17,7 @@
#include "qemu/error-report.h"
#include "qapi/qmp/qdict.h"
#include "qemu/module.h"
+#include "sysemu/block-backend.h"
#include "exec/memory.h" /* for ram_block_discard_disable() */
#include "block/block-io.h"
@@ -320,16 +321,30 @@ static void blkio_detach_aio_context(BlockDriverState *bs)
NULL, NULL, NULL);
}
-/* Call with s->blkio_lock held to submit I/O after enqueuing a new request */
-static void blkio_submit_io(BlockDriverState *bs)
+/*
+ * Called by blk_io_unplug() or immediately if not plugged. Called without
+ * blkio_lock.
+ */
+static void blkio_unplug_fn(void *opaque)
{
- if (qatomic_read(&bs->io_plugged) == 0) {
- BDRVBlkioState *s = bs->opaque;
+ BDRVBlkioState *s = opaque;
+ WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
blkioq_do_io(s->blkioq, NULL, 0, 0, NULL);
}
}
+/*
+ * Schedule I/O submission after enqueuing a new request. Called without
+ * blkio_lock.
+ */
+static void blkio_submit_io(BlockDriverState *bs)
+{
+ BDRVBlkioState *s = bs->opaque;
+
+ blk_io_plug_call(blkio_unplug_fn, s);
+}
+
static int coroutine_fn
blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
{
@@ -340,9 +355,9 @@ blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
blkioq_discard(s->blkioq, offset, bytes, &cod, 0);
- blkio_submit_io(bs);
}
+ blkio_submit_io(bs);
qemu_coroutine_yield();
return cod.ret;
}
@@ -373,9 +388,9 @@ blkio_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
blkioq_readv(s->blkioq, offset, iov, iovcnt, &cod, 0);
- blkio_submit_io(bs);
}
+ blkio_submit_io(bs);
qemu_coroutine_yield();
if (use_bounce_buffer) {
@@ -418,9 +433,9 @@ static int coroutine_fn blkio_co_pwritev(BlockDriverState *bs, int64_t offset,
WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
blkioq_writev(s->blkioq, offset, iov, iovcnt, &cod, blkio_flags);
- blkio_submit_io(bs);
}
+ blkio_submit_io(bs);
qemu_coroutine_yield();
if (use_bounce_buffer) {
@@ -439,9 +454,9 @@ static int coroutine_fn blkio_co_flush(BlockDriverState *bs)
WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
blkioq_flush(s->blkioq, &cod, 0);
- blkio_submit_io(bs);
}
+ blkio_submit_io(bs);
qemu_coroutine_yield();
return cod.ret;
}
@@ -467,22 +482,13 @@ static int coroutine_fn blkio_co_pwrite_zeroes(BlockDriverState *bs,
WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
blkioq_write_zeroes(s->blkioq, offset, bytes, &cod, blkio_flags);
- blkio_submit_io(bs);
}
+ blkio_submit_io(bs);
qemu_coroutine_yield();
return cod.ret;
}
-static void coroutine_fn blkio_co_io_unplug(BlockDriverState *bs)
-{
- BDRVBlkioState *s = bs->opaque;
-
- WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
- blkio_submit_io(bs);
- }
-}
-
typedef enum {
BMRR_OK,
BMRR_SKIP,
@@ -667,25 +673,60 @@ static int blkio_virtio_blk_common_open(BlockDriverState *bs,
{
const char *path = qdict_get_try_str(options, "path");
BDRVBlkioState *s = bs->opaque;
- int ret;
+ bool fd_supported = false;
+ int fd, ret;
if (!path) {
error_setg(errp, "missing 'path' option");
return -EINVAL;
}
- ret = blkio_set_str(s->blkio, "path", path);
- qdict_del(options, "path");
- if (ret < 0) {
- error_setg_errno(errp, -ret, "failed to set path: %s",
- blkio_get_error_msg());
- return ret;
- }
-
if (!(flags & BDRV_O_NOCACHE)) {
error_setg(errp, "cache.direct=off is not supported");
return -EINVAL;
}
+
+ if (blkio_get_int(s->blkio, "fd", &fd) == 0) {
+ fd_supported = true;
+ }
+
+ /*
+ * If the libblkio driver supports fd passing, let's always use qemu_open()
+ * to open the `path`, so we can handle fd passing from the management
+ * layer through the "/dev/fdset/N" special path.
+ */
+ if (fd_supported) {
+ int open_flags;
+
+ if (flags & BDRV_O_RDWR) {
+ open_flags = O_RDWR;
+ } else {
+ open_flags = O_RDONLY;
+ }
+
+ fd = qemu_open(path, open_flags, errp);
+ if (fd < 0) {
+ return -EINVAL;
+ }
+
+ ret = blkio_set_int(s->blkio, "fd", fd);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "failed to set fd: %s",
+ blkio_get_error_msg());
+ qemu_close(fd);
+ return ret;
+ }
+ } else {
+ ret = blkio_set_str(s->blkio, "path", path);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "failed to set path: %s",
+ blkio_get_error_msg());
+ return ret;
+ }
+ }
+
+ qdict_del(options, "path");
+
return 0;
}
@@ -1004,7 +1045,6 @@ static void blkio_refresh_limits(BlockDriverState *bs, Error **errp)
.bdrv_co_pwritev = blkio_co_pwritev, \
.bdrv_co_flush_to_disk = blkio_co_flush, \
.bdrv_co_pwrite_zeroes = blkio_co_pwrite_zeroes, \
- .bdrv_co_io_unplug = blkio_co_io_unplug, \
.bdrv_refresh_limits = blkio_refresh_limits, \
.bdrv_register_buf = blkio_register_buf, \
.bdrv_unregister_buf = blkio_unregister_buf, \
diff --git a/block/block-backend.c b/block/block-backend.c
index 241f643507..4009ed5fed 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -2582,28 +2582,6 @@ void blk_add_insert_bs_notifier(BlockBackend *blk, Notifier *notify)
notifier_list_add(&blk->insert_bs_notifiers, notify);
}
-void coroutine_fn blk_co_io_plug(BlockBackend *blk)
-{
- BlockDriverState *bs = blk_bs(blk);
- IO_CODE();
- GRAPH_RDLOCK_GUARD();
-
- if (bs) {
- bdrv_co_io_plug(bs);
- }
-}
-
-void coroutine_fn blk_co_io_unplug(BlockBackend *blk)
-{
- BlockDriverState *bs = blk_bs(blk);
- IO_CODE();
- GRAPH_RDLOCK_GUARD();
-
- if (bs) {
- bdrv_co_io_unplug(bs);
- }
-}
-
BlockAcctStats *blk_get_stats(BlockBackend *blk)
{
IO_CODE();
diff --git a/block/file-posix.c b/block/file-posix.c
index 0ab158efba..ac1ed54811 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -2550,36 +2550,6 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset,
return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
}
-static void coroutine_fn raw_co_io_plug(BlockDriverState *bs)
-{
- BDRVRawState __attribute__((unused)) *s = bs->opaque;
-#ifdef CONFIG_LINUX_AIO
- if (s->use_linux_aio) {
- laio_io_plug();
- }
-#endif
-#ifdef CONFIG_LINUX_IO_URING
- if (s->use_linux_io_uring) {
- luring_io_plug();
- }
-#endif
-}
-
-static void coroutine_fn raw_co_io_unplug(BlockDriverState *bs)
-{
- BDRVRawState __attribute__((unused)) *s = bs->opaque;
-#ifdef CONFIG_LINUX_AIO
- if (s->use_linux_aio) {
- laio_io_unplug(s->aio_max_batch);
- }
-#endif
-#ifdef CONFIG_LINUX_IO_URING
- if (s->use_linux_io_uring) {
- luring_io_unplug();
- }
-#endif
-}
-
static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
{
BDRVRawState *s = bs->opaque;
@@ -3924,8 +3894,6 @@ BlockDriver bdrv_file = {
.bdrv_co_copy_range_from = raw_co_copy_range_from,
.bdrv_co_copy_range_to = raw_co_copy_range_to,
.bdrv_refresh_limits = raw_refresh_limits,
- .bdrv_co_io_plug = raw_co_io_plug,
- .bdrv_co_io_unplug = raw_co_io_unplug,
.bdrv_attach_aio_context = raw_aio_attach_aio_context,
.bdrv_co_truncate = raw_co_truncate,
@@ -4296,8 +4264,6 @@ static BlockDriver bdrv_host_device = {
.bdrv_co_copy_range_from = raw_co_copy_range_from,
.bdrv_co_copy_range_to = raw_co_copy_range_to,
.bdrv_refresh_limits = raw_refresh_limits,
- .bdrv_co_io_plug = raw_co_io_plug,
- .bdrv_co_io_unplug = raw_co_io_unplug,
.bdrv_attach_aio_context = raw_aio_attach_aio_context,
.bdrv_co_truncate = raw_co_truncate,
@@ -4434,8 +4400,6 @@ static BlockDriver bdrv_host_cdrom = {
.bdrv_co_pwritev = raw_co_pwritev,
.bdrv_co_flush_to_disk = raw_co_flush_to_disk,
.bdrv_refresh_limits = cdrom_refresh_limits,
- .bdrv_co_io_plug = raw_co_io_plug,
- .bdrv_co_io_unplug = raw_co_io_unplug,
.bdrv_attach_aio_context = raw_aio_attach_aio_context,
.bdrv_co_truncate = raw_co_truncate,
@@ -4562,8 +4526,6 @@ static BlockDriver bdrv_host_cdrom = {
.bdrv_co_pwritev = raw_co_pwritev,
.bdrv_co_flush_to_disk = raw_co_flush_to_disk,
.bdrv_refresh_limits = cdrom_refresh_limits,
- .bdrv_co_io_plug = raw_co_io_plug,
- .bdrv_co_io_unplug = raw_co_io_unplug,
.bdrv_attach_aio_context = raw_aio_attach_aio_context,
.bdrv_co_truncate = raw_co_truncate,
diff --git a/block/io.c b/block/io.c
index 540bf8d26d..f2dfc7c405 100644
--- a/block/io.c
+++ b/block/io.c
@@ -3223,43 +3223,6 @@ void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
return mem;
}
-void coroutine_fn bdrv_co_io_plug(BlockDriverState *bs)
-{
- BdrvChild *child;
- IO_CODE();
- assert_bdrv_graph_readable();
-
- QLIST_FOREACH(child, &bs->children, next) {
- bdrv_co_io_plug(child->bs);
- }
-
- if (qatomic_fetch_inc(&bs->io_plugged) == 0) {
- BlockDriver *drv = bs->drv;
- if (drv && drv->bdrv_co_io_plug) {
- drv->bdrv_co_io_plug(bs);
- }
- }
-}
-
-void coroutine_fn bdrv_co_io_unplug(BlockDriverState *bs)
-{
- BdrvChild *child;
- IO_CODE();
- assert_bdrv_graph_readable();
-
- assert(bs->io_plugged);
- if (qatomic_fetch_dec(&bs->io_plugged) == 1) {
- BlockDriver *drv = bs->drv;
- if (drv && drv->bdrv_co_io_unplug) {
- drv->bdrv_co_io_unplug(bs);
- }
- }
-
- QLIST_FOREACH(child, &bs->children, next) {
- bdrv_co_io_unplug(child->bs);
- }
-}
-
/* Helper that undoes bdrv_register_buf() when it fails partway through */
static void GRAPH_RDLOCK
bdrv_register_buf_rollback(BlockDriverState *bs, void *host, size_t size,
diff --git a/block/io_uring.c b/block/io_uring.c
index 3a77480e16..69d9820928 100644
--- a/block/io_uring.c
+++ b/block/io_uring.c
@@ -16,6 +16,7 @@
#include "block/raw-aio.h"
#include "qemu/coroutine.h"
#include "qapi/error.h"
+#include "sysemu/block-backend.h"
#include "trace.h"
/* Only used for assertions. */
@@ -41,7 +42,6 @@ typedef struct LuringAIOCB {
} LuringAIOCB;
typedef struct LuringQueue {
- int plugged;
unsigned int in_queue;
unsigned int in_flight;
bool blocked;
@@ -267,7 +267,7 @@ static void luring_process_completions_and_submit(LuringState *s)
{
luring_process_completions(s);
- if (!s->io_q.plugged && s->io_q.in_queue > 0) {
+ if (s->io_q.in_queue > 0) {
ioq_submit(s);
}
}
@@ -301,29 +301,17 @@ static void qemu_luring_poll_ready(void *opaque)
static void ioq_init(LuringQueue *io_q)
{
QSIMPLEQ_INIT(&io_q->submit_queue);
- io_q->plugged = 0;
io_q->in_queue = 0;
io_q->in_flight = 0;
io_q->blocked = false;
}
-void luring_io_plug(void)
+static void luring_unplug_fn(void *opaque)
{
- AioContext *ctx = qemu_get_current_aio_context();
- LuringState *s = aio_get_linux_io_uring(ctx);
- trace_luring_io_plug(s);
- s->io_q.plugged++;
-}
-
-void luring_io_unplug(void)
-{
- AioContext *ctx = qemu_get_current_aio_context();
- LuringState *s = aio_get_linux_io_uring(ctx);
- assert(s->io_q.plugged);
- trace_luring_io_unplug(s, s->io_q.blocked, s->io_q.plugged,
- s->io_q.in_queue, s->io_q.in_flight);
- if (--s->io_q.plugged == 0 &&
- !s->io_q.blocked && s->io_q.in_queue > 0) {
+ LuringState *s = opaque;
+ trace_luring_unplug_fn(s, s->io_q.blocked, s->io_q.in_queue,
+ s->io_q.in_flight);
+ if (!s->io_q.blocked && s->io_q.in_queue > 0) {
ioq_submit(s);
}
}
@@ -370,14 +358,16 @@ static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s,
QSIMPLEQ_INSERT_TAIL(&s->io_q.submit_queue, luringcb, next);
s->io_q.in_queue++;
- trace_luring_do_submit(s, s->io_q.blocked, s->io_q.plugged,
- s->io_q.in_queue, s->io_q.in_flight);
- if (!s->io_q.blocked &&
- (!s->io_q.plugged ||
- s->io_q.in_flight + s->io_q.in_queue >= MAX_ENTRIES)) {
- ret = ioq_submit(s);
- trace_luring_do_submit_done(s, ret);
- return ret;
+ trace_luring_do_submit(s, s->io_q.blocked, s->io_q.in_queue,
+ s->io_q.in_flight);
+ if (!s->io_q.blocked) {
+ if (s->io_q.in_flight + s->io_q.in_queue >= MAX_ENTRIES) {
+ ret = ioq_submit(s);
+ trace_luring_do_submit_done(s, ret);
+ return ret;
+ }
+
+ blk_io_plug_call(luring_unplug_fn, s);
}
return 0;
}
diff --git a/block/linux-aio.c b/block/linux-aio.c
index 916f001e32..561c71a9ae 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -15,6 +15,7 @@
#include "qemu/event_notifier.h"
#include "qemu/coroutine.h"
#include "qapi/error.h"
+#include "sysemu/block-backend.h"
/* Only used for assertions. */
#include "qemu/coroutine_int.h"
@@ -46,7 +47,6 @@ struct qemu_laiocb {
};
typedef struct {
- int plugged;
unsigned int in_queue;
unsigned int in_flight;
bool blocked;
@@ -236,7 +236,7 @@ static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
{
qemu_laio_process_completions(s);
- if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
+ if (!QSIMPLEQ_EMPTY(&s->io_q.pending)) {
ioq_submit(s);
}
}
@@ -277,7 +277,6 @@ static void qemu_laio_poll_ready(EventNotifier *opaque)
static void ioq_init(LaioQueue *io_q)
{
QSIMPLEQ_INIT(&io_q->pending);
- io_q->plugged = 0;
io_q->in_queue = 0;
io_q->in_flight = 0;
io_q->blocked = false;
@@ -354,31 +353,11 @@ static uint64_t laio_max_batch(LinuxAioState *s, uint64_t dev_max_batch)
return max_batch;
}
-void laio_io_plug(void)
+static void laio_unplug_fn(void *opaque)
{
- AioContext *ctx = qemu_get_current_aio_context();
- LinuxAioState *s = aio_get_linux_aio(ctx);
-
- s->io_q.plugged++;
-}
-
-void laio_io_unplug(uint64_t dev_max_batch)
-{
- AioContext *ctx = qemu_get_current_aio_context();
- LinuxAioState *s = aio_get_linux_aio(ctx);
-
- assert(s->io_q.plugged);
- s->io_q.plugged--;
+ LinuxAioState *s = opaque;
- /*
- * Why max batch checking is performed here:
- * Another BDS may have queued requests with a higher dev_max_batch and
- * therefore in_queue could now exceed our dev_max_batch. Re-check the max
- * batch so we can honor our device's dev_max_batch.
- */
- if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch) ||
- (!s->io_q.plugged &&
- !s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending))) {
+ if (!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
ioq_submit(s);
}
}
@@ -410,10 +389,12 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next);
s->io_q.in_queue++;
- if (!s->io_q.blocked &&
- (!s->io_q.plugged ||
- s->io_q.in_queue >= laio_max_batch(s, dev_max_batch))) {
- ioq_submit(s);
+ if (!s->io_q.blocked) {
+ if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch)) {
+ ioq_submit(s);
+ } else {
+ blk_io_plug_call(laio_unplug_fn, s);
+ }
}
return 0;
diff --git a/block/meson.build b/block/meson.build
index 486dda8b85..fb4332bd66 100644
--- a/block/meson.build
+++ b/block/meson.build
@@ -23,6 +23,7 @@ block_ss.add(files(
'mirror.c',
'nbd.c',
'null.c',
+ 'plug.c',
'qapi.c',
'qcow2-bitmap.c',
'qcow2-cache.c',
diff --git a/block/nvme.c b/block/nvme.c
index 17937d398d..7ca85bc44a 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -25,6 +25,7 @@
#include "qemu/vfio-helpers.h"
#include "block/block-io.h"
#include "block/block_int.h"
+#include "sysemu/block-backend.h"
#include "sysemu/replay.h"
#include "trace.h"
@@ -119,7 +120,6 @@ struct BDRVNVMeState {
int blkshift;
uint64_t max_transfer;
- bool plugged;
bool supports_write_zeroes;
bool supports_discard;
@@ -282,7 +282,7 @@ static void nvme_kick(NVMeQueuePair *q)
{
BDRVNVMeState *s = q->s;
- if (s->plugged || !q->need_kick) {
+ if (!q->need_kick) {
return;
}
trace_nvme_kick(s, q->index);
@@ -387,10 +387,6 @@ static bool nvme_process_completion(NVMeQueuePair *q)
NvmeCqe *c;
trace_nvme_process_completion(s, q->index, q->inflight);
- if (s->plugged) {
- trace_nvme_process_completion_queue_plugged(s, q->index);
- return false;
- }
/*
* Support re-entrancy when a request cb() function invokes aio_poll().
@@ -480,6 +476,15 @@ static void nvme_trace_command(const NvmeCmd *cmd)
}
}
+static void nvme_unplug_fn(void *opaque)
+{
+ NVMeQueuePair *q = opaque;
+
+ QEMU_LOCK_GUARD(&q->lock);
+ nvme_kick(q);
+ nvme_process_completion(q);
+}
+
static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
NvmeCmd *cmd, BlockCompletionFunc cb,
void *opaque)
@@ -496,8 +501,7 @@ static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd));
q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE;
q->need_kick++;
- nvme_kick(q);
- nvme_process_completion(q);
+ blk_io_plug_call(nvme_unplug_fn, q);
qemu_mutex_unlock(&q->lock);
}
@@ -1567,27 +1571,6 @@ static void nvme_attach_aio_context(BlockDriverState *bs,
}
}
-static void coroutine_fn nvme_co_io_plug(BlockDriverState *bs)
-{
- BDRVNVMeState *s = bs->opaque;
- assert(!s->plugged);
- s->plugged = true;
-}
-
-static void coroutine_fn nvme_co_io_unplug(BlockDriverState *bs)
-{
- BDRVNVMeState *s = bs->opaque;
- assert(s->plugged);
- s->plugged = false;
- for (unsigned i = INDEX_IO(0); i < s->queue_count; i++) {
- NVMeQueuePair *q = s->queues[i];
- qemu_mutex_lock(&q->lock);
- nvme_kick(q);
- nvme_process_completion(q);
- qemu_mutex_unlock(&q->lock);
- }
-}
-
static bool nvme_register_buf(BlockDriverState *bs, void *host, size_t size,
Error **errp)
{
@@ -1664,9 +1647,6 @@ static BlockDriver bdrv_nvme = {
.bdrv_detach_aio_context = nvme_detach_aio_context,
.bdrv_attach_aio_context = nvme_attach_aio_context,
- .bdrv_co_io_plug = nvme_co_io_plug,
- .bdrv_co_io_unplug = nvme_co_io_unplug,
-
.bdrv_register_buf = nvme_register_buf,
.bdrv_unregister_buf = nvme_unregister_buf,
};
diff --git a/block/plug.c b/block/plug.c
new file mode 100644
index 0000000000..98a155d2f4
--- /dev/null
+++ b/block/plug.c
@@ -0,0 +1,159 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Block I/O plugging
+ *
+ * Copyright Red Hat.
+ *
+ * This API defers a function call within a blk_io_plug()/blk_io_unplug()
+ * section, allowing multiple calls to batch up. This is a performance
+ * optimization that is used in the block layer to submit several I/O requests
+ * at once instead of individually:
+ *
+ * blk_io_plug(); <-- start of plugged region
+ * ...
+ * blk_io_plug_call(my_func, my_obj); <-- deferred my_func(my_obj) call
+ * blk_io_plug_call(my_func, my_obj); <-- another
+ * blk_io_plug_call(my_func, my_obj); <-- another
+ * ...
+ * blk_io_unplug(); <-- end of plugged region, my_func(my_obj) is called once
+ *
+ * This code is actually generic and not tied to the block layer. If another
+ * subsystem needs this functionality, it could be renamed.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/coroutine-tls.h"
+#include "qemu/notify.h"
+#include "qemu/thread.h"
+#include "sysemu/block-backend.h"
+
+/* A function call that has been deferred until unplug() */
+typedef struct {
+ void (*fn)(void *);
+ void *opaque;
+} UnplugFn;
+
+/* Per-thread state */
+typedef struct {
+ unsigned count; /* how many times has plug() been called? */
+ GArray *unplug_fns; /* functions to call at unplug time */
+} Plug;
+
+/* Use get_ptr_plug() to fetch this thread-local value */
+QEMU_DEFINE_STATIC_CO_TLS(Plug, plug);
+
+/* Called at thread cleanup time */
+static void blk_io_plug_atexit(Notifier *n, void *value)
+{
+ Plug *plug = get_ptr_plug();
+ g_array_free(plug->unplug_fns, TRUE);
+}
+
+/* This won't involve coroutines, so use __thread */
+static __thread Notifier blk_io_plug_atexit_notifier;
+
+/**
+ * blk_io_plug_call:
+ * @fn: a function pointer to be invoked
+ * @opaque: a user-defined argument to @fn()
+ *
+ * Call @fn(@opaque) immediately if not within a blk_io_plug()/blk_io_unplug()
+ * section.
+ *
+ * Otherwise defer the call until the end of the outermost
+ * blk_io_plug()/blk_io_unplug() section in this thread. If the same
+ * @fn/@opaque pair has already been deferred, it will only be called once upon
+ * blk_io_unplug() so that accumulated calls are batched into a single call.
+ *
+ * The caller must ensure that @opaque is not freed before @fn() is invoked.
+ */
+void blk_io_plug_call(void (*fn)(void *), void *opaque)
+{
+ Plug *plug = get_ptr_plug();
+
+ /* Call immediately if we're not plugged */
+ if (plug->count == 0) {
+ fn(opaque);
+ return;
+ }
+
+ GArray *array = plug->unplug_fns;
+ if (!array) {
+ array = g_array_new(FALSE, FALSE, sizeof(UnplugFn));
+ plug->unplug_fns = array;
+ blk_io_plug_atexit_notifier.notify = blk_io_plug_atexit;
+ qemu_thread_atexit_add(&blk_io_plug_atexit_notifier);
+ }
+
+ UnplugFn *fns = (UnplugFn *)array->data;
+ UnplugFn new_fn = {
+ .fn = fn,
+ .opaque = opaque,
+ };
+
+ /*
+ * There won't be many, so do a linear search. If this becomes a bottleneck
+ * then a binary search (glib 2.62+) or different data structure could be
+ * used.
+ */
+ for (guint i = 0; i < array->len; i++) {
+ if (memcmp(&fns[i], &new_fn, sizeof(new_fn)) == 0) {
+ return; /* already exists */
+ }
+ }
+
+ g_array_append_val(array, new_fn);
+}
+
+/**
+ * blk_io_plug: Defer blk_io_plug_call() functions until blk_io_unplug()
+ *
+ * blk_io_plug/unplug are thread-local operations. This means that multiple
+ * threads can simultaneously call plug/unplug, but the caller must ensure that
+ * each unplug() is called in the same thread of the matching plug().
+ *
+ * Nesting is supported. blk_io_plug_call() functions are only called at the
+ * outermost blk_io_unplug().
+ */
+void blk_io_plug(void)
+{
+ Plug *plug = get_ptr_plug();
+
+ assert(plug->count < UINT32_MAX);
+
+ plug->count++;
+}
+
+/**
+ * blk_io_unplug: Run any pending blk_io_plug_call() functions
+ *
+ * There must have been a matching blk_io_plug() call in the same thread prior
+ * to this blk_io_unplug() call.
+ */
+void blk_io_unplug(void)
+{
+ Plug *plug = get_ptr_plug();
+
+ assert(plug->count > 0);
+
+ if (--plug->count > 0) {
+ return;
+ }
+
+ GArray *array = plug->unplug_fns;
+ if (!array) {
+ return;
+ }
+
+ UnplugFn *fns = (UnplugFn *)array->data;
+
+ for (guint i = 0; i < array->len; i++) {
+ fns[i].fn(fns[i].opaque);
+ }
+
+ /*
+ * This resets the array without freeing memory so that appending is cheap
+ * in the future.
+ */
+ g_array_set_size(array, 0);
+}
diff --git a/block/trace-events b/block/trace-events
index 32665158d6..6f121b7636 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -64,9 +64,8 @@ file_paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) "
# io_uring.c
luring_init_state(void *s, size_t size) "s %p size %zu"
luring_cleanup_state(void *s) "%p freed"
-luring_io_plug(void *s) "LuringState %p plug"
-luring_io_unplug(void *s, int blocked, int plugged, int queued, int inflight) "LuringState %p blocked %d plugged %d queued %d inflight %d"
-luring_do_submit(void *s, int blocked, int plugged, int queued, int inflight) "LuringState %p blocked %d plugged %d queued %d inflight %d"
+luring_unplug_fn(void *s, int blocked, int queued, int inflight) "LuringState %p blocked %d queued %d inflight %d"
+luring_do_submit(void *s, int blocked, int queued, int inflight) "LuringState %p blocked %d queued %d inflight %d"
luring_do_submit_done(void *s, int ret) "LuringState %p submitted to kernel %d"
luring_co_submit(void *bs, void *s, void *luringcb, int fd, uint64_t offset, size_t nbytes, int type) "bs %p s %p luringcb %p fd %d offset %" PRId64 " nbytes %zd type %d"
luring_process_completion(void *s, void *aiocb, int ret) "LuringState %p luringcb %p ret %d"
@@ -141,7 +140,6 @@ nvme_kick(void *s, unsigned q_index) "s %p q #%u"
nvme_dma_flush_queue_wait(void *s) "s %p"
nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x"
nvme_process_completion(void *s, unsigned q_index, int inflight) "s %p q #%u inflight %d"
-nvme_process_completion_queue_plugged(void *s, unsigned q_index) "s %p q #%u"
nvme_complete_command(void *s, unsigned q_index, int cid) "s %p q #%u cid %d"
nvme_submit_command(void *s, unsigned q_index, int cid) "s %p q #%u cid %d"
nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x"
diff --git a/hw/block/dataplane/xen-block.c b/hw/block/dataplane/xen-block.c
index 2597f38805..3b6f2b0aa2 100644
--- a/hw/block/dataplane/xen-block.c
+++ b/hw/block/dataplane/xen-block.c
@@ -537,7 +537,7 @@ static bool xen_block_handle_requests(XenBlockDataPlane *dataplane)
* is below us.
*/
if (inflight_atstart > IO_PLUG_THRESHOLD) {
- blk_io_plug(dataplane->blk);
+ blk_io_plug();
}
while (rc != rp) {
/* pull request from ring */
@@ -577,12 +577,12 @@ static bool xen_block_handle_requests(XenBlockDataPlane *dataplane)
if (inflight_atstart > IO_PLUG_THRESHOLD &&
batched >= inflight_atstart) {
- blk_io_unplug(dataplane->blk);
+ blk_io_unplug();
}
xen_block_do_aio(request);
if (inflight_atstart > IO_PLUG_THRESHOLD) {
if (batched >= inflight_atstart) {
- blk_io_plug(dataplane->blk);
+ blk_io_plug();
batched = 0;
} else {
batched++;
@@ -590,7 +590,7 @@ static bool xen_block_handle_requests(XenBlockDataPlane *dataplane)
}
}
if (inflight_atstart > IO_PLUG_THRESHOLD) {
- blk_io_unplug(dataplane->blk);
+ blk_io_unplug();
}
return done_something;
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 4ca66b5860..39e7f23fab 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -1134,7 +1134,7 @@ void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
bool suppress_notifications = virtio_queue_get_notification(vq);
aio_context_acquire(blk_get_aio_context(s->blk));
- blk_io_plug(s->blk);
+ blk_io_plug();
do {
if (suppress_notifications) {
@@ -1158,7 +1158,7 @@ void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
virtio_blk_submit_multireq(s, &mrb);
}
- blk_io_unplug(s->blk);
+ blk_io_unplug();
aio_context_release(blk_get_aio_context(s->blk));
}
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index 4a8849cc7e..9c8ef0aaa6 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -799,7 +799,7 @@ static int virtio_scsi_handle_cmd_req_prepare(VirtIOSCSI *s, VirtIOSCSIReq *req)
return -ENOBUFS;
}
scsi_req_ref(req->sreq);
- blk_io_plug(d->conf.blk);
+ blk_io_plug();
object_unref(OBJECT(d));
return 0;
}
@@ -810,7 +810,7 @@ static void virtio_scsi_handle_cmd_req_submit(VirtIOSCSI *s, VirtIOSCSIReq *req)
if (scsi_req_enqueue(sreq)) {
scsi_req_continue(sreq);
}
- blk_io_unplug(sreq->dev->conf.blk);
+ blk_io_unplug();
scsi_req_unref(sreq);
}
@@ -836,7 +836,7 @@ static void virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq)
while (!QTAILQ_EMPTY(&reqs)) {
req = QTAILQ_FIRST(&reqs);
QTAILQ_REMOVE(&reqs, req, next);
- blk_io_unplug(req->sreq->dev->conf.blk);
+ blk_io_unplug();
scsi_req_unref(req->sreq);
virtqueue_detach_element(req->vq, &req->elem, 0);
virtio_scsi_free_req(req);
diff --git a/include/block/block-io.h b/include/block/block-io.h
index a27e471a87..43af816d75 100644
--- a/include/block/block-io.h
+++ b/include/block/block-io.h
@@ -259,9 +259,6 @@ void coroutine_fn bdrv_co_leave(BlockDriverState *bs, AioContext *old_ctx);
AioContext *child_of_bds_get_parent_aio_context(BdrvChild *c);
-void coroutine_fn GRAPH_RDLOCK bdrv_co_io_plug(BlockDriverState *bs);
-void coroutine_fn GRAPH_RDLOCK bdrv_co_io_unplug(BlockDriverState *bs);
-
bool coroutine_fn GRAPH_RDLOCK
bdrv_co_can_store_new_dirty_bitmap(BlockDriverState *bs, const char *name,
uint32_t granularity, Error **errp);
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index b1cbc1e00c..74195c3004 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -768,11 +768,6 @@ struct BlockDriver {
void coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_debug_event)(
BlockDriverState *bs, BlkdebugEvent event);
- /* io queue for linux-aio */
- void coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_io_plug)(BlockDriverState *bs);
- void coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_io_unplug)(
- BlockDriverState *bs);
-
bool (*bdrv_supports_persistent_dirty_bitmap)(BlockDriverState *bs);
bool coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_can_store_new_dirty_bitmap)(
@@ -1227,12 +1222,6 @@ struct BlockDriverState {
unsigned int in_flight;
unsigned int serialising_in_flight;
- /*
- * counter for nested bdrv_io_plug.
- * Accessed with atomic ops.
- */
- unsigned io_plugged;
-
/* do we need to tell the quest if we have a volatile write cache? */
int enable_write_cache;
diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
index 0fe85ade77..0f63c2800c 100644
--- a/include/block/raw-aio.h
+++ b/include/block/raw-aio.h
@@ -62,13 +62,6 @@ int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov,
void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context);
void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context);
-
-/*
- * laio_io_plug/unplug work in the thread's current AioContext, therefore the
- * caller must ensure that they are paired in the same IOThread.
- */
-void laio_io_plug(void);
-void laio_io_unplug(uint64_t dev_max_batch);
#endif
/* io_uring.c - Linux io_uring implementation */
#ifdef CONFIG_LINUX_IO_URING
@@ -81,13 +74,6 @@ int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset,
QEMUIOVector *qiov, int type);
void luring_detach_aio_context(LuringState *s, AioContext *old_context);
void luring_attach_aio_context(LuringState *s, AioContext *new_context);
-
-/*
- * luring_io_plug/unplug work in the thread's current AioContext, therefore the
- * caller must ensure that they are paired in the same IOThread.
- */
-void luring_io_plug(void);
-void luring_io_unplug(void);
#endif
#ifdef _WIN32
diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h
index d62a7ee773..be4dcef59d 100644
--- a/include/sysemu/block-backend-io.h
+++ b/include/sysemu/block-backend-io.h
@@ -100,16 +100,9 @@ void blk_iostatus_set_err(BlockBackend *blk, int error);
int blk_get_max_iov(BlockBackend *blk);
int blk_get_max_hw_iov(BlockBackend *blk);
-/*
- * blk_io_plug/unplug are thread-local operations. This means that multiple
- * IOThreads can simultaneously call plug/unplug, but the caller must ensure
- * that each unplug() is called in the same IOThread of the matching plug().
- */
-void coroutine_fn blk_co_io_plug(BlockBackend *blk);
-void co_wrapper blk_io_plug(BlockBackend *blk);
-
-void coroutine_fn blk_co_io_unplug(BlockBackend *blk);
-void co_wrapper blk_io_unplug(BlockBackend *blk);
+void blk_io_plug(void);
+void blk_io_unplug(void);
+void blk_io_plug_call(void (*fn)(void *), void *opaque);
AioContext *blk_get_aio_context(BlockBackend *blk);
BlockAcctStats *blk_get_stats(BlockBackend *blk);
diff --git a/meson.build b/meson.build
index bc76ea96bf..a61d3e9b06 100644
--- a/meson.build
+++ b/meson.build
@@ -2106,6 +2106,10 @@ config_host_data.set('CONFIG_LZO', lzo.found())
config_host_data.set('CONFIG_MPATH', mpathpersist.found())
config_host_data.set('CONFIG_MPATH_NEW_API', mpathpersist_new_api)
config_host_data.set('CONFIG_BLKIO', blkio.found())
+if blkio.found()
+ config_host_data.set('CONFIG_BLKIO_VHOST_VDPA_FD',
+ blkio.version().version_compare('>=1.3.0'))
+endif
config_host_data.set('CONFIG_CURL', curl.found())
config_host_data.set('CONFIG_CURSES', curses.found())
config_host_data.set('CONFIG_GBM', gbm.found())
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 98d9116dae..4bf89171c6 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -3955,10 +3955,16 @@
#
# @path: path to the vhost-vdpa character device.
#
+# Features:
+# @fdset: Member @path supports the special "/dev/fdset/N" path
+# (since 8.1)
+#
# Since: 7.2
##
{ 'struct': 'BlockdevOptionsVirtioBlkVhostVdpa',
'data': { 'path': 'str' },
+ 'features': [ { 'name' :'fdset',
+ 'if': 'CONFIG_BLKIO_VHOST_VDPA_FD' } ],
'if': 'CONFIG_BLKIO' }
##