aboutsummaryrefslogtreecommitdiff
path: root/util/fdmon-io_uring.c
diff options
context:
space:
mode:
Diffstat (limited to 'util/fdmon-io_uring.c')
-rw-r--r--util/fdmon-io_uring.c326
1 files changed, 326 insertions, 0 deletions
diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c
new file mode 100644
index 0000000000..fb99b4b61e
--- /dev/null
+++ b/util/fdmon-io_uring.c
@@ -0,0 +1,326 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Linux io_uring file descriptor monitoring
+ *
+ * The Linux io_uring API supports file descriptor monitoring with a few
+ * advantages over existing APIs like poll(2) and epoll(7):
+ *
+ * 1. Userspace polling of events is possible because the completion queue (cq
+ * ring) is shared between the kernel and userspace. This allows
+ * applications that rely on userspace polling to also monitor file
+ * descriptors in the same userspace polling loop.
+ *
+ * 2. Submission and completion is batched and done together in a single system
+ * call. This minimizes the number of system calls.
+ *
+ * 3. File descriptor monitoring is O(1) like epoll(7) so it scales better than
+ * poll(2).
+ *
+ * 4. Nanosecond timeouts are supported so it requires fewer syscalls than
+ * epoll(7).
+ *
+ * This code only monitors file descriptors and does not do asynchronous disk
+ * I/O. Implementing disk I/O efficiently has other requirements and should
+ * use a separate io_uring so it does not make sense to unify the code.
+ *
+ * File descriptor monitoring is implemented using the following operations:
+ *
+ * 1. IORING_OP_POLL_ADD - adds a file descriptor to be monitored.
+ * 2. IORING_OP_POLL_REMOVE - removes a file descriptor being monitored. When
+ * the poll mask changes for a file descriptor it is first removed and then
+ * re-added with the new poll mask, so this operation is also used as part
+ * of modifying an existing monitored file descriptor.
+ * 3. IORING_OP_TIMEOUT - added every time a blocking syscall is made to wait
+ * for events. This operation self-cancels if another event completes
+ * before the timeout.
+ *
+ * io_uring calls the submission queue the "sq ring" and the completion queue
+ * the "cq ring". Ring entries are called "sqe" and "cqe", respectively.
+ *
+ * The code is structured so that sq/cq rings are only modified within
+ * fdmon_io_uring_wait(). Changes to AioHandlers are made by enqueuing them on
+ * ctx->submit_list so that fdmon_io_uring_wait() can submit IORING_OP_POLL_ADD
+ * and/or IORING_OP_POLL_REMOVE sqes for them.
+ */
+
+#include "qemu/osdep.h"
+#include <poll.h>
+#include "qemu/rcu_queue.h"
+#include "aio-posix.h"
+
+enum {
+ FDMON_IO_URING_ENTRIES = 128, /* sq/cq ring size */
+
+ /* AioHandler::flags */
+ FDMON_IO_URING_PENDING = (1 << 0),
+ FDMON_IO_URING_ADD = (1 << 1),
+ FDMON_IO_URING_REMOVE = (1 << 2),
+};
+
+static inline int poll_events_from_pfd(int pfd_events)
+{
+ return (pfd_events & G_IO_IN ? POLLIN : 0) |
+ (pfd_events & G_IO_OUT ? POLLOUT : 0) |
+ (pfd_events & G_IO_HUP ? POLLHUP : 0) |
+ (pfd_events & G_IO_ERR ? POLLERR : 0);
+}
+
+static inline int pfd_events_from_poll(int poll_events)
+{
+ return (poll_events & POLLIN ? G_IO_IN : 0) |
+ (poll_events & POLLOUT ? G_IO_OUT : 0) |
+ (poll_events & POLLHUP ? G_IO_HUP : 0) |
+ (poll_events & POLLERR ? G_IO_ERR : 0);
+}
+
+/*
+ * Returns an sqe for submitting a request. Only be called within
+ * fdmon_io_uring_wait().
+ */
+static struct io_uring_sqe *get_sqe(AioContext *ctx)
+{
+ struct io_uring *ring = &ctx->fdmon_io_uring;
+ struct io_uring_sqe *sqe = io_uring_get_sqe(ring);
+ int ret;
+
+ if (likely(sqe)) {
+ return sqe;
+ }
+
+ /* No free sqes left, submit pending sqes first */
+ ret = io_uring_submit(ring);
+ assert(ret > 1);
+ sqe = io_uring_get_sqe(ring);
+ assert(sqe);
+ return sqe;
+}
+
+/* Atomically enqueue an AioHandler for sq ring submission */
+static void enqueue(AioHandlerSList *head, AioHandler *node, unsigned flags)
+{
+ unsigned old_flags;
+
+ old_flags = atomic_fetch_or(&node->flags, FDMON_IO_URING_PENDING | flags);
+ if (!(old_flags & FDMON_IO_URING_PENDING)) {
+ QSLIST_INSERT_HEAD_ATOMIC(head, node, node_submitted);
+ }
+}
+
+/* Dequeue an AioHandler for sq ring submission. Called by fill_sq_ring(). */
+static AioHandler *dequeue(AioHandlerSList *head, unsigned *flags)
+{
+ AioHandler *node = QSLIST_FIRST(head);
+
+ if (!node) {
+ return NULL;
+ }
+
+ /* Doesn't need to be atomic since fill_sq_ring() moves the list */
+ QSLIST_REMOVE_HEAD(head, node_submitted);
+
+ /*
+ * Don't clear FDMON_IO_URING_REMOVE. It's sticky so it can serve two
+ * purposes: telling fill_sq_ring() to submit IORING_OP_POLL_REMOVE and
+ * telling process_cqe() to delete the AioHandler when its
+ * IORING_OP_POLL_ADD completes.
+ */
+ *flags = atomic_fetch_and(&node->flags, ~(FDMON_IO_URING_PENDING |
+ FDMON_IO_URING_ADD));
+ return node;
+}
+
+static void fdmon_io_uring_update(AioContext *ctx,
+ AioHandler *old_node,
+ AioHandler *new_node)
+{
+ if (new_node) {
+ enqueue(&ctx->submit_list, new_node, FDMON_IO_URING_ADD);
+ }
+
+ if (old_node) {
+ /*
+ * Deletion is tricky because IORING_OP_POLL_ADD and
+ * IORING_OP_POLL_REMOVE are async. We need to wait for the original
+ * IORING_OP_POLL_ADD to complete before this handler can be freed
+ * safely.
+ *
+ * It's possible that the file descriptor becomes ready and the
+ * IORING_OP_POLL_ADD cqe is enqueued before IORING_OP_POLL_REMOVE is
+ * submitted, too.
+ *
+ * Mark this handler deleted right now but don't place it on
+ * ctx->deleted_aio_handlers yet. Instead, manually fudge the list
+ * entry to make QLIST_IS_INSERTED() think this handler has been
+ * inserted and other code recognizes this AioHandler as deleted.
+ *
+ * Once the original IORING_OP_POLL_ADD completes we enqueue the
+ * handler on the real ctx->deleted_aio_handlers list to be freed.
+ */
+ assert(!QLIST_IS_INSERTED(old_node, node_deleted));
+ old_node->node_deleted.le_prev = &old_node->node_deleted.le_next;
+
+ enqueue(&ctx->submit_list, old_node, FDMON_IO_URING_REMOVE);
+ }
+}
+
+static void add_poll_add_sqe(AioContext *ctx, AioHandler *node)
+{
+ struct io_uring_sqe *sqe = get_sqe(ctx);
+ int events = poll_events_from_pfd(node->pfd.events);
+
+ io_uring_prep_poll_add(sqe, node->pfd.fd, events);
+ io_uring_sqe_set_data(sqe, node);
+}
+
+static void add_poll_remove_sqe(AioContext *ctx, AioHandler *node)
+{
+ struct io_uring_sqe *sqe = get_sqe(ctx);
+
+ io_uring_prep_poll_remove(sqe, node);
+}
+
+/* Add a timeout that self-cancels when another cqe becomes ready */
+static void add_timeout_sqe(AioContext *ctx, int64_t ns)
+{
+ struct io_uring_sqe *sqe;
+ struct __kernel_timespec ts = {
+ .tv_sec = ns / NANOSECONDS_PER_SECOND,
+ .tv_nsec = ns % NANOSECONDS_PER_SECOND,
+ };
+
+ sqe = get_sqe(ctx);
+ io_uring_prep_timeout(sqe, &ts, 1, 0);
+}
+
+/* Add sqes from ctx->submit_list for submission */
+static void fill_sq_ring(AioContext *ctx)
+{
+ AioHandlerSList submit_list;
+ AioHandler *node;
+ unsigned flags;
+
+ QSLIST_MOVE_ATOMIC(&submit_list, &ctx->submit_list);
+
+ while ((node = dequeue(&submit_list, &flags))) {
+ /* Order matters, just in case both flags were set */
+ if (flags & FDMON_IO_URING_ADD) {
+ add_poll_add_sqe(ctx, node);
+ }
+ if (flags & FDMON_IO_URING_REMOVE) {
+ add_poll_remove_sqe(ctx, node);
+ }
+ }
+}
+
+/* Returns true if a handler became ready */
+static bool process_cqe(AioContext *ctx,
+ AioHandlerList *ready_list,
+ struct io_uring_cqe *cqe)
+{
+ AioHandler *node = io_uring_cqe_get_data(cqe);
+ unsigned flags;
+
+ /* poll_timeout and poll_remove have a zero user_data field */
+ if (!node) {
+ return false;
+ }
+
+ /*
+ * Deletion can only happen when IORING_OP_POLL_ADD completes. If we race
+ * with enqueue() here then we can safely clear the FDMON_IO_URING_REMOVE
+ * bit before IORING_OP_POLL_REMOVE is submitted.
+ */
+ flags = atomic_fetch_and(&node->flags, ~FDMON_IO_URING_REMOVE);
+ if (flags & FDMON_IO_URING_REMOVE) {
+ QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
+ return false;
+ }
+
+ aio_add_ready_handler(ready_list, node, pfd_events_from_poll(cqe->res));
+
+ /* IORING_OP_POLL_ADD is one-shot so we must re-arm it */
+ add_poll_add_sqe(ctx, node);
+ return true;
+}
+
+static int process_cq_ring(AioContext *ctx, AioHandlerList *ready_list)
+{
+ struct io_uring *ring = &ctx->fdmon_io_uring;
+ struct io_uring_cqe *cqe;
+ unsigned num_cqes = 0;
+ unsigned num_ready = 0;
+ unsigned head;
+
+ io_uring_for_each_cqe(ring, head, cqe) {
+ if (process_cqe(ctx, ready_list, cqe)) {
+ num_ready++;
+ }
+
+ num_cqes++;
+ }
+
+ io_uring_cq_advance(ring, num_cqes);
+ return num_ready;
+}
+
+static int fdmon_io_uring_wait(AioContext *ctx, AioHandlerList *ready_list,
+ int64_t timeout)
+{
+ unsigned wait_nr = 1; /* block until at least one cqe is ready */
+ int ret;
+
+ /* Fall back while external clients are disabled */
+ if (atomic_read(&ctx->external_disable_cnt)) {
+ return fdmon_poll_ops.wait(ctx, ready_list, timeout);
+ }
+
+ if (timeout == 0) {
+ wait_nr = 0; /* non-blocking */
+ } else if (timeout > 0) {
+ add_timeout_sqe(ctx, timeout);
+ }
+
+ fill_sq_ring(ctx);
+
+ ret = io_uring_submit_and_wait(&ctx->fdmon_io_uring, wait_nr);
+ assert(ret >= 0);
+
+ return process_cq_ring(ctx, ready_list);
+}
+
+static const FDMonOps fdmon_io_uring_ops = {
+ .update = fdmon_io_uring_update,
+ .wait = fdmon_io_uring_wait,
+};
+
+bool fdmon_io_uring_setup(AioContext *ctx)
+{
+ int ret;
+
+ ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES, &ctx->fdmon_io_uring, 0);
+ if (ret != 0) {
+ return false;
+ }
+
+ QSLIST_INIT(&ctx->submit_list);
+ ctx->fdmon_ops = &fdmon_io_uring_ops;
+ return true;
+}
+
+void fdmon_io_uring_destroy(AioContext *ctx)
+{
+ if (ctx->fdmon_ops == &fdmon_io_uring_ops) {
+ AioHandler *node;
+
+ io_uring_queue_exit(&ctx->fdmon_io_uring);
+
+ /* No need to submit these anymore, just free them. */
+ while ((node = QSLIST_FIRST_RCU(&ctx->submit_list))) {
+ QSLIST_REMOVE_HEAD_RCU(&ctx->submit_list, node_submitted);
+ QLIST_REMOVE(node, node);
+ g_free(node);
+ }
+
+ ctx->fdmon_ops = &fdmon_poll_ops;
+ }
+}