aio-posix: remove idle poll handlers to improve scalability

When there are many poll handlers it's likely that some of them are idle most of the time. Remove handlers that haven't had activity recently so that the polling loop scales better for guests with a large number of devices. This feature only takes effect for the Linux io_uring fd monitoring implementation because it is capable of combining fd monitoring with userspace polling. The other implementations can't do that and risk starving fds in favor of poll handlers, so don't try this optimization when they are in use. IOPS improves from 10k to 105k when the guest has 100 virtio-blk-pci,num-queues=32 devices and 1 virtio-blk-pci,num-queues=1 device for rw=randread,iodepth=1,bs=4k,ioengine=libaio on NVMe. [Clarified aio_poll_handlers locking discipline explanation in comment after discussion with Paolo Bonzini <pbonzini@redhat.com>. --Stefan] Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> Link: https://lore.kernel.org/r/20200305170806.1313245-8-stefanha@redhat.com Message-Id: <20200305170806.1313245-8-stefanha@redhat.com>
author: Stefan Hajnoczi <stefanha@redhat.com> 2020-03-05 17:08:06 +0000
committer: Stefan Hajnoczi <stefanha@redhat.com> 2020-03-09 16:45:16 +0000
commit: d37d0e365afb6825a90d8356fc6adcc1f58f40f3 (patch)
tree: d5cc874fc5fffaaecd073a4f56dac12378449265 /util/aio-posix.c
parent: aa38e19f05c3a5ae64dff84f44e1aa31281a5b14 (diff)
1 files changed, 86 insertions, 7 deletions
diff --git a/util/aio-posix.c b/util/aio-posix.c
index 759989b45b..cd6cf0a4a9 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -22,6 +22,9 @@
 #include "trace.h"
 #include "aio-posix.h"
 
+/* Stop userspace polling on a handler if it isn't active for some time */
+#define POLL_IDLE_INTERVAL_NS (7 * NANOSECONDS_PER_SECOND)
+
 bool aio_poll_disabled(AioContext *ctx)
 {
     return atomic_read(&ctx->poll_disable_cnt);
@@ -78,6 +81,7 @@ static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node)
      * deleted because deleted nodes are only cleaned up while
      * no one is walking the handlers list.
      */
+    QLIST_SAFE_REMOVE(node, node_poll);
     QLIST_REMOVE(node, node);
     return true;
 }
@@ -205,7 +209,7 @@ static bool poll_set_started(AioContext *ctx, bool started)
     ctx->poll_started = started;
 
     qemu_lockcnt_inc(&ctx->list_lock);
-    QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
+    QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) {
         IOHandler *fn;
 
         if (QLIST_IS_INSERTED(node, node_deleted)) {
@@ -286,6 +290,7 @@ static void aio_free_deleted_handlers(AioContext *ctx)
     while ((node = QLIST_FIRST_RCU(&ctx->deleted_aio_handlers))) {
         QLIST_REMOVE(node, node);
         QLIST_REMOVE(node, node_deleted);
+        QLIST_SAFE_REMOVE(node, node_poll);
         g_free(node);
     }
 
@@ -300,6 +305,22 @@ static bool aio_dispatch_handler(AioContext *ctx, AioHandler *node)
     revents = node->pfd.revents & node->pfd.events;
     node->pfd.revents = 0;
 
+    /*
+     * Start polling AioHandlers when they become ready because activity is
+     * likely to continue.  Note that starvation is theoretically possible when
+     * fdmon_supports_polling(), but only until the fd fires for the first
+     * time.
+     */
+    if (!QLIST_IS_INSERTED(node, node_deleted) &&
+        !QLIST_IS_INSERTED(node, node_poll) &&
+        node->io_poll) {
+        trace_poll_add(ctx, node, node->pfd.fd, revents);
+        if (ctx->poll_started && node->io_poll_begin) {
+            node->io_poll_begin(node->opaque);
+        }
+        QLIST_INSERT_HEAD(&ctx->poll_aio_handlers, node, node_poll);
+    }
+
     if (!QLIST_IS_INSERTED(node, node_deleted) &&
         (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
         aio_node_check(ctx, node->is_external) &&
@@ -364,15 +385,19 @@ void aio_dispatch(AioContext *ctx)
     timerlistgroup_run_timers(&ctx->tlg);
 }
 
-static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout)
+static bool run_poll_handlers_once(AioContext *ctx,
+                                   int64_t now,
+                                   int64_t *timeout)
 {
     bool progress = false;
     AioHandler *node;
+    AioHandler *tmp;
 
-    QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
-        if (!QLIST_IS_INSERTED(node, node_deleted) && node->io_poll &&
-            aio_node_check(ctx, node->is_external) &&
+    QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) {
+        if (aio_node_check(ctx, node->is_external) &&
             node->io_poll(node->opaque)) {
+            node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS;
+
             /*
              * Polling was successful, exit try_poll_mode immediately
              * to adjust the next polling time.
@@ -389,6 +414,50 @@ static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout)
     return progress;
 }
 
+static bool fdmon_supports_polling(AioContext *ctx)
+{
+    return ctx->fdmon_ops->need_wait != aio_poll_disabled;
+}
+
+static bool remove_idle_poll_handlers(AioContext *ctx, int64_t now)
+{
+    AioHandler *node;
+    AioHandler *tmp;
+    bool progress = false;
+
+    /*
+     * File descriptor monitoring implementations without userspace polling
+     * support suffer from starvation when a subset of handlers is polled
+     * because fds will not be processed in a timely fashion.  Don't remove
+     * idle poll handlers.
+     */
+    if (!fdmon_supports_polling(ctx)) {
+        return false;
+    }
+
+    QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) {
+        if (node->poll_idle_timeout == 0LL) {
+            node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS;
+        } else if (now >= node->poll_idle_timeout) {
+            trace_poll_remove(ctx, node, node->pfd.fd);
+            node->poll_idle_timeout = 0LL;
+            QLIST_SAFE_REMOVE(node, node_poll);
+            if (ctx->poll_started && node->io_poll_end) {
+                node->io_poll_end(node->opaque);
+
+                /*
+                 * Final poll in case ->io_poll_end() races with an event.
+                 * Nevermind about re-adding the handler in the rare case where
+                 * this causes progress.
+                 */
+                progress = node->io_poll(node->opaque) || progress;
+            }
+        }
+    }
+
+    return progress;
+}
+
 /* run_poll_handlers:
  * @ctx: the AioContext
  * @max_ns: maximum time to poll for, in nanoseconds
@@ -424,12 +493,17 @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout)
 
     start_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
     do {
-        progress = run_poll_handlers_once(ctx, timeout);
+        progress = run_poll_handlers_once(ctx, start_time, timeout);
         elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time;
         max_ns = qemu_soonest_timeout(*timeout, max_ns);
         assert(!(max_ns && progress));
     } while (elapsed_time < max_ns && !ctx->fdmon_ops->need_wait(ctx));
 
+    if (remove_idle_poll_handlers(ctx, start_time + elapsed_time)) {
+        *timeout = 0;
+        progress = true;
+    }
+
     /* If time has passed with no successful polling, adjust *timeout to
      * keep the same ending time.
      */
@@ -454,8 +528,13 @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout)
  */
 static bool try_poll_mode(AioContext *ctx, int64_t *timeout)
 {
-    int64_t max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns);
+    int64_t max_ns;
+
+    if (QLIST_EMPTY_RCU(&ctx->poll_aio_handlers)) {
+        return false;
+    }
 
+    max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns);
     if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) {
         poll_set_started(ctx, true);
author	Stefan Hajnoczi <stefanha@redhat.com>	2020-03-05 17:08:06 +0000
committer	Stefan Hajnoczi <stefanha@redhat.com>	2020-03-09 16:45:16 +0000
commit	d37d0e365afb6825a90d8356fc6adcc1f58f40f3 (patch)
tree	d5cc874fc5fffaaecd073a4f56dac12378449265 /util/aio-posix.c
parent	aa38e19f05c3a5ae64dff84f44e1aa31281a5b14 (diff)