diff options
Diffstat (limited to 'block/linux-aio.c')
-rw-r--r-- | block/linux-aio.c | 178 |
1 files changed, 141 insertions, 37 deletions
diff --git a/block/linux-aio.c b/block/linux-aio.c index e906abebb3..d4e19d444c 100644 --- a/block/linux-aio.c +++ b/block/linux-aio.c @@ -59,7 +59,6 @@ struct LinuxAioState { /* I/O completion processing */ QEMUBH *completion_bh; - struct io_event events[MAX_EVENTS]; int event_idx; int event_max; }; @@ -95,64 +94,153 @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb) laiocb->ret = ret; if (laiocb->co) { - qemu_coroutine_enter(laiocb->co); + /* Jump and continue completion for foreign requests, don't do + * anything for current request, it will be completed shortly. */ + if (laiocb->co != qemu_coroutine_self()) { + qemu_coroutine_enter(laiocb->co); + } } else { laiocb->common.cb(laiocb->common.opaque, ret); qemu_aio_unref(laiocb); } } -/* The completion BH fetches completed I/O requests and invokes their - * callbacks. +/** + * aio_ring buffer which is shared between userspace and kernel. * - * The function is somewhat tricky because it supports nested event loops, for - * example when a request callback invokes aio_poll(). In order to do this, - * the completion events array and index are kept in LinuxAioState. The BH - * reschedules itself as long as there are completions pending so it will - * either be called again in a nested event loop or will be called after all - * events have been completed. When there are no events left to complete, the - * BH returns without rescheduling. + * This copied from linux/fs/aio.c, common header does not exist + * but AIO exists for ages so we assume ABI is stable. */ -static void qemu_laio_completion_bh(void *opaque) +struct aio_ring { + unsigned id; /* kernel internal index number */ + unsigned nr; /* number of io_events */ + unsigned head; /* Written to by userland or by kernel. */ + unsigned tail; + + unsigned magic; + unsigned compat_features; + unsigned incompat_features; + unsigned header_length; /* size of aio_ring */ + + struct io_event io_events[0]; +}; + +/** + * io_getevents_peek: + * @ctx: AIO context + * @events: pointer on events array, output value + + * Returns the number of completed events and sets a pointer + * on events array. This function does not update the internal + * ring buffer, only reads head and tail. When @events has been + * processed io_getevents_commit() must be called. + */ +static inline unsigned int io_getevents_peek(io_context_t ctx, + struct io_event **events) { - LinuxAioState *s = opaque; + struct aio_ring *ring = (struct aio_ring *)ctx; + unsigned int head = ring->head, tail = ring->tail; + unsigned int nr; - /* Fetch more completion events when empty */ - if (s->event_idx == s->event_max) { - do { - struct timespec ts = { 0 }; - s->event_max = io_getevents(s->ctx, MAX_EVENTS, MAX_EVENTS, - s->events, &ts); - } while (s->event_max == -EINTR); - - s->event_idx = 0; - if (s->event_max <= 0) { - s->event_max = 0; - return; /* no more events */ - } - s->io_q.in_flight -= s->event_max; + nr = tail >= head ? tail - head : ring->nr - head; + *events = ring->io_events + head; + /* To avoid speculative loads of s->events[i] before observing tail. + Paired with smp_wmb() inside linux/fs/aio.c: aio_complete(). */ + smp_rmb(); + + return nr; +} + +/** + * io_getevents_commit: + * @ctx: AIO context + * @nr: the number of events on which head should be advanced + * + * Advances head of a ring buffer. + */ +static inline void io_getevents_commit(io_context_t ctx, unsigned int nr) +{ + struct aio_ring *ring = (struct aio_ring *)ctx; + + if (nr) { + ring->head = (ring->head + nr) % ring->nr; } +} + +/** + * io_getevents_advance_and_peek: + * @ctx: AIO context + * @events: pointer on events array, output value + * @nr: the number of events on which head should be advanced + * + * Advances head of a ring buffer and returns number of elements left. + */ +static inline unsigned int +io_getevents_advance_and_peek(io_context_t ctx, + struct io_event **events, + unsigned int nr) +{ + io_getevents_commit(ctx, nr); + return io_getevents_peek(ctx, events); +} + +/** + * qemu_laio_process_completions: + * @s: AIO state + * + * Fetches completed I/O requests and invokes their callbacks. + * + * The function is somewhat tricky because it supports nested event loops, for + * example when a request callback invokes aio_poll(). In order to do this, + * indices are kept in LinuxAioState. Function schedules BH completion so it + * can be called again in a nested event loop. When there are no events left + * to complete the BH is being canceled. + */ +static void qemu_laio_process_completions(LinuxAioState *s) +{ + struct io_event *events; /* Reschedule so nested event loops see currently pending completions */ qemu_bh_schedule(s->completion_bh); - /* Process completion events */ - while (s->event_idx < s->event_max) { - struct iocb *iocb = s->events[s->event_idx].obj; - struct qemu_laiocb *laiocb = + while ((s->event_max = io_getevents_advance_and_peek(s->ctx, &events, + s->event_idx))) { + for (s->event_idx = 0; s->event_idx < s->event_max; ) { + struct iocb *iocb = events[s->event_idx].obj; + struct qemu_laiocb *laiocb = container_of(iocb, struct qemu_laiocb, iocb); - laiocb->ret = io_event_ret(&s->events[s->event_idx]); - s->event_idx++; + laiocb->ret = io_event_ret(&events[s->event_idx]); - qemu_laio_process_completion(laiocb); + /* Change counters one-by-one because we can be nested. */ + s->io_q.in_flight--; + s->event_idx++; + qemu_laio_process_completion(laiocb); + } } + qemu_bh_cancel(s->completion_bh); + + /* If we are nested we have to notify the level above that we are done + * by setting event_max to zero, upper level will then jump out of it's + * own `for` loop. If we are the last all counters droped to zero. */ + s->event_max = 0; + s->event_idx = 0; +} + +static void qemu_laio_process_completions_and_submit(LinuxAioState *s) +{ + qemu_laio_process_completions(s); if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) { ioq_submit(s); } +} - qemu_bh_cancel(s->completion_bh); +static void qemu_laio_completion_bh(void *opaque) +{ + LinuxAioState *s = opaque; + + qemu_laio_process_completions_and_submit(s); } static void qemu_laio_completion_cb(EventNotifier *e) @@ -160,7 +248,7 @@ static void qemu_laio_completion_cb(EventNotifier *e) LinuxAioState *s = container_of(e, LinuxAioState, e); if (event_notifier_test_and_clear(&s->e)) { - qemu_laio_completion_bh(s); + qemu_laio_process_completions_and_submit(s); } } @@ -236,6 +324,19 @@ static void ioq_submit(LinuxAioState *s) QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed); } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending)); s->io_q.blocked = (s->io_q.in_queue > 0); + + if (s->io_q.in_flight) { + /* We can try to complete something just right away if there are + * still requests in-flight. */ + qemu_laio_process_completions(s); + /* + * Even we have completed everything (in_flight == 0), the queue can + * have still pended requests (in_queue > 0). We do not attempt to + * repeat submission to avoid IO hang. The reason is simple: s->e is + * still set and completion callback will be called shortly and all + * pended requests will be submitted from there. + */ + } } void laio_io_plug(BlockDriverState *bs, LinuxAioState *s) @@ -293,6 +394,7 @@ int coroutine_fn laio_co_submit(BlockDriverState *bs, LinuxAioState *s, int fd, .co = qemu_coroutine_self(), .nbytes = qiov->size, .ctx = s, + .ret = -EINPROGRESS, .is_read = (type == QEMU_AIO_READ), .qiov = qiov, }; @@ -302,7 +404,9 @@ int coroutine_fn laio_co_submit(BlockDriverState *bs, LinuxAioState *s, int fd, return ret; } - qemu_coroutine_yield(); + if (laiocb.ret == -EINPROGRESS) { + qemu_coroutine_yield(); + } return laiocb.ret; } |