diff options
Diffstat (limited to 'block/qed.c')
-rw-r--r-- | block/qed.c | 779 |
1 files changed, 302 insertions, 477 deletions
diff --git a/block/qed.c b/block/qed.c index 8d899fd479..385381a78a 100644 --- a/block/qed.c +++ b/block/qed.c @@ -21,10 +21,6 @@ #include "qapi/qmp/qerror.h" #include "sysemu/block-backend.h" -static const AIOCBInfo qed_aiocb_info = { - .aiocb_size = sizeof(QEDAIOCB), -}; - static int bdrv_qed_probe(const uint8_t *buf, int buf_size, const char *filename) { @@ -92,49 +88,13 @@ int qed_write_header_sync(BDRVQEDState *s) return 0; } -typedef struct { - GenericCB gencb; - BDRVQEDState *s; - struct iovec iov; - QEMUIOVector qiov; - int nsectors; - uint8_t *buf; -} QEDWriteHeaderCB; - -static void qed_write_header_cb(void *opaque, int ret) -{ - QEDWriteHeaderCB *write_header_cb = opaque; - - qemu_vfree(write_header_cb->buf); - gencb_complete(write_header_cb, ret); -} - -static void qed_write_header_read_cb(void *opaque, int ret) -{ - QEDWriteHeaderCB *write_header_cb = opaque; - BDRVQEDState *s = write_header_cb->s; - - if (ret) { - qed_write_header_cb(write_header_cb, ret); - return; - } - - /* Update header */ - qed_header_cpu_to_le(&s->header, (QEDHeader *)write_header_cb->buf); - - bdrv_aio_writev(s->bs->file, 0, &write_header_cb->qiov, - write_header_cb->nsectors, qed_write_header_cb, - write_header_cb); -} - /** * Update header in-place (does not rewrite backing filename or other strings) * * This function only updates known header fields in-place and does not affect * extra data after the QED header. */ -static void qed_write_header(BDRVQEDState *s, BlockCompletionFunc cb, - void *opaque) +static int coroutine_fn qed_write_header(BDRVQEDState *s) { /* We must write full sectors for O_DIRECT but cannot necessarily generate * the data following the header if an unrecognized compat feature is @@ -144,18 +104,35 @@ static void qed_write_header(BDRVQEDState *s, BlockCompletionFunc cb, int nsectors = DIV_ROUND_UP(sizeof(QEDHeader), BDRV_SECTOR_SIZE); size_t len = nsectors * BDRV_SECTOR_SIZE; - QEDWriteHeaderCB *write_header_cb = gencb_alloc(sizeof(*write_header_cb), - cb, opaque); - - write_header_cb->s = s; - write_header_cb->nsectors = nsectors; - write_header_cb->buf = qemu_blockalign(s->bs, len); - write_header_cb->iov.iov_base = write_header_cb->buf; - write_header_cb->iov.iov_len = len; - qemu_iovec_init_external(&write_header_cb->qiov, &write_header_cb->iov, 1); - - bdrv_aio_readv(s->bs->file, 0, &write_header_cb->qiov, nsectors, - qed_write_header_read_cb, write_header_cb); + uint8_t *buf; + struct iovec iov; + QEMUIOVector qiov; + int ret; + + buf = qemu_blockalign(s->bs, len); + iov = (struct iovec) { + .iov_base = buf, + .iov_len = len, + }; + qemu_iovec_init_external(&qiov, &iov, 1); + + ret = bdrv_co_preadv(s->bs->file, 0, qiov.size, &qiov, 0); + if (ret < 0) { + goto out; + } + + /* Update header */ + qed_header_cpu_to_le(&s->header, (QEDHeader *) buf); + + ret = bdrv_co_pwritev(s->bs->file, 0, qiov.size, &qiov, 0); + if (ret < 0) { + goto out; + } + + ret = 0; +out: + qemu_vfree(buf); + return ret; } static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size) @@ -272,20 +249,6 @@ static CachedL2Table *qed_new_l2_table(BDRVQEDState *s) return l2_table; } -static void qed_aio_next_io(QEDAIOCB *acb, int ret); - -static void qed_aio_start_io(QEDAIOCB *acb) -{ - qed_aio_next_io(acb, 0); -} - -static void qed_aio_next_io_cb(void *opaque, int ret) -{ - QEDAIOCB *acb = opaque; - - qed_aio_next_io(acb, ret); -} - static void qed_plug_allocating_write_reqs(BDRVQEDState *s) { assert(!s->allocating_write_reqs_plugged); @@ -295,61 +258,47 @@ static void qed_plug_allocating_write_reqs(BDRVQEDState *s) static void qed_unplug_allocating_write_reqs(BDRVQEDState *s) { - QEDAIOCB *acb; - assert(s->allocating_write_reqs_plugged); s->allocating_write_reqs_plugged = false; - - acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs); - if (acb) { - qed_aio_start_io(acb); - } + qemu_co_enter_next(&s->allocating_write_reqs); } -static void qed_finish_clear_need_check(void *opaque, int ret) -{ - /* Do nothing */ -} - -static void qed_flush_after_clear_need_check(void *opaque, int ret) +static void coroutine_fn qed_need_check_timer_entry(void *opaque) { BDRVQEDState *s = opaque; + int ret; - bdrv_aio_flush(s->bs, qed_finish_clear_need_check, s); + /* The timer should only fire when allocating writes have drained */ + assert(!s->allocating_acb); - /* No need to wait until flush completes */ - qed_unplug_allocating_write_reqs(s); -} + trace_qed_need_check_timer_cb(s); -static void qed_clear_need_check(void *opaque, int ret) -{ - BDRVQEDState *s = opaque; + qed_acquire(s); + qed_plug_allocating_write_reqs(s); - if (ret) { + /* Ensure writes are on disk before clearing flag */ + ret = bdrv_co_flush(s->bs->file->bs); + qed_release(s); + if (ret < 0) { qed_unplug_allocating_write_reqs(s); return; } s->header.features &= ~QED_F_NEED_CHECK; - qed_write_header(s, qed_flush_after_clear_need_check, s); + ret = qed_write_header(s); + (void) ret; + + qed_unplug_allocating_write_reqs(s); + + ret = bdrv_co_flush(s->bs); + (void) ret; } static void qed_need_check_timer_cb(void *opaque) { - BDRVQEDState *s = opaque; - - /* The timer should only fire when allocating writes have drained */ - assert(!QSIMPLEQ_FIRST(&s->allocating_write_reqs)); - - trace_qed_need_check_timer_cb(s); - - qed_acquire(s); - qed_plug_allocating_write_reqs(s); - - /* Ensure writes are on disk before clearing flag */ - bdrv_aio_flush(s->bs->file->bs, qed_clear_need_check, s); - qed_release(s); + Coroutine *co = qemu_coroutine_create(qed_need_check_timer_entry, opaque); + qemu_coroutine_enter(co); } void qed_acquire(BDRVQEDState *s) @@ -423,7 +372,7 @@ static int bdrv_qed_do_open(BlockDriverState *bs, QDict *options, int flags, int ret; s->bs = bs; - QSIMPLEQ_INIT(&s->allocating_write_reqs); + qemu_co_queue_init(&s->allocating_write_reqs); ret = bdrv_pread(bs->file, 0, &le_header, sizeof(le_header)); if (ret < 0) { @@ -776,14 +725,14 @@ static int64_t coroutine_fn bdrv_qed_co_get_block_status(BlockDriverState *bs, .file = file, }; QEDRequest request = { .l2_table = NULL }; + uint64_t offset; + int ret; - qed_find_cluster(s, &request, cb.pos, len, qed_is_allocated_cb, &cb); + ret = qed_find_cluster(s, &request, cb.pos, &len, &offset); + qed_is_allocated_cb(&cb, ret, offset, len); - /* Now sleep if the callback wasn't invoked immediately */ - while (cb.status == BDRV_BLOCK_OFFSET_MASK) { - cb.co = qemu_coroutine_self(); - qemu_coroutine_yield(); - } + /* The callback was invoked immediately */ + assert(cb.status != BDRV_BLOCK_OFFSET_MASK); qed_unref_l2_cache_entry(request.l2_table); @@ -792,7 +741,7 @@ static int64_t coroutine_fn bdrv_qed_co_get_block_status(BlockDriverState *bs, static BDRVQEDState *acb_to_s(QEDAIOCB *acb) { - return acb->common.bs->opaque; + return acb->bs->opaque; } /** @@ -808,13 +757,13 @@ static BDRVQEDState *acb_to_s(QEDAIOCB *acb) * This function reads qiov->size bytes starting at pos from the backing file. * If there is no backing file then zeroes are read. */ -static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos, - QEMUIOVector *qiov, - QEMUIOVector **backing_qiov, - BlockCompletionFunc *cb, void *opaque) +static int coroutine_fn qed_read_backing_file(BDRVQEDState *s, uint64_t pos, + QEMUIOVector *qiov, + QEMUIOVector **backing_qiov) { uint64_t backing_length = 0; size_t size; + int ret; /* If there is a backing file, get its length. Treat the absence of a * backing file like a zero length backing file. @@ -822,8 +771,7 @@ static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos, if (s->bs->backing) { int64_t l = bdrv_getlength(s->bs->backing->bs); if (l < 0) { - cb(opaque, l); - return; + return l; } backing_length = l; } @@ -836,8 +784,7 @@ static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos, /* Complete now if there are no backing file sectors to read */ if (pos >= backing_length) { - cb(opaque, 0); - return; + return 0; } /* If the read straddles the end of the backing file, shorten it */ @@ -849,46 +796,11 @@ static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos, qemu_iovec_concat(*backing_qiov, qiov, 0, size); BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO); - bdrv_aio_readv(s->bs->backing, pos / BDRV_SECTOR_SIZE, - *backing_qiov, size / BDRV_SECTOR_SIZE, cb, opaque); -} - -typedef struct { - GenericCB gencb; - BDRVQEDState *s; - QEMUIOVector qiov; - QEMUIOVector *backing_qiov; - struct iovec iov; - uint64_t offset; -} CopyFromBackingFileCB; - -static void qed_copy_from_backing_file_cb(void *opaque, int ret) -{ - CopyFromBackingFileCB *copy_cb = opaque; - qemu_vfree(copy_cb->iov.iov_base); - gencb_complete(©_cb->gencb, ret); -} - -static void qed_copy_from_backing_file_write(void *opaque, int ret) -{ - CopyFromBackingFileCB *copy_cb = opaque; - BDRVQEDState *s = copy_cb->s; - - if (copy_cb->backing_qiov) { - qemu_iovec_destroy(copy_cb->backing_qiov); - g_free(copy_cb->backing_qiov); - copy_cb->backing_qiov = NULL; - } - - if (ret) { - qed_copy_from_backing_file_cb(copy_cb, ret); - return; + ret = bdrv_co_preadv(s->bs->backing, pos, size, *backing_qiov, 0); + if (ret < 0) { + return ret; } - - BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE); - bdrv_aio_writev(s->bs->file, copy_cb->offset / BDRV_SECTOR_SIZE, - ©_cb->qiov, copy_cb->qiov.size / BDRV_SECTOR_SIZE, - qed_copy_from_backing_file_cb, copy_cb); + return 0; } /** @@ -898,32 +810,48 @@ static void qed_copy_from_backing_file_write(void *opaque, int ret) * @pos: Byte position in device * @len: Number of bytes * @offset: Byte offset in image file - * @cb: Completion function - * @opaque: User data for completion function */ -static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos, - uint64_t len, uint64_t offset, - BlockCompletionFunc *cb, - void *opaque) +static int coroutine_fn qed_copy_from_backing_file(BDRVQEDState *s, + uint64_t pos, uint64_t len, + uint64_t offset) { - CopyFromBackingFileCB *copy_cb; + QEMUIOVector qiov; + QEMUIOVector *backing_qiov = NULL; + struct iovec iov; + int ret; /* Skip copy entirely if there is no work to do */ if (len == 0) { - cb(opaque, 0); - return; + return 0; } - copy_cb = gencb_alloc(sizeof(*copy_cb), cb, opaque); - copy_cb->s = s; - copy_cb->offset = offset; - copy_cb->backing_qiov = NULL; - copy_cb->iov.iov_base = qemu_blockalign(s->bs, len); - copy_cb->iov.iov_len = len; - qemu_iovec_init_external(©_cb->qiov, ©_cb->iov, 1); + iov = (struct iovec) { + .iov_base = qemu_blockalign(s->bs, len), + .iov_len = len, + }; + qemu_iovec_init_external(&qiov, &iov, 1); + + ret = qed_read_backing_file(s, pos, &qiov, &backing_qiov); - qed_read_backing_file(s, pos, ©_cb->qiov, ©_cb->backing_qiov, - qed_copy_from_backing_file_write, copy_cb); + if (backing_qiov) { + qemu_iovec_destroy(backing_qiov); + g_free(backing_qiov); + backing_qiov = NULL; + } + + if (ret) { + goto out; + } + + BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE); + ret = bdrv_co_pwritev(s->bs->file, offset, qiov.size, &qiov, 0); + if (ret < 0) { + goto out; + } + ret = 0; +out: + qemu_vfree(iov.iov_base); + return ret; } /** @@ -938,8 +866,9 @@ static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos, * The cluster offset may be an allocated byte offset in the image file, the * zero cluster marker, or the unallocated cluster marker. */ -static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index, - unsigned int n, uint64_t cluster) +static void coroutine_fn qed_update_l2_table(BDRVQEDState *s, QEDTable *table, + int index, unsigned int n, + uint64_t cluster) { int i; for (i = index; i < index + n; i++) { @@ -951,27 +880,9 @@ static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index, } } -static void qed_aio_complete_bh(void *opaque) +static void coroutine_fn qed_aio_complete(QEDAIOCB *acb) { - QEDAIOCB *acb = opaque; BDRVQEDState *s = acb_to_s(acb); - BlockCompletionFunc *cb = acb->common.cb; - void *user_opaque = acb->common.opaque; - int ret = acb->bh_ret; - - qemu_aio_unref(acb); - - /* Invoke callback */ - qed_acquire(s); - cb(user_opaque, ret); - qed_release(s); -} - -static void qed_aio_complete(QEDAIOCB *acb, int ret) -{ - BDRVQEDState *s = acb_to_s(acb); - - trace_qed_aio_complete(s, acb, ret); /* Free resources */ qemu_iovec_destroy(&acb->cur_qiov); @@ -983,22 +894,16 @@ static void qed_aio_complete(QEDAIOCB *acb, int ret) acb->qiov->iov[0].iov_base = NULL; } - /* Arrange for a bh to invoke the completion function */ - acb->bh_ret = ret; - aio_bh_schedule_oneshot(bdrv_get_aio_context(acb->common.bs), - qed_aio_complete_bh, acb); - /* Start next allocating write request waiting behind this one. Note that * requests enqueue themselves when they first hit an unallocated cluster * but they wait until the entire request is finished before waking up the * next request in the queue. This ensures that we don't cycle through * requests multiple times but rather finish one at a time completely. */ - if (acb == QSIMPLEQ_FIRST(&s->allocating_write_reqs)) { - QSIMPLEQ_REMOVE_HEAD(&s->allocating_write_reqs, next); - acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs); - if (acb) { - qed_aio_start_io(acb); + if (acb == s->allocating_acb) { + s->allocating_acb = NULL; + if (!qemu_co_queue_empty(&s->allocating_write_reqs)) { + qemu_co_enter_next(&s->allocating_write_reqs); } else if (s->header.features & QED_F_NEED_CHECK) { qed_start_need_check_timer(s); } @@ -1006,15 +911,21 @@ static void qed_aio_complete(QEDAIOCB *acb, int ret) } /** - * Commit the current L2 table to the cache + * Update L1 table with new L2 table offset and write it out */ -static void qed_commit_l2_update(void *opaque, int ret) +static int coroutine_fn qed_aio_write_l1_update(QEDAIOCB *acb) { - QEDAIOCB *acb = opaque; BDRVQEDState *s = acb_to_s(acb); CachedL2Table *l2_table = acb->request.l2_table; uint64_t l2_offset = l2_table->offset; + int index, ret; + + index = qed_l1_index(s, acb->cur_pos); + s->l1_table->offsets[index] = l2_table->offset; + ret = qed_write_l1_table(s, index, 1); + + /* Commit the current L2 table to the cache */ qed_commit_l2_cache_entry(&s->l2_cache, l2_table); /* This is guaranteed to succeed because we just committed the entry to the @@ -1023,41 +934,18 @@ static void qed_commit_l2_update(void *opaque, int ret) acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset); assert(acb->request.l2_table != NULL); - qed_aio_next_io(acb, ret); + return ret; } -/** - * Update L1 table with new L2 table offset and write it out - */ -static void qed_aio_write_l1_update(void *opaque, int ret) -{ - QEDAIOCB *acb = opaque; - BDRVQEDState *s = acb_to_s(acb); - int index; - - if (ret) { - qed_aio_complete(acb, ret); - return; - } - - index = qed_l1_index(s, acb->cur_pos); - s->l1_table->offsets[index] = acb->request.l2_table->offset; - - qed_write_l1_table(s, index, 1, qed_commit_l2_update, acb); -} /** * Update L2 table with new cluster offsets and write them out */ -static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset) +static int coroutine_fn qed_aio_write_l2_update(QEDAIOCB *acb, uint64_t offset) { BDRVQEDState *s = acb_to_s(acb); bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1; - int index; - - if (ret) { - goto err; - } + int index, ret; if (need_alloc) { qed_unref_l2_cache_entry(acb->request.l2_table); @@ -1070,115 +958,99 @@ static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset) if (need_alloc) { /* Write out the whole new L2 table */ - qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true, - qed_aio_write_l1_update, acb); + ret = qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true); + if (ret) { + return ret; + } + return qed_aio_write_l1_update(acb); } else { /* Write out only the updated part of the L2 table */ - qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters, false, - qed_aio_next_io_cb, acb); - } - return; - -err: - qed_aio_complete(acb, ret); -} - -static void qed_aio_write_l2_update_cb(void *opaque, int ret) -{ - QEDAIOCB *acb = opaque; - qed_aio_write_l2_update(acb, ret, acb->cur_cluster); -} - -/** - * Flush new data clusters before updating the L2 table - * - * This flush is necessary when a backing file is in use. A crash during an - * allocating write could result in empty clusters in the image. If the write - * only touched a subregion of the cluster, then backing image sectors have - * been lost in the untouched region. The solution is to flush after writing a - * new data cluster and before updating the L2 table. - */ -static void qed_aio_write_flush_before_l2_update(void *opaque, int ret) -{ - QEDAIOCB *acb = opaque; - BDRVQEDState *s = acb_to_s(acb); - - if (!bdrv_aio_flush(s->bs->file->bs, qed_aio_write_l2_update_cb, opaque)) { - qed_aio_complete(acb, -EIO); + ret = qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters, + false); + if (ret) { + return ret; + } } + return 0; } /** * Write data to the image file */ -static void qed_aio_write_main(void *opaque, int ret) +static int coroutine_fn qed_aio_write_main(QEDAIOCB *acb) { - QEDAIOCB *acb = opaque; BDRVQEDState *s = acb_to_s(acb); uint64_t offset = acb->cur_cluster + qed_offset_into_cluster(s, acb->cur_pos); - BlockCompletionFunc *next_fn; + int ret; - trace_qed_aio_write_main(s, acb, ret, offset, acb->cur_qiov.size); + trace_qed_aio_write_main(s, acb, 0, offset, acb->cur_qiov.size); - if (ret) { - qed_aio_complete(acb, ret); - return; + BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO); + ret = bdrv_co_pwritev(s->bs->file, offset, acb->cur_qiov.size, + &acb->cur_qiov, 0); + if (ret < 0) { + return ret; } - if (acb->find_cluster_ret == QED_CLUSTER_FOUND) { - next_fn = qed_aio_next_io_cb; - } else { + if (acb->find_cluster_ret != QED_CLUSTER_FOUND) { if (s->bs->backing) { - next_fn = qed_aio_write_flush_before_l2_update; - } else { - next_fn = qed_aio_write_l2_update_cb; + /* + * Flush new data clusters before updating the L2 table + * + * This flush is necessary when a backing file is in use. A crash + * during an allocating write could result in empty clusters in the + * image. If the write only touched a subregion of the cluster, + * then backing image sectors have been lost in the untouched + * region. The solution is to flush after writing a new data + * cluster and before updating the L2 table. + */ + ret = bdrv_co_flush(s->bs->file->bs); + if (ret < 0) { + return ret; + } + } + ret = qed_aio_write_l2_update(acb, acb->cur_cluster); + if (ret < 0) { + return ret; } } - - BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO); - bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE, - &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE, - next_fn, acb); + return 0; } /** - * Populate back untouched region of new data cluster + * Populate untouched regions of new data cluster */ -static void qed_aio_write_postfill(void *opaque, int ret) +static int coroutine_fn qed_aio_write_cow(QEDAIOCB *acb) { - QEDAIOCB *acb = opaque; BDRVQEDState *s = acb_to_s(acb); - uint64_t start = acb->cur_pos + acb->cur_qiov.size; - uint64_t len = - qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start; - uint64_t offset = acb->cur_cluster + - qed_offset_into_cluster(s, acb->cur_pos) + - acb->cur_qiov.size; + uint64_t start, len, offset; + int ret; - if (ret) { - qed_aio_complete(acb, ret); - return; + /* Populate front untouched region of new data cluster */ + start = qed_start_of_cluster(s, acb->cur_pos); + len = qed_offset_into_cluster(s, acb->cur_pos); + + trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster); + ret = qed_copy_from_backing_file(s, start, len, acb->cur_cluster); + if (ret < 0) { + return ret; } - trace_qed_aio_write_postfill(s, acb, start, len, offset); - qed_copy_from_backing_file(s, start, len, offset, - qed_aio_write_main, acb); -} + /* Populate back untouched region of new data cluster */ + start = acb->cur_pos + acb->cur_qiov.size; + len = qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start; + offset = acb->cur_cluster + + qed_offset_into_cluster(s, acb->cur_pos) + + acb->cur_qiov.size; -/** - * Populate front untouched region of new data cluster - */ -static void qed_aio_write_prefill(void *opaque, int ret) -{ - QEDAIOCB *acb = opaque; - BDRVQEDState *s = acb_to_s(acb); - uint64_t start = qed_start_of_cluster(s, acb->cur_pos); - uint64_t len = qed_offset_into_cluster(s, acb->cur_pos); + trace_qed_aio_write_postfill(s, acb, start, len, offset); + ret = qed_copy_from_backing_file(s, start, len, offset); + if (ret < 0) { + return ret; + } - trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster); - qed_copy_from_backing_file(s, start, len, acb->cur_cluster, - qed_aio_write_postfill, acb); + return qed_aio_write_main(acb); } /** @@ -1194,18 +1066,6 @@ static bool qed_should_set_need_check(BDRVQEDState *s) return !(s->header.features & QED_F_NEED_CHECK); } -static void qed_aio_write_zero_cluster(void *opaque, int ret) -{ - QEDAIOCB *acb = opaque; - - if (ret) { - qed_aio_complete(acb, ret); - return; - } - - qed_aio_write_l2_update(acb, 0, 1); -} - /** * Write new data cluster * @@ -1214,23 +1074,24 @@ static void qed_aio_write_zero_cluster(void *opaque, int ret) * * This path is taken when writing to previously unallocated clusters. */ -static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len) +static int coroutine_fn qed_aio_write_alloc(QEDAIOCB *acb, size_t len) { BDRVQEDState *s = acb_to_s(acb); - BlockCompletionFunc *cb; + int ret; /* Cancel timer when the first allocating request comes in */ - if (QSIMPLEQ_EMPTY(&s->allocating_write_reqs)) { + if (s->allocating_acb == NULL) { qed_cancel_need_check_timer(s); } /* Freeze this request if another allocating write is in progress */ - if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs)) { - QSIMPLEQ_INSERT_TAIL(&s->allocating_write_reqs, acb, next); - } - if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs) || - s->allocating_write_reqs_plugged) { - return; /* wait for existing request to finish */ + if (s->allocating_acb != acb || s->allocating_write_reqs_plugged) { + if (s->allocating_acb != NULL) { + qemu_co_queue_wait(&s->allocating_write_reqs, NULL); + assert(s->allocating_acb == NULL); + } + s->allocating_acb = acb; + return -EAGAIN; /* start over with looking up table entries */ } acb->cur_nclusters = qed_bytes_to_clusters(s, @@ -1240,22 +1101,29 @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len) if (acb->flags & QED_AIOCB_ZERO) { /* Skip ahead if the clusters are already zero */ if (acb->find_cluster_ret == QED_CLUSTER_ZERO) { - qed_aio_start_io(acb); - return; + return 0; } - - cb = qed_aio_write_zero_cluster; } else { - cb = qed_aio_write_prefill; acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters); } if (qed_should_set_need_check(s)) { s->header.features |= QED_F_NEED_CHECK; - qed_write_header(s, cb, acb); + ret = qed_write_header(s); + if (ret < 0) { + return ret; + } + } + + if (acb->flags & QED_AIOCB_ZERO) { + ret = qed_aio_write_l2_update(acb, 1); } else { - cb(acb, 0); + ret = qed_aio_write_cow(acb); + } + if (ret < 0) { + return ret; } + return 0; } /** @@ -1267,17 +1135,17 @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len) * * This path is taken when writing to already allocated clusters. */ -static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len) +static int coroutine_fn qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, + size_t len) { /* Allocate buffer for zero writes */ if (acb->flags & QED_AIOCB_ZERO) { struct iovec *iov = acb->qiov->iov; if (!iov->iov_base) { - iov->iov_base = qemu_try_blockalign(acb->common.bs, iov->iov_len); + iov->iov_base = qemu_try_blockalign(acb->bs, iov->iov_len); if (iov->iov_base == NULL) { - qed_aio_complete(acb, -ENOMEM); - return; + return -ENOMEM; } memset(iov->iov_base, 0, iov->iov_len); } @@ -1288,22 +1156,19 @@ static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len) qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len); /* Do the actual write */ - qed_aio_write_main(acb, 0); + return qed_aio_write_main(acb); } /** * Write data cluster * * @opaque: Write request - * @ret: QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1, - * or -errno + * @ret: QED_CLUSTER_FOUND, QED_CLUSTER_L2 or QED_CLUSTER_L1 * @offset: Cluster offset in bytes * @len: Length in bytes - * - * Callback from qed_find_cluster(). */ -static void qed_aio_write_data(void *opaque, int ret, - uint64_t offset, size_t len) +static int coroutine_fn qed_aio_write_data(void *opaque, int ret, + uint64_t offset, size_t len) { QEDAIOCB *acb = opaque; @@ -1313,18 +1178,15 @@ static void qed_aio_write_data(void *opaque, int ret, switch (ret) { case QED_CLUSTER_FOUND: - qed_aio_write_inplace(acb, offset, len); - break; + return qed_aio_write_inplace(acb, offset, len); case QED_CLUSTER_L2: case QED_CLUSTER_L1: case QED_CLUSTER_ZERO: - qed_aio_write_alloc(acb, len); - break; + return qed_aio_write_alloc(acb, len); default: - qed_aio_complete(acb, ret); - break; + g_assert_not_reached(); } } @@ -1332,166 +1194,139 @@ static void qed_aio_write_data(void *opaque, int ret, * Read data cluster * * @opaque: Read request - * @ret: QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1, - * or -errno + * @ret: QED_CLUSTER_FOUND, QED_CLUSTER_L2 or QED_CLUSTER_L1 * @offset: Cluster offset in bytes * @len: Length in bytes - * - * Callback from qed_find_cluster(). */ -static void qed_aio_read_data(void *opaque, int ret, - uint64_t offset, size_t len) +static int coroutine_fn qed_aio_read_data(void *opaque, int ret, + uint64_t offset, size_t len) { QEDAIOCB *acb = opaque; BDRVQEDState *s = acb_to_s(acb); - BlockDriverState *bs = acb->common.bs; + BlockDriverState *bs = acb->bs; /* Adjust offset into cluster */ offset += qed_offset_into_cluster(s, acb->cur_pos); trace_qed_aio_read_data(s, acb, ret, offset, len); - if (ret < 0) { - goto err; - } - qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len); /* Handle zero cluster and backing file reads */ if (ret == QED_CLUSTER_ZERO) { qemu_iovec_memset(&acb->cur_qiov, 0, 0, acb->cur_qiov.size); - qed_aio_start_io(acb); - return; + return 0; } else if (ret != QED_CLUSTER_FOUND) { - qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov, - &acb->backing_qiov, qed_aio_next_io_cb, acb); - return; + return qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov, + &acb->backing_qiov); } BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); - bdrv_aio_readv(bs->file, offset / BDRV_SECTOR_SIZE, - &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE, - qed_aio_next_io_cb, acb); - return; - -err: - qed_aio_complete(acb, ret); + ret = bdrv_co_preadv(bs->file, offset, acb->cur_qiov.size, + &acb->cur_qiov, 0); + if (ret < 0) { + return ret; + } + return 0; } /** * Begin next I/O or complete the request */ -static void qed_aio_next_io(QEDAIOCB *acb, int ret) +static int coroutine_fn qed_aio_next_io(QEDAIOCB *acb) { BDRVQEDState *s = acb_to_s(acb); - QEDFindClusterFunc *io_fn = (acb->flags & QED_AIOCB_WRITE) ? - qed_aio_write_data : qed_aio_read_data; + uint64_t offset; + size_t len; + int ret; - trace_qed_aio_next_io(s, acb, ret, acb->cur_pos + acb->cur_qiov.size); + while (1) { + trace_qed_aio_next_io(s, acb, 0, acb->cur_pos + acb->cur_qiov.size); - if (acb->backing_qiov) { - qemu_iovec_destroy(acb->backing_qiov); - g_free(acb->backing_qiov); - acb->backing_qiov = NULL; - } + if (acb->backing_qiov) { + qemu_iovec_destroy(acb->backing_qiov); + g_free(acb->backing_qiov); + acb->backing_qiov = NULL; + } - /* Handle I/O error */ - if (ret) { - qed_aio_complete(acb, ret); - return; - } + acb->qiov_offset += acb->cur_qiov.size; + acb->cur_pos += acb->cur_qiov.size; + qemu_iovec_reset(&acb->cur_qiov); - acb->qiov_offset += acb->cur_qiov.size; - acb->cur_pos += acb->cur_qiov.size; - qemu_iovec_reset(&acb->cur_qiov); + /* Complete request */ + if (acb->cur_pos >= acb->end_pos) { + ret = 0; + break; + } - /* Complete request */ - if (acb->cur_pos >= acb->end_pos) { - qed_aio_complete(acb, 0); - return; + /* Find next cluster and start I/O */ + len = acb->end_pos - acb->cur_pos; + ret = qed_find_cluster(s, &acb->request, acb->cur_pos, &len, &offset); + if (ret < 0) { + break; + } + + if (acb->flags & QED_AIOCB_WRITE) { + ret = qed_aio_write_data(acb, ret, offset, len); + } else { + ret = qed_aio_read_data(acb, ret, offset, len); + } + + if (ret < 0 && ret != -EAGAIN) { + break; + } } - /* Find next cluster and start I/O */ - qed_find_cluster(s, &acb->request, - acb->cur_pos, acb->end_pos - acb->cur_pos, - io_fn, acb); + trace_qed_aio_complete(s, acb, ret); + qed_aio_complete(acb); + return ret; } -static BlockAIOCB *qed_aio_setup(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, - void *opaque, int flags) -{ - QEDAIOCB *acb = qemu_aio_get(&qed_aiocb_info, bs, cb, opaque); - - trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors, - opaque, flags); +static int coroutine_fn qed_co_request(BlockDriverState *bs, int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + int flags) +{ + QEDAIOCB acb = { + .bs = bs, + .cur_pos = (uint64_t) sector_num * BDRV_SECTOR_SIZE, + .end_pos = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE, + .qiov = qiov, + .flags = flags, + }; + qemu_iovec_init(&acb.cur_qiov, qiov->niov); - acb->flags = flags; - acb->qiov = qiov; - acb->qiov_offset = 0; - acb->cur_pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE; - acb->end_pos = acb->cur_pos + nb_sectors * BDRV_SECTOR_SIZE; - acb->backing_qiov = NULL; - acb->request.l2_table = NULL; - qemu_iovec_init(&acb->cur_qiov, qiov->niov); + trace_qed_aio_setup(bs->opaque, &acb, sector_num, nb_sectors, NULL, flags); /* Start request */ - qed_aio_start_io(acb); - return &acb->common; + return qed_aio_next_io(&acb); } -static BlockAIOCB *bdrv_qed_aio_readv(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, - void *opaque) +static int coroutine_fn bdrv_qed_co_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *qiov) { - return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); + return qed_co_request(bs, sector_num, qiov, nb_sectors, 0); } -static BlockAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, - void *opaque) -{ - return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, - opaque, QED_AIOCB_WRITE); -} - -typedef struct { - Coroutine *co; - int ret; - bool done; -} QEDWriteZeroesCB; - -static void coroutine_fn qed_co_pwrite_zeroes_cb(void *opaque, int ret) +static int coroutine_fn bdrv_qed_co_writev(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *qiov) { - QEDWriteZeroesCB *cb = opaque; - - cb->done = true; - cb->ret = ret; - if (cb->co) { - aio_co_wake(cb->co); - } + return qed_co_request(bs, sector_num, qiov, nb_sectors, QED_AIOCB_WRITE); } static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, - int count, + int bytes, BdrvRequestFlags flags) { - BlockAIOCB *blockacb; BDRVQEDState *s = bs->opaque; - QEDWriteZeroesCB cb = { .done = false }; QEMUIOVector qiov; struct iovec iov; /* Fall back if the request is not aligned */ if (qed_offset_into_cluster(s, offset) || - qed_offset_into_cluster(s, count)) { + qed_offset_into_cluster(s, bytes)) { return -ENOTSUP; } @@ -1499,22 +1334,12 @@ static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs, * then it will be allocated during request processing. */ iov.iov_base = NULL; - iov.iov_len = count; + iov.iov_len = bytes; qemu_iovec_init_external(&qiov, &iov, 1); - blockacb = qed_aio_setup(bs, offset >> BDRV_SECTOR_BITS, &qiov, - count >> BDRV_SECTOR_BITS, - qed_co_pwrite_zeroes_cb, &cb, - QED_AIOCB_WRITE | QED_AIOCB_ZERO); - if (!blockacb) { - return -EIO; - } - if (!cb.done) { - cb.co = qemu_coroutine_self(); - qemu_coroutine_yield(); - } - assert(cb.done); - return cb.ret; + return qed_co_request(bs, offset >> BDRV_SECTOR_BITS, &qiov, + bytes >> BDRV_SECTOR_BITS, + QED_AIOCB_WRITE | QED_AIOCB_ZERO); } static int bdrv_qed_truncate(BlockDriverState *bs, int64_t offset, Error **errp) @@ -1710,8 +1535,8 @@ static BlockDriver bdrv_qed = { .bdrv_create = bdrv_qed_create, .bdrv_has_zero_init = bdrv_has_zero_init_1, .bdrv_co_get_block_status = bdrv_qed_co_get_block_status, - .bdrv_aio_readv = bdrv_qed_aio_readv, - .bdrv_aio_writev = bdrv_qed_aio_writev, + .bdrv_co_readv = bdrv_qed_co_readv, + .bdrv_co_writev = bdrv_qed_co_writev, .bdrv_co_pwrite_zeroes = bdrv_qed_co_pwrite_zeroes, .bdrv_truncate = bdrv_qed_truncate, .bdrv_getlength = bdrv_qed_getlength, |