aboutsummaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
authorSam Li <faithilikerun@gmail.com>2023-05-08 13:15:08 +0800
committerStefan Hajnoczi <stefanha@redhat.com>2023-05-15 08:18:10 -0400
commit4751d09adcc3dd76b4124f5c408055ee0940b3ee (patch)
treea629f4ce1ef6be4c2c66995eb8b32b5d13c5dbe0 /block
parenta3c41f06d5a84bc2263e871b1e9fa4daba7edf0f (diff)
block: introduce zone append write for zoned devices
A zone append command is a write operation that specifies the first logical block of a zone as the write position. When writing to a zoned block device using zone append, the byte offset of the call may point at any position within the zone to which the data is being appended. Upon completion the device will respond with the position where the data has been written in the zone. Signed-off-by: Sam Li <faithilikerun@gmail.com> Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com> Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> Message-id: 20230508051510.177850-3-faithilikerun@gmail.com Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Diffstat (limited to 'block')
-rw-r--r--block/block-backend.c61
-rw-r--r--block/file-posix.c58
-rw-r--r--block/io.c27
-rw-r--r--block/io_uring.c4
-rw-r--r--block/linux-aio.c3
-rw-r--r--block/raw-format.c8
6 files changed, 154 insertions, 7 deletions
diff --git a/block/block-backend.c b/block/block-backend.c
index 4a8d5c4b23..ca537cd0ad 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1929,6 +1929,45 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
return &acb->common;
}
+static void coroutine_fn blk_aio_zone_append_entry(void *opaque)
+{
+ BlkAioEmAIOCB *acb = opaque;
+ BlkRwCo *rwco = &acb->rwco;
+
+ rwco->ret = blk_co_zone_append(rwco->blk, (int64_t *)(uintptr_t)acb->bytes,
+ rwco->iobuf, rwco->flags);
+ blk_aio_complete(acb);
+}
+
+BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
+ QEMUIOVector *qiov, BdrvRequestFlags flags,
+ BlockCompletionFunc *cb, void *opaque) {
+ BlkAioEmAIOCB *acb;
+ Coroutine *co;
+ IO_CODE();
+
+ blk_inc_in_flight(blk);
+ acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
+ acb->rwco = (BlkRwCo) {
+ .blk = blk,
+ .ret = NOT_DONE,
+ .flags = flags,
+ .iobuf = qiov,
+ };
+ acb->bytes = (int64_t)(uintptr_t)offset;
+ acb->has_returned = false;
+
+ co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
+ aio_co_enter(blk_get_aio_context(blk), co);
+ acb->has_returned = true;
+ if (acb->rwco.ret != NOT_DONE) {
+ replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
+ blk_aio_complete_bh, acb);
+ }
+
+ return &acb->common;
+}
+
/*
* Send a zone_report command.
* offset is a byte offset from the start of the device. No alignment
@@ -1982,6 +2021,28 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
return ret;
}
+/*
+ * Send a zone_append command.
+ */
+int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
+ QEMUIOVector *qiov, BdrvRequestFlags flags)
+{
+ int ret;
+ IO_CODE();
+
+ blk_inc_in_flight(blk);
+ blk_wait_while_drained(blk);
+ GRAPH_RDLOCK_GUARD();
+ if (!blk_is_available(blk)) {
+ blk_dec_in_flight(blk);
+ return -ENOMEDIUM;
+ }
+
+ ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
+ blk_dec_in_flight(blk);
+ return ret;
+}
+
void blk_drain(BlockBackend *blk)
{
BlockDriverState *bs = blk_bs(blk);
diff --git a/block/file-posix.c b/block/file-posix.c
index 56f57515d4..179263fec6 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -160,6 +160,7 @@ typedef struct BDRVRawState {
bool has_write_zeroes:1;
bool use_linux_aio:1;
bool use_linux_io_uring:1;
+ int64_t *offset; /* offset of zone append operation */
int page_cache_inconsistent; /* errno from fdatasync failure */
bool has_fallocate;
bool needs_alignment;
@@ -1698,7 +1699,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
ssize_t len;
len = RETRY_ON_EINTR(
- (aiocb->aio_type & QEMU_AIO_WRITE) ?
+ (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) ?
qemu_pwritev(aiocb->aio_fildes,
aiocb->io.iov,
aiocb->io.niov,
@@ -1727,7 +1728,7 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
ssize_t len;
while (offset < aiocb->aio_nbytes) {
- if (aiocb->aio_type & QEMU_AIO_WRITE) {
+ if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
len = pwrite(aiocb->aio_fildes,
(const char *)buf + offset,
aiocb->aio_nbytes - offset,
@@ -1820,7 +1821,7 @@ static int handle_aiocb_rw(void *opaque)
}
nbytes = handle_aiocb_rw_linear(aiocb, buf);
- if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
+ if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) {
char *p = buf;
size_t count = aiocb->aio_nbytes, copy;
int i;
@@ -2453,8 +2454,12 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
if (fd_open(bs) < 0)
return -EIO;
#if defined(CONFIG_BLKZONED)
- if (type & QEMU_AIO_WRITE && bs->wps) {
+ if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) && bs->wps) {
qemu_co_mutex_lock(&bs->wps->colock);
+ if (type & QEMU_AIO_ZONE_APPEND && bs->bl.zone_size) {
+ int index = offset / bs->bl.zone_size;
+ offset = bs->wps->wp[index];
+ }
}
#endif
@@ -2502,9 +2507,13 @@ out:
{
BlockZoneWps *wps = bs->wps;
if (ret == 0) {
- if (type & QEMU_AIO_WRITE && wps && bs->bl.zone_size) {
+ if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
+ && wps && bs->bl.zone_size) {
uint64_t *wp = &wps->wp[offset / bs->bl.zone_size];
if (!BDRV_ZT_IS_CONV(*wp)) {
+ if (type & QEMU_AIO_ZONE_APPEND) {
+ *s->offset = *wp;
+ }
/* Advance the wp if needed */
if (offset + bytes > *wp) {
*wp = offset + bytes;
@@ -2512,12 +2521,12 @@ out:
}
}
} else {
- if (type & QEMU_AIO_WRITE) {
+ if (type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
update_zones_wp(bs, s->fd, 0, 1);
}
}
- if (type & QEMU_AIO_WRITE && wps) {
+ if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) && wps) {
qemu_co_mutex_unlock(&wps->colock);
}
}
@@ -3515,6 +3524,40 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
}
#endif
+#if defined(CONFIG_BLKZONED)
+static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
+ int64_t *offset,
+ QEMUIOVector *qiov,
+ BdrvRequestFlags flags) {
+ assert(flags == 0);
+ int64_t zone_size_mask = bs->bl.zone_size - 1;
+ int64_t iov_len = 0;
+ int64_t len = 0;
+ BDRVRawState *s = bs->opaque;
+ s->offset = offset;
+
+ if (*offset & zone_size_mask) {
+ error_report("sector offset %" PRId64 " is not aligned to zone size "
+ "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512);
+ return -EINVAL;
+ }
+
+ int64_t wg = bs->bl.write_granularity;
+ int64_t wg_mask = wg - 1;
+ for (int i = 0; i < qiov->niov; i++) {
+ iov_len = qiov->iov[i].iov_len;
+ if (iov_len & wg_mask) {
+ error_report("len of IOVector[%d] %" PRId64 " is not aligned to "
+ "block size %" PRId64 "", i, iov_len, wg);
+ return -EINVAL;
+ }
+ len += iov_len;
+ }
+
+ return raw_co_prw(bs, *offset, len, qiov, QEMU_AIO_ZONE_APPEND);
+}
+#endif
+
static coroutine_fn int
raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
bool blkdev)
@@ -4276,6 +4319,7 @@ static BlockDriver bdrv_host_device = {
/* zone management operations */
.bdrv_co_zone_report = raw_co_zone_report,
.bdrv_co_zone_mgmt = raw_co_zone_mgmt,
+ .bdrv_co_zone_append = raw_co_zone_append,
#endif
};
diff --git a/block/io.c b/block/io.c
index 12bf90e9bc..4d54fda593 100644
--- a/block/io.c
+++ b/block/io.c
@@ -3154,6 +3154,33 @@ out:
return co.ret;
}
+int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
+ QEMUIOVector *qiov,
+ BdrvRequestFlags flags)
+{
+ int ret;
+ BlockDriver *drv = bs->drv;
+ CoroutineIOCompletion co = {
+ .coroutine = qemu_coroutine_self(),
+ };
+ IO_CODE();
+
+ ret = bdrv_check_qiov_request(*offset, qiov->size, qiov, 0, NULL);
+ if (ret < 0) {
+ return ret;
+ }
+
+ bdrv_inc_in_flight(bs);
+ if (!drv || !drv->bdrv_co_zone_append || bs->bl.zoned == BLK_Z_NONE) {
+ co.ret = -ENOTSUP;
+ goto out;
+ }
+ co.ret = drv->bdrv_co_zone_append(bs, offset, qiov, flags);
+out:
+ bdrv_dec_in_flight(bs);
+ return co.ret;
+}
+
void *qemu_blockalign(BlockDriverState *bs, size_t size)
{
IO_CODE();
diff --git a/block/io_uring.c b/block/io_uring.c
index 989f9a99ed..82cab6a5bd 100644
--- a/block/io_uring.c
+++ b/block/io_uring.c
@@ -350,6 +350,10 @@ static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s,
io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
luringcb->qiov->niov, offset);
break;
+ case QEMU_AIO_ZONE_APPEND:
+ io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
+ luringcb->qiov->niov, offset);
+ break;
case QEMU_AIO_READ:
io_uring_prep_readv(sqes, fd, luringcb->qiov->iov,
luringcb->qiov->niov, offset);
diff --git a/block/linux-aio.c b/block/linux-aio.c
index fc50cdd1bf..442c86209b 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -394,6 +394,9 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
case QEMU_AIO_WRITE:
io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
break;
+ case QEMU_AIO_ZONE_APPEND:
+ io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
+ break;
case QEMU_AIO_READ:
io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset);
break;
diff --git a/block/raw-format.c b/block/raw-format.c
index bbb644cd95..3a3946213f 100644
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -332,6 +332,13 @@ raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len);
}
+static int coroutine_fn GRAPH_RDLOCK
+raw_co_zone_append(BlockDriverState *bs,int64_t *offset, QEMUIOVector *qiov,
+ BdrvRequestFlags flags)
+{
+ return bdrv_co_zone_append(bs->file->bs, offset, qiov, flags);
+}
+
static int64_t coroutine_fn GRAPH_RDLOCK
raw_co_getlength(BlockDriverState *bs)
{
@@ -637,6 +644,7 @@ BlockDriver bdrv_raw = {
.bdrv_co_pdiscard = &raw_co_pdiscard,
.bdrv_co_zone_report = &raw_co_zone_report,
.bdrv_co_zone_mgmt = &raw_co_zone_mgmt,
+ .bdrv_co_zone_append = &raw_co_zone_append,
.bdrv_co_block_status = &raw_co_block_status,
.bdrv_co_copy_range_from = &raw_co_copy_range_from,
.bdrv_co_copy_range_to = &raw_co_copy_range_to,