aboutsummaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
authorRichard Henderson <richard.henderson@linaro.org>2023-05-15 13:54:33 -0700
committerRichard Henderson <richard.henderson@linaro.org>2023-05-15 13:54:33 -0700
commitab4c44d657aeca7e1da6d6dcb1741c8e7d357b8b (patch)
tree13b05307b2c4023bf21ef5acd38e325e7569e5ac /block
parentc095228e8a8cdf5c15bb8a47c4d069582ae017d1 (diff)
parent01562fee5f3ad4506d57dbcf4b1903b565eceec7 (diff)
Merge tag 'block-pull-request' of https://gitlab.com/stefanha/qemu into staging
Pull request This pull request contain's Sam Li's zoned storage support in the QEMU block layer and virtio-blk emulation. v2: - Sam fixed the CI failures. CI passes for me now. [Richard] # -----BEGIN PGP SIGNATURE----- # # iQEzBAABCAAdFiEEhpWov9P5fNqsNXdanKSrs4Grc8gFAmRiWCgACgkQnKSrs4Gr # c8h/7gf+MMm2cGEaf376t8HMwTc6wbXVfbmAlZrge2EXPZfFvEaxj7HClcEraOgV # yJsGWeU6mOw4r68ICJ/4KhrY1cdv+VZym/LsMLMcFUTXFHnyX4pyU3am31FPOI4K # +wrDYJOJhc4DkAESWGgEWiMKpuO/uUEgBmHdW+qPFCl77Yl/eP6H5uNP6nGFn55p # QpS/l8iha7PDkc81EsrjA+e/YI0ubfNSP7+zZElhQ98354CQ0MCfmZ6h9bT+o2bu # R7SBUj80e+2X0a1b9s/2Jz/x8l4TEsl8kr48/Q1usq3GVVkbjEgqsk6wTN13Q/4g # CeIR7E61ZeYzmpb4tLFRIqK2Jw+NEQ== # =Q8xW # -----END PGP SIGNATURE----- # gpg: Signature made Mon 15 May 2023 09:04:56 AM PDT # gpg: using RSA key 8695A8BFD3F97CDAAC35775A9CA4ABB381AB73C8 # gpg: Good signature from "Stefan Hajnoczi <stefanha@redhat.com>" [full] # gpg: aka "Stefan Hajnoczi <stefanha@gmail.com>" [full] * tag 'block-pull-request' of https://gitlab.com/stefanha/qemu: docs/zoned-storage:add zoned emulation use case virtio-blk: add some trace events for zoned emulation block: add accounting for zone append operation virtio-blk: add zoned storage emulation for zoned devices block: add some trace events for zone append qemu-iotests: test zone append operation block: introduce zone append write for zoned devices file-posix: add tracking of the zone write pointers docs/zoned-storage: add zoned device documentation block: add some trace events for new block layer APIs iotests: test new zone operations block: add zoned BlockDriver check to block layer block/raw-format: add zone operations to pass through requests block/block-backend: add block layer APIs resembling Linux ZonedBlockDevice ioctls block/file-posix: introduce helper functions for sysfs attributes block/block-common: add zoned device structs Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Diffstat (limited to 'block')
-rw-r--r--block/block-backend.c198
-rw-r--r--block/file-posix.c680
-rw-r--r--block/io.c68
-rw-r--r--block/io_uring.c4
-rw-r--r--block/linux-aio.c3
-rw-r--r--block/qapi-sysemu.c11
-rw-r--r--block/qapi.c18
-rw-r--r--block/raw-format.c26
-rw-r--r--block/trace-events4
9 files changed, 972 insertions, 40 deletions
diff --git a/block/block-backend.c b/block/block-backend.c
index e37d55d3e9..ca537cd0ad 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1845,6 +1845,204 @@ int coroutine_fn blk_co_flush(BlockBackend *blk)
return ret;
}
+static void coroutine_fn blk_aio_zone_report_entry(void *opaque)
+{
+ BlkAioEmAIOCB *acb = opaque;
+ BlkRwCo *rwco = &acb->rwco;
+
+ rwco->ret = blk_co_zone_report(rwco->blk, rwco->offset,
+ (unsigned int*)(uintptr_t)acb->bytes,
+ rwco->iobuf);
+ blk_aio_complete(acb);
+}
+
+BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
+ unsigned int *nr_zones,
+ BlockZoneDescriptor *zones,
+ BlockCompletionFunc *cb, void *opaque)
+{
+ BlkAioEmAIOCB *acb;
+ Coroutine *co;
+ IO_CODE();
+
+ blk_inc_in_flight(blk);
+ acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
+ acb->rwco = (BlkRwCo) {
+ .blk = blk,
+ .offset = offset,
+ .iobuf = zones,
+ .ret = NOT_DONE,
+ };
+ acb->bytes = (int64_t)(uintptr_t)nr_zones,
+ acb->has_returned = false;
+
+ co = qemu_coroutine_create(blk_aio_zone_report_entry, acb);
+ aio_co_enter(blk_get_aio_context(blk), co);
+
+ acb->has_returned = true;
+ if (acb->rwco.ret != NOT_DONE) {
+ replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
+ blk_aio_complete_bh, acb);
+ }
+
+ return &acb->common;
+}
+
+static void coroutine_fn blk_aio_zone_mgmt_entry(void *opaque)
+{
+ BlkAioEmAIOCB *acb = opaque;
+ BlkRwCo *rwco = &acb->rwco;
+
+ rwco->ret = blk_co_zone_mgmt(rwco->blk,
+ (BlockZoneOp)(uintptr_t)rwco->iobuf,
+ rwco->offset, acb->bytes);
+ blk_aio_complete(acb);
+}
+
+BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
+ int64_t offset, int64_t len,
+ BlockCompletionFunc *cb, void *opaque) {
+ BlkAioEmAIOCB *acb;
+ Coroutine *co;
+ IO_CODE();
+
+ blk_inc_in_flight(blk);
+ acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
+ acb->rwco = (BlkRwCo) {
+ .blk = blk,
+ .offset = offset,
+ .iobuf = (void *)(uintptr_t)op,
+ .ret = NOT_DONE,
+ };
+ acb->bytes = len;
+ acb->has_returned = false;
+
+ co = qemu_coroutine_create(blk_aio_zone_mgmt_entry, acb);
+ aio_co_enter(blk_get_aio_context(blk), co);
+
+ acb->has_returned = true;
+ if (acb->rwco.ret != NOT_DONE) {
+ replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
+ blk_aio_complete_bh, acb);
+ }
+
+ return &acb->common;
+}
+
+static void coroutine_fn blk_aio_zone_append_entry(void *opaque)
+{
+ BlkAioEmAIOCB *acb = opaque;
+ BlkRwCo *rwco = &acb->rwco;
+
+ rwco->ret = blk_co_zone_append(rwco->blk, (int64_t *)(uintptr_t)acb->bytes,
+ rwco->iobuf, rwco->flags);
+ blk_aio_complete(acb);
+}
+
+BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
+ QEMUIOVector *qiov, BdrvRequestFlags flags,
+ BlockCompletionFunc *cb, void *opaque) {
+ BlkAioEmAIOCB *acb;
+ Coroutine *co;
+ IO_CODE();
+
+ blk_inc_in_flight(blk);
+ acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
+ acb->rwco = (BlkRwCo) {
+ .blk = blk,
+ .ret = NOT_DONE,
+ .flags = flags,
+ .iobuf = qiov,
+ };
+ acb->bytes = (int64_t)(uintptr_t)offset;
+ acb->has_returned = false;
+
+ co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
+ aio_co_enter(blk_get_aio_context(blk), co);
+ acb->has_returned = true;
+ if (acb->rwco.ret != NOT_DONE) {
+ replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
+ blk_aio_complete_bh, acb);
+ }
+
+ return &acb->common;
+}
+
+/*
+ * Send a zone_report command.
+ * offset is a byte offset from the start of the device. No alignment
+ * required for offset.
+ * nr_zones represents IN maximum and OUT actual.
+ */
+int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
+ unsigned int *nr_zones,
+ BlockZoneDescriptor *zones)
+{
+ int ret;
+ IO_CODE();
+
+ blk_inc_in_flight(blk); /* increase before waiting */
+ blk_wait_while_drained(blk);
+ GRAPH_RDLOCK_GUARD();
+ if (!blk_is_available(blk)) {
+ blk_dec_in_flight(blk);
+ return -ENOMEDIUM;
+ }
+ ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones);
+ blk_dec_in_flight(blk);
+ return ret;
+}
+
+/*
+ * Send a zone_management command.
+ * op is the zone operation;
+ * offset is the byte offset from the start of the zoned device;
+ * len is the maximum number of bytes the command should operate on. It
+ * should be aligned with the device zone size.
+ */
+int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
+ int64_t offset, int64_t len)
+{
+ int ret;
+ IO_CODE();
+
+ blk_inc_in_flight(blk);
+ blk_wait_while_drained(blk);
+ GRAPH_RDLOCK_GUARD();
+
+ ret = blk_check_byte_request(blk, offset, len);
+ if (ret < 0) {
+ blk_dec_in_flight(blk);
+ return ret;
+ }
+
+ ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len);
+ blk_dec_in_flight(blk);
+ return ret;
+}
+
+/*
+ * Send a zone_append command.
+ */
+int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
+ QEMUIOVector *qiov, BdrvRequestFlags flags)
+{
+ int ret;
+ IO_CODE();
+
+ blk_inc_in_flight(blk);
+ blk_wait_while_drained(blk);
+ GRAPH_RDLOCK_GUARD();
+ if (!blk_is_available(blk)) {
+ blk_dec_in_flight(blk);
+ return -ENOMEDIUM;
+ }
+
+ ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
+ blk_dec_in_flight(blk);
+ return ret;
+}
+
void blk_drain(BlockBackend *blk)
{
BlockDriverState *bs = blk_bs(blk);
diff --git a/block/file-posix.c b/block/file-posix.c
index c7b723368e..0ab158efba 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -68,6 +68,9 @@
#include <sys/param.h>
#include <sys/syscall.h>
#include <sys/vfs.h>
+#if defined(CONFIG_BLKZONED)
+#include <linux/blkzoned.h>
+#endif
#include <linux/cdrom.h>
#include <linux/fd.h>
#include <linux/fs.h>
@@ -157,6 +160,7 @@ typedef struct BDRVRawState {
bool has_write_zeroes:1;
bool use_linux_aio:1;
bool use_linux_io_uring:1;
+ int64_t *offset; /* offset of zone append operation */
int page_cache_inconsistent; /* errno from fdatasync failure */
bool has_fallocate;
bool needs_alignment;
@@ -216,6 +220,13 @@ typedef struct RawPosixAIOData {
PreallocMode prealloc;
Error **errp;
} truncate;
+ struct {
+ unsigned int *nr_zones;
+ BlockZoneDescriptor *zones;
+ } zone_report;
+ struct {
+ unsigned long op;
+ } zone_mgmt;
};
} RawPosixAIOData;
@@ -766,6 +777,18 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
goto fail;
}
}
+#ifdef CONFIG_BLKZONED
+ /*
+ * The kernel page cache does not reliably work for writes to SWR zones
+ * of zoned block device because it can not guarantee the order of writes.
+ */
+ if ((bs->bl.zoned != BLK_Z_NONE) &&
+ (!(s->open_flags & O_DIRECT))) {
+ error_setg(errp, "The driver supports zoned devices, and it requires "
+ "cache.direct=on, which was not specified.");
+ return -EINVAL; /* No host kernel page cache */
+ }
+#endif
if (S_ISBLK(st.st_mode)) {
#ifdef __linux__
@@ -1202,15 +1225,91 @@ static int hdev_get_max_hw_transfer(int fd, struct stat *st)
#endif
}
-static int hdev_get_max_segments(int fd, struct stat *st)
+/*
+ * Get a sysfs attribute value as character string.
+ */
+#ifdef CONFIG_LINUX
+static int get_sysfs_str_val(struct stat *st, const char *attribute,
+ char **val) {
+ g_autofree char *sysfspath = NULL;
+ int ret;
+ size_t len;
+
+ if (!S_ISBLK(st->st_mode)) {
+ return -ENOTSUP;
+ }
+
+ sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/%s",
+ major(st->st_rdev), minor(st->st_rdev),
+ attribute);
+ ret = g_file_get_contents(sysfspath, val, &len, NULL);
+ if (ret == -1) {
+ return -ENOENT;
+ }
+
+ /* The file is ended with '\n' */
+ char *p;
+ p = *val;
+ if (*(p + len - 1) == '\n') {
+ *(p + len - 1) = '\0';
+ }
+ return ret;
+}
+#endif
+
+#if defined(CONFIG_BLKZONED)
+static int get_sysfs_zoned_model(struct stat *st, BlockZoneModel *zoned)
{
+ g_autofree char *val = NULL;
+ int ret;
+
+ ret = get_sysfs_str_val(st, "zoned", &val);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (strcmp(val, "host-managed") == 0) {
+ *zoned = BLK_Z_HM;
+ } else if (strcmp(val, "host-aware") == 0) {
+ *zoned = BLK_Z_HA;
+ } else if (strcmp(val, "none") == 0) {
+ *zoned = BLK_Z_NONE;
+ } else {
+ return -ENOTSUP;
+ }
+ return 0;
+}
+#endif /* defined(CONFIG_BLKZONED) */
+
+/*
+ * Get a sysfs attribute value as a long integer.
+ */
#ifdef CONFIG_LINUX
- char buf[32];
+static long get_sysfs_long_val(struct stat *st, const char *attribute)
+{
+ g_autofree char *str = NULL;
const char *end;
- char *sysfspath = NULL;
+ long val;
+ int ret;
+
+ ret = get_sysfs_str_val(st, attribute, &str);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* The file is ended with '\n', pass 'end' to accept that. */
+ ret = qemu_strtol(str, &end, 10, &val);
+ if (ret == 0 && end && *end == '\0') {
+ ret = val;
+ }
+ return ret;
+}
+#endif
+
+static int hdev_get_max_segments(int fd, struct stat *st)
+{
+#ifdef CONFIG_LINUX
int ret;
- int sysfd = -1;
- long max_segments;
if (S_ISCHR(st->st_mode)) {
if (ioctl(fd, SG_GET_SG_TABLESIZE, &ret) == 0) {
@@ -1218,43 +1317,175 @@ static int hdev_get_max_segments(int fd, struct stat *st)
}
return -ENOTSUP;
}
+ return get_sysfs_long_val(st, "max_segments");
+#else
+ return -ENOTSUP;
+#endif
+}
- if (!S_ISBLK(st->st_mode)) {
- return -ENOTSUP;
+#if defined(CONFIG_BLKZONED)
+/*
+ * If the reset_all flag is true, then the wps of zone whose state is
+ * not readonly or offline should be all reset to the start sector.
+ * Else, take the real wp of the device.
+ */
+static int get_zones_wp(BlockDriverState *bs, int fd, int64_t offset,
+ unsigned int nrz, bool reset_all)
+{
+ struct blk_zone *blkz;
+ size_t rep_size;
+ uint64_t sector = offset >> BDRV_SECTOR_BITS;
+ BlockZoneWps *wps = bs->wps;
+ unsigned int j = offset / bs->bl.zone_size;
+ unsigned int n = 0, i = 0;
+ int ret;
+ rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
+ g_autofree struct blk_zone_report *rep = NULL;
+
+ rep = g_malloc(rep_size);
+ blkz = (struct blk_zone *)(rep + 1);
+ while (n < nrz) {
+ memset(rep, 0, rep_size);
+ rep->sector = sector;
+ rep->nr_zones = nrz - n;
+
+ do {
+ ret = ioctl(fd, BLKREPORTZONE, rep);
+ } while (ret != 0 && errno == EINTR);
+ if (ret != 0) {
+ error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
+ fd, offset, errno);
+ return -errno;
+ }
+
+ if (!rep->nr_zones) {
+ break;
+ }
+
+ for (i = 0; i < rep->nr_zones; ++i, ++n, ++j) {
+ /*
+ * The wp tracking cares only about sequential writes required and
+ * sequential write preferred zones so that the wp can advance to
+ * the right location.
+ * Use the most significant bit of the wp location to indicate the
+ * zone type: 0 for SWR/SWP zones and 1 for conventional zones.
+ */
+ if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
+ wps->wp[j] |= 1ULL << 63;
+ } else {
+ switch(blkz[i].cond) {
+ case BLK_ZONE_COND_FULL:
+ case BLK_ZONE_COND_READONLY:
+ /* Zone not writable */
+ wps->wp[j] = (blkz[i].start + blkz[i].len) << BDRV_SECTOR_BITS;
+ break;
+ case BLK_ZONE_COND_OFFLINE:
+ /* Zone not writable nor readable */
+ wps->wp[j] = (blkz[i].start) << BDRV_SECTOR_BITS;
+ break;
+ default:
+ if (reset_all) {
+ wps->wp[j] = blkz[i].start << BDRV_SECTOR_BITS;
+ } else {
+ wps->wp[j] = blkz[i].wp << BDRV_SECTOR_BITS;
+ }
+ break;
+ }
+ }
+ }
+ sector = blkz[i - 1].start + blkz[i - 1].len;
}
- sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
- major(st->st_rdev), minor(st->st_rdev));
- sysfd = open(sysfspath, O_RDONLY);
- if (sysfd == -1) {
- ret = -errno;
- goto out;
+ return 0;
+}
+
+static void update_zones_wp(BlockDriverState *bs, int fd, int64_t offset,
+ unsigned int nrz)
+{
+ if (get_zones_wp(bs, fd, offset, nrz, 0) < 0) {
+ error_report("update zone wp failed");
+ }
+}
+
+static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
+ Error **errp)
+{
+ BDRVRawState *s = bs->opaque;
+ BlockZoneModel zoned;
+ int ret;
+
+ bs->bl.zoned = BLK_Z_NONE;
+
+ ret = get_sysfs_zoned_model(st, &zoned);
+ if (ret < 0 || zoned == BLK_Z_NONE) {
+ return;
+ }
+ bs->bl.zoned = zoned;
+
+ ret = get_sysfs_long_val(st, "max_open_zones");
+ if (ret >= 0) {
+ bs->bl.max_open_zones = ret;
+ }
+
+ ret = get_sysfs_long_val(st, "max_active_zones");
+ if (ret >= 0) {
+ bs->bl.max_active_zones = ret;
}
- ret = RETRY_ON_EINTR(read(sysfd, buf, sizeof(buf) - 1));
+
+ /*
+ * The zoned device must at least have zone size and nr_zones fields.
+ */
+ ret = get_sysfs_long_val(st, "chunk_sectors");
if (ret < 0) {
- ret = -errno;
- goto out;
- } else if (ret == 0) {
- ret = -EIO;
- goto out;
+ error_setg_errno(errp, -ret, "Unable to read chunk_sectors "
+ "sysfs attribute");
+ return;
+ } else if (!ret) {
+ error_setg(errp, "Read 0 from chunk_sectors sysfs attribute");
+ return;
}
- buf[ret] = 0;
- /* The file is ended with '\n', pass 'end' to accept that. */
- ret = qemu_strtol(buf, &end, 10, &max_segments);
- if (ret == 0 && end && *end == '\n') {
- ret = max_segments;
+ bs->bl.zone_size = ret << BDRV_SECTOR_BITS;
+
+ ret = get_sysfs_long_val(st, "nr_zones");
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "Unable to read nr_zones "
+ "sysfs attribute");
+ return;
+ } else if (!ret) {
+ error_setg(errp, "Read 0 from nr_zones sysfs attribute");
+ return;
}
+ bs->bl.nr_zones = ret;
-out:
- if (sysfd != -1) {
- close(sysfd);
+ ret = get_sysfs_long_val(st, "zone_append_max_bytes");
+ if (ret > 0) {
+ bs->bl.max_append_sectors = ret >> BDRV_SECTOR_BITS;
}
- g_free(sysfspath);
- return ret;
-#else
- return -ENOTSUP;
-#endif
+
+ ret = get_sysfs_long_val(st, "physical_block_size");
+ if (ret >= 0) {
+ bs->bl.write_granularity = ret;
+ }
+
+ /* The refresh_limits() function can be called multiple times. */
+ g_free(bs->wps);
+ bs->wps = g_malloc(sizeof(BlockZoneWps) +
+ sizeof(int64_t) * bs->bl.nr_zones);
+ ret = get_zones_wp(bs, s->fd, 0, bs->bl.nr_zones, 0);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "report wps failed");
+ bs->wps = NULL;
+ return;
+ }
+ qemu_co_mutex_init(&bs->wps->colock);
}
+#else /* !defined(CONFIG_BLKZONED) */
+static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
+ Error **errp)
+{
+ bs->bl.zoned = BLK_Z_NONE;
+}
+#endif /* !defined(CONFIG_BLKZONED) */
static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
{
@@ -1297,6 +1528,8 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
bs->bl.max_hw_iov = ret;
}
}
+
+ raw_refresh_zoned_limits(bs, &st, errp);
}
static int check_for_dasd(int fd)
@@ -1320,9 +1553,12 @@ static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
BDRVRawState *s = bs->opaque;
int ret;
- /* If DASD, get blocksizes */
+ /* If DASD or zoned devices, get blocksizes */
if (check_for_dasd(s->fd) < 0) {
- return -ENOTSUP;
+ /* zoned devices are not DASD */
+ if (bs->bl.zoned == BLK_Z_NONE) {
+ return -ENOTSUP;
+ }
}
ret = probe_logical_blocksize(s->fd, &bsz->log);
if (ret < 0) {
@@ -1463,7 +1699,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
ssize_t len;
len = RETRY_ON_EINTR(
- (aiocb->aio_type & QEMU_AIO_WRITE) ?
+ (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) ?
qemu_pwritev(aiocb->aio_fildes,
aiocb->io.iov,
aiocb->io.niov,
@@ -1492,7 +1728,7 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
ssize_t len;
while (offset < aiocb->aio_nbytes) {
- if (aiocb->aio_type & QEMU_AIO_WRITE) {
+ if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
len = pwrite(aiocb->aio_fildes,
(const char *)buf + offset,
aiocb->aio_nbytes - offset,
@@ -1585,7 +1821,7 @@ static int handle_aiocb_rw(void *opaque)
}
nbytes = handle_aiocb_rw_linear(aiocb, buf);
- if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
+ if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) {
char *p = buf;
size_t count = aiocb->aio_nbytes, copy;
int i;
@@ -1790,6 +2026,147 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
}
#endif
+/*
+ * parse_zone - Fill a zone descriptor
+ */
+#if defined(CONFIG_BLKZONED)
+static inline int parse_zone(struct BlockZoneDescriptor *zone,
+ const struct blk_zone *blkz) {
+ zone->start = blkz->start << BDRV_SECTOR_BITS;
+ zone->length = blkz->len << BDRV_SECTOR_BITS;
+ zone->wp = blkz->wp << BDRV_SECTOR_BITS;
+
+#ifdef HAVE_BLK_ZONE_REP_CAPACITY
+ zone->cap = blkz->capacity << BDRV_SECTOR_BITS;
+#else
+ zone->cap = blkz->len << BDRV_SECTOR_BITS;
+#endif
+
+ switch (blkz->type) {
+ case BLK_ZONE_TYPE_SEQWRITE_REQ:
+ zone->type = BLK_ZT_SWR;
+ break;
+ case BLK_ZONE_TYPE_SEQWRITE_PREF:
+ zone->type = BLK_ZT_SWP;
+ break;
+ case BLK_ZONE_TYPE_CONVENTIONAL:
+ zone->type = BLK_ZT_CONV;
+ break;
+ default:
+ error_report("Unsupported zone type: 0x%x", blkz->type);
+ return -ENOTSUP;
+ }
+
+ switch (blkz->cond) {
+ case BLK_ZONE_COND_NOT_WP:
+ zone->state = BLK_ZS_NOT_WP;
+ break;
+ case BLK_ZONE_COND_EMPTY:
+ zone->state = BLK_ZS_EMPTY;
+ break;
+ case BLK_ZONE_COND_IMP_OPEN:
+ zone->state = BLK_ZS_IOPEN;
+ break;
+ case BLK_ZONE_COND_EXP_OPEN:
+ zone->state = BLK_ZS_EOPEN;
+ break;
+ case BLK_ZONE_COND_CLOSED:
+ zone->state = BLK_ZS_CLOSED;
+ break;
+ case BLK_ZONE_COND_READONLY:
+ zone->state = BLK_ZS_RDONLY;
+ break;
+ case BLK_ZONE_COND_FULL:
+ zone->state = BLK_ZS_FULL;
+ break;
+ case BLK_ZONE_COND_OFFLINE:
+ zone->state = BLK_ZS_OFFLINE;
+ break;
+ default:
+ error_report("Unsupported zone state: 0x%x", blkz->cond);
+ return -ENOTSUP;
+ }
+ return 0;
+}
+#endif
+
+#if defined(CONFIG_BLKZONED)
+static int handle_aiocb_zone_report(void *opaque)
+{
+ RawPosixAIOData *aiocb = opaque;
+ int fd = aiocb->aio_fildes;
+ unsigned int *nr_zones = aiocb->zone_report.nr_zones;
+ BlockZoneDescriptor *zones = aiocb->zone_report.zones;
+ /* zoned block devices use 512-byte sectors */
+ uint64_t sector = aiocb->aio_offset / 512;
+
+ struct blk_zone *blkz;
+ size_t rep_size;
+ unsigned int nrz;
+ int ret;
+ unsigned int n = 0, i = 0;
+
+ nrz = *nr_zones;
+ rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
+ g_autofree struct blk_zone_report *rep = NULL;
+ rep = g_malloc(rep_size);
+
+ blkz = (struct blk_zone *)(rep + 1);
+ while (n < nrz) {
+ memset(rep, 0, rep_size);
+ rep->sector = sector;
+ rep->nr_zones = nrz - n;
+
+ do {
+ ret = ioctl(fd, BLKREPORTZONE, rep);
+ } while (ret != 0 && errno == EINTR);
+ if (ret != 0) {
+ error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
+ fd, sector, errno);
+ return -errno;
+ }
+
+ if (!rep->nr_zones) {
+ break;
+ }
+
+ for (i = 0; i < rep->nr_zones; i++, n++) {
+ ret = parse_zone(&zones[n], &blkz[i]);
+ if (ret != 0) {
+ return ret;
+ }
+
+ /* The next report should start after the last zone reported */
+ sector = blkz[i].start + blkz[i].len;
+ }
+ }
+
+ *nr_zones = n;
+ return 0;
+}
+#endif
+
+#if defined(CONFIG_BLKZONED)
+static int handle_aiocb_zone_mgmt(void *opaque)
+{
+ RawPosixAIOData *aiocb = opaque;
+ int fd = aiocb->aio_fildes;
+ uint64_t sector = aiocb->aio_offset / 512;
+ int64_t nr_sectors = aiocb->aio_nbytes / 512;
+ struct blk_zone_range range;
+ int ret;
+
+ /* Execute the operation */
+ range.sector = sector;
+ range.nr_sectors = nr_sectors;
+ do {
+ ret = ioctl(fd, aiocb->zone_mgmt.op, &range);
+ } while (ret != 0 && errno == EINTR);
+
+ return ret < 0 ? -errno : ret;
+}
+#endif
+
static int handle_aiocb_copy_range(void *opaque)
{
RawPosixAIOData *aiocb = opaque;
@@ -2072,9 +2449,19 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
{
BDRVRawState *s = bs->opaque;
RawPosixAIOData acb;
+ int ret;
if (fd_open(bs) < 0)
return -EIO;
+#if defined(CONFIG_BLKZONED)
+ if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) && bs->wps) {
+ qemu_co_mutex_lock(&bs->wps->colock);
+ if (type & QEMU_AIO_ZONE_APPEND && bs->bl.zone_size) {
+ int index = offset / bs->bl.zone_size;
+ offset = bs->wps->wp[index];
+ }
+ }
+#endif
/*
* When using O_DIRECT, the request must be aligned to be able to use
@@ -2087,12 +2474,15 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
#ifdef CONFIG_LINUX_IO_URING
} else if (s->use_linux_io_uring) {
assert(qiov->size == bytes);
- return luring_co_submit(bs, s->fd, offset, qiov, type);
+ ret = luring_co_submit(bs, s->fd, offset, qiov, type);
+ goto out;
#endif
#ifdef CONFIG_LINUX_AIO
} else if (s->use_linux_aio) {
assert(qiov->size == bytes);
- return laio_co_submit(s->fd, offset, qiov, type, s->aio_max_batch);
+ ret = laio_co_submit(s->fd, offset, qiov, type,
+ s->aio_max_batch);
+ goto out;
#endif
}
@@ -2109,7 +2499,41 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
};
assert(qiov->size == bytes);
- return raw_thread_pool_submit(handle_aiocb_rw, &acb);
+ ret = raw_thread_pool_submit(handle_aiocb_rw, &acb);
+ goto out; /* Avoid the compiler err of unused label */
+
+out:
+#if defined(CONFIG_BLKZONED)
+{
+ BlockZoneWps *wps = bs->wps;
+ if (ret == 0) {
+ if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
+ && wps && bs->bl.zone_size) {
+ uint64_t *wp = &wps->wp[offset / bs->bl.zone_size];
+ if (!BDRV_ZT_IS_CONV(*wp)) {
+ if (type & QEMU_AIO_ZONE_APPEND) {
+ *s->offset = *wp;
+ trace_zbd_zone_append_complete(bs, *s->offset
+ >> BDRV_SECTOR_BITS);
+ }
+ /* Advance the wp if needed */
+ if (offset + bytes > *wp) {
+ *wp = offset + bytes;
+ }
+ }
+ }
+ } else {
+ if (type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
+ update_zones_wp(bs, s->fd, 0, 1);
+ }
+ }
+
+ if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) && wps) {
+ qemu_co_mutex_unlock(&wps->colock);
+ }
+}
+#endif
+ return ret;
}
static int coroutine_fn raw_co_preadv(BlockDriverState *bs, int64_t offset,
@@ -2212,6 +2636,9 @@ static void raw_close(BlockDriverState *bs)
BDRVRawState *s = bs->opaque;
if (s->fd >= 0) {
+#if defined(CONFIG_BLKZONED)
+ g_free(bs->wps);
+#endif
qemu_close(s->fd);
s->fd = -1;
}
@@ -2969,6 +3396,171 @@ static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret)
}
}
+/*
+ * zone report - Get a zone block device's information in the form
+ * of an array of zone descriptors.
+ * zones is an array of zone descriptors to hold zone information on reply;
+ * offset can be any byte within the entire size of the device;
+ * nr_zones is the maxium number of sectors the command should operate on.
+ */
+#if defined(CONFIG_BLKZONED)
+static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
+ unsigned int *nr_zones,
+ BlockZoneDescriptor *zones) {
+ BDRVRawState *s = bs->opaque;
+ RawPosixAIOData acb = (RawPosixAIOData) {
+ .bs = bs,
+ .aio_fildes = s->fd,
+ .aio_type = QEMU_AIO_ZONE_REPORT,
+ .aio_offset = offset,
+ .zone_report = {
+ .nr_zones = nr_zones,
+ .zones = zones,
+ },
+ };
+
+ trace_zbd_zone_report(bs, *nr_zones, offset >> BDRV_SECTOR_BITS);
+ return raw_thread_pool_submit(handle_aiocb_zone_report, &acb);
+}
+#endif
+
+/*
+ * zone management operations - Execute an operation on a zone
+ */
+#if defined(CONFIG_BLKZONED)
+static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
+ int64_t offset, int64_t len) {
+ BDRVRawState *s = bs->opaque;
+ RawPosixAIOData acb;
+ int64_t zone_size, zone_size_mask;
+ const char *op_name;
+ unsigned long zo;
+ int ret;
+ BlockZoneWps *wps = bs->wps;
+ int64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
+
+ zone_size = bs->bl.zone_size;
+ zone_size_mask = zone_size - 1;
+ if (offset & zone_size_mask) {
+ error_report("sector offset %" PRId64 " is not aligned to zone size "
+ "%" PRId64 "", offset / 512, zone_size / 512);
+ return -EINVAL;
+ }
+
+ if (((offset + len) < capacity && len & zone_size_mask) ||
+ offset + len > capacity) {
+ error_report("number of sectors %" PRId64 " is not aligned to zone size"
+ " %" PRId64 "", len / 512, zone_size / 512);
+ return -EINVAL;
+ }
+
+ uint32_t i = offset / bs->bl.zone_size;
+ uint32_t nrz = len / bs->bl.zone_size;
+ uint64_t *wp = &wps->wp[i];
+ if (BDRV_ZT_IS_CONV(*wp) && len != capacity) {
+ error_report("zone mgmt operations are not allowed for conventional zones");
+ return -EIO;
+ }
+
+ switch (op) {
+ case BLK_ZO_OPEN:
+ op_name = "BLKOPENZONE";
+ zo = BLKOPENZONE;
+ break;
+ case BLK_ZO_CLOSE:
+ op_name = "BLKCLOSEZONE";
+ zo = BLKCLOSEZONE;
+ break;
+ case BLK_ZO_FINISH:
+ op_name = "BLKFINISHZONE";
+ zo = BLKFINISHZONE;
+ break;
+ case BLK_ZO_RESET:
+ op_name = "BLKRESETZONE";
+ zo = BLKRESETZONE;
+ break;
+ default:
+ error_report("Unsupported zone op: 0x%x", op);
+ return -ENOTSUP;
+ }
+
+ acb = (RawPosixAIOData) {
+ .bs = bs,
+ .aio_fildes = s->fd,
+ .aio_type = QEMU_AIO_ZONE_MGMT,
+ .aio_offset = offset,
+ .aio_nbytes = len,
+ .zone_mgmt = {
+ .op = zo,
+ },
+ };
+
+ trace_zbd_zone_mgmt(bs, op_name, offset >> BDRV_SECTOR_BITS,
+ len >> BDRV_SECTOR_BITS);
+ ret = raw_thread_pool_submit(handle_aiocb_zone_mgmt, &acb);
+ if (ret != 0) {
+ update_zones_wp(bs, s->fd, offset, i);
+ error_report("ioctl %s failed %d", op_name, ret);
+ return ret;
+ }
+
+ if (zo == BLKRESETZONE && len == capacity) {
+ ret = get_zones_wp(bs, s->fd, 0, bs->bl.nr_zones, 1);
+ if (ret < 0) {
+ error_report("reporting single wp failed");
+ return ret;
+ }
+ } else if (zo == BLKRESETZONE) {
+ for (unsigned int j = 0; j < nrz; ++j) {
+ wp[j] = offset + j * zone_size;
+ }
+ } else if (zo == BLKFINISHZONE) {
+ for (unsigned int j = 0; j < nrz; ++j) {
+ /* The zoned device allows the last zone smaller that the
+ * zone size. */
+ wp[j] = MIN(offset + (j + 1) * zone_size, offset + len);
+ }
+ }
+
+ return ret;
+}
+#endif
+
+#if defined(CONFIG_BLKZONED)
+static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
+ int64_t *offset,
+ QEMUIOVector *qiov,
+ BdrvRequestFlags flags) {
+ assert(flags == 0);
+ int64_t zone_size_mask = bs->bl.zone_size - 1;
+ int64_t iov_len = 0;
+ int64_t len = 0;
+ BDRVRawState *s = bs->opaque;
+ s->offset = offset;
+
+ if (*offset & zone_size_mask) {
+ error_report("sector offset %" PRId64 " is not aligned to zone size "
+ "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512);
+ return -EINVAL;
+ }
+
+ int64_t wg = bs->bl.write_granularity;
+ int64_t wg_mask = wg - 1;
+ for (int i = 0; i < qiov->niov; i++) {
+ iov_len = qiov->iov[i].iov_len;
+ if (iov_len & wg_mask) {
+ error_report("len of IOVector[%d] %" PRId64 " is not aligned to "
+ "block size %" PRId64 "", i, iov_len, wg);
+ return -EINVAL;
+ }
+ len += iov_len;
+ }
+
+ trace_zbd_zone_append(bs, *offset >> BDRV_SECTOR_BITS);
+ return raw_co_prw(bs, *offset, len, qiov, QEMU_AIO_ZONE_APPEND);
+}
+#endif
+
static coroutine_fn int
raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
bool blkdev)
@@ -3724,6 +4316,14 @@ static BlockDriver bdrv_host_device = {
#ifdef __linux__
.bdrv_co_ioctl = hdev_co_ioctl,
#endif
+
+ /* zoned device */
+#if defined(CONFIG_BLKZONED)
+ /* zone management operations */
+ .bdrv_co_zone_report = raw_co_zone_report,
+ .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
+ .bdrv_co_zone_append = raw_co_zone_append,
+#endif
};
#if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
diff --git a/block/io.c b/block/io.c
index 58557f2f96..4d54fda593 100644
--- a/block/io.c
+++ b/block/io.c
@@ -3113,6 +3113,74 @@ out:
return co.ret;
}
+int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
+ unsigned int *nr_zones,
+ BlockZoneDescriptor *zones)
+{
+ BlockDriver *drv = bs->drv;
+ CoroutineIOCompletion co = {
+ .coroutine = qemu_coroutine_self(),
+ };
+ IO_CODE();
+
+ bdrv_inc_in_flight(bs);
+ if (!drv || !drv->bdrv_co_zone_report || bs->bl.zoned == BLK_Z_NONE) {
+ co.ret = -ENOTSUP;
+ goto out;
+ }
+ co.ret = drv->bdrv_co_zone_report(bs, offset, nr_zones, zones);
+out:
+ bdrv_dec_in_flight(bs);
+ return co.ret;
+}
+
+int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
+ int64_t offset, int64_t len)
+{
+ BlockDriver *drv = bs->drv;
+ CoroutineIOCompletion co = {
+ .coroutine = qemu_coroutine_self(),
+ };
+ IO_CODE();
+
+ bdrv_inc_in_flight(bs);
+ if (!drv || !drv->bdrv_co_zone_mgmt || bs->bl.zoned == BLK_Z_NONE) {
+ co.ret = -ENOTSUP;
+ goto out;
+ }
+ co.ret = drv->bdrv_co_zone_mgmt(bs, op, offset, len);
+out:
+ bdrv_dec_in_flight(bs);
+ return co.ret;
+}
+
+int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
+ QEMUIOVector *qiov,
+ BdrvRequestFlags flags)
+{
+ int ret;
+ BlockDriver *drv = bs->drv;
+ CoroutineIOCompletion co = {
+ .coroutine = qemu_coroutine_self(),
+ };
+ IO_CODE();
+
+ ret = bdrv_check_qiov_request(*offset, qiov->size, qiov, 0, NULL);
+ if (ret < 0) {
+ return ret;
+ }
+
+ bdrv_inc_in_flight(bs);
+ if (!drv || !drv->bdrv_co_zone_append || bs->bl.zoned == BLK_Z_NONE) {
+ co.ret = -ENOTSUP;
+ goto out;
+ }
+ co.ret = drv->bdrv_co_zone_append(bs, offset, qiov, flags);
+out:
+ bdrv_dec_in_flight(bs);
+ return co.ret;
+}
+
void *qemu_blockalign(BlockDriverState *bs, size_t size)
{
IO_CODE();
diff --git a/block/io_uring.c b/block/io_uring.c
index 989f9a99ed..82cab6a5bd 100644
--- a/block/io_uring.c
+++ b/block/io_uring.c
@@ -350,6 +350,10 @@ static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s,
io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
luringcb->qiov->niov, offset);
break;
+ case QEMU_AIO_ZONE_APPEND:
+ io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
+ luringcb->qiov->niov, offset);
+ break;
case QEMU_AIO_READ:
io_uring_prep_readv(sqes, fd, luringcb->qiov->iov,
luringcb->qiov->niov, offset);
diff --git a/block/linux-aio.c b/block/linux-aio.c
index fc50cdd1bf..442c86209b 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -394,6 +394,9 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
case QEMU_AIO_WRITE:
io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
break;
+ case QEMU_AIO_ZONE_APPEND:
+ io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
+ break;
case QEMU_AIO_READ:
io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset);
break;
diff --git a/block/qapi-sysemu.c b/block/qapi-sysemu.c
index 7bd7554150..cec3c1afb4 100644
--- a/block/qapi-sysemu.c
+++ b/block/qapi-sysemu.c
@@ -517,6 +517,7 @@ void qmp_block_latency_histogram_set(
bool has_boundaries, uint64List *boundaries,
bool has_boundaries_read, uint64List *boundaries_read,
bool has_boundaries_write, uint64List *boundaries_write,
+ bool has_boundaries_append, uint64List *boundaries_append,
bool has_boundaries_flush, uint64List *boundaries_flush,
Error **errp)
{
@@ -557,6 +558,16 @@ void qmp_block_latency_histogram_set(
}
}
+ if (has_boundaries || has_boundaries_append) {
+ ret = block_latency_histogram_set(
+ stats, BLOCK_ACCT_ZONE_APPEND,
+ has_boundaries_append ? boundaries_append : boundaries);
+ if (ret) {
+ error_setg(errp, "Device '%s' set append write boundaries fail", id);
+ return;
+ }
+ }
+
if (has_boundaries || has_boundaries_flush) {
ret = block_latency_histogram_set(
stats, BLOCK_ACCT_FLUSH,
diff --git a/block/qapi.c b/block/qapi.c
index 71f2751257..f34f95e0ef 100644
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -533,27 +533,36 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)
ds->rd_bytes = stats->nr_bytes[BLOCK_ACCT_READ];
ds->wr_bytes = stats->nr_bytes[BLOCK_ACCT_WRITE];
+ ds->zone_append_bytes = stats->nr_bytes[BLOCK_ACCT_ZONE_APPEND];
ds->unmap_bytes = stats->nr_bytes[BLOCK_ACCT_UNMAP];
ds->rd_operations = stats->nr_ops[BLOCK_ACCT_READ];
ds->wr_operations = stats->nr_ops[BLOCK_ACCT_WRITE];
+ ds->zone_append_operations = stats->nr_ops[BLOCK_ACCT_ZONE_APPEND];
ds->unmap_operations = stats->nr_ops[BLOCK_ACCT_UNMAP];
ds->failed_rd_operations = stats->failed_ops[BLOCK_ACCT_READ];
ds->failed_wr_operations = stats->failed_ops[BLOCK_ACCT_WRITE];
+ ds->failed_zone_append_operations =
+ stats->failed_ops[BLOCK_ACCT_ZONE_APPEND];
ds->failed_flush_operations = stats->failed_ops[BLOCK_ACCT_FLUSH];
ds->failed_unmap_operations = stats->failed_ops[BLOCK_ACCT_UNMAP];
ds->invalid_rd_operations = stats->invalid_ops[BLOCK_ACCT_READ];
ds->invalid_wr_operations = stats->invalid_ops[BLOCK_ACCT_WRITE];
+ ds->invalid_zone_append_operations =
+ stats->invalid_ops[BLOCK_ACCT_ZONE_APPEND];
ds->invalid_flush_operations =
stats->invalid_ops[BLOCK_ACCT_FLUSH];
ds->invalid_unmap_operations = stats->invalid_ops[BLOCK_ACCT_UNMAP];
ds->rd_merged = stats->merged[BLOCK_ACCT_READ];
ds->wr_merged = stats->merged[BLOCK_ACCT_WRITE];
+ ds->zone_append_merged = stats->merged[BLOCK_ACCT_ZONE_APPEND];
ds->unmap_merged = stats->merged[BLOCK_ACCT_UNMAP];
ds->flush_operations = stats->nr_ops[BLOCK_ACCT_FLUSH];
ds->wr_total_time_ns = stats->total_time_ns[BLOCK_ACCT_WRITE];
+ ds->zone_append_total_time_ns =
+ stats->total_time_ns[BLOCK_ACCT_ZONE_APPEND];
ds->rd_total_time_ns = stats->total_time_ns[BLOCK_ACCT_READ];
ds->flush_total_time_ns = stats->total_time_ns[BLOCK_ACCT_FLUSH];
ds->unmap_total_time_ns = stats->total_time_ns[BLOCK_ACCT_UNMAP];
@@ -571,6 +580,7 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)
TimedAverage *rd = &ts->latency[BLOCK_ACCT_READ];
TimedAverage *wr = &ts->latency[BLOCK_ACCT_WRITE];
+ TimedAverage *zap = &ts->latency[BLOCK_ACCT_ZONE_APPEND];
TimedAverage *fl = &ts->latency[BLOCK_ACCT_FLUSH];
dev_stats->interval_length = ts->interval_length;
@@ -583,6 +593,10 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)
dev_stats->max_wr_latency_ns = timed_average_max(wr);
dev_stats->avg_wr_latency_ns = timed_average_avg(wr);
+ dev_stats->min_zone_append_latency_ns = timed_average_min(zap);
+ dev_stats->max_zone_append_latency_ns = timed_average_max(zap);
+ dev_stats->avg_zone_append_latency_ns = timed_average_avg(zap);
+
dev_stats->min_flush_latency_ns = timed_average_min(fl);
dev_stats->max_flush_latency_ns = timed_average_max(fl);
dev_stats->avg_flush_latency_ns = timed_average_avg(fl);
@@ -591,6 +605,8 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)
block_acct_queue_depth(ts, BLOCK_ACCT_READ);
dev_stats->avg_wr_queue_depth =
block_acct_queue_depth(ts, BLOCK_ACCT_WRITE);
+ dev_stats->avg_zone_append_queue_depth =
+ block_acct_queue_depth(ts, BLOCK_ACCT_ZONE_APPEND);
QAPI_LIST_PREPEND(ds->timed_stats, dev_stats);
}
@@ -600,6 +616,8 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)
= bdrv_latency_histogram_stats(&hgram[BLOCK_ACCT_READ]);
ds->wr_latency_histogram
= bdrv_latency_histogram_stats(&hgram[BLOCK_ACCT_WRITE]);
+ ds->zone_append_latency_histogram
+ = bdrv_latency_histogram_stats(&hgram[BLOCK_ACCT_ZONE_APPEND]);
ds->flush_latency_histogram
= bdrv_latency_histogram_stats(&hgram[BLOCK_ACCT_FLUSH]);
}
diff --git a/block/raw-format.c b/block/raw-format.c
index fd9e61f58e..3a3946213f 100644
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -317,6 +317,28 @@ raw_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
return bdrv_co_pdiscard(bs->file, offset, bytes);
}
+static int coroutine_fn GRAPH_RDLOCK
+raw_co_zone_report(BlockDriverState *bs, int64_t offset,
+ unsigned int *nr_zones,
+ BlockZoneDescriptor *zones)
+{
+ return bdrv_co_zone_report(bs->file->bs, offset, nr_zones, zones);
+}
+
+static int coroutine_fn GRAPH_RDLOCK
+raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
+ int64_t offset, int64_t len)
+{
+ return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len);
+}
+
+static int coroutine_fn GRAPH_RDLOCK
+raw_co_zone_append(BlockDriverState *bs,int64_t *offset, QEMUIOVector *qiov,
+ BdrvRequestFlags flags)
+{
+ return bdrv_co_zone_append(bs->file->bs, offset, qiov, flags);
+}
+
static int64_t coroutine_fn GRAPH_RDLOCK
raw_co_getlength(BlockDriverState *bs)
{
@@ -608,6 +630,7 @@ static void raw_child_perm(BlockDriverState *bs, BdrvChild *c,
BlockDriver bdrv_raw = {
.format_name = "raw",
.instance_size = sizeof(BDRVRawState),
+ .supports_zoned_children = true,
.bdrv_probe = &raw_probe,
.bdrv_reopen_prepare = &raw_reopen_prepare,
.bdrv_reopen_commit = &raw_reopen_commit,
@@ -619,6 +642,9 @@ BlockDriver bdrv_raw = {
.bdrv_co_pwritev = &raw_co_pwritev,
.bdrv_co_pwrite_zeroes = &raw_co_pwrite_zeroes,
.bdrv_co_pdiscard = &raw_co_pdiscard,
+ .bdrv_co_zone_report = &raw_co_zone_report,
+ .bdrv_co_zone_mgmt = &raw_co_zone_mgmt,
+ .bdrv_co_zone_append = &raw_co_zone_append,
.bdrv_co_block_status = &raw_co_block_status,
.bdrv_co_copy_range_from = &raw_co_copy_range_from,
.bdrv_co_copy_range_to = &raw_co_copy_range_to,
diff --git a/block/trace-events b/block/trace-events
index 48dbf10c66..32665158d6 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -209,6 +209,10 @@ file_FindEjectableOpticalMedia(const char *media) "Matching using %s"
file_setup_cdrom(const char *partition) "Using %s as optical disc"
file_hdev_is_sg(int type, int version) "SG device found: type=%d, version=%d"
file_flush_fdatasync_failed(int err) "errno %d"
+zbd_zone_report(void *bs, unsigned int nr_zones, int64_t sector) "bs %p report %d zones starting at sector offset 0x%" PRIx64 ""
+zbd_zone_mgmt(void *bs, const char *op_name, int64_t sector, int64_t len) "bs %p %s starts at sector offset 0x%" PRIx64 " over a range of 0x%" PRIx64 " sectors"
+zbd_zone_append(void *bs, int64_t sector) "bs %p append at sector offset 0x%" PRIx64 ""
+zbd_zone_append_complete(void *bs, int64_t sector) "bs %p returns append sector 0x%" PRIx64 ""
# ssh.c
sftp_error(const char *op, const char *ssh_err, int ssh_err_code, int sftp_err_code) "%s failed: %s (libssh error code: %d, sftp error code: %d)"