aboutsummaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
authorRichard Henderson <richard.henderson@linaro.org>2021-11-03 00:32:56 -0400
committerRichard Henderson <richard.henderson@linaro.org>2021-11-03 00:32:56 -0400
commit741bdeb1d5a4024a2c54c6abb2de493a27b61953 (patch)
treee2af8a1c355cbf56913499e9ea32814d5b86ad47 /block
parent22d5760cb43e2fe73e61fda145a98f3217ca47bf (diff)
parenta8951438946d72d74c9bdbdb38fce95aa2973a88 (diff)
Merge remote-tracking branch 'remotes/kwolf/tags/for-upstream' into staging
Block layer patches - Fail gracefully when blockdev-snapshot creates loops - ide: Fix IDENTIFY DEVICE for disks > 128 GiB - file-posix: Fix return value translation for AIO discards - file-posix: add 'aio-max-batch' option - rbd: implement bdrv_co_block_status - Code cleanups and build fixes # gpg: Signature made Tue 02 Nov 2021 12:04:02 PM EDT # gpg: using RSA key DC3DEB159A9AF95D3D7456FE7F09B272C88F2FD6 # gpg: issuer "kwolf@redhat.com" # gpg: Good signature from "Kevin Wolf <kwolf@redhat.com>" [full] * remotes/kwolf/tags/for-upstream: block/nvme: Extract nvme_free_queue() from nvme_free_queue_pair() block/nvme: Display CQ/SQ pointer in nvme_free_queue_pair() block/nvme: Automatically free qemu_memalign() with QEMU_AUTO_VFREE block-backend: Silence clang -m32 compiler warning linux-aio: add `dev_max_batch` parameter to laio_io_unplug() linux-aio: add `dev_max_batch` parameter to laio_co_submit() file-posix: add `aio-max-batch` option block/export/fuse.c: fix musl build ide: Cap LBA28 capacity announcement to 2^28-1 block/rbd: implement bdrv_co_block_status block: Fail gracefully when blockdev-snapshot creates loops block/file-posix: Fix return value translation for AIO discards Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Diffstat (limited to 'block')
-rw-r--r--block/block-backend.c2
-rw-r--r--block/export/fuse.c4
-rw-r--r--block/file-posix.c18
-rw-r--r--block/linux-aio.c38
-rw-r--r--block/nvme.c22
-rw-r--r--block/rbd.c112
-rw-r--r--block/trace-events2
7 files changed, 171 insertions, 27 deletions
diff --git a/block/block-backend.c b/block/block-backend.c
index 39cd99df2b..12ef80ea17 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1540,7 +1540,7 @@ BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset,
QEMUIOVector *qiov, BdrvRequestFlags flags,
BlockCompletionFunc *cb, void *opaque)
{
- assert(qiov->size <= INT64_MAX);
+ assert((uint64_t)qiov->size <= INT64_MAX);
return blk_aio_prwv(blk, offset, qiov->size, qiov,
blk_aio_write_entry, flags, cb, opaque);
}
diff --git a/block/export/fuse.c b/block/export/fuse.c
index 2e3bf8270b..823c126d23 100644
--- a/block/export/fuse.c
+++ b/block/export/fuse.c
@@ -31,6 +31,10 @@
#include <fuse.h>
#include <fuse_lowlevel.h>
+#if defined(CONFIG_FALLOCATE_ZERO_RANGE)
+#include <linux/falloc.h>
+#endif
+
#ifdef __linux__
#include <linux/fs.h>
#endif
diff --git a/block/file-posix.c b/block/file-posix.c
index 53be0bdc1b..7a27c83060 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -150,6 +150,8 @@ typedef struct BDRVRawState {
uint64_t locked_perm;
uint64_t locked_shared_perm;
+ uint64_t aio_max_batch;
+
int perm_change_fd;
int perm_change_flags;
BDRVReopenState *reopen_state;
@@ -531,6 +533,11 @@ static QemuOptsList raw_runtime_opts = {
.help = "host AIO implementation (threads, native, io_uring)",
},
{
+ .name = "aio-max-batch",
+ .type = QEMU_OPT_NUMBER,
+ .help = "AIO max batch size (0 = auto handled by AIO backend, default: 0)",
+ },
+ {
.name = "locking",
.type = QEMU_OPT_STRING,
.help = "file locking mode (on/off/auto, default: auto)",
@@ -609,6 +616,8 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
s->use_linux_io_uring = (aio == BLOCKDEV_AIO_OPTIONS_IO_URING);
#endif
+ s->aio_max_batch = qemu_opt_get_number(opts, "aio-max-batch", 0);
+
locking = qapi_enum_parse(&OnOffAuto_lookup,
qemu_opt_get(opts, "locking"),
ON_OFF_AUTO_AUTO, &local_err);
@@ -1807,7 +1816,7 @@ static int handle_aiocb_copy_range(void *opaque)
static int handle_aiocb_discard(void *opaque)
{
RawPosixAIOData *aiocb = opaque;
- int ret = -EOPNOTSUPP;
+ int ret = -ENOTSUP;
BDRVRawState *s = aiocb->bs->opaque;
if (!s->has_discard) {
@@ -1829,7 +1838,7 @@ static int handle_aiocb_discard(void *opaque)
#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
aiocb->aio_offset, aiocb->aio_nbytes);
- ret = translate_err(-errno);
+ ret = translate_err(ret);
#elif defined(__APPLE__) && (__MACH__)
fpunchhole_t fpunchhole;
fpunchhole.fp_flags = 0;
@@ -2057,7 +2066,8 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
} else if (s->use_linux_aio) {
LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
assert(qiov->size == bytes);
- return laio_co_submit(bs, aio, s->fd, offset, qiov, type);
+ return laio_co_submit(bs, aio, s->fd, offset, qiov, type,
+ s->aio_max_batch);
#endif
}
@@ -2115,7 +2125,7 @@ static void raw_aio_unplug(BlockDriverState *bs)
#ifdef CONFIG_LINUX_AIO
if (s->use_linux_aio) {
LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
- laio_io_unplug(bs, aio);
+ laio_io_unplug(bs, aio, s->aio_max_batch);
}
#endif
#ifdef CONFIG_LINUX_IO_URING
diff --git a/block/linux-aio.c b/block/linux-aio.c
index 0dab507b71..f53ae72e21 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -334,30 +334,45 @@ static void ioq_submit(LinuxAioState *s)
}
}
+static uint64_t laio_max_batch(LinuxAioState *s, uint64_t dev_max_batch)
+{
+ uint64_t max_batch = s->aio_context->aio_max_batch ?: DEFAULT_MAX_BATCH;
+
+ /*
+ * AIO context can be shared between multiple block devices, so
+ * `dev_max_batch` allows reducing the batch size for latency-sensitive
+ * devices.
+ */
+ max_batch = MIN_NON_ZERO(dev_max_batch, max_batch);
+
+ /* limit the batch with the number of available events */
+ max_batch = MIN_NON_ZERO(MAX_EVENTS - s->io_q.in_flight, max_batch);
+
+ return max_batch;
+}
+
void laio_io_plug(BlockDriverState *bs, LinuxAioState *s)
{
s->io_q.plugged++;
}
-void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s)
+void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s,
+ uint64_t dev_max_batch)
{
assert(s->io_q.plugged);
- if (--s->io_q.plugged == 0 &&
- !s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
+ if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch) ||
+ (--s->io_q.plugged == 0 &&
+ !s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending))) {
ioq_submit(s);
}
}
static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
- int type)
+ int type, uint64_t dev_max_batch)
{
LinuxAioState *s = laiocb->ctx;
struct iocb *iocbs = &laiocb->iocb;
QEMUIOVector *qiov = laiocb->qiov;
- int64_t max_batch = s->aio_context->aio_max_batch ?: DEFAULT_MAX_BATCH;
-
- /* limit the batch with the number of available events */
- max_batch = MIN_NON_ZERO(MAX_EVENTS - s->io_q.in_flight, max_batch);
switch (type) {
case QEMU_AIO_WRITE:
@@ -378,7 +393,7 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
s->io_q.in_queue++;
if (!s->io_q.blocked &&
(!s->io_q.plugged ||
- s->io_q.in_queue >= max_batch)) {
+ s->io_q.in_queue >= laio_max_batch(s, dev_max_batch))) {
ioq_submit(s);
}
@@ -386,7 +401,8 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
}
int coroutine_fn laio_co_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
- uint64_t offset, QEMUIOVector *qiov, int type)
+ uint64_t offset, QEMUIOVector *qiov, int type,
+ uint64_t dev_max_batch)
{
int ret;
struct qemu_laiocb laiocb = {
@@ -398,7 +414,7 @@ int coroutine_fn laio_co_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
.qiov = qiov,
};
- ret = laio_do_submit(fd, &laiocb, offset, type);
+ ret = laio_do_submit(fd, &laiocb, offset, type, dev_max_batch);
if (ret < 0) {
return ret;
}
diff --git a/block/nvme.c b/block/nvme.c
index 1cc7b62bb4..e4f336d79c 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -183,15 +183,20 @@ static bool nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
return r == 0;
}
+static void nvme_free_queue(NVMeQueue *q)
+{
+ qemu_vfree(q->queue);
+}
+
static void nvme_free_queue_pair(NVMeQueuePair *q)
{
- trace_nvme_free_queue_pair(q->index, q);
+ trace_nvme_free_queue_pair(q->index, q, &q->cq, &q->sq);
if (q->completion_bh) {
qemu_bh_delete(q->completion_bh);
}
+ nvme_free_queue(&q->sq);
+ nvme_free_queue(&q->cq);
qemu_vfree(q->prp_list_pages);
- qemu_vfree(q->sq.queue);
- qemu_vfree(q->cq.queue);
qemu_mutex_destroy(&q->lock);
g_free(q);
}
@@ -514,10 +519,10 @@ static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
{
BDRVNVMeState *s = bs->opaque;
bool ret = false;
- union {
+ QEMU_AUTO_VFREE union {
NvmeIdCtrl ctrl;
NvmeIdNs ns;
- } *id;
+ } *id = NULL;
NvmeLBAF *lbaf;
uint16_t oncs;
int r;
@@ -595,7 +600,6 @@ static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
s->blkshift = lbaf->ds;
out:
qemu_vfio_dma_unmap(s->vfio, id);
- qemu_vfree(id);
return ret;
}
@@ -1219,7 +1223,7 @@ static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
{
BDRVNVMeState *s = bs->opaque;
int r;
- uint8_t *buf = NULL;
+ QEMU_AUTO_VFREE uint8_t *buf = NULL;
QEMUIOVector local_qiov;
size_t len = QEMU_ALIGN_UP(bytes, qemu_real_host_page_size);
assert(QEMU_IS_ALIGNED(offset, s->page_size));
@@ -1246,7 +1250,6 @@ static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
if (!r && !is_write) {
qemu_iovec_from_buf(qiov, 0, buf, bytes);
}
- qemu_vfree(buf);
return r;
}
@@ -1365,7 +1368,7 @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
BDRVNVMeState *s = bs->opaque;
NVMeQueuePair *ioq = s->queues[INDEX_IO(0)];
NVMeRequest *req;
- NvmeDsmRange *buf;
+ QEMU_AUTO_VFREE NvmeDsmRange *buf = NULL;
QEMUIOVector local_qiov;
int ret;
@@ -1440,7 +1443,6 @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
trace_nvme_dsm_done(s, offset, bytes, ret);
out:
qemu_iovec_destroy(&local_qiov);
- qemu_vfree(buf);
return ret;
}
diff --git a/block/rbd.c b/block/rbd.c
index 701fbf2b0c..def96292e0 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -97,6 +97,12 @@ typedef struct RBDTask {
int64_t ret;
} RBDTask;
+typedef struct RBDDiffIterateReq {
+ uint64_t offs;
+ uint64_t bytes;
+ bool exists;
+} RBDDiffIterateReq;
+
static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx,
BlockdevOptionsRbd *opts, bool cache,
const char *keypairs, const char *secretid,
@@ -1259,6 +1265,111 @@ static ImageInfoSpecific *qemu_rbd_get_specific_info(BlockDriverState *bs,
return spec_info;
}
+/*
+ * rbd_diff_iterate2 allows to interrupt the exection by returning a negative
+ * value in the callback routine. Choose a value that does not conflict with
+ * an existing exitcode and return it if we want to prematurely stop the
+ * execution because we detected a change in the allocation status.
+ */
+#define QEMU_RBD_EXIT_DIFF_ITERATE2 -9000
+
+static int qemu_rbd_diff_iterate_cb(uint64_t offs, size_t len,
+ int exists, void *opaque)
+{
+ RBDDiffIterateReq *req = opaque;
+
+ assert(req->offs + req->bytes <= offs);
+ /*
+ * we do not diff against a snapshot so we should never receive a callback
+ * for a hole.
+ */
+ assert(exists);
+
+ if (!req->exists && offs > req->offs) {
+ /*
+ * we started in an unallocated area and hit the first allocated
+ * block. req->bytes must be set to the length of the unallocated area
+ * before the allocated area. stop further processing.
+ */
+ req->bytes = offs - req->offs;
+ return QEMU_RBD_EXIT_DIFF_ITERATE2;
+ }
+
+ if (req->exists && offs > req->offs + req->bytes) {
+ /*
+ * we started in an allocated area and jumped over an unallocated area,
+ * req->bytes contains the length of the allocated area before the
+ * unallocated area. stop further processing.
+ */
+ return QEMU_RBD_EXIT_DIFF_ITERATE2;
+ }
+
+ req->bytes += len;
+ req->exists = true;
+
+ return 0;
+}
+
+static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
+ bool want_zero, int64_t offset,
+ int64_t bytes, int64_t *pnum,
+ int64_t *map,
+ BlockDriverState **file)
+{
+ BDRVRBDState *s = bs->opaque;
+ int status, r;
+ RBDDiffIterateReq req = { .offs = offset };
+ uint64_t features, flags;
+
+ assert(offset + bytes <= s->image_size);
+
+ /* default to all sectors allocated */
+ status = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
+ *map = offset;
+ *file = bs;
+ *pnum = bytes;
+
+ /* check if RBD image supports fast-diff */
+ r = rbd_get_features(s->image, &features);
+ if (r < 0) {
+ return status;
+ }
+ if (!(features & RBD_FEATURE_FAST_DIFF)) {
+ return status;
+ }
+
+ /* check if RBD fast-diff result is valid */
+ r = rbd_get_flags(s->image, &flags);
+ if (r < 0) {
+ return status;
+ }
+ if (flags & RBD_FLAG_FAST_DIFF_INVALID) {
+ return status;
+ }
+
+ r = rbd_diff_iterate2(s->image, NULL, offset, bytes, true, true,
+ qemu_rbd_diff_iterate_cb, &req);
+ if (r < 0 && r != QEMU_RBD_EXIT_DIFF_ITERATE2) {
+ return status;
+ }
+ assert(req.bytes <= bytes);
+ if (!req.exists) {
+ if (r == 0) {
+ /*
+ * rbd_diff_iterate2 does not invoke callbacks for unallocated
+ * areas. This here catches the case where no callback was
+ * invoked at all (req.bytes == 0).
+ */
+ assert(req.bytes == 0);
+ req.bytes = bytes;
+ }
+ status = BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID;
+ }
+
+ *pnum = req.bytes;
+ return status;
+}
+
static int64_t qemu_rbd_getlength(BlockDriverState *bs)
{
BDRVRBDState *s = bs->opaque;
@@ -1494,6 +1605,7 @@ static BlockDriver bdrv_rbd = {
#ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
.bdrv_co_pwrite_zeroes = qemu_rbd_co_pwrite_zeroes,
#endif
+ .bdrv_co_block_status = qemu_rbd_co_block_status,
.bdrv_snapshot_create = qemu_rbd_snap_create,
.bdrv_snapshot_delete = qemu_rbd_snap_remove,
diff --git a/block/trace-events b/block/trace-events
index ab56edacb4..549090d453 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -157,7 +157,7 @@ nvme_dsm_done(void *s, int64_t offset, int64_t bytes, int ret) "s %p offset 0x%"
nvme_dma_map_flush(void *s) "s %p"
nvme_free_req_queue_wait(void *s, unsigned q_index) "s %p q #%u"
nvme_create_queue_pair(unsigned q_index, void *q, size_t size, void *aio_context, int fd) "index %u q %p size %zu aioctx %p fd %d"
-nvme_free_queue_pair(unsigned q_index, void *q) "index %u q %p"
+nvme_free_queue_pair(unsigned q_index, void *q, void *cq, void *sq) "index %u q %p cq %p sq %p"
nvme_cmd_map_qiov(void *s, void *cmd, void *req, void *qiov, int entries) "s %p cmd %p req %p qiov %p entries %d"
nvme_cmd_map_qiov_pages(void *s, int i, uint64_t page) "s %p page[%d] 0x%"PRIx64
nvme_cmd_map_qiov_iov(void *s, int i, void *page, int pages) "s %p iov[%d] %p pages %d"