aboutsummaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
authorPeter Maydell <peter.maydell@linaro.org>2020-12-31 23:26:46 +0000
committerPeter Maydell <peter.maydell@linaro.org>2020-12-31 23:26:46 +0000
commit26f6b15e2636eb20cb6757093170341b22fe6fbc (patch)
tree147896371dd546414557fdbed8f35b622ee4aae2 /block
parentc7e48f91653d6ace9dc42ec6b5b627b57d5d49e0 (diff)
parent0e72078128229bf9efb542e396ab44bf91b91340 (diff)
Merge remote-tracking branch 'remotes/maxreitz/tags/pull-block-2020-12-18' into staging
Block patches: - New block filter: preallocate (which, on writes beyond an image file's end, allocates big chunks of data so that such post-EOF writes will occur less frequently) - write-zeroes and block-status support for Quorum - Implementation of truncate for the nvme block driver similarly to the existing implementations for host block devices and iscsi devices - Block layer refactoring: Drop the tighten_restrictions concept in the block permission functions - iotest fixes # gpg: Signature made Fri 18 Dec 2020 14:45:30 GMT # gpg: using RSA key 91BEB60A30DB3E8857D11829F407DB0061D5CF40 # gpg: issuer "mreitz@redhat.com" # gpg: Good signature from "Max Reitz <mreitz@redhat.com>" [full] # Primary key fingerprint: 91BE B60A 30DB 3E88 57D1 1829 F407 DB00 61D5 CF40 * remotes/maxreitz/tags/pull-block-2020-12-18: (30 commits) iotests: Fix _send_qemu_cmd with bash 5.1 iotests/102: Pass $QEMU_HANDLE to _send_qemu_cmd block/nvme: Implement fake truncate() coroutine quorum: Implement bdrv_co_pwrite_zeroes() quorum: Implement bdrv_co_block_status() scripts/simplebench: add bench_prealloc.py simplebench/results_to_text: make executable simplebench/results_to_text: add difference line to the table simplebench/results_to_text: improve view of the table simplebench: move results_to_text() into separate file simplebench: rename ascii() to results_to_text() scripts/simplebench: use standard deviation for +- error scripts/simplebench: support iops scripts/simplebench: fix grammar: s/successed/succeeded/ iotests: add 298 to test new preallocate filter driver iotests.py: execute_setup_common(): add required_fmts argument iotests: qemu_io_silent: support --image-opts qemu-io: add preallocate mode parameter for truncate command block: introduce preallocate filter block: bdrv_check_perm(): process children anyway ... Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Diffstat (limited to 'block')
-rw-r--r--block/file-posix.c2
-rw-r--r--block/io.c130
-rw-r--r--block/meson.build1
-rw-r--r--block/nvme.c24
-rw-r--r--block/preallocate.c559
-rw-r--r--block/quorum.c88
6 files changed, 747 insertions, 57 deletions
diff --git a/block/file-posix.c b/block/file-posix.c
index 9804681d5c..00cdaaa2d4 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -2953,7 +2953,7 @@ raw_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int bytes,
assert(bdrv_check_request(req->offset, req->bytes) == 0);
- bdrv_mark_request_serialising(req, bs->bl.request_alignment);
+ bdrv_make_request_serialising(req, bs->bl.request_alignment);
}
#endif
diff --git a/block/io.c b/block/io.c
index 24205f5168..95b1c56c06 100644
--- a/block/io.c
+++ b/block/io.c
@@ -754,55 +754,65 @@ static bool tracked_request_overlaps(BdrvTrackedRequest *req,
return true;
}
-static bool coroutine_fn
-bdrv_wait_serialising_requests_locked(BlockDriverState *bs,
- BdrvTrackedRequest *self)
+/* Called with self->bs->reqs_lock held */
+static BdrvTrackedRequest *
+bdrv_find_conflicting_request(BdrvTrackedRequest *self)
{
BdrvTrackedRequest *req;
- bool retry;
- bool waited = false;
- do {
- retry = false;
- QLIST_FOREACH(req, &bs->tracked_requests, list) {
- if (req == self || (!req->serialising && !self->serialising)) {
- continue;
- }
- if (tracked_request_overlaps(req, self->overlap_offset,
- self->overlap_bytes))
- {
- /* Hitting this means there was a reentrant request, for
- * example, a block driver issuing nested requests. This must
- * never happen since it means deadlock.
- */
- assert(qemu_coroutine_self() != req->co);
-
- /* If the request is already (indirectly) waiting for us, or
- * will wait for us as soon as it wakes up, then just go on
- * (instead of producing a deadlock in the former case). */
- if (!req->waiting_for) {
- self->waiting_for = req;
- qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock);
- self->waiting_for = NULL;
- retry = true;
- waited = true;
- break;
- }
+ QLIST_FOREACH(req, &self->bs->tracked_requests, list) {
+ if (req == self || (!req->serialising && !self->serialising)) {
+ continue;
+ }
+ if (tracked_request_overlaps(req, self->overlap_offset,
+ self->overlap_bytes))
+ {
+ /*
+ * Hitting this means there was a reentrant request, for
+ * example, a block driver issuing nested requests. This must
+ * never happen since it means deadlock.
+ */
+ assert(qemu_coroutine_self() != req->co);
+
+ /*
+ * If the request is already (indirectly) waiting for us, or
+ * will wait for us as soon as it wakes up, then just go on
+ * (instead of producing a deadlock in the former case).
+ */
+ if (!req->waiting_for) {
+ return req;
}
}
- } while (retry);
+ }
+
+ return NULL;
+}
+
+/* Called with self->bs->reqs_lock held */
+static bool coroutine_fn
+bdrv_wait_serialising_requests_locked(BdrvTrackedRequest *self)
+{
+ BdrvTrackedRequest *req;
+ bool waited = false;
+
+ while ((req = bdrv_find_conflicting_request(self))) {
+ self->waiting_for = req;
+ qemu_co_queue_wait(&req->wait_queue, &self->bs->reqs_lock);
+ self->waiting_for = NULL;
+ waited = true;
+ }
+
return waited;
}
-bool bdrv_mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
+/* Called with req->bs->reqs_lock held */
+static void tracked_request_set_serialising(BdrvTrackedRequest *req,
+ uint64_t align)
{
- BlockDriverState *bs = req->bs;
int64_t overlap_offset = req->offset & ~(align - 1);
uint64_t overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
- overlap_offset;
- bool waited;
- qemu_co_mutex_lock(&bs->reqs_lock);
if (!req->serialising) {
qatomic_inc(&req->bs->serialising_in_flight);
req->serialising = true;
@@ -810,9 +820,6 @@ bool bdrv_mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
- waited = bdrv_wait_serialising_requests_locked(bs, req);
- qemu_co_mutex_unlock(&bs->reqs_lock);
- return waited;
}
/**
@@ -892,12 +899,27 @@ static bool coroutine_fn bdrv_wait_serialising_requests(BdrvTrackedRequest *self
}
qemu_co_mutex_lock(&bs->reqs_lock);
- waited = bdrv_wait_serialising_requests_locked(bs, self);
+ waited = bdrv_wait_serialising_requests_locked(self);
qemu_co_mutex_unlock(&bs->reqs_lock);
return waited;
}
+bool coroutine_fn bdrv_make_request_serialising(BdrvTrackedRequest *req,
+ uint64_t align)
+{
+ bool waited;
+
+ qemu_co_mutex_lock(&req->bs->reqs_lock);
+
+ tracked_request_set_serialising(req, align);
+ waited = bdrv_wait_serialising_requests_locked(req);
+
+ qemu_co_mutex_unlock(&req->bs->reqs_lock);
+
+ return waited;
+}
+
int bdrv_check_request(int64_t offset, int64_t bytes)
{
if (offset < 0 || bytes < 0) {
@@ -1423,7 +1445,7 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child,
* with each other for the same cluster. For example, in copy-on-read
* it ensures that the CoR read and write operations are atomic and
* guest writes cannot interleave between them. */
- bdrv_mark_request_serialising(req, bdrv_get_cluster_size(bs));
+ bdrv_make_request_serialising(req, bdrv_get_cluster_size(bs));
} else {
bdrv_wait_serialising_requests(req);
}
@@ -1827,7 +1849,6 @@ bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, uint64_t bytes,
BdrvTrackedRequest *req, int flags)
{
BlockDriverState *bs = child->bs;
- bool waited;
int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
if (bs->read_only) {
@@ -1837,17 +1858,18 @@ bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, uint64_t bytes,
assert(!(bs->open_flags & BDRV_O_INACTIVE));
assert((bs->open_flags & BDRV_O_NO_IO) == 0);
assert(!(flags & ~BDRV_REQ_MASK));
+ assert(!((flags & BDRV_REQ_NO_WAIT) && !(flags & BDRV_REQ_SERIALISING)));
if (flags & BDRV_REQ_SERIALISING) {
- waited = bdrv_mark_request_serialising(req, bdrv_get_cluster_size(bs));
- /*
- * For a misaligned request we should have already waited earlier,
- * because we come after bdrv_padding_rmw_read which must be called
- * with the request already marked as serialising.
- */
- assert(!waited ||
- (req->offset == req->overlap_offset &&
- req->bytes == req->overlap_bytes));
+ QEMU_LOCK_GUARD(&bs->reqs_lock);
+
+ tracked_request_set_serialising(req, bdrv_get_cluster_size(bs));
+
+ if ((flags & BDRV_REQ_NO_WAIT) && bdrv_find_conflicting_request(req)) {
+ return -EBUSY;
+ }
+
+ bdrv_wait_serialising_requests_locked(req);
} else {
bdrv_wait_serialising_requests(req);
}
@@ -2013,7 +2035,7 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child,
padding = bdrv_init_padding(bs, offset, bytes, &pad);
if (padding) {
- bdrv_mark_request_serialising(req, align);
+ bdrv_make_request_serialising(req, align);
bdrv_padding_rmw_read(child, req, &pad, true);
@@ -2127,7 +2149,7 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child,
}
if (bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad)) {
- bdrv_mark_request_serialising(&req, align);
+ bdrv_make_request_serialising(&req, align);
bdrv_padding_rmw_read(child, &req, &pad, false);
}
@@ -3248,7 +3270,7 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
* new area, we need to make sure that no write requests are made to it
* concurrently or they might be overwritten by preallocation. */
if (new_bytes) {
- bdrv_mark_request_serialising(&req, 1);
+ bdrv_make_request_serialising(&req, 1);
}
if (bs->read_only) {
error_setg(errp, "Image is read-only");
diff --git a/block/meson.build b/block/meson.build
index 5dcc1e5cce..7595d86c41 100644
--- a/block/meson.build
+++ b/block/meson.build
@@ -12,6 +12,7 @@ block_ss.add(files(
'block-copy.c',
'commit.c',
'copy-on-read.c',
+ 'preallocate.c',
'create.c',
'crypto.c',
'dirty-bitmap.c',
diff --git a/block/nvme.c b/block/nvme.c
index a06a188d53..5a6fbacf4a 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -1389,6 +1389,29 @@ out:
}
+static int coroutine_fn nvme_co_truncate(BlockDriverState *bs, int64_t offset,
+ bool exact, PreallocMode prealloc,
+ BdrvRequestFlags flags, Error **errp)
+{
+ int64_t cur_length;
+
+ if (prealloc != PREALLOC_MODE_OFF) {
+ error_setg(errp, "Unsupported preallocation mode '%s'",
+ PreallocMode_str(prealloc));
+ return -ENOTSUP;
+ }
+
+ cur_length = nvme_getlength(bs);
+ if (offset != cur_length && exact) {
+ error_setg(errp, "Cannot resize NVMe devices");
+ return -ENOTSUP;
+ } else if (offset > cur_length) {
+ error_setg(errp, "Cannot grow NVMe devices");
+ return -EINVAL;
+ }
+
+ return 0;
+}
static int nvme_reopen_prepare(BDRVReopenState *reopen_state,
BlockReopenQueue *queue, Error **errp)
@@ -1523,6 +1546,7 @@ static BlockDriver bdrv_nvme = {
.bdrv_close = nvme_close,
.bdrv_getlength = nvme_getlength,
.bdrv_probe_blocksizes = nvme_probe_blocksizes,
+ .bdrv_co_truncate = nvme_co_truncate,
.bdrv_co_preadv = nvme_co_preadv,
.bdrv_co_pwritev = nvme_co_pwritev,
diff --git a/block/preallocate.c b/block/preallocate.c
new file mode 100644
index 0000000000..b619206304
--- /dev/null
+++ b/block/preallocate.c
@@ -0,0 +1,559 @@
+/*
+ * preallocate filter driver
+ *
+ * The driver performs preallocate operation: it is injected above
+ * some node, and before each write over EOF it does additional preallocating
+ * write-zeroes request.
+ *
+ * Copyright (c) 2020 Virtuozzo International GmbH.
+ *
+ * Author:
+ * Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+
+#include "qapi/error.h"
+#include "qemu/module.h"
+#include "qemu/option.h"
+#include "qemu/units.h"
+#include "block/block_int.h"
+
+
+typedef struct PreallocateOpts {
+ int64_t prealloc_size;
+ int64_t prealloc_align;
+} PreallocateOpts;
+
+typedef struct BDRVPreallocateState {
+ PreallocateOpts opts;
+
+ /*
+ * Track real data end, to crop preallocation on close. If < 0 the status is
+ * unknown.
+ *
+ * @data_end is a maximum of file size on open (or when we get write/resize
+ * permissions) and all write request ends after it. So it's safe to
+ * truncate to data_end if it is valid.
+ */
+ int64_t data_end;
+
+ /*
+ * Start of trailing preallocated area which reads as zero. May be smaller
+ * than data_end, if user does over-EOF write zero operation. If < 0 the
+ * status is unknown.
+ *
+ * If both @zero_start and @file_end are valid, the region
+ * [@zero_start, @file_end) is known to be preallocated zeroes. If @file_end
+ * is not valid, @zero_start doesn't make much sense.
+ */
+ int64_t zero_start;
+
+ /*
+ * Real end of file. Actually the cache for bdrv_getlength(bs->file->bs),
+ * to avoid extra lseek() calls on each write operation. If < 0 the status
+ * is unknown.
+ */
+ int64_t file_end;
+
+ /*
+ * All three states @data_end, @zero_start and @file_end are guaranteed to
+ * be invalid (< 0) when we don't have both exclusive BLK_PERM_RESIZE and
+ * BLK_PERM_WRITE permissions on file child.
+ */
+} BDRVPreallocateState;
+
+#define PREALLOCATE_OPT_PREALLOC_ALIGN "prealloc-align"
+#define PREALLOCATE_OPT_PREALLOC_SIZE "prealloc-size"
+static QemuOptsList runtime_opts = {
+ .name = "preallocate",
+ .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
+ .desc = {
+ {
+ .name = PREALLOCATE_OPT_PREALLOC_ALIGN,
+ .type = QEMU_OPT_SIZE,
+ .help = "on preallocation, align file length to this number, "
+ "default 1M",
+ },
+ {
+ .name = PREALLOCATE_OPT_PREALLOC_SIZE,
+ .type = QEMU_OPT_SIZE,
+ .help = "how much to preallocate, default 128M",
+ },
+ { /* end of list */ }
+ },
+};
+
+static bool preallocate_absorb_opts(PreallocateOpts *dest, QDict *options,
+ BlockDriverState *child_bs, Error **errp)
+{
+ QemuOpts *opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
+
+ if (!qemu_opts_absorb_qdict(opts, options, errp)) {
+ return false;
+ }
+
+ dest->prealloc_align =
+ qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_ALIGN, 1 * MiB);
+ dest->prealloc_size =
+ qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_SIZE, 128 * MiB);
+
+ qemu_opts_del(opts);
+
+ if (!QEMU_IS_ALIGNED(dest->prealloc_align, BDRV_SECTOR_SIZE)) {
+ error_setg(errp, "prealloc-align parameter of preallocate filter "
+ "is not aligned to %llu", BDRV_SECTOR_SIZE);
+ return false;
+ }
+
+ if (!QEMU_IS_ALIGNED(dest->prealloc_align,
+ child_bs->bl.request_alignment)) {
+ error_setg(errp, "prealloc-align parameter of preallocate filter "
+ "is not aligned to underlying node request alignment "
+ "(%" PRIi32 ")", child_bs->bl.request_alignment);
+ return false;
+ }
+
+ return true;
+}
+
+static int preallocate_open(BlockDriverState *bs, QDict *options, int flags,
+ Error **errp)
+{
+ BDRVPreallocateState *s = bs->opaque;
+
+ /*
+ * s->data_end and friends should be initialized on permission update.
+ * For this to work, mark them invalid.
+ */
+ s->file_end = s->zero_start = s->data_end = -EINVAL;
+
+ bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
+ BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
+ false, errp);
+ if (!bs->file) {
+ return -EINVAL;
+ }
+
+ if (!preallocate_absorb_opts(&s->opts, options, bs->file->bs, errp)) {
+ return -EINVAL;
+ }
+
+ bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED |
+ (BDRV_REQ_FUA & bs->file->bs->supported_write_flags);
+
+ bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED |
+ ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
+ bs->file->bs->supported_zero_flags);
+
+ return 0;
+}
+
+static void preallocate_close(BlockDriverState *bs)
+{
+ int ret;
+ BDRVPreallocateState *s = bs->opaque;
+
+ if (s->data_end < 0) {
+ return;
+ }
+
+ if (s->file_end < 0) {
+ s->file_end = bdrv_getlength(bs->file->bs);
+ if (s->file_end < 0) {
+ return;
+ }
+ }
+
+ if (s->data_end < s->file_end) {
+ ret = bdrv_truncate(bs->file, s->data_end, true, PREALLOC_MODE_OFF, 0,
+ NULL);
+ s->file_end = ret < 0 ? ret : s->data_end;
+ }
+}
+
+
+/*
+ * Handle reopen.
+ *
+ * We must implement reopen handlers, otherwise reopen just don't work. Handle
+ * new options and don't care about preallocation state, as it is handled in
+ * set/check permission handlers.
+ */
+
+static int preallocate_reopen_prepare(BDRVReopenState *reopen_state,
+ BlockReopenQueue *queue, Error **errp)
+{
+ PreallocateOpts *opts = g_new0(PreallocateOpts, 1);
+
+ if (!preallocate_absorb_opts(opts, reopen_state->options,
+ reopen_state->bs->file->bs, errp)) {
+ g_free(opts);
+ return -EINVAL;
+ }
+
+ reopen_state->opaque = opts;
+
+ return 0;
+}
+
+static void preallocate_reopen_commit(BDRVReopenState *state)
+{
+ BDRVPreallocateState *s = state->bs->opaque;
+
+ s->opts = *(PreallocateOpts *)state->opaque;
+
+ g_free(state->opaque);
+ state->opaque = NULL;
+}
+
+static void preallocate_reopen_abort(BDRVReopenState *state)
+{
+ g_free(state->opaque);
+ state->opaque = NULL;
+}
+
+static coroutine_fn int preallocate_co_preadv_part(
+ BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+ QEMUIOVector *qiov, size_t qiov_offset, int flags)
+{
+ return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset,
+ flags);
+}
+
+static int coroutine_fn preallocate_co_pdiscard(BlockDriverState *bs,
+ int64_t offset, int bytes)
+{
+ return bdrv_co_pdiscard(bs->file, offset, bytes);
+}
+
+static bool can_write_resize(uint64_t perm)
+{
+ return (perm & BLK_PERM_WRITE) && (perm & BLK_PERM_RESIZE);
+}
+
+static bool has_prealloc_perms(BlockDriverState *bs)
+{
+ BDRVPreallocateState *s = bs->opaque;
+
+ if (can_write_resize(bs->file->perm)) {
+ assert(!(bs->file->shared_perm & BLK_PERM_WRITE));
+ assert(!(bs->file->shared_perm & BLK_PERM_RESIZE));
+ return true;
+ }
+
+ assert(s->data_end < 0);
+ assert(s->zero_start < 0);
+ assert(s->file_end < 0);
+ return false;
+}
+
+/*
+ * Call on each write. Returns true if @want_merge_zero is true and the region
+ * [offset, offset + bytes) is zeroed (as a result of this call or earlier
+ * preallocation).
+ *
+ * want_merge_zero is used to merge write-zero request with preallocation in
+ * one bdrv_co_pwrite_zeroes() call.
+ */
+static bool coroutine_fn handle_write(BlockDriverState *bs, int64_t offset,
+ int64_t bytes, bool want_merge_zero)
+{
+ BDRVPreallocateState *s = bs->opaque;
+ int64_t end = offset + bytes;
+ int64_t prealloc_start, prealloc_end;
+ int ret;
+
+ if (!has_prealloc_perms(bs)) {
+ /* We don't have state neither should try to recover it */
+ return false;
+ }
+
+ if (s->data_end < 0) {
+ s->data_end = bdrv_getlength(bs->file->bs);
+ if (s->data_end < 0) {
+ return false;
+ }
+
+ if (s->file_end < 0) {
+ s->file_end = s->data_end;
+ }
+ }
+
+ if (end <= s->data_end) {
+ return false;
+ }
+
+ /* We have valid s->data_end, and request writes beyond it. */
+
+ s->data_end = end;
+ if (s->zero_start < 0 || !want_merge_zero) {
+ s->zero_start = end;
+ }
+
+ if (s->file_end < 0) {
+ s->file_end = bdrv_getlength(bs->file->bs);
+ if (s->file_end < 0) {
+ return false;
+ }
+ }
+
+ /* Now s->data_end, s->zero_start and s->file_end are valid. */
+
+ if (end <= s->file_end) {
+ /* No preallocation needed. */
+ return want_merge_zero && offset >= s->zero_start;
+ }
+
+ /* Now we want new preallocation, as request writes beyond s->file_end. */
+
+ prealloc_start = want_merge_zero ? MIN(offset, s->file_end) : s->file_end;
+ prealloc_end = QEMU_ALIGN_UP(end + s->opts.prealloc_size,
+ s->opts.prealloc_align);
+
+ ret = bdrv_co_pwrite_zeroes(
+ bs->file, prealloc_start, prealloc_end - prealloc_start,
+ BDRV_REQ_NO_FALLBACK | BDRV_REQ_SERIALISING | BDRV_REQ_NO_WAIT);
+ if (ret < 0) {
+ s->file_end = ret;
+ return false;
+ }
+
+ s->file_end = prealloc_end;
+ return want_merge_zero;
+}
+
+static int coroutine_fn preallocate_co_pwrite_zeroes(BlockDriverState *bs,
+ int64_t offset, int bytes, BdrvRequestFlags flags)
+{
+ bool want_merge_zero =
+ !(flags & ~(BDRV_REQ_ZERO_WRITE | BDRV_REQ_NO_FALLBACK));
+ if (handle_write(bs, offset, bytes, want_merge_zero)) {
+ return 0;
+ }
+
+ return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
+}
+
+static coroutine_fn int preallocate_co_pwritev_part(BlockDriverState *bs,
+ uint64_t offset,
+ uint64_t bytes,
+ QEMUIOVector *qiov,
+ size_t qiov_offset,
+ int flags)
+{
+ handle_write(bs, offset, bytes, false);
+
+ return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset,
+ flags);
+}
+
+static int coroutine_fn
+preallocate_co_truncate(BlockDriverState *bs, int64_t offset,
+ bool exact, PreallocMode prealloc,
+ BdrvRequestFlags flags, Error **errp)
+{
+ ERRP_GUARD();
+ BDRVPreallocateState *s = bs->opaque;
+ int ret;
+
+ if (s->data_end >= 0 && offset > s->data_end) {
+ if (s->file_end < 0) {
+ s->file_end = bdrv_getlength(bs->file->bs);
+ if (s->file_end < 0) {
+ error_setg(errp, "failed to get file length");
+ return s->file_end;
+ }
+ }
+
+ if (prealloc == PREALLOC_MODE_FALLOC) {
+ /*
+ * If offset <= s->file_end, the task is already done, just
+ * update s->data_end, to move part of "filter preallocation"
+ * to "preallocation requested by user".
+ * Otherwise just proceed to preallocate missing part.
+ */
+ if (offset <= s->file_end) {
+ s->data_end = offset;
+ return 0;
+ }
+ } else {
+ /*
+ * We have to drop our preallocation, to
+ * - avoid "Cannot use preallocation for shrinking files" in
+ * case of offset < file_end
+ * - give PREALLOC_MODE_OFF a chance to keep small disk
+ * usage
+ * - give PREALLOC_MODE_FULL a chance to actually write the
+ * whole region as user expects
+ */
+ if (s->file_end > s->data_end) {
+ ret = bdrv_co_truncate(bs->file, s->data_end, true,
+ PREALLOC_MODE_OFF, 0, errp);
+ if (ret < 0) {
+ s->file_end = ret;
+ error_prepend(errp, "preallocate-filter: failed to drop "
+ "write-zero preallocation: ");
+ return ret;
+ }
+ s->file_end = s->data_end;
+ }
+ }
+
+ s->data_end = offset;
+ }
+
+ ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp);
+ if (ret < 0) {
+ s->file_end = s->zero_start = s->data_end = ret;
+ return ret;
+ }
+
+ if (has_prealloc_perms(bs)) {
+ s->file_end = s->zero_start = s->data_end = offset;
+ }
+ return 0;
+}
+
+static int coroutine_fn preallocate_co_flush(BlockDriverState *bs)
+{
+ return bdrv_co_flush(bs->file->bs);
+}
+
+static int64_t preallocate_getlength(BlockDriverState *bs)
+{
+ int64_t ret;
+ BDRVPreallocateState *s = bs->opaque;
+
+ if (s->data_end >= 0) {
+ return s->data_end;
+ }
+
+ ret = bdrv_getlength(bs->file->bs);
+
+ if (has_prealloc_perms(bs)) {
+ s->file_end = s->zero_start = s->data_end = ret;
+ }
+
+ return ret;
+}
+
+static int preallocate_check_perm(BlockDriverState *bs,
+ uint64_t perm, uint64_t shared, Error **errp)
+{
+ BDRVPreallocateState *s = bs->opaque;
+
+ if (s->data_end >= 0 && !can_write_resize(perm)) {
+ /*
+ * Lose permissions.
+ * We should truncate in check_perm, as in set_perm bs->file->perm will
+ * be already changed, and we should not violate it.
+ */
+ if (s->file_end < 0) {
+ s->file_end = bdrv_getlength(bs->file->bs);
+ if (s->file_end < 0) {
+ error_setg(errp, "Failed to get file length");
+ return s->file_end;
+ }
+ }
+
+ if (s->data_end < s->file_end) {
+ int ret = bdrv_truncate(bs->file, s->data_end, true,
+ PREALLOC_MODE_OFF, 0, NULL);
+ if (ret < 0) {
+ error_setg(errp, "Failed to drop preallocation");
+ s->file_end = ret;
+ return ret;
+ }
+ s->file_end = s->data_end;
+ }
+ }
+
+ return 0;
+}
+
+static void preallocate_set_perm(BlockDriverState *bs,
+ uint64_t perm, uint64_t shared)
+{
+ BDRVPreallocateState *s = bs->opaque;
+
+ if (can_write_resize(perm)) {
+ if (s->data_end < 0) {
+ s->data_end = s->file_end = s->zero_start =
+ bdrv_getlength(bs->file->bs);
+ }
+ } else {
+ /*
+ * We drop our permissions, as well as allow shared
+ * permissions (see preallocate_child_perm), anyone will be able to
+ * change the child, so mark all states invalid. We'll regain control if
+ * get good permissions back.
+ */
+ s->data_end = s->file_end = s->zero_start = -EINVAL;
+ }
+}
+
+static void preallocate_child_perm(BlockDriverState *bs, BdrvChild *c,
+ BdrvChildRole role, BlockReopenQueue *reopen_queue,
+ uint64_t perm, uint64_t shared, uint64_t *nperm, uint64_t *nshared)
+{
+ bdrv_default_perms(bs, c, role, reopen_queue, perm, shared, nperm, nshared);
+
+ if (can_write_resize(perm)) {
+ /* This should come by default, but let's enforce: */
+ *nperm |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
+
+ /*
+ * Don't share, to keep our states s->file_end, s->data_end and
+ * s->zero_start valid.
+ */
+ *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
+ }
+}
+
+BlockDriver bdrv_preallocate_filter = {
+ .format_name = "preallocate",
+ .instance_size = sizeof(BDRVPreallocateState),
+
+ .bdrv_getlength = preallocate_getlength,
+ .bdrv_open = preallocate_open,
+ .bdrv_close = preallocate_close,
+
+ .bdrv_reopen_prepare = preallocate_reopen_prepare,
+ .bdrv_reopen_commit = preallocate_reopen_commit,
+ .bdrv_reopen_abort = preallocate_reopen_abort,
+
+ .bdrv_co_preadv_part = preallocate_co_preadv_part,
+ .bdrv_co_pwritev_part = preallocate_co_pwritev_part,
+ .bdrv_co_pwrite_zeroes = preallocate_co_pwrite_zeroes,
+ .bdrv_co_pdiscard = preallocate_co_pdiscard,
+ .bdrv_co_flush = preallocate_co_flush,
+ .bdrv_co_truncate = preallocate_co_truncate,
+
+ .bdrv_check_perm = preallocate_check_perm,
+ .bdrv_set_perm = preallocate_set_perm,
+ .bdrv_child_perm = preallocate_child_perm,
+
+ .has_variable_length = true,
+ .is_filter = true,
+};
+
+static void bdrv_preallocate_init(void)
+{
+ bdrv_register(&bdrv_preallocate_filter);
+}
+
+block_init(bdrv_preallocate_init);
diff --git a/block/quorum.c b/block/quorum.c
index 4b08a199b7..0bd75450de 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -18,6 +18,7 @@
#include "qemu/module.h"
#include "qemu/option.h"
#include "block/block_int.h"
+#include "block/coroutines.h"
#include "block/qdict.h"
#include "qapi/error.h"
#include "qapi/qapi-events-block.h"
@@ -691,8 +692,13 @@ static void write_quorum_entry(void *opaque)
QuorumChildRequest *sacb = &acb->qcrs[i];
sacb->bs = s->children[i]->bs;
- sacb->ret = bdrv_co_pwritev(s->children[i], acb->offset, acb->bytes,
- acb->qiov, acb->flags);
+ if (acb->flags & BDRV_REQ_ZERO_WRITE) {
+ sacb->ret = bdrv_co_pwrite_zeroes(s->children[i], acb->offset,
+ acb->bytes, acb->flags);
+ } else {
+ sacb->ret = bdrv_co_pwritev(s->children[i], acb->offset, acb->bytes,
+ acb->qiov, acb->flags);
+ }
if (sacb->ret == 0) {
acb->success_count++;
} else {
@@ -738,6 +744,14 @@ static int quorum_co_pwritev(BlockDriverState *bs, uint64_t offset,
return ret;
}
+static int quorum_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
+ int bytes, BdrvRequestFlags flags)
+
+{
+ return quorum_co_pwritev(bs, offset, bytes, NULL,
+ flags | BDRV_REQ_ZERO_WRITE);
+}
+
static int64_t quorum_getlength(BlockDriverState *bs)
{
BDRVQuorumState *s = bs->opaque;
@@ -896,6 +910,21 @@ static QemuOptsList quorum_runtime_opts = {
},
};
+static void quorum_refresh_flags(BlockDriverState *bs)
+{
+ BDRVQuorumState *s = bs->opaque;
+ int i;
+
+ bs->supported_zero_flags =
+ BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK;
+
+ for (i = 0; i < s->num_children; i++) {
+ bs->supported_zero_flags &= s->children[i]->bs->supported_zero_flags;
+ }
+
+ bs->supported_zero_flags |= BDRV_REQ_WRITE_UNCHANGED;
+}
+
static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
Error **errp)
{
@@ -990,6 +1019,7 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
s->next_child_index = s->num_children;
bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED;
+ quorum_refresh_flags(bs);
g_free(opened);
goto exit;
@@ -1061,6 +1091,7 @@ static void quorum_add_child(BlockDriverState *bs, BlockDriverState *child_bs,
}
s->children = g_renew(BdrvChild *, s->children, s->num_children + 1);
s->children[s->num_children++] = child;
+ quorum_refresh_flags(bs);
out:
bdrv_drained_end(bs);
@@ -1105,6 +1136,7 @@ static void quorum_del_child(BlockDriverState *bs, BdrvChild *child,
s->children = g_renew(BdrvChild *, s->children, --s->num_children);
bdrv_unref_child(bs, child);
+ quorum_refresh_flags(bs);
bdrv_drained_end(bs);
}
@@ -1179,6 +1211,56 @@ static void quorum_child_perm(BlockDriverState *bs, BdrvChild *c,
| DEFAULT_PERM_UNCHANGED;
}
+/*
+ * Each one of the children can report different status flags even
+ * when they contain the same data, so what this function does is
+ * return BDRV_BLOCK_ZERO if *all* children agree that a certain
+ * region contains zeroes, and BDRV_BLOCK_DATA otherwise.
+ */
+static int coroutine_fn quorum_co_block_status(BlockDriverState *bs,
+ bool want_zero,
+ int64_t offset, int64_t count,
+ int64_t *pnum, int64_t *map,
+ BlockDriverState **file)
+{
+ BDRVQuorumState *s = bs->opaque;
+ int i, ret;
+ int64_t pnum_zero = count;
+ int64_t pnum_data = 0;
+
+ for (i = 0; i < s->num_children; i++) {
+ int64_t bytes;
+ ret = bdrv_co_common_block_status_above(s->children[i]->bs, NULL, false,
+ want_zero, offset, count,
+ &bytes, NULL, NULL, NULL);
+ if (ret < 0) {
+ quorum_report_bad(QUORUM_OP_TYPE_READ, offset, count,
+ s->children[i]->bs->node_name, ret);
+ pnum_data = count;
+ break;
+ }
+ /*
+ * Even if all children agree about whether there are zeroes
+ * or not at @offset they might disagree on the size, so use
+ * the smallest when reporting BDRV_BLOCK_ZERO and the largest
+ * when reporting BDRV_BLOCK_DATA.
+ */
+ if (ret & BDRV_BLOCK_ZERO) {
+ pnum_zero = MIN(pnum_zero, bytes);
+ } else {
+ pnum_data = MAX(pnum_data, bytes);
+ }
+ }
+
+ if (pnum_data) {
+ *pnum = pnum_data;
+ return BDRV_BLOCK_DATA;
+ } else {
+ *pnum = pnum_zero;
+ return BDRV_BLOCK_ZERO;
+ }
+}
+
static const char *const quorum_strong_runtime_opts[] = {
QUORUM_OPT_VOTE_THRESHOLD,
QUORUM_OPT_BLKVERIFY,
@@ -1197,6 +1279,7 @@ static BlockDriver bdrv_quorum = {
.bdrv_close = quorum_close,
.bdrv_gather_child_options = quorum_gather_child_options,
.bdrv_dirname = quorum_dirname,
+ .bdrv_co_block_status = quorum_co_block_status,
.bdrv_co_flush_to_disk = quorum_co_flush,
@@ -1204,6 +1287,7 @@ static BlockDriver bdrv_quorum = {
.bdrv_co_preadv = quorum_co_preadv,
.bdrv_co_pwritev = quorum_co_pwritev,
+ .bdrv_co_pwrite_zeroes = quorum_co_pwrite_zeroes,
.bdrv_add_child = quorum_add_child,
.bdrv_del_child = quorum_del_child,