diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2020-12-31 23:26:46 +0000 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2020-12-31 23:26:46 +0000 |
commit | 26f6b15e2636eb20cb6757093170341b22fe6fbc (patch) | |
tree | 147896371dd546414557fdbed8f35b622ee4aae2 /block | |
parent | c7e48f91653d6ace9dc42ec6b5b627b57d5d49e0 (diff) | |
parent | 0e72078128229bf9efb542e396ab44bf91b91340 (diff) |
Merge remote-tracking branch 'remotes/maxreitz/tags/pull-block-2020-12-18' into staging
Block patches:
- New block filter: preallocate (which, on writes beyond an image file's
end, allocates big chunks of data so that such post-EOF writes will
occur less frequently)
- write-zeroes and block-status support for Quorum
- Implementation of truncate for the nvme block driver similarly to the
existing implementations for host block devices and iscsi devices
- Block layer refactoring: Drop the tighten_restrictions concept in the
block permission functions
- iotest fixes
# gpg: Signature made Fri 18 Dec 2020 14:45:30 GMT
# gpg: using RSA key 91BEB60A30DB3E8857D11829F407DB0061D5CF40
# gpg: issuer "mreitz@redhat.com"
# gpg: Good signature from "Max Reitz <mreitz@redhat.com>" [full]
# Primary key fingerprint: 91BE B60A 30DB 3E88 57D1 1829 F407 DB00 61D5 CF40
* remotes/maxreitz/tags/pull-block-2020-12-18: (30 commits)
iotests: Fix _send_qemu_cmd with bash 5.1
iotests/102: Pass $QEMU_HANDLE to _send_qemu_cmd
block/nvme: Implement fake truncate() coroutine
quorum: Implement bdrv_co_pwrite_zeroes()
quorum: Implement bdrv_co_block_status()
scripts/simplebench: add bench_prealloc.py
simplebench/results_to_text: make executable
simplebench/results_to_text: add difference line to the table
simplebench/results_to_text: improve view of the table
simplebench: move results_to_text() into separate file
simplebench: rename ascii() to results_to_text()
scripts/simplebench: use standard deviation for +- error
scripts/simplebench: support iops
scripts/simplebench: fix grammar: s/successed/succeeded/
iotests: add 298 to test new preallocate filter driver
iotests.py: execute_setup_common(): add required_fmts argument
iotests: qemu_io_silent: support --image-opts
qemu-io: add preallocate mode parameter for truncate command
block: introduce preallocate filter
block: bdrv_check_perm(): process children anyway
...
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Diffstat (limited to 'block')
-rw-r--r-- | block/file-posix.c | 2 | ||||
-rw-r--r-- | block/io.c | 130 | ||||
-rw-r--r-- | block/meson.build | 1 | ||||
-rw-r--r-- | block/nvme.c | 24 | ||||
-rw-r--r-- | block/preallocate.c | 559 | ||||
-rw-r--r-- | block/quorum.c | 88 |
6 files changed, 747 insertions, 57 deletions
diff --git a/block/file-posix.c b/block/file-posix.c index 9804681d5c..00cdaaa2d4 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -2953,7 +2953,7 @@ raw_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int bytes, assert(bdrv_check_request(req->offset, req->bytes) == 0); - bdrv_mark_request_serialising(req, bs->bl.request_alignment); + bdrv_make_request_serialising(req, bs->bl.request_alignment); } #endif diff --git a/block/io.c b/block/io.c index 24205f5168..95b1c56c06 100644 --- a/block/io.c +++ b/block/io.c @@ -754,55 +754,65 @@ static bool tracked_request_overlaps(BdrvTrackedRequest *req, return true; } -static bool coroutine_fn -bdrv_wait_serialising_requests_locked(BlockDriverState *bs, - BdrvTrackedRequest *self) +/* Called with self->bs->reqs_lock held */ +static BdrvTrackedRequest * +bdrv_find_conflicting_request(BdrvTrackedRequest *self) { BdrvTrackedRequest *req; - bool retry; - bool waited = false; - do { - retry = false; - QLIST_FOREACH(req, &bs->tracked_requests, list) { - if (req == self || (!req->serialising && !self->serialising)) { - continue; - } - if (tracked_request_overlaps(req, self->overlap_offset, - self->overlap_bytes)) - { - /* Hitting this means there was a reentrant request, for - * example, a block driver issuing nested requests. This must - * never happen since it means deadlock. - */ - assert(qemu_coroutine_self() != req->co); - - /* If the request is already (indirectly) waiting for us, or - * will wait for us as soon as it wakes up, then just go on - * (instead of producing a deadlock in the former case). */ - if (!req->waiting_for) { - self->waiting_for = req; - qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock); - self->waiting_for = NULL; - retry = true; - waited = true; - break; - } + QLIST_FOREACH(req, &self->bs->tracked_requests, list) { + if (req == self || (!req->serialising && !self->serialising)) { + continue; + } + if (tracked_request_overlaps(req, self->overlap_offset, + self->overlap_bytes)) + { + /* + * Hitting this means there was a reentrant request, for + * example, a block driver issuing nested requests. This must + * never happen since it means deadlock. + */ + assert(qemu_coroutine_self() != req->co); + + /* + * If the request is already (indirectly) waiting for us, or + * will wait for us as soon as it wakes up, then just go on + * (instead of producing a deadlock in the former case). + */ + if (!req->waiting_for) { + return req; } } - } while (retry); + } + + return NULL; +} + +/* Called with self->bs->reqs_lock held */ +static bool coroutine_fn +bdrv_wait_serialising_requests_locked(BdrvTrackedRequest *self) +{ + BdrvTrackedRequest *req; + bool waited = false; + + while ((req = bdrv_find_conflicting_request(self))) { + self->waiting_for = req; + qemu_co_queue_wait(&req->wait_queue, &self->bs->reqs_lock); + self->waiting_for = NULL; + waited = true; + } + return waited; } -bool bdrv_mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) +/* Called with req->bs->reqs_lock held */ +static void tracked_request_set_serialising(BdrvTrackedRequest *req, + uint64_t align) { - BlockDriverState *bs = req->bs; int64_t overlap_offset = req->offset & ~(align - 1); uint64_t overlap_bytes = ROUND_UP(req->offset + req->bytes, align) - overlap_offset; - bool waited; - qemu_co_mutex_lock(&bs->reqs_lock); if (!req->serialising) { qatomic_inc(&req->bs->serialising_in_flight); req->serialising = true; @@ -810,9 +820,6 @@ bool bdrv_mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) req->overlap_offset = MIN(req->overlap_offset, overlap_offset); req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); - waited = bdrv_wait_serialising_requests_locked(bs, req); - qemu_co_mutex_unlock(&bs->reqs_lock); - return waited; } /** @@ -892,12 +899,27 @@ static bool coroutine_fn bdrv_wait_serialising_requests(BdrvTrackedRequest *self } qemu_co_mutex_lock(&bs->reqs_lock); - waited = bdrv_wait_serialising_requests_locked(bs, self); + waited = bdrv_wait_serialising_requests_locked(self); qemu_co_mutex_unlock(&bs->reqs_lock); return waited; } +bool coroutine_fn bdrv_make_request_serialising(BdrvTrackedRequest *req, + uint64_t align) +{ + bool waited; + + qemu_co_mutex_lock(&req->bs->reqs_lock); + + tracked_request_set_serialising(req, align); + waited = bdrv_wait_serialising_requests_locked(req); + + qemu_co_mutex_unlock(&req->bs->reqs_lock); + + return waited; +} + int bdrv_check_request(int64_t offset, int64_t bytes) { if (offset < 0 || bytes < 0) { @@ -1423,7 +1445,7 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, * with each other for the same cluster. For example, in copy-on-read * it ensures that the CoR read and write operations are atomic and * guest writes cannot interleave between them. */ - bdrv_mark_request_serialising(req, bdrv_get_cluster_size(bs)); + bdrv_make_request_serialising(req, bdrv_get_cluster_size(bs)); } else { bdrv_wait_serialising_requests(req); } @@ -1827,7 +1849,6 @@ bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, uint64_t bytes, BdrvTrackedRequest *req, int flags) { BlockDriverState *bs = child->bs; - bool waited; int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); if (bs->read_only) { @@ -1837,17 +1858,18 @@ bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, uint64_t bytes, assert(!(bs->open_flags & BDRV_O_INACTIVE)); assert((bs->open_flags & BDRV_O_NO_IO) == 0); assert(!(flags & ~BDRV_REQ_MASK)); + assert(!((flags & BDRV_REQ_NO_WAIT) && !(flags & BDRV_REQ_SERIALISING))); if (flags & BDRV_REQ_SERIALISING) { - waited = bdrv_mark_request_serialising(req, bdrv_get_cluster_size(bs)); - /* - * For a misaligned request we should have already waited earlier, - * because we come after bdrv_padding_rmw_read which must be called - * with the request already marked as serialising. - */ - assert(!waited || - (req->offset == req->overlap_offset && - req->bytes == req->overlap_bytes)); + QEMU_LOCK_GUARD(&bs->reqs_lock); + + tracked_request_set_serialising(req, bdrv_get_cluster_size(bs)); + + if ((flags & BDRV_REQ_NO_WAIT) && bdrv_find_conflicting_request(req)) { + return -EBUSY; + } + + bdrv_wait_serialising_requests_locked(req); } else { bdrv_wait_serialising_requests(req); } @@ -2013,7 +2035,7 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child, padding = bdrv_init_padding(bs, offset, bytes, &pad); if (padding) { - bdrv_mark_request_serialising(req, align); + bdrv_make_request_serialising(req, align); bdrv_padding_rmw_read(child, req, &pad, true); @@ -2127,7 +2149,7 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child, } if (bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad)) { - bdrv_mark_request_serialising(&req, align); + bdrv_make_request_serialising(&req, align); bdrv_padding_rmw_read(child, &req, &pad, false); } @@ -3248,7 +3270,7 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, * new area, we need to make sure that no write requests are made to it * concurrently or they might be overwritten by preallocation. */ if (new_bytes) { - bdrv_mark_request_serialising(&req, 1); + bdrv_make_request_serialising(&req, 1); } if (bs->read_only) { error_setg(errp, "Image is read-only"); diff --git a/block/meson.build b/block/meson.build index 5dcc1e5cce..7595d86c41 100644 --- a/block/meson.build +++ b/block/meson.build @@ -12,6 +12,7 @@ block_ss.add(files( 'block-copy.c', 'commit.c', 'copy-on-read.c', + 'preallocate.c', 'create.c', 'crypto.c', 'dirty-bitmap.c', diff --git a/block/nvme.c b/block/nvme.c index a06a188d53..5a6fbacf4a 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -1389,6 +1389,29 @@ out: } +static int coroutine_fn nvme_co_truncate(BlockDriverState *bs, int64_t offset, + bool exact, PreallocMode prealloc, + BdrvRequestFlags flags, Error **errp) +{ + int64_t cur_length; + + if (prealloc != PREALLOC_MODE_OFF) { + error_setg(errp, "Unsupported preallocation mode '%s'", + PreallocMode_str(prealloc)); + return -ENOTSUP; + } + + cur_length = nvme_getlength(bs); + if (offset != cur_length && exact) { + error_setg(errp, "Cannot resize NVMe devices"); + return -ENOTSUP; + } else if (offset > cur_length) { + error_setg(errp, "Cannot grow NVMe devices"); + return -EINVAL; + } + + return 0; +} static int nvme_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue, Error **errp) @@ -1523,6 +1546,7 @@ static BlockDriver bdrv_nvme = { .bdrv_close = nvme_close, .bdrv_getlength = nvme_getlength, .bdrv_probe_blocksizes = nvme_probe_blocksizes, + .bdrv_co_truncate = nvme_co_truncate, .bdrv_co_preadv = nvme_co_preadv, .bdrv_co_pwritev = nvme_co_pwritev, diff --git a/block/preallocate.c b/block/preallocate.c new file mode 100644 index 0000000000..b619206304 --- /dev/null +++ b/block/preallocate.c @@ -0,0 +1,559 @@ +/* + * preallocate filter driver + * + * The driver performs preallocate operation: it is injected above + * some node, and before each write over EOF it does additional preallocating + * write-zeroes request. + * + * Copyright (c) 2020 Virtuozzo International GmbH. + * + * Author: + * Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" + +#include "qapi/error.h" +#include "qemu/module.h" +#include "qemu/option.h" +#include "qemu/units.h" +#include "block/block_int.h" + + +typedef struct PreallocateOpts { + int64_t prealloc_size; + int64_t prealloc_align; +} PreallocateOpts; + +typedef struct BDRVPreallocateState { + PreallocateOpts opts; + + /* + * Track real data end, to crop preallocation on close. If < 0 the status is + * unknown. + * + * @data_end is a maximum of file size on open (or when we get write/resize + * permissions) and all write request ends after it. So it's safe to + * truncate to data_end if it is valid. + */ + int64_t data_end; + + /* + * Start of trailing preallocated area which reads as zero. May be smaller + * than data_end, if user does over-EOF write zero operation. If < 0 the + * status is unknown. + * + * If both @zero_start and @file_end are valid, the region + * [@zero_start, @file_end) is known to be preallocated zeroes. If @file_end + * is not valid, @zero_start doesn't make much sense. + */ + int64_t zero_start; + + /* + * Real end of file. Actually the cache for bdrv_getlength(bs->file->bs), + * to avoid extra lseek() calls on each write operation. If < 0 the status + * is unknown. + */ + int64_t file_end; + + /* + * All three states @data_end, @zero_start and @file_end are guaranteed to + * be invalid (< 0) when we don't have both exclusive BLK_PERM_RESIZE and + * BLK_PERM_WRITE permissions on file child. + */ +} BDRVPreallocateState; + +#define PREALLOCATE_OPT_PREALLOC_ALIGN "prealloc-align" +#define PREALLOCATE_OPT_PREALLOC_SIZE "prealloc-size" +static QemuOptsList runtime_opts = { + .name = "preallocate", + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), + .desc = { + { + .name = PREALLOCATE_OPT_PREALLOC_ALIGN, + .type = QEMU_OPT_SIZE, + .help = "on preallocation, align file length to this number, " + "default 1M", + }, + { + .name = PREALLOCATE_OPT_PREALLOC_SIZE, + .type = QEMU_OPT_SIZE, + .help = "how much to preallocate, default 128M", + }, + { /* end of list */ } + }, +}; + +static bool preallocate_absorb_opts(PreallocateOpts *dest, QDict *options, + BlockDriverState *child_bs, Error **errp) +{ + QemuOpts *opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); + + if (!qemu_opts_absorb_qdict(opts, options, errp)) { + return false; + } + + dest->prealloc_align = + qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_ALIGN, 1 * MiB); + dest->prealloc_size = + qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_SIZE, 128 * MiB); + + qemu_opts_del(opts); + + if (!QEMU_IS_ALIGNED(dest->prealloc_align, BDRV_SECTOR_SIZE)) { + error_setg(errp, "prealloc-align parameter of preallocate filter " + "is not aligned to %llu", BDRV_SECTOR_SIZE); + return false; + } + + if (!QEMU_IS_ALIGNED(dest->prealloc_align, + child_bs->bl.request_alignment)) { + error_setg(errp, "prealloc-align parameter of preallocate filter " + "is not aligned to underlying node request alignment " + "(%" PRIi32 ")", child_bs->bl.request_alignment); + return false; + } + + return true; +} + +static int preallocate_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + BDRVPreallocateState *s = bs->opaque; + + /* + * s->data_end and friends should be initialized on permission update. + * For this to work, mark them invalid. + */ + s->file_end = s->zero_start = s->data_end = -EINVAL; + + bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds, + BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY, + false, errp); + if (!bs->file) { + return -EINVAL; + } + + if (!preallocate_absorb_opts(&s->opts, options, bs->file->bs, errp)) { + return -EINVAL; + } + + bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED | + (BDRV_REQ_FUA & bs->file->bs->supported_write_flags); + + bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED | + ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) & + bs->file->bs->supported_zero_flags); + + return 0; +} + +static void preallocate_close(BlockDriverState *bs) +{ + int ret; + BDRVPreallocateState *s = bs->opaque; + + if (s->data_end < 0) { + return; + } + + if (s->file_end < 0) { + s->file_end = bdrv_getlength(bs->file->bs); + if (s->file_end < 0) { + return; + } + } + + if (s->data_end < s->file_end) { + ret = bdrv_truncate(bs->file, s->data_end, true, PREALLOC_MODE_OFF, 0, + NULL); + s->file_end = ret < 0 ? ret : s->data_end; + } +} + + +/* + * Handle reopen. + * + * We must implement reopen handlers, otherwise reopen just don't work. Handle + * new options and don't care about preallocation state, as it is handled in + * set/check permission handlers. + */ + +static int preallocate_reopen_prepare(BDRVReopenState *reopen_state, + BlockReopenQueue *queue, Error **errp) +{ + PreallocateOpts *opts = g_new0(PreallocateOpts, 1); + + if (!preallocate_absorb_opts(opts, reopen_state->options, + reopen_state->bs->file->bs, errp)) { + g_free(opts); + return -EINVAL; + } + + reopen_state->opaque = opts; + + return 0; +} + +static void preallocate_reopen_commit(BDRVReopenState *state) +{ + BDRVPreallocateState *s = state->bs->opaque; + + s->opts = *(PreallocateOpts *)state->opaque; + + g_free(state->opaque); + state->opaque = NULL; +} + +static void preallocate_reopen_abort(BDRVReopenState *state) +{ + g_free(state->opaque); + state->opaque = NULL; +} + +static coroutine_fn int preallocate_co_preadv_part( + BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, size_t qiov_offset, int flags) +{ + return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset, + flags); +} + +static int coroutine_fn preallocate_co_pdiscard(BlockDriverState *bs, + int64_t offset, int bytes) +{ + return bdrv_co_pdiscard(bs->file, offset, bytes); +} + +static bool can_write_resize(uint64_t perm) +{ + return (perm & BLK_PERM_WRITE) && (perm & BLK_PERM_RESIZE); +} + +static bool has_prealloc_perms(BlockDriverState *bs) +{ + BDRVPreallocateState *s = bs->opaque; + + if (can_write_resize(bs->file->perm)) { + assert(!(bs->file->shared_perm & BLK_PERM_WRITE)); + assert(!(bs->file->shared_perm & BLK_PERM_RESIZE)); + return true; + } + + assert(s->data_end < 0); + assert(s->zero_start < 0); + assert(s->file_end < 0); + return false; +} + +/* + * Call on each write. Returns true if @want_merge_zero is true and the region + * [offset, offset + bytes) is zeroed (as a result of this call or earlier + * preallocation). + * + * want_merge_zero is used to merge write-zero request with preallocation in + * one bdrv_co_pwrite_zeroes() call. + */ +static bool coroutine_fn handle_write(BlockDriverState *bs, int64_t offset, + int64_t bytes, bool want_merge_zero) +{ + BDRVPreallocateState *s = bs->opaque; + int64_t end = offset + bytes; + int64_t prealloc_start, prealloc_end; + int ret; + + if (!has_prealloc_perms(bs)) { + /* We don't have state neither should try to recover it */ + return false; + } + + if (s->data_end < 0) { + s->data_end = bdrv_getlength(bs->file->bs); + if (s->data_end < 0) { + return false; + } + + if (s->file_end < 0) { + s->file_end = s->data_end; + } + } + + if (end <= s->data_end) { + return false; + } + + /* We have valid s->data_end, and request writes beyond it. */ + + s->data_end = end; + if (s->zero_start < 0 || !want_merge_zero) { + s->zero_start = end; + } + + if (s->file_end < 0) { + s->file_end = bdrv_getlength(bs->file->bs); + if (s->file_end < 0) { + return false; + } + } + + /* Now s->data_end, s->zero_start and s->file_end are valid. */ + + if (end <= s->file_end) { + /* No preallocation needed. */ + return want_merge_zero && offset >= s->zero_start; + } + + /* Now we want new preallocation, as request writes beyond s->file_end. */ + + prealloc_start = want_merge_zero ? MIN(offset, s->file_end) : s->file_end; + prealloc_end = QEMU_ALIGN_UP(end + s->opts.prealloc_size, + s->opts.prealloc_align); + + ret = bdrv_co_pwrite_zeroes( + bs->file, prealloc_start, prealloc_end - prealloc_start, + BDRV_REQ_NO_FALLBACK | BDRV_REQ_SERIALISING | BDRV_REQ_NO_WAIT); + if (ret < 0) { + s->file_end = ret; + return false; + } + + s->file_end = prealloc_end; + return want_merge_zero; +} + +static int coroutine_fn preallocate_co_pwrite_zeroes(BlockDriverState *bs, + int64_t offset, int bytes, BdrvRequestFlags flags) +{ + bool want_merge_zero = + !(flags & ~(BDRV_REQ_ZERO_WRITE | BDRV_REQ_NO_FALLBACK)); + if (handle_write(bs, offset, bytes, want_merge_zero)) { + return 0; + } + + return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags); +} + +static coroutine_fn int preallocate_co_pwritev_part(BlockDriverState *bs, + uint64_t offset, + uint64_t bytes, + QEMUIOVector *qiov, + size_t qiov_offset, + int flags) +{ + handle_write(bs, offset, bytes, false); + + return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset, + flags); +} + +static int coroutine_fn +preallocate_co_truncate(BlockDriverState *bs, int64_t offset, + bool exact, PreallocMode prealloc, + BdrvRequestFlags flags, Error **errp) +{ + ERRP_GUARD(); + BDRVPreallocateState *s = bs->opaque; + int ret; + + if (s->data_end >= 0 && offset > s->data_end) { + if (s->file_end < 0) { + s->file_end = bdrv_getlength(bs->file->bs); + if (s->file_end < 0) { + error_setg(errp, "failed to get file length"); + return s->file_end; + } + } + + if (prealloc == PREALLOC_MODE_FALLOC) { + /* + * If offset <= s->file_end, the task is already done, just + * update s->data_end, to move part of "filter preallocation" + * to "preallocation requested by user". + * Otherwise just proceed to preallocate missing part. + */ + if (offset <= s->file_end) { + s->data_end = offset; + return 0; + } + } else { + /* + * We have to drop our preallocation, to + * - avoid "Cannot use preallocation for shrinking files" in + * case of offset < file_end + * - give PREALLOC_MODE_OFF a chance to keep small disk + * usage + * - give PREALLOC_MODE_FULL a chance to actually write the + * whole region as user expects + */ + if (s->file_end > s->data_end) { + ret = bdrv_co_truncate(bs->file, s->data_end, true, + PREALLOC_MODE_OFF, 0, errp); + if (ret < 0) { + s->file_end = ret; + error_prepend(errp, "preallocate-filter: failed to drop " + "write-zero preallocation: "); + return ret; + } + s->file_end = s->data_end; + } + } + + s->data_end = offset; + } + + ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp); + if (ret < 0) { + s->file_end = s->zero_start = s->data_end = ret; + return ret; + } + + if (has_prealloc_perms(bs)) { + s->file_end = s->zero_start = s->data_end = offset; + } + return 0; +} + +static int coroutine_fn preallocate_co_flush(BlockDriverState *bs) +{ + return bdrv_co_flush(bs->file->bs); +} + +static int64_t preallocate_getlength(BlockDriverState *bs) +{ + int64_t ret; + BDRVPreallocateState *s = bs->opaque; + + if (s->data_end >= 0) { + return s->data_end; + } + + ret = bdrv_getlength(bs->file->bs); + + if (has_prealloc_perms(bs)) { + s->file_end = s->zero_start = s->data_end = ret; + } + + return ret; +} + +static int preallocate_check_perm(BlockDriverState *bs, + uint64_t perm, uint64_t shared, Error **errp) +{ + BDRVPreallocateState *s = bs->opaque; + + if (s->data_end >= 0 && !can_write_resize(perm)) { + /* + * Lose permissions. + * We should truncate in check_perm, as in set_perm bs->file->perm will + * be already changed, and we should not violate it. + */ + if (s->file_end < 0) { + s->file_end = bdrv_getlength(bs->file->bs); + if (s->file_end < 0) { + error_setg(errp, "Failed to get file length"); + return s->file_end; + } + } + + if (s->data_end < s->file_end) { + int ret = bdrv_truncate(bs->file, s->data_end, true, + PREALLOC_MODE_OFF, 0, NULL); + if (ret < 0) { + error_setg(errp, "Failed to drop preallocation"); + s->file_end = ret; + return ret; + } + s->file_end = s->data_end; + } + } + + return 0; +} + +static void preallocate_set_perm(BlockDriverState *bs, + uint64_t perm, uint64_t shared) +{ + BDRVPreallocateState *s = bs->opaque; + + if (can_write_resize(perm)) { + if (s->data_end < 0) { + s->data_end = s->file_end = s->zero_start = + bdrv_getlength(bs->file->bs); + } + } else { + /* + * We drop our permissions, as well as allow shared + * permissions (see preallocate_child_perm), anyone will be able to + * change the child, so mark all states invalid. We'll regain control if + * get good permissions back. + */ + s->data_end = s->file_end = s->zero_start = -EINVAL; + } +} + +static void preallocate_child_perm(BlockDriverState *bs, BdrvChild *c, + BdrvChildRole role, BlockReopenQueue *reopen_queue, + uint64_t perm, uint64_t shared, uint64_t *nperm, uint64_t *nshared) +{ + bdrv_default_perms(bs, c, role, reopen_queue, perm, shared, nperm, nshared); + + if (can_write_resize(perm)) { + /* This should come by default, but let's enforce: */ + *nperm |= BLK_PERM_WRITE | BLK_PERM_RESIZE; + + /* + * Don't share, to keep our states s->file_end, s->data_end and + * s->zero_start valid. + */ + *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE); + } +} + +BlockDriver bdrv_preallocate_filter = { + .format_name = "preallocate", + .instance_size = sizeof(BDRVPreallocateState), + + .bdrv_getlength = preallocate_getlength, + .bdrv_open = preallocate_open, + .bdrv_close = preallocate_close, + + .bdrv_reopen_prepare = preallocate_reopen_prepare, + .bdrv_reopen_commit = preallocate_reopen_commit, + .bdrv_reopen_abort = preallocate_reopen_abort, + + .bdrv_co_preadv_part = preallocate_co_preadv_part, + .bdrv_co_pwritev_part = preallocate_co_pwritev_part, + .bdrv_co_pwrite_zeroes = preallocate_co_pwrite_zeroes, + .bdrv_co_pdiscard = preallocate_co_pdiscard, + .bdrv_co_flush = preallocate_co_flush, + .bdrv_co_truncate = preallocate_co_truncate, + + .bdrv_check_perm = preallocate_check_perm, + .bdrv_set_perm = preallocate_set_perm, + .bdrv_child_perm = preallocate_child_perm, + + .has_variable_length = true, + .is_filter = true, +}; + +static void bdrv_preallocate_init(void) +{ + bdrv_register(&bdrv_preallocate_filter); +} + +block_init(bdrv_preallocate_init); diff --git a/block/quorum.c b/block/quorum.c index 4b08a199b7..0bd75450de 100644 --- a/block/quorum.c +++ b/block/quorum.c @@ -18,6 +18,7 @@ #include "qemu/module.h" #include "qemu/option.h" #include "block/block_int.h" +#include "block/coroutines.h" #include "block/qdict.h" #include "qapi/error.h" #include "qapi/qapi-events-block.h" @@ -691,8 +692,13 @@ static void write_quorum_entry(void *opaque) QuorumChildRequest *sacb = &acb->qcrs[i]; sacb->bs = s->children[i]->bs; - sacb->ret = bdrv_co_pwritev(s->children[i], acb->offset, acb->bytes, - acb->qiov, acb->flags); + if (acb->flags & BDRV_REQ_ZERO_WRITE) { + sacb->ret = bdrv_co_pwrite_zeroes(s->children[i], acb->offset, + acb->bytes, acb->flags); + } else { + sacb->ret = bdrv_co_pwritev(s->children[i], acb->offset, acb->bytes, + acb->qiov, acb->flags); + } if (sacb->ret == 0) { acb->success_count++; } else { @@ -738,6 +744,14 @@ static int quorum_co_pwritev(BlockDriverState *bs, uint64_t offset, return ret; } +static int quorum_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, + int bytes, BdrvRequestFlags flags) + +{ + return quorum_co_pwritev(bs, offset, bytes, NULL, + flags | BDRV_REQ_ZERO_WRITE); +} + static int64_t quorum_getlength(BlockDriverState *bs) { BDRVQuorumState *s = bs->opaque; @@ -896,6 +910,21 @@ static QemuOptsList quorum_runtime_opts = { }, }; +static void quorum_refresh_flags(BlockDriverState *bs) +{ + BDRVQuorumState *s = bs->opaque; + int i; + + bs->supported_zero_flags = + BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK; + + for (i = 0; i < s->num_children; i++) { + bs->supported_zero_flags &= s->children[i]->bs->supported_zero_flags; + } + + bs->supported_zero_flags |= BDRV_REQ_WRITE_UNCHANGED; +} + static int quorum_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { @@ -990,6 +1019,7 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags, s->next_child_index = s->num_children; bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED; + quorum_refresh_flags(bs); g_free(opened); goto exit; @@ -1061,6 +1091,7 @@ static void quorum_add_child(BlockDriverState *bs, BlockDriverState *child_bs, } s->children = g_renew(BdrvChild *, s->children, s->num_children + 1); s->children[s->num_children++] = child; + quorum_refresh_flags(bs); out: bdrv_drained_end(bs); @@ -1105,6 +1136,7 @@ static void quorum_del_child(BlockDriverState *bs, BdrvChild *child, s->children = g_renew(BdrvChild *, s->children, --s->num_children); bdrv_unref_child(bs, child); + quorum_refresh_flags(bs); bdrv_drained_end(bs); } @@ -1179,6 +1211,56 @@ static void quorum_child_perm(BlockDriverState *bs, BdrvChild *c, | DEFAULT_PERM_UNCHANGED; } +/* + * Each one of the children can report different status flags even + * when they contain the same data, so what this function does is + * return BDRV_BLOCK_ZERO if *all* children agree that a certain + * region contains zeroes, and BDRV_BLOCK_DATA otherwise. + */ +static int coroutine_fn quorum_co_block_status(BlockDriverState *bs, + bool want_zero, + int64_t offset, int64_t count, + int64_t *pnum, int64_t *map, + BlockDriverState **file) +{ + BDRVQuorumState *s = bs->opaque; + int i, ret; + int64_t pnum_zero = count; + int64_t pnum_data = 0; + + for (i = 0; i < s->num_children; i++) { + int64_t bytes; + ret = bdrv_co_common_block_status_above(s->children[i]->bs, NULL, false, + want_zero, offset, count, + &bytes, NULL, NULL, NULL); + if (ret < 0) { + quorum_report_bad(QUORUM_OP_TYPE_READ, offset, count, + s->children[i]->bs->node_name, ret); + pnum_data = count; + break; + } + /* + * Even if all children agree about whether there are zeroes + * or not at @offset they might disagree on the size, so use + * the smallest when reporting BDRV_BLOCK_ZERO and the largest + * when reporting BDRV_BLOCK_DATA. + */ + if (ret & BDRV_BLOCK_ZERO) { + pnum_zero = MIN(pnum_zero, bytes); + } else { + pnum_data = MAX(pnum_data, bytes); + } + } + + if (pnum_data) { + *pnum = pnum_data; + return BDRV_BLOCK_DATA; + } else { + *pnum = pnum_zero; + return BDRV_BLOCK_ZERO; + } +} + static const char *const quorum_strong_runtime_opts[] = { QUORUM_OPT_VOTE_THRESHOLD, QUORUM_OPT_BLKVERIFY, @@ -1197,6 +1279,7 @@ static BlockDriver bdrv_quorum = { .bdrv_close = quorum_close, .bdrv_gather_child_options = quorum_gather_child_options, .bdrv_dirname = quorum_dirname, + .bdrv_co_block_status = quorum_co_block_status, .bdrv_co_flush_to_disk = quorum_co_flush, @@ -1204,6 +1287,7 @@ static BlockDriver bdrv_quorum = { .bdrv_co_preadv = quorum_co_preadv, .bdrv_co_pwritev = quorum_co_pwritev, + .bdrv_co_pwrite_zeroes = quorum_co_pwrite_zeroes, .bdrv_add_child = quorum_add_child, .bdrv_del_child = quorum_del_child, |