diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2020-09-11 14:47:49 +0100 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2020-09-11 14:47:49 +0100 |
commit | 2499453eb1cbb68a45d7562a180afd7659007fd4 (patch) | |
tree | 71e256143ef313c34e6e00b7bddb1c8e4bb6e5d8 /block | |
parent | 922781b7b37de22a06269f25c9c1ae66293c5991 (diff) | |
parent | b9be6faed1a069526efdc80df7318b16afc6e116 (diff) |
Merge remote-tracking branch 'remotes/kevin/tags/for-upstream' into staging
Block layer patches:
- qemu-img create: Fail gracefully when backing file is an empty string
- Fixes related to filter block nodes ("Deal with filters" series)
- block/nvme: Various cleanups required to use multiple queues
- block/nvme: Use NvmeBar structure from "block/nvme.h"
- file-win32: Fix "locking" option
- iotests: Allow running from different directory
# gpg: Signature made Thu 10 Sep 2020 10:11:19 BST
# gpg: using RSA key DC3DEB159A9AF95D3D7456FE7F09B272C88F2FD6
# gpg: issuer "kwolf@redhat.com"
# gpg: Good signature from "Kevin Wolf <kwolf@redhat.com>" [full]
# Primary key fingerprint: DC3D EB15 9A9A F95D 3D74 56FE 7F09 B272 C88F 2FD6
* remotes/kevin/tags/for-upstream: (65 commits)
block/qcow2-cluster: Add missing "fallthrough" annotation
block/nvme: Pair doorbell registers
block/nvme: Use generic NvmeBar structure
block/nvme: Group controller registers in NVMeRegs structure
file-win32: Fix "locking" option
iotests: Allow running from different directory
iotests: Test committing to overridden backing
iotests: Add test for commit in sub directory
iotests: Add filter mirror test cases
iotests: Add filter commit test cases
iotests: Let complete_and_wait() work with commit
iotests: Test that qcow2's data-file is flushed
block: Leave BDS.backing_{file,format} constant
block: Inline bdrv_co_block_status_from_*()
blockdev: Fix active commit choice
block: Drop backing_bs()
qemu-img: Use child access functions
nbd: Use CAF when looking for dirty bitmap
commit: Deal with filters
backup: Deal with filters
...
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Diffstat (limited to 'block')
-rw-r--r-- | block/backup-top.c | 4 | ||||
-rw-r--r-- | block/backup.c | 9 | ||||
-rw-r--r-- | block/blkdebug.c | 7 | ||||
-rw-r--r-- | block/blklogwrites.c | 1 | ||||
-rw-r--r-- | block/block-backend.c | 7 | ||||
-rw-r--r-- | block/block-copy.c | 4 | ||||
-rw-r--r-- | block/commit.c | 95 | ||||
-rw-r--r-- | block/copy-on-read.c | 13 | ||||
-rw-r--r-- | block/file-win32.c | 22 | ||||
-rw-r--r-- | block/filter-compress.c | 2 | ||||
-rw-r--r-- | block/io.c | 142 | ||||
-rw-r--r-- | block/mirror.c | 119 | ||||
-rw-r--r-- | block/monitor/block-hmp-cmds.c | 2 | ||||
-rw-r--r-- | block/null.c | 7 | ||||
-rw-r--r-- | block/nvme.c | 248 | ||||
-rw-r--r-- | block/qapi.c | 74 | ||||
-rw-r--r-- | block/qcow2-cluster.c | 1 | ||||
-rw-r--r-- | block/snapshot.c | 104 | ||||
-rw-r--r-- | block/stream.c | 63 | ||||
-rw-r--r-- | block/throttle.c | 11 | ||||
-rw-r--r-- | block/vmdk.c | 16 |
21 files changed, 622 insertions, 329 deletions
diff --git a/block/backup-top.c b/block/backup-top.c index af2f20f346..fe6883cc97 100644 --- a/block/backup-top.c +++ b/block/backup-top.c @@ -175,8 +175,6 @@ BlockDriver bdrv_backup_top_filter = { .bdrv_co_pdiscard = backup_top_co_pdiscard, .bdrv_co_flush = backup_top_co_flush, - .bdrv_co_block_status = bdrv_co_block_status_from_backing, - .bdrv_refresh_filename = backup_top_refresh_filename, .bdrv_child_perm = backup_top_child_perm, @@ -281,7 +279,7 @@ void bdrv_backup_top_drop(BlockDriverState *bs) s->active = false; bdrv_child_refresh_perms(bs, bs->backing, &error_abort); - bdrv_replace_node(bs, backing_bs(bs), &error_abort); + bdrv_replace_node(bs, bs->backing->bs, &error_abort); bdrv_set_backing_hd(bs, NULL, &error_abort); bdrv_drained_end(bs); diff --git a/block/backup.c b/block/backup.c index 4f13bb20a5..9afa0bf3b4 100644 --- a/block/backup.c +++ b/block/backup.c @@ -297,6 +297,7 @@ static int64_t backup_calculate_cluster_size(BlockDriverState *target, { int ret; BlockDriverInfo bdi; + bool target_does_cow = bdrv_backing_chain_next(target); /* * If there is no backing file on the target, we cannot rely on COW if our @@ -304,7 +305,7 @@ static int64_t backup_calculate_cluster_size(BlockDriverState *target, * targets with a backing file, try to avoid COW if possible. */ ret = bdrv_get_info(target, &bdi); - if (ret == -ENOTSUP && !target->backing) { + if (ret == -ENOTSUP && !target_does_cow) { /* Cluster size is not defined */ warn_report("The target block device doesn't provide " "information about the block size and it doesn't have a " @@ -313,14 +314,14 @@ static int64_t backup_calculate_cluster_size(BlockDriverState *target, "this default, the backup may be unusable", BACKUP_CLUSTER_SIZE_DEFAULT); return BACKUP_CLUSTER_SIZE_DEFAULT; - } else if (ret < 0 && !target->backing) { + } else if (ret < 0 && !target_does_cow) { error_setg_errno(errp, -ret, "Couldn't determine the cluster size of the target image, " "which has no backing file"); error_append_hint(errp, "Aborting, since this may create an unusable destination image\n"); return ret; - } else if (ret < 0 && target->backing) { + } else if (ret < 0 && target_does_cow) { /* Not fatal; just trudge on ahead. */ return BACKUP_CLUSTER_SIZE_DEFAULT; } @@ -371,7 +372,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, return NULL; } - if (compress && !block_driver_can_compress(target->drv)) { + if (compress && !bdrv_supports_compressed_writes(target)) { error_setg(errp, "Compression is not supported for this drive %s", bdrv_get_device_name(target)); return NULL; diff --git a/block/blkdebug.c b/block/blkdebug.c index 9c08d8a005..eecbf3e5c4 100644 --- a/block/blkdebug.c +++ b/block/blkdebug.c @@ -752,8 +752,11 @@ static int coroutine_fn blkdebug_co_block_status(BlockDriverState *bs, return err; } - return bdrv_co_block_status_from_file(bs, want_zero, offset, bytes, - pnum, map, file); + assert(bs->file && bs->file->bs); + *pnum = bytes; + *map = offset; + *file = bs->file->bs; + return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; } static void blkdebug_close(BlockDriverState *bs) diff --git a/block/blklogwrites.c b/block/blklogwrites.c index 57315f56b4..13ae63983b 100644 --- a/block/blklogwrites.c +++ b/block/blklogwrites.c @@ -515,7 +515,6 @@ static BlockDriver bdrv_blk_log_writes = { .bdrv_co_pwrite_zeroes = blk_log_writes_co_pwrite_zeroes, .bdrv_co_flush_to_disk = blk_log_writes_co_flush_to_disk, .bdrv_co_pdiscard = blk_log_writes_co_pdiscard, - .bdrv_co_block_status = bdrv_co_block_status_from_file, .is_filter = true, .strong_runtime_opts = blk_log_writes_strong_runtime_opts, diff --git a/block/block-backend.c b/block/block-backend.c index 3a13cb5f0b..24dd0670d1 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -2279,10 +2279,13 @@ int blk_commit_all(void) while ((blk = blk_all_next(blk)) != NULL) { AioContext *aio_context = blk_get_aio_context(blk); + BlockDriverState *unfiltered_bs = bdrv_skip_filters(blk_bs(blk)); aio_context_acquire(aio_context); - if (blk_is_inserted(blk) && blk->root->bs->backing) { - int ret = bdrv_commit(blk->root->bs); + if (blk_is_inserted(blk) && bdrv_cow_child(unfiltered_bs)) { + int ret; + + ret = bdrv_commit(unfiltered_bs); if (ret < 0) { aio_context_release(aio_context); return ret; diff --git a/block/block-copy.c b/block/block-copy.c index a30b9097ef..cd9bc47c8f 100644 --- a/block/block-copy.c +++ b/block/block-copy.c @@ -440,8 +440,8 @@ static int block_copy_block_status(BlockCopyState *s, int64_t offset, BlockDriverState *base; int ret; - if (s->skip_unallocated && s->source->bs->backing) { - base = s->source->bs->backing->bs; + if (s->skip_unallocated) { + base = bdrv_backing_chain_next(s->source->bs); } else { base = NULL; } diff --git a/block/commit.c b/block/commit.c index 7732d02dfe..1e85c306cc 100644 --- a/block/commit.c +++ b/block/commit.c @@ -37,6 +37,7 @@ typedef struct CommitBlockJob { BlockBackend *top; BlockBackend *base; BlockDriverState *base_bs; + BlockDriverState *base_overlay; BlockdevOnError on_error; bool base_read_only; bool chain_frozen; @@ -89,7 +90,7 @@ static void commit_abort(Job *job) * XXX Can (or should) we somehow keep 'consistent read' blocked even * after the failed/cancelled commit job is gone? If we already wrote * something to base, the intermediate images aren't valid any more. */ - bdrv_replace_node(s->commit_top_bs, backing_bs(s->commit_top_bs), + bdrv_replace_node(s->commit_top_bs, s->commit_top_bs->backing->bs, &error_abort); bdrv_unref(s->commit_top_bs); @@ -153,7 +154,7 @@ static int coroutine_fn commit_run(Job *job, Error **errp) break; } /* Copy if allocated above the base */ - ret = bdrv_is_allocated_above(blk_bs(s->top), blk_bs(s->base), false, + ret = bdrv_is_allocated_above(blk_bs(s->top), s->base_overlay, true, offset, COMMIT_BUFFER_SIZE, &n); copy = (ret == 1); trace_commit_one_iteration(s, offset, n, ret); @@ -237,7 +238,6 @@ static void bdrv_commit_top_child_perm(BlockDriverState *bs, BdrvChild *c, static BlockDriver bdrv_commit_top = { .format_name = "commit_top", .bdrv_co_preadv = bdrv_commit_top_preadv, - .bdrv_co_block_status = bdrv_co_block_status_from_backing, .bdrv_refresh_filename = bdrv_commit_top_refresh_filename, .bdrv_child_perm = bdrv_commit_top_child_perm, @@ -253,15 +253,35 @@ void commit_start(const char *job_id, BlockDriverState *bs, CommitBlockJob *s; BlockDriverState *iter; BlockDriverState *commit_top_bs = NULL; + BlockDriverState *filtered_base; Error *local_err = NULL; + int64_t base_size, top_size; + uint64_t base_perms, iter_shared_perms; int ret; assert(top != bs); - if (top == base) { + if (bdrv_skip_filters(top) == bdrv_skip_filters(base)) { error_setg(errp, "Invalid files for merge: top and base are the same"); return; } + base_size = bdrv_getlength(base); + if (base_size < 0) { + error_setg_errno(errp, -base_size, "Could not inquire base image size"); + return; + } + + top_size = bdrv_getlength(top); + if (top_size < 0) { + error_setg_errno(errp, -top_size, "Could not inquire top image size"); + return; + } + + base_perms = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE; + if (base_size < top_size) { + base_perms |= BLK_PERM_RESIZE; + } + s = block_job_create(job_id, &commit_job_driver, NULL, bs, 0, BLK_PERM_ALL, speed, creation_flags, NULL, NULL, errp); if (!s) { @@ -301,17 +321,43 @@ void commit_start(const char *job_id, BlockDriverState *bs, s->commit_top_bs = commit_top_bs; - /* Block all nodes between top and base, because they will - * disappear from the chain after this operation. */ - assert(bdrv_chain_contains(top, base)); - for (iter = top; iter != base; iter = backing_bs(iter)) { - /* XXX BLK_PERM_WRITE needs to be allowed so we don't block ourselves - * at s->base (if writes are blocked for a node, they are also blocked - * for its backing file). The other options would be a second filter - * driver above s->base. */ + /* + * Block all nodes between top and base, because they will + * disappear from the chain after this operation. + * Note that this assumes that the user is fine with removing all + * nodes (including R/W filters) between top and base. Assuring + * this is the responsibility of the interface (i.e. whoever calls + * commit_start()). + */ + s->base_overlay = bdrv_find_overlay(top, base); + assert(s->base_overlay); + + /* + * The topmost node with + * bdrv_skip_filters(filtered_base) == bdrv_skip_filters(base) + */ + filtered_base = bdrv_cow_bs(s->base_overlay); + assert(bdrv_skip_filters(filtered_base) == bdrv_skip_filters(base)); + + /* + * XXX BLK_PERM_WRITE needs to be allowed so we don't block ourselves + * at s->base (if writes are blocked for a node, they are also blocked + * for its backing file). The other options would be a second filter + * driver above s->base. + */ + iter_shared_perms = BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE; + + for (iter = top; iter != base; iter = bdrv_filter_or_cow_bs(iter)) { + if (iter == filtered_base) { + /* + * From here on, all nodes are filters on the base. This + * allows us to share BLK_PERM_CONSISTENT_READ. + */ + iter_shared_perms |= BLK_PERM_CONSISTENT_READ; + } + ret = block_job_add_bdrv(&s->common, "intermediate node", iter, 0, - BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE, - errp); + iter_shared_perms, errp); if (ret < 0) { goto fail; } @@ -328,9 +374,7 @@ void commit_start(const char *job_id, BlockDriverState *bs, } s->base = blk_new(s->common.job.aio_context, - BLK_PERM_CONSISTENT_READ - | BLK_PERM_WRITE - | BLK_PERM_RESIZE, + base_perms, BLK_PERM_CONSISTENT_READ | BLK_PERM_GRAPH_MOD | BLK_PERM_WRITE_UNCHANGED); @@ -398,19 +442,22 @@ int bdrv_commit(BlockDriverState *bs) if (!drv) return -ENOMEDIUM; - if (!bs->backing) { + backing_file_bs = bdrv_cow_bs(bs); + + if (!backing_file_bs) { return -ENOTSUP; } if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT_SOURCE, NULL) || - bdrv_op_is_blocked(bs->backing->bs, BLOCK_OP_TYPE_COMMIT_TARGET, NULL)) { + bdrv_op_is_blocked(backing_file_bs, BLOCK_OP_TYPE_COMMIT_TARGET, NULL)) + { return -EBUSY; } - ro = bs->backing->bs->read_only; + ro = backing_file_bs->read_only; if (ro) { - if (bdrv_reopen_set_read_only(bs->backing->bs, false, NULL)) { + if (bdrv_reopen_set_read_only(backing_file_bs, false, NULL)) { return -EACCES; } } @@ -428,8 +475,6 @@ int bdrv_commit(BlockDriverState *bs) } /* Insert commit_top block node above backing, so we can write to it */ - backing_file_bs = backing_bs(bs); - commit_top_bs = bdrv_new_open_driver(&bdrv_commit_top, NULL, BDRV_O_RDWR, &local_err); if (commit_top_bs == NULL) { @@ -515,7 +560,7 @@ ro_cleanup: qemu_vfree(buf); blk_unref(backing); - if (backing_file_bs) { + if (bdrv_cow_bs(bs) != backing_file_bs) { bdrv_set_backing_hd(bs, backing_file_bs, &error_abort); } bdrv_unref(commit_top_bs); @@ -523,7 +568,7 @@ ro_cleanup: if (ro) { /* ignoring error return here */ - bdrv_reopen_set_read_only(bs->backing->bs, true, NULL); + bdrv_reopen_set_read_only(backing_file_bs, true, NULL); } return ret; diff --git a/block/copy-on-read.c b/block/copy-on-read.c index a6e3c74a68..2816e61afe 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -107,6 +107,16 @@ static int coroutine_fn cor_co_pdiscard(BlockDriverState *bs, } +static int coroutine_fn cor_co_pwritev_compressed(BlockDriverState *bs, + uint64_t offset, + uint64_t bytes, + QEMUIOVector *qiov) +{ + return bdrv_co_pwritev(bs->file, offset, bytes, qiov, + BDRV_REQ_WRITE_COMPRESSED); +} + + static void cor_eject(BlockDriverState *bs, bool eject_flag) { bdrv_eject(bs->file->bs, eject_flag); @@ -131,12 +141,11 @@ static BlockDriver bdrv_copy_on_read = { .bdrv_co_pwritev = cor_co_pwritev, .bdrv_co_pwrite_zeroes = cor_co_pwrite_zeroes, .bdrv_co_pdiscard = cor_co_pdiscard, + .bdrv_co_pwritev_compressed = cor_co_pwritev_compressed, .bdrv_eject = cor_eject, .bdrv_lock_medium = cor_lock_medium, - .bdrv_co_block_status = bdrv_co_block_status_from_file, - .has_variable_length = true, .is_filter = true, }; diff --git a/block/file-win32.c b/block/file-win32.c index ab69bd811a..e2900c3a51 100644 --- a/block/file-win32.c +++ b/block/file-win32.c @@ -299,6 +299,11 @@ static QemuOptsList raw_runtime_opts = { .type = QEMU_OPT_STRING, .help = "host AIO implementation (threads, native)", }, + { + .name = "locking", + .type = QEMU_OPT_STRING, + .help = "file locking mode (on/off/auto, default: auto)", + }, { /* end of list */ } }, }; @@ -333,6 +338,7 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags, Error *local_err = NULL; const char *filename; bool use_aio; + OnOffAuto locking; int ret; s->type = FTYPE_FILE; @@ -343,10 +349,24 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags, goto fail; } - if (qdict_get_try_bool(options, "locking", false)) { + locking = qapi_enum_parse(&OnOffAuto_lookup, + qemu_opt_get(opts, "locking"), + ON_OFF_AUTO_AUTO, &local_err); + if (local_err) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto fail; + } + switch (locking) { + case ON_OFF_AUTO_ON: error_setg(errp, "locking=on is not supported on Windows"); ret = -EINVAL; goto fail; + case ON_OFF_AUTO_OFF: + case ON_OFF_AUTO_AUTO: + break; + default: + g_assert_not_reached(); } filename = qemu_opt_get(opts, "filename"); diff --git a/block/filter-compress.c b/block/filter-compress.c index 8ec1991c1f..5136371bf8 100644 --- a/block/filter-compress.c +++ b/block/filter-compress.c @@ -146,8 +146,6 @@ static BlockDriver bdrv_compress = { .bdrv_eject = compress_eject, .bdrv_lock_medium = compress_lock_medium, - .bdrv_co_block_status = bdrv_co_block_status_from_file, - .has_variable_length = true, .is_filter = true, }; diff --git a/block/io.c b/block/io.c index ad3a51ed53..a2389bb38c 100644 --- a/block/io.c +++ b/block/io.c @@ -135,6 +135,8 @@ static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src) void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) { BlockDriver *drv = bs->drv; + BdrvChild *c; + bool have_limits; Error *local_err = NULL; memset(&bs->bl, 0, sizeof(bs->bl)); @@ -149,14 +151,21 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) drv->bdrv_co_preadv_part) ? 1 : 512; /* Take some limits from the children as a default */ - if (bs->file) { - bdrv_refresh_limits(bs->file->bs, &local_err); - if (local_err) { - error_propagate(errp, local_err); - return; + have_limits = false; + QLIST_FOREACH(c, &bs->children, next) { + if (c->role & (BDRV_CHILD_DATA | BDRV_CHILD_FILTERED | BDRV_CHILD_COW)) + { + bdrv_refresh_limits(c->bs, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + bdrv_merge_limits(&bs->bl, &c->bs->bl); + have_limits = true; } - bdrv_merge_limits(&bs->bl, &bs->file->bs->bl); - } else { + } + + if (!have_limits) { bs->bl.min_mem_alignment = 512; bs->bl.opt_mem_alignment = qemu_real_host_page_size; @@ -164,15 +173,6 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) bs->bl.max_iov = IOV_MAX; } - if (bs->backing) { - bdrv_refresh_limits(bs->backing->bs, &local_err); - if (local_err) { - error_propagate(errp, local_err); - return; - } - bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl); - } - /* Then let the driver override it */ if (drv->bdrv_refresh_limits) { drv->bdrv_refresh_limits(bs, errp); @@ -2255,36 +2255,6 @@ typedef struct BdrvCoBlockStatusData { BlockDriverState **file; } BdrvCoBlockStatusData; -int coroutine_fn bdrv_co_block_status_from_file(BlockDriverState *bs, - bool want_zero, - int64_t offset, - int64_t bytes, - int64_t *pnum, - int64_t *map, - BlockDriverState **file) -{ - assert(bs->file && bs->file->bs); - *pnum = bytes; - *map = offset; - *file = bs->file->bs; - return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; -} - -int coroutine_fn bdrv_co_block_status_from_backing(BlockDriverState *bs, - bool want_zero, - int64_t offset, - int64_t bytes, - int64_t *pnum, - int64_t *map, - BlockDriverState **file) -{ - assert(bs->backing && bs->backing->bs); - *pnum = bytes; - *map = offset; - *file = bs->backing->bs; - return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; -} - /* * Returns the allocation status of the specified sectors. * Drivers not implementing the functionality are assumed to not support @@ -2325,6 +2295,7 @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, BlockDriverState *local_file = NULL; int64_t aligned_offset, aligned_bytes; uint32_t align; + bool has_filtered_child; assert(pnum); *pnum = 0; @@ -2350,7 +2321,8 @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, /* Must be non-NULL or bdrv_getlength() would have failed */ assert(bs->drv); - if (!bs->drv->bdrv_co_block_status) { + has_filtered_child = bdrv_filter_child(bs); + if (!bs->drv->bdrv_co_block_status && !has_filtered_child) { *pnum = bytes; ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; if (offset + bytes == total_size) { @@ -2371,9 +2343,20 @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, aligned_offset = QEMU_ALIGN_DOWN(offset, align); aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset; - ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset, - aligned_bytes, pnum, &local_map, - &local_file); + if (bs->drv->bdrv_co_block_status) { + ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset, + aligned_bytes, pnum, &local_map, + &local_file); + } else { + /* Default code for filters */ + + local_file = bdrv_filter_bs(bs); + assert(local_file); + + *pnum = aligned_bytes; + local_map = aligned_offset; + ret = BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; + } if (ret < 0) { *pnum = 0; goto out; @@ -2409,9 +2392,10 @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { ret |= BDRV_BLOCK_ALLOCATED; } else if (want_zero && bs->drv->supports_backing) { - if (bs->backing) { - BlockDriverState *bs2 = bs->backing->bs; - int64_t size2 = bdrv_getlength(bs2); + BlockDriverState *cow_bs = bdrv_cow_bs(bs); + + if (cow_bs) { + int64_t size2 = bdrv_getlength(cow_bs); if (size2 >= 0 && offset >= size2) { ret |= BDRV_BLOCK_ZERO; @@ -2479,7 +2463,7 @@ static int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs, bool first = true; assert(bs != base); - for (p = bs; p != base; p = backing_bs(p)) { + for (p = bs; p != base; p = bdrv_filter_or_cow_bs(p)) { ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map, file); if (ret < 0) { @@ -2553,7 +2537,7 @@ int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base, int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes, int64_t *pnum, int64_t *map, BlockDriverState **file) { - return bdrv_block_status_above(bs, backing_bs(bs), + return bdrv_block_status_above(bs, bdrv_filter_or_cow_bs(bs), offset, bytes, pnum, map, file); } @@ -2563,9 +2547,9 @@ int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset, int ret; int64_t dummy; - ret = bdrv_common_block_status_above(bs, backing_bs(bs), false, offset, - bytes, pnum ? pnum : &dummy, NULL, - NULL); + ret = bdrv_common_block_status_above(bs, bdrv_filter_or_cow_bs(bs), false, + offset, bytes, pnum ? pnum : &dummy, + NULL, NULL); if (ret < 0) { return ret; } @@ -2628,7 +2612,7 @@ int bdrv_is_allocated_above(BlockDriverState *top, break; } - intermediate = backing_bs(intermediate); + intermediate = bdrv_filter_or_cow_bs(intermediate); } *pnum = n; @@ -2647,6 +2631,7 @@ bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, bool is_read) { BlockDriver *drv = bs->drv; + BlockDriverState *child_bs = bdrv_primary_bs(bs); int ret = -ENOTSUP; bdrv_inc_in_flight(bs); @@ -2659,8 +2644,8 @@ bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, } else { ret = drv->bdrv_save_vmstate(bs, qiov, pos); } - } else if (bs->file) { - ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read); + } else if (child_bs) { + ret = bdrv_co_rw_vmstate(child_bs, qiov, pos, is_read); } bdrv_dec_in_flight(bs); @@ -2770,6 +2755,8 @@ static int coroutine_fn bdrv_flush_co_entry(void *opaque) int coroutine_fn bdrv_co_flush(BlockDriverState *bs) { + BdrvChild *primary_child = bdrv_primary_child(bs); + BdrvChild *child; int current_gen; int ret = 0; @@ -2799,7 +2786,7 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs) } /* Write back cached data to the OS even with cache=unsafe */ - BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); + BLKDBG_EVENT(primary_child, BLKDBG_FLUSH_TO_OS); if (bs->drv->bdrv_co_flush_to_os) { ret = bs->drv->bdrv_co_flush_to_os(bs); if (ret < 0) { @@ -2809,15 +2796,15 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs) /* But don't actually force it to the disk with cache=unsafe */ if (bs->open_flags & BDRV_O_NO_FLUSH) { - goto flush_parent; + goto flush_children; } /* Check if we really need to flush anything */ if (bs->flushed_gen == current_gen) { - goto flush_parent; + goto flush_children; } - BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); + BLKDBG_EVENT(primary_child, BLKDBG_FLUSH_TO_DISK); if (!bs->drv) { /* bs->drv->bdrv_co_flush() might have ejected the BDS * (even in case of apparent success) */ @@ -2861,8 +2848,17 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs) /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH * in the case of cache=unsafe, so there are no useless flushes. */ -flush_parent: - ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0; +flush_children: + ret = 0; + QLIST_FOREACH(child, &bs->children, next) { + if (child->perm & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) { + int this_child_ret = bdrv_co_flush(child->bs); + if (!ret) { + ret = this_child_ret; + } + } + } + out: /* Notify any pending flushes that we have completed */ if (ret == 0) { @@ -3309,6 +3305,7 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, Error **errp) { BlockDriverState *bs = child->bs; + BdrvChild *filtered, *backing; BlockDriver *drv = bs->drv; BdrvTrackedRequest req; int64_t old_size, new_bytes; @@ -3360,6 +3357,9 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, goto out; } + filtered = bdrv_filter_child(bs); + backing = bdrv_cow_child(bs); + /* * If the image has a backing file that is large enough that it would * provide data for the new area, we cannot leave it unallocated because @@ -3370,10 +3370,10 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, * backing file, taking care of keeping things consistent with that backing * file is the user's responsibility. */ - if (new_bytes && bs->backing) { + if (new_bytes && backing) { int64_t backing_len; - backing_len = bdrv_getlength(backing_bs(bs)); + backing_len = bdrv_getlength(backing->bs); if (backing_len < 0) { ret = backing_len; error_setg_errno(errp, -ret, "Could not get backing file size"); @@ -3392,8 +3392,8 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, goto out; } ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, flags, errp); - } else if (bs->file && drv->is_filter) { - ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp); + } else if (filtered) { + ret = bdrv_co_truncate(filtered, offset, exact, prealloc, flags, errp); } else { error_setg(errp, "Image format driver does not support resize"); ret = -ENOTSUP; diff --git a/block/mirror.c b/block/mirror.c index e8e8844afc..26acf4af6f 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -42,6 +42,7 @@ typedef struct MirrorBlockJob { BlockBackend *target; BlockDriverState *mirror_top_bs; BlockDriverState *base; + BlockDriverState *base_overlay; /* The name of the graph node to replace */ char *replaces; @@ -677,8 +678,10 @@ static int mirror_exit_common(Job *job) &error_abort); if (!abort && s->backing_mode == MIRROR_SOURCE_BACKING_CHAIN) { BlockDriverState *backing = s->is_none_mode ? src : s->base; - if (backing_bs(target_bs) != backing) { - bdrv_set_backing_hd(target_bs, backing, &local_err); + BlockDriverState *unfiltered_target = bdrv_skip_filters(target_bs); + + if (bdrv_cow_bs(unfiltered_target) != backing) { + bdrv_set_backing_hd(unfiltered_target, backing, &local_err); if (local_err) { error_report_err(local_err); local_err = NULL; @@ -740,7 +743,7 @@ static int mirror_exit_common(Job *job) * valid. */ block_job_remove_all_bdrv(bjob); - bdrv_replace_node(mirror_top_bs, backing_bs(mirror_top_bs), &error_abort); + bdrv_replace_node(mirror_top_bs, mirror_top_bs->backing->bs, &error_abort); /* We just changed the BDS the job BB refers to (with either or both of the * bdrv_replace_node() calls), so switch the BB back so the cleanup does @@ -786,7 +789,6 @@ static void coroutine_fn mirror_throttle(MirrorBlockJob *s) static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s) { int64_t offset; - BlockDriverState *base = s->base; BlockDriverState *bs = s->mirror_top_bs->backing->bs; BlockDriverState *target_bs = blk_bs(s->target); int ret; @@ -837,7 +839,8 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s) return 0; } - ret = bdrv_is_allocated_above(bs, base, false, offset, bytes, &count); + ret = bdrv_is_allocated_above(bs, s->base_overlay, true, offset, bytes, + &count); if (ret < 0) { return ret; } @@ -936,7 +939,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp) } else { s->target_cluster_size = BDRV_SECTOR_SIZE; } - if (backing_filename[0] && !target_bs->backing && + if (backing_filename[0] && !bdrv_backing_chain_next(target_bs) && s->granularity < s->target_cluster_size) { s->buf_size = MAX(s->buf_size, s->target_cluster_size); s->cow_bitmap = bitmap_new(length); @@ -1116,8 +1119,9 @@ static void mirror_complete(Job *job, Error **errp) if (s->backing_mode == MIRROR_OPEN_BACKING_CHAIN) { int ret; - assert(!target->backing); - ret = bdrv_open_backing_file(target, NULL, "backing", errp); + assert(!bdrv_backing_chain_next(target)); + ret = bdrv_open_backing_file(bdrv_skip_filters(target), NULL, + "backing", errp); if (ret < 0) { return; } @@ -1527,7 +1531,6 @@ static BlockDriver bdrv_mirror_top = { .bdrv_co_pwrite_zeroes = bdrv_mirror_top_pwrite_zeroes, .bdrv_co_pdiscard = bdrv_mirror_top_pdiscard, .bdrv_co_flush = bdrv_mirror_top_flush, - .bdrv_co_block_status = bdrv_co_block_status_from_backing, .bdrv_refresh_filename = bdrv_mirror_top_refresh_filename, .bdrv_child_perm = bdrv_mirror_top_child_perm, @@ -1555,8 +1558,8 @@ static BlockJob *mirror_start_job( MirrorBlockJob *s; MirrorBDSOpaque *bs_opaque; BlockDriverState *mirror_top_bs; - bool target_graph_mod; bool target_is_backing; + uint64_t target_perms, target_shared_perms; Error *local_err = NULL; int ret; @@ -1575,7 +1578,7 @@ static BlockJob *mirror_start_job( buf_size = DEFAULT_MIRROR_BUF_SIZE; } - if (bs == target) { + if (bdrv_skip_filters(bs) == bdrv_skip_filters(target)) { error_setg(errp, "Can't mirror node into itself"); return NULL; } @@ -1639,15 +1642,50 @@ static BlockJob *mirror_start_job( * In the case of active commit, things look a bit different, though, * because the target is an already populated backing file in active use. * We can allow anything except resize there.*/ + + target_perms = BLK_PERM_WRITE; + target_shared_perms = BLK_PERM_WRITE_UNCHANGED; + target_is_backing = bdrv_chain_contains(bs, target); - target_graph_mod = (backing_mode != MIRROR_LEAVE_BACKING_CHAIN); + if (target_is_backing) { + int64_t bs_size, target_size; + bs_size = bdrv_getlength(bs); + if (bs_size < 0) { + error_setg_errno(errp, -bs_size, + "Could not inquire top image size"); + goto fail; + } + + target_size = bdrv_getlength(target); + if (target_size < 0) { + error_setg_errno(errp, -target_size, + "Could not inquire base image size"); + goto fail; + } + + if (target_size < bs_size) { + target_perms |= BLK_PERM_RESIZE; + } + + target_shared_perms |= BLK_PERM_CONSISTENT_READ + | BLK_PERM_WRITE + | BLK_PERM_GRAPH_MOD; + } else if (bdrv_chain_contains(bs, bdrv_skip_filters(target))) { + /* + * We may want to allow this in the future, but it would + * require taking some extra care. + */ + error_setg(errp, "Cannot mirror to a filter on top of a node in the " + "source's backing chain"); + goto fail; + } + + if (backing_mode != MIRROR_LEAVE_BACKING_CHAIN) { + target_perms |= BLK_PERM_GRAPH_MOD; + } + s->target = blk_new(s->common.job.aio_context, - BLK_PERM_WRITE | BLK_PERM_RESIZE | - (target_graph_mod ? BLK_PERM_GRAPH_MOD : 0), - BLK_PERM_WRITE_UNCHANGED | - (target_is_backing ? BLK_PERM_CONSISTENT_READ | - BLK_PERM_WRITE | - BLK_PERM_GRAPH_MOD : 0)); + target_perms, target_shared_perms); ret = blk_insert_bs(s->target, target, errp); if (ret < 0) { goto fail; @@ -1672,6 +1710,7 @@ static BlockJob *mirror_start_job( s->zero_target = zero_target; s->copy_mode = copy_mode; s->base = base; + s->base_overlay = bdrv_find_overlay(bs, base); s->granularity = granularity; s->buf_size = ROUND_UP(buf_size, granularity); s->unmap = unmap; @@ -1702,15 +1741,39 @@ static BlockJob *mirror_start_job( /* In commit_active_start() all intermediate nodes disappear, so * any jobs in them must be blocked */ if (target_is_backing) { - BlockDriverState *iter; - for (iter = backing_bs(bs); iter != target; iter = backing_bs(iter)) { - /* XXX BLK_PERM_WRITE needs to be allowed so we don't block - * ourselves at s->base (if writes are blocked for a node, they are - * also blocked for its backing file). The other options would be a - * second filter driver above s->base (== target). */ + BlockDriverState *iter, *filtered_target; + uint64_t iter_shared_perms; + + /* + * The topmost node with + * bdrv_skip_filters(filtered_target) == bdrv_skip_filters(target) + */ + filtered_target = bdrv_cow_bs(bdrv_find_overlay(bs, target)); + + assert(bdrv_skip_filters(filtered_target) == + bdrv_skip_filters(target)); + + /* + * XXX BLK_PERM_WRITE needs to be allowed so we don't block + * ourselves at s->base (if writes are blocked for a node, they are + * also blocked for its backing file). The other options would be a + * second filter driver above s->base (== target). + */ + iter_shared_perms = BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE; + + for (iter = bdrv_filter_or_cow_bs(bs); iter != target; + iter = bdrv_filter_or_cow_bs(iter)) + { + if (iter == filtered_target) { + /* + * From here on, all nodes are filters on the base. + * This allows us to share BLK_PERM_CONSISTENT_READ. + */ + iter_shared_perms |= BLK_PERM_CONSISTENT_READ; + } + ret = block_job_add_bdrv(&s->common, "intermediate node", iter, 0, - BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE, - errp); + iter_shared_perms, errp); if (ret < 0) { goto fail; } @@ -1746,7 +1809,7 @@ fail: bs_opaque->stop = true; bdrv_child_refresh_perms(mirror_top_bs, mirror_top_bs->backing, &error_abort); - bdrv_replace_node(mirror_top_bs, backing_bs(mirror_top_bs), &error_abort); + bdrv_replace_node(mirror_top_bs, mirror_top_bs->backing->bs, &error_abort); bdrv_unref(mirror_top_bs); @@ -1774,7 +1837,7 @@ void mirror_start(const char *job_id, BlockDriverState *bs, return; } is_none_mode = mode == MIRROR_SYNC_MODE_NONE; - base = mode == MIRROR_SYNC_MODE_TOP ? backing_bs(bs) : NULL; + base = mode == MIRROR_SYNC_MODE_TOP ? bdrv_backing_chain_next(bs) : NULL; mirror_start_job(job_id, bs, creation_flags, target, replaces, speed, granularity, buf_size, backing_mode, zero_target, on_source_error, on_target_error, unmap, NULL, NULL, diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c index 4c8c375172..4d3db5ed3c 100644 --- a/block/monitor/block-hmp-cmds.c +++ b/block/monitor/block-hmp-cmds.c @@ -217,7 +217,7 @@ void hmp_commit(Monitor *mon, const QDict *qdict) return; } - bs = blk_bs(blk); + bs = bdrv_skip_implicit_filters(blk_bs(blk)); aio_context = bdrv_get_aio_context(bs); aio_context_acquire(aio_context); diff --git a/block/null.c b/block/null.c index 15e1d56746..cc9b1d4ea7 100644 --- a/block/null.c +++ b/block/null.c @@ -262,6 +262,11 @@ static void null_refresh_filename(BlockDriverState *bs) bs->drv->format_name); } +static int64_t null_allocated_file_size(BlockDriverState *bs) +{ + return 0; +} + static const char *const null_strong_runtime_opts[] = { BLOCK_OPT_SIZE, NULL_OPT_ZEROES, @@ -277,6 +282,7 @@ static BlockDriver bdrv_null_co = { .bdrv_file_open = null_file_open, .bdrv_parse_filename = null_co_parse_filename, .bdrv_getlength = null_getlength, + .bdrv_get_allocated_file_size = null_allocated_file_size, .bdrv_co_preadv = null_co_preadv, .bdrv_co_pwritev = null_co_pwritev, @@ -297,6 +303,7 @@ static BlockDriver bdrv_null_aio = { .bdrv_file_open = null_file_open, .bdrv_parse_filename = null_aio_parse_filename, .bdrv_getlength = null_getlength, + .bdrv_get_allocated_file_size = null_allocated_file_size, .bdrv_aio_preadv = null_aio_preadv, .bdrv_aio_pwritev = null_aio_pwritev, diff --git a/block/nvme.c b/block/nvme.c index 05485fdd11..f4f27b6da7 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -83,25 +83,21 @@ typedef struct { /* Memory mapped registers */ typedef volatile struct { - uint64_t cap; - uint32_t vs; - uint32_t intms; - uint32_t intmc; - uint32_t cc; - uint32_t reserved0; - uint32_t csts; - uint32_t nssr; - uint32_t aqa; - uint64_t asq; - uint64_t acq; - uint32_t cmbloc; - uint32_t cmbsz; - uint8_t reserved1[0xec0]; - uint8_t cmd_set_specfic[0x100]; - uint32_t doorbells[]; + NvmeBar ctrl; + struct { + uint32_t sq_tail; + uint32_t cq_head; + } doorbells[]; } NVMeRegs; -QEMU_BUILD_BUG_ON(offsetof(NVMeRegs, doorbells) != 0x1000); +#define INDEX_ADMIN 0 +#define INDEX_IO(n) (1 + n) + +/* This driver shares a single MSIX IRQ for the admin and I/O queues */ +enum { + MSIX_SHARED_IRQ_IDX = 0, + MSIX_IRQ_COUNT = 1 +}; struct BDRVNVMeState { AioContext *aio_context; @@ -117,7 +113,7 @@ struct BDRVNVMeState { /* How many uint32_t elements does each doorbell entry take. */ size_t doorbell_scale; bool write_cache_supported; - EventNotifier irq_notifier; + EventNotifier irq_notifier[MSIX_IRQ_COUNT]; uint64_t nsze; /* Namespace size reported by identify command */ int nsid; /* The namespace id to read/write data. */ @@ -162,21 +158,20 @@ static QemuOptsList runtime_opts = { }, }; -static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q, +static void nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q, int nentries, int entry_bytes, Error **errp) { - BDRVNVMeState *s = bs->opaque; size_t bytes; int r; bytes = ROUND_UP(nentries * entry_bytes, s->page_size); q->head = q->tail = 0; - q->queue = qemu_try_blockalign0(bs, bytes); - + q->queue = qemu_try_memalign(s->page_size, bytes); if (!q->queue) { error_setg(errp, "Cannot allocate queue"); return; } + memset(q->queue, 0, bytes); r = qemu_vfio_dma_map(s->vfio, q->queue, bytes, false, &q->iova); if (r) { error_setg(errp, "Cannot map queue"); @@ -206,23 +201,31 @@ static void nvme_free_req_queue_cb(void *opaque) qemu_mutex_unlock(&q->lock); } -static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs, +static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s, + AioContext *aio_context, int idx, int size, Error **errp) { int i, r; - BDRVNVMeState *s = bs->opaque; Error *local_err = NULL; - NVMeQueuePair *q = g_new0(NVMeQueuePair, 1); + NVMeQueuePair *q; uint64_t prp_list_iova; + q = g_try_new0(NVMeQueuePair, 1); + if (!q) { + return NULL; + } + q->prp_list_pages = qemu_try_memalign(s->page_size, + s->page_size * NVME_NUM_REQS); + if (!q->prp_list_pages) { + goto fail; + } + memset(q->prp_list_pages, 0, s->page_size * NVME_NUM_REQS); qemu_mutex_init(&q->lock); q->s = s; q->index = idx; qemu_co_queue_init(&q->free_req_queue); - q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS); - q->completion_bh = aio_bh_new(bdrv_get_aio_context(bs), - nvme_process_completion_bh, q); + q->completion_bh = aio_bh_new(aio_context, nvme_process_completion_bh, q); r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages, s->page_size * NVME_NUM_REQS, false, &prp_list_iova); @@ -239,19 +242,19 @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs, req->prp_list_iova = prp_list_iova + i * s->page_size; } - nvme_init_queue(bs, &q->sq, size, NVME_SQ_ENTRY_BYTES, &local_err); + nvme_init_queue(s, &q->sq, size, NVME_SQ_ENTRY_BYTES, &local_err); if (local_err) { error_propagate(errp, local_err); goto fail; } - q->sq.doorbell = &s->regs->doorbells[idx * 2 * s->doorbell_scale]; + q->sq.doorbell = &s->regs->doorbells[idx * s->doorbell_scale].sq_tail; - nvme_init_queue(bs, &q->cq, size, NVME_CQ_ENTRY_BYTES, &local_err); + nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, &local_err); if (local_err) { error_propagate(errp, local_err); goto fail; } - q->cq.doorbell = &s->regs->doorbells[(idx * 2 + 1) * s->doorbell_scale]; + q->cq.doorbell = &s->regs->doorbells[idx * s->doorbell_scale].cq_head; return q; fail: @@ -441,6 +444,9 @@ static void nvme_trace_command(const NvmeCmd *cmd) { int i; + if (!trace_event_get_state_backends(TRACE_NVME_SUBMIT_COMMAND_RAW)) { + return; + } for (i = 0; i < 8; ++i) { uint8_t *cmdp = (uint8_t *)cmd + i * 8; trace_nvme_submit_command_raw(cmdp[0], cmdp[1], cmdp[2], cmdp[3], @@ -479,6 +485,7 @@ static void nvme_cmd_sync_cb(void *opaque, int ret) static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q, NvmeCmd *cmd) { + AioContext *aio_context = bdrv_get_aio_context(bs); NVMeRequest *req; int ret = -EINPROGRESS; req = nvme_get_free_req(q); @@ -487,17 +494,18 @@ static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q, } nvme_submit_command(q, req, cmd, nvme_cmd_sync_cb, &ret); - BDRV_POLL_WHILE(bs, ret == -EINPROGRESS); + AIO_WAIT_WHILE(aio_context, ret == -EINPROGRESS); return ret; } static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp) { BDRVNVMeState *s = bs->opaque; - NvmeIdCtrl *idctrl; - NvmeIdNs *idns; + union { + NvmeIdCtrl ctrl; + NvmeIdNs ns; + } *id; NvmeLBAF *lbaf; - uint8_t *resp; uint16_t oncs; int r; uint64_t iova; @@ -506,54 +514,52 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp) .cdw10 = cpu_to_le32(0x1), }; - resp = qemu_try_blockalign0(bs, sizeof(NvmeIdCtrl)); - if (!resp) { + id = qemu_try_memalign(s->page_size, sizeof(*id)); + if (!id) { error_setg(errp, "Cannot allocate buffer for identify response"); goto out; } - idctrl = (NvmeIdCtrl *)resp; - idns = (NvmeIdNs *)resp; - r = qemu_vfio_dma_map(s->vfio, resp, sizeof(NvmeIdCtrl), true, &iova); + r = qemu_vfio_dma_map(s->vfio, id, sizeof(*id), true, &iova); if (r) { error_setg(errp, "Cannot map buffer for DMA"); goto out; } - cmd.dptr.prp1 = cpu_to_le64(iova); - if (nvme_cmd_sync(bs, s->queues[0], &cmd)) { + memset(id, 0, sizeof(*id)); + cmd.dptr.prp1 = cpu_to_le64(iova); + if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) { error_setg(errp, "Failed to identify controller"); goto out; } - if (le32_to_cpu(idctrl->nn) < namespace) { + if (le32_to_cpu(id->ctrl.nn) < namespace) { error_setg(errp, "Invalid namespace"); goto out; } - s->write_cache_supported = le32_to_cpu(idctrl->vwc) & 0x1; - s->max_transfer = (idctrl->mdts ? 1 << idctrl->mdts : 0) * s->page_size; + s->write_cache_supported = le32_to_cpu(id->ctrl.vwc) & 0x1; + s->max_transfer = (id->ctrl.mdts ? 1 << id->ctrl.mdts : 0) * s->page_size; /* For now the page list buffer per command is one page, to hold at most * s->page_size / sizeof(uint64_t) entries. */ s->max_transfer = MIN_NON_ZERO(s->max_transfer, s->page_size / sizeof(uint64_t) * s->page_size); - oncs = le16_to_cpu(idctrl->oncs); + oncs = le16_to_cpu(id->ctrl.oncs); s->supports_write_zeroes = !!(oncs & NVME_ONCS_WRITE_ZEROES); s->supports_discard = !!(oncs & NVME_ONCS_DSM); - memset(resp, 0, 4096); - + memset(id, 0, sizeof(*id)); cmd.cdw10 = 0; cmd.nsid = cpu_to_le32(namespace); - if (nvme_cmd_sync(bs, s->queues[0], &cmd)) { + if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) { error_setg(errp, "Failed to identify namespace"); goto out; } - s->nsze = le64_to_cpu(idns->nsze); - lbaf = &idns->lbaf[NVME_ID_NS_FLBAS_INDEX(idns->flbas)]; + s->nsze = le64_to_cpu(id->ns.nsze); + lbaf = &id->ns.lbaf[NVME_ID_NS_FLBAS_INDEX(id->ns.flbas)]; - if (NVME_ID_NS_DLFEAT_WRITE_ZEROES(idns->dlfeat) && - NVME_ID_NS_DLFEAT_READ_BEHAVIOR(idns->dlfeat) == + if (NVME_ID_NS_DLFEAT_WRITE_ZEROES(id->ns.dlfeat) && + NVME_ID_NS_DLFEAT_READ_BEHAVIOR(id->ns.dlfeat) == NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ZEROES) { bs->supported_write_flags |= BDRV_REQ_MAY_UNMAP; } @@ -573,8 +579,34 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp) s->blkshift = lbaf->ds; out: - qemu_vfio_dma_unmap(s->vfio, resp); - qemu_vfree(resp); + qemu_vfio_dma_unmap(s->vfio, id); + qemu_vfree(id); +} + +static bool nvme_poll_queue(NVMeQueuePair *q) +{ + bool progress = false; + + const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES; + NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset]; + + /* + * Do an early check for completions. q->lock isn't needed because + * nvme_process_completion() only runs in the event loop thread and + * cannot race with itself. + */ + if ((le16_to_cpu(cqe->status) & 0x1) == q->cq_phase) { + return false; + } + + qemu_mutex_lock(&q->lock); + while (nvme_process_completion(q)) { + /* Keep polling */ + progress = true; + } + qemu_mutex_unlock(&q->lock); + + return progress; } static bool nvme_poll_queues(BDRVNVMeState *s) @@ -583,32 +615,17 @@ static bool nvme_poll_queues(BDRVNVMeState *s) int i; for (i = 0; i < s->nr_queues; i++) { - NVMeQueuePair *q = s->queues[i]; - const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES; - NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset]; - - /* - * Do an early check for completions. q->lock isn't needed because - * nvme_process_completion() only runs in the event loop thread and - * cannot race with itself. - */ - if ((le16_to_cpu(cqe->status) & 0x1) == q->cq_phase) { - continue; - } - - qemu_mutex_lock(&q->lock); - while (nvme_process_completion(q)) { - /* Keep polling */ + if (nvme_poll_queue(s->queues[i])) { progress = true; } - qemu_mutex_unlock(&q->lock); } return progress; } static void nvme_handle_event(EventNotifier *n) { - BDRVNVMeState *s = container_of(n, BDRVNVMeState, irq_notifier); + BDRVNVMeState *s = container_of(n, BDRVNVMeState, + irq_notifier[MSIX_SHARED_IRQ_IDX]); trace_nvme_handle_event(s); event_notifier_test_and_clear(n); @@ -623,7 +640,8 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp) NvmeCmd cmd; int queue_size = NVME_QUEUE_SIZE; - q = nvme_create_queue_pair(bs, n, queue_size, errp); + q = nvme_create_queue_pair(s, bdrv_get_aio_context(bs), + n, queue_size, errp); if (!q) { return false; } @@ -633,10 +651,9 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp) .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)), .cdw11 = cpu_to_le32(0x3), }; - if (nvme_cmd_sync(bs, s->queues[0], &cmd)) { - error_setg(errp, "Failed to create io queue [%d]", n); - nvme_free_queue_pair(q); - return false; + if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) { + error_setg(errp, "Failed to create CQ io queue [%d]", n); + goto out_error; } cmd = (NvmeCmd) { .opcode = NVME_ADM_CMD_CREATE_SQ, @@ -644,21 +661,24 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp) .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)), .cdw11 = cpu_to_le32(0x1 | (n << 16)), }; - if (nvme_cmd_sync(bs, s->queues[0], &cmd)) { - error_setg(errp, "Failed to create io queue [%d]", n); - nvme_free_queue_pair(q); - return false; + if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) { + error_setg(errp, "Failed to create SQ io queue [%d]", n); + goto out_error; } s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1); s->queues[n] = q; s->nr_queues++; return true; +out_error: + nvme_free_queue_pair(q); + return false; } static bool nvme_poll_cb(void *opaque) { EventNotifier *e = opaque; - BDRVNVMeState *s = container_of(e, BDRVNVMeState, irq_notifier); + BDRVNVMeState *s = container_of(e, BDRVNVMeState, + irq_notifier[MSIX_SHARED_IRQ_IDX]); trace_nvme_poll_cb(s); return nvme_poll_queues(s); @@ -668,6 +688,7 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace, Error **errp) { BDRVNVMeState *s = bs->opaque; + AioContext *aio_context = bdrv_get_aio_context(bs); int ret; uint64_t cap; uint64_t timeout_ms; @@ -679,7 +700,7 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace, s->device = g_strdup(device); s->nsid = namespace; s->aio_context = bdrv_get_aio_context(bs); - ret = event_notifier_init(&s->irq_notifier, 0); + ret = event_notifier_init(&s->irq_notifier[MSIX_SHARED_IRQ_IDX], 0); if (ret) { error_setg(errp, "Failed to init event notifier"); return ret; @@ -700,7 +721,7 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace, /* Perform initialize sequence as described in NVMe spec "7.6.1 * Initialization". */ - cap = le64_to_cpu(s->regs->cap); + cap = le64_to_cpu(s->regs->ctrl.cap); if (!(cap & (1ULL << 37))) { error_setg(errp, "Device doesn't support NVMe command set"); ret = -EINVAL; @@ -713,10 +734,10 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace, timeout_ms = MIN(500 * ((cap >> 24) & 0xFF), 30000); /* Reset device to get a clean state. */ - s->regs->cc = cpu_to_le32(le32_to_cpu(s->regs->cc) & 0xFE); + s->regs->ctrl.cc = cpu_to_le32(le32_to_cpu(s->regs->ctrl.cc) & 0xFE); /* Wait for CSTS.RDY = 0. */ - deadline = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ms * 1000000ULL; - while (le32_to_cpu(s->regs->csts) & 0x1) { + deadline = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ms * SCALE_MS; + while (le32_to_cpu(s->regs->ctrl.csts) & 0x1) { if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) { error_setg(errp, "Timeout while waiting for device to reset (%" PRId64 " ms)", @@ -728,25 +749,27 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace, /* Set up admin queue. */ s->queues = g_new(NVMeQueuePair *, 1); - s->queues[0] = nvme_create_queue_pair(bs, 0, NVME_QUEUE_SIZE, errp); - if (!s->queues[0]) { + s->queues[INDEX_ADMIN] = nvme_create_queue_pair(s, aio_context, 0, + NVME_QUEUE_SIZE, + errp); + if (!s->queues[INDEX_ADMIN]) { ret = -EINVAL; goto out; } s->nr_queues = 1; QEMU_BUILD_BUG_ON(NVME_QUEUE_SIZE & 0xF000); - s->regs->aqa = cpu_to_le32((NVME_QUEUE_SIZE << 16) | NVME_QUEUE_SIZE); - s->regs->asq = cpu_to_le64(s->queues[0]->sq.iova); - s->regs->acq = cpu_to_le64(s->queues[0]->cq.iova); + s->regs->ctrl.aqa = cpu_to_le32((NVME_QUEUE_SIZE << 16) | NVME_QUEUE_SIZE); + s->regs->ctrl.asq = cpu_to_le64(s->queues[INDEX_ADMIN]->sq.iova); + s->regs->ctrl.acq = cpu_to_le64(s->queues[INDEX_ADMIN]->cq.iova); /* After setting up all control registers we can enable device now. */ - s->regs->cc = cpu_to_le32((ctz32(NVME_CQ_ENTRY_BYTES) << 20) | + s->regs->ctrl.cc = cpu_to_le32((ctz32(NVME_CQ_ENTRY_BYTES) << 20) | (ctz32(NVME_SQ_ENTRY_BYTES) << 16) | 0x1); /* Wait for CSTS.RDY = 1. */ now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); deadline = now + timeout_ms * 1000000; - while (!(le32_to_cpu(s->regs->csts) & 0x1)) { + while (!(le32_to_cpu(s->regs->ctrl.csts) & 0x1)) { if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) { error_setg(errp, "Timeout while waiting for device to start (%" PRId64 " ms)", @@ -756,12 +779,13 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace, } } - ret = qemu_vfio_pci_init_irq(s->vfio, &s->irq_notifier, + ret = qemu_vfio_pci_init_irq(s->vfio, s->irq_notifier, VFIO_PCI_MSIX_IRQ_INDEX, errp); if (ret) { goto out; } - aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier, + aio_set_event_notifier(bdrv_get_aio_context(bs), + &s->irq_notifier[MSIX_SHARED_IRQ_IDX], false, nvme_handle_event, nvme_poll_cb); nvme_identify(bs, namespace, &local_err); @@ -828,7 +852,7 @@ static int nvme_enable_disable_write_cache(BlockDriverState *bs, bool enable, .cdw11 = cpu_to_le32(enable ? 0x01 : 0x00), }; - ret = nvme_cmd_sync(bs, s->queues[0], &cmd); + ret = nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd); if (ret) { error_setg(errp, "Failed to configure NVMe write cache"); } @@ -844,9 +868,10 @@ static void nvme_close(BlockDriverState *bs) nvme_free_queue_pair(s->queues[i]); } g_free(s->queues); - aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier, + aio_set_event_notifier(bdrv_get_aio_context(bs), + &s->irq_notifier[MSIX_SHARED_IRQ_IDX], false, NULL, NULL); - event_notifier_cleanup(&s->irq_notifier); + event_notifier_cleanup(&s->irq_notifier[MSIX_SHARED_IRQ_IDX]); qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0, NVME_BAR_SIZE); qemu_vfio_close(s->vfio); @@ -1045,7 +1070,7 @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs, { int r; BDRVNVMeState *s = bs->opaque; - NVMeQueuePair *ioq = s->queues[1]; + NVMeQueuePair *ioq = s->queues[INDEX_IO(0)]; NVMeRequest *req; uint32_t cdw12 = (((bytes >> s->blkshift) - 1) & 0xFFFF) | @@ -1124,7 +1149,7 @@ static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes, return nvme_co_prw_aligned(bs, offset, bytes, qiov, is_write, flags); } trace_nvme_prw_buffered(s, offset, bytes, qiov->niov, is_write); - buf = qemu_try_blockalign(bs, bytes); + buf = qemu_try_memalign(s->page_size, bytes); if (!buf) { return -ENOMEM; @@ -1160,7 +1185,7 @@ static coroutine_fn int nvme_co_pwritev(BlockDriverState *bs, static coroutine_fn int nvme_co_flush(BlockDriverState *bs) { BDRVNVMeState *s = bs->opaque; - NVMeQueuePair *ioq = s->queues[1]; + NVMeQueuePair *ioq = s->queues[INDEX_IO(0)]; NVMeRequest *req; NvmeCmd cmd = { .opcode = NVME_CMD_FLUSH, @@ -1191,7 +1216,7 @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs, BdrvRequestFlags flags) { BDRVNVMeState *s = bs->opaque; - NVMeQueuePair *ioq = s->queues[1]; + NVMeQueuePair *ioq = s->queues[INDEX_IO(0)]; NVMeRequest *req; uint32_t cdw12 = ((bytes >> s->blkshift) - 1) & 0xFFFF; @@ -1244,7 +1269,7 @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs, int bytes) { BDRVNVMeState *s = bs->opaque; - NVMeQueuePair *ioq = s->queues[1]; + NVMeQueuePair *ioq = s->queues[INDEX_IO(0)]; NVMeRequest *req; NvmeDsmRange *buf; QEMUIOVector local_qiov; @@ -1268,11 +1293,11 @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs, assert(s->nr_queues > 1); - buf = qemu_try_blockalign0(bs, s->page_size); + buf = qemu_try_memalign(s->page_size, s->page_size); if (!buf) { return -ENOMEM; } - + memset(buf, 0, s->page_size); buf->nlb = cpu_to_le32(bytes >> s->blkshift); buf->slba = cpu_to_le64(offset >> s->blkshift); buf->cattr = 0; @@ -1353,7 +1378,8 @@ static void nvme_detach_aio_context(BlockDriverState *bs) q->completion_bh = NULL; } - aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier, + aio_set_event_notifier(bdrv_get_aio_context(bs), + &s->irq_notifier[MSIX_SHARED_IRQ_IDX], false, NULL, NULL); } @@ -1363,7 +1389,7 @@ static void nvme_attach_aio_context(BlockDriverState *bs, BDRVNVMeState *s = bs->opaque; s->aio_context = new_context; - aio_set_event_notifier(new_context, &s->irq_notifier, + aio_set_event_notifier(new_context, &s->irq_notifier[MSIX_SHARED_IRQ_IDX], false, nvme_handle_event, nvme_poll_cb); for (int i = 0; i < s->nr_queues; i++) { @@ -1387,7 +1413,7 @@ static void nvme_aio_unplug(BlockDriverState *bs) BDRVNVMeState *s = bs->opaque; assert(s->plugged); s->plugged = false; - for (i = 1; i < s->nr_queues; i++) { + for (i = INDEX_IO(0); i < s->nr_queues; i++) { NVMeQueuePair *q = s->queues[i]; qemu_mutex_lock(&q->lock); nvme_kick(q); diff --git a/block/qapi.c b/block/qapi.c index afd9f3b4a7..f423ece98c 100644 --- a/block/qapi.c +++ b/block/qapi.c @@ -47,7 +47,7 @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk, Error **errp) { ImageInfo **p_image_info; - BlockDriverState *bs0; + BlockDriverState *bs0, *backing; BlockDeviceInfo *info; if (!bs->drv) { @@ -76,9 +76,10 @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk, info->node_name = g_strdup(bs->node_name); } - if (bs->backing_file[0]) { + backing = bdrv_cow_bs(bs); + if (backing) { info->has_backing_file = true; - info->backing_file = g_strdup(bs->backing_file); + info->backing_file = g_strdup(backing->filename); } if (!QLIST_EMPTY(&bs->dirty_bitmaps)) { @@ -163,9 +164,13 @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk, break; } - if (bs0->drv && bs0->backing) { + if (bs0->drv && bdrv_filter_or_cow_child(bs0)) { + /* + * Put any filtered child here (for backwards compatibility to when + * we put bs0->backing here, which might be any filtered child). + */ info->backing_file_depth++; - bs0 = bs0->backing->bs; + bs0 = bdrv_filter_or_cow_bs(bs0); (*p_image_info)->has_backing_image = true; p_image_info = &((*p_image_info)->backing_image); } else { @@ -174,9 +179,8 @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk, /* Skip automatically inserted nodes that the user isn't aware of for * query-block (blk != NULL), but not for query-named-block-nodes */ - while (blk && bs0->drv && bs0->implicit) { - bs0 = backing_bs(bs0); - assert(bs0); + if (blk) { + bs0 = bdrv_skip_implicit_filters(bs0); } } @@ -288,7 +292,7 @@ void bdrv_query_image_info(BlockDriverState *bs, info->virtual_size = size; info->actual_size = bdrv_get_allocated_file_size(bs); info->has_actual_size = info->actual_size >= 0; - if (bdrv_is_encrypted(bs)) { + if (bs->encrypted) { info->encrypted = true; info->has_encrypted = true; } @@ -311,6 +315,7 @@ void bdrv_query_image_info(BlockDriverState *bs, backing_filename = bs->backing_file; if (backing_filename[0] != '\0') { char *backing_filename2; + info->backing_filename = g_strdup(backing_filename); info->has_backing_filename = true; backing_filename2 = bdrv_get_full_backing_filename(bs, NULL); @@ -362,9 +367,7 @@ static void bdrv_query_info(BlockBackend *blk, BlockInfo **p_info, char *qdev; /* Skip automatically inserted nodes that the user isn't aware of */ - while (bs && bs->drv && bs->implicit) { - bs = backing_bs(bs); - } + bs = bdrv_skip_implicit_filters(bs); info->device = g_strdup(blk_name(blk)); info->type = g_strdup("unknown"); @@ -526,6 +529,8 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk) static BlockStats *bdrv_query_bds_stats(BlockDriverState *bs, bool blk_level) { + BdrvChild *parent_child; + BlockDriverState *filter_or_cow_bs; BlockStats *s = NULL; s = g_malloc0(sizeof(*s)); @@ -538,9 +543,8 @@ static BlockStats *bdrv_query_bds_stats(BlockDriverState *bs, /* Skip automatically inserted nodes that the user isn't aware of in * a BlockBackend-level command. Stay at the exact node for a node-level * command. */ - while (blk_level && bs->drv && bs->implicit) { - bs = backing_bs(bs); - assert(bs); + if (blk_level) { + bs = bdrv_skip_implicit_filters(bs); } if (bdrv_get_node_name(bs)[0]) { @@ -555,14 +559,46 @@ static BlockStats *bdrv_query_bds_stats(BlockDriverState *bs, s->has_driver_specific = true; } - if (bs->file) { + parent_child = bdrv_primary_child(bs); + if (!parent_child || + !(parent_child->role & (BDRV_CHILD_DATA | BDRV_CHILD_FILTERED))) + { + BdrvChild *c; + + /* + * Look for a unique data-storing child. We do not need to look for + * filtered children, as there would be only one and it would have been + * the primary child. + */ + parent_child = NULL; + QLIST_FOREACH(c, &bs->children, next) { + if (c->role & BDRV_CHILD_DATA) { + if (parent_child) { + /* + * There are multiple data-storing children and we cannot + * choose between them. + */ + parent_child = NULL; + break; + } + parent_child = c; + } + } + } + if (parent_child) { s->has_parent = true; - s->parent = bdrv_query_bds_stats(bs->file->bs, blk_level); + s->parent = bdrv_query_bds_stats(parent_child->bs, blk_level); } - if (blk_level && bs->backing) { + filter_or_cow_bs = bdrv_filter_or_cow_bs(bs); + if (blk_level && filter_or_cow_bs) { + /* + * Put any filtered or COW child here (for backwards + * compatibility to when we put bs0->backing here, which might + * be either) + */ s->has_backing = true; - s->backing = bdrv_query_bds_stats(bs->backing->bs, blk_level); + s->backing = bdrv_query_bds_stats(filter_or_cow_bs, blk_level); } return s; diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 996b3314f4..fcdf7af8e6 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -1320,6 +1320,7 @@ static bool cluster_needs_new_alloc(BlockDriverState *bs, uint64_t l2_entry) if (l2_entry & QCOW_OFLAG_COPIED) { return false; } + /* fallthrough */ case QCOW2_CLUSTER_UNALLOCATED: case QCOW2_CLUSTER_COMPRESSED: case QCOW2_CLUSTER_ZERO_PLAIN: diff --git a/block/snapshot.c b/block/snapshot.c index bd9fb01817..a2bf3a54eb 100644 --- a/block/snapshot.c +++ b/block/snapshot.c @@ -147,6 +147,56 @@ bool bdrv_snapshot_find_by_id_and_name(BlockDriverState *bs, return ret; } +/** + * Return a pointer to the child BDS pointer to which we can fall + * back if the given BDS does not support snapshots. + * Return NULL if there is no BDS to (safely) fall back to. + * + * We need to return an indirect pointer because bdrv_snapshot_goto() + * has to modify the BdrvChild pointer. + */ +static BdrvChild **bdrv_snapshot_fallback_ptr(BlockDriverState *bs) +{ + BdrvChild **fallback; + BdrvChild *child; + + /* + * The only BdrvChild pointers that are safe to modify (and which + * we can thus return a reference to) are bs->file and + * bs->backing. + */ + fallback = &bs->file; + if (!*fallback && bs->drv && bs->drv->is_filter) { + fallback = &bs->backing; + } + + if (!*fallback) { + return NULL; + } + + /* + * Check that there are no other children that would need to be + * snapshotted. If there are, it is not safe to fall back to + * *fallback. + */ + QLIST_FOREACH(child, &bs->children, next) { + if (child->role & (BDRV_CHILD_DATA | BDRV_CHILD_METADATA | + BDRV_CHILD_FILTERED) && + child != *fallback) + { + return NULL; + } + } + + return fallback; +} + +static BlockDriverState *bdrv_snapshot_fallback(BlockDriverState *bs) +{ + BdrvChild **child_ptr = bdrv_snapshot_fallback_ptr(bs); + return child_ptr ? (*child_ptr)->bs : NULL; +} + int bdrv_can_snapshot(BlockDriverState *bs) { BlockDriver *drv = bs->drv; @@ -155,8 +205,9 @@ int bdrv_can_snapshot(BlockDriverState *bs) } if (!drv->bdrv_snapshot_create) { - if (bs->file != NULL) { - return bdrv_can_snapshot(bs->file->bs); + BlockDriverState *fallback_bs = bdrv_snapshot_fallback(bs); + if (fallback_bs) { + return bdrv_can_snapshot(fallback_bs); } return 0; } @@ -168,14 +219,15 @@ int bdrv_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) { BlockDriver *drv = bs->drv; + BlockDriverState *fallback_bs = bdrv_snapshot_fallback(bs); if (!drv) { return -ENOMEDIUM; } if (drv->bdrv_snapshot_create) { return drv->bdrv_snapshot_create(bs, sn_info); } - if (bs->file) { - return bdrv_snapshot_create(bs->file->bs, sn_info); + if (fallback_bs) { + return bdrv_snapshot_create(fallback_bs, sn_info); } return -ENOTSUP; } @@ -185,6 +237,7 @@ int bdrv_snapshot_goto(BlockDriverState *bs, Error **errp) { BlockDriver *drv = bs->drv; + BdrvChild **fallback_ptr; int ret, open_ret; if (!drv) { @@ -205,39 +258,46 @@ int bdrv_snapshot_goto(BlockDriverState *bs, return ret; } - if (bs->file) { - BlockDriverState *file; - QDict *options = qdict_clone_shallow(bs->options); + fallback_ptr = bdrv_snapshot_fallback_ptr(bs); + if (fallback_ptr) { + QDict *options; QDict *file_options; Error *local_err = NULL; + BlockDriverState *fallback_bs = (*fallback_ptr)->bs; + char *subqdict_prefix = g_strdup_printf("%s.", (*fallback_ptr)->name); + + options = qdict_clone_shallow(bs->options); - file = bs->file->bs; /* Prevent it from getting deleted when detached from bs */ - bdrv_ref(file); + bdrv_ref(fallback_bs); - qdict_extract_subqdict(options, &file_options, "file."); + qdict_extract_subqdict(options, &file_options, subqdict_prefix); qobject_unref(file_options); - qdict_put_str(options, "file", bdrv_get_node_name(file)); + g_free(subqdict_prefix); + + qdict_put_str(options, (*fallback_ptr)->name, + bdrv_get_node_name(fallback_bs)); if (drv->bdrv_close) { drv->bdrv_close(bs); } - bdrv_unref_child(bs, bs->file); - bs->file = NULL; - ret = bdrv_snapshot_goto(file, snapshot_id, errp); + bdrv_unref_child(bs, *fallback_ptr); + *fallback_ptr = NULL; + + ret = bdrv_snapshot_goto(fallback_bs, snapshot_id, errp); open_ret = drv->bdrv_open(bs, options, bs->open_flags, &local_err); qobject_unref(options); if (open_ret < 0) { - bdrv_unref(file); + bdrv_unref(fallback_bs); bs->drv = NULL; /* A bdrv_snapshot_goto() error takes precedence */ error_propagate(errp, local_err); return ret < 0 ? ret : open_ret; } - assert(bs->file->bs == file); - bdrv_unref(file); + assert(fallback_bs == (*fallback_ptr)->bs); + bdrv_unref(fallback_bs); return ret; } @@ -273,6 +333,7 @@ int bdrv_snapshot_delete(BlockDriverState *bs, Error **errp) { BlockDriver *drv = bs->drv; + BlockDriverState *fallback_bs = bdrv_snapshot_fallback(bs); int ret; if (!drv) { @@ -289,8 +350,8 @@ int bdrv_snapshot_delete(BlockDriverState *bs, if (drv->bdrv_snapshot_delete) { ret = drv->bdrv_snapshot_delete(bs, snapshot_id, name, errp); - } else if (bs->file) { - ret = bdrv_snapshot_delete(bs->file->bs, snapshot_id, name, errp); + } else if (fallback_bs) { + ret = bdrv_snapshot_delete(fallback_bs, snapshot_id, name, errp); } else { error_setg(errp, "Block format '%s' used by device '%s' " "does not support internal snapshot deletion", @@ -306,14 +367,15 @@ int bdrv_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_info) { BlockDriver *drv = bs->drv; + BlockDriverState *fallback_bs = bdrv_snapshot_fallback(bs); if (!drv) { return -ENOMEDIUM; } if (drv->bdrv_snapshot_list) { return drv->bdrv_snapshot_list(bs, psn_info); } - if (bs->file) { - return bdrv_snapshot_list(bs->file->bs, psn_info); + if (fallback_bs) { + return bdrv_snapshot_list(fallback_bs, psn_info); } return -ENOTSUP; } diff --git a/block/stream.c b/block/stream.c index 310ccbaa4c..8ce6729a33 100644 --- a/block/stream.c +++ b/block/stream.c @@ -31,7 +31,8 @@ enum { typedef struct StreamBlockJob { BlockJob common; - BlockDriverState *bottom; + BlockDriverState *base_overlay; /* COW overlay (stream from this) */ + BlockDriverState *above_base; /* Node directly above the base */ BlockdevOnError on_error; char *backing_file_str; bool bs_read_only; @@ -53,7 +54,7 @@ static void stream_abort(Job *job) if (s->chain_frozen) { BlockJob *bjob = &s->common; - bdrv_unfreeze_backing_chain(blk_bs(bjob->blk), s->bottom); + bdrv_unfreeze_backing_chain(blk_bs(bjob->blk), s->above_base); } } @@ -62,14 +63,15 @@ static int stream_prepare(Job *job) StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); BlockJob *bjob = &s->common; BlockDriverState *bs = blk_bs(bjob->blk); - BlockDriverState *base = backing_bs(s->bottom); + BlockDriverState *unfiltered_bs = bdrv_skip_filters(bs); + BlockDriverState *base = bdrv_filter_or_cow_bs(s->above_base); Error *local_err = NULL; int ret = 0; - bdrv_unfreeze_backing_chain(bs, s->bottom); + bdrv_unfreeze_backing_chain(bs, s->above_base); s->chain_frozen = false; - if (bs->backing) { + if (bdrv_cow_child(unfiltered_bs)) { const char *base_id = NULL, *base_fmt = NULL; if (base) { base_id = s->backing_file_str; @@ -77,8 +79,8 @@ static int stream_prepare(Job *job) base_fmt = base->drv->format_name; } } - bdrv_set_backing_hd(bs, base, &local_err); - ret = bdrv_change_backing_file(bs, base_id, base_fmt, false); + bdrv_set_backing_hd(unfiltered_bs, base, &local_err); + ret = bdrv_change_backing_file(unfiltered_bs, base_id, base_fmt, false); if (local_err) { error_report_err(local_err); return -EPERM; @@ -109,14 +111,15 @@ static int coroutine_fn stream_run(Job *job, Error **errp) StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); BlockBackend *blk = s->common.blk; BlockDriverState *bs = blk_bs(blk); - bool enable_cor = !backing_bs(s->bottom); + BlockDriverState *unfiltered_bs = bdrv_skip_filters(bs); + bool enable_cor = !bdrv_cow_child(s->base_overlay); int64_t len; int64_t offset = 0; uint64_t delay_ns = 0; int error = 0; int64_t n = 0; /* bytes */ - if (bs == s->bottom) { + if (unfiltered_bs == s->base_overlay) { /* Nothing to stream */ return 0; } @@ -150,13 +153,14 @@ static int coroutine_fn stream_run(Job *job, Error **errp) copy = false; - ret = bdrv_is_allocated(bs, offset, STREAM_CHUNK, &n); + ret = bdrv_is_allocated(unfiltered_bs, offset, STREAM_CHUNK, &n); if (ret == 1) { /* Allocated in the top, no need to copy. */ } else if (ret >= 0) { /* Copy if allocated in the intermediate images. Limit to the * known-unallocated area [offset, offset+n*BDRV_SECTOR_SIZE). */ - ret = bdrv_is_allocated_above(backing_bs(bs), s->bottom, true, + ret = bdrv_is_allocated_above(bdrv_cow_bs(unfiltered_bs), + s->base_overlay, true, offset, n, &n); /* Finish early if end of backing file has been reached */ if (ret == 0 && n == 0) { @@ -223,9 +227,29 @@ void stream_start(const char *job_id, BlockDriverState *bs, BlockDriverState *iter; bool bs_read_only; int basic_flags = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED; - BlockDriverState *bottom = bdrv_find_overlay(bs, base); + BlockDriverState *base_overlay = bdrv_find_overlay(bs, base); + BlockDriverState *above_base; - if (bdrv_freeze_backing_chain(bs, bottom, errp) < 0) { + if (!base_overlay) { + error_setg(errp, "'%s' is not in the backing chain of '%s'", + base->node_name, bs->node_name); + return; + } + + /* + * Find the node directly above @base. @base_overlay is a COW overlay, so + * it must have a bdrv_cow_child(), but it is the immediate overlay of + * @base, so between the two there can only be filters. + */ + above_base = base_overlay; + if (bdrv_cow_bs(above_base) != base) { + above_base = bdrv_cow_bs(above_base); + while (bdrv_filter_bs(above_base) != base) { + above_base = bdrv_filter_bs(above_base); + } + } + + if (bdrv_freeze_backing_chain(bs, above_base, errp) < 0) { return; } @@ -255,14 +279,19 @@ void stream_start(const char *job_id, BlockDriverState *bs, * and resizes. Reassign the base node pointer because the backing BS of the * bottom node might change after the call to bdrv_reopen_set_read_only() * due to parallel block jobs running. + * above_base node might change after the call to + * bdrv_reopen_set_read_only() due to parallel block jobs running. */ - base = backing_bs(bottom); - for (iter = backing_bs(bs); iter && iter != base; iter = backing_bs(iter)) { + base = bdrv_filter_or_cow_bs(above_base); + for (iter = bdrv_filter_or_cow_bs(bs); iter != base; + iter = bdrv_filter_or_cow_bs(iter)) + { block_job_add_bdrv(&s->common, "intermediate node", iter, 0, basic_flags, &error_abort); } - s->bottom = bottom; + s->base_overlay = base_overlay; + s->above_base = above_base; s->backing_file_str = g_strdup(backing_file_str); s->bs_read_only = bs_read_only; s->chain_frozen = true; @@ -276,5 +305,5 @@ fail: if (bs_read_only) { bdrv_reopen_set_read_only(bs, true, NULL); } - bdrv_unfreeze_backing_chain(bs, bottom); + bdrv_unfreeze_backing_chain(bs, above_base); } diff --git a/block/throttle.c b/block/throttle.c index 1c1ac57bee..9a0f38149a 100644 --- a/block/throttle.c +++ b/block/throttle.c @@ -151,6 +151,15 @@ static int coroutine_fn throttle_co_pdiscard(BlockDriverState *bs, return bdrv_co_pdiscard(bs->file, offset, bytes); } +static int coroutine_fn throttle_co_pwritev_compressed(BlockDriverState *bs, + uint64_t offset, + uint64_t bytes, + QEMUIOVector *qiov) +{ + return throttle_co_pwritev(bs, offset, bytes, qiov, + BDRV_REQ_WRITE_COMPRESSED); +} + static int throttle_co_flush(BlockDriverState *bs) { return bdrv_co_flush(bs->file->bs); @@ -243,6 +252,7 @@ static BlockDriver bdrv_throttle = { .bdrv_co_pwrite_zeroes = throttle_co_pwrite_zeroes, .bdrv_co_pdiscard = throttle_co_pdiscard, + .bdrv_co_pwritev_compressed = throttle_co_pwritev_compressed, .bdrv_attach_aio_context = throttle_attach_aio_context, .bdrv_detach_aio_context = throttle_detach_aio_context, @@ -250,7 +260,6 @@ static BlockDriver bdrv_throttle = { .bdrv_reopen_prepare = throttle_reopen_prepare, .bdrv_reopen_commit = throttle_reopen_commit, .bdrv_reopen_abort = throttle_reopen_abort, - .bdrv_co_block_status = bdrv_co_block_status_from_file, .bdrv_co_drain_begin = throttle_co_drain_begin, .bdrv_co_drain_end = throttle_co_drain_end, diff --git a/block/vmdk.c b/block/vmdk.c index d90855446a..8ec62c7ab7 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -2803,21 +2803,6 @@ static void vmdk_close(BlockDriverState *bs) error_free(s->migration_blocker); } -static coroutine_fn int vmdk_co_flush(BlockDriverState *bs) -{ - BDRVVmdkState *s = bs->opaque; - int i, err; - int ret = 0; - - for (i = 0; i < s->num_extents; i++) { - err = bdrv_co_flush(s->extents[i].file->bs); - if (err < 0) { - ret = err; - } - } - return ret; -} - static int64_t vmdk_get_allocated_file_size(BlockDriverState *bs) { int i; @@ -3081,7 +3066,6 @@ static BlockDriver bdrv_vmdk = { .bdrv_close = vmdk_close, .bdrv_co_create_opts = vmdk_co_create_opts, .bdrv_co_create = vmdk_co_create, - .bdrv_co_flush_to_disk = vmdk_co_flush, .bdrv_co_block_status = vmdk_co_block_status, .bdrv_get_allocated_file_size = vmdk_get_allocated_file_size, .bdrv_has_zero_init = vmdk_has_zero_init, |