aboutsummaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
authorPeter Maydell <peter.maydell@linaro.org>2020-09-11 14:47:49 +0100
committerPeter Maydell <peter.maydell@linaro.org>2020-09-11 14:47:49 +0100
commit2499453eb1cbb68a45d7562a180afd7659007fd4 (patch)
tree71e256143ef313c34e6e00b7bddb1c8e4bb6e5d8 /block
parent922781b7b37de22a06269f25c9c1ae66293c5991 (diff)
parentb9be6faed1a069526efdc80df7318b16afc6e116 (diff)
Merge remote-tracking branch 'remotes/kevin/tags/for-upstream' into staging
Block layer patches: - qemu-img create: Fail gracefully when backing file is an empty string - Fixes related to filter block nodes ("Deal with filters" series) - block/nvme: Various cleanups required to use multiple queues - block/nvme: Use NvmeBar structure from "block/nvme.h" - file-win32: Fix "locking" option - iotests: Allow running from different directory # gpg: Signature made Thu 10 Sep 2020 10:11:19 BST # gpg: using RSA key DC3DEB159A9AF95D3D7456FE7F09B272C88F2FD6 # gpg: issuer "kwolf@redhat.com" # gpg: Good signature from "Kevin Wolf <kwolf@redhat.com>" [full] # Primary key fingerprint: DC3D EB15 9A9A F95D 3D74 56FE 7F09 B272 C88F 2FD6 * remotes/kevin/tags/for-upstream: (65 commits) block/qcow2-cluster: Add missing "fallthrough" annotation block/nvme: Pair doorbell registers block/nvme: Use generic NvmeBar structure block/nvme: Group controller registers in NVMeRegs structure file-win32: Fix "locking" option iotests: Allow running from different directory iotests: Test committing to overridden backing iotests: Add test for commit in sub directory iotests: Add filter mirror test cases iotests: Add filter commit test cases iotests: Let complete_and_wait() work with commit iotests: Test that qcow2's data-file is flushed block: Leave BDS.backing_{file,format} constant block: Inline bdrv_co_block_status_from_*() blockdev: Fix active commit choice block: Drop backing_bs() qemu-img: Use child access functions nbd: Use CAF when looking for dirty bitmap commit: Deal with filters backup: Deal with filters ... Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Diffstat (limited to 'block')
-rw-r--r--block/backup-top.c4
-rw-r--r--block/backup.c9
-rw-r--r--block/blkdebug.c7
-rw-r--r--block/blklogwrites.c1
-rw-r--r--block/block-backend.c7
-rw-r--r--block/block-copy.c4
-rw-r--r--block/commit.c95
-rw-r--r--block/copy-on-read.c13
-rw-r--r--block/file-win32.c22
-rw-r--r--block/filter-compress.c2
-rw-r--r--block/io.c142
-rw-r--r--block/mirror.c119
-rw-r--r--block/monitor/block-hmp-cmds.c2
-rw-r--r--block/null.c7
-rw-r--r--block/nvme.c248
-rw-r--r--block/qapi.c74
-rw-r--r--block/qcow2-cluster.c1
-rw-r--r--block/snapshot.c104
-rw-r--r--block/stream.c63
-rw-r--r--block/throttle.c11
-rw-r--r--block/vmdk.c16
21 files changed, 622 insertions, 329 deletions
diff --git a/block/backup-top.c b/block/backup-top.c
index af2f20f346..fe6883cc97 100644
--- a/block/backup-top.c
+++ b/block/backup-top.c
@@ -175,8 +175,6 @@ BlockDriver bdrv_backup_top_filter = {
.bdrv_co_pdiscard = backup_top_co_pdiscard,
.bdrv_co_flush = backup_top_co_flush,
- .bdrv_co_block_status = bdrv_co_block_status_from_backing,
-
.bdrv_refresh_filename = backup_top_refresh_filename,
.bdrv_child_perm = backup_top_child_perm,
@@ -281,7 +279,7 @@ void bdrv_backup_top_drop(BlockDriverState *bs)
s->active = false;
bdrv_child_refresh_perms(bs, bs->backing, &error_abort);
- bdrv_replace_node(bs, backing_bs(bs), &error_abort);
+ bdrv_replace_node(bs, bs->backing->bs, &error_abort);
bdrv_set_backing_hd(bs, NULL, &error_abort);
bdrv_drained_end(bs);
diff --git a/block/backup.c b/block/backup.c
index 4f13bb20a5..9afa0bf3b4 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -297,6 +297,7 @@ static int64_t backup_calculate_cluster_size(BlockDriverState *target,
{
int ret;
BlockDriverInfo bdi;
+ bool target_does_cow = bdrv_backing_chain_next(target);
/*
* If there is no backing file on the target, we cannot rely on COW if our
@@ -304,7 +305,7 @@ static int64_t backup_calculate_cluster_size(BlockDriverState *target,
* targets with a backing file, try to avoid COW if possible.
*/
ret = bdrv_get_info(target, &bdi);
- if (ret == -ENOTSUP && !target->backing) {
+ if (ret == -ENOTSUP && !target_does_cow) {
/* Cluster size is not defined */
warn_report("The target block device doesn't provide "
"information about the block size and it doesn't have a "
@@ -313,14 +314,14 @@ static int64_t backup_calculate_cluster_size(BlockDriverState *target,
"this default, the backup may be unusable",
BACKUP_CLUSTER_SIZE_DEFAULT);
return BACKUP_CLUSTER_SIZE_DEFAULT;
- } else if (ret < 0 && !target->backing) {
+ } else if (ret < 0 && !target_does_cow) {
error_setg_errno(errp, -ret,
"Couldn't determine the cluster size of the target image, "
"which has no backing file");
error_append_hint(errp,
"Aborting, since this may create an unusable destination image\n");
return ret;
- } else if (ret < 0 && target->backing) {
+ } else if (ret < 0 && target_does_cow) {
/* Not fatal; just trudge on ahead. */
return BACKUP_CLUSTER_SIZE_DEFAULT;
}
@@ -371,7 +372,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
return NULL;
}
- if (compress && !block_driver_can_compress(target->drv)) {
+ if (compress && !bdrv_supports_compressed_writes(target)) {
error_setg(errp, "Compression is not supported for this drive %s",
bdrv_get_device_name(target));
return NULL;
diff --git a/block/blkdebug.c b/block/blkdebug.c
index 9c08d8a005..eecbf3e5c4 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -752,8 +752,11 @@ static int coroutine_fn blkdebug_co_block_status(BlockDriverState *bs,
return err;
}
- return bdrv_co_block_status_from_file(bs, want_zero, offset, bytes,
- pnum, map, file);
+ assert(bs->file && bs->file->bs);
+ *pnum = bytes;
+ *map = offset;
+ *file = bs->file->bs;
+ return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
}
static void blkdebug_close(BlockDriverState *bs)
diff --git a/block/blklogwrites.c b/block/blklogwrites.c
index 57315f56b4..13ae63983b 100644
--- a/block/blklogwrites.c
+++ b/block/blklogwrites.c
@@ -515,7 +515,6 @@ static BlockDriver bdrv_blk_log_writes = {
.bdrv_co_pwrite_zeroes = blk_log_writes_co_pwrite_zeroes,
.bdrv_co_flush_to_disk = blk_log_writes_co_flush_to_disk,
.bdrv_co_pdiscard = blk_log_writes_co_pdiscard,
- .bdrv_co_block_status = bdrv_co_block_status_from_file,
.is_filter = true,
.strong_runtime_opts = blk_log_writes_strong_runtime_opts,
diff --git a/block/block-backend.c b/block/block-backend.c
index 3a13cb5f0b..24dd0670d1 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -2279,10 +2279,13 @@ int blk_commit_all(void)
while ((blk = blk_all_next(blk)) != NULL) {
AioContext *aio_context = blk_get_aio_context(blk);
+ BlockDriverState *unfiltered_bs = bdrv_skip_filters(blk_bs(blk));
aio_context_acquire(aio_context);
- if (blk_is_inserted(blk) && blk->root->bs->backing) {
- int ret = bdrv_commit(blk->root->bs);
+ if (blk_is_inserted(blk) && bdrv_cow_child(unfiltered_bs)) {
+ int ret;
+
+ ret = bdrv_commit(unfiltered_bs);
if (ret < 0) {
aio_context_release(aio_context);
return ret;
diff --git a/block/block-copy.c b/block/block-copy.c
index a30b9097ef..cd9bc47c8f 100644
--- a/block/block-copy.c
+++ b/block/block-copy.c
@@ -440,8 +440,8 @@ static int block_copy_block_status(BlockCopyState *s, int64_t offset,
BlockDriverState *base;
int ret;
- if (s->skip_unallocated && s->source->bs->backing) {
- base = s->source->bs->backing->bs;
+ if (s->skip_unallocated) {
+ base = bdrv_backing_chain_next(s->source->bs);
} else {
base = NULL;
}
diff --git a/block/commit.c b/block/commit.c
index 7732d02dfe..1e85c306cc 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -37,6 +37,7 @@ typedef struct CommitBlockJob {
BlockBackend *top;
BlockBackend *base;
BlockDriverState *base_bs;
+ BlockDriverState *base_overlay;
BlockdevOnError on_error;
bool base_read_only;
bool chain_frozen;
@@ -89,7 +90,7 @@ static void commit_abort(Job *job)
* XXX Can (or should) we somehow keep 'consistent read' blocked even
* after the failed/cancelled commit job is gone? If we already wrote
* something to base, the intermediate images aren't valid any more. */
- bdrv_replace_node(s->commit_top_bs, backing_bs(s->commit_top_bs),
+ bdrv_replace_node(s->commit_top_bs, s->commit_top_bs->backing->bs,
&error_abort);
bdrv_unref(s->commit_top_bs);
@@ -153,7 +154,7 @@ static int coroutine_fn commit_run(Job *job, Error **errp)
break;
}
/* Copy if allocated above the base */
- ret = bdrv_is_allocated_above(blk_bs(s->top), blk_bs(s->base), false,
+ ret = bdrv_is_allocated_above(blk_bs(s->top), s->base_overlay, true,
offset, COMMIT_BUFFER_SIZE, &n);
copy = (ret == 1);
trace_commit_one_iteration(s, offset, n, ret);
@@ -237,7 +238,6 @@ static void bdrv_commit_top_child_perm(BlockDriverState *bs, BdrvChild *c,
static BlockDriver bdrv_commit_top = {
.format_name = "commit_top",
.bdrv_co_preadv = bdrv_commit_top_preadv,
- .bdrv_co_block_status = bdrv_co_block_status_from_backing,
.bdrv_refresh_filename = bdrv_commit_top_refresh_filename,
.bdrv_child_perm = bdrv_commit_top_child_perm,
@@ -253,15 +253,35 @@ void commit_start(const char *job_id, BlockDriverState *bs,
CommitBlockJob *s;
BlockDriverState *iter;
BlockDriverState *commit_top_bs = NULL;
+ BlockDriverState *filtered_base;
Error *local_err = NULL;
+ int64_t base_size, top_size;
+ uint64_t base_perms, iter_shared_perms;
int ret;
assert(top != bs);
- if (top == base) {
+ if (bdrv_skip_filters(top) == bdrv_skip_filters(base)) {
error_setg(errp, "Invalid files for merge: top and base are the same");
return;
}
+ base_size = bdrv_getlength(base);
+ if (base_size < 0) {
+ error_setg_errno(errp, -base_size, "Could not inquire base image size");
+ return;
+ }
+
+ top_size = bdrv_getlength(top);
+ if (top_size < 0) {
+ error_setg_errno(errp, -top_size, "Could not inquire top image size");
+ return;
+ }
+
+ base_perms = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE;
+ if (base_size < top_size) {
+ base_perms |= BLK_PERM_RESIZE;
+ }
+
s = block_job_create(job_id, &commit_job_driver, NULL, bs, 0, BLK_PERM_ALL,
speed, creation_flags, NULL, NULL, errp);
if (!s) {
@@ -301,17 +321,43 @@ void commit_start(const char *job_id, BlockDriverState *bs,
s->commit_top_bs = commit_top_bs;
- /* Block all nodes between top and base, because they will
- * disappear from the chain after this operation. */
- assert(bdrv_chain_contains(top, base));
- for (iter = top; iter != base; iter = backing_bs(iter)) {
- /* XXX BLK_PERM_WRITE needs to be allowed so we don't block ourselves
- * at s->base (if writes are blocked for a node, they are also blocked
- * for its backing file). The other options would be a second filter
- * driver above s->base. */
+ /*
+ * Block all nodes between top and base, because they will
+ * disappear from the chain after this operation.
+ * Note that this assumes that the user is fine with removing all
+ * nodes (including R/W filters) between top and base. Assuring
+ * this is the responsibility of the interface (i.e. whoever calls
+ * commit_start()).
+ */
+ s->base_overlay = bdrv_find_overlay(top, base);
+ assert(s->base_overlay);
+
+ /*
+ * The topmost node with
+ * bdrv_skip_filters(filtered_base) == bdrv_skip_filters(base)
+ */
+ filtered_base = bdrv_cow_bs(s->base_overlay);
+ assert(bdrv_skip_filters(filtered_base) == bdrv_skip_filters(base));
+
+ /*
+ * XXX BLK_PERM_WRITE needs to be allowed so we don't block ourselves
+ * at s->base (if writes are blocked for a node, they are also blocked
+ * for its backing file). The other options would be a second filter
+ * driver above s->base.
+ */
+ iter_shared_perms = BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE;
+
+ for (iter = top; iter != base; iter = bdrv_filter_or_cow_bs(iter)) {
+ if (iter == filtered_base) {
+ /*
+ * From here on, all nodes are filters on the base. This
+ * allows us to share BLK_PERM_CONSISTENT_READ.
+ */
+ iter_shared_perms |= BLK_PERM_CONSISTENT_READ;
+ }
+
ret = block_job_add_bdrv(&s->common, "intermediate node", iter, 0,
- BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE,
- errp);
+ iter_shared_perms, errp);
if (ret < 0) {
goto fail;
}
@@ -328,9 +374,7 @@ void commit_start(const char *job_id, BlockDriverState *bs,
}
s->base = blk_new(s->common.job.aio_context,
- BLK_PERM_CONSISTENT_READ
- | BLK_PERM_WRITE
- | BLK_PERM_RESIZE,
+ base_perms,
BLK_PERM_CONSISTENT_READ
| BLK_PERM_GRAPH_MOD
| BLK_PERM_WRITE_UNCHANGED);
@@ -398,19 +442,22 @@ int bdrv_commit(BlockDriverState *bs)
if (!drv)
return -ENOMEDIUM;
- if (!bs->backing) {
+ backing_file_bs = bdrv_cow_bs(bs);
+
+ if (!backing_file_bs) {
return -ENOTSUP;
}
if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT_SOURCE, NULL) ||
- bdrv_op_is_blocked(bs->backing->bs, BLOCK_OP_TYPE_COMMIT_TARGET, NULL)) {
+ bdrv_op_is_blocked(backing_file_bs, BLOCK_OP_TYPE_COMMIT_TARGET, NULL))
+ {
return -EBUSY;
}
- ro = bs->backing->bs->read_only;
+ ro = backing_file_bs->read_only;
if (ro) {
- if (bdrv_reopen_set_read_only(bs->backing->bs, false, NULL)) {
+ if (bdrv_reopen_set_read_only(backing_file_bs, false, NULL)) {
return -EACCES;
}
}
@@ -428,8 +475,6 @@ int bdrv_commit(BlockDriverState *bs)
}
/* Insert commit_top block node above backing, so we can write to it */
- backing_file_bs = backing_bs(bs);
-
commit_top_bs = bdrv_new_open_driver(&bdrv_commit_top, NULL, BDRV_O_RDWR,
&local_err);
if (commit_top_bs == NULL) {
@@ -515,7 +560,7 @@ ro_cleanup:
qemu_vfree(buf);
blk_unref(backing);
- if (backing_file_bs) {
+ if (bdrv_cow_bs(bs) != backing_file_bs) {
bdrv_set_backing_hd(bs, backing_file_bs, &error_abort);
}
bdrv_unref(commit_top_bs);
@@ -523,7 +568,7 @@ ro_cleanup:
if (ro) {
/* ignoring error return here */
- bdrv_reopen_set_read_only(bs->backing->bs, true, NULL);
+ bdrv_reopen_set_read_only(backing_file_bs, true, NULL);
}
return ret;
diff --git a/block/copy-on-read.c b/block/copy-on-read.c
index a6e3c74a68..2816e61afe 100644
--- a/block/copy-on-read.c
+++ b/block/copy-on-read.c
@@ -107,6 +107,16 @@ static int coroutine_fn cor_co_pdiscard(BlockDriverState *bs,
}
+static int coroutine_fn cor_co_pwritev_compressed(BlockDriverState *bs,
+ uint64_t offset,
+ uint64_t bytes,
+ QEMUIOVector *qiov)
+{
+ return bdrv_co_pwritev(bs->file, offset, bytes, qiov,
+ BDRV_REQ_WRITE_COMPRESSED);
+}
+
+
static void cor_eject(BlockDriverState *bs, bool eject_flag)
{
bdrv_eject(bs->file->bs, eject_flag);
@@ -131,12 +141,11 @@ static BlockDriver bdrv_copy_on_read = {
.bdrv_co_pwritev = cor_co_pwritev,
.bdrv_co_pwrite_zeroes = cor_co_pwrite_zeroes,
.bdrv_co_pdiscard = cor_co_pdiscard,
+ .bdrv_co_pwritev_compressed = cor_co_pwritev_compressed,
.bdrv_eject = cor_eject,
.bdrv_lock_medium = cor_lock_medium,
- .bdrv_co_block_status = bdrv_co_block_status_from_file,
-
.has_variable_length = true,
.is_filter = true,
};
diff --git a/block/file-win32.c b/block/file-win32.c
index ab69bd811a..e2900c3a51 100644
--- a/block/file-win32.c
+++ b/block/file-win32.c
@@ -299,6 +299,11 @@ static QemuOptsList raw_runtime_opts = {
.type = QEMU_OPT_STRING,
.help = "host AIO implementation (threads, native)",
},
+ {
+ .name = "locking",
+ .type = QEMU_OPT_STRING,
+ .help = "file locking mode (on/off/auto, default: auto)",
+ },
{ /* end of list */ }
},
};
@@ -333,6 +338,7 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
Error *local_err = NULL;
const char *filename;
bool use_aio;
+ OnOffAuto locking;
int ret;
s->type = FTYPE_FILE;
@@ -343,10 +349,24 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
goto fail;
}
- if (qdict_get_try_bool(options, "locking", false)) {
+ locking = qapi_enum_parse(&OnOffAuto_lookup,
+ qemu_opt_get(opts, "locking"),
+ ON_OFF_AUTO_AUTO, &local_err);
+ if (local_err) {
+ error_propagate(errp, local_err);
+ ret = -EINVAL;
+ goto fail;
+ }
+ switch (locking) {
+ case ON_OFF_AUTO_ON:
error_setg(errp, "locking=on is not supported on Windows");
ret = -EINVAL;
goto fail;
+ case ON_OFF_AUTO_OFF:
+ case ON_OFF_AUTO_AUTO:
+ break;
+ default:
+ g_assert_not_reached();
}
filename = qemu_opt_get(opts, "filename");
diff --git a/block/filter-compress.c b/block/filter-compress.c
index 8ec1991c1f..5136371bf8 100644
--- a/block/filter-compress.c
+++ b/block/filter-compress.c
@@ -146,8 +146,6 @@ static BlockDriver bdrv_compress = {
.bdrv_eject = compress_eject,
.bdrv_lock_medium = compress_lock_medium,
- .bdrv_co_block_status = bdrv_co_block_status_from_file,
-
.has_variable_length = true,
.is_filter = true,
};
diff --git a/block/io.c b/block/io.c
index ad3a51ed53..a2389bb38c 100644
--- a/block/io.c
+++ b/block/io.c
@@ -135,6 +135,8 @@ static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src)
void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
{
BlockDriver *drv = bs->drv;
+ BdrvChild *c;
+ bool have_limits;
Error *local_err = NULL;
memset(&bs->bl, 0, sizeof(bs->bl));
@@ -149,14 +151,21 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
drv->bdrv_co_preadv_part) ? 1 : 512;
/* Take some limits from the children as a default */
- if (bs->file) {
- bdrv_refresh_limits(bs->file->bs, &local_err);
- if (local_err) {
- error_propagate(errp, local_err);
- return;
+ have_limits = false;
+ QLIST_FOREACH(c, &bs->children, next) {
+ if (c->role & (BDRV_CHILD_DATA | BDRV_CHILD_FILTERED | BDRV_CHILD_COW))
+ {
+ bdrv_refresh_limits(c->bs, &local_err);
+ if (local_err) {
+ error_propagate(errp, local_err);
+ return;
+ }
+ bdrv_merge_limits(&bs->bl, &c->bs->bl);
+ have_limits = true;
}
- bdrv_merge_limits(&bs->bl, &bs->file->bs->bl);
- } else {
+ }
+
+ if (!have_limits) {
bs->bl.min_mem_alignment = 512;
bs->bl.opt_mem_alignment = qemu_real_host_page_size;
@@ -164,15 +173,6 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
bs->bl.max_iov = IOV_MAX;
}
- if (bs->backing) {
- bdrv_refresh_limits(bs->backing->bs, &local_err);
- if (local_err) {
- error_propagate(errp, local_err);
- return;
- }
- bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl);
- }
-
/* Then let the driver override it */
if (drv->bdrv_refresh_limits) {
drv->bdrv_refresh_limits(bs, errp);
@@ -2255,36 +2255,6 @@ typedef struct BdrvCoBlockStatusData {
BlockDriverState **file;
} BdrvCoBlockStatusData;
-int coroutine_fn bdrv_co_block_status_from_file(BlockDriverState *bs,
- bool want_zero,
- int64_t offset,
- int64_t bytes,
- int64_t *pnum,
- int64_t *map,
- BlockDriverState **file)
-{
- assert(bs->file && bs->file->bs);
- *pnum = bytes;
- *map = offset;
- *file = bs->file->bs;
- return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
-}
-
-int coroutine_fn bdrv_co_block_status_from_backing(BlockDriverState *bs,
- bool want_zero,
- int64_t offset,
- int64_t bytes,
- int64_t *pnum,
- int64_t *map,
- BlockDriverState **file)
-{
- assert(bs->backing && bs->backing->bs);
- *pnum = bytes;
- *map = offset;
- *file = bs->backing->bs;
- return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
-}
-
/*
* Returns the allocation status of the specified sectors.
* Drivers not implementing the functionality are assumed to not support
@@ -2325,6 +2295,7 @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,
BlockDriverState *local_file = NULL;
int64_t aligned_offset, aligned_bytes;
uint32_t align;
+ bool has_filtered_child;
assert(pnum);
*pnum = 0;
@@ -2350,7 +2321,8 @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,
/* Must be non-NULL or bdrv_getlength() would have failed */
assert(bs->drv);
- if (!bs->drv->bdrv_co_block_status) {
+ has_filtered_child = bdrv_filter_child(bs);
+ if (!bs->drv->bdrv_co_block_status && !has_filtered_child) {
*pnum = bytes;
ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
if (offset + bytes == total_size) {
@@ -2371,9 +2343,20 @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,
aligned_offset = QEMU_ALIGN_DOWN(offset, align);
aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset;
- ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset,
- aligned_bytes, pnum, &local_map,
- &local_file);
+ if (bs->drv->bdrv_co_block_status) {
+ ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset,
+ aligned_bytes, pnum, &local_map,
+ &local_file);
+ } else {
+ /* Default code for filters */
+
+ local_file = bdrv_filter_bs(bs);
+ assert(local_file);
+
+ *pnum = aligned_bytes;
+ local_map = aligned_offset;
+ ret = BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
+ }
if (ret < 0) {
*pnum = 0;
goto out;
@@ -2409,9 +2392,10 @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,
if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
ret |= BDRV_BLOCK_ALLOCATED;
} else if (want_zero && bs->drv->supports_backing) {
- if (bs->backing) {
- BlockDriverState *bs2 = bs->backing->bs;
- int64_t size2 = bdrv_getlength(bs2);
+ BlockDriverState *cow_bs = bdrv_cow_bs(bs);
+
+ if (cow_bs) {
+ int64_t size2 = bdrv_getlength(cow_bs);
if (size2 >= 0 && offset >= size2) {
ret |= BDRV_BLOCK_ZERO;
@@ -2479,7 +2463,7 @@ static int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs,
bool first = true;
assert(bs != base);
- for (p = bs; p != base; p = backing_bs(p)) {
+ for (p = bs; p != base; p = bdrv_filter_or_cow_bs(p)) {
ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map,
file);
if (ret < 0) {
@@ -2553,7 +2537,7 @@ int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base,
int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes,
int64_t *pnum, int64_t *map, BlockDriverState **file)
{
- return bdrv_block_status_above(bs, backing_bs(bs),
+ return bdrv_block_status_above(bs, bdrv_filter_or_cow_bs(bs),
offset, bytes, pnum, map, file);
}
@@ -2563,9 +2547,9 @@ int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset,
int ret;
int64_t dummy;
- ret = bdrv_common_block_status_above(bs, backing_bs(bs), false, offset,
- bytes, pnum ? pnum : &dummy, NULL,
- NULL);
+ ret = bdrv_common_block_status_above(bs, bdrv_filter_or_cow_bs(bs), false,
+ offset, bytes, pnum ? pnum : &dummy,
+ NULL, NULL);
if (ret < 0) {
return ret;
}
@@ -2628,7 +2612,7 @@ int bdrv_is_allocated_above(BlockDriverState *top,
break;
}
- intermediate = backing_bs(intermediate);
+ intermediate = bdrv_filter_or_cow_bs(intermediate);
}
*pnum = n;
@@ -2647,6 +2631,7 @@ bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
bool is_read)
{
BlockDriver *drv = bs->drv;
+ BlockDriverState *child_bs = bdrv_primary_bs(bs);
int ret = -ENOTSUP;
bdrv_inc_in_flight(bs);
@@ -2659,8 +2644,8 @@ bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
} else {
ret = drv->bdrv_save_vmstate(bs, qiov, pos);
}
- } else if (bs->file) {
- ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
+ } else if (child_bs) {
+ ret = bdrv_co_rw_vmstate(child_bs, qiov, pos, is_read);
}
bdrv_dec_in_flight(bs);
@@ -2770,6 +2755,8 @@ static int coroutine_fn bdrv_flush_co_entry(void *opaque)
int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
{
+ BdrvChild *primary_child = bdrv_primary_child(bs);
+ BdrvChild *child;
int current_gen;
int ret = 0;
@@ -2799,7 +2786,7 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
}
/* Write back cached data to the OS even with cache=unsafe */
- BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
+ BLKDBG_EVENT(primary_child, BLKDBG_FLUSH_TO_OS);
if (bs->drv->bdrv_co_flush_to_os) {
ret = bs->drv->bdrv_co_flush_to_os(bs);
if (ret < 0) {
@@ -2809,15 +2796,15 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
/* But don't actually force it to the disk with cache=unsafe */
if (bs->open_flags & BDRV_O_NO_FLUSH) {
- goto flush_parent;
+ goto flush_children;
}
/* Check if we really need to flush anything */
if (bs->flushed_gen == current_gen) {
- goto flush_parent;
+ goto flush_children;
}
- BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
+ BLKDBG_EVENT(primary_child, BLKDBG_FLUSH_TO_DISK);
if (!bs->drv) {
/* bs->drv->bdrv_co_flush() might have ejected the BDS
* (even in case of apparent success) */
@@ -2861,8 +2848,17 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
/* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
* in the case of cache=unsafe, so there are no useless flushes.
*/
-flush_parent:
- ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
+flush_children:
+ ret = 0;
+ QLIST_FOREACH(child, &bs->children, next) {
+ if (child->perm & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) {
+ int this_child_ret = bdrv_co_flush(child->bs);
+ if (!ret) {
+ ret = this_child_ret;
+ }
+ }
+ }
+
out:
/* Notify any pending flushes that we have completed */
if (ret == 0) {
@@ -3309,6 +3305,7 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
Error **errp)
{
BlockDriverState *bs = child->bs;
+ BdrvChild *filtered, *backing;
BlockDriver *drv = bs->drv;
BdrvTrackedRequest req;
int64_t old_size, new_bytes;
@@ -3360,6 +3357,9 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
goto out;
}
+ filtered = bdrv_filter_child(bs);
+ backing = bdrv_cow_child(bs);
+
/*
* If the image has a backing file that is large enough that it would
* provide data for the new area, we cannot leave it unallocated because
@@ -3370,10 +3370,10 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
* backing file, taking care of keeping things consistent with that backing
* file is the user's responsibility.
*/
- if (new_bytes && bs->backing) {
+ if (new_bytes && backing) {
int64_t backing_len;
- backing_len = bdrv_getlength(backing_bs(bs));
+ backing_len = bdrv_getlength(backing->bs);
if (backing_len < 0) {
ret = backing_len;
error_setg_errno(errp, -ret, "Could not get backing file size");
@@ -3392,8 +3392,8 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
goto out;
}
ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, flags, errp);
- } else if (bs->file && drv->is_filter) {
- ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp);
+ } else if (filtered) {
+ ret = bdrv_co_truncate(filtered, offset, exact, prealloc, flags, errp);
} else {
error_setg(errp, "Image format driver does not support resize");
ret = -ENOTSUP;
diff --git a/block/mirror.c b/block/mirror.c
index e8e8844afc..26acf4af6f 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -42,6 +42,7 @@ typedef struct MirrorBlockJob {
BlockBackend *target;
BlockDriverState *mirror_top_bs;
BlockDriverState *base;
+ BlockDriverState *base_overlay;
/* The name of the graph node to replace */
char *replaces;
@@ -677,8 +678,10 @@ static int mirror_exit_common(Job *job)
&error_abort);
if (!abort && s->backing_mode == MIRROR_SOURCE_BACKING_CHAIN) {
BlockDriverState *backing = s->is_none_mode ? src : s->base;
- if (backing_bs(target_bs) != backing) {
- bdrv_set_backing_hd(target_bs, backing, &local_err);
+ BlockDriverState *unfiltered_target = bdrv_skip_filters(target_bs);
+
+ if (bdrv_cow_bs(unfiltered_target) != backing) {
+ bdrv_set_backing_hd(unfiltered_target, backing, &local_err);
if (local_err) {
error_report_err(local_err);
local_err = NULL;
@@ -740,7 +743,7 @@ static int mirror_exit_common(Job *job)
* valid.
*/
block_job_remove_all_bdrv(bjob);
- bdrv_replace_node(mirror_top_bs, backing_bs(mirror_top_bs), &error_abort);
+ bdrv_replace_node(mirror_top_bs, mirror_top_bs->backing->bs, &error_abort);
/* We just changed the BDS the job BB refers to (with either or both of the
* bdrv_replace_node() calls), so switch the BB back so the cleanup does
@@ -786,7 +789,6 @@ static void coroutine_fn mirror_throttle(MirrorBlockJob *s)
static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
{
int64_t offset;
- BlockDriverState *base = s->base;
BlockDriverState *bs = s->mirror_top_bs->backing->bs;
BlockDriverState *target_bs = blk_bs(s->target);
int ret;
@@ -837,7 +839,8 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
return 0;
}
- ret = bdrv_is_allocated_above(bs, base, false, offset, bytes, &count);
+ ret = bdrv_is_allocated_above(bs, s->base_overlay, true, offset, bytes,
+ &count);
if (ret < 0) {
return ret;
}
@@ -936,7 +939,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
} else {
s->target_cluster_size = BDRV_SECTOR_SIZE;
}
- if (backing_filename[0] && !target_bs->backing &&
+ if (backing_filename[0] && !bdrv_backing_chain_next(target_bs) &&
s->granularity < s->target_cluster_size) {
s->buf_size = MAX(s->buf_size, s->target_cluster_size);
s->cow_bitmap = bitmap_new(length);
@@ -1116,8 +1119,9 @@ static void mirror_complete(Job *job, Error **errp)
if (s->backing_mode == MIRROR_OPEN_BACKING_CHAIN) {
int ret;
- assert(!target->backing);
- ret = bdrv_open_backing_file(target, NULL, "backing", errp);
+ assert(!bdrv_backing_chain_next(target));
+ ret = bdrv_open_backing_file(bdrv_skip_filters(target), NULL,
+ "backing", errp);
if (ret < 0) {
return;
}
@@ -1527,7 +1531,6 @@ static BlockDriver bdrv_mirror_top = {
.bdrv_co_pwrite_zeroes = bdrv_mirror_top_pwrite_zeroes,
.bdrv_co_pdiscard = bdrv_mirror_top_pdiscard,
.bdrv_co_flush = bdrv_mirror_top_flush,
- .bdrv_co_block_status = bdrv_co_block_status_from_backing,
.bdrv_refresh_filename = bdrv_mirror_top_refresh_filename,
.bdrv_child_perm = bdrv_mirror_top_child_perm,
@@ -1555,8 +1558,8 @@ static BlockJob *mirror_start_job(
MirrorBlockJob *s;
MirrorBDSOpaque *bs_opaque;
BlockDriverState *mirror_top_bs;
- bool target_graph_mod;
bool target_is_backing;
+ uint64_t target_perms, target_shared_perms;
Error *local_err = NULL;
int ret;
@@ -1575,7 +1578,7 @@ static BlockJob *mirror_start_job(
buf_size = DEFAULT_MIRROR_BUF_SIZE;
}
- if (bs == target) {
+ if (bdrv_skip_filters(bs) == bdrv_skip_filters(target)) {
error_setg(errp, "Can't mirror node into itself");
return NULL;
}
@@ -1639,15 +1642,50 @@ static BlockJob *mirror_start_job(
* In the case of active commit, things look a bit different, though,
* because the target is an already populated backing file in active use.
* We can allow anything except resize there.*/
+
+ target_perms = BLK_PERM_WRITE;
+ target_shared_perms = BLK_PERM_WRITE_UNCHANGED;
+
target_is_backing = bdrv_chain_contains(bs, target);
- target_graph_mod = (backing_mode != MIRROR_LEAVE_BACKING_CHAIN);
+ if (target_is_backing) {
+ int64_t bs_size, target_size;
+ bs_size = bdrv_getlength(bs);
+ if (bs_size < 0) {
+ error_setg_errno(errp, -bs_size,
+ "Could not inquire top image size");
+ goto fail;
+ }
+
+ target_size = bdrv_getlength(target);
+ if (target_size < 0) {
+ error_setg_errno(errp, -target_size,
+ "Could not inquire base image size");
+ goto fail;
+ }
+
+ if (target_size < bs_size) {
+ target_perms |= BLK_PERM_RESIZE;
+ }
+
+ target_shared_perms |= BLK_PERM_CONSISTENT_READ
+ | BLK_PERM_WRITE
+ | BLK_PERM_GRAPH_MOD;
+ } else if (bdrv_chain_contains(bs, bdrv_skip_filters(target))) {
+ /*
+ * We may want to allow this in the future, but it would
+ * require taking some extra care.
+ */
+ error_setg(errp, "Cannot mirror to a filter on top of a node in the "
+ "source's backing chain");
+ goto fail;
+ }
+
+ if (backing_mode != MIRROR_LEAVE_BACKING_CHAIN) {
+ target_perms |= BLK_PERM_GRAPH_MOD;
+ }
+
s->target = blk_new(s->common.job.aio_context,
- BLK_PERM_WRITE | BLK_PERM_RESIZE |
- (target_graph_mod ? BLK_PERM_GRAPH_MOD : 0),
- BLK_PERM_WRITE_UNCHANGED |
- (target_is_backing ? BLK_PERM_CONSISTENT_READ |
- BLK_PERM_WRITE |
- BLK_PERM_GRAPH_MOD : 0));
+ target_perms, target_shared_perms);
ret = blk_insert_bs(s->target, target, errp);
if (ret < 0) {
goto fail;
@@ -1672,6 +1710,7 @@ static BlockJob *mirror_start_job(
s->zero_target = zero_target;
s->copy_mode = copy_mode;
s->base = base;
+ s->base_overlay = bdrv_find_overlay(bs, base);
s->granularity = granularity;
s->buf_size = ROUND_UP(buf_size, granularity);
s->unmap = unmap;
@@ -1702,15 +1741,39 @@ static BlockJob *mirror_start_job(
/* In commit_active_start() all intermediate nodes disappear, so
* any jobs in them must be blocked */
if (target_is_backing) {
- BlockDriverState *iter;
- for (iter = backing_bs(bs); iter != target; iter = backing_bs(iter)) {
- /* XXX BLK_PERM_WRITE needs to be allowed so we don't block
- * ourselves at s->base (if writes are blocked for a node, they are
- * also blocked for its backing file). The other options would be a
- * second filter driver above s->base (== target). */
+ BlockDriverState *iter, *filtered_target;
+ uint64_t iter_shared_perms;
+
+ /*
+ * The topmost node with
+ * bdrv_skip_filters(filtered_target) == bdrv_skip_filters(target)
+ */
+ filtered_target = bdrv_cow_bs(bdrv_find_overlay(bs, target));
+
+ assert(bdrv_skip_filters(filtered_target) ==
+ bdrv_skip_filters(target));
+
+ /*
+ * XXX BLK_PERM_WRITE needs to be allowed so we don't block
+ * ourselves at s->base (if writes are blocked for a node, they are
+ * also blocked for its backing file). The other options would be a
+ * second filter driver above s->base (== target).
+ */
+ iter_shared_perms = BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE;
+
+ for (iter = bdrv_filter_or_cow_bs(bs); iter != target;
+ iter = bdrv_filter_or_cow_bs(iter))
+ {
+ if (iter == filtered_target) {
+ /*
+ * From here on, all nodes are filters on the base.
+ * This allows us to share BLK_PERM_CONSISTENT_READ.
+ */
+ iter_shared_perms |= BLK_PERM_CONSISTENT_READ;
+ }
+
ret = block_job_add_bdrv(&s->common, "intermediate node", iter, 0,
- BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE,
- errp);
+ iter_shared_perms, errp);
if (ret < 0) {
goto fail;
}
@@ -1746,7 +1809,7 @@ fail:
bs_opaque->stop = true;
bdrv_child_refresh_perms(mirror_top_bs, mirror_top_bs->backing,
&error_abort);
- bdrv_replace_node(mirror_top_bs, backing_bs(mirror_top_bs), &error_abort);
+ bdrv_replace_node(mirror_top_bs, mirror_top_bs->backing->bs, &error_abort);
bdrv_unref(mirror_top_bs);
@@ -1774,7 +1837,7 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
return;
}
is_none_mode = mode == MIRROR_SYNC_MODE_NONE;
- base = mode == MIRROR_SYNC_MODE_TOP ? backing_bs(bs) : NULL;
+ base = mode == MIRROR_SYNC_MODE_TOP ? bdrv_backing_chain_next(bs) : NULL;
mirror_start_job(job_id, bs, creation_flags, target, replaces,
speed, granularity, buf_size, backing_mode, zero_target,
on_source_error, on_target_error, unmap, NULL, NULL,
diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c
index 4c8c375172..4d3db5ed3c 100644
--- a/block/monitor/block-hmp-cmds.c
+++ b/block/monitor/block-hmp-cmds.c
@@ -217,7 +217,7 @@ void hmp_commit(Monitor *mon, const QDict *qdict)
return;
}
- bs = blk_bs(blk);
+ bs = bdrv_skip_implicit_filters(blk_bs(blk));
aio_context = bdrv_get_aio_context(bs);
aio_context_acquire(aio_context);
diff --git a/block/null.c b/block/null.c
index 15e1d56746..cc9b1d4ea7 100644
--- a/block/null.c
+++ b/block/null.c
@@ -262,6 +262,11 @@ static void null_refresh_filename(BlockDriverState *bs)
bs->drv->format_name);
}
+static int64_t null_allocated_file_size(BlockDriverState *bs)
+{
+ return 0;
+}
+
static const char *const null_strong_runtime_opts[] = {
BLOCK_OPT_SIZE,
NULL_OPT_ZEROES,
@@ -277,6 +282,7 @@ static BlockDriver bdrv_null_co = {
.bdrv_file_open = null_file_open,
.bdrv_parse_filename = null_co_parse_filename,
.bdrv_getlength = null_getlength,
+ .bdrv_get_allocated_file_size = null_allocated_file_size,
.bdrv_co_preadv = null_co_preadv,
.bdrv_co_pwritev = null_co_pwritev,
@@ -297,6 +303,7 @@ static BlockDriver bdrv_null_aio = {
.bdrv_file_open = null_file_open,
.bdrv_parse_filename = null_aio_parse_filename,
.bdrv_getlength = null_getlength,
+ .bdrv_get_allocated_file_size = null_allocated_file_size,
.bdrv_aio_preadv = null_aio_preadv,
.bdrv_aio_pwritev = null_aio_pwritev,
diff --git a/block/nvme.c b/block/nvme.c
index 05485fdd11..f4f27b6da7 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -83,25 +83,21 @@ typedef struct {
/* Memory mapped registers */
typedef volatile struct {
- uint64_t cap;
- uint32_t vs;
- uint32_t intms;
- uint32_t intmc;
- uint32_t cc;
- uint32_t reserved0;
- uint32_t csts;
- uint32_t nssr;
- uint32_t aqa;
- uint64_t asq;
- uint64_t acq;
- uint32_t cmbloc;
- uint32_t cmbsz;
- uint8_t reserved1[0xec0];
- uint8_t cmd_set_specfic[0x100];
- uint32_t doorbells[];
+ NvmeBar ctrl;
+ struct {
+ uint32_t sq_tail;
+ uint32_t cq_head;
+ } doorbells[];
} NVMeRegs;
-QEMU_BUILD_BUG_ON(offsetof(NVMeRegs, doorbells) != 0x1000);
+#define INDEX_ADMIN 0
+#define INDEX_IO(n) (1 + n)
+
+/* This driver shares a single MSIX IRQ for the admin and I/O queues */
+enum {
+ MSIX_SHARED_IRQ_IDX = 0,
+ MSIX_IRQ_COUNT = 1
+};
struct BDRVNVMeState {
AioContext *aio_context;
@@ -117,7 +113,7 @@ struct BDRVNVMeState {
/* How many uint32_t elements does each doorbell entry take. */
size_t doorbell_scale;
bool write_cache_supported;
- EventNotifier irq_notifier;
+ EventNotifier irq_notifier[MSIX_IRQ_COUNT];
uint64_t nsze; /* Namespace size reported by identify command */
int nsid; /* The namespace id to read/write data. */
@@ -162,21 +158,20 @@ static QemuOptsList runtime_opts = {
},
};
-static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q,
+static void nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
int nentries, int entry_bytes, Error **errp)
{
- BDRVNVMeState *s = bs->opaque;
size_t bytes;
int r;
bytes = ROUND_UP(nentries * entry_bytes, s->page_size);
q->head = q->tail = 0;
- q->queue = qemu_try_blockalign0(bs, bytes);
-
+ q->queue = qemu_try_memalign(s->page_size, bytes);
if (!q->queue) {
error_setg(errp, "Cannot allocate queue");
return;
}
+ memset(q->queue, 0, bytes);
r = qemu_vfio_dma_map(s->vfio, q->queue, bytes, false, &q->iova);
if (r) {
error_setg(errp, "Cannot map queue");
@@ -206,23 +201,31 @@ static void nvme_free_req_queue_cb(void *opaque)
qemu_mutex_unlock(&q->lock);
}
-static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
+static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
+ AioContext *aio_context,
int idx, int size,
Error **errp)
{
int i, r;
- BDRVNVMeState *s = bs->opaque;
Error *local_err = NULL;
- NVMeQueuePair *q = g_new0(NVMeQueuePair, 1);
+ NVMeQueuePair *q;
uint64_t prp_list_iova;
+ q = g_try_new0(NVMeQueuePair, 1);
+ if (!q) {
+ return NULL;
+ }
+ q->prp_list_pages = qemu_try_memalign(s->page_size,
+ s->page_size * NVME_NUM_REQS);
+ if (!q->prp_list_pages) {
+ goto fail;
+ }
+ memset(q->prp_list_pages, 0, s->page_size * NVME_NUM_REQS);
qemu_mutex_init(&q->lock);
q->s = s;
q->index = idx;
qemu_co_queue_init(&q->free_req_queue);
- q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS);
- q->completion_bh = aio_bh_new(bdrv_get_aio_context(bs),
- nvme_process_completion_bh, q);
+ q->completion_bh = aio_bh_new(aio_context, nvme_process_completion_bh, q);
r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
s->page_size * NVME_NUM_REQS,
false, &prp_list_iova);
@@ -239,19 +242,19 @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
req->prp_list_iova = prp_list_iova + i * s->page_size;
}
- nvme_init_queue(bs, &q->sq, size, NVME_SQ_ENTRY_BYTES, &local_err);
+ nvme_init_queue(s, &q->sq, size, NVME_SQ_ENTRY_BYTES, &local_err);
if (local_err) {
error_propagate(errp, local_err);
goto fail;
}
- q->sq.doorbell = &s->regs->doorbells[idx * 2 * s->doorbell_scale];
+ q->sq.doorbell = &s->regs->doorbells[idx * s->doorbell_scale].sq_tail;
- nvme_init_queue(bs, &q->cq, size, NVME_CQ_ENTRY_BYTES, &local_err);
+ nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, &local_err);
if (local_err) {
error_propagate(errp, local_err);
goto fail;
}
- q->cq.doorbell = &s->regs->doorbells[(idx * 2 + 1) * s->doorbell_scale];
+ q->cq.doorbell = &s->regs->doorbells[idx * s->doorbell_scale].cq_head;
return q;
fail:
@@ -441,6 +444,9 @@ static void nvme_trace_command(const NvmeCmd *cmd)
{
int i;
+ if (!trace_event_get_state_backends(TRACE_NVME_SUBMIT_COMMAND_RAW)) {
+ return;
+ }
for (i = 0; i < 8; ++i) {
uint8_t *cmdp = (uint8_t *)cmd + i * 8;
trace_nvme_submit_command_raw(cmdp[0], cmdp[1], cmdp[2], cmdp[3],
@@ -479,6 +485,7 @@ static void nvme_cmd_sync_cb(void *opaque, int ret)
static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q,
NvmeCmd *cmd)
{
+ AioContext *aio_context = bdrv_get_aio_context(bs);
NVMeRequest *req;
int ret = -EINPROGRESS;
req = nvme_get_free_req(q);
@@ -487,17 +494,18 @@ static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q,
}
nvme_submit_command(q, req, cmd, nvme_cmd_sync_cb, &ret);
- BDRV_POLL_WHILE(bs, ret == -EINPROGRESS);
+ AIO_WAIT_WHILE(aio_context, ret == -EINPROGRESS);
return ret;
}
static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
{
BDRVNVMeState *s = bs->opaque;
- NvmeIdCtrl *idctrl;
- NvmeIdNs *idns;
+ union {
+ NvmeIdCtrl ctrl;
+ NvmeIdNs ns;
+ } *id;
NvmeLBAF *lbaf;
- uint8_t *resp;
uint16_t oncs;
int r;
uint64_t iova;
@@ -506,54 +514,52 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
.cdw10 = cpu_to_le32(0x1),
};
- resp = qemu_try_blockalign0(bs, sizeof(NvmeIdCtrl));
- if (!resp) {
+ id = qemu_try_memalign(s->page_size, sizeof(*id));
+ if (!id) {
error_setg(errp, "Cannot allocate buffer for identify response");
goto out;
}
- idctrl = (NvmeIdCtrl *)resp;
- idns = (NvmeIdNs *)resp;
- r = qemu_vfio_dma_map(s->vfio, resp, sizeof(NvmeIdCtrl), true, &iova);
+ r = qemu_vfio_dma_map(s->vfio, id, sizeof(*id), true, &iova);
if (r) {
error_setg(errp, "Cannot map buffer for DMA");
goto out;
}
- cmd.dptr.prp1 = cpu_to_le64(iova);
- if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
+ memset(id, 0, sizeof(*id));
+ cmd.dptr.prp1 = cpu_to_le64(iova);
+ if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
error_setg(errp, "Failed to identify controller");
goto out;
}
- if (le32_to_cpu(idctrl->nn) < namespace) {
+ if (le32_to_cpu(id->ctrl.nn) < namespace) {
error_setg(errp, "Invalid namespace");
goto out;
}
- s->write_cache_supported = le32_to_cpu(idctrl->vwc) & 0x1;
- s->max_transfer = (idctrl->mdts ? 1 << idctrl->mdts : 0) * s->page_size;
+ s->write_cache_supported = le32_to_cpu(id->ctrl.vwc) & 0x1;
+ s->max_transfer = (id->ctrl.mdts ? 1 << id->ctrl.mdts : 0) * s->page_size;
/* For now the page list buffer per command is one page, to hold at most
* s->page_size / sizeof(uint64_t) entries. */
s->max_transfer = MIN_NON_ZERO(s->max_transfer,
s->page_size / sizeof(uint64_t) * s->page_size);
- oncs = le16_to_cpu(idctrl->oncs);
+ oncs = le16_to_cpu(id->ctrl.oncs);
s->supports_write_zeroes = !!(oncs & NVME_ONCS_WRITE_ZEROES);
s->supports_discard = !!(oncs & NVME_ONCS_DSM);
- memset(resp, 0, 4096);
-
+ memset(id, 0, sizeof(*id));
cmd.cdw10 = 0;
cmd.nsid = cpu_to_le32(namespace);
- if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
+ if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
error_setg(errp, "Failed to identify namespace");
goto out;
}
- s->nsze = le64_to_cpu(idns->nsze);
- lbaf = &idns->lbaf[NVME_ID_NS_FLBAS_INDEX(idns->flbas)];
+ s->nsze = le64_to_cpu(id->ns.nsze);
+ lbaf = &id->ns.lbaf[NVME_ID_NS_FLBAS_INDEX(id->ns.flbas)];
- if (NVME_ID_NS_DLFEAT_WRITE_ZEROES(idns->dlfeat) &&
- NVME_ID_NS_DLFEAT_READ_BEHAVIOR(idns->dlfeat) ==
+ if (NVME_ID_NS_DLFEAT_WRITE_ZEROES(id->ns.dlfeat) &&
+ NVME_ID_NS_DLFEAT_READ_BEHAVIOR(id->ns.dlfeat) ==
NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ZEROES) {
bs->supported_write_flags |= BDRV_REQ_MAY_UNMAP;
}
@@ -573,8 +579,34 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
s->blkshift = lbaf->ds;
out:
- qemu_vfio_dma_unmap(s->vfio, resp);
- qemu_vfree(resp);
+ qemu_vfio_dma_unmap(s->vfio, id);
+ qemu_vfree(id);
+}
+
+static bool nvme_poll_queue(NVMeQueuePair *q)
+{
+ bool progress = false;
+
+ const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES;
+ NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset];
+
+ /*
+ * Do an early check for completions. q->lock isn't needed because
+ * nvme_process_completion() only runs in the event loop thread and
+ * cannot race with itself.
+ */
+ if ((le16_to_cpu(cqe->status) & 0x1) == q->cq_phase) {
+ return false;
+ }
+
+ qemu_mutex_lock(&q->lock);
+ while (nvme_process_completion(q)) {
+ /* Keep polling */
+ progress = true;
+ }
+ qemu_mutex_unlock(&q->lock);
+
+ return progress;
}
static bool nvme_poll_queues(BDRVNVMeState *s)
@@ -583,32 +615,17 @@ static bool nvme_poll_queues(BDRVNVMeState *s)
int i;
for (i = 0; i < s->nr_queues; i++) {
- NVMeQueuePair *q = s->queues[i];
- const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES;
- NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset];
-
- /*
- * Do an early check for completions. q->lock isn't needed because
- * nvme_process_completion() only runs in the event loop thread and
- * cannot race with itself.
- */
- if ((le16_to_cpu(cqe->status) & 0x1) == q->cq_phase) {
- continue;
- }
-
- qemu_mutex_lock(&q->lock);
- while (nvme_process_completion(q)) {
- /* Keep polling */
+ if (nvme_poll_queue(s->queues[i])) {
progress = true;
}
- qemu_mutex_unlock(&q->lock);
}
return progress;
}
static void nvme_handle_event(EventNotifier *n)
{
- BDRVNVMeState *s = container_of(n, BDRVNVMeState, irq_notifier);
+ BDRVNVMeState *s = container_of(n, BDRVNVMeState,
+ irq_notifier[MSIX_SHARED_IRQ_IDX]);
trace_nvme_handle_event(s);
event_notifier_test_and_clear(n);
@@ -623,7 +640,8 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
NvmeCmd cmd;
int queue_size = NVME_QUEUE_SIZE;
- q = nvme_create_queue_pair(bs, n, queue_size, errp);
+ q = nvme_create_queue_pair(s, bdrv_get_aio_context(bs),
+ n, queue_size, errp);
if (!q) {
return false;
}
@@ -633,10 +651,9 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
.cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)),
.cdw11 = cpu_to_le32(0x3),
};
- if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
- error_setg(errp, "Failed to create io queue [%d]", n);
- nvme_free_queue_pair(q);
- return false;
+ if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
+ error_setg(errp, "Failed to create CQ io queue [%d]", n);
+ goto out_error;
}
cmd = (NvmeCmd) {
.opcode = NVME_ADM_CMD_CREATE_SQ,
@@ -644,21 +661,24 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
.cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)),
.cdw11 = cpu_to_le32(0x1 | (n << 16)),
};
- if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
- error_setg(errp, "Failed to create io queue [%d]", n);
- nvme_free_queue_pair(q);
- return false;
+ if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
+ error_setg(errp, "Failed to create SQ io queue [%d]", n);
+ goto out_error;
}
s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1);
s->queues[n] = q;
s->nr_queues++;
return true;
+out_error:
+ nvme_free_queue_pair(q);
+ return false;
}
static bool nvme_poll_cb(void *opaque)
{
EventNotifier *e = opaque;
- BDRVNVMeState *s = container_of(e, BDRVNVMeState, irq_notifier);
+ BDRVNVMeState *s = container_of(e, BDRVNVMeState,
+ irq_notifier[MSIX_SHARED_IRQ_IDX]);
trace_nvme_poll_cb(s);
return nvme_poll_queues(s);
@@ -668,6 +688,7 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
Error **errp)
{
BDRVNVMeState *s = bs->opaque;
+ AioContext *aio_context = bdrv_get_aio_context(bs);
int ret;
uint64_t cap;
uint64_t timeout_ms;
@@ -679,7 +700,7 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
s->device = g_strdup(device);
s->nsid = namespace;
s->aio_context = bdrv_get_aio_context(bs);
- ret = event_notifier_init(&s->irq_notifier, 0);
+ ret = event_notifier_init(&s->irq_notifier[MSIX_SHARED_IRQ_IDX], 0);
if (ret) {
error_setg(errp, "Failed to init event notifier");
return ret;
@@ -700,7 +721,7 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
/* Perform initialize sequence as described in NVMe spec "7.6.1
* Initialization". */
- cap = le64_to_cpu(s->regs->cap);
+ cap = le64_to_cpu(s->regs->ctrl.cap);
if (!(cap & (1ULL << 37))) {
error_setg(errp, "Device doesn't support NVMe command set");
ret = -EINVAL;
@@ -713,10 +734,10 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
timeout_ms = MIN(500 * ((cap >> 24) & 0xFF), 30000);
/* Reset device to get a clean state. */
- s->regs->cc = cpu_to_le32(le32_to_cpu(s->regs->cc) & 0xFE);
+ s->regs->ctrl.cc = cpu_to_le32(le32_to_cpu(s->regs->ctrl.cc) & 0xFE);
/* Wait for CSTS.RDY = 0. */
- deadline = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ms * 1000000ULL;
- while (le32_to_cpu(s->regs->csts) & 0x1) {
+ deadline = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ms * SCALE_MS;
+ while (le32_to_cpu(s->regs->ctrl.csts) & 0x1) {
if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) {
error_setg(errp, "Timeout while waiting for device to reset (%"
PRId64 " ms)",
@@ -728,25 +749,27 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
/* Set up admin queue. */
s->queues = g_new(NVMeQueuePair *, 1);
- s->queues[0] = nvme_create_queue_pair(bs, 0, NVME_QUEUE_SIZE, errp);
- if (!s->queues[0]) {
+ s->queues[INDEX_ADMIN] = nvme_create_queue_pair(s, aio_context, 0,
+ NVME_QUEUE_SIZE,
+ errp);
+ if (!s->queues[INDEX_ADMIN]) {
ret = -EINVAL;
goto out;
}
s->nr_queues = 1;
QEMU_BUILD_BUG_ON(NVME_QUEUE_SIZE & 0xF000);
- s->regs->aqa = cpu_to_le32((NVME_QUEUE_SIZE << 16) | NVME_QUEUE_SIZE);
- s->regs->asq = cpu_to_le64(s->queues[0]->sq.iova);
- s->regs->acq = cpu_to_le64(s->queues[0]->cq.iova);
+ s->regs->ctrl.aqa = cpu_to_le32((NVME_QUEUE_SIZE << 16) | NVME_QUEUE_SIZE);
+ s->regs->ctrl.asq = cpu_to_le64(s->queues[INDEX_ADMIN]->sq.iova);
+ s->regs->ctrl.acq = cpu_to_le64(s->queues[INDEX_ADMIN]->cq.iova);
/* After setting up all control registers we can enable device now. */
- s->regs->cc = cpu_to_le32((ctz32(NVME_CQ_ENTRY_BYTES) << 20) |
+ s->regs->ctrl.cc = cpu_to_le32((ctz32(NVME_CQ_ENTRY_BYTES) << 20) |
(ctz32(NVME_SQ_ENTRY_BYTES) << 16) |
0x1);
/* Wait for CSTS.RDY = 1. */
now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
deadline = now + timeout_ms * 1000000;
- while (!(le32_to_cpu(s->regs->csts) & 0x1)) {
+ while (!(le32_to_cpu(s->regs->ctrl.csts) & 0x1)) {
if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) {
error_setg(errp, "Timeout while waiting for device to start (%"
PRId64 " ms)",
@@ -756,12 +779,13 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
}
}
- ret = qemu_vfio_pci_init_irq(s->vfio, &s->irq_notifier,
+ ret = qemu_vfio_pci_init_irq(s->vfio, s->irq_notifier,
VFIO_PCI_MSIX_IRQ_INDEX, errp);
if (ret) {
goto out;
}
- aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
+ aio_set_event_notifier(bdrv_get_aio_context(bs),
+ &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
false, nvme_handle_event, nvme_poll_cb);
nvme_identify(bs, namespace, &local_err);
@@ -828,7 +852,7 @@ static int nvme_enable_disable_write_cache(BlockDriverState *bs, bool enable,
.cdw11 = cpu_to_le32(enable ? 0x01 : 0x00),
};
- ret = nvme_cmd_sync(bs, s->queues[0], &cmd);
+ ret = nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd);
if (ret) {
error_setg(errp, "Failed to configure NVMe write cache");
}
@@ -844,9 +868,10 @@ static void nvme_close(BlockDriverState *bs)
nvme_free_queue_pair(s->queues[i]);
}
g_free(s->queues);
- aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
+ aio_set_event_notifier(bdrv_get_aio_context(bs),
+ &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
false, NULL, NULL);
- event_notifier_cleanup(&s->irq_notifier);
+ event_notifier_cleanup(&s->irq_notifier[MSIX_SHARED_IRQ_IDX]);
qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0, NVME_BAR_SIZE);
qemu_vfio_close(s->vfio);
@@ -1045,7 +1070,7 @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
{
int r;
BDRVNVMeState *s = bs->opaque;
- NVMeQueuePair *ioq = s->queues[1];
+ NVMeQueuePair *ioq = s->queues[INDEX_IO(0)];
NVMeRequest *req;
uint32_t cdw12 = (((bytes >> s->blkshift) - 1) & 0xFFFF) |
@@ -1124,7 +1149,7 @@ static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
return nvme_co_prw_aligned(bs, offset, bytes, qiov, is_write, flags);
}
trace_nvme_prw_buffered(s, offset, bytes, qiov->niov, is_write);
- buf = qemu_try_blockalign(bs, bytes);
+ buf = qemu_try_memalign(s->page_size, bytes);
if (!buf) {
return -ENOMEM;
@@ -1160,7 +1185,7 @@ static coroutine_fn int nvme_co_pwritev(BlockDriverState *bs,
static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
{
BDRVNVMeState *s = bs->opaque;
- NVMeQueuePair *ioq = s->queues[1];
+ NVMeQueuePair *ioq = s->queues[INDEX_IO(0)];
NVMeRequest *req;
NvmeCmd cmd = {
.opcode = NVME_CMD_FLUSH,
@@ -1191,7 +1216,7 @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
BdrvRequestFlags flags)
{
BDRVNVMeState *s = bs->opaque;
- NVMeQueuePair *ioq = s->queues[1];
+ NVMeQueuePair *ioq = s->queues[INDEX_IO(0)];
NVMeRequest *req;
uint32_t cdw12 = ((bytes >> s->blkshift) - 1) & 0xFFFF;
@@ -1244,7 +1269,7 @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
int bytes)
{
BDRVNVMeState *s = bs->opaque;
- NVMeQueuePair *ioq = s->queues[1];
+ NVMeQueuePair *ioq = s->queues[INDEX_IO(0)];
NVMeRequest *req;
NvmeDsmRange *buf;
QEMUIOVector local_qiov;
@@ -1268,11 +1293,11 @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
assert(s->nr_queues > 1);
- buf = qemu_try_blockalign0(bs, s->page_size);
+ buf = qemu_try_memalign(s->page_size, s->page_size);
if (!buf) {
return -ENOMEM;
}
-
+ memset(buf, 0, s->page_size);
buf->nlb = cpu_to_le32(bytes >> s->blkshift);
buf->slba = cpu_to_le64(offset >> s->blkshift);
buf->cattr = 0;
@@ -1353,7 +1378,8 @@ static void nvme_detach_aio_context(BlockDriverState *bs)
q->completion_bh = NULL;
}
- aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
+ aio_set_event_notifier(bdrv_get_aio_context(bs),
+ &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
false, NULL, NULL);
}
@@ -1363,7 +1389,7 @@ static void nvme_attach_aio_context(BlockDriverState *bs,
BDRVNVMeState *s = bs->opaque;
s->aio_context = new_context;
- aio_set_event_notifier(new_context, &s->irq_notifier,
+ aio_set_event_notifier(new_context, &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
false, nvme_handle_event, nvme_poll_cb);
for (int i = 0; i < s->nr_queues; i++) {
@@ -1387,7 +1413,7 @@ static void nvme_aio_unplug(BlockDriverState *bs)
BDRVNVMeState *s = bs->opaque;
assert(s->plugged);
s->plugged = false;
- for (i = 1; i < s->nr_queues; i++) {
+ for (i = INDEX_IO(0); i < s->nr_queues; i++) {
NVMeQueuePair *q = s->queues[i];
qemu_mutex_lock(&q->lock);
nvme_kick(q);
diff --git a/block/qapi.c b/block/qapi.c
index afd9f3b4a7..f423ece98c 100644
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -47,7 +47,7 @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk,
Error **errp)
{
ImageInfo **p_image_info;
- BlockDriverState *bs0;
+ BlockDriverState *bs0, *backing;
BlockDeviceInfo *info;
if (!bs->drv) {
@@ -76,9 +76,10 @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk,
info->node_name = g_strdup(bs->node_name);
}
- if (bs->backing_file[0]) {
+ backing = bdrv_cow_bs(bs);
+ if (backing) {
info->has_backing_file = true;
- info->backing_file = g_strdup(bs->backing_file);
+ info->backing_file = g_strdup(backing->filename);
}
if (!QLIST_EMPTY(&bs->dirty_bitmaps)) {
@@ -163,9 +164,13 @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk,
break;
}
- if (bs0->drv && bs0->backing) {
+ if (bs0->drv && bdrv_filter_or_cow_child(bs0)) {
+ /*
+ * Put any filtered child here (for backwards compatibility to when
+ * we put bs0->backing here, which might be any filtered child).
+ */
info->backing_file_depth++;
- bs0 = bs0->backing->bs;
+ bs0 = bdrv_filter_or_cow_bs(bs0);
(*p_image_info)->has_backing_image = true;
p_image_info = &((*p_image_info)->backing_image);
} else {
@@ -174,9 +179,8 @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk,
/* Skip automatically inserted nodes that the user isn't aware of for
* query-block (blk != NULL), but not for query-named-block-nodes */
- while (blk && bs0->drv && bs0->implicit) {
- bs0 = backing_bs(bs0);
- assert(bs0);
+ if (blk) {
+ bs0 = bdrv_skip_implicit_filters(bs0);
}
}
@@ -288,7 +292,7 @@ void bdrv_query_image_info(BlockDriverState *bs,
info->virtual_size = size;
info->actual_size = bdrv_get_allocated_file_size(bs);
info->has_actual_size = info->actual_size >= 0;
- if (bdrv_is_encrypted(bs)) {
+ if (bs->encrypted) {
info->encrypted = true;
info->has_encrypted = true;
}
@@ -311,6 +315,7 @@ void bdrv_query_image_info(BlockDriverState *bs,
backing_filename = bs->backing_file;
if (backing_filename[0] != '\0') {
char *backing_filename2;
+
info->backing_filename = g_strdup(backing_filename);
info->has_backing_filename = true;
backing_filename2 = bdrv_get_full_backing_filename(bs, NULL);
@@ -362,9 +367,7 @@ static void bdrv_query_info(BlockBackend *blk, BlockInfo **p_info,
char *qdev;
/* Skip automatically inserted nodes that the user isn't aware of */
- while (bs && bs->drv && bs->implicit) {
- bs = backing_bs(bs);
- }
+ bs = bdrv_skip_implicit_filters(bs);
info->device = g_strdup(blk_name(blk));
info->type = g_strdup("unknown");
@@ -526,6 +529,8 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)
static BlockStats *bdrv_query_bds_stats(BlockDriverState *bs,
bool blk_level)
{
+ BdrvChild *parent_child;
+ BlockDriverState *filter_or_cow_bs;
BlockStats *s = NULL;
s = g_malloc0(sizeof(*s));
@@ -538,9 +543,8 @@ static BlockStats *bdrv_query_bds_stats(BlockDriverState *bs,
/* Skip automatically inserted nodes that the user isn't aware of in
* a BlockBackend-level command. Stay at the exact node for a node-level
* command. */
- while (blk_level && bs->drv && bs->implicit) {
- bs = backing_bs(bs);
- assert(bs);
+ if (blk_level) {
+ bs = bdrv_skip_implicit_filters(bs);
}
if (bdrv_get_node_name(bs)[0]) {
@@ -555,14 +559,46 @@ static BlockStats *bdrv_query_bds_stats(BlockDriverState *bs,
s->has_driver_specific = true;
}
- if (bs->file) {
+ parent_child = bdrv_primary_child(bs);
+ if (!parent_child ||
+ !(parent_child->role & (BDRV_CHILD_DATA | BDRV_CHILD_FILTERED)))
+ {
+ BdrvChild *c;
+
+ /*
+ * Look for a unique data-storing child. We do not need to look for
+ * filtered children, as there would be only one and it would have been
+ * the primary child.
+ */
+ parent_child = NULL;
+ QLIST_FOREACH(c, &bs->children, next) {
+ if (c->role & BDRV_CHILD_DATA) {
+ if (parent_child) {
+ /*
+ * There are multiple data-storing children and we cannot
+ * choose between them.
+ */
+ parent_child = NULL;
+ break;
+ }
+ parent_child = c;
+ }
+ }
+ }
+ if (parent_child) {
s->has_parent = true;
- s->parent = bdrv_query_bds_stats(bs->file->bs, blk_level);
+ s->parent = bdrv_query_bds_stats(parent_child->bs, blk_level);
}
- if (blk_level && bs->backing) {
+ filter_or_cow_bs = bdrv_filter_or_cow_bs(bs);
+ if (blk_level && filter_or_cow_bs) {
+ /*
+ * Put any filtered or COW child here (for backwards
+ * compatibility to when we put bs0->backing here, which might
+ * be either)
+ */
s->has_backing = true;
- s->backing = bdrv_query_bds_stats(bs->backing->bs, blk_level);
+ s->backing = bdrv_query_bds_stats(filter_or_cow_bs, blk_level);
}
return s;
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 996b3314f4..fcdf7af8e6 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1320,6 +1320,7 @@ static bool cluster_needs_new_alloc(BlockDriverState *bs, uint64_t l2_entry)
if (l2_entry & QCOW_OFLAG_COPIED) {
return false;
}
+ /* fallthrough */
case QCOW2_CLUSTER_UNALLOCATED:
case QCOW2_CLUSTER_COMPRESSED:
case QCOW2_CLUSTER_ZERO_PLAIN:
diff --git a/block/snapshot.c b/block/snapshot.c
index bd9fb01817..a2bf3a54eb 100644
--- a/block/snapshot.c
+++ b/block/snapshot.c
@@ -147,6 +147,56 @@ bool bdrv_snapshot_find_by_id_and_name(BlockDriverState *bs,
return ret;
}
+/**
+ * Return a pointer to the child BDS pointer to which we can fall
+ * back if the given BDS does not support snapshots.
+ * Return NULL if there is no BDS to (safely) fall back to.
+ *
+ * We need to return an indirect pointer because bdrv_snapshot_goto()
+ * has to modify the BdrvChild pointer.
+ */
+static BdrvChild **bdrv_snapshot_fallback_ptr(BlockDriverState *bs)
+{
+ BdrvChild **fallback;
+ BdrvChild *child;
+
+ /*
+ * The only BdrvChild pointers that are safe to modify (and which
+ * we can thus return a reference to) are bs->file and
+ * bs->backing.
+ */
+ fallback = &bs->file;
+ if (!*fallback && bs->drv && bs->drv->is_filter) {
+ fallback = &bs->backing;
+ }
+
+ if (!*fallback) {
+ return NULL;
+ }
+
+ /*
+ * Check that there are no other children that would need to be
+ * snapshotted. If there are, it is not safe to fall back to
+ * *fallback.
+ */
+ QLIST_FOREACH(child, &bs->children, next) {
+ if (child->role & (BDRV_CHILD_DATA | BDRV_CHILD_METADATA |
+ BDRV_CHILD_FILTERED) &&
+ child != *fallback)
+ {
+ return NULL;
+ }
+ }
+
+ return fallback;
+}
+
+static BlockDriverState *bdrv_snapshot_fallback(BlockDriverState *bs)
+{
+ BdrvChild **child_ptr = bdrv_snapshot_fallback_ptr(bs);
+ return child_ptr ? (*child_ptr)->bs : NULL;
+}
+
int bdrv_can_snapshot(BlockDriverState *bs)
{
BlockDriver *drv = bs->drv;
@@ -155,8 +205,9 @@ int bdrv_can_snapshot(BlockDriverState *bs)
}
if (!drv->bdrv_snapshot_create) {
- if (bs->file != NULL) {
- return bdrv_can_snapshot(bs->file->bs);
+ BlockDriverState *fallback_bs = bdrv_snapshot_fallback(bs);
+ if (fallback_bs) {
+ return bdrv_can_snapshot(fallback_bs);
}
return 0;
}
@@ -168,14 +219,15 @@ int bdrv_snapshot_create(BlockDriverState *bs,
QEMUSnapshotInfo *sn_info)
{
BlockDriver *drv = bs->drv;
+ BlockDriverState *fallback_bs = bdrv_snapshot_fallback(bs);
if (!drv) {
return -ENOMEDIUM;
}
if (drv->bdrv_snapshot_create) {
return drv->bdrv_snapshot_create(bs, sn_info);
}
- if (bs->file) {
- return bdrv_snapshot_create(bs->file->bs, sn_info);
+ if (fallback_bs) {
+ return bdrv_snapshot_create(fallback_bs, sn_info);
}
return -ENOTSUP;
}
@@ -185,6 +237,7 @@ int bdrv_snapshot_goto(BlockDriverState *bs,
Error **errp)
{
BlockDriver *drv = bs->drv;
+ BdrvChild **fallback_ptr;
int ret, open_ret;
if (!drv) {
@@ -205,39 +258,46 @@ int bdrv_snapshot_goto(BlockDriverState *bs,
return ret;
}
- if (bs->file) {
- BlockDriverState *file;
- QDict *options = qdict_clone_shallow(bs->options);
+ fallback_ptr = bdrv_snapshot_fallback_ptr(bs);
+ if (fallback_ptr) {
+ QDict *options;
QDict *file_options;
Error *local_err = NULL;
+ BlockDriverState *fallback_bs = (*fallback_ptr)->bs;
+ char *subqdict_prefix = g_strdup_printf("%s.", (*fallback_ptr)->name);
+
+ options = qdict_clone_shallow(bs->options);
- file = bs->file->bs;
/* Prevent it from getting deleted when detached from bs */
- bdrv_ref(file);
+ bdrv_ref(fallback_bs);
- qdict_extract_subqdict(options, &file_options, "file.");
+ qdict_extract_subqdict(options, &file_options, subqdict_prefix);
qobject_unref(file_options);
- qdict_put_str(options, "file", bdrv_get_node_name(file));
+ g_free(subqdict_prefix);
+
+ qdict_put_str(options, (*fallback_ptr)->name,
+ bdrv_get_node_name(fallback_bs));
if (drv->bdrv_close) {
drv->bdrv_close(bs);
}
- bdrv_unref_child(bs, bs->file);
- bs->file = NULL;
- ret = bdrv_snapshot_goto(file, snapshot_id, errp);
+ bdrv_unref_child(bs, *fallback_ptr);
+ *fallback_ptr = NULL;
+
+ ret = bdrv_snapshot_goto(fallback_bs, snapshot_id, errp);
open_ret = drv->bdrv_open(bs, options, bs->open_flags, &local_err);
qobject_unref(options);
if (open_ret < 0) {
- bdrv_unref(file);
+ bdrv_unref(fallback_bs);
bs->drv = NULL;
/* A bdrv_snapshot_goto() error takes precedence */
error_propagate(errp, local_err);
return ret < 0 ? ret : open_ret;
}
- assert(bs->file->bs == file);
- bdrv_unref(file);
+ assert(fallback_bs == (*fallback_ptr)->bs);
+ bdrv_unref(fallback_bs);
return ret;
}
@@ -273,6 +333,7 @@ int bdrv_snapshot_delete(BlockDriverState *bs,
Error **errp)
{
BlockDriver *drv = bs->drv;
+ BlockDriverState *fallback_bs = bdrv_snapshot_fallback(bs);
int ret;
if (!drv) {
@@ -289,8 +350,8 @@ int bdrv_snapshot_delete(BlockDriverState *bs,
if (drv->bdrv_snapshot_delete) {
ret = drv->bdrv_snapshot_delete(bs, snapshot_id, name, errp);
- } else if (bs->file) {
- ret = bdrv_snapshot_delete(bs->file->bs, snapshot_id, name, errp);
+ } else if (fallback_bs) {
+ ret = bdrv_snapshot_delete(fallback_bs, snapshot_id, name, errp);
} else {
error_setg(errp, "Block format '%s' used by device '%s' "
"does not support internal snapshot deletion",
@@ -306,14 +367,15 @@ int bdrv_snapshot_list(BlockDriverState *bs,
QEMUSnapshotInfo **psn_info)
{
BlockDriver *drv = bs->drv;
+ BlockDriverState *fallback_bs = bdrv_snapshot_fallback(bs);
if (!drv) {
return -ENOMEDIUM;
}
if (drv->bdrv_snapshot_list) {
return drv->bdrv_snapshot_list(bs, psn_info);
}
- if (bs->file) {
- return bdrv_snapshot_list(bs->file->bs, psn_info);
+ if (fallback_bs) {
+ return bdrv_snapshot_list(fallback_bs, psn_info);
}
return -ENOTSUP;
}
diff --git a/block/stream.c b/block/stream.c
index 310ccbaa4c..8ce6729a33 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -31,7 +31,8 @@ enum {
typedef struct StreamBlockJob {
BlockJob common;
- BlockDriverState *bottom;
+ BlockDriverState *base_overlay; /* COW overlay (stream from this) */
+ BlockDriverState *above_base; /* Node directly above the base */
BlockdevOnError on_error;
char *backing_file_str;
bool bs_read_only;
@@ -53,7 +54,7 @@ static void stream_abort(Job *job)
if (s->chain_frozen) {
BlockJob *bjob = &s->common;
- bdrv_unfreeze_backing_chain(blk_bs(bjob->blk), s->bottom);
+ bdrv_unfreeze_backing_chain(blk_bs(bjob->blk), s->above_base);
}
}
@@ -62,14 +63,15 @@ static int stream_prepare(Job *job)
StreamBlockJob *s = container_of(job, StreamBlockJob, common.job);
BlockJob *bjob = &s->common;
BlockDriverState *bs = blk_bs(bjob->blk);
- BlockDriverState *base = backing_bs(s->bottom);
+ BlockDriverState *unfiltered_bs = bdrv_skip_filters(bs);
+ BlockDriverState *base = bdrv_filter_or_cow_bs(s->above_base);
Error *local_err = NULL;
int ret = 0;
- bdrv_unfreeze_backing_chain(bs, s->bottom);
+ bdrv_unfreeze_backing_chain(bs, s->above_base);
s->chain_frozen = false;
- if (bs->backing) {
+ if (bdrv_cow_child(unfiltered_bs)) {
const char *base_id = NULL, *base_fmt = NULL;
if (base) {
base_id = s->backing_file_str;
@@ -77,8 +79,8 @@ static int stream_prepare(Job *job)
base_fmt = base->drv->format_name;
}
}
- bdrv_set_backing_hd(bs, base, &local_err);
- ret = bdrv_change_backing_file(bs, base_id, base_fmt, false);
+ bdrv_set_backing_hd(unfiltered_bs, base, &local_err);
+ ret = bdrv_change_backing_file(unfiltered_bs, base_id, base_fmt, false);
if (local_err) {
error_report_err(local_err);
return -EPERM;
@@ -109,14 +111,15 @@ static int coroutine_fn stream_run(Job *job, Error **errp)
StreamBlockJob *s = container_of(job, StreamBlockJob, common.job);
BlockBackend *blk = s->common.blk;
BlockDriverState *bs = blk_bs(blk);
- bool enable_cor = !backing_bs(s->bottom);
+ BlockDriverState *unfiltered_bs = bdrv_skip_filters(bs);
+ bool enable_cor = !bdrv_cow_child(s->base_overlay);
int64_t len;
int64_t offset = 0;
uint64_t delay_ns = 0;
int error = 0;
int64_t n = 0; /* bytes */
- if (bs == s->bottom) {
+ if (unfiltered_bs == s->base_overlay) {
/* Nothing to stream */
return 0;
}
@@ -150,13 +153,14 @@ static int coroutine_fn stream_run(Job *job, Error **errp)
copy = false;
- ret = bdrv_is_allocated(bs, offset, STREAM_CHUNK, &n);
+ ret = bdrv_is_allocated(unfiltered_bs, offset, STREAM_CHUNK, &n);
if (ret == 1) {
/* Allocated in the top, no need to copy. */
} else if (ret >= 0) {
/* Copy if allocated in the intermediate images. Limit to the
* known-unallocated area [offset, offset+n*BDRV_SECTOR_SIZE). */
- ret = bdrv_is_allocated_above(backing_bs(bs), s->bottom, true,
+ ret = bdrv_is_allocated_above(bdrv_cow_bs(unfiltered_bs),
+ s->base_overlay, true,
offset, n, &n);
/* Finish early if end of backing file has been reached */
if (ret == 0 && n == 0) {
@@ -223,9 +227,29 @@ void stream_start(const char *job_id, BlockDriverState *bs,
BlockDriverState *iter;
bool bs_read_only;
int basic_flags = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED;
- BlockDriverState *bottom = bdrv_find_overlay(bs, base);
+ BlockDriverState *base_overlay = bdrv_find_overlay(bs, base);
+ BlockDriverState *above_base;
- if (bdrv_freeze_backing_chain(bs, bottom, errp) < 0) {
+ if (!base_overlay) {
+ error_setg(errp, "'%s' is not in the backing chain of '%s'",
+ base->node_name, bs->node_name);
+ return;
+ }
+
+ /*
+ * Find the node directly above @base. @base_overlay is a COW overlay, so
+ * it must have a bdrv_cow_child(), but it is the immediate overlay of
+ * @base, so between the two there can only be filters.
+ */
+ above_base = base_overlay;
+ if (bdrv_cow_bs(above_base) != base) {
+ above_base = bdrv_cow_bs(above_base);
+ while (bdrv_filter_bs(above_base) != base) {
+ above_base = bdrv_filter_bs(above_base);
+ }
+ }
+
+ if (bdrv_freeze_backing_chain(bs, above_base, errp) < 0) {
return;
}
@@ -255,14 +279,19 @@ void stream_start(const char *job_id, BlockDriverState *bs,
* and resizes. Reassign the base node pointer because the backing BS of the
* bottom node might change after the call to bdrv_reopen_set_read_only()
* due to parallel block jobs running.
+ * above_base node might change after the call to
+ * bdrv_reopen_set_read_only() due to parallel block jobs running.
*/
- base = backing_bs(bottom);
- for (iter = backing_bs(bs); iter && iter != base; iter = backing_bs(iter)) {
+ base = bdrv_filter_or_cow_bs(above_base);
+ for (iter = bdrv_filter_or_cow_bs(bs); iter != base;
+ iter = bdrv_filter_or_cow_bs(iter))
+ {
block_job_add_bdrv(&s->common, "intermediate node", iter, 0,
basic_flags, &error_abort);
}
- s->bottom = bottom;
+ s->base_overlay = base_overlay;
+ s->above_base = above_base;
s->backing_file_str = g_strdup(backing_file_str);
s->bs_read_only = bs_read_only;
s->chain_frozen = true;
@@ -276,5 +305,5 @@ fail:
if (bs_read_only) {
bdrv_reopen_set_read_only(bs, true, NULL);
}
- bdrv_unfreeze_backing_chain(bs, bottom);
+ bdrv_unfreeze_backing_chain(bs, above_base);
}
diff --git a/block/throttle.c b/block/throttle.c
index 1c1ac57bee..9a0f38149a 100644
--- a/block/throttle.c
+++ b/block/throttle.c
@@ -151,6 +151,15 @@ static int coroutine_fn throttle_co_pdiscard(BlockDriverState *bs,
return bdrv_co_pdiscard(bs->file, offset, bytes);
}
+static int coroutine_fn throttle_co_pwritev_compressed(BlockDriverState *bs,
+ uint64_t offset,
+ uint64_t bytes,
+ QEMUIOVector *qiov)
+{
+ return throttle_co_pwritev(bs, offset, bytes, qiov,
+ BDRV_REQ_WRITE_COMPRESSED);
+}
+
static int throttle_co_flush(BlockDriverState *bs)
{
return bdrv_co_flush(bs->file->bs);
@@ -243,6 +252,7 @@ static BlockDriver bdrv_throttle = {
.bdrv_co_pwrite_zeroes = throttle_co_pwrite_zeroes,
.bdrv_co_pdiscard = throttle_co_pdiscard,
+ .bdrv_co_pwritev_compressed = throttle_co_pwritev_compressed,
.bdrv_attach_aio_context = throttle_attach_aio_context,
.bdrv_detach_aio_context = throttle_detach_aio_context,
@@ -250,7 +260,6 @@ static BlockDriver bdrv_throttle = {
.bdrv_reopen_prepare = throttle_reopen_prepare,
.bdrv_reopen_commit = throttle_reopen_commit,
.bdrv_reopen_abort = throttle_reopen_abort,
- .bdrv_co_block_status = bdrv_co_block_status_from_file,
.bdrv_co_drain_begin = throttle_co_drain_begin,
.bdrv_co_drain_end = throttle_co_drain_end,
diff --git a/block/vmdk.c b/block/vmdk.c
index d90855446a..8ec62c7ab7 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -2803,21 +2803,6 @@ static void vmdk_close(BlockDriverState *bs)
error_free(s->migration_blocker);
}
-static coroutine_fn int vmdk_co_flush(BlockDriverState *bs)
-{
- BDRVVmdkState *s = bs->opaque;
- int i, err;
- int ret = 0;
-
- for (i = 0; i < s->num_extents; i++) {
- err = bdrv_co_flush(s->extents[i].file->bs);
- if (err < 0) {
- ret = err;
- }
- }
- return ret;
-}
-
static int64_t vmdk_get_allocated_file_size(BlockDriverState *bs)
{
int i;
@@ -3081,7 +3066,6 @@ static BlockDriver bdrv_vmdk = {
.bdrv_close = vmdk_close,
.bdrv_co_create_opts = vmdk_co_create_opts,
.bdrv_co_create = vmdk_co_create,
- .bdrv_co_flush_to_disk = vmdk_co_flush,
.bdrv_co_block_status = vmdk_co_block_status,
.bdrv_get_allocated_file_size = vmdk_get_allocated_file_size,
.bdrv_has_zero_init = vmdk_has_zero_init,