diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2016-06-08 17:17:16 +0100 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2016-06-08 17:17:16 +0100 |
commit | 6f50f25c825fbd0edd2dc43b9a63edfecfd4a3e9 (patch) | |
tree | 80fee285db6153395963b01591d27d5053c1ec4c | |
parent | d36ebffe945d744a1605b30a8fd208b5325c1505 (diff) | |
parent | 55d539c8f75abab70301a43d8c94b976f6ddc358 (diff) |
Merge remote-tracking branch 'remotes/kevin/tags/for-upstream' into staging
Block layer patches
# gpg: Signature made Wed 08 Jun 2016 09:31:38 BST
# gpg: using RSA key 0x7F09B272C88F2FD6
# gpg: Good signature from "Kevin Wolf <kwolf@redhat.com>"
* remotes/kevin/tags/for-upstream: (31 commits)
qemu-img bench: Add --flush-interval
qemu-img bench: Implement -S (step size)
qemu-img bench: Make start offset configurable
qemu-img bench: Sequential writes
qemu-img bench
block: Don't emulate natively supported pwritev flags
blockdev: clean up error handling in do_open_tray
block: Fix bdrv_all_delete_snapshot() error handling
qcow2: avoid extra flushes in qcow2
raw-posix: Fetch max sectors for host block device
block: assert that bs->request_alignment is a power of 2
migration/block: Convert saving to BlockBackend
migration/block: Convert load to BlockBackend
block: Kill bdrv_co_write_zeroes()
vmdk: Convert to bdrv_co_pwrite_zeroes()
raw_bsd: Convert to bdrv_co_pwrite_zeroes()
raw-posix: Convert to bdrv_co_pwrite_zeroes()
qed: Convert to bdrv_co_pwrite_zeroes()
gluster: Convert to bdrv_co_pwrite_zeroes()
blkreplay: Convert to bdrv_co_pwrite_zeroes()
...
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r-- | block.c | 2 | ||||
-rwxr-xr-x | block/blkreplay.c | 8 | ||||
-rw-r--r-- | block/gluster.c | 14 | ||||
-rw-r--r-- | block/io.c | 121 | ||||
-rw-r--r-- | block/iscsi.c | 72 | ||||
-rw-r--r-- | block/parallels.c | 4 | ||||
-rw-r--r-- | block/qcow2-cache.c | 11 | ||||
-rw-r--r-- | block/qcow2-cluster.c | 3 | ||||
-rw-r--r-- | block/qcow2.c | 91 | ||||
-rw-r--r-- | block/qcow2.h | 1 | ||||
-rw-r--r-- | block/qed.c | 35 | ||||
-rw-r--r-- | block/raw-posix.c | 58 | ||||
-rw-r--r-- | block/raw_bsd.c | 10 | ||||
-rw-r--r-- | block/snapshot.c | 3 | ||||
-rw-r--r-- | block/vmdk.c | 18 | ||||
-rw-r--r-- | blockdev.c | 46 | ||||
-rw-r--r-- | include/block/block.h | 10 | ||||
-rw-r--r-- | include/block/block_int.h | 16 | ||||
-rw-r--r-- | migration/block.c | 147 | ||||
-rw-r--r-- | qemu-img-cmds.hx | 6 | ||||
-rw-r--r-- | qemu-img.c | 329 | ||||
-rw-r--r-- | qemu-img.texi | 24 | ||||
-rwxr-xr-x | tests/qemu-iotests/034 | 2 | ||||
-rwxr-xr-x | tests/qemu-iotests/154 | 42 | ||||
-rw-r--r-- | tests/qemu-iotests/154.out | 55 | ||||
-rw-r--r-- | trace-events | 6 |
26 files changed, 831 insertions, 303 deletions
@@ -1018,7 +1018,7 @@ static int bdrv_open_common(BlockDriverState *bs, BdrvChild *file, assert(bdrv_opt_mem_align(bs) != 0); assert(bdrv_min_mem_align(bs) != 0); - assert((bs->request_alignment != 0) || bdrv_is_sg(bs)); + assert(is_power_of_2(bs->request_alignment) || bdrv_is_sg(bs)); qemu_opts_del(opts); return 0; diff --git a/block/blkreplay.c b/block/blkreplay.c index 42f1813af1..525c2d54e0 100755 --- a/block/blkreplay.c +++ b/block/blkreplay.c @@ -103,11 +103,11 @@ static int coroutine_fn blkreplay_co_writev(BlockDriverState *bs, return ret; } -static int coroutine_fn blkreplay_co_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) +static int coroutine_fn blkreplay_co_pwrite_zeroes(BlockDriverState *bs, + int64_t offset, int count, BdrvRequestFlags flags) { uint64_t reqid = request_id++; - int ret = bdrv_co_write_zeroes(bs->file->bs, sector_num, nb_sectors, flags); + int ret = bdrv_co_pwrite_zeroes(bs->file->bs, offset, count, flags); block_request_create(reqid, bs, qemu_coroutine_self()); qemu_coroutine_yield(); @@ -147,7 +147,7 @@ static BlockDriver bdrv_blkreplay = { .bdrv_co_readv = blkreplay_co_readv, .bdrv_co_writev = blkreplay_co_writev, - .bdrv_co_write_zeroes = blkreplay_co_write_zeroes, + .bdrv_co_pwrite_zeroes = blkreplay_co_pwrite_zeroes, .bdrv_co_discard = blkreplay_co_discard, .bdrv_co_flush = blkreplay_co_flush, }; diff --git a/block/gluster.c b/block/gluster.c index a8aaacf645..d361d8e847 100644 --- a/block/gluster.c +++ b/block/gluster.c @@ -454,14 +454,12 @@ static void qemu_gluster_reopen_abort(BDRVReopenState *state) } #ifdef CONFIG_GLUSTERFS_ZEROFILL -static coroutine_fn int qemu_gluster_co_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) +static coroutine_fn int qemu_gluster_co_pwrite_zeroes(BlockDriverState *bs, + int64_t offset, int size, BdrvRequestFlags flags) { int ret; GlusterAIOCB acb; BDRVGlusterState *s = bs->opaque; - off_t size = nb_sectors * BDRV_SECTOR_SIZE; - off_t offset = sector_num * BDRV_SECTOR_SIZE; acb.size = size; acb.ret = 0; @@ -769,7 +767,7 @@ static BlockDriver bdrv_gluster = { .bdrv_co_discard = qemu_gluster_co_discard, #endif #ifdef CONFIG_GLUSTERFS_ZEROFILL - .bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes, + .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes, #endif .create_opts = &qemu_gluster_create_opts, }; @@ -796,7 +794,7 @@ static BlockDriver bdrv_gluster_tcp = { .bdrv_co_discard = qemu_gluster_co_discard, #endif #ifdef CONFIG_GLUSTERFS_ZEROFILL - .bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes, + .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes, #endif .create_opts = &qemu_gluster_create_opts, }; @@ -823,7 +821,7 @@ static BlockDriver bdrv_gluster_unix = { .bdrv_co_discard = qemu_gluster_co_discard, #endif #ifdef CONFIG_GLUSTERFS_ZEROFILL - .bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes, + .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes, #endif .create_opts = &qemu_gluster_create_opts, }; @@ -850,7 +848,7 @@ static BlockDriver bdrv_gluster_rdma = { .bdrv_co_discard = qemu_gluster_co_discard, #endif #ifdef CONFIG_GLUSTERFS_ZEROFILL - .bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes, + .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes, #endif .create_opts = &qemu_gluster_create_opts, }; diff --git a/block/io.c b/block/io.c index 6070e773b7..fb99a7151c 100644 --- a/block/io.c +++ b/block/io.c @@ -42,8 +42,8 @@ static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, void *opaque, bool is_write); static void coroutine_fn bdrv_co_do_rw(void *opaque); -static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, BdrvRequestFlags flags); +static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, + int64_t offset, int count, BdrvRequestFlags flags); static void bdrv_parent_drained_begin(BlockDriverState *bs) { @@ -620,18 +620,25 @@ int bdrv_write(BlockDriverState *bs, int64_t sector_num, return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0); } -int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, BdrvRequestFlags flags) +int bdrv_pwrite_zeroes(BlockDriverState *bs, int64_t offset, + int count, BdrvRequestFlags flags) { - return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, - BDRV_REQ_ZERO_WRITE | flags); + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = NULL, + .iov_len = count, + }; + + qemu_iovec_init_external(&qiov, &iov, 1); + return bdrv_prwv_co(bs, offset, &qiov, true, + BDRV_REQ_ZERO_WRITE | flags); } /* - * Completely zero out a block device with the help of bdrv_write_zeroes. + * Completely zero out a block device with the help of bdrv_pwrite_zeroes. * The operation is sped up by checking the block status and only writing * zeroes to the device if they currently do not return zeroes. Optional - * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP, + * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP, * BDRV_REQ_FUA). * * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). @@ -662,7 +669,8 @@ int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) sector_num += n; continue; } - ret = bdrv_write_zeroes(bs, sector_num, n, flags); + ret = bdrv_pwrite_zeroes(bs, sector_num << BDRV_SECTOR_BITS, + n << BDRV_SECTOR_BITS, flags); if (ret < 0) { error_report("error writing zeroes at sector %" PRId64 ": %s", sector_num, strerror(-ret)); @@ -808,7 +816,9 @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, int ret; if (drv->bdrv_co_pwritev) { - ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, flags); + ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, + flags & bs->supported_write_flags); + flags &= ~bs->supported_write_flags; goto emulate_flags; } @@ -893,10 +903,12 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, goto err; } - if (drv->bdrv_co_write_zeroes && + if (drv->bdrv_co_pwrite_zeroes && buffer_is_zero(bounce_buffer, iov.iov_len)) { - ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, - cluster_nb_sectors, 0); + ret = bdrv_co_do_pwrite_zeroes(bs, + cluster_sector_num * BDRV_SECTOR_SIZE, + cluster_nb_sectors * BDRV_SECTOR_SIZE, + 0); } else { /* This does not change the data on the disk, it is not necessary * to flush even in cache=writethrough mode. @@ -1110,36 +1122,40 @@ int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, #define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768 -static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) +static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, + int64_t offset, int count, BdrvRequestFlags flags) { BlockDriver *drv = bs->drv; QEMUIOVector qiov; struct iovec iov = {0}; int ret = 0; bool need_flush = false; + int head = 0; + int tail = 0; - int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes, - BDRV_REQUEST_MAX_SECTORS); + int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX); + int alignment = MAX(bs->bl.pwrite_zeroes_alignment ?: 1, + bs->request_alignment); - while (nb_sectors > 0 && !ret) { - int num = nb_sectors; + assert(is_power_of_2(alignment)); + head = offset & (alignment - 1); + tail = (offset + count) & (alignment - 1); + max_write_zeroes &= ~(alignment - 1); + + while (count > 0 && !ret) { + int num = count; /* Align request. Block drivers can expect the "bulk" of the request - * to be aligned. + * to be aligned, and that unaligned requests do not cross cluster + * boundaries. */ - if (bs->bl.write_zeroes_alignment - && num > bs->bl.write_zeroes_alignment) { - if (sector_num % bs->bl.write_zeroes_alignment != 0) { - /* Make a small request up to the first aligned sector. */ - num = bs->bl.write_zeroes_alignment; - num -= sector_num % bs->bl.write_zeroes_alignment; - } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) { - /* Shorten the request to the last aligned sector. num cannot - * underflow because num > bs->bl.write_zeroes_alignment. - */ - num -= (sector_num + num) % bs->bl.write_zeroes_alignment; - } + if (head) { + /* Make a small request up to the first aligned sector. */ + num = MIN(count, alignment - head); + head = 0; + } else if (tail && num > alignment) { + /* Shorten the request to the last aligned sector. */ + num -= tail; } /* limit request size */ @@ -1149,9 +1165,9 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, ret = -ENOTSUP; /* First try the efficient write zeroes operation */ - if (drv->bdrv_co_write_zeroes) { - ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, - flags & bs->supported_zero_flags); + if (drv->bdrv_co_pwrite_zeroes) { + ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num, + flags & bs->supported_zero_flags); if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) && !(bs->supported_zero_flags & BDRV_REQ_FUA)) { need_flush = true; @@ -1173,33 +1189,31 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, write_flags &= ~BDRV_REQ_FUA; need_flush = true; } - num = MIN(num, max_xfer_len); - iov.iov_len = num * BDRV_SECTOR_SIZE; + num = MIN(num, max_xfer_len << BDRV_SECTOR_BITS); + iov.iov_len = num; if (iov.iov_base == NULL) { - iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE); + iov.iov_base = qemu_try_blockalign(bs, num); if (iov.iov_base == NULL) { ret = -ENOMEM; goto fail; } - memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE); + memset(iov.iov_base, 0, num); } qemu_iovec_init_external(&qiov, &iov, 1); - ret = bdrv_driver_pwritev(bs, sector_num * BDRV_SECTOR_SIZE, - num * BDRV_SECTOR_SIZE, &qiov, - write_flags); + ret = bdrv_driver_pwritev(bs, offset, num, &qiov, write_flags); /* Keep bounce buffer around if it is big enough for all * all future requests. */ - if (num < max_xfer_len) { + if (num < max_xfer_len << BDRV_SECTOR_BITS) { qemu_vfree(iov.iov_base); iov.iov_base = NULL; } } - sector_num += num; - nb_sectors -= num; + offset += num; + count -= num; } fail: @@ -1237,7 +1251,7 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && - !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes && + !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes && qemu_iovec_is_zero(qiov)) { flags |= BDRV_REQ_ZERO_WRITE; if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { @@ -1249,7 +1263,8 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, /* Do nothing, write notifier decided to fail this request */ } else if (flags & BDRV_REQ_ZERO_WRITE) { bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); - ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); + ret = bdrv_co_do_pwrite_zeroes(bs, sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS, flags); } else { bdrv_debug_event(bs, BLKDBG_PWRITEV); ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags); @@ -1510,18 +1525,18 @@ int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0); } -int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - BdrvRequestFlags flags) +int coroutine_fn bdrv_co_pwrite_zeroes(BlockDriverState *bs, + int64_t offset, int count, + BdrvRequestFlags flags) { - trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags); + trace_bdrv_co_pwrite_zeroes(bs, offset, count, flags); if (!(bs->open_flags & BDRV_O_UNMAP)) { flags &= ~BDRV_REQ_MAY_UNMAP; } - return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, - BDRV_REQ_ZERO_WRITE | flags); + return bdrv_co_pwritev(bs, offset, count, NULL, + BDRV_REQ_ZERO_WRITE | flags); } typedef struct BdrvCoGetBlockStatusData { diff --git a/block/iscsi.c b/block/iscsi.c index e7d5f7b0c3..7e78adea15 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -401,18 +401,26 @@ static int64_t sector_qemu2lun(int64_t sector, IscsiLun *iscsilun) return sector * BDRV_SECTOR_SIZE / iscsilun->block_size; } -static bool is_request_lun_aligned(int64_t sector_num, int nb_sectors, - IscsiLun *iscsilun) +static bool is_byte_request_lun_aligned(int64_t offset, int count, + IscsiLun *iscsilun) { - if ((sector_num * BDRV_SECTOR_SIZE) % iscsilun->block_size || - (nb_sectors * BDRV_SECTOR_SIZE) % iscsilun->block_size) { - error_report("iSCSI misaligned request: " - "iscsilun->block_size %u, sector_num %" PRIi64 - ", nb_sectors %d", - iscsilun->block_size, sector_num, nb_sectors); - return 0; - } - return 1; + if (offset % iscsilun->block_size || count % iscsilun->block_size) { + error_report("iSCSI misaligned request: " + "iscsilun->block_size %u, offset %" PRIi64 + ", count %d", + iscsilun->block_size, offset, count); + return false; + } + return true; +} + +static bool is_sector_request_lun_aligned(int64_t sector_num, int nb_sectors, + IscsiLun *iscsilun) +{ + assert(nb_sectors < BDRV_REQUEST_MAX_SECTORS); + return is_byte_request_lun_aligned(sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS, + iscsilun); } static unsigned long *iscsi_allocationmap_init(IscsiLun *iscsilun) @@ -461,7 +469,7 @@ iscsi_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors, if (fua) { assert(iscsilun->dpofua); } - if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { + if (!is_sector_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { return -EINVAL; } @@ -541,7 +549,7 @@ static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs, iscsi_co_init_iscsitask(iscsilun, &iTask); - if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { + if (!is_sector_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { ret = -EINVAL; goto out; } @@ -638,7 +646,7 @@ static int coroutine_fn iscsi_co_readv(BlockDriverState *bs, uint64_t lba; uint32_t num_sectors; - if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { + if (!is_sector_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { return -EINVAL; } @@ -926,7 +934,7 @@ coroutine_fn iscsi_co_discard(BlockDriverState *bs, int64_t sector_num, struct IscsiTask iTask; struct unmap_list list; - if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { + if (!is_sector_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { return -EINVAL; } @@ -977,8 +985,8 @@ retry: } static int -coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, BdrvRequestFlags flags) +coroutine_fn iscsi_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, + int count, BdrvRequestFlags flags) { IscsiLun *iscsilun = bs->opaque; struct IscsiTask iTask; @@ -986,8 +994,8 @@ coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num, uint32_t nb_blocks; bool use_16_for_ws = iscsilun->use_16_for_rw; - if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { - return -EINVAL; + if (!is_byte_request_lun_aligned(offset, count, iscsilun)) { + return -ENOTSUP; } if (flags & BDRV_REQ_MAY_UNMAP) { @@ -1008,8 +1016,8 @@ coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num, return -ENOTSUP; } - lba = sector_qemu2lun(sector_num, iscsilun); - nb_blocks = sector_qemu2lun(nb_sectors, iscsilun); + lba = offset / iscsilun->block_size; + nb_blocks = count / iscsilun->block_size; if (iscsilun->zeroblock == NULL) { iscsilun->zeroblock = g_try_malloc0(iscsilun->block_size); @@ -1065,9 +1073,11 @@ retry: } if (flags & BDRV_REQ_MAY_UNMAP) { - iscsi_allocationmap_clear(iscsilun, sector_num, nb_sectors); + iscsi_allocationmap_clear(iscsilun, offset >> BDRV_SECTOR_BITS, + count >> BDRV_SECTOR_BITS); } else { - iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors); + iscsi_allocationmap_set(iscsilun, offset >> BDRV_SECTOR_BITS, + count >> BDRV_SECTOR_BITS); } return 0; @@ -1711,15 +1721,19 @@ static void iscsi_refresh_limits(BlockDriverState *bs, Error **errp) } bs->bl.discard_alignment = sector_limits_lun2qemu(iscsilun->bl.opt_unmap_gran, iscsilun); + } else { + bs->bl.discard_alignment = iscsilun->block_size >> BDRV_SECTOR_BITS; } - if (iscsilun->bl.max_ws_len < 0xffffffff) { - bs->bl.max_write_zeroes = - sector_limits_lun2qemu(iscsilun->bl.max_ws_len, iscsilun); + if (iscsilun->bl.max_ws_len < 0xffffffff / iscsilun->block_size) { + bs->bl.max_pwrite_zeroes = + iscsilun->bl.max_ws_len * iscsilun->block_size; } if (iscsilun->lbp.lbpws) { - bs->bl.write_zeroes_alignment = - sector_limits_lun2qemu(iscsilun->bl.opt_unmap_gran, iscsilun); + bs->bl.pwrite_zeroes_alignment = + iscsilun->bl.opt_unmap_gran * iscsilun->block_size; + } else { + bs->bl.pwrite_zeroes_alignment = iscsilun->block_size; } bs->bl.opt_transfer_length = sector_limits_lun2qemu(iscsilun->bl.opt_xfer_len, iscsilun); @@ -1852,7 +1866,7 @@ static BlockDriver bdrv_iscsi = { .bdrv_co_get_block_status = iscsi_co_get_block_status, .bdrv_co_discard = iscsi_co_discard, - .bdrv_co_write_zeroes = iscsi_co_write_zeroes, + .bdrv_co_pwrite_zeroes = iscsi_co_pwrite_zeroes, .bdrv_co_readv = iscsi_co_readv, .bdrv_co_writev_flags = iscsi_co_writev_flags, .bdrv_co_flush_to_disk = iscsi_co_flush, diff --git a/block/parallels.c b/block/parallels.c index b9b5c6dc3d..d6a1a616b6 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -210,7 +210,9 @@ static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num, int ret; space += s->prealloc_size; if (s->prealloc_mode == PRL_PREALLOC_MODE_FALLOCATE) { - ret = bdrv_write_zeroes(bs->file->bs, s->data_end, space, 0); + ret = bdrv_pwrite_zeroes(bs->file->bs, + s->data_end << BDRV_SECTOR_BITS, + space << BDRV_SECTOR_BITS, 0); } else { ret = bdrv_truncate(bs->file->bs, (s->data_end + space) << BDRV_SECTOR_BITS); diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c index 0fe8edae41..208a060421 100644 --- a/block/qcow2-cache.c +++ b/block/qcow2-cache.c @@ -226,7 +226,7 @@ static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i) return 0; } -int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c) +int qcow2_cache_write(BlockDriverState *bs, Qcow2Cache *c) { BDRVQcow2State *s = bs->opaque; int result = 0; @@ -242,8 +242,15 @@ int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c) } } + return result; +} + +int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c) +{ + int result = qcow2_cache_write(bs, c); + if (result == 0) { - ret = bdrv_flush(bs->file->bs); + int ret = bdrv_flush(bs->file->bs); if (ret < 0) { result = ret; } diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index c73797378e..b04bfafd65 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -1765,8 +1765,7 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, goto fail; } - ret = bdrv_write_zeroes(bs->file->bs, offset / BDRV_SECTOR_SIZE, - s->cluster_sectors, 0); + ret = bdrv_pwrite_zeroes(bs->file->bs, offset, s->cluster_size, 0); if (ret < 0) { if (!preallocated) { qcow2_free_clusters(bs, offset, s->cluster_size, diff --git a/block/qcow2.c b/block/qcow2.c index c9306a78c1..6f5fb810e4 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -1193,7 +1193,7 @@ static void qcow2_refresh_limits(BlockDriverState *bs, Error **errp) { BDRVQcow2State *s = bs->opaque; - bs->bl.write_zeroes_alignment = s->cluster_sectors; + bs->bl.pwrite_zeroes_alignment = s->cluster_size; } static int qcow2_set_key(BlockDriverState *bs, const char *key) @@ -2406,65 +2406,55 @@ finish: } -static bool is_zero_cluster(BlockDriverState *bs, int64_t start) +static bool is_zero_sectors(BlockDriverState *bs, int64_t start, + uint32_t count) { - BDRVQcow2State *s = bs->opaque; int nr; BlockDriverState *file; - int64_t res = bdrv_get_block_status_above(bs, NULL, start, - s->cluster_sectors, &nr, &file); - return res >= 0 && (res & BDRV_BLOCK_ZERO) && nr == s->cluster_sectors; -} - -static bool is_zero_cluster_top_locked(BlockDriverState *bs, int64_t start) -{ - BDRVQcow2State *s = bs->opaque; - int nr = s->cluster_sectors; - uint64_t off; - int ret; + int64_t res; - ret = qcow2_get_cluster_offset(bs, start << BDRV_SECTOR_BITS, &nr, &off); - assert(nr == s->cluster_sectors); - return ret == QCOW2_CLUSTER_UNALLOCATED || ret == QCOW2_CLUSTER_ZERO; + if (!count) { + return true; + } + res = bdrv_get_block_status_above(bs, NULL, start, count, + &nr, &file); + return res >= 0 && (res & BDRV_BLOCK_ZERO) && nr == count; } -static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) +static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs, + int64_t offset, int count, BdrvRequestFlags flags) { int ret; BDRVQcow2State *s = bs->opaque; - int head = sector_num % s->cluster_sectors; - int tail = (sector_num + nb_sectors) % s->cluster_sectors; + uint32_t head = offset % s->cluster_size; + uint32_t tail = (offset + count) % s->cluster_size; - if (head != 0 || tail != 0) { - int64_t cl_end = -1; + trace_qcow2_pwrite_zeroes_start_req(qemu_coroutine_self(), offset, count); - sector_num -= head; - nb_sectors += head; + if (head || tail) { + int64_t cl_start = (offset - head) >> BDRV_SECTOR_BITS; + uint64_t off; + int nr; - if (tail != 0) { - nb_sectors += s->cluster_sectors - tail; - } + assert(head + count <= s->cluster_size); - if (!is_zero_cluster(bs, sector_num)) { + /* check whether remainder of cluster already reads as zero */ + if (!(is_zero_sectors(bs, cl_start, + DIV_ROUND_UP(head, BDRV_SECTOR_SIZE)) && + is_zero_sectors(bs, (offset + count) >> BDRV_SECTOR_BITS, + DIV_ROUND_UP(-tail & (s->cluster_size - 1), + BDRV_SECTOR_SIZE)))) { return -ENOTSUP; } - if (nb_sectors > s->cluster_sectors) { - /* Technically the request can cover 2 clusters, f.e. 4k write - at s->cluster_sectors - 2k offset. One of these cluster can - be zeroed, one unallocated */ - cl_end = sector_num + nb_sectors - s->cluster_sectors; - if (!is_zero_cluster(bs, cl_end)) { - return -ENOTSUP; - } - } - qemu_co_mutex_lock(&s->lock); /* We can have new write after previous check */ - if (!is_zero_cluster_top_locked(bs, sector_num) || - (cl_end > 0 && !is_zero_cluster_top_locked(bs, cl_end))) { + offset = cl_start << BDRV_SECTOR_BITS; + count = s->cluster_size; + nr = s->cluster_sectors; + ret = qcow2_get_cluster_offset(bs, offset, &nr, &off); + if (ret != QCOW2_CLUSTER_UNALLOCATED && ret != QCOW2_CLUSTER_ZERO) { qemu_co_mutex_unlock(&s->lock); return -ENOTSUP; } @@ -2472,8 +2462,10 @@ static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs, qemu_co_mutex_lock(&s->lock); } + trace_qcow2_pwrite_zeroes(qemu_coroutine_self(), offset, count); + /* Whatever is left can use real zero clusters */ - ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS, nb_sectors); + ret = qcow2_zero_clusters(bs, offset, count >> BDRV_SECTOR_BITS); qemu_co_mutex_unlock(&s->lock); return ret; @@ -2664,8 +2656,8 @@ static int make_completely_empty(BlockDriverState *bs) /* After this call, neither the in-memory nor the on-disk refcount * information accurately describe the actual references */ - ret = bdrv_write_zeroes(bs->file->bs, s->l1_table_offset / BDRV_SECTOR_SIZE, - l1_clusters * s->cluster_sectors, 0); + ret = bdrv_pwrite_zeroes(bs->file->bs, s->l1_table_offset, + l1_clusters * s->cluster_size, 0); if (ret < 0) { goto fail_broken_refcounts; } @@ -2678,9 +2670,8 @@ static int make_completely_empty(BlockDriverState *bs) * overwrite parts of the existing refcount and L1 table, which is not * an issue because the dirty flag is set, complete data loss is in fact * desired and partial data loss is consequently fine as well */ - ret = bdrv_write_zeroes(bs->file->bs, s->cluster_size / BDRV_SECTOR_SIZE, - (2 + l1_clusters) * s->cluster_size / - BDRV_SECTOR_SIZE, 0); + ret = bdrv_pwrite_zeroes(bs->file->bs, s->cluster_size, + (2 + l1_clusters) * s->cluster_size, 0); /* This call (even if it failed overall) may have overwritten on-disk * refcount structures; in that case, the in-memory refcount information * will probably differ from the on-disk information which makes the BDS @@ -2822,14 +2813,14 @@ static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs) int ret; qemu_co_mutex_lock(&s->lock); - ret = qcow2_cache_flush(bs, s->l2_table_cache); + ret = qcow2_cache_write(bs, s->l2_table_cache); if (ret < 0) { qemu_co_mutex_unlock(&s->lock); return ret; } if (qcow2_need_accurate_refcounts(s)) { - ret = qcow2_cache_flush(bs, s->refcount_block_cache); + ret = qcow2_cache_write(bs, s->refcount_block_cache); if (ret < 0) { qemu_co_mutex_unlock(&s->lock); return ret; @@ -3381,7 +3372,7 @@ BlockDriver bdrv_qcow2 = { .bdrv_co_writev = qcow2_co_writev, .bdrv_co_flush_to_os = qcow2_co_flush_to_os, - .bdrv_co_write_zeroes = qcow2_co_write_zeroes, + .bdrv_co_pwrite_zeroes = qcow2_co_pwrite_zeroes, .bdrv_co_discard = qcow2_co_discard, .bdrv_truncate = qcow2_truncate, .bdrv_write_compressed = qcow2_write_compressed, diff --git a/block/qcow2.h b/block/qcow2.h index a063a3c1a1..7db9795d44 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -583,6 +583,7 @@ int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c); void qcow2_cache_entry_mark_dirty(BlockDriverState *bs, Qcow2Cache *c, void *table); int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c); +int qcow2_cache_write(BlockDriverState *bs, Qcow2Cache *c); int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c, Qcow2Cache *dependency); void qcow2_cache_depends_on_flush(Qcow2Cache *c); diff --git a/block/qed.c b/block/qed.c index 0f621924f8..12068061ac 100644 --- a/block/qed.c +++ b/block/qed.c @@ -517,7 +517,7 @@ static void bdrv_qed_refresh_limits(BlockDriverState *bs, Error **errp) { BDRVQEDState *s = bs->opaque; - bs->bl.write_zeroes_alignment = s->header.cluster_size >> BDRV_SECTOR_BITS; + bs->bl.pwrite_zeroes_alignment = s->header.cluster_size; } /* We have nothing to do for QED reopen, stubs just return @@ -1418,7 +1418,7 @@ typedef struct { bool done; } QEDWriteZeroesCB; -static void coroutine_fn qed_co_write_zeroes_cb(void *opaque, int ret) +static void coroutine_fn qed_co_pwrite_zeroes_cb(void *opaque, int ret) { QEDWriteZeroesCB *cb = opaque; @@ -1429,10 +1429,10 @@ static void coroutine_fn qed_co_write_zeroes_cb(void *opaque, int ret) } } -static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, - BdrvRequestFlags flags) +static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs, + int64_t offset, + int count, + BdrvRequestFlags flags) { BlockAIOCB *blockacb; BDRVQEDState *s = bs->opaque; @@ -1440,25 +1440,22 @@ static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs, QEMUIOVector qiov; struct iovec iov; - /* Refuse if there are untouched backing file sectors */ - if (bs->backing) { - if (qed_offset_into_cluster(s, sector_num * BDRV_SECTOR_SIZE) != 0) { - return -ENOTSUP; - } - if (qed_offset_into_cluster(s, nb_sectors * BDRV_SECTOR_SIZE) != 0) { - return -ENOTSUP; - } + /* Fall back if the request is not aligned */ + if (qed_offset_into_cluster(s, offset) || + qed_offset_into_cluster(s, count)) { + return -ENOTSUP; } /* Zero writes start without an I/O buffer. If a buffer becomes necessary * then it will be allocated during request processing. */ - iov.iov_base = NULL, - iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE, + iov.iov_base = NULL; + iov.iov_len = count; qemu_iovec_init_external(&qiov, &iov, 1); - blockacb = qed_aio_setup(bs, sector_num, &qiov, nb_sectors, - qed_co_write_zeroes_cb, &cb, + blockacb = qed_aio_setup(bs, offset >> BDRV_SECTOR_BITS, &qiov, + count >> BDRV_SECTOR_BITS, + qed_co_pwrite_zeroes_cb, &cb, QED_AIOCB_WRITE | QED_AIOCB_ZERO); if (!blockacb) { return -EIO; @@ -1663,7 +1660,7 @@ static BlockDriver bdrv_qed = { .bdrv_co_get_block_status = bdrv_qed_co_get_block_status, .bdrv_aio_readv = bdrv_qed_aio_readv, .bdrv_aio_writev = bdrv_qed_aio_writev, - .bdrv_co_write_zeroes = bdrv_qed_co_write_zeroes, + .bdrv_co_pwrite_zeroes = bdrv_qed_co_pwrite_zeroes, .bdrv_truncate = bdrv_qed_truncate, .bdrv_getlength = bdrv_qed_getlength, .bdrv_get_info = bdrv_qed_get_info, diff --git a/block/raw-posix.c b/block/raw-posix.c index a4f5a1ba5f..ce2e20f203 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -729,9 +729,33 @@ static void raw_reopen_abort(BDRVReopenState *state) state->opaque = NULL; } +static int hdev_get_max_transfer_length(int fd) +{ +#ifdef BLKSECTGET + int max_sectors = 0; + if (ioctl(fd, BLKSECTGET, &max_sectors) == 0) { + return max_sectors; + } else { + return -errno; + } +#else + return -ENOSYS; +#endif +} + static void raw_refresh_limits(BlockDriverState *bs, Error **errp) { BDRVRawState *s = bs->opaque; + struct stat st; + + if (!fstat(s->fd, &st)) { + if (S_ISBLK(st.st_mode)) { + int ret = hdev_get_max_transfer_length(s->fd); + if (ret >= 0) { + bs->bl.max_transfer_length = ret; + } + } + } raw_probe_alignment(bs, s->fd, errp); bs->bl.min_mem_alignment = s->buf_align; @@ -1252,8 +1276,8 @@ static int aio_worker(void *arg) } static int paio_submit_co(BlockDriverState *bs, int fd, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - int type) + int64_t offset, QEMUIOVector *qiov, + int count, int type) { RawPosixAIOData *acb = g_new(RawPosixAIOData, 1); ThreadPool *pool; @@ -1262,16 +1286,16 @@ static int paio_submit_co(BlockDriverState *bs, int fd, acb->aio_type = type; acb->aio_fildes = fd; - acb->aio_nbytes = nb_sectors * BDRV_SECTOR_SIZE; - acb->aio_offset = sector_num * BDRV_SECTOR_SIZE; + acb->aio_nbytes = count; + acb->aio_offset = offset; if (qiov) { acb->aio_iov = qiov->iov; acb->aio_niov = qiov->niov; - assert(qiov->size == acb->aio_nbytes); + assert(qiov->size == count); } - trace_paio_submit_co(sector_num, nb_sectors, type); + trace_paio_submit_co(offset, count, type); pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); return thread_pool_submit_co(pool, aio_worker, acb); } @@ -1868,17 +1892,17 @@ static coroutine_fn BlockAIOCB *raw_aio_discard(BlockDriverState *bs, cb, opaque, QEMU_AIO_DISCARD); } -static int coroutine_fn raw_co_write_zeroes( - BlockDriverState *bs, int64_t sector_num, - int nb_sectors, BdrvRequestFlags flags) +static int coroutine_fn raw_co_pwrite_zeroes( + BlockDriverState *bs, int64_t offset, + int count, BdrvRequestFlags flags) { BDRVRawState *s = bs->opaque; if (!(flags & BDRV_REQ_MAY_UNMAP)) { - return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors, + return paio_submit_co(bs, s->fd, offset, NULL, count, QEMU_AIO_WRITE_ZEROES); } else if (s->discard_zeroes) { - return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors, + return paio_submit_co(bs, s->fd, offset, NULL, count, QEMU_AIO_DISCARD); } return -ENOTSUP; @@ -1931,7 +1955,7 @@ BlockDriver bdrv_file = { .bdrv_create = raw_create, .bdrv_has_zero_init = bdrv_has_zero_init_1, .bdrv_co_get_block_status = raw_co_get_block_status, - .bdrv_co_write_zeroes = raw_co_write_zeroes, + .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes, .bdrv_aio_readv = raw_aio_readv, .bdrv_aio_writev = raw_aio_writev, @@ -2293,8 +2317,8 @@ static coroutine_fn BlockAIOCB *hdev_aio_discard(BlockDriverState *bs, cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV); } -static coroutine_fn int hdev_co_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) +static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs, + int64_t offset, int count, BdrvRequestFlags flags) { BDRVRawState *s = bs->opaque; int rc; @@ -2304,10 +2328,10 @@ static coroutine_fn int hdev_co_write_zeroes(BlockDriverState *bs, return rc; } if (!(flags & BDRV_REQ_MAY_UNMAP)) { - return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors, + return paio_submit_co(bs, s->fd, offset, NULL, count, QEMU_AIO_WRITE_ZEROES|QEMU_AIO_BLKDEV); } else if (s->discard_zeroes) { - return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors, + return paio_submit_co(bs, s->fd, offset, NULL, count, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV); } return -ENOTSUP; @@ -2379,7 +2403,7 @@ static BlockDriver bdrv_host_device = { .bdrv_reopen_abort = raw_reopen_abort, .bdrv_create = hdev_create, .create_opts = &raw_create_opts, - .bdrv_co_write_zeroes = hdev_co_write_zeroes, + .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes, .bdrv_aio_readv = raw_aio_readv, .bdrv_aio_writev = raw_aio_writev, diff --git a/block/raw_bsd.c b/block/raw_bsd.c index 3385ed448d..b1d5237135 100644 --- a/block/raw_bsd.c +++ b/block/raw_bsd.c @@ -127,11 +127,11 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, (sector_num << BDRV_SECTOR_BITS); } -static int coroutine_fn raw_co_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - BdrvRequestFlags flags) +static int coroutine_fn raw_co_pwrite_zeroes(BlockDriverState *bs, + int64_t offset, int count, + BdrvRequestFlags flags) { - return bdrv_co_write_zeroes(bs->file->bs, sector_num, nb_sectors, flags); + return bdrv_co_pwrite_zeroes(bs->file->bs, offset, count, flags); } static int coroutine_fn raw_co_discard(BlockDriverState *bs, @@ -252,7 +252,7 @@ BlockDriver bdrv_raw = { .bdrv_create = &raw_create, .bdrv_co_readv = &raw_co_readv, .bdrv_co_writev_flags = &raw_co_writev_flags, - .bdrv_co_write_zeroes = &raw_co_write_zeroes, + .bdrv_co_pwrite_zeroes = &raw_co_pwrite_zeroes, .bdrv_co_discard = &raw_co_discard, .bdrv_co_get_block_status = &raw_co_get_block_status, .bdrv_truncate = &raw_truncate, diff --git a/block/snapshot.c b/block/snapshot.c index 6e6e34fcf4..da89d2b085 100644 --- a/block/snapshot.c +++ b/block/snapshot.c @@ -409,9 +409,6 @@ int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bad_bs, if (bdrv_can_snapshot(bs) && bdrv_snapshot_find(bs, snapshot, name) >= 0) { ret = bdrv_snapshot_delete_by_id_or_name(bs, name, err); - if (ret < 0) { - goto fail; - } } aio_context_release(ctx); if (ret < 0) { diff --git a/block/vmdk.c b/block/vmdk.c index 2205cc888f..ee09423b46 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -997,9 +997,9 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp) for (i = 0; i < s->num_extents; i++) { if (!s->extents[i].flat) { - bs->bl.write_zeroes_alignment = - MAX(bs->bl.write_zeroes_alignment, - s->extents[i].cluster_sectors); + bs->bl.pwrite_zeroes_alignment = + MAX(bs->bl.pwrite_zeroes_alignment, + s->extents[i].cluster_sectors << BDRV_SECTOR_BITS); } } } @@ -1703,15 +1703,13 @@ static int vmdk_write_compressed(BlockDriverState *bs, } } -static int coroutine_fn vmdk_co_write_zeroes(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, - BdrvRequestFlags flags) +static int coroutine_fn vmdk_co_pwrite_zeroes(BlockDriverState *bs, + int64_t offset, + int bytes, + BdrvRequestFlags flags) { int ret; BDRVVmdkState *s = bs->opaque; - uint64_t offset = sector_num * BDRV_SECTOR_SIZE; - uint64_t bytes = nb_sectors * BDRV_SECTOR_SIZE; qemu_co_mutex_lock(&s->lock); /* write zeroes could fail if sectors not aligned to cluster, test it with @@ -2402,7 +2400,7 @@ static BlockDriver bdrv_vmdk = { .bdrv_co_preadv = vmdk_co_preadv, .bdrv_co_pwritev = vmdk_co_pwritev, .bdrv_write_compressed = vmdk_write_compressed, - .bdrv_co_write_zeroes = vmdk_co_write_zeroes, + .bdrv_co_pwrite_zeroes = vmdk_co_pwrite_zeroes, .bdrv_close = vmdk_close, .bdrv_create = vmdk_create, .bdrv_co_flush_to_disk = vmdk_co_flush, diff --git a/blockdev.c b/blockdev.c index 6ccb8e1f84..7fd515a4fa 100644 --- a/blockdev.c +++ b/blockdev.c @@ -56,6 +56,8 @@ static QTAILQ_HEAD(, BlockDriverState) monitor_bdrv_states = QTAILQ_HEAD_INITIALIZER(monitor_bdrv_states); +static int do_open_tray(const char *device, bool force, Error **errp); + static const char *const if_name[IF_COUNT] = { [IF_NONE] = "none", [IF_IDE] = "ide", @@ -2274,8 +2276,6 @@ exit: block_job_txn_unref(block_job_txn); } -static int do_open_tray(const char *device, bool force, Error **errp); - void qmp_eject(const char *device, bool has_force, bool force, Error **errp) { Error *local_err = NULL; @@ -2286,16 +2286,11 @@ void qmp_eject(const char *device, bool has_force, bool force, Error **errp) } rc = do_open_tray(device, force, &local_err); - if (local_err) { + if (rc && rc != -ENOSYS) { error_propagate(errp, local_err); return; } - - if (rc == EINPROGRESS) { - error_setg(errp, "Device '%s' is locked and force was not specified, " - "wait for tray to open and try again", device); - return; - } + error_free(local_err); qmp_x_blockdev_remove_medium(device, errp); } @@ -2324,11 +2319,16 @@ void qmp_block_passwd(bool has_device, const char *device, aio_context_release(aio_context); } -/** - * returns -errno on fatal error, +errno for non-fatal situations. - * errp will always be set when the return code is negative. - * May return +ENOSYS if the device has no tray, - * or +EINPROGRESS if the tray is locked and the guest has been notified. +/* + * Attempt to open the tray of @device. + * If @force, ignore its tray lock. + * Else, if the tray is locked, don't open it, but ask the guest to open it. + * On error, store an error through @errp and return -errno. + * If @device does not exist, return -ENODEV. + * If it has no removable media, return -ENOTSUP. + * If it has no tray, return -ENOSYS. + * If the guest was asked to open the tray, return -EINPROGRESS. + * Else, return 0. */ static int do_open_tray(const char *device, bool force, Error **errp) { @@ -2348,8 +2348,8 @@ static int do_open_tray(const char *device, bool force, Error **errp) } if (!blk_dev_has_tray(blk)) { - /* Ignore this command on tray-less devices */ - return ENOSYS; + error_setg(errp, "Device '%s' does not have a tray", device); + return -ENOSYS; } if (blk_dev_is_tray_open(blk)) { @@ -2366,7 +2366,9 @@ static int do_open_tray(const char *device, bool force, Error **errp) } if (locked && !force) { - return EINPROGRESS; + error_setg(errp, "Device '%s' is locked and force was not specified, " + "wait for tray to open and try again", device); + return -EINPROGRESS; } return 0; @@ -2375,10 +2377,18 @@ static int do_open_tray(const char *device, bool force, Error **errp) void qmp_blockdev_open_tray(const char *device, bool has_force, bool force, Error **errp) { + Error *local_err = NULL; + int rc; + if (!has_force) { force = false; } - do_open_tray(device, force, errp); + rc = do_open_tray(device, force, &local_err); + if (rc && rc != -ENOSYS && rc != -EINPROGRESS) { + error_propagate(errp, local_err); + return; + } + error_free(local_err); } void qmp_blockdev_close_tray(const char *device, Error **errp) diff --git a/include/block/block.h b/include/block/block.h index 3fd5043d01..54cca28bac 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -33,7 +33,7 @@ typedef struct BlockDriverInfo { * True if the driver can optimize writing zeroes by unmapping * sectors. This is equivalent to the BLKDISCARDZEROES ioctl in Linux * with the difference that in qemu a discard is allowed to silently - * fail. Therefore we have to use bdrv_write_zeroes with the + * fail. Therefore we have to use bdrv_pwrite_zeroes with the * BDRV_REQ_MAY_UNMAP flag for an optimized zero write with unmapping. * After this call the driver has to guarantee that the contents read * back as zero. It is additionally required that the block device is @@ -227,8 +227,8 @@ int bdrv_read(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, int nb_sectors); int bdrv_write(BlockDriverState *bs, int64_t sector_num, const uint8_t *buf, int nb_sectors); -int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, BdrvRequestFlags flags); +int bdrv_pwrite_zeroes(BlockDriverState *bs, int64_t offset, + int count, BdrvRequestFlags flags); int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags); int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int count); @@ -247,8 +247,8 @@ int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, * function is not suitable for zeroing the entire image in a single request * because it may allocate memory for the entire region. */ -int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, BdrvRequestFlags flags); +int coroutine_fn bdrv_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, + int count, BdrvRequestFlags flags); BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, const char *backing_file); int bdrv_get_backing_file_depth(BlockDriverState *bs); diff --git a/include/block/block_int.h b/include/block/block_int.h index 30a97178c8..8a4963c4fe 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -163,8 +163,8 @@ struct BlockDriver { * function pointer may be NULL or return -ENOSUP and .bdrv_co_writev() * will be called instead. */ - int coroutine_fn (*bdrv_co_write_zeroes)(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, BdrvRequestFlags flags); + int coroutine_fn (*bdrv_co_pwrite_zeroes)(BlockDriverState *bs, + int64_t offset, int count, BdrvRequestFlags flags); int coroutine_fn (*bdrv_co_discard)(BlockDriverState *bs, int64_t sector_num, int nb_sectors); int64_t coroutine_fn (*bdrv_co_get_block_status)(BlockDriverState *bs, @@ -328,11 +328,13 @@ typedef struct BlockLimits { /* optimal alignment for discard requests in sectors */ int64_t discard_alignment; - /* maximum number of sectors that can zeroized at once */ - int max_write_zeroes; + /* maximum number of bytes that can zeroized at once (since it is + * signed, it must be < 2G, if set) */ + int32_t max_pwrite_zeroes; - /* optimal alignment for write zeroes requests in sectors */ - int64_t write_zeroes_alignment; + /* optimal alignment for write zeroes requests in bytes, must be + * power of 2, and less than max_pwrite_zeroes if that is set */ + uint32_t pwrite_zeroes_alignment; /* optimal transfer length in sectors */ int opt_transfer_length; @@ -454,7 +456,7 @@ struct BlockDriverState { unsigned int request_alignment; /* Flags honored during pwrite (so far: BDRV_REQ_FUA) */ unsigned int supported_write_flags; - /* Flags honored during write_zeroes (so far: BDRV_REQ_FUA, + /* Flags honored during pwrite_zeroes (so far: BDRV_REQ_FUA, * BDRV_REQ_MAY_UNMAP) */ unsigned int supported_zero_flags; diff --git a/migration/block.c b/migration/block.c index e0628d187f..ebc10e628d 100644 --- a/migration/block.c +++ b/migration/block.c @@ -52,7 +52,8 @@ typedef struct BlkMigDevState { /* Written during setup phase. Can be read without a lock. */ - BlockDriverState *bs; + BlockBackend *blk; + char *blk_name; int shared_base; int64_t total_sectors; QSIMPLEQ_ENTRY(BlkMigDevState) entry; @@ -145,9 +146,9 @@ static void blk_send(QEMUFile *f, BlkMigBlock * blk) | flags); /* device name */ - len = strlen(bdrv_get_device_name(blk->bmds->bs)); + len = strlen(blk->bmds->blk_name); qemu_put_byte(f, len); - qemu_put_buffer(f, (uint8_t *)bdrv_get_device_name(blk->bmds->bs), len); + qemu_put_buffer(f, (uint8_t *) blk->bmds->blk_name, len); /* if a block is zero we need to flush here since the network * bandwidth is now a lot higher than the storage device bandwidth. @@ -201,7 +202,7 @@ static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector) { int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK; - if (sector < bdrv_nb_sectors(bmds->bs)) { + if (sector < blk_nb_sectors(bmds->blk)) { return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] & (1UL << (chunk % (sizeof(unsigned long) * 8)))); } else { @@ -235,10 +236,10 @@ static void bmds_set_aio_inflight(BlkMigDevState *bmds, int64_t sector_num, static void alloc_aio_bitmap(BlkMigDevState *bmds) { - BlockDriverState *bs = bmds->bs; + BlockBackend *bb = bmds->blk; int64_t bitmap_size; - bitmap_size = bdrv_nb_sectors(bs) + BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1; + bitmap_size = blk_nb_sectors(bb) + BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1; bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8; bmds->aio_bitmap = g_malloc0(bitmap_size); @@ -268,19 +269,19 @@ static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds) { int64_t total_sectors = bmds->total_sectors; int64_t cur_sector = bmds->cur_sector; - BlockDriverState *bs = bmds->bs; + BlockBackend *bb = bmds->blk; BlkMigBlock *blk; int nr_sectors; if (bmds->shared_base) { qemu_mutex_lock_iothread(); - aio_context_acquire(bdrv_get_aio_context(bs)); + aio_context_acquire(blk_get_aio_context(bb)); while (cur_sector < total_sectors && - !bdrv_is_allocated(bs, cur_sector, MAX_IS_ALLOCATED_SEARCH, - &nr_sectors)) { + !bdrv_is_allocated(blk_bs(bb), cur_sector, + MAX_IS_ALLOCATED_SEARCH, &nr_sectors)) { cur_sector += nr_sectors; } - aio_context_release(bdrv_get_aio_context(bs)); + aio_context_release(blk_get_aio_context(bb)); qemu_mutex_unlock_iothread(); } @@ -323,12 +324,12 @@ static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds) * without the need to acquire the AioContext. */ qemu_mutex_lock_iothread(); - aio_context_acquire(bdrv_get_aio_context(bmds->bs)); - blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov, - nr_sectors, blk_mig_read_cb, blk); + aio_context_acquire(blk_get_aio_context(bmds->blk)); + blk->aiocb = blk_aio_preadv(bb, cur_sector * BDRV_SECTOR_SIZE, &blk->qiov, + 0, blk_mig_read_cb, blk); bdrv_reset_dirty_bitmap(bmds->dirty_bitmap, cur_sector, nr_sectors); - aio_context_release(bdrv_get_aio_context(bmds->bs)); + aio_context_release(blk_get_aio_context(bmds->blk)); qemu_mutex_unlock_iothread(); bmds->cur_sector = cur_sector + nr_sectors; @@ -343,10 +344,10 @@ static int set_dirty_tracking(void) int ret; QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { - aio_context_acquire(bdrv_get_aio_context(bmds->bs)); - bmds->dirty_bitmap = bdrv_create_dirty_bitmap(bmds->bs, BLOCK_SIZE, - NULL, NULL); - aio_context_release(bdrv_get_aio_context(bmds->bs)); + aio_context_acquire(blk_get_aio_context(bmds->blk)); + bmds->dirty_bitmap = bdrv_create_dirty_bitmap(blk_bs(bmds->blk), + BLOCK_SIZE, NULL, NULL); + aio_context_release(blk_get_aio_context(bmds->blk)); if (!bmds->dirty_bitmap) { ret = -errno; goto fail; @@ -357,9 +358,9 @@ static int set_dirty_tracking(void) fail: QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { if (bmds->dirty_bitmap) { - aio_context_acquire(bdrv_get_aio_context(bmds->bs)); - bdrv_release_dirty_bitmap(bmds->bs, bmds->dirty_bitmap); - aio_context_release(bdrv_get_aio_context(bmds->bs)); + aio_context_acquire(blk_get_aio_context(bmds->blk)); + bdrv_release_dirty_bitmap(blk_bs(bmds->blk), bmds->dirty_bitmap); + aio_context_release(blk_get_aio_context(bmds->blk)); } } return ret; @@ -372,9 +373,9 @@ static void unset_dirty_tracking(void) BlkMigDevState *bmds; QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { - aio_context_acquire(bdrv_get_aio_context(bmds->bs)); - bdrv_release_dirty_bitmap(bmds->bs, bmds->dirty_bitmap); - aio_context_release(bdrv_get_aio_context(bmds->bs)); + aio_context_acquire(blk_get_aio_context(bmds->blk)); + bdrv_release_dirty_bitmap(blk_bs(bmds->blk), bmds->dirty_bitmap); + aio_context_release(blk_get_aio_context(bmds->blk)); } } @@ -384,6 +385,11 @@ static void init_blk_migration(QEMUFile *f) BlkMigDevState *bmds; int64_t sectors; BdrvNextIterator it; + int i, num_bs = 0; + struct { + BlkMigDevState *bmds; + BlockDriverState *bs; + } *bmds_bs; block_mig_state.submitted = 0; block_mig_state.read_done = 0; @@ -393,27 +399,32 @@ static void init_blk_migration(QEMUFile *f) block_mig_state.bulk_completed = 0; block_mig_state.zero_blocks = migrate_zero_blocks(); - for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { + num_bs++; + } + bmds_bs = g_malloc0(num_bs * sizeof(*bmds_bs)); + + for (i = 0, bs = bdrv_first(&it); bs; bs = bdrv_next(&it), i++) { if (bdrv_is_read_only(bs)) { continue; } sectors = bdrv_nb_sectors(bs); if (sectors <= 0) { - return; + goto out; } bmds = g_new0(BlkMigDevState, 1); - bmds->bs = bs; + bmds->blk = blk_new(); + bmds->blk_name = g_strdup(bdrv_get_device_name(bs)); bmds->bulk_completed = 0; bmds->total_sectors = sectors; bmds->completed_sectors = 0; bmds->shared_base = block_mig_state.shared_base; - alloc_aio_bitmap(bmds); - error_setg(&bmds->blocker, "block device is in use by migration"); - bdrv_op_block_all(bs, bmds->blocker); - bdrv_ref(bs); + + assert(i < num_bs); + bmds_bs[i].bmds = bmds; + bmds_bs[i].bs = bs; block_mig_state.total_sector_sum += sectors; @@ -426,6 +437,24 @@ static void init_blk_migration(QEMUFile *f) QSIMPLEQ_INSERT_TAIL(&block_mig_state.bmds_list, bmds, entry); } + + /* Can only insert new BDSes now because doing so while iterating block + * devices may end up in a deadlock (iterating the new BDSes, too). */ + for (i = 0; i < num_bs; i++) { + BlkMigDevState *bmds = bmds_bs[i].bmds; + BlockDriverState *bs = bmds_bs[i].bs; + + if (bmds) { + blk_insert_bs(bmds->blk, bs); + + alloc_aio_bitmap(bmds); + error_setg(&bmds->blocker, "block device is in use by migration"); + bdrv_op_block_all(bs, bmds->blocker); + } + } + +out: + g_free(bmds_bs); } /* Called with no lock taken. */ @@ -482,6 +511,7 @@ static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds, int is_async) { BlkMigBlock *blk; + BlockDriverState *bs = blk_bs(bmds->blk); int64_t total_sectors = bmds->total_sectors; int64_t sector; int nr_sectors; @@ -491,11 +521,11 @@ static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds, blk_mig_lock(); if (bmds_aio_inflight(bmds, sector)) { blk_mig_unlock(); - bdrv_drain(bmds->bs); + blk_drain(bmds->blk); } else { blk_mig_unlock(); } - if (bdrv_get_dirty(bmds->bs, bmds->dirty_bitmap, sector)) { + if (bdrv_get_dirty(bs, bmds->dirty_bitmap, sector)) { if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) { nr_sectors = total_sectors - sector; @@ -513,15 +543,18 @@ static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds, blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE; qemu_iovec_init_external(&blk->qiov, &blk->iov, 1); - blk->aiocb = bdrv_aio_readv(bmds->bs, sector, &blk->qiov, - nr_sectors, blk_mig_read_cb, blk); + blk->aiocb = blk_aio_preadv(bmds->blk, + sector * BDRV_SECTOR_SIZE, + &blk->qiov, 0, blk_mig_read_cb, + blk); blk_mig_lock(); block_mig_state.submitted++; bmds_set_aio_inflight(bmds, sector, nr_sectors, 1); blk_mig_unlock(); } else { - ret = bdrv_read(bmds->bs, sector, blk->buf, nr_sectors); + ret = blk_pread(bmds->blk, sector * BDRV_SECTOR_SIZE, blk->buf, + nr_sectors * BDRV_SECTOR_SIZE); if (ret < 0) { goto error; } @@ -559,9 +592,9 @@ static int blk_mig_save_dirty_block(QEMUFile *f, int is_async) int ret = 1; QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { - aio_context_acquire(bdrv_get_aio_context(bmds->bs)); + aio_context_acquire(blk_get_aio_context(bmds->blk)); ret = mig_save_device_dirty(f, bmds, is_async); - aio_context_release(bdrv_get_aio_context(bmds->bs)); + aio_context_release(blk_get_aio_context(bmds->blk)); if (ret <= 0) { break; } @@ -619,9 +652,9 @@ static int64_t get_remaining_dirty(void) int64_t dirty = 0; QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { - aio_context_acquire(bdrv_get_aio_context(bmds->bs)); + aio_context_acquire(blk_get_aio_context(bmds->blk)); dirty += bdrv_get_dirty_count(bmds->dirty_bitmap); - aio_context_release(bdrv_get_aio_context(bmds->bs)); + aio_context_release(blk_get_aio_context(bmds->blk)); } return dirty << BDRV_SECTOR_BITS; @@ -641,15 +674,16 @@ static void block_migration_cleanup(void *opaque) while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) { QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry); - bdrv_op_unblock_all(bmds->bs, bmds->blocker); + bdrv_op_unblock_all(blk_bs(bmds->blk), bmds->blocker); error_free(bmds->blocker); - /* Save ctx, because bmds->bs can disappear during bdrv_unref. */ - ctx = bdrv_get_aio_context(bmds->bs); + /* Save ctx, because bmds->blk can disappear during blk_unref. */ + ctx = blk_get_aio_context(bmds->blk); aio_context_acquire(ctx); - bdrv_unref(bmds->bs); + blk_unref(bmds->blk); aio_context_release(ctx); + g_free(bmds->blk_name); g_free(bmds->aio_bitmap); g_free(bmds); } @@ -827,8 +861,7 @@ static int block_load(QEMUFile *f, void *opaque, int version_id) int len, flags; char device_name[256]; int64_t addr; - BlockDriverState *bs, *bs_prev = NULL; - BlockBackend *blk; + BlockBackend *blk, *blk_prev = NULL;; Error *local_err = NULL; uint8_t *buf; int64_t total_sectors = 0; @@ -853,23 +886,17 @@ static int block_load(QEMUFile *f, void *opaque, int version_id) device_name); return -EINVAL; } - bs = blk_bs(blk); - if (!bs) { - fprintf(stderr, "Block device %s has no medium\n", - device_name); - return -EINVAL; - } - if (bs != bs_prev) { - bs_prev = bs; - total_sectors = bdrv_nb_sectors(bs); + if (blk != blk_prev) { + blk_prev = blk; + total_sectors = blk_nb_sectors(blk); if (total_sectors <= 0) { error_report("Error getting length of block device %s", device_name); return -EINVAL; } - bdrv_invalidate_cache(bs, &local_err); + blk_invalidate_cache(blk, &local_err); if (local_err) { error_report_err(local_err); return -EINVAL; @@ -883,12 +910,14 @@ static int block_load(QEMUFile *f, void *opaque, int version_id) } if (flags & BLK_MIG_FLAG_ZERO_BLOCK) { - ret = bdrv_write_zeroes(bs, addr, nr_sectors, + ret = blk_pwrite_zeroes(blk, addr * BDRV_SECTOR_SIZE, + nr_sectors * BDRV_SECTOR_SIZE, BDRV_REQ_MAY_UNMAP); } else { buf = g_malloc(BLOCK_SIZE); qemu_get_buffer(f, buf, BLOCK_SIZE); - ret = bdrv_write(bs, addr, buf, nr_sectors); + ret = blk_pwrite(blk, addr * BDRV_SECTOR_SIZE, buf, + nr_sectors * BDRV_SECTOR_SIZE, 0); g_free(buf); } diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx index e7cded6e24..7e95b2da79 100644 --- a/qemu-img-cmds.hx +++ b/qemu-img-cmds.hx @@ -9,6 +9,12 @@ STEXI @table @option ETEXI +DEF("bench", img_bench, + "bench [-c count] [-d depth] [-f fmt] [--flush-interval=flush_interval] [-n] [--no-drain] [-o offset] [--pattern=pattern] [-q] [-s buffer_size] [-S step_size] [-t cache] [-w] filename") +STEXI +@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [--flush-interval=@var{flush_interval}] [-n] [--no-drain] [-o @var{offset}] [--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-S @var{step_size}] [-t @var{cache}] [-w] @var{filename} +ETEXI + DEF("check", img_check, "check [-q] [--object objectdef] [--image-opts] [-f fmt] [--output=ofmt] [-r [leaks | all]] [-T src_cache] filename") STEXI diff --git a/qemu-img.c b/qemu-img.c index 32e307c075..251386b49d 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -54,6 +54,9 @@ enum { OPTION_BACKING_CHAIN = 257, OPTION_OBJECT = 258, OPTION_IMAGE_OPTS = 259, + OPTION_PATTERN = 260, + OPTION_FLUSH_INTERVAL = 261, + OPTION_NO_DRAIN = 262, }; typedef enum OutputFormat { @@ -3460,6 +3463,332 @@ out_no_progress: return 0; } +typedef struct BenchData { + BlockBackend *blk; + uint64_t image_size; + bool write; + int bufsize; + int step; + int nrreq; + int n; + int flush_interval; + bool drain_on_flush; + uint8_t *buf; + QEMUIOVector *qiov; + + int in_flight; + bool in_flush; + uint64_t offset; +} BenchData; + +static void bench_undrained_flush_cb(void *opaque, int ret) +{ + if (ret < 0) { + error_report("Failed flush request: %s\n", strerror(-ret)); + exit(EXIT_FAILURE); + } +} + +static void bench_cb(void *opaque, int ret) +{ + BenchData *b = opaque; + BlockAIOCB *acb; + + if (ret < 0) { + error_report("Failed request: %s\n", strerror(-ret)); + exit(EXIT_FAILURE); + } + + if (b->in_flush) { + /* Just finished a flush with drained queue: Start next requests */ + assert(b->in_flight == 0); + b->in_flush = false; + } else if (b->in_flight > 0) { + int remaining = b->n - b->in_flight; + + b->n--; + b->in_flight--; + + /* Time for flush? Drain queue if requested, then flush */ + if (b->flush_interval && remaining % b->flush_interval == 0) { + if (!b->in_flight || !b->drain_on_flush) { + BlockCompletionFunc *cb; + + if (b->drain_on_flush) { + b->in_flush = true; + cb = bench_cb; + } else { + cb = bench_undrained_flush_cb; + } + + acb = blk_aio_flush(b->blk, cb, b); + if (!acb) { + error_report("Failed to issue flush request"); + exit(EXIT_FAILURE); + } + } + if (b->drain_on_flush) { + return; + } + } + } + + while (b->n > b->in_flight && b->in_flight < b->nrreq) { + if (b->write) { + acb = blk_aio_pwritev(b->blk, b->offset, b->qiov, 0, + bench_cb, b); + } else { + acb = blk_aio_preadv(b->blk, b->offset, b->qiov, 0, + bench_cb, b); + } + if (!acb) { + error_report("Failed to issue request"); + exit(EXIT_FAILURE); + } + b->in_flight++; + b->offset += b->step; + b->offset %= b->image_size; + } +} + +static int img_bench(int argc, char **argv) +{ + int c, ret = 0; + const char *fmt = NULL, *filename; + bool quiet = false; + bool image_opts = false; + bool is_write = false; + int count = 75000; + int depth = 64; + int64_t offset = 0; + size_t bufsize = 4096; + int pattern = 0; + size_t step = 0; + int flush_interval = 0; + bool drain_on_flush = true; + int64_t image_size; + BlockBackend *blk = NULL; + BenchData data = {}; + int flags = 0; + bool writethrough; + struct timeval t1, t2; + int i; + + for (;;) { + static const struct option long_options[] = { + {"help", no_argument, 0, 'h'}, + {"flush-interval", required_argument, 0, OPTION_FLUSH_INTERVAL}, + {"image-opts", no_argument, 0, OPTION_IMAGE_OPTS}, + {"pattern", required_argument, 0, OPTION_PATTERN}, + {"no-drain", no_argument, 0, OPTION_NO_DRAIN}, + {0, 0, 0, 0} + }; + c = getopt_long(argc, argv, "hc:d:f:no:qs:S:t:w", long_options, NULL); + if (c == -1) { + break; + } + + switch (c) { + case 'h': + case '?': + help(); + break; + case 'c': + { + char *end; + errno = 0; + count = strtoul(optarg, &end, 0); + if (errno || *end || count > INT_MAX) { + error_report("Invalid request count specified"); + return 1; + } + break; + } + case 'd': + { + char *end; + errno = 0; + depth = strtoul(optarg, &end, 0); + if (errno || *end || depth > INT_MAX) { + error_report("Invalid queue depth specified"); + return 1; + } + break; + } + case 'f': + fmt = optarg; + break; + case 'n': + flags |= BDRV_O_NATIVE_AIO; + break; + case 'o': + { + char *end; + errno = 0; + offset = qemu_strtosz_suffix(optarg, &end, + QEMU_STRTOSZ_DEFSUFFIX_B); + if (offset < 0|| *end) { + error_report("Invalid offset specified"); + return 1; + } + break; + } + break; + case 'q': + quiet = true; + break; + case 's': + { + int64_t sval; + char *end; + + sval = qemu_strtosz_suffix(optarg, &end, QEMU_STRTOSZ_DEFSUFFIX_B); + if (sval < 0 || sval > INT_MAX || *end) { + error_report("Invalid buffer size specified"); + return 1; + } + + bufsize = sval; + break; + } + case 'S': + { + int64_t sval; + char *end; + + sval = qemu_strtosz_suffix(optarg, &end, QEMU_STRTOSZ_DEFSUFFIX_B); + if (sval < 0 || sval > INT_MAX || *end) { + error_report("Invalid step size specified"); + return 1; + } + + step = sval; + break; + } + case 't': + ret = bdrv_parse_cache_mode(optarg, &flags, &writethrough); + if (ret < 0) { + error_report("Invalid cache mode"); + ret = -1; + goto out; + } + break; + case 'w': + flags |= BDRV_O_RDWR; + is_write = true; + break; + case OPTION_PATTERN: + { + char *end; + errno = 0; + pattern = strtoul(optarg, &end, 0); + if (errno || *end || pattern > 0xff) { + error_report("Invalid pattern byte specified"); + return 1; + } + break; + } + case OPTION_FLUSH_INTERVAL: + { + char *end; + errno = 0; + flush_interval = strtoul(optarg, &end, 0); + if (errno || *end || flush_interval > INT_MAX) { + error_report("Invalid flush interval specified"); + return 1; + } + break; + } + case OPTION_NO_DRAIN: + drain_on_flush = false; + break; + case OPTION_IMAGE_OPTS: + image_opts = true; + break; + } + } + + if (optind != argc - 1) { + error_exit("Expecting one image file name"); + } + filename = argv[argc - 1]; + + if (!is_write && flush_interval) { + error_report("--flush-interval is only available in write tests"); + ret = -1; + goto out; + } + if (flush_interval && flush_interval < depth) { + error_report("Flush interval can't be smaller than depth"); + ret = -1; + goto out; + } + + blk = img_open(image_opts, filename, fmt, flags, writethrough, quiet); + if (!blk) { + ret = -1; + goto out; + } + + image_size = blk_getlength(blk); + if (image_size < 0) { + ret = image_size; + goto out; + } + + data = (BenchData) { + .blk = blk, + .image_size = image_size, + .bufsize = bufsize, + .step = step ?: bufsize, + .nrreq = depth, + .n = count, + .offset = offset, + .write = is_write, + .flush_interval = flush_interval, + .drain_on_flush = drain_on_flush, + }; + printf("Sending %d %s requests, %d bytes each, %d in parallel " + "(starting at offset %" PRId64 ", step size %d)\n", + data.n, data.write ? "write" : "read", data.bufsize, data.nrreq, + data.offset, data.step); + if (flush_interval) { + printf("Sending flush every %d requests\n", flush_interval); + } + + data.buf = blk_blockalign(blk, data.nrreq * data.bufsize); + memset(data.buf, pattern, data.nrreq * data.bufsize); + + data.qiov = g_new(QEMUIOVector, data.nrreq); + for (i = 0; i < data.nrreq; i++) { + qemu_iovec_init(&data.qiov[i], 1); + qemu_iovec_add(&data.qiov[i], + data.buf + i * data.bufsize, data.bufsize); + } + + gettimeofday(&t1, NULL); + bench_cb(&data, 0); + + while (data.n > 0) { + main_loop_wait(false); + } + gettimeofday(&t2, NULL); + + printf("Run completed in %3.3f seconds.\n", + (t2.tv_sec - t1.tv_sec) + + ((double)(t2.tv_usec - t1.tv_usec) / 1000000)); + +out: + qemu_vfree(data.buf); + blk_unref(blk); + + if (ret) { + return 1; + } + return 0; +} + + static const img_cmd_t img_cmds[] = { #define DEF(option, callback, arg_string) \ { option, callback }, diff --git a/qemu-img.texi b/qemu-img.texi index afaebdd408..cbe50e9b88 100644 --- a/qemu-img.texi +++ b/qemu-img.texi @@ -131,6 +131,30 @@ Skip the creation of the target volume Command description: @table @option +@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [--flush-interval=@var{flush_interval}] [-n] [--no-drain] [-o @var{offset}] [--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-S @var{step_size}] [-t @var{cache}] [-w] @var{filename} + +Run a simple sequential I/O benchmark on the specified image. If @code{-w} is +specified, a write test is performed, otherwise a read test is performed. + +A total number of @var{count} I/O requests is performed, each @var{buffer_size} +bytes in size, and with @var{depth} requests in parallel. The first request +starts at the position given by @var{offset}, each following request increases +the current position by @var{step_size}. If @var{step_size} is not given, +@var{buffer_size} is used for its value. + +If @var{flush_interval} is specified for a write test, the request queue is +drained and a flush is issued before new writes are made whenever the number of +remaining requests is a multiple of @var{flush_interval}. If additionally +@code{--no-drain} is specified, a flush is issued without draining the request +queue first. + +If @code{-n} is specified, the native AIO backend is used if possible. On +Linux, this option only works if @code{-t none} or @code{-t directsync} is +specified as well. + +For write tests, by default a buffer filled with zeros is written. This can be +overridden with a pattern byte specified by @var{pattern}. + @item check [-f @var{fmt}] [--output=@var{ofmt}] [-r [leaks | all]] [-T @var{src_cache}] @var{filename} Perform a consistency check on the disk image @var{filename}. The command can diff --git a/tests/qemu-iotests/034 b/tests/qemu-iotests/034 index c711cfce94..1b28bdae63 100755 --- a/tests/qemu-iotests/034 +++ b/tests/qemu-iotests/034 @@ -1,6 +1,6 @@ #!/bin/bash # -# Test bdrv_write_zeroes with backing files +# Test bdrv_pwrite_zeroes with backing files (see also 154) # # Copyright (C) 2012 Red Hat, Inc. # diff --git a/tests/qemu-iotests/154 b/tests/qemu-iotests/154 index 23f1b3ab16..7ca7219f08 100755 --- a/tests/qemu-iotests/154 +++ b/tests/qemu-iotests/154 @@ -1,6 +1,6 @@ #!/bin/bash # -# qcow2 specific bdrv_write_zeroes tests with backing files (complements 034) +# qcow2 specific bdrv_pwrite_zeroes tests with backing files (complements 034) # # Copyright (C) 2016 Red Hat, Inc. # @@ -115,6 +115,46 @@ $QEMU_IO -c "read -P 0 40k 3k" "$TEST_IMG" | _filter_qemu_io $QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map echo +echo == write_zeroes covers non-zero data == + +CLUSTER_SIZE=512 TEST_IMG="$TEST_IMG.base" _make_test_img $size +_make_test_img -b "$TEST_IMG.base" + +# non-zero data at front of request +# Backing file: -- XX -- -- +# Active layer: -- 00 00 -- + +$QEMU_IO -c "write -P 0x11 5k 1k" "$TEST_IMG.base" | _filter_qemu_io +$QEMU_IO -c "write -z 5k 2k" "$TEST_IMG" | _filter_qemu_io +$QEMU_IO -c "read -P 0 4k 4k" "$TEST_IMG" | _filter_qemu_io + +# non-zero data at end of request +# Backing file: -- -- XX -- +# Active layer: -- 00 00 -- + +$QEMU_IO -c "write -P 0x11 14k 1k" "$TEST_IMG.base" | _filter_qemu_io +$QEMU_IO -c "write -z 13k 2k" "$TEST_IMG" | _filter_qemu_io +$QEMU_IO -c "read -P 0 12k 4k" "$TEST_IMG" | _filter_qemu_io + +# non-zero data matches size of request +# Backing file: -- XX XX -- +# Active layer: -- 00 00 -- + +$QEMU_IO -c "write -P 0x11 21k 2k" "$TEST_IMG.base" | _filter_qemu_io +$QEMU_IO -c "write -z 21k 2k" "$TEST_IMG" | _filter_qemu_io +$QEMU_IO -c "read -P 0 20k 4k" "$TEST_IMG" | _filter_qemu_io + +# non-zero data smaller than request +# Backing file: -- -X X- -- +# Active layer: -- 00 00 -- + +$QEMU_IO -c "write -P 0x11 30208 1k" "$TEST_IMG.base" | _filter_qemu_io +$QEMU_IO -c "write -z 29k 2k" "$TEST_IMG" | _filter_qemu_io +$QEMU_IO -c "read -P 0 28k 4k" "$TEST_IMG" | _filter_qemu_io + +$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map + +echo echo == spanning two clusters, non-zero before request == CLUSTER_SIZE=512 TEST_IMG="$TEST_IMG.base" _make_test_img $size diff --git a/tests/qemu-iotests/154.out b/tests/qemu-iotests/154.out index 8946b734ac..da9eabdda8 100644 --- a/tests/qemu-iotests/154.out +++ b/tests/qemu-iotests/154.out @@ -74,6 +74,43 @@ read 3072/3072 bytes at offset 40960 { "start": 40960, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 24576}, { "start": 45056, "length": 134172672, "depth": 1, "zero": true, "data": false}] +== write_zeroes covers non-zero data == +Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=134217728 +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 backing_file=TEST_DIR/t.IMGFMT.base +wrote 1024/1024 bytes at offset 5120 +1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2048/2048 bytes at offset 5120 +2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 4096/4096 bytes at offset 4096 +4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 1024/1024 bytes at offset 14336 +1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2048/2048 bytes at offset 13312 +2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 4096/4096 bytes at offset 12288 +4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2048/2048 bytes at offset 21504 +2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2048/2048 bytes at offset 21504 +2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 4096/4096 bytes at offset 20480 +4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 1024/1024 bytes at offset 30208 +1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2048/2048 bytes at offset 29696 +2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 4096/4096 bytes at offset 28672 +4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +[{ "start": 0, "length": 4096, "depth": 1, "zero": true, "data": false}, +{ "start": 4096, "length": 4096, "depth": 0, "zero": true, "data": false}, +{ "start": 8192, "length": 4096, "depth": 1, "zero": true, "data": false}, +{ "start": 12288, "length": 4096, "depth": 0, "zero": true, "data": false}, +{ "start": 16384, "length": 4096, "depth": 1, "zero": true, "data": false}, +{ "start": 20480, "length": 4096, "depth": 0, "zero": true, "data": false}, +{ "start": 24576, "length": 4096, "depth": 1, "zero": true, "data": false}, +{ "start": 28672, "length": 4096, "depth": 0, "zero": true, "data": false}, +{ "start": 32768, "length": 134184960, "depth": 1, "zero": true, "data": false}] + == spanning two clusters, non-zero before request == Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=134217728 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 backing_file=TEST_DIR/t.IMGFMT.base @@ -106,11 +143,14 @@ read 1024/1024 bytes at offset 67584 read 5120/5120 bytes at offset 68608 5 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) [{ "start": 0, "length": 32768, "depth": 1, "zero": true, "data": false}, -{ "start": 32768, "length": 8192, "depth": 0, "zero": false, "data": true, "offset": 20480}, +{ "start": 32768, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 20480}, +{ "start": 36864, "length": 4096, "depth": 0, "zero": true, "data": false}, { "start": 40960, "length": 8192, "depth": 1, "zero": true, "data": false}, -{ "start": 49152, "length": 8192, "depth": 0, "zero": false, "data": true, "offset": 28672}, +{ "start": 49152, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 24576}, +{ "start": 53248, "length": 4096, "depth": 0, "zero": true, "data": false}, { "start": 57344, "length": 8192, "depth": 1, "zero": true, "data": false}, -{ "start": 65536, "length": 8192, "depth": 0, "zero": false, "data": true, "offset": 36864}, +{ "start": 65536, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 28672}, +{ "start": 69632, "length": 4096, "depth": 0, "zero": true, "data": false}, { "start": 73728, "length": 134144000, "depth": 1, "zero": true, "data": false}] == spanning two clusters, non-zero after request == @@ -145,11 +185,14 @@ read 7168/7168 bytes at offset 65536 read 1024/1024 bytes at offset 72704 1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) [{ "start": 0, "length": 32768, "depth": 1, "zero": true, "data": false}, -{ "start": 32768, "length": 8192, "depth": 0, "zero": false, "data": true, "offset": 20480}, +{ "start": 32768, "length": 4096, "depth": 0, "zero": true, "data": false}, +{ "start": 36864, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 20480}, { "start": 40960, "length": 8192, "depth": 1, "zero": true, "data": false}, -{ "start": 49152, "length": 8192, "depth": 0, "zero": false, "data": true, "offset": 28672}, +{ "start": 49152, "length": 4096, "depth": 0, "zero": true, "data": false}, +{ "start": 53248, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 24576}, { "start": 57344, "length": 8192, "depth": 1, "zero": true, "data": false}, -{ "start": 65536, "length": 8192, "depth": 0, "zero": false, "data": true, "offset": 36864}, +{ "start": 65536, "length": 4096, "depth": 0, "zero": true, "data": false}, +{ "start": 69632, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 28672}, { "start": 73728, "length": 134144000, "depth": 1, "zero": true, "data": false}] == spanning two clusters, partially overwriting backing file == diff --git a/trace-events b/trace-events index c55d7084fb..421d89f476 100644 --- a/trace-events +++ b/trace-events @@ -72,7 +72,7 @@ bdrv_aio_readv(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs % bdrv_aio_writev(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p" bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d" bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d" -bdrv_co_write_zeroes(void *bs, int64_t sector_num, int nb_sector, int flags) "bs %p sector_num %"PRId64" nb_sectors %d flags %#x" +bdrv_co_pwrite_zeroes(void *bs, int64_t offset, int count, int flags) "bs %p offset %"PRId64" count %d flags %#x" bdrv_co_do_copy_on_readv(void *bs, int64_t sector_num, int nb_sectors, int64_t cluster_sector_num, int cluster_nb_sectors) "bs %p sector_num %"PRId64" nb_sectors %d cluster_sector_num %"PRId64" cluster_nb_sectors %d" # block/stream.c @@ -131,7 +131,7 @@ thread_pool_cancel(void *req, void *opaque) "req %p opaque %p" # block/raw-win32.c # block/raw-posix.c -paio_submit_co(int64_t sector_num, int nb_sectors, int type) "sector_num %"PRId64" nb_sectors %d type %d" +paio_submit_co(int64_t offset, int count, int type) "offset %"PRId64" count %d type %d" paio_submit(void *acb, void *opaque, int64_t sector_num, int nb_sectors, int type) "acb %p opaque %p sector_num %"PRId64" nb_sectors %d type %d" # ioport.c @@ -611,6 +611,8 @@ qcow2_writev_done_req(void *co, int ret) "co %p ret %d" qcow2_writev_start_part(void *co) "co %p" qcow2_writev_done_part(void *co, int cur_nr_sectors) "co %p cur_nr_sectors %d" qcow2_writev_data(void *co, uint64_t offset) "co %p offset %" PRIx64 +qcow2_pwrite_zeroes_start_req(void *co, int64_t offset, int count) "co %p offset %" PRIx64 " count %d" +qcow2_pwrite_zeroes(void *co, int64_t offset, int count) "co %p offset %" PRIx64 " count %d" # block/qcow2-cluster.c qcow2_alloc_clusters_offset(void *co, uint64_t offset, int num) "co %p offset %" PRIx64 " num %d" |