diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2016-05-12 16:33:40 +0100 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2016-05-12 16:33:40 +0100 |
commit | f68419eee9a966f5a915314c43cda6778f976a77 (patch) | |
tree | 5b63c0bf712a978a50c43be7000b6a9332258c20 /block | |
parent | e4f70d635863cfc3e3fa7d9a6e37b569ae94d82f (diff) | |
parent | efc2645f714aae1bcf22e8165cad51c57f34fdf3 (diff) |
Merge remote-tracking branch 'remotes/kevin/tags/for-upstream' into staging
Block layer patches
# gpg: Signature made Thu 12 May 2016 14:37:05 BST using RSA key ID C88F2FD6
# gpg: Good signature from "Kevin Wolf <kwolf@redhat.com>"
* remotes/kevin/tags/for-upstream: (69 commits)
qemu-iotests: iotests: fail hard if not run via "check"
block: enable testing of LUKS driver with block I/O tests
block: add support for encryption secrets in block I/O tests
block: add support for --image-opts in block I/O tests
qemu-io: Add 'write -z -u' to test MAY_UNMAP flag
qemu-io: Add 'write -f' to test FUA flag
qemu-io: Allow unaligned access by default
qemu-io: Use bool for command line flags
qemu-io: Make 'open' subcommand more like command line
qemu-io: Add missing option documentation
qmp: add monitor command to add/remove a child
quorum: implement bdrv_add_child() and bdrv_del_child()
Add new block driver interface to add/delete a BDS's child
qemu-img: check block status of backing file when converting.
iotests: fix the redirection order in 083
block: Inactivate all children
block: Drop superfluous invalidating bs->file from drivers
block: Invalidate all children
nbd: Simplify client FUA handling
block: Honor BDRV_REQ_FUA during write_zeroes
...
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Diffstat (limited to 'block')
-rw-r--r-- | block/block-backend.c | 114 | ||||
-rw-r--r-- | block/bochs.c | 51 | ||||
-rw-r--r-- | block/cloop.c | 38 | ||||
-rw-r--r-- | block/crypto.c | 2 | ||||
-rw-r--r-- | block/curl.c | 10 | ||||
-rw-r--r-- | block/dmg.c | 40 | ||||
-rw-r--r-- | block/io.c | 514 | ||||
-rw-r--r-- | block/iscsi.c | 19 | ||||
-rw-r--r-- | block/linux-aio.c | 57 | ||||
-rw-r--r-- | block/nbd-client.c | 11 | ||||
-rw-r--r-- | block/nbd-client.h | 2 | ||||
-rw-r--r-- | block/nbd.c | 37 | ||||
-rw-r--r-- | block/parallels.c | 5 | ||||
-rw-r--r-- | block/qcow.c | 8 | ||||
-rw-r--r-- | block/qcow2.c | 76 | ||||
-rw-r--r-- | block/qed.c | 12 | ||||
-rw-r--r-- | block/quorum.c | 94 | ||||
-rw-r--r-- | block/raw-aio.h | 15 | ||||
-rw-r--r-- | block/raw-posix.c | 21 | ||||
-rw-r--r-- | block/raw_bsd.c | 15 | ||||
-rw-r--r-- | block/sheepdog.c | 15 | ||||
-rw-r--r-- | block/throttle-groups.c | 18 | ||||
-rw-r--r-- | block/vdi.c | 131 | ||||
-rw-r--r-- | block/vhdx.c | 5 | ||||
-rw-r--r-- | block/vmdk.c | 367 | ||||
-rw-r--r-- | block/vpc.c | 175 | ||||
-rw-r--r-- | block/vvfat.c | 55 |
27 files changed, 1051 insertions, 856 deletions
diff --git a/block/block-backend.c b/block/block-backend.c index 16c9d5e0f2..a1e2c7fa20 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -1,7 +1,7 @@ /* * QEMU Block backends * - * Copyright (C) 2014 Red Hat, Inc. + * Copyright (C) 2014-2016 Red Hat, Inc. * * Authors: * Markus Armbruster <armbru@redhat.com>, @@ -692,7 +692,7 @@ static int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset, return ret; } - return bdrv_co_do_preadv(blk_bs(blk), offset, bytes, qiov, flags); + return bdrv_co_preadv(blk_bs(blk), offset, bytes, qiov, flags); } static int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset, @@ -710,7 +710,7 @@ static int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset, flags |= BDRV_REQ_FUA; } - return bdrv_co_do_pwritev(blk_bs(blk), offset, bytes, qiov, flags); + return bdrv_co_pwritev(blk_bs(blk), offset, bytes, qiov, flags); } typedef struct BlkRwCo { @@ -772,55 +772,28 @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf, return rwco.ret; } -static int blk_rw(BlockBackend *blk, int64_t sector_num, uint8_t *buf, - int nb_sectors, CoroutineEntry co_entry, - BdrvRequestFlags flags) -{ - if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { - return -EINVAL; - } - - return blk_prw(blk, sector_num << BDRV_SECTOR_BITS, buf, - nb_sectors << BDRV_SECTOR_BITS, co_entry, flags); -} - -int blk_read(BlockBackend *blk, int64_t sector_num, uint8_t *buf, - int nb_sectors) -{ - return blk_rw(blk, sector_num, buf, nb_sectors, blk_read_entry, 0); -} - -int blk_read_unthrottled(BlockBackend *blk, int64_t sector_num, uint8_t *buf, - int nb_sectors) +int blk_pread_unthrottled(BlockBackend *blk, int64_t offset, uint8_t *buf, + int count) { BlockDriverState *bs = blk_bs(blk); - bool enabled; int ret; - ret = blk_check_request(blk, sector_num, nb_sectors); + ret = blk_check_byte_request(blk, offset, count); if (ret < 0) { return ret; } - enabled = bs->io_limits_enabled; - bs->io_limits_enabled = false; - ret = blk_read(blk, sector_num, buf, nb_sectors); - bs->io_limits_enabled = enabled; + bdrv_no_throttling_begin(bs); + ret = blk_pread(blk, offset, buf, count); + bdrv_no_throttling_end(bs); return ret; } -int blk_write(BlockBackend *blk, int64_t sector_num, const uint8_t *buf, - int nb_sectors) -{ - return blk_rw(blk, sector_num, (uint8_t*) buf, nb_sectors, - blk_write_entry, 0); -} - -int blk_write_zeroes(BlockBackend *blk, int64_t sector_num, - int nb_sectors, BdrvRequestFlags flags) +int blk_write_zeroes(BlockBackend *blk, int64_t offset, + int count, BdrvRequestFlags flags) { - return blk_rw(blk, sector_num, NULL, nb_sectors, blk_write_entry, - flags | BDRV_REQ_ZERO_WRITE); + return blk_prw(blk, offset, NULL, count, blk_write_entry, + flags | BDRV_REQ_ZERO_WRITE); } static void error_callback_bh(void *opaque) @@ -932,18 +905,12 @@ static void blk_aio_write_entry(void *opaque) blk_aio_complete(acb); } -BlockAIOCB *blk_aio_write_zeroes(BlockBackend *blk, int64_t sector_num, - int nb_sectors, BdrvRequestFlags flags, +BlockAIOCB *blk_aio_write_zeroes(BlockBackend *blk, int64_t offset, + int count, BdrvRequestFlags flags, BlockCompletionFunc *cb, void *opaque) { - if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { - return blk_abort_aio_request(blk, cb, opaque, -EINVAL); - } - - return blk_aio_prwv(blk, sector_num << BDRV_SECTOR_BITS, - nb_sectors << BDRV_SECTOR_BITS, NULL, - blk_aio_write_entry, flags | BDRV_REQ_ZERO_WRITE, - cb, opaque); + return blk_aio_prwv(blk, offset, count, NULL, blk_aio_write_entry, + flags | BDRV_REQ_ZERO_WRITE, cb, opaque); } int blk_pread(BlockBackend *blk, int64_t offset, void *buf, int count) @@ -955,9 +922,11 @@ int blk_pread(BlockBackend *blk, int64_t offset, void *buf, int count) return count; } -int blk_pwrite(BlockBackend *blk, int64_t offset, const void *buf, int count) +int blk_pwrite(BlockBackend *blk, int64_t offset, const void *buf, int count, + BdrvRequestFlags flags) { - int ret = blk_prw(blk, offset, (void*) buf, count, blk_write_entry, 0); + int ret = blk_prw(blk, offset, (void *) buf, count, blk_write_entry, + flags); if (ret < 0) { return ret; } @@ -991,30 +960,20 @@ int64_t blk_nb_sectors(BlockBackend *blk) return bdrv_nb_sectors(blk_bs(blk)); } -BlockAIOCB *blk_aio_readv(BlockBackend *blk, int64_t sector_num, - QEMUIOVector *iov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) +BlockAIOCB *blk_aio_preadv(BlockBackend *blk, int64_t offset, + QEMUIOVector *qiov, BdrvRequestFlags flags, + BlockCompletionFunc *cb, void *opaque) { - if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { - return blk_abort_aio_request(blk, cb, opaque, -EINVAL); - } - - assert(nb_sectors << BDRV_SECTOR_BITS == iov->size); - return blk_aio_prwv(blk, sector_num << BDRV_SECTOR_BITS, iov->size, iov, - blk_aio_read_entry, 0, cb, opaque); + return blk_aio_prwv(blk, offset, qiov->size, qiov, + blk_aio_read_entry, flags, cb, opaque); } -BlockAIOCB *blk_aio_writev(BlockBackend *blk, int64_t sector_num, - QEMUIOVector *iov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) +BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset, + QEMUIOVector *qiov, BdrvRequestFlags flags, + BlockCompletionFunc *cb, void *opaque) { - if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { - return blk_abort_aio_request(blk, cb, opaque, -EINVAL); - } - - assert(nb_sectors << BDRV_SECTOR_BITS == iov->size); - return blk_aio_prwv(blk, sector_num << BDRV_SECTOR_BITS, iov->size, iov, - blk_aio_write_entry, 0, cb, opaque); + return blk_aio_prwv(blk, offset, qiov->size, qiov, + blk_aio_write_entry, flags, cb, opaque); } BlockAIOCB *blk_aio_flush(BlockBackend *blk, @@ -1444,15 +1403,10 @@ void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk, return qemu_aio_get(aiocb_info, blk_bs(blk), cb, opaque); } -int coroutine_fn blk_co_write_zeroes(BlockBackend *blk, int64_t sector_num, - int nb_sectors, BdrvRequestFlags flags) +int coroutine_fn blk_co_write_zeroes(BlockBackend *blk, int64_t offset, + int count, BdrvRequestFlags flags) { - if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { - return -EINVAL; - } - - return blk_co_pwritev(blk, sector_num << BDRV_SECTOR_BITS, - nb_sectors << BDRV_SECTOR_BITS, NULL, + return blk_co_pwritev(blk, offset, count, NULL, flags | BDRV_REQ_ZERO_WRITE); } diff --git a/block/bochs.c b/block/bochs.c index af8b7abdfd..f0e18c0b84 100644 --- a/block/bochs.c +++ b/block/bochs.c @@ -104,6 +104,7 @@ static int bochs_open(BlockDriverState *bs, QDict *options, int flags, int ret; bs->read_only = 1; // no write support yet + bs->request_alignment = BDRV_SECTOR_SIZE; /* No sub-sector I/O supported */ ret = bdrv_pread(bs->file->bs, 0, &bochs, sizeof(bochs)); if (ret < 0) { @@ -221,38 +222,52 @@ static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num) return bitmap_offset + (512 * (s->bitmap_blocks + extent_offset)); } -static int bochs_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) +static int coroutine_fn +bochs_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) { + BDRVBochsState *s = bs->opaque; + uint64_t sector_num = offset >> BDRV_SECTOR_BITS; + int nb_sectors = bytes >> BDRV_SECTOR_BITS; + uint64_t bytes_done = 0; + QEMUIOVector local_qiov; int ret; + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + + qemu_iovec_init(&local_qiov, qiov->niov); + qemu_co_mutex_lock(&s->lock); + while (nb_sectors > 0) { int64_t block_offset = seek_to_sector(bs, sector_num); if (block_offset < 0) { - return block_offset; - } else if (block_offset > 0) { - ret = bdrv_pread(bs->file->bs, block_offset, buf, 512); + ret = block_offset; + goto fail; + } + + qemu_iovec_reset(&local_qiov); + qemu_iovec_concat(&local_qiov, qiov, bytes_done, 512); + + if (block_offset > 0) { + ret = bdrv_co_preadv(bs->file->bs, block_offset, 512, + &local_qiov, 0); if (ret < 0) { - return ret; + goto fail; } } else { - memset(buf, 0, 512); + qemu_iovec_memset(&local_qiov, 0, 0, 512); } nb_sectors--; sector_num++; - buf += 512; + bytes_done += 512; } - return 0; -} -static coroutine_fn int bochs_co_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ - int ret; - BDRVBochsState *s = bs->opaque; - qemu_co_mutex_lock(&s->lock); - ret = bochs_read(bs, sector_num, buf, nb_sectors); + ret = 0; +fail: qemu_co_mutex_unlock(&s->lock); + qemu_iovec_destroy(&local_qiov); + return ret; } @@ -267,7 +282,7 @@ static BlockDriver bdrv_bochs = { .instance_size = sizeof(BDRVBochsState), .bdrv_probe = bochs_probe, .bdrv_open = bochs_open, - .bdrv_read = bochs_co_read, + .bdrv_co_preadv = bochs_co_preadv, .bdrv_close = bochs_close, }; diff --git a/block/cloop.c b/block/cloop.c index a84f14019c..fc1ca3a05a 100644 --- a/block/cloop.c +++ b/block/cloop.c @@ -66,6 +66,7 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags, int ret; bs->read_only = 1; + bs->request_alignment = BDRV_SECTOR_SIZE; /* No sub-sector I/O supported */ /* read header */ ret = bdrv_pread(bs->file->bs, 128, &s->block_size, 4); @@ -229,33 +230,38 @@ static inline int cloop_read_block(BlockDriverState *bs, int block_num) return 0; } -static int cloop_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) +static int coroutine_fn +cloop_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) { BDRVCloopState *s = bs->opaque; - int i; + uint64_t sector_num = offset >> BDRV_SECTOR_BITS; + int nb_sectors = bytes >> BDRV_SECTOR_BITS; + int ret, i; + + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + + qemu_co_mutex_lock(&s->lock); for (i = 0; i < nb_sectors; i++) { + void *data; uint32_t sector_offset_in_block = ((sector_num + i) % s->sectors_per_block), block_num = (sector_num + i) / s->sectors_per_block; if (cloop_read_block(bs, block_num) != 0) { - return -1; + ret = -EIO; + goto fail; } - memcpy(buf + i * 512, - s->uncompressed_block + sector_offset_in_block * 512, 512); + + data = s->uncompressed_block + sector_offset_in_block * 512; + qemu_iovec_from_buf(qiov, i * 512, data, 512); } - return 0; -} -static coroutine_fn int cloop_co_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ - int ret; - BDRVCloopState *s = bs->opaque; - qemu_co_mutex_lock(&s->lock); - ret = cloop_read(bs, sector_num, buf, nb_sectors); + ret = 0; +fail: qemu_co_mutex_unlock(&s->lock); + return ret; } @@ -273,7 +279,7 @@ static BlockDriver bdrv_cloop = { .instance_size = sizeof(BDRVCloopState), .bdrv_probe = cloop_probe, .bdrv_open = cloop_open, - .bdrv_read = cloop_co_read, + .bdrv_co_preadv = cloop_co_preadv, .bdrv_close = cloop_close, }; diff --git a/block/crypto.c b/block/crypto.c index 2424a4c9c9..758e14e032 100644 --- a/block/crypto.c +++ b/block/crypto.c @@ -91,7 +91,7 @@ static ssize_t block_crypto_write_func(QCryptoBlock *block, struct BlockCryptoCreateData *data = opaque; ssize_t ret; - ret = blk_pwrite(data->blk, offset, buf, buflen); + ret = blk_pwrite(data->blk, offset, buf, buflen, 0); if (ret < 0) { error_setg_errno(errp, -ret, "Could not write encryption header"); return ret; diff --git a/block/curl.c b/block/curl.c index 5a8f8b6239..da9f5e85de 100644 --- a/block/curl.c +++ b/block/curl.c @@ -36,10 +36,16 @@ // #define DEBUG_VERBOSE #ifdef DEBUG_CURL -#define DPRINTF(fmt, ...) do { printf(fmt, ## __VA_ARGS__); } while (0) +#define DEBUG_CURL_PRINT 1 #else -#define DPRINTF(fmt, ...) do { } while (0) +#define DEBUG_CURL_PRINT 0 #endif +#define DPRINTF(fmt, ...) \ + do { \ + if (DEBUG_CURL_PRINT) { \ + fprintf(stderr, fmt, ## __VA_ARGS__); \ + } \ + } while (0) #if LIBCURL_VERSION_NUM >= 0x071000 /* The multi interface timer callback was introduced in 7.16.0 */ diff --git a/block/dmg.c b/block/dmg.c index a496eb7c9b..1ea5f22d82 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -440,6 +440,8 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags, int ret; bs->read_only = 1; + bs->request_alignment = BDRV_SECTOR_SIZE; /* No sub-sector I/O supported */ + s->n_chunks = 0; s->offsets = s->lengths = s->sectors = s->sectorcounts = NULL; /* used by dmg_read_mish_block to keep track of the current I/O position */ @@ -659,38 +661,42 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num) return 0; } -static int dmg_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) +static int coroutine_fn +dmg_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) { BDRVDMGState *s = bs->opaque; - int i; + uint64_t sector_num = offset >> BDRV_SECTOR_BITS; + int nb_sectors = bytes >> BDRV_SECTOR_BITS; + int ret, i; + + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + + qemu_co_mutex_lock(&s->lock); for (i = 0; i < nb_sectors; i++) { uint32_t sector_offset_in_chunk; + void *data; + if (dmg_read_chunk(bs, sector_num + i) != 0) { - return -1; + ret = -EIO; + goto fail; } /* Special case: current chunk is all zeroes. Do not perform a memcpy as * s->uncompressed_chunk may be too small to cover the large all-zeroes * section. dmg_read_chunk is called to find s->current_chunk */ if (s->types[s->current_chunk] == 2) { /* all zeroes block entry */ - memset(buf + i * 512, 0, 512); + qemu_iovec_memset(qiov, i * 512, 0, 512); continue; } sector_offset_in_chunk = sector_num + i - s->sectors[s->current_chunk]; - memcpy(buf + i * 512, - s->uncompressed_chunk + sector_offset_in_chunk * 512, 512); + data = s->uncompressed_chunk + sector_offset_in_chunk * 512; + qemu_iovec_from_buf(qiov, i * 512, data, 512); } - return 0; -} -static coroutine_fn int dmg_co_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ - int ret; - BDRVDMGState *s = bs->opaque; - qemu_co_mutex_lock(&s->lock); - ret = dmg_read(bs, sector_num, buf, nb_sectors); + ret = 0; +fail: qemu_co_mutex_unlock(&s->lock); return ret; } @@ -715,7 +721,7 @@ static BlockDriver bdrv_dmg = { .instance_size = sizeof(BDRVDMGState), .bdrv_probe = dmg_probe, .bdrv_open = dmg_open, - .bdrv_read = dmg_co_read, + .bdrv_co_preadv = dmg_co_preadv, .bdrv_close = dmg_close, }; diff --git a/block/io.c b/block/io.c index a7dbf85b19..cd6d71a503 100644 --- a/block/io.c +++ b/block/io.c @@ -34,18 +34,6 @@ #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ -static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque); -static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque); -static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - QEMUIOVector *iov); -static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - QEMUIOVector *iov); static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, @@ -62,48 +50,35 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, void bdrv_set_io_limits(BlockDriverState *bs, ThrottleConfig *cfg) { - int i; - throttle_group_config(bs, cfg); - - for (i = 0; i < 2; i++) { - qemu_co_enter_next(&bs->throttled_reqs[i]); - } } -/* this function drain all the throttled IOs */ -static bool bdrv_start_throttled_reqs(BlockDriverState *bs) +void bdrv_no_throttling_begin(BlockDriverState *bs) { - bool drained = false; - bool enabled = bs->io_limits_enabled; - int i; - - bs->io_limits_enabled = false; - - for (i = 0; i < 2; i++) { - while (qemu_co_enter_next(&bs->throttled_reqs[i])) { - drained = true; - } + if (bs->io_limits_disabled++ == 0) { + throttle_group_restart_bs(bs); } +} - bs->io_limits_enabled = enabled; - - return drained; +void bdrv_no_throttling_end(BlockDriverState *bs) +{ + assert(bs->io_limits_disabled); + --bs->io_limits_disabled; } void bdrv_io_limits_disable(BlockDriverState *bs) { - bs->io_limits_enabled = false; - bdrv_start_throttled_reqs(bs); + assert(bs->throttle_state); + bdrv_no_throttling_begin(bs); throttle_group_unregister_bs(bs); + bdrv_no_throttling_end(bs); } /* should be called before bdrv_set_io_limits if a limit is set */ void bdrv_io_limits_enable(BlockDriverState *bs, const char *group) { - assert(!bs->io_limits_enabled); + assert(!bs->throttle_state); throttle_group_register_bs(bs, group); - bs->io_limits_enabled = true; } void bdrv_io_limits_update_group(BlockDriverState *bs, const char *group) @@ -123,24 +98,6 @@ void bdrv_io_limits_update_group(BlockDriverState *bs, const char *group) bdrv_io_limits_enable(bs, group); } -void bdrv_setup_io_funcs(BlockDriver *bdrv) -{ - /* Block drivers without coroutine functions need emulation */ - if (!bdrv->bdrv_co_readv) { - bdrv->bdrv_co_readv = bdrv_co_readv_em; - bdrv->bdrv_co_writev = bdrv_co_writev_em; - - /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if - * the block driver lacks aio we need to emulate that too. - */ - if (!bdrv->bdrv_aio_readv) { - /* add AIO emulation layer */ - bdrv->bdrv_aio_readv = bdrv_aio_readv_em; - bdrv->bdrv_aio_writev = bdrv_aio_writev_em; - } - } -} - void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) { BlockDriver *drv = bs->drv; @@ -260,18 +217,29 @@ typedef struct { bool done; } BdrvCoDrainData; +static void bdrv_drain_poll(BlockDriverState *bs) +{ + bool busy = true; + + while (busy) { + /* Keep iterating */ + busy = bdrv_requests_pending(bs); + busy |= aio_poll(bdrv_get_aio_context(bs), busy); + } +} + static void bdrv_co_drain_bh_cb(void *opaque) { BdrvCoDrainData *data = opaque; Coroutine *co = data->co; qemu_bh_delete(data->bh); - bdrv_drain(data->bs); + bdrv_drain_poll(data->bs); data->done = true; qemu_coroutine_enter(co, NULL); } -void coroutine_fn bdrv_co_drain(BlockDriverState *bs) +static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs) { BdrvCoDrainData data; @@ -305,21 +273,28 @@ void coroutine_fn bdrv_co_drain(BlockDriverState *bs) * not depend on events in other AioContexts. In that case, use * bdrv_drain_all() instead. */ -void bdrv_drain(BlockDriverState *bs) +void coroutine_fn bdrv_co_drain(BlockDriverState *bs) { - bool busy = true; + bdrv_no_throttling_begin(bs); + bdrv_io_unplugged_begin(bs); + bdrv_drain_recurse(bs); + bdrv_co_yield_to_drain(bs); + bdrv_io_unplugged_end(bs); + bdrv_no_throttling_end(bs); +} +void bdrv_drain(BlockDriverState *bs) +{ + bdrv_no_throttling_begin(bs); + bdrv_io_unplugged_begin(bs); bdrv_drain_recurse(bs); if (qemu_in_coroutine()) { - bdrv_co_drain(bs); - return; - } - while (busy) { - /* Keep iterating */ - bdrv_flush_io_queue(bs); - busy = bdrv_requests_pending(bs); - busy |= aio_poll(bdrv_get_aio_context(bs), busy); + bdrv_co_yield_to_drain(bs); + } else { + bdrv_drain_poll(bs); } + bdrv_io_unplugged_end(bs); + bdrv_no_throttling_end(bs); } /* @@ -342,6 +317,8 @@ void bdrv_drain_all(void) if (bs->job) { block_job_pause(bs->job); } + bdrv_no_throttling_begin(bs); + bdrv_io_unplugged_begin(bs); bdrv_drain_recurse(bs); aio_context_release(aio_context); @@ -366,7 +343,6 @@ void bdrv_drain_all(void) aio_context_acquire(aio_context); while ((bs = bdrv_next(bs))) { if (aio_context == bdrv_get_aio_context(bs)) { - bdrv_flush_io_queue(bs); if (bdrv_requests_pending(bs)) { busy = true; aio_poll(aio_context, busy); @@ -383,6 +359,8 @@ void bdrv_drain_all(void) AioContext *aio_context = bdrv_get_aio_context(bs); aio_context_acquire(aio_context); + bdrv_io_unplugged_end(bs); + bdrv_no_throttling_end(bs); if (bs->job) { block_job_resume(bs->job); } @@ -581,13 +559,13 @@ static void coroutine_fn bdrv_rw_co_entry(void *opaque) RwCo *rwco = opaque; if (!rwco->is_write) { - rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset, - rwco->qiov->size, rwco->qiov, - rwco->flags); + rwco->ret = bdrv_co_preadv(rwco->bs, rwco->offset, + rwco->qiov->size, rwco->qiov, + rwco->flags); } else { - rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset, - rwco->qiov->size, rwco->qiov, - rwco->flags); + rwco->ret = bdrv_co_pwritev(rwco->bs, rwco->offset, + rwco->qiov->size, rwco->qiov, + rwco->flags); } } @@ -608,17 +586,6 @@ static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, .flags = flags, }; - /** - * In sync call context, when the vcpu is blocked, this throttling timer - * will not fire; so the I/O throttling function has to be disabled here - * if it has been enabled. - */ - if (bs->io_limits_enabled) { - fprintf(stderr, "Disabling I/O throttling on '%s' due " - "to synchronous I/O.\n", bdrv_get_device_name(bs)); - bdrv_io_limits_disable(bs); - } - if (qemu_in_coroutine()) { /* Fast-path if already in coroutine context */ bdrv_rw_co_entry(&rwco); @@ -685,7 +652,8 @@ int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, * Completely zero out a block device with the help of bdrv_write_zeroes. * The operation is sped up by checking the block status and only writing * zeroes to the device if they currently do not return zeroes. Optional - * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP). + * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP, + * BDRV_REQ_FUA). * * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). */ @@ -800,6 +768,109 @@ int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, return 0; } +typedef struct CoroutineIOCompletion { + Coroutine *coroutine; + int ret; +} CoroutineIOCompletion; + +static void bdrv_co_io_em_complete(void *opaque, int ret) +{ + CoroutineIOCompletion *co = opaque; + + co->ret = ret; + qemu_coroutine_enter(co->coroutine, NULL); +} + +static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs, + uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) +{ + BlockDriver *drv = bs->drv; + int64_t sector_num; + unsigned int nb_sectors; + + if (drv->bdrv_co_preadv) { + return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags); + } + + sector_num = offset >> BDRV_SECTOR_BITS; + nb_sectors = bytes >> BDRV_SECTOR_BITS; + + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS); + + if (drv->bdrv_co_readv) { + return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); + } else { + BlockAIOCB *acb; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + + acb = bs->drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors, + bdrv_co_io_em_complete, &co); + if (acb == NULL) { + return -EIO; + } else { + qemu_coroutine_yield(); + return co.ret; + } + } +} + +static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, + uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) +{ + BlockDriver *drv = bs->drv; + int64_t sector_num; + unsigned int nb_sectors; + int ret; + + if (drv->bdrv_co_pwritev) { + ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, flags); + goto emulate_flags; + } + + sector_num = offset >> BDRV_SECTOR_BITS; + nb_sectors = bytes >> BDRV_SECTOR_BITS; + + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS); + + if (drv->bdrv_co_writev_flags) { + ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov, + flags & bs->supported_write_flags); + flags &= ~bs->supported_write_flags; + } else if (drv->bdrv_co_writev) { + assert(!bs->supported_write_flags); + ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); + } else { + BlockAIOCB *acb; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + + acb = bs->drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors, + bdrv_co_io_em_complete, &co); + if (acb == NULL) { + ret = -EIO; + } else { + qemu_coroutine_yield(); + ret = co.ret; + } + } + +emulate_flags: + if (ret == 0 && (flags & BDRV_REQ_FUA)) { + ret = bdrv_co_flush(bs); + } + + return ret; +} + static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { @@ -836,8 +907,9 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, qemu_iovec_init_external(&bounce_qiov, &iov, 1); - ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, - &bounce_qiov); + ret = bdrv_driver_preadv(bs, cluster_sector_num * BDRV_SECTOR_SIZE, + cluster_nb_sectors * BDRV_SECTOR_SIZE, + &bounce_qiov, 0); if (ret < 0) { goto err; } @@ -850,8 +922,9 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, /* This does not change the data on the disk, it is not necessary * to flush even in cache=writethrough mode. */ - ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, - &bounce_qiov); + ret = bdrv_driver_pwritev(bs, cluster_sector_num * BDRV_SECTOR_SIZE, + cluster_nb_sectors * BDRV_SECTOR_SIZE, + &bounce_qiov, 0); } if (ret < 0) { @@ -880,7 +953,6 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, int64_t align, QEMUIOVector *qiov, int flags) { - BlockDriver *drv = bs->drv; int ret; int64_t sector_num = offset >> BDRV_SECTOR_BITS; @@ -921,7 +993,7 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, /* Forward the request to the BlockDriver */ if (!bs->zero_beyond_eof) { - ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); + ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0); } else { /* Read zeros after EOF */ int64_t total_sectors, max_nb_sectors; @@ -935,7 +1007,7 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num), align >> BDRV_SECTOR_BITS); if (nb_sectors < max_nb_sectors) { - ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); + ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0); } else if (max_nb_sectors > 0) { QEMUIOVector local_qiov; @@ -943,8 +1015,9 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, qemu_iovec_concat(&local_qiov, qiov, 0, max_nb_sectors * BDRV_SECTOR_SIZE); - ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors, - &local_qiov); + ret = bdrv_driver_preadv(bs, offset, + max_nb_sectors * BDRV_SECTOR_SIZE, + &local_qiov, 0); qemu_iovec_destroy(&local_qiov); } else { @@ -967,7 +1040,7 @@ out: /* * Handle a read request in coroutine context */ -int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, +int coroutine_fn bdrv_co_preadv(BlockDriverState *bs, int64_t offset, unsigned int bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) { @@ -997,7 +1070,7 @@ int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, } /* throttling disk I/O */ - if (bs->io_limits_enabled) { + if (bs->throttle_state) { throttle_group_co_io_limits_intercept(bs, bytes, false); } @@ -1049,8 +1122,8 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, return -EINVAL; } - return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS, - nb_sectors << BDRV_SECTOR_BITS, qiov, flags); + return bdrv_co_preadv(bs, sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS, qiov, flags); } int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, @@ -1088,6 +1161,7 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, QEMUIOVector qiov; struct iovec iov = {0}; int ret = 0; + bool need_flush = false; int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes, BDRV_REQUEST_MAX_SECTORS); @@ -1120,13 +1194,29 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, ret = -ENOTSUP; /* First try the efficient write zeroes operation */ if (drv->bdrv_co_write_zeroes) { - ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags); + ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, + flags & bs->supported_zero_flags); + if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) && + !(bs->supported_zero_flags & BDRV_REQ_FUA)) { + need_flush = true; + } + } else { + assert(!bs->supported_zero_flags); } if (ret == -ENOTSUP) { /* Fall back to bounce buffer if write zeroes is unsupported */ int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length, MAX_WRITE_ZEROES_BOUNCE_BUFFER); + BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE; + + if ((flags & BDRV_REQ_FUA) && + !(bs->supported_write_flags & BDRV_REQ_FUA)) { + /* No need for bdrv_driver_pwrite() to do a fallback + * flush on each chunk; use just one at the end */ + write_flags &= ~BDRV_REQ_FUA; + need_flush = true; + } num = MIN(num, max_xfer_len); iov.iov_len = num * BDRV_SECTOR_SIZE; if (iov.iov_base == NULL) { @@ -1139,7 +1229,9 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, } qemu_iovec_init_external(&qiov, &iov, 1); - ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov); + ret = bdrv_driver_pwritev(bs, sector_num * BDRV_SECTOR_SIZE, + num * BDRV_SECTOR_SIZE, &qiov, + write_flags); /* Keep bounce buffer around if it is big enough for all * all future requests. @@ -1155,6 +1247,9 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, } fail: + if (ret == 0 && need_flush) { + ret = bdrv_co_flush(bs); + } qemu_vfree(iov.iov_base); return ret; } @@ -1199,23 +1294,12 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, } else if (flags & BDRV_REQ_ZERO_WRITE) { bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); - } else if (drv->bdrv_co_writev_flags) { - bdrv_debug_event(bs, BLKDBG_PWRITEV); - ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov, - flags); } else { - assert(drv->supported_write_flags == 0); bdrv_debug_event(bs, BLKDBG_PWRITEV); - ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); + ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags); } bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); - if (ret == 0 && (flags & BDRV_REQ_FUA) && - !(drv->supported_write_flags & BDRV_REQ_FUA)) - { - ret = bdrv_co_flush(bs); - } - bdrv_set_dirty(bs, sector_num, nb_sectors); if (bs->wr_highest_offset < offset + bytes) { @@ -1320,7 +1404,7 @@ fail: /* * Handle a write request in coroutine context */ -int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, +int coroutine_fn bdrv_co_pwritev(BlockDriverState *bs, int64_t offset, unsigned int bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) { @@ -1347,7 +1431,7 @@ int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, } /* throttling disk I/O */ - if (bs->io_limits_enabled) { + if (bs->throttle_state) { throttle_group_co_io_limits_intercept(bs, bytes, true); } @@ -1455,8 +1539,8 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, return -EINVAL; } - return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS, - nb_sectors << BDRV_SECTOR_BITS, qiov, flags); + return bdrv_co_pwritev(bs, sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS, qiov, flags); } int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, @@ -2064,80 +2148,6 @@ void bdrv_aio_cancel_async(BlockAIOCB *acb) /**************************************************************/ /* async block device emulation */ -typedef struct BlockAIOCBSync { - BlockAIOCB common; - QEMUBH *bh; - int ret; - /* vector translation state */ - QEMUIOVector *qiov; - uint8_t *bounce; - int is_write; -} BlockAIOCBSync; - -static const AIOCBInfo bdrv_em_aiocb_info = { - .aiocb_size = sizeof(BlockAIOCBSync), -}; - -static void bdrv_aio_bh_cb(void *opaque) -{ - BlockAIOCBSync *acb = opaque; - - if (!acb->is_write && acb->ret >= 0) { - qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); - } - qemu_vfree(acb->bounce); - acb->common.cb(acb->common.opaque, acb->ret); - qemu_bh_delete(acb->bh); - acb->bh = NULL; - qemu_aio_unref(acb); -} - -static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, - int nb_sectors, - BlockCompletionFunc *cb, - void *opaque, - int is_write) - -{ - BlockAIOCBSync *acb; - - acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque); - acb->is_write = is_write; - acb->qiov = qiov; - acb->bounce = qemu_try_blockalign(bs, qiov->size); - acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb); - - if (acb->bounce == NULL) { - acb->ret = -ENOMEM; - } else if (is_write) { - qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); - acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); - } else { - acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors); - } - - qemu_bh_schedule(acb->bh); - - return &acb->common; -} - -static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); -} - -static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); -} - - typedef struct BlockAIOCBCoroutine { BlockAIOCB common; BlockRequest req; @@ -2314,59 +2324,6 @@ void qemu_aio_unref(void *p) /**************************************************************/ /* Coroutine block device emulation */ -typedef struct CoroutineIOCompletion { - Coroutine *coroutine; - int ret; -} CoroutineIOCompletion; - -static void bdrv_co_io_em_complete(void *opaque, int ret) -{ - CoroutineIOCompletion *co = opaque; - - co->ret = ret; - qemu_coroutine_enter(co->coroutine, NULL); -} - -static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *iov, - bool is_write) -{ - CoroutineIOCompletion co = { - .coroutine = qemu_coroutine_self(), - }; - BlockAIOCB *acb; - - if (is_write) { - acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors, - bdrv_co_io_em_complete, &co); - } else { - acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors, - bdrv_co_io_em_complete, &co); - } - - trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb); - if (!acb) { - return -EIO; - } - qemu_coroutine_yield(); - - return co.ret; -} - -static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - QEMUIOVector *iov) -{ - return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false); -} - -static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - QEMUIOVector *iov) -{ - return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); -} - static void coroutine_fn bdrv_flush_co_entry(void *opaque) { RwCo *rwco = opaque; @@ -2763,33 +2720,68 @@ void bdrv_add_before_write_notifier(BlockDriverState *bs, void bdrv_io_plug(BlockDriverState *bs) { - BlockDriver *drv = bs->drv; - if (drv && drv->bdrv_io_plug) { - drv->bdrv_io_plug(bs); - } else if (bs->file) { - bdrv_io_plug(bs->file->bs); + BdrvChild *child; + + QLIST_FOREACH(child, &bs->children, next) { + bdrv_io_plug(child->bs); + } + + if (bs->io_plugged++ == 0 && bs->io_plug_disabled == 0) { + BlockDriver *drv = bs->drv; + if (drv && drv->bdrv_io_plug) { + drv->bdrv_io_plug(bs); + } } } void bdrv_io_unplug(BlockDriverState *bs) { - BlockDriver *drv = bs->drv; - if (drv && drv->bdrv_io_unplug) { - drv->bdrv_io_unplug(bs); - } else if (bs->file) { - bdrv_io_unplug(bs->file->bs); + BdrvChild *child; + + assert(bs->io_plugged); + if (--bs->io_plugged == 0 && bs->io_plug_disabled == 0) { + BlockDriver *drv = bs->drv; + if (drv && drv->bdrv_io_unplug) { + drv->bdrv_io_unplug(bs); + } + } + + QLIST_FOREACH(child, &bs->children, next) { + bdrv_io_unplug(child->bs); } } -void bdrv_flush_io_queue(BlockDriverState *bs) +void bdrv_io_unplugged_begin(BlockDriverState *bs) { - BlockDriver *drv = bs->drv; - if (drv && drv->bdrv_flush_io_queue) { - drv->bdrv_flush_io_queue(bs); - } else if (bs->file) { - bdrv_flush_io_queue(bs->file->bs); + BdrvChild *child; + + if (bs->io_plug_disabled++ == 0 && bs->io_plugged > 0) { + BlockDriver *drv = bs->drv; + if (drv && drv->bdrv_io_unplug) { + drv->bdrv_io_unplug(bs); + } + } + + QLIST_FOREACH(child, &bs->children, next) { + bdrv_io_unplugged_begin(child->bs); + } +} + +void bdrv_io_unplugged_end(BlockDriverState *bs) +{ + BdrvChild *child; + + assert(bs->io_plug_disabled); + QLIST_FOREACH(child, &bs->children, next) { + bdrv_io_unplugged_end(child->bs); + } + + if (--bs->io_plug_disabled == 0 && bs->io_plugged > 0) { + BlockDriver *drv = bs->drv; + if (drv && drv->bdrv_io_plug) { + drv->bdrv_io_plug(bs); + } } - bdrv_start_throttled_reqs(bs); } void bdrv_drained_begin(BlockDriverState *bs) diff --git a/block/iscsi.c b/block/iscsi.c index 302baf84c1..10f3906bcc 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -456,8 +456,11 @@ iscsi_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors, struct IscsiTask iTask; uint64_t lba; uint32_t num_sectors; - bool fua; + bool fua = flags & BDRV_REQ_FUA; + if (fua) { + assert(iscsilun->dpofua); + } if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { return -EINVAL; } @@ -472,7 +475,6 @@ iscsi_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors, num_sectors = sector_qemu2lun(nb_sectors, iscsilun); iscsi_co_init_iscsitask(iscsilun, &iTask); retry: - fua = iscsilun->dpofua && (flags & BDRV_REQ_FUA); if (iscsilun->use_16_for_rw) { iTask.task = iscsi_write16_task(iscsilun->iscsi, iscsilun->lun, lba, NULL, num_sectors * iscsilun->block_size, @@ -513,13 +515,6 @@ retry: return 0; } -static int coroutine_fn -iscsi_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, - QEMUIOVector *iov) -{ - return iscsi_co_writev_flags(bs, sector_num, nb_sectors, iov, 0); -} - static bool iscsi_allocationmap_is_allocated(IscsiLun *iscsilun, int64_t sector_num, int nb_sectors) @@ -1555,6 +1550,10 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags, task = NULL; iscsi_modesense_sync(iscsilun); + if (iscsilun->dpofua) { + bs->supported_write_flags = BDRV_REQ_FUA; + } + bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP; /* Check the write protect flag of the LUN if we want to write */ if (iscsilun->type == TYPE_DISK && (flags & BDRV_O_RDWR) && @@ -1847,9 +1846,7 @@ static BlockDriver bdrv_iscsi = { .bdrv_co_discard = iscsi_co_discard, .bdrv_co_write_zeroes = iscsi_co_write_zeroes, .bdrv_co_readv = iscsi_co_readv, - .bdrv_co_writev = iscsi_co_writev, .bdrv_co_writev_flags = iscsi_co_writev_flags, - .supported_write_flags = BDRV_REQ_FUA, .bdrv_co_flush_to_disk = iscsi_co_flush, #ifdef __linux__ diff --git a/block/linux-aio.c b/block/linux-aio.c index 805757e02e..90ec98ee23 100644 --- a/block/linux-aio.c +++ b/block/linux-aio.c @@ -30,7 +30,7 @@ struct qemu_laiocb { BlockAIOCB common; - struct qemu_laio_state *ctx; + LinuxAioState *ctx; struct iocb iocb; ssize_t ret; size_t nbytes; @@ -46,7 +46,7 @@ typedef struct { QSIMPLEQ_HEAD(, qemu_laiocb) pending; } LaioQueue; -struct qemu_laio_state { +struct LinuxAioState { io_context_t ctx; EventNotifier e; @@ -60,7 +60,7 @@ struct qemu_laio_state { int event_max; }; -static void ioq_submit(struct qemu_laio_state *s); +static void ioq_submit(LinuxAioState *s); static inline ssize_t io_event_ret(struct io_event *ev) { @@ -70,8 +70,7 @@ static inline ssize_t io_event_ret(struct io_event *ev) /* * Completes an AIO request (calls the callback and frees the ACB). */ -static void qemu_laio_process_completion(struct qemu_laio_state *s, - struct qemu_laiocb *laiocb) +static void qemu_laio_process_completion(struct qemu_laiocb *laiocb) { int ret; @@ -99,7 +98,7 @@ static void qemu_laio_process_completion(struct qemu_laio_state *s, * * The function is somewhat tricky because it supports nested event loops, for * example when a request callback invokes aio_poll(). In order to do this, - * the completion events array and index are kept in qemu_laio_state. The BH + * the completion events array and index are kept in LinuxAioState. The BH * reschedules itself as long as there are completions pending so it will * either be called again in a nested event loop or will be called after all * events have been completed. When there are no events left to complete, the @@ -107,7 +106,7 @@ static void qemu_laio_process_completion(struct qemu_laio_state *s, */ static void qemu_laio_completion_bh(void *opaque) { - struct qemu_laio_state *s = opaque; + LinuxAioState *s = opaque; /* Fetch more completion events when empty */ if (s->event_idx == s->event_max) { @@ -136,7 +135,7 @@ static void qemu_laio_completion_bh(void *opaque) laiocb->ret = io_event_ret(&s->events[s->event_idx]); s->event_idx++; - qemu_laio_process_completion(s, laiocb); + qemu_laio_process_completion(laiocb); } if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) { @@ -146,7 +145,7 @@ static void qemu_laio_completion_bh(void *opaque) static void qemu_laio_completion_cb(EventNotifier *e) { - struct qemu_laio_state *s = container_of(e, struct qemu_laio_state, e); + LinuxAioState *s = container_of(e, LinuxAioState, e); if (event_notifier_test_and_clear(&s->e)) { qemu_bh_schedule(s->completion_bh); @@ -185,7 +184,7 @@ static void ioq_init(LaioQueue *io_q) io_q->blocked = false; } -static void ioq_submit(struct qemu_laio_state *s) +static void ioq_submit(LinuxAioState *s) { int ret, len; struct qemu_laiocb *aiocb; @@ -216,33 +215,25 @@ static void ioq_submit(struct qemu_laio_state *s) s->io_q.blocked = (s->io_q.n > 0); } -void laio_io_plug(BlockDriverState *bs, void *aio_ctx) +void laio_io_plug(BlockDriverState *bs, LinuxAioState *s) { - struct qemu_laio_state *s = aio_ctx; - - s->io_q.plugged++; + assert(!s->io_q.plugged); + s->io_q.plugged = 1; } -void laio_io_unplug(BlockDriverState *bs, void *aio_ctx, bool unplug) +void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s) { - struct qemu_laio_state *s = aio_ctx; - - assert(s->io_q.plugged > 0 || !unplug); - - if (unplug && --s->io_q.plugged > 0) { - return; - } - + assert(s->io_q.plugged); + s->io_q.plugged = 0; if (!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) { ioq_submit(s); } } -BlockAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd, +BlockAIOCB *laio_submit(BlockDriverState *bs, LinuxAioState *s, int fd, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockCompletionFunc *cb, void *opaque, int type) { - struct qemu_laio_state *s = aio_ctx; struct qemu_laiocb *laiocb; struct iocb *iocbs; off_t offset = sector_num * 512; @@ -284,26 +275,22 @@ out_free_aiocb: return NULL; } -void laio_detach_aio_context(void *s_, AioContext *old_context) +void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context) { - struct qemu_laio_state *s = s_; - aio_set_event_notifier(old_context, &s->e, false, NULL); qemu_bh_delete(s->completion_bh); } -void laio_attach_aio_context(void *s_, AioContext *new_context) +void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context) { - struct qemu_laio_state *s = s_; - s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s); aio_set_event_notifier(new_context, &s->e, false, qemu_laio_completion_cb); } -void *laio_init(void) +LinuxAioState *laio_init(void) { - struct qemu_laio_state *s; + LinuxAioState *s; s = g_malloc0(sizeof(*s)); if (event_notifier_init(&s->e, false) < 0) { @@ -325,10 +312,8 @@ out_free_state: return NULL; } -void laio_cleanup(void *s_) +void laio_cleanup(LinuxAioState *s) { - struct qemu_laio_state *s = s_; - event_notifier_cleanup(&s->e); if (io_destroy(s->ctx) != 0) { diff --git a/block/nbd-client.c b/block/nbd-client.c index 878e879ace..4d13444409 100644 --- a/block/nbd-client.c +++ b/block/nbd-client.c @@ -243,15 +243,15 @@ static int nbd_co_readv_1(BlockDriverState *bs, int64_t sector_num, static int nbd_co_writev_1(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, - int offset, int *flags) + int offset, int flags) { NbdClientSession *client = nbd_get_client_session(bs); struct nbd_request request = { .type = NBD_CMD_WRITE }; struct nbd_reply reply; ssize_t ret; - if ((*flags & BDRV_REQ_FUA) && (client->nbdflags & NBD_FLAG_SEND_FUA)) { - *flags &= ~BDRV_REQ_FUA; + if (flags & BDRV_REQ_FUA) { + assert(client->nbdflags & NBD_FLAG_SEND_FUA); request.type |= NBD_CMD_FLAG_FUA; } @@ -291,7 +291,7 @@ int nbd_client_co_readv(BlockDriverState *bs, int64_t sector_num, } int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov, int *flags) + int nb_sectors, QEMUIOVector *qiov, int flags) { int offset = 0; int ret; @@ -414,6 +414,9 @@ int nbd_client_init(BlockDriverState *bs, logout("Failed to negotiate with the NBD server\n"); return ret; } + if (client->nbdflags & NBD_FLAG_SEND_FUA) { + bs->supported_write_flags = BDRV_REQ_FUA; + } qemu_co_mutex_init(&client->send_mutex); qemu_co_mutex_init(&client->free_sema); diff --git a/block/nbd-client.h b/block/nbd-client.h index bc7aec0795..c618dadc39 100644 --- a/block/nbd-client.h +++ b/block/nbd-client.h @@ -48,7 +48,7 @@ int nbd_client_co_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors); int nbd_client_co_flush(BlockDriverState *bs); int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov, int *flags); + int nb_sectors, QEMUIOVector *qiov, int flags); int nbd_client_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); diff --git a/block/nbd.c b/block/nbd.c index f7ea3b3608..6015e8b537 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -355,31 +355,6 @@ static int nbd_co_readv(BlockDriverState *bs, int64_t sector_num, return nbd_client_co_readv(bs, sector_num, nb_sectors, qiov); } -static int nbd_co_writev_flags(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov, int flags) -{ - int ret; - - ret = nbd_client_co_writev(bs, sector_num, nb_sectors, qiov, &flags); - if (ret < 0) { - return ret; - } - - /* The flag wasn't sent to the server, so we need to emulate it with an - * explicit flush */ - if (flags & BDRV_REQ_FUA) { - ret = nbd_client_co_flush(bs); - } - - return ret; -} - -static int nbd_co_writev(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) -{ - return nbd_co_writev_flags(bs, sector_num, nb_sectors, qiov, 0); -} - static int nbd_co_flush(BlockDriverState *bs) { return nbd_client_co_flush(bs); @@ -476,9 +451,7 @@ static BlockDriver bdrv_nbd = { .bdrv_parse_filename = nbd_parse_filename, .bdrv_file_open = nbd_open, .bdrv_co_readv = nbd_co_readv, - .bdrv_co_writev = nbd_co_writev, - .bdrv_co_writev_flags = nbd_co_writev_flags, - .supported_write_flags = BDRV_REQ_FUA, + .bdrv_co_writev_flags = nbd_client_co_writev, .bdrv_close = nbd_close, .bdrv_co_flush_to_os = nbd_co_flush, .bdrv_co_discard = nbd_co_discard, @@ -496,9 +469,7 @@ static BlockDriver bdrv_nbd_tcp = { .bdrv_parse_filename = nbd_parse_filename, .bdrv_file_open = nbd_open, .bdrv_co_readv = nbd_co_readv, - .bdrv_co_writev = nbd_co_writev, - .bdrv_co_writev_flags = nbd_co_writev_flags, - .supported_write_flags = BDRV_REQ_FUA, + .bdrv_co_writev_flags = nbd_client_co_writev, .bdrv_close = nbd_close, .bdrv_co_flush_to_os = nbd_co_flush, .bdrv_co_discard = nbd_co_discard, @@ -516,9 +487,7 @@ static BlockDriver bdrv_nbd_unix = { .bdrv_parse_filename = nbd_parse_filename, .bdrv_file_open = nbd_open, .bdrv_co_readv = nbd_co_readv, - .bdrv_co_writev = nbd_co_writev, - .bdrv_co_writev_flags = nbd_co_writev_flags, - .supported_write_flags = BDRV_REQ_FUA, + .bdrv_co_writev_flags = nbd_client_co_writev, .bdrv_close = nbd_close, .bdrv_co_flush_to_os = nbd_co_flush, .bdrv_co_discard = nbd_co_discard, diff --git a/block/parallels.c b/block/parallels.c index 324ed43ac4..cddbfc4012 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -512,11 +512,12 @@ static int parallels_create(const char *filename, QemuOpts *opts, Error **errp) memset(tmp, 0, sizeof(tmp)); memcpy(tmp, &header, sizeof(header)); - ret = blk_pwrite(file, 0, tmp, BDRV_SECTOR_SIZE); + ret = blk_pwrite(file, 0, tmp, BDRV_SECTOR_SIZE, 0); if (ret < 0) { goto exit; } - ret = blk_write_zeroes(file, 1, bat_sectors - 1, 0); + ret = blk_write_zeroes(file, BDRV_SECTOR_SIZE, + (bat_sectors - 1) << BDRV_SECTOR_BITS, 0); if (ret < 0) { goto exit; } diff --git a/block/qcow.c b/block/qcow.c index 60ddb12eca..d6dc1b05b3 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -853,14 +853,14 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp) } /* write all the data */ - ret = blk_pwrite(qcow_blk, 0, &header, sizeof(header)); + ret = blk_pwrite(qcow_blk, 0, &header, sizeof(header), 0); if (ret != sizeof(header)) { goto exit; } if (backing_file) { ret = blk_pwrite(qcow_blk, sizeof(header), - backing_file, backing_filename_len); + backing_file, backing_filename_len, 0); if (ret != backing_filename_len) { goto exit; } @@ -869,8 +869,8 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp) tmp = g_malloc0(BDRV_SECTOR_SIZE); for (i = 0; i < ((sizeof(uint64_t)*l1_size + BDRV_SECTOR_SIZE - 1)/ BDRV_SECTOR_SIZE); i++) { - ret = blk_pwrite(qcow_blk, header_size + - BDRV_SECTOR_SIZE*i, tmp, BDRV_SECTOR_SIZE); + ret = blk_pwrite(qcow_blk, header_size + BDRV_SECTOR_SIZE * i, + tmp, BDRV_SECTOR_SIZE, 0); if (ret != BDRV_SECTOR_SIZE) { g_free(tmp); goto exit; diff --git a/block/qcow2.c b/block/qcow2.c index 470734be9f..62febfc386 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -1757,13 +1757,6 @@ static void qcow2_invalidate_cache(BlockDriverState *bs, Error **errp) qcow2_close(bs); - bdrv_invalidate_cache(bs->file->bs, &local_err); - if (local_err) { - error_propagate(errp, local_err); - bs->drv = NULL; - return; - } - memset(s, 0, sizeof(BDRVQcow2State)); options = qdict_clone_shallow(bs->options); @@ -2207,7 +2200,7 @@ static int qcow2_create2(const char *filename, int64_t total_size, cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS); } - ret = blk_pwrite(blk, 0, header, cluster_size); + ret = blk_pwrite(blk, 0, header, cluster_size, 0); g_free(header); if (ret < 0) { error_setg_errno(errp, -ret, "Could not write qcow2 header"); @@ -2217,7 +2210,7 @@ static int qcow2_create2(const char *filename, int64_t total_size, /* Write a refcount table with one refcount block */ refcount_table = g_malloc0(2 * cluster_size); refcount_table[0] = cpu_to_be64(2 * cluster_size); - ret = blk_pwrite(blk, cluster_size, refcount_table, 2 * cluster_size); + ret = blk_pwrite(blk, cluster_size, refcount_table, 2 * cluster_size, 0); g_free(refcount_table); if (ret < 0) { @@ -2411,21 +2404,74 @@ finish: return ret; } + +static bool is_zero_cluster(BlockDriverState *bs, int64_t start) +{ + BDRVQcow2State *s = bs->opaque; + int nr; + BlockDriverState *file; + int64_t res = bdrv_get_block_status_above(bs, NULL, start, + s->cluster_sectors, &nr, &file); + return res >= 0 && ((res & BDRV_BLOCK_ZERO) || !(res & BDRV_BLOCK_DATA)); +} + +static bool is_zero_cluster_top_locked(BlockDriverState *bs, int64_t start) +{ + BDRVQcow2State *s = bs->opaque; + int nr = s->cluster_sectors; + uint64_t off; + int ret; + + ret = qcow2_get_cluster_offset(bs, start << BDRV_SECTOR_BITS, &nr, &off); + return ret == QCOW2_CLUSTER_UNALLOCATED || ret == QCOW2_CLUSTER_ZERO; +} + static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs, int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) { int ret; BDRVQcow2State *s = bs->opaque; - /* Emulate misaligned zero writes */ - if (sector_num % s->cluster_sectors || nb_sectors % s->cluster_sectors) { - return -ENOTSUP; + int head = sector_num % s->cluster_sectors; + int tail = (sector_num + nb_sectors) % s->cluster_sectors; + + if (head != 0 || tail != 0) { + int64_t cl_end = -1; + + sector_num -= head; + nb_sectors += head; + + if (tail != 0) { + nb_sectors += s->cluster_sectors - tail; + } + + if (!is_zero_cluster(bs, sector_num)) { + return -ENOTSUP; + } + + if (nb_sectors > s->cluster_sectors) { + /* Technically the request can cover 2 clusters, f.e. 4k write + at s->cluster_sectors - 2k offset. One of these cluster can + be zeroed, one unallocated */ + cl_end = sector_num + nb_sectors - s->cluster_sectors; + if (!is_zero_cluster(bs, cl_end)) { + return -ENOTSUP; + } + } + + qemu_co_mutex_lock(&s->lock); + /* We can have new write after previous check */ + if (!is_zero_cluster_top_locked(bs, sector_num) || + (cl_end > 0 && !is_zero_cluster_top_locked(bs, cl_end))) { + qemu_co_mutex_unlock(&s->lock); + return -ENOTSUP; + } + } else { + qemu_co_mutex_lock(&s->lock); } /* Whatever is left can use real zero clusters */ - qemu_co_mutex_lock(&s->lock); - ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS, - nb_sectors); + ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS, nb_sectors); qemu_co_mutex_unlock(&s->lock); return ret; diff --git a/block/qed.c b/block/qed.c index 0af52741df..10ce18eb66 100644 --- a/block/qed.c +++ b/block/qed.c @@ -601,18 +601,18 @@ static int qed_create(const char *filename, uint32_t cluster_size, } qed_header_cpu_to_le(&header, &le_header); - ret = blk_pwrite(blk, 0, &le_header, sizeof(le_header)); + ret = blk_pwrite(blk, 0, &le_header, sizeof(le_header), 0); if (ret < 0) { goto out; } ret = blk_pwrite(blk, sizeof(le_header), backing_file, - header.backing_filename_size); + header.backing_filename_size, 0); if (ret < 0) { goto out; } l1_table = g_malloc0(l1_size); - ret = blk_pwrite(blk, header.l1_table_offset, l1_table, l1_size); + ret = blk_pwrite(blk, header.l1_table_offset, l1_table, l1_size, 0); if (ret < 0) { goto out; } @@ -1594,12 +1594,6 @@ static void bdrv_qed_invalidate_cache(BlockDriverState *bs, Error **errp) bdrv_qed_close(bs); - bdrv_invalidate_cache(bs->file->bs, &local_err); - if (local_err) { - error_propagate(errp, local_err); - return; - } - memset(s, 0, sizeof(BDRVQEDState)); ret = bdrv_qed_open(bs, NULL, bs->open_flags, &local_err); if (local_err) { diff --git a/block/quorum.c b/block/quorum.c index da15465a9a..1ec3511528 100644 --- a/block/quorum.c +++ b/block/quorum.c @@ -14,6 +14,7 @@ */ #include "qemu/osdep.h" +#include "qemu/cutils.h" #include "block/block_int.h" #include "qapi/qmp/qbool.h" #include "qapi/qmp/qdict.h" @@ -67,6 +68,9 @@ typedef struct QuorumVotes { typedef struct BDRVQuorumState { BdrvChild **children; /* children BlockDriverStates */ int num_children; /* children count */ + unsigned next_child_index; /* the index of the next child that should + * be added + */ int threshold; /* if less than threshold children reads gave the * same result a quorum error occurs. */ @@ -747,21 +751,6 @@ static int64_t quorum_getlength(BlockDriverState *bs) return result; } -static void quorum_invalidate_cache(BlockDriverState *bs, Error **errp) -{ - BDRVQuorumState *s = bs->opaque; - Error *local_err = NULL; - int i; - - for (i = 0; i < s->num_children; i++) { - bdrv_invalidate_cache(s->children[i]->bs, &local_err); - if (local_err) { - error_propagate(errp, local_err); - return; - } - } -} - static coroutine_fn int quorum_co_flush(BlockDriverState *bs) { BDRVQuorumState *s = bs->opaque; @@ -898,9 +887,9 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags, ret = -EINVAL; goto exit; } - if (s->num_children < 2) { + if (s->num_children < 1) { error_setg(&local_err, - "Number of provided children must be greater than 1"); + "Number of provided children must be 1 or more"); ret = -EINVAL; goto exit; } @@ -964,6 +953,7 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags, opened[i] = true; } + s->next_child_index = s->num_children; g_free(opened); goto exit; @@ -1020,6 +1010,72 @@ static void quorum_attach_aio_context(BlockDriverState *bs, } } +static void quorum_add_child(BlockDriverState *bs, BlockDriverState *child_bs, + Error **errp) +{ + BDRVQuorumState *s = bs->opaque; + BdrvChild *child; + char indexstr[32]; + int ret; + + assert(s->num_children <= INT_MAX / sizeof(BdrvChild *)); + if (s->num_children == INT_MAX / sizeof(BdrvChild *) || + s->next_child_index == UINT_MAX) { + error_setg(errp, "Too many children"); + return; + } + + ret = snprintf(indexstr, 32, "children.%u", s->next_child_index); + if (ret < 0 || ret >= 32) { + error_setg(errp, "cannot generate child name"); + return; + } + s->next_child_index++; + + bdrv_drained_begin(bs); + + /* We can safely add the child now */ + bdrv_ref(child_bs); + child = bdrv_attach_child(bs, child_bs, indexstr, &child_format); + s->children = g_renew(BdrvChild *, s->children, s->num_children + 1); + s->children[s->num_children++] = child; + + bdrv_drained_end(bs); +} + +static void quorum_del_child(BlockDriverState *bs, BdrvChild *child, + Error **errp) +{ + BDRVQuorumState *s = bs->opaque; + int i; + + for (i = 0; i < s->num_children; i++) { + if (s->children[i] == child) { + break; + } + } + + /* we have checked it in bdrv_del_child() */ + assert(i < s->num_children); + + if (s->num_children <= s->threshold) { + error_setg(errp, + "The number of children cannot be lower than the vote threshold %d", + s->threshold); + return; + } + + bdrv_drained_begin(bs); + + /* We can safely remove this child now */ + memmove(&s->children[i], &s->children[i + 1], + (s->num_children - i - 1) * sizeof(BdrvChild *)); + s->children = g_renew(BdrvChild *, s->children, --s->num_children); + bdrv_unref_child(bs, child); + + bdrv_drained_end(bs); +} + static void quorum_refresh_filename(BlockDriverState *bs, QDict *options) { BDRVQuorumState *s = bs->opaque; @@ -1070,11 +1126,13 @@ static BlockDriver bdrv_quorum = { .bdrv_aio_readv = quorum_aio_readv, .bdrv_aio_writev = quorum_aio_writev, - .bdrv_invalidate_cache = quorum_invalidate_cache, .bdrv_detach_aio_context = quorum_detach_aio_context, .bdrv_attach_aio_context = quorum_attach_aio_context, + .bdrv_add_child = quorum_add_child, + .bdrv_del_child = quorum_del_child, + .is_filter = true, .bdrv_recurse_is_first_non_filter = quorum_recurse_is_first_non_filter, }; diff --git a/block/raw-aio.h b/block/raw-aio.h index 811e375018..714714e016 100644 --- a/block/raw-aio.h +++ b/block/raw-aio.h @@ -35,15 +35,16 @@ /* linux-aio.c - Linux native implementation */ #ifdef CONFIG_LINUX_AIO -void *laio_init(void); -void laio_cleanup(void *s); -BlockAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd, +typedef struct LinuxAioState LinuxAioState; +LinuxAioState *laio_init(void); +void laio_cleanup(LinuxAioState *s); +BlockAIOCB *laio_submit(BlockDriverState *bs, LinuxAioState *s, int fd, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockCompletionFunc *cb, void *opaque, int type); -void laio_detach_aio_context(void *s, AioContext *old_context); -void laio_attach_aio_context(void *s, AioContext *new_context); -void laio_io_plug(BlockDriverState *bs, void *aio_ctx); -void laio_io_unplug(BlockDriverState *bs, void *aio_ctx, bool unplug); +void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context); +void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context); +void laio_io_plug(BlockDriverState *bs, LinuxAioState *s); +void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s); #endif #ifdef _WIN32 diff --git a/block/raw-posix.c b/block/raw-posix.c index 906d5c9411..a4f5a1ba5f 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -139,7 +139,7 @@ typedef struct BDRVRawState { #ifdef CONFIG_LINUX_AIO int use_aio; - void *aio_ctx; + LinuxAioState *aio_ctx; #endif #ifdef CONFIG_XFS bool is_xfs:1; @@ -398,7 +398,7 @@ static void raw_attach_aio_context(BlockDriverState *bs, } #ifdef CONFIG_LINUX_AIO -static int raw_set_aio(void **aio_ctx, int *use_aio, int bdrv_flags) +static int raw_set_aio(LinuxAioState **aio_ctx, int *use_aio, int bdrv_flags) { int ret = -1; assert(aio_ctx != NULL); @@ -517,6 +517,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, s->has_discard = true; s->has_write_zeroes = true; + bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP; if ((bs->open_flags & BDRV_O_NOCACHE) != 0) { s->needs_alignment = true; } @@ -1345,17 +1346,7 @@ static void raw_aio_unplug(BlockDriverState *bs) #ifdef CONFIG_LINUX_AIO BDRVRawState *s = bs->opaque; if (s->use_aio) { - laio_io_unplug(bs, s->aio_ctx, true); - } -#endif -} - -static void raw_aio_flush_io_queue(BlockDriverState *bs) -{ -#ifdef CONFIG_LINUX_AIO - BDRVRawState *s = bs->opaque; - if (s->use_aio) { - laio_io_unplug(bs, s->aio_ctx, false); + laio_io_unplug(bs, s->aio_ctx); } #endif } @@ -1949,7 +1940,6 @@ BlockDriver bdrv_file = { .bdrv_refresh_limits = raw_refresh_limits, .bdrv_io_plug = raw_aio_plug, .bdrv_io_unplug = raw_aio_unplug, - .bdrv_flush_io_queue = raw_aio_flush_io_queue, .bdrv_truncate = raw_truncate, .bdrv_getlength = raw_getlength, @@ -2398,7 +2388,6 @@ static BlockDriver bdrv_host_device = { .bdrv_refresh_limits = raw_refresh_limits, .bdrv_io_plug = raw_aio_plug, .bdrv_io_unplug = raw_aio_unplug, - .bdrv_flush_io_queue = raw_aio_flush_io_queue, .bdrv_truncate = raw_truncate, .bdrv_getlength = raw_getlength, @@ -2528,7 +2517,6 @@ static BlockDriver bdrv_host_cdrom = { .bdrv_refresh_limits = raw_refresh_limits, .bdrv_io_plug = raw_aio_plug, .bdrv_io_unplug = raw_aio_unplug, - .bdrv_flush_io_queue = raw_aio_flush_io_queue, .bdrv_truncate = raw_truncate, .bdrv_getlength = raw_getlength, @@ -2664,7 +2652,6 @@ static BlockDriver bdrv_host_cdrom = { .bdrv_refresh_limits = raw_refresh_limits, .bdrv_io_plug = raw_aio_plug, .bdrv_io_unplug = raw_aio_unplug, - .bdrv_flush_io_queue = raw_aio_flush_io_queue, .bdrv_truncate = raw_truncate, .bdrv_getlength = raw_getlength, diff --git a/block/raw_bsd.c b/block/raw_bsd.c index a6cc7e9918..3385ed448d 100644 --- a/block/raw_bsd.c +++ b/block/raw_bsd.c @@ -105,8 +105,8 @@ raw_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors, } BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); - ret = bdrv_co_do_pwritev(bs->file->bs, sector_num * BDRV_SECTOR_SIZE, - nb_sectors * BDRV_SECTOR_SIZE, qiov, flags); + ret = bdrv_co_pwritev(bs->file->bs, sector_num * BDRV_SECTOR_SIZE, + nb_sectors * BDRV_SECTOR_SIZE, qiov, flags); fail: if (qiov == &local_qiov) { @@ -116,13 +116,6 @@ fail: return ret; } -static int coroutine_fn -raw_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, - QEMUIOVector *qiov) -{ - return raw_co_writev_flags(bs, sector_num, nb_sectors, qiov, 0); -} - static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum, @@ -211,6 +204,8 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { bs->sg = bs->file->bs->sg; + bs->supported_write_flags = BDRV_REQ_FUA; + bs->supported_zero_flags = BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP; if (bs->probed && !bdrv_is_read_only(bs)) { fprintf(stderr, @@ -256,9 +251,7 @@ BlockDriver bdrv_raw = { .bdrv_close = &raw_close, .bdrv_create = &raw_create, .bdrv_co_readv = &raw_co_readv, - .bdrv_co_writev = &raw_co_writev, .bdrv_co_writev_flags = &raw_co_writev_flags, - .supported_write_flags = BDRV_REQ_FUA, .bdrv_co_write_zeroes = &raw_co_write_zeroes, .bdrv_co_discard = &raw_co_discard, .bdrv_co_get_block_status = &raw_co_get_block_status, diff --git a/block/sheepdog.c b/block/sheepdog.c index 33e0a33824..23fbace1f9 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -294,13 +294,16 @@ static inline size_t count_data_objs(const struct SheepdogInode *inode) #undef DPRINTF #ifdef DEBUG_SDOG -#define DPRINTF(fmt, args...) \ - do { \ - fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \ - } while (0) +#define DEBUG_SDOG_PRINT 1 #else -#define DPRINTF(fmt, args...) +#define DEBUG_SDOG_PRINT 0 #endif +#define DPRINTF(fmt, args...) \ + do { \ + if (DEBUG_SDOG_PRINT) { \ + fprintf(stderr, "%s %d: " fmt, __func__, __LINE__, ##args); \ + } \ + } while (0) typedef struct SheepdogAIOCB SheepdogAIOCB; @@ -1678,7 +1681,7 @@ static int sd_prealloc(const char *filename, Error **errp) if (ret < 0) { goto out; } - ret = blk_pwrite(blk, idx * buf_size, buf, buf_size); + ret = blk_pwrite(blk, idx * buf_size, buf, buf_size, 0); if (ret < 0) { goto out; } diff --git a/block/throttle-groups.c b/block/throttle-groups.c index 4920e09495..9ac063a0cd 100644 --- a/block/throttle-groups.c +++ b/block/throttle-groups.c @@ -219,6 +219,10 @@ static bool throttle_group_schedule_timer(BlockDriverState *bs, ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); bool must_wait; + if (bs->io_limits_disabled) { + return false; + } + /* Check if any of the timers in this group is already armed */ if (tg->any_timer_armed[is_write]) { return true; @@ -313,6 +317,17 @@ void coroutine_fn throttle_group_co_io_limits_intercept(BlockDriverState *bs, qemu_mutex_unlock(&tg->lock); } +void throttle_group_restart_bs(BlockDriverState *bs) +{ + int i; + + for (i = 0; i < 2; i++) { + while (qemu_co_enter_next(&bs->throttled_reqs[i])) { + ; + } + } +} + /* Update the throttle configuration for a particular group. Similar * to throttle_config(), but guarantees atomicity within the * throttling group. @@ -335,6 +350,9 @@ void throttle_group_config(BlockDriverState *bs, ThrottleConfig *cfg) } throttle_config(ts, tt, cfg); qemu_mutex_unlock(&tg->lock); + + qemu_co_enter_next(&bs->throttled_reqs[0]); + qemu_co_enter_next(&bs->throttled_reqs[1]); } /* Get the throttle configuration from a particular group. Similar to diff --git a/block/vdi.c b/block/vdi.c index 75d4819edb..54e11447c3 100644 --- a/block/vdi.c +++ b/block/vdi.c @@ -557,98 +557,109 @@ static int64_t coroutine_fn vdi_co_get_block_status(BlockDriverState *bs, return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset; } -static int vdi_co_read(BlockDriverState *bs, - int64_t sector_num, uint8_t *buf, int nb_sectors) +static int coroutine_fn +vdi_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) { BDRVVdiState *s = bs->opaque; + QEMUIOVector local_qiov; uint32_t bmap_entry; uint32_t block_index; - uint32_t sector_in_block; - uint32_t n_sectors; + uint32_t offset_in_block; + uint32_t n_bytes; + uint64_t bytes_done = 0; int ret = 0; logout("\n"); - while (ret >= 0 && nb_sectors > 0) { - block_index = sector_num / s->block_sectors; - sector_in_block = sector_num % s->block_sectors; - n_sectors = s->block_sectors - sector_in_block; - if (n_sectors > nb_sectors) { - n_sectors = nb_sectors; - } + qemu_iovec_init(&local_qiov, qiov->niov); + + while (ret >= 0 && bytes > 0) { + block_index = offset / s->block_size; + offset_in_block = offset % s->block_size; + n_bytes = MIN(bytes, s->block_size - offset_in_block); - logout("will read %u sectors starting at sector %" PRIu64 "\n", - n_sectors, sector_num); + logout("will read %u bytes starting at offset %" PRIu64 "\n", + n_bytes, offset); /* prepare next AIO request */ bmap_entry = le32_to_cpu(s->bmap[block_index]); if (!VDI_IS_ALLOCATED(bmap_entry)) { /* Block not allocated, return zeros, no need to wait. */ - memset(buf, 0, n_sectors * SECTOR_SIZE); + qemu_iovec_memset(qiov, bytes_done, 0, n_bytes); ret = 0; } else { - uint64_t offset = s->header.offset_data / SECTOR_SIZE + - (uint64_t)bmap_entry * s->block_sectors + - sector_in_block; - ret = bdrv_read(bs->file->bs, offset, buf, n_sectors); + uint64_t data_offset = s->header.offset_data + + (uint64_t)bmap_entry * s->block_size + + offset_in_block; + + qemu_iovec_reset(&local_qiov); + qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes); + + ret = bdrv_co_preadv(bs->file->bs, data_offset, n_bytes, + &local_qiov, 0); } - logout("%u sectors read\n", n_sectors); + logout("%u bytes read\n", n_bytes); - nb_sectors -= n_sectors; - sector_num += n_sectors; - buf += n_sectors * SECTOR_SIZE; + bytes -= n_bytes; + offset += n_bytes; + bytes_done += n_bytes; } + qemu_iovec_destroy(&local_qiov); + return ret; } -static int vdi_co_write(BlockDriverState *bs, - int64_t sector_num, const uint8_t *buf, int nb_sectors) +static int coroutine_fn +vdi_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) { BDRVVdiState *s = bs->opaque; + QEMUIOVector local_qiov; uint32_t bmap_entry; uint32_t block_index; - uint32_t sector_in_block; - uint32_t n_sectors; + uint32_t offset_in_block; + uint32_t n_bytes; uint32_t bmap_first = VDI_UNALLOCATED; uint32_t bmap_last = VDI_UNALLOCATED; uint8_t *block = NULL; + uint64_t bytes_done = 0; int ret = 0; logout("\n"); - while (ret >= 0 && nb_sectors > 0) { - block_index = sector_num / s->block_sectors; - sector_in_block = sector_num % s->block_sectors; - n_sectors = s->block_sectors - sector_in_block; - if (n_sectors > nb_sectors) { - n_sectors = nb_sectors; - } + qemu_iovec_init(&local_qiov, qiov->niov); + + while (ret >= 0 && bytes > 0) { + block_index = offset / s->block_size; + offset_in_block = offset % s->block_size; + n_bytes = MIN(bytes, s->block_size - offset_in_block); - logout("will write %u sectors starting at sector %" PRIu64 "\n", - n_sectors, sector_num); + logout("will write %u bytes starting at offset %" PRIu64 "\n", + n_bytes, offset); /* prepare next AIO request */ bmap_entry = le32_to_cpu(s->bmap[block_index]); if (!VDI_IS_ALLOCATED(bmap_entry)) { /* Allocate new block and write to it. */ - uint64_t offset; + uint64_t data_offset; bmap_entry = s->header.blocks_allocated; s->bmap[block_index] = cpu_to_le32(bmap_entry); s->header.blocks_allocated++; - offset = s->header.offset_data / SECTOR_SIZE + - (uint64_t)bmap_entry * s->block_sectors; + data_offset = s->header.offset_data + + (uint64_t)bmap_entry * s->block_size; if (block == NULL) { block = g_malloc(s->block_size); bmap_first = block_index; } bmap_last = block_index; /* Copy data to be written to new block and zero unused parts. */ - memset(block, 0, sector_in_block * SECTOR_SIZE); - memcpy(block + sector_in_block * SECTOR_SIZE, - buf, n_sectors * SECTOR_SIZE); - memset(block + (sector_in_block + n_sectors) * SECTOR_SIZE, 0, - (s->block_sectors - n_sectors - sector_in_block) * SECTOR_SIZE); + memset(block, 0, offset_in_block); + qemu_iovec_to_buf(qiov, bytes_done, block + offset_in_block, + n_bytes); + memset(block + offset_in_block + n_bytes, 0, + s->block_size - n_bytes - offset_in_block); /* Note that this coroutine does not yield anywhere from reading the * bmap entry until here, so in regards to all the coroutines trying @@ -658,12 +669,12 @@ static int vdi_co_write(BlockDriverState *bs, * acquire the lock and thus the padded cluster is written before * the other coroutines can write to the affected area. */ qemu_co_mutex_lock(&s->write_lock); - ret = bdrv_write(bs->file->bs, offset, block, s->block_sectors); + ret = bdrv_pwrite(bs->file->bs, data_offset, block, s->block_size); qemu_co_mutex_unlock(&s->write_lock); } else { - uint64_t offset = s->header.offset_data / SECTOR_SIZE + - (uint64_t)bmap_entry * s->block_sectors + - sector_in_block; + uint64_t data_offset = s->header.offset_data + + (uint64_t)bmap_entry * s->block_size + + offset_in_block; qemu_co_mutex_lock(&s->write_lock); /* This lock is only used to make sure the following write operation * is executed after the write issued by the coroutine allocating @@ -674,16 +685,23 @@ static int vdi_co_write(BlockDriverState *bs, * that that write operation has returned (there may be other writes * in flight, but they do not concern this very operation). */ qemu_co_mutex_unlock(&s->write_lock); - ret = bdrv_write(bs->file->bs, offset, buf, n_sectors); + + qemu_iovec_reset(&local_qiov); + qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes); + + ret = bdrv_co_pwritev(bs->file->bs, data_offset, n_bytes, + &local_qiov, 0); } - nb_sectors -= n_sectors; - sector_num += n_sectors; - buf += n_sectors * SECTOR_SIZE; + bytes -= n_bytes; + offset += n_bytes; + bytes_done += n_bytes; - logout("%u sectors written\n", n_sectors); + logout("%u bytes written\n", n_bytes); } + qemu_iovec_destroy(&local_qiov); + logout("finished data write\n"); if (ret < 0) { return ret; @@ -694,6 +712,7 @@ static int vdi_co_write(BlockDriverState *bs, VdiHeader *header = (VdiHeader *) block; uint8_t *base; uint64_t offset; + uint32_t n_sectors; logout("now writing modified header\n"); assert(VDI_IS_ALLOCATED(bmap_first)); @@ -808,7 +827,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) vdi_header_print(&header); #endif vdi_header_to_le(&header); - ret = blk_pwrite(blk, offset, &header, sizeof(header)); + ret = blk_pwrite(blk, offset, &header, sizeof(header), 0); if (ret < 0) { error_setg(errp, "Error writing header to %s", filename); goto exit; @@ -829,7 +848,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) bmap[i] = VDI_UNALLOCATED; } } - ret = blk_pwrite(blk, offset, bmap, bmap_size); + ret = blk_pwrite(blk, offset, bmap, bmap_size, 0); if (ret < 0) { error_setg(errp, "Error writing bmap to %s", filename); goto exit; @@ -903,9 +922,9 @@ static BlockDriver bdrv_vdi = { .bdrv_co_get_block_status = vdi_co_get_block_status, .bdrv_make_empty = vdi_make_empty, - .bdrv_read = vdi_co_read, + .bdrv_co_preadv = vdi_co_preadv, #if defined(CONFIG_VDI_WRITE) - .bdrv_write = vdi_co_write, + .bdrv_co_pwritev = vdi_co_pwritev, #endif .bdrv_get_info = vdi_get_info, diff --git a/block/vhdx.c b/block/vhdx.c index 2b7b332404..ec778fe2a7 100644 --- a/block/vhdx.c +++ b/block/vhdx.c @@ -1856,13 +1856,14 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp) creator = g_utf8_to_utf16("QEMU v" QEMU_VERSION, -1, NULL, &creator_items, NULL); signature = cpu_to_le64(VHDX_FILE_SIGNATURE); - ret = blk_pwrite(blk, VHDX_FILE_ID_OFFSET, &signature, sizeof(signature)); + ret = blk_pwrite(blk, VHDX_FILE_ID_OFFSET, &signature, sizeof(signature), + 0); if (ret < 0) { goto delete_and_exit; } if (creator) { ret = blk_pwrite(blk, VHDX_FILE_ID_OFFSET + sizeof(signature), - creator, creator_items * sizeof(gunichar2)); + creator, creator_items * sizeof(gunichar2), 0); if (ret < 0) { goto delete_and_exit; } diff --git a/block/vmdk.c b/block/vmdk.c index 45f9d3c5b9..e6c97c25a6 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1016,27 +1016,26 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp) */ static int get_whole_cluster(BlockDriverState *bs, VmdkExtent *extent, - uint64_t cluster_sector_num, - uint64_t sector_num, - uint64_t skip_start_sector, - uint64_t skip_end_sector) + uint64_t cluster_offset, + uint64_t offset, + uint64_t skip_start_bytes, + uint64_t skip_end_bytes) { int ret = VMDK_OK; int64_t cluster_bytes; uint8_t *whole_grain; /* For COW, align request sector_num to cluster start */ - sector_num = QEMU_ALIGN_DOWN(sector_num, extent->cluster_sectors); cluster_bytes = extent->cluster_sectors << BDRV_SECTOR_BITS; + offset = QEMU_ALIGN_DOWN(offset, cluster_bytes); whole_grain = qemu_blockalign(bs, cluster_bytes); if (!bs->backing) { - memset(whole_grain, 0, skip_start_sector << BDRV_SECTOR_BITS); - memset(whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), 0, - cluster_bytes - (skip_end_sector << BDRV_SECTOR_BITS)); + memset(whole_grain, 0, skip_start_bytes); + memset(whole_grain + skip_end_bytes, 0, cluster_bytes - skip_end_bytes); } - assert(skip_end_sector <= extent->cluster_sectors); + assert(skip_end_bytes <= cluster_bytes); /* we will be here if it's first write on non-exist grain(cluster). * try to read from parent image, if exist */ if (bs->backing && !vmdk_is_cid_valid(bs)) { @@ -1045,42 +1044,43 @@ static int get_whole_cluster(BlockDriverState *bs, } /* Read backing data before skip range */ - if (skip_start_sector > 0) { + if (skip_start_bytes > 0) { if (bs->backing) { - ret = bdrv_read(bs->backing->bs, sector_num, - whole_grain, skip_start_sector); + ret = bdrv_pread(bs->backing->bs, offset, whole_grain, + skip_start_bytes); if (ret < 0) { ret = VMDK_ERROR; goto exit; } } - ret = bdrv_write(extent->file->bs, cluster_sector_num, whole_grain, - skip_start_sector); + ret = bdrv_pwrite(extent->file->bs, cluster_offset, whole_grain, + skip_start_bytes); if (ret < 0) { ret = VMDK_ERROR; goto exit; } } /* Read backing data after skip range */ - if (skip_end_sector < extent->cluster_sectors) { + if (skip_end_bytes < cluster_bytes) { if (bs->backing) { - ret = bdrv_read(bs->backing->bs, sector_num + skip_end_sector, - whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), - extent->cluster_sectors - skip_end_sector); + ret = bdrv_pread(bs->backing->bs, offset + skip_end_bytes, + whole_grain + skip_end_bytes, + cluster_bytes - skip_end_bytes); if (ret < 0) { ret = VMDK_ERROR; goto exit; } } - ret = bdrv_write(extent->file->bs, cluster_sector_num + skip_end_sector, - whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), - extent->cluster_sectors - skip_end_sector); + ret = bdrv_pwrite(extent->file->bs, cluster_offset + skip_end_bytes, + whole_grain + skip_end_bytes, + cluster_bytes - skip_end_bytes); if (ret < 0) { ret = VMDK_ERROR; goto exit; } } + ret = VMDK_OK; exit: qemu_vfree(whole_grain); return ret; @@ -1142,8 +1142,8 @@ static int get_cluster_offset(BlockDriverState *bs, uint64_t offset, bool allocate, uint64_t *cluster_offset, - uint64_t skip_start_sector, - uint64_t skip_end_sector) + uint64_t skip_start_bytes, + uint64_t skip_end_bytes) { unsigned int l1_index, l2_offset, l2_index; int min_index, i, j; @@ -1230,10 +1230,8 @@ static int get_cluster_offset(BlockDriverState *bs, * This problem may occur because of insufficient space on host disk * or inappropriate VM shutdown. */ - ret = get_whole_cluster(bs, extent, - cluster_sector, - offset >> BDRV_SECTOR_BITS, - skip_start_sector, skip_end_sector); + ret = get_whole_cluster(bs, extent, cluster_sector * BDRV_SECTOR_SIZE, + offset, skip_start_bytes, skip_end_bytes); if (ret) { return ret; } @@ -1259,15 +1257,26 @@ static VmdkExtent *find_extent(BDRVVmdkState *s, return NULL; } +static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent, + int64_t offset) +{ + uint64_t offset_in_cluster, extent_begin_offset, extent_relative_offset; + uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE; + + extent_begin_offset = + (extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE; + extent_relative_offset = offset - extent_begin_offset; + offset_in_cluster = extent_relative_offset % cluster_size; + + return offset_in_cluster; +} + static inline uint64_t vmdk_find_index_in_cluster(VmdkExtent *extent, int64_t sector_num) { - uint64_t index_in_cluster, extent_begin_sector, extent_relative_sector_num; - - extent_begin_sector = extent->end_sector - extent->sectors; - extent_relative_sector_num = sector_num - extent_begin_sector; - index_in_cluster = extent_relative_sector_num % extent->cluster_sectors; - return index_in_cluster; + uint64_t offset; + offset = vmdk_find_offset_in_cluster(extent, sector_num * BDRV_SECTOR_SIZE); + return offset / BDRV_SECTOR_SIZE; } static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs, @@ -1319,38 +1328,57 @@ static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs, } static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset, - int64_t offset_in_cluster, const uint8_t *buf, - int nb_sectors, int64_t sector_num) + int64_t offset_in_cluster, QEMUIOVector *qiov, + uint64_t qiov_offset, uint64_t n_bytes, + uint64_t offset) { int ret; VmdkGrainMarker *data = NULL; uLongf buf_len; - const uint8_t *write_buf = buf; - int write_len = nb_sectors * 512; + QEMUIOVector local_qiov; + struct iovec iov; int64_t write_offset; int64_t write_end_sector; if (extent->compressed) { + void *compressed_data; + if (!extent->has_marker) { ret = -EINVAL; goto out; } buf_len = (extent->cluster_sectors << 9) * 2; data = g_malloc(buf_len + sizeof(VmdkGrainMarker)); - if (compress(data->data, &buf_len, buf, nb_sectors << 9) != Z_OK || - buf_len == 0) { + + compressed_data = g_malloc(n_bytes); + qemu_iovec_to_buf(qiov, qiov_offset, compressed_data, n_bytes); + ret = compress(data->data, &buf_len, compressed_data, n_bytes); + g_free(compressed_data); + + if (ret != Z_OK || buf_len == 0) { ret = -EINVAL; goto out; } - data->lba = sector_num; + + data->lba = offset >> BDRV_SECTOR_BITS; data->size = buf_len; - write_buf = (uint8_t *)data; - write_len = buf_len + sizeof(VmdkGrainMarker); + + n_bytes = buf_len + sizeof(VmdkGrainMarker); + iov = (struct iovec) { + .iov_base = data, + .iov_len = n_bytes, + }; + qemu_iovec_init_external(&local_qiov, &iov, 1); + } else { + qemu_iovec_init(&local_qiov, qiov->niov); + qemu_iovec_concat(&local_qiov, qiov, qiov_offset, n_bytes); } + write_offset = cluster_offset + offset_in_cluster, - ret = bdrv_pwrite(extent->file->bs, write_offset, write_buf, write_len); + ret = bdrv_co_pwritev(extent->file->bs, write_offset, n_bytes, + &local_qiov, 0); - write_end_sector = DIV_ROUND_UP(write_offset + write_len, BDRV_SECTOR_SIZE); + write_end_sector = DIV_ROUND_UP(write_offset + n_bytes, BDRV_SECTOR_SIZE); if (extent->compressed) { extent->next_cluster_sector = write_end_sector; @@ -1359,19 +1387,21 @@ static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset, write_end_sector); } - if (ret != write_len) { - ret = ret < 0 ? ret : -EIO; + if (ret < 0) { goto out; } ret = 0; out: g_free(data); + if (!extent->compressed) { + qemu_iovec_destroy(&local_qiov); + } return ret; } static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset, - int64_t offset_in_cluster, uint8_t *buf, - int nb_sectors) + int64_t offset_in_cluster, QEMUIOVector *qiov, + int bytes) { int ret; int cluster_bytes, buf_bytes; @@ -1383,14 +1413,13 @@ static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset, if (!extent->compressed) { - ret = bdrv_pread(extent->file->bs, - cluster_offset + offset_in_cluster, - buf, nb_sectors * 512); - if (ret == nb_sectors * 512) { - return 0; - } else { - return -EIO; + ret = bdrv_co_preadv(extent->file->bs, + cluster_offset + offset_in_cluster, bytes, + qiov, 0); + if (ret < 0) { + return ret; } + return 0; } cluster_bytes = extent->cluster_sectors * 512; /* Read two clusters in case GrainMarker + compressed data > one cluster */ @@ -1422,11 +1451,11 @@ static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset, } if (offset_in_cluster < 0 || - offset_in_cluster + nb_sectors * 512 > buf_len) { + offset_in_cluster + bytes > buf_len) { ret = -EINVAL; goto out; } - memcpy(buf, uncomp_buf + offset_in_cluster, nb_sectors * 512); + qemu_iovec_from_buf(qiov, 0, uncomp_buf + offset_in_cluster, bytes); ret = 0; out: @@ -1435,64 +1464,73 @@ static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset, return ret; } -static int vmdk_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) +static int coroutine_fn +vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) { BDRVVmdkState *s = bs->opaque; int ret; - uint64_t n, index_in_cluster; + uint64_t n_bytes, offset_in_cluster; VmdkExtent *extent = NULL; + QEMUIOVector local_qiov; uint64_t cluster_offset; + uint64_t bytes_done = 0; - while (nb_sectors > 0) { - extent = find_extent(s, sector_num, extent); + qemu_iovec_init(&local_qiov, qiov->niov); + qemu_co_mutex_lock(&s->lock); + + while (bytes > 0) { + extent = find_extent(s, offset >> BDRV_SECTOR_BITS, extent); if (!extent) { - return -EIO; + ret = -EIO; + goto fail; } ret = get_cluster_offset(bs, extent, NULL, - sector_num << 9, false, &cluster_offset, - 0, 0); - index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num); - n = extent->cluster_sectors - index_in_cluster; - if (n > nb_sectors) { - n = nb_sectors; - } + offset, false, &cluster_offset, 0, 0); + offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset); + + n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE + - offset_in_cluster); + if (ret != VMDK_OK) { /* if not allocated, try to read from parent image, if exist */ if (bs->backing && ret != VMDK_ZEROED) { if (!vmdk_is_cid_valid(bs)) { - return -EINVAL; + ret = -EINVAL; + goto fail; } - ret = bdrv_read(bs->backing->bs, sector_num, buf, n); + + qemu_iovec_reset(&local_qiov); + qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes); + + ret = bdrv_co_preadv(bs->backing->bs, offset, n_bytes, + &local_qiov, 0); if (ret < 0) { - return ret; + goto fail; } } else { - memset(buf, 0, 512 * n); + qemu_iovec_memset(qiov, bytes_done, 0, n_bytes); } } else { - ret = vmdk_read_extent(extent, - cluster_offset, index_in_cluster * 512, - buf, n); + qemu_iovec_reset(&local_qiov); + qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes); + + ret = vmdk_read_extent(extent, cluster_offset, offset_in_cluster, + &local_qiov, n_bytes); if (ret) { - return ret; + goto fail; } } - nb_sectors -= n; - sector_num += n; - buf += n * 512; + bytes -= n_bytes; + offset += n_bytes; + bytes_done += n_bytes; } - return 0; -} -static coroutine_fn int vmdk_co_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ - int ret; - BDRVVmdkState *s = bs->opaque; - qemu_co_mutex_lock(&s->lock); - ret = vmdk_read(bs, sector_num, buf, nb_sectors); + ret = 0; +fail: qemu_co_mutex_unlock(&s->lock); + qemu_iovec_destroy(&local_qiov); + return ret; } @@ -1506,38 +1544,38 @@ static coroutine_fn int vmdk_co_read(BlockDriverState *bs, int64_t sector_num, * * Returns: error code with 0 for success. */ -static int vmdk_write(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors, - bool zeroed, bool zero_dry_run) +static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset, + uint64_t bytes, QEMUIOVector *qiov, + bool zeroed, bool zero_dry_run) { BDRVVmdkState *s = bs->opaque; VmdkExtent *extent = NULL; int ret; - int64_t index_in_cluster, n; + int64_t offset_in_cluster, n_bytes; uint64_t cluster_offset; + uint64_t bytes_done = 0; VmdkMetaData m_data; - if (sector_num > bs->total_sectors) { - error_report("Wrong offset: sector_num=0x%" PRIx64 + if (DIV_ROUND_UP(offset, BDRV_SECTOR_SIZE) > bs->total_sectors) { + error_report("Wrong offset: offset=0x%" PRIx64 " total_sectors=0x%" PRIx64, - sector_num, bs->total_sectors); + offset, bs->total_sectors); return -EIO; } - while (nb_sectors > 0) { - extent = find_extent(s, sector_num, extent); + while (bytes > 0) { + extent = find_extent(s, offset >> BDRV_SECTOR_BITS, extent); if (!extent) { return -EIO; } - index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num); - n = extent->cluster_sectors - index_in_cluster; - if (n > nb_sectors) { - n = nb_sectors; - } - ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9, + offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset); + n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE + - offset_in_cluster); + + ret = get_cluster_offset(bs, extent, &m_data, offset, !(extent->compressed || zeroed), - &cluster_offset, - index_in_cluster, index_in_cluster + n); + &cluster_offset, offset_in_cluster, + offset_in_cluster + n_bytes); if (extent->compressed) { if (ret == VMDK_OK) { /* Refuse write to allocated cluster for streamOptimized */ @@ -1546,7 +1584,7 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num, return -EIO; } else { /* allocate */ - ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9, + ret = get_cluster_offset(bs, extent, &m_data, offset, true, &cluster_offset, 0, 0); } } @@ -1556,9 +1594,9 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num, if (zeroed) { /* Do zeroed write, buf is ignored */ if (extent->has_zero_grain && - index_in_cluster == 0 && - n >= extent->cluster_sectors) { - n = extent->cluster_sectors; + offset_in_cluster == 0 && + n_bytes >= extent->cluster_sectors * BDRV_SECTOR_SIZE) { + n_bytes = extent->cluster_sectors * BDRV_SECTOR_SIZE; if (!zero_dry_run) { /* update L2 tables */ if (vmdk_L2update(extent, &m_data, VMDK_GTE_ZEROED) @@ -1570,9 +1608,8 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num, return -ENOTSUP; } } else { - ret = vmdk_write_extent(extent, - cluster_offset, index_in_cluster * 512, - buf, n, sector_num); + ret = vmdk_write_extent(extent, cluster_offset, offset_in_cluster, + qiov, bytes_done, n_bytes, offset); if (ret) { return ret; } @@ -1585,9 +1622,9 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num, } } } - nb_sectors -= n; - sector_num += n; - buf += n * 512; + bytes -= n_bytes; + offset += n_bytes; + bytes_done += n_bytes; /* update CID on the first write every time the virtual disk is * opened */ @@ -1602,25 +1639,65 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num, return 0; } -static coroutine_fn int vmdk_co_write(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors) +static int coroutine_fn +vmdk_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) { int ret; BDRVVmdkState *s = bs->opaque; qemu_co_mutex_lock(&s->lock); - ret = vmdk_write(bs, sector_num, buf, nb_sectors, false, false); + ret = vmdk_pwritev(bs, offset, bytes, qiov, false, false); qemu_co_mutex_unlock(&s->lock); return ret; } +typedef struct VmdkWriteCompressedCo { + BlockDriverState *bs; + int64_t sector_num; + const uint8_t *buf; + int nb_sectors; + int ret; +} VmdkWriteCompressedCo; + +static void vmdk_co_write_compressed(void *opaque) +{ + VmdkWriteCompressedCo *co = opaque; + QEMUIOVector local_qiov; + uint64_t offset = co->sector_num * BDRV_SECTOR_SIZE; + uint64_t bytes = co->nb_sectors * BDRV_SECTOR_SIZE; + + struct iovec iov = (struct iovec) { + .iov_base = (uint8_t*) co->buf, + .iov_len = bytes, + }; + qemu_iovec_init_external(&local_qiov, &iov, 1); + + co->ret = vmdk_pwritev(co->bs, offset, bytes, &local_qiov, false, false); +} + static int vmdk_write_compressed(BlockDriverState *bs, int64_t sector_num, const uint8_t *buf, int nb_sectors) { BDRVVmdkState *s = bs->opaque; + if (s->num_extents == 1 && s->extents[0].compressed) { - return vmdk_write(bs, sector_num, buf, nb_sectors, false, false); + Coroutine *co; + AioContext *aio_context = bdrv_get_aio_context(bs); + VmdkWriteCompressedCo data = { + .bs = bs, + .sector_num = sector_num, + .buf = buf, + .nb_sectors = nb_sectors, + .ret = -EINPROGRESS, + }; + co = qemu_coroutine_create(vmdk_co_write_compressed); + qemu_coroutine_enter(co, &data); + while (data.ret == -EINPROGRESS) { + aio_poll(aio_context, true); + } + return data.ret; } else { return -ENOTSUP; } @@ -1633,12 +1710,15 @@ static int coroutine_fn vmdk_co_write_zeroes(BlockDriverState *bs, { int ret; BDRVVmdkState *s = bs->opaque; + uint64_t offset = sector_num * BDRV_SECTOR_SIZE; + uint64_t bytes = nb_sectors * BDRV_SECTOR_SIZE; + qemu_co_mutex_lock(&s->lock); /* write zeroes could fail if sectors not aligned to cluster, test it with * dry_run == true before really updating image */ - ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, true); + ret = vmdk_pwritev(bs, offset, bytes, NULL, true, true); if (!ret) { - ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, false); + ret = vmdk_pwritev(bs, offset, bytes, NULL, true, false); } qemu_co_mutex_unlock(&s->lock); return ret; @@ -1728,12 +1808,12 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, header.check_bytes[3] = 0xa; /* write all the data */ - ret = blk_pwrite(blk, 0, &magic, sizeof(magic)); + ret = blk_pwrite(blk, 0, &magic, sizeof(magic), 0); if (ret < 0) { error_setg(errp, QERR_IO_ERROR); goto exit; } - ret = blk_pwrite(blk, sizeof(magic), &header, sizeof(header)); + ret = blk_pwrite(blk, sizeof(magic), &header, sizeof(header), 0); if (ret < 0) { error_setg(errp, QERR_IO_ERROR); goto exit; @@ -1753,7 +1833,7 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, gd_buf[i] = cpu_to_le32(tmp); } ret = blk_pwrite(blk, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE, - gd_buf, gd_buf_size); + gd_buf, gd_buf_size, 0); if (ret < 0) { error_setg(errp, QERR_IO_ERROR); goto exit; @@ -1765,7 +1845,7 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, gd_buf[i] = cpu_to_le32(tmp); } ret = blk_pwrite(blk, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE, - gd_buf, gd_buf_size); + gd_buf, gd_buf_size, 0); if (ret < 0) { error_setg(errp, QERR_IO_ERROR); goto exit; @@ -1829,8 +1909,8 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp) int64_t total_size = 0, filesize; char *adapter_type = NULL; char *backing_file = NULL; + char *hw_version = NULL; char *fmt = NULL; - int flags = 0; int ret = 0; bool flat, split, compress; GString *ext_desc_lines; @@ -1861,7 +1941,7 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp) "# The Disk Data Base\n" "#DDB\n" "\n" - "ddb.virtualHWVersion = \"%d\"\n" + "ddb.virtualHWVersion = \"%s\"\n" "ddb.geometry.cylinders = \"%" PRId64 "\"\n" "ddb.geometry.heads = \"%" PRIu32 "\"\n" "ddb.geometry.sectors = \"63\"\n" @@ -1878,8 +1958,20 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp) BDRV_SECTOR_SIZE); adapter_type = qemu_opt_get_del(opts, BLOCK_OPT_ADAPTER_TYPE); backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE); + hw_version = qemu_opt_get_del(opts, BLOCK_OPT_HWVERSION); if (qemu_opt_get_bool_del(opts, BLOCK_OPT_COMPAT6, false)) { - flags |= BLOCK_FLAG_COMPAT6; + if (strcmp(hw_version, "undefined")) { + error_setg(errp, + "compat6 cannot be enabled with hwversion set"); + ret = -EINVAL; + goto exit; + } + g_free(hw_version); + hw_version = g_strdup("6"); + } + if (strcmp(hw_version, "undefined") == 0) { + g_free(hw_version); + hw_version = g_strdup("4"); } fmt = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT); if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ZEROED_GRAIN, false)) { @@ -2001,7 +2093,7 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp) fmt, parent_desc_line, ext_desc_lines->str, - (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4), + hw_version, total_size / (int64_t)(63 * number_heads * BDRV_SECTOR_SIZE), number_heads, @@ -2028,7 +2120,7 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp) blk_set_allow_write_beyond_eof(new_blk, true); - ret = blk_pwrite(new_blk, desc_offset, desc, desc_len); + ret = blk_pwrite(new_blk, desc_offset, desc, desc_len, 0); if (ret < 0) { error_setg_errno(errp, -ret, "Could not write description"); goto exit; @@ -2047,6 +2139,7 @@ exit: } g_free(adapter_type); g_free(backing_file); + g_free(hw_version); g_free(fmt); g_free(desc); g_free(path); @@ -2298,6 +2391,12 @@ static QemuOptsList vmdk_create_opts = { .def_value_str = "off" }, { + .name = BLOCK_OPT_HWVERSION, + .type = QEMU_OPT_STRING, + .help = "VMDK hardware version", + .def_value_str = "undefined" + }, + { .name = BLOCK_OPT_SUBFMT, .type = QEMU_OPT_STRING, .help = @@ -2321,8 +2420,8 @@ static BlockDriver bdrv_vmdk = { .bdrv_open = vmdk_open, .bdrv_check = vmdk_check, .bdrv_reopen_prepare = vmdk_reopen_prepare, - .bdrv_read = vmdk_co_read, - .bdrv_write = vmdk_co_write, + .bdrv_co_preadv = vmdk_co_preadv, + .bdrv_co_pwritev = vmdk_co_pwritev, .bdrv_write_compressed = vmdk_write_compressed, .bdrv_co_write_zeroes = vmdk_co_write_zeroes, .bdrv_close = vmdk_close, diff --git a/block/vpc.c b/block/vpc.c index 3e2ea698d9..0379813e2f 100644 --- a/block/vpc.c +++ b/block/vpc.c @@ -454,22 +454,21 @@ static int vpc_reopen_prepare(BDRVReopenState *state, * The parameter write must be 1 if the offset will be used for a write * operation (the block bitmaps is updated then), 0 otherwise. */ -static inline int64_t get_sector_offset(BlockDriverState *bs, - int64_t sector_num, int write) +static inline int64_t get_image_offset(BlockDriverState *bs, uint64_t offset, + bool write) { BDRVVPCState *s = bs->opaque; - uint64_t offset = sector_num * 512; uint64_t bitmap_offset, block_offset; - uint32_t pagetable_index, pageentry_index; + uint32_t pagetable_index, offset_in_block; pagetable_index = offset / s->block_size; - pageentry_index = (offset % s->block_size) / 512; + offset_in_block = offset % s->block_size; if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff) return -1; /* not allocated */ bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index]; - block_offset = bitmap_offset + s->bitmap_size + (512 * pageentry_index); + block_offset = bitmap_offset + s->bitmap_size + offset_in_block; /* We must ensure that we don't write to any sectors which are marked as unused in the bitmap. We get away with setting all bits in the block @@ -487,6 +486,12 @@ static inline int64_t get_sector_offset(BlockDriverState *bs, return block_offset; } +static inline int64_t get_sector_offset(BlockDriverState *bs, + int64_t sector_num, bool write) +{ + return get_image_offset(bs, sector_num * BDRV_SECTOR_SIZE, write); +} + /* * Writes the footer to the end of the image file. This is needed when the * file grows as it overwrites the old footer @@ -513,7 +518,7 @@ static int rewrite_footer(BlockDriverState* bs) * * Returns the sectors' offset in the image file on success and < 0 on error */ -static int64_t alloc_block(BlockDriverState* bs, int64_t sector_num) +static int64_t alloc_block(BlockDriverState* bs, int64_t offset) { BDRVVPCState *s = bs->opaque; int64_t bat_offset; @@ -522,14 +527,13 @@ static int64_t alloc_block(BlockDriverState* bs, int64_t sector_num) uint8_t bitmap[s->bitmap_size]; /* Check if sector_num is valid */ - if ((sector_num < 0) || (sector_num > bs->total_sectors)) - return -1; + if ((offset < 0) || (offset > bs->total_sectors * BDRV_SECTOR_SIZE)) { + return -EINVAL; + } /* Write entry into in-memory BAT */ - index = (sector_num * 512) / s->block_size; - if (s->pagetable[index] != 0xFFFFFFFF) - return -1; - + index = offset / s->block_size; + assert(s->pagetable[index] == 0xFFFFFFFF); s->pagetable[index] = s->free_data_block_offset / 512; /* Initialize the block's bitmap */ @@ -553,11 +557,11 @@ static int64_t alloc_block(BlockDriverState* bs, int64_t sector_num) if (ret < 0) goto fail; - return get_sector_offset(bs, sector_num, 0); + return get_image_offset(bs, offset, false); fail: s->free_data_block_offset -= (s->block_size + s->bitmap_size); - return -1; + return ret; } static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) @@ -573,104 +577,105 @@ static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) return 0; } -static int vpc_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) +static int coroutine_fn +vpc_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) { BDRVVPCState *s = bs->opaque; int ret; - int64_t offset; - int64_t sectors, sectors_per_block; + int64_t image_offset; + int64_t n_bytes; + int64_t bytes_done = 0; VHDFooter *footer = (VHDFooter *) s->footer_buf; + QEMUIOVector local_qiov; if (be32_to_cpu(footer->type) == VHD_FIXED) { - return bdrv_read(bs->file->bs, sector_num, buf, nb_sectors); + return bdrv_co_preadv(bs->file->bs, offset, bytes, qiov, 0); } - while (nb_sectors > 0) { - offset = get_sector_offset(bs, sector_num, 0); - sectors_per_block = s->block_size >> BDRV_SECTOR_BITS; - sectors = sectors_per_block - (sector_num % sectors_per_block); - if (sectors > nb_sectors) { - sectors = nb_sectors; - } + qemu_co_mutex_lock(&s->lock); + qemu_iovec_init(&local_qiov, qiov->niov); + + while (bytes > 0) { + image_offset = get_image_offset(bs, offset, false); + n_bytes = MIN(bytes, s->block_size - (offset % s->block_size)); - if (offset == -1) { - memset(buf, 0, sectors * BDRV_SECTOR_SIZE); + if (image_offset == -1) { + qemu_iovec_memset(qiov, bytes_done, 0, n_bytes); } else { - ret = bdrv_pread(bs->file->bs, offset, buf, - sectors * BDRV_SECTOR_SIZE); - if (ret != sectors * BDRV_SECTOR_SIZE) { - return -1; + qemu_iovec_reset(&local_qiov); + qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes); + + ret = bdrv_co_preadv(bs->file->bs, image_offset, n_bytes, + &local_qiov, 0); + if (ret < 0) { + goto fail; } } - nb_sectors -= sectors; - sector_num += sectors; - buf += sectors * BDRV_SECTOR_SIZE; + bytes -= n_bytes; + offset += n_bytes; + bytes_done += n_bytes; } - return 0; -} -static coroutine_fn int vpc_co_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ - int ret; - BDRVVPCState *s = bs->opaque; - qemu_co_mutex_lock(&s->lock); - ret = vpc_read(bs, sector_num, buf, nb_sectors); + ret = 0; +fail: + qemu_iovec_destroy(&local_qiov); qemu_co_mutex_unlock(&s->lock); + return ret; } -static int vpc_write(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors) +static int coroutine_fn +vpc_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) { BDRVVPCState *s = bs->opaque; - int64_t offset; - int64_t sectors, sectors_per_block; + int64_t image_offset; + int64_t n_bytes; + int64_t bytes_done = 0; int ret; VHDFooter *footer = (VHDFooter *) s->footer_buf; + QEMUIOVector local_qiov; if (be32_to_cpu(footer->type) == VHD_FIXED) { - return bdrv_write(bs->file->bs, sector_num, buf, nb_sectors); + return bdrv_co_pwritev(bs->file->bs, offset, bytes, qiov, 0); } - while (nb_sectors > 0) { - offset = get_sector_offset(bs, sector_num, 1); - sectors_per_block = s->block_size >> BDRV_SECTOR_BITS; - sectors = sectors_per_block - (sector_num % sectors_per_block); - if (sectors > nb_sectors) { - sectors = nb_sectors; - } + qemu_co_mutex_lock(&s->lock); + qemu_iovec_init(&local_qiov, qiov->niov); + + while (bytes > 0) { + image_offset = get_image_offset(bs, offset, true); + n_bytes = MIN(bytes, s->block_size - (offset % s->block_size)); - if (offset == -1) { - offset = alloc_block(bs, sector_num); - if (offset < 0) - return -1; + if (image_offset == -1) { + image_offset = alloc_block(bs, offset); + if (image_offset < 0) { + ret = image_offset; + goto fail; + } } - ret = bdrv_pwrite(bs->file->bs, offset, buf, - sectors * BDRV_SECTOR_SIZE); - if (ret != sectors * BDRV_SECTOR_SIZE) { - return -1; + qemu_iovec_reset(&local_qiov); + qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes); + + ret = bdrv_co_pwritev(bs->file->bs, image_offset, n_bytes, + &local_qiov, 0); + if (ret < 0) { + goto fail; } - nb_sectors -= sectors; - sector_num += sectors; - buf += sectors * BDRV_SECTOR_SIZE; + bytes -= n_bytes; + offset += n_bytes; + bytes_done += n_bytes; } - return 0; -} - -static coroutine_fn int vpc_co_write(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors) -{ - int ret; - BDRVVPCState *s = bs->opaque; - qemu_co_mutex_lock(&s->lock); - ret = vpc_write(bs, sector_num, buf, nb_sectors); + ret = 0; +fail: + qemu_iovec_destroy(&local_qiov); qemu_co_mutex_unlock(&s->lock); + return ret; } @@ -783,13 +788,13 @@ static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf, block_size = 0x200000; num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512); - ret = blk_pwrite(blk, offset, buf, HEADER_SIZE); + ret = blk_pwrite(blk, offset, buf, HEADER_SIZE, 0); if (ret < 0) { goto fail; } offset = 1536 + ((num_bat_entries * 4 + 511) & ~511); - ret = blk_pwrite(blk, offset, buf, HEADER_SIZE); + ret = blk_pwrite(blk, offset, buf, HEADER_SIZE, 0); if (ret < 0) { goto fail; } @@ -799,7 +804,7 @@ static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf, memset(buf, 0xFF, 512); for (i = 0; i < (num_bat_entries * 4 + 511) / 512; i++) { - ret = blk_pwrite(blk, offset, buf, 512); + ret = blk_pwrite(blk, offset, buf, 512, 0); if (ret < 0) { goto fail; } @@ -826,7 +831,7 @@ static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf, /* Write the header */ offset = 512; - ret = blk_pwrite(blk, offset, buf, 1024); + ret = blk_pwrite(blk, offset, buf, 1024, 0); if (ret < 0) { goto fail; } @@ -848,7 +853,7 @@ static int create_fixed_disk(BlockBackend *blk, uint8_t *buf, return ret; } - ret = blk_pwrite(blk, total_size - HEADER_SIZE, buf, HEADER_SIZE); + ret = blk_pwrite(blk, total_size - HEADER_SIZE, buf, HEADER_SIZE, 0); if (ret < 0) { return ret; } @@ -1056,8 +1061,8 @@ static BlockDriver bdrv_vpc = { .bdrv_reopen_prepare = vpc_reopen_prepare, .bdrv_create = vpc_create, - .bdrv_read = vpc_co_read, - .bdrv_write = vpc_co_write, + .bdrv_co_preadv = vpc_co_preadv, + .bdrv_co_pwritev = vpc_co_pwritev, .bdrv_co_get_block_status = vpc_co_get_block_status, .bdrv_get_info = vpc_get_info, diff --git a/block/vvfat.c b/block/vvfat.c index 183fc4f049..5b0c8dd639 100644 --- a/block/vvfat.c +++ b/block/vvfat.c @@ -1179,6 +1179,7 @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags, bs->read_only = 0; } + bs->request_alignment = BDRV_SECTOR_SIZE; /* No sub-sector I/O supported */ bs->total_sectors = cyls * heads * secs; if (init_directories(s, dirname, heads, secs, errp)) { @@ -1421,14 +1422,31 @@ DLOG(fprintf(stderr, "sector %d not allocated\n", (int)sector_num)); return 0; } -static coroutine_fn int vvfat_co_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) +static int coroutine_fn +vvfat_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) { int ret; BDRVVVFATState *s = bs->opaque; + uint64_t sector_num = offset >> BDRV_SECTOR_BITS; + int nb_sectors = bytes >> BDRV_SECTOR_BITS; + void *buf; + + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + + buf = g_try_malloc(bytes); + if (bytes && buf == NULL) { + return -ENOMEM; + } + qemu_co_mutex_lock(&s->lock); ret = vvfat_read(bs, sector_num, buf, nb_sectors); qemu_co_mutex_unlock(&s->lock); + + qemu_iovec_from_buf(qiov, 0, buf, bytes); + g_free(buf); + return ret; } @@ -2880,14 +2898,31 @@ DLOG(checkpoint()); return 0; } -static coroutine_fn int vvfat_co_write(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors) +static int coroutine_fn +vvfat_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) { int ret; BDRVVVFATState *s = bs->opaque; + uint64_t sector_num = offset >> BDRV_SECTOR_BITS; + int nb_sectors = bytes >> BDRV_SECTOR_BITS; + void *buf; + + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + + buf = g_try_malloc(bytes); + if (bytes && buf == NULL) { + return -ENOMEM; + } + qemu_iovec_to_buf(qiov, 0, buf, bytes); + qemu_co_mutex_lock(&s->lock); ret = vvfat_write(bs, sector_num, buf, nb_sectors); qemu_co_mutex_unlock(&s->lock); + + g_free(buf); + return ret; } @@ -2904,8 +2939,10 @@ static int64_t coroutine_fn vvfat_co_get_block_status(BlockDriverState *bs, return BDRV_BLOCK_DATA; } -static int write_target_commit(BlockDriverState *bs, int64_t sector_num, - const uint8_t* buffer, int nb_sectors) { +static int coroutine_fn +write_target_commit(BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) +{ BDRVVVFATState* s = *((BDRVVVFATState**) bs->opaque); return try_commit(s); } @@ -2918,7 +2955,7 @@ static void write_target_close(BlockDriverState *bs) { static BlockDriver vvfat_write_target = { .format_name = "vvfat_write_target", - .bdrv_write = write_target_commit, + .bdrv_co_pwritev = write_target_commit, .bdrv_close = write_target_close, }; @@ -3014,8 +3051,8 @@ static BlockDriver bdrv_vvfat = { .bdrv_file_open = vvfat_open, .bdrv_close = vvfat_close, - .bdrv_read = vvfat_co_read, - .bdrv_write = vvfat_co_write, + .bdrv_co_preadv = vvfat_co_preadv, + .bdrv_co_pwritev = vvfat_co_pwritev, .bdrv_co_get_block_status = vvfat_co_get_block_status, }; |