diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2018-06-04 18:34:04 +0100 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2018-06-04 18:34:04 +0100 |
commit | 0d514fa23402ab7b4f1c965e0631d953bbe4d3b7 (patch) | |
tree | 15694d41fba306b5b8e545d9a6e15bb199c64b25 | |
parent | 5d7ad3ce103af3ab7c860a4ca97653f8ffa6e29c (diff) | |
parent | 21891a5a3011608845b5d7f1f9cce60cdc2bcc62 (diff) |
Merge remote-tracking branch 'remotes/stefanha/tags/block-pull-request' into staging
Pull request
* Copy offloading for qemu-img convert (iSCSI, raw, and qcow2)
If the underlying storage supports copy offloading, qemu-img convert will
use it instead of performing reads and writes. This avoids data transfers
and thus frees up storage bandwidth for other purposes. SCSI EXTENDED COPY
and Linux copy_file_range(2) are used to implement this optimization.
* Drop spurious "WARNING: I\/O thread spun for 1000 iterations" warning
# gpg: Signature made Mon 04 Jun 2018 12:20:08 BST
# gpg: using RSA key 9CA4ABB381AB73C8
# gpg: Good signature from "Stefan Hajnoczi <stefanha@redhat.com>"
# gpg: aka "Stefan Hajnoczi <stefanha@gmail.com>"
# Primary key fingerprint: 8695 A8BF D3F9 7CDA AC35 775A 9CA4 ABB3 81AB 73C8
* remotes/stefanha/tags/block-pull-request:
main-loop: drop spin_counter
qemu-img: Convert with copy offloading
block-backend: Add blk_co_copy_range
iscsi: Implement copy offloading
iscsi: Create and use iscsi_co_wait_for_task
iscsi: Query and save device designator when opening
file-posix: Implement bdrv_co_copy_range
qcow2: Implement copy offloading
raw: Implement copy offloading
raw: Check byte range uniformly
block: Introduce API for copy offloading
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r-- | block/block-backend.c | 18 | ||||
-rw-r--r-- | block/file-posix.c | 98 | ||||
-rw-r--r-- | block/io.c | 97 | ||||
-rw-r--r-- | block/iscsi.c | 314 | ||||
-rw-r--r-- | block/qcow2.c | 229 | ||||
-rw-r--r-- | block/raw-format.c | 96 | ||||
-rwxr-xr-x | configure | 17 | ||||
-rw-r--r-- | include/block/block.h | 32 | ||||
-rw-r--r-- | include/block/block_int.h | 38 | ||||
-rw-r--r-- | include/block/raw-aio.h | 10 | ||||
-rw-r--r-- | include/scsi/constants.h | 4 | ||||
-rw-r--r-- | include/sysemu/block-backend.h | 4 | ||||
-rw-r--r-- | qemu-img.c | 50 | ||||
-rw-r--r-- | tests/qemu-iotests/common.filter | 1 | ||||
-rw-r--r-- | util/main-loop.c | 25 |
15 files changed, 908 insertions, 125 deletions
diff --git a/block/block-backend.c b/block/block-backend.c index 89f47b00ea..d55c328736 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -2211,3 +2211,21 @@ void blk_unregister_buf(BlockBackend *blk, void *host) { bdrv_unregister_buf(blk_bs(blk), host); } + +int coroutine_fn blk_co_copy_range(BlockBackend *blk_in, int64_t off_in, + BlockBackend *blk_out, int64_t off_out, + int bytes, BdrvRequestFlags flags) +{ + int r; + r = blk_check_byte_request(blk_in, off_in, bytes); + if (r) { + return r; + } + r = blk_check_byte_request(blk_out, off_out, bytes); + if (r) { + return r; + } + return bdrv_co_copy_range(blk_in->root, off_in, + blk_out->root, off_out, + bytes, flags); +} diff --git a/block/file-posix.c b/block/file-posix.c index 5a602cfe37..513d371bb1 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -59,6 +59,7 @@ #ifdef __linux__ #include <sys/ioctl.h> #include <sys/param.h> +#include <sys/syscall.h> #include <linux/cdrom.h> #include <linux/fd.h> #include <linux/fs.h> @@ -187,6 +188,8 @@ typedef struct RawPosixAIOData { #define aio_ioctl_cmd aio_nbytes /* for QEMU_AIO_IOCTL */ off_t aio_offset; int aio_type; + int aio_fd2; + off_t aio_offset2; } RawPosixAIOData; #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) @@ -1446,6 +1449,49 @@ static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb) return -ENOTSUP; } +#ifndef HAVE_COPY_FILE_RANGE +static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd, + off_t *out_off, size_t len, unsigned int flags) +{ +#ifdef __NR_copy_file_range + return syscall(__NR_copy_file_range, in_fd, in_off, out_fd, + out_off, len, flags); +#else + errno = ENOSYS; + return -1; +#endif +} +#endif + +static ssize_t handle_aiocb_copy_range(RawPosixAIOData *aiocb) +{ + uint64_t bytes = aiocb->aio_nbytes; + off_t in_off = aiocb->aio_offset; + off_t out_off = aiocb->aio_offset2; + + while (bytes) { + ssize_t ret = copy_file_range(aiocb->aio_fildes, &in_off, + aiocb->aio_fd2, &out_off, + bytes, 0); + if (ret == -EINTR) { + continue; + } + if (ret < 0) { + if (errno == ENOSYS) { + return -ENOTSUP; + } else { + return -errno; + } + } + if (!ret) { + /* No progress (e.g. when beyond EOF), fall back to buffer I/O. */ + return -ENOTSUP; + } + bytes -= ret; + } + return 0; +} + static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb) { int ret = -EOPNOTSUPP; @@ -1526,6 +1572,9 @@ static int aio_worker(void *arg) case QEMU_AIO_WRITE_ZEROES: ret = handle_aiocb_write_zeroes(aiocb); break; + case QEMU_AIO_COPY_RANGE: + ret = handle_aiocb_copy_range(aiocb); + break; default: fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type); ret = -EINVAL; @@ -1536,9 +1585,10 @@ static int aio_worker(void *arg) return ret; } -static int paio_submit_co(BlockDriverState *bs, int fd, - int64_t offset, QEMUIOVector *qiov, - int bytes, int type) +static int paio_submit_co_full(BlockDriverState *bs, int fd, + int64_t offset, int fd2, int64_t offset2, + QEMUIOVector *qiov, + int bytes, int type) { RawPosixAIOData *acb = g_new(RawPosixAIOData, 1); ThreadPool *pool; @@ -1546,6 +1596,8 @@ static int paio_submit_co(BlockDriverState *bs, int fd, acb->bs = bs; acb->aio_type = type; acb->aio_fildes = fd; + acb->aio_fd2 = fd2; + acb->aio_offset2 = offset2; acb->aio_nbytes = bytes; acb->aio_offset = offset; @@ -1561,6 +1613,13 @@ static int paio_submit_co(BlockDriverState *bs, int fd, return thread_pool_submit_co(pool, aio_worker, acb); } +static inline int paio_submit_co(BlockDriverState *bs, int fd, + int64_t offset, QEMUIOVector *qiov, + int bytes, int type) +{ + return paio_submit_co_full(bs, fd, offset, -1, 0, qiov, bytes, type); +} + static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd, int64_t offset, QEMUIOVector *qiov, int bytes, BlockCompletionFunc *cb, void *opaque, int type) @@ -2451,6 +2510,35 @@ static void raw_abort_perm_update(BlockDriverState *bs) raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL); } +static int coroutine_fn raw_co_copy_range_from(BlockDriverState *bs, + BdrvChild *src, uint64_t src_offset, + BdrvChild *dst, uint64_t dst_offset, + uint64_t bytes, BdrvRequestFlags flags) +{ + return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes, flags); +} + +static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs, + BdrvChild *src, uint64_t src_offset, + BdrvChild *dst, uint64_t dst_offset, + uint64_t bytes, BdrvRequestFlags flags) +{ + BDRVRawState *s = bs->opaque; + BDRVRawState *src_s; + + assert(dst->bs == bs); + if (src->bs->drv->bdrv_co_copy_range_to != raw_co_copy_range_to) { + return -ENOTSUP; + } + + src_s = src->bs->opaque; + if (fd_open(bs) < 0 || fd_open(bs) < 0) { + return -EIO; + } + return paio_submit_co_full(bs, src_s->fd, src_offset, s->fd, dst_offset, + NULL, bytes, QEMU_AIO_COPY_RANGE); +} + BlockDriver bdrv_file = { .format_name = "file", .protocol_name = "file", @@ -2474,6 +2562,8 @@ BlockDriver bdrv_file = { .bdrv_co_pwritev = raw_co_pwritev, .bdrv_aio_flush = raw_aio_flush, .bdrv_aio_pdiscard = raw_aio_pdiscard, + .bdrv_co_copy_range_from = raw_co_copy_range_from, + .bdrv_co_copy_range_to = raw_co_copy_range_to, .bdrv_refresh_limits = raw_refresh_limits, .bdrv_io_plug = raw_aio_plug, .bdrv_io_unplug = raw_aio_unplug, @@ -2952,6 +3042,8 @@ static BlockDriver bdrv_host_device = { .bdrv_co_pwritev = raw_co_pwritev, .bdrv_aio_flush = raw_aio_flush, .bdrv_aio_pdiscard = hdev_aio_pdiscard, + .bdrv_co_copy_range_from = raw_co_copy_range_from, + .bdrv_co_copy_range_to = raw_co_copy_range_to, .bdrv_refresh_limits = raw_refresh_limits, .bdrv_io_plug = raw_aio_plug, .bdrv_io_unplug = raw_aio_unplug, diff --git a/block/io.c b/block/io.c index ca96b487eb..b7beaeeb9f 100644 --- a/block/io.c +++ b/block/io.c @@ -2835,3 +2835,100 @@ void bdrv_unregister_buf(BlockDriverState *bs, void *host) bdrv_unregister_buf(child->bs, host); } } + +static int coroutine_fn bdrv_co_copy_range_internal(BdrvChild *src, + uint64_t src_offset, + BdrvChild *dst, + uint64_t dst_offset, + uint64_t bytes, + BdrvRequestFlags flags, + bool recurse_src) +{ + int ret; + + if (!src || !dst || !src->bs || !dst->bs) { + return -ENOMEDIUM; + } + ret = bdrv_check_byte_request(src->bs, src_offset, bytes); + if (ret) { + return ret; + } + + ret = bdrv_check_byte_request(dst->bs, dst_offset, bytes); + if (ret) { + return ret; + } + if (flags & BDRV_REQ_ZERO_WRITE) { + return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, flags); + } + + if (!src->bs->drv->bdrv_co_copy_range_from + || !dst->bs->drv->bdrv_co_copy_range_to + || src->bs->encrypted || dst->bs->encrypted) { + return -ENOTSUP; + } + if (recurse_src) { + return src->bs->drv->bdrv_co_copy_range_from(src->bs, + src, src_offset, + dst, dst_offset, + bytes, flags); + } else { + return dst->bs->drv->bdrv_co_copy_range_to(dst->bs, + src, src_offset, + dst, dst_offset, + bytes, flags); + } +} + +/* Copy range from @src to @dst. + * + * See the comment of bdrv_co_copy_range for the parameter and return value + * semantics. */ +int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset, + BdrvChild *dst, uint64_t dst_offset, + uint64_t bytes, BdrvRequestFlags flags) +{ + return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, + bytes, flags, true); +} + +/* Copy range from @src to @dst. + * + * See the comment of bdrv_co_copy_range for the parameter and return value + * semantics. */ +int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset, + BdrvChild *dst, uint64_t dst_offset, + uint64_t bytes, BdrvRequestFlags flags) +{ + return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, + bytes, flags, false); +} + +int coroutine_fn bdrv_co_copy_range(BdrvChild *src, uint64_t src_offset, + BdrvChild *dst, uint64_t dst_offset, + uint64_t bytes, BdrvRequestFlags flags) +{ + BdrvTrackedRequest src_req, dst_req; + BlockDriverState *src_bs = src->bs; + BlockDriverState *dst_bs = dst->bs; + int ret; + + bdrv_inc_in_flight(src_bs); + bdrv_inc_in_flight(dst_bs); + tracked_request_begin(&src_req, src_bs, src_offset, + bytes, BDRV_TRACKED_READ); + tracked_request_begin(&dst_req, dst_bs, dst_offset, + bytes, BDRV_TRACKED_WRITE); + + wait_serialising_requests(&src_req); + wait_serialising_requests(&dst_req); + ret = bdrv_co_copy_range_from(src, src_offset, + dst, dst_offset, + bytes, flags); + + tracked_request_end(&src_req); + tracked_request_end(&dst_req); + bdrv_dec_in_flight(src_bs); + bdrv_dec_in_flight(dst_bs); + return ret; +} diff --git a/block/iscsi.c b/block/iscsi.c index 3fd7203916..c2fbd8a8aa 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -68,6 +68,7 @@ typedef struct IscsiLun { QemuMutex mutex; struct scsi_inquiry_logical_block_provisioning lbp; struct scsi_inquiry_block_limits bl; + struct scsi_inquiry_device_designator *dd; unsigned char *zeroblock; /* The allocmap tracks which clusters (pages) on the iSCSI target are * allocated and which are not. In case a target returns zeros for @@ -555,6 +556,17 @@ static inline bool iscsi_allocmap_is_valid(IscsiLun *iscsilun, offset / iscsilun->cluster_size) == size); } +static void coroutine_fn iscsi_co_wait_for_task(IscsiTask *iTask, + IscsiLun *iscsilun) +{ + while (!iTask->complete) { + iscsi_set_events(iscsilun); + qemu_mutex_unlock(&iscsilun->mutex); + qemu_coroutine_yield(); + qemu_mutex_lock(&iscsilun->mutex); + } +} + static int coroutine_fn iscsi_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov, int flags) @@ -616,12 +628,7 @@ retry: scsi_task_set_iov_out(iTask.task, (struct scsi_iovec *) iov->iov, iov->niov); #endif - while (!iTask.complete) { - iscsi_set_events(iscsilun); - qemu_mutex_unlock(&iscsilun->mutex); - qemu_coroutine_yield(); - qemu_mutex_lock(&iscsilun->mutex); - } + iscsi_co_wait_for_task(&iTask, iscsilun); if (iTask.task != NULL) { scsi_free_scsi_task(iTask.task); @@ -692,13 +699,7 @@ retry: ret = -ENOMEM; goto out_unlock; } - - while (!iTask.complete) { - iscsi_set_events(iscsilun); - qemu_mutex_unlock(&iscsilun->mutex); - qemu_coroutine_yield(); - qemu_mutex_lock(&iscsilun->mutex); - } + iscsi_co_wait_for_task(&iTask, iscsilun); if (iTask.do_retry) { if (iTask.task != NULL) { @@ -862,13 +863,8 @@ retry: #if LIBISCSI_API_VERSION < (20160603) scsi_task_set_iov_in(iTask.task, (struct scsi_iovec *) iov->iov, iov->niov); #endif - while (!iTask.complete) { - iscsi_set_events(iscsilun); - qemu_mutex_unlock(&iscsilun->mutex); - qemu_coroutine_yield(); - qemu_mutex_lock(&iscsilun->mutex); - } + iscsi_co_wait_for_task(&iTask, iscsilun); if (iTask.task != NULL) { scsi_free_scsi_task(iTask.task); iTask.task = NULL; @@ -905,12 +901,7 @@ retry: return -ENOMEM; } - while (!iTask.complete) { - iscsi_set_events(iscsilun); - qemu_mutex_unlock(&iscsilun->mutex); - qemu_coroutine_yield(); - qemu_mutex_lock(&iscsilun->mutex); - } + iscsi_co_wait_for_task(&iTask, iscsilun); if (iTask.task != NULL) { scsi_free_scsi_task(iTask.task); @@ -1142,12 +1133,7 @@ retry: goto out_unlock; } - while (!iTask.complete) { - iscsi_set_events(iscsilun); - qemu_mutex_unlock(&iscsilun->mutex); - qemu_coroutine_yield(); - qemu_mutex_lock(&iscsilun->mutex); - } + iscsi_co_wait_for_task(&iTask, iscsilun); if (iTask.task != NULL) { scsi_free_scsi_task(iTask.task); @@ -1243,12 +1229,7 @@ retry: return -ENOMEM; } - while (!iTask.complete) { - iscsi_set_events(iscsilun); - qemu_mutex_unlock(&iscsilun->mutex); - qemu_coroutine_yield(); - qemu_mutex_lock(&iscsilun->mutex); - } + iscsi_co_wait_for_task(&iTask, iscsilun); if (iTask.status == SCSI_STATUS_CHECK_CONDITION && iTask.task->sense.key == SCSI_SENSE_ILLEGAL_REQUEST && @@ -1740,6 +1721,30 @@ static QemuOptsList runtime_opts = { }, }; +static void iscsi_save_designator(IscsiLun *lun, + struct scsi_inquiry_device_identification *inq_di) +{ + struct scsi_inquiry_device_designator *desig, *copy = NULL; + + for (desig = inq_di->designators; desig; desig = desig->next) { + if (desig->association || + desig->designator_type > SCSI_DESIGNATOR_TYPE_NAA) { + continue; + } + /* NAA works better than T10 vendor ID based designator. */ + if (!copy || copy->designator_type < desig->designator_type) { + copy = desig; + } + } + if (copy) { + lun->dd = g_new(struct scsi_inquiry_device_designator, 1); + *lun->dd = *copy; + lun->dd->next = NULL; + lun->dd->designator = g_malloc(copy->designator_length); + memcpy(lun->dd->designator, copy->designator, copy->designator_length); + } +} + static int iscsi_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { @@ -1922,6 +1927,7 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags, struct scsi_task *inq_task; struct scsi_inquiry_logical_block_provisioning *inq_lbp; struct scsi_inquiry_block_limits *inq_bl; + struct scsi_inquiry_device_identification *inq_di; switch (inq_vpd->pages[i]) { case SCSI_INQUIRY_PAGECODE_LOGICAL_BLOCK_PROVISIONING: inq_task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 1, @@ -1947,6 +1953,17 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags, sizeof(struct scsi_inquiry_block_limits)); scsi_free_scsi_task(inq_task); break; + case SCSI_INQUIRY_PAGECODE_DEVICE_IDENTIFICATION: + inq_task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 1, + SCSI_INQUIRY_PAGECODE_DEVICE_IDENTIFICATION, + (void **) &inq_di, errp); + if (inq_task == NULL) { + ret = -EINVAL; + goto out; + } + iscsi_save_designator(iscsilun, inq_di); + scsi_free_scsi_task(inq_task); + break; default: break; } @@ -2003,6 +2020,10 @@ static void iscsi_close(BlockDriverState *bs) iscsi_logout_sync(iscsi); } iscsi_destroy_context(iscsi); + if (iscsilun->dd) { + g_free(iscsilun->dd->designator); + g_free(iscsilun->dd); + } g_free(iscsilun->zeroblock); iscsi_allocmap_free(iscsilun); qemu_mutex_destroy(&iscsilun->mutex); @@ -2184,6 +2205,221 @@ static void coroutine_fn iscsi_co_invalidate_cache(BlockDriverState *bs, iscsi_allocmap_invalidate(iscsilun); } +static int coroutine_fn iscsi_co_copy_range_from(BlockDriverState *bs, + BdrvChild *src, + uint64_t src_offset, + BdrvChild *dst, + uint64_t dst_offset, + uint64_t bytes, + BdrvRequestFlags flags) +{ + return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes, flags); +} + +static struct scsi_task *iscsi_xcopy_task(int param_len) +{ + struct scsi_task *task; + + task = g_new0(struct scsi_task, 1); + + task->cdb[0] = EXTENDED_COPY; + task->cdb[10] = (param_len >> 24) & 0xFF; + task->cdb[11] = (param_len >> 16) & 0xFF; + task->cdb[12] = (param_len >> 8) & 0xFF; + task->cdb[13] = param_len & 0xFF; + task->cdb_size = 16; + task->xfer_dir = SCSI_XFER_WRITE; + task->expxferlen = param_len; + + return task; +} + +static void iscsi_populate_target_desc(unsigned char *desc, IscsiLun *lun) +{ + struct scsi_inquiry_device_designator *dd = lun->dd; + + memset(desc, 0, 32); + desc[0] = 0xE4; /* IDENT_DESCR_TGT_DESCR */ + desc[4] = dd->code_set; + desc[5] = (dd->designator_type & 0xF) + | ((dd->association & 3) << 4); + desc[7] = dd->designator_length; + memcpy(desc + 8, dd->designator, dd->designator_length); + + desc[28] = 0; + desc[29] = (lun->block_size >> 16) & 0xFF; + desc[30] = (lun->block_size >> 8) & 0xFF; + desc[31] = lun->block_size & 0xFF; +} + +static void iscsi_xcopy_desc_hdr(uint8_t *hdr, int dc, int cat, int src_index, + int dst_index) +{ + hdr[0] = 0x02; /* BLK_TO_BLK_SEG_DESCR */ + hdr[1] = ((dc << 1) | cat) & 0xFF; + hdr[2] = (XCOPY_BLK2BLK_SEG_DESC_SIZE >> 8) & 0xFF; + /* don't account for the first 4 bytes in descriptor header*/ + hdr[3] = (XCOPY_BLK2BLK_SEG_DESC_SIZE - 4 /* SEG_DESC_SRC_INDEX_OFFSET */) & 0xFF; + hdr[4] = (src_index >> 8) & 0xFF; + hdr[5] = src_index & 0xFF; + hdr[6] = (dst_index >> 8) & 0xFF; + hdr[7] = dst_index & 0xFF; +} + +static void iscsi_xcopy_populate_desc(uint8_t *desc, int dc, int cat, + int src_index, int dst_index, int num_blks, + uint64_t src_lba, uint64_t dst_lba) +{ + iscsi_xcopy_desc_hdr(desc, dc, cat, src_index, dst_index); + + /* The caller should verify the request size */ + assert(num_blks < 65536); + desc[10] = (num_blks >> 8) & 0xFF; + desc[11] = num_blks & 0xFF; + desc[12] = (src_lba >> 56) & 0xFF; + desc[13] = (src_lba >> 48) & 0xFF; + desc[14] = (src_lba >> 40) & 0xFF; + desc[15] = (src_lba >> 32) & 0xFF; + desc[16] = (src_lba >> 24) & 0xFF; + desc[17] = (src_lba >> 16) & 0xFF; + desc[18] = (src_lba >> 8) & 0xFF; + desc[19] = src_lba & 0xFF; + desc[20] = (dst_lba >> 56) & 0xFF; + desc[21] = (dst_lba >> 48) & 0xFF; + desc[22] = (dst_lba >> 40) & 0xFF; + desc[23] = (dst_lba >> 32) & 0xFF; + desc[24] = (dst_lba >> 24) & 0xFF; + desc[25] = (dst_lba >> 16) & 0xFF; + desc[26] = (dst_lba >> 8) & 0xFF; + desc[27] = dst_lba & 0xFF; +} + +static void iscsi_xcopy_populate_header(unsigned char *buf, int list_id, int str, + int list_id_usage, int prio, + int tgt_desc_len, + int seg_desc_len, int inline_data_len) +{ + buf[0] = list_id; + buf[1] = ((str & 1) << 5) | ((list_id_usage & 3) << 3) | (prio & 7); + buf[2] = (tgt_desc_len >> 8) & 0xFF; + buf[3] = tgt_desc_len & 0xFF; + buf[8] = (seg_desc_len >> 24) & 0xFF; + buf[9] = (seg_desc_len >> 16) & 0xFF; + buf[10] = (seg_desc_len >> 8) & 0xFF; + buf[11] = seg_desc_len & 0xFF; + buf[12] = (inline_data_len >> 24) & 0xFF; + buf[13] = (inline_data_len >> 16) & 0xFF; + buf[14] = (inline_data_len >> 8) & 0xFF; + buf[15] = inline_data_len & 0xFF; +} + +static void iscsi_xcopy_data(struct iscsi_data *data, + IscsiLun *src, int64_t src_lba, + IscsiLun *dst, int64_t dst_lba, + uint16_t num_blocks) +{ + uint8_t *buf; + const int src_offset = XCOPY_DESC_OFFSET; + const int dst_offset = XCOPY_DESC_OFFSET + IDENT_DESCR_TGT_DESCR_SIZE; + const int seg_offset = dst_offset + IDENT_DESCR_TGT_DESCR_SIZE; + + data->size = XCOPY_DESC_OFFSET + + IDENT_DESCR_TGT_DESCR_SIZE * 2 + + XCOPY_BLK2BLK_SEG_DESC_SIZE; + data->data = g_malloc0(data->size); + buf = data->data; + + /* Initialise the parameter list header */ + iscsi_xcopy_populate_header(buf, 1, 0, 2 /* LIST_ID_USAGE_DISCARD */, + 0, 2 * IDENT_DESCR_TGT_DESCR_SIZE, + XCOPY_BLK2BLK_SEG_DESC_SIZE, + 0); + + /* Initialise CSCD list with one src + one dst descriptor */ + iscsi_populate_target_desc(&buf[src_offset], src); + iscsi_populate_target_desc(&buf[dst_offset], dst); + + /* Initialise one segment descriptor */ + iscsi_xcopy_populate_desc(&buf[seg_offset], 0, 0, 0, 1, num_blocks, + src_lba, dst_lba); +} + +static int coroutine_fn iscsi_co_copy_range_to(BlockDriverState *bs, + BdrvChild *src, + uint64_t src_offset, + BdrvChild *dst, + uint64_t dst_offset, + uint64_t bytes, + BdrvRequestFlags flags) +{ + IscsiLun *dst_lun = dst->bs->opaque; + IscsiLun *src_lun; + struct IscsiTask iscsi_task; + struct iscsi_data data; + int r = 0; + int block_size; + + if (src->bs->drv->bdrv_co_copy_range_to != iscsi_co_copy_range_to) { + return -ENOTSUP; + } + src_lun = src->bs->opaque; + + if (!src_lun->dd || !dst_lun->dd) { + return -ENOTSUP; + } + if (!is_byte_request_lun_aligned(dst_offset, bytes, dst_lun)) { + return -ENOTSUP; + } + if (!is_byte_request_lun_aligned(src_offset, bytes, src_lun)) { + return -ENOTSUP; + } + if (dst_lun->block_size != src_lun->block_size || + !dst_lun->block_size) { + return -ENOTSUP; + } + + block_size = dst_lun->block_size; + if (bytes / block_size > 65535) { + return -ENOTSUP; + } + + iscsi_xcopy_data(&data, + src_lun, src_offset / block_size, + dst_lun, dst_offset / block_size, + bytes / block_size); + + iscsi_co_init_iscsitask(dst_lun, &iscsi_task); + + qemu_mutex_lock(&dst_lun->mutex); + iscsi_task.task = iscsi_xcopy_task(data.size); +retry: + if (iscsi_scsi_command_async(dst_lun->iscsi, dst_lun->lun, + iscsi_task.task, iscsi_co_generic_cb, + &data, + &iscsi_task) != 0) { + r = -EIO; + goto out_unlock; + } + + iscsi_co_wait_for_task(&iscsi_task, dst_lun); + + if (iscsi_task.do_retry) { + iscsi_task.complete = 0; + goto retry; + } + + if (iscsi_task.status != SCSI_STATUS_GOOD) { + r = iscsi_task.err_code; + goto out_unlock; + } + +out_unlock: + g_free(iscsi_task.task); + qemu_mutex_unlock(&dst_lun->mutex); + g_free(iscsi_task.err_str); + return r; +} + static QemuOptsList iscsi_create_opts = { .name = "iscsi-create-opts", .head = QTAILQ_HEAD_INITIALIZER(iscsi_create_opts.head), @@ -2218,6 +2454,8 @@ static BlockDriver bdrv_iscsi = { .bdrv_co_block_status = iscsi_co_block_status, .bdrv_co_pdiscard = iscsi_co_pdiscard, + .bdrv_co_copy_range_from = iscsi_co_copy_range_from, + .bdrv_co_copy_range_to = iscsi_co_copy_range_to, .bdrv_co_pwrite_zeroes = iscsi_co_pwrite_zeroes, .bdrv_co_readv = iscsi_co_readv, .bdrv_co_writev = iscsi_co_writev, @@ -2253,6 +2491,8 @@ static BlockDriver bdrv_iser = { .bdrv_co_block_status = iscsi_co_block_status, .bdrv_co_pdiscard = iscsi_co_pdiscard, + .bdrv_co_copy_range_from = iscsi_co_copy_range_from, + .bdrv_co_copy_range_to = iscsi_co_copy_range_to, .bdrv_co_pwrite_zeroes = iscsi_co_pwrite_zeroes, .bdrv_co_readv = iscsi_co_readv, .bdrv_co_writev = iscsi_co_writev, diff --git a/block/qcow2.c b/block/qcow2.c index c87c593e83..549fee9b69 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -1761,6 +1761,39 @@ static int coroutine_fn qcow2_co_block_status(BlockDriverState *bs, return status; } +static coroutine_fn int qcow2_handle_l2meta(BlockDriverState *bs, + QCowL2Meta **pl2meta, + bool link_l2) +{ + int ret = 0; + QCowL2Meta *l2meta = *pl2meta; + + while (l2meta != NULL) { + QCowL2Meta *next; + + if (!ret && link_l2) { + ret = qcow2_alloc_cluster_link_l2(bs, l2meta); + if (ret) { + goto out; + } + } + + /* Take the request off the list of running requests */ + if (l2meta->nb_clusters != 0) { + QLIST_REMOVE(l2meta, next_in_flight); + } + + qemu_co_queue_restart_all(&l2meta->dependent_requests); + + next = l2meta->next; + g_free(l2meta); + l2meta = next; + } +out: + *pl2meta = l2meta; + return ret; +} + static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) @@ -2047,24 +2080,9 @@ static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset, } } - while (l2meta != NULL) { - QCowL2Meta *next; - - ret = qcow2_alloc_cluster_link_l2(bs, l2meta); - if (ret < 0) { - goto fail; - } - - /* Take the request off the list of running requests */ - if (l2meta->nb_clusters != 0) { - QLIST_REMOVE(l2meta, next_in_flight); - } - - qemu_co_queue_restart_all(&l2meta->dependent_requests); - - next = l2meta->next; - g_free(l2meta); - l2meta = next; + ret = qcow2_handle_l2meta(bs, &l2meta, true); + if (ret) { + goto fail; } bytes -= cur_bytes; @@ -2075,18 +2093,7 @@ static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset, ret = 0; fail: - while (l2meta != NULL) { - QCowL2Meta *next; - - if (l2meta->nb_clusters != 0) { - QLIST_REMOVE(l2meta, next_in_flight); - } - qemu_co_queue_restart_all(&l2meta->dependent_requests); - - next = l2meta->next; - g_free(l2meta); - l2meta = next; - } + qcow2_handle_l2meta(bs, &l2meta, false); qemu_co_mutex_unlock(&s->lock); @@ -3273,6 +3280,166 @@ static coroutine_fn int qcow2_co_pdiscard(BlockDriverState *bs, return ret; } +static int coroutine_fn +qcow2_co_copy_range_from(BlockDriverState *bs, + BdrvChild *src, uint64_t src_offset, + BdrvChild *dst, uint64_t dst_offset, + uint64_t bytes, BdrvRequestFlags flags) +{ + BDRVQcow2State *s = bs->opaque; + int ret; + unsigned int cur_bytes; /* number of bytes in current iteration */ + BdrvChild *child = NULL; + BdrvRequestFlags cur_flags; + + assert(!bs->encrypted); + qemu_co_mutex_lock(&s->lock); + + while (bytes != 0) { + uint64_t copy_offset = 0; + /* prepare next request */ + cur_bytes = MIN(bytes, INT_MAX); + cur_flags = flags; + + ret = qcow2_get_cluster_offset(bs, src_offset, &cur_bytes, ©_offset); + if (ret < 0) { + goto out; + } + + switch (ret) { + case QCOW2_CLUSTER_UNALLOCATED: + if (bs->backing && bs->backing->bs) { + int64_t backing_length = bdrv_getlength(bs->backing->bs); + if (src_offset >= backing_length) { + cur_flags |= BDRV_REQ_ZERO_WRITE; + } else { + child = bs->backing; + cur_bytes = MIN(cur_bytes, backing_length - src_offset); + copy_offset = src_offset; + } + } else { + cur_flags |= BDRV_REQ_ZERO_WRITE; + } + break; + + case QCOW2_CLUSTER_ZERO_PLAIN: + case QCOW2_CLUSTER_ZERO_ALLOC: + cur_flags |= BDRV_REQ_ZERO_WRITE; + break; + + case QCOW2_CLUSTER_COMPRESSED: + ret = -ENOTSUP; + goto out; + break; + + case QCOW2_CLUSTER_NORMAL: + child = bs->file; + copy_offset += offset_into_cluster(s, src_offset); + if ((copy_offset & 511) != 0) { + ret = -EIO; + goto out; + } + break; + + default: + abort(); + } + qemu_co_mutex_unlock(&s->lock); + ret = bdrv_co_copy_range_from(child, + copy_offset, + dst, dst_offset, + cur_bytes, cur_flags); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) { + goto out; + } + + bytes -= cur_bytes; + src_offset += cur_bytes; + dst_offset += cur_bytes; + } + ret = 0; + +out: + qemu_co_mutex_unlock(&s->lock); + return ret; +} + +static int coroutine_fn +qcow2_co_copy_range_to(BlockDriverState *bs, + BdrvChild *src, uint64_t src_offset, + BdrvChild *dst, uint64_t dst_offset, + uint64_t bytes, BdrvRequestFlags flags) +{ + BDRVQcow2State *s = bs->opaque; + int offset_in_cluster; + int ret; + unsigned int cur_bytes; /* number of sectors in current iteration */ + uint64_t cluster_offset; + uint8_t *cluster_data = NULL; + QCowL2Meta *l2meta = NULL; + + assert(!bs->encrypted); + s->cluster_cache_offset = -1; /* disable compressed cache */ + + qemu_co_mutex_lock(&s->lock); + + while (bytes != 0) { + + l2meta = NULL; + + offset_in_cluster = offset_into_cluster(s, dst_offset); + cur_bytes = MIN(bytes, INT_MAX); + + /* TODO: + * If src->bs == dst->bs, we could simply copy by incrementing + * the refcnt, without copying user data. + * Or if src->bs == dst->bs->backing->bs, we could copy by discarding. */ + ret = qcow2_alloc_cluster_offset(bs, dst_offset, &cur_bytes, + &cluster_offset, &l2meta); + if (ret < 0) { + goto fail; + } + + assert((cluster_offset & 511) == 0); + + ret = qcow2_pre_write_overlap_check(bs, 0, + cluster_offset + offset_in_cluster, cur_bytes); + if (ret < 0) { + goto fail; + } + + qemu_co_mutex_unlock(&s->lock); + ret = bdrv_co_copy_range_to(src, src_offset, + bs->file, + cluster_offset + offset_in_cluster, + cur_bytes, flags); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) { + goto fail; + } + + ret = qcow2_handle_l2meta(bs, &l2meta, true); + if (ret) { + goto fail; + } + + bytes -= cur_bytes; + dst_offset += cur_bytes; + } + ret = 0; + +fail: + qcow2_handle_l2meta(bs, &l2meta, false); + + qemu_co_mutex_unlock(&s->lock); + + qemu_vfree(cluster_data); + trace_qcow2_writev_done_req(qemu_coroutine_self(), ret); + + return ret; +} + static int qcow2_truncate(BlockDriverState *bs, int64_t offset, PreallocMode prealloc, Error **errp) { @@ -4521,6 +4688,8 @@ BlockDriver bdrv_qcow2 = { .bdrv_co_pwrite_zeroes = qcow2_co_pwrite_zeroes, .bdrv_co_pdiscard = qcow2_co_pdiscard, + .bdrv_co_copy_range_from = qcow2_co_copy_range_from, + .bdrv_co_copy_range_to = qcow2_co_copy_range_to, .bdrv_truncate = qcow2_truncate, .bdrv_co_pwritev_compressed = qcow2_co_pwritev_compressed, .bdrv_make_empty = qcow2_make_empty, diff --git a/block/raw-format.c b/block/raw-format.c index fe33693a2d..f2e468df6f 100644 --- a/block/raw-format.c +++ b/block/raw-format.c @@ -167,16 +167,37 @@ static void raw_reopen_abort(BDRVReopenState *state) state->opaque = NULL; } +/* Check and adjust the offset, against 'offset' and 'size' options. */ +static inline int raw_adjust_offset(BlockDriverState *bs, uint64_t *offset, + uint64_t bytes, bool is_write) +{ + BDRVRawState *s = bs->opaque; + + if (s->has_size && (*offset > s->size || bytes > (s->size - *offset))) { + /* There's not enough space for the write, or the read request is + * out-of-range. Don't read/write anything to prevent leaking out of + * the size specified in options. */ + return is_write ? -ENOSPC : -EINVAL;; + } + + if (*offset > INT64_MAX - s->offset) { + return -EINVAL; + } + *offset += s->offset; + + return 0; +} + static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) { - BDRVRawState *s = bs->opaque; + int ret; - if (offset > UINT64_MAX - s->offset) { - return -EINVAL; + ret = raw_adjust_offset(bs, &offset, bytes, false); + if (ret) { + return ret; } - offset += s->offset; BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags); @@ -186,23 +207,11 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) { - BDRVRawState *s = bs->opaque; void *buf = NULL; BlockDriver *drv; QEMUIOVector local_qiov; int ret; - if (s->has_size && (offset > s->size || bytes > (s->size - offset))) { - /* There's not enough space for the data. Don't write anything and just - * fail to prevent leaking out of the size specified in options. */ - return -ENOSPC; - } - - if (offset > UINT64_MAX - s->offset) { - ret = -EINVAL; - goto fail; - } - if (bs->probed && offset < BLOCK_PROBE_BUF_SIZE && bytes) { /* Handling partial writes would be a pain - so we just * require that guests have 512-byte request alignment if @@ -237,7 +246,10 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset, qiov = &local_qiov; } - offset += s->offset; + ret = raw_adjust_offset(bs, &offset, bytes, true); + if (ret) { + goto fail; + } BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); ret = bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags); @@ -267,22 +279,24 @@ static int coroutine_fn raw_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int bytes, BdrvRequestFlags flags) { - BDRVRawState *s = bs->opaque; - if (offset > UINT64_MAX - s->offset) { - return -EINVAL; + int ret; + + ret = raw_adjust_offset(bs, (uint64_t *)&offset, bytes, true); + if (ret) { + return ret; } - offset += s->offset; return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags); } static int coroutine_fn raw_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes) { - BDRVRawState *s = bs->opaque; - if (offset > UINT64_MAX - s->offset) { - return -EINVAL; + int ret; + + ret = raw_adjust_offset(bs, (uint64_t *)&offset, bytes, true); + if (ret) { + return ret; } - offset += s->offset; return bdrv_co_pdiscard(bs->file->bs, offset, bytes); } @@ -483,6 +497,36 @@ static int raw_probe_geometry(BlockDriverState *bs, HDGeometry *geo) return bdrv_probe_geometry(bs->file->bs, geo); } +static int coroutine_fn raw_co_copy_range_from(BlockDriverState *bs, + BdrvChild *src, uint64_t src_offset, + BdrvChild *dst, uint64_t dst_offset, + uint64_t bytes, BdrvRequestFlags flags) +{ + int ret; + + ret = raw_adjust_offset(bs, &src_offset, bytes, false); + if (ret) { + return ret; + } + return bdrv_co_copy_range_from(bs->file, src_offset, dst, dst_offset, + bytes, flags); +} + +static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs, + BdrvChild *src, uint64_t src_offset, + BdrvChild *dst, uint64_t dst_offset, + uint64_t bytes, BdrvRequestFlags flags) +{ + int ret; + + ret = raw_adjust_offset(bs, &dst_offset, bytes, true); + if (ret) { + return ret; + } + return bdrv_co_copy_range_to(src, src_offset, bs->file, dst_offset, bytes, + flags); +} + BlockDriver bdrv_raw = { .format_name = "raw", .instance_size = sizeof(BDRVRawState), @@ -499,6 +543,8 @@ BlockDriver bdrv_raw = { .bdrv_co_pwrite_zeroes = &raw_co_pwrite_zeroes, .bdrv_co_pdiscard = &raw_co_pdiscard, .bdrv_co_block_status = &raw_co_block_status, + .bdrv_co_copy_range_from = &raw_co_copy_range_from, + .bdrv_co_copy_range_to = &raw_co_copy_range_to, .bdrv_truncate = &raw_truncate, .bdrv_getlength = &raw_getlength, .has_variable_length = true, @@ -5170,6 +5170,20 @@ if test "$fortify_source" != "no"; then fi fi +############################################### +# Check if copy_file_range is provided by glibc +have_copy_file_range=no +cat > $TMPC << EOF +#include <unistd.h> +int main(void) { + copy_file_range(0, NULL, 0, NULL, 0, 0); + return 0; +} +EOF +if compile_prog "" "" ; then + have_copy_file_range=yes +fi + ########################################## # check if struct fsxattr is available via linux/fs.h @@ -6273,6 +6287,9 @@ fi if test "$have_fsxattr" = "yes" ; then echo "HAVE_FSXATTR=y" >> $config_host_mak fi +if test "$have_copy_file_range" = "yes" ; then + echo "HAVE_COPY_FILE_RANGE=y" >> $config_host_mak +fi if test "$vte" = "yes" ; then echo "CONFIG_VTE=y" >> $config_host_mak echo "VTE_CFLAGS=$vte_cflags" >> $config_host_mak diff --git a/include/block/block.h b/include/block/block.h index 3894edda9d..6cc6c7e699 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -611,4 +611,36 @@ bool bdrv_can_store_new_dirty_bitmap(BlockDriverState *bs, const char *name, */ void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size); void bdrv_unregister_buf(BlockDriverState *bs, void *host); + +/** + * + * bdrv_co_copy_range: + * + * Do offloaded copy between two children. If the operation is not implemented + * by the driver, or if the backend storage doesn't support it, a negative + * error code will be returned. + * + * Note: block layer doesn't emulate or fallback to a bounce buffer approach + * because usually the caller shouldn't attempt offloaded copy any more (e.g. + * calling copy_file_range(2)) after the first error, thus it should fall back + * to a read+write path in the caller level. + * + * @src: Source child to copy data from + * @src_offset: offset in @src image to read data + * @dst: Destination child to copy data to + * @dst_offset: offset in @dst image to write data + * @bytes: number of bytes to copy + * @flags: request flags. Must be one of: + * 0 - actually read data from src; + * BDRV_REQ_ZERO_WRITE - treat the @src range as zero data and do zero + * write on @dst as if bdrv_co_pwrite_zeroes is + * called. Used to simplify caller code, or + * during BlockDriver.bdrv_co_copy_range_from() + * recursion. + * + * Returns: 0 if succeeded; negative error code if failed. + **/ +int coroutine_fn bdrv_co_copy_range(BdrvChild *src, uint64_t src_offset, + BdrvChild *dst, uint64_t dst_offset, + uint64_t bytes, BdrvRequestFlags flags); #endif diff --git a/include/block/block_int.h b/include/block/block_int.h index 6c0927bce3..888b7f7bff 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -204,6 +204,37 @@ struct BlockDriver { int coroutine_fn (*bdrv_co_pdiscard)(BlockDriverState *bs, int64_t offset, int bytes); + /* Map [offset, offset + nbytes) range onto a child of @bs to copy from, + * and invoke bdrv_co_copy_range_from(child, ...), or invoke + * bdrv_co_copy_range_to() if @bs is the leaf child to copy data from. + * + * See the comment of bdrv_co_copy_range for the parameter and return value + * semantics. + */ + int coroutine_fn (*bdrv_co_copy_range_from)(BlockDriverState *bs, + BdrvChild *src, + uint64_t offset, + BdrvChild *dst, + uint64_t dst_offset, + uint64_t bytes, + BdrvRequestFlags flags); + + /* Map [offset, offset + nbytes) range onto a child of bs to copy data to, + * and invoke bdrv_co_copy_range_to(child, src, ...), or perform the copy + * operation if @bs is the leaf and @src has the same BlockDriver. Return + * -ENOTSUP if @bs is the leaf but @src has a different BlockDriver. + * + * See the comment of bdrv_co_copy_range for the parameter and return value + * semantics. + */ + int coroutine_fn (*bdrv_co_copy_range_to)(BlockDriverState *bs, + BdrvChild *src, + uint64_t src_offset, + BdrvChild *dst, + uint64_t dst_offset, + uint64_t bytes, + BdrvRequestFlags flags); + /* * Building block for bdrv_block_status[_above] and * bdrv_is_allocated[_above]. The driver should answer only @@ -1102,4 +1133,11 @@ void bdrv_dec_in_flight(BlockDriverState *bs); void blockdev_close_all_bdrv_states(void); +int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset, + BdrvChild *dst, uint64_t dst_offset, + uint64_t bytes, BdrvRequestFlags flags); +int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset, + BdrvChild *dst, uint64_t dst_offset, + uint64_t bytes, BdrvRequestFlags flags); + #endif /* BLOCK_INT_H */ diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h index 9e47b8a629..0e717fd475 100644 --- a/include/block/raw-aio.h +++ b/include/block/raw-aio.h @@ -25,9 +25,15 @@ #define QEMU_AIO_FLUSH 0x0008 #define QEMU_AIO_DISCARD 0x0010 #define QEMU_AIO_WRITE_ZEROES 0x0020 +#define QEMU_AIO_COPY_RANGE 0x0040 #define QEMU_AIO_TYPE_MASK \ - (QEMU_AIO_READ|QEMU_AIO_WRITE|QEMU_AIO_IOCTL|QEMU_AIO_FLUSH| \ - QEMU_AIO_DISCARD|QEMU_AIO_WRITE_ZEROES) + (QEMU_AIO_READ | \ + QEMU_AIO_WRITE | \ + QEMU_AIO_IOCTL | \ + QEMU_AIO_FLUSH | \ + QEMU_AIO_DISCARD | \ + QEMU_AIO_WRITE_ZEROES | \ + QEMU_AIO_COPY_RANGE) /* AIO flags */ #define QEMU_AIO_MISALIGNED 0x1000 diff --git a/include/scsi/constants.h b/include/scsi/constants.h index a141dd71f8..083a8e887a 100644 --- a/include/scsi/constants.h +++ b/include/scsi/constants.h @@ -311,4 +311,8 @@ #define MMC_PROFILE_HDDVD_RW_DL 0x005A #define MMC_PROFILE_INVALID 0xFFFF +#define XCOPY_DESC_OFFSET 16 +#define IDENT_DESCR_TGT_DESCR_SIZE 32 +#define XCOPY_BLK2BLK_SEG_DESC_SIZE 28 + #endif diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h index 92ab624fac..8d03d493c2 100644 --- a/include/sysemu/block-backend.h +++ b/include/sysemu/block-backend.h @@ -232,4 +232,8 @@ void blk_set_force_allow_inactivate(BlockBackend *blk); void blk_register_buf(BlockBackend *blk, void *host, size_t size); void blk_unregister_buf(BlockBackend *blk, void *host); +int coroutine_fn blk_co_copy_range(BlockBackend *blk_in, int64_t off_in, + BlockBackend *blk_out, int64_t off_out, + int bytes, BdrvRequestFlags flags); + #endif diff --git a/qemu-img.c b/qemu-img.c index 976b437da0..75f1610aa0 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -1547,6 +1547,7 @@ typedef struct ImgConvertState { bool compressed; bool target_has_backing; bool wr_in_order; + bool copy_range; int min_sparse; size_t cluster_sectors; size_t buf_sectors; @@ -1740,6 +1741,37 @@ static int coroutine_fn convert_co_write(ImgConvertState *s, int64_t sector_num, return 0; } +static int coroutine_fn convert_co_copy_range(ImgConvertState *s, int64_t sector_num, + int nb_sectors) +{ + int n, ret; + + while (nb_sectors > 0) { + BlockBackend *blk; + int src_cur; + int64_t bs_sectors, src_cur_offset; + int64_t offset; + + convert_select_part(s, sector_num, &src_cur, &src_cur_offset); + offset = (sector_num - src_cur_offset) << BDRV_SECTOR_BITS; + blk = s->src[src_cur]; + bs_sectors = s->src_sectors[src_cur]; + + n = MIN(nb_sectors, bs_sectors - (sector_num - src_cur_offset)); + + ret = blk_co_copy_range(blk, offset, s->target, + sector_num << BDRV_SECTOR_BITS, + n << BDRV_SECTOR_BITS, 0); + if (ret < 0) { + return ret; + } + + sector_num += n; + nb_sectors -= n; + } + return 0; +} + static void coroutine_fn convert_co_do_copy(void *opaque) { ImgConvertState *s = opaque; @@ -1762,6 +1794,7 @@ static void coroutine_fn convert_co_do_copy(void *opaque) int n; int64_t sector_num; enum ImgConvertBlockStatus status; + bool copy_range; qemu_co_mutex_lock(&s->lock); if (s->ret != -EINPROGRESS || s->sector_num >= s->total_sectors) { @@ -1791,7 +1824,9 @@ static void coroutine_fn convert_co_do_copy(void *opaque) s->allocated_sectors, 0); } - if (status == BLK_DATA) { +retry: + copy_range = s->copy_range && s->status == BLK_DATA; + if (status == BLK_DATA && !copy_range) { ret = convert_co_read(s, sector_num, n, buf); if (ret < 0) { error_report("error while reading sector %" PRId64 @@ -1813,7 +1848,15 @@ static void coroutine_fn convert_co_do_copy(void *opaque) } if (s->ret == -EINPROGRESS) { - ret = convert_co_write(s, sector_num, n, buf, status); + if (copy_range) { + ret = convert_co_copy_range(s, sector_num, n); + if (ret) { + s->copy_range = false; + goto retry; + } + } else { + ret = convert_co_write(s, sector_num, n, buf, status); + } if (ret < 0) { error_report("error while writing sector %" PRId64 ": %s", sector_num, strerror(-ret)); @@ -1936,6 +1979,7 @@ static int img_convert(int argc, char **argv) ImgConvertState s = (ImgConvertState) { /* Need at least 4k of zeros for sparse detection */ .min_sparse = 8, + .copy_range = true, .buf_sectors = IO_BUF_SIZE / BDRV_SECTOR_SIZE, .wr_in_order = true, .num_coroutines = 8, @@ -1976,6 +2020,7 @@ static int img_convert(int argc, char **argv) break; case 'c': s.compressed = true; + s.copy_range = false; break; case 'o': if (!is_valid_option_list(optarg)) { @@ -2017,6 +2062,7 @@ static int img_convert(int argc, char **argv) } s.min_sparse = sval / BDRV_SECTOR_SIZE; + s.copy_range = false; break; } case 'p': diff --git a/tests/qemu-iotests/common.filter b/tests/qemu-iotests/common.filter index f08ee55046..2031e353a5 100644 --- a/tests/qemu-iotests/common.filter +++ b/tests/qemu-iotests/common.filter @@ -77,7 +77,6 @@ _filter_qemu() { sed -e "s#\\(^\\|(qemu) \\)$(basename $QEMU_PROG):#\1QEMU_PROG:#" \ -e 's#^QEMU [0-9]\+\.[0-9]\+\.[0-9]\+ monitor#QEMU X.Y.Z monitor#' \ - -e '/main-loop: WARNING: I\/O thread spun for [0-9]\+ iterations/d' \ -e $'s#\r##' # QEMU monitor uses \r\n line endings } diff --git a/util/main-loop.c b/util/main-loop.c index 992f9b0f34..affe0403c5 100644 --- a/util/main-loop.c +++ b/util/main-loop.c @@ -222,36 +222,11 @@ static int os_host_main_loop_wait(int64_t timeout) { GMainContext *context = g_main_context_default(); int ret; - static int spin_counter; g_main_context_acquire(context); glib_pollfds_fill(&timeout); - /* If the I/O thread is very busy or we are incorrectly busy waiting in - * the I/O thread, this can lead to starvation of the BQL such that the - * VCPU threads never run. To make sure we can detect the later case, - * print a message to the screen. If we run into this condition, create - * a fake timeout in order to give the VCPU threads a chance to run. - */ - if (!timeout && (spin_counter > MAX_MAIN_LOOP_SPIN)) { - static bool notified; - - if (!notified && !qtest_enabled() && !qtest_driver()) { - warn_report("I/O thread spun for %d iterations", - MAX_MAIN_LOOP_SPIN); - notified = true; - } - - timeout = SCALE_MS; - } - - - if (timeout) { - spin_counter = 0; - } else { - spin_counter++; - } qemu_mutex_unlock_iothread(); replay_mutex_unlock(); |