diff options
-rw-r--r-- | block/commit.c | 8 | ||||
-rw-r--r-- | block/iscsi.c | 5 | ||||
-rw-r--r-- | block/mirror.c | 78 | ||||
-rw-r--r-- | block/sheepdog.c | 7 | ||||
-rw-r--r-- | block/vhdx-log.c | 13 | ||||
-rw-r--r-- | block/vhdx.c | 22 | ||||
-rw-r--r-- | block/vhdx.h | 5 | ||||
-rw-r--r-- | block/vmdk.c | 173 | ||||
-rw-r--r-- | blockdev.c | 9 | ||||
-rwxr-xr-x | configure | 22 | ||||
-rw-r--r-- | hw/block/dataplane/virtio-blk.c | 86 | ||||
-rw-r--r-- | hw/virtio/dataplane/Makefile.objs | 2 | ||||
-rw-r--r-- | hw/virtio/dataplane/hostmem.c | 183 | ||||
-rw-r--r-- | hw/virtio/dataplane/vring.c | 253 | ||||
-rw-r--r-- | include/block/block_int.h | 22 | ||||
-rw-r--r-- | include/hw/virtio/dataplane/hostmem.h | 58 | ||||
-rw-r--r-- | include/hw/virtio/dataplane/vring.h | 10 | ||||
-rw-r--r-- | qapi-schema.json | 7 | ||||
-rw-r--r-- | qemu-doc.texi | 15 | ||||
-rw-r--r-- | qemu-img.texi | 4 | ||||
-rwxr-xr-x | tests/qemu-iotests/040 | 74 | ||||
-rw-r--r-- | tests/qemu-iotests/051.out | 1 | ||||
-rwxr-xr-x | tests/qemu-iotests/059 | 14 | ||||
-rw-r--r-- | tests/qemu-iotests/059.out | 5 |
24 files changed, 535 insertions, 541 deletions
diff --git a/block/commit.c b/block/commit.c index d4090cbf7d..acec4ac5a8 100644 --- a/block/commit.c +++ b/block/commit.c @@ -198,13 +198,7 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base, return; } - /* Once we support top == active layer, remove this check */ - if (top == bs) { - error_setg(errp, - "Top image as the active layer is currently unsupported"); - return; - } - + assert(top != bs); if (top == base) { error_setg(errp, "Invalid files for merge: top and base are the same"); return; diff --git a/block/iscsi.c b/block/iscsi.c index fa69408df9..294b2c6ab3 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -359,7 +359,10 @@ retry: default: iTask.task = iscsi_read10_task(iscsilun->iscsi, iscsilun->lun, lba, num_sectors * iscsilun->block_size, - iscsilun->block_size, 0, 0, 0, 0, 0, + iscsilun->block_size, +#if !defined(CONFIG_LIBISCSI_1_4) /* API change from 1.4.0 to 1.5.0 */ + 0, 0, 0, 0, 0, +#endif iscsi_co_generic_cb, &iTask); break; } diff --git a/block/mirror.c b/block/mirror.c index 6dc27ad35d..2932bab27a 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -31,7 +31,8 @@ typedef struct MirrorBlockJob { BlockJob common; RateLimit limit; BlockDriverState *target; - MirrorSyncMode mode; + BlockDriverState *base; + bool is_none_mode; BlockdevOnError on_source_error, on_target_error; bool synced; bool should_complete; @@ -335,10 +336,9 @@ static void coroutine_fn mirror_run(void *opaque) sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; mirror_free_init(s); - if (s->mode != MIRROR_SYNC_MODE_NONE) { + if (!s->is_none_mode) { /* First part, loop on the sectors and initialize the dirty bitmap. */ - BlockDriverState *base; - base = s->mode == MIRROR_SYNC_MODE_FULL ? NULL : bs->backing_hd; + BlockDriverState *base = s->base; for (sector_num = 0; sector_num < end; ) { int64_t next = (sector_num | (sectors_per_chunk - 1)) + 1; ret = bdrv_is_allocated_above(bs, base, @@ -481,8 +481,14 @@ immediate_exit: bdrv_reopen(s->target, bdrv_get_flags(s->common.bs), NULL); } bdrv_swap(s->target, s->common.bs); + if (s->common.driver->job_type == BLOCK_JOB_TYPE_COMMIT) { + /* drop the bs loop chain formed by the swap: break the loop then + * trigger the unref from the top one */ + BlockDriverState *p = s->base->backing_hd; + s->base->backing_hd = NULL; + bdrv_unref(p); + } } - bdrv_close(s->target); bdrv_unref(s->target); block_job_completed(&s->common, ret); } @@ -536,12 +542,24 @@ static const BlockJobDriver mirror_job_driver = { .complete = mirror_complete, }; -void mirror_start(BlockDriverState *bs, BlockDriverState *target, - int64_t speed, int64_t granularity, int64_t buf_size, - MirrorSyncMode mode, BlockdevOnError on_source_error, - BlockdevOnError on_target_error, - BlockDriverCompletionFunc *cb, - void *opaque, Error **errp) +static const BlockJobDriver commit_active_job_driver = { + .instance_size = sizeof(MirrorBlockJob), + .job_type = BLOCK_JOB_TYPE_COMMIT, + .set_speed = mirror_set_speed, + .iostatus_reset + = mirror_iostatus_reset, + .complete = mirror_complete, +}; + +static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target, + int64_t speed, int64_t granularity, + int64_t buf_size, + BlockdevOnError on_source_error, + BlockdevOnError on_target_error, + BlockDriverCompletionFunc *cb, + void *opaque, Error **errp, + const BlockJobDriver *driver, + bool is_none_mode, BlockDriverState *base) { MirrorBlockJob *s; @@ -566,7 +584,8 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target, return; } - s = block_job_create(&mirror_job_driver, bs, speed, cb, opaque, errp); + + s = block_job_create(driver, bs, speed, cb, opaque, errp); if (!s) { return; } @@ -574,7 +593,8 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target, s->on_source_error = on_source_error; s->on_target_error = on_target_error; s->target = target; - s->mode = mode; + s->is_none_mode = is_none_mode; + s->base = base; s->granularity = granularity; s->buf_size = MAX(buf_size, granularity); @@ -586,3 +606,35 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target, trace_mirror_start(bs, s, s->common.co, opaque); qemu_coroutine_enter(s->common.co, s); } + +void mirror_start(BlockDriverState *bs, BlockDriverState *target, + int64_t speed, int64_t granularity, int64_t buf_size, + MirrorSyncMode mode, BlockdevOnError on_source_error, + BlockdevOnError on_target_error, + BlockDriverCompletionFunc *cb, + void *opaque, Error **errp) +{ + bool is_none_mode; + BlockDriverState *base; + + is_none_mode = mode == MIRROR_SYNC_MODE_NONE; + base = mode == MIRROR_SYNC_MODE_TOP ? bs->backing_hd : NULL; + mirror_start_job(bs, target, speed, granularity, buf_size, + on_source_error, on_target_error, cb, opaque, errp, + &mirror_job_driver, is_none_mode, base); +} + +void commit_active_start(BlockDriverState *bs, BlockDriverState *base, + int64_t speed, + BlockdevOnError on_error, + BlockDriverCompletionFunc *cb, + void *opaque, Error **errp) +{ + if (bdrv_reopen(base, bs->open_flags, errp)) { + return; + } + bdrv_ref(base); + mirror_start_job(bs, base, speed, 0, 0, + on_error, on_error, cb, opaque, errp, + &commit_active_job_driver, false, base); +} diff --git a/block/sheepdog.c b/block/sheepdog.c index d1c812df3d..ba451a97a4 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -2048,13 +2048,14 @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num, { SheepdogAIOCB *acb; int ret; + int64_t offset = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE; + BDRVSheepdogState *s = bs->opaque; - if (bs->growable && sector_num + nb_sectors > bs->total_sectors) { - ret = sd_truncate(bs, (sector_num + nb_sectors) * BDRV_SECTOR_SIZE); + if (bs->growable && offset > s->inode.vdi_size) { + ret = sd_truncate(bs, offset); if (ret < 0) { return ret; } - bs->total_sectors = sector_num + nb_sectors; } acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors); diff --git a/block/vhdx-log.c b/block/vhdx-log.c index ee5583c309..8c9ae0d8e7 100644 --- a/block/vhdx-log.c +++ b/block/vhdx-log.c @@ -706,7 +706,8 @@ exit: * * If read-only, we must replay the log in RAM (or refuse to open * a dirty VHDX file read-only) */ -int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s, bool *flushed) +int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s, bool *flushed, + Error **errp) { int ret = 0; VHDXHeader *hdr; @@ -761,6 +762,16 @@ int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s, bool *flushed) } if (logs.valid) { + if (bs->read_only) { + ret = -EPERM; + error_setg_errno(errp, EPERM, + "VHDX image file '%s' opened read-only, but " + "contains a log that needs to be replayed. To " + "replay the log, execute:\n qemu-img check -r " + "all '%s'", + bs->filename, bs->filename); + goto exit; + } /* now flush the log */ ret = vhdx_log_flush(bs, s, &logs); if (ret < 0) { diff --git a/block/vhdx.c b/block/vhdx.c index 67bbe103a1..1995778945 100644 --- a/block/vhdx.c +++ b/block/vhdx.c @@ -878,7 +878,6 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags, int ret = 0; uint32_t i; uint64_t signature; - bool log_flushed = false; s->bat = NULL; @@ -907,7 +906,7 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags, goto fail; } - ret = vhdx_parse_log(bs, s, &log_flushed); + ret = vhdx_parse_log(bs, s, &s->log_replayed_on_open, errp); if (ret < 0) { goto fail; } @@ -1854,6 +1853,24 @@ exit: return ret; } +/* If opened r/w, the VHDX driver will automatically replay the log, + * if one is present, inside the vhdx_open() call. + * + * If qemu-img check -r all is called, the image is automatically opened + * r/w and any log has already been replayed, so there is nothing (currently) + * for us to do here + */ +static int vhdx_check(BlockDriverState *bs, BdrvCheckResult *result, + BdrvCheckMode fix) +{ + BDRVVHDXState *s = bs->opaque; + + if (s->log_replayed_on_open) { + result->corruptions_fixed++; + } + return 0; +} + static QEMUOptionParameter vhdx_create_options[] = { { .name = BLOCK_OPT_SIZE, @@ -1898,6 +1915,7 @@ static BlockDriver bdrv_vhdx = { .bdrv_co_writev = vhdx_co_writev, .bdrv_create = vhdx_create, .bdrv_get_info = vhdx_get_info, + .bdrv_check = vhdx_check, .create_options = vhdx_create_options, }; diff --git a/block/vhdx.h b/block/vhdx.h index 51183b243c..2acd7c2d19 100644 --- a/block/vhdx.h +++ b/block/vhdx.h @@ -394,6 +394,8 @@ typedef struct BDRVVHDXState { Error *migration_blocker; + bool log_replayed_on_open; + QLIST_HEAD(VHDXRegionHead, VHDXRegionEntry) regions; } BDRVVHDXState; @@ -408,7 +410,8 @@ uint32_t vhdx_checksum_calc(uint32_t crc, uint8_t *buf, size_t size, bool vhdx_checksum_is_valid(uint8_t *buf, size_t size, int crc_offset); -int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s, bool *flushed); +int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s, bool *flushed, + Error **errp); int vhdx_log_write_and_flush(BlockDriverState *bs, BDRVVHDXState *s, void *data, uint32_t length, uint64_t offset); diff --git a/block/vmdk.c b/block/vmdk.c index 0734bc200c..c6b60b4a91 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -749,9 +749,14 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs, return -EINVAL; } } else if (!strcmp(type, "VMFS")) { - flat_offset = 0; + if (ret == 4) { + flat_offset = 0; + } else { + error_setg(errp, "Invalid extent lines:\n%s", p); + return -EINVAL; + } } else if (ret != 4) { - error_setg(errp, "Invalid extent lines: \n%s", p); + error_setg(errp, "Invalid extent lines:\n%s", p); return -EINVAL; } @@ -1447,23 +1452,33 @@ static int coroutine_fn vmdk_co_write_zeroes(BlockDriverState *bs, } static int vmdk_create_extent(const char *filename, int64_t filesize, - bool flat, bool compress, bool zeroed_grain) + bool flat, bool compress, bool zeroed_grain, + Error **errp) { int ret, i; - int fd = 0; + BlockDriverState *bs = NULL; VMDK4Header header; - uint32_t tmp, magic, grains, gd_size, gt_size, gt_count; + Error *local_err; + uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count; + uint32_t *gd_buf = NULL; + int gd_buf_size; + + ret = bdrv_create_file(filename, NULL, &local_err); + if (ret < 0) { + error_propagate(errp, local_err); + goto exit; + } - fd = qemu_open(filename, - O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE, - 0644); - if (fd < 0) { - return -errno; + ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR, &local_err); + if (ret < 0) { + error_propagate(errp, local_err); + goto exit; } + if (flat) { - ret = ftruncate(fd, filesize); + ret = bdrv_truncate(bs, filesize); if (ret < 0) { - ret = -errno; + error_setg(errp, "Could not truncate file"); } goto exit; } @@ -1474,24 +1489,23 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0) | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0); header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0; - header.capacity = filesize / 512; + header.capacity = filesize / BDRV_SECTOR_SIZE; header.granularity = 128; - header.num_gtes_per_gt = 512; + header.num_gtes_per_gt = BDRV_SECTOR_SIZE; - grains = (filesize / 512 + header.granularity - 1) / header.granularity; - gt_size = ((header.num_gtes_per_gt * sizeof(uint32_t)) + 511) >> 9; - gt_count = - (grains + header.num_gtes_per_gt - 1) / header.num_gtes_per_gt; - gd_size = (gt_count * sizeof(uint32_t) + 511) >> 9; + grains = DIV_ROUND_UP(filesize / BDRV_SECTOR_SIZE, header.granularity); + gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t), + BDRV_SECTOR_SIZE); + gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt); + gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE); header.desc_offset = 1; header.desc_size = 20; header.rgd_offset = header.desc_offset + header.desc_size; - header.gd_offset = header.rgd_offset + gd_size + (gt_size * gt_count); + header.gd_offset = header.rgd_offset + gd_sectors + (gt_size * gt_count); header.grain_offset = - ((header.gd_offset + gd_size + (gt_size * gt_count) + - header.granularity - 1) / header.granularity) * - header.granularity; + ROUND_UP(header.gd_offset + gd_sectors + (gt_size * gt_count), + header.granularity); /* swap endianness for all header fields */ header.version = cpu_to_le32(header.version); header.flags = cpu_to_le32(header.flags); @@ -1511,48 +1525,55 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, header.check_bytes[3] = 0xa; /* write all the data */ - ret = qemu_write_full(fd, &magic, sizeof(magic)); - if (ret != sizeof(magic)) { - ret = -errno; + ret = bdrv_pwrite(bs, 0, &magic, sizeof(magic)); + if (ret < 0) { + error_set(errp, QERR_IO_ERROR); goto exit; } - ret = qemu_write_full(fd, &header, sizeof(header)); - if (ret != sizeof(header)) { - ret = -errno; + ret = bdrv_pwrite(bs, sizeof(magic), &header, sizeof(header)); + if (ret < 0) { + error_set(errp, QERR_IO_ERROR); goto exit; } - ret = ftruncate(fd, le64_to_cpu(header.grain_offset) << 9); + ret = bdrv_truncate(bs, le64_to_cpu(header.grain_offset) << 9); if (ret < 0) { - ret = -errno; + error_setg(errp, "Could not truncate file"); goto exit; } /* write grain directory */ - lseek(fd, le64_to_cpu(header.rgd_offset) << 9, SEEK_SET); - for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_size; + gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE; + gd_buf = g_malloc0(gd_buf_size); + for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors; i < gt_count; i++, tmp += gt_size) { - ret = qemu_write_full(fd, &tmp, sizeof(tmp)); - if (ret != sizeof(tmp)) { - ret = -errno; - goto exit; - } + gd_buf[i] = cpu_to_le32(tmp); + } + ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE, + gd_buf, gd_buf_size); + if (ret < 0) { + error_set(errp, QERR_IO_ERROR); + goto exit; } /* write backup grain directory */ - lseek(fd, le64_to_cpu(header.gd_offset) << 9, SEEK_SET); - for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_size; + for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors; i < gt_count; i++, tmp += gt_size) { - ret = qemu_write_full(fd, &tmp, sizeof(tmp)); - if (ret != sizeof(tmp)) { - ret = -errno; - goto exit; - } + gd_buf[i] = cpu_to_le32(tmp); + } + ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE, + gd_buf, gd_buf_size); + if (ret < 0) { + error_set(errp, QERR_IO_ERROR); + goto exit; } ret = 0; - exit: - qemu_close(fd); +exit: + if (bs) { + bdrv_unref(bs); + } + g_free(gd_buf); return ret; } @@ -1599,7 +1620,9 @@ static int filename_decompose(const char *filename, char *path, char *prefix, static int vmdk_create(const char *filename, QEMUOptionParameter *options, Error **errp) { - int fd, idx = 0; + int idx = 0; + BlockDriverState *new_bs = NULL; + Error *local_err; char *desc = NULL; int64_t total_size = 0, filesize; const char *adapter_type = NULL; @@ -1616,6 +1639,7 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options, uint32_t parent_cid = 0xffffffff; uint32_t number_heads = 16; bool zeroed_grain = false; + uint32_t desc_offset = 0, desc_len; const char desc_template[] = "# Disk DescriptorFile\n" "version=1\n" @@ -1749,7 +1773,7 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options, path, desc_filename); if (vmdk_create_extent(ext_filename, size, - flat, compress, zeroed_grain)) { + flat, compress, zeroed_grain, errp)) { ret = -EINVAL; goto exit; } @@ -1757,7 +1781,7 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options, /* Format description line */ snprintf(desc_line, sizeof(desc_line), - desc_extent_line, size / 512, desc_filename); + desc_extent_line, size / BDRV_SECTOR_SIZE, desc_filename); g_string_append(ext_desc_lines, desc_line); } /* generate descriptor file */ @@ -1768,36 +1792,43 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options, parent_desc_line, ext_desc_lines->str, (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4), - total_size / (int64_t)(63 * number_heads * 512), + total_size / + (int64_t)(63 * number_heads * BDRV_SECTOR_SIZE), number_heads, adapter_type); - if (split || flat) { - fd = qemu_open(filename, - O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE, - 0644); + desc_len = strlen(desc); + /* the descriptor offset = 0x200 */ + if (!split && !flat) { + desc_offset = 0x200; } else { - fd = qemu_open(filename, - O_WRONLY | O_BINARY | O_LARGEFILE, - 0644); + ret = bdrv_create_file(filename, options, &local_err); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not create image file"); + goto exit; + } } - if (fd < 0) { - ret = -errno; + ret = bdrv_file_open(&new_bs, filename, NULL, BDRV_O_RDWR, &local_err); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not write description"); goto exit; } - /* the descriptor offset = 0x200 */ - if (!split && !flat && 0x200 != lseek(fd, 0x200, SEEK_SET)) { - ret = -errno; - goto close_exit; + ret = bdrv_pwrite(new_bs, desc_offset, desc, desc_len); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not write description"); + goto exit; } - ret = qemu_write_full(fd, desc, strlen(desc)); - if (ret != strlen(desc)) { - ret = -errno; - goto close_exit; + /* bdrv_pwrite write padding zeros to align to sector, we don't need that + * for description file */ + if (desc_offset == 0) { + ret = bdrv_truncate(new_bs, desc_len); + if (ret < 0) { + error_setg(errp, "Could not truncate file"); + } } - ret = 0; -close_exit: - qemu_close(fd); exit: + if (new_bs) { + bdrv_unref(new_bs); + } g_free(desc); g_string_free(ext_desc_lines, true); return ret; diff --git a/blockdev.c b/blockdev.c index 6a85961af2..2c3242b87a 100644 --- a/blockdev.c +++ b/blockdev.c @@ -1820,8 +1820,13 @@ void qmp_block_commit(const char *device, return; } - commit_start(bs, base_bs, top_bs, speed, on_error, block_job_cb, bs, - &local_err); + if (top_bs == bs) { + commit_active_start(bs, base_bs, speed, on_error, block_job_cb, + bs, &local_err); + } else { + commit_start(bs, base_bs, top_bs, speed, on_error, block_job_cb, bs, + &local_err); + } if (local_err != NULL) { error_propagate(errp, local_err); return; @@ -3078,6 +3078,21 @@ EOF fi fi +# We also need to know the API version because there was an +# API change from 1.4.0 to 1.5.0. +if test "$libiscsi" = "yes"; then + cat >$TMPC <<EOF +#include <iscsi/iscsi.h> +int main(void) +{ + iscsi_read10_task(0, 0, 0, 0, 0, 0, 0); + return 0; +} +EOF + if compile_prog "" "-liscsi"; then + libiscsi_version="1.4.0" + fi +fi ########################################## # Do we need libm @@ -3805,7 +3820,11 @@ echo "nss used $smartcard_nss" echo "libusb $libusb" echo "usb net redir $usb_redir" echo "GLX support $glx" +if test "$libiscsi_version" = "1.4.0"; then +echo "libiscsi support $libiscsi (1.4.0)" +else echo "libiscsi support $libiscsi" +fi echo "build guest agent $guest_agent" echo "QGA VSS support $guest_agent_with_vss" echo "seccomp support $seccomp" @@ -4137,6 +4156,9 @@ fi if test "$libiscsi" = "yes" ; then echo "CONFIG_LIBISCSI=y" >> $config_host_mak + if test "$libiscsi_version" = "1.4.0"; then + echo "CONFIG_LIBISCSI_1_4=y" >> $config_host_mak + fi fi if test "$seccomp" = "yes"; then diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c index 1e57f3aabd..456d437ac3 100644 --- a/hw/block/dataplane/virtio-blk.c +++ b/hw/block/dataplane/virtio-blk.c @@ -35,7 +35,7 @@ enum { typedef struct { struct iocb iocb; /* Linux AIO control block */ QEMUIOVector *inhdr; /* iovecs for virtio_blk_inhdr */ - unsigned int head; /* vring descriptor index */ + VirtQueueElement *elem; /* saved data from the virtqueue */ struct iovec *bounce_iov; /* used if guest buffers are unaligned */ QEMUIOVector *read_qiov; /* for read completion /w bounce buffer */ } VirtIOBlockRequest; @@ -96,7 +96,7 @@ static void complete_request(struct iocb *iocb, ssize_t ret, void *opaque) len = 0; } - trace_virtio_blk_data_plane_complete_request(s, req->head, ret); + trace_virtio_blk_data_plane_complete_request(s, req->elem->index, ret); if (req->read_qiov) { assert(req->bounce_iov); @@ -118,12 +118,12 @@ static void complete_request(struct iocb *iocb, ssize_t ret, void *opaque) * written to, but for virtio-blk it seems to be the number of bytes * transferred plus the status bytes. */ - vring_push(&s->vring, req->head, len + sizeof(hdr)); - + vring_push(&s->vring, req->elem, len + sizeof(hdr)); + req->elem = NULL; s->num_reqs--; } -static void complete_request_early(VirtIOBlockDataPlane *s, unsigned int head, +static void complete_request_early(VirtIOBlockDataPlane *s, VirtQueueElement *elem, QEMUIOVector *inhdr, unsigned char status) { struct virtio_blk_inhdr hdr = { @@ -134,26 +134,26 @@ static void complete_request_early(VirtIOBlockDataPlane *s, unsigned int head, qemu_iovec_destroy(inhdr); g_slice_free(QEMUIOVector, inhdr); - vring_push(&s->vring, head, sizeof(hdr)); + vring_push(&s->vring, elem, sizeof(hdr)); notify_guest(s); } /* Get disk serial number */ static void do_get_id_cmd(VirtIOBlockDataPlane *s, struct iovec *iov, unsigned int iov_cnt, - unsigned int head, QEMUIOVector *inhdr) + VirtQueueElement *elem, QEMUIOVector *inhdr) { char id[VIRTIO_BLK_ID_BYTES]; /* Serial number not NUL-terminated when shorter than buffer */ strncpy(id, s->blk->serial ? s->blk->serial : "", sizeof(id)); iov_from_buf(iov, iov_cnt, 0, id, sizeof(id)); - complete_request_early(s, head, inhdr, VIRTIO_BLK_S_OK); + complete_request_early(s, elem, inhdr, VIRTIO_BLK_S_OK); } static int do_rdwr_cmd(VirtIOBlockDataPlane *s, bool read, - struct iovec *iov, unsigned int iov_cnt, - long long offset, unsigned int head, + struct iovec *iov, unsigned iov_cnt, + long long offset, VirtQueueElement *elem, QEMUIOVector *inhdr) { struct iocb *iocb; @@ -186,19 +186,20 @@ static int do_rdwr_cmd(VirtIOBlockDataPlane *s, bool read, /* Fill in virtio block metadata needed for completion */ VirtIOBlockRequest *req = container_of(iocb, VirtIOBlockRequest, iocb); - req->head = head; + req->elem = elem; req->inhdr = inhdr; req->bounce_iov = bounce_iov; req->read_qiov = read_qiov; return 0; } -static int process_request(IOQueue *ioq, struct iovec iov[], - unsigned int out_num, unsigned int in_num, - unsigned int head) +static int process_request(IOQueue *ioq, VirtQueueElement *elem) { VirtIOBlockDataPlane *s = container_of(ioq, VirtIOBlockDataPlane, ioqueue); - struct iovec *in_iov = &iov[out_num]; + struct iovec *iov = elem->out_sg; + struct iovec *in_iov = elem->in_sg; + unsigned out_num = elem->out_num; + unsigned in_num = elem->in_num; struct virtio_blk_outhdr outhdr; QEMUIOVector *inhdr; size_t in_size; @@ -229,29 +230,29 @@ static int process_request(IOQueue *ioq, struct iovec iov[], switch (outhdr.type) { case VIRTIO_BLK_T_IN: - do_rdwr_cmd(s, true, in_iov, in_num, outhdr.sector * 512, head, inhdr); + do_rdwr_cmd(s, true, in_iov, in_num, outhdr.sector * 512, elem, inhdr); return 0; case VIRTIO_BLK_T_OUT: - do_rdwr_cmd(s, false, iov, out_num, outhdr.sector * 512, head, inhdr); + do_rdwr_cmd(s, false, iov, out_num, outhdr.sector * 512, elem, inhdr); return 0; case VIRTIO_BLK_T_SCSI_CMD: /* TODO support SCSI commands */ - complete_request_early(s, head, inhdr, VIRTIO_BLK_S_UNSUPP); + complete_request_early(s, elem, inhdr, VIRTIO_BLK_S_UNSUPP); return 0; case VIRTIO_BLK_T_FLUSH: /* TODO fdsync not supported by Linux AIO, do it synchronously here! */ if (qemu_fdatasync(s->fd) < 0) { - complete_request_early(s, head, inhdr, VIRTIO_BLK_S_IOERR); + complete_request_early(s, elem, inhdr, VIRTIO_BLK_S_IOERR); } else { - complete_request_early(s, head, inhdr, VIRTIO_BLK_S_OK); + complete_request_early(s, elem, inhdr, VIRTIO_BLK_S_OK); } return 0; case VIRTIO_BLK_T_GET_ID: - do_get_id_cmd(s, in_iov, in_num, head, inhdr); + do_get_id_cmd(s, in_iov, in_num, elem, inhdr); return 0; default: @@ -267,29 +268,8 @@ static void handle_notify(EventNotifier *e) VirtIOBlockDataPlane *s = container_of(e, VirtIOBlockDataPlane, host_notifier); - /* There is one array of iovecs into which all new requests are extracted - * from the vring. Requests are read from the vring and the translated - * descriptors are written to the iovecs array. The iovecs do not have to - * persist across handle_notify() calls because the kernel copies the - * iovecs on io_submit(). - * - * Handling io_submit() EAGAIN may require storing the requests across - * handle_notify() calls until the kernel has sufficient resources to - * accept more I/O. This is not implemented yet. - */ - struct iovec iovec[VRING_MAX]; - struct iovec *end = &iovec[VRING_MAX]; - struct iovec *iov = iovec; - - /* When a request is read from the vring, the index of the first descriptor - * (aka head) is returned so that the completed request can be pushed onto - * the vring later. - * - * The number of hypervisor read-only iovecs is out_num. The number of - * hypervisor write-only iovecs is in_num. - */ - int head; - unsigned int out_num = 0, in_num = 0; + VirtQueueElement *elem; + int ret; unsigned int num_queued; event_notifier_test_and_clear(&s->host_notifier); @@ -298,29 +278,31 @@ static void handle_notify(EventNotifier *e) vring_disable_notification(s->vdev, &s->vring); for (;;) { - head = vring_pop(s->vdev, &s->vring, iov, end, &out_num, &in_num); - if (head < 0) { + ret = vring_pop(s->vdev, &s->vring, &elem); + if (ret < 0) { + assert(elem == NULL); break; /* no more requests */ } - trace_virtio_blk_data_plane_process_request(s, out_num, in_num, - head); + trace_virtio_blk_data_plane_process_request(s, elem->out_num, + elem->in_num, elem->index); - if (process_request(&s->ioqueue, iov, out_num, in_num, head) < 0) { + if (process_request(&s->ioqueue, elem) < 0) { vring_set_broken(&s->vring); + vring_free_element(elem); + ret = -EFAULT; break; } - iov += out_num + in_num; } - if (likely(head == -EAGAIN)) { /* vring emptied */ + if (likely(ret == -EAGAIN)) { /* vring emptied */ /* Re-enable guest->host notifies and stop processing the vring. * But if the guest has snuck in more descriptors, keep processing. */ if (vring_enable_notification(s->vdev, &s->vring)) { break; } - } else { /* head == -ENOBUFS or fatal error, iovecs[] is depleted */ + } else { /* ret == -ENOBUFS or fatal error, iovecs[] is depleted */ /* Since there are no iovecs[] left, stop processing for now. Do * not re-enable guest->host notifies since the I/O completion * handler knows to check for more vring descriptors anyway. diff --git a/hw/virtio/dataplane/Makefile.objs b/hw/virtio/dataplane/Makefile.objs index a91bf33c8b..9a8cfc0297 100644 --- a/hw/virtio/dataplane/Makefile.objs +++ b/hw/virtio/dataplane/Makefile.objs @@ -1 +1 @@ -common-obj-y += hostmem.o vring.o +common-obj-y += vring.o diff --git a/hw/virtio/dataplane/hostmem.c b/hw/virtio/dataplane/hostmem.c deleted file mode 100644 index 901d98b8a0..0000000000 --- a/hw/virtio/dataplane/hostmem.c +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Thread-safe guest to host memory mapping - * - * Copyright 2012 Red Hat, Inc. and/or its affiliates - * - * Authors: - * Stefan Hajnoczi <stefanha@redhat.com> - * - * This work is licensed under the terms of the GNU GPL, version 2 or later. - * See the COPYING file in the top-level directory. - * - */ - -#include "exec/address-spaces.h" -#include "hw/virtio/dataplane/hostmem.h" - -static int hostmem_lookup_cmp(const void *phys_, const void *region_) -{ - hwaddr phys = *(const hwaddr *)phys_; - const HostMemRegion *region = region_; - - if (phys < region->guest_addr) { - return -1; - } else if (phys >= region->guest_addr + region->size) { - return 1; - } else { - return 0; - } -} - -/** - * Map guest physical address to host pointer - */ -void *hostmem_lookup(HostMem *hostmem, hwaddr phys, hwaddr len, bool is_write) -{ - HostMemRegion *region; - void *host_addr = NULL; - hwaddr offset_within_region; - - qemu_mutex_lock(&hostmem->current_regions_lock); - region = bsearch(&phys, hostmem->current_regions, - hostmem->num_current_regions, - sizeof(hostmem->current_regions[0]), - hostmem_lookup_cmp); - if (!region) { - goto out; - } - if (is_write && region->readonly) { - goto out; - } - offset_within_region = phys - region->guest_addr; - if (len <= region->size - offset_within_region) { - host_addr = region->host_addr + offset_within_region; - } -out: - qemu_mutex_unlock(&hostmem->current_regions_lock); - - return host_addr; -} - -/** - * Install new regions list - */ -static void hostmem_listener_commit(MemoryListener *listener) -{ - HostMem *hostmem = container_of(listener, HostMem, listener); - int i; - - qemu_mutex_lock(&hostmem->current_regions_lock); - for (i = 0; i < hostmem->num_current_regions; i++) { - memory_region_unref(hostmem->current_regions[i].mr); - } - g_free(hostmem->current_regions); - hostmem->current_regions = hostmem->new_regions; - hostmem->num_current_regions = hostmem->num_new_regions; - qemu_mutex_unlock(&hostmem->current_regions_lock); - - /* Reset new regions list */ - hostmem->new_regions = NULL; - hostmem->num_new_regions = 0; -} - -/** - * Add a MemoryRegionSection to the new regions list - */ -static void hostmem_append_new_region(HostMem *hostmem, - MemoryRegionSection *section) -{ - void *ram_ptr = memory_region_get_ram_ptr(section->mr); - size_t num = hostmem->num_new_regions; - size_t new_size = (num + 1) * sizeof(hostmem->new_regions[0]); - - hostmem->new_regions = g_realloc(hostmem->new_regions, new_size); - hostmem->new_regions[num] = (HostMemRegion){ - .host_addr = ram_ptr + section->offset_within_region, - .guest_addr = section->offset_within_address_space, - .size = int128_get64(section->size), - .readonly = section->readonly, - .mr = section->mr, - }; - hostmem->num_new_regions++; - - memory_region_ref(section->mr); -} - -static void hostmem_listener_append_region(MemoryListener *listener, - MemoryRegionSection *section) -{ - HostMem *hostmem = container_of(listener, HostMem, listener); - - /* Ignore non-RAM regions, we may not be able to map them */ - if (!memory_region_is_ram(section->mr)) { - return; - } - - /* Ignore regions with dirty logging, we cannot mark them dirty */ - if (memory_region_is_logging(section->mr)) { - return; - } - - hostmem_append_new_region(hostmem, section); -} - -/* We don't implement most MemoryListener callbacks, use these nop stubs */ -static void hostmem_listener_dummy(MemoryListener *listener) -{ -} - -static void hostmem_listener_section_dummy(MemoryListener *listener, - MemoryRegionSection *section) -{ -} - -static void hostmem_listener_eventfd_dummy(MemoryListener *listener, - MemoryRegionSection *section, - bool match_data, uint64_t data, - EventNotifier *e) -{ -} - -static void hostmem_listener_coalesced_mmio_dummy(MemoryListener *listener, - MemoryRegionSection *section, - hwaddr addr, hwaddr len) -{ -} - -void hostmem_init(HostMem *hostmem) -{ - memset(hostmem, 0, sizeof(*hostmem)); - - qemu_mutex_init(&hostmem->current_regions_lock); - - hostmem->listener = (MemoryListener){ - .begin = hostmem_listener_dummy, - .commit = hostmem_listener_commit, - .region_add = hostmem_listener_append_region, - .region_del = hostmem_listener_section_dummy, - .region_nop = hostmem_listener_append_region, - .log_start = hostmem_listener_section_dummy, - .log_stop = hostmem_listener_section_dummy, - .log_sync = hostmem_listener_section_dummy, - .log_global_start = hostmem_listener_dummy, - .log_global_stop = hostmem_listener_dummy, - .eventfd_add = hostmem_listener_eventfd_dummy, - .eventfd_del = hostmem_listener_eventfd_dummy, - .coalesced_mmio_add = hostmem_listener_coalesced_mmio_dummy, - .coalesced_mmio_del = hostmem_listener_coalesced_mmio_dummy, - .priority = 10, - }; - - memory_listener_register(&hostmem->listener, &address_space_memory); - if (hostmem->num_new_regions > 0) { - hostmem_listener_commit(&hostmem->listener); - } -} - -void hostmem_finalize(HostMem *hostmem) -{ - memory_listener_unregister(&hostmem->listener); - g_free(hostmem->new_regions); - g_free(hostmem->current_regions); - qemu_mutex_destroy(&hostmem->current_regions_lock); -} diff --git a/hw/virtio/dataplane/vring.c b/hw/virtio/dataplane/vring.c index 351a343806..250d45ec3d 100644 --- a/hw/virtio/dataplane/vring.c +++ b/hw/virtio/dataplane/vring.c @@ -15,9 +15,53 @@ */ #include "trace.h" +#include "hw/hw.h" +#include "exec/memory.h" +#include "exec/address-spaces.h" #include "hw/virtio/dataplane/vring.h" #include "qemu/error-report.h" +/* vring_map can be coupled with vring_unmap or (if you still have the + * value returned in *mr) memory_region_unref. + */ +static void *vring_map(MemoryRegion **mr, hwaddr phys, hwaddr len, + bool is_write) +{ + MemoryRegionSection section = memory_region_find(get_system_memory(), phys, len); + + if (!section.mr || int128_get64(section.size) < len) { + goto out; + } + if (is_write && section.readonly) { + goto out; + } + if (!memory_region_is_ram(section.mr)) { + goto out; + } + + /* Ignore regions with dirty logging, we cannot mark them dirty */ + if (memory_region_is_logging(section.mr)) { + goto out; + } + + *mr = section.mr; + return memory_region_get_ram_ptr(section.mr) + section.offset_within_region; + +out: + memory_region_unref(section.mr); + *mr = NULL; + return NULL; +} + +static void vring_unmap(void *buffer, bool is_write) +{ + ram_addr_t addr; + MemoryRegion *mr; + + mr = qemu_ram_addr_from_host(buffer, &addr); + memory_region_unref(mr); +} + /* Map the guest's vring to host memory */ bool vring_setup(Vring *vring, VirtIODevice *vdev, int n) { @@ -27,8 +71,7 @@ bool vring_setup(Vring *vring, VirtIODevice *vdev, int n) vring->broken = false; - hostmem_init(&vring->hostmem); - vring_ptr = hostmem_lookup(&vring->hostmem, vring_addr, vring_size, true); + vring_ptr = vring_map(&vring->mr, vring_addr, vring_size, true); if (!vring_ptr) { error_report("Failed to map vring " "addr %#" HWADDR_PRIx " size %" HWADDR_PRIu, @@ -54,7 +97,7 @@ void vring_teardown(Vring *vring, VirtIODevice *vdev, int n) virtio_queue_set_last_avail_idx(vdev, n, vring->last_avail_idx); virtio_queue_invalidate_signalled_used(vdev, n); - hostmem_finalize(&vring->hostmem); + memory_region_unref(vring->mr); } /* Disable guest->host notifies */ @@ -110,14 +153,61 @@ bool vring_should_notify(VirtIODevice *vdev, Vring *vring) return vring_need_event(vring_used_event(&vring->vr), new, old); } + +static int get_desc(Vring *vring, VirtQueueElement *elem, + struct vring_desc *desc) +{ + unsigned *num; + struct iovec *iov; + hwaddr *addr; + MemoryRegion *mr; + + if (desc->flags & VRING_DESC_F_WRITE) { + num = &elem->in_num; + iov = &elem->in_sg[*num]; + addr = &elem->in_addr[*num]; + } else { + num = &elem->out_num; + iov = &elem->out_sg[*num]; + addr = &elem->out_addr[*num]; + + /* If it's an output descriptor, they're all supposed + * to come before any input descriptors. */ + if (unlikely(elem->in_num)) { + error_report("Descriptor has out after in"); + return -EFAULT; + } + } + + /* Stop for now if there are not enough iovecs available. */ + if (*num >= VIRTQUEUE_MAX_SIZE) { + return -ENOBUFS; + } + + /* TODO handle non-contiguous memory across region boundaries */ + iov->iov_base = vring_map(&mr, desc->addr, desc->len, + desc->flags & VRING_DESC_F_WRITE); + if (!iov->iov_base) { + error_report("Failed to map descriptor addr %#" PRIx64 " len %u", + (uint64_t)desc->addr, desc->len); + return -EFAULT; + } + + /* The MemoryRegion is looked up again and unref'ed later, leave the + * ref in place. */ + iov->iov_len = desc->len; + *addr = desc->addr; + *num += 1; + return 0; +} + /* This is stolen from linux/drivers/vhost/vhost.c. */ -static int get_indirect(Vring *vring, - struct iovec iov[], struct iovec *iov_end, - unsigned int *out_num, unsigned int *in_num, +static int get_indirect(Vring *vring, VirtQueueElement *elem, struct vring_desc *indirect) { struct vring_desc desc; unsigned int i = 0, count, found = 0; + int ret; /* Sanity check */ if (unlikely(indirect->len % sizeof(desc))) { @@ -139,11 +229,12 @@ static int get_indirect(Vring *vring, do { struct vring_desc *desc_ptr; + MemoryRegion *mr; /* Translate indirect descriptor */ - desc_ptr = hostmem_lookup(&vring->hostmem, - indirect->addr + found * sizeof(desc), - sizeof(desc), false); + desc_ptr = vring_map(&mr, + indirect->addr + found * sizeof(desc), + sizeof(desc), false); if (!desc_ptr) { error_report("Failed to map indirect descriptor " "addr %#" PRIx64 " len %zu", @@ -153,6 +244,7 @@ static int get_indirect(Vring *vring, return -EFAULT; } desc = *desc_ptr; + memory_region_unref(mr); /* Ensure descriptor has been loaded before accessing fields */ barrier(); /* read_barrier_depends(); */ @@ -170,42 +262,35 @@ static int get_indirect(Vring *vring, return -EFAULT; } - /* Stop for now if there are not enough iovecs available. */ - if (iov >= iov_end) { - return -ENOBUFS; - } - - iov->iov_base = hostmem_lookup(&vring->hostmem, desc.addr, desc.len, - desc.flags & VRING_DESC_F_WRITE); - if (!iov->iov_base) { - error_report("Failed to map indirect descriptor" - "addr %#" PRIx64 " len %u", - (uint64_t)desc.addr, desc.len); - vring->broken = true; - return -EFAULT; - } - iov->iov_len = desc.len; - iov++; - - /* If this is an input descriptor, increment that count. */ - if (desc.flags & VRING_DESC_F_WRITE) { - *in_num += 1; - } else { - /* If it's an output descriptor, they're all supposed - * to come before any input descriptors. */ - if (unlikely(*in_num)) { - error_report("Indirect descriptor " - "has out after in: idx %u", i); - vring->broken = true; - return -EFAULT; - } - *out_num += 1; + ret = get_desc(vring, elem, &desc); + if (ret < 0) { + vring->broken |= (ret == -EFAULT); + return ret; } i = desc.next; } while (desc.flags & VRING_DESC_F_NEXT); return 0; } +void vring_free_element(VirtQueueElement *elem) +{ + int i; + + /* This assumes that the iovecs, if changed, are never moved past + * the end of the valid area. This is true if iovec manipulations + * are done with iov_discard_front and iov_discard_back. + */ + for (i = 0; i < elem->out_num; i++) { + vring_unmap(elem->out_sg[i].iov_base, false); + } + + for (i = 0; i < elem->in_num; i++) { + vring_unmap(elem->in_sg[i].iov_base, true); + } + + g_slice_free(VirtQueueElement, elem); +} + /* This looks in the virtqueue and for the first available buffer, and converts * it to an iovec for convenient access. Since descriptors consist of some * number of output then some number of input descriptors, it's actually two @@ -218,16 +303,18 @@ static int get_indirect(Vring *vring, * Stolen from linux/drivers/vhost/vhost.c. */ int vring_pop(VirtIODevice *vdev, Vring *vring, - struct iovec iov[], struct iovec *iov_end, - unsigned int *out_num, unsigned int *in_num) + VirtQueueElement **p_elem) { struct vring_desc desc; unsigned int i, head, found = 0, num = vring->vr.num; uint16_t avail_idx, last_avail_idx; + VirtQueueElement *elem = NULL; + int ret; /* If there was a fatal error then refuse operation */ if (vring->broken) { - return -EFAULT; + ret = -EFAULT; + goto out; } /* Check it isn't doing very strange things with descriptor numbers. */ @@ -238,13 +325,14 @@ int vring_pop(VirtIODevice *vdev, Vring *vring, if (unlikely((uint16_t)(avail_idx - last_avail_idx) > num)) { error_report("Guest moved used index from %u to %u", last_avail_idx, avail_idx); - vring->broken = true; - return -EFAULT; + ret = -EFAULT; + goto out; } /* If there's nothing new since last we looked. */ if (avail_idx == last_avail_idx) { - return -EAGAIN; + ret = -EAGAIN; + goto out; } /* Only get avail ring entries after they have been exposed by guest. */ @@ -254,32 +342,33 @@ int vring_pop(VirtIODevice *vdev, Vring *vring, * the index we've seen. */ head = vring->vr.avail->ring[last_avail_idx % num]; + elem = g_slice_new(VirtQueueElement); + elem->index = head; + elem->in_num = elem->out_num = 0; + /* If their number is silly, that's an error. */ if (unlikely(head >= num)) { error_report("Guest says index %u > %u is available", head, num); - vring->broken = true; - return -EFAULT; + ret = -EFAULT; + goto out; } if (vdev->guest_features & (1 << VIRTIO_RING_F_EVENT_IDX)) { vring_avail_event(&vring->vr) = vring->vr.avail->idx; } - /* When we start there are none of either input nor output. */ - *out_num = *in_num = 0; - i = head; do { if (unlikely(i >= num)) { error_report("Desc index is %u > %u, head = %u", i, num, head); - vring->broken = true; - return -EFAULT; + ret = -EFAULT; + goto out; } if (unlikely(++found > num)) { error_report("Loop detected: last one at %u vq size %u head %u", i, num, head); - vring->broken = true; - return -EFAULT; + ret = -EFAULT; + goto out; } desc = vring->vr.desc[i]; @@ -287,64 +376,50 @@ int vring_pop(VirtIODevice *vdev, Vring *vring, barrier(); if (desc.flags & VRING_DESC_F_INDIRECT) { - int ret = get_indirect(vring, iov, iov_end, out_num, in_num, &desc); + int ret = get_indirect(vring, elem, &desc); if (ret < 0) { - return ret; + goto out; } continue; } - /* If there are not enough iovecs left, stop for now. The caller - * should check if there are more descs available once they have dealt - * with the current set. - */ - if (iov >= iov_end) { - return -ENOBUFS; + ret = get_desc(vring, elem, &desc); + if (ret < 0) { + goto out; } - /* TODO handle non-contiguous memory across region boundaries */ - iov->iov_base = hostmem_lookup(&vring->hostmem, desc.addr, desc.len, - desc.flags & VRING_DESC_F_WRITE); - if (!iov->iov_base) { - error_report("Failed to map vring desc addr %#" PRIx64 " len %u", - (uint64_t)desc.addr, desc.len); - vring->broken = true; - return -EFAULT; - } - iov->iov_len = desc.len; - iov++; - - if (desc.flags & VRING_DESC_F_WRITE) { - /* If this is an input descriptor, - * increment that count. */ - *in_num += 1; - } else { - /* If it's an output descriptor, they're all supposed - * to come before any input descriptors. */ - if (unlikely(*in_num)) { - error_report("Descriptor has out after in: idx %d", i); - vring->broken = true; - return -EFAULT; - } - *out_num += 1; - } i = desc.next; } while (desc.flags & VRING_DESC_F_NEXT); /* On success, increment avail index. */ vring->last_avail_idx++; + *p_elem = elem; return head; + +out: + assert(ret < 0); + if (ret == -EFAULT) { + vring->broken = true; + } + if (elem) { + vring_free_element(elem); + } + *p_elem = NULL; + return ret; } /* After we've used one of their buffers, we tell them about it. * * Stolen from linux/drivers/vhost/vhost.c. */ -void vring_push(Vring *vring, unsigned int head, int len) +void vring_push(Vring *vring, VirtQueueElement *elem, int len) { struct vring_used_elem *used; + unsigned int head = elem->index; uint16_t new; + vring_free_element(elem); + /* Don't touch vring if a fatal error occurred */ if (vring->broken) { return; diff --git a/include/block/block_int.h b/include/block/block_int.h index 8b132d7178..2772f2f1bd 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -394,8 +394,9 @@ void stream_start(BlockDriverState *bs, BlockDriverState *base, /** * commit_start: - * @bs: Top Block device - * @base: Block device that will be written into, and become the new top + * @bs: Active block device. + * @top: Top block device to be committed. + * @base: Block device that will be written into, and become the new top. * @speed: The maximum speed, in bytes per second, or 0 for unlimited. * @on_error: The action to take upon error. * @cb: Completion function for the job. @@ -407,7 +408,22 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base, BlockDriverState *top, int64_t speed, BlockdevOnError on_error, BlockDriverCompletionFunc *cb, void *opaque, Error **errp); - +/** + * commit_active_start: + * @bs: Active block device to be committed. + * @base: Block device that will be written into, and become the new top. + * @speed: The maximum speed, in bytes per second, or 0 for unlimited. + * @on_error: The action to take upon error. + * @cb: Completion function for the job. + * @opaque: Opaque pointer value passed to @cb. + * @errp: Error object. + * + */ +void commit_active_start(BlockDriverState *bs, BlockDriverState *base, + int64_t speed, + BlockdevOnError on_error, + BlockDriverCompletionFunc *cb, + void *opaque, Error **errp); /* * mirror_start: * @bs: Block device to operate on. diff --git a/include/hw/virtio/dataplane/hostmem.h b/include/hw/virtio/dataplane/hostmem.h deleted file mode 100644 index 2810f4b44e..0000000000 --- a/include/hw/virtio/dataplane/hostmem.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Thread-safe guest to host memory mapping - * - * Copyright 2012 Red Hat, Inc. and/or its affiliates - * - * Authors: - * Stefan Hajnoczi <stefanha@redhat.com> - * - * This work is licensed under the terms of the GNU GPL, version 2 or later. - * See the COPYING file in the top-level directory. - * - */ - -#ifndef HOSTMEM_H -#define HOSTMEM_H - -#include "exec/memory.h" -#include "qemu/thread.h" - -typedef struct { - MemoryRegion *mr; - void *host_addr; - hwaddr guest_addr; - uint64_t size; - bool readonly; -} HostMemRegion; - -typedef struct { - /* The listener is invoked when regions change and a new list of regions is - * built up completely before they are installed. - */ - MemoryListener listener; - HostMemRegion *new_regions; - size_t num_new_regions; - - /* Current regions are accessed from multiple threads either to lookup - * addresses or to install a new list of regions. The lock protects the - * pointer and the regions. - */ - QemuMutex current_regions_lock; - HostMemRegion *current_regions; - size_t num_current_regions; -} HostMem; - -void hostmem_init(HostMem *hostmem); -void hostmem_finalize(HostMem *hostmem); - -/** - * Map a guest physical address to a pointer - * - * Note that there is map/unmap mechanism here. The caller must ensure that - * mapped memory is no longer used across events like hot memory unplug. This - * can be done with other mechanisms like bdrv_drain_all() that quiesce - * in-flight I/O. - */ -void *hostmem_lookup(HostMem *hostmem, hwaddr phys, hwaddr len, bool is_write); - -#endif /* HOSTMEM_H */ diff --git a/include/hw/virtio/dataplane/vring.h b/include/hw/virtio/dataplane/vring.h index c0b69ff18f..63e7bf4256 100644 --- a/include/hw/virtio/dataplane/vring.h +++ b/include/hw/virtio/dataplane/vring.h @@ -19,11 +19,10 @@ #include <linux/virtio_ring.h> #include "qemu-common.h" -#include "hostmem.h" #include "hw/virtio/virtio.h" typedef struct { - HostMem hostmem; /* guest memory mapper */ + MemoryRegion *mr; /* memory region containing the vring */ struct vring vr; /* virtqueue vring mapped to host memory */ uint16_t last_avail_idx; /* last processed avail ring index */ uint16_t last_used_idx; /* last processed used ring index */ @@ -54,9 +53,8 @@ void vring_teardown(Vring *vring, VirtIODevice *vdev, int n); void vring_disable_notification(VirtIODevice *vdev, Vring *vring); bool vring_enable_notification(VirtIODevice *vdev, Vring *vring); bool vring_should_notify(VirtIODevice *vdev, Vring *vring); -int vring_pop(VirtIODevice *vdev, Vring *vring, - struct iovec iov[], struct iovec *iov_end, - unsigned int *out_num, unsigned int *in_num); -void vring_push(Vring *vring, unsigned int head, int len); +int vring_pop(VirtIODevice *vdev, Vring *vring, VirtQueueElement **elem); +void vring_push(Vring *vring, VirtQueueElement *elem, int len); +void vring_free_element(VirtQueueElement *elem); #endif /* VRING_H */ diff --git a/qapi-schema.json b/qapi-schema.json index c3c939c8c3..b9b051c1fa 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -1967,9 +1967,11 @@ # # @top: The file name of the backing image within the image chain, # which contains the topmost data to be committed down. -# Note, the active layer as 'top' is currently unsupported. # # If top == base, that is an error. +# If top == active, the job will not be completed by itself, +# user needs to complete the job with the block-job-complete +# command after getting the ready event. (Since 2.0) # # # @speed: #optional the maximum speed, in bytes per second @@ -1979,7 +1981,6 @@ # If @device does not exist, DeviceNotFound # If image commit is not supported by this device, NotSupported # If @base or @top is invalid, a generic error is returned -# If @top is the active layer, or omitted, a generic error is returned # If @speed is invalid, InvalidParameter # # Since: 1.3 @@ -3022,7 +3023,7 @@ # # @devname: #optional path of the netmap device (default: '/dev/netmap'). # -# Since 1.8 +# Since 2.0 ## { 'type': 'NetdevNetmapOptions', 'data': { diff --git a/qemu-doc.texi b/qemu-doc.texi index 185dd47a03..4e9c6e9b6e 100644 --- a/qemu-doc.texi +++ b/qemu-doc.texi @@ -654,6 +654,21 @@ Supported options: Specifies which VHD subformat to use. Valid options are @code{dynamic} (default) and @code{fixed}. @end table + +@item VHDX +Hyper-V compatible image format (VHDX). +Supported options: +@table @code +@item subformat +Specifies which VHDX subformat to use. Valid options are +@code{dynamic} (default) and @code{fixed}. +@item block_state_zero +Force use of payload blocks of type 'ZERO'. +@item block_size +Block size; min 1 MB, max 256 MB. 0 means auto-calculate based on image size. +@item log_size +Log size; min 1 MB. +@end table @end table @subsubsection Read-only formats diff --git a/qemu-img.texi b/qemu-img.texi index be31191e43..1bba91efde 100644 --- a/qemu-img.texi +++ b/qemu-img.texi @@ -431,8 +431,8 @@ This option can only be enabled if @code{compat=1.1} is specified. @item Other QEMU also supports various other image file formats for compatibility with -older QEMU versions or other hypervisors, including VMDK, VDI, VHD (vpc), qcow1 -and QED. For a full list of supported formats see @code{qemu-img --help}. +older QEMU versions or other hypervisors, including VMDK, VDI, VHD (vpc), VHDX, +qcow1 and QED. For a full list of supported formats see @code{qemu-img --help}. For a more detailed description of these formats, see the QEMU Emulation User Documentation. diff --git a/tests/qemu-iotests/040 b/tests/qemu-iotests/040 index 18dcd61ef2..72eaad5b08 100755 --- a/tests/qemu-iotests/040 +++ b/tests/qemu-iotests/040 @@ -39,6 +39,29 @@ class ImageCommitTestCase(iotests.QMPTestCase): result = self.vm.qmp('query-block-jobs') self.assert_qmp(result, 'return', []) + def run_commit_test(self, top, base): + self.assert_no_active_commit() + result = self.vm.qmp('block-commit', device='drive0', top=top, base=base) + self.assert_qmp(result, 'return', {}) + + completed = False + while not completed: + for event in self.vm.get_qmp_events(wait=True): + if event['event'] == 'BLOCK_JOB_COMPLETED': + self.assert_qmp(event, 'data/type', 'commit') + self.assert_qmp(event, 'data/device', 'drive0') + self.assert_qmp(event, 'data/offset', self.image_len) + self.assert_qmp(event, 'data/len', self.image_len) + completed = True + elif event['event'] == 'BLOCK_JOB_READY': + self.assert_qmp(event, 'data/type', 'commit') + self.assert_qmp(event, 'data/device', 'drive0') + self.assert_qmp(event, 'data/len', self.image_len) + self.vm.qmp('block-job-complete', device='drive0') + + self.assert_no_active_commit() + self.vm.shutdown() + class TestSingleDrive(ImageCommitTestCase): image_len = 1 * 1024 * 1024 test_len = 1 * 1024 * 256 @@ -59,23 +82,7 @@ class TestSingleDrive(ImageCommitTestCase): os.remove(backing_img) def test_commit(self): - self.assert_no_active_commit() - result = self.vm.qmp('block-commit', device='drive0', top='%s' % mid_img) - self.assert_qmp(result, 'return', {}) - - completed = False - while not completed: - for event in self.vm.get_qmp_events(wait=True): - if event['event'] == 'BLOCK_JOB_COMPLETED': - self.assert_qmp(event, 'data/type', 'commit') - self.assert_qmp(event, 'data/device', 'drive0') - self.assert_qmp(event, 'data/offset', self.image_len) - self.assert_qmp(event, 'data/len', self.image_len) - completed = True - - self.assert_no_active_commit() - self.vm.shutdown() - + self.run_commit_test(mid_img, backing_img) self.assertEqual(-1, qemu_io('-c', 'read -P 0xab 0 524288', backing_img).find("verification failed")) self.assertEqual(-1, qemu_io('-c', 'read -P 0xef 524288 524288', backing_img).find("verification failed")) @@ -102,10 +109,9 @@ class TestSingleDrive(ImageCommitTestCase): self.assert_qmp(result, 'error/desc', 'Base \'badfile\' not found') def test_top_is_active(self): - self.assert_no_active_commit() - result = self.vm.qmp('block-commit', device='drive0', top='%s' % test_img, base='%s' % backing_img) - self.assert_qmp(result, 'error/class', 'GenericError') - self.assert_qmp(result, 'error/desc', 'Top image as the active layer is currently unsupported') + self.run_commit_test(test_img, backing_img) + self.assertEqual(-1, qemu_io('-c', 'read -P 0xab 0 524288', backing_img).find("verification failed")) + self.assertEqual(-1, qemu_io('-c', 'read -P 0xef 524288 524288', backing_img).find("verification failed")) def test_top_and_base_reversed(self): self.assert_no_active_commit() @@ -166,23 +172,7 @@ class TestRelativePaths(ImageCommitTestCase): raise def test_commit(self): - self.assert_no_active_commit() - result = self.vm.qmp('block-commit', device='drive0', top='%s' % self.mid_img) - self.assert_qmp(result, 'return', {}) - - completed = False - while not completed: - for event in self.vm.get_qmp_events(wait=True): - if event['event'] == 'BLOCK_JOB_COMPLETED': - self.assert_qmp(event, 'data/type', 'commit') - self.assert_qmp(event, 'data/device', 'drive0') - self.assert_qmp(event, 'data/offset', self.image_len) - self.assert_qmp(event, 'data/len', self.image_len) - completed = True - - self.assert_no_active_commit() - self.vm.shutdown() - + self.run_commit_test(self.mid_img, self.backing_img) self.assertEqual(-1, qemu_io('-c', 'read -P 0xab 0 524288', self.backing_img_abs).find("verification failed")) self.assertEqual(-1, qemu_io('-c', 'read -P 0xef 524288 524288', self.backing_img_abs).find("verification failed")) @@ -209,10 +199,9 @@ class TestRelativePaths(ImageCommitTestCase): self.assert_qmp(result, 'error/desc', 'Base \'badfile\' not found') def test_top_is_active(self): - self.assert_no_active_commit() - result = self.vm.qmp('block-commit', device='drive0', top='%s' % self.test_img, base='%s' % self.backing_img) - self.assert_qmp(result, 'error/class', 'GenericError') - self.assert_qmp(result, 'error/desc', 'Top image as the active layer is currently unsupported') + self.run_commit_test(self.test_img, self.backing_img) + self.assertEqual(-1, qemu_io('-c', 'read -P 0xab 0 524288', self.backing_img_abs).find("verification failed")) + self.assertEqual(-1, qemu_io('-c', 'read -P 0xef 524288 524288', self.backing_img_abs).find("verification failed")) def test_top_and_base_reversed(self): self.assert_no_active_commit() @@ -229,6 +218,7 @@ class TestSetSpeed(ImageCommitTestCase): qemu_img('create', '-f', iotests.imgfmt, '-o', 'backing_file=%s' % backing_img, mid_img) qemu_img('create', '-f', iotests.imgfmt, '-o', 'backing_file=%s' % mid_img, test_img) qemu_io('-c', 'write -P 0x1 0 512', test_img) + qemu_io('-c', 'write -P 0xef 524288 524288', mid_img) self.vm = iotests.VM().add_drive(test_img) self.vm.launch() diff --git a/tests/qemu-iotests/051.out b/tests/qemu-iotests/051.out index 49e95a20cf..c2cadba2fc 100644 --- a/tests/qemu-iotests/051.out +++ b/tests/qemu-iotests/051.out @@ -91,7 +91,6 @@ Testing: -drive if=virtio QEMU X.Y.Z monitor - type 'help' for more information (qemu) QEMU_PROG: -drive if=virtio: Device needs media, but drive is empty QEMU_PROG: -drive if=virtio: Device initialization failed. -QEMU_PROG: -drive if=virtio: Device initialization failed. QEMU_PROG: -drive if=virtio: Device 'virtio-blk-pci' could not be initialized Testing: -drive if=scsi diff --git a/tests/qemu-iotests/059 b/tests/qemu-iotests/059 index 73941c3e61..65bea1d6c6 100755 --- a/tests/qemu-iotests/059 +++ b/tests/qemu-iotests/059 @@ -81,6 +81,20 @@ IMGOPTS="subformat=twoGbMaxExtentFlat" _make_test_img 1000G $QEMU_IMG info $TEST_IMG | _filter_testdir | sed -e 's/cid: [0-9]*/cid: XXXXXXXX/' echo +echo "=== Testing malformed VMFS extent description line ===" +cat >"$TEST_IMG" <<EOF +# Disk DescriptorFile +version=1 +CID=58ab4847 +parentCID=ffffffff +createType="vmfs" + +# Extent description +RW 12582912 VMFS "dummy.vmdk" 1 +EOF +_img_info + +echo echo "=== Testing version 3 ===" _use_sample_img iotest-version3.vmdk.bz2 _img_info diff --git a/tests/qemu-iotests/059.out b/tests/qemu-iotests/059.out index 4ff935c6f4..16ab7c6c1f 100644 --- a/tests/qemu-iotests/059.out +++ b/tests/qemu-iotests/059.out @@ -2038,6 +2038,11 @@ Format specific information: filename: TEST_DIR/t-f500.vmdk format: FLAT +=== Testing malformed VMFS extent description line === +qemu-img: Could not open 'TEST_DIR/t.IMGFMT': Invalid extent lines: +RW 12582912 VMFS "dummy.IMGFMT" 1 + + === Testing version 3 === image: TEST_DIR/iotest-version3.IMGFMT file format: IMGFMT |