diff options
author | Anthony Liguori <aliguori@us.ibm.com> | 2012-01-27 08:58:52 -0600 |
---|---|---|
committer | Anthony Liguori <aliguori@us.ibm.com> | 2012-01-27 08:58:52 -0600 |
commit | 21fe5bc678b16d748db385fb1be95caa96b00eee (patch) | |
tree | 33342168c916f107f2c1e89c8fff8490b9e46f88 | |
parent | 96bab41df61b532bb6954a38527ad8403859a6c9 (diff) | |
parent | e2f0c49ffae8d3a00272c3cbc68850cc5aafbffa (diff) |
Merge remote-tracking branch 'kwolf/for-anthony' into staging
* kwolf/for-anthony: (22 commits)
scsi: Guard against buflen exceeding req->cmd.xfer in scsi_disk_emulate_command
qcow: Use bdrv functions to replace file operation
qcow: Return real error code in qcow_open
block/vdi: Zero unused parts when allocating a new block (fix #919242)
virtio-blk: add virtio_blk_handle_read trace event
docs: describe live block operations
block: add support for partial streaming
add QERR_BASE_NOT_FOUND
block: add bdrv_find_backing_image
blockdev: make image streaming safe across hotplug
qmp: add query-block-jobs
qmp: add block_job_cancel command
qmp: add block_job_set_speed command
qmp: add block_stream command
block: rate-limit streaming operations
block: add image streaming block job
block: add BlockJob interface for long-running operations
block: make copy-on-read a per-request flag
block: check bdrv_in_use() before blockdev operations
coroutine: add co_sleep_ns() coroutine sleep function
...
-rw-r--r-- | Makefile.objs | 2 | ||||
-rw-r--r-- | QMP/qmp-events.txt | 53 | ||||
-rw-r--r-- | block.c | 119 | ||||
-rw-r--r-- | block.h | 4 | ||||
-rw-r--r-- | block/blkdebug.c | 4 | ||||
-rw-r--r-- | block/blkverify.c | 4 | ||||
-rw-r--r-- | block/qcow.c | 104 | ||||
-rw-r--r-- | block/rbd.c | 22 | ||||
-rw-r--r-- | block/stream.c | 269 | ||||
-rw-r--r-- | block/vdi.c | 8 | ||||
-rw-r--r-- | block_int.h | 47 | ||||
-rw-r--r-- | blockdev.c | 199 | ||||
-rw-r--r-- | docs/live-block-ops.txt | 58 | ||||
-rw-r--r-- | hmp-commands.hx | 41 | ||||
-rw-r--r-- | hmp.c | 68 | ||||
-rw-r--r-- | hmp.h | 4 | ||||
-rw-r--r-- | hw/scsi-disk.c | 10 | ||||
-rw-r--r-- | hw/virtio-blk.c | 2 | ||||
-rw-r--r-- | monitor.c | 13 | ||||
-rw-r--r-- | monitor.h | 2 | ||||
-rw-r--r-- | qapi-schema.json | 115 | ||||
-rw-r--r-- | qemu-coroutine-sleep.c | 38 | ||||
-rw-r--r-- | qemu-coroutine.h | 9 | ||||
-rw-r--r-- | qemu-io.c | 48 | ||||
-rw-r--r-- | qerror.c | 8 | ||||
-rw-r--r-- | qerror.h | 6 | ||||
-rw-r--r-- | qmp-commands.hx | 24 | ||||
-rw-r--r-- | trace-events | 13 |
28 files changed, 1206 insertions, 88 deletions
diff --git a/Makefile.objs b/Makefile.objs index 9ca606356f..06a147b0b0 100644 --- a/Makefile.objs +++ b/Makefile.objs @@ -13,6 +13,7 @@ oslib-obj-$(CONFIG_POSIX) += oslib-posix.o qemu-thread-posix.o ####################################################################### # coroutines coroutine-obj-y = qemu-coroutine.o qemu-coroutine-lock.o qemu-coroutine-io.o +coroutine-obj-y += qemu-coroutine-sleep.o ifeq ($(CONFIG_UCONTEXT_COROUTINE),y) coroutine-obj-$(CONFIG_POSIX) += coroutine-ucontext.o else @@ -34,6 +35,7 @@ block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o block-nested-y += qed-check.o block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o +block-nested-y += stream.o block-nested-$(CONFIG_WIN32) += raw-win32.o block-nested-$(CONFIG_POSIX) += raw-posix.o block-nested-$(CONFIG_LIBISCSI) += iscsi.o diff --git a/QMP/qmp-events.txt b/QMP/qmp-events.txt index af586ec855..06cb404837 100644 --- a/QMP/qmp-events.txt +++ b/QMP/qmp-events.txt @@ -264,3 +264,56 @@ Example: Note: If action is "reset", "shutdown", or "pause" the WATCHDOG event is followed respectively by the RESET, SHUTDOWN, or STOP events. + + +BLOCK_JOB_COMPLETED +------------------- + +Emitted when a block job has completed. + +Data: + +- "type": Job type ("stream" for image streaming, json-string) +- "device": Device name (json-string) +- "len": Maximum progress value (json-int) +- "offset": Current progress value (json-int) + On success this is equal to len. + On failure this is less than len. +- "speed": Rate limit, bytes per second (json-int) +- "error": Error message (json-string, optional) + Only present on failure. This field contains a human-readable + error message. There are no semantics other than that streaming + has failed and clients should not try to interpret the error + string. + +Example: + +{ "event": "BLOCK_JOB_COMPLETED", + "data": { "type": "stream", "device": "virtio-disk0", + "len": 10737418240, "offset": 10737418240, + "speed": 0 }, + "timestamp": { "seconds": 1267061043, "microseconds": 959568 } } + + +BLOCK_JOB_CANCELLED +------------------- + +Emitted when a block job has been cancelled. + +Data: + +- "type": Job type ("stream" for image streaming, json-string) +- "device": Device name (json-string) +- "len": Maximum progress value (json-int) +- "offset": Current progress value (json-int) + On success this is equal to len. + On failure this is less than len. +- "speed": Rate limit, bytes per second (json-int) + +Example: + +{ "event": "BLOCK_JOB_CANCELLED", + "data": { "type": "stream", "device": "virtio-disk0", + "len": 10737418240, "offset": 134217728, + "speed": 0 }, + "timestamp": { "seconds": 1267061043, "microseconds": 959568 } } @@ -48,6 +48,10 @@ #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ +typedef enum { + BDRV_REQ_COPY_ON_READ = 0x1, +} BdrvRequestFlags; + static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load); static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, @@ -62,7 +66,8 @@ static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov); static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, + BdrvRequestFlags flags); static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, @@ -1020,6 +1025,10 @@ int bdrv_commit(BlockDriverState *bs) return -EACCES; } + if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) { + return -EBUSY; + } + backing_drv = bs->backing_hd->drv; ro = bs->backing_hd->read_only; strncpy(filename, bs->backing_hd->filename, sizeof(filename)); @@ -1288,7 +1297,7 @@ static void coroutine_fn bdrv_rw_co_entry(void *opaque) if (!rwco->is_write) { rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num, - rwco->nb_sectors, rwco->qiov); + rwco->nb_sectors, rwco->qiov, 0); } else { rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num, rwco->nb_sectors, rwco->qiov); @@ -1496,7 +1505,7 @@ int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, return 0; } -static int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, +static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { /* Perform I/O through a temporary buffer so that users who scribble over @@ -1519,8 +1528,8 @@ static int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, round_to_clusters(bs, sector_num, nb_sectors, &cluster_sector_num, &cluster_nb_sectors); - trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors, - cluster_sector_num, cluster_nb_sectors); + trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, + cluster_sector_num, cluster_nb_sectors); iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len); @@ -1555,7 +1564,8 @@ err: * Handle a read request in coroutine context */ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, + BdrvRequestFlags flags) { BlockDriver *drv = bs->drv; BdrvTrackedRequest req; @@ -1574,12 +1584,19 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, } if (bs->copy_on_read) { + flags |= BDRV_REQ_COPY_ON_READ; + } + if (flags & BDRV_REQ_COPY_ON_READ) { + bs->copy_on_read_in_flight++; + } + + if (bs->copy_on_read_in_flight) { wait_for_overlapping_requests(bs, sector_num, nb_sectors); } tracked_request_begin(&req, bs, sector_num, nb_sectors, false); - if (bs->copy_on_read) { + if (flags & BDRV_REQ_COPY_ON_READ) { int pnum; ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum); @@ -1588,7 +1605,7 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, } if (!ret || pnum != nb_sectors) { - ret = bdrv_co_copy_on_readv(bs, sector_num, nb_sectors, qiov); + ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov); goto out; } } @@ -1597,6 +1614,11 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, out: tracked_request_end(&req); + + if (flags & BDRV_REQ_COPY_ON_READ) { + bs->copy_on_read_in_flight--; + } + return ret; } @@ -1605,7 +1627,16 @@ int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, { trace_bdrv_co_readv(bs, sector_num, nb_sectors); - return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov); + return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); +} + +int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) +{ + trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors); + + return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, + BDRV_REQ_COPY_ON_READ); } /* @@ -1633,7 +1664,7 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, bdrv_io_limits_intercept(bs, true, nb_sectors); } - if (bs->copy_on_read) { + if (bs->copy_on_read_in_flight) { wait_for_overlapping_requests(bs, sector_num, nb_sectors); } @@ -2564,6 +2595,24 @@ int bdrv_snapshot_load_tmp(BlockDriverState *bs, return -ENOTSUP; } +BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, + const char *backing_file) +{ + if (!bs->drv) { + return NULL; + } + + if (bs->backing_hd) { + if (strcmp(bs->backing_file, backing_file) == 0) { + return bs->backing_hd; + } else { + return bdrv_find_backing_image(bs->backing_hd, backing_file); + } + } + + return NULL; +} + #define NB_SUFFIXES 4 char *get_human_readable_size(char *buf, int buf_size, int64_t size) @@ -3140,7 +3189,7 @@ static void coroutine_fn bdrv_co_do_rw(void *opaque) if (!acb->is_write) { acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, - acb->req.nb_sectors, acb->req.qiov); + acb->req.nb_sectors, acb->req.qiov, 0); } else { acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, acb->req.nb_sectors, acb->req.qiov); @@ -3827,3 +3876,51 @@ out: return ret; } + +void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs, + BlockDriverCompletionFunc *cb, void *opaque) +{ + BlockJob *job; + + if (bs->job || bdrv_in_use(bs)) { + return NULL; + } + bdrv_set_in_use(bs, 1); + + job = g_malloc0(job_type->instance_size); + job->job_type = job_type; + job->bs = bs; + job->cb = cb; + job->opaque = opaque; + bs->job = job; + return job; +} + +void block_job_complete(BlockJob *job, int ret) +{ + BlockDriverState *bs = job->bs; + + assert(bs->job == job); + job->cb(job->opaque, ret); + bs->job = NULL; + g_free(job); + bdrv_set_in_use(bs, 0); +} + +int block_job_set_speed(BlockJob *job, int64_t value) +{ + if (!job->job_type->set_speed) { + return -ENOTSUP; + } + return job->job_type->set_speed(job, value); +} + +void block_job_cancel(BlockJob *job) +{ + job->cancelled = true; +} + +bool block_job_is_cancelled(BlockJob *job) +{ + return job->cancelled; +} @@ -142,10 +142,14 @@ int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, const void *buf, int count); int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); +int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum); +BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, + const char *backing_file); int bdrv_truncate(BlockDriverState *bs, int64_t offset); int64_t bdrv_getlength(BlockDriverState *bs); int64_t bdrv_get_allocated_file_size(BlockDriverState *bs); diff --git a/block/blkdebug.c b/block/blkdebug.c index 9b885359e4..a251802ad4 100644 --- a/block/blkdebug.c +++ b/block/blkdebug.c @@ -292,10 +292,10 @@ static int blkdebug_open(BlockDriverState *bs, const char *filename, int flags) return -EINVAL; } - config = strdup(filename); + config = g_strdup(filename); config[c - filename] = '\0'; ret = read_config(s, config); - free(config); + g_free(config); if (ret < 0) { return ret; } diff --git a/block/blkverify.c b/block/blkverify.c index 4ca8584b88..9d5f1ec5b9 100644 --- a/block/blkverify.c +++ b/block/blkverify.c @@ -87,10 +87,10 @@ static int blkverify_open(BlockDriverState *bs, const char *filename, int flags) return -EINVAL; } - raw = strdup(filename); + raw = g_strdup(filename); raw[c - filename] = '\0'; ret = bdrv_file_open(&bs->file, raw, flags); - free(raw); + g_free(raw); if (ret < 0) { return ret; } diff --git a/block/qcow.c b/block/qcow.c index b16955d764..b1cfe1f696 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -95,11 +95,13 @@ static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename) static int qcow_open(BlockDriverState *bs, int flags) { BDRVQcowState *s = bs->opaque; - int len, i, shift; + int len, i, shift, ret; QCowHeader header; - if (bdrv_pread(bs->file, 0, &header, sizeof(header)) != sizeof(header)) + ret = bdrv_pread(bs->file, 0, &header, sizeof(header)); + if (ret < 0) { goto fail; + } be32_to_cpus(&header.magic); be32_to_cpus(&header.version); be64_to_cpus(&header.backing_file_offset); @@ -109,15 +111,31 @@ static int qcow_open(BlockDriverState *bs, int flags) be32_to_cpus(&header.crypt_method); be64_to_cpus(&header.l1_table_offset); - if (header.magic != QCOW_MAGIC || header.version != QCOW_VERSION) + if (header.magic != QCOW_MAGIC) { + ret = -EINVAL; goto fail; - if (header.size <= 1 || header.cluster_bits < 9) + } + if (header.version != QCOW_VERSION) { + char version[64]; + snprintf(version, sizeof(version), "QCOW version %d", header.version); + qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, + bs->device_name, "qcow", version); + ret = -ENOTSUP; goto fail; - if (header.crypt_method > QCOW_CRYPT_AES) + } + + if (header.size <= 1 || header.cluster_bits < 9) { + ret = -EINVAL; + goto fail; + } + if (header.crypt_method > QCOW_CRYPT_AES) { + ret = -EINVAL; goto fail; + } s->crypt_method_header = header.crypt_method; - if (s->crypt_method_header) + if (s->crypt_method_header) { bs->encrypted = 1; + } s->cluster_bits = header.cluster_bits; s->cluster_size = 1 << s->cluster_bits; s->cluster_sectors = 1 << (s->cluster_bits - 9); @@ -132,33 +150,33 @@ static int qcow_open(BlockDriverState *bs, int flags) s->l1_table_offset = header.l1_table_offset; s->l1_table = g_malloc(s->l1_size * sizeof(uint64_t)); - if (!s->l1_table) - goto fail; - if (bdrv_pread(bs->file, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)) != - s->l1_size * sizeof(uint64_t)) + + ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table, + s->l1_size * sizeof(uint64_t)); + if (ret < 0) { goto fail; + } + for(i = 0;i < s->l1_size; i++) { be64_to_cpus(&s->l1_table[i]); } /* alloc L2 cache */ s->l2_cache = g_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t)); - if (!s->l2_cache) - goto fail; s->cluster_cache = g_malloc(s->cluster_size); - if (!s->cluster_cache) - goto fail; s->cluster_data = g_malloc(s->cluster_size); - if (!s->cluster_data) - goto fail; s->cluster_cache_offset = -1; /* read the backing file name */ if (header.backing_file_offset != 0) { len = header.backing_file_size; - if (len > 1023) + if (len > 1023) { len = 1023; - if (bdrv_pread(bs->file, header.backing_file_offset, bs->backing_file, len) != len) + } + ret = bdrv_pread(bs->file, header.backing_file_offset, + bs->backing_file, len); + if (ret < 0) { goto fail; + } bs->backing_file[len] = '\0'; } @@ -176,7 +194,7 @@ static int qcow_open(BlockDriverState *bs, int flags) g_free(s->l2_cache); g_free(s->cluster_cache); g_free(s->cluster_data); - return -1; + return ret; } static int qcow_set_key(BlockDriverState *bs, const char *key) @@ -626,13 +644,14 @@ static void qcow_close(BlockDriverState *bs) static int qcow_create(const char *filename, QEMUOptionParameter *options) { - int fd, header_size, backing_filename_len, l1_size, i, shift; + int header_size, backing_filename_len, l1_size, shift, i; QCowHeader header; - uint64_t tmp; + uint8_t *tmp; int64_t total_size = 0; const char *backing_file = NULL; int flags = 0; int ret; + BlockDriverState *qcow_bs; /* Read out options */ while (options && options->name) { @@ -646,9 +665,21 @@ static int qcow_create(const char *filename, QEMUOptionParameter *options) options++; } - fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644); - if (fd < 0) - return -errno; + ret = bdrv_create_file(filename, options); + if (ret < 0) { + return ret; + } + + ret = bdrv_file_open(&qcow_bs, filename, BDRV_O_RDWR); + if (ret < 0) { + return ret; + } + + ret = bdrv_truncate(qcow_bs, 0); + if (ret < 0) { + goto exit; + } + memset(&header, 0, sizeof(header)); header.magic = cpu_to_be32(QCOW_MAGIC); header.version = cpu_to_be32(QCOW_VERSION); @@ -684,33 +715,34 @@ static int qcow_create(const char *filename, QEMUOptionParameter *options) } /* write all the data */ - ret = qemu_write_full(fd, &header, sizeof(header)); + ret = bdrv_pwrite(qcow_bs, 0, &header, sizeof(header)); if (ret != sizeof(header)) { - ret = -errno; goto exit; } if (backing_file) { - ret = qemu_write_full(fd, backing_file, backing_filename_len); + ret = bdrv_pwrite(qcow_bs, sizeof(header), + backing_file, backing_filename_len); if (ret != backing_filename_len) { - ret = -errno; goto exit; } - } - lseek(fd, header_size, SEEK_SET); - tmp = 0; - for(i = 0;i < l1_size; i++) { - ret = qemu_write_full(fd, &tmp, sizeof(tmp)); - if (ret != sizeof(tmp)) { - ret = -errno; + + tmp = g_malloc0(BDRV_SECTOR_SIZE); + for (i = 0; i < ((sizeof(uint64_t)*l1_size + BDRV_SECTOR_SIZE - 1)/ + BDRV_SECTOR_SIZE); i++) { + ret = bdrv_pwrite(qcow_bs, header_size + + BDRV_SECTOR_SIZE*i, tmp, BDRV_SECTOR_SIZE); + if (ret != BDRV_SECTOR_SIZE) { + g_free(tmp); goto exit; } } + g_free(tmp); ret = 0; exit: - close(fd); + bdrv_delete(qcow_bs); return ret; } diff --git a/block/rbd.c b/block/rbd.c index db5abf240b..46a8579018 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -789,6 +789,26 @@ static int qemu_rbd_snap_create(BlockDriverState *bs, return 0; } +static int qemu_rbd_snap_remove(BlockDriverState *bs, + const char *snapshot_name) +{ + BDRVRBDState *s = bs->opaque; + int r; + + r = rbd_snap_remove(s->image, snapshot_name); + return r; +} + +static int qemu_rbd_snap_rollback(BlockDriverState *bs, + const char *snapshot_name) +{ + BDRVRBDState *s = bs->opaque; + int r; + + r = rbd_snap_rollback(s->image, snapshot_name); + return r; +} + static int qemu_rbd_snap_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab) { @@ -862,7 +882,9 @@ static BlockDriver bdrv_rbd = { .bdrv_co_flush_to_disk = qemu_rbd_co_flush, .bdrv_snapshot_create = qemu_rbd_snap_create, + .bdrv_snapshot_delete = qemu_rbd_snap_remove, .bdrv_snapshot_list = qemu_rbd_snap_list, + .bdrv_snapshot_goto = qemu_rbd_snap_rollback, }; static void bdrv_rbd_init(void) diff --git a/block/stream.c b/block/stream.c new file mode 100644 index 0000000000..d1b3986a8a --- /dev/null +++ b/block/stream.c @@ -0,0 +1,269 @@ +/* + * Image streaming + * + * Copyright IBM, Corp. 2011 + * + * Authors: + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "trace.h" +#include "block_int.h" + +enum { + /* + * Size of data buffer for populating the image file. This should be large + * enough to process multiple clusters in a single call, so that populating + * contiguous regions of the image is efficient. + */ + STREAM_BUFFER_SIZE = 512 * 1024, /* in bytes */ +}; + +#define SLICE_TIME 100000000ULL /* ns */ + +typedef struct { + int64_t next_slice_time; + uint64_t slice_quota; + uint64_t dispatched; +} RateLimit; + +static int64_t ratelimit_calculate_delay(RateLimit *limit, uint64_t n) +{ + int64_t delay_ns = 0; + int64_t now = qemu_get_clock_ns(rt_clock); + + if (limit->next_slice_time < now) { + limit->next_slice_time = now + SLICE_TIME; + limit->dispatched = 0; + } + if (limit->dispatched + n > limit->slice_quota) { + delay_ns = limit->next_slice_time - now; + } else { + limit->dispatched += n; + } + return delay_ns; +} + +static void ratelimit_set_speed(RateLimit *limit, uint64_t speed) +{ + limit->slice_quota = speed / (1000000000ULL / SLICE_TIME); +} + +typedef struct StreamBlockJob { + BlockJob common; + RateLimit limit; + BlockDriverState *base; + char backing_file_id[1024]; +} StreamBlockJob; + +static int coroutine_fn stream_populate(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + void *buf) +{ + struct iovec iov = { + .iov_base = buf, + .iov_len = nb_sectors * BDRV_SECTOR_SIZE, + }; + QEMUIOVector qiov; + + qemu_iovec_init_external(&qiov, &iov, 1); + + /* Copy-on-read the unallocated clusters */ + return bdrv_co_copy_on_readv(bs, sector_num, nb_sectors, &qiov); +} + +/* + * Given an image chain: [BASE] -> [INTER1] -> [INTER2] -> [TOP] + * + * Return true if the given sector is allocated in top. + * Return false if the given sector is allocated in intermediate images. + * Return true otherwise. + * + * 'pnum' is set to the number of sectors (including and immediately following + * the specified sector) that are known to be in the same + * allocated/unallocated state. + * + */ +static int coroutine_fn is_allocated_base(BlockDriverState *top, + BlockDriverState *base, + int64_t sector_num, + int nb_sectors, int *pnum) +{ + BlockDriverState *intermediate; + int ret, n; + + ret = bdrv_co_is_allocated(top, sector_num, nb_sectors, &n); + if (ret) { + *pnum = n; + return ret; + } + + /* + * Is the unallocated chunk [sector_num, n] also + * unallocated between base and top? + */ + intermediate = top->backing_hd; + + while (intermediate) { + int pnum_inter; + + /* reached base */ + if (intermediate == base) { + *pnum = n; + return 1; + } + ret = bdrv_co_is_allocated(intermediate, sector_num, nb_sectors, + &pnum_inter); + if (ret < 0) { + return ret; + } else if (ret) { + *pnum = pnum_inter; + return 0; + } + + /* + * [sector_num, nb_sectors] is unallocated on top but intermediate + * might have + * + * [sector_num+x, nr_sectors] allocated. + */ + if (n > pnum_inter) { + n = pnum_inter; + } + + intermediate = intermediate->backing_hd; + } + + return 1; +} + +static void coroutine_fn stream_run(void *opaque) +{ + StreamBlockJob *s = opaque; + BlockDriverState *bs = s->common.bs; + BlockDriverState *base = s->base; + int64_t sector_num, end; + int ret = 0; + int n; + void *buf; + + s->common.len = bdrv_getlength(bs); + if (s->common.len < 0) { + block_job_complete(&s->common, s->common.len); + return; + } + + end = s->common.len >> BDRV_SECTOR_BITS; + buf = qemu_blockalign(bs, STREAM_BUFFER_SIZE); + + /* Turn on copy-on-read for the whole block device so that guest read + * requests help us make progress. Only do this when copying the entire + * backing chain since the copy-on-read operation does not take base into + * account. + */ + if (!base) { + bdrv_enable_copy_on_read(bs); + } + + for (sector_num = 0; sector_num < end; sector_num += n) { +retry: + if (block_job_is_cancelled(&s->common)) { + break; + } + + + if (base) { + ret = is_allocated_base(bs, base, sector_num, + STREAM_BUFFER_SIZE / BDRV_SECTOR_SIZE, &n); + } else { + ret = bdrv_co_is_allocated(bs, sector_num, + STREAM_BUFFER_SIZE / BDRV_SECTOR_SIZE, + &n); + } + trace_stream_one_iteration(s, sector_num, n, ret); + if (ret == 0) { + if (s->common.speed) { + uint64_t delay_ns = ratelimit_calculate_delay(&s->limit, n); + if (delay_ns > 0) { + co_sleep_ns(rt_clock, delay_ns); + + /* Recheck cancellation and that sectors are unallocated */ + goto retry; + } + } + ret = stream_populate(bs, sector_num, n, buf); + } + if (ret < 0) { + break; + } + ret = 0; + + /* Publish progress */ + s->common.offset += n * BDRV_SECTOR_SIZE; + + /* Note that even when no rate limit is applied we need to yield + * with no pending I/O here so that qemu_aio_flush() returns. + */ + co_sleep_ns(rt_clock, 0); + } + + if (!base) { + bdrv_disable_copy_on_read(bs); + } + + if (sector_num == end && ret == 0) { + const char *base_id = NULL; + if (base) { + base_id = s->backing_file_id; + } + ret = bdrv_change_backing_file(bs, base_id, NULL); + } + + qemu_vfree(buf); + block_job_complete(&s->common, ret); +} + +static int stream_set_speed(BlockJob *job, int64_t value) +{ + StreamBlockJob *s = container_of(job, StreamBlockJob, common); + + if (value < 0) { + return -EINVAL; + } + job->speed = value; + ratelimit_set_speed(&s->limit, value / BDRV_SECTOR_SIZE); + return 0; +} + +static BlockJobType stream_job_type = { + .instance_size = sizeof(StreamBlockJob), + .job_type = "stream", + .set_speed = stream_set_speed, +}; + +int stream_start(BlockDriverState *bs, BlockDriverState *base, + const char *base_id, BlockDriverCompletionFunc *cb, + void *opaque) +{ + StreamBlockJob *s; + Coroutine *co; + + s = block_job_create(&stream_job_type, bs, cb, opaque); + if (!s) { + return -EBUSY; /* bs must already be in use */ + } + + s->base = base; + if (base_id) { + pstrcpy(s->backing_file_id, sizeof(s->backing_file_id), base_id); + } + + co = qemu_coroutine_create(stream_run); + trace_stream_start(bs, base, s, co, opaque); + qemu_coroutine_enter(co, s); + return 0; +} diff --git a/block/vdi.c b/block/vdi.c index 31cdfabdea..6a0011fbcc 100644 --- a/block/vdi.c +++ b/block/vdi.c @@ -1,7 +1,7 @@ /* * Block driver for the Virtual Disk Image (VDI) format * - * Copyright (c) 2009 Stefan Weil + * Copyright (c) 2009, 2012 Stefan Weil * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -756,15 +756,19 @@ static void vdi_aio_write_cb(void *opaque, int ret) (uint64_t)bmap_entry * s->block_sectors; block = acb->block_buffer; if (block == NULL) { - block = g_malloc0(s->block_size); + block = g_malloc(s->block_size); acb->block_buffer = block; acb->bmap_first = block_index; assert(!acb->header_modified); acb->header_modified = 1; } acb->bmap_last = block_index; + /* Copy data to be written to new block and zero unused parts. */ + memset(block, 0, sector_in_block * SECTOR_SIZE); memcpy(block + sector_in_block * SECTOR_SIZE, acb->buf, n_sectors * SECTOR_SIZE); + memset(block + (sector_in_block + n_sectors) * SECTOR_SIZE, 0, + (s->block_sectors - n_sectors - sector_in_block) * SECTOR_SIZE); acb->hd_iov.iov_base = (void *)block; acb->hd_iov.iov_len = s->block_size; qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1); diff --git a/block_int.h b/block_int.h index 311bd2a6fa..7be2988ca7 100644 --- a/block_int.h +++ b/block_int.h @@ -69,6 +69,36 @@ typedef struct BlockIOBaseValue { uint64_t ios[2]; } BlockIOBaseValue; +typedef void BlockJobCancelFunc(void *opaque); +typedef struct BlockJob BlockJob; +typedef struct BlockJobType { + /** Derived BlockJob struct size */ + size_t instance_size; + + /** String describing the operation, part of query-block-jobs QMP API */ + const char *job_type; + + /** Optional callback for job types that support setting a speed limit */ + int (*set_speed)(BlockJob *job, int64_t value); +} BlockJobType; + +/** + * Long-running operation on a BlockDriverState + */ +struct BlockJob { + const BlockJobType *job_type; + BlockDriverState *bs; + bool cancelled; + + /* These fields are published by the query-block-jobs QMP API */ + int64_t offset; + int64_t len; + int64_t speed; + + BlockDriverCompletionFunc *cb; + void *opaque; +}; + struct BlockDriver { const char *format_name; int instance_size; @@ -218,6 +248,9 @@ struct BlockDriverState { BlockDriverState *backing_hd; BlockDriverState *file; + /* number of in-flight copy-on-read requests */ + unsigned int copy_on_read_in_flight; + /* async read/write emulation */ void *sync_aiocb; @@ -261,6 +294,9 @@ struct BlockDriverState { void *private; QLIST_HEAD(, BdrvTrackedRequest) tracked_requests; + + /* long-running background operation */ + BlockJob *job; }; struct BlockDriverAIOCB { @@ -284,4 +320,15 @@ void bdrv_set_io_limits(BlockDriverState *bs, int is_windows_drive(const char *filename); #endif +void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs, + BlockDriverCompletionFunc *cb, void *opaque); +void block_job_complete(BlockJob *job, int ret); +int block_job_set_speed(BlockJob *job, int64_t value); +void block_job_cancel(BlockJob *job); +bool block_job_is_cancelled(BlockJob *job); + +int stream_start(BlockDriverState *bs, BlockDriverState *base, + const char *base_id, BlockDriverCompletionFunc *cb, + void *opaque); + #endif /* BLOCK_INT_H */ diff --git a/blockdev.c b/blockdev.c index 1f83c888e7..7e4c548426 100644 --- a/blockdev.c +++ b/blockdev.c @@ -13,9 +13,11 @@ #include "qerror.h" #include "qemu-option.h" #include "qemu-config.h" +#include "qemu-objects.h" #include "sysemu.h" #include "block_int.h" #include "qmp-commands.h" +#include "trace.h" static QTAILQ_HEAD(drivelist, DriveInfo) drives = QTAILQ_HEAD_INITIALIZER(drives); @@ -200,6 +202,37 @@ void drive_get_ref(DriveInfo *dinfo) dinfo->refcount++; } +typedef struct { + QEMUBH *bh; + DriveInfo *dinfo; +} DrivePutRefBH; + +static void drive_put_ref_bh(void *opaque) +{ + DrivePutRefBH *s = opaque; + + drive_put_ref(s->dinfo); + qemu_bh_delete(s->bh); + g_free(s); +} + +/* + * Release a drive reference in a BH + * + * It is not possible to use drive_put_ref() from a callback function when the + * callers still need the drive. In such cases we schedule a BH to release the + * reference. + */ +static void drive_put_ref_bh_schedule(DriveInfo *dinfo) +{ + DrivePutRefBH *s; + + s = g_new(DrivePutRefBH, 1); + s->bh = qemu_bh_new(drive_put_ref_bh, s); + s->dinfo = dinfo; + qemu_bh_schedule(s->bh); +} + static int parse_block_error_action(const char *buf, int is_read) { if (!strcmp(buf, "ignore")) { @@ -592,12 +625,18 @@ void do_commit(Monitor *mon, const QDict *qdict) if (!strcmp(device, "all")) { bdrv_commit_all(); } else { + int ret; + bs = bdrv_find(device); if (!bs) { qerror_report(QERR_DEVICE_NOT_FOUND, device); return; } - bdrv_commit(bs); + ret = bdrv_commit(bs); + if (ret == -EBUSY) { + qerror_report(QERR_DEVICE_IN_USE, device); + return; + } } } @@ -616,6 +655,10 @@ void qmp_blockdev_snapshot_sync(const char *device, const char *snapshot_file, error_set(errp, QERR_DEVICE_NOT_FOUND, device); return; } + if (bdrv_in_use(bs)) { + error_set(errp, QERR_DEVICE_IN_USE, device); + return; + } pstrcpy(old_filename, sizeof(old_filename), bs->filename); @@ -667,6 +710,10 @@ void qmp_blockdev_snapshot_sync(const char *device, const char *snapshot_file, static void eject_device(BlockDriverState *bs, int force, Error **errp) { + if (bdrv_in_use(bs)) { + error_set(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs)); + return; + } if (!bdrv_dev_has_removable_media(bs)) { error_set(errp, QERR_DEVICE_NOT_REMOVABLE, bdrv_get_device_name(bs)); return; @@ -883,3 +930,153 @@ void qmp_block_resize(const char *device, int64_t size, Error **errp) break; } } + +static QObject *qobject_from_block_job(BlockJob *job) +{ + return qobject_from_jsonf("{ 'type': %s," + "'device': %s," + "'len': %" PRId64 "," + "'offset': %" PRId64 "," + "'speed': %" PRId64 " }", + job->job_type->job_type, + bdrv_get_device_name(job->bs), + job->len, + job->offset, + job->speed); +} + +static void block_stream_cb(void *opaque, int ret) +{ + BlockDriverState *bs = opaque; + QObject *obj; + + trace_block_stream_cb(bs, bs->job, ret); + + assert(bs->job); + obj = qobject_from_block_job(bs->job); + if (ret < 0) { + QDict *dict = qobject_to_qdict(obj); + qdict_put(dict, "error", qstring_from_str(strerror(-ret))); + } + + if (block_job_is_cancelled(bs->job)) { + monitor_protocol_event(QEVENT_BLOCK_JOB_CANCELLED, obj); + } else { + monitor_protocol_event(QEVENT_BLOCK_JOB_COMPLETED, obj); + } + qobject_decref(obj); + + drive_put_ref_bh_schedule(drive_get_by_blockdev(bs)); +} + +void qmp_block_stream(const char *device, bool has_base, + const char *base, Error **errp) +{ + BlockDriverState *bs; + BlockDriverState *base_bs = NULL; + int ret; + + bs = bdrv_find(device); + if (!bs) { + error_set(errp, QERR_DEVICE_NOT_FOUND, device); + return; + } + + if (base) { + base_bs = bdrv_find_backing_image(bs, base); + if (base_bs == NULL) { + error_set(errp, QERR_BASE_NOT_FOUND, base); + return; + } + } + + ret = stream_start(bs, base_bs, base, block_stream_cb, bs); + if (ret < 0) { + switch (ret) { + case -EBUSY: + error_set(errp, QERR_DEVICE_IN_USE, device); + return; + default: + error_set(errp, QERR_NOT_SUPPORTED); + return; + } + } + + /* Grab a reference so hotplug does not delete the BlockDriverState from + * underneath us. + */ + drive_get_ref(drive_get_by_blockdev(bs)); + + trace_qmp_block_stream(bs, bs->job); +} + +static BlockJob *find_block_job(const char *device) +{ + BlockDriverState *bs; + + bs = bdrv_find(device); + if (!bs || !bs->job) { + return NULL; + } + return bs->job; +} + +void qmp_block_job_set_speed(const char *device, int64_t value, Error **errp) +{ + BlockJob *job = find_block_job(device); + + if (!job) { + error_set(errp, QERR_DEVICE_NOT_ACTIVE, device); + return; + } + + if (block_job_set_speed(job, value) < 0) { + error_set(errp, QERR_NOT_SUPPORTED); + } +} + +void qmp_block_job_cancel(const char *device, Error **errp) +{ + BlockJob *job = find_block_job(device); + + if (!job) { + error_set(errp, QERR_DEVICE_NOT_ACTIVE, device); + return; + } + + trace_qmp_block_job_cancel(job); + block_job_cancel(job); +} + +static void do_qmp_query_block_jobs_one(void *opaque, BlockDriverState *bs) +{ + BlockJobInfoList **prev = opaque; + BlockJob *job = bs->job; + + if (job) { + BlockJobInfoList *elem; + BlockJobInfo *info = g_new(BlockJobInfo, 1); + *info = (BlockJobInfo){ + .type = g_strdup(job->job_type->job_type), + .device = g_strdup(bdrv_get_device_name(bs)), + .len = job->len, + .offset = job->offset, + .speed = job->speed, + }; + + elem = g_new0(BlockJobInfoList, 1); + elem->value = info; + + (*prev)->next = elem; + *prev = elem; + } +} + +BlockJobInfoList *qmp_query_block_jobs(Error **errp) +{ + /* Dummy is a fake list element for holding the head pointer */ + BlockJobInfoList dummy = {}; + BlockJobInfoList *prev = &dummy; + bdrv_iterate(do_qmp_query_block_jobs_one, &prev); + return dummy.next; +} diff --git a/docs/live-block-ops.txt b/docs/live-block-ops.txt new file mode 100644 index 0000000000..a257087401 --- /dev/null +++ b/docs/live-block-ops.txt @@ -0,0 +1,58 @@ +LIVE BLOCK OPERATIONS +===================== + +High level description of live block operations. Note these are not +supported for use with the raw format at the moment. + +Snapshot live merge +=================== + +Given a snapshot chain, described in this document in the following +format: + +[A] -> [B] -> [C] -> [D] + +Where the rightmost object ([D] in the example) described is the current +image which the guest OS has write access to. To the left of it is its base +image, and so on accordingly until the leftmost image, which has no +base. + +The snapshot live merge operation transforms such a chain into a +smaller one with fewer elements, such as this transformation relative +to the first example: + +[A] -> [D] + +Currently only forward merge with target being the active image is +supported, that is, data copy is performed in the right direction with +destination being the rightmost image. + +The operation is implemented in QEMU through image streaming facilities. + +The basic idea is to execute 'block_stream virtio0' while the guest is +running. Progress can be monitored using 'info block-jobs'. When the +streaming operation completes it raises a QMP event. 'block_stream' +copies data from the backing file(s) into the active image. When finished, +it adjusts the backing file pointer. + +The 'base' parameter specifies an image which data need not be streamed from. +This image will be used as the backing file for the active image when the +operation is finished. + +In the example above, the command would be: + +(qemu) block_stream virtio0 A + + +Live block copy +=============== + +To copy an in use image to another destination in the filesystem, one +should create a live snapshot in the desired destination, then stream +into that image. Example: + +(qemu) snapshot_blkdev ide0-hd0 /new-path/disk.img qcow2 + +(qemu) block_stream ide0-hd0 + + diff --git a/hmp-commands.hx b/hmp-commands.hx index e6506fc9d3..573b823347 100644 --- a/hmp-commands.hx +++ b/hmp-commands.hx @@ -69,6 +69,47 @@ but should be used with extreme caution. Note that this command only resizes image files, it can not resize block devices like LVM volumes. ETEXI + { + .name = "block_stream", + .args_type = "device:B,base:s?", + .params = "device [base]", + .help = "copy data from a backing file into a block device", + .mhandler.cmd = hmp_block_stream, + }, + +STEXI +@item block_stream +@findex block_stream +Copy data from a backing file into a block device. +ETEXI + + { + .name = "block_job_set_speed", + .args_type = "device:B,value:o", + .params = "device value", + .help = "set maximum speed for a background block operation", + .mhandler.cmd = hmp_block_job_set_speed, + }, + +STEXI +@item block_job_set_stream +@findex block_job_set_stream +Set maximum speed for a background block operation. +ETEXI + + { + .name = "block_job_cancel", + .args_type = "device:B", + .params = "device", + .help = "stop an active block streaming operation", + .mhandler.cmd = hmp_block_job_cancel, + }, + +STEXI +@item block_job_cancel +@findex block_job_cancel +Stop an active block streaming operation. +ETEXI { .name = "eject", @@ -509,6 +509,42 @@ void hmp_info_pci(Monitor *mon) qapi_free_PciInfoList(info_list); } +void hmp_info_block_jobs(Monitor *mon) +{ + BlockJobInfoList *list; + Error *err = NULL; + + list = qmp_query_block_jobs(&err); + assert(!err); + + if (!list) { + monitor_printf(mon, "No active jobs\n"); + return; + } + + while (list) { + if (strcmp(list->value->type, "stream") == 0) { + monitor_printf(mon, "Streaming device %s: Completed %" PRId64 + " of %" PRId64 " bytes, speed limit %" PRId64 + " bytes/s\n", + list->value->device, + list->value->offset, + list->value->len, + list->value->speed); + } else { + monitor_printf(mon, "Type %s, device %s: Completed %" PRId64 + " of %" PRId64 " bytes, speed limit %" PRId64 + " bytes/s\n", + list->value->type, + list->value->device, + list->value->offset, + list->value->len, + list->value->speed); + } + list = list->next; + } +} + void hmp_quit(Monitor *mon, const QDict *qdict) { monitor_suspend(mon); @@ -783,3 +819,35 @@ void hmp_block_set_io_throttle(Monitor *mon, const QDict *qdict) qdict_get_int(qdict, "iops_wr"), &err); hmp_handle_error(mon, &err); } + +void hmp_block_stream(Monitor *mon, const QDict *qdict) +{ + Error *error = NULL; + const char *device = qdict_get_str(qdict, "device"); + const char *base = qdict_get_try_str(qdict, "base"); + + qmp_block_stream(device, base != NULL, base, &error); + + hmp_handle_error(mon, &error); +} + +void hmp_block_job_set_speed(Monitor *mon, const QDict *qdict) +{ + Error *error = NULL; + const char *device = qdict_get_str(qdict, "device"); + int64_t value = qdict_get_int(qdict, "value"); + + qmp_block_job_set_speed(device, value, &error); + + hmp_handle_error(mon, &error); +} + +void hmp_block_job_cancel(Monitor *mon, const QDict *qdict) +{ + Error *error = NULL; + const char *device = qdict_get_str(qdict, "device"); + + qmp_block_job_cancel(device, &error); + + hmp_handle_error(mon, &error); +} @@ -32,6 +32,7 @@ void hmp_info_vnc(Monitor *mon); void hmp_info_spice(Monitor *mon); void hmp_info_balloon(Monitor *mon); void hmp_info_pci(Monitor *mon); +void hmp_info_block_jobs(Monitor *mon); void hmp_quit(Monitor *mon, const QDict *qdict); void hmp_stop(Monitor *mon, const QDict *qdict); void hmp_system_reset(Monitor *mon, const QDict *qdict); @@ -54,5 +55,8 @@ void hmp_expire_password(Monitor *mon, const QDict *qdict); void hmp_eject(Monitor *mon, const QDict *qdict); void hmp_change(Monitor *mon, const QDict *qdict); void hmp_block_set_io_throttle(Monitor *mon, const QDict *qdict); +void hmp_block_stream(Monitor *mon, const QDict *qdict); +void hmp_block_job_set_speed(Monitor *mon, const QDict *qdict); +void hmp_block_job_cancel(Monitor *mon, const QDict *qdict); #endif diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c index 5d8bf53586..11cfe73df8 100644 --- a/hw/scsi-disk.c +++ b/hw/scsi-disk.c @@ -391,9 +391,6 @@ static int scsi_disk_emulate_inquiry(SCSIRequest *req, uint8_t *outbuf) } l = strlen(s->serial); - if (l > req->cmd.xfer) { - l = req->cmd.xfer; - } if (l > 20) { l = 20; } @@ -1002,9 +999,6 @@ static int scsi_disk_emulate_mode_sense(SCSIDiskReq *r, uint8_t *outbuf) outbuf[0] = ((buflen - 2) >> 8) & 0xff; outbuf[1] = (buflen - 2) & 0xff; } - if (buflen > r->req.cmd.xfer) { - buflen = r->req.cmd.xfer; - } return buflen; } @@ -1038,9 +1032,6 @@ static int scsi_disk_emulate_read_toc(SCSIRequest *req, uint8_t *outbuf) default: return -1; } - if (toclen > req->cmd.xfer) { - toclen = req->cmd.xfer; - } return toclen; } @@ -1251,6 +1242,7 @@ static int scsi_disk_emulate_command(SCSIDiskReq *r) scsi_check_condition(r, SENSE_CODE(INVALID_OPCODE)); return -1; } + buflen = MIN(buflen, req->cmd.xfer); return buflen; not_ready: diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c index 5b416c36ee..a5a439668b 100644 --- a/hw/virtio-blk.c +++ b/hw/virtio-blk.c @@ -346,6 +346,8 @@ static void virtio_blk_handle_read(VirtIOBlockReq *req) bdrv_acct_start(req->dev->bs, &req->acct, req->qiov.size, BDRV_ACCT_READ); + trace_virtio_blk_handle_read(req, sector, req->qiov.size / 512); + if (sector & req->dev->sector_mask) { virtio_blk_rw_complete(req, -EIO); return; @@ -479,6 +479,12 @@ void monitor_protocol_event(MonitorEvent event, QObject *data) case QEVENT_SPICE_DISCONNECTED: event_name = "SPICE_DISCONNECTED"; break; + case QEVENT_BLOCK_JOB_COMPLETED: + event_name = "BLOCK_JOB_COMPLETED"; + break; + case QEVENT_BLOCK_JOB_CANCELLED: + event_name = "BLOCK_JOB_CANCELLED"; + break; default: abort(); break; @@ -2312,6 +2318,13 @@ static mon_cmd_t info_cmds[] = { .mhandler.info = hmp_info_blockstats, }, { + .name = "block-jobs", + .args_type = "", + .params = "", + .help = "show progress of ongoing block device operations", + .mhandler.info = hmp_info_block_jobs, + }, + { .name = "registers", .args_type = "", .params = "", @@ -36,6 +36,8 @@ typedef enum MonitorEvent { QEVENT_SPICE_CONNECTED, QEVENT_SPICE_INITIALIZED, QEVENT_SPICE_DISCONNECTED, + QEVENT_BLOCK_JOB_COMPLETED, + QEVENT_BLOCK_JOB_CANCELLED, QEVENT_MAX, } MonitorEvent; diff --git a/qapi-schema.json b/qapi-schema.json index 735eb352b5..80debe679a 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -845,6 +845,38 @@ { 'command': 'query-pci', 'returns': ['PciInfo'] } ## +# @BlockJobInfo: +# +# Information about a long-running block device operation. +# +# @type: the job type ('stream' for image streaming) +# +# @device: the block device name +# +# @len: the maximum progress value +# +# @offset: the current progress value +# +# @speed: the rate limit, bytes per second +# +# Since: 1.1 +## +{ 'type': 'BlockJobInfo', + 'data': {'type': 'str', 'device': 'str', 'len': 'int', + 'offset': 'int', 'speed': 'int'} } + +## +# @query-block-jobs: +# +# Return information about long-running block device operations. +# +# Returns: a list of @BlockJobInfo for each active block job +# +# Since: 1.1 +## +{ 'command': 'query-block-jobs', 'returns': ['BlockJobInfo'] } + +## # @quit: # # This command will cause the QEMU process to exit gracefully. While every @@ -1434,3 +1466,86 @@ { 'command': 'block_set_io_throttle', 'data': { 'device': 'str', 'bps': 'int', 'bps_rd': 'int', 'bps_wr': 'int', 'iops': 'int', 'iops_rd': 'int', 'iops_wr': 'int' } } + +# @block_stream: +# +# Copy data from a backing file into a block device. +# +# The block streaming operation is performed in the background until the entire +# backing file has been copied. This command returns immediately once streaming +# has started. The status of ongoing block streaming operations can be checked +# with query-block-jobs. The operation can be stopped before it has completed +# using the block_job_cancel command. +# +# If a base file is specified then sectors are not copied from that base file and +# its backing chain. When streaming completes the image file will have the base +# file as its backing file. This can be used to stream a subset of the backing +# file chain instead of flattening the entire image. +# +# On successful completion the image file is updated to drop the backing file +# and the BLOCK_JOB_COMPLETED event is emitted. +# +# @device: the device name +# +# @base: #optional the common backing file name +# +# Returns: Nothing on success +# If streaming is already active on this device, DeviceInUse +# If @device does not exist, DeviceNotFound +# If image streaming is not supported by this device, NotSupported +# If @base does not exist, BaseNotFound +# +# Since: 1.1 +## +{ 'command': 'block_stream', 'data': { 'device': 'str', '*base': 'str' } } + +## +# @block_job_set_speed: +# +# Set maximum speed for a background block operation. +# +# This command can only be issued when there is an active block job. +# +# Throttling can be disabled by setting the speed to 0. +# +# @device: the device name +# +# @value: the maximum speed, in bytes per second +# +# Returns: Nothing on success +# If the job type does not support throttling, NotSupported +# If streaming is not active on this device, DeviceNotActive +# +# Since: 1.1 +## +{ 'command': 'block_job_set_speed', + 'data': { 'device': 'str', 'value': 'int' } } + +## +# @block_job_cancel: +# +# Stop an active block streaming operation. +# +# This command returns immediately after marking the active block streaming +# operation for cancellation. It is an error to call this command if no +# operation is in progress. +# +# The operation will cancel as soon as possible and then emit the +# BLOCK_JOB_CANCELLED event. Before that happens the job is still visible when +# enumerated using query-block-jobs. +# +# The image file retains its backing file unless the streaming operation happens +# to complete just as it is being cancelled. +# +# A new block streaming operation can be started at a later time to finish +# copying all data from the backing file. +# +# @device: the device name +# +# Returns: Nothing on success +# If streaming is not active on this device, DeviceNotActive +# If cancellation already in progress, DeviceInUse +# +# Since: 1.1 +## +{ 'command': 'block_job_cancel', 'data': { 'device': 'str' } } diff --git a/qemu-coroutine-sleep.c b/qemu-coroutine-sleep.c new file mode 100644 index 0000000000..fd65274446 --- /dev/null +++ b/qemu-coroutine-sleep.c @@ -0,0 +1,38 @@ +/* + * QEMU coroutine sleep + * + * Copyright IBM, Corp. 2011 + * + * Authors: + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "qemu-coroutine.h" +#include "qemu-timer.h" + +typedef struct CoSleepCB { + QEMUTimer *ts; + Coroutine *co; +} CoSleepCB; + +static void co_sleep_cb(void *opaque) +{ + CoSleepCB *sleep_cb = opaque; + + qemu_free_timer(sleep_cb->ts); + qemu_coroutine_enter(sleep_cb->co, NULL); +} + +void coroutine_fn co_sleep_ns(QEMUClock *clock, int64_t ns) +{ + CoSleepCB sleep_cb = { + .co = qemu_coroutine_self(), + }; + sleep_cb.ts = qemu_new_timer(clock, SCALE_NS, co_sleep_cb, &sleep_cb); + qemu_mod_timer(sleep_cb.ts, qemu_get_clock_ns(clock) + ns); + qemu_coroutine_yield(); +} diff --git a/qemu-coroutine.h b/qemu-coroutine.h index 8a55fe125e..34c15d4116 100644 --- a/qemu-coroutine.h +++ b/qemu-coroutine.h @@ -17,6 +17,7 @@ #include <stdbool.h> #include "qemu-queue.h" +#include "qemu-timer.h" /** * Coroutines are a mechanism for stack switching and can be used for @@ -199,4 +200,12 @@ void qemu_co_rwlock_wrlock(CoRwlock *lock); */ void qemu_co_rwlock_unlock(CoRwlock *lock); +/** + * Yield the coroutine for a given duration + * + * Note this function uses timers and hence only works when a main loop is in + * use. See main-loop.h and do not use from qemu-tool programs. + */ +void coroutine_fn co_sleep_ns(QEMUClock *clock, int64_t ns); + #endif /* QEMU_COROUTINE_H */ @@ -130,7 +130,7 @@ static void print_report(const char *op, struct timeval *t, int64_t offset, static void * create_iovec(QEMUIOVector *qiov, char **argv, int nr_iov, int pattern) { - size_t *sizes = calloc(nr_iov, sizeof(size_t)); + size_t *sizes = g_new0(size_t, nr_iov); size_t count = 0; void *buf = NULL; void *p; @@ -172,7 +172,7 @@ create_iovec(QEMUIOVector *qiov, char **argv, int nr_iov, int pattern) } fail: - free(sizes); + g_free(sizes); return buf; } @@ -471,14 +471,14 @@ static int read_f(int argc, char **argv) } if (Pflag) { - void *cmp_buf = malloc(pattern_count); + void *cmp_buf = g_malloc(pattern_count); memset(cmp_buf, pattern, pattern_count); if (memcmp(buf + pattern_offset, cmp_buf, pattern_count)) { printf("Pattern verification failed at offset %" PRId64 ", %d bytes\n", offset + pattern_offset, pattern_count); } - free(cmp_buf); + g_free(cmp_buf); } if (qflag) { @@ -601,13 +601,13 @@ static int readv_f(int argc, char **argv) } if (Pflag) { - void *cmp_buf = malloc(qiov.size); + void *cmp_buf = g_malloc(qiov.size); memset(cmp_buf, pattern, qiov.size); if (memcmp(buf, cmp_buf, qiov.size)) { printf("Pattern verification failed at offset %" PRId64 ", %zd bytes\n", offset, qiov.size); } - free(cmp_buf); + g_free(cmp_buf); } if (qflag) { @@ -1063,7 +1063,7 @@ static void aio_write_done(void *opaque, int ret) ctx->qiov.size, 1, ctx->Cflag); out: qemu_io_free(ctx->buf); - free(ctx); + g_free(ctx); } static void aio_read_done(void *opaque, int ret) @@ -1079,14 +1079,14 @@ static void aio_read_done(void *opaque, int ret) } if (ctx->Pflag) { - void *cmp_buf = malloc(ctx->qiov.size); + void *cmp_buf = g_malloc(ctx->qiov.size); memset(cmp_buf, ctx->pattern, ctx->qiov.size); if (memcmp(ctx->buf, cmp_buf, ctx->qiov.size)) { printf("Pattern verification failed at offset %" PRId64 ", %zd bytes\n", ctx->offset, ctx->qiov.size); } - free(cmp_buf); + g_free(cmp_buf); } if (ctx->qflag) { @@ -1103,7 +1103,7 @@ static void aio_read_done(void *opaque, int ret) ctx->qiov.size, 1, ctx->Cflag); out: qemu_io_free(ctx->buf); - free(ctx); + g_free(ctx); } static void aio_read_help(void) @@ -1141,7 +1141,7 @@ static const cmdinfo_t aio_read_cmd = { static int aio_read_f(int argc, char **argv) { int nr_iov, c; - struct aio_ctx *ctx = calloc(1, sizeof(struct aio_ctx)); + struct aio_ctx *ctx = g_new0(struct aio_ctx, 1); while ((c = getopt(argc, argv, "CP:qv")) != EOF) { switch (c) { @@ -1152,7 +1152,7 @@ static int aio_read_f(int argc, char **argv) ctx->Pflag = 1; ctx->pattern = parse_pattern(optarg); if (ctx->pattern < 0) { - free(ctx); + g_free(ctx); return 0; } break; @@ -1163,20 +1163,20 @@ static int aio_read_f(int argc, char **argv) ctx->vflag = 1; break; default: - free(ctx); + g_free(ctx); return command_usage(&aio_read_cmd); } } if (optind > argc - 2) { - free(ctx); + g_free(ctx); return command_usage(&aio_read_cmd); } ctx->offset = cvtnum(argv[optind]); if (ctx->offset < 0) { printf("non-numeric length argument -- %s\n", argv[optind]); - free(ctx); + g_free(ctx); return 0; } optind++; @@ -1184,14 +1184,14 @@ static int aio_read_f(int argc, char **argv) if (ctx->offset & 0x1ff) { printf("offset %" PRId64 " is not sector aligned\n", ctx->offset); - free(ctx); + g_free(ctx); return 0; } nr_iov = argc - optind; ctx->buf = create_iovec(&ctx->qiov, &argv[optind], nr_iov, 0xab); if (ctx->buf == NULL) { - free(ctx); + g_free(ctx); return 0; } @@ -1237,7 +1237,7 @@ static int aio_write_f(int argc, char **argv) { int nr_iov, c; int pattern = 0xcd; - struct aio_ctx *ctx = calloc(1, sizeof(struct aio_ctx)); + struct aio_ctx *ctx = g_new0(struct aio_ctx, 1); while ((c = getopt(argc, argv, "CqP:")) != EOF) { switch (c) { @@ -1250,25 +1250,25 @@ static int aio_write_f(int argc, char **argv) case 'P': pattern = parse_pattern(optarg); if (pattern < 0) { - free(ctx); + g_free(ctx); return 0; } break; default: - free(ctx); + g_free(ctx); return command_usage(&aio_write_cmd); } } if (optind > argc - 2) { - free(ctx); + g_free(ctx); return command_usage(&aio_write_cmd); } ctx->offset = cvtnum(argv[optind]); if (ctx->offset < 0) { printf("non-numeric length argument -- %s\n", argv[optind]); - free(ctx); + g_free(ctx); return 0; } optind++; @@ -1276,14 +1276,14 @@ static int aio_write_f(int argc, char **argv) if (ctx->offset & 0x1ff) { printf("offset %" PRId64 " is not sector aligned\n", ctx->offset); - free(ctx); + g_free(ctx); return 0; } nr_iov = argc - optind; ctx->buf = create_iovec(&ctx->qiov, &argv[optind], nr_iov, pattern); if (ctx->buf == NULL) { - free(ctx); + g_free(ctx); return 0; } @@ -52,6 +52,10 @@ static const QErrorStringTable qerror_table[] = { .desc = "Device '%(device)' can't go on a %(bad_bus_type) bus", }, { + .error_fmt = QERR_BASE_NOT_FOUND, + .desc = "Base '%(base)' not found", + }, + { .error_fmt = QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, .desc = "Block format '%(format)' used by device '%(name)' does not support feature '%(feature)'", }, @@ -197,6 +201,10 @@ static const QErrorStringTable qerror_table[] = { .desc = "No '%(bus)' bus found for device '%(device)'", }, { + .error_fmt = QERR_NOT_SUPPORTED, + .desc = "Not supported", + }, + { .error_fmt = QERR_OPEN_FILE_FAILED, .desc = "Could not open '%(filename)'", }, @@ -57,6 +57,9 @@ QError *qobject_to_qerror(const QObject *obj); #define QERR_BAD_BUS_FOR_DEVICE \ "{ 'class': 'BadBusForDevice', 'data': { 'device': %s, 'bad_bus_type': %s } }" +#define QERR_BASE_NOT_FOUND \ + "{ 'class': 'BaseNotFound', 'data': { 'base': %s } }" + #define QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED \ "{ 'class': 'BlockFormatFeatureNotSupported', 'data': { 'format': %s, 'name': %s, 'feature': %s } }" @@ -168,6 +171,9 @@ QError *qobject_to_qerror(const QObject *obj); #define QERR_NO_BUS_FOR_DEVICE \ "{ 'class': 'NoBusForDevice', 'data': { 'device': %s, 'bus': %s } }" +#define QERR_NOT_SUPPORTED \ + "{ 'class': 'NotSupported', 'data': {} }" + #define QERR_OPEN_FILE_FAILED \ "{ 'class': 'OpenFileFailed', 'data': { 'filename': %s } }" diff --git a/qmp-commands.hx b/qmp-commands.hx index 799e655988..bd6b6410ad 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -649,6 +649,24 @@ Example: EQMP { + .name = "block_stream", + .args_type = "device:B,base:s?", + .mhandler.cmd_new = qmp_marshal_input_block_stream, + }, + + { + .name = "block_job_set_speed", + .args_type = "device:B,value:o", + .mhandler.cmd_new = qmp_marshal_input_block_job_set_speed, + }, + + { + .name = "block_job_cancel", + .args_type = "device:B", + .mhandler.cmd_new = qmp_marshal_input_block_job_cancel, + }, + + { .name = "blockdev-snapshot-sync", .args_type = "device:B,snapshot-file:s,format:s?", .mhandler.cmd_new = qmp_marshal_input_blockdev_snapshot_sync, @@ -1996,6 +2014,12 @@ EQMP }, { + .name = "query-block-jobs", + .args_type = "", + .mhandler.cmd_new = qmp_marshal_input_query_block_jobs, + }, + + { .name = "qom-list", .args_type = "path:s", .mhandler.cmd_new = qmp_marshal_input_qom_list, diff --git a/trace-events b/trace-events index d2b0c6181d..75f6e17abe 100644 --- a/trace-events +++ b/trace-events @@ -65,14 +65,25 @@ bdrv_aio_readv(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs % bdrv_aio_writev(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p" bdrv_lock_medium(void *bs, bool locked) "bs %p locked %d" bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d" +bdrv_co_copy_on_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d" bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d" bdrv_co_io_em(void *bs, int64_t sector_num, int nb_sectors, int is_write, void *acb) "bs %p sector_num %"PRId64" nb_sectors %d is_write %d acb %p" -bdrv_co_copy_on_readv(void *bs, int64_t sector_num, int nb_sectors, int64_t cluster_sector_num, int cluster_nb_sectors) "bs %p sector_num %"PRId64" nb_sectors %d cluster_sector_num %"PRId64" cluster_nb_sectors %d" +bdrv_co_do_copy_on_readv(void *bs, int64_t sector_num, int nb_sectors, int64_t cluster_sector_num, int cluster_nb_sectors) "bs %p sector_num %"PRId64" nb_sectors %d cluster_sector_num %"PRId64" cluster_nb_sectors %d" + +# block/stream.c +stream_one_iteration(void *s, int64_t sector_num, int nb_sectors, int is_allocated) "s %p sector_num %"PRId64" nb_sectors %d is_allocated %d" +stream_start(void *bs, void *base, void *s, void *co, void *opaque) "bs %p base %p s %p co %p opaque %p" + +# blockdev.c +qmp_block_job_cancel(void *job) "job %p" +block_stream_cb(void *bs, void *job, int ret) "bs %p job %p ret %d" +qmp_block_stream(void *bs, void *job) "bs %p job %p" # hw/virtio-blk.c virtio_blk_req_complete(void *req, int status) "req %p status %d" virtio_blk_rw_complete(void *req, int ret) "req %p ret %d" virtio_blk_handle_write(void *req, uint64_t sector, size_t nsectors) "req %p sector %"PRIu64" nsectors %zu" +virtio_blk_handle_read(void *req, uint64_t sector, size_t nsectors) "req %p sector %"PRIu64" nsectors %zu" # posix-aio-compat.c paio_submit(void *acb, void *opaque, int64_t sector_num, int nb_sectors, int type) "acb %p opaque %p sector_num %"PRId64" nb_sectors %d type %d" |