diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2015-11-12 17:22:06 +0000 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2015-11-12 17:22:06 +0000 |
commit | b2df6a79df6343d0ed4ea05d83b3ff1d849e8d25 (patch) | |
tree | dbdbb73641e92a85f29ff12b7d010320ef6cd619 | |
parent | cfcc7c144879ebe61ac2472216314fc1331b4450 (diff) | |
parent | aece5edc96f211eec6febdafc9bbbb99315a2efd (diff) |
Merge remote-tracking branch 'remotes/kevin/tags/for-upstream' into staging
Block layer patches (rebased Stefan's pull request)
# gpg: Signature made Thu 12 Nov 2015 15:34:16 GMT using RSA key ID C88F2FD6
# gpg: Good signature from "Kevin Wolf <kwolf@redhat.com>"
* remotes/kevin/tags/for-upstream: (43 commits)
block: Update copyright of the accounting code
scsi-disk: Account for failed operations
macio: Account for failed operations
ide: Account for failed and invalid operations
atapi: Account for failed and invalid operations
xen_disk: Account for failed and invalid operations
virtio-blk: Account for failed and invalid operations
nvme: Account for failed and invalid operations
iotests: Add test for the block device statistics
block: Use QEMU_CLOCK_VIRTUAL for the accounting code in qtest mode
qemu-io: Account for failed, invalid and flush operations
block: New option to define the intervals for collecting I/O statistics
block: Add average I/O queue depth to BlockDeviceTimedStats
block: Compute minimum, maximum and average I/O latencies
block: Allow configuring whether to account failed and invalid ops
block: Add statistics for failed and invalid I/O operations
block: Add idle_time_ns to BlockDeviceStats
util: Infrastructure for computing recent averages
block: define 'clock_type' for the accounting code
ide: Account for write operations correctly
...
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
42 files changed, 2618 insertions, 296 deletions
@@ -3404,10 +3404,25 @@ void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap, hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors); } -void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap) +void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out) { assert(bdrv_dirty_bitmap_enabled(bitmap)); - hbitmap_reset_all(bitmap->bitmap); + if (!out) { + hbitmap_reset_all(bitmap->bitmap); + } else { + HBitmap *backup = bitmap->bitmap; + bitmap->bitmap = hbitmap_alloc(bitmap->size, + hbitmap_granularity(backup)); + *out = backup; + } +} + +void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in) +{ + HBitmap *tmp = bitmap->bitmap; + assert(bdrv_dirty_bitmap_enabled(bitmap)); + bitmap->bitmap = in; + hbitmap_free(tmp); } void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, diff --git a/block/accounting.c b/block/accounting.c index a423560206..185025ec1e 100644 --- a/block/accounting.c +++ b/block/accounting.c @@ -2,6 +2,7 @@ * QEMU System Emulator block accounting * * Copyright (c) 2011 Christoph Hellwig + * Copyright (c) 2015 Igalia, S.L. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -25,6 +26,54 @@ #include "block/accounting.h" #include "block/block_int.h" #include "qemu/timer.h" +#include "sysemu/qtest.h" + +static QEMUClockType clock_type = QEMU_CLOCK_REALTIME; +static const int qtest_latency_ns = NANOSECONDS_PER_SECOND / 1000; + +void block_acct_init(BlockAcctStats *stats, bool account_invalid, + bool account_failed) +{ + stats->account_invalid = account_invalid; + stats->account_failed = account_failed; + + if (qtest_enabled()) { + clock_type = QEMU_CLOCK_VIRTUAL; + } +} + +void block_acct_cleanup(BlockAcctStats *stats) +{ + BlockAcctTimedStats *s, *next; + QSLIST_FOREACH_SAFE(s, &stats->intervals, entries, next) { + g_free(s); + } +} + +void block_acct_add_interval(BlockAcctStats *stats, unsigned interval_length) +{ + BlockAcctTimedStats *s; + unsigned i; + + s = g_new0(BlockAcctTimedStats, 1); + s->interval_length = interval_length; + QSLIST_INSERT_HEAD(&stats->intervals, s, entries); + + for (i = 0; i < BLOCK_MAX_IOTYPE; i++) { + timed_average_init(&s->latency[i], clock_type, + (uint64_t) interval_length * NANOSECONDS_PER_SECOND); + } +} + +BlockAcctTimedStats *block_acct_interval_next(BlockAcctStats *stats, + BlockAcctTimedStats *s) +{ + if (s == NULL) { + return QSLIST_FIRST(&stats->intervals); + } else { + return QSLIST_NEXT(s, entries); + } +} void block_acct_start(BlockAcctStats *stats, BlockAcctCookie *cookie, int64_t bytes, enum BlockAcctType type) @@ -32,20 +81,71 @@ void block_acct_start(BlockAcctStats *stats, BlockAcctCookie *cookie, assert(type < BLOCK_MAX_IOTYPE); cookie->bytes = bytes; - cookie->start_time_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); + cookie->start_time_ns = qemu_clock_get_ns(clock_type); cookie->type = type; } void block_acct_done(BlockAcctStats *stats, BlockAcctCookie *cookie) { + BlockAcctTimedStats *s; + int64_t time_ns = qemu_clock_get_ns(clock_type); + int64_t latency_ns = time_ns - cookie->start_time_ns; + + if (qtest_enabled()) { + latency_ns = qtest_latency_ns; + } + assert(cookie->type < BLOCK_MAX_IOTYPE); stats->nr_bytes[cookie->type] += cookie->bytes; stats->nr_ops[cookie->type]++; - stats->total_time_ns[cookie->type] += - qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - cookie->start_time_ns; + stats->total_time_ns[cookie->type] += latency_ns; + stats->last_access_time_ns = time_ns; + + QSLIST_FOREACH(s, &stats->intervals, entries) { + timed_average_account(&s->latency[cookie->type], latency_ns); + } +} + +void block_acct_failed(BlockAcctStats *stats, BlockAcctCookie *cookie) +{ + assert(cookie->type < BLOCK_MAX_IOTYPE); + + stats->failed_ops[cookie->type]++; + + if (stats->account_failed) { + BlockAcctTimedStats *s; + int64_t time_ns = qemu_clock_get_ns(clock_type); + int64_t latency_ns = time_ns - cookie->start_time_ns; + + if (qtest_enabled()) { + latency_ns = qtest_latency_ns; + } + + stats->total_time_ns[cookie->type] += latency_ns; + stats->last_access_time_ns = time_ns; + + QSLIST_FOREACH(s, &stats->intervals, entries) { + timed_average_account(&s->latency[cookie->type], latency_ns); + } + } } +void block_acct_invalid(BlockAcctStats *stats, enum BlockAcctType type) +{ + assert(type < BLOCK_MAX_IOTYPE); + + /* block_acct_done() and block_acct_failed() update + * total_time_ns[], but this one does not. The reason is that + * invalid requests are accounted during their submission, + * therefore there's no actual I/O involved. */ + + stats->invalid_ops[type]++; + + if (stats->account_invalid) { + stats->last_access_time_ns = qemu_clock_get_ns(clock_type); + } +} void block_acct_merge_done(BlockAcctStats *stats, enum BlockAcctType type, int num_requests) @@ -53,3 +153,20 @@ void block_acct_merge_done(BlockAcctStats *stats, enum BlockAcctType type, assert(type < BLOCK_MAX_IOTYPE); stats->merged[type] += num_requests; } + +int64_t block_acct_idle_time_ns(BlockAcctStats *stats) +{ + return qemu_clock_get_ns(clock_type) - stats->last_access_time_ns; +} + +double block_acct_queue_depth(BlockAcctTimedStats *stats, + enum BlockAcctType type) +{ + uint64_t sum, elapsed; + + assert(type < BLOCK_MAX_IOTYPE); + + sum = timed_average_sum(&stats->latency[type], &elapsed); + + return (double) sum / elapsed; +} diff --git a/block/backup.c b/block/backup.c index ec01db8eff..3b39119256 100644 --- a/block/backup.c +++ b/block/backup.c @@ -221,11 +221,45 @@ static void backup_iostatus_reset(BlockJob *job) } } +static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret) +{ + BdrvDirtyBitmap *bm; + BlockDriverState *bs = job->common.bs; + + if (ret < 0 || block_job_is_cancelled(&job->common)) { + /* Merge the successor back into the parent, delete nothing. */ + bm = bdrv_reclaim_dirty_bitmap(bs, job->sync_bitmap, NULL); + assert(bm); + } else { + /* Everything is fine, delete this bitmap and install the backup. */ + bm = bdrv_dirty_bitmap_abdicate(bs, job->sync_bitmap, NULL); + assert(bm); + } +} + +static void backup_commit(BlockJob *job) +{ + BackupBlockJob *s = container_of(job, BackupBlockJob, common); + if (s->sync_bitmap) { + backup_cleanup_sync_bitmap(s, 0); + } +} + +static void backup_abort(BlockJob *job) +{ + BackupBlockJob *s = container_of(job, BackupBlockJob, common); + if (s->sync_bitmap) { + backup_cleanup_sync_bitmap(s, -1); + } +} + static const BlockJobDriver backup_job_driver = { .instance_size = sizeof(BackupBlockJob), .job_type = BLOCK_JOB_TYPE_BACKUP, .set_speed = backup_set_speed, .iostatus_reset = backup_iostatus_reset, + .commit = backup_commit, + .abort = backup_abort, }; static BlockErrorAction backup_error_action(BackupBlockJob *job, @@ -441,19 +475,6 @@ static void coroutine_fn backup_run(void *opaque) /* wait until pending backup_do_cow() calls have completed */ qemu_co_rwlock_wrlock(&job->flush_rwlock); qemu_co_rwlock_unlock(&job->flush_rwlock); - - if (job->sync_bitmap) { - BdrvDirtyBitmap *bm; - if (ret < 0 || block_job_is_cancelled(&job->common)) { - /* Merge the successor back into the parent, delete nothing. */ - bm = bdrv_reclaim_dirty_bitmap(bs, job->sync_bitmap, NULL); - assert(bm); - } else { - /* Everything is fine, delete this bitmap and install the backup. */ - bm = bdrv_dirty_bitmap_abdicate(bs, job->sync_bitmap, NULL); - assert(bm); - } - } hbitmap_free(job->bitmap); if (target->blk) { @@ -472,7 +493,7 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target, BlockdevOnError on_source_error, BlockdevOnError on_target_error, BlockCompletionFunc *cb, void *opaque, - Error **errp) + BlockJobTxn *txn, Error **errp) { int64_t len; @@ -554,6 +575,7 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target, sync_bitmap : NULL; job->common.len = len; job->common.co = qemu_coroutine_create(backup_run); + block_job_txn_add_job(txn, &job->common); qemu_coroutine_enter(job->common.co, job); return; diff --git a/block/block-backend.c b/block/block-backend.c index 6f9309fef4..9889e813b6 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -176,6 +176,7 @@ static void blk_delete(BlockBackend *blk) } g_free(blk->name); drive_info_del(blk->legacy_dinfo); + block_acct_cleanup(&blk->stats); g_free(blk); } diff --git a/block/io.c b/block/io.c index 8dcad3b3fe..adc1eabef0 100644 --- a/block/io.c +++ b/block/io.c @@ -237,8 +237,21 @@ bool bdrv_requests_pending(BlockDriverState *bs) return false; } +static void bdrv_drain_recurse(BlockDriverState *bs) +{ + BdrvChild *child; + + if (bs->drv && bs->drv->bdrv_drain) { + bs->drv->bdrv_drain(bs); + } + QLIST_FOREACH(child, &bs->children, next) { + bdrv_drain_recurse(child->bs); + } +} + /* - * Wait for pending requests to complete on a single BlockDriverState subtree + * Wait for pending requests to complete on a single BlockDriverState subtree, + * and suspend block driver's internal I/O until next request arrives. * * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState * AioContext. @@ -251,6 +264,7 @@ void bdrv_drain(BlockDriverState *bs) { bool busy = true; + bdrv_drain_recurse(bs); while (busy) { /* Keep iterating */ bdrv_flush_io_queue(bs); @@ -348,13 +362,14 @@ static void tracked_request_end(BdrvTrackedRequest *req) static void tracked_request_begin(BdrvTrackedRequest *req, BlockDriverState *bs, int64_t offset, - unsigned int bytes, bool is_write) + unsigned int bytes, + enum BdrvTrackedRequestType type) { *req = (BdrvTrackedRequest){ .bs = bs, .offset = offset, .bytes = bytes, - .is_write = is_write, + .type = type, .co = qemu_coroutine_self(), .serialising = false, .overlap_offset = offset, @@ -971,7 +986,7 @@ static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, bytes = ROUND_UP(bytes, align); } - tracked_request_begin(&req, bs, offset, bytes, false); + tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ); ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align, use_local_qiov ? &local_qiov : qiov, flags); @@ -1292,7 +1307,7 @@ static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, * Pad qiov with the read parts and be sure to have a tracked request not * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. */ - tracked_request_begin(&req, bs, offset, bytes, true); + tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE); if (!qiov) { ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req); @@ -2317,18 +2332,20 @@ static void coroutine_fn bdrv_flush_co_entry(void *opaque) int coroutine_fn bdrv_co_flush(BlockDriverState *bs) { int ret; + BdrvTrackedRequest req; if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || bdrv_is_sg(bs)) { return 0; } + tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH); /* Write back cached data to the OS even with cache=unsafe */ BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); if (bs->drv->bdrv_co_flush_to_os) { ret = bs->drv->bdrv_co_flush_to_os(bs); if (ret < 0) { - return ret; + goto out; } } @@ -2368,14 +2385,17 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs) ret = 0; } if (ret < 0) { - return ret; + goto out; } /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH * in the case of cache=unsafe, so there are no useless flushes. */ flush_parent: - return bs->file ? bdrv_co_flush(bs->file->bs) : 0; + ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0; +out: + tracked_request_end(&req); + return ret; } int bdrv_flush(BlockDriverState *bs) @@ -2418,6 +2438,7 @@ static void coroutine_fn bdrv_discard_co_entry(void *opaque) int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) { + BdrvTrackedRequest req; int max_discard, ret; if (!bs->drv) { @@ -2440,6 +2461,8 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, return 0; } + tracked_request_begin(&req, bs, sector_num, nb_sectors, + BDRV_TRACKED_DISCARD); bdrv_set_dirty(bs, sector_num, nb_sectors); max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS); @@ -2473,20 +2496,24 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, bdrv_co_io_em_complete, &co); if (acb == NULL) { - return -EIO; + ret = -EIO; + goto out; } else { qemu_coroutine_yield(); ret = co.ret; } } if (ret && ret != -ENOTSUP) { - return ret; + goto out; } sector_num += num; nb_sectors -= num; } - return 0; + ret = 0; +out: + tracked_request_end(&req); + return ret; } int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) @@ -2515,26 +2542,109 @@ int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) return rwco.ret; } -/* needed for generic scsi interface */ +typedef struct { + CoroutineIOCompletion *co; + QEMUBH *bh; +} BdrvIoctlCompletionData; -int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) +static void bdrv_ioctl_bh_cb(void *opaque) +{ + BdrvIoctlCompletionData *data = opaque; + + bdrv_co_io_em_complete(data->co, -ENOTSUP); + qemu_bh_delete(data->bh); +} + +static int bdrv_co_do_ioctl(BlockDriverState *bs, int req, void *buf) { BlockDriver *drv = bs->drv; + BdrvTrackedRequest tracked_req; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + BlockAIOCB *acb; - if (drv && drv->bdrv_ioctl) - return drv->bdrv_ioctl(bs, req, buf); - return -ENOTSUP; + tracked_request_begin(&tracked_req, bs, 0, 0, BDRV_TRACKED_IOCTL); + if (!drv || !drv->bdrv_aio_ioctl) { + co.ret = -ENOTSUP; + goto out; + } + + acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co); + if (!acb) { + BdrvIoctlCompletionData *data = g_new(BdrvIoctlCompletionData, 1); + data->bh = aio_bh_new(bdrv_get_aio_context(bs), + bdrv_ioctl_bh_cb, data); + data->co = &co; + qemu_bh_schedule(data->bh); + } + qemu_coroutine_yield(); +out: + tracked_request_end(&tracked_req); + return co.ret; +} + +typedef struct { + BlockDriverState *bs; + int req; + void *buf; + int ret; +} BdrvIoctlCoData; + +static void coroutine_fn bdrv_co_ioctl_entry(void *opaque) +{ + BdrvIoctlCoData *data = opaque; + data->ret = bdrv_co_do_ioctl(data->bs, data->req, data->buf); +} + +/* needed for generic scsi interface */ +int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) +{ + BdrvIoctlCoData data = { + .bs = bs, + .req = req, + .buf = buf, + .ret = -EINPROGRESS, + }; + + if (qemu_in_coroutine()) { + /* Fast-path if already in coroutine context */ + bdrv_co_ioctl_entry(&data); + } else { + Coroutine *co = qemu_coroutine_create(bdrv_co_ioctl_entry); + qemu_coroutine_enter(co, &data); + } + while (data.ret == -EINPROGRESS) { + aio_poll(bdrv_get_aio_context(bs), true); + } + return data.ret; +} + +static void coroutine_fn bdrv_co_aio_ioctl_entry(void *opaque) +{ + BlockAIOCBCoroutine *acb = opaque; + acb->req.error = bdrv_co_do_ioctl(acb->common.bs, + acb->req.req, acb->req.buf); + bdrv_co_complete(acb); } BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, unsigned long int req, void *buf, BlockCompletionFunc *cb, void *opaque) { - BlockDriver *drv = bs->drv; + BlockAIOCBCoroutine *acb = qemu_aio_get(&bdrv_em_co_aiocb_info, + bs, cb, opaque); + Coroutine *co; - if (drv && drv->bdrv_aio_ioctl) - return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque); - return NULL; + acb->need_bh = true; + acb->req.error = -EINPROGRESS; + acb->req.req = req; + acb->req.buf = buf; + co = qemu_coroutine_create(bdrv_co_aio_ioctl_entry); + qemu_coroutine_enter(co, acb); + + bdrv_co_maybe_schedule_bh(acb); + return &acb->common; } void *qemu_blockalign(BlockDriverState *bs, size_t size) diff --git a/block/iscsi.c b/block/iscsi.c index 080ef52345..bd1f1bfcd1 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -97,6 +97,7 @@ typedef struct IscsiAIOCB { int status; int64_t sector_num; int nb_sectors; + int ret; #ifdef __linux__ sg_io_hdr_t *ioh; #endif @@ -779,6 +780,38 @@ iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status, iscsi_schedule_bh(acb); } +static void iscsi_ioctl_bh_completion(void *opaque) +{ + IscsiAIOCB *acb = opaque; + + qemu_bh_delete(acb->bh); + acb->common.cb(acb->common.opaque, acb->ret); + qemu_aio_unref(acb); +} + +static void iscsi_ioctl_handle_emulated(IscsiAIOCB *acb, int req, void *buf) +{ + BlockDriverState *bs = acb->common.bs; + IscsiLun *iscsilun = bs->opaque; + int ret = 0; + + switch (req) { + case SG_GET_VERSION_NUM: + *(int *)buf = 30000; + break; + case SG_GET_SCSI_ID: + ((struct sg_scsi_id *)buf)->scsi_type = iscsilun->type; + break; + default: + ret = -EINVAL; + } + assert(!acb->bh); + acb->bh = aio_bh_new(bdrv_get_aio_context(bs), + iscsi_ioctl_bh_completion, acb); + acb->ret = ret; + qemu_bh_schedule(acb->bh); +} + static BlockAIOCB *iscsi_aio_ioctl(BlockDriverState *bs, unsigned long int req, void *buf, BlockCompletionFunc *cb, void *opaque) @@ -788,8 +821,6 @@ static BlockAIOCB *iscsi_aio_ioctl(BlockDriverState *bs, struct iscsi_data data; IscsiAIOCB *acb; - assert(req == SG_IO); - acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque); acb->iscsilun = iscsilun; @@ -798,6 +829,11 @@ static BlockAIOCB *iscsi_aio_ioctl(BlockDriverState *bs, acb->buf = NULL; acb->ioh = buf; + if (req != SG_IO) { + iscsi_ioctl_handle_emulated(acb, req, buf); + return &acb->common; + } + acb->task = malloc(sizeof(struct scsi_task)); if (acb->task == NULL) { error_report("iSCSI: Failed to allocate task for scsi command. %s", @@ -862,38 +898,6 @@ static BlockAIOCB *iscsi_aio_ioctl(BlockDriverState *bs, return &acb->common; } -static void ioctl_cb(void *opaque, int status) -{ - int *p_status = opaque; - *p_status = status; -} - -static int iscsi_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) -{ - IscsiLun *iscsilun = bs->opaque; - int status; - - switch (req) { - case SG_GET_VERSION_NUM: - *(int *)buf = 30000; - break; - case SG_GET_SCSI_ID: - ((struct sg_scsi_id *)buf)->scsi_type = iscsilun->type; - break; - case SG_IO: - status = -EINPROGRESS; - iscsi_aio_ioctl(bs, req, buf, ioctl_cb, &status); - - while (status == -EINPROGRESS) { - aio_poll(iscsilun->aio_context, true); - } - - return 0; - default: - return -1; - } - return 0; -} #endif static int64_t @@ -1824,7 +1828,6 @@ static BlockDriver bdrv_iscsi = { .bdrv_co_flush_to_disk = iscsi_co_flush, #ifdef __linux__ - .bdrv_ioctl = iscsi_ioctl, .bdrv_aio_ioctl = iscsi_aio_ioctl, #endif diff --git a/block/mirror.c b/block/mirror.c index 60f1cb589d..52c9abfe14 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -742,7 +742,7 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target, s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp); if (!s->dirty_bitmap) { g_free(s->replaces); - block_job_release(bs); + block_job_unref(&s->common); return; } diff --git a/block/qapi.c b/block/qapi.c index 89d4274177..d20262decb 100644 --- a/block/qapi.c +++ b/block/qapi.c @@ -346,17 +346,68 @@ static BlockStats *bdrv_query_stats(const BlockDriverState *bs, s->stats = g_malloc0(sizeof(*s->stats)); if (bs->blk) { BlockAcctStats *stats = blk_get_stats(bs->blk); + BlockAcctTimedStats *ts = NULL; s->stats->rd_bytes = stats->nr_bytes[BLOCK_ACCT_READ]; s->stats->wr_bytes = stats->nr_bytes[BLOCK_ACCT_WRITE]; s->stats->rd_operations = stats->nr_ops[BLOCK_ACCT_READ]; s->stats->wr_operations = stats->nr_ops[BLOCK_ACCT_WRITE]; + + s->stats->failed_rd_operations = stats->failed_ops[BLOCK_ACCT_READ]; + s->stats->failed_wr_operations = stats->failed_ops[BLOCK_ACCT_WRITE]; + s->stats->failed_flush_operations = stats->failed_ops[BLOCK_ACCT_FLUSH]; + + s->stats->invalid_rd_operations = stats->invalid_ops[BLOCK_ACCT_READ]; + s->stats->invalid_wr_operations = stats->invalid_ops[BLOCK_ACCT_WRITE]; + s->stats->invalid_flush_operations = + stats->invalid_ops[BLOCK_ACCT_FLUSH]; + s->stats->rd_merged = stats->merged[BLOCK_ACCT_READ]; s->stats->wr_merged = stats->merged[BLOCK_ACCT_WRITE]; s->stats->flush_operations = stats->nr_ops[BLOCK_ACCT_FLUSH]; s->stats->wr_total_time_ns = stats->total_time_ns[BLOCK_ACCT_WRITE]; s->stats->rd_total_time_ns = stats->total_time_ns[BLOCK_ACCT_READ]; s->stats->flush_total_time_ns = stats->total_time_ns[BLOCK_ACCT_FLUSH]; + + s->stats->has_idle_time_ns = stats->last_access_time_ns > 0; + if (s->stats->has_idle_time_ns) { + s->stats->idle_time_ns = block_acct_idle_time_ns(stats); + } + + s->stats->account_invalid = stats->account_invalid; + s->stats->account_failed = stats->account_failed; + + while ((ts = block_acct_interval_next(stats, ts))) { + BlockDeviceTimedStatsList *timed_stats = + g_malloc0(sizeof(*timed_stats)); + BlockDeviceTimedStats *dev_stats = g_malloc0(sizeof(*dev_stats)); + timed_stats->next = s->stats->timed_stats; + timed_stats->value = dev_stats; + s->stats->timed_stats = timed_stats; + + TimedAverage *rd = &ts->latency[BLOCK_ACCT_READ]; + TimedAverage *wr = &ts->latency[BLOCK_ACCT_WRITE]; + TimedAverage *fl = &ts->latency[BLOCK_ACCT_FLUSH]; + + dev_stats->interval_length = ts->interval_length; + + dev_stats->min_rd_latency_ns = timed_average_min(rd); + dev_stats->max_rd_latency_ns = timed_average_max(rd); + dev_stats->avg_rd_latency_ns = timed_average_avg(rd); + + dev_stats->min_wr_latency_ns = timed_average_min(wr); + dev_stats->max_wr_latency_ns = timed_average_max(wr); + dev_stats->avg_wr_latency_ns = timed_average_avg(wr); + + dev_stats->min_flush_latency_ns = timed_average_min(fl); + dev_stats->max_flush_latency_ns = timed_average_max(fl); + dev_stats->avg_flush_latency_ns = timed_average_avg(fl); + + dev_stats->avg_rd_queue_depth = + block_acct_queue_depth(ts, BLOCK_ACCT_READ); + dev_stats->avg_wr_queue_depth = + block_acct_queue_depth(ts, BLOCK_ACCT_WRITE); + } } s->stats->wr_highest_offset = bs->wr_highest_offset; diff --git a/block/qed.c b/block/qed.c index 5ea05d4909..9b88895038 100644 --- a/block/qed.c +++ b/block/qed.c @@ -375,6 +375,18 @@ static void bdrv_qed_attach_aio_context(BlockDriverState *bs, } } +static void bdrv_qed_drain(BlockDriverState *bs) +{ + BDRVQEDState *s = bs->opaque; + + /* Cancel timer and start doing I/O that were meant to happen as if it + * fired, that way we get bdrv_drain() taking care of the ongoing requests + * correctly. */ + qed_cancel_need_check_timer(s); + qed_plug_allocating_write_reqs(s); + bdrv_aio_flush(s->bs, qed_clear_need_check, s); +} + static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { @@ -1676,6 +1688,7 @@ static BlockDriver bdrv_qed = { .bdrv_check = bdrv_qed_check, .bdrv_detach_aio_context = bdrv_qed_detach_aio_context, .bdrv_attach_aio_context = bdrv_qed_attach_aio_context, + .bdrv_drain = bdrv_qed_drain, }; static void bdrv_qed_init(void) diff --git a/block/raw-posix.c b/block/raw-posix.c index 918c756c2e..aec9ec6bbb 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -2175,12 +2175,6 @@ static int hdev_open(BlockDriverState *bs, QDict *options, int flags, } #if defined(__linux__) -static int hdev_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) -{ - BDRVRawState *s = bs->opaque; - - return ioctl(s->fd, req, buf); -} static BlockAIOCB *hdev_aio_ioctl(BlockDriverState *bs, unsigned long int req, void *buf, @@ -2338,7 +2332,6 @@ static BlockDriver bdrv_host_device = { /* generic scsi device */ #ifdef __linux__ - .bdrv_ioctl = hdev_ioctl, .bdrv_aio_ioctl = hdev_aio_ioctl, #endif }; @@ -2471,7 +2464,6 @@ static BlockDriver bdrv_host_cdrom = { .bdrv_lock_medium = cdrom_lock_medium, /* generic scsi device */ - .bdrv_ioctl = hdev_ioctl, .bdrv_aio_ioctl = hdev_aio_ioctl, }; #endif /* __linux__ */ diff --git a/block/raw_bsd.c b/block/raw_bsd.c index 0aded31c22..915d6fd0e6 100644 --- a/block/raw_bsd.c +++ b/block/raw_bsd.c @@ -169,11 +169,6 @@ static void raw_lock_medium(BlockDriverState *bs, bool locked) bdrv_lock_medium(bs->file->bs, locked); } -static int raw_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) -{ - return bdrv_ioctl(bs->file->bs, req, buf); -} - static BlockAIOCB *raw_aio_ioctl(BlockDriverState *bs, unsigned long int req, void *buf, BlockCompletionFunc *cb, @@ -262,7 +257,6 @@ BlockDriver bdrv_raw = { .bdrv_media_changed = &raw_media_changed, .bdrv_eject = &raw_eject, .bdrv_lock_medium = &raw_lock_medium, - .bdrv_ioctl = &raw_ioctl, .bdrv_aio_ioctl = &raw_aio_ioctl, .create_opts = &raw_create_opts, .bdrv_has_zero_init = &raw_has_zero_init diff --git a/blockdev.c b/blockdev.c index 8607df90a9..fc85128e94 100644 --- a/blockdev.c +++ b/blockdev.c @@ -283,32 +283,6 @@ typedef struct { BlockDriverState *bs; } BDRVPutRefBH; -static void bdrv_put_ref_bh(void *opaque) -{ - BDRVPutRefBH *s = opaque; - - bdrv_unref(s->bs); - qemu_bh_delete(s->bh); - g_free(s); -} - -/* - * Release a BDS reference in a BH - * - * It is not safe to use bdrv_unref() from a callback function when the callers - * still need the BlockDriverState. In such cases we schedule a BH to release - * the reference. - */ -static void bdrv_put_ref_bh_schedule(BlockDriverState *bs) -{ - BDRVPutRefBH *s; - - s = g_new(BDRVPutRefBH, 1); - s->bh = qemu_bh_new(bdrv_put_ref_bh, s); - s->bs = bs; - qemu_bh_schedule(s->bh); -} - static int parse_block_error_action(const char *buf, bool is_read, Error **errp) { if (!strcmp(buf, "ignore")) { @@ -467,6 +441,8 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts, const char *buf; int bdrv_flags = 0; int on_read_error, on_write_error; + bool account_invalid, account_failed; + const char *stats_intervals; BlockBackend *blk; BlockDriverState *bs; ThrottleConfig cfg; @@ -503,6 +479,11 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts, /* extract parameters */ snapshot = qemu_opt_get_bool(opts, "snapshot", 0); + account_invalid = qemu_opt_get_bool(opts, "stats-account-invalid", true); + account_failed = qemu_opt_get_bool(opts, "stats-account-failed", true); + + stats_intervals = qemu_opt_get(opts, "stats-intervals"); + extract_common_blockdev_options(opts, &bdrv_flags, &throttling_group, &cfg, &detect_zeroes, &error); if (error) { @@ -599,6 +580,37 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts, if (bdrv_key_required(bs)) { autostart = 0; } + + block_acct_init(blk_get_stats(blk), account_invalid, account_failed); + + if (stats_intervals) { + char **intervals = g_strsplit(stats_intervals, ":", 0); + unsigned i; + + if (*stats_intervals == '\0') { + error_setg(&error, "stats-intervals can't have an empty value"); + } + + for (i = 0; !error && intervals[i] != NULL; i++) { + unsigned long long val; + if (parse_uint_full(intervals[i], &val, 10) == 0 && + val > 0 && val <= UINT_MAX) { + block_acct_add_interval(blk_get_stats(blk), val); + } else { + error_setg(&error, "Invalid interval length: '%s'", + intervals[i]); + } + } + + g_strfreev(intervals); + + if (error) { + error_propagate(errp, error); + blk_unref(blk); + blk = NULL; + goto err_no_bs_opts; + } + } } blk_set_on_error(blk, on_read_error, on_write_error); @@ -1157,7 +1169,7 @@ static void blockdev_do_action(TransactionActionKind type, void *data, action.u.data = data; list.value = &action; list.next = NULL; - qmp_transaction(&list, errp); + qmp_transaction(&list, false, NULL, errp); } void qmp_blockdev_snapshot_sync(bool has_device, const char *device, @@ -1359,44 +1371,75 @@ static BdrvDirtyBitmap *block_dirty_bitmap_lookup(const char *node, /* New and old BlockDriverState structs for atomic group operations */ -typedef struct BlkTransactionState BlkTransactionState; +typedef struct BlkActionState BlkActionState; -/* Only prepare() may fail. In a single transaction, only one of commit() or - abort() will be called, clean() will always be called if it present. */ -typedef struct BdrvActionOps { - /* Size of state struct, in bytes. */ +/** + * BlkActionOps: + * Table of operations that define an Action. + * + * @instance_size: Size of state struct, in bytes. + * @prepare: Prepare the work, must NOT be NULL. + * @commit: Commit the changes, can be NULL. + * @abort: Abort the changes on fail, can be NULL. + * @clean: Clean up resources after all transaction actions have called + * commit() or abort(). Can be NULL. + * + * Only prepare() may fail. In a single transaction, only one of commit() or + * abort() will be called. clean() will always be called if it is present. + */ +typedef struct BlkActionOps { size_t instance_size; - /* Prepare the work, must NOT be NULL. */ - void (*prepare)(BlkTransactionState *common, Error **errp); - /* Commit the changes, can be NULL. */ - void (*commit)(BlkTransactionState *common); - /* Abort the changes on fail, can be NULL. */ - void (*abort)(BlkTransactionState *common); - /* Clean up resource in the end, can be NULL. */ - void (*clean)(BlkTransactionState *common); -} BdrvActionOps; + void (*prepare)(BlkActionState *common, Error **errp); + void (*commit)(BlkActionState *common); + void (*abort)(BlkActionState *common); + void (*clean)(BlkActionState *common); +} BlkActionOps; -/* - * This structure must be arranged as first member in child type, assuming - * that compiler will also arrange it to the same address with parent instance. - * Later it will be used in free(). +/** + * BlkActionState: + * Describes one Action's state within a Transaction. + * + * @action: QAPI-defined enum identifying which Action to perform. + * @ops: Table of ActionOps this Action can perform. + * @block_job_txn: Transaction which this action belongs to. + * @entry: List membership for all Actions in this Transaction. + * + * This structure must be arranged as first member in a subclassed type, + * assuming that the compiler will also arrange it to the same offsets as the + * base class. */ -struct BlkTransactionState { +struct BlkActionState { TransactionAction *action; - const BdrvActionOps *ops; - QSIMPLEQ_ENTRY(BlkTransactionState) entry; + const BlkActionOps *ops; + BlockJobTxn *block_job_txn; + TransactionProperties *txn_props; + QSIMPLEQ_ENTRY(BlkActionState) entry; }; /* internal snapshot private data */ typedef struct InternalSnapshotState { - BlkTransactionState common; + BlkActionState common; BlockDriverState *bs; AioContext *aio_context; QEMUSnapshotInfo sn; bool created; } InternalSnapshotState; -static void internal_snapshot_prepare(BlkTransactionState *common, + +static int action_check_completion_mode(BlkActionState *s, Error **errp) +{ + if (s->txn_props->completion_mode != ACTION_COMPLETION_MODE_INDIVIDUAL) { + error_setg(errp, + "Action '%s' does not support Transaction property " + "completion-mode = %s", + TransactionActionKind_lookup[s->action->type], + ActionCompletionMode_lookup[s->txn_props->completion_mode]); + return -1; + } + return 0; +} + +static void internal_snapshot_prepare(BlkActionState *common, Error **errp) { Error *local_err = NULL; @@ -1421,6 +1464,10 @@ static void internal_snapshot_prepare(BlkTransactionState *common, name = internal->name; /* 2. check for validation */ + if (action_check_completion_mode(common, errp) < 0) { + return; + } + blk = blk_by_name(device); if (!blk) { error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND, @@ -1495,7 +1542,7 @@ static void internal_snapshot_prepare(BlkTransactionState *common, state->created = true; } -static void internal_snapshot_abort(BlkTransactionState *common) +static void internal_snapshot_abort(BlkActionState *common) { InternalSnapshotState *state = DO_UPCAST(InternalSnapshotState, common, common); @@ -1518,7 +1565,7 @@ static void internal_snapshot_abort(BlkTransactionState *common) } } -static void internal_snapshot_clean(BlkTransactionState *common) +static void internal_snapshot_clean(BlkActionState *common) { InternalSnapshotState *state = DO_UPCAST(InternalSnapshotState, common, common); @@ -1533,13 +1580,13 @@ static void internal_snapshot_clean(BlkTransactionState *common) /* external snapshot private data */ typedef struct ExternalSnapshotState { - BlkTransactionState common; + BlkActionState common; BlockDriverState *old_bs; BlockDriverState *new_bs; AioContext *aio_context; } ExternalSnapshotState; -static void external_snapshot_prepare(BlkTransactionState *common, +static void external_snapshot_prepare(BlkActionState *common, Error **errp) { int flags = 0, ret; @@ -1582,6 +1629,10 @@ static void external_snapshot_prepare(BlkTransactionState *common, } /* start processing */ + if (action_check_completion_mode(common, errp) < 0) { + return; + } + state->old_bs = bdrv_lookup_bs(device, node_name, errp); if (!state->old_bs) { return; @@ -1686,7 +1737,7 @@ static void external_snapshot_prepare(BlkTransactionState *common, } } -static void external_snapshot_commit(BlkTransactionState *common) +static void external_snapshot_commit(BlkActionState *common) { ExternalSnapshotState *state = DO_UPCAST(ExternalSnapshotState, common, common); @@ -1702,7 +1753,7 @@ static void external_snapshot_commit(BlkTransactionState *common) NULL); } -static void external_snapshot_abort(BlkTransactionState *common) +static void external_snapshot_abort(BlkActionState *common) { ExternalSnapshotState *state = DO_UPCAST(ExternalSnapshotState, common, common); @@ -1711,7 +1762,7 @@ static void external_snapshot_abort(BlkTransactionState *common) } } -static void external_snapshot_clean(BlkTransactionState *common) +static void external_snapshot_clean(BlkActionState *common) { ExternalSnapshotState *state = DO_UPCAST(ExternalSnapshotState, common, common); @@ -1722,13 +1773,25 @@ static void external_snapshot_clean(BlkTransactionState *common) } typedef struct DriveBackupState { - BlkTransactionState common; + BlkActionState common; BlockDriverState *bs; AioContext *aio_context; BlockJob *job; } DriveBackupState; -static void drive_backup_prepare(BlkTransactionState *common, Error **errp) +static void do_drive_backup(const char *device, const char *target, + bool has_format, const char *format, + enum MirrorSyncMode sync, + bool has_mode, enum NewImageMode mode, + bool has_speed, int64_t speed, + bool has_bitmap, const char *bitmap, + bool has_on_source_error, + BlockdevOnError on_source_error, + bool has_on_target_error, + BlockdevOnError on_target_error, + BlockJobTxn *txn, Error **errp); + +static void drive_backup_prepare(BlkActionState *common, Error **errp) { DriveBackupState *state = DO_UPCAST(DriveBackupState, common, common); BlockBackend *blk; @@ -1756,15 +1819,15 @@ static void drive_backup_prepare(BlkTransactionState *common, Error **errp) bdrv_drained_begin(blk_bs(blk)); state->bs = blk_bs(blk); - qmp_drive_backup(backup->device, backup->target, - backup->has_format, backup->format, - backup->sync, - backup->has_mode, backup->mode, - backup->has_speed, backup->speed, - backup->has_bitmap, backup->bitmap, - backup->has_on_source_error, backup->on_source_error, - backup->has_on_target_error, backup->on_target_error, - &local_err); + do_drive_backup(backup->device, backup->target, + backup->has_format, backup->format, + backup->sync, + backup->has_mode, backup->mode, + backup->has_speed, backup->speed, + backup->has_bitmap, backup->bitmap, + backup->has_on_source_error, backup->on_source_error, + backup->has_on_target_error, backup->on_target_error, + common->block_job_txn, &local_err); if (local_err) { error_propagate(errp, local_err); return; @@ -1773,7 +1836,7 @@ static void drive_backup_prepare(BlkTransactionState *common, Error **errp) state->job = state->bs->job; } -static void drive_backup_abort(BlkTransactionState *common) +static void drive_backup_abort(BlkActionState *common) { DriveBackupState *state = DO_UPCAST(DriveBackupState, common, common); BlockDriverState *bs = state->bs; @@ -1784,7 +1847,7 @@ static void drive_backup_abort(BlkTransactionState *common) } } -static void drive_backup_clean(BlkTransactionState *common) +static void drive_backup_clean(BlkActionState *common) { DriveBackupState *state = DO_UPCAST(DriveBackupState, common, common); @@ -1795,13 +1858,22 @@ static void drive_backup_clean(BlkTransactionState *common) } typedef struct BlockdevBackupState { - BlkTransactionState common; + BlkActionState common; BlockDriverState *bs; BlockJob *job; AioContext *aio_context; } BlockdevBackupState; -static void blockdev_backup_prepare(BlkTransactionState *common, Error **errp) +static void do_blockdev_backup(const char *device, const char *target, + enum MirrorSyncMode sync, + bool has_speed, int64_t speed, + bool has_on_source_error, + BlockdevOnError on_source_error, + bool has_on_target_error, + BlockdevOnError on_target_error, + BlockJobTxn *txn, Error **errp); + +static void blockdev_backup_prepare(BlkActionState *common, Error **errp) { BlockdevBackupState *state = DO_UPCAST(BlockdevBackupState, common, common); BlockdevBackup *backup; @@ -1839,12 +1911,12 @@ static void blockdev_backup_prepare(BlkTransactionState *common, Error **errp) state->bs = blk_bs(blk); bdrv_drained_begin(state->bs); - qmp_blockdev_backup(backup->device, backup->target, - backup->sync, - backup->has_speed, backup->speed, - backup->has_on_source_error, backup->on_source_error, - backup->has_on_target_error, backup->on_target_error, - &local_err); + do_blockdev_backup(backup->device, backup->target, + backup->sync, + backup->has_speed, backup->speed, + backup->has_on_source_error, backup->on_source_error, + backup->has_on_target_error, backup->on_target_error, + common->block_job_txn, &local_err); if (local_err) { error_propagate(errp, local_err); return; @@ -1853,7 +1925,7 @@ static void blockdev_backup_prepare(BlkTransactionState *common, Error **errp) state->job = state->bs->job; } -static void blockdev_backup_abort(BlkTransactionState *common) +static void blockdev_backup_abort(BlkActionState *common) { BlockdevBackupState *state = DO_UPCAST(BlockdevBackupState, common, common); BlockDriverState *bs = state->bs; @@ -1864,7 +1936,7 @@ static void blockdev_backup_abort(BlkTransactionState *common) } } -static void blockdev_backup_clean(BlkTransactionState *common) +static void blockdev_backup_clean(BlkActionState *common) { BlockdevBackupState *state = DO_UPCAST(BlockdevBackupState, common, common); @@ -1874,17 +1946,125 @@ static void blockdev_backup_clean(BlkTransactionState *common) } } -static void abort_prepare(BlkTransactionState *common, Error **errp) +typedef struct BlockDirtyBitmapState { + BlkActionState common; + BdrvDirtyBitmap *bitmap; + BlockDriverState *bs; + AioContext *aio_context; + HBitmap *backup; + bool prepared; +} BlockDirtyBitmapState; + +static void block_dirty_bitmap_add_prepare(BlkActionState *common, + Error **errp) +{ + Error *local_err = NULL; + BlockDirtyBitmapAdd *action; + BlockDirtyBitmapState *state = DO_UPCAST(BlockDirtyBitmapState, + common, common); + + if (action_check_completion_mode(common, errp) < 0) { + return; + } + + action = common->action->u.block_dirty_bitmap_add; + /* AIO context taken and released within qmp_block_dirty_bitmap_add */ + qmp_block_dirty_bitmap_add(action->node, action->name, + action->has_granularity, action->granularity, + &local_err); + + if (!local_err) { + state->prepared = true; + } else { + error_propagate(errp, local_err); + } +} + +static void block_dirty_bitmap_add_abort(BlkActionState *common) +{ + BlockDirtyBitmapAdd *action; + BlockDirtyBitmapState *state = DO_UPCAST(BlockDirtyBitmapState, + common, common); + + action = common->action->u.block_dirty_bitmap_add; + /* Should not be able to fail: IF the bitmap was added via .prepare(), + * then the node reference and bitmap name must have been valid. + */ + if (state->prepared) { + qmp_block_dirty_bitmap_remove(action->node, action->name, &error_abort); + } +} + +static void block_dirty_bitmap_clear_prepare(BlkActionState *common, + Error **errp) +{ + BlockDirtyBitmapState *state = DO_UPCAST(BlockDirtyBitmapState, + common, common); + BlockDirtyBitmap *action; + + if (action_check_completion_mode(common, errp) < 0) { + return; + } + + action = common->action->u.block_dirty_bitmap_clear; + state->bitmap = block_dirty_bitmap_lookup(action->node, + action->name, + &state->bs, + &state->aio_context, + errp); + if (!state->bitmap) { + return; + } + + if (bdrv_dirty_bitmap_frozen(state->bitmap)) { + error_setg(errp, "Cannot modify a frozen bitmap"); + return; + } else if (!bdrv_dirty_bitmap_enabled(state->bitmap)) { + error_setg(errp, "Cannot clear a disabled bitmap"); + return; + } + + bdrv_clear_dirty_bitmap(state->bitmap, &state->backup); + /* AioContext is released in .clean() */ +} + +static void block_dirty_bitmap_clear_abort(BlkActionState *common) +{ + BlockDirtyBitmapState *state = DO_UPCAST(BlockDirtyBitmapState, + common, common); + + bdrv_undo_clear_dirty_bitmap(state->bitmap, state->backup); +} + +static void block_dirty_bitmap_clear_commit(BlkActionState *common) +{ + BlockDirtyBitmapState *state = DO_UPCAST(BlockDirtyBitmapState, + common, common); + + hbitmap_free(state->backup); +} + +static void block_dirty_bitmap_clear_clean(BlkActionState *common) +{ + BlockDirtyBitmapState *state = DO_UPCAST(BlockDirtyBitmapState, + common, common); + + if (state->aio_context) { + aio_context_release(state->aio_context); + } +} + +static void abort_prepare(BlkActionState *common, Error **errp) { error_setg(errp, "Transaction aborted using Abort action"); } -static void abort_commit(BlkTransactionState *common) +static void abort_commit(BlkActionState *common) { g_assert_not_reached(); /* this action never succeeds */ } -static const BdrvActionOps actions[] = { +static const BlkActionOps actions[] = { [TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT] = { .instance_size = sizeof(ExternalSnapshotState), .prepare = external_snapshot_prepare, @@ -1911,7 +2091,7 @@ static const BdrvActionOps actions[] = { .clean = blockdev_backup_clean, }, [TRANSACTION_ACTION_KIND_ABORT] = { - .instance_size = sizeof(BlkTransactionState), + .instance_size = sizeof(BlkActionState), .prepare = abort_prepare, .commit = abort_commit, }, @@ -1921,28 +2101,71 @@ static const BdrvActionOps actions[] = { .abort = internal_snapshot_abort, .clean = internal_snapshot_clean, }, + [TRANSACTION_ACTION_KIND_BLOCK_DIRTY_BITMAP_ADD] = { + .instance_size = sizeof(BlockDirtyBitmapState), + .prepare = block_dirty_bitmap_add_prepare, + .abort = block_dirty_bitmap_add_abort, + }, + [TRANSACTION_ACTION_KIND_BLOCK_DIRTY_BITMAP_CLEAR] = { + .instance_size = sizeof(BlockDirtyBitmapState), + .prepare = block_dirty_bitmap_clear_prepare, + .commit = block_dirty_bitmap_clear_commit, + .abort = block_dirty_bitmap_clear_abort, + .clean = block_dirty_bitmap_clear_clean, + } }; +/** + * Allocate a TransactionProperties structure if necessary, and fill + * that structure with desired defaults if they are unset. + */ +static TransactionProperties *get_transaction_properties( + TransactionProperties *props) +{ + if (!props) { + props = g_new0(TransactionProperties, 1); + } + + if (!props->has_completion_mode) { + props->has_completion_mode = true; + props->completion_mode = ACTION_COMPLETION_MODE_INDIVIDUAL; + } + + return props; +} + /* * 'Atomic' group operations. The operations are performed as a set, and if * any fail then we roll back all operations in the group. */ -void qmp_transaction(TransactionActionList *dev_list, Error **errp) +void qmp_transaction(TransactionActionList *dev_list, + bool has_props, + struct TransactionProperties *props, + Error **errp) { TransactionActionList *dev_entry = dev_list; - BlkTransactionState *state, *next; + BlockJobTxn *block_job_txn = NULL; + BlkActionState *state, *next; Error *local_err = NULL; - QSIMPLEQ_HEAD(snap_bdrv_states, BlkTransactionState) snap_bdrv_states; + QSIMPLEQ_HEAD(snap_bdrv_states, BlkActionState) snap_bdrv_states; QSIMPLEQ_INIT(&snap_bdrv_states); + /* Does this transaction get canceled as a group on failure? + * If not, we don't really need to make a BlockJobTxn. + */ + props = get_transaction_properties(props); + if (props->completion_mode != ACTION_COMPLETION_MODE_INDIVIDUAL) { + block_job_txn = block_job_txn_new(); + } + /* drain all i/o before any operations */ bdrv_drain_all(); /* We don't do anything in this loop that commits us to the operations */ while (NULL != dev_entry) { TransactionAction *dev_info = NULL; - const BdrvActionOps *ops; + const BlkActionOps *ops; dev_info = dev_entry->value; dev_entry = dev_entry->next; @@ -1955,6 +2178,8 @@ void qmp_transaction(TransactionActionList *dev_list, Error **errp) state = g_malloc0(ops->instance_size); state->ops = ops; state->action = dev_info; + state->block_job_txn = block_job_txn; + state->txn_props = props; QSIMPLEQ_INSERT_TAIL(&snap_bdrv_states, state, entry); state->ops->prepare(state, &local_err); @@ -1987,6 +2212,10 @@ exit: } g_free(state); } + if (!has_props) { + qapi_free_TransactionProperties(props); + } + block_job_txn_unref(block_job_txn); } void qmp_eject(const char *device, bool has_force, bool force, Error **errp) @@ -2472,7 +2701,7 @@ void qmp_block_dirty_bitmap_clear(const char *node, const char *name, goto out; } - bdrv_clear_dirty_bitmap(bitmap); + bdrv_clear_dirty_bitmap(bitmap, NULL); out: aio_context_release(aio_context); @@ -2615,8 +2844,6 @@ static void block_job_cb(void *opaque, int ret) } else { block_job_event_completed(bs->job, msg); } - - bdrv_put_ref_bh_schedule(bs); } void qmp_block_stream(const char *device, @@ -2797,15 +3024,17 @@ out: aio_context_release(aio_context); } -void qmp_drive_backup(const char *device, const char *target, - bool has_format, const char *format, - enum MirrorSyncMode sync, - bool has_mode, enum NewImageMode mode, - bool has_speed, int64_t speed, - bool has_bitmap, const char *bitmap, - bool has_on_source_error, BlockdevOnError on_source_error, - bool has_on_target_error, BlockdevOnError on_target_error, - Error **errp) +static void do_drive_backup(const char *device, const char *target, + bool has_format, const char *format, + enum MirrorSyncMode sync, + bool has_mode, enum NewImageMode mode, + bool has_speed, int64_t speed, + bool has_bitmap, const char *bitmap, + bool has_on_source_error, + BlockdevOnError on_source_error, + bool has_on_target_error, + BlockdevOnError on_target_error, + BlockJobTxn *txn, Error **errp) { BlockBackend *blk; BlockDriverState *bs; @@ -2920,7 +3149,7 @@ void qmp_drive_backup(const char *device, const char *target, backup_start(bs, target_bs, speed, sync, bmap, on_source_error, on_target_error, - block_job_cb, bs, &local_err); + block_job_cb, bs, txn, &local_err); if (local_err != NULL) { bdrv_unref(target_bs); error_propagate(errp, local_err); @@ -2931,19 +3160,37 @@ out: aio_context_release(aio_context); } +void qmp_drive_backup(const char *device, const char *target, + bool has_format, const char *format, + enum MirrorSyncMode sync, + bool has_mode, enum NewImageMode mode, + bool has_speed, int64_t speed, + bool has_bitmap, const char *bitmap, + bool has_on_source_error, BlockdevOnError on_source_error, + bool has_on_target_error, BlockdevOnError on_target_error, + Error **errp) +{ + return do_drive_backup(device, target, has_format, format, sync, + has_mode, mode, has_speed, speed, + has_bitmap, bitmap, + has_on_source_error, on_source_error, + has_on_target_error, on_target_error, + NULL, errp); +} + BlockDeviceInfoList *qmp_query_named_block_nodes(Error **errp) { return bdrv_named_nodes_list(errp); } -void qmp_blockdev_backup(const char *device, const char *target, +void do_blockdev_backup(const char *device, const char *target, enum MirrorSyncMode sync, bool has_speed, int64_t speed, bool has_on_source_error, BlockdevOnError on_source_error, bool has_on_target_error, BlockdevOnError on_target_error, - Error **errp) + BlockJobTxn *txn, Error **errp) { BlockBackend *blk, *target_blk; BlockDriverState *bs; @@ -2991,7 +3238,7 @@ void qmp_blockdev_backup(const char *device, const char *target, bdrv_ref(target_bs); bdrv_set_aio_context(target_bs, aio_context); backup_start(bs, target_bs, speed, sync, NULL, on_source_error, - on_target_error, block_job_cb, bs, &local_err); + on_target_error, block_job_cb, bs, txn, &local_err); if (local_err != NULL) { bdrv_unref(target_bs); error_propagate(errp, local_err); @@ -3000,6 +3247,21 @@ out: aio_context_release(aio_context); } +void qmp_blockdev_backup(const char *device, const char *target, + enum MirrorSyncMode sync, + bool has_speed, int64_t speed, + bool has_on_source_error, + BlockdevOnError on_source_error, + bool has_on_target_error, + BlockdevOnError on_target_error, + Error **errp) +{ + do_blockdev_backup(device, target, sync, has_speed, speed, + has_on_source_error, on_source_error, + has_on_target_error, on_target_error, + NULL, errp); +} + void qmp_drive_mirror(const char *device, const char *target, bool has_format, const char *format, bool has_node_name, const char *node_name, @@ -3676,6 +3938,21 @@ QemuOptsList qemu_common_drive_opts = { .name = "detect-zeroes", .type = QEMU_OPT_STRING, .help = "try to optimize zero writes (off, on, unmap)", + },{ + .name = "stats-account-invalid", + .type = QEMU_OPT_BOOL, + .help = "whether to account for invalid I/O operations " + "in the statistics", + },{ + .name = "stats-account-failed", + .type = QEMU_OPT_BOOL, + .help = "whether to account for failed I/O operations " + "in the statistics", + },{ + .name = "stats-intervals", + .type = QEMU_OPT_STRING, + .help = "colon-separated list of intervals " + "for collecting I/O statistics, in seconds", }, { /* end of list */ } }, diff --git a/blockjob.c b/blockjob.c index c02fe598b8..80adb9d52a 100644 --- a/blockjob.c +++ b/blockjob.c @@ -37,6 +37,19 @@ #include "qemu/timer.h" #include "qapi-event.h" +/* Transactional group of block jobs */ +struct BlockJobTxn { + + /* Is this txn being cancelled? */ + bool aborting; + + /* List of jobs */ + QLIST_HEAD(, BlockJob) jobs; + + /* Reference count */ + int refcnt; +}; + void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs, int64_t speed, BlockCompletionFunc *cb, void *opaque, Error **errp) @@ -60,6 +73,7 @@ void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs, job->cb = cb; job->opaque = opaque; job->busy = true; + job->refcnt = 1; bs->job = job; /* Only set speed when necessary to avoid NotSupported error */ @@ -68,7 +82,7 @@ void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs, block_job_set_speed(job, speed, &local_err); if (local_err) { - block_job_release(bs); + block_job_unref(job); error_propagate(errp, local_err); return NULL; } @@ -76,15 +90,101 @@ void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs, return job; } -void block_job_release(BlockDriverState *bs) +void block_job_ref(BlockJob *job) { - BlockJob *job = bs->job; + ++job->refcnt; +} - bs->job = NULL; - bdrv_op_unblock_all(bs, job->blocker); - error_free(job->blocker); - g_free(job->id); - g_free(job); +void block_job_unref(BlockJob *job) +{ + if (--job->refcnt == 0) { + job->bs->job = NULL; + bdrv_op_unblock_all(job->bs, job->blocker); + bdrv_unref(job->bs); + error_free(job->blocker); + g_free(job->id); + g_free(job); + } +} + +static void block_job_completed_single(BlockJob *job) +{ + if (!job->ret) { + if (job->driver->commit) { + job->driver->commit(job); + } + } else { + if (job->driver->abort) { + job->driver->abort(job); + } + } + job->cb(job->opaque, job->ret); + if (job->txn) { + block_job_txn_unref(job->txn); + } + block_job_unref(job); +} + +static void block_job_completed_txn_abort(BlockJob *job) +{ + AioContext *ctx; + BlockJobTxn *txn = job->txn; + BlockJob *other_job, *next; + + if (txn->aborting) { + /* + * We are cancelled by another job, which will handle everything. + */ + return; + } + txn->aborting = true; + /* We are the first failed job. Cancel other jobs. */ + QLIST_FOREACH(other_job, &txn->jobs, txn_list) { + ctx = bdrv_get_aio_context(other_job->bs); + aio_context_acquire(ctx); + } + QLIST_FOREACH(other_job, &txn->jobs, txn_list) { + if (other_job == job || other_job->completed) { + /* Other jobs are "effectively" cancelled by us, set the status for + * them; this job, however, may or may not be cancelled, depending + * on the caller, so leave it. */ + if (other_job != job) { + other_job->cancelled = true; + } + continue; + } + block_job_cancel_sync(other_job); + assert(other_job->completed); + } + QLIST_FOREACH_SAFE(other_job, &txn->jobs, txn_list, next) { + ctx = bdrv_get_aio_context(other_job->bs); + block_job_completed_single(other_job); + aio_context_release(ctx); + } +} + +static void block_job_completed_txn_success(BlockJob *job) +{ + AioContext *ctx; + BlockJobTxn *txn = job->txn; + BlockJob *other_job, *next; + /* + * Successful completion, see if there are other running jobs in this + * txn. + */ + QLIST_FOREACH(other_job, &txn->jobs, txn_list) { + if (!other_job->completed) { + return; + } + } + /* We are the last completed job, commit the transaction. */ + QLIST_FOREACH_SAFE(other_job, &txn->jobs, txn_list, next) { + ctx = bdrv_get_aio_context(other_job->bs); + aio_context_acquire(ctx); + assert(other_job->ret == 0); + block_job_completed_single(other_job); + aio_context_release(ctx); + } } void block_job_completed(BlockJob *job, int ret) @@ -92,8 +192,16 @@ void block_job_completed(BlockJob *job, int ret) BlockDriverState *bs = job->bs; assert(bs->job == job); - job->cb(job->opaque, ret); - block_job_release(bs); + assert(!job->completed); + job->completed = true; + job->ret = ret; + if (!job->txn) { + block_job_completed_single(job); + } else if (ret < 0 || block_job_is_cancelled(job)) { + block_job_completed_txn_abort(job); + } else { + block_job_completed_txn_success(job); + } } void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp) @@ -178,43 +286,29 @@ struct BlockFinishData { int ret; }; -static void block_job_finish_cb(void *opaque, int ret) -{ - struct BlockFinishData *data = opaque; - - data->cancelled = block_job_is_cancelled(data->job); - data->ret = ret; - data->cb(data->opaque, ret); -} - static int block_job_finish_sync(BlockJob *job, void (*finish)(BlockJob *, Error **errp), Error **errp) { - struct BlockFinishData data; BlockDriverState *bs = job->bs; Error *local_err = NULL; + int ret; assert(bs->job == job); - /* Set up our own callback to store the result and chain to - * the original callback. - */ - data.job = job; - data.cb = job->cb; - data.opaque = job->opaque; - data.ret = -EINPROGRESS; - job->cb = block_job_finish_cb; - job->opaque = &data; + block_job_ref(job); finish(job, &local_err); if (local_err) { error_propagate(errp, local_err); + block_job_unref(job); return -EBUSY; } - while (data.ret == -EINPROGRESS) { + while (!job->completed) { aio_poll(bdrv_get_aio_context(bs), true); } - return (data.cancelled && data.ret == 0) ? -ECANCELED : data.ret; + ret = (job->cancelled && job->ret == 0) ? -ECANCELED : job->ret; + block_job_unref(job); + return ret; } /* A wrapper around block_job_cancel() taking an Error ** parameter so it may be @@ -406,3 +500,36 @@ void block_job_defer_to_main_loop(BlockJob *job, qemu_bh_schedule(data->bh); } + +BlockJobTxn *block_job_txn_new(void) +{ + BlockJobTxn *txn = g_new0(BlockJobTxn, 1); + QLIST_INIT(&txn->jobs); + txn->refcnt = 1; + return txn; +} + +static void block_job_txn_ref(BlockJobTxn *txn) +{ + txn->refcnt++; +} + +void block_job_txn_unref(BlockJobTxn *txn) +{ + if (txn && --txn->refcnt == 0) { + g_free(txn); + } +} + +void block_job_txn_add_job(BlockJobTxn *txn, BlockJob *job) +{ + if (!txn) { + return; + } + + assert(!job->txn); + job->txn = txn; + + QLIST_INSERT_HEAD(&txn->jobs, job, txn_list); + block_job_txn_ref(txn); +} diff --git a/docs/bitmaps.md b/docs/bitmaps.md index fa87f077fe..9fd8ea65ea 100644 --- a/docs/bitmaps.md +++ b/docs/bitmaps.md @@ -97,11 +97,7 @@ which is included at the end of this document. } ``` -## Transactions (Not yet implemented) - -* Transactional commands are forthcoming in a future version, - and are not yet available for use. This section serves as - documentation of intent for their design and usage. +## Transactions ### Justification @@ -522,6 +522,7 @@ void hmp_info_blockstats(Monitor *mon, const QDict *qdict) " flush_total_time_ns=%" PRId64 " rd_merged=%" PRId64 " wr_merged=%" PRId64 + " idle_time_ns=%" PRId64 "\n", stats->value->stats->rd_bytes, stats->value->stats->wr_bytes, @@ -532,7 +533,8 @@ void hmp_info_blockstats(Monitor *mon, const QDict *qdict) stats->value->stats->rd_total_time_ns, stats->value->stats->flush_total_time_ns, stats->value->stats->rd_merged, - stats->value->stats->wr_merged); + stats->value->stats->wr_merged, + stats->value->stats->idle_time_ns); } qapi_free_BlockStatsList(stats_list); diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 5da41b23cf..169e4fa7a5 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -201,10 +201,11 @@ static void nvme_rw_cb(void *opaque, int ret) NvmeCtrl *n = sq->ctrl; NvmeCQueue *cq = n->cq[sq->cqid]; - block_acct_done(blk_get_stats(n->conf.blk), &req->acct); if (!ret) { + block_acct_done(blk_get_stats(n->conf.blk), &req->acct); req->status = NVME_SUCCESS; } else { + block_acct_failed(blk_get_stats(n->conf.blk), &req->acct); req->status = NVME_INTERNAL_DEV_ERROR; } if (req->has_sg) { @@ -238,18 +239,22 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, uint64_t data_size = (uint64_t)nlb << data_shift; uint64_t aio_slba = slba << (data_shift - BDRV_SECTOR_BITS); int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0; + enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ; if ((slba + nlb) > ns->id_ns.nsze) { + block_acct_invalid(blk_get_stats(n->conf.blk), acct); return NVME_LBA_RANGE | NVME_DNR; } + if (nvme_map_prp(&req->qsg, prp1, prp2, data_size, n)) { + block_acct_invalid(blk_get_stats(n->conf.blk), acct); return NVME_INVALID_FIELD | NVME_DNR; } + assert((nlb << data_shift) == req->qsg.size); req->has_sg = true; - dma_acct_start(n->conf.blk, &req->acct, &req->qsg, - is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ); + dma_acct_start(n->conf.blk, &req->acct, &req->qsg, acct); req->aiocb = is_write ? dma_blk_write(n->conf.blk, &req->qsg, aio_slba, nvme_rw_cb, req) : dma_blk_read(n->conf.blk, &req->qsg, aio_slba, nvme_rw_cb, req); diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index 093e475dc9..e70fccf80c 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -76,7 +76,7 @@ static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error, s->rq = req; } else if (action == BLOCK_ERROR_ACTION_REPORT) { virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR); - block_acct_done(blk_get_stats(s->blk), &req->acct); + block_acct_failed(blk_get_stats(s->blk), &req->acct); virtio_blk_free_request(req); } @@ -536,6 +536,8 @@ void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb) if (!virtio_blk_sect_range_ok(req->dev, req->sector_num, req->qiov.size)) { virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR); + block_acct_invalid(blk_get_stats(req->dev->blk), + is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ); virtio_blk_free_request(req); return; } diff --git a/hw/block/xen_disk.c b/hw/block/xen_disk.c index 1bbc111939..02eda6efbc 100644 --- a/hw/block/xen_disk.c +++ b/hw/block/xen_disk.c @@ -537,7 +537,11 @@ static void qemu_aio_complete(void *opaque, int ret) break; } case BLKIF_OP_READ: - block_acct_done(blk_get_stats(ioreq->blkdev->blk), &ioreq->acct); + if (ioreq->status == BLKIF_RSP_OKAY) { + block_acct_done(blk_get_stats(ioreq->blkdev->blk), &ioreq->acct); + } else { + block_acct_failed(blk_get_stats(ioreq->blkdev->blk), &ioreq->acct); + } break; case BLKIF_OP_DISCARD: default: @@ -576,7 +580,9 @@ static int ioreq_runio_qemu_aio(struct ioreq *ioreq) } block_acct_start(blk_get_stats(blkdev->blk), &ioreq->acct, - ioreq->v.size, BLOCK_ACCT_WRITE); + ioreq->v.size, + ioreq->req.operation == BLKIF_OP_WRITE ? + BLOCK_ACCT_WRITE : BLOCK_ACCT_FLUSH); ioreq->aio_inflight++; blk_aio_writev(blkdev->blk, ioreq->start / BLOCK_SIZE, &ioreq->v, ioreq->v.size / BLOCK_SIZE, @@ -720,6 +726,23 @@ static void blk_handle_requests(struct XenBlkDev *blkdev) /* parse them */ if (ioreq_parse(ioreq) != 0) { + + switch (ioreq->req.operation) { + case BLKIF_OP_READ: + block_acct_invalid(blk_get_stats(blkdev->blk), + BLOCK_ACCT_READ); + break; + case BLKIF_OP_WRITE: + block_acct_invalid(blk_get_stats(blkdev->blk), + BLOCK_ACCT_WRITE); + break; + case BLKIF_OP_FLUSH_DISKCACHE: + block_acct_invalid(blk_get_stats(blkdev->blk), + BLOCK_ACCT_FLUSH); + default: + break; + }; + if (blk_send_response_one(ioreq)) { xen_be_send_notify(&blkdev->xendev); } diff --git a/hw/ide/atapi.c b/hw/ide/atapi.c index 747f46611e..cf0b78e3a4 100644 --- a/hw/ide/atapi.c +++ b/hw/ide/atapi.c @@ -108,27 +108,30 @@ static void cd_data_to_raw(uint8_t *buf, int lba) static int cd_read_sector(IDEState *s, int lba, uint8_t *buf, int sector_size) { int ret; + block_acct_start(blk_get_stats(s->blk), &s->acct, + 4 * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ); switch(sector_size) { case 2048: - block_acct_start(blk_get_stats(s->blk), &s->acct, - 4 * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ); ret = blk_read(s->blk, (int64_t)lba << 2, buf, 4); - block_acct_done(blk_get_stats(s->blk), &s->acct); break; case 2352: - block_acct_start(blk_get_stats(s->blk), &s->acct, - 4 * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ); ret = blk_read(s->blk, (int64_t)lba << 2, buf + 16, 4); - block_acct_done(blk_get_stats(s->blk), &s->acct); - if (ret < 0) - return ret; - cd_data_to_raw(buf, lba); + if (ret >= 0) { + cd_data_to_raw(buf, lba); + } break; default: - ret = -EIO; - break; + block_acct_invalid(blk_get_stats(s->blk), BLOCK_ACCT_READ); + return -EIO; + } + + if (ret < 0) { + block_acct_failed(blk_get_stats(s->blk), &s->acct); + } else { + block_acct_done(blk_get_stats(s->blk), &s->acct); } + return ret; } @@ -357,7 +360,11 @@ static void ide_atapi_cmd_read_dma_cb(void *opaque, int ret) return; eot: - block_acct_done(blk_get_stats(s->blk), &s->acct); + if (ret < 0) { + block_acct_failed(blk_get_stats(s->blk), &s->acct); + } else { + block_acct_done(blk_get_stats(s->blk), &s->acct); + } ide_set_inactive(s, false); } diff --git a/hw/ide/core.c b/hw/ide/core.c index 364ba21e0c..2725dd3b81 100644 --- a/hw/ide/core.c +++ b/hw/ide/core.c @@ -574,7 +574,6 @@ static void ide_sector_read_cb(void *opaque, int ret) if (ret == -ECANCELED) { return; } - block_acct_done(blk_get_stats(s->blk), &s->acct); if (ret != 0) { if (ide_handle_rw_error(s, -ret, IDE_RETRY_PIO | IDE_RETRY_READ)) { @@ -582,6 +581,8 @@ static void ide_sector_read_cb(void *opaque, int ret) } } + block_acct_done(blk_get_stats(s->blk), &s->acct); + n = s->nsector; if (n > s->req_nb_sectors) { n = s->req_nb_sectors; @@ -621,6 +622,7 @@ static void ide_sector_read(IDEState *s) if (!ide_sect_range_ok(s, sector_num, n)) { ide_rw_error(s); + block_acct_invalid(blk_get_stats(s->blk), BLOCK_ACCT_READ); return; } @@ -672,6 +674,7 @@ static int ide_handle_rw_error(IDEState *s, int error, int op) assert(s->bus->retry_unit == s->unit); s->bus->error_status = op; } else if (action == BLOCK_ERROR_ACTION_REPORT) { + block_acct_failed(blk_get_stats(s->blk), &s->acct); if (op & IDE_RETRY_DMA) { ide_dma_error(s); } else { @@ -750,6 +753,7 @@ static void ide_dma_cb(void *opaque, int ret) if ((s->dma_cmd == IDE_DMA_READ || s->dma_cmd == IDE_DMA_WRITE) && !ide_sect_range_ok(s, sector_num, n)) { ide_dma_error(s); + block_acct_invalid(blk_get_stats(s->blk), s->acct.type); return; } @@ -826,7 +830,6 @@ static void ide_sector_write_cb(void *opaque, int ret) if (ret == -ECANCELED) { return; } - block_acct_done(blk_get_stats(s->blk), &s->acct); s->pio_aiocb = NULL; s->status &= ~BUSY_STAT; @@ -837,6 +840,8 @@ static void ide_sector_write_cb(void *opaque, int ret) } } + block_acct_done(blk_get_stats(s->blk), &s->acct); + n = s->nsector; if (n > s->req_nb_sectors) { n = s->req_nb_sectors; @@ -887,6 +892,7 @@ static void ide_sector_write(IDEState *s) if (!ide_sect_range_ok(s, sector_num, n)) { ide_rw_error(s); + block_acct_invalid(blk_get_stats(s->blk), BLOCK_ACCT_WRITE); return; } @@ -895,7 +901,7 @@ static void ide_sector_write(IDEState *s) qemu_iovec_init_external(&s->qiov, &s->iov, 1); block_acct_start(blk_get_stats(s->blk), &s->acct, - n * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ); + n * BDRV_SECTOR_SIZE, BLOCK_ACCT_WRITE); s->pio_aiocb = blk_aio_writev(s->blk, sector_num, &s->qiov, n, ide_sector_write_cb, s); } diff --git a/hw/ide/macio.c b/hw/ide/macio.c index 893c9b9bae..3ee962f830 100644 --- a/hw/ide/macio.c +++ b/hw/ide/macio.c @@ -286,7 +286,11 @@ static void pmac_ide_atapi_transfer_cb(void *opaque, int ret) return; done: - block_acct_done(blk_get_stats(s->blk), &s->acct); + if (ret < 0) { + block_acct_failed(blk_get_stats(s->blk), &s->acct); + } else { + block_acct_done(blk_get_stats(s->blk), &s->acct); + } io->dma_end(opaque); return; @@ -348,7 +352,11 @@ static void pmac_ide_transfer_cb(void *opaque, int ret) done: if (s->dma_cmd == IDE_DMA_READ || s->dma_cmd == IDE_DMA_WRITE) { - block_acct_done(blk_get_stats(s->blk), &s->acct); + if (ret < 0) { + block_acct_failed(blk_get_stats(s->blk), &s->acct); + } else { + block_acct_done(blk_get_stats(s->blk), &s->acct); + } } io->dma_end(opaque); } diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c index 707e7349a2..4797d83683 100644 --- a/hw/scsi/scsi-disk.c +++ b/hw/scsi/scsi-disk.c @@ -90,7 +90,7 @@ struct SCSIDiskState bool tray_locked; }; -static int scsi_handle_rw_error(SCSIDiskReq *r, int error); +static int scsi_handle_rw_error(SCSIDiskReq *r, int error, bool acct_failed); static void scsi_free_request(SCSIRequest *req) { @@ -169,18 +169,18 @@ static void scsi_aio_complete(void *opaque, int ret) assert(r->req.aiocb != NULL); r->req.aiocb = NULL; - block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct); if (r->req.io_canceled) { scsi_req_cancel_complete(&r->req); goto done; } if (ret < 0) { - if (scsi_handle_rw_error(r, -ret)) { + if (scsi_handle_rw_error(r, -ret, true)) { goto done; } } + block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct); scsi_req_complete(&r->req, GOOD); done: @@ -247,7 +247,7 @@ static void scsi_dma_complete_noio(SCSIDiskReq *r, int ret) } if (ret < 0) { - if (scsi_handle_rw_error(r, -ret)) { + if (scsi_handle_rw_error(r, -ret, false)) { goto done; } } @@ -273,7 +273,11 @@ static void scsi_dma_complete(void *opaque, int ret) assert(r->req.aiocb != NULL); r->req.aiocb = NULL; - block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct); + if (ret < 0) { + block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct); + } else { + block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct); + } scsi_dma_complete_noio(r, ret); } @@ -285,18 +289,18 @@ static void scsi_read_complete(void * opaque, int ret) assert(r->req.aiocb != NULL); r->req.aiocb = NULL; - block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct); if (r->req.io_canceled) { scsi_req_cancel_complete(&r->req); goto done; } if (ret < 0) { - if (scsi_handle_rw_error(r, -ret)) { + if (scsi_handle_rw_error(r, -ret, true)) { goto done; } } + block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct); DPRINTF("Data ready tag=0x%x len=%zd\n", r->req.tag, r->qiov.size); n = r->qiov.size / 512; @@ -322,7 +326,7 @@ static void scsi_do_read(SCSIDiskReq *r, int ret) } if (ret < 0) { - if (scsi_handle_rw_error(r, -ret)) { + if (scsi_handle_rw_error(r, -ret, false)) { goto done; } } @@ -355,7 +359,11 @@ static void scsi_do_read_cb(void *opaque, int ret) assert (r->req.aiocb != NULL); r->req.aiocb = NULL; - block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct); + if (ret < 0) { + block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct); + } else { + block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct); + } scsi_do_read(opaque, ret); } @@ -407,7 +415,7 @@ static void scsi_read_data(SCSIRequest *req) * scsi_handle_rw_error always manages its reference counts, independent * of the return value. */ -static int scsi_handle_rw_error(SCSIDiskReq *r, int error) +static int scsi_handle_rw_error(SCSIDiskReq *r, int error, bool acct_failed) { bool is_read = (r->req.cmd.mode == SCSI_XFER_FROM_DEV); SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev); @@ -415,6 +423,9 @@ static int scsi_handle_rw_error(SCSIDiskReq *r, int error) is_read, error); if (action == BLOCK_ERROR_ACTION_REPORT) { + if (acct_failed) { + block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct); + } switch (error) { case ENOMEDIUM: scsi_check_condition(r, SENSE_CODE(NO_MEDIUM)); @@ -452,7 +463,7 @@ static void scsi_write_complete_noio(SCSIDiskReq *r, int ret) } if (ret < 0) { - if (scsi_handle_rw_error(r, -ret)) { + if (scsi_handle_rw_error(r, -ret, false)) { goto done; } } @@ -481,7 +492,11 @@ static void scsi_write_complete(void * opaque, int ret) assert (r->req.aiocb != NULL); r->req.aiocb = NULL; - block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct); + if (ret < 0) { + block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct); + } else { + block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct); + } scsi_write_complete_noio(r, ret); } @@ -1592,7 +1607,7 @@ static void scsi_unmap_complete_noio(UnmapCBData *data, int ret) } if (ret < 0) { - if (scsi_handle_rw_error(r, -ret)) { + if (scsi_handle_rw_error(r, -ret, false)) { goto done; } } @@ -1696,18 +1711,19 @@ static void scsi_write_same_complete(void *opaque, int ret) assert(r->req.aiocb != NULL); r->req.aiocb = NULL; - block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct); if (r->req.io_canceled) { scsi_req_cancel_complete(&r->req); goto done; } if (ret < 0) { - if (scsi_handle_rw_error(r, -ret)) { + if (scsi_handle_rw_error(r, -ret, true)) { goto done; } } + block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct); + data->nb_sectors -= data->iov.iov_len / 512; data->sector += data->iov.iov_len / 512; data->iov.iov_len = MIN(data->nb_sectors * 512, data->iov.iov_len); diff --git a/include/block/accounting.h b/include/block/accounting.h index 66637cdfed..0f46cb4ec1 100644 --- a/include/block/accounting.h +++ b/include/block/accounting.h @@ -2,6 +2,7 @@ * QEMU System Emulator block accounting * * Copyright (c) 2011 Christoph Hellwig + * Copyright (c) 2015 Igalia, S.L. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -25,8 +26,12 @@ #define BLOCK_ACCOUNTING_H #include <stdint.h> +#include <stdbool.h> #include "qemu/typedefs.h" +#include "qemu/timed-average.h" + +typedef struct BlockAcctTimedStats BlockAcctTimedStats; enum BlockAcctType { BLOCK_ACCT_READ, @@ -35,11 +40,23 @@ enum BlockAcctType { BLOCK_MAX_IOTYPE, }; +struct BlockAcctTimedStats { + TimedAverage latency[BLOCK_MAX_IOTYPE]; + unsigned interval_length; /* in seconds */ + QSLIST_ENTRY(BlockAcctTimedStats) entries; +}; + typedef struct BlockAcctStats { uint64_t nr_bytes[BLOCK_MAX_IOTYPE]; uint64_t nr_ops[BLOCK_MAX_IOTYPE]; + uint64_t invalid_ops[BLOCK_MAX_IOTYPE]; + uint64_t failed_ops[BLOCK_MAX_IOTYPE]; uint64_t total_time_ns[BLOCK_MAX_IOTYPE]; uint64_t merged[BLOCK_MAX_IOTYPE]; + int64_t last_access_time_ns; + QSLIST_HEAD(, BlockAcctTimedStats) intervals; + bool account_invalid; + bool account_failed; } BlockAcctStats; typedef struct BlockAcctCookie { @@ -48,10 +65,21 @@ typedef struct BlockAcctCookie { enum BlockAcctType type; } BlockAcctCookie; +void block_acct_init(BlockAcctStats *stats, bool account_invalid, + bool account_failed); +void block_acct_cleanup(BlockAcctStats *stats); +void block_acct_add_interval(BlockAcctStats *stats, unsigned interval_length); +BlockAcctTimedStats *block_acct_interval_next(BlockAcctStats *stats, + BlockAcctTimedStats *s); void block_acct_start(BlockAcctStats *stats, BlockAcctCookie *cookie, int64_t bytes, enum BlockAcctType type); void block_acct_done(BlockAcctStats *stats, BlockAcctCookie *cookie); +void block_acct_failed(BlockAcctStats *stats, BlockAcctCookie *cookie); +void block_acct_invalid(BlockAcctStats *stats, enum BlockAcctType type); void block_acct_merge_done(BlockAcctStats *stats, enum BlockAcctType type, int num_requests); +int64_t block_acct_idle_time_ns(BlockAcctStats *stats); +double block_acct_queue_depth(BlockAcctTimedStats *stats, + enum BlockAcctType type); #endif diff --git a/include/block/block.h b/include/block/block.h index 610db923d5..73edb1a79c 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -14,6 +14,7 @@ typedef struct BlockDriver BlockDriver; typedef struct BlockJob BlockJob; typedef struct BdrvChild BdrvChild; typedef struct BdrvChildRole BdrvChildRole; +typedef struct BlockJobTxn BlockJobTxn; typedef struct BlockDriverInfo { /* in bytes, 0 if irrelevant */ @@ -335,10 +336,18 @@ void bdrv_aio_cancel_async(BlockAIOCB *acb); typedef struct BlockRequest { /* Fields to be filled by multiwrite caller */ - int64_t sector; - int nb_sectors; - int flags; - QEMUIOVector *qiov; + union { + struct { + int64_t sector; + int nb_sectors; + int flags; + QEMUIOVector *qiov; + }; + struct { + int req; + void *buf; + }; + }; BlockCompletionFunc *cb; void *opaque; @@ -493,7 +502,6 @@ void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap, int64_t cur_sector, int nr_sectors); void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap, int64_t cur_sector, int nr_sectors); -void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap); void bdrv_dirty_iter_init(BdrvDirtyBitmap *bitmap, struct HBitmapIter *hbi); void bdrv_set_dirty_iter(struct HBitmapIter *hbi, int64_t offset); int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap); diff --git a/include/block/block_int.h b/include/block/block_int.h index 603145a21d..4012e36437 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -60,11 +60,19 @@ #define BLOCK_PROBE_BUF_SIZE 512 +enum BdrvTrackedRequestType { + BDRV_TRACKED_READ, + BDRV_TRACKED_WRITE, + BDRV_TRACKED_FLUSH, + BDRV_TRACKED_IOCTL, + BDRV_TRACKED_DISCARD, +}; + typedef struct BdrvTrackedRequest { BlockDriverState *bs; int64_t offset; unsigned int bytes; - bool is_write; + enum BdrvTrackedRequestType type; bool serialising; int64_t overlap_offset; @@ -219,7 +227,6 @@ struct BlockDriver { void (*bdrv_lock_medium)(BlockDriverState *bs, bool locked); /* to control generic scsi devices */ - int (*bdrv_ioctl)(BlockDriverState *bs, unsigned long int req, void *buf); BlockAIOCB *(*bdrv_aio_ioctl)(BlockDriverState *bs, unsigned long int req, void *buf, BlockCompletionFunc *cb, void *opaque); @@ -288,6 +295,12 @@ struct BlockDriver { */ int (*bdrv_probe_geometry)(BlockDriverState *bs, HDGeometry *geo); + /** + * Drain and stop any internal sources of requests in the driver, and + * remain so until next I/O callback (e.g. bdrv_co_writev) is called. + */ + void (*bdrv_drain)(BlockDriverState *bs); + QLIST_ENTRY(BlockDriver) list; }; @@ -656,6 +669,7 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target, * @on_target_error: The action to take upon error writing to the target. * @cb: Completion function for the job. * @opaque: Opaque pointer value passed to @cb. + * @txn: Transaction that this job is part of (may be NULL). * * Start a backup operation on @bs. Clusters in @bs are written to @target * until the job is cancelled or manually completed. @@ -666,7 +680,7 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target, BlockdevOnError on_source_error, BlockdevOnError on_target_error, BlockCompletionFunc *cb, void *opaque, - Error **errp); + BlockJobTxn *txn, Error **errp); void blk_set_bs(BlockBackend *blk, BlockDriverState *bs); @@ -680,4 +694,7 @@ void blk_dev_resize_cb(BlockBackend *blk); void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors); bool bdrv_requests_pending(BlockDriverState *bs); +void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out); +void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in); + #endif /* BLOCK_INT_H */ diff --git a/include/block/blockjob.h b/include/block/blockjob.h index 289b13f0c0..d84ccd8d2c 100644 --- a/include/block/blockjob.h +++ b/include/block/blockjob.h @@ -50,6 +50,26 @@ typedef struct BlockJobDriver { * manually. */ void (*complete)(BlockJob *job, Error **errp); + + /** + * If the callback is not NULL, it will be invoked when all the jobs + * belonging to the same transaction complete; or upon this job's + * completion if it is not in a transaction. Skipped if NULL. + * + * All jobs will complete with a call to either .commit() or .abort() but + * never both. + */ + void (*commit)(BlockJob *job); + + /** + * If the callback is not NULL, it will be invoked when any job in the + * same transaction fails; or upon this job's failure (due to error or + * cancellation) if it is not in a transaction. Skipped if NULL. + * + * All jobs will complete with a call to either .commit() or .abort() but + * never both. + */ + void (*abort)(BlockJob *job); } BlockJobDriver; /** @@ -130,6 +150,21 @@ struct BlockJob { /** The opaque value that is passed to the completion function. */ void *opaque; + + /** Reference count of the block job */ + int refcnt; + + /* True if this job has reported completion by calling block_job_completed. + */ + bool completed; + + /* ret code passed to block_job_completed. + */ + int ret; + + /** Non-NULL if this job is part of a transaction */ + BlockJobTxn *txn; + QLIST_ENTRY(BlockJob) txn_list; }; /** @@ -174,12 +209,21 @@ void block_job_sleep_ns(BlockJob *job, QEMUClockType type, int64_t ns); void block_job_yield(BlockJob *job); /** - * block_job_release: + * block_job_ref: * @bs: The block device. * - * Release job resources when an error occurred or job completed. + * Grab a reference to the block job. Should be paired with block_job_unref. */ -void block_job_release(BlockDriverState *bs); +void block_job_ref(BlockJob *job); + +/** + * block_job_unref: + * @bs: The block device. + * + * Release reference to the block job and release resources if it is the last + * reference. + */ +void block_job_unref(BlockJob *job); /** * block_job_completed: @@ -364,4 +408,39 @@ void block_job_defer_to_main_loop(BlockJob *job, BlockJobDeferToMainLoopFn *fn, void *opaque); +/** + * block_job_txn_new: + * + * Allocate and return a new block job transaction. Jobs can be added to the + * transaction using block_job_txn_add_job(). + * + * The transaction is automatically freed when the last job completes or is + * cancelled. + * + * All jobs in the transaction either complete successfully or fail/cancel as a + * group. Jobs wait for each other before completing. Cancelling one job + * cancels all jobs in the transaction. + */ +BlockJobTxn *block_job_txn_new(void); + +/** + * block_job_txn_unref: + * + * Release a reference that was previously acquired with block_job_txn_add_job + * or block_job_txn_new. If it's the last reference to the object, it will be + * freed. + */ +void block_job_txn_unref(BlockJobTxn *txn); + +/** + * block_job_txn_add_job: + * @txn: The transaction (may be NULL) + * @job: Job to add to the transaction + * + * Add @job to the transaction. The @job must not already be in a transaction. + * The caller must call either block_job_txn_unref() or block_job_completed() + * to release the reference that is automatically grabbed here. + */ +void block_job_txn_add_job(BlockJobTxn *txn, BlockJob *job); + #endif diff --git a/include/qemu/timed-average.h b/include/qemu/timed-average.h new file mode 100644 index 0000000000..364bf88f70 --- /dev/null +++ b/include/qemu/timed-average.h @@ -0,0 +1,64 @@ +/* + * QEMU timed average computation + * + * Copyright (C) Nodalink, EURL. 2014 + * Copyright (C) Igalia, S.L. 2015 + * + * Authors: + * Benoît Canet <benoit.canet@nodalink.com> + * Alberto Garcia <berto@igalia.com> + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 2 of the License, or + * (at your option) version 3 or any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef TIMED_AVERAGE_H +#define TIMED_AVERAGE_H + +#include <stdint.h> + +#include "qemu/timer.h" + +typedef struct TimedAverageWindow TimedAverageWindow; +typedef struct TimedAverage TimedAverage; + +/* All fields of both structures are private */ + +struct TimedAverageWindow { + uint64_t min; /* minimum value accounted in the window */ + uint64_t max; /* maximum value accounted in the window */ + uint64_t sum; /* sum of all values */ + uint64_t count; /* number of values */ + int64_t expiration; /* the end of the current window in ns */ +}; + +struct TimedAverage { + uint64_t period; /* period in nanoseconds */ + TimedAverageWindow windows[2]; /* two overlapping windows of with + * an offset of period / 2 between them */ + unsigned current; /* the current window index: it's also the + * oldest window index */ + QEMUClockType clock_type; /* the clock used */ +}; + +void timed_average_init(TimedAverage *ta, QEMUClockType clock_type, + uint64_t period); + +void timed_average_account(TimedAverage *ta, uint64_t value); + +uint64_t timed_average_min(TimedAverage *ta); +uint64_t timed_average_avg(TimedAverage *ta); +uint64_t timed_average_max(TimedAverage *ta); +uint64_t timed_average_sum(TimedAverage *ta, uint64_t *elapsed); + +#endif diff --git a/qapi-schema.json b/qapi-schema.json index c3f95ab170..36e59b8932 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -1534,6 +1534,26 @@ 'data': { } } ## +# @ActionCompletionMode +# +# An enumeration of Transactional completion modes. +# +# @individual: Do not attempt to cancel any other Actions if any Actions fail +# after the Transaction request succeeds. All Actions that +# can complete successfully will do so without waiting on others. +# This is the default. +# +# @grouped: If any Action fails after the Transaction succeeds, cancel all +# Actions. Actions do not complete until all Actions are ready to +# complete. May be rejected by Actions that do not support this +# completion mode. +# +# Since: 2.5 +## +{ 'enum': 'ActionCompletionMode', + 'data': [ 'individual', 'grouped' ] } + +## # @TransactionAction # # A discriminated record of operations that can be performed with @@ -1546,6 +1566,8 @@ # blockdev-snapshot-internal-sync since 1.7 # blockdev-backup since 2.3 # blockdev-snapshot since 2.5 +# block-dirty-bitmap-add since 2.5 +# block-dirty-bitmap-clear since 2.5 ## { 'union': 'TransactionAction', 'data': { @@ -1554,18 +1576,41 @@ 'drive-backup': 'DriveBackup', 'blockdev-backup': 'BlockdevBackup', 'abort': 'Abort', - 'blockdev-snapshot-internal-sync': 'BlockdevSnapshotInternal' + 'blockdev-snapshot-internal-sync': 'BlockdevSnapshotInternal', + 'block-dirty-bitmap-add': 'BlockDirtyBitmapAdd', + 'block-dirty-bitmap-clear': 'BlockDirtyBitmap' } } ## +# @TransactionProperties +# +# Optional arguments to modify the behavior of a Transaction. +# +# @completion-mode: #optional Controls how jobs launched asynchronously by +# Actions will complete or fail as a group. +# See @ActionCompletionMode for details. +# +# Since: 2.5 +## +{ 'struct': 'TransactionProperties', + 'data': { + '*completion-mode': 'ActionCompletionMode' + } +} + +## # @transaction # # Executes a number of transactionable QMP commands atomically. If any # operation fails, then the entire set of actions will be abandoned and the # appropriate error returned. # -# List of: -# @TransactionAction: information needed for the respective operation +# @actions: List of @TransactionAction; +# information needed for the respective operations. +# +# @properties: #optional structure of additional options to control the +# execution of the transaction. See @TransactionProperties +# for additional detail. # # Returns: nothing on success # Errors depend on the operations of the transaction @@ -1577,7 +1622,10 @@ # Since 1.1 ## { 'command': 'transaction', - 'data': { 'actions': [ 'TransactionAction' ] } } + 'data': { 'actions': [ 'TransactionAction' ], + '*properties': 'TransactionProperties' + } +} ## # @human-monitor-command: diff --git a/qapi/block-core.json b/qapi/block-core.json index 470e86c6df..f97c250ce9 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -414,6 +414,59 @@ ## { 'command': 'query-block', 'returns': ['BlockInfo'] } + +## +# @BlockDeviceTimedStats: +# +# Statistics of a block device during a given interval of time. +# +# @interval_length: Interval used for calculating the statistics, +# in seconds. +# +# @min_rd_latency_ns: Minimum latency of read operations in the +# defined interval, in nanoseconds. +# +# @min_wr_latency_ns: Minimum latency of write operations in the +# defined interval, in nanoseconds. +# +# @min_flush_latency_ns: Minimum latency of flush operations in the +# defined interval, in nanoseconds. +# +# @max_rd_latency_ns: Maximum latency of read operations in the +# defined interval, in nanoseconds. +# +# @max_wr_latency_ns: Maximum latency of write operations in the +# defined interval, in nanoseconds. +# +# @max_flush_latency_ns: Maximum latency of flush operations in the +# defined interval, in nanoseconds. +# +# @avg_rd_latency_ns: Average latency of read operations in the +# defined interval, in nanoseconds. +# +# @avg_wr_latency_ns: Average latency of write operations in the +# defined interval, in nanoseconds. +# +# @avg_flush_latency_ns: Average latency of flush operations in the +# defined interval, in nanoseconds. +# +# @avg_rd_queue_depth: Average number of pending read operations +# in the defined interval. +# +# @avg_wr_queue_depth: Average number of pending write operations +# in the defined interval. +# +# Since: 2.5 +## + +{ 'struct': 'BlockDeviceTimedStats', + 'data': { 'interval_length': 'int', 'min_rd_latency_ns': 'int', + 'max_rd_latency_ns': 'int', 'avg_rd_latency_ns': 'int', + 'min_wr_latency_ns': 'int', 'max_wr_latency_ns': 'int', + 'avg_wr_latency_ns': 'int', 'min_flush_latency_ns': 'int', + 'max_flush_latency_ns': 'int', 'avg_flush_latency_ns': 'int', + 'avg_rd_queue_depth': 'number', 'avg_wr_queue_depth': 'number' } } + ## # @BlockDeviceStats: # @@ -448,6 +501,37 @@ # @wr_merged: Number of write requests that have been merged into another # request (Since 2.3). # +# @idle_time_ns: #optional Time since the last I/O operation, in +# nanoseconds. If the field is absent it means that +# there haven't been any operations yet (Since 2.5). +# +# @failed_rd_operations: The number of failed read operations +# performed by the device (Since 2.5) +# +# @failed_wr_operations: The number of failed write operations +# performed by the device (Since 2.5) +# +# @failed_flush_operations: The number of failed flush operations +# performed by the device (Since 2.5) +# +# @invalid_rd_operations: The number of invalid read operations +# performed by the device (Since 2.5) +# +# @invalid_wr_operations: The number of invalid write operations +# performed by the device (Since 2.5) +# +# @invalid_flush_operations: The number of invalid flush operations +# performed by the device (Since 2.5) +# +# @account_invalid: Whether invalid operations are included in the +# last access statistics (Since 2.5) +# +# @account_failed: Whether failed operations are included in the +# latency and last access statistics (Since 2.5) +# +# @timed_stats: Statistics specific to the set of previously defined +# intervals of time (Since 2.5) +# # Since: 0.14.0 ## { 'struct': 'BlockDeviceStats', @@ -455,7 +539,12 @@ 'wr_operations': 'int', 'flush_operations': 'int', 'flush_total_time_ns': 'int', 'wr_total_time_ns': 'int', 'rd_total_time_ns': 'int', 'wr_highest_offset': 'int', - 'rd_merged': 'int', 'wr_merged': 'int' } } + 'rd_merged': 'int', 'wr_merged': 'int', '*idle_time_ns': 'int', + 'failed_rd_operations': 'int', 'failed_wr_operations': 'int', + 'failed_flush_operations': 'int', 'invalid_rd_operations': 'int', + 'invalid_wr_operations': 'int', 'invalid_flush_operations': 'int', + 'account_invalid': 'bool', 'account_failed': 'bool', + 'timed_stats': ['BlockDeviceTimedStats'] } } ## # @BlockStats: @@ -1436,6 +1525,15 @@ # (default: enospc) # @read-only: #optional whether the block device should be read-only # (default: false) +# @stats-account-invalid: #optional whether to include invalid +# operations when computing last access statistics +# (default: true) (Since 2.5) +# @stats-account-failed: #optional whether to include failed +# operations when computing latency and last +# access statistics (default: true) (Since 2.5) +# @stats-intervals: #optional colon-separated list of intervals for +# collecting I/O statistics, in seconds (default: none) +# (Since 2.5) # @detect-zeroes: #optional detect and optimize zero writes (Since 2.1) # (default: off) # @@ -1451,6 +1549,9 @@ '*rerror': 'BlockdevOnError', '*werror': 'BlockdevOnError', '*read-only': 'bool', + '*stats-account-invalid': 'bool', + '*stats-account-failed': 'bool', + '*stats-intervals': 'str', '*detect-zeroes': 'BlockdevDetectZeroesOptions' } } ## diff --git a/qemu-img.c b/qemu-img.c index 9831db75ef..033011c4e7 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -645,9 +645,6 @@ static void common_block_job_cb(void *opaque, int ret) if (ret < 0) { error_setg_errno(cbi->errp, -ret, "Block job failed"); } - - /* Drop this block job's reference */ - bdrv_unref(cbi->bs); } static void run_block_job(BlockJob *job, Error **errp) diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c index 9c77aafb99..18fc2bdc10 100644 --- a/qemu-io-cmds.c +++ b/qemu-io-cmds.c @@ -1428,6 +1428,7 @@ static void aio_write_done(void *opaque, int ret) if (ret < 0) { printf("aio_write failed: %s\n", strerror(-ret)); + block_acct_failed(blk_get_stats(ctx->blk), &ctx->acct); goto out; } @@ -1456,6 +1457,7 @@ static void aio_read_done(void *opaque, int ret) if (ret < 0) { printf("readv failed: %s\n", strerror(-ret)); + block_acct_failed(blk_get_stats(ctx->blk), &ctx->acct); goto out; } @@ -1569,6 +1571,7 @@ static int aio_read_f(BlockBackend *blk, int argc, char **argv) if (ctx->offset & 0x1ff) { printf("offset %" PRId64 " is not sector aligned\n", ctx->offset); + block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ); g_free(ctx); return 0; } @@ -1576,6 +1579,7 @@ static int aio_read_f(BlockBackend *blk, int argc, char **argv) nr_iov = argc - optind; ctx->buf = create_iovec(blk, &ctx->qiov, &argv[optind], nr_iov, 0xab); if (ctx->buf == NULL) { + block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ); g_free(ctx); return 0; } @@ -1664,6 +1668,7 @@ static int aio_write_f(BlockBackend *blk, int argc, char **argv) if (ctx->offset & 0x1ff) { printf("offset %" PRId64 " is not sector aligned\n", ctx->offset); + block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE); g_free(ctx); return 0; } @@ -1671,6 +1676,7 @@ static int aio_write_f(BlockBackend *blk, int argc, char **argv) nr_iov = argc - optind; ctx->buf = create_iovec(blk, &ctx->qiov, &argv[optind], nr_iov, pattern); if (ctx->buf == NULL) { + block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE); g_free(ctx); return 0; } @@ -1685,7 +1691,10 @@ static int aio_write_f(BlockBackend *blk, int argc, char **argv) static int aio_flush_f(BlockBackend *blk, int argc, char **argv) { + BlockAcctCookie cookie; + block_acct_start(blk_get_stats(blk), &cookie, 0, BLOCK_ACCT_FLUSH); blk_drain_all(); + block_acct_done(blk_get_stats(blk), &cookie); return 0; } diff --git a/qmp-commands.hx b/qmp-commands.hx index 02c0c5bc42..9d8b42f59a 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -1281,7 +1281,7 @@ EQMP }, { .name = "transaction", - .args_type = "actions:q", + .args_type = "actions:q,properties:q?", .mhandler.cmd_new = qmp_marshal_transaction, }, @@ -2583,6 +2583,64 @@ Each json-object contain the following: another request (json-int) - "wr_merged": number of write requests that have been merged into another request (json-int) + - "idle_time_ns": time since the last I/O operation, in + nanoseconds. If the field is absent it means + that there haven't been any operations yet + (json-int, optional) + - "failed_rd_operations": number of failed read operations + (json-int) + - "failed_wr_operations": number of failed write operations + (json-int) + - "failed_flush_operations": number of failed flush operations + (json-int) + - "invalid_rd_operations": number of invalid read operations + (json-int) + - "invalid_wr_operations": number of invalid write operations + (json-int) + - "invalid_flush_operations": number of invalid flush operations + (json-int) + - "account_invalid": whether invalid operations are included in + the last access statistics (json-bool) + - "account_failed": whether failed operations are included in the + latency and last access statistics + (json-bool) + - "timed_stats": A json-array containing statistics collected in + specific intervals, with the following members: + - "interval_length": interval used for calculating the + statistics, in seconds (json-int) + - "min_rd_latency_ns": minimum latency of read operations in + the defined interval, in nanoseconds + (json-int) + - "min_wr_latency_ns": minimum latency of write operations in + the defined interval, in nanoseconds + (json-int) + - "min_flush_latency_ns": minimum latency of flush operations + in the defined interval, in + nanoseconds (json-int) + - "max_rd_latency_ns": maximum latency of read operations in + the defined interval, in nanoseconds + (json-int) + - "max_wr_latency_ns": maximum latency of write operations in + the defined interval, in nanoseconds + (json-int) + - "max_flush_latency_ns": maximum latency of flush operations + in the defined interval, in + nanoseconds (json-int) + - "avg_rd_latency_ns": average latency of read operations in + the defined interval, in nanoseconds + (json-int) + - "avg_wr_latency_ns": average latency of write operations in + the defined interval, in nanoseconds + (json-int) + - "avg_flush_latency_ns": average latency of flush operations + in the defined interval, in + nanoseconds (json-int) + - "avg_rd_queue_depth": average number of pending read + operations in the defined interval + (json-number) + - "avg_wr_queue_depth": average number of pending write + operations in the defined interval + (json-number). - "parent": Contains recursively the statistics of the underlying protocol (e.g. the host file for a qcow2 image). If there is no underlying protocol, this field is omitted @@ -2607,7 +2665,10 @@ Example: "flush_total_times_ns":49653 "flush_operations":61, "rd_merged":0, - "wr_merged":0 + "wr_merged":0, + "idle_time_ns":2953431879, + "account_invalid":true, + "account_failed":false } }, "stats":{ @@ -2621,7 +2682,10 @@ Example: "rd_total_times_ns":3465673657 "flush_total_times_ns":49653, "rd_merged":0, - "wr_merged":0 + "wr_merged":0, + "idle_time_ns":2953431879, + "account_invalid":true, + "account_failed":false } }, { @@ -2637,7 +2701,9 @@ Example: "rd_total_times_ns":0 "flush_total_times_ns":0, "rd_merged":0, - "wr_merged":0 + "wr_merged":0, + "account_invalid":false, + "account_failed":false } }, { @@ -2653,7 +2719,9 @@ Example: "rd_total_times_ns":0 "flush_total_times_ns":0, "rd_merged":0, - "wr_merged":0 + "wr_merged":0, + "account_invalid":false, + "account_failed":false } }, { @@ -2669,7 +2737,9 @@ Example: "rd_total_times_ns":0 "flush_total_times_ns":0, "rd_merged":0, - "wr_merged":0 + "wr_merged":0, + "account_invalid":false, + "account_failed":false } } ] diff --git a/tests/Makefile b/tests/Makefile index 92969e8288..90c4141ac5 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -47,6 +47,8 @@ check-unit-y += tests/test-thread-pool$(EXESUF) gcov-files-test-thread-pool-y = thread-pool.c gcov-files-test-hbitmap-y = util/hbitmap.c check-unit-y += tests/test-hbitmap$(EXESUF) +gcov-files-test-hbitmap-y = blockjob.c +check-unit-y += tests/test-blockjob-txn$(EXESUF) check-unit-y += tests/test-x86-cpuid$(EXESUF) # all code tested by test-x86-cpuid is inside topology.h gcov-files-test-x86-cpuid-y = @@ -81,6 +83,7 @@ check-unit-y += tests/test-crypto-cipher$(EXESUF) check-unit-$(CONFIG_GNUTLS) += tests/test-crypto-tlscredsx509$(EXESUF) check-unit-$(CONFIG_GNUTLS) += tests/test-crypto-tlssession$(EXESUF) check-unit-$(CONFIG_LINUX) += tests/test-qga$(EXESUF) +check-unit-y += tests/test-timed-average$(EXESUF) check-block-$(CONFIG_POSIX) += tests/qemu-iotests-quick.sh @@ -390,6 +393,7 @@ tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(test-block-obj-y) tests/test-aio$(EXESUF): tests/test-aio.o $(test-block-obj-y) tests/test-rfifolock$(EXESUF): tests/test-rfifolock.o $(test-util-obj-y) tests/test-throttle$(EXESUF): tests/test-throttle.o $(test-block-obj-y) +tests/test-blockjob-txn$(EXESUF): tests/test-blockjob-txn.o $(test-block-obj-y) $(test-util-obj-y) tests/test-thread-pool$(EXESUF): tests/test-thread-pool.o $(test-block-obj-y) tests/test-iov$(EXESUF): tests/test-iov.o $(test-util-obj-y) tests/test-hbitmap$(EXESUF): tests/test-hbitmap.o $(test-util-obj-y) @@ -409,6 +413,9 @@ tests/test-vmstate$(EXESUF): tests/test-vmstate.o \ migration/vmstate.o migration/qemu-file.o migration/qemu-file-buf.o \ migration/qemu-file-unix.o qjson.o \ $(test-qom-obj-y) +tests/test-timed-average$(EXESUF): tests/test-timed-average.o qemu-timer.o \ + libqemuutil.a stubs/clock-warp.o stubs/cpu-get-icount.o \ + stubs/notify-event.o stubs/replay.o tests/test-qapi-types.c tests/test-qapi-types.h :\ $(SRC_PATH)/tests/qapi-schema/qapi-schema-test.json $(SRC_PATH)/scripts/qapi-types.py $(qapi-py) diff --git a/tests/qemu-iotests/124 b/tests/qemu-iotests/124 index 9ccd11809f..c928f0101b 100644 --- a/tests/qemu-iotests/124 +++ b/tests/qemu-iotests/124 @@ -36,6 +36,23 @@ def try_remove(img): pass +def transaction_action(action, **kwargs): + return { + 'type': action, + 'data': dict((k.replace('_', '-'), v) for k, v in kwargs.iteritems()) + } + + +def transaction_bitmap_clear(node, name, **kwargs): + return transaction_action('block-dirty-bitmap-clear', + node=node, name=name, **kwargs) + + +def transaction_drive_backup(device, target, **kwargs): + return transaction_action('drive-backup', device=device, target=target, + **kwargs) + + class Bitmap: def __init__(self, name, drive): self.name = name @@ -122,9 +139,12 @@ class TestIncrementalBackup(iotests.QMPTestCase): def do_qmp_backup(self, error='Input/output error', **kwargs): res = self.vm.qmp('drive-backup', **kwargs) self.assert_qmp(res, 'return', {}) + return self.wait_qmp_backup(kwargs['device'], error) + + def wait_qmp_backup(self, device, error='Input/output error'): event = self.vm.event_wait(name="BLOCK_JOB_COMPLETED", - match={'data': {'device': kwargs['device']}}) + match={'data': {'device': device}}) self.assertNotEqual(event, None) try: @@ -139,6 +159,12 @@ class TestIncrementalBackup(iotests.QMPTestCase): return False + def wait_qmp_backup_cancelled(self, device): + event = self.vm.event_wait(name='BLOCK_JOB_CANCELLED', + match={'data': {'device': device}}) + self.assertNotEqual(event, None) + + def create_anchor_backup(self, drive=None): if drive is None: drive = self.drives[-1] @@ -264,6 +290,43 @@ class TestIncrementalBackup(iotests.QMPTestCase): return self.do_incremental_simple(granularity=131072) + def test_incremental_transaction(self): + '''Test: Verify backups made from transactionally created bitmaps. + + Create a bitmap "before" VM execution begins, then create a second + bitmap AFTER writes have already occurred. Use transactions to create + a full backup and synchronize both bitmaps to this backup. + Create an incremental backup through both bitmaps and verify that + both backups match the current drive0 image. + ''' + + drive0 = self.drives[0] + bitmap0 = self.add_bitmap('bitmap0', drive0) + self.hmp_io_writes(drive0['id'], (('0xab', 0, 512), + ('0xfe', '16M', '256k'), + ('0x64', '32736k', '64k'))) + bitmap1 = self.add_bitmap('bitmap1', drive0) + + result = self.vm.qmp('transaction', actions=[ + transaction_bitmap_clear(bitmap0.drive['id'], bitmap0.name), + transaction_bitmap_clear(bitmap1.drive['id'], bitmap1.name), + transaction_drive_backup(drive0['id'], drive0['backup'], + sync='full', format=drive0['fmt']) + ]) + self.assert_qmp(result, 'return', {}) + self.wait_until_completed(drive0['id']) + self.files.append(drive0['backup']) + + self.hmp_io_writes(drive0['id'], (('0x9a', 0, 512), + ('0x55', '8M', '352k'), + ('0x78', '15872k', '1M'))) + # Both bitmaps should be correctly in sync. + self.create_incremental(bitmap0) + self.create_incremental(bitmap1) + self.vm.shutdown() + self.check_backups() + + def test_incremental_failure(self): '''Test: Verify backups made after a failure are correct. @@ -321,6 +384,123 @@ class TestIncrementalBackup(iotests.QMPTestCase): self.check_backups() + def test_transaction_failure(self): + '''Test: Verify backups made from a transaction that partially fails. + + Add a second drive with its own unique pattern, and add a bitmap to each + drive. Use blkdebug to interfere with the backup on just one drive and + attempt to create a coherent incremental backup across both drives. + + verify a failure in one but not both, then delete the failed stubs and + re-run the same transaction. + + verify that both incrementals are created successfully. + ''' + + # Create a second drive, with pattern: + drive1 = self.add_node('drive1') + self.img_create(drive1['file'], drive1['fmt']) + io_write_patterns(drive1['file'], (('0x14', 0, 512), + ('0x5d', '1M', '32k'), + ('0xcd', '32M', '124k'))) + + # Create a blkdebug interface to this img as 'drive1' + result = self.vm.qmp('blockdev-add', options={ + 'id': drive1['id'], + 'driver': drive1['fmt'], + 'file': { + 'driver': 'blkdebug', + 'image': { + 'driver': 'file', + 'filename': drive1['file'] + }, + 'set-state': [{ + 'event': 'flush_to_disk', + 'state': 1, + 'new_state': 2 + }], + 'inject-error': [{ + 'event': 'read_aio', + 'errno': 5, + 'state': 2, + 'immediately': False, + 'once': True + }], + } + }) + self.assert_qmp(result, 'return', {}) + + # Create bitmaps and full backups for both drives + drive0 = self.drives[0] + dr0bm0 = self.add_bitmap('bitmap0', drive0) + dr1bm0 = self.add_bitmap('bitmap0', drive1) + self.create_anchor_backup(drive0) + self.create_anchor_backup(drive1) + self.assert_no_active_block_jobs() + self.assertFalse(self.vm.get_qmp_events(wait=False)) + + # Emulate some writes + self.hmp_io_writes(drive0['id'], (('0xab', 0, 512), + ('0xfe', '16M', '256k'), + ('0x64', '32736k', '64k'))) + self.hmp_io_writes(drive1['id'], (('0xba', 0, 512), + ('0xef', '16M', '256k'), + ('0x46', '32736k', '64k'))) + + # Create incremental backup targets + target0 = self.prepare_backup(dr0bm0) + target1 = self.prepare_backup(dr1bm0) + + # Ask for a new incremental backup per-each drive, + # expecting drive1's backup to fail: + transaction = [ + transaction_drive_backup(drive0['id'], target0, sync='incremental', + format=drive0['fmt'], mode='existing', + bitmap=dr0bm0.name), + transaction_drive_backup(drive1['id'], target1, sync='incremental', + format=drive1['fmt'], mode='existing', + bitmap=dr1bm0.name) + ] + result = self.vm.qmp('transaction', actions=transaction, + properties={'completion-mode': 'grouped'} ) + self.assert_qmp(result, 'return', {}) + + # Observe that drive0's backup is cancelled and drive1 completes with + # an error. + self.wait_qmp_backup_cancelled(drive0['id']) + self.assertFalse(self.wait_qmp_backup(drive1['id'])) + error = self.vm.event_wait('BLOCK_JOB_ERROR') + self.assert_qmp(error, 'data', {'device': drive1['id'], + 'action': 'report', + 'operation': 'read'}) + self.assertFalse(self.vm.get_qmp_events(wait=False)) + self.assert_no_active_block_jobs() + + # Delete drive0's successful target and eliminate our record of the + # unsuccessful drive1 target. Then re-run the same transaction. + dr0bm0.del_target() + dr1bm0.del_target() + target0 = self.prepare_backup(dr0bm0) + target1 = self.prepare_backup(dr1bm0) + + # Re-run the exact same transaction. + result = self.vm.qmp('transaction', actions=transaction, + properties={'completion-mode':'grouped'}) + self.assert_qmp(result, 'return', {}) + + # Both should complete successfully this time. + self.assertTrue(self.wait_qmp_backup(drive0['id'])) + self.assertTrue(self.wait_qmp_backup(drive1['id'])) + self.make_reference_backup(dr0bm0) + self.make_reference_backup(dr1bm0) + self.assertFalse(self.vm.get_qmp_events(wait=False)) + self.assert_no_active_block_jobs() + + # And the images should of course validate. + self.vm.shutdown() + self.check_backups() + + def test_sync_dirty_bitmap_missing(self): self.assert_no_active_block_jobs() self.files.append(self.err_img) diff --git a/tests/qemu-iotests/124.out b/tests/qemu-iotests/124.out index 2f7d3902f2..dae404e278 100644 --- a/tests/qemu-iotests/124.out +++ b/tests/qemu-iotests/124.out @@ -1,5 +1,5 @@ -....... +......... ---------------------------------------------------------------------- -Ran 7 tests +Ran 9 tests OK diff --git a/tests/qemu-iotests/136 b/tests/qemu-iotests/136 new file mode 100644 index 0000000000..f574d83ff7 --- /dev/null +++ b/tests/qemu-iotests/136 @@ -0,0 +1,349 @@ +#!/usr/bin/env python +# +# Tests for block device statistics +# +# Copyright (C) 2015 Igalia, S.L. +# Author: Alberto Garcia <berto@igalia.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# + +import iotests +import os + +interval_length = 10 +nsec_per_sec = 1000000000 +op_latency = nsec_per_sec / 1000 # See qtest_latency_ns in accounting.c +bad_sector = 8192 +bad_offset = bad_sector * 512 +blkdebug_file = os.path.join(iotests.test_dir, 'blkdebug.conf') + +class BlockDeviceStatsTestCase(iotests.QMPTestCase): + test_img = "null-aio://" + total_rd_bytes = 0 + total_rd_ops = 0 + total_wr_bytes = 0 + total_wr_ops = 0 + total_wr_merged = 0 + total_flush_ops = 0 + failed_rd_ops = 0 + failed_wr_ops = 0 + invalid_rd_ops = 0 + invalid_wr_ops = 0 + wr_highest_offset = 0 + account_invalid = False + account_failed = False + + def blockstats(self, device): + result = self.vm.qmp("query-blockstats") + for r in result['return']: + if r['device'] == device: + return r['stats'] + raise Exception("Device not found for blockstats: %s" % device) + + def create_blkdebug_file(self): + file = open(blkdebug_file, 'w') + file.write(''' +[inject-error] +event = "read_aio" +errno = "5" +sector = "%d" + +[inject-error] +event = "write_aio" +errno = "5" +sector = "%d" +''' % (bad_sector, bad_sector)) + file.close() + + def setUp(self): + drive_args = [] + drive_args.append("stats-intervals=%d" % interval_length) + drive_args.append("stats-account-invalid=%s" % + (self.account_invalid and "on" or "off")) + drive_args.append("stats-account-failed=%s" % + (self.account_failed and "on" or "off")) + self.create_blkdebug_file() + self.vm = iotests.VM().add_drive('blkdebug:%s:%s ' % + (blkdebug_file, self.test_img), + ','.join(drive_args)) + self.vm.launch() + # Set an initial value for the clock + self.vm.qtest("clock_step %d" % nsec_per_sec) + + def tearDown(self): + self.vm.shutdown() + os.remove(blkdebug_file) + + def accounted_ops(self, read = False, write = False, flush = False): + ops = 0 + if write: + ops += self.total_wr_ops + if self.account_failed: + ops += self.failed_wr_ops + if self.account_invalid: + ops += self.invalid_wr_ops + if read: + ops += self.total_rd_ops + if self.account_failed: + ops += self.failed_rd_ops + if self.account_invalid: + ops += self.invalid_rd_ops + if flush: + ops += self.total_flush_ops + return ops + + def accounted_latency(self, read = False, write = False, flush = False): + latency = 0 + if write: + latency += self.total_wr_ops * op_latency + if self.account_failed: + latency += self.failed_wr_ops * op_latency + if read: + latency += self.total_rd_ops * op_latency + if self.account_failed: + latency += self.failed_rd_ops * op_latency + if flush: + latency += self.total_flush_ops * op_latency + return latency + + def check_values(self): + stats = self.blockstats('drive0') + + # Check that the totals match with what we have calculated + self.assertEqual(self.total_rd_bytes, stats['rd_bytes']) + self.assertEqual(self.total_wr_bytes, stats['wr_bytes']) + self.assertEqual(self.total_rd_ops, stats['rd_operations']) + self.assertEqual(self.total_wr_ops, stats['wr_operations']) + self.assertEqual(self.total_flush_ops, stats['flush_operations']) + self.assertEqual(self.wr_highest_offset, stats['wr_highest_offset']) + self.assertEqual(self.failed_rd_ops, stats['failed_rd_operations']) + self.assertEqual(self.failed_wr_ops, stats['failed_wr_operations']) + self.assertEqual(self.invalid_rd_ops, stats['invalid_rd_operations']) + self.assertEqual(self.invalid_wr_ops, stats['invalid_wr_operations']) + self.assertEqual(self.account_invalid, stats['account_invalid']) + self.assertEqual(self.account_failed, stats['account_failed']) + self.assertEqual(self.total_wr_merged, stats['wr_merged']) + + # Check that there's exactly one interval with the length we defined + self.assertEqual(1, len(stats['timed_stats'])) + timed_stats = stats['timed_stats'][0] + self.assertEqual(interval_length, timed_stats['interval_length']) + + total_rd_latency = self.accounted_latency(read = True) + if (total_rd_latency != 0): + self.assertEqual(total_rd_latency, stats['rd_total_time_ns']) + self.assertEqual(op_latency, timed_stats['min_rd_latency_ns']) + self.assertEqual(op_latency, timed_stats['max_rd_latency_ns']) + self.assertEqual(op_latency, timed_stats['avg_rd_latency_ns']) + self.assertLess(0, timed_stats['avg_rd_queue_depth']) + else: + self.assertEqual(0, stats['rd_total_time_ns']) + self.assertEqual(0, timed_stats['min_rd_latency_ns']) + self.assertEqual(0, timed_stats['max_rd_latency_ns']) + self.assertEqual(0, timed_stats['avg_rd_latency_ns']) + self.assertEqual(0, timed_stats['avg_rd_queue_depth']) + + # min read latency <= avg read latency <= max read latency + self.assertLessEqual(timed_stats['min_rd_latency_ns'], + timed_stats['avg_rd_latency_ns']) + self.assertLessEqual(timed_stats['avg_rd_latency_ns'], + timed_stats['max_rd_latency_ns']) + + total_wr_latency = self.accounted_latency(write = True) + if (total_wr_latency != 0): + self.assertEqual(total_wr_latency, stats['wr_total_time_ns']) + self.assertEqual(op_latency, timed_stats['min_wr_latency_ns']) + self.assertEqual(op_latency, timed_stats['max_wr_latency_ns']) + self.assertEqual(op_latency, timed_stats['avg_wr_latency_ns']) + self.assertLess(0, timed_stats['avg_wr_queue_depth']) + else: + self.assertEqual(0, stats['wr_total_time_ns']) + self.assertEqual(0, timed_stats['min_wr_latency_ns']) + self.assertEqual(0, timed_stats['max_wr_latency_ns']) + self.assertEqual(0, timed_stats['avg_wr_latency_ns']) + self.assertEqual(0, timed_stats['avg_wr_queue_depth']) + + # min write latency <= avg write latency <= max write latency + self.assertLessEqual(timed_stats['min_wr_latency_ns'], + timed_stats['avg_wr_latency_ns']) + self.assertLessEqual(timed_stats['avg_wr_latency_ns'], + timed_stats['max_wr_latency_ns']) + + total_flush_latency = self.accounted_latency(flush = True) + if (total_flush_latency != 0): + self.assertEqual(total_flush_latency, stats['flush_total_time_ns']) + self.assertEqual(op_latency, timed_stats['min_flush_latency_ns']) + self.assertEqual(op_latency, timed_stats['max_flush_latency_ns']) + self.assertEqual(op_latency, timed_stats['avg_flush_latency_ns']) + else: + self.assertEqual(0, stats['flush_total_time_ns']) + self.assertEqual(0, timed_stats['min_flush_latency_ns']) + self.assertEqual(0, timed_stats['max_flush_latency_ns']) + self.assertEqual(0, timed_stats['avg_flush_latency_ns']) + + # min flush latency <= avg flush latency <= max flush latency + self.assertLessEqual(timed_stats['min_flush_latency_ns'], + timed_stats['avg_flush_latency_ns']) + self.assertLessEqual(timed_stats['avg_flush_latency_ns'], + timed_stats['max_flush_latency_ns']) + + # idle_time_ns must be > 0 if we have performed any operation + if (self.accounted_ops(read = True, write = True, flush = True) != 0): + self.assertLess(0, stats['idle_time_ns']) + else: + self.assertFalse(stats.has_key('idle_time_ns')) + + # This test does not alter these, so they must be all 0 + self.assertEqual(0, stats['rd_merged']) + self.assertEqual(0, stats['failed_flush_operations']) + self.assertEqual(0, stats['invalid_flush_operations']) + + def do_test_stats(self, rd_size = 0, rd_ops = 0, wr_size = 0, wr_ops = 0, + flush_ops = 0, invalid_rd_ops = 0, invalid_wr_ops = 0, + failed_rd_ops = 0, failed_wr_ops = 0, wr_merged = 0): + # The 'ops' list will contain all the requested I/O operations + ops = [] + for i in range(rd_ops): + ops.append("aio_read %d %d" % (i * rd_size, rd_size)) + + for i in range(wr_ops): + ops.append("aio_write %d %d" % (i * wr_size, wr_size)) + + for i in range(flush_ops): + ops.append("aio_flush") + + highest_offset = wr_ops * wr_size + + # Two types of invalid operations: unaligned length and unaligned offset + for i in range(invalid_rd_ops / 2): + ops.append("aio_read 0 511") + + for i in range(invalid_rd_ops / 2, invalid_rd_ops): + ops.append("aio_read 13 512") + + for i in range(invalid_wr_ops / 2): + ops.append("aio_write 0 511") + + for i in range(invalid_wr_ops / 2, invalid_wr_ops): + ops.append("aio_write 13 512") + + for i in range(failed_rd_ops): + ops.append("aio_read %d 512" % bad_offset) + + for i in range(failed_wr_ops): + ops.append("aio_write %d 512" % bad_offset) + + if failed_wr_ops > 0: + highest_offset = max(highest_offset, bad_offset + 512) + + for i in range(wr_merged): + first = i * wr_size * 2 + second = first + wr_size + ops.append("multiwrite %d %d ; %d %d" % + (first, wr_size, second, wr_size)) + + highest_offset = max(highest_offset, wr_merged * wr_size * 2) + + # Now perform all operations + for op in ops: + self.vm.hmp_qemu_io("drive0", op) + + # Update the expected totals + self.total_rd_bytes += rd_ops * rd_size + self.total_rd_ops += rd_ops + self.total_wr_bytes += wr_ops * wr_size + self.total_wr_ops += wr_ops + self.total_wr_merged += wr_merged + self.total_flush_ops += flush_ops + self.invalid_rd_ops += invalid_rd_ops + self.invalid_wr_ops += invalid_wr_ops + self.failed_rd_ops += failed_rd_ops + self.failed_wr_ops += failed_wr_ops + + self.wr_highest_offset = max(self.wr_highest_offset, highest_offset) + + # Advance the clock so idle_time_ns has a meaningful value + self.vm.qtest("clock_step %d" % nsec_per_sec) + + # And check that the actual statistics match the expected ones + self.check_values() + + def test_read_only(self): + test_values = [[512, 1], + [65536, 1], + [512, 12], + [65536, 12]] + for i in test_values: + self.do_test_stats(rd_size = i[0], rd_ops = i[1]) + + def test_write_only(self): + test_values = [[512, 1], + [65536, 1], + [512, 12], + [65536, 12]] + for i in test_values: + self.do_test_stats(wr_size = i[0], wr_ops = i[1]) + + def test_invalid(self): + self.do_test_stats(invalid_rd_ops = 7) + self.do_test_stats(invalid_wr_ops = 3) + self.do_test_stats(invalid_rd_ops = 4, invalid_wr_ops = 5) + + def test_failed(self): + self.do_test_stats(failed_rd_ops = 8) + self.do_test_stats(failed_wr_ops = 6) + self.do_test_stats(failed_rd_ops = 5, failed_wr_ops = 12) + + def test_flush(self): + self.do_test_stats(flush_ops = 8) + + def test_merged(self): + for i in range(5): + self.do_test_stats(wr_merged = i * 3) + + def test_all(self): + # rd_size, rd_ops, wr_size, wr_ops, flush_ops + # invalid_rd_ops, invalid_wr_ops, + # failed_rd_ops, failed_wr_ops + # wr_merged + test_values = [[512, 1, 512, 1, 1, 4, 7, 5, 2, 1], + [65536, 1, 2048, 12, 7, 7, 5, 2, 5, 5], + [32768, 9, 8192, 1, 4, 3, 2, 4, 6, 4], + [16384, 11, 3584, 16, 9, 8, 6, 7, 3, 4]] + for i in test_values: + self.do_test_stats(*i) + + def test_no_op(self): + # All values must be sane before doing any I/O + self.check_values() + + +class BlockDeviceStatsTestAccountInvalid(BlockDeviceStatsTestCase): + account_invalid = True + account_failed = False + +class BlockDeviceStatsTestAccountFailed(BlockDeviceStatsTestCase): + account_invalid = False + account_failed = True + +class BlockDeviceStatsTestAccountBoth(BlockDeviceStatsTestCase): + account_invalid = True + account_failed = True + +class BlockDeviceStatsTestCoroutine(BlockDeviceStatsTestCase): + test_img = "null-co://" + +if __name__ == '__main__': + iotests.main(supported_fmts=["raw"]) diff --git a/tests/qemu-iotests/136.out b/tests/qemu-iotests/136.out new file mode 100644 index 0000000000..0a5e9583a4 --- /dev/null +++ b/tests/qemu-iotests/136.out @@ -0,0 +1,5 @@ +........................................ +---------------------------------------------------------------------- +Ran 40 tests + +OK diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group index c69265decd..5a0880893a 100644 --- a/tests/qemu-iotests/group +++ b/tests/qemu-iotests/group @@ -136,6 +136,7 @@ 132 rw auto quick 134 rw auto quick 135 rw auto +136 rw auto 137 rw auto 138 rw auto quick 139 rw auto quick diff --git a/tests/test-blockjob-txn.c b/tests/test-blockjob-txn.c new file mode 100644 index 0000000000..34747e924d --- /dev/null +++ b/tests/test-blockjob-txn.c @@ -0,0 +1,250 @@ +/* + * Blockjob transactions tests + * + * Copyright Red Hat, Inc. 2015 + * + * Authors: + * Stefan Hajnoczi <stefanha@redhat.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + */ + +#include <glib.h> +#include "qapi/error.h" +#include "qemu/main-loop.h" +#include "block/blockjob.h" + +typedef struct { + BlockJob common; + unsigned int iterations; + bool use_timer; + int rc; + int *result; +} TestBlockJob; + +static const BlockJobDriver test_block_job_driver = { + .instance_size = sizeof(TestBlockJob), +}; + +static void test_block_job_complete(BlockJob *job, void *opaque) +{ + BlockDriverState *bs = job->bs; + int rc = (intptr_t)opaque; + + if (block_job_is_cancelled(job)) { + rc = -ECANCELED; + } + + block_job_completed(job, rc); + bdrv_unref(bs); +} + +static void coroutine_fn test_block_job_run(void *opaque) +{ + TestBlockJob *s = opaque; + BlockJob *job = &s->common; + + while (s->iterations--) { + if (s->use_timer) { + block_job_sleep_ns(job, QEMU_CLOCK_REALTIME, 0); + } else { + block_job_yield(job); + } + + if (block_job_is_cancelled(job)) { + break; + } + } + + block_job_defer_to_main_loop(job, test_block_job_complete, + (void *)(intptr_t)s->rc); +} + +typedef struct { + TestBlockJob *job; + int *result; +} TestBlockJobCBData; + +static void test_block_job_cb(void *opaque, int ret) +{ + TestBlockJobCBData *data = opaque; + if (!ret && block_job_is_cancelled(&data->job->common)) { + ret = -ECANCELED; + } + *data->result = ret; + g_free(data); +} + +/* Create a block job that completes with a given return code after a given + * number of event loop iterations. The return code is stored in the given + * result pointer. + * + * The event loop iterations can either be handled automatically with a 0 delay + * timer, or they can be stepped manually by entering the coroutine. + */ +static BlockJob *test_block_job_start(unsigned int iterations, + bool use_timer, + int rc, int *result) +{ + BlockDriverState *bs; + TestBlockJob *s; + TestBlockJobCBData *data; + + data = g_new0(TestBlockJobCBData, 1); + bs = bdrv_new(); + s = block_job_create(&test_block_job_driver, bs, 0, test_block_job_cb, + data, &error_abort); + s->iterations = iterations; + s->use_timer = use_timer; + s->rc = rc; + s->result = result; + s->common.co = qemu_coroutine_create(test_block_job_run); + data->job = s; + data->result = result; + qemu_coroutine_enter(s->common.co, s); + return &s->common; +} + +static void test_single_job(int expected) +{ + BlockJob *job; + BlockJobTxn *txn; + int result = -EINPROGRESS; + + txn = block_job_txn_new(); + job = test_block_job_start(1, true, expected, &result); + block_job_txn_add_job(txn, job); + + if (expected == -ECANCELED) { + block_job_cancel(job); + } + + while (result == -EINPROGRESS) { + aio_poll(qemu_get_aio_context(), true); + } + g_assert_cmpint(result, ==, expected); + + block_job_txn_unref(txn); +} + +static void test_single_job_success(void) +{ + test_single_job(0); +} + +static void test_single_job_failure(void) +{ + test_single_job(-EIO); +} + +static void test_single_job_cancel(void) +{ + test_single_job(-ECANCELED); +} + +static void test_pair_jobs(int expected1, int expected2) +{ + BlockJob *job1; + BlockJob *job2; + BlockJobTxn *txn; + int result1 = -EINPROGRESS; + int result2 = -EINPROGRESS; + + txn = block_job_txn_new(); + job1 = test_block_job_start(1, true, expected1, &result1); + block_job_txn_add_job(txn, job1); + job2 = test_block_job_start(2, true, expected2, &result2); + block_job_txn_add_job(txn, job2); + + if (expected1 == -ECANCELED) { + block_job_cancel(job1); + } + if (expected2 == -ECANCELED) { + block_job_cancel(job2); + } + + while (result1 == -EINPROGRESS || result2 == -EINPROGRESS) { + aio_poll(qemu_get_aio_context(), true); + } + + /* Failure or cancellation of one job cancels the other job */ + if (expected1 != 0) { + expected2 = -ECANCELED; + } else if (expected2 != 0) { + expected1 = -ECANCELED; + } + + g_assert_cmpint(result1, ==, expected1); + g_assert_cmpint(result2, ==, expected2); + + block_job_txn_unref(txn); +} + +static void test_pair_jobs_success(void) +{ + test_pair_jobs(0, 0); +} + +static void test_pair_jobs_failure(void) +{ + /* Test both orderings. The two jobs run for a different number of + * iterations so the code path is different depending on which job fails + * first. + */ + test_pair_jobs(-EIO, 0); + test_pair_jobs(0, -EIO); +} + +static void test_pair_jobs_cancel(void) +{ + test_pair_jobs(-ECANCELED, 0); + test_pair_jobs(0, -ECANCELED); +} + +static void test_pair_jobs_fail_cancel_race(void) +{ + BlockJob *job1; + BlockJob *job2; + BlockJobTxn *txn; + int result1 = -EINPROGRESS; + int result2 = -EINPROGRESS; + + txn = block_job_txn_new(); + job1 = test_block_job_start(1, true, -ECANCELED, &result1); + block_job_txn_add_job(txn, job1); + job2 = test_block_job_start(2, false, 0, &result2); + block_job_txn_add_job(txn, job2); + + block_job_cancel(job1); + + /* Now make job2 finish before the main loop kicks jobs. This simulates + * the race between a pending kick and another job completing. + */ + block_job_enter(job2); + block_job_enter(job2); + + while (result1 == -EINPROGRESS || result2 == -EINPROGRESS) { + aio_poll(qemu_get_aio_context(), true); + } + + g_assert_cmpint(result1, ==, -ECANCELED); + g_assert_cmpint(result2, ==, -ECANCELED); + + block_job_txn_unref(txn); +} + +int main(int argc, char **argv) +{ + qemu_init_main_loop(&error_abort); + + g_test_init(&argc, &argv, NULL); + g_test_add_func("/single/success", test_single_job_success); + g_test_add_func("/single/failure", test_single_job_failure); + g_test_add_func("/single/cancel", test_single_job_cancel); + g_test_add_func("/pair/success", test_pair_jobs_success); + g_test_add_func("/pair/failure", test_pair_jobs_failure); + g_test_add_func("/pair/cancel", test_pair_jobs_cancel); + g_test_add_func("/pair/fail-cancel-race", test_pair_jobs_fail_cancel_race); + return g_test_run(); +} diff --git a/tests/test-timed-average.c b/tests/test-timed-average.c new file mode 100644 index 0000000000..a049799b80 --- /dev/null +++ b/tests/test-timed-average.c @@ -0,0 +1,90 @@ +/* + * Timed average computation tests + * + * Copyright Nodalink, EURL. 2014 + * + * Authors: + * Benoît Canet <benoit.canet@nodalink.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + */ + +#include <glib.h> +#include <unistd.h> + +#include "qemu/timed-average.h" + +/* This is the clock for QEMU_CLOCK_VIRTUAL */ +static int64_t my_clock_value; + +int64_t cpu_get_clock(void) +{ + return my_clock_value; +} + +static void account(TimedAverage *ta) +{ + timed_average_account(ta, 1); + timed_average_account(ta, 5); + timed_average_account(ta, 2); + timed_average_account(ta, 4); + timed_average_account(ta, 3); +} + +static void test_average(void) +{ + TimedAverage ta; + uint64_t result; + int i; + + /* we will compute some average on a period of 1 second */ + timed_average_init(&ta, QEMU_CLOCK_VIRTUAL, NANOSECONDS_PER_SECOND); + + result = timed_average_min(&ta); + g_assert(result == 0); + result = timed_average_avg(&ta); + g_assert(result == 0); + result = timed_average_max(&ta); + g_assert(result == 0); + + for (i = 0; i < 100; i++) { + account(&ta); + result = timed_average_min(&ta); + g_assert(result == 1); + result = timed_average_avg(&ta); + g_assert(result == 3); + result = timed_average_max(&ta); + g_assert(result == 5); + my_clock_value += NANOSECONDS_PER_SECOND / 10; + } + + my_clock_value += NANOSECONDS_PER_SECOND * 100; + + result = timed_average_min(&ta); + g_assert(result == 0); + result = timed_average_avg(&ta); + g_assert(result == 0); + result = timed_average_max(&ta); + g_assert(result == 0); + + for (i = 0; i < 100; i++) { + account(&ta); + result = timed_average_min(&ta); + g_assert(result == 1); + result = timed_average_avg(&ta); + g_assert(result == 3); + result = timed_average_max(&ta); + g_assert(result == 5); + my_clock_value += NANOSECONDS_PER_SECOND / 10; + } +} + +int main(int argc, char **argv) +{ + /* tests in the same order as the header function declarations */ + g_test_init(&argc, &argv, NULL); + g_test_add_func("/timed-average/average", test_average); + return g_test_run(); +} + diff --git a/util/Makefile.objs b/util/Makefile.objs index d7cc39907f..89dd80ef86 100644 --- a/util/Makefile.objs +++ b/util/Makefile.objs @@ -29,3 +29,4 @@ util-obj-y += qemu-coroutine.o qemu-coroutine-lock.o qemu-coroutine-io.o util-obj-y += qemu-coroutine-sleep.o util-obj-y += coroutine-$(CONFIG_COROUTINE_BACKEND).o util-obj-y += buffer.o +util-obj-y += timed-average.o diff --git a/util/timed-average.c b/util/timed-average.c new file mode 100644 index 0000000000..a2dfb4834d --- /dev/null +++ b/util/timed-average.c @@ -0,0 +1,231 @@ +/* + * QEMU timed average computation + * + * Copyright (C) Nodalink, EURL. 2014 + * Copyright (C) Igalia, S.L. 2015 + * + * Authors: + * Benoît Canet <benoit.canet@nodalink.com> + * Alberto Garcia <berto@igalia.com> + * + * This program is free sofware: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Sofware Foundation, either version 2 of the License, or + * (at your option) version 3 or any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <string.h> + +#include "qemu/timed-average.h" + +/* This module computes an average of a set of values within a time + * window. + * + * Algorithm: + * + * - Create two windows with a certain expiration period, and + * offsetted by period / 2. + * - Each time you want to account a new value, do it in both windows. + * - The minimum / maximum / average values are always returned from + * the oldest window. + * + * Example: + * + * t=0 |t=0.5 |t=1 |t=1.5 |t=2 + * wnd0: [0,0.5)|wnd0: [0.5,1.5) | |wnd0: [1.5,2.5) | + * wnd1: [0,1) | |wnd1: [1,2) | | + * + * Values are returned from: + * + * wnd0---------|wnd1------------|wnd0---------|wnd1-------------| + */ + +/* Update the expiration of a time window + * + * @w: the window used + * @now: the current time in nanoseconds + * @period: the expiration period in nanoseconds + */ +static void update_expiration(TimedAverageWindow *w, int64_t now, + int64_t period) +{ + /* time elapsed since the last theoretical expiration */ + int64_t elapsed = (now - w->expiration) % period; + /* time remaininging until the next expiration */ + int64_t remaining = period - elapsed; + /* compute expiration */ + w->expiration = now + remaining; +} + +/* Reset a window + * + * @w: the window to reset + */ +static void window_reset(TimedAverageWindow *w) +{ + w->min = UINT64_MAX; + w->max = 0; + w->sum = 0; + w->count = 0; +} + +/* Get the current window (that is, the one with the earliest + * expiration time). + * + * @ta: the TimedAverage structure + * @ret: a pointer to the current window + */ +static TimedAverageWindow *current_window(TimedAverage *ta) +{ + return &ta->windows[ta->current]; +} + +/* Initialize a TimedAverage structure + * + * @ta: the TimedAverage structure + * @clock_type: the type of clock to use + * @period: the time window period in nanoseconds + */ +void timed_average_init(TimedAverage *ta, QEMUClockType clock_type, + uint64_t period) +{ + int64_t now = qemu_clock_get_ns(clock_type); + + /* Returned values are from the oldest window, so they belong to + * the interval [ta->period/2,ta->period). By adjusting the + * requested period by 4/3, we guarantee that they're in the + * interval [2/3 period,4/3 period), closer to the requested + * period on average */ + ta->period = (uint64_t) period * 4 / 3; + ta->clock_type = clock_type; + ta->current = 0; + + window_reset(&ta->windows[0]); + window_reset(&ta->windows[1]); + + /* Both windows are offsetted by half a period */ + ta->windows[0].expiration = now + ta->period / 2; + ta->windows[1].expiration = now + ta->period; +} + +/* Check if the time windows have expired, updating their counters and + * expiration time if that's the case. + * + * @ta: the TimedAverage structure + * @elapsed: if non-NULL, the elapsed time (in ns) within the current + * window will be stored here + */ +static void check_expirations(TimedAverage *ta, uint64_t *elapsed) +{ + int64_t now = qemu_clock_get_ns(ta->clock_type); + int i; + + assert(ta->period != 0); + + /* Check if the windows have expired */ + for (i = 0; i < 2; i++) { + TimedAverageWindow *w = &ta->windows[i]; + if (w->expiration <= now) { + window_reset(w); + update_expiration(w, now, ta->period); + } + } + + /* Make ta->current point to the oldest window */ + if (ta->windows[0].expiration < ta->windows[1].expiration) { + ta->current = 0; + } else { + ta->current = 1; + } + + /* Calculate the elapsed time within the current window */ + if (elapsed) { + int64_t remaining = ta->windows[ta->current].expiration - now; + *elapsed = ta->period - remaining; + } +} + +/* Account a value + * + * @ta: the TimedAverage structure + * @value: the value to account + */ +void timed_average_account(TimedAverage *ta, uint64_t value) +{ + int i; + check_expirations(ta, NULL); + + /* Do the accounting in both windows at the same time */ + for (i = 0; i < 2; i++) { + TimedAverageWindow *w = &ta->windows[i]; + + w->sum += value; + w->count++; + + if (value < w->min) { + w->min = value; + } + + if (value > w->max) { + w->max = value; + } + } +} + +/* Get the minimum value + * + * @ta: the TimedAverage structure + * @ret: the minimum value + */ +uint64_t timed_average_min(TimedAverage *ta) +{ + TimedAverageWindow *w; + check_expirations(ta, NULL); + w = current_window(ta); + return w->min < UINT64_MAX ? w->min : 0; +} + +/* Get the average value + * + * @ta: the TimedAverage structure + * @ret: the average value + */ +uint64_t timed_average_avg(TimedAverage *ta) +{ + TimedAverageWindow *w; + check_expirations(ta, NULL); + w = current_window(ta); + return w->count > 0 ? w->sum / w->count : 0; +} + +/* Get the maximum value + * + * @ta: the TimedAverage structure + * @ret: the maximum value + */ +uint64_t timed_average_max(TimedAverage *ta) +{ + check_expirations(ta, NULL); + return current_window(ta)->max; +} + +/* Get the sum of all accounted values + * @ta: the TimedAverage structure + * @elapsed: if non-NULL, the elapsed time (in ns) will be stored here + * @ret: the sum of all accounted values + */ +uint64_t timed_average_sum(TimedAverage *ta, uint64_t *elapsed) +{ + TimedAverageWindow *w; + check_expirations(ta, elapsed); + w = current_window(ta); + return w->sum; +} |