diff options
36 files changed, 2417 insertions, 319 deletions
diff --git a/block-migration.c b/block-migration.c index 6acf3e1682..9ac7de6d78 100644 --- a/block-migration.c +++ b/block-migration.c @@ -23,7 +23,8 @@ #include "sysemu/blockdev.h" #include <assert.h> -#define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS) +#define BLOCK_SIZE (1 << 20) +#define BDRV_SECTORS_PER_DIRTY_CHUNK (BLOCK_SIZE >> BDRV_SECTOR_BITS) #define BLK_MIG_FLAG_DEVICE_BLOCK 0x01 #define BLK_MIG_FLAG_EOS 0x02 @@ -254,7 +255,7 @@ static void set_dirty_tracking(int enable) BlkMigDevState *bmds; QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { - bdrv_set_dirty_tracking(bmds->bs, enable); + bdrv_set_dirty_tracking(bmds->bs, enable ? BLOCK_SIZE : 0); } } @@ -478,7 +479,7 @@ static int64_t get_remaining_dirty(void) dirty += bdrv_get_dirty_count(bmds->bs); } - return dirty * BLOCK_SIZE; + return dirty << BDRV_SECTOR_BITS; } static void blk_mig_cleanup(void) @@ -1286,7 +1286,6 @@ static void bdrv_move_feature_fields(BlockDriverState *bs_dest, bs_dest->iostatus = bs_src->iostatus; /* dirty bitmap */ - bs_dest->dirty_count = bs_src->dirty_count; bs_dest->dirty_bitmap = bs_src->dirty_bitmap; /* job */ @@ -1674,10 +1673,10 @@ static void tracked_request_begin(BdrvTrackedRequest *req, /** * Round a region to cluster boundaries */ -static void round_to_clusters(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - int64_t *cluster_sector_num, - int *cluster_nb_sectors) +void bdrv_round_to_clusters(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + int64_t *cluster_sector_num, + int *cluster_nb_sectors) { BlockDriverInfo bdi; @@ -1719,8 +1718,8 @@ static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs, * CoR read and write operations are atomic and guest writes cannot * interleave between them. */ - round_to_clusters(bs, sector_num, nb_sectors, - &cluster_sector_num, &cluster_nb_sectors); + bdrv_round_to_clusters(bs, sector_num, nb_sectors, + &cluster_sector_num, &cluster_nb_sectors); do { retry = false; @@ -2035,36 +2034,6 @@ int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num, return ret; } -#define BITS_PER_LONG (sizeof(unsigned long) * 8) - -static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, int dirty) -{ - int64_t start, end; - unsigned long val, idx, bit; - - start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK; - end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK; - - for (; start <= end; start++) { - idx = start / BITS_PER_LONG; - bit = start % BITS_PER_LONG; - val = bs->dirty_bitmap[idx]; - if (dirty) { - if (!(val & (1UL << bit))) { - bs->dirty_count++; - val |= 1UL << bit; - } - } else { - if (val & (1UL << bit)) { - bs->dirty_count--; - val &= ~(1UL << bit); - } - } - bs->dirty_bitmap[idx] = val; - } -} - /* Return < 0 if error. Important errors are: -EIO generic I/O error (may happen for all errors) -ENOMEDIUM No media inserted. @@ -2216,8 +2185,8 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, /* Cover entire cluster so no additional backing file I/O is required when * allocating cluster in the image file. */ - round_to_clusters(bs, sector_num, nb_sectors, - &cluster_sector_num, &cluster_nb_sectors); + bdrv_round_to_clusters(bs, sector_num, nb_sectors, + &cluster_sector_num, &cluster_nb_sectors); trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, cluster_sector_num, cluster_nb_sectors); @@ -2863,8 +2832,9 @@ BlockInfo *bdrv_query_info(BlockDriverState *bs) if (bs->dirty_bitmap) { info->has_dirty = true; info->dirty = g_malloc0(sizeof(*info->dirty)); - info->dirty->count = bdrv_get_dirty_count(bs) * - BDRV_SECTORS_PER_DIRTY_CHUNK * BDRV_SECTOR_SIZE; + info->dirty->count = bdrv_get_dirty_count(bs) * BDRV_SECTOR_SIZE; + info->dirty->granularity = + ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bs->dirty_bitmap)); } if (bs->drv) { @@ -4173,7 +4143,7 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, } if (bs->dirty_bitmap) { - set_dirty_bitmap(bs, sector_num, nb_sectors, 0); + bdrv_reset_dirty(bs, sector_num, nb_sectors); } if (bs->drv->bdrv_co_discard) { @@ -4331,22 +4301,20 @@ bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) return true; } -void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable) +void bdrv_set_dirty_tracking(BlockDriverState *bs, int granularity) { int64_t bitmap_size; - bs->dirty_count = 0; - if (enable) { - if (!bs->dirty_bitmap) { - bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) + - BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG - 1; - bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG; + assert((granularity & (granularity - 1)) == 0); - bs->dirty_bitmap = g_new0(unsigned long, bitmap_size); - } + if (granularity) { + granularity >>= BDRV_SECTOR_BITS; + assert(!bs->dirty_bitmap); + bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS); + bs->dirty_bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1); } else { if (bs->dirty_bitmap) { - g_free(bs->dirty_bitmap); + hbitmap_free(bs->dirty_bitmap); bs->dirty_bitmap = NULL; } } @@ -4354,67 +4322,37 @@ void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable) int bdrv_get_dirty(BlockDriverState *bs, int64_t sector) { - int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK; - - if (bs->dirty_bitmap && - (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) { - return !!(bs->dirty_bitmap[chunk / BITS_PER_LONG] & - (1UL << (chunk % BITS_PER_LONG))); + if (bs->dirty_bitmap) { + return hbitmap_get(bs->dirty_bitmap, sector); } else { return 0; } } -int64_t bdrv_get_next_dirty(BlockDriverState *bs, int64_t sector) +void bdrv_dirty_iter_init(BlockDriverState *bs, HBitmapIter *hbi) { - int64_t chunk; - int bit, elem; - - /* Avoid an infinite loop. */ - assert(bs->dirty_count > 0); - - sector = (sector | (BDRV_SECTORS_PER_DIRTY_CHUNK - 1)) + 1; - chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK; - - QEMU_BUILD_BUG_ON(sizeof(bs->dirty_bitmap[0]) * 8 != BITS_PER_LONG); - elem = chunk / BITS_PER_LONG; - bit = chunk % BITS_PER_LONG; - for (;;) { - if (sector >= bs->total_sectors) { - sector = 0; - bit = elem = 0; - } - if (bit == 0 && bs->dirty_bitmap[elem] == 0) { - sector += BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG; - elem++; - } else { - if (bs->dirty_bitmap[elem] & (1UL << bit)) { - return sector; - } - sector += BDRV_SECTORS_PER_DIRTY_CHUNK; - if (++bit == BITS_PER_LONG) { - bit = 0; - elem++; - } - } - } + hbitmap_iter_init(hbi, bs->dirty_bitmap, 0); } void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors) { - set_dirty_bitmap(bs, cur_sector, nr_sectors, 1); + hbitmap_set(bs->dirty_bitmap, cur_sector, nr_sectors); } void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors) { - set_dirty_bitmap(bs, cur_sector, nr_sectors, 0); + hbitmap_reset(bs->dirty_bitmap, cur_sector, nr_sectors); } int64_t bdrv_get_dirty_count(BlockDriverState *bs) { - return bs->dirty_count; + if (bs->dirty_bitmap) { + return hbitmap_count(bs->dirty_bitmap); + } else { + return 0; + } } void bdrv_set_in_use(BlockDriverState *bs, int in_use) diff --git a/block/bochs.c b/block/bochs.c index 1b1d9cdbe5..37375834e9 100644 --- a/block/bochs.c +++ b/block/bochs.c @@ -126,7 +126,7 @@ static int bochs_open(BlockDriverState *bs, int flags) strcmp(bochs.subtype, GROWING_TYPE) || ((le32_to_cpu(bochs.version) != HEADER_VERSION) && (le32_to_cpu(bochs.version) != HEADER_V1))) { - goto fail; + return -EMEDIUMTYPE; } if (le32_to_cpu(bochs.version) == HEADER_V1) { diff --git a/block/cow.c b/block/cow.c index a33ce950d4..4baf9042fe 100644 --- a/block/cow.c +++ b/block/cow.c @@ -73,7 +73,7 @@ static int cow_open(BlockDriverState *bs, int flags) } if (be32_to_cpu(cow_header.magic) != COW_MAGIC) { - ret = -EINVAL; + ret = -EMEDIUMTYPE; goto fail; } diff --git a/block/mirror.c b/block/mirror.c index 6180aa30e5..a62ad86c28 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -15,17 +15,17 @@ #include "block/blockjob.h" #include "block/block_int.h" #include "qemu/ratelimit.h" +#include "qemu/bitmap.h" -enum { - /* - * Size of data buffer for populating the image file. This should be large - * enough to process multiple clusters in a single call, so that populating - * contiguous regions of the image is efficient. - */ - BLOCK_SIZE = 512 * BDRV_SECTORS_PER_DIRTY_CHUNK, /* in bytes */ -}; +#define SLICE_TIME 100000000ULL /* ns */ +#define MAX_IN_FLIGHT 16 -#define SLICE_TIME 100000000ULL /* ns */ +/* The mirroring buffer is a list of granularity-sized chunks. + * Free chunks are organized in a list. + */ +typedef struct MirrorBuffer { + QSIMPLEQ_ENTRY(MirrorBuffer) next; +} MirrorBuffer; typedef struct MirrorBlockJob { BlockJob common; @@ -36,9 +36,26 @@ typedef struct MirrorBlockJob { bool synced; bool should_complete; int64_t sector_num; + int64_t granularity; + size_t buf_size; + unsigned long *cow_bitmap; + HBitmapIter hbi; uint8_t *buf; + QSIMPLEQ_HEAD(, MirrorBuffer) buf_free; + int buf_free_count; + + unsigned long *in_flight_bitmap; + int in_flight; + int ret; } MirrorBlockJob; +typedef struct MirrorOp { + MirrorBlockJob *s; + QEMUIOVector qiov; + int64_t sector_num; + int nb_sectors; +} MirrorOp; + static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read, int error) { @@ -52,51 +69,234 @@ static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read, } } -static int coroutine_fn mirror_iteration(MirrorBlockJob *s, - BlockErrorAction *p_action) +static void mirror_iteration_done(MirrorOp *op, int ret) { - BlockDriverState *source = s->common.bs; - BlockDriverState *target = s->target; - QEMUIOVector qiov; - int ret, nb_sectors; - int64_t end; - struct iovec iov; + MirrorBlockJob *s = op->s; + struct iovec *iov; + int64_t chunk_num; + int i, nb_chunks, sectors_per_chunk; + + trace_mirror_iteration_done(s, op->sector_num, op->nb_sectors, ret); + + s->in_flight--; + iov = op->qiov.iov; + for (i = 0; i < op->qiov.niov; i++) { + MirrorBuffer *buf = (MirrorBuffer *) iov[i].iov_base; + QSIMPLEQ_INSERT_TAIL(&s->buf_free, buf, next); + s->buf_free_count++; + } - end = s->common.len >> BDRV_SECTOR_BITS; - s->sector_num = bdrv_get_next_dirty(source, s->sector_num); - nb_sectors = MIN(BDRV_SECTORS_PER_DIRTY_CHUNK, end - s->sector_num); - bdrv_reset_dirty(source, s->sector_num, nb_sectors); + sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; + chunk_num = op->sector_num / sectors_per_chunk; + nb_chunks = op->nb_sectors / sectors_per_chunk; + bitmap_clear(s->in_flight_bitmap, chunk_num, nb_chunks); + if (s->cow_bitmap && ret >= 0) { + bitmap_set(s->cow_bitmap, chunk_num, nb_chunks); + } - /* Copy the dirty cluster. */ - iov.iov_base = s->buf; - iov.iov_len = nb_sectors * 512; - qemu_iovec_init_external(&qiov, &iov, 1); + g_slice_free(MirrorOp, op); + qemu_coroutine_enter(s->common.co, NULL); +} - trace_mirror_one_iteration(s, s->sector_num, nb_sectors); - ret = bdrv_co_readv(source, s->sector_num, nb_sectors, &qiov); +static void mirror_write_complete(void *opaque, int ret) +{ + MirrorOp *op = opaque; + MirrorBlockJob *s = op->s; if (ret < 0) { - *p_action = mirror_error_action(s, true, -ret); - goto fail; + BlockDriverState *source = s->common.bs; + BlockErrorAction action; + + bdrv_set_dirty(source, op->sector_num, op->nb_sectors); + action = mirror_error_action(s, false, -ret); + if (action == BDRV_ACTION_REPORT && s->ret >= 0) { + s->ret = ret; + } } - ret = bdrv_co_writev(target, s->sector_num, nb_sectors, &qiov); + mirror_iteration_done(op, ret); +} + +static void mirror_read_complete(void *opaque, int ret) +{ + MirrorOp *op = opaque; + MirrorBlockJob *s = op->s; if (ret < 0) { - *p_action = mirror_error_action(s, false, -ret); - s->synced = false; - goto fail; + BlockDriverState *source = s->common.bs; + BlockErrorAction action; + + bdrv_set_dirty(source, op->sector_num, op->nb_sectors); + action = mirror_error_action(s, true, -ret); + if (action == BDRV_ACTION_REPORT && s->ret >= 0) { + s->ret = ret; + } + + mirror_iteration_done(op, ret); + return; + } + bdrv_aio_writev(s->target, op->sector_num, &op->qiov, op->nb_sectors, + mirror_write_complete, op); +} + +static void coroutine_fn mirror_iteration(MirrorBlockJob *s) +{ + BlockDriverState *source = s->common.bs; + int nb_sectors, sectors_per_chunk, nb_chunks; + int64_t end, sector_num, next_chunk, next_sector, hbitmap_next_sector; + MirrorOp *op; + + s->sector_num = hbitmap_iter_next(&s->hbi); + if (s->sector_num < 0) { + bdrv_dirty_iter_init(source, &s->hbi); + s->sector_num = hbitmap_iter_next(&s->hbi); + trace_mirror_restart_iter(s, bdrv_get_dirty_count(source)); + assert(s->sector_num >= 0); + } + + hbitmap_next_sector = s->sector_num; + sector_num = s->sector_num; + sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; + end = s->common.len >> BDRV_SECTOR_BITS; + + /* Extend the QEMUIOVector to include all adjacent blocks that will + * be copied in this operation. + * + * We have to do this if we have no backing file yet in the destination, + * and the cluster size is very large. Then we need to do COW ourselves. + * The first time a cluster is copied, copy it entirely. Note that, + * because both the granularity and the cluster size are powers of two, + * the number of sectors to copy cannot exceed one cluster. + * + * We also want to extend the QEMUIOVector to include more adjacent + * dirty blocks if possible, to limit the number of I/O operations and + * run efficiently even with a small granularity. + */ + nb_chunks = 0; + nb_sectors = 0; + next_sector = sector_num; + next_chunk = sector_num / sectors_per_chunk; + + /* Wait for I/O to this cluster (from a previous iteration) to be done. */ + while (test_bit(next_chunk, s->in_flight_bitmap)) { + trace_mirror_yield_in_flight(s, sector_num, s->in_flight); + qemu_coroutine_yield(); + } + + do { + int added_sectors, added_chunks; + + if (!bdrv_get_dirty(source, next_sector) || + test_bit(next_chunk, s->in_flight_bitmap)) { + assert(nb_sectors > 0); + break; + } + + added_sectors = sectors_per_chunk; + if (s->cow_bitmap && !test_bit(next_chunk, s->cow_bitmap)) { + bdrv_round_to_clusters(s->target, + next_sector, added_sectors, + &next_sector, &added_sectors); + + /* On the first iteration, the rounding may make us copy + * sectors before the first dirty one. + */ + if (next_sector < sector_num) { + assert(nb_sectors == 0); + sector_num = next_sector; + next_chunk = next_sector / sectors_per_chunk; + } + } + + added_sectors = MIN(added_sectors, end - (sector_num + nb_sectors)); + added_chunks = (added_sectors + sectors_per_chunk - 1) / sectors_per_chunk; + + /* When doing COW, it may happen that there is not enough space for + * a full cluster. Wait if that is the case. + */ + while (nb_chunks == 0 && s->buf_free_count < added_chunks) { + trace_mirror_yield_buf_busy(s, nb_chunks, s->in_flight); + qemu_coroutine_yield(); + } + if (s->buf_free_count < nb_chunks + added_chunks) { + trace_mirror_break_buf_busy(s, nb_chunks, s->in_flight); + break; + } + + /* We have enough free space to copy these sectors. */ + bitmap_set(s->in_flight_bitmap, next_chunk, added_chunks); + + nb_sectors += added_sectors; + nb_chunks += added_chunks; + next_sector += added_sectors; + next_chunk += added_chunks; + } while (next_sector < end); + + /* Allocate a MirrorOp that is used as an AIO callback. */ + op = g_slice_new(MirrorOp); + op->s = s; + op->sector_num = sector_num; + op->nb_sectors = nb_sectors; + + /* Now make a QEMUIOVector taking enough granularity-sized chunks + * from s->buf_free. + */ + qemu_iovec_init(&op->qiov, nb_chunks); + next_sector = sector_num; + while (nb_chunks-- > 0) { + MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free); + QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next); + s->buf_free_count--; + qemu_iovec_add(&op->qiov, buf, s->granularity); + + /* Advance the HBitmapIter in parallel, so that we do not examine + * the same sector twice. + */ + if (next_sector > hbitmap_next_sector && bdrv_get_dirty(source, next_sector)) { + hbitmap_next_sector = hbitmap_iter_next(&s->hbi); + } + + next_sector += sectors_per_chunk; } - return 0; -fail: - /* Try again later. */ - bdrv_set_dirty(source, s->sector_num, nb_sectors); - return ret; + bdrv_reset_dirty(source, sector_num, nb_sectors); + + /* Copy the dirty cluster. */ + s->in_flight++; + trace_mirror_one_iteration(s, sector_num, nb_sectors); + bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors, + mirror_read_complete, op); +} + +static void mirror_free_init(MirrorBlockJob *s) +{ + int granularity = s->granularity; + size_t buf_size = s->buf_size; + uint8_t *buf = s->buf; + + assert(s->buf_free_count == 0); + QSIMPLEQ_INIT(&s->buf_free); + while (buf_size != 0) { + MirrorBuffer *cur = (MirrorBuffer *)buf; + QSIMPLEQ_INSERT_TAIL(&s->buf_free, cur, next); + s->buf_free_count++; + buf_size -= granularity; + buf += granularity; + } +} + +static void mirror_drain(MirrorBlockJob *s) +{ + while (s->in_flight > 0) { + qemu_coroutine_yield(); + } } static void coroutine_fn mirror_run(void *opaque) { MirrorBlockJob *s = opaque; BlockDriverState *bs = s->common.bs; - int64_t sector_num, end; + int64_t sector_num, end, sectors_per_chunk, length; + uint64_t last_pause_ns; + BlockDriverInfo bdi; + char backing_filename[1024]; int ret = 0; int n; @@ -105,20 +305,39 @@ static void coroutine_fn mirror_run(void *opaque) } s->common.len = bdrv_getlength(bs); - if (s->common.len < 0) { + if (s->common.len <= 0) { block_job_completed(&s->common, s->common.len); return; } + length = (bdrv_getlength(bs) + s->granularity - 1) / s->granularity; + s->in_flight_bitmap = bitmap_new(length); + + /* If we have no backing file yet in the destination, we cannot let + * the destination do COW. Instead, we copy sectors around the + * dirty data if needed. We need a bitmap to do that. + */ + bdrv_get_backing_filename(s->target, backing_filename, + sizeof(backing_filename)); + if (backing_filename[0] && !s->target->backing_hd) { + bdrv_get_info(s->target, &bdi); + if (s->granularity < bdi.cluster_size) { + s->buf_size = MAX(s->buf_size, bdi.cluster_size); + s->cow_bitmap = bitmap_new(length); + } + } + end = s->common.len >> BDRV_SECTOR_BITS; - s->buf = qemu_blockalign(bs, BLOCK_SIZE); + s->buf = qemu_blockalign(bs, s->buf_size); + sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; + mirror_free_init(s); if (s->mode != MIRROR_SYNC_MODE_NONE) { /* First part, loop on the sectors and initialize the dirty bitmap. */ BlockDriverState *base; base = s->mode == MIRROR_SYNC_MODE_FULL ? NULL : bs->backing_hd; for (sector_num = 0; sector_num < end; ) { - int64_t next = (sector_num | (BDRV_SECTORS_PER_DIRTY_CHUNK - 1)) + 1; + int64_t next = (sector_num | (sectors_per_chunk - 1)) + 1; ret = bdrv_co_is_allocated_above(bs, base, sector_num, next - sector_num, &n); @@ -136,24 +355,40 @@ static void coroutine_fn mirror_run(void *opaque) } } - s->sector_num = -1; + bdrv_dirty_iter_init(bs, &s->hbi); + last_pause_ns = qemu_get_clock_ns(rt_clock); for (;;) { uint64_t delay_ns; int64_t cnt; bool should_complete; + if (s->ret < 0) { + ret = s->ret; + goto immediate_exit; + } + cnt = bdrv_get_dirty_count(bs); - if (cnt != 0) { - BlockErrorAction action = BDRV_ACTION_REPORT; - ret = mirror_iteration(s, &action); - if (ret < 0 && action == BDRV_ACTION_REPORT) { - goto immediate_exit; + + /* Note that even when no rate limit is applied we need to yield + * periodically with no pending I/O so that qemu_aio_flush() returns. + * We do so every SLICE_TIME nanoseconds, or when there is an error, + * or when the source is clean, whichever comes first. + */ + if (qemu_get_clock_ns(rt_clock) - last_pause_ns < SLICE_TIME && + s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) { + if (s->in_flight == MAX_IN_FLIGHT || s->buf_free_count == 0 || + (cnt == 0 && s->in_flight > 0)) { + trace_mirror_yield(s, s->in_flight, s->buf_free_count, cnt); + qemu_coroutine_yield(); + continue; + } else if (cnt != 0) { + mirror_iteration(s); + continue; } - cnt = bdrv_get_dirty_count(bs); } should_complete = false; - if (cnt == 0) { + if (s->in_flight == 0 && cnt == 0) { trace_mirror_before_flush(s); ret = bdrv_flush(s->target); if (ret < 0) { @@ -196,23 +431,20 @@ static void coroutine_fn mirror_run(void *opaque) trace_mirror_before_sleep(s, cnt, s->synced); if (!s->synced) { /* Publish progress */ - s->common.offset = end * BDRV_SECTOR_SIZE - cnt * BLOCK_SIZE; + s->common.offset = (end - cnt) * BDRV_SECTOR_SIZE; if (s->common.speed) { - delay_ns = ratelimit_calculate_delay(&s->limit, BDRV_SECTORS_PER_DIRTY_CHUNK); + delay_ns = ratelimit_calculate_delay(&s->limit, sectors_per_chunk); } else { delay_ns = 0; } - /* Note that even when no rate limit is applied we need to yield - * with no pending I/O here so that bdrv_drain_all() returns. - */ block_job_sleep_ns(&s->common, rt_clock, delay_ns); if (block_job_is_cancelled(&s->common)) { break; } } else if (!should_complete) { - delay_ns = (cnt == 0 ? SLICE_TIME : 0); + delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0); block_job_sleep_ns(&s->common, rt_clock, delay_ns); } else if (cnt == 0) { /* The two disks are in sync. Exit and report successful @@ -222,11 +454,24 @@ static void coroutine_fn mirror_run(void *opaque) s->common.cancelled = false; break; } + last_pause_ns = qemu_get_clock_ns(rt_clock); } immediate_exit: + if (s->in_flight > 0) { + /* We get here only if something went wrong. Either the job failed, + * or it was cancelled prematurely so that we do not guarantee that + * the target is a copy of the source. + */ + assert(ret < 0 || (!s->synced && block_job_is_cancelled(&s->common))); + mirror_drain(s); + } + + assert(s->in_flight == 0); qemu_vfree(s->buf); - bdrv_set_dirty_tracking(bs, false); + g_free(s->cow_bitmap); + g_free(s->in_flight_bitmap); + bdrv_set_dirty_tracking(bs, 0); bdrv_iostatus_disable(s->target); if (s->should_complete && ret == 0) { if (bdrv_get_flags(s->target) != bdrv_get_flags(s->common.bs)) { @@ -288,14 +533,28 @@ static BlockJobType mirror_job_type = { }; void mirror_start(BlockDriverState *bs, BlockDriverState *target, - int64_t speed, MirrorSyncMode mode, - BlockdevOnError on_source_error, + int64_t speed, int64_t granularity, int64_t buf_size, + MirrorSyncMode mode, BlockdevOnError on_source_error, BlockdevOnError on_target_error, BlockDriverCompletionFunc *cb, void *opaque, Error **errp) { MirrorBlockJob *s; + if (granularity == 0) { + /* Choose the default granularity based on the target file's cluster + * size, clamped between 4k and 64k. */ + BlockDriverInfo bdi; + if (bdrv_get_info(target, &bdi) >= 0 && bdi.cluster_size != 0) { + granularity = MAX(4096, bdi.cluster_size); + granularity = MIN(65536, granularity); + } else { + granularity = 65536; + } + } + + assert ((granularity & (granularity - 1)) == 0); + if ((on_source_error == BLOCKDEV_ON_ERROR_STOP || on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) && !bdrv_iostatus_is_enabled(bs)) { @@ -312,7 +571,10 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target, s->on_target_error = on_target_error; s->target = target; s->mode = mode; - bdrv_set_dirty_tracking(bs, true); + s->granularity = granularity; + s->buf_size = MAX(buf_size, granularity); + + bdrv_set_dirty_tracking(bs, granularity); bdrv_set_enable_write_cache(s->target, true); bdrv_set_on_error(s->target, on_target_error, on_target_error); bdrv_iostatus_enable(s->target); diff --git a/block/qcow.c b/block/qcow.c index 4276610afd..a7135eea47 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -112,7 +112,7 @@ static int qcow_open(BlockDriverState *bs, int flags) be64_to_cpus(&header.l1_table_offset); if (header.magic != QCOW_MAGIC) { - ret = -EINVAL; + ret = -EMEDIUMTYPE; goto fail; } if (header.version != QCOW_VERSION) { diff --git a/block/qcow2.c b/block/qcow2.c index f6abff6111..7610e569e3 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -311,7 +311,7 @@ static int qcow2_open(BlockDriverState *bs, int flags) be32_to_cpus(&header.nb_snapshots); if (header.magic != QCOW_MAGIC) { - ret = -EINVAL; + ret = -EMEDIUMTYPE; goto fail; } if (header.version < 2 || header.version > 3) { diff --git a/block/qed.c b/block/qed.c index cf85d8f2b4..b8515e58b3 100644 --- a/block/qed.c +++ b/block/qed.c @@ -390,7 +390,7 @@ static int bdrv_qed_open(BlockDriverState *bs, int flags) qed_header_le_to_cpu(&le_header, &s->header); if (s->header.magic != QED_MAGIC) { - return -EINVAL; + return -EMEDIUMTYPE; } if (s->header.features & ~QED_FEATURE_MASK) { /* image uses unsupported feature bits */ diff --git a/block/vdi.c b/block/vdi.c index 021abaa227..257a592ea9 100644 --- a/block/vdi.c +++ b/block/vdi.c @@ -246,7 +246,7 @@ static void vdi_header_print(VdiHeader *header) { char uuid[37]; logout("text %s", header->text); - logout("signature 0x%04x\n", header->signature); + logout("signature 0x%08x\n", header->signature); logout("header size 0x%04x\n", header->header_size); logout("image type 0x%04x\n", header->image_type); logout("image flags 0x%04x\n", header->image_flags); @@ -369,10 +369,12 @@ static int vdi_open(BlockDriverState *bs, int flags) BDRVVdiState *s = bs->opaque; VdiHeader header; size_t bmap_size; + int ret; logout("\n"); - if (bdrv_read(bs->file, 0, (uint8_t *)&header, 1) < 0) { + ret = bdrv_read(bs->file, 0, (uint8_t *)&header, 1); + if (ret < 0) { goto fail; } @@ -390,33 +392,45 @@ static int vdi_open(BlockDriverState *bs, int flags) header.disk_size &= ~(SECTOR_SIZE - 1); } - if (header.version != VDI_VERSION_1_1) { + if (header.signature != VDI_SIGNATURE) { + logout("bad vdi signature %08x\n", header.signature); + ret = -EMEDIUMTYPE; + goto fail; + } else if (header.version != VDI_VERSION_1_1) { logout("unsupported version %u.%u\n", header.version >> 16, header.version & 0xffff); + ret = -ENOTSUP; goto fail; } else if (header.offset_bmap % SECTOR_SIZE != 0) { /* We only support block maps which start on a sector boundary. */ logout("unsupported block map offset 0x%x B\n", header.offset_bmap); + ret = -ENOTSUP; goto fail; } else if (header.offset_data % SECTOR_SIZE != 0) { /* We only support data blocks which start on a sector boundary. */ logout("unsupported data offset 0x%x B\n", header.offset_data); + ret = -ENOTSUP; goto fail; } else if (header.sector_size != SECTOR_SIZE) { logout("unsupported sector size %u B\n", header.sector_size); + ret = -ENOTSUP; goto fail; } else if (header.block_size != 1 * MiB) { logout("unsupported block size %u B\n", header.block_size); + ret = -ENOTSUP; goto fail; } else if (header.disk_size > (uint64_t)header.blocks_in_image * header.block_size) { logout("unsupported disk size %" PRIu64 " B\n", header.disk_size); + ret = -ENOTSUP; goto fail; } else if (!uuid_is_null(header.uuid_link)) { logout("link uuid != 0, unsupported\n"); + ret = -ENOTSUP; goto fail; } else if (!uuid_is_null(header.uuid_parent)) { logout("parent uuid != 0, unsupported\n"); + ret = -ENOTSUP; goto fail; } @@ -432,7 +446,8 @@ static int vdi_open(BlockDriverState *bs, int flags) if (bmap_size > 0) { s->bmap = g_malloc(bmap_size * SECTOR_SIZE); } - if (bdrv_read(bs->file, s->bmap_sector, (uint8_t *)s->bmap, bmap_size) < 0) { + ret = bdrv_read(bs->file, s->bmap_sector, (uint8_t *)s->bmap, bmap_size); + if (ret < 0) { goto fail_free_bmap; } @@ -448,7 +463,7 @@ static int vdi_open(BlockDriverState *bs, int flags) g_free(s->bmap); fail: - return -1; + return ret; } static int vdi_reopen_prepare(BDRVReopenState *state, diff --git a/block/vmdk.c b/block/vmdk.c index 19298c2a3e..8333afb5e3 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -616,7 +616,7 @@ static int vmdk_open_sparse(BlockDriverState *bs, return vmdk_open_vmdk4(bs, file, flags); break; default: - return -EINVAL; + return -EMEDIUMTYPE; break; } } @@ -718,7 +718,7 @@ static int vmdk_open_desc_file(BlockDriverState *bs, int flags, } buf[2047] = '\0'; if (vmdk_parse_description(buf, "createType", ct, sizeof(ct))) { - return -EINVAL; + return -EMEDIUMTYPE; } if (strcmp(ct, "monolithicFlat") && strcmp(ct, "twoGbMaxExtentSparse") && diff --git a/blockdev.c b/blockdev.c index 9126587c45..63e6f1eafa 100644 --- a/blockdev.c +++ b/blockdev.c @@ -617,8 +617,13 @@ DriveInfo *drive_init(QemuOpts *opts, BlockInterfaceType block_default_type) ret = bdrv_open(dinfo->bdrv, file, bdrv_flags, drv); if (ret < 0) { - error_report("could not open disk image %s: %s", - file, strerror(-ret)); + if (ret == -EMEDIUMTYPE) { + error_report("could not open disk image %s: not in %s format", + file, drv->format_name); + } else { + error_report("could not open disk image %s: %s", + file, strerror(-ret)); + } goto err; } @@ -642,21 +647,17 @@ void do_commit(Monitor *mon, const QDict *qdict) if (!strcmp(device, "all")) { ret = bdrv_commit_all(); - if (ret == -EBUSY) { - qerror_report(QERR_DEVICE_IN_USE, device); - return; - } } else { bs = bdrv_find(device); if (!bs) { - qerror_report(QERR_DEVICE_NOT_FOUND, device); + monitor_printf(mon, "Device '%s' not found\n", device); return; } ret = bdrv_commit(bs); - if (ret == -EBUSY) { - qerror_report(QERR_DEVICE_IN_USE, device); - return; - } + } + if (ret < 0) { + monitor_printf(mon, "'commit' error for '%s': %s\n", device, + strerror(-ret)); } } @@ -1188,16 +1189,19 @@ void qmp_block_commit(const char *device, drive_get_ref(drive_get_by_blockdev(bs)); } +#define DEFAULT_MIRROR_BUF_SIZE (10 << 20) + void qmp_drive_mirror(const char *device, const char *target, bool has_format, const char *format, enum MirrorSyncMode sync, bool has_mode, enum NewImageMode mode, bool has_speed, int64_t speed, + bool has_granularity, uint32_t granularity, + bool has_buf_size, int64_t buf_size, bool has_on_source_error, BlockdevOnError on_source_error, bool has_on_target_error, BlockdevOnError on_target_error, Error **errp) { - BlockDriverInfo bdi; BlockDriverState *bs; BlockDriverState *source, *target_bs; BlockDriver *proto_drv; @@ -1219,6 +1223,21 @@ void qmp_drive_mirror(const char *device, const char *target, if (!has_mode) { mode = NEW_IMAGE_MODE_ABSOLUTE_PATHS; } + if (!has_granularity) { + granularity = 0; + } + if (!has_buf_size) { + buf_size = DEFAULT_MIRROR_BUF_SIZE; + } + + if (granularity != 0 && (granularity < 512 || granularity > 1048576 * 64)) { + error_set(errp, QERR_INVALID_PARAMETER, device); + return; + } + if (granularity & (granularity - 1)) { + error_set(errp, QERR_INVALID_PARAMETER, device); + return; + } bs = bdrv_find(device); if (!bs) { @@ -1259,11 +1278,11 @@ void qmp_drive_mirror(const char *device, const char *target, return; } + bdrv_get_geometry(bs, &size); + size *= 512; if (sync == MIRROR_SYNC_MODE_FULL && mode != NEW_IMAGE_MODE_EXISTING) { /* create new image w/o backing file */ assert(format && drv); - bdrv_get_geometry(bs, &size); - size *= 512; bdrv_img_create(target, format, NULL, NULL, NULL, size, flags, &local_err); } else { @@ -1276,7 +1295,7 @@ void qmp_drive_mirror(const char *device, const char *target, bdrv_img_create(target, format, source->filename, source->drv->format_name, - NULL, -1, flags, &local_err); + NULL, size, flags, &local_err); break; default: abort(); @@ -1288,6 +1307,9 @@ void qmp_drive_mirror(const char *device, const char *target, return; } + /* Mirroring takes care of copy-on-write using the source's backing + * file. + */ target_bs = bdrv_new(""); ret = bdrv_open(target_bs, target, flags | BDRV_O_NO_BACKING, drv); @@ -1297,18 +1319,8 @@ void qmp_drive_mirror(const char *device, const char *target, return; } - /* We need a backing file if we will copy parts of a cluster. */ - if (bdrv_get_info(target_bs, &bdi) >= 0 && bdi.cluster_size != 0 && - bdi.cluster_size >= BDRV_SECTORS_PER_DIRTY_CHUNK * 512) { - ret = bdrv_open_backing_file(target_bs); - if (ret < 0) { - bdrv_delete(target_bs); - error_set(errp, QERR_OPEN_FILE_FAILED, target); - return; - } - } - - mirror_start(bs, target_bs, speed, sync, on_source_error, on_target_error, + mirror_start(bs, target_bs, speed, granularity, buf_size, sync, + on_source_error, on_target_error, block_job_cb, bs, &local_err); if (local_err != NULL) { bdrv_delete(target_bs); diff --git a/docs/virtio-balloon-stats.txt b/docs/virtio-balloon-stats.txt new file mode 100644 index 0000000000..f74612f468 --- /dev/null +++ b/docs/virtio-balloon-stats.txt @@ -0,0 +1,104 @@ +virtio balloon memory statistics +================================ + +The virtio balloon driver supports guest memory statistics reporting. These +statistics are available to QEMU users as QOM (QEMU Object Model) device +properties via a polling mechanism. + +Before querying the available stats, clients first have to enable polling. +This is done by writing a time interval value (in seconds) to the +guest-stats-polling-interval property. This value can be: + + > 0 enables polling in the specified interval. If polling is already + enabled, the polling time interval is changed to the new value + + 0 disables polling. Previous polled statistics are still valid and + can be queried. + +Once polling is enabled, the virtio-balloon device in QEMU will start +polling the guest's balloon driver for new stats in the specified time +interval. + +To retrieve those stats, clients have to query the guest-stats property, +which will return a dictionary containing: + + o A key named 'stats', containing all available stats. If the guest + doesn't support a particular stat, or if it couldn't be retrieved, + its value will be -1. Currently, the following stats are supported: + + - stat-swap-in + - stat-swap-out + - stat-major-faults + - stat-minor-faults + - stat-free-memory + - stat-total-memory + + o A key named last-update, which contains the last stats update + timestamp in seconds. Since this timestamp is generated by the host, + a buggy guest can't influence its value + +It's also important to note the following: + + - Previously polled statistics remain available even if the polling is + later disabled + + - As noted above, if a guest doesn't support a particular stat its value + will always be -1. However, it's also possible that a guest temporarily + couldn't update one or even all stats. If this happens, just wait for + the next update + + - Polling can be enabled even if the guest doesn't have stats support + or the balloon driver wasn't loaded in the guest. If this is the case + and stats are queried, an error will be returned + + - The polling timer is only re-armed when the guest responds to the + statistics request. This means that if a (buggy) guest doesn't ever + respond to the request the timer will never be re-armed, which has + the same effect as disabling polling + +Here are a few examples. QEMU is started with '-balloon virtio', which +generates '/machine/peripheral-anon/device[1]' as the QOM path for the +balloon device. + +Enable polling with 2 seconds interval: + +{ "execute": "qom-set", + "arguments": { "path": "/machine/peripheral-anon/device[1]", + "property": "guest-stats-polling-interval", "value": 2 } } + +{ "return": {} } + +Change polling to 10 seconds: + +{ "execute": "qom-set", + "arguments": { "path": "/machine/peripheral-anon/device[1]", + "property": "guest-stats-polling-interval", "value": 10 } } + +{ "return": {} } + +Get stats: + +{ "execute": "qom-get", + "arguments": { "path": "/machine/peripheral-anon/device[1]", + "property": "guest-stats" } } +{ + "return": { + "stats": { + "stat-swap-out": 0, + "stat-free-memory": 844943360, + "stat-minor-faults": 219028, + "stat-major-faults": 235, + "stat-total-memory": 1044406272, + "stat-swap-in": 0 + }, + "last-update": 1358529861 + } +} + +Disable polling: + +{ "execute": "qom-set", + "arguments": { "path": "/machine/peripheral-anon/device[1]", + "property": "stats-polling-interval", "value": 0 } } + +{ "return": {} } diff --git a/hmp-commands.hx b/hmp-commands.hx index 0934b9b915..bdd48f3469 100644 --- a/hmp-commands.hx +++ b/hmp-commands.hx @@ -837,6 +837,45 @@ STEXI @item nmi @var{cpu} @findex nmi Inject an NMI on the given CPU (x86 only). + +ETEXI + + { + .name = "memchar_write", + .args_type = "device:s,data:s", + .params = "device data", + .help = "Provide writing interface for CirMemCharDriver. Write" + "'data' to it.", + .mhandler.cmd = hmp_memchar_write, + }, + +STEXI +@item memchar_write @var{device} @var{data} +@findex memchar_write +Provide writing interface for CirMemCharDriver. Write @var{data} +to char device 'memory'. + +ETEXI + + { + .name = "memchar_read", + .args_type = "device:s,size:i", + .params = "device size", + .help = "Provide read interface for CirMemCharDriver. Read from" + "it and return the data with size.", + .mhandler.cmd = hmp_memchar_read, + }, + +STEXI +@item memchar_read @var{device} +@findex memchar_read +Provide read interface for CirMemCharDriver. Read from char device +'memory' and return the data. + +@var{size} is the size of data want to read from. Refer to unencoded +size of the raw data, would adjust to the init size of the memchar +if the requested size is larger than it. + ETEXI { @@ -465,29 +465,7 @@ void hmp_info_balloon(Monitor *mon, const QDict *qdict) return; } - monitor_printf(mon, "balloon: actual=%" PRId64, info->actual >> 20); - if (info->has_mem_swapped_in) { - monitor_printf(mon, " mem_swapped_in=%" PRId64, info->mem_swapped_in); - } - if (info->has_mem_swapped_out) { - monitor_printf(mon, " mem_swapped_out=%" PRId64, info->mem_swapped_out); - } - if (info->has_major_page_faults) { - monitor_printf(mon, " major_page_faults=%" PRId64, - info->major_page_faults); - } - if (info->has_minor_page_faults) { - monitor_printf(mon, " minor_page_faults=%" PRId64, - info->minor_page_faults); - } - if (info->has_free_mem) { - monitor_printf(mon, " free_mem=%" PRId64, info->free_mem); - } - if (info->has_total_mem) { - monitor_printf(mon, " total_mem=%" PRId64, info->total_mem); - } - - monitor_printf(mon, "\n"); + monitor_printf(mon, "balloon: actual=%" PRId64 "\n", info->actual >> 20); qapi_free_BalloonInfo(info); } @@ -684,6 +662,40 @@ void hmp_pmemsave(Monitor *mon, const QDict *qdict) hmp_handle_error(mon, &errp); } +void hmp_memchar_write(Monitor *mon, const QDict *qdict) +{ + uint32_t size; + const char *chardev = qdict_get_str(qdict, "device"); + const char *data = qdict_get_str(qdict, "data"); + Error *errp = NULL; + + size = strlen(data); + qmp_memchar_write(chardev, size, data, false, 0, &errp); + + hmp_handle_error(mon, &errp); +} + +void hmp_memchar_read(Monitor *mon, const QDict *qdict) +{ + uint32_t size = qdict_get_int(qdict, "size"); + const char *chardev = qdict_get_str(qdict, "device"); + MemCharRead *meminfo; + Error *errp = NULL; + + meminfo = qmp_memchar_read(chardev, size, false, 0, &errp); + if (errp) { + monitor_printf(mon, "%s\n", error_get_pretty(errp)); + error_free(errp); + return; + } + + if (meminfo->count > 0) { + monitor_printf(mon, "%s\n", meminfo->data); + } + + qapi_free_MemCharRead(meminfo); +} + static void hmp_cont_cb(void *opaque, int err) { if (!err) { @@ -796,7 +808,7 @@ void hmp_drive_mirror(Monitor *mon, const QDict *qdict) qmp_drive_mirror(device, filename, !!format, format, full ? MIRROR_SYNC_MODE_FULL : MIRROR_SYNC_MODE_TOP, - true, mode, false, 0, + true, mode, false, 0, false, 0, false, 0, false, 0, false, 0, &errp); hmp_handle_error(mon, &errp); } @@ -43,6 +43,8 @@ void hmp_system_powerdown(Monitor *mon, const QDict *qdict); void hmp_cpu(Monitor *mon, const QDict *qdict); void hmp_memsave(Monitor *mon, const QDict *qdict); void hmp_pmemsave(Monitor *mon, const QDict *qdict); +void hmp_memchar_write(Monitor *mon, const QDict *qdict); +void hmp_memchar_read(Monitor *mon, const QDict *qdict); void hmp_cont(Monitor *mon, const QDict *qdict); void hmp_system_wakeup(Monitor *mon, const QDict *qdict); void hmp_inject_nmi(Monitor *mon, const QDict *qdict); diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c index 21f50ea5be..ad0094f532 100644 --- a/hw/ide/ahci.c +++ b/hw/ide/ahci.c @@ -241,7 +241,7 @@ static void ahci_port_write(AHCIState *s, int port, int offset, uint32_t val) if ((pr->cmd & PORT_CMD_FIS_ON) && !s->dev[port].init_d2h_sent) { ahci_init_d2h(&s->dev[port]); - s->dev[port].init_d2h_sent = 1; + s->dev[port].init_d2h_sent = true; } check_cmd(s, port); @@ -494,7 +494,7 @@ static void ahci_reset_port(AHCIState *s, int port) pr->scr_err = 0; pr->scr_act = 0; d->busy_slot = -1; - d->init_d2h_sent = 0; + d->init_d2h_sent = false; ide_state = &s->dev[port].port.ifs[0]; if (!ide_state->bs) { @@ -946,7 +946,7 @@ static int handle_cmd(AHCIState *s, int port, int slot) ide_state->hcyl = 0xeb; debug_print_fis(ide_state->io_buffer, 0x10); ide_state->feature = IDE_FEATURE_DMA; - s->dev[port].done_atapi_packet = 0; + s->dev[port].done_atapi_packet = false; /* XXX send PIO setup FIS */ } @@ -991,7 +991,7 @@ static int ahci_start_transfer(IDEDMA *dma) if (is_atapi && !ad->done_atapi_packet) { /* already prepopulated iobuffer */ - ad->done_atapi_packet = 1; + ad->done_atapi_packet = true; goto out; } @@ -1035,11 +1035,10 @@ out: static void ahci_start_dma(IDEDMA *dma, IDEState *s, BlockDriverCompletionFunc *dma_cb) { +#ifdef DEBUG_AHCI AHCIDevice *ad = DO_UPCAST(AHCIDevice, dma, dma); - +#endif DPRINTF(ad->port_no, "\n"); - ad->dma_cb = dma_cb; - ad->dma_status |= BM_STATUS_DMAING; s->io_buffer_offset = 0; dma_cb(s, 0); } @@ -1095,7 +1094,6 @@ static int ahci_dma_set_unit(IDEDMA *dma, int unit) static int ahci_dma_add_status(IDEDMA *dma, int status) { AHCIDevice *ad = DO_UPCAST(AHCIDevice, dma, dma); - ad->dma_status |= status; DPRINTF(ad->port_no, "set status: %x\n", status); if (status & BM_STATUS_INT) { @@ -1114,8 +1112,6 @@ static int ahci_dma_set_inactive(IDEDMA *dma) /* update d2h status */ ahci_write_fis_d2h(ad, NULL); - ad->dma_cb = NULL; - if (!ad->check_bh) { /* maybe we still have something to process, check later */ ad->check_bh = qemu_bh_new(ahci_check_cmd_bh, ad); @@ -1203,6 +1199,82 @@ void ahci_reset(AHCIState *s) } } +static const VMStateDescription vmstate_ahci_device = { + .name = "ahci port", + .version_id = 1, + .fields = (VMStateField []) { + VMSTATE_IDE_BUS(port, AHCIDevice), + VMSTATE_UINT32(port_state, AHCIDevice), + VMSTATE_UINT32(finished, AHCIDevice), + VMSTATE_UINT32(port_regs.lst_addr, AHCIDevice), + VMSTATE_UINT32(port_regs.lst_addr_hi, AHCIDevice), + VMSTATE_UINT32(port_regs.fis_addr, AHCIDevice), + VMSTATE_UINT32(port_regs.fis_addr_hi, AHCIDevice), + VMSTATE_UINT32(port_regs.irq_stat, AHCIDevice), + VMSTATE_UINT32(port_regs.irq_mask, AHCIDevice), + VMSTATE_UINT32(port_regs.cmd, AHCIDevice), + VMSTATE_UINT32(port_regs.tfdata, AHCIDevice), + VMSTATE_UINT32(port_regs.sig, AHCIDevice), + VMSTATE_UINT32(port_regs.scr_stat, AHCIDevice), + VMSTATE_UINT32(port_regs.scr_ctl, AHCIDevice), + VMSTATE_UINT32(port_regs.scr_err, AHCIDevice), + VMSTATE_UINT32(port_regs.scr_act, AHCIDevice), + VMSTATE_UINT32(port_regs.cmd_issue, AHCIDevice), + VMSTATE_BOOL(done_atapi_packet, AHCIDevice), + VMSTATE_INT32(busy_slot, AHCIDevice), + VMSTATE_BOOL(init_d2h_sent, AHCIDevice), + VMSTATE_END_OF_LIST() + }, +}; + +static int ahci_state_post_load(void *opaque, int version_id) +{ + int i; + struct AHCIDevice *ad; + AHCIState *s = opaque; + + for (i = 0; i < s->ports; i++) { + ad = &s->dev[i]; + AHCIPortRegs *pr = &ad->port_regs; + + map_page(&ad->lst, + ((uint64_t)pr->lst_addr_hi << 32) | pr->lst_addr, 1024); + map_page(&ad->res_fis, + ((uint64_t)pr->fis_addr_hi << 32) | pr->fis_addr, 256); + /* + * All pending i/o should be flushed out on a migrate. However, + * we might not have cleared the busy_slot since this is done + * in a bh. Also, issue i/o against any slots that are pending. + */ + if ((ad->busy_slot != -1) && + !(ad->port.ifs[0].status & (BUSY_STAT|DRQ_STAT))) { + pr->cmd_issue &= ~(1 << ad->busy_slot); + ad->busy_slot = -1; + } + check_cmd(s, i); + } + + return 0; +} + +const VMStateDescription vmstate_ahci = { + .name = "ahci", + .version_id = 1, + .post_load = ahci_state_post_load, + .fields = (VMStateField []) { + VMSTATE_STRUCT_VARRAY_POINTER_INT32(dev, AHCIState, ports, + vmstate_ahci_device, AHCIDevice), + VMSTATE_UINT32(control_regs.cap, AHCIState), + VMSTATE_UINT32(control_regs.ghc, AHCIState), + VMSTATE_UINT32(control_regs.irqstatus, AHCIState), + VMSTATE_UINT32(control_regs.impl, AHCIState), + VMSTATE_UINT32(control_regs.version, AHCIState), + VMSTATE_UINT32(idp_index, AHCIState), + VMSTATE_INT32(ports, AHCIState), + VMSTATE_END_OF_LIST() + }, +}; + typedef struct SysbusAHCIState { SysBusDevice busdev; AHCIState ahci; @@ -1211,7 +1283,11 @@ typedef struct SysbusAHCIState { static const VMStateDescription vmstate_sysbus_ahci = { .name = "sysbus-ahci", - .unmigratable = 1, + .unmigratable = 1, /* Still buggy under I/O load */ + .fields = (VMStateField []) { + VMSTATE_AHCI(ahci, AHCIPCIState), + VMSTATE_END_OF_LIST() + }, }; static void sysbus_ahci_reset(DeviceState *dev) diff --git a/hw/ide/ahci.h b/hw/ide/ahci.h index 1200a56ada..85f37fe99d 100644 --- a/hw/ide/ahci.h +++ b/hw/ide/ahci.h @@ -281,11 +281,9 @@ struct AHCIDevice { QEMUBH *check_bh; uint8_t *lst; uint8_t *res_fis; - int dma_status; - int done_atapi_packet; - int busy_slot; - int init_d2h_sent; - BlockDriverCompletionFunc *dma_cb; + bool done_atapi_packet; + int32_t busy_slot; + bool init_d2h_sent; AHCICmdHdr *cur_cmd; NCQTransferState ncq_tfs[AHCI_MAX_CMDS]; }; @@ -297,7 +295,7 @@ typedef struct AHCIState { MemoryRegion idp; /* Index-Data Pair I/O port space */ unsigned idp_offset; /* Offset of index in I/O port space */ uint32_t idp_index; /* Current IDP index */ - int ports; + int32_t ports; qemu_irq irq; DMAContext *dma; } AHCIState; @@ -307,6 +305,16 @@ typedef struct AHCIPCIState { AHCIState ahci; } AHCIPCIState; +extern const VMStateDescription vmstate_ahci; + +#define VMSTATE_AHCI(_field, _state) { \ + .name = (stringify(_field)), \ + .size = sizeof(AHCIState), \ + .vmsd = &vmstate_ahci, \ + .flags = VMS_STRUCT, \ + .offset = vmstate_offset_value(_state, _field, AHCIState), \ +} + typedef struct NCQFrame { uint8_t fis_type; uint8_t c; diff --git a/hw/ide/core.c b/hw/ide/core.c index 14ad0799c3..3743dc3b55 100644 --- a/hw/ide/core.c +++ b/hw/ide/core.c @@ -1149,8 +1149,10 @@ void ide_exec_cmd(IDEBus *bus, uint32_t val) } ide_set_irq(s->bus); break; + case WIN_VERIFY_EXT: - lba48 = 1; + lba48 = 1; + /* fall through */ case WIN_VERIFY: case WIN_VERIFY_ONCE: /* do sector number check ? */ @@ -1158,8 +1160,10 @@ void ide_exec_cmd(IDEBus *bus, uint32_t val) s->status = READY_STAT | SEEK_STAT; ide_set_irq(s->bus); break; + case WIN_READ_EXT: - lba48 = 1; + lba48 = 1; + /* fall through */ case WIN_READ: case WIN_READ_ONCE: if (s->drive_kind == IDE_CD) { @@ -1173,8 +1177,10 @@ void ide_exec_cmd(IDEBus *bus, uint32_t val) s->req_nb_sectors = 1; ide_sector_read(s); break; + case WIN_WRITE_EXT: - lba48 = 1; + lba48 = 1; + /* fall through */ case WIN_WRITE: case WIN_WRITE_ONCE: case CFA_WRITE_SECT_WO_ERASE: @@ -1189,8 +1195,10 @@ void ide_exec_cmd(IDEBus *bus, uint32_t val) ide_transfer_start(s, s->io_buffer, 512, ide_sector_write); s->media_changed = 1; break; + case WIN_MULTREAD_EXT: - lba48 = 1; + lba48 = 1; + /* fall through */ case WIN_MULTREAD: if (!s->bs) { goto abort_cmd; @@ -1202,8 +1210,10 @@ void ide_exec_cmd(IDEBus *bus, uint32_t val) s->req_nb_sectors = s->mult_sectors; ide_sector_read(s); break; + case WIN_MULTWRITE_EXT: - lba48 = 1; + lba48 = 1; + /* fall through */ case WIN_MULTWRITE: case CFA_WRITE_MULTI_WO_ERASE: if (!s->bs) { @@ -1222,8 +1232,10 @@ void ide_exec_cmd(IDEBus *bus, uint32_t val) ide_transfer_start(s, s->io_buffer, 512 * n, ide_sector_write); s->media_changed = 1; break; + case WIN_READDMA_EXT: - lba48 = 1; + lba48 = 1; + /* fall through */ case WIN_READDMA: case WIN_READDMA_ONCE: if (!s->bs) { @@ -1232,8 +1244,10 @@ void ide_exec_cmd(IDEBus *bus, uint32_t val) ide_cmd_lba48_transform(s, lba48); ide_sector_start_dma(s, IDE_DMA_READ); break; + case WIN_WRITEDMA_EXT: - lba48 = 1; + lba48 = 1; + /* fall through */ case WIN_WRITEDMA: case WIN_WRITEDMA_ONCE: if (!s->bs) { @@ -1243,14 +1257,17 @@ void ide_exec_cmd(IDEBus *bus, uint32_t val) ide_sector_start_dma(s, IDE_DMA_WRITE); s->media_changed = 1; break; + case WIN_READ_NATIVE_MAX_EXT: - lba48 = 1; + lba48 = 1; + /* fall through */ case WIN_READ_NATIVE_MAX: ide_cmd_lba48_transform(s, lba48); ide_set_sector(s, s->nb_sectors - 1); s->status = READY_STAT | SEEK_STAT; ide_set_irq(s->bus); break; + case WIN_CHECKPOWERMODE1: case WIN_CHECKPOWERMODE2: s->error = 0; diff --git a/hw/ide/ich.c b/hw/ide/ich.c index 1fb803d340..cc30adc701 100644 --- a/hw/ide/ich.c +++ b/hw/ide/ich.c @@ -79,9 +79,15 @@ #define ICH9_IDP_INDEX 0x10 #define ICH9_IDP_INDEX_LOG2 0x04 -static const VMStateDescription vmstate_ahci = { - .name = "ahci", - .unmigratable = 1, +static const VMStateDescription vmstate_ich9_ahci = { + .name = "ich9_ahci", + .unmigratable = 1, /* Still buggy under I/O load */ + .version_id = 1, + .fields = (VMStateField []) { + VMSTATE_PCI_DEVICE(card, AHCIPCIState), + VMSTATE_AHCI(ahci, AHCIPCIState), + VMSTATE_END_OF_LIST() + }, }; static void pci_ich9_reset(DeviceState *dev) @@ -152,7 +158,7 @@ static void ich_ahci_class_init(ObjectClass *klass, void *data) k->device_id = PCI_DEVICE_ID_INTEL_82801IR; k->revision = 0x02; k->class_id = PCI_CLASS_STORAGE_SATA; - dc->vmsd = &vmstate_ahci; + dc->vmsd = &vmstate_ich9_ahci; dc->reset = pci_ich9_reset; } diff --git a/hw/virtio-balloon.c b/hw/virtio-balloon.c index 3040bc63ab..c0a790264c 100644 --- a/hw/virtio-balloon.c +++ b/hw/virtio-balloon.c @@ -14,6 +14,7 @@ */ #include "qemu/iov.h" +#include "qemu/timer.h" #include "qemu-common.h" #include "virtio.h" #include "pc.h" @@ -22,6 +23,7 @@ #include "virtio-balloon.h" #include "sysemu/kvm.h" #include "exec/address-spaces.h" +#include "qapi/visitor.h" #if defined(__linux__) #include <sys/mman.h> @@ -36,6 +38,9 @@ typedef struct VirtIOBalloon uint64_t stats[VIRTIO_BALLOON_S_NR]; VirtQueueElement stats_vq_elem; size_t stats_vq_offset; + QEMUTimer *stats_timer; + int64_t stats_last_update; + int64_t stats_poll_interval; DeviceState *qdev; } VirtIOBalloon; @@ -53,6 +58,16 @@ static void balloon_page(void *addr, int deflate) #endif } +static const char *balloon_stat_names[] = { + [VIRTIO_BALLOON_S_SWAP_IN] = "stat-swap-in", + [VIRTIO_BALLOON_S_SWAP_OUT] = "stat-swap-out", + [VIRTIO_BALLOON_S_MAJFLT] = "stat-major-faults", + [VIRTIO_BALLOON_S_MINFLT] = "stat-minor-faults", + [VIRTIO_BALLOON_S_MEMFREE] = "stat-free-memory", + [VIRTIO_BALLOON_S_MEMTOT] = "stat-total-memory", + [VIRTIO_BALLOON_S_NR] = NULL +}; + /* * reset_stats - Mark all items in the stats array as unset * @@ -67,6 +82,118 @@ static inline void reset_stats(VirtIOBalloon *dev) for (i = 0; i < VIRTIO_BALLOON_S_NR; dev->stats[i++] = -1); } +static bool balloon_stats_supported(const VirtIOBalloon *s) +{ + return s->vdev.guest_features & (1 << VIRTIO_BALLOON_F_STATS_VQ); +} + +static bool balloon_stats_enabled(const VirtIOBalloon *s) +{ + return s->stats_poll_interval > 0; +} + +static void balloon_stats_destroy_timer(VirtIOBalloon *s) +{ + if (balloon_stats_enabled(s)) { + qemu_del_timer(s->stats_timer); + qemu_free_timer(s->stats_timer); + s->stats_timer = NULL; + s->stats_poll_interval = 0; + } +} + +static void balloon_stats_change_timer(VirtIOBalloon *s, int secs) +{ + qemu_mod_timer(s->stats_timer, qemu_get_clock_ms(vm_clock) + secs * 1000); +} + +static void balloon_stats_poll_cb(void *opaque) +{ + VirtIOBalloon *s = opaque; + + if (!balloon_stats_supported(s)) { + /* re-schedule */ + balloon_stats_change_timer(s, s->stats_poll_interval); + return; + } + + virtqueue_push(s->svq, &s->stats_vq_elem, s->stats_vq_offset); + virtio_notify(&s->vdev, s->svq); +} + +static void balloon_stats_get_all(Object *obj, struct Visitor *v, + void *opaque, const char *name, Error **errp) +{ + VirtIOBalloon *s = opaque; + int i; + + if (!s->stats_last_update) { + error_setg(errp, "guest hasn't updated any stats yet"); + return; + } + + visit_start_struct(v, NULL, "guest-stats", name, 0, errp); + visit_type_int(v, &s->stats_last_update, "last-update", errp); + + visit_start_struct(v, NULL, NULL, "stats", 0, errp); + for (i = 0; i < VIRTIO_BALLOON_S_NR; i++) { + visit_type_int64(v, (int64_t *) &s->stats[i], balloon_stat_names[i], + errp); + } + visit_end_struct(v, errp); + + visit_end_struct(v, errp); +} + +static void balloon_stats_get_poll_interval(Object *obj, struct Visitor *v, + void *opaque, const char *name, + Error **errp) +{ + VirtIOBalloon *s = opaque; + visit_type_int(v, &s->stats_poll_interval, name, errp); +} + +static void balloon_stats_set_poll_interval(Object *obj, struct Visitor *v, + void *opaque, const char *name, + Error **errp) +{ + VirtIOBalloon *s = opaque; + int64_t value; + + visit_type_int(v, &value, name, errp); + if (error_is_set(errp)) { + return; + } + + if (value < 0) { + error_setg(errp, "timer value must be greater than zero"); + return; + } + + if (value == s->stats_poll_interval) { + return; + } + + if (value == 0) { + /* timer=0 disables the timer */ + balloon_stats_destroy_timer(s); + return; + } + + if (balloon_stats_enabled(s)) { + /* timer interval change */ + s->stats_poll_interval = value; + balloon_stats_change_timer(s, value); + return; + } + + /* create a new timer */ + g_assert(s->stats_timer == NULL); + s->stats_timer = qemu_new_timer_ms(vm_clock, balloon_stats_poll_cb, s); + s->stats_poll_interval = value; + balloon_stats_change_timer(s, 0); +} + static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq) { VirtIOBalloon *s = to_virtio_balloon(vdev); @@ -107,9 +234,10 @@ static void virtio_balloon_receive_stats(VirtIODevice *vdev, VirtQueue *vq) VirtQueueElement *elem = &s->stats_vq_elem; VirtIOBalloonStat stat; size_t offset = 0; + qemu_timeval tv; if (!virtqueue_pop(vq, elem)) { - return; + goto out; } /* Initialize the stats to get rid of any stale values. This is only @@ -128,6 +256,18 @@ static void virtio_balloon_receive_stats(VirtIODevice *vdev, VirtQueue *vq) s->stats[tag] = val; } s->stats_vq_offset = offset; + + if (qemu_gettimeofday(&tv) < 0) { + fprintf(stderr, "warning: %s: failed to get time of day\n", __func__); + goto out; + } + + s->stats_last_update = tv.tv_sec; + +out: + if (balloon_stats_enabled(s)) { + balloon_stats_change_timer(s, s->stats_poll_interval); + } } static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data) @@ -164,28 +304,6 @@ static uint32_t virtio_balloon_get_features(VirtIODevice *vdev, uint32_t f) static void virtio_balloon_stat(void *opaque, BalloonInfo *info) { VirtIOBalloon *dev = opaque; - -#if 0 - /* Disable guest-provided stats for now. For more details please check: - * https://bugzilla.redhat.com/show_bug.cgi?id=623903 - * - * If you do enable it (which is probably not going to happen as we - * need a new command for it), remember that you also need to fill the - * appropriate members of the BalloonInfo structure so that the stats - * are returned to the client. - */ - if (dev->vdev.guest_features & (1 << VIRTIO_BALLOON_F_STATS_VQ)) { - virtqueue_push(dev->svq, &dev->stats_vq_elem, dev->stats_vq_offset); - virtio_notify(&dev->vdev, dev->svq); - return; - } -#endif - - /* Stats are not supported. Clear out any stale values that might - * have been set by a more featureful guest kernel. - */ - reset_stats(dev); - info->actual = ram_size - ((uint64_t) dev->actual << VIRTIO_BALLOON_PFN_SHIFT); } @@ -255,12 +373,18 @@ VirtIODevice *virtio_balloon_init(DeviceState *dev) s->dvq = virtio_add_queue(&s->vdev, 128, virtio_balloon_handle_output); s->svq = virtio_add_queue(&s->vdev, 128, virtio_balloon_receive_stats); - reset_stats(s); - s->qdev = dev; register_savevm(dev, "virtio-balloon", -1, 1, virtio_balloon_save, virtio_balloon_load, s); + object_property_add(OBJECT(dev), "guest-stats", "guest statistics", + balloon_stats_get_all, NULL, NULL, s, NULL); + + object_property_add(OBJECT(dev), "guest-stats-polling-interval", "int", + balloon_stats_get_poll_interval, + balloon_stats_set_poll_interval, + NULL, s, NULL); + return &s->vdev; } @@ -268,6 +392,7 @@ void virtio_balloon_exit(VirtIODevice *vdev) { VirtIOBalloon *s = DO_UPCAST(VirtIOBalloon, vdev, vdev); + balloon_stats_destroy_timer(s); qemu_remove_balloon_handler(s); unregister_savevm(s->qdev, "virtio-balloon", s); virtio_cleanup(vdev); diff --git a/include/block/block.h b/include/block/block.h index ffd193637d..5c3b911c1b 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -309,6 +309,10 @@ int bdrv_get_flags(BlockDriverState *bs); int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, const uint8_t *buf, int nb_sectors); int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi); +void bdrv_round_to_clusters(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + int64_t *cluster_sector_num, + int *cluster_nb_sectors); const char *bdrv_get_encrypted_filename(BlockDriverState *bs); void bdrv_get_backing_filename(BlockDriverState *bs, @@ -351,13 +355,12 @@ void bdrv_set_buffer_alignment(BlockDriverState *bs, int align); void *qemu_blockalign(BlockDriverState *bs, size_t size); bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov); -#define BDRV_SECTORS_PER_DIRTY_CHUNK 2048 - -void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable); +struct HBitmapIter; +void bdrv_set_dirty_tracking(BlockDriverState *bs, int granularity); int bdrv_get_dirty(BlockDriverState *bs, int64_t sector); void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors); void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors); -int64_t bdrv_get_next_dirty(BlockDriverState *bs, int64_t sector); +void bdrv_dirty_iter_init(BlockDriverState *bs, struct HBitmapIter *hbi); int64_t bdrv_get_dirty_count(BlockDriverState *bs); void bdrv_enable_copy_on_read(BlockDriverState *bs); diff --git a/include/block/block_int.h b/include/block/block_int.h index f83ffb8a08..f7279b978a 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -32,6 +32,7 @@ #include "qapi-types.h" #include "qapi/qmp/qerror.h" #include "monitor/monitor.h" +#include "qemu/hbitmap.h" #define BLOCK_FLAG_ENCRYPT 1 #define BLOCK_FLAG_COMPAT6 4 @@ -275,8 +276,7 @@ struct BlockDriverState { bool iostatus_enabled; BlockDeviceIoStatus iostatus; char device_name[32]; - unsigned long *dirty_bitmap; - int64_t dirty_count; + HBitmap *dirty_bitmap; int in_use; /* users other than guest access, eg. block migration */ QTAILQ_ENTRY(BlockDriverState) list; @@ -344,6 +344,8 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base, * @bs: Block device to operate on. * @target: Block device to write to. * @speed: The maximum speed, in bytes per second, or 0 for unlimited. + * @granularity: The chosen granularity for the dirty bitmap. + * @buf_size: The amount of data that can be in flight at one time. * @mode: Whether to collapse all images in the chain to the target. * @on_source_error: The action to take upon error reading from the source. * @on_target_error: The action to take upon error writing to the target. @@ -357,8 +359,8 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base, * @bs will be switched to read from @target. */ void mirror_start(BlockDriverState *bs, BlockDriverState *target, - int64_t speed, MirrorSyncMode mode, - BlockdevOnError on_source_error, + int64_t speed, int64_t granularity, int64_t buf_size, + MirrorSyncMode mode, BlockdevOnError on_source_error, BlockdevOnError on_target_error, BlockDriverCompletionFunc *cb, void *opaque, Error **errp); diff --git a/include/qemu-common.h b/include/qemu-common.h index ca464bb367..af2379ff38 100644 --- a/include/qemu-common.h +++ b/include/qemu-common.h @@ -68,6 +68,9 @@ #if !defined(ECANCELED) #define ECANCELED 4097 #endif +#if !defined(EMEDIUMTYPE) +#define EMEDIUMTYPE 4098 +#endif #ifndef TIME_MAX #define TIME_MAX LONG_MAX #endif diff --git a/include/qemu/hbitmap.h b/include/qemu/hbitmap.h new file mode 100644 index 0000000000..73f5d1d8d3 --- /dev/null +++ b/include/qemu/hbitmap.h @@ -0,0 +1,208 @@ +/* + * Hierarchical Bitmap Data Type + * + * Copyright Red Hat, Inc., 2012 + * + * Author: Paolo Bonzini <pbonzini@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ + +#ifndef HBITMAP_H +#define HBITMAP_H 1 + +#include <limits.h> +#include <stdint.h> +#include <stdbool.h> +#include "bitops.h" + +typedef struct HBitmap HBitmap; +typedef struct HBitmapIter HBitmapIter; + +#define BITS_PER_LEVEL (BITS_PER_LONG == 32 ? 5 : 6) + +/* For 32-bit, the largest that fits in a 4 GiB address space. + * For 64-bit, the number of sectors in 1 PiB. Good luck, in + * either case... :) + */ +#define HBITMAP_LOG_MAX_SIZE (BITS_PER_LONG == 32 ? 34 : 41) + +/* We need to place a sentinel in level 0 to speed up iteration. Thus, + * we do this instead of HBITMAP_LOG_MAX_SIZE / BITS_PER_LEVEL. The + * difference is that it allocates an extra level when HBITMAP_LOG_MAX_SIZE + * is an exact multiple of BITS_PER_LEVEL. + */ +#define HBITMAP_LEVELS ((HBITMAP_LOG_MAX_SIZE / BITS_PER_LEVEL) + 1) + +struct HBitmapIter { + const HBitmap *hb; + + /* Copied from hb for access in the inline functions (hb is opaque). */ + int granularity; + + /* Entry offset into the last-level array of longs. */ + size_t pos; + + /* The currently-active path in the tree. Each item of cur[i] stores + * the bits (i.e. the subtrees) yet to be processed under that node. + */ + unsigned long cur[HBITMAP_LEVELS]; +}; + +/** + * hbitmap_alloc: + * @size: Number of bits in the bitmap. + * @granularity: Granularity of the bitmap. Aligned groups of 2^@granularity + * bits will be represented by a single bit. Each operation on a + * range of bits first rounds the bits to determine which group they land + * in, and then affect the entire set; iteration will only visit the first + * bit of each group. + * + * Allocate a new HBitmap. + */ +HBitmap *hbitmap_alloc(uint64_t size, int granularity); + +/** + * hbitmap_empty: + * @hb: HBitmap to operate on. + * + * Return whether the bitmap is empty. + */ +bool hbitmap_empty(const HBitmap *hb); + +/** + * hbitmap_granularity: + * @hb: HBitmap to operate on. + * + * Return the granularity of the HBitmap. + */ +int hbitmap_granularity(const HBitmap *hb); + +/** + * hbitmap_count: + * @hb: HBitmap to operate on. + * + * Return the number of bits set in the HBitmap. + */ +uint64_t hbitmap_count(const HBitmap *hb); + +/** + * hbitmap_set: + * @hb: HBitmap to operate on. + * @start: First bit to set (0-based). + * @count: Number of bits to set. + * + * Set a consecutive range of bits in an HBitmap. + */ +void hbitmap_set(HBitmap *hb, uint64_t start, uint64_t count); + +/** + * hbitmap_reset: + * @hb: HBitmap to operate on. + * @start: First bit to reset (0-based). + * @count: Number of bits to reset. + * + * Reset a consecutive range of bits in an HBitmap. + */ +void hbitmap_reset(HBitmap *hb, uint64_t start, uint64_t count); + +/** + * hbitmap_get: + * @hb: HBitmap to operate on. + * @item: Bit to query (0-based). + * + * Return whether the @item-th bit in an HBitmap is set. + */ +bool hbitmap_get(const HBitmap *hb, uint64_t item); + +/** + * hbitmap_free: + * @hb: HBitmap to operate on. + * + * Free an HBitmap and all of its associated memory. + */ +void hbitmap_free(HBitmap *hb); + +/** + * hbitmap_iter_init: + * @hbi: HBitmapIter to initialize. + * @hb: HBitmap to iterate on. + * @first: First bit to visit (0-based, must be strictly less than the + * size of the bitmap). + * + * Set up @hbi to iterate on the HBitmap @hb. hbitmap_iter_next will return + * the lowest-numbered bit that is set in @hb, starting at @first. + * + * Concurrent setting of bits is acceptable, and will at worst cause the + * iteration to miss some of those bits. Resetting bits before the current + * position of the iterator is also okay. However, concurrent resetting of + * bits can lead to unexpected behavior if the iterator has not yet reached + * those bits. + */ +void hbitmap_iter_init(HBitmapIter *hbi, const HBitmap *hb, uint64_t first); + +/* hbitmap_iter_skip_words: + * @hbi: HBitmapIter to operate on. + * + * Internal function used by hbitmap_iter_next and hbitmap_iter_next_word. + */ +unsigned long hbitmap_iter_skip_words(HBitmapIter *hbi); + +/** + * hbitmap_iter_next: + * @hbi: HBitmapIter to operate on. + * + * Return the next bit that is set in @hbi's associated HBitmap, + * or -1 if all remaining bits are zero. + */ +static inline int64_t hbitmap_iter_next(HBitmapIter *hbi) +{ + unsigned long cur = hbi->cur[HBITMAP_LEVELS - 1]; + int64_t item; + + if (cur == 0) { + cur = hbitmap_iter_skip_words(hbi); + if (cur == 0) { + return -1; + } + } + + /* The next call will resume work from the next bit. */ + hbi->cur[HBITMAP_LEVELS - 1] = cur & (cur - 1); + item = ((uint64_t)hbi->pos << BITS_PER_LEVEL) + ffsl(cur) - 1; + + return item << hbi->granularity; +} + +/** + * hbitmap_iter_next_word: + * @hbi: HBitmapIter to operate on. + * @p_cur: Location where to store the next non-zero word. + * + * Return the index of the next nonzero word that is set in @hbi's + * associated HBitmap, and set *p_cur to the content of that word + * (bits before the index that was passed to hbitmap_iter_init are + * trimmed on the first call). Return -1, and set *p_cur to zero, + * if all remaining words are zero. + */ +static inline size_t hbitmap_iter_next_word(HBitmapIter *hbi, unsigned long *p_cur) +{ + unsigned long cur = hbi->cur[HBITMAP_LEVELS - 1]; + + if (cur == 0) { + cur = hbitmap_iter_skip_words(hbi); + if (cur == 0) { + *p_cur = 0; + return -1; + } + } + + /* The next call will resume work from the next word. */ + hbi->cur[HBITMAP_LEVELS - 1] = 0; + *p_cur = cur; + return hbi->pos; +} + + +#endif diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h index 81c9a754ae..2a32be4cc0 100644 --- a/include/qemu/host-utils.h +++ b/include/qemu/host-utils.h @@ -26,6 +26,7 @@ #define HOST_UTILS_H 1 #include "qemu/compiler.h" /* QEMU_GNUC_PREREQ */ +#include <string.h> /* ffsl */ #if defined(__x86_64__) #define __HAVE_FAST_MULU64__ @@ -237,4 +238,29 @@ static inline int ctpop64(uint64_t val) #endif } +/* glibc does not provide an inline version of ffsl, so always define + * ours. We need to give it a different name, however. + */ +#ifdef __GLIBC__ +#define ffsl qemu_ffsl +#endif +static inline int ffsl(long val) +{ + if (!val) { + return 0; + } + +#if QEMU_GNUC_PREREQ(3, 4) + return __builtin_ctzl(val) + 1; +#else + if (sizeof(long) == 4) { + return ctz32(val) + 1; + } else if (sizeof(long) == 8) { + return ctz64(val) + 1; + } else { + abort(); + } +#endif +} + #endif diff --git a/qapi-schema.json b/qapi-schema.json index 6d7252b9e8..3a4817b391 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -325,6 +325,80 @@ { 'command': 'query-chardev', 'returns': ['ChardevInfo'] } ## +# @DataFormat: +# +# An enumeration of data format. +# +# @utf8: The data format is 'utf8'. +# +# @base64: The data format is 'base64'. +# +# Since: 1.4 +## +{ 'enum': 'DataFormat' + 'data': [ 'utf8', 'base64' ] } + +## +# @memchar-write: +# +# Provide writing interface for memchardev. Write data to char +# device 'memory'. +# +# @device: the name of the memory char device. +# +# @size: the size to write in bytes. +# +# @data: the source data write to memchar. +# +# @format: #optional the format of the data write to chardev 'memory', +# by default is 'utf8'. +# +# Returns: Nothing on success +# If @device is not a valid char device, DeviceNotFound +# +# Since: 1.4 +## +{ 'command': 'memchar-write', + 'data': {'device': 'str', 'size': 'int', 'data': 'str', + '*format': 'DataFormat'} } + +## +# @MemCharRead +# +# Result of QMP command memchar-read. +# +# @data: The data read from memchar as string. +# +# @count: The numbers of bytes read from. +# +# Since: 1.4 +## +{ 'type': 'MemCharRead', + 'data': { 'data': 'str', 'count': 'int' } } + +## +# @memchar-read: +# +# Provide read interface for memchardev. Read from the char +# device 'memory' and return the data. +# +# @device: the name of the memory char device. +# +# @size: the size to read in bytes. +# +# @format: #optional the format of the data want to read from +# memchardev, by default is 'utf8'. +# +# Returns: @MemCharRead +# If @device is not a valid memchr device, DeviceNotFound +# +# Since: 1.4 +## +{ 'command': 'memchar-read', + 'data': {'device': 'str', 'size': 'int', '*format': 'DataFormat'}, + 'returns': 'MemCharRead' } + +## # @CommandInfo: # # Information about a QMP command @@ -667,10 +741,12 @@ # # @count: number of dirty bytes according to the dirty bitmap # +# @granularity: granularity of the dirty bitmap in bytes (since 1.4) +# # Since: 1.3 ## { 'type': 'BlockDirtyInfo', - 'data': {'count': 'int'} } + 'data': {'count': 'int', 'granularity': 'int'} } ## # @BlockInfo: @@ -977,28 +1053,10 @@ # # @actual: the number of bytes the balloon currently contains # -# @mem_swapped_in: #optional number of pages swapped in within the guest -# -# @mem_swapped_out: #optional number of pages swapped out within the guest -# -# @major_page_faults: #optional number of major page faults within the guest -# -# @minor_page_faults: #optional number of minor page faults within the guest -# -# @free_mem: #optional amount of memory (in bytes) free in the guest -# -# @total_mem: #optional amount of memory (in bytes) visible to the guest -# # Since: 0.14.0 # -# Notes: all current versions of QEMU do not fill out optional information in -# this structure. ## -{ 'type': 'BalloonInfo', - 'data': {'actual': 'int', '*mem_swapped_in': 'int', - '*mem_swapped_out': 'int', '*major_page_faults': 'int', - '*minor_page_faults': 'int', '*free_mem': 'int', - '*total_mem': 'int'} } +{ 'type': 'BalloonInfo', 'data': {'actual': 'int' } } ## # @query-balloon: @@ -1634,6 +1692,14 @@ # (all the disk, only the sectors allocated in the topmost image, or # only new I/O). # +# @granularity: #optional granularity of the dirty bitmap, default is 64K +# if the image format doesn't have clusters, 4K if the clusters +# are smaller than that, else the cluster size. Must be a +# power of 2 between 512 and 64M (since 1.4). +# +# @buf-size: #optional maximum amount of data in flight from source to +# target (since 1.4). +# # @on-source-error: #optional the action to take on an error on the source, # default 'report'. 'stop' and 'enospc' can only be used # if the block device supports io-status (see BlockInfo). @@ -1650,7 +1716,8 @@ { 'command': 'drive-mirror', 'data': { 'device': 'str', 'target': 'str', '*format': 'str', 'sync': 'MirrorSyncMode', '*mode': 'NewImageMode', - '*speed': 'int', '*on-source-error': 'BlockdevOnError', + '*speed': 'int', '*granularity': 'uint32', + '*buf-size': 'int', '*on-source-error': 'BlockdevOnError', '*on-target-error': 'BlockdevOnError' } } ## diff --git a/qemu-char.c b/qemu-char.c index da1db1d104..ac5d62dd9a 100644 --- a/qemu-char.c +++ b/qemu-char.c @@ -98,6 +98,7 @@ #include "ui/qemu-spice.h" #define READ_BUF_LEN 4096 +#define CBUFF_SIZE 65536 /***********************************************************/ /* character device */ @@ -2643,6 +2644,199 @@ size_t qemu_chr_mem_osize(const CharDriverState *chr) return d->outbuf_size; } +/*********************************************************/ +/*CircularMemory chardev*/ + +typedef struct { + size_t size; + size_t prod; + size_t cons; + uint8_t *cbuf; +} CirMemCharDriver; + +static bool cirmem_chr_is_empty(const CharDriverState *chr) +{ + const CirMemCharDriver *d = chr->opaque; + + return d->cons == d->prod; +} + +static size_t qemu_chr_cirmem_count(const CharDriverState *chr) +{ + const CirMemCharDriver *d = chr->opaque; + + return (d->prod - d->cons); +} + +static int cirmem_chr_write(CharDriverState *chr, const uint8_t *buf, int len) +{ + CirMemCharDriver *d = chr->opaque; + int i; + + if (!buf || (len < 0)) { + return -1; + } + + for (i = 0; i < len; i++ ) { + /* Avoid writing the IAC information to the queue. */ + if ((unsigned char)buf[i] == IAC) { + continue; + } + + d->cbuf[d->prod++ % d->size] = buf[i]; + if ((d->prod - d->cons) > d->size) { + d->cons = d->prod - d->size; + } + } + + return 0; +} + +static int cirmem_chr_read(CharDriverState *chr, uint8_t *buf, int len) +{ + CirMemCharDriver *d = chr->opaque; + int i; + + for (i = 0; i < len && !cirmem_chr_is_empty(chr); i++) { + buf[i] = d->cbuf[d->cons++ % d->size]; + } + + return i; +} + +static void cirmem_chr_close(struct CharDriverState *chr) +{ + CirMemCharDriver *d = chr->opaque; + + g_free(d->cbuf); + g_free(d); + chr->opaque = NULL; +} + +static CharDriverState *qemu_chr_open_cirmemchr(QemuOpts *opts) +{ + CharDriverState *chr; + CirMemCharDriver *d; + + chr = g_malloc0(sizeof(CharDriverState)); + d = g_malloc(sizeof(*d)); + + d->size = qemu_opt_get_number(opts, "maxcapacity", 0); + if (d->size == 0) { + d->size = CBUFF_SIZE; + } + + /* The size must be power of 2 */ + if (d->size & (d->size - 1)) { + fprintf(stderr, "chardev: size of memory device must be power of 2\n"); + goto fail; + } + + d->prod = 0; + d->cons = 0; + d->cbuf = g_malloc0(d->size); + + chr->opaque = d; + chr->chr_write = cirmem_chr_write; + chr->chr_close = cirmem_chr_close; + + return chr; + +fail: + g_free(d); + g_free(chr); + return NULL; +} + +static bool qemu_is_chr(const CharDriverState *chr, const char *filename) +{ + return strcmp(chr->filename, filename); +} + +void qmp_memchar_write(const char *device, int64_t size, + const char *data, bool has_format, + enum DataFormat format, + Error **errp) +{ + CharDriverState *chr; + guchar *write_data; + int ret; + gsize write_count; + + chr = qemu_chr_find(device); + if (!chr) { + error_set(errp, QERR_DEVICE_NOT_FOUND, device); + return; + } + + if (qemu_is_chr(chr, "memory")) { + error_setg(errp,"%s is not memory char device", device); + return; + } + + write_count = (gsize)size; + + if (has_format && (format == DATA_FORMAT_BASE64)) { + write_data = g_base64_decode(data, &write_count); + } else { + write_data = (uint8_t *)data; + } + + ret = cirmem_chr_write(chr, write_data, write_count); + + if (ret < 0) { + error_setg(errp, "Failed to write to device %s", device); + return; + } +} + +MemCharRead *qmp_memchar_read(const char *device, int64_t size, + bool has_format, enum DataFormat format, + Error **errp) +{ + CharDriverState *chr; + guchar *read_data; + MemCharRead *meminfo; + size_t count; + + chr = qemu_chr_find(device); + if (!chr) { + error_set(errp, QERR_DEVICE_NOT_FOUND, device); + return NULL; + } + + if (qemu_is_chr(chr, "memory")) { + error_setg(errp,"%s is not memory char device", device); + return NULL; + } + + if (size <= 0) { + error_setg(errp, "size must be greater than zero"); + return NULL; + } + + meminfo = g_malloc0(sizeof(MemCharRead)); + + count = qemu_chr_cirmem_count(chr); + if (count == 0) { + meminfo->data = g_strdup(""); + return meminfo; + } + + size = size > count ? count : size; + read_data = g_malloc0(size + 1); + + meminfo->count = cirmem_chr_read(chr, read_data, size); + + if (has_format && (format == DATA_FORMAT_BASE64)) { + meminfo->data = g_base64_encode(read_data, (size_t)meminfo->count); + } else { + meminfo->data = (char *)read_data; + } + + return meminfo; +} + QemuOpts *qemu_chr_parse_compat(const char *label, const char *filename) { char host[65], port[33], width[8], height[8]; @@ -2697,6 +2891,11 @@ QemuOpts *qemu_chr_parse_compat(const char *label, const char *filename) qemu_opt_set(opts, "path", filename); return opts; } + if (strstart(filename, "memory", &p)) { + qemu_opt_set(opts, "backend", "memory"); + qemu_opt_set(opts, "maxcapacity", p); + return opts; + } if (strstart(filename, "file:", &p)) { qemu_opt_set(opts, "backend", "file"); qemu_opt_set(opts, "path", p); @@ -2796,6 +2995,7 @@ static const struct { { .name = "udp", .open = qemu_chr_open_udp }, { .name = "msmouse", .open = qemu_chr_open_msmouse }, { .name = "vc", .open = text_console_init }, + { .name = "memory", .open = qemu_chr_open_cirmemchr }, #ifdef _WIN32 { .name = "file", .open = qemu_chr_open_win_file_out }, { .name = "pipe", .open = qemu_chr_open_win_pipe }, @@ -3055,6 +3255,9 @@ QemuOptsList qemu_chardev_opts = { },{ .name = "debug", .type = QEMU_OPT_NUMBER, + },{ + .name = "maxcapacity", + .type = QEMU_OPT_NUMBER, }, { /* end of list */ } }, diff --git a/qemu-options.hx b/qemu-options.hx index 4e2b4994a2..2d44137bf9 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -1736,6 +1736,7 @@ DEF("chardev", HAS_ARG, QEMU_OPTION_chardev, "-chardev msmouse,id=id[,mux=on|off]\n" "-chardev vc,id=id[[,width=width][,height=height]][[,cols=cols][,rows=rows]]\n" " [,mux=on|off]\n" + "-chardev memory,id=id,maxcapacity=maxcapacity\n" "-chardev file,id=id,path=path[,mux=on|off]\n" "-chardev pipe,id=id,path=path[,mux=on|off]\n" #ifdef _WIN32 @@ -1777,6 +1778,7 @@ Backend is one of: @option{udp}, @option{msmouse}, @option{vc}, +@option{memory}, @option{file}, @option{pipe}, @option{console}, @@ -1885,6 +1887,14 @@ the console, in pixels. @option{cols} and @option{rows} specify that the console be sized to fit a text console with the given dimensions. +@item -chardev memory ,id=@var{id} ,maxcapacity=@var{maxcapacity} + +Create a circular buffer with fixed size indicated by optionally @option{maxcapacity} +which will be default 64K if it is not given. + +@option{maxcapacity} specifies the max capacity of the size of circular buffer +to create. Should be power of 2. + @item -chardev file ,id=@var{id} ,path=@var{path} Log all traffic received from the guest to a file. diff --git a/qmp-commands.hx b/qmp-commands.hx index cbf12804be..f58a8411ea 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -466,6 +466,72 @@ Note: inject-nmi fails when the guest doesn't support injecting. EQMP { + .name = "memchar-write", + .args_type = "device:s,size:i,data:s,format:s?", + .mhandler.cmd_new = qmp_marshal_input_memchar_write, + }, + +SQMP +memchar-write +------------- + +Provide writing interface for CirMemCharDriver. Write data to memory +char device. + +Arguments: + +- "device": the name of the char device, must be unique (json-string) +- "size": the memory size, in bytes, should be power of 2 (json-int) +- "data": the source data write to memory (json-string) +- "format": the data format write to memory, default is + utf8. (json-string, optional) + - Possible values: "utf8", "base64" + +Example: + +-> { "execute": "memchar-write", + "arguments": { "device": foo, + "size": 8, + "data": "abcdefgh", + "format": "utf8" } } +<- { "return": {} } + +EQMP + + { + .name = "memchar-read", + .args_type = "device:s,size:i,format:s?", + .mhandler.cmd_new = qmp_marshal_input_memchar_read, + }, + +SQMP +memchar-read +------------- + +Provide read interface for CirMemCharDriver. Read from the char +device memory and return the data with size. + +Arguments: + +- "device": the name of the char device, must be unique (json-string) +- "size": the memory size wanted to read in bytes (refer to unencoded + size of the raw data), would adjust to the init size of the + memchar if the requested size is larger than it. (json-int) +- "format": the data format write to memchardev, default is + utf8. (json-string, optional) + - Possible values: "utf8", "base64" + +Example: + +-> { "execute": "memchar-read", + "arguments": { "device": foo, + "size": 1000, + "format": "utf8" } } +<- { "return": { "data": "data string...", "count": 1000 } } + +EQMP + + { .name = "xen-save-devices-state", .args_type = "filename:F", .mhandler.cmd_new = qmp_marshal_input_xen_save_devices_state, @@ -938,7 +1004,8 @@ EQMP { .name = "drive-mirror", .args_type = "sync:s,device:B,target:s,speed:i?,mode:s?,format:s?," - "on-source-error:s?,on-target-error:s?", + "on-source-error:s?,on-target-error:s?," + "granularity:i?,buf-size:i?", .mhandler.cmd_new = qmp_marshal_input_drive_mirror, }, @@ -962,6 +1029,9 @@ Arguments: file/device (NewImageMode, optional, default 'absolute-paths') - "speed": maximum speed of the streaming job, in bytes per second (json-int) +- "granularity": granularity of the dirty bitmap, in bytes (json-int, optional) +- "buf_size": maximum amount of data in flight from source to target, in bytes + (json-int, default 10M) - "sync": what parts of the disk image should be copied to the destination; possibilities include "full" for all the disk, "top" for only the sectors allocated in the topmost image, or "none" to only replicate new I/O @@ -971,6 +1041,10 @@ Arguments: - "on-target-error": the action to take on an error on the target (BlockdevOnError, default 'report') +The default value of the granularity is the image cluster size clamped +between 4096 and 65536, if the image format defines one. If the format +does not define a cluster size, the default value of the granularity +is 65536. Example: @@ -2549,13 +2623,6 @@ Make an asynchronous request for balloon info. When the request completes a json-object will be returned containing the following data: - "actual": current balloon value in bytes (json-int) -- "mem_swapped_in": Amount of memory swapped in bytes (json-int, optional) -- "mem_swapped_out": Amount of memory swapped out in bytes (json-int, optional) -- "major_page_faults": Number of major faults (json-int, optional) -- "minor_page_faults": Number of minor faults (json-int, optional) -- "free_mem": Total amount of free and unused memory in - bytes (json-int, optional) -- "total_mem": Total amount of available memory in bytes (json-int, optional) Example: @@ -2563,12 +2630,6 @@ Example: <- { "return":{ "actual":1073741824, - "mem_swapped_in":0, - "mem_swapped_out":0, - "major_page_faults":142, - "minor_page_faults":239245, - "free_mem":1014185984, - "total_mem":1044668416 } } diff --git a/tests/Makefile b/tests/Makefile index 804ce42962..c681cebd18 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -45,6 +45,8 @@ gcov-files-test-aio-$(CONFIG_WIN32) = aio-win32.c gcov-files-test-aio-$(CONFIG_POSIX) = aio-posix.c check-unit-y += tests/test-thread-pool$(EXESUF) gcov-files-test-thread-pool-y = thread-pool.c +gcov-files-test-hbitmap-y = util/hbitmap.c +check-unit-y += tests/test-hbitmap$(EXESUF) check-unit-y += tests/test-x86-cpuid$(EXESUF) # all code tested by test-x86-cpuid is inside topology.h gcov-files-test-x86-cpuid-y = @@ -94,6 +96,7 @@ tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(block-obj-y) libqemuutil tests/test-aio$(EXESUF): tests/test-aio.o $(block-obj-y) libqemuutil.a libqemustub.a tests/test-thread-pool$(EXESUF): tests/test-thread-pool.o $(block-obj-y) libqemuutil.a libqemustub.a tests/test-iov$(EXESUF): tests/test-iov.o libqemuutil.a +tests/test-hbitmap$(EXESUF): tests/test-hbitmap.o libqemuutil.a libqemustub.a tests/test-x86-cpuid$(EXESUF): tests/test-x86-cpuid.o tests/test-qapi-types.c tests/test-qapi-types.h :\ diff --git a/tests/qemu-iotests/041 b/tests/qemu-iotests/041 index c6eb851871..b040820c51 100755 --- a/tests/qemu-iotests/041 +++ b/tests/qemu-iotests/041 @@ -207,6 +207,37 @@ class TestSingleDrive(ImageMirroringTestCase): self.assertTrue(self.compare_images(test_img, target_img), 'target image does not match source after mirroring') + def test_small_buffer(self): + self.assert_no_active_mirrors() + + # A small buffer is rounded up automatically + result = self.vm.qmp('drive-mirror', device='drive0', sync='full', + buf_size=4096, target=target_img) + self.assert_qmp(result, 'return', {}) + + self.complete_and_wait() + result = self.vm.qmp('query-block') + self.assert_qmp(result, 'return[0]/inserted/file', target_img) + self.vm.shutdown() + self.assertTrue(self.compare_images(test_img, target_img), + 'target image does not match source after mirroring') + + def test_small_buffer2(self): + self.assert_no_active_mirrors() + + qemu_img('create', '-f', iotests.imgfmt, '-o', 'cluster_size=%d,size=%d' + % (TestSingleDrive.image_len, TestSingleDrive.image_len), target_img) + result = self.vm.qmp('drive-mirror', device='drive0', sync='full', + buf_size=65536, mode='existing', target=target_img) + self.assert_qmp(result, 'return', {}) + + self.complete_and_wait() + result = self.vm.qmp('query-block') + self.assert_qmp(result, 'return[0]/inserted/file', target_img) + self.vm.shutdown() + self.assertTrue(self.compare_images(test_img, target_img), + 'target image does not match source after mirroring') + def test_large_cluster(self): self.assert_no_active_mirrors() @@ -292,6 +323,27 @@ class TestMirrorNoBacking(ImageMirroringTestCase): self.assertTrue(self.compare_images(test_img, target_img), 'target image does not match source after mirroring') + def test_large_cluster(self): + self.assert_no_active_mirrors() + + # qemu-img create fails if the image is not there + qemu_img('create', '-f', iotests.imgfmt, '-o', 'size=%d' + %(TestMirrorNoBacking.image_len), target_backing_img) + qemu_img('create', '-f', iotests.imgfmt, '-o', 'cluster_size=%d,backing_file=%s' + % (TestMirrorNoBacking.image_len, target_backing_img), target_img) + os.remove(target_backing_img) + + result = self.vm.qmp('drive-mirror', device='drive0', sync='full', + mode='existing', target=target_img) + self.assert_qmp(result, 'return', {}) + + self.complete_and_wait() + result = self.vm.qmp('query-block') + self.assert_qmp(result, 'return[0]/inserted/file', target_img) + self.vm.shutdown() + self.assertTrue(self.compare_images(test_img, target_img), + 'target image does not match source after mirroring') + class TestReadErrors(ImageMirroringTestCase): image_len = 2 * 1024 * 1024 # MB @@ -330,6 +382,9 @@ new_state = "1" '-o', 'backing_file=blkdebug:%s:%s,backing_fmt=raw' % (self.blkdebug_file, backing_img), test_img) + # Write something for tests that use sync='top' + qemu_io('-c', 'write %d 512' % (self.MIRROR_GRANULARITY + 65536), + test_img) self.vm = iotests.VM().add_drive(test_img) self.vm.launch() @@ -383,6 +438,32 @@ new_state = "1" self.complete_and_wait() self.vm.shutdown() + def test_large_cluster(self): + self.assert_no_active_mirrors() + + # Test COW into the target image. The first half of the + # cluster at MIRROR_GRANULARITY has to be copied from + # backing_img, even though sync='top'. + qemu_img('create', '-f', iotests.imgfmt, '-ocluster_size=131072,backing_file=%s' %(backing_img), target_img) + result = self.vm.qmp('drive-mirror', device='drive0', sync='top', + on_source_error='ignore', + mode='existing', target=target_img) + self.assert_qmp(result, 'return', {}) + + event = self.vm.get_qmp_event(wait=True) + self.assertEquals(event['event'], 'BLOCK_JOB_ERROR') + self.assert_qmp(event, 'data/device', 'drive0') + self.assert_qmp(event, 'data/operation', 'read') + result = self.vm.qmp('query-block-jobs') + self.assert_qmp(result, 'return[0]/paused', False) + self.complete_and_wait() + self.vm.shutdown() + + # Detach blkdebug to compare images successfully + qemu_img('rebase', '-f', iotests.imgfmt, '-u', '-b', backing_img, test_img) + self.assertTrue(self.compare_images(test_img, target_img), + 'target image does not match source after mirroring') + def test_stop_read(self): self.assert_no_active_mirrors() diff --git a/tests/qemu-iotests/041.out b/tests/qemu-iotests/041.out index 71009c239f..84bfd63fba 100644 --- a/tests/qemu-iotests/041.out +++ b/tests/qemu-iotests/041.out @@ -1,5 +1,5 @@ -.................. +...................... ---------------------------------------------------------------------- -Ran 18 tests +Ran 22 tests OK diff --git a/tests/test-hbitmap.c b/tests/test-hbitmap.c new file mode 100644 index 0000000000..8c902f2055 --- /dev/null +++ b/tests/test-hbitmap.c @@ -0,0 +1,401 @@ +/* + * Hierarchical bitmap unit-tests. + * + * Copyright (C) 2012 Red Hat Inc. + * + * Author: Paolo Bonzini <pbonzini@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include <glib.h> +#include <stdarg.h> +#include "qemu/hbitmap.h" + +#define LOG_BITS_PER_LONG (BITS_PER_LONG == 32 ? 5 : 6) + +#define L1 BITS_PER_LONG +#define L2 (BITS_PER_LONG * L1) +#define L3 (BITS_PER_LONG * L2) + +typedef struct TestHBitmapData { + HBitmap *hb; + unsigned long *bits; + size_t size; + int granularity; +} TestHBitmapData; + + +/* Check that the HBitmap and the shadow bitmap contain the same data, + * ignoring the same "first" bits. + */ +static void hbitmap_test_check(TestHBitmapData *data, + uint64_t first) +{ + uint64_t count = 0; + size_t pos; + int bit; + HBitmapIter hbi; + int64_t i, next; + + hbitmap_iter_init(&hbi, data->hb, first); + + i = first; + for (;;) { + next = hbitmap_iter_next(&hbi); + if (next < 0) { + next = data->size; + } + + while (i < next) { + pos = i >> LOG_BITS_PER_LONG; + bit = i & (BITS_PER_LONG - 1); + i++; + g_assert_cmpint(data->bits[pos] & (1UL << bit), ==, 0); + } + + if (next == data->size) { + break; + } + + pos = i >> LOG_BITS_PER_LONG; + bit = i & (BITS_PER_LONG - 1); + i++; + count++; + g_assert_cmpint(data->bits[pos] & (1UL << bit), !=, 0); + } + + if (first == 0) { + g_assert_cmpint(count << data->granularity, ==, hbitmap_count(data->hb)); + } +} + +/* This is provided instead of a test setup function so that the sizes + are kept in the test functions (and not in main()) */ +static void hbitmap_test_init(TestHBitmapData *data, + uint64_t size, int granularity) +{ + size_t n; + data->hb = hbitmap_alloc(size, granularity); + + n = (size + BITS_PER_LONG - 1) / BITS_PER_LONG; + if (n == 0) { + n = 1; + } + data->bits = g_new0(unsigned long, n); + data->size = size; + data->granularity = granularity; + if (size) { + hbitmap_test_check(data, 0); + } +} + +static void hbitmap_test_teardown(TestHBitmapData *data, + const void *unused) +{ + if (data->hb) { + hbitmap_free(data->hb); + data->hb = NULL; + } + if (data->bits) { + g_free(data->bits); + data->bits = NULL; + } +} + +/* Set a range in the HBitmap and in the shadow "simple" bitmap. + * The two bitmaps are then tested against each other. + */ +static void hbitmap_test_set(TestHBitmapData *data, + uint64_t first, uint64_t count) +{ + hbitmap_set(data->hb, first, count); + while (count-- != 0) { + size_t pos = first >> LOG_BITS_PER_LONG; + int bit = first & (BITS_PER_LONG - 1); + first++; + + data->bits[pos] |= 1UL << bit; + } + + if (data->granularity == 0) { + hbitmap_test_check(data, 0); + } +} + +/* Reset a range in the HBitmap and in the shadow "simple" bitmap. + */ +static void hbitmap_test_reset(TestHBitmapData *data, + uint64_t first, uint64_t count) +{ + hbitmap_reset(data->hb, first, count); + while (count-- != 0) { + size_t pos = first >> LOG_BITS_PER_LONG; + int bit = first & (BITS_PER_LONG - 1); + first++; + + data->bits[pos] &= ~(1UL << bit); + } + + if (data->granularity == 0) { + hbitmap_test_check(data, 0); + } +} + +static void hbitmap_test_check_get(TestHBitmapData *data) +{ + uint64_t count = 0; + uint64_t i; + + for (i = 0; i < data->size; i++) { + size_t pos = i >> LOG_BITS_PER_LONG; + int bit = i & (BITS_PER_LONG - 1); + unsigned long val = data->bits[pos] & (1UL << bit); + count += hbitmap_get(data->hb, i); + g_assert_cmpint(hbitmap_get(data->hb, i), ==, val != 0); + } + g_assert_cmpint(count, ==, hbitmap_count(data->hb)); +} + +static void test_hbitmap_zero(TestHBitmapData *data, + const void *unused) +{ + hbitmap_test_init(data, 0, 0); +} + +static void test_hbitmap_unaligned(TestHBitmapData *data, + const void *unused) +{ + hbitmap_test_init(data, L3 + 23, 0); + hbitmap_test_set(data, 0, 1); + hbitmap_test_set(data, L3 + 22, 1); +} + +static void test_hbitmap_iter_empty(TestHBitmapData *data, + const void *unused) +{ + hbitmap_test_init(data, L1, 0); +} + +static void test_hbitmap_iter_partial(TestHBitmapData *data, + const void *unused) +{ + hbitmap_test_init(data, L3, 0); + hbitmap_test_set(data, 0, L3); + hbitmap_test_check(data, 1); + hbitmap_test_check(data, L1 - 1); + hbitmap_test_check(data, L1); + hbitmap_test_check(data, L1 * 2 - 1); + hbitmap_test_check(data, L2 - 1); + hbitmap_test_check(data, L2); + hbitmap_test_check(data, L2 + 1); + hbitmap_test_check(data, L2 + L1); + hbitmap_test_check(data, L2 + L1 * 2 - 1); + hbitmap_test_check(data, L2 * 2 - 1); + hbitmap_test_check(data, L2 * 2); + hbitmap_test_check(data, L2 * 2 + 1); + hbitmap_test_check(data, L2 * 2 + L1); + hbitmap_test_check(data, L2 * 2 + L1 * 2 - 1); + hbitmap_test_check(data, L3 / 2); +} + +static void test_hbitmap_set_all(TestHBitmapData *data, + const void *unused) +{ + hbitmap_test_init(data, L3, 0); + hbitmap_test_set(data, 0, L3); +} + +static void test_hbitmap_get_all(TestHBitmapData *data, + const void *unused) +{ + hbitmap_test_init(data, L3, 0); + hbitmap_test_set(data, 0, L3); + hbitmap_test_check_get(data); +} + +static void test_hbitmap_get_some(TestHBitmapData *data, + const void *unused) +{ + hbitmap_test_init(data, 2 * L2, 0); + hbitmap_test_set(data, 10, 1); + hbitmap_test_check_get(data); + hbitmap_test_set(data, L1 - 1, 1); + hbitmap_test_check_get(data); + hbitmap_test_set(data, L1, 1); + hbitmap_test_check_get(data); + hbitmap_test_set(data, L2 - 1, 1); + hbitmap_test_check_get(data); + hbitmap_test_set(data, L2, 1); + hbitmap_test_check_get(data); +} + +static void test_hbitmap_set_one(TestHBitmapData *data, + const void *unused) +{ + hbitmap_test_init(data, 2 * L2, 0); + hbitmap_test_set(data, 10, 1); + hbitmap_test_set(data, L1 - 1, 1); + hbitmap_test_set(data, L1, 1); + hbitmap_test_set(data, L2 - 1, 1); + hbitmap_test_set(data, L2, 1); +} + +static void test_hbitmap_set_two_elem(TestHBitmapData *data, + const void *unused) +{ + hbitmap_test_init(data, 2 * L2, 0); + hbitmap_test_set(data, L1 - 1, 2); + hbitmap_test_set(data, L1 * 2 - 1, 4); + hbitmap_test_set(data, L1 * 4, L1 + 1); + hbitmap_test_set(data, L1 * 8 - 1, L1 + 1); + hbitmap_test_set(data, L2 - 1, 2); + hbitmap_test_set(data, L2 + L1 - 1, 8); + hbitmap_test_set(data, L2 + L1 * 4, L1 + 1); + hbitmap_test_set(data, L2 + L1 * 8 - 1, L1 + 1); +} + +static void test_hbitmap_set(TestHBitmapData *data, + const void *unused) +{ + hbitmap_test_init(data, L3 * 2, 0); + hbitmap_test_set(data, L1 - 1, L1 + 2); + hbitmap_test_set(data, L1 * 3 - 1, L1 + 2); + hbitmap_test_set(data, L1 * 5, L1 * 2 + 1); + hbitmap_test_set(data, L1 * 8 - 1, L1 * 2 + 1); + hbitmap_test_set(data, L2 - 1, L1 + 2); + hbitmap_test_set(data, L2 + L1 * 2 - 1, L1 + 2); + hbitmap_test_set(data, L2 + L1 * 4, L1 * 2 + 1); + hbitmap_test_set(data, L2 + L1 * 7 - 1, L1 * 2 + 1); + hbitmap_test_set(data, L2 * 2 - 1, L3 * 2 - L2 * 2); +} + +static void test_hbitmap_set_twice(TestHBitmapData *data, + const void *unused) +{ + hbitmap_test_init(data, L1 * 3, 0); + hbitmap_test_set(data, 0, L1 * 3); + hbitmap_test_set(data, L1, 1); +} + +static void test_hbitmap_set_overlap(TestHBitmapData *data, + const void *unused) +{ + hbitmap_test_init(data, L3 * 2, 0); + hbitmap_test_set(data, L1 - 1, L1 + 2); + hbitmap_test_set(data, L1 * 2 - 1, L1 * 2 + 2); + hbitmap_test_set(data, 0, L1 * 3); + hbitmap_test_set(data, L1 * 8 - 1, L2); + hbitmap_test_set(data, L2, L1); + hbitmap_test_set(data, L2 - L1 - 1, L1 * 8 + 2); + hbitmap_test_set(data, L2, L3 - L2 + 1); + hbitmap_test_set(data, L3 - L1, L1 * 3); + hbitmap_test_set(data, L3 - 1, 3); + hbitmap_test_set(data, L3 - 1, L2); +} + +static void test_hbitmap_reset_empty(TestHBitmapData *data, + const void *unused) +{ + hbitmap_test_init(data, L3, 0); + hbitmap_test_reset(data, 0, L3); +} + +static void test_hbitmap_reset(TestHBitmapData *data, + const void *unused) +{ + hbitmap_test_init(data, L3 * 2, 0); + hbitmap_test_set(data, L1 - 1, L1 + 2); + hbitmap_test_reset(data, L1 * 2 - 1, L1 * 2 + 2); + hbitmap_test_set(data, 0, L1 * 3); + hbitmap_test_reset(data, L1 * 8 - 1, L2); + hbitmap_test_set(data, L2, L1); + hbitmap_test_reset(data, L2 - L1 - 1, L1 * 8 + 2); + hbitmap_test_set(data, L2, L3 - L2 + 1); + hbitmap_test_reset(data, L3 - L1, L1 * 3); + hbitmap_test_set(data, L3 - 1, 3); + hbitmap_test_reset(data, L3 - 1, L2); + hbitmap_test_set(data, 0, L3 * 2); + hbitmap_test_reset(data, 0, L1); + hbitmap_test_reset(data, 0, L2); + hbitmap_test_reset(data, L3, L3); + hbitmap_test_set(data, L3 / 2, L3); +} + +static void test_hbitmap_granularity(TestHBitmapData *data, + const void *unused) +{ + /* Note that hbitmap_test_check has to be invoked manually in this test. */ + hbitmap_test_init(data, L1, 1); + hbitmap_test_set(data, 0, 1); + g_assert_cmpint(hbitmap_count(data->hb), ==, 2); + hbitmap_test_check(data, 0); + hbitmap_test_set(data, 2, 1); + g_assert_cmpint(hbitmap_count(data->hb), ==, 4); + hbitmap_test_check(data, 0); + hbitmap_test_set(data, 0, 3); + g_assert_cmpint(hbitmap_count(data->hb), ==, 4); + hbitmap_test_reset(data, 0, 1); + g_assert_cmpint(hbitmap_count(data->hb), ==, 2); +} + +static void test_hbitmap_iter_granularity(TestHBitmapData *data, + const void *unused) +{ + HBitmapIter hbi; + + /* Note that hbitmap_test_check has to be invoked manually in this test. */ + hbitmap_test_init(data, 131072 << 7, 7); + hbitmap_iter_init(&hbi, data->hb, 0); + g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0); + + hbitmap_test_set(data, ((L2 + L1 + 1) << 7) + 8, 8); + hbitmap_iter_init(&hbi, data->hb, 0); + g_assert_cmpint(hbitmap_iter_next(&hbi), ==, (L2 + L1 + 1) << 7); + g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0); + + hbitmap_iter_init(&hbi, data->hb, (L2 + L1 + 2) << 7); + g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0); + + hbitmap_test_set(data, (131072 << 7) - 8, 8); + hbitmap_iter_init(&hbi, data->hb, 0); + g_assert_cmpint(hbitmap_iter_next(&hbi), ==, (L2 + L1 + 1) << 7); + g_assert_cmpint(hbitmap_iter_next(&hbi), ==, 131071 << 7); + g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0); + + hbitmap_iter_init(&hbi, data->hb, (L2 + L1 + 2) << 7); + g_assert_cmpint(hbitmap_iter_next(&hbi), ==, 131071 << 7); + g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0); +} + +static void hbitmap_test_add(const char *testpath, + void (*test_func)(TestHBitmapData *data, const void *user_data)) +{ + g_test_add(testpath, TestHBitmapData, NULL, NULL, test_func, + hbitmap_test_teardown); +} + +int main(int argc, char **argv) +{ + g_test_init(&argc, &argv, NULL); + hbitmap_test_add("/hbitmap/size/0", test_hbitmap_zero); + hbitmap_test_add("/hbitmap/size/unaligned", test_hbitmap_unaligned); + hbitmap_test_add("/hbitmap/iter/empty", test_hbitmap_iter_empty); + hbitmap_test_add("/hbitmap/iter/partial", test_hbitmap_iter_partial); + hbitmap_test_add("/hbitmap/iter/granularity", test_hbitmap_iter_granularity); + hbitmap_test_add("/hbitmap/get/all", test_hbitmap_get_all); + hbitmap_test_add("/hbitmap/get/some", test_hbitmap_get_some); + hbitmap_test_add("/hbitmap/set/all", test_hbitmap_set_all); + hbitmap_test_add("/hbitmap/set/one", test_hbitmap_set_one); + hbitmap_test_add("/hbitmap/set/two-elem", test_hbitmap_set_two_elem); + hbitmap_test_add("/hbitmap/set/general", test_hbitmap_set); + hbitmap_test_add("/hbitmap/set/twice", test_hbitmap_set_twice); + hbitmap_test_add("/hbitmap/set/overlap", test_hbitmap_set_overlap); + hbitmap_test_add("/hbitmap/reset/empty", test_hbitmap_reset_empty); + hbitmap_test_add("/hbitmap/reset/general", test_hbitmap_reset); + hbitmap_test_add("/hbitmap/granularity", test_hbitmap_granularity); + g_test_run(); + + return 0; +} diff --git a/trace-events b/trace-events index 09091e6d17..2b28076e45 100644 --- a/trace-events +++ b/trace-events @@ -79,10 +79,17 @@ commit_start(void *bs, void *base, void *top, void *s, void *co, void *opaque) " # block/mirror.c mirror_start(void *bs, void *s, void *co, void *opaque) "bs %p s %p co %p opaque %p" +mirror_restart_iter(void *s, int64_t cnt) "s %p dirty count %"PRId64 mirror_before_flush(void *s) "s %p" mirror_before_drain(void *s, int64_t cnt) "s %p dirty count %"PRId64 mirror_before_sleep(void *s, int64_t cnt, int synced) "s %p dirty count %"PRId64" synced %d" mirror_one_iteration(void *s, int64_t sector_num, int nb_sectors) "s %p sector_num %"PRId64" nb_sectors %d" +mirror_cow(void *s, int64_t sector_num) "s %p sector_num %"PRId64 +mirror_iteration_done(void *s, int64_t sector_num, int nb_sectors, int ret) "s %p sector_num %"PRId64" nb_sectors %d ret %d" +mirror_yield(void *s, int64_t cnt, int buf_free_count, int in_flight) "s %p dirty count %"PRId64" free buffers %d in_flight %d" +mirror_yield_in_flight(void *s, int64_t sector_num, int in_flight) "s %p sector_num %"PRId64" in_flight %d" +mirror_yield_buf_busy(void *s, int nb_chunks, int in_flight) "s %p requested chunks %d in_flight %d" +mirror_break_buf_busy(void *s, int nb_chunks, int in_flight) "s %p requested chunks %d in_flight %d" # blockdev.c qmp_block_job_cancel(void *job) "job %p" @@ -1060,3 +1067,8 @@ xics_set_irq_lsi(int srcno, int nr) "set_irq_lsi: srcno %d [irq %#x]" xics_ics_write_xive(int nr, int srcno, int server, uint8_t priority) "ics_write_xive: irq %#x [src %d] server %#x prio %#x" xics_ics_reject(int nr, int srcno) "reject irq %#x [src %d]" xics_ics_eoi(int nr) "ics_eoi: irq %#x" + +# hbitmap.c +hbitmap_iter_skip_words(const void *hb, void *hbi, uint64_t pos, unsigned long cur) "hb %p hbi %p pos %"PRId64" cur 0x%lx" +hbitmap_reset(void *hb, uint64_t start, uint64_t count, uint64_t sbit, uint64_t ebit) "hb %p items %"PRIu64",%"PRIu64" bits %"PRIu64"..%"PRIu64 +hbitmap_set(void *hb, uint64_t start, uint64_t count, uint64_t sbit, uint64_t ebit) "hb %p items %"PRIu64",%"PRIu64" bits %"PRIu64"..%"PRIu64 diff --git a/util/Makefile.objs b/util/Makefile.objs index 5baeb53af6..495a178557 100644 --- a/util/Makefile.objs +++ b/util/Makefile.objs @@ -2,7 +2,7 @@ util-obj-y = osdep.o cutils.o qemu-timer-common.o util-obj-$(CONFIG_WIN32) += oslib-win32.o qemu-thread-win32.o event_notifier-win32.o util-obj-$(CONFIG_POSIX) += oslib-posix.o qemu-thread-posix.o event_notifier-posix.o util-obj-y += envlist.o path.o host-utils.o cache-utils.o module.o -util-obj-y += bitmap.o bitops.o +util-obj-y += bitmap.o bitops.o hbitmap.o util-obj-y += acl.o util-obj-y += error.o qemu-error.o util-obj-$(CONFIG_POSIX) += compatfd.o diff --git a/util/hbitmap.c b/util/hbitmap.c new file mode 100644 index 0000000000..2aa487db74 --- /dev/null +++ b/util/hbitmap.c @@ -0,0 +1,401 @@ +/* + * Hierarchical Bitmap Data Type + * + * Copyright Red Hat, Inc., 2012 + * + * Author: Paolo Bonzini <pbonzini@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ + +#include <string.h> +#include <glib.h> +#include <assert.h> +#include "qemu/osdep.h" +#include "qemu/hbitmap.h" +#include "qemu/host-utils.h" +#include "trace.h" + +/* HBitmaps provides an array of bits. The bits are stored as usual in an + * array of unsigned longs, but HBitmap is also optimized to provide fast + * iteration over set bits; going from one bit to the next is O(logB n) + * worst case, with B = sizeof(long) * CHAR_BIT: the result is low enough + * that the number of levels is in fact fixed. + * + * In order to do this, it stacks multiple bitmaps with progressively coarser + * granularity; in all levels except the last, bit N is set iff the N-th + * unsigned long is nonzero in the immediately next level. When iteration + * completes on the last level it can examine the 2nd-last level to quickly + * skip entire words, and even do so recursively to skip blocks of 64 words or + * powers thereof (32 on 32-bit machines). + * + * Given an index in the bitmap, it can be split in group of bits like + * this (for the 64-bit case): + * + * bits 0-57 => word in the last bitmap | bits 58-63 => bit in the word + * bits 0-51 => word in the 2nd-last bitmap | bits 52-57 => bit in the word + * bits 0-45 => word in the 3rd-last bitmap | bits 46-51 => bit in the word + * + * So it is easy to move up simply by shifting the index right by + * log2(BITS_PER_LONG) bits. To move down, you shift the index left + * similarly, and add the word index within the group. Iteration uses + * ffs (find first set bit) to find the next word to examine; this + * operation can be done in constant time in most current architectures. + * + * Setting or clearing a range of m bits on all levels, the work to perform + * is O(m + m/W + m/W^2 + ...), which is O(m) like on a regular bitmap. + * + * When iterating on a bitmap, each bit (on any level) is only visited + * once. Hence, The total cost of visiting a bitmap with m bits in it is + * the number of bits that are set in all bitmaps. Unless the bitmap is + * extremely sparse, this is also O(m + m/W + m/W^2 + ...), so the amortized + * cost of advancing from one bit to the next is usually constant (worst case + * O(logB n) as in the non-amortized complexity). + */ + +struct HBitmap { + /* Number of total bits in the bottom level. */ + uint64_t size; + + /* Number of set bits in the bottom level. */ + uint64_t count; + + /* A scaling factor. Given a granularity of G, each bit in the bitmap will + * will actually represent a group of 2^G elements. Each operation on a + * range of bits first rounds the bits to determine which group they land + * in, and then affect the entire page; iteration will only visit the first + * bit of each group. Here is an example of operations in a size-16, + * granularity-1 HBitmap: + * + * initial state 00000000 + * set(start=0, count=9) 11111000 (iter: 0, 2, 4, 6, 8) + * reset(start=1, count=3) 00111000 (iter: 4, 6, 8) + * set(start=9, count=2) 00111100 (iter: 4, 6, 8, 10) + * reset(start=5, count=5) 00000000 + * + * From an implementation point of view, when setting or resetting bits, + * the bitmap will scale bit numbers right by this amount of bits. When + * iterating, the bitmap will scale bit numbers left by this amount of + * bits. + */ + int granularity; + + /* A number of progressively less coarse bitmaps (i.e. level 0 is the + * coarsest). Each bit in level N represents a word in level N+1 that + * has a set bit, except the last level where each bit represents the + * actual bitmap. + * + * Note that all bitmaps have the same number of levels. Even a 1-bit + * bitmap will still allocate HBITMAP_LEVELS arrays. + */ + unsigned long *levels[HBITMAP_LEVELS]; +}; + +static inline int popcountl(unsigned long l) +{ + return BITS_PER_LONG == 32 ? ctpop32(l) : ctpop64(l); +} + +/* Advance hbi to the next nonzero word and return it. hbi->pos + * is updated. Returns zero if we reach the end of the bitmap. + */ +unsigned long hbitmap_iter_skip_words(HBitmapIter *hbi) +{ + size_t pos = hbi->pos; + const HBitmap *hb = hbi->hb; + unsigned i = HBITMAP_LEVELS - 1; + + unsigned long cur; + do { + cur = hbi->cur[--i]; + pos >>= BITS_PER_LEVEL; + } while (cur == 0); + + /* Check for end of iteration. We always use fewer than BITS_PER_LONG + * bits in the level 0 bitmap; thus we can repurpose the most significant + * bit as a sentinel. The sentinel is set in hbitmap_alloc and ensures + * that the above loop ends even without an explicit check on i. + */ + + if (i == 0 && cur == (1UL << (BITS_PER_LONG - 1))) { + return 0; + } + for (; i < HBITMAP_LEVELS - 1; i++) { + /* Shift back pos to the left, matching the right shifts above. + * The index of this word's least significant set bit provides + * the low-order bits. + */ + pos = (pos << BITS_PER_LEVEL) + ffsl(cur) - 1; + hbi->cur[i] = cur & (cur - 1); + + /* Set up next level for iteration. */ + cur = hb->levels[i + 1][pos]; + } + + hbi->pos = pos; + trace_hbitmap_iter_skip_words(hbi->hb, hbi, pos, cur); + + assert(cur); + return cur; +} + +void hbitmap_iter_init(HBitmapIter *hbi, const HBitmap *hb, uint64_t first) +{ + unsigned i, bit; + uint64_t pos; + + hbi->hb = hb; + pos = first >> hb->granularity; + assert(pos < hb->size); + hbi->pos = pos >> BITS_PER_LEVEL; + hbi->granularity = hb->granularity; + + for (i = HBITMAP_LEVELS; i-- > 0; ) { + bit = pos & (BITS_PER_LONG - 1); + pos >>= BITS_PER_LEVEL; + + /* Drop bits representing items before first. */ + hbi->cur[i] = hb->levels[i][pos] & ~((1UL << bit) - 1); + + /* We have already added level i+1, so the lowest set bit has + * been processed. Clear it. + */ + if (i != HBITMAP_LEVELS - 1) { + hbi->cur[i] &= ~(1UL << bit); + } + } +} + +bool hbitmap_empty(const HBitmap *hb) +{ + return hb->count == 0; +} + +int hbitmap_granularity(const HBitmap *hb) +{ + return hb->granularity; +} + +uint64_t hbitmap_count(const HBitmap *hb) +{ + return hb->count << hb->granularity; +} + +/* Count the number of set bits between start and end, not accounting for + * the granularity. Also an example of how to use hbitmap_iter_next_word. + */ +static uint64_t hb_count_between(HBitmap *hb, uint64_t start, uint64_t last) +{ + HBitmapIter hbi; + uint64_t count = 0; + uint64_t end = last + 1; + unsigned long cur; + size_t pos; + + hbitmap_iter_init(&hbi, hb, start << hb->granularity); + for (;;) { + pos = hbitmap_iter_next_word(&hbi, &cur); + if (pos >= (end >> BITS_PER_LEVEL)) { + break; + } + count += popcountl(cur); + } + + if (pos == (end >> BITS_PER_LEVEL)) { + /* Drop bits representing the END-th and subsequent items. */ + int bit = end & (BITS_PER_LONG - 1); + cur &= (1UL << bit) - 1; + count += popcountl(cur); + } + + return count; +} + +/* Setting starts at the last layer and propagates up if an element + * changes from zero to non-zero. + */ +static inline bool hb_set_elem(unsigned long *elem, uint64_t start, uint64_t last) +{ + unsigned long mask; + bool changed; + + assert((last >> BITS_PER_LEVEL) == (start >> BITS_PER_LEVEL)); + assert(start <= last); + + mask = 2UL << (last & (BITS_PER_LONG - 1)); + mask -= 1UL << (start & (BITS_PER_LONG - 1)); + changed = (*elem == 0); + *elem |= mask; + return changed; +} + +/* The recursive workhorse (the depth is limited to HBITMAP_LEVELS)... */ +static void hb_set_between(HBitmap *hb, int level, uint64_t start, uint64_t last) +{ + size_t pos = start >> BITS_PER_LEVEL; + size_t lastpos = last >> BITS_PER_LEVEL; + bool changed = false; + size_t i; + + i = pos; + if (i < lastpos) { + uint64_t next = (start | (BITS_PER_LONG - 1)) + 1; + changed |= hb_set_elem(&hb->levels[level][i], start, next - 1); + for (;;) { + start = next; + next += BITS_PER_LONG; + if (++i == lastpos) { + break; + } + changed |= (hb->levels[level][i] == 0); + hb->levels[level][i] = ~0UL; + } + } + changed |= hb_set_elem(&hb->levels[level][i], start, last); + + /* If there was any change in this layer, we may have to update + * the one above. + */ + if (level > 0 && changed) { + hb_set_between(hb, level - 1, pos, lastpos); + } +} + +void hbitmap_set(HBitmap *hb, uint64_t start, uint64_t count) +{ + /* Compute range in the last layer. */ + uint64_t last = start + count - 1; + + trace_hbitmap_set(hb, start, count, + start >> hb->granularity, last >> hb->granularity); + + start >>= hb->granularity; + last >>= hb->granularity; + count = last - start + 1; + + hb->count += count - hb_count_between(hb, start, last); + hb_set_between(hb, HBITMAP_LEVELS - 1, start, last); +} + +/* Resetting works the other way round: propagate up if the new + * value is zero. + */ +static inline bool hb_reset_elem(unsigned long *elem, uint64_t start, uint64_t last) +{ + unsigned long mask; + bool blanked; + + assert((last >> BITS_PER_LEVEL) == (start >> BITS_PER_LEVEL)); + assert(start <= last); + + mask = 2UL << (last & (BITS_PER_LONG - 1)); + mask -= 1UL << (start & (BITS_PER_LONG - 1)); + blanked = *elem != 0 && ((*elem & ~mask) == 0); + *elem &= ~mask; + return blanked; +} + +/* The recursive workhorse (the depth is limited to HBITMAP_LEVELS)... */ +static void hb_reset_between(HBitmap *hb, int level, uint64_t start, uint64_t last) +{ + size_t pos = start >> BITS_PER_LEVEL; + size_t lastpos = last >> BITS_PER_LEVEL; + bool changed = false; + size_t i; + + i = pos; + if (i < lastpos) { + uint64_t next = (start | (BITS_PER_LONG - 1)) + 1; + + /* Here we need a more complex test than when setting bits. Even if + * something was changed, we must not blank bits in the upper level + * unless the lower-level word became entirely zero. So, remove pos + * from the upper-level range if bits remain set. + */ + if (hb_reset_elem(&hb->levels[level][i], start, next - 1)) { + changed = true; + } else { + pos++; + } + + for (;;) { + start = next; + next += BITS_PER_LONG; + if (++i == lastpos) { + break; + } + changed |= (hb->levels[level][i] != 0); + hb->levels[level][i] = 0UL; + } + } + + /* Same as above, this time for lastpos. */ + if (hb_reset_elem(&hb->levels[level][i], start, last)) { + changed = true; + } else { + lastpos--; + } + + if (level > 0 && changed) { + hb_reset_between(hb, level - 1, pos, lastpos); + } +} + +void hbitmap_reset(HBitmap *hb, uint64_t start, uint64_t count) +{ + /* Compute range in the last layer. */ + uint64_t last = start + count - 1; + + trace_hbitmap_reset(hb, start, count, + start >> hb->granularity, last >> hb->granularity); + + start >>= hb->granularity; + last >>= hb->granularity; + + hb->count -= hb_count_between(hb, start, last); + hb_reset_between(hb, HBITMAP_LEVELS - 1, start, last); +} + +bool hbitmap_get(const HBitmap *hb, uint64_t item) +{ + /* Compute position and bit in the last layer. */ + uint64_t pos = item >> hb->granularity; + unsigned long bit = 1UL << (pos & (BITS_PER_LONG - 1)); + + return (hb->levels[HBITMAP_LEVELS - 1][pos >> BITS_PER_LEVEL] & bit) != 0; +} + +void hbitmap_free(HBitmap *hb) +{ + unsigned i; + for (i = HBITMAP_LEVELS; i-- > 0; ) { + g_free(hb->levels[i]); + } + g_free(hb); +} + +HBitmap *hbitmap_alloc(uint64_t size, int granularity) +{ + HBitmap *hb = g_malloc0(sizeof (struct HBitmap)); + unsigned i; + + assert(granularity >= 0 && granularity < 64); + size = (size + (1ULL << granularity) - 1) >> granularity; + assert(size <= ((uint64_t)1 << HBITMAP_LOG_MAX_SIZE)); + + hb->size = size; + hb->granularity = granularity; + for (i = HBITMAP_LEVELS; i-- > 0; ) { + size = MAX((size + BITS_PER_LONG - 1) >> BITS_PER_LEVEL, 1); + hb->levels[i] = g_malloc0(size * sizeof(unsigned long)); + } + + /* We necessarily have free bits in level 0 due to the definition + * of HBITMAP_LEVELS, so use one for a sentinel. This speeds up + * hbitmap_iter_skip_words. + */ + assert(size == 1); + hb->levels[0][0] |= 1UL << (BITS_PER_LONG - 1); + return hb; +} |