diff options
91 files changed, 5962 insertions, 3071 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 7aab80b59f..b5ab755de5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1182,7 +1182,19 @@ S: Supported F: block/gluster.c T: git git://github.com/codyprime/qemu-kvm-jtc.git block +Null Block Driver +M: Fam Zheng <famz@redhat.com> +L: qemu-block@nongnu.org +S: Supported +F: block/null.c + Bootdevice M: Gonglei <arei.gonglei@huawei.com> S: Maintained F: bootdevice.c + +Quorum +M: Alberto Garcia <berto@igalia.com> +S: Supported +F: block/quorum.c +L: qemu-block@nongnu.org diff --git a/aio-posix.c b/aio-posix.c index cbd4c3438c..4abec38866 100644 --- a/aio-posix.c +++ b/aio-posix.c @@ -24,7 +24,6 @@ struct AioHandler IOHandler *io_read; IOHandler *io_write; int deleted; - int pollfds_idx; void *opaque; QLIST_ENTRY(AioHandler) node; }; @@ -83,7 +82,6 @@ void aio_set_fd_handler(AioContext *ctx, node->io_read = io_read; node->io_write = io_write; node->opaque = opaque; - node->pollfds_idx = -1; node->pfd.events = (io_read ? G_IO_IN | G_IO_HUP | G_IO_ERR : 0); node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0); @@ -186,13 +184,61 @@ bool aio_dispatch(AioContext *ctx) return progress; } +/* These thread-local variables are used only in a small part of aio_poll + * around the call to the poll() system call. In particular they are not + * used while aio_poll is performing callbacks, which makes it much easier + * to think about reentrancy! + * + * Stack-allocated arrays would be perfect but they have size limitations; + * heap allocation is expensive enough that we want to reuse arrays across + * calls to aio_poll(). And because poll() has to be called without holding + * any lock, the arrays cannot be stored in AioContext. Thread-local data + * has none of the disadvantages of these three options. + */ +static __thread GPollFD *pollfds; +static __thread AioHandler **nodes; +static __thread unsigned npfd, nalloc; +static __thread Notifier pollfds_cleanup_notifier; + +static void pollfds_cleanup(Notifier *n, void *unused) +{ + g_assert(npfd == 0); + g_free(pollfds); + g_free(nodes); + nalloc = 0; +} + +static void add_pollfd(AioHandler *node) +{ + if (npfd == nalloc) { + if (nalloc == 0) { + pollfds_cleanup_notifier.notify = pollfds_cleanup; + qemu_thread_atexit_add(&pollfds_cleanup_notifier); + nalloc = 8; + } else { + g_assert(nalloc <= INT_MAX); + nalloc *= 2; + } + pollfds = g_renew(GPollFD, pollfds, nalloc); + nodes = g_renew(AioHandler *, nodes, nalloc); + } + nodes[npfd] = node; + pollfds[npfd] = (GPollFD) { + .fd = node->pfd.fd, + .events = node->pfd.events, + }; + npfd++; +} + bool aio_poll(AioContext *ctx, bool blocking) { AioHandler *node; bool was_dispatching; - int ret; + int i, ret; bool progress; + int64_t timeout; + aio_context_acquire(ctx); was_dispatching = ctx->dispatching; progress = false; @@ -210,39 +256,36 @@ bool aio_poll(AioContext *ctx, bool blocking) ctx->walking_handlers++; - g_array_set_size(ctx->pollfds, 0); + assert(npfd == 0); /* fill pollfds */ QLIST_FOREACH(node, &ctx->aio_handlers, node) { - node->pollfds_idx = -1; if (!node->deleted && node->pfd.events) { - GPollFD pfd = { - .fd = node->pfd.fd, - .events = node->pfd.events, - }; - node->pollfds_idx = ctx->pollfds->len; - g_array_append_val(ctx->pollfds, pfd); + add_pollfd(node); } } - ctx->walking_handlers--; + timeout = blocking ? aio_compute_timeout(ctx) : 0; /* wait until next event */ - ret = qemu_poll_ns((GPollFD *)ctx->pollfds->data, - ctx->pollfds->len, - blocking ? aio_compute_timeout(ctx) : 0); + if (timeout) { + aio_context_release(ctx); + } + ret = qemu_poll_ns((GPollFD *)pollfds, npfd, timeout); + if (timeout) { + aio_context_acquire(ctx); + } /* if we have any readable fds, dispatch event */ if (ret > 0) { - QLIST_FOREACH(node, &ctx->aio_handlers, node) { - if (node->pollfds_idx != -1) { - GPollFD *pfd = &g_array_index(ctx->pollfds, GPollFD, - node->pollfds_idx); - node->pfd.revents = pfd->revents; - } + for (i = 0; i < npfd; i++) { + nodes[i]->pfd.revents = pollfds[i].revents; } } + npfd = 0; + ctx->walking_handlers--; + /* Run dispatch even if there were no readable fds to run timers */ aio_set_dispatching(ctx, true); if (aio_dispatch(ctx)) { @@ -250,5 +293,7 @@ bool aio_poll(AioContext *ctx, bool blocking) } aio_set_dispatching(ctx, was_dispatching); + aio_context_release(ctx); + return progress; } diff --git a/aio-win32.c b/aio-win32.c index e6f4cedf48..233d8f5d79 100644 --- a/aio-win32.c +++ b/aio-win32.c @@ -283,6 +283,7 @@ bool aio_poll(AioContext *ctx, bool blocking) int count; int timeout; + aio_context_acquire(ctx); have_select_revents = aio_prepare(ctx); if (have_select_revents) { blocking = false; @@ -323,7 +324,13 @@ bool aio_poll(AioContext *ctx, bool blocking) timeout = blocking ? qemu_timeout_ns_to_ms(aio_compute_timeout(ctx)) : 0; + if (timeout) { + aio_context_release(ctx); + } ret = WaitForMultipleObjects(count, events, FALSE, timeout); + if (timeout) { + aio_context_acquire(ctx); + } aio_set_dispatching(ctx, true); if (first && aio_bh_poll(ctx)) { @@ -349,5 +356,6 @@ bool aio_poll(AioContext *ctx, bool blocking) progress |= timerlistgroup_run_timers(&ctx->tlg); aio_set_dispatching(ctx, was_dispatching); + aio_context_release(ctx); return progress; } @@ -230,7 +230,6 @@ aio_ctx_finalize(GSource *source) event_notifier_cleanup(&ctx->notifier); rfifolock_destroy(&ctx->lock); qemu_mutex_destroy(&ctx->bh_lock); - g_array_free(ctx->pollfds, TRUE); timerlistgroup_deinit(&ctx->tlg); } @@ -281,12 +280,6 @@ static void aio_timerlist_notify(void *opaque) aio_notify(opaque); } -static void aio_rfifolock_cb(void *opaque) -{ - /* Kick owner thread in case they are blocked in aio_poll() */ - aio_notify(opaque); -} - AioContext *aio_context_new(Error **errp) { int ret; @@ -302,10 +295,9 @@ AioContext *aio_context_new(Error **errp) aio_set_event_notifier(ctx, &ctx->notifier, (EventNotifierHandler *) event_notifier_test_and_clear); - ctx->pollfds = g_array_new(FALSE, FALSE, sizeof(GPollFD)); ctx->thread_pool = NULL; qemu_mutex_init(&ctx->bh_lock); - rfifolock_init(&ctx->lock, aio_rfifolock_cb, ctx); + rfifolock_init(&ctx->lock, NULL, NULL); timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx); return ctx; @@ -51,43 +51,25 @@ #include <windows.h> #endif +/** + * A BdrvDirtyBitmap can be in three possible states: + * (1) successor is NULL and disabled is false: full r/w mode + * (2) successor is NULL and disabled is true: read only mode ("disabled") + * (3) successor is set: frozen mode. + * A frozen bitmap cannot be renamed, deleted, anonymized, cleared, set, + * or enabled. A frozen bitmap can only abdicate() or reclaim(). + */ struct BdrvDirtyBitmap { - HBitmap *bitmap; + HBitmap *bitmap; /* Dirty sector bitmap implementation */ + BdrvDirtyBitmap *successor; /* Anonymous child; implies frozen status */ + char *name; /* Optional non-empty unique ID */ + int64_t size; /* Size of the bitmap (Number of sectors) */ + bool disabled; /* Bitmap is read-only */ QLIST_ENTRY(BdrvDirtyBitmap) list; }; #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ -static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque); -static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque); -static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - QEMUIOVector *iov); -static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - QEMUIOVector *iov); -static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, - int64_t offset, unsigned int bytes, QEMUIOVector *qiov, - BdrvRequestFlags flags); -static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, - int64_t offset, unsigned int bytes, QEMUIOVector *qiov, - BdrvRequestFlags flags); -static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, - int nb_sectors, - BdrvRequestFlags flags, - BlockCompletionFunc *cb, - void *opaque, - bool is_write); -static void coroutine_fn bdrv_co_do_rw(void *opaque); -static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, BdrvRequestFlags flags); - static QTAILQ_HEAD(, BlockDriverState) bdrv_states = QTAILQ_HEAD_INITIALIZER(bdrv_states); @@ -97,10 +79,7 @@ static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states = static QLIST_HEAD(, BlockDriver) bdrv_drivers = QLIST_HEAD_INITIALIZER(bdrv_drivers); -static void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, - int nr_sectors); -static void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, - int nr_sectors); +static void bdrv_dirty_bitmap_truncate(BlockDriverState *bs); /* If non-zero, use only whitelisted block drivers */ static int use_bdrv_whitelist; @@ -124,104 +103,6 @@ int is_windows_drive(const char *filename) } #endif -/* throttling disk I/O limits */ -void bdrv_set_io_limits(BlockDriverState *bs, - ThrottleConfig *cfg) -{ - int i; - - throttle_config(&bs->throttle_state, cfg); - - for (i = 0; i < 2; i++) { - qemu_co_enter_next(&bs->throttled_reqs[i]); - } -} - -/* this function drain all the throttled IOs */ -static bool bdrv_start_throttled_reqs(BlockDriverState *bs) -{ - bool drained = false; - bool enabled = bs->io_limits_enabled; - int i; - - bs->io_limits_enabled = false; - - for (i = 0; i < 2; i++) { - while (qemu_co_enter_next(&bs->throttled_reqs[i])) { - drained = true; - } - } - - bs->io_limits_enabled = enabled; - - return drained; -} - -void bdrv_io_limits_disable(BlockDriverState *bs) -{ - bs->io_limits_enabled = false; - - bdrv_start_throttled_reqs(bs); - - throttle_destroy(&bs->throttle_state); -} - -static void bdrv_throttle_read_timer_cb(void *opaque) -{ - BlockDriverState *bs = opaque; - qemu_co_enter_next(&bs->throttled_reqs[0]); -} - -static void bdrv_throttle_write_timer_cb(void *opaque) -{ - BlockDriverState *bs = opaque; - qemu_co_enter_next(&bs->throttled_reqs[1]); -} - -/* should be called before bdrv_set_io_limits if a limit is set */ -void bdrv_io_limits_enable(BlockDriverState *bs) -{ - assert(!bs->io_limits_enabled); - throttle_init(&bs->throttle_state, - bdrv_get_aio_context(bs), - QEMU_CLOCK_VIRTUAL, - bdrv_throttle_read_timer_cb, - bdrv_throttle_write_timer_cb, - bs); - bs->io_limits_enabled = true; -} - -/* This function makes an IO wait if needed - * - * @nb_sectors: the number of sectors of the IO - * @is_write: is the IO a write - */ -static void bdrv_io_limits_intercept(BlockDriverState *bs, - unsigned int bytes, - bool is_write) -{ - /* does this io must wait */ - bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write); - - /* if must wait or any request of this type throttled queue the IO */ - if (must_wait || - !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) { - qemu_co_queue_wait(&bs->throttled_reqs[is_write]); - } - - /* the IO will be executed, do the accounting */ - throttle_account(&bs->throttle_state, is_write, bytes); - - - /* if the next request must wait -> do nothing */ - if (throttle_schedule_timer(&bs->throttle_state, is_write)) { - return; - } - - /* else queue next request for execution */ - qemu_co_queue_next(&bs->throttled_reqs[is_write]); -} - size_t bdrv_opt_mem_align(BlockDriverState *bs) { if (!bs || !bs->drv) { @@ -335,20 +216,7 @@ void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz, void bdrv_register(BlockDriver *bdrv) { - /* Block drivers without coroutine functions need emulation */ - if (!bdrv->bdrv_co_readv) { - bdrv->bdrv_co_readv = bdrv_co_readv_em; - bdrv->bdrv_co_writev = bdrv_co_writev_em; - - /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if - * the block driver lacks aio we need to emulate that too. - */ - if (!bdrv->bdrv_aio_readv) { - /* add AIO emulation layer */ - bdrv->bdrv_aio_readv = bdrv_aio_readv_em; - bdrv->bdrv_aio_writev = bdrv_aio_writev_em; - } - } + bdrv_setup_io_funcs(bdrv); QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list); } @@ -520,54 +388,6 @@ int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp) return ret; } -void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) -{ - BlockDriver *drv = bs->drv; - Error *local_err = NULL; - - memset(&bs->bl, 0, sizeof(bs->bl)); - - if (!drv) { - return; - } - - /* Take some limits from the children as a default */ - if (bs->file) { - bdrv_refresh_limits(bs->file, &local_err); - if (local_err) { - error_propagate(errp, local_err); - return; - } - bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length; - bs->bl.max_transfer_length = bs->file->bl.max_transfer_length; - bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment; - } else { - bs->bl.opt_mem_alignment = 512; - } - - if (bs->backing_hd) { - bdrv_refresh_limits(bs->backing_hd, &local_err); - if (local_err) { - error_propagate(errp, local_err); - return; - } - bs->bl.opt_transfer_length = - MAX(bs->bl.opt_transfer_length, - bs->backing_hd->bl.opt_transfer_length); - bs->bl.max_transfer_length = - MIN_NON_ZERO(bs->bl.max_transfer_length, - bs->backing_hd->bl.max_transfer_length); - bs->bl.opt_mem_alignment = - MAX(bs->bl.opt_mem_alignment, - bs->backing_hd->bl.opt_mem_alignment); - } - - /* Then let the driver override it */ - if (drv->bdrv_refresh_limits) { - drv->bdrv_refresh_limits(bs, errp); - } -} - /** * Try to get @bs's logical and physical block size. * On success, store them in @bsz struct and return 0. @@ -841,22 +661,6 @@ int bdrv_parse_cache_flags(const char *mode, int *flags) return 0; } -/** - * The copy-on-read flag is actually a reference count so multiple users may - * use the feature without worrying about clobbering its previous state. - * Copy-on-read stays enabled until all users have called to disable it. - */ -void bdrv_enable_copy_on_read(BlockDriverState *bs) -{ - bs->copy_on_read++; -} - -void bdrv_disable_copy_on_read(BlockDriverState *bs) -{ - assert(bs->copy_on_read > 0); - bs->copy_on_read--; -} - /* * Returns the flags that a temporary snapshot should get, based on the * originally requested flags (the originally requested image will have flags @@ -1224,8 +1028,8 @@ void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd) bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker); } else if (backing_hd) { error_setg(&bs->backing_blocker, - "device is used as backing hd of '%s'", - bdrv_get_device_name(bs)); + "node is used as backing hd of '%s'", + bdrv_get_device_or_node_name(bs)); } bs->backing_hd = backing_hd; @@ -1812,8 +1616,8 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue, * to r/w */ if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) && reopen_state->flags & BDRV_O_RDWR) { - error_set(errp, QERR_DEVICE_IS_READ_ONLY, - bdrv_get_device_name(reopen_state->bs)); + error_setg(errp, "Node '%s' is read only", + bdrv_get_device_or_node_name(reopen_state->bs)); goto error; } @@ -1839,9 +1643,9 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue, } else { /* It is currently mandatory to have a bdrv_reopen_prepare() * handler for each supported drv. */ - error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, - drv->format_name, bdrv_get_device_name(reopen_state->bs), - "reopening of file"); + error_setg(errp, "Block format '%s' used by node '%s' " + "does not support reopening files", drv->format_name, + bdrv_get_device_or_node_name(reopen_state->bs)); ret = -1; goto error; } @@ -1966,86 +1770,6 @@ void bdrv_close_all(void) } } -/* Check if any requests are in-flight (including throttled requests) */ -static bool bdrv_requests_pending(BlockDriverState *bs) -{ - if (!QLIST_EMPTY(&bs->tracked_requests)) { - return true; - } - if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) { - return true; - } - if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) { - return true; - } - if (bs->file && bdrv_requests_pending(bs->file)) { - return true; - } - if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) { - return true; - } - return false; -} - -static bool bdrv_drain_one(BlockDriverState *bs) -{ - bool bs_busy; - - bdrv_flush_io_queue(bs); - bdrv_start_throttled_reqs(bs); - bs_busy = bdrv_requests_pending(bs); - bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy); - return bs_busy; -} - -/* - * Wait for pending requests to complete on a single BlockDriverState subtree - * - * See the warning in bdrv_drain_all(). This function can only be called if - * you are sure nothing can generate I/O because you have op blockers - * installed. - * - * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState - * AioContext. - */ -void bdrv_drain(BlockDriverState *bs) -{ - while (bdrv_drain_one(bs)) { - /* Keep iterating */ - } -} - -/* - * Wait for pending requests to complete across all BlockDriverStates - * - * This function does not flush data to disk, use bdrv_flush_all() for that - * after calling this function. - * - * Note that completion of an asynchronous I/O operation can trigger any - * number of other I/O operations on other devices---for example a coroutine - * can be arbitrarily complex and a constant flow of I/O can come until the - * coroutine is complete. Because of this, it is not possible to have a - * function to drain a single device's I/O queue. - */ -void bdrv_drain_all(void) -{ - /* Always run first iteration so any pending completion BHs run */ - bool busy = true; - BlockDriverState *bs; - - while (busy) { - busy = false; - - QTAILQ_FOREACH(bs, &bdrv_states, device_list) { - AioContext *aio_context = bdrv_get_aio_context(bs); - - aio_context_acquire(aio_context); - busy |= bdrv_drain_one(bs); - aio_context_release(aio_context); - } - } -} - /* make a BlockDriverState anonymous by removing from bdrv_state and * graph_bdrv_state list. Also, NULL terminate the device_name to prevent double remove */ @@ -2367,152 +2091,6 @@ int bdrv_commit_all(void) return 0; } -/** - * Remove an active request from the tracked requests list - * - * This function should be called when a tracked request is completing. - */ -static void tracked_request_end(BdrvTrackedRequest *req) -{ - if (req->serialising) { - req->bs->serialising_in_flight--; - } - - QLIST_REMOVE(req, list); - qemu_co_queue_restart_all(&req->wait_queue); -} - -/** - * Add an active request to the tracked requests list - */ -static void tracked_request_begin(BdrvTrackedRequest *req, - BlockDriverState *bs, - int64_t offset, - unsigned int bytes, bool is_write) -{ - *req = (BdrvTrackedRequest){ - .bs = bs, - .offset = offset, - .bytes = bytes, - .is_write = is_write, - .co = qemu_coroutine_self(), - .serialising = false, - .overlap_offset = offset, - .overlap_bytes = bytes, - }; - - qemu_co_queue_init(&req->wait_queue); - - QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); -} - -static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) -{ - int64_t overlap_offset = req->offset & ~(align - 1); - unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) - - overlap_offset; - - if (!req->serialising) { - req->bs->serialising_in_flight++; - req->serialising = true; - } - - req->overlap_offset = MIN(req->overlap_offset, overlap_offset); - req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); -} - -/** - * Round a region to cluster boundaries - */ -void bdrv_round_to_clusters(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - int64_t *cluster_sector_num, - int *cluster_nb_sectors) -{ - BlockDriverInfo bdi; - - if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { - *cluster_sector_num = sector_num; - *cluster_nb_sectors = nb_sectors; - } else { - int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE; - *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c); - *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num + - nb_sectors, c); - } -} - -static int bdrv_get_cluster_size(BlockDriverState *bs) -{ - BlockDriverInfo bdi; - int ret; - - ret = bdrv_get_info(bs, &bdi); - if (ret < 0 || bdi.cluster_size == 0) { - return bs->request_alignment; - } else { - return bdi.cluster_size; - } -} - -static bool tracked_request_overlaps(BdrvTrackedRequest *req, - int64_t offset, unsigned int bytes) -{ - /* aaaa bbbb */ - if (offset >= req->overlap_offset + req->overlap_bytes) { - return false; - } - /* bbbb aaaa */ - if (req->overlap_offset >= offset + bytes) { - return false; - } - return true; -} - -static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) -{ - BlockDriverState *bs = self->bs; - BdrvTrackedRequest *req; - bool retry; - bool waited = false; - - if (!bs->serialising_in_flight) { - return false; - } - - do { - retry = false; - QLIST_FOREACH(req, &bs->tracked_requests, list) { - if (req == self || (!req->serialising && !self->serialising)) { - continue; - } - if (tracked_request_overlaps(req, self->overlap_offset, - self->overlap_bytes)) - { - /* Hitting this means there was a reentrant request, for - * example, a block driver issuing nested requests. This must - * never happen since it means deadlock. - */ - assert(qemu_coroutine_self() != req->co); - - /* If the request is already (indirectly) waiting for us, or - * will wait for us as soon as it wakes up, then just go on - * (instead of producing a deadlock in the former case). */ - if (!req->waiting_for) { - self->waiting_for = req; - qemu_co_queue_wait(&req->wait_queue); - self->waiting_for = NULL; - retry = true; - waited = true; - break; - } - } - } - } while (retry); - - return waited; -} - /* * Return values: * 0 - success @@ -2681,879 +2259,6 @@ exit: return ret; } - -static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, - size_t size) -{ - if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) { - return -EIO; - } - - if (!bdrv_is_inserted(bs)) { - return -ENOMEDIUM; - } - - if (offset < 0) { - return -EIO; - } - - return 0; -} - -static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, - int nb_sectors) -{ - if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { - return -EIO; - } - - return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE, - nb_sectors * BDRV_SECTOR_SIZE); -} - -typedef struct RwCo { - BlockDriverState *bs; - int64_t offset; - QEMUIOVector *qiov; - bool is_write; - int ret; - BdrvRequestFlags flags; -} RwCo; - -static void coroutine_fn bdrv_rw_co_entry(void *opaque) -{ - RwCo *rwco = opaque; - - if (!rwco->is_write) { - rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset, - rwco->qiov->size, rwco->qiov, - rwco->flags); - } else { - rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset, - rwco->qiov->size, rwco->qiov, - rwco->flags); - } -} - -/* - * Process a vectored synchronous request using coroutines - */ -static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, - QEMUIOVector *qiov, bool is_write, - BdrvRequestFlags flags) -{ - Coroutine *co; - RwCo rwco = { - .bs = bs, - .offset = offset, - .qiov = qiov, - .is_write = is_write, - .ret = NOT_DONE, - .flags = flags, - }; - - /** - * In sync call context, when the vcpu is blocked, this throttling timer - * will not fire; so the I/O throttling function has to be disabled here - * if it has been enabled. - */ - if (bs->io_limits_enabled) { - fprintf(stderr, "Disabling I/O throttling on '%s' due " - "to synchronous I/O.\n", bdrv_get_device_name(bs)); - bdrv_io_limits_disable(bs); - } - - if (qemu_in_coroutine()) { - /* Fast-path if already in coroutine context */ - bdrv_rw_co_entry(&rwco); - } else { - AioContext *aio_context = bdrv_get_aio_context(bs); - - co = qemu_coroutine_create(bdrv_rw_co_entry); - qemu_coroutine_enter(co, &rwco); - while (rwco.ret == NOT_DONE) { - aio_poll(aio_context, true); - } - } - return rwco.ret; -} - -/* - * Process a synchronous request using coroutines - */ -static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, - int nb_sectors, bool is_write, BdrvRequestFlags flags) -{ - QEMUIOVector qiov; - struct iovec iov = { - .iov_base = (void *)buf, - .iov_len = nb_sectors * BDRV_SECTOR_SIZE, - }; - - if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { - return -EINVAL; - } - - qemu_iovec_init_external(&qiov, &iov, 1); - return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS, - &qiov, is_write, flags); -} - -/* return < 0 if error. See bdrv_write() for the return codes */ -int bdrv_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ - return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0); -} - -/* Just like bdrv_read(), but with I/O throttling temporarily disabled */ -int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ - bool enabled; - int ret; - - enabled = bs->io_limits_enabled; - bs->io_limits_enabled = false; - ret = bdrv_read(bs, sector_num, buf, nb_sectors); - bs->io_limits_enabled = enabled; - return ret; -} - -/* Return < 0 if error. Important errors are: - -EIO generic I/O error (may happen for all errors) - -ENOMEDIUM No media inserted. - -EINVAL Invalid sector number or nb_sectors - -EACCES Trying to write a read-only device -*/ -int bdrv_write(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors) -{ - return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0); -} - -int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, BdrvRequestFlags flags) -{ - return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, - BDRV_REQ_ZERO_WRITE | flags); -} - -/* - * Completely zero out a block device with the help of bdrv_write_zeroes. - * The operation is sped up by checking the block status and only writing - * zeroes to the device if they currently do not return zeroes. Optional - * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP). - * - * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). - */ -int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) -{ - int64_t target_sectors, ret, nb_sectors, sector_num = 0; - int n; - - target_sectors = bdrv_nb_sectors(bs); - if (target_sectors < 0) { - return target_sectors; - } - - for (;;) { - nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS); - if (nb_sectors <= 0) { - return 0; - } - ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n); - if (ret < 0) { - error_report("error getting block status at sector %" PRId64 ": %s", - sector_num, strerror(-ret)); - return ret; - } - if (ret & BDRV_BLOCK_ZERO) { - sector_num += n; - continue; - } - ret = bdrv_write_zeroes(bs, sector_num, n, flags); - if (ret < 0) { - error_report("error writing zeroes at sector %" PRId64 ": %s", - sector_num, strerror(-ret)); - return ret; - } - sector_num += n; - } -} - -int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes) -{ - QEMUIOVector qiov; - struct iovec iov = { - .iov_base = (void *)buf, - .iov_len = bytes, - }; - int ret; - - if (bytes < 0) { - return -EINVAL; - } - - qemu_iovec_init_external(&qiov, &iov, 1); - ret = bdrv_prwv_co(bs, offset, &qiov, false, 0); - if (ret < 0) { - return ret; - } - - return bytes; -} - -int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) -{ - int ret; - - ret = bdrv_prwv_co(bs, offset, qiov, true, 0); - if (ret < 0) { - return ret; - } - - return qiov->size; -} - -int bdrv_pwrite(BlockDriverState *bs, int64_t offset, - const void *buf, int bytes) -{ - QEMUIOVector qiov; - struct iovec iov = { - .iov_base = (void *) buf, - .iov_len = bytes, - }; - - if (bytes < 0) { - return -EINVAL; - } - - qemu_iovec_init_external(&qiov, &iov, 1); - return bdrv_pwritev(bs, offset, &qiov); -} - -/* - * Writes to the file and ensures that no writes are reordered across this - * request (acts as a barrier) - * - * Returns 0 on success, -errno in error cases. - */ -int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, - const void *buf, int count) -{ - int ret; - - ret = bdrv_pwrite(bs, offset, buf, count); - if (ret < 0) { - return ret; - } - - /* No flush needed for cache modes that already do it */ - if (bs->enable_write_cache) { - bdrv_flush(bs); - } - - return 0; -} - -static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) -{ - /* Perform I/O through a temporary buffer so that users who scribble over - * their read buffer while the operation is in progress do not end up - * modifying the image file. This is critical for zero-copy guest I/O - * where anything might happen inside guest memory. - */ - void *bounce_buffer; - - BlockDriver *drv = bs->drv; - struct iovec iov; - QEMUIOVector bounce_qiov; - int64_t cluster_sector_num; - int cluster_nb_sectors; - size_t skip_bytes; - int ret; - - /* Cover entire cluster so no additional backing file I/O is required when - * allocating cluster in the image file. - */ - bdrv_round_to_clusters(bs, sector_num, nb_sectors, - &cluster_sector_num, &cluster_nb_sectors); - - trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, - cluster_sector_num, cluster_nb_sectors); - - iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; - iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len); - if (bounce_buffer == NULL) { - ret = -ENOMEM; - goto err; - } - - qemu_iovec_init_external(&bounce_qiov, &iov, 1); - - ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, - &bounce_qiov); - if (ret < 0) { - goto err; - } - - if (drv->bdrv_co_write_zeroes && - buffer_is_zero(bounce_buffer, iov.iov_len)) { - ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, - cluster_nb_sectors, 0); - } else { - /* This does not change the data on the disk, it is not necessary - * to flush even in cache=writethrough mode. - */ - ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, - &bounce_qiov); - } - - if (ret < 0) { - /* It might be okay to ignore write errors for guest requests. If this - * is a deliberate copy-on-read then we don't want to ignore the error. - * Simply report it in all cases. - */ - goto err; - } - - skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE; - qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, - nb_sectors * BDRV_SECTOR_SIZE); - -err: - qemu_vfree(bounce_buffer); - return ret; -} - -/* - * Forwards an already correctly aligned request to the BlockDriver. This - * handles copy on read and zeroing after EOF; any other features must be - * implemented by the caller. - */ -static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, - BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, - int64_t align, QEMUIOVector *qiov, int flags) -{ - BlockDriver *drv = bs->drv; - int ret; - - int64_t sector_num = offset >> BDRV_SECTOR_BITS; - unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; - - assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); - assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); - assert(!qiov || bytes == qiov->size); - - /* Handle Copy on Read and associated serialisation */ - if (flags & BDRV_REQ_COPY_ON_READ) { - /* If we touch the same cluster it counts as an overlap. This - * guarantees that allocating writes will be serialized and not race - * with each other for the same cluster. For example, in copy-on-read - * it ensures that the CoR read and write operations are atomic and - * guest writes cannot interleave between them. */ - mark_request_serialising(req, bdrv_get_cluster_size(bs)); - } - - wait_serialising_requests(req); - - if (flags & BDRV_REQ_COPY_ON_READ) { - int pnum; - - ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum); - if (ret < 0) { - goto out; - } - - if (!ret || pnum != nb_sectors) { - ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov); - goto out; - } - } - - /* Forward the request to the BlockDriver */ - if (!bs->zero_beyond_eof) { - ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); - } else { - /* Read zeros after EOF */ - int64_t total_sectors, max_nb_sectors; - - total_sectors = bdrv_nb_sectors(bs); - if (total_sectors < 0) { - ret = total_sectors; - goto out; - } - - max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num), - align >> BDRV_SECTOR_BITS); - if (nb_sectors < max_nb_sectors) { - ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); - } else if (max_nb_sectors > 0) { - QEMUIOVector local_qiov; - - qemu_iovec_init(&local_qiov, qiov->niov); - qemu_iovec_concat(&local_qiov, qiov, 0, - max_nb_sectors * BDRV_SECTOR_SIZE); - - ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors, - &local_qiov); - - qemu_iovec_destroy(&local_qiov); - } else { - ret = 0; - } - - /* Reading beyond end of file is supposed to produce zeroes */ - if (ret == 0 && total_sectors < sector_num + nb_sectors) { - uint64_t offset = MAX(0, total_sectors - sector_num); - uint64_t bytes = (sector_num + nb_sectors - offset) * - BDRV_SECTOR_SIZE; - qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes); - } - } - -out: - return ret; -} - -static inline uint64_t bdrv_get_align(BlockDriverState *bs) -{ - /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ - return MAX(BDRV_SECTOR_SIZE, bs->request_alignment); -} - -static inline bool bdrv_req_is_aligned(BlockDriverState *bs, - int64_t offset, size_t bytes) -{ - int64_t align = bdrv_get_align(bs); - return !(offset & (align - 1) || (bytes & (align - 1))); -} - -/* - * Handle a read request in coroutine context - */ -static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, - int64_t offset, unsigned int bytes, QEMUIOVector *qiov, - BdrvRequestFlags flags) -{ - BlockDriver *drv = bs->drv; - BdrvTrackedRequest req; - - uint64_t align = bdrv_get_align(bs); - uint8_t *head_buf = NULL; - uint8_t *tail_buf = NULL; - QEMUIOVector local_qiov; - bool use_local_qiov = false; - int ret; - - if (!drv) { - return -ENOMEDIUM; - } - - ret = bdrv_check_byte_request(bs, offset, bytes); - if (ret < 0) { - return ret; - } - - if (bs->copy_on_read) { - flags |= BDRV_REQ_COPY_ON_READ; - } - - /* throttling disk I/O */ - if (bs->io_limits_enabled) { - bdrv_io_limits_intercept(bs, bytes, false); - } - - /* Align read if necessary by padding qiov */ - if (offset & (align - 1)) { - head_buf = qemu_blockalign(bs, align); - qemu_iovec_init(&local_qiov, qiov->niov + 2); - qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); - qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); - use_local_qiov = true; - - bytes += offset & (align - 1); - offset = offset & ~(align - 1); - } - - if ((offset + bytes) & (align - 1)) { - if (!use_local_qiov) { - qemu_iovec_init(&local_qiov, qiov->niov + 1); - qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); - use_local_qiov = true; - } - tail_buf = qemu_blockalign(bs, align); - qemu_iovec_add(&local_qiov, tail_buf, - align - ((offset + bytes) & (align - 1))); - - bytes = ROUND_UP(bytes, align); - } - - tracked_request_begin(&req, bs, offset, bytes, false); - ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align, - use_local_qiov ? &local_qiov : qiov, - flags); - tracked_request_end(&req); - - if (use_local_qiov) { - qemu_iovec_destroy(&local_qiov); - qemu_vfree(head_buf); - qemu_vfree(tail_buf); - } - - return ret; -} - -static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, - BdrvRequestFlags flags) -{ - if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { - return -EINVAL; - } - - return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS, - nb_sectors << BDRV_SECTOR_BITS, qiov, flags); -} - -int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) -{ - trace_bdrv_co_readv(bs, sector_num, nb_sectors); - - return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); -} - -int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) -{ - trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors); - - return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, - BDRV_REQ_COPY_ON_READ); -} - -#define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768 - -static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) -{ - BlockDriver *drv = bs->drv; - QEMUIOVector qiov; - struct iovec iov = {0}; - int ret = 0; - - int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes, - BDRV_REQUEST_MAX_SECTORS); - - while (nb_sectors > 0 && !ret) { - int num = nb_sectors; - - /* Align request. Block drivers can expect the "bulk" of the request - * to be aligned. - */ - if (bs->bl.write_zeroes_alignment - && num > bs->bl.write_zeroes_alignment) { - if (sector_num % bs->bl.write_zeroes_alignment != 0) { - /* Make a small request up to the first aligned sector. */ - num = bs->bl.write_zeroes_alignment; - num -= sector_num % bs->bl.write_zeroes_alignment; - } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) { - /* Shorten the request to the last aligned sector. num cannot - * underflow because num > bs->bl.write_zeroes_alignment. - */ - num -= (sector_num + num) % bs->bl.write_zeroes_alignment; - } - } - - /* limit request size */ - if (num > max_write_zeroes) { - num = max_write_zeroes; - } - - ret = -ENOTSUP; - /* First try the efficient write zeroes operation */ - if (drv->bdrv_co_write_zeroes) { - ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags); - } - - if (ret == -ENOTSUP) { - /* Fall back to bounce buffer if write zeroes is unsupported */ - int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length, - MAX_WRITE_ZEROES_BOUNCE_BUFFER); - num = MIN(num, max_xfer_len); - iov.iov_len = num * BDRV_SECTOR_SIZE; - if (iov.iov_base == NULL) { - iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE); - if (iov.iov_base == NULL) { - ret = -ENOMEM; - goto fail; - } - memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE); - } - qemu_iovec_init_external(&qiov, &iov, 1); - - ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov); - - /* Keep bounce buffer around if it is big enough for all - * all future requests. - */ - if (num < max_xfer_len) { - qemu_vfree(iov.iov_base); - iov.iov_base = NULL; - } - } - - sector_num += num; - nb_sectors -= num; - } - -fail: - qemu_vfree(iov.iov_base); - return ret; -} - -/* - * Forwards an already correctly aligned write request to the BlockDriver. - */ -static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, - BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, - QEMUIOVector *qiov, int flags) -{ - BlockDriver *drv = bs->drv; - bool waited; - int ret; - - int64_t sector_num = offset >> BDRV_SECTOR_BITS; - unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; - - assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); - assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); - assert(!qiov || bytes == qiov->size); - - waited = wait_serialising_requests(req); - assert(!waited || !req->serialising); - assert(req->overlap_offset <= offset); - assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); - - ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); - - if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && - !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes && - qemu_iovec_is_zero(qiov)) { - flags |= BDRV_REQ_ZERO_WRITE; - if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { - flags |= BDRV_REQ_MAY_UNMAP; - } - } - - if (ret < 0) { - /* Do nothing, write notifier decided to fail this request */ - } else if (flags & BDRV_REQ_ZERO_WRITE) { - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO); - ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); - } else { - BLKDBG_EVENT(bs, BLKDBG_PWRITEV); - ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); - } - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE); - - if (ret == 0 && !bs->enable_write_cache) { - ret = bdrv_co_flush(bs); - } - - bdrv_set_dirty(bs, sector_num, nb_sectors); - - block_acct_highest_sector(&bs->stats, sector_num, nb_sectors); - - if (ret >= 0) { - bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors); - } - - return ret; -} - -/* - * Handle a write request in coroutine context - */ -static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, - int64_t offset, unsigned int bytes, QEMUIOVector *qiov, - BdrvRequestFlags flags) -{ - BdrvTrackedRequest req; - uint64_t align = bdrv_get_align(bs); - uint8_t *head_buf = NULL; - uint8_t *tail_buf = NULL; - QEMUIOVector local_qiov; - bool use_local_qiov = false; - int ret; - - if (!bs->drv) { - return -ENOMEDIUM; - } - if (bs->read_only) { - return -EACCES; - } - - ret = bdrv_check_byte_request(bs, offset, bytes); - if (ret < 0) { - return ret; - } - - /* throttling disk I/O */ - if (bs->io_limits_enabled) { - bdrv_io_limits_intercept(bs, bytes, true); - } - - /* - * Align write if necessary by performing a read-modify-write cycle. - * Pad qiov with the read parts and be sure to have a tracked request not - * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. - */ - tracked_request_begin(&req, bs, offset, bytes, true); - - if (offset & (align - 1)) { - QEMUIOVector head_qiov; - struct iovec head_iov; - - mark_request_serialising(&req, align); - wait_serialising_requests(&req); - - head_buf = qemu_blockalign(bs, align); - head_iov = (struct iovec) { - .iov_base = head_buf, - .iov_len = align, - }; - qemu_iovec_init_external(&head_qiov, &head_iov, 1); - - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD); - ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align, - align, &head_qiov, 0); - if (ret < 0) { - goto fail; - } - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); - - qemu_iovec_init(&local_qiov, qiov->niov + 2); - qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); - qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); - use_local_qiov = true; - - bytes += offset & (align - 1); - offset = offset & ~(align - 1); - } - - if ((offset + bytes) & (align - 1)) { - QEMUIOVector tail_qiov; - struct iovec tail_iov; - size_t tail_bytes; - bool waited; - - mark_request_serialising(&req, align); - waited = wait_serialising_requests(&req); - assert(!waited || !use_local_qiov); - - tail_buf = qemu_blockalign(bs, align); - tail_iov = (struct iovec) { - .iov_base = tail_buf, - .iov_len = align, - }; - qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); - - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL); - ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align, - align, &tail_qiov, 0); - if (ret < 0) { - goto fail; - } - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); - - if (!use_local_qiov) { - qemu_iovec_init(&local_qiov, qiov->niov + 1); - qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); - use_local_qiov = true; - } - - tail_bytes = (offset + bytes) & (align - 1); - qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); - - bytes = ROUND_UP(bytes, align); - } - - if (use_local_qiov) { - /* Local buffer may have non-zero data. */ - flags &= ~BDRV_REQ_ZERO_WRITE; - } - ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, - use_local_qiov ? &local_qiov : qiov, - flags); - -fail: - tracked_request_end(&req); - - if (use_local_qiov) { - qemu_iovec_destroy(&local_qiov); - } - qemu_vfree(head_buf); - qemu_vfree(tail_buf); - - return ret; -} - -static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, - BdrvRequestFlags flags) -{ - if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { - return -EINVAL; - } - - return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS, - nb_sectors << BDRV_SECTOR_BITS, qiov, flags); -} - -int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) -{ - trace_bdrv_co_writev(bs, sector_num, nb_sectors); - - return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0); -} - -int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - BdrvRequestFlags flags) -{ - int ret; - - trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags); - - if (!(bs->open_flags & BDRV_O_UNMAP)) { - flags &= ~BDRV_REQ_MAY_UNMAP; - } - if (bdrv_req_is_aligned(bs, sector_num << BDRV_SECTOR_BITS, - nb_sectors << BDRV_SECTOR_BITS)) { - ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, - BDRV_REQ_ZERO_WRITE | flags); - } else { - uint8_t *buf; - QEMUIOVector local_qiov; - size_t bytes = nb_sectors << BDRV_SECTOR_BITS; - - buf = qemu_memalign(bdrv_opt_mem_align(bs), bytes); - memset(buf, 0, bytes); - qemu_iovec_init(&local_qiov, 1); - qemu_iovec_add(&local_qiov, buf, bytes); - - ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, &local_qiov, - BDRV_REQ_ZERO_WRITE | flags); - qemu_vfree(buf); - } - return ret; -} - /** * Truncate file to 'offset' bytes (needed only for file protocols) */ @@ -3571,6 +2276,7 @@ int bdrv_truncate(BlockDriverState *bs, int64_t offset) ret = drv->bdrv_truncate(bs, offset); if (ret == 0) { ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS); + bdrv_dirty_bitmap_truncate(bs); if (bs->blk) { blk_dev_resize_cb(bs->blk); } @@ -3797,8 +2503,8 @@ void bdrv_add_key(BlockDriverState *bs, const char *key, Error **errp) { if (key) { if (!bdrv_is_encrypted(bs)) { - error_setg(errp, "Device '%s' is not encrypted", - bdrv_get_device_name(bs)); + error_setg(errp, "Node '%s' is not encrypted", + bdrv_get_device_or_node_name(bs)); } else if (bdrv_set_key(bs, key) < 0) { error_set(errp, QERR_INVALID_PASSWORD); } @@ -3806,7 +2512,7 @@ void bdrv_add_key(BlockDriverState *bs, const char *key, Error **errp) if (bdrv_key_required(bs)) { error_set(errp, ERROR_CLASS_DEVICE_ENCRYPTED, "'%s' (%s) is encrypted", - bdrv_get_device_name(bs), + bdrv_get_device_or_node_name(bs), bdrv_get_encrypted_filename(bs)); } } @@ -3870,15 +2576,20 @@ BlockDriverState *bdrv_find_node(const char *node_name) } /* Put this QMP function here so it can access the static graph_bdrv_states. */ -BlockDeviceInfoList *bdrv_named_nodes_list(void) +BlockDeviceInfoList *bdrv_named_nodes_list(Error **errp) { BlockDeviceInfoList *list, *entry; BlockDriverState *bs; list = NULL; QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) { + BlockDeviceInfo *info = bdrv_block_device_info(bs, errp); + if (!info) { + qapi_free_BlockDeviceInfoList(list); + return NULL; + } entry = g_malloc0(sizeof(*entry)); - entry->value = bdrv_block_device_info(bs); + entry->value = info; entry->next = list; list = entry; } @@ -3953,29 +2664,18 @@ const char *bdrv_get_device_name(const BlockDriverState *bs) return bs->blk ? blk_name(bs->blk) : ""; } -int bdrv_get_flags(BlockDriverState *bs) +/* This can be used to identify nodes that might not have a device + * name associated. Since node and device names live in the same + * namespace, the result is unambiguous. The exception is if both are + * absent, then this returns an empty (non-null) string. */ +const char *bdrv_get_device_or_node_name(const BlockDriverState *bs) { - return bs->open_flags; + return bs->blk ? blk_name(bs->blk) : bs->node_name; } -int bdrv_flush_all(void) +int bdrv_get_flags(BlockDriverState *bs) { - BlockDriverState *bs; - int result = 0; - - QTAILQ_FOREACH(bs, &bdrv_states, device_list) { - AioContext *aio_context = bdrv_get_aio_context(bs); - int ret; - - aio_context_acquire(aio_context); - ret = bdrv_flush(bs); - if (ret < 0 && !result) { - result = ret; - } - aio_context_release(aio_context); - } - - return result; + return bs->open_flags; } int bdrv_has_zero_init_1(BlockDriverState *bs) @@ -4030,222 +2730,6 @@ bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs) return false; } -typedef struct BdrvCoGetBlockStatusData { - BlockDriverState *bs; - BlockDriverState *base; - int64_t sector_num; - int nb_sectors; - int *pnum; - int64_t ret; - bool done; -} BdrvCoGetBlockStatusData; - -/* - * Returns the allocation status of the specified sectors. - * Drivers not implementing the functionality are assumed to not support - * backing files, hence all their sectors are reported as allocated. - * - * If 'sector_num' is beyond the end of the disk image the return value is 0 - * and 'pnum' is set to 0. - * - * 'pnum' is set to the number of sectors (including and immediately following - * the specified sector) that are known to be in the same - * allocated/unallocated state. - * - * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes - * beyond the end of the disk image it will be clamped. - */ -static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, int *pnum) -{ - int64_t total_sectors; - int64_t n; - int64_t ret, ret2; - - total_sectors = bdrv_nb_sectors(bs); - if (total_sectors < 0) { - return total_sectors; - } - - if (sector_num >= total_sectors) { - *pnum = 0; - return 0; - } - - n = total_sectors - sector_num; - if (n < nb_sectors) { - nb_sectors = n; - } - - if (!bs->drv->bdrv_co_get_block_status) { - *pnum = nb_sectors; - ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; - if (bs->drv->protocol_name) { - ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE); - } - return ret; - } - - ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum); - if (ret < 0) { - *pnum = 0; - return ret; - } - - if (ret & BDRV_BLOCK_RAW) { - assert(ret & BDRV_BLOCK_OFFSET_VALID); - return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, - *pnum, pnum); - } - - if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { - ret |= BDRV_BLOCK_ALLOCATED; - } - - if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) { - if (bdrv_unallocated_blocks_are_zero(bs)) { - ret |= BDRV_BLOCK_ZERO; - } else if (bs->backing_hd) { - BlockDriverState *bs2 = bs->backing_hd; - int64_t nb_sectors2 = bdrv_nb_sectors(bs2); - if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) { - ret |= BDRV_BLOCK_ZERO; - } - } - } - - if (bs->file && - (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && - (ret & BDRV_BLOCK_OFFSET_VALID)) { - int file_pnum; - - ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, - *pnum, &file_pnum); - if (ret2 >= 0) { - /* Ignore errors. This is just providing extra information, it - * is useful but not necessary. - */ - if (!file_pnum) { - /* !file_pnum indicates an offset at or beyond the EOF; it is - * perfectly valid for the format block driver to point to such - * offsets, so catch it and mark everything as zero */ - ret |= BDRV_BLOCK_ZERO; - } else { - /* Limit request to the range reported by the protocol driver */ - *pnum = file_pnum; - ret |= (ret2 & BDRV_BLOCK_ZERO); - } - } - } - - return ret; -} - -/* Coroutine wrapper for bdrv_get_block_status() */ -static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque) -{ - BdrvCoGetBlockStatusData *data = opaque; - BlockDriverState *bs = data->bs; - - data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors, - data->pnum); - data->done = true; -} - -/* - * Synchronous wrapper around bdrv_co_get_block_status(). - * - * See bdrv_co_get_block_status() for details. - */ -int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, int *pnum) -{ - Coroutine *co; - BdrvCoGetBlockStatusData data = { - .bs = bs, - .sector_num = sector_num, - .nb_sectors = nb_sectors, - .pnum = pnum, - .done = false, - }; - - if (qemu_in_coroutine()) { - /* Fast-path if already in coroutine context */ - bdrv_get_block_status_co_entry(&data); - } else { - AioContext *aio_context = bdrv_get_aio_context(bs); - - co = qemu_coroutine_create(bdrv_get_block_status_co_entry); - qemu_coroutine_enter(co, &data); - while (!data.done) { - aio_poll(aio_context, true); - } - } - return data.ret; -} - -int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, int *pnum) -{ - int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum); - if (ret < 0) { - return ret; - } - return !!(ret & BDRV_BLOCK_ALLOCATED); -} - -/* - * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] - * - * Return true if the given sector is allocated in any image between - * BASE and TOP (inclusive). BASE can be NULL to check if the given - * sector is allocated in any image of the chain. Return false otherwise. - * - * 'pnum' is set to the number of sectors (including and immediately following - * the specified sector) that are known to be in the same - * allocated/unallocated state. - * - */ -int bdrv_is_allocated_above(BlockDriverState *top, - BlockDriverState *base, - int64_t sector_num, - int nb_sectors, int *pnum) -{ - BlockDriverState *intermediate; - int ret, n = nb_sectors; - - intermediate = top; - while (intermediate && intermediate != base) { - int pnum_inter; - ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors, - &pnum_inter); - if (ret < 0) { - return ret; - } else if (ret) { - *pnum = pnum_inter; - return 1; - } - - /* - * [sector_num, nb_sectors] is unallocated on top but intermediate - * might have - * - * [sector_num+x, nr_sectors] allocated. - */ - if (n > pnum_inter && - (intermediate == top || - sector_num + pnum_inter < intermediate->total_sectors)) { - n = pnum_inter; - } - - intermediate = intermediate->backing_hd; - } - - *pnum = n; - return 0; -} - const char *bdrv_get_encrypted_filename(BlockDriverState *bs) { if (bs->backing_hd && bs->backing_hd->encrypted) @@ -4262,28 +2746,6 @@ void bdrv_get_backing_filename(BlockDriverState *bs, pstrcpy(filename, filename_size, bs->backing_file); } -int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors) -{ - BlockDriver *drv = bs->drv; - int ret; - - if (!drv) { - return -ENOMEDIUM; - } - if (!drv->bdrv_write_compressed) { - return -ENOTSUP; - } - ret = bdrv_check_request(bs, sector_num, nb_sectors); - if (ret < 0) { - return ret; - } - - assert(QLIST_EMPTY(&bs->dirty_bitmaps)); - - return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors); -} - int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) { BlockDriver *drv = bs->drv; @@ -4304,47 +2766,6 @@ ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs) return NULL; } -int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, - int64_t pos, int size) -{ - QEMUIOVector qiov; - struct iovec iov = { - .iov_base = (void *) buf, - .iov_len = size, - }; - - qemu_iovec_init_external(&qiov, &iov, 1); - return bdrv_writev_vmstate(bs, &qiov, pos); -} - -int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) -{ - BlockDriver *drv = bs->drv; - - if (!drv) { - return -ENOMEDIUM; - } else if (drv->bdrv_save_vmstate) { - return drv->bdrv_save_vmstate(bs, qiov, pos); - } else if (bs->file) { - return bdrv_writev_vmstate(bs->file, qiov, pos); - } - - return -ENOTSUP; -} - -int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, - int64_t pos, int size) -{ - BlockDriver *drv = bs->drv; - if (!drv) - return -ENOMEDIUM; - if (drv->bdrv_load_vmstate) - return drv->bdrv_load_vmstate(bs, buf, pos, size); - if (bs->file) - return bdrv_load_vmstate(bs->file, buf, pos, size); - return -ENOTSUP; -} - void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event) { if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) { @@ -4491,452 +2912,6 @@ int bdrv_get_backing_file_depth(BlockDriverState *bs) return 1 + bdrv_get_backing_file_depth(bs->backing_hd); } -/**************************************************************/ -/* async I/Os */ - -BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, - QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); - - return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, - cb, opaque, false); -} - -BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, - QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); - - return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, - cb, opaque, true); -} - -BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, BdrvRequestFlags flags, - BlockCompletionFunc *cb, void *opaque) -{ - trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque); - - return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors, - BDRV_REQ_ZERO_WRITE | flags, - cb, opaque, true); -} - - -typedef struct MultiwriteCB { - int error; - int num_requests; - int num_callbacks; - struct { - BlockCompletionFunc *cb; - void *opaque; - QEMUIOVector *free_qiov; - } callbacks[]; -} MultiwriteCB; - -static void multiwrite_user_cb(MultiwriteCB *mcb) -{ - int i; - - for (i = 0; i < mcb->num_callbacks; i++) { - mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error); - if (mcb->callbacks[i].free_qiov) { - qemu_iovec_destroy(mcb->callbacks[i].free_qiov); - } - g_free(mcb->callbacks[i].free_qiov); - } -} - -static void multiwrite_cb(void *opaque, int ret) -{ - MultiwriteCB *mcb = opaque; - - trace_multiwrite_cb(mcb, ret); - - if (ret < 0 && !mcb->error) { - mcb->error = ret; - } - - mcb->num_requests--; - if (mcb->num_requests == 0) { - multiwrite_user_cb(mcb); - g_free(mcb); - } -} - -static int multiwrite_req_compare(const void *a, const void *b) -{ - const BlockRequest *req1 = a, *req2 = b; - - /* - * Note that we can't simply subtract req2->sector from req1->sector - * here as that could overflow the return value. - */ - if (req1->sector > req2->sector) { - return 1; - } else if (req1->sector < req2->sector) { - return -1; - } else { - return 0; - } -} - -/* - * Takes a bunch of requests and tries to merge them. Returns the number of - * requests that remain after merging. - */ -static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, - int num_reqs, MultiwriteCB *mcb) -{ - int i, outidx; - - // Sort requests by start sector - qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare); - - // Check if adjacent requests touch the same clusters. If so, combine them, - // filling up gaps with zero sectors. - outidx = 0; - for (i = 1; i < num_reqs; i++) { - int merge = 0; - int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors; - - // Handle exactly sequential writes and overlapping writes. - if (reqs[i].sector <= oldreq_last) { - merge = 1; - } - - if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) { - merge = 0; - } - - if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors + - reqs[i].nb_sectors > bs->bl.max_transfer_length) { - merge = 0; - } - - if (merge) { - size_t size; - QEMUIOVector *qiov = g_malloc0(sizeof(*qiov)); - qemu_iovec_init(qiov, - reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1); - - // Add the first request to the merged one. If the requests are - // overlapping, drop the last sectors of the first request. - size = (reqs[i].sector - reqs[outidx].sector) << 9; - qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size); - - // We should need to add any zeros between the two requests - assert (reqs[i].sector <= oldreq_last); - - // Add the second request - qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size); - - // Add tail of first request, if necessary - if (qiov->size < reqs[outidx].qiov->size) { - qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size, - reqs[outidx].qiov->size - qiov->size); - } - - reqs[outidx].nb_sectors = qiov->size >> 9; - reqs[outidx].qiov = qiov; - - mcb->callbacks[i].free_qiov = reqs[outidx].qiov; - } else { - outidx++; - reqs[outidx].sector = reqs[i].sector; - reqs[outidx].nb_sectors = reqs[i].nb_sectors; - reqs[outidx].qiov = reqs[i].qiov; - } - } - - block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1); - - return outidx + 1; -} - -/* - * Submit multiple AIO write requests at once. - * - * On success, the function returns 0 and all requests in the reqs array have - * been submitted. In error case this function returns -1, and any of the - * requests may or may not be submitted yet. In particular, this means that the - * callback will be called for some of the requests, for others it won't. The - * caller must check the error field of the BlockRequest to wait for the right - * callbacks (if error != 0, no callback will be called). - * - * The implementation may modify the contents of the reqs array, e.g. to merge - * requests. However, the fields opaque and error are left unmodified as they - * are used to signal failure for a single request to the caller. - */ -int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) -{ - MultiwriteCB *mcb; - int i; - - /* don't submit writes if we don't have a medium */ - if (bs->drv == NULL) { - for (i = 0; i < num_reqs; i++) { - reqs[i].error = -ENOMEDIUM; - } - return -1; - } - - if (num_reqs == 0) { - return 0; - } - - // Create MultiwriteCB structure - mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks)); - mcb->num_requests = 0; - mcb->num_callbacks = num_reqs; - - for (i = 0; i < num_reqs; i++) { - mcb->callbacks[i].cb = reqs[i].cb; - mcb->callbacks[i].opaque = reqs[i].opaque; - } - - // Check for mergable requests - num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb); - - trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs); - - /* Run the aio requests. */ - mcb->num_requests = num_reqs; - for (i = 0; i < num_reqs; i++) { - bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov, - reqs[i].nb_sectors, reqs[i].flags, - multiwrite_cb, mcb, - true); - } - - return 0; -} - -void bdrv_aio_cancel(BlockAIOCB *acb) -{ - qemu_aio_ref(acb); - bdrv_aio_cancel_async(acb); - while (acb->refcnt > 1) { - if (acb->aiocb_info->get_aio_context) { - aio_poll(acb->aiocb_info->get_aio_context(acb), true); - } else if (acb->bs) { - aio_poll(bdrv_get_aio_context(acb->bs), true); - } else { - abort(); - } - } - qemu_aio_unref(acb); -} - -/* Async version of aio cancel. The caller is not blocked if the acb implements - * cancel_async, otherwise we do nothing and let the request normally complete. - * In either case the completion callback must be called. */ -void bdrv_aio_cancel_async(BlockAIOCB *acb) -{ - if (acb->aiocb_info->cancel_async) { - acb->aiocb_info->cancel_async(acb); - } -} - -/**************************************************************/ -/* async block device emulation */ - -typedef struct BlockAIOCBSync { - BlockAIOCB common; - QEMUBH *bh; - int ret; - /* vector translation state */ - QEMUIOVector *qiov; - uint8_t *bounce; - int is_write; -} BlockAIOCBSync; - -static const AIOCBInfo bdrv_em_aiocb_info = { - .aiocb_size = sizeof(BlockAIOCBSync), -}; - -static void bdrv_aio_bh_cb(void *opaque) -{ - BlockAIOCBSync *acb = opaque; - - if (!acb->is_write && acb->ret >= 0) { - qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); - } - qemu_vfree(acb->bounce); - acb->common.cb(acb->common.opaque, acb->ret); - qemu_bh_delete(acb->bh); - acb->bh = NULL; - qemu_aio_unref(acb); -} - -static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, - int nb_sectors, - BlockCompletionFunc *cb, - void *opaque, - int is_write) - -{ - BlockAIOCBSync *acb; - - acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque); - acb->is_write = is_write; - acb->qiov = qiov; - acb->bounce = qemu_try_blockalign(bs, qiov->size); - acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb); - - if (acb->bounce == NULL) { - acb->ret = -ENOMEM; - } else if (is_write) { - qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); - acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); - } else { - acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors); - } - - qemu_bh_schedule(acb->bh); - - return &acb->common; -} - -static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); -} - -static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); -} - - -typedef struct BlockAIOCBCoroutine { - BlockAIOCB common; - BlockRequest req; - bool is_write; - bool *done; - QEMUBH* bh; -} BlockAIOCBCoroutine; - -static const AIOCBInfo bdrv_em_co_aiocb_info = { - .aiocb_size = sizeof(BlockAIOCBCoroutine), -}; - -static void bdrv_co_em_bh(void *opaque) -{ - BlockAIOCBCoroutine *acb = opaque; - - acb->common.cb(acb->common.opaque, acb->req.error); - - qemu_bh_delete(acb->bh); - qemu_aio_unref(acb); -} - -/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */ -static void coroutine_fn bdrv_co_do_rw(void *opaque) -{ - BlockAIOCBCoroutine *acb = opaque; - BlockDriverState *bs = acb->common.bs; - - if (!acb->is_write) { - acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, - acb->req.nb_sectors, acb->req.qiov, acb->req.flags); - } else { - acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, - acb->req.nb_sectors, acb->req.qiov, acb->req.flags); - } - - acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); - qemu_bh_schedule(acb->bh); -} - -static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, - int nb_sectors, - BdrvRequestFlags flags, - BlockCompletionFunc *cb, - void *opaque, - bool is_write) -{ - Coroutine *co; - BlockAIOCBCoroutine *acb; - - acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); - acb->req.sector = sector_num; - acb->req.nb_sectors = nb_sectors; - acb->req.qiov = qiov; - acb->req.flags = flags; - acb->is_write = is_write; - - co = qemu_coroutine_create(bdrv_co_do_rw); - qemu_coroutine_enter(co, acb); - - return &acb->common; -} - -static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque) -{ - BlockAIOCBCoroutine *acb = opaque; - BlockDriverState *bs = acb->common.bs; - - acb->req.error = bdrv_co_flush(bs); - acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); - qemu_bh_schedule(acb->bh); -} - -BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs, - BlockCompletionFunc *cb, void *opaque) -{ - trace_bdrv_aio_flush(bs, opaque); - - Coroutine *co; - BlockAIOCBCoroutine *acb; - - acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); - - co = qemu_coroutine_create(bdrv_aio_flush_co_entry); - qemu_coroutine_enter(co, acb); - - return &acb->common; -} - -static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) -{ - BlockAIOCBCoroutine *acb = opaque; - BlockDriverState *bs = acb->common.bs; - - acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors); - acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); - qemu_bh_schedule(acb->bh); -} - -BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - Coroutine *co; - BlockAIOCBCoroutine *acb; - - trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque); - - acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); - acb->req.sector = sector_num; - acb->req.nb_sectors = nb_sectors; - co = qemu_coroutine_create(bdrv_aio_discard_co_entry); - qemu_coroutine_enter(co, acb); - - return &acb->common; -} - void bdrv_init(void) { module_call_init(MODULE_INIT_BLOCK); @@ -4948,161 +2923,6 @@ void bdrv_init_with_whitelist(void) bdrv_init(); } -void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, - BlockCompletionFunc *cb, void *opaque) -{ - BlockAIOCB *acb; - - acb = g_slice_alloc(aiocb_info->aiocb_size); - acb->aiocb_info = aiocb_info; - acb->bs = bs; - acb->cb = cb; - acb->opaque = opaque; - acb->refcnt = 1; - return acb; -} - -void qemu_aio_ref(void *p) -{ - BlockAIOCB *acb = p; - acb->refcnt++; -} - -void qemu_aio_unref(void *p) -{ - BlockAIOCB *acb = p; - assert(acb->refcnt > 0); - if (--acb->refcnt == 0) { - g_slice_free1(acb->aiocb_info->aiocb_size, acb); - } -} - -/**************************************************************/ -/* Coroutine block device emulation */ - -typedef struct CoroutineIOCompletion { - Coroutine *coroutine; - int ret; -} CoroutineIOCompletion; - -static void bdrv_co_io_em_complete(void *opaque, int ret) -{ - CoroutineIOCompletion *co = opaque; - - co->ret = ret; - qemu_coroutine_enter(co->coroutine, NULL); -} - -static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *iov, - bool is_write) -{ - CoroutineIOCompletion co = { - .coroutine = qemu_coroutine_self(), - }; - BlockAIOCB *acb; - - if (is_write) { - acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors, - bdrv_co_io_em_complete, &co); - } else { - acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors, - bdrv_co_io_em_complete, &co); - } - - trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb); - if (!acb) { - return -EIO; - } - qemu_coroutine_yield(); - - return co.ret; -} - -static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - QEMUIOVector *iov) -{ - return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false); -} - -static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - QEMUIOVector *iov) -{ - return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); -} - -static void coroutine_fn bdrv_flush_co_entry(void *opaque) -{ - RwCo *rwco = opaque; - - rwco->ret = bdrv_co_flush(rwco->bs); -} - -int coroutine_fn bdrv_co_flush(BlockDriverState *bs) -{ - int ret; - - if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) { - return 0; - } - - /* Write back cached data to the OS even with cache=unsafe */ - BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); - if (bs->drv->bdrv_co_flush_to_os) { - ret = bs->drv->bdrv_co_flush_to_os(bs); - if (ret < 0) { - return ret; - } - } - - /* But don't actually force it to the disk with cache=unsafe */ - if (bs->open_flags & BDRV_O_NO_FLUSH) { - goto flush_parent; - } - - BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); - if (bs->drv->bdrv_co_flush_to_disk) { - ret = bs->drv->bdrv_co_flush_to_disk(bs); - } else if (bs->drv->bdrv_aio_flush) { - BlockAIOCB *acb; - CoroutineIOCompletion co = { - .coroutine = qemu_coroutine_self(), - }; - - acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); - if (acb == NULL) { - ret = -EIO; - } else { - qemu_coroutine_yield(); - ret = co.ret; - } - } else { - /* - * Some block drivers always operate in either writethrough or unsafe - * mode and don't support bdrv_flush therefore. Usually qemu doesn't - * know how the server works (because the behaviour is hardcoded or - * depends on server-side configuration), so we can't ensure that - * everything is safe on disk. Returning an error doesn't work because - * that would break guests even if the server operates in writethrough - * mode. - * - * Let's hope the user knows what he's doing. - */ - ret = 0; - } - if (ret < 0) { - return ret; - } - - /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH - * in the case of cache=unsafe, so there are no useless flushes. - */ -flush_parent: - return bdrv_co_flush(bs->file); -} - void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp) { Error *local_err = NULL; @@ -5152,143 +2972,6 @@ void bdrv_invalidate_cache_all(Error **errp) } } -int bdrv_flush(BlockDriverState *bs) -{ - Coroutine *co; - RwCo rwco = { - .bs = bs, - .ret = NOT_DONE, - }; - - if (qemu_in_coroutine()) { - /* Fast-path if already in coroutine context */ - bdrv_flush_co_entry(&rwco); - } else { - AioContext *aio_context = bdrv_get_aio_context(bs); - - co = qemu_coroutine_create(bdrv_flush_co_entry); - qemu_coroutine_enter(co, &rwco); - while (rwco.ret == NOT_DONE) { - aio_poll(aio_context, true); - } - } - - return rwco.ret; -} - -typedef struct DiscardCo { - BlockDriverState *bs; - int64_t sector_num; - int nb_sectors; - int ret; -} DiscardCo; -static void coroutine_fn bdrv_discard_co_entry(void *opaque) -{ - DiscardCo *rwco = opaque; - - rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); -} - -int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, - int nb_sectors) -{ - int max_discard, ret; - - if (!bs->drv) { - return -ENOMEDIUM; - } - - ret = bdrv_check_request(bs, sector_num, nb_sectors); - if (ret < 0) { - return ret; - } else if (bs->read_only) { - return -EROFS; - } - - bdrv_reset_dirty(bs, sector_num, nb_sectors); - - /* Do nothing if disabled. */ - if (!(bs->open_flags & BDRV_O_UNMAP)) { - return 0; - } - - if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) { - return 0; - } - - max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS); - while (nb_sectors > 0) { - int ret; - int num = nb_sectors; - - /* align request */ - if (bs->bl.discard_alignment && - num >= bs->bl.discard_alignment && - sector_num % bs->bl.discard_alignment) { - if (num > bs->bl.discard_alignment) { - num = bs->bl.discard_alignment; - } - num -= sector_num % bs->bl.discard_alignment; - } - - /* limit request size */ - if (num > max_discard) { - num = max_discard; - } - - if (bs->drv->bdrv_co_discard) { - ret = bs->drv->bdrv_co_discard(bs, sector_num, num); - } else { - BlockAIOCB *acb; - CoroutineIOCompletion co = { - .coroutine = qemu_coroutine_self(), - }; - - acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, - bdrv_co_io_em_complete, &co); - if (acb == NULL) { - return -EIO; - } else { - qemu_coroutine_yield(); - ret = co.ret; - } - } - if (ret && ret != -ENOTSUP) { - return ret; - } - - sector_num += num; - nb_sectors -= num; - } - return 0; -} - -int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) -{ - Coroutine *co; - DiscardCo rwco = { - .bs = bs, - .sector_num = sector_num, - .nb_sectors = nb_sectors, - .ret = NOT_DONE, - }; - - if (qemu_in_coroutine()) { - /* Fast-path if already in coroutine context */ - bdrv_discard_co_entry(&rwco); - } else { - AioContext *aio_context = bdrv_get_aio_context(bs); - - co = qemu_coroutine_create(bdrv_discard_co_entry); - qemu_coroutine_enter(co, &rwco); - while (rwco.ret == NOT_DONE) { - aio_poll(aio_context, true); - } - } - - return rwco.ret; -} - /**************************************************************/ /* removable device support */ @@ -5354,107 +3037,171 @@ void bdrv_lock_medium(BlockDriverState *bs, bool locked) } } -/* needed for generic scsi interface */ - -int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) +void bdrv_set_guest_block_size(BlockDriverState *bs, int align) { - BlockDriver *drv = bs->drv; - - if (drv && drv->bdrv_ioctl) - return drv->bdrv_ioctl(bs, req, buf); - return -ENOTSUP; + bs->guest_block_size = align; } -BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, - unsigned long int req, void *buf, - BlockCompletionFunc *cb, void *opaque) +BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, const char *name) { - BlockDriver *drv = bs->drv; + BdrvDirtyBitmap *bm; - if (drv && drv->bdrv_aio_ioctl) - return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque); + assert(name); + QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) { + if (bm->name && !strcmp(name, bm->name)) { + return bm; + } + } return NULL; } -void bdrv_set_guest_block_size(BlockDriverState *bs, int align) +void bdrv_dirty_bitmap_make_anon(BdrvDirtyBitmap *bitmap) { - bs->guest_block_size = align; + assert(!bdrv_dirty_bitmap_frozen(bitmap)); + g_free(bitmap->name); + bitmap->name = NULL; +} + +BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, + uint32_t granularity, + const char *name, + Error **errp) +{ + int64_t bitmap_size; + BdrvDirtyBitmap *bitmap; + uint32_t sector_granularity; + + assert((granularity & (granularity - 1)) == 0); + + if (name && bdrv_find_dirty_bitmap(bs, name)) { + error_setg(errp, "Bitmap already exists: %s", name); + return NULL; + } + sector_granularity = granularity >> BDRV_SECTOR_BITS; + assert(sector_granularity); + bitmap_size = bdrv_nb_sectors(bs); + if (bitmap_size < 0) { + error_setg_errno(errp, -bitmap_size, "could not get length of device"); + errno = -bitmap_size; + return NULL; + } + bitmap = g_new0(BdrvDirtyBitmap, 1); + bitmap->bitmap = hbitmap_alloc(bitmap_size, ctz32(sector_granularity)); + bitmap->size = bitmap_size; + bitmap->name = g_strdup(name); + bitmap->disabled = false; + QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list); + return bitmap; } -void *qemu_blockalign(BlockDriverState *bs, size_t size) +bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap) { - return qemu_memalign(bdrv_opt_mem_align(bs), size); + return bitmap->successor; } -void *qemu_blockalign0(BlockDriverState *bs, size_t size) +bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap) { - return memset(qemu_blockalign(bs, size), 0, size); + return !(bitmap->disabled || bitmap->successor); } -void *qemu_try_blockalign(BlockDriverState *bs, size_t size) +/** + * Create a successor bitmap destined to replace this bitmap after an operation. + * Requires that the bitmap is not frozen and has no successor. + */ +int bdrv_dirty_bitmap_create_successor(BlockDriverState *bs, + BdrvDirtyBitmap *bitmap, Error **errp) { - size_t align = bdrv_opt_mem_align(bs); + uint64_t granularity; + BdrvDirtyBitmap *child; + + if (bdrv_dirty_bitmap_frozen(bitmap)) { + error_setg(errp, "Cannot create a successor for a bitmap that is " + "currently frozen"); + return -1; + } + assert(!bitmap->successor); - /* Ensure that NULL is never returned on success */ - assert(align > 0); - if (size == 0) { - size = align; + /* Create an anonymous successor */ + granularity = bdrv_dirty_bitmap_granularity(bitmap); + child = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp); + if (!child) { + return -1; } - return qemu_try_memalign(align, size); + /* Successor will be on or off based on our current state. */ + child->disabled = bitmap->disabled; + + /* Install the successor and freeze the parent */ + bitmap->successor = child; + return 0; } -void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) +/** + * For a bitmap with a successor, yield our name to the successor, + * delete the old bitmap, and return a handle to the new bitmap. + */ +BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BlockDriverState *bs, + BdrvDirtyBitmap *bitmap, + Error **errp) { - void *mem = qemu_try_blockalign(bs, size); + char *name; + BdrvDirtyBitmap *successor = bitmap->successor; - if (mem) { - memset(mem, 0, size); + if (successor == NULL) { + error_setg(errp, "Cannot relinquish control if " + "there's no successor present"); + return NULL; } - return mem; + name = bitmap->name; + bitmap->name = NULL; + successor->name = name; + bitmap->successor = NULL; + bdrv_release_dirty_bitmap(bs, bitmap); + + return successor; } -/* - * Check if all memory in this vector is sector aligned. +/** + * In cases of failure where we can no longer safely delete the parent, + * we may wish to re-join the parent and child/successor. + * The merged parent will be un-frozen, but not explicitly re-enabled. */ -bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) +BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BlockDriverState *bs, + BdrvDirtyBitmap *parent, + Error **errp) { - int i; - size_t alignment = bdrv_opt_mem_align(bs); + BdrvDirtyBitmap *successor = parent->successor; - for (i = 0; i < qiov->niov; i++) { - if ((uintptr_t) qiov->iov[i].iov_base % alignment) { - return false; - } - if (qiov->iov[i].iov_len % alignment) { - return false; - } + if (!successor) { + error_setg(errp, "Cannot reclaim a successor when none is present"); + return NULL; } - return true; + if (!hbitmap_merge(parent->bitmap, successor->bitmap)) { + error_setg(errp, "Merging of parent and successor bitmap failed"); + return NULL; + } + bdrv_release_dirty_bitmap(bs, successor); + parent->successor = NULL; + + return parent; } -BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity, - Error **errp) +/** + * Truncates _all_ bitmaps attached to a BDS. + */ +static void bdrv_dirty_bitmap_truncate(BlockDriverState *bs) { - int64_t bitmap_size; BdrvDirtyBitmap *bitmap; + uint64_t size = bdrv_nb_sectors(bs); - assert((granularity & (granularity - 1)) == 0); - - granularity >>= BDRV_SECTOR_BITS; - assert(granularity); - bitmap_size = bdrv_nb_sectors(bs); - if (bitmap_size < 0) { - error_setg_errno(errp, -bitmap_size, "could not get length of device"); - errno = -bitmap_size; - return NULL; + QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) { + if (bdrv_dirty_bitmap_frozen(bitmap)) { + continue; + } + hbitmap_truncate(bitmap->bitmap, size); } - bitmap = g_new0(BdrvDirtyBitmap, 1); - bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1); - QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list); - return bitmap; } void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap) @@ -5462,14 +3209,28 @@ void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap) BdrvDirtyBitmap *bm, *next; QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) { if (bm == bitmap) { + assert(!bdrv_dirty_bitmap_frozen(bm)); QLIST_REMOVE(bitmap, list); hbitmap_free(bitmap->bitmap); + g_free(bitmap->name); g_free(bitmap); return; } } } +void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap) +{ + assert(!bdrv_dirty_bitmap_frozen(bitmap)); + bitmap->disabled = true; +} + +void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap) +{ + assert(!bdrv_dirty_bitmap_frozen(bitmap)); + bitmap->disabled = false; +} + BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs) { BdrvDirtyBitmap *bm; @@ -5479,9 +3240,11 @@ BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs) QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) { BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1); BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1); - info->count = bdrv_get_dirty_count(bs, bm); - info->granularity = - ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap)); + info->count = bdrv_get_dirty_count(bm); + info->granularity = bdrv_dirty_bitmap_granularity(bm); + info->has_name = !!bm->name; + info->name = g_strdup(bm->name); + info->frozen = bdrv_dirty_bitmap_frozen(bm); entry->value = info; *plist = entry; plist = &entry->next; @@ -5499,43 +3262,90 @@ int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector } } -void bdrv_dirty_iter_init(BlockDriverState *bs, - BdrvDirtyBitmap *bitmap, HBitmapIter *hbi) +/** + * Chooses a default granularity based on the existing cluster size, + * but clamped between [4K, 64K]. Defaults to 64K in the case that there + * is no cluster size information available. + */ +uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs) +{ + BlockDriverInfo bdi; + uint32_t granularity; + + if (bdrv_get_info(bs, &bdi) >= 0 && bdi.cluster_size > 0) { + granularity = MAX(4096, bdi.cluster_size); + granularity = MIN(65536, granularity); + } else { + granularity = 65536; + } + + return granularity; +} + +uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap) +{ + return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->bitmap); +} + +void bdrv_dirty_iter_init(BdrvDirtyBitmap *bitmap, HBitmapIter *hbi) { hbitmap_iter_init(hbi, bitmap->bitmap, 0); } -void bdrv_set_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, +void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap, int64_t cur_sector, int nr_sectors) { + assert(bdrv_dirty_bitmap_enabled(bitmap)); hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors); } -void bdrv_reset_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, +void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap, int64_t cur_sector, int nr_sectors) { + assert(bdrv_dirty_bitmap_enabled(bitmap)); hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors); } -static void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, - int nr_sectors) +void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap) +{ + assert(bdrv_dirty_bitmap_enabled(bitmap)); + hbitmap_reset(bitmap->bitmap, 0, bitmap->size); +} + +void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, + int nr_sectors) { BdrvDirtyBitmap *bitmap; QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) { + if (!bdrv_dirty_bitmap_enabled(bitmap)) { + continue; + } hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors); } } -static void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, - int nr_sectors) +void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, + int nr_sectors) { BdrvDirtyBitmap *bitmap; QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) { + if (!bdrv_dirty_bitmap_enabled(bitmap)) { + continue; + } hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors); } } -int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap) +/** + * Advance an HBitmapIter to an arbitrary offset. + */ +void bdrv_set_dirty_iter(HBitmapIter *hbi, int64_t offset) +{ + assert(hbi->hb); + hbitmap_iter_init(hbi, hbi->hb, offset); +} + +int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap) { return hbitmap_count(bitmap->bitmap); } @@ -5572,8 +3382,8 @@ bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp) if (!QLIST_EMPTY(&bs->op_blockers[op])) { blocker = QLIST_FIRST(&bs->op_blockers[op]); if (errp) { - error_setg(errp, "Device '%s' is busy: %s", - bdrv_get_device_name(bs), + error_setg(errp, "Node '%s' is busy: %s", + bdrv_get_device_or_node_name(bs), error_get_pretty(blocker->reason)); } return true; @@ -5953,12 +3763,6 @@ void bdrv_remove_aio_context_notifier(BlockDriverState *bs, abort(); } -void bdrv_add_before_write_notifier(BlockDriverState *bs, - NotifierWithReturn *notifier) -{ - notifier_with_return_list_add(&bs->before_write_notifiers, notifier); -} - int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts, BlockDriverAmendStatusCB *status_cb) { @@ -6059,36 +3863,6 @@ out: return to_replace_bs; } -void bdrv_io_plug(BlockDriverState *bs) -{ - BlockDriver *drv = bs->drv; - if (drv && drv->bdrv_io_plug) { - drv->bdrv_io_plug(bs); - } else if (bs->file) { - bdrv_io_plug(bs->file); - } -} - -void bdrv_io_unplug(BlockDriverState *bs) -{ - BlockDriver *drv = bs->drv; - if (drv && drv->bdrv_io_unplug) { - drv->bdrv_io_unplug(bs); - } else if (bs->file) { - bdrv_io_unplug(bs->file); - } -} - -void bdrv_flush_io_queue(BlockDriverState *bs) -{ - BlockDriver *drv = bs->drv; - if (drv && drv->bdrv_flush_io_queue) { - drv->bdrv_flush_io_queue(bs); - } else if (bs->file) { - bdrv_flush_io_queue(bs->file); - } -} - static bool append_open_options(QDict *d, BlockDriverState *bs) { const QDictEntry *entry; diff --git a/block/Makefile.objs b/block/Makefile.objs index db2933e469..0d8c2a4ab6 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -1,4 +1,4 @@ -block-obj-y += raw_bsd.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o +block-obj-y += raw_bsd.o qcow.o vdi.o vmdk.o cloop.o bochs.o vpc.o vvfat.o block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o block-obj-y += qed-check.o @@ -9,7 +9,7 @@ block-obj-y += block-backend.o snapshot.o qapi.o block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o block-obj-$(CONFIG_POSIX) += raw-posix.o block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o -block-obj-y += null.o mirror.o +block-obj-y += null.o mirror.o io.o block-obj-y += nbd.o nbd-client.o sheepdog.o block-obj-$(CONFIG_LIBISCSI) += iscsi.o @@ -37,6 +37,7 @@ gluster.o-libs := $(GLUSTERFS_LIBS) ssh.o-cflags := $(LIBSSH2_CFLAGS) ssh.o-libs := $(LIBSSH2_LIBS) archipelago.o-libs := $(ARCHIPELAGO_LIBS) +block-obj-m += dmg.o dmg.o-libs := $(BZIP2_LIBS) qcow.o-libs := -lz linux-aio.o-libs := -laio diff --git a/block/backup.c b/block/backup.c index 1c535b1ab9..d3f648ddd7 100644 --- a/block/backup.c +++ b/block/backup.c @@ -37,6 +37,8 @@ typedef struct CowRequest { typedef struct BackupBlockJob { BlockJob common; BlockDriverState *target; + /* bitmap for sync=dirty-bitmap */ + BdrvDirtyBitmap *sync_bitmap; MirrorSyncMode sync_mode; RateLimit limit; BlockdevOnError on_source_error; @@ -242,6 +244,91 @@ static void backup_complete(BlockJob *job, void *opaque) g_free(data); } +static bool coroutine_fn yield_and_check(BackupBlockJob *job) +{ + if (block_job_is_cancelled(&job->common)) { + return true; + } + + /* we need to yield so that bdrv_drain_all() returns. + * (without, VM does not reboot) + */ + if (job->common.speed) { + uint64_t delay_ns = ratelimit_calculate_delay(&job->limit, + job->sectors_read); + job->sectors_read = 0; + block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, delay_ns); + } else { + block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, 0); + } + + if (block_job_is_cancelled(&job->common)) { + return true; + } + + return false; +} + +static int coroutine_fn backup_run_incremental(BackupBlockJob *job) +{ + bool error_is_read; + int ret = 0; + int clusters_per_iter; + uint32_t granularity; + int64_t sector; + int64_t cluster; + int64_t end; + int64_t last_cluster = -1; + BlockDriverState *bs = job->common.bs; + HBitmapIter hbi; + + granularity = bdrv_dirty_bitmap_granularity(job->sync_bitmap); + clusters_per_iter = MAX((granularity / BACKUP_CLUSTER_SIZE), 1); + bdrv_dirty_iter_init(job->sync_bitmap, &hbi); + + /* Find the next dirty sector(s) */ + while ((sector = hbitmap_iter_next(&hbi)) != -1) { + cluster = sector / BACKUP_SECTORS_PER_CLUSTER; + + /* Fake progress updates for any clusters we skipped */ + if (cluster != last_cluster + 1) { + job->common.offset += ((cluster - last_cluster - 1) * + BACKUP_CLUSTER_SIZE); + } + + for (end = cluster + clusters_per_iter; cluster < end; cluster++) { + do { + if (yield_and_check(job)) { + return ret; + } + ret = backup_do_cow(bs, cluster * BACKUP_SECTORS_PER_CLUSTER, + BACKUP_SECTORS_PER_CLUSTER, &error_is_read); + if ((ret < 0) && + backup_error_action(job, error_is_read, -ret) == + BLOCK_ERROR_ACTION_REPORT) { + return ret; + } + } while (ret < 0); + } + + /* If the bitmap granularity is smaller than the backup granularity, + * we need to advance the iterator pointer to the next cluster. */ + if (granularity < BACKUP_CLUSTER_SIZE) { + bdrv_set_dirty_iter(&hbi, cluster * BACKUP_SECTORS_PER_CLUSTER); + } + + last_cluster = cluster - 1; + } + + /* Play some final catchup with the progress meter */ + end = DIV_ROUND_UP(job->common.len, BACKUP_CLUSTER_SIZE); + if (last_cluster + 1 < end) { + job->common.offset += ((end - last_cluster - 1) * BACKUP_CLUSTER_SIZE); + } + + return ret; +} + static void coroutine_fn backup_run(void *opaque) { BackupBlockJob *job = opaque; @@ -259,8 +346,7 @@ static void coroutine_fn backup_run(void *opaque) qemu_co_rwlock_init(&job->flush_rwlock); start = 0; - end = DIV_ROUND_UP(job->common.len / BDRV_SECTOR_SIZE, - BACKUP_SECTORS_PER_CLUSTER); + end = DIV_ROUND_UP(job->common.len, BACKUP_CLUSTER_SIZE); job->bitmap = hbitmap_alloc(end, 0); @@ -278,28 +364,13 @@ static void coroutine_fn backup_run(void *opaque) qemu_coroutine_yield(); job->common.busy = true; } + } else if (job->sync_mode == MIRROR_SYNC_MODE_DIRTY_BITMAP) { + ret = backup_run_incremental(job); } else { /* Both FULL and TOP SYNC_MODE's require copying.. */ for (; start < end; start++) { bool error_is_read; - - if (block_job_is_cancelled(&job->common)) { - break; - } - - /* we need to yield so that qemu_aio_flush() returns. - * (without, VM does not reboot) - */ - if (job->common.speed) { - uint64_t delay_ns = ratelimit_calculate_delay( - &job->limit, job->sectors_read); - job->sectors_read = 0; - block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, delay_ns); - } else { - block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, 0); - } - - if (block_job_is_cancelled(&job->common)) { + if (yield_and_check(job)) { break; } @@ -357,6 +428,18 @@ static void coroutine_fn backup_run(void *opaque) qemu_co_rwlock_wrlock(&job->flush_rwlock); qemu_co_rwlock_unlock(&job->flush_rwlock); + if (job->sync_bitmap) { + BdrvDirtyBitmap *bm; + if (ret < 0) { + /* Merge the successor back into the parent, delete nothing. */ + bm = bdrv_reclaim_dirty_bitmap(bs, job->sync_bitmap, NULL); + assert(bm); + } else { + /* Everything is fine, delete this bitmap and install the backup. */ + bm = bdrv_dirty_bitmap_abdicate(bs, job->sync_bitmap, NULL); + assert(bm); + } + } hbitmap_free(job->bitmap); bdrv_iostatus_disable(target); @@ -369,6 +452,7 @@ static void coroutine_fn backup_run(void *opaque) void backup_start(BlockDriverState *bs, BlockDriverState *target, int64_t speed, MirrorSyncMode sync_mode, + BdrvDirtyBitmap *sync_bitmap, BlockdevOnError on_source_error, BlockdevOnError on_target_error, BlockCompletionFunc *cb, void *opaque, @@ -412,17 +496,36 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target, return; } + if (sync_mode == MIRROR_SYNC_MODE_DIRTY_BITMAP) { + if (!sync_bitmap) { + error_setg(errp, "must provide a valid bitmap name for " + "\"dirty-bitmap\" sync mode"); + return; + } + + /* Create a new bitmap, and freeze/disable this one. */ + if (bdrv_dirty_bitmap_create_successor(bs, sync_bitmap, errp) < 0) { + return; + } + } else if (sync_bitmap) { + error_setg(errp, + "a sync_bitmap was provided to backup_run, " + "but received an incompatible sync_mode (%s)", + MirrorSyncMode_lookup[sync_mode]); + return; + } + len = bdrv_getlength(bs); if (len < 0) { error_setg_errno(errp, -len, "unable to get length for '%s'", bdrv_get_device_name(bs)); - return; + goto error; } BackupBlockJob *job = block_job_create(&backup_job_driver, bs, speed, cb, opaque, errp); if (!job) { - return; + goto error; } bdrv_op_block_all(target, job->common.blocker); @@ -431,7 +534,15 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target, job->on_target_error = on_target_error; job->target = target; job->sync_mode = sync_mode; + job->sync_bitmap = sync_mode == MIRROR_SYNC_MODE_DIRTY_BITMAP ? + sync_bitmap : NULL; job->common.len = len; job->common.co = qemu_coroutine_create(backup_run); qemu_coroutine_enter(job->common.co, job); + return; + + error: + if (sync_bitmap) { + bdrv_reclaim_dirty_bitmap(bs, sync_bitmap, NULL); + } } diff --git a/block/blkdebug.c b/block/blkdebug.c index 63611e0a33..3c30edba73 100644 --- a/block/blkdebug.c +++ b/block/blkdebug.c @@ -721,6 +721,11 @@ static int64_t blkdebug_getlength(BlockDriverState *bs) return bdrv_getlength(bs->file); } +static int blkdebug_truncate(BlockDriverState *bs, int64_t offset) +{ + return bdrv_truncate(bs->file, offset); +} + static void blkdebug_refresh_filename(BlockDriverState *bs) { QDict *opts; @@ -779,6 +784,7 @@ static BlockDriver bdrv_blkdebug = { .bdrv_file_open = blkdebug_open, .bdrv_close = blkdebug_close, .bdrv_getlength = blkdebug_getlength, + .bdrv_truncate = blkdebug_truncate, .bdrv_refresh_filename = blkdebug_refresh_filename, .bdrv_aio_readv = blkdebug_aio_readv, diff --git a/block/block-backend.c b/block/block-backend.c index 48b6e4c05c..93e46f376a 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -515,6 +515,17 @@ int blk_write(BlockBackend *blk, int64_t sector_num, const uint8_t *buf, return bdrv_write(blk->bs, sector_num, buf, nb_sectors); } +int blk_write_zeroes(BlockBackend *blk, int64_t sector_num, + int nb_sectors, BdrvRequestFlags flags) +{ + int ret = blk_check_request(blk, sector_num, nb_sectors); + if (ret < 0) { + return ret; + } + + return bdrv_write_zeroes(blk->bs, sector_num, nb_sectors, flags); +} + static void error_callback_bh(void *opaque) { struct BlockBackendAIOCB *acb = opaque; diff --git a/block/io.c b/block/io.c new file mode 100644 index 0000000000..1ce62c4fbc --- /dev/null +++ b/block/io.c @@ -0,0 +1,2540 @@ +/* + * Block layer I/O functions + * + * Copyright (c) 2003 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "trace.h" +#include "sysemu/qtest.h" +#include "block/blockjob.h" +#include "block/block_int.h" + +#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ + +static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque); +static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque); +static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov); +static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov); +static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags); +static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags); +static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, + int nb_sectors, + BdrvRequestFlags flags, + BlockCompletionFunc *cb, + void *opaque, + bool is_write); +static void coroutine_fn bdrv_co_do_rw(void *opaque); +static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, BdrvRequestFlags flags); + +/* throttling disk I/O limits */ +void bdrv_set_io_limits(BlockDriverState *bs, + ThrottleConfig *cfg) +{ + int i; + + throttle_config(&bs->throttle_state, cfg); + + for (i = 0; i < 2; i++) { + qemu_co_enter_next(&bs->throttled_reqs[i]); + } +} + +/* this function drain all the throttled IOs */ +static bool bdrv_start_throttled_reqs(BlockDriverState *bs) +{ + bool drained = false; + bool enabled = bs->io_limits_enabled; + int i; + + bs->io_limits_enabled = false; + + for (i = 0; i < 2; i++) { + while (qemu_co_enter_next(&bs->throttled_reqs[i])) { + drained = true; + } + } + + bs->io_limits_enabled = enabled; + + return drained; +} + +void bdrv_io_limits_disable(BlockDriverState *bs) +{ + bs->io_limits_enabled = false; + + bdrv_start_throttled_reqs(bs); + + throttle_destroy(&bs->throttle_state); +} + +static void bdrv_throttle_read_timer_cb(void *opaque) +{ + BlockDriverState *bs = opaque; + qemu_co_enter_next(&bs->throttled_reqs[0]); +} + +static void bdrv_throttle_write_timer_cb(void *opaque) +{ + BlockDriverState *bs = opaque; + qemu_co_enter_next(&bs->throttled_reqs[1]); +} + +/* should be called before bdrv_set_io_limits if a limit is set */ +void bdrv_io_limits_enable(BlockDriverState *bs) +{ + int clock_type = QEMU_CLOCK_REALTIME; + + if (qtest_enabled()) { + /* For testing block IO throttling only */ + clock_type = QEMU_CLOCK_VIRTUAL; + } + assert(!bs->io_limits_enabled); + throttle_init(&bs->throttle_state, + bdrv_get_aio_context(bs), + clock_type, + bdrv_throttle_read_timer_cb, + bdrv_throttle_write_timer_cb, + bs); + bs->io_limits_enabled = true; +} + +/* This function makes an IO wait if needed + * + * @nb_sectors: the number of sectors of the IO + * @is_write: is the IO a write + */ +static void bdrv_io_limits_intercept(BlockDriverState *bs, + unsigned int bytes, + bool is_write) +{ + /* does this io must wait */ + bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write); + + /* if must wait or any request of this type throttled queue the IO */ + if (must_wait || + !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) { + qemu_co_queue_wait(&bs->throttled_reqs[is_write]); + } + + /* the IO will be executed, do the accounting */ + throttle_account(&bs->throttle_state, is_write, bytes); + + + /* if the next request must wait -> do nothing */ + if (throttle_schedule_timer(&bs->throttle_state, is_write)) { + return; + } + + /* else queue next request for execution */ + qemu_co_queue_next(&bs->throttled_reqs[is_write]); +} + +void bdrv_setup_io_funcs(BlockDriver *bdrv) +{ + /* Block drivers without coroutine functions need emulation */ + if (!bdrv->bdrv_co_readv) { + bdrv->bdrv_co_readv = bdrv_co_readv_em; + bdrv->bdrv_co_writev = bdrv_co_writev_em; + + /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if + * the block driver lacks aio we need to emulate that too. + */ + if (!bdrv->bdrv_aio_readv) { + /* add AIO emulation layer */ + bdrv->bdrv_aio_readv = bdrv_aio_readv_em; + bdrv->bdrv_aio_writev = bdrv_aio_writev_em; + } + } +} + +void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) +{ + BlockDriver *drv = bs->drv; + Error *local_err = NULL; + + memset(&bs->bl, 0, sizeof(bs->bl)); + + if (!drv) { + return; + } + + /* Take some limits from the children as a default */ + if (bs->file) { + bdrv_refresh_limits(bs->file, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length; + bs->bl.max_transfer_length = bs->file->bl.max_transfer_length; + bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment; + } else { + bs->bl.opt_mem_alignment = 512; + } + + if (bs->backing_hd) { + bdrv_refresh_limits(bs->backing_hd, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + bs->bl.opt_transfer_length = + MAX(bs->bl.opt_transfer_length, + bs->backing_hd->bl.opt_transfer_length); + bs->bl.max_transfer_length = + MIN_NON_ZERO(bs->bl.max_transfer_length, + bs->backing_hd->bl.max_transfer_length); + bs->bl.opt_mem_alignment = + MAX(bs->bl.opt_mem_alignment, + bs->backing_hd->bl.opt_mem_alignment); + } + + /* Then let the driver override it */ + if (drv->bdrv_refresh_limits) { + drv->bdrv_refresh_limits(bs, errp); + } +} + +/** + * The copy-on-read flag is actually a reference count so multiple users may + * use the feature without worrying about clobbering its previous state. + * Copy-on-read stays enabled until all users have called to disable it. + */ +void bdrv_enable_copy_on_read(BlockDriverState *bs) +{ + bs->copy_on_read++; +} + +void bdrv_disable_copy_on_read(BlockDriverState *bs) +{ + assert(bs->copy_on_read > 0); + bs->copy_on_read--; +} + +/* Check if any requests are in-flight (including throttled requests) */ +static bool bdrv_requests_pending(BlockDriverState *bs) +{ + if (!QLIST_EMPTY(&bs->tracked_requests)) { + return true; + } + if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) { + return true; + } + if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) { + return true; + } + if (bs->file && bdrv_requests_pending(bs->file)) { + return true; + } + if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) { + return true; + } + return false; +} + +static bool bdrv_drain_one(BlockDriverState *bs) +{ + bool bs_busy; + + bdrv_flush_io_queue(bs); + bdrv_start_throttled_reqs(bs); + bs_busy = bdrv_requests_pending(bs); + bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy); + return bs_busy; +} + +/* + * Wait for pending requests to complete on a single BlockDriverState subtree + * + * See the warning in bdrv_drain_all(). This function can only be called if + * you are sure nothing can generate I/O because you have op blockers + * installed. + * + * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState + * AioContext. + */ +void bdrv_drain(BlockDriverState *bs) +{ + while (bdrv_drain_one(bs)) { + /* Keep iterating */ + } +} + +/* + * Wait for pending requests to complete across all BlockDriverStates + * + * This function does not flush data to disk, use bdrv_flush_all() for that + * after calling this function. + * + * Note that completion of an asynchronous I/O operation can trigger any + * number of other I/O operations on other devices---for example a coroutine + * can be arbitrarily complex and a constant flow of I/O can come until the + * coroutine is complete. Because of this, it is not possible to have a + * function to drain a single device's I/O queue. + */ +void bdrv_drain_all(void) +{ + /* Always run first iteration so any pending completion BHs run */ + bool busy = true; + BlockDriverState *bs = NULL; + + while ((bs = bdrv_next(bs))) { + AioContext *aio_context = bdrv_get_aio_context(bs); + + aio_context_acquire(aio_context); + if (bs->job) { + block_job_pause(bs->job); + } + aio_context_release(aio_context); + } + + while (busy) { + busy = false; + bs = NULL; + + while ((bs = bdrv_next(bs))) { + AioContext *aio_context = bdrv_get_aio_context(bs); + + aio_context_acquire(aio_context); + busy |= bdrv_drain_one(bs); + aio_context_release(aio_context); + } + } + + bs = NULL; + while ((bs = bdrv_next(bs))) { + AioContext *aio_context = bdrv_get_aio_context(bs); + + aio_context_acquire(aio_context); + if (bs->job) { + block_job_resume(bs->job); + } + aio_context_release(aio_context); + } +} + +/** + * Remove an active request from the tracked requests list + * + * This function should be called when a tracked request is completing. + */ +static void tracked_request_end(BdrvTrackedRequest *req) +{ + if (req->serialising) { + req->bs->serialising_in_flight--; + } + + QLIST_REMOVE(req, list); + qemu_co_queue_restart_all(&req->wait_queue); +} + +/** + * Add an active request to the tracked requests list + */ +static void tracked_request_begin(BdrvTrackedRequest *req, + BlockDriverState *bs, + int64_t offset, + unsigned int bytes, bool is_write) +{ + *req = (BdrvTrackedRequest){ + .bs = bs, + .offset = offset, + .bytes = bytes, + .is_write = is_write, + .co = qemu_coroutine_self(), + .serialising = false, + .overlap_offset = offset, + .overlap_bytes = bytes, + }; + + qemu_co_queue_init(&req->wait_queue); + + QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); +} + +static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) +{ + int64_t overlap_offset = req->offset & ~(align - 1); + unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) + - overlap_offset; + + if (!req->serialising) { + req->bs->serialising_in_flight++; + req->serialising = true; + } + + req->overlap_offset = MIN(req->overlap_offset, overlap_offset); + req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); +} + +/** + * Round a region to cluster boundaries + */ +void bdrv_round_to_clusters(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + int64_t *cluster_sector_num, + int *cluster_nb_sectors) +{ + BlockDriverInfo bdi; + + if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { + *cluster_sector_num = sector_num; + *cluster_nb_sectors = nb_sectors; + } else { + int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE; + *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c); + *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num + + nb_sectors, c); + } +} + +static int bdrv_get_cluster_size(BlockDriverState *bs) +{ + BlockDriverInfo bdi; + int ret; + + ret = bdrv_get_info(bs, &bdi); + if (ret < 0 || bdi.cluster_size == 0) { + return bs->request_alignment; + } else { + return bdi.cluster_size; + } +} + +static bool tracked_request_overlaps(BdrvTrackedRequest *req, + int64_t offset, unsigned int bytes) +{ + /* aaaa bbbb */ + if (offset >= req->overlap_offset + req->overlap_bytes) { + return false; + } + /* bbbb aaaa */ + if (req->overlap_offset >= offset + bytes) { + return false; + } + return true; +} + +static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) +{ + BlockDriverState *bs = self->bs; + BdrvTrackedRequest *req; + bool retry; + bool waited = false; + + if (!bs->serialising_in_flight) { + return false; + } + + do { + retry = false; + QLIST_FOREACH(req, &bs->tracked_requests, list) { + if (req == self || (!req->serialising && !self->serialising)) { + continue; + } + if (tracked_request_overlaps(req, self->overlap_offset, + self->overlap_bytes)) + { + /* Hitting this means there was a reentrant request, for + * example, a block driver issuing nested requests. This must + * never happen since it means deadlock. + */ + assert(qemu_coroutine_self() != req->co); + + /* If the request is already (indirectly) waiting for us, or + * will wait for us as soon as it wakes up, then just go on + * (instead of producing a deadlock in the former case). */ + if (!req->waiting_for) { + self->waiting_for = req; + qemu_co_queue_wait(&req->wait_queue); + self->waiting_for = NULL; + retry = true; + waited = true; + break; + } + } + } + } while (retry); + + return waited; +} + +static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, + size_t size) +{ + if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) { + return -EIO; + } + + if (!bdrv_is_inserted(bs)) { + return -ENOMEDIUM; + } + + if (offset < 0) { + return -EIO; + } + + return 0; +} + +static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, + int nb_sectors) +{ + if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { + return -EIO; + } + + return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE, + nb_sectors * BDRV_SECTOR_SIZE); +} + +typedef struct RwCo { + BlockDriverState *bs; + int64_t offset; + QEMUIOVector *qiov; + bool is_write; + int ret; + BdrvRequestFlags flags; +} RwCo; + +static void coroutine_fn bdrv_rw_co_entry(void *opaque) +{ + RwCo *rwco = opaque; + + if (!rwco->is_write) { + rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset, + rwco->qiov->size, rwco->qiov, + rwco->flags); + } else { + rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset, + rwco->qiov->size, rwco->qiov, + rwco->flags); + } +} + +/* + * Process a vectored synchronous request using coroutines + */ +static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, + QEMUIOVector *qiov, bool is_write, + BdrvRequestFlags flags) +{ + Coroutine *co; + RwCo rwco = { + .bs = bs, + .offset = offset, + .qiov = qiov, + .is_write = is_write, + .ret = NOT_DONE, + .flags = flags, + }; + + /** + * In sync call context, when the vcpu is blocked, this throttling timer + * will not fire; so the I/O throttling function has to be disabled here + * if it has been enabled. + */ + if (bs->io_limits_enabled) { + fprintf(stderr, "Disabling I/O throttling on '%s' due " + "to synchronous I/O.\n", bdrv_get_device_name(bs)); + bdrv_io_limits_disable(bs); + } + + if (qemu_in_coroutine()) { + /* Fast-path if already in coroutine context */ + bdrv_rw_co_entry(&rwco); + } else { + AioContext *aio_context = bdrv_get_aio_context(bs); + + co = qemu_coroutine_create(bdrv_rw_co_entry); + qemu_coroutine_enter(co, &rwco); + while (rwco.ret == NOT_DONE) { + aio_poll(aio_context, true); + } + } + return rwco.ret; +} + +/* + * Process a synchronous request using coroutines + */ +static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, + int nb_sectors, bool is_write, BdrvRequestFlags flags) +{ + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = (void *)buf, + .iov_len = nb_sectors * BDRV_SECTOR_SIZE, + }; + + if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { + return -EINVAL; + } + + qemu_iovec_init_external(&qiov, &iov, 1); + return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS, + &qiov, is_write, flags); +} + +/* return < 0 if error. See bdrv_write() for the return codes */ +int bdrv_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0); +} + +/* Just like bdrv_read(), but with I/O throttling temporarily disabled */ +int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + bool enabled; + int ret; + + enabled = bs->io_limits_enabled; + bs->io_limits_enabled = false; + ret = bdrv_read(bs, sector_num, buf, nb_sectors); + bs->io_limits_enabled = enabled; + return ret; +} + +/* Return < 0 if error. Important errors are: + -EIO generic I/O error (may happen for all errors) + -ENOMEDIUM No media inserted. + -EINVAL Invalid sector number or nb_sectors + -EACCES Trying to write a read-only device +*/ +int bdrv_write(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0); +} + +int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, BdrvRequestFlags flags) +{ + return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, + BDRV_REQ_ZERO_WRITE | flags); +} + +/* + * Completely zero out a block device with the help of bdrv_write_zeroes. + * The operation is sped up by checking the block status and only writing + * zeroes to the device if they currently do not return zeroes. Optional + * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP). + * + * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). + */ +int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) +{ + int64_t target_sectors, ret, nb_sectors, sector_num = 0; + int n; + + target_sectors = bdrv_nb_sectors(bs); + if (target_sectors < 0) { + return target_sectors; + } + + for (;;) { + nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS); + if (nb_sectors <= 0) { + return 0; + } + ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n); + if (ret < 0) { + error_report("error getting block status at sector %" PRId64 ": %s", + sector_num, strerror(-ret)); + return ret; + } + if (ret & BDRV_BLOCK_ZERO) { + sector_num += n; + continue; + } + ret = bdrv_write_zeroes(bs, sector_num, n, flags); + if (ret < 0) { + error_report("error writing zeroes at sector %" PRId64 ": %s", + sector_num, strerror(-ret)); + return ret; + } + sector_num += n; + } +} + +int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes) +{ + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = (void *)buf, + .iov_len = bytes, + }; + int ret; + + if (bytes < 0) { + return -EINVAL; + } + + qemu_iovec_init_external(&qiov, &iov, 1); + ret = bdrv_prwv_co(bs, offset, &qiov, false, 0); + if (ret < 0) { + return ret; + } + + return bytes; +} + +int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) +{ + int ret; + + ret = bdrv_prwv_co(bs, offset, qiov, true, 0); + if (ret < 0) { + return ret; + } + + return qiov->size; +} + +int bdrv_pwrite(BlockDriverState *bs, int64_t offset, + const void *buf, int bytes) +{ + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = (void *) buf, + .iov_len = bytes, + }; + + if (bytes < 0) { + return -EINVAL; + } + + qemu_iovec_init_external(&qiov, &iov, 1); + return bdrv_pwritev(bs, offset, &qiov); +} + +/* + * Writes to the file and ensures that no writes are reordered across this + * request (acts as a barrier) + * + * Returns 0 on success, -errno in error cases. + */ +int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, + const void *buf, int count) +{ + int ret; + + ret = bdrv_pwrite(bs, offset, buf, count); + if (ret < 0) { + return ret; + } + + /* No flush needed for cache modes that already do it */ + if (bs->enable_write_cache) { + bdrv_flush(bs); + } + + return 0; +} + +static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) +{ + /* Perform I/O through a temporary buffer so that users who scribble over + * their read buffer while the operation is in progress do not end up + * modifying the image file. This is critical for zero-copy guest I/O + * where anything might happen inside guest memory. + */ + void *bounce_buffer; + + BlockDriver *drv = bs->drv; + struct iovec iov; + QEMUIOVector bounce_qiov; + int64_t cluster_sector_num; + int cluster_nb_sectors; + size_t skip_bytes; + int ret; + + /* Cover entire cluster so no additional backing file I/O is required when + * allocating cluster in the image file. + */ + bdrv_round_to_clusters(bs, sector_num, nb_sectors, + &cluster_sector_num, &cluster_nb_sectors); + + trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, + cluster_sector_num, cluster_nb_sectors); + + iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; + iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len); + if (bounce_buffer == NULL) { + ret = -ENOMEM; + goto err; + } + + qemu_iovec_init_external(&bounce_qiov, &iov, 1); + + ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, + &bounce_qiov); + if (ret < 0) { + goto err; + } + + if (drv->bdrv_co_write_zeroes && + buffer_is_zero(bounce_buffer, iov.iov_len)) { + ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, + cluster_nb_sectors, 0); + } else { + /* This does not change the data on the disk, it is not necessary + * to flush even in cache=writethrough mode. + */ + ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, + &bounce_qiov); + } + + if (ret < 0) { + /* It might be okay to ignore write errors for guest requests. If this + * is a deliberate copy-on-read then we don't want to ignore the error. + * Simply report it in all cases. + */ + goto err; + } + + skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE; + qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, + nb_sectors * BDRV_SECTOR_SIZE); + +err: + qemu_vfree(bounce_buffer); + return ret; +} + +/* + * Forwards an already correctly aligned request to the BlockDriver. This + * handles copy on read and zeroing after EOF; any other features must be + * implemented by the caller. + */ +static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, + BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, + int64_t align, QEMUIOVector *qiov, int flags) +{ + BlockDriver *drv = bs->drv; + int ret; + + int64_t sector_num = offset >> BDRV_SECTOR_BITS; + unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; + + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + assert(!qiov || bytes == qiov->size); + + /* Handle Copy on Read and associated serialisation */ + if (flags & BDRV_REQ_COPY_ON_READ) { + /* If we touch the same cluster it counts as an overlap. This + * guarantees that allocating writes will be serialized and not race + * with each other for the same cluster. For example, in copy-on-read + * it ensures that the CoR read and write operations are atomic and + * guest writes cannot interleave between them. */ + mark_request_serialising(req, bdrv_get_cluster_size(bs)); + } + + wait_serialising_requests(req); + + if (flags & BDRV_REQ_COPY_ON_READ) { + int pnum; + + ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum); + if (ret < 0) { + goto out; + } + + if (!ret || pnum != nb_sectors) { + ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov); + goto out; + } + } + + /* Forward the request to the BlockDriver */ + if (!bs->zero_beyond_eof) { + ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); + } else { + /* Read zeros after EOF */ + int64_t total_sectors, max_nb_sectors; + + total_sectors = bdrv_nb_sectors(bs); + if (total_sectors < 0) { + ret = total_sectors; + goto out; + } + + max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num), + align >> BDRV_SECTOR_BITS); + if (nb_sectors < max_nb_sectors) { + ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); + } else if (max_nb_sectors > 0) { + QEMUIOVector local_qiov; + + qemu_iovec_init(&local_qiov, qiov->niov); + qemu_iovec_concat(&local_qiov, qiov, 0, + max_nb_sectors * BDRV_SECTOR_SIZE); + + ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors, + &local_qiov); + + qemu_iovec_destroy(&local_qiov); + } else { + ret = 0; + } + + /* Reading beyond end of file is supposed to produce zeroes */ + if (ret == 0 && total_sectors < sector_num + nb_sectors) { + uint64_t offset = MAX(0, total_sectors - sector_num); + uint64_t bytes = (sector_num + nb_sectors - offset) * + BDRV_SECTOR_SIZE; + qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes); + } + } + +out: + return ret; +} + +static inline uint64_t bdrv_get_align(BlockDriverState *bs) +{ + /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ + return MAX(BDRV_SECTOR_SIZE, bs->request_alignment); +} + +static inline bool bdrv_req_is_aligned(BlockDriverState *bs, + int64_t offset, size_t bytes) +{ + int64_t align = bdrv_get_align(bs); + return !(offset & (align - 1) || (bytes & (align - 1))); +} + +/* + * Handle a read request in coroutine context + */ +static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + BlockDriver *drv = bs->drv; + BdrvTrackedRequest req; + + uint64_t align = bdrv_get_align(bs); + uint8_t *head_buf = NULL; + uint8_t *tail_buf = NULL; + QEMUIOVector local_qiov; + bool use_local_qiov = false; + int ret; + + if (!drv) { + return -ENOMEDIUM; + } + + ret = bdrv_check_byte_request(bs, offset, bytes); + if (ret < 0) { + return ret; + } + + if (bs->copy_on_read) { + flags |= BDRV_REQ_COPY_ON_READ; + } + + /* throttling disk I/O */ + if (bs->io_limits_enabled) { + bdrv_io_limits_intercept(bs, bytes, false); + } + + /* Align read if necessary by padding qiov */ + if (offset & (align - 1)) { + head_buf = qemu_blockalign(bs, align); + qemu_iovec_init(&local_qiov, qiov->niov + 2); + qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); + qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); + use_local_qiov = true; + + bytes += offset & (align - 1); + offset = offset & ~(align - 1); + } + + if ((offset + bytes) & (align - 1)) { + if (!use_local_qiov) { + qemu_iovec_init(&local_qiov, qiov->niov + 1); + qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); + use_local_qiov = true; + } + tail_buf = qemu_blockalign(bs, align); + qemu_iovec_add(&local_qiov, tail_buf, + align - ((offset + bytes) & (align - 1))); + + bytes = ROUND_UP(bytes, align); + } + + tracked_request_begin(&req, bs, offset, bytes, false); + ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align, + use_local_qiov ? &local_qiov : qiov, + flags); + tracked_request_end(&req); + + if (use_local_qiov) { + qemu_iovec_destroy(&local_qiov); + qemu_vfree(head_buf); + qemu_vfree(tail_buf); + } + + return ret; +} + +static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { + return -EINVAL; + } + + return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS, qiov, flags); +} + +int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + trace_bdrv_co_readv(bs, sector_num, nb_sectors); + + return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); +} + +int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) +{ + trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors); + + return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, + BDRV_REQ_COPY_ON_READ); +} + +#define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768 + +static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) +{ + BlockDriver *drv = bs->drv; + QEMUIOVector qiov; + struct iovec iov = {0}; + int ret = 0; + + int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes, + BDRV_REQUEST_MAX_SECTORS); + + while (nb_sectors > 0 && !ret) { + int num = nb_sectors; + + /* Align request. Block drivers can expect the "bulk" of the request + * to be aligned. + */ + if (bs->bl.write_zeroes_alignment + && num > bs->bl.write_zeroes_alignment) { + if (sector_num % bs->bl.write_zeroes_alignment != 0) { + /* Make a small request up to the first aligned sector. */ + num = bs->bl.write_zeroes_alignment; + num -= sector_num % bs->bl.write_zeroes_alignment; + } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) { + /* Shorten the request to the last aligned sector. num cannot + * underflow because num > bs->bl.write_zeroes_alignment. + */ + num -= (sector_num + num) % bs->bl.write_zeroes_alignment; + } + } + + /* limit request size */ + if (num > max_write_zeroes) { + num = max_write_zeroes; + } + + ret = -ENOTSUP; + /* First try the efficient write zeroes operation */ + if (drv->bdrv_co_write_zeroes) { + ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags); + } + + if (ret == -ENOTSUP) { + /* Fall back to bounce buffer if write zeroes is unsupported */ + int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length, + MAX_WRITE_ZEROES_BOUNCE_BUFFER); + num = MIN(num, max_xfer_len); + iov.iov_len = num * BDRV_SECTOR_SIZE; + if (iov.iov_base == NULL) { + iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE); + if (iov.iov_base == NULL) { + ret = -ENOMEM; + goto fail; + } + memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE); + } + qemu_iovec_init_external(&qiov, &iov, 1); + + ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov); + + /* Keep bounce buffer around if it is big enough for all + * all future requests. + */ + if (num < max_xfer_len) { + qemu_vfree(iov.iov_base); + iov.iov_base = NULL; + } + } + + sector_num += num; + nb_sectors -= num; + } + +fail: + qemu_vfree(iov.iov_base); + return ret; +} + +/* + * Forwards an already correctly aligned write request to the BlockDriver. + */ +static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, + BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, + QEMUIOVector *qiov, int flags) +{ + BlockDriver *drv = bs->drv; + bool waited; + int ret; + + int64_t sector_num = offset >> BDRV_SECTOR_BITS; + unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; + + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + assert(!qiov || bytes == qiov->size); + + waited = wait_serialising_requests(req); + assert(!waited || !req->serialising); + assert(req->overlap_offset <= offset); + assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); + + ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); + + if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && + !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes && + qemu_iovec_is_zero(qiov)) { + flags |= BDRV_REQ_ZERO_WRITE; + if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { + flags |= BDRV_REQ_MAY_UNMAP; + } + } + + if (ret < 0) { + /* Do nothing, write notifier decided to fail this request */ + } else if (flags & BDRV_REQ_ZERO_WRITE) { + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO); + ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); + } else { + BLKDBG_EVENT(bs, BLKDBG_PWRITEV); + ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); + } + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE); + + if (ret == 0 && !bs->enable_write_cache) { + ret = bdrv_co_flush(bs); + } + + bdrv_set_dirty(bs, sector_num, nb_sectors); + + block_acct_highest_sector(&bs->stats, sector_num, nb_sectors); + + if (ret >= 0) { + bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors); + } + + return ret; +} + +/* + * Handle a write request in coroutine context + */ +static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + BdrvTrackedRequest req; + uint64_t align = bdrv_get_align(bs); + uint8_t *head_buf = NULL; + uint8_t *tail_buf = NULL; + QEMUIOVector local_qiov; + bool use_local_qiov = false; + int ret; + + if (!bs->drv) { + return -ENOMEDIUM; + } + if (bs->read_only) { + return -EACCES; + } + + ret = bdrv_check_byte_request(bs, offset, bytes); + if (ret < 0) { + return ret; + } + + /* throttling disk I/O */ + if (bs->io_limits_enabled) { + bdrv_io_limits_intercept(bs, bytes, true); + } + + /* + * Align write if necessary by performing a read-modify-write cycle. + * Pad qiov with the read parts and be sure to have a tracked request not + * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. + */ + tracked_request_begin(&req, bs, offset, bytes, true); + + if (offset & (align - 1)) { + QEMUIOVector head_qiov; + struct iovec head_iov; + + mark_request_serialising(&req, align); + wait_serialising_requests(&req); + + head_buf = qemu_blockalign(bs, align); + head_iov = (struct iovec) { + .iov_base = head_buf, + .iov_len = align, + }; + qemu_iovec_init_external(&head_qiov, &head_iov, 1); + + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD); + ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align, + align, &head_qiov, 0); + if (ret < 0) { + goto fail; + } + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); + + qemu_iovec_init(&local_qiov, qiov->niov + 2); + qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); + qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); + use_local_qiov = true; + + bytes += offset & (align - 1); + offset = offset & ~(align - 1); + } + + if ((offset + bytes) & (align - 1)) { + QEMUIOVector tail_qiov; + struct iovec tail_iov; + size_t tail_bytes; + bool waited; + + mark_request_serialising(&req, align); + waited = wait_serialising_requests(&req); + assert(!waited || !use_local_qiov); + + tail_buf = qemu_blockalign(bs, align); + tail_iov = (struct iovec) { + .iov_base = tail_buf, + .iov_len = align, + }; + qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); + + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL); + ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align, + align, &tail_qiov, 0); + if (ret < 0) { + goto fail; + } + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); + + if (!use_local_qiov) { + qemu_iovec_init(&local_qiov, qiov->niov + 1); + qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); + use_local_qiov = true; + } + + tail_bytes = (offset + bytes) & (align - 1); + qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); + + bytes = ROUND_UP(bytes, align); + } + + if (use_local_qiov) { + /* Local buffer may have non-zero data. */ + flags &= ~BDRV_REQ_ZERO_WRITE; + } + ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, + use_local_qiov ? &local_qiov : qiov, + flags); + +fail: + tracked_request_end(&req); + + if (use_local_qiov) { + qemu_iovec_destroy(&local_qiov); + } + qemu_vfree(head_buf); + qemu_vfree(tail_buf); + + return ret; +} + +static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { + return -EINVAL; + } + + return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS, qiov, flags); +} + +int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + trace_bdrv_co_writev(bs, sector_num, nb_sectors); + + return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0); +} + +int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + BdrvRequestFlags flags) +{ + int ret; + + trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags); + + if (!(bs->open_flags & BDRV_O_UNMAP)) { + flags &= ~BDRV_REQ_MAY_UNMAP; + } + if (bdrv_req_is_aligned(bs, sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS)) { + ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, + BDRV_REQ_ZERO_WRITE | flags); + } else { + uint8_t *buf; + QEMUIOVector local_qiov; + size_t bytes = nb_sectors << BDRV_SECTOR_BITS; + + buf = qemu_memalign(bdrv_opt_mem_align(bs), bytes); + memset(buf, 0, bytes); + qemu_iovec_init(&local_qiov, 1); + qemu_iovec_add(&local_qiov, buf, bytes); + + ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, &local_qiov, + BDRV_REQ_ZERO_WRITE | flags); + qemu_vfree(buf); + } + return ret; +} + +int bdrv_flush_all(void) +{ + BlockDriverState *bs = NULL; + int result = 0; + + while ((bs = bdrv_next(bs))) { + AioContext *aio_context = bdrv_get_aio_context(bs); + int ret; + + aio_context_acquire(aio_context); + ret = bdrv_flush(bs); + if (ret < 0 && !result) { + result = ret; + } + aio_context_release(aio_context); + } + + return result; +} + +typedef struct BdrvCoGetBlockStatusData { + BlockDriverState *bs; + BlockDriverState *base; + int64_t sector_num; + int nb_sectors; + int *pnum; + int64_t ret; + bool done; +} BdrvCoGetBlockStatusData; + +/* + * Returns the allocation status of the specified sectors. + * Drivers not implementing the functionality are assumed to not support + * backing files, hence all their sectors are reported as allocated. + * + * If 'sector_num' is beyond the end of the disk image the return value is 0 + * and 'pnum' is set to 0. + * + * 'pnum' is set to the number of sectors (including and immediately following + * the specified sector) that are known to be in the same + * allocated/unallocated state. + * + * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes + * beyond the end of the disk image it will be clamped. + */ +static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, int *pnum) +{ + int64_t total_sectors; + int64_t n; + int64_t ret, ret2; + + total_sectors = bdrv_nb_sectors(bs); + if (total_sectors < 0) { + return total_sectors; + } + + if (sector_num >= total_sectors) { + *pnum = 0; + return 0; + } + + n = total_sectors - sector_num; + if (n < nb_sectors) { + nb_sectors = n; + } + + if (!bs->drv->bdrv_co_get_block_status) { + *pnum = nb_sectors; + ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; + if (bs->drv->protocol_name) { + ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE); + } + return ret; + } + + ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum); + if (ret < 0) { + *pnum = 0; + return ret; + } + + if (ret & BDRV_BLOCK_RAW) { + assert(ret & BDRV_BLOCK_OFFSET_VALID); + return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, + *pnum, pnum); + } + + if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { + ret |= BDRV_BLOCK_ALLOCATED; + } + + if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) { + if (bdrv_unallocated_blocks_are_zero(bs)) { + ret |= BDRV_BLOCK_ZERO; + } else if (bs->backing_hd) { + BlockDriverState *bs2 = bs->backing_hd; + int64_t nb_sectors2 = bdrv_nb_sectors(bs2); + if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) { + ret |= BDRV_BLOCK_ZERO; + } + } + } + + if (bs->file && + (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && + (ret & BDRV_BLOCK_OFFSET_VALID)) { + int file_pnum; + + ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, + *pnum, &file_pnum); + if (ret2 >= 0) { + /* Ignore errors. This is just providing extra information, it + * is useful but not necessary. + */ + if (!file_pnum) { + /* !file_pnum indicates an offset at or beyond the EOF; it is + * perfectly valid for the format block driver to point to such + * offsets, so catch it and mark everything as zero */ + ret |= BDRV_BLOCK_ZERO; + } else { + /* Limit request to the range reported by the protocol driver */ + *pnum = file_pnum; + ret |= (ret2 & BDRV_BLOCK_ZERO); + } + } + } + + return ret; +} + +/* Coroutine wrapper for bdrv_get_block_status() */ +static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque) +{ + BdrvCoGetBlockStatusData *data = opaque; + BlockDriverState *bs = data->bs; + + data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors, + data->pnum); + data->done = true; +} + +/* + * Synchronous wrapper around bdrv_co_get_block_status(). + * + * See bdrv_co_get_block_status() for details. + */ +int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, int *pnum) +{ + Coroutine *co; + BdrvCoGetBlockStatusData data = { + .bs = bs, + .sector_num = sector_num, + .nb_sectors = nb_sectors, + .pnum = pnum, + .done = false, + }; + + if (qemu_in_coroutine()) { + /* Fast-path if already in coroutine context */ + bdrv_get_block_status_co_entry(&data); + } else { + AioContext *aio_context = bdrv_get_aio_context(bs); + + co = qemu_coroutine_create(bdrv_get_block_status_co_entry); + qemu_coroutine_enter(co, &data); + while (!data.done) { + aio_poll(aio_context, true); + } + } + return data.ret; +} + +int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, int *pnum) +{ + int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum); + if (ret < 0) { + return ret; + } + return !!(ret & BDRV_BLOCK_ALLOCATED); +} + +/* + * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] + * + * Return true if the given sector is allocated in any image between + * BASE and TOP (inclusive). BASE can be NULL to check if the given + * sector is allocated in any image of the chain. Return false otherwise. + * + * 'pnum' is set to the number of sectors (including and immediately following + * the specified sector) that are known to be in the same + * allocated/unallocated state. + * + */ +int bdrv_is_allocated_above(BlockDriverState *top, + BlockDriverState *base, + int64_t sector_num, + int nb_sectors, int *pnum) +{ + BlockDriverState *intermediate; + int ret, n = nb_sectors; + + intermediate = top; + while (intermediate && intermediate != base) { + int pnum_inter; + ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors, + &pnum_inter); + if (ret < 0) { + return ret; + } else if (ret) { + *pnum = pnum_inter; + return 1; + } + + /* + * [sector_num, nb_sectors] is unallocated on top but intermediate + * might have + * + * [sector_num+x, nr_sectors] allocated. + */ + if (n > pnum_inter && + (intermediate == top || + sector_num + pnum_inter < intermediate->total_sectors)) { + n = pnum_inter; + } + + intermediate = intermediate->backing_hd; + } + + *pnum = n; + return 0; +} + +int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + BlockDriver *drv = bs->drv; + int ret; + + if (!drv) { + return -ENOMEDIUM; + } + if (!drv->bdrv_write_compressed) { + return -ENOTSUP; + } + ret = bdrv_check_request(bs, sector_num, nb_sectors); + if (ret < 0) { + return ret; + } + + assert(QLIST_EMPTY(&bs->dirty_bitmaps)); + + return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors); +} + +int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, + int64_t pos, int size) +{ + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = (void *) buf, + .iov_len = size, + }; + + qemu_iovec_init_external(&qiov, &iov, 1); + return bdrv_writev_vmstate(bs, &qiov, pos); +} + +int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) +{ + BlockDriver *drv = bs->drv; + + if (!drv) { + return -ENOMEDIUM; + } else if (drv->bdrv_save_vmstate) { + return drv->bdrv_save_vmstate(bs, qiov, pos); + } else if (bs->file) { + return bdrv_writev_vmstate(bs->file, qiov, pos); + } + + return -ENOTSUP; +} + +int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, + int64_t pos, int size) +{ + BlockDriver *drv = bs->drv; + if (!drv) + return -ENOMEDIUM; + if (drv->bdrv_load_vmstate) + return drv->bdrv_load_vmstate(bs, buf, pos, size); + if (bs->file) + return bdrv_load_vmstate(bs->file, buf, pos, size); + return -ENOTSUP; +} + +/**************************************************************/ +/* async I/Os */ + +BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); + + return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, + cb, opaque, false); +} + +BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); + + return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, + cb, opaque, true); +} + +BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, BdrvRequestFlags flags, + BlockCompletionFunc *cb, void *opaque) +{ + trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque); + + return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors, + BDRV_REQ_ZERO_WRITE | flags, + cb, opaque, true); +} + + +typedef struct MultiwriteCB { + int error; + int num_requests; + int num_callbacks; + struct { + BlockCompletionFunc *cb; + void *opaque; + QEMUIOVector *free_qiov; + } callbacks[]; +} MultiwriteCB; + +static void multiwrite_user_cb(MultiwriteCB *mcb) +{ + int i; + + for (i = 0; i < mcb->num_callbacks; i++) { + mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error); + if (mcb->callbacks[i].free_qiov) { + qemu_iovec_destroy(mcb->callbacks[i].free_qiov); + } + g_free(mcb->callbacks[i].free_qiov); + } +} + +static void multiwrite_cb(void *opaque, int ret) +{ + MultiwriteCB *mcb = opaque; + + trace_multiwrite_cb(mcb, ret); + + if (ret < 0 && !mcb->error) { + mcb->error = ret; + } + + mcb->num_requests--; + if (mcb->num_requests == 0) { + multiwrite_user_cb(mcb); + g_free(mcb); + } +} + +static int multiwrite_req_compare(const void *a, const void *b) +{ + const BlockRequest *req1 = a, *req2 = b; + + /* + * Note that we can't simply subtract req2->sector from req1->sector + * here as that could overflow the return value. + */ + if (req1->sector > req2->sector) { + return 1; + } else if (req1->sector < req2->sector) { + return -1; + } else { + return 0; + } +} + +/* + * Takes a bunch of requests and tries to merge them. Returns the number of + * requests that remain after merging. + */ +static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, + int num_reqs, MultiwriteCB *mcb) +{ + int i, outidx; + + // Sort requests by start sector + qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare); + + // Check if adjacent requests touch the same clusters. If so, combine them, + // filling up gaps with zero sectors. + outidx = 0; + for (i = 1; i < num_reqs; i++) { + int merge = 0; + int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors; + + // Handle exactly sequential writes and overlapping writes. + if (reqs[i].sector <= oldreq_last) { + merge = 1; + } + + if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) { + merge = 0; + } + + if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors + + reqs[i].nb_sectors > bs->bl.max_transfer_length) { + merge = 0; + } + + if (merge) { + size_t size; + QEMUIOVector *qiov = g_malloc0(sizeof(*qiov)); + qemu_iovec_init(qiov, + reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1); + + // Add the first request to the merged one. If the requests are + // overlapping, drop the last sectors of the first request. + size = (reqs[i].sector - reqs[outidx].sector) << 9; + qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size); + + // We should need to add any zeros between the two requests + assert (reqs[i].sector <= oldreq_last); + + // Add the second request + qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size); + + // Add tail of first request, if necessary + if (qiov->size < reqs[outidx].qiov->size) { + qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size, + reqs[outidx].qiov->size - qiov->size); + } + + reqs[outidx].nb_sectors = qiov->size >> 9; + reqs[outidx].qiov = qiov; + + mcb->callbacks[i].free_qiov = reqs[outidx].qiov; + } else { + outidx++; + reqs[outidx].sector = reqs[i].sector; + reqs[outidx].nb_sectors = reqs[i].nb_sectors; + reqs[outidx].qiov = reqs[i].qiov; + } + } + + block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1); + + return outidx + 1; +} + +/* + * Submit multiple AIO write requests at once. + * + * On success, the function returns 0 and all requests in the reqs array have + * been submitted. In error case this function returns -1, and any of the + * requests may or may not be submitted yet. In particular, this means that the + * callback will be called for some of the requests, for others it won't. The + * caller must check the error field of the BlockRequest to wait for the right + * callbacks (if error != 0, no callback will be called). + * + * The implementation may modify the contents of the reqs array, e.g. to merge + * requests. However, the fields opaque and error are left unmodified as they + * are used to signal failure for a single request to the caller. + */ +int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) +{ + MultiwriteCB *mcb; + int i; + + /* don't submit writes if we don't have a medium */ + if (bs->drv == NULL) { + for (i = 0; i < num_reqs; i++) { + reqs[i].error = -ENOMEDIUM; + } + return -1; + } + + if (num_reqs == 0) { + return 0; + } + + // Create MultiwriteCB structure + mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks)); + mcb->num_requests = 0; + mcb->num_callbacks = num_reqs; + + for (i = 0; i < num_reqs; i++) { + mcb->callbacks[i].cb = reqs[i].cb; + mcb->callbacks[i].opaque = reqs[i].opaque; + } + + // Check for mergable requests + num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb); + + trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs); + + /* Run the aio requests. */ + mcb->num_requests = num_reqs; + for (i = 0; i < num_reqs; i++) { + bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov, + reqs[i].nb_sectors, reqs[i].flags, + multiwrite_cb, mcb, + true); + } + + return 0; +} + +void bdrv_aio_cancel(BlockAIOCB *acb) +{ + qemu_aio_ref(acb); + bdrv_aio_cancel_async(acb); + while (acb->refcnt > 1) { + if (acb->aiocb_info->get_aio_context) { + aio_poll(acb->aiocb_info->get_aio_context(acb), true); + } else if (acb->bs) { + aio_poll(bdrv_get_aio_context(acb->bs), true); + } else { + abort(); + } + } + qemu_aio_unref(acb); +} + +/* Async version of aio cancel. The caller is not blocked if the acb implements + * cancel_async, otherwise we do nothing and let the request normally complete. + * In either case the completion callback must be called. */ +void bdrv_aio_cancel_async(BlockAIOCB *acb) +{ + if (acb->aiocb_info->cancel_async) { + acb->aiocb_info->cancel_async(acb); + } +} + +/**************************************************************/ +/* async block device emulation */ + +typedef struct BlockAIOCBSync { + BlockAIOCB common; + QEMUBH *bh; + int ret; + /* vector translation state */ + QEMUIOVector *qiov; + uint8_t *bounce; + int is_write; +} BlockAIOCBSync; + +static const AIOCBInfo bdrv_em_aiocb_info = { + .aiocb_size = sizeof(BlockAIOCBSync), +}; + +static void bdrv_aio_bh_cb(void *opaque) +{ + BlockAIOCBSync *acb = opaque; + + if (!acb->is_write && acb->ret >= 0) { + qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); + } + qemu_vfree(acb->bounce); + acb->common.cb(acb->common.opaque, acb->ret); + qemu_bh_delete(acb->bh); + acb->bh = NULL; + qemu_aio_unref(acb); +} + +static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, + int nb_sectors, + BlockCompletionFunc *cb, + void *opaque, + int is_write) + +{ + BlockAIOCBSync *acb; + + acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque); + acb->is_write = is_write; + acb->qiov = qiov; + acb->bounce = qemu_try_blockalign(bs, qiov->size); + acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb); + + if (acb->bounce == NULL) { + acb->ret = -ENOMEM; + } else if (is_write) { + qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); + acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); + } else { + acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors); + } + + qemu_bh_schedule(acb->bh); + + return &acb->common; +} + +static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); +} + +static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); +} + + +typedef struct BlockAIOCBCoroutine { + BlockAIOCB common; + BlockRequest req; + bool is_write; + bool need_bh; + bool *done; + QEMUBH* bh; +} BlockAIOCBCoroutine; + +static const AIOCBInfo bdrv_em_co_aiocb_info = { + .aiocb_size = sizeof(BlockAIOCBCoroutine), +}; + +static void bdrv_co_complete(BlockAIOCBCoroutine *acb) +{ + if (!acb->need_bh) { + acb->common.cb(acb->common.opaque, acb->req.error); + qemu_aio_unref(acb); + } +} + +static void bdrv_co_em_bh(void *opaque) +{ + BlockAIOCBCoroutine *acb = opaque; + + assert(!acb->need_bh); + qemu_bh_delete(acb->bh); + bdrv_co_complete(acb); +} + +static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb) +{ + acb->need_bh = false; + if (acb->req.error != -EINPROGRESS) { + BlockDriverState *bs = acb->common.bs; + + acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); + qemu_bh_schedule(acb->bh); + } +} + +/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */ +static void coroutine_fn bdrv_co_do_rw(void *opaque) +{ + BlockAIOCBCoroutine *acb = opaque; + BlockDriverState *bs = acb->common.bs; + + if (!acb->is_write) { + acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, + acb->req.nb_sectors, acb->req.qiov, acb->req.flags); + } else { + acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, + acb->req.nb_sectors, acb->req.qiov, acb->req.flags); + } + + bdrv_co_complete(acb); +} + +static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, + int nb_sectors, + BdrvRequestFlags flags, + BlockCompletionFunc *cb, + void *opaque, + bool is_write) +{ + Coroutine *co; + BlockAIOCBCoroutine *acb; + + acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); + acb->need_bh = true; + acb->req.error = -EINPROGRESS; + acb->req.sector = sector_num; + acb->req.nb_sectors = nb_sectors; + acb->req.qiov = qiov; + acb->req.flags = flags; + acb->is_write = is_write; + + co = qemu_coroutine_create(bdrv_co_do_rw); + qemu_coroutine_enter(co, acb); + + bdrv_co_maybe_schedule_bh(acb); + return &acb->common; +} + +static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque) +{ + BlockAIOCBCoroutine *acb = opaque; + BlockDriverState *bs = acb->common.bs; + + acb->req.error = bdrv_co_flush(bs); + bdrv_co_complete(acb); +} + +BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs, + BlockCompletionFunc *cb, void *opaque) +{ + trace_bdrv_aio_flush(bs, opaque); + + Coroutine *co; + BlockAIOCBCoroutine *acb; + + acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); + acb->need_bh = true; + acb->req.error = -EINPROGRESS; + + co = qemu_coroutine_create(bdrv_aio_flush_co_entry); + qemu_coroutine_enter(co, acb); + + bdrv_co_maybe_schedule_bh(acb); + return &acb->common; +} + +static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) +{ + BlockAIOCBCoroutine *acb = opaque; + BlockDriverState *bs = acb->common.bs; + + acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors); + bdrv_co_complete(acb); +} + +BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + Coroutine *co; + BlockAIOCBCoroutine *acb; + + trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque); + + acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); + acb->need_bh = true; + acb->req.error = -EINPROGRESS; + acb->req.sector = sector_num; + acb->req.nb_sectors = nb_sectors; + co = qemu_coroutine_create(bdrv_aio_discard_co_entry); + qemu_coroutine_enter(co, acb); + + bdrv_co_maybe_schedule_bh(acb); + return &acb->common; +} + +void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, + BlockCompletionFunc *cb, void *opaque) +{ + BlockAIOCB *acb; + + acb = g_slice_alloc(aiocb_info->aiocb_size); + acb->aiocb_info = aiocb_info; + acb->bs = bs; + acb->cb = cb; + acb->opaque = opaque; + acb->refcnt = 1; + return acb; +} + +void qemu_aio_ref(void *p) +{ + BlockAIOCB *acb = p; + acb->refcnt++; +} + +void qemu_aio_unref(void *p) +{ + BlockAIOCB *acb = p; + assert(acb->refcnt > 0); + if (--acb->refcnt == 0) { + g_slice_free1(acb->aiocb_info->aiocb_size, acb); + } +} + +/**************************************************************/ +/* Coroutine block device emulation */ + +typedef struct CoroutineIOCompletion { + Coroutine *coroutine; + int ret; +} CoroutineIOCompletion; + +static void bdrv_co_io_em_complete(void *opaque, int ret) +{ + CoroutineIOCompletion *co = opaque; + + co->ret = ret; + qemu_coroutine_enter(co->coroutine, NULL); +} + +static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *iov, + bool is_write) +{ + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + BlockAIOCB *acb; + + if (is_write) { + acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors, + bdrv_co_io_em_complete, &co); + } else { + acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors, + bdrv_co_io_em_complete, &co); + } + + trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb); + if (!acb) { + return -EIO; + } + qemu_coroutine_yield(); + + return co.ret; +} + +static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov) +{ + return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false); +} + +static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov) +{ + return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); +} + +static void coroutine_fn bdrv_flush_co_entry(void *opaque) +{ + RwCo *rwco = opaque; + + rwco->ret = bdrv_co_flush(rwco->bs); +} + +int coroutine_fn bdrv_co_flush(BlockDriverState *bs) +{ + int ret; + + if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) { + return 0; + } + + /* Write back cached data to the OS even with cache=unsafe */ + BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); + if (bs->drv->bdrv_co_flush_to_os) { + ret = bs->drv->bdrv_co_flush_to_os(bs); + if (ret < 0) { + return ret; + } + } + + /* But don't actually force it to the disk with cache=unsafe */ + if (bs->open_flags & BDRV_O_NO_FLUSH) { + goto flush_parent; + } + + BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); + if (bs->drv->bdrv_co_flush_to_disk) { + ret = bs->drv->bdrv_co_flush_to_disk(bs); + } else if (bs->drv->bdrv_aio_flush) { + BlockAIOCB *acb; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + + acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); + if (acb == NULL) { + ret = -EIO; + } else { + qemu_coroutine_yield(); + ret = co.ret; + } + } else { + /* + * Some block drivers always operate in either writethrough or unsafe + * mode and don't support bdrv_flush therefore. Usually qemu doesn't + * know how the server works (because the behaviour is hardcoded or + * depends on server-side configuration), so we can't ensure that + * everything is safe on disk. Returning an error doesn't work because + * that would break guests even if the server operates in writethrough + * mode. + * + * Let's hope the user knows what he's doing. + */ + ret = 0; + } + if (ret < 0) { + return ret; + } + + /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH + * in the case of cache=unsafe, so there are no useless flushes. + */ +flush_parent: + return bdrv_co_flush(bs->file); +} + +int bdrv_flush(BlockDriverState *bs) +{ + Coroutine *co; + RwCo rwco = { + .bs = bs, + .ret = NOT_DONE, + }; + + if (qemu_in_coroutine()) { + /* Fast-path if already in coroutine context */ + bdrv_flush_co_entry(&rwco); + } else { + AioContext *aio_context = bdrv_get_aio_context(bs); + + co = qemu_coroutine_create(bdrv_flush_co_entry); + qemu_coroutine_enter(co, &rwco); + while (rwco.ret == NOT_DONE) { + aio_poll(aio_context, true); + } + } + + return rwco.ret; +} + +typedef struct DiscardCo { + BlockDriverState *bs; + int64_t sector_num; + int nb_sectors; + int ret; +} DiscardCo; +static void coroutine_fn bdrv_discard_co_entry(void *opaque) +{ + DiscardCo *rwco = opaque; + + rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); +} + +int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, + int nb_sectors) +{ + int max_discard, ret; + + if (!bs->drv) { + return -ENOMEDIUM; + } + + ret = bdrv_check_request(bs, sector_num, nb_sectors); + if (ret < 0) { + return ret; + } else if (bs->read_only) { + return -EROFS; + } + + bdrv_reset_dirty(bs, sector_num, nb_sectors); + + /* Do nothing if disabled. */ + if (!(bs->open_flags & BDRV_O_UNMAP)) { + return 0; + } + + if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) { + return 0; + } + + max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS); + while (nb_sectors > 0) { + int ret; + int num = nb_sectors; + + /* align request */ + if (bs->bl.discard_alignment && + num >= bs->bl.discard_alignment && + sector_num % bs->bl.discard_alignment) { + if (num > bs->bl.discard_alignment) { + num = bs->bl.discard_alignment; + } + num -= sector_num % bs->bl.discard_alignment; + } + + /* limit request size */ + if (num > max_discard) { + num = max_discard; + } + + if (bs->drv->bdrv_co_discard) { + ret = bs->drv->bdrv_co_discard(bs, sector_num, num); + } else { + BlockAIOCB *acb; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + + acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, + bdrv_co_io_em_complete, &co); + if (acb == NULL) { + return -EIO; + } else { + qemu_coroutine_yield(); + ret = co.ret; + } + } + if (ret && ret != -ENOTSUP) { + return ret; + } + + sector_num += num; + nb_sectors -= num; + } + return 0; +} + +int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) +{ + Coroutine *co; + DiscardCo rwco = { + .bs = bs, + .sector_num = sector_num, + .nb_sectors = nb_sectors, + .ret = NOT_DONE, + }; + + if (qemu_in_coroutine()) { + /* Fast-path if already in coroutine context */ + bdrv_discard_co_entry(&rwco); + } else { + AioContext *aio_context = bdrv_get_aio_context(bs); + + co = qemu_coroutine_create(bdrv_discard_co_entry); + qemu_coroutine_enter(co, &rwco); + while (rwco.ret == NOT_DONE) { + aio_poll(aio_context, true); + } + } + + return rwco.ret; +} + +/* needed for generic scsi interface */ + +int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) +{ + BlockDriver *drv = bs->drv; + + if (drv && drv->bdrv_ioctl) + return drv->bdrv_ioctl(bs, req, buf); + return -ENOTSUP; +} + +BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, + unsigned long int req, void *buf, + BlockCompletionFunc *cb, void *opaque) +{ + BlockDriver *drv = bs->drv; + + if (drv && drv->bdrv_aio_ioctl) + return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque); + return NULL; +} + +void *qemu_blockalign(BlockDriverState *bs, size_t size) +{ + return qemu_memalign(bdrv_opt_mem_align(bs), size); +} + +void *qemu_blockalign0(BlockDriverState *bs, size_t size) +{ + return memset(qemu_blockalign(bs, size), 0, size); +} + +void *qemu_try_blockalign(BlockDriverState *bs, size_t size) +{ + size_t align = bdrv_opt_mem_align(bs); + + /* Ensure that NULL is never returned on success */ + assert(align > 0); + if (size == 0) { + size = align; + } + + return qemu_try_memalign(align, size); +} + +void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) +{ + void *mem = qemu_try_blockalign(bs, size); + + if (mem) { + memset(mem, 0, size); + } + + return mem; +} + +/* + * Check if all memory in this vector is sector aligned. + */ +bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) +{ + int i; + size_t alignment = bdrv_opt_mem_align(bs); + + for (i = 0; i < qiov->niov; i++) { + if ((uintptr_t) qiov->iov[i].iov_base % alignment) { + return false; + } + if (qiov->iov[i].iov_len % alignment) { + return false; + } + } + + return true; +} + +void bdrv_add_before_write_notifier(BlockDriverState *bs, + NotifierWithReturn *notifier) +{ + notifier_with_return_list_add(&bs->before_write_notifiers, notifier); +} + +void bdrv_io_plug(BlockDriverState *bs) +{ + BlockDriver *drv = bs->drv; + if (drv && drv->bdrv_io_plug) { + drv->bdrv_io_plug(bs); + } else if (bs->file) { + bdrv_io_plug(bs->file); + } +} + +void bdrv_io_unplug(BlockDriverState *bs) +{ + BlockDriver *drv = bs->drv; + if (drv && drv->bdrv_io_unplug) { + drv->bdrv_io_unplug(bs); + } else if (bs->file) { + bdrv_io_unplug(bs->file); + } +} + +void bdrv_flush_io_queue(BlockDriverState *bs) +{ + BlockDriver *drv = bs->drv; + if (drv && drv->bdrv_flush_io_queue) { + drv->bdrv_flush_io_queue(bs); + } else if (bs->file) { + bdrv_flush_io_queue(bs->file); + } +} diff --git a/block/iscsi.c b/block/iscsi.c index ba33290000..8fca1d32cb 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -2,7 +2,7 @@ * QEMU Block driver for iSCSI images * * Copyright (c) 2010-2011 Ronnie Sahlberg <ronniesahlberg@gmail.com> - * Copyright (c) 2012-2014 Peter Lieven <pl@kamp.de> + * Copyright (c) 2012-2015 Peter Lieven <pl@kamp.de> * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -57,9 +57,6 @@ typedef struct IscsiLun { int events; QEMUTimer *nop_timer; QEMUTimer *event_timer; - uint8_t lbpme; - uint8_t lbprz; - uint8_t has_write_same; struct scsi_inquiry_logical_block_provisioning lbp; struct scsi_inquiry_block_limits bl; unsigned char *zeroblock; @@ -67,6 +64,11 @@ typedef struct IscsiLun { int cluster_sectors; bool use_16_for_rw; bool write_protected; + bool lbpme; + bool lbprz; + bool dpofua; + bool has_write_same; + bool force_next_flush; } IscsiLun; typedef struct IscsiTask { @@ -79,6 +81,7 @@ typedef struct IscsiTask { QEMUBH *bh; IscsiLun *iscsilun; QEMUTimer retry_timer; + bool force_next_flush; } IscsiTask; typedef struct IscsiAIOCB { @@ -100,7 +103,7 @@ typedef struct IscsiAIOCB { #define NOP_INTERVAL 5000 #define MAX_NOP_FAILURES 3 #define ISCSI_CMD_RETRIES ARRAY_SIZE(iscsi_retry_times) -static const unsigned iscsi_retry_times[] = {8, 32, 128, 512, 2048}; +static const unsigned iscsi_retry_times[] = {8, 32, 128, 512, 2048, 8192, 32768}; /* this threshold is a trade-off knob to choose between * the potential additional overhead of an extra GET_LBA_STATUS request @@ -183,10 +186,13 @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int status, iTask->do_retry = 1; goto out; } - if (status == SCSI_STATUS_BUSY) { + /* status 0x28 is SCSI_TASK_SET_FULL. It was first introduced + * in libiscsi 1.10.0. Hardcode this value here to avoid + * the need to bump the libiscsi requirement to 1.10.0 */ + if (status == SCSI_STATUS_BUSY || status == 0x28) { unsigned retry_time = exp_random(iscsi_retry_times[iTask->retries - 1]); - error_report("iSCSI Busy (retry #%u in %u ms): %s", + error_report("iSCSI Busy/TaskSetFull (retry #%u in %u ms): %s", iTask->retries, retry_time, iscsi_get_error(iscsi)); aio_timer_init(iTask->iscsilun->aio_context, @@ -199,6 +205,8 @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int status, } } error_report("iSCSI Failure: %s", iscsi_get_error(iscsi)); + } else { + iTask->iscsilun->force_next_flush |= iTask->force_next_flush; } out: @@ -369,6 +377,7 @@ static int coroutine_fn iscsi_co_writev(BlockDriverState *bs, struct IscsiTask iTask; uint64_t lba; uint32_t num_sectors; + int fua; if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { return -EINVAL; @@ -384,15 +393,17 @@ static int coroutine_fn iscsi_co_writev(BlockDriverState *bs, num_sectors = sector_qemu2lun(nb_sectors, iscsilun); iscsi_co_init_iscsitask(iscsilun, &iTask); retry: + fua = iscsilun->dpofua && !bs->enable_write_cache; + iTask.force_next_flush = !fua; if (iscsilun->use_16_for_rw) { iTask.task = iscsi_write16_task(iscsilun->iscsi, iscsilun->lun, lba, NULL, num_sectors * iscsilun->block_size, - iscsilun->block_size, 0, 0, 0, 0, 0, + iscsilun->block_size, 0, 0, fua, 0, 0, iscsi_co_generic_cb, &iTask); } else { iTask.task = iscsi_write10_task(iscsilun->iscsi, iscsilun->lun, lba, NULL, num_sectors * iscsilun->block_size, - iscsilun->block_size, 0, 0, 0, 0, 0, + iscsilun->block_size, 0, 0, fua, 0, 0, iscsi_co_generic_cb, &iTask); } if (iTask.task == NULL) { @@ -460,7 +471,7 @@ static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs, *pnum = nb_sectors; /* LUN does not support logical block provisioning */ - if (iscsilun->lbpme == 0) { + if (!iscsilun->lbpme) { goto out; } @@ -620,8 +631,12 @@ static int coroutine_fn iscsi_co_flush(BlockDriverState *bs) return 0; } - iscsi_co_init_iscsitask(iscsilun, &iTask); + if (!iscsilun->force_next_flush) { + return 0; + } + iscsilun->force_next_flush = false; + iscsi_co_init_iscsitask(iscsilun, &iTask); retry: if (iscsi_synchronizecache10_task(iscsilun->iscsi, iscsilun->lun, 0, 0, 0, 0, iscsi_co_generic_cb, &iTask) == NULL) { @@ -917,6 +932,7 @@ coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num, } iscsi_co_init_iscsitask(iscsilun, &iTask); + iTask.force_next_flush = true; retry: if (use_16_for_ws) { iTask.task = iscsi_writesame16_task(iscsilun->iscsi, iscsilun->lun, lba, @@ -1121,8 +1137,8 @@ static void iscsi_readcapacity_sync(IscsiLun *iscsilun, Error **errp) } else { iscsilun->block_size = rc16->block_length; iscsilun->num_blocks = rc16->returned_lba + 1; - iscsilun->lbpme = rc16->lbpme; - iscsilun->lbprz = rc16->lbprz; + iscsilun->lbpme = !!rc16->lbpme; + iscsilun->lbprz = !!rc16->lbprz; iscsilun->use_16_for_rw = (rc16->returned_lba > 0xffffffff); } } @@ -1253,11 +1269,12 @@ static void iscsi_attach_aio_context(BlockDriverState *bs, iscsi_timed_set_events, iscsilun); } -static bool iscsi_is_write_protected(IscsiLun *iscsilun) +static void iscsi_modesense_sync(IscsiLun *iscsilun) { struct scsi_task *task; struct scsi_mode_sense *ms = NULL; - bool wrprotected = false; + iscsilun->write_protected = false; + iscsilun->dpofua = false; task = iscsi_modesense6_sync(iscsilun->iscsi, iscsilun->lun, 1, SCSI_MODESENSE_PC_CURRENT, @@ -1278,13 +1295,13 @@ static bool iscsi_is_write_protected(IscsiLun *iscsilun) iscsi_get_error(iscsilun->iscsi)); goto out; } - wrprotected = ms->device_specific_parameter & 0x80; + iscsilun->write_protected = ms->device_specific_parameter & 0x80; + iscsilun->dpofua = ms->device_specific_parameter & 0x10; out: if (task) { scsi_free_scsi_task(task); } - return wrprotected; } /* @@ -1403,7 +1420,8 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags, scsi_free_scsi_task(task); task = NULL; - iscsilun->write_protected = iscsi_is_write_protected(iscsilun); + iscsi_modesense_sync(iscsilun); + /* Check the write protect flag of the LUN if we want to write */ if (iscsilun->type == TYPE_DISK && (flags & BDRV_O_RDWR) && iscsilun->write_protected) { @@ -1481,7 +1499,7 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags, iscsilun->bl.opt_unmap_gran * iscsilun->block_size <= 16 * 1024 * 1024) { iscsilun->cluster_sectors = (iscsilun->bl.opt_unmap_gran * iscsilun->block_size) >> BDRV_SECTOR_BITS; - if (iscsilun->lbprz && !(bs->open_flags & BDRV_O_NOCACHE)) { + if (iscsilun->lbprz) { iscsilun->allocationmap = iscsi_allocationmap_init(iscsilun); if (iscsilun->allocationmap == NULL) { ret = -ENOMEM; @@ -1501,6 +1519,9 @@ out: if (ret) { if (iscsi != NULL) { + if (iscsi_is_logged_in(iscsi)) { + iscsi_logout_sync(iscsi); + } iscsi_destroy_context(iscsi); } memset(iscsilun, 0, sizeof(IscsiLun)); @@ -1514,6 +1535,9 @@ static void iscsi_close(BlockDriverState *bs) struct iscsi_context *iscsi = iscsilun->iscsi; iscsi_detach_aio_context(bs); + if (iscsi_is_logged_in(iscsi)) { + iscsi_logout_sync(iscsi); + } iscsi_destroy_context(iscsi); g_free(iscsilun->zeroblock); g_free(iscsilun->allocationmap); @@ -1649,7 +1673,7 @@ out: static int iscsi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) { IscsiLun *iscsilun = bs->opaque; - bdi->unallocated_blocks_are_zero = !!iscsilun->lbprz; + bdi->unallocated_blocks_are_zero = iscsilun->lbprz; bdi->can_write_zeroes_with_unmap = iscsilun->lbprz && iscsilun->lbp.lbpws; bdi->cluster_size = iscsilun->cluster_sectors * BDRV_SECTOR_SIZE; return 0; diff --git a/block/mirror.c b/block/mirror.c index 405616422b..58f391a6d6 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -125,11 +125,9 @@ static void mirror_write_complete(void *opaque, int ret) MirrorOp *op = opaque; MirrorBlockJob *s = op->s; if (ret < 0) { - BlockDriverState *source = s->common.bs; BlockErrorAction action; - bdrv_set_dirty_bitmap(source, s->dirty_bitmap, op->sector_num, - op->nb_sectors); + bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors); action = mirror_error_action(s, false, -ret); if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) { s->ret = ret; @@ -143,11 +141,9 @@ static void mirror_read_complete(void *opaque, int ret) MirrorOp *op = opaque; MirrorBlockJob *s = op->s; if (ret < 0) { - BlockDriverState *source = s->common.bs; BlockErrorAction action; - bdrv_set_dirty_bitmap(source, s->dirty_bitmap, op->sector_num, - op->nb_sectors); + bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors); action = mirror_error_action(s, true, -ret); if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) { s->ret = ret; @@ -170,10 +166,9 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) s->sector_num = hbitmap_iter_next(&s->hbi); if (s->sector_num < 0) { - bdrv_dirty_iter_init(source, s->dirty_bitmap, &s->hbi); + bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi); s->sector_num = hbitmap_iter_next(&s->hbi); - trace_mirror_restart_iter(s, - bdrv_get_dirty_count(source, s->dirty_bitmap)); + trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap)); assert(s->sector_num >= 0); } @@ -288,8 +283,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) next_sector += sectors_per_chunk; } - bdrv_reset_dirty_bitmap(source, s->dirty_bitmap, sector_num, - nb_sectors); + bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num, nb_sectors); /* Copy the dirty cluster. */ s->in_flight++; @@ -446,7 +440,7 @@ static void coroutine_fn mirror_run(void *opaque) assert(n > 0); if (ret == 1) { - bdrv_set_dirty_bitmap(bs, s->dirty_bitmap, sector_num, n); + bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n); sector_num = next; } else { sector_num += n; @@ -454,7 +448,7 @@ static void coroutine_fn mirror_run(void *opaque) } } - bdrv_dirty_iter_init(bs, s->dirty_bitmap, &s->hbi); + bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi); last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); for (;;) { uint64_t delay_ns = 0; @@ -466,7 +460,7 @@ static void coroutine_fn mirror_run(void *opaque) goto immediate_exit; } - cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap); + cnt = bdrv_get_dirty_count(s->dirty_bitmap); /* s->common.offset contains the number of bytes already processed so * far, cnt is the number of dirty sectors remaining and * s->sectors_in_flight is the number of sectors currently being @@ -475,7 +469,7 @@ static void coroutine_fn mirror_run(void *opaque) (cnt + s->sectors_in_flight) * BDRV_SECTOR_SIZE; /* Note that even when no rate limit is applied we need to yield - * periodically with no pending I/O so that qemu_aio_flush() returns. + * periodically with no pending I/O so that bdrv_drain_all() returns. * We do so every SLICE_TIME nanoseconds, or when there is an error, * or when the source is clean, whichever comes first. */ @@ -488,9 +482,6 @@ static void coroutine_fn mirror_run(void *opaque) continue; } else if (cnt != 0) { delay_ns = mirror_iteration(s); - if (delay_ns == 0) { - continue; - } } } @@ -516,7 +507,7 @@ static void coroutine_fn mirror_run(void *opaque) should_complete = s->should_complete || block_job_is_cancelled(&s->common); - cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap); + cnt = bdrv_get_dirty_count(s->dirty_bitmap); } } @@ -531,7 +522,7 @@ static void coroutine_fn mirror_run(void *opaque) */ trace_mirror_before_drain(s, cnt); bdrv_drain(bs); - cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap); + cnt = bdrv_get_dirty_count(s->dirty_bitmap); } ret = 0; @@ -634,7 +625,7 @@ static void mirror_complete(BlockJob *job, Error **errp) } s->should_complete = true; - block_job_resume(job); + block_job_enter(&s->common); } static const BlockJobDriver mirror_job_driver = { @@ -656,7 +647,7 @@ static const BlockJobDriver commit_active_job_driver = { static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target, const char *replaces, - int64_t speed, int64_t granularity, + int64_t speed, uint32_t granularity, int64_t buf_size, BlockdevOnError on_source_error, BlockdevOnError on_target_error, @@ -668,15 +659,7 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target, MirrorBlockJob *s; if (granularity == 0) { - /* Choose the default granularity based on the target file's cluster - * size, clamped between 4k and 64k. */ - BlockDriverInfo bdi; - if (bdrv_get_info(target, &bdi) >= 0 && bdi.cluster_size != 0) { - granularity = MAX(4096, bdi.cluster_size); - granularity = MIN(65536, granularity); - } else { - granularity = 65536; - } + granularity = bdrv_get_default_bitmap_granularity(target); } assert ((granularity & (granularity - 1)) == 0); @@ -703,7 +686,7 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target, s->granularity = granularity; s->buf_size = MAX(buf_size, granularity); - s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, errp); + s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp); if (!s->dirty_bitmap) { return; } @@ -717,7 +700,7 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target, void mirror_start(BlockDriverState *bs, BlockDriverState *target, const char *replaces, - int64_t speed, int64_t granularity, int64_t buf_size, + int64_t speed, uint32_t granularity, int64_t buf_size, MirrorSyncMode mode, BlockdevOnError on_source_error, BlockdevOnError on_target_error, BlockCompletionFunc *cb, @@ -726,6 +709,10 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target, bool is_none_mode; BlockDriverState *base; + if (mode == MIRROR_SYNC_MODE_DIRTY_BITMAP) { + error_setg(errp, "Sync mode 'dirty-bitmap' not supported"); + return; + } is_none_mode = mode == MIRROR_SYNC_MODE_NONE; base = mode == MIRROR_SYNC_MODE_TOP ? bs->backing_hd : NULL; mirror_start_job(bs, target, replaces, diff --git a/block/null.c b/block/null.c index ec2bd27a4b..7d083233fb 100644 --- a/block/null.c +++ b/block/null.c @@ -12,8 +12,11 @@ #include "block/block_int.h" +#define NULL_OPT_LATENCY "latency-ns" + typedef struct { int64_t length; + int64_t latency_ns; } BDRVNullState; static QemuOptsList runtime_opts = { @@ -30,6 +33,12 @@ static QemuOptsList runtime_opts = { .type = QEMU_OPT_SIZE, .help = "size of the null block", }, + { + .name = NULL_OPT_LATENCY, + .type = QEMU_OPT_NUMBER, + .help = "nanoseconds (approximated) to wait " + "before completing request", + }, { /* end of list */ } }, }; @@ -39,13 +48,20 @@ static int null_file_open(BlockDriverState *bs, QDict *options, int flags, { QemuOpts *opts; BDRVNullState *s = bs->opaque; + int ret = 0; opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); qemu_opts_absorb_qdict(opts, options, &error_abort); s->length = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 1 << 30); + s->latency_ns = + qemu_opt_get_number(opts, NULL_OPT_LATENCY, 0); + if (s->latency_ns < 0) { + error_setg(errp, "latency-ns is invalid"); + ret = -EINVAL; + } qemu_opts_del(opts); - return 0; + return ret; } static void null_close(BlockDriverState *bs) @@ -58,28 +74,40 @@ static int64_t null_getlength(BlockDriverState *bs) return s->length; } +static coroutine_fn int null_co_common(BlockDriverState *bs) +{ + BDRVNullState *s = bs->opaque; + + if (s->latency_ns) { + co_aio_sleep_ns(bdrv_get_aio_context(bs), QEMU_CLOCK_REALTIME, + s->latency_ns); + } + return 0; +} + static coroutine_fn int null_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { - return 0; + return null_co_common(bs); } static coroutine_fn int null_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { - return 0; + return null_co_common(bs); } static coroutine_fn int null_co_flush(BlockDriverState *bs) { - return 0; + return null_co_common(bs); } typedef struct { BlockAIOCB common; QEMUBH *bh; + QEMUTimer timer; } NullAIOCB; static const AIOCBInfo null_aiocb_info = { @@ -94,15 +122,33 @@ static void null_bh_cb(void *opaque) qemu_aio_unref(acb); } +static void null_timer_cb(void *opaque) +{ + NullAIOCB *acb = opaque; + acb->common.cb(acb->common.opaque, 0); + timer_deinit(&acb->timer); + qemu_aio_unref(acb); +} + static inline BlockAIOCB *null_aio_common(BlockDriverState *bs, BlockCompletionFunc *cb, void *opaque) { NullAIOCB *acb; + BDRVNullState *s = bs->opaque; acb = qemu_aio_get(&null_aiocb_info, bs, cb, opaque); - acb->bh = aio_bh_new(bdrv_get_aio_context(bs), null_bh_cb, acb); - qemu_bh_schedule(acb->bh); + /* Only emulate latency after vcpu is running. */ + if (s->latency_ns) { + aio_timer_init(bdrv_get_aio_context(bs), &acb->timer, + QEMU_CLOCK_REALTIME, SCALE_NS, + null_timer_cb, acb); + timer_mod_ns(&acb->timer, + qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + s->latency_ns); + } else { + acb->bh = aio_bh_new(bdrv_get_aio_context(bs), null_bh_cb, acb); + qemu_bh_schedule(acb->bh); + } return &acb->common; } @@ -131,6 +177,12 @@ static BlockAIOCB *null_aio_flush(BlockDriverState *bs, return null_aio_common(bs, cb, opaque); } +static int null_reopen_prepare(BDRVReopenState *reopen_state, + BlockReopenQueue *queue, Error **errp) +{ + return 0; +} + static BlockDriver bdrv_null_co = { .format_name = "null-co", .protocol_name = "null-co", @@ -143,6 +195,7 @@ static BlockDriver bdrv_null_co = { .bdrv_co_readv = null_co_readv, .bdrv_co_writev = null_co_writev, .bdrv_co_flush_to_disk = null_co_flush, + .bdrv_reopen_prepare = null_reopen_prepare, }; static BlockDriver bdrv_null_aio = { @@ -157,6 +210,7 @@ static BlockDriver bdrv_null_aio = { .bdrv_aio_readv = null_aio_readv, .bdrv_aio_writev = null_aio_writev, .bdrv_aio_flush = null_aio_flush, + .bdrv_reopen_prepare = null_reopen_prepare, }; static void bdrv_null_init(void) diff --git a/block/qapi.c b/block/qapi.c index 8a19aed446..063dd1bc1f 100644 --- a/block/qapi.c +++ b/block/qapi.c @@ -31,8 +31,10 @@ #include "qapi/qmp/types.h" #include "sysemu/block-backend.h" -BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs) +BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs, Error **errp) { + ImageInfo **p_image_info; + BlockDriverState *bs0; BlockDeviceInfo *info = g_malloc0(sizeof(*info)); info->file = g_strdup(bs->filename); @@ -92,6 +94,25 @@ BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs) info->write_threshold = bdrv_write_threshold_get(bs); + bs0 = bs; + p_image_info = &info->image; + while (1) { + Error *local_err = NULL; + bdrv_query_image_info(bs0, p_image_info, &local_err); + if (local_err) { + error_propagate(errp, local_err); + qapi_free_BlockDeviceInfo(info); + return NULL; + } + if (bs0->drv && bs0->backing_hd) { + bs0 = bs0->backing_hd; + (*p_image_info)->has_backing_image = true; + p_image_info = &((*p_image_info)->backing_image); + } else { + break; + } + } + return info; } @@ -264,9 +285,6 @@ static void bdrv_query_info(BlockBackend *blk, BlockInfo **p_info, { BlockInfo *info = g_malloc0(sizeof(*info)); BlockDriverState *bs = blk_bs(blk); - BlockDriverState *bs0; - ImageInfo **p_image_info; - Error *local_err = NULL; info->device = g_strdup(blk_name(blk)); info->type = g_strdup("unknown"); info->locked = blk_dev_is_medium_locked(blk); @@ -289,23 +307,9 @@ static void bdrv_query_info(BlockBackend *blk, BlockInfo **p_info, if (bs->drv) { info->has_inserted = true; - info->inserted = bdrv_block_device_info(bs); - - bs0 = bs; - p_image_info = &info->inserted->image; - while (1) { - bdrv_query_image_info(bs0, p_image_info, &local_err); - if (local_err) { - error_propagate(errp, local_err); - goto err; - } - if (bs0->drv && bs0->backing_hd) { - bs0 = bs0->backing_hd; - (*p_image_info)->has_backing_image = true; - p_image_info = &((*p_image_info)->backing_image); - } else { - break; - } + info->inserted = bdrv_block_device_info(bs, errp); + if (info->inserted == NULL) { + goto err; } } diff --git a/block/qcow.c b/block/qcow.c index 055896910e..ab893284d2 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -124,7 +124,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags, snprintf(version, sizeof(version), "QCOW version %" PRIu32, header.version); error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, - bdrv_get_device_name(bs), "qcow", version); + bdrv_get_device_or_node_name(bs), "qcow", version); ret = -ENOTSUP; goto fail; } @@ -229,9 +229,9 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags, } /* Disable migration when qcow images are used */ - error_set(&s->migration_blocker, - QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, - "qcow", bdrv_get_device_name(bs), "live migration"); + error_setg(&s->migration_blocker, "The qcow format used by node '%s' " + "does not support live migration", + bdrv_get_device_or_node_name(bs)); migrate_add_blocker(s->migration_blocker); qemu_co_mutex_init(&s->lock); diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 6cbae1d205..f47260b808 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -2450,7 +2450,7 @@ int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset, if (ret < 0) { return ret; } else if (ret > 0) { - int metadata_ol_bitnr = ffs(ret) - 1; + int metadata_ol_bitnr = ctz32(ret); assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR); qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid " diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c index 2aa9dcb1d1..17bb2119b2 100644 --- a/block/qcow2-snapshot.c +++ b/block/qcow2-snapshot.c @@ -351,10 +351,8 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) memset(sn, 0, sizeof(*sn)); - /* Generate an ID if it wasn't passed */ - if (sn_info->id_str[0] == '\0') { - find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str)); - } + /* Generate an ID */ + find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str)); /* Check that the ID is unique */ if (find_snapshot_by_id_and_name(bs, sn_info->id_str, NULL) >= 0) { diff --git a/block/qcow2.c b/block/qcow2.c index 316a8db22b..b9a72e39d4 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -208,7 +208,7 @@ static void GCC_FMT_ATTR(3, 4) report_unsupported(BlockDriverState *bs, va_end(ap); error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, - bdrv_get_device_name(bs), "qcow2", msg); + bdrv_get_device_or_node_name(bs), "qcow2", msg); } static void report_unsupported_feature(BlockDriverState *bs, @@ -1802,7 +1802,7 @@ static int qcow2_create2(const char *filename, int64_t total_size, { /* Calculate cluster_bits */ int cluster_bits; - cluster_bits = ffs(cluster_size) - 1; + cluster_bits = ctz32(cluster_size); if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS || (1 << cluster_bits) != cluster_size) { @@ -2110,7 +2110,7 @@ static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp) goto finish; } - refcount_order = ffs(refcount_bits) - 1; + refcount_order = ctz32(refcount_bits); ret = qcow2_create2(filename, size, backing_file, backing_fmt, flags, cluster_size, prealloc, opts, version, refcount_order, @@ -2824,6 +2824,7 @@ void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset, int64_t size, const char *message_format, ...) { BDRVQcowState *s = bs->opaque; + const char *node_name; char *message; va_list ap; @@ -2847,8 +2848,11 @@ void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset, "corruption events will be suppressed\n", message); } - qapi_event_send_block_image_corrupted(bdrv_get_device_name(bs), message, - offset >= 0, offset, size >= 0, size, + node_name = bdrv_get_node_name(bs); + qapi_event_send_block_image_corrupted(bdrv_get_device_name(bs), + *node_name != '\0', node_name, + message, offset >= 0, offset, + size >= 0, size, fatal, &error_abort); g_free(message); diff --git a/block/qed.c b/block/qed.c index 892b13c806..5bbe069ce9 100644 --- a/block/qed.c +++ b/block/qed.c @@ -408,7 +408,7 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags, snprintf(buf, sizeof(buf), "%" PRIx64, s->header.features & ~QED_FEATURE_MASK); error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, - bdrv_get_device_name(bs), "QED", buf); + bdrv_get_device_or_node_name(bs), "QED", buf); return -ENOTSUP; } if (!qed_is_cluster_size_valid(s->header.cluster_size)) { @@ -436,9 +436,9 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags, s->table_nelems = (s->header.cluster_size * s->header.table_size) / sizeof(uint64_t); - s->l2_shift = ffs(s->header.cluster_size) - 1; + s->l2_shift = ctz32(s->header.cluster_size); s->l2_mask = s->table_nelems - 1; - s->l1_shift = s->l2_shift + ffs(s->table_nelems) - 1; + s->l1_shift = s->l2_shift + ctz32(s->table_nelems); /* Header size calculation must not overflow uint32_t */ if (s->header.header_size > UINT32_MAX / s->header.cluster_size) { diff --git a/block/quorum.c b/block/quorum.c index 437b12251d..f91ef75a84 100644 --- a/block/quorum.c +++ b/block/quorum.c @@ -226,10 +226,7 @@ static void quorum_report_bad(QuorumAIOCB *acb, char *node_name, int ret) static void quorum_report_failure(QuorumAIOCB *acb) { - const char *reference = bdrv_get_device_name(acb->common.bs)[0] ? - bdrv_get_device_name(acb->common.bs) : - acb->common.bs->node_name; - + const char *reference = bdrv_get_device_or_node_name(acb->common.bs); qapi_event_send_quorum_failure(reference, acb->sector_num, acb->nb_sectors, &error_abort); } diff --git a/block/rbd.c b/block/rbd.c index f3ab2ddd5a..fbe87e035b 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -325,7 +325,7 @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp) error_setg(errp, "obj size too small"); return -EINVAL; } - obj_order = ffs(objsize) - 1; + obj_order = ctz32(objsize); } clientname = qemu_rbd_parse_clientname(conf, clientname_buf); diff --git a/block/sheepdog.c b/block/sheepdog.c index c14172cfa6..2d5f06a390 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -1716,7 +1716,7 @@ static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt) if ((object_size - 1) & object_size) { /* not a power of 2? */ return -EINVAL; } - obj_order = ffs(object_size) - 1; + obj_order = ctz32(object_size); if (obj_order < 20 || obj_order > 31) { return -EINVAL; } diff --git a/block/snapshot.c b/block/snapshot.c index 698e1a1d58..50ae610139 100644 --- a/block/snapshot.c +++ b/block/snapshot.c @@ -246,9 +246,9 @@ int bdrv_snapshot_delete(BlockDriverState *bs, if (bs->file) { return bdrv_snapshot_delete(bs->file, snapshot_id, name, errp); } - error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, - drv->format_name, bdrv_get_device_name(bs), - "internal snapshot deletion"); + error_setg(errp, "Block format '%s' used by device '%s' " + "does not support internal snapshot deletion", + drv->format_name, bdrv_get_device_name(bs)); return -ENOTSUP; } @@ -329,9 +329,9 @@ int bdrv_snapshot_load_tmp(BlockDriverState *bs, if (drv->bdrv_snapshot_load_tmp) { return drv->bdrv_snapshot_load_tmp(bs, snapshot_id, name, errp); } - error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, - drv->format_name, bdrv_get_device_name(bs), - "temporarily load internal snapshot"); + error_setg(errp, "Block format '%s' used by device '%s' " + "does not support temporarily loading internal snapshots", + drv->format_name, bdrv_get_device_name(bs)); return -ENOTSUP; } diff --git a/block/vdi.c b/block/vdi.c index 53bd02fe22..7642ef3597 100644 --- a/block/vdi.c +++ b/block/vdi.c @@ -502,9 +502,9 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags, } /* Disable migration when vdi images are used */ - error_set(&s->migration_blocker, - QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, - "vdi", bdrv_get_device_name(bs), "live migration"); + error_setg(&s->migration_blocker, "The vdi format used by node '%s' " + "does not support live migration", + bdrv_get_device_or_node_name(bs)); migrate_add_blocker(s->migration_blocker); qemu_co_mutex_init(&s->write_lock); diff --git a/block/vhdx.c b/block/vhdx.c index bb3ed45d5c..0776de7174 100644 --- a/block/vhdx.c +++ b/block/vhdx.c @@ -1002,9 +1002,9 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags, /* TODO: differencing files */ /* Disable migration when VHDX images are used */ - error_set(&s->migration_blocker, - QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, - "vhdx", bdrv_get_device_name(bs), "live migration"); + error_setg(&s->migration_blocker, "The vhdx format used by node '%s' " + "does not support live migration", + bdrv_get_device_or_node_name(bs)); migrate_add_blocker(s->migration_blocker); return 0; @@ -1269,7 +1269,7 @@ static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num, iov1.iov_base = qemu_blockalign(bs, iov1.iov_len); memset(iov1.iov_base, 0, iov1.iov_len); qemu_iovec_concat_iov(&hd_qiov, &iov1, 1, 0, - sinfo.block_offset); + iov1.iov_len); sectors_to_write += iov1.iov_len >> BDRV_SECTOR_BITS; } @@ -1285,7 +1285,7 @@ static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num, iov2.iov_base = qemu_blockalign(bs, iov2.iov_len); memset(iov2.iov_base, 0, iov2.iov_len); qemu_iovec_concat_iov(&hd_qiov, &iov2, 1, 0, - sinfo.block_offset); + iov2.iov_len); sectors_to_write += iov2.iov_len >> BDRV_SECTOR_BITS; } } diff --git a/block/vmdk.c b/block/vmdk.c index 8410a158a2..1c5e2ef1b3 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -523,7 +523,7 @@ static int vmdk_open_vmfs_sparse(BlockDriverState *bs, } ret = vmdk_add_extent(bs, file, false, le32_to_cpu(header.disk_sectors), - le32_to_cpu(header.l1dir_offset) << 9, + (int64_t)le32_to_cpu(header.l1dir_offset) << 9, 0, le32_to_cpu(header.l1dir_size), 4096, @@ -669,7 +669,7 @@ static int vmdk_open_vmdk4(BlockDriverState *bs, snprintf(buf, sizeof(buf), "VMDK version %" PRId32, le32_to_cpu(header.version)); error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, - bdrv_get_device_name(bs), "vmdk", buf); + bdrv_get_device_or_node_name(bs), "vmdk", buf); return -ENOTSUP; } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR)) { /* VMware KB 2064959 explains that version 3 added support for @@ -962,9 +962,9 @@ static int vmdk_open(BlockDriverState *bs, QDict *options, int flags, qemu_co_mutex_init(&s->lock); /* Disable migration when VMDK images are used */ - error_set(&s->migration_blocker, - QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, - "vmdk", bdrv_get_device_name(bs), "live migration"); + error_setg(&s->migration_blocker, "The vmdk format used by node '%s' " + "does not support live migration", + bdrv_get_device_or_node_name(bs)); migrate_add_blocker(s->migration_blocker); g_free(buf); return 0; diff --git a/block/vpc.c b/block/vpc.c index 43e768ee76..37572bab86 100644 --- a/block/vpc.c +++ b/block/vpc.c @@ -318,9 +318,9 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags, qemu_co_mutex_init(&s->lock); /* Disable migration when VHD images are used */ - error_set(&s->migration_blocker, - QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, - "vpc", bdrv_get_device_name(bs), "live migration"); + error_setg(&s->migration_blocker, "The vpc format used by node '%s' " + "does not support live migration", + bdrv_get_device_or_node_name(bs)); migrate_add_blocker(s->migration_blocker); return 0; diff --git a/block/vvfat.c b/block/vvfat.c index 9be632f404..e803589675 100644 --- a/block/vvfat.c +++ b/block/vvfat.c @@ -1180,9 +1180,10 @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags, /* Disable migration when vvfat is used rw */ if (s->qcow) { - error_set(&s->migration_blocker, - QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, - "vvfat (rw)", bdrv_get_device_name(bs), "live migration"); + error_setg(&s->migration_blocker, + "The vvfat (rw) format used by node '%s' " + "does not support live migration", + bdrv_get_device_or_node_name(bs)); migrate_add_blocker(s->migration_blocker); } diff --git a/blockdev.c b/blockdev.c index fbb3a79978..5eaf77e599 100644 --- a/blockdev.c +++ b/blockdev.c @@ -1164,6 +1164,68 @@ out_aio_context: return NULL; } +/** + * block_dirty_bitmap_lookup: + * Return a dirty bitmap (if present), after validating + * the node reference and bitmap names. + * + * @node: The name of the BDS node to search for bitmaps + * @name: The name of the bitmap to search for + * @pbs: Output pointer for BDS lookup, if desired. Can be NULL. + * @paio: Output pointer for aio_context acquisition, if desired. Can be NULL. + * @errp: Output pointer for error information. Can be NULL. + * + * @return: A bitmap object on success, or NULL on failure. + */ +static BdrvDirtyBitmap *block_dirty_bitmap_lookup(const char *node, + const char *name, + BlockDriverState **pbs, + AioContext **paio, + Error **errp) +{ + BlockDriverState *bs; + BdrvDirtyBitmap *bitmap; + AioContext *aio_context; + + if (!node) { + error_setg(errp, "Node cannot be NULL"); + return NULL; + } + if (!name) { + error_setg(errp, "Bitmap name cannot be NULL"); + return NULL; + } + bs = bdrv_lookup_bs(node, node, NULL); + if (!bs) { + error_setg(errp, "Node '%s' not found", node); + return NULL; + } + + aio_context = bdrv_get_aio_context(bs); + aio_context_acquire(aio_context); + + bitmap = bdrv_find_dirty_bitmap(bs, name); + if (!bitmap) { + error_setg(errp, "Dirty bitmap '%s' not found", name); + goto fail; + } + + if (pbs) { + *pbs = bs; + } + if (paio) { + *paio = aio_context; + } else { + aio_context_release(aio_context); + } + + return bitmap; + + fail: + aio_context_release(aio_context); + return NULL; +} + /* New and old BlockDriverState structs for atomic group operations */ typedef struct BlkTransactionState BlkTransactionState; @@ -1248,13 +1310,14 @@ static void internal_snapshot_prepare(BlkTransactionState *common, } if (bdrv_is_read_only(bs)) { - error_set(errp, QERR_DEVICE_IS_READ_ONLY, device); + error_setg(errp, "Device '%s' is read only", device); return; } if (!bdrv_can_snapshot(bs)) { - error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, - bs->drv->format_name, device, "internal snapshot"); + error_setg(errp, "Block format '%s' used by device '%s' " + "does not support internal snapshots", + bs->drv->format_name, device); return; } @@ -1522,6 +1585,7 @@ static void drive_backup_prepare(BlkTransactionState *common, Error **errp) backup->sync, backup->has_mode, backup->mode, backup->has_speed, backup->speed, + backup->has_bitmap, backup->bitmap, backup->has_on_source_error, backup->on_source_error, backup->has_on_target_error, backup->on_target_error, &local_err); @@ -1953,6 +2017,102 @@ void qmp_block_set_io_throttle(const char *device, int64_t bps, int64_t bps_rd, aio_context_release(aio_context); } +void qmp_block_dirty_bitmap_add(const char *node, const char *name, + bool has_granularity, uint32_t granularity, + Error **errp) +{ + AioContext *aio_context; + BlockDriverState *bs; + + if (!name || name[0] == '\0') { + error_setg(errp, "Bitmap name cannot be empty"); + return; + } + + bs = bdrv_lookup_bs(node, node, errp); + if (!bs) { + return; + } + + aio_context = bdrv_get_aio_context(bs); + aio_context_acquire(aio_context); + + if (has_granularity) { + if (granularity < 512 || !is_power_of_2(granularity)) { + error_setg(errp, "Granularity must be power of 2 " + "and at least 512"); + goto out; + } + } else { + /* Default to cluster size, if available: */ + granularity = bdrv_get_default_bitmap_granularity(bs); + } + + bdrv_create_dirty_bitmap(bs, granularity, name, errp); + + out: + aio_context_release(aio_context); +} + +void qmp_block_dirty_bitmap_remove(const char *node, const char *name, + Error **errp) +{ + AioContext *aio_context; + BlockDriverState *bs; + BdrvDirtyBitmap *bitmap; + + bitmap = block_dirty_bitmap_lookup(node, name, &bs, &aio_context, errp); + if (!bitmap || !bs) { + return; + } + + if (bdrv_dirty_bitmap_frozen(bitmap)) { + error_setg(errp, + "Bitmap '%s' is currently frozen and cannot be removed", + name); + goto out; + } + bdrv_dirty_bitmap_make_anon(bitmap); + bdrv_release_dirty_bitmap(bs, bitmap); + + out: + aio_context_release(aio_context); +} + +/** + * Completely clear a bitmap, for the purposes of synchronizing a bitmap + * immediately after a full backup operation. + */ +void qmp_block_dirty_bitmap_clear(const char *node, const char *name, + Error **errp) +{ + AioContext *aio_context; + BdrvDirtyBitmap *bitmap; + BlockDriverState *bs; + + bitmap = block_dirty_bitmap_lookup(node, name, &bs, &aio_context, errp); + if (!bitmap || !bs) { + return; + } + + if (bdrv_dirty_bitmap_frozen(bitmap)) { + error_setg(errp, + "Bitmap '%s' is currently frozen and cannot be modified", + name); + goto out; + } else if (!bdrv_dirty_bitmap_enabled(bitmap)) { + error_setg(errp, + "Bitmap '%s' is currently disabled and cannot be cleared", + name); + goto out; + } + + bdrv_clear_dirty_bitmap(bitmap); + + out: + aio_context_release(aio_context); +} + int hmp_drive_del(Monitor *mon, const QDict *qdict, QObject **ret_data) { const char *id = qdict_get_str(qdict, "id"); @@ -2055,7 +2215,7 @@ void qmp_block_resize(bool has_device, const char *device, error_set(errp, QERR_UNSUPPORTED); break; case -EACCES: - error_set(errp, QERR_DEVICE_IS_READ_ONLY, device); + error_setg(errp, "Device '%s' is read only", device); break; case -EBUSY: error_set(errp, QERR_DEVICE_IN_USE, device); @@ -2270,6 +2430,7 @@ void qmp_drive_backup(const char *device, const char *target, enum MirrorSyncMode sync, bool has_mode, enum NewImageMode mode, bool has_speed, int64_t speed, + bool has_bitmap, const char *bitmap, bool has_on_source_error, BlockdevOnError on_source_error, bool has_on_target_error, BlockdevOnError on_target_error, Error **errp) @@ -2278,6 +2439,7 @@ void qmp_drive_backup(const char *device, const char *target, BlockDriverState *bs; BlockDriverState *target_bs; BlockDriverState *source = NULL; + BdrvDirtyBitmap *bmap = NULL; AioContext *aio_context; BlockDriver *drv = NULL; Error *local_err = NULL; @@ -2377,7 +2539,16 @@ void qmp_drive_backup(const char *device, const char *target, bdrv_set_aio_context(target_bs, aio_context); - backup_start(bs, target_bs, speed, sync, on_source_error, on_target_error, + if (has_bitmap) { + bmap = bdrv_find_dirty_bitmap(bs, bitmap); + if (!bmap) { + error_setg(errp, "Bitmap '%s' could not be found", bitmap); + goto out; + } + } + + backup_start(bs, target_bs, speed, sync, bmap, + on_source_error, on_target_error, block_job_cb, bs, &local_err); if (local_err != NULL) { bdrv_unref(target_bs); @@ -2391,7 +2562,7 @@ out: BlockDeviceInfoList *qmp_query_named_block_nodes(Error **errp) { - return bdrv_named_nodes_list(); + return bdrv_named_nodes_list(errp); } void qmp_blockdev_backup(const char *device, const char *target, @@ -2438,8 +2609,8 @@ void qmp_blockdev_backup(const char *device, const char *target, bdrv_ref(target_bs); bdrv_set_aio_context(target_bs, aio_context); - backup_start(bs, target_bs, speed, sync, on_source_error, on_target_error, - block_job_cb, bs, &local_err); + backup_start(bs, target_bs, speed, sync, NULL, on_source_error, + on_target_error, block_job_cb, bs, &local_err); if (local_err != NULL) { bdrv_unref(target_bs); error_propagate(errp, local_err); @@ -2699,7 +2870,7 @@ void qmp_block_job_cancel(const char *device, force = false; } - if (job->paused && !force) { + if (job->user_paused && !force) { error_setg(errp, "The block job for device '%s' is currently paused", device); goto out; @@ -2716,10 +2887,11 @@ void qmp_block_job_pause(const char *device, Error **errp) AioContext *aio_context; BlockJob *job = find_block_job(device, &aio_context, errp); - if (!job) { + if (!job || job->user_paused) { return; } + job->user_paused = true; trace_qmp_block_job_pause(job); block_job_pause(job); aio_context_release(aio_context); @@ -2730,10 +2902,11 @@ void qmp_block_job_resume(const char *device, Error **errp) AioContext *aio_context; BlockJob *job = find_block_job(device, &aio_context, errp); - if (!job) { + if (!job || !job->user_paused) { return; } + job->user_paused = false; trace_qmp_block_job_resume(job); block_job_resume(job); aio_context_release(aio_context); diff --git a/blockjob.c b/blockjob.c index ba2255d91f..2755465259 100644 --- a/blockjob.c +++ b/blockjob.c @@ -107,7 +107,7 @@ void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp) void block_job_complete(BlockJob *job, Error **errp) { - if (job->paused || job->cancelled || !job->driver->complete) { + if (job->pause_count || job->cancelled || !job->driver->complete) { error_set(errp, QERR_BLOCK_JOB_NOT_READY, bdrv_get_device_name(job->bs)); return; @@ -118,17 +118,26 @@ void block_job_complete(BlockJob *job, Error **errp) void block_job_pause(BlockJob *job) { - job->paused = true; + job->pause_count++; } bool block_job_is_paused(BlockJob *job) { - return job->paused; + return job->pause_count > 0; } void block_job_resume(BlockJob *job) { - job->paused = false; + assert(job->pause_count > 0); + job->pause_count--; + if (job->pause_count) { + return; + } + block_job_enter(job); +} + +void block_job_enter(BlockJob *job) +{ block_job_iostatus_reset(job); if (job->co && !job->busy) { qemu_coroutine_enter(job->co, NULL); @@ -138,7 +147,7 @@ void block_job_resume(BlockJob *job) void block_job_cancel(BlockJob *job) { job->cancelled = true; - block_job_resume(job); + block_job_enter(job); } bool block_job_is_cancelled(BlockJob *job) @@ -258,7 +267,7 @@ BlockJobInfo *block_job_query(BlockJob *job) info->device = g_strdup(bdrv_get_device_name(job->bs)); info->len = job->len; info->busy = job->busy; - info->paused = job->paused; + info->paused = job->pause_count > 0; info->offset = job->offset; info->speed = job->speed; info->io_status = job->iostatus; @@ -335,6 +344,8 @@ BlockErrorAction block_job_error_action(BlockJob *job, BlockDriverState *bs, IO_OPERATION_TYPE_WRITE, action, &error_abort); if (action == BLOCK_ERROR_ACTION_STOP) { + /* make the pause user visible, which will be resumed from QMP. */ + job->user_paused = true; block_job_pause(job); block_job_iostatus_set_err(job, error); if (bs != job->bs) { diff --git a/docs/bitmaps.md b/docs/bitmaps.md new file mode 100644 index 0000000000..f066b48aa5 --- /dev/null +++ b/docs/bitmaps.md @@ -0,0 +1,352 @@ +<!-- +Copyright 2015 John Snow <jsnow@redhat.com> and Red Hat, Inc. +All rights reserved. + +This file is licensed via The FreeBSD Documentation License, the full text of +which is included at the end of this document. +--> + +# Dirty Bitmaps and Incremental Backup + +* Dirty Bitmaps are objects that track which data needs to be backed up for the + next incremental backup. + +* Dirty bitmaps can be created at any time and attached to any node + (not just complete drives.) + +## Dirty Bitmap Names + +* A dirty bitmap's name is unique to the node, but bitmaps attached to different + nodes can share the same name. + +## Bitmap Modes + +* A Bitmap can be "frozen," which means that it is currently in-use by a backup + operation and cannot be deleted, renamed, written to, reset, + etc. + +## Basic QMP Usage + +### Supported Commands ### + +* block-dirty-bitmap-add +* block-dirty-bitmap-remove +* block-dirty-bitmap-clear + +### Creation + +* To create a new bitmap, enabled, on the drive with id=drive0: + +```json +{ "execute": "block-dirty-bitmap-add", + "arguments": { + "node": "drive0", + "name": "bitmap0" + } +} +``` + +* This bitmap will have a default granularity that matches the cluster size of + its associated drive, if available, clamped to between [4KiB, 64KiB]. + The current default for qcow2 is 64KiB. + +* To create a new bitmap that tracks changes in 32KiB segments: + +```json +{ "execute": "block-dirty-bitmap-add", + "arguments": { + "node": "drive0", + "name": "bitmap0", + "granularity": 32768 + } +} +``` + +### Deletion + +* Bitmaps that are frozen cannot be deleted. + +* Deleting the bitmap does not impact any other bitmaps attached to the same + node, nor does it affect any backups already created from this node. + +* Because bitmaps are only unique to the node to which they are attached, + you must specify the node/drive name here, too. + +```json +{ "execute": "block-dirty-bitmap-remove", + "arguments": { + "node": "drive0", + "name": "bitmap0" + } +} +``` + +### Resetting + +* Resetting a bitmap will clear all information it holds. + +* An incremental backup created from an empty bitmap will copy no data, + as if nothing has changed. + +```json +{ "execute": "block-dirty-bitmap-clear", + "arguments": { + "node": "drive0", + "name": "bitmap0" + } +} +``` + +## Transactions (Not yet implemented) + +* Transactional commands are forthcoming in a future version, + and are not yet available for use. This section serves as + documentation of intent for their design and usage. + +### Justification + +Bitmaps can be safely modified when the VM is paused or halted by using +the basic QMP commands. For instance, you might perform the following actions: + +1. Boot the VM in a paused state. +2. Create a full drive backup of drive0. +3. Create a new bitmap attached to drive0. +4. Resume execution of the VM. +5. Incremental backups are ready to be created. + +At this point, the bitmap and drive backup would be correctly in sync, +and incremental backups made from this point forward would be correctly aligned +to the full drive backup. + +This is not particularly useful if we decide we want to start incremental +backups after the VM has been running for a while, for which we will need to +perform actions such as the following: + +1. Boot the VM and begin execution. +2. Using a single transaction, perform the following operations: + * Create bitmap0. + * Create a full drive backup of drive0. +3. Incremental backups are now ready to be created. + +### Supported Bitmap Transactions + +* block-dirty-bitmap-add +* block-dirty-bitmap-clear + +The usages are identical to their respective QMP commands, but see below +for examples. + +### Example: New Incremental Backup + +As outlined in the justification, perhaps we want to create a new incremental +backup chain attached to a drive. + +```json +{ "execute": "transaction", + "arguments": { + "actions": [ + {"type": "block-dirty-bitmap-add", + "data": {"node": "drive0", "name": "bitmap0"} }, + {"type": "drive-backup", + "data": {"device": "drive0", "target": "/path/to/full_backup.img", + "sync": "full", "format": "qcow2"} } + ] + } +} +``` + +### Example: New Incremental Backup Anchor Point + +Maybe we just want to create a new full backup with an existing bitmap and +want to reset the bitmap to track the new chain. + +```json +{ "execute": "transaction", + "arguments": { + "actions": [ + {"type": "block-dirty-bitmap-clear", + "data": {"node": "drive0", "name": "bitmap0"} }, + {"type": "drive-backup", + "data": {"device": "drive0", "target": "/path/to/new_full_backup.img", + "sync": "full", "format": "qcow2"} } + ] + } +} +``` + +## Incremental Backups + +The star of the show. + +**Nota Bene!** Only incremental backups of entire drives are supported for now. +So despite the fact that you can attach a bitmap to any arbitrary node, they are +only currently useful when attached to the root node. This is because +drive-backup only supports drives/devices instead of arbitrary nodes. + +### Example: First Incremental Backup + +1. Create a full backup and sync it to the dirty bitmap, as in the transactional +examples above; or with the VM offline, manually create a full copy and then +create a new bitmap before the VM begins execution. + + * Let's assume the full backup is named 'full_backup.img'. + * Let's assume the bitmap you created is 'bitmap0' attached to 'drive0'. + +2. Create a destination image for the incremental backup that utilizes the +full backup as a backing image. + + * Let's assume it is named 'incremental.0.img'. + + ```sh + # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2 + ``` + +3. Issue the incremental backup command: + + ```json + { "execute": "drive-backup", + "arguments": { + "device": "drive0", + "bitmap": "bitmap0", + "target": "incremental.0.img", + "format": "qcow2", + "sync": "dirty-bitmap", + "mode": "existing" + } + } + ``` + +### Example: Second Incremental Backup + +1. Create a new destination image for the incremental backup that points to the + previous one, e.g.: 'incremental.1.img' + + ```sh + # qemu-img create -f qcow2 incremental.1.img -b incremental.0.img -F qcow2 + ``` + +2. Issue a new incremental backup command. The only difference here is that we + have changed the target image below. + + ```json + { "execute": "drive-backup", + "arguments": { + "device": "drive0", + "bitmap": "bitmap0", + "target": "incremental.1.img", + "format": "qcow2", + "sync": "dirty-bitmap", + "mode": "existing" + } + } + ``` + +## Errors + +* In the event of an error that occurs after a backup job is successfully + launched, either by a direct QMP command or a QMP transaction, the user + will receive a BLOCK_JOB_COMPLETE event with a failure message, accompanied + by a BLOCK_JOB_ERROR event. + +* In the case of an event being cancelled, the user will receive a + BLOCK_JOB_CANCELLED event instead of a pair of COMPLETE and ERROR events. + +* In either case, the incremental backup data contained within the bitmap is + safely rolled back, and the data within the bitmap is not lost. The image + file created for the failed attempt can be safely deleted. + +* Once the underlying problem is fixed (e.g. more storage space is freed up), + you can simply retry the incremental backup command with the same bitmap. + +### Example + +1. Create a target image: + + ```sh + # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2 + ``` + +2. Attempt to create an incremental backup via QMP: + + ```json + { "execute": "drive-backup", + "arguments": { + "device": "drive0", + "bitmap": "bitmap0", + "target": "incremental.0.img", + "format": "qcow2", + "sync": "dirty-bitmap", + "mode": "existing" + } + } + ``` + +3. Receive an event notifying us of failure: + + ```json + { "timestamp": { "seconds": 1424709442, "microseconds": 844524 }, + "data": { "speed": 0, "offset": 0, "len": 67108864, + "error": "No space left on device", + "device": "drive1", "type": "backup" }, + "event": "BLOCK_JOB_COMPLETED" } + ``` + +4. Delete the failed incremental, and re-create the image. + + ```sh + # rm incremental.0.img + # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2 + ``` + +5. Retry the command after fixing the underlying problem, + such as freeing up space on the backup volume: + + ```json + { "execute": "drive-backup", + "arguments": { + "device": "drive0", + "bitmap": "bitmap0", + "target": "incremental.0.img", + "format": "qcow2", + "sync": "dirty-bitmap", + "mode": "existing" + } + } + ``` + +6. Receive confirmation that the job completed successfully: + + ```json + { "timestamp": { "seconds": 1424709668, "microseconds": 526525 }, + "data": { "device": "drive1", "type": "backup", + "speed": 0, "len": 67108864, "offset": 67108864}, + "event": "BLOCK_JOB_COMPLETED" } + ``` + +<!-- +The FreeBSD Documentation License + +Redistribution and use in source (Markdown) and 'compiled' forms (SGML, HTML, +PDF, PostScript, RTF and so forth) with or without modification, are permitted +provided that the following conditions are met: + +Redistributions of source code (Markdown) must retain the above copyright +notice, this list of conditions and the following disclaimer of this file +unmodified. + +Redistributions in compiled form (transformed to other DTDs, converted to PDF, +PostScript, RTF and other formats) must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +THIS DOCUMENTATION IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +THIS DOCUMENTATION, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +--> diff --git a/docs/qmp/qmp-events.txt b/docs/qmp/qmp-events.txt index d759d19748..b19e490eb5 100644 --- a/docs/qmp/qmp-events.txt +++ b/docs/qmp/qmp-events.txt @@ -31,21 +31,26 @@ Example: BLOCK_IMAGE_CORRUPTED --------------------- -Emitted when a disk image is being marked corrupt. +Emitted when a disk image is being marked corrupt. The image can be +identified by its device or node name. The 'device' field is always +present for compatibility reasons, but it can be empty ("") if the +image does not have a device name associated. Data: -- "device": Device name (json-string) -- "msg": Informative message (e.g., reason for the corruption) (json-string) -- "offset": If the corruption resulted from an image access, this is the access - offset into the image (json-int) -- "size": If the corruption resulted from an image access, this is the access - size (json-int) +- "device": Device name (json-string) +- "node-name": Node name (json-string, optional) +- "msg": Informative message (e.g., reason for the corruption) + (json-string) +- "offset": If the corruption resulted from an image access, this + is the access offset into the image (json-int) +- "size": If the corruption resulted from an image access, this + is the access size (json-int) Example: { "event": "BLOCK_IMAGE_CORRUPTED", - "data": { "device": "ide0-hd0", + "data": { "device": "ide0-hd0", "node-name": "node0", "msg": "Prevented active L1 table overwrite", "offset": 196608, "size": 65536 }, "timestamp": { "seconds": 1378126126, "microseconds": 966463 } } @@ -391,8 +391,7 @@ static void print_block_info(Monitor *mon, BlockInfo *info, inserted->iops_size); } - /* TODO: inserted->image should never be null */ - if (verbose && inserted->image) { + if (verbose) { monitor_printf(mon, "\nImages:\n"); image_info = inserted->image; while (1) { @@ -1062,7 +1061,8 @@ void hmp_drive_backup(Monitor *mon, const QDict *qdict) qmp_drive_backup(device, filename, !!format, format, full ? MIRROR_SYNC_MODE_FULL : MIRROR_SYNC_MODE_TOP, - true, mode, false, 0, false, 0, false, 0, &err); + true, mode, false, 0, false, NULL, + false, 0, false, 0, &err); hmp_handle_error(mon, &err); } diff --git a/hw/acpi/pcihp.c b/hw/acpi/pcihp.c index 612fec03ee..77e1126f8f 100644 --- a/hw/acpi/pcihp.c +++ b/hw/acpi/pcihp.c @@ -120,7 +120,7 @@ static bool acpi_pcihp_pc_no_hotplug(AcpiPciHpState *s, PCIDevice *dev) static void acpi_pcihp_eject_slot(AcpiPciHpState *s, unsigned bsel, unsigned slots) { BusChild *kid, *next; - int slot = ffs(slots) - 1; + int slot = ctz32(slots); PCIBus *bus = acpi_pcihp_find_hotplug_bus(s, bsel); if (!bus) { diff --git a/hw/arm/nseries.c b/hw/arm/nseries.c index 2a5406d98d..d243159664 100644 --- a/hw/arm/nseries.c +++ b/hw/arm/nseries.c @@ -579,7 +579,10 @@ static uint32_t mipid_txrx(void *opaque, uint32_t cmd, int len) case 0x26: /* GAMSET */ if (!s->pm) { - s->gamma = ffs(s->param[0] & 0xf) - 1; + s->gamma = ctz32(s->param[0] & 0xf); + if (s->gamma == 32) { + s->gamma = -1; /* XXX: should this be 0? */ + } } else if (s->pm < 0) { s->pm = 1; } diff --git a/hw/arm/omap1.c b/hw/arm/omap1.c index 91ffb589e5..de2b289257 100644 --- a/hw/arm/omap1.c +++ b/hw/arm/omap1.c @@ -2004,8 +2004,7 @@ static void omap_mpuio_write(void *opaque, hwaddr addr, case 0x04: /* OUTPUT_REG */ diff = (s->outputs ^ value) & ~s->dir; s->outputs = value; - while ((ln = ffs(diff))) { - ln --; + while ((ln = ctz32(diff)) != 32) { if (s->handler[ln]) qemu_set_irq(s->handler[ln], (value >> ln) & 1); diff &= ~(1 << ln); @@ -2017,8 +2016,7 @@ static void omap_mpuio_write(void *opaque, hwaddr addr, s->dir = value; value = s->outputs & ~s->dir; - while ((ln = ffs(diff))) { - ln --; + while ((ln = ctz32(diff)) != 32) { if (s->handler[ln]) qemu_set_irq(s->handler[ln], (value >> ln) & 1); diff &= ~(1 << ln); diff --git a/hw/arm/pxa2xx_gpio.c b/hw/arm/pxa2xx_gpio.c index 354ccf1ea1..c89c8045c3 100644 --- a/hw/arm/pxa2xx_gpio.c +++ b/hw/arm/pxa2xx_gpio.c @@ -137,7 +137,7 @@ static void pxa2xx_gpio_handler_update(PXA2xxGPIOInfo *s) { level = s->olevel[i] & s->dir[i]; for (diff = s->prev_level[i] ^ level; diff; diff ^= 1 << bit) { - bit = ffs(diff) - 1; + bit = ctz32(diff); line = bit + 32 * i; qemu_set_irq(s->handler[line], (level >> bit) & 1); } diff --git a/hw/arm/strongarm.c b/hw/arm/strongarm.c index 1ddea6d89c..da9fc1d51b 100644 --- a/hw/arm/strongarm.c +++ b/hw/arm/strongarm.c @@ -528,7 +528,7 @@ static void strongarm_gpio_handler_update(StrongARMGPIOInfo *s) level = s->olevel & s->dir; for (diff = s->prev_level ^ level; diff; diff ^= 1 << bit) { - bit = ffs(diff) - 1; + bit = ctz32(diff); qemu_set_irq(s->handler[bit], (level >> bit) & 1); } @@ -745,7 +745,7 @@ static void strongarm_ppc_handler_update(StrongARMPPCInfo *s) level = s->olevel & s->dir; for (diff = s->prev_level ^ level; diff; diff ^= 1 << bit) { - bit = ffs(diff) - 1; + bit = ctz32(diff); qemu_set_irq(s->handler[bit], (level >> bit) & 1); } diff --git a/hw/block/m25p80.c b/hw/block/m25p80.c index afe243b811..efc43dde6a 100644 --- a/hw/block/m25p80.c +++ b/hw/block/m25p80.c @@ -621,7 +621,6 @@ static int m25p80_init(SSISlave *ss) s->size = s->pi->sector_size * s->pi->n_sectors; s->dirty_page = -1; - s->storage = blk_blockalign(s->blk, s->size); /* FIXME use a qdev drive property instead of drive_get_next() */ dinfo = drive_get_next(IF_MTD); @@ -629,6 +628,9 @@ static int m25p80_init(SSISlave *ss) if (dinfo) { DB_PRINT_L(0, "Binding to IF_MTD drive\n"); s->blk = blk_by_legacy_dinfo(dinfo); + blk_attach_dev_nofail(s->blk, s); + + s->storage = blk_blockalign(s->blk, s->size); /* FIXME: Move to late init */ if (blk_read(s->blk, 0, s->storage, @@ -638,6 +640,7 @@ static int m25p80_init(SSISlave *ss) } } else { DB_PRINT_L(0, "No BDRV - binding to RAM\n"); + s->storage = blk_blockalign(NULL, s->size); memset(s->storage, 0xFF, s->size); } diff --git a/hw/bt/sdp.c b/hw/bt/sdp.c index 218e075df7..c903747952 100644 --- a/hw/bt/sdp.c +++ b/hw/bt/sdp.c @@ -707,7 +707,7 @@ static void sdp_service_record_build(struct sdp_service_record_s *record, len += sdp_attr_max_size(&def->attributes[record->attributes ++].data, &record->uuids); } - record->uuids = 1 << ffs(record->uuids - 1); + record->uuids = pow2ceil(record->uuids); record->attribute_list = g_malloc0(record->attributes * sizeof(*record->attribute_list)); record->uuid = diff --git a/hw/char/virtio-serial-bus.c b/hw/char/virtio-serial-bus.c index e336bdb4a9..6e2ad8221b 100644 --- a/hw/char/virtio-serial-bus.c +++ b/hw/char/virtio-serial-bus.c @@ -814,12 +814,12 @@ static uint32_t find_free_port_id(VirtIOSerial *vser) max_nr_ports = vser->serial.max_virtserial_ports; for (i = 0; i < (max_nr_ports + 31) / 32; i++) { - uint32_t map, bit; + uint32_t map, zeroes; map = vser->ports_map[i]; - bit = ffs(~map); - if (bit) { - return (bit - 1) + i * 32; + zeroes = ctz32(~map); + if (zeroes != 32) { + return zeroes + i * 32; } } return VIRTIO_CONSOLE_BAD_ID; diff --git a/hw/display/tc6393xb.c b/hw/display/tc6393xb.c index 4306adc959..66b7ade8da 100644 --- a/hw/display/tc6393xb.c +++ b/hw/display/tc6393xb.c @@ -171,7 +171,7 @@ static void tc6393xb_gpio_handler_update(TC6393xbState *s) level = s->gpio_level & s->gpio_dir; for (diff = s->prev_level ^ level; diff; diff ^= 1 << bit) { - bit = ffs(diff) - 1; + bit = ctz32(diff); qemu_set_irq(s->handler[bit], (level >> bit) & 1); } diff --git a/hw/gpio/max7310.c b/hw/gpio/max7310.c index 7fbf313ce8..2f59b134ee 100644 --- a/hw/gpio/max7310.c +++ b/hw/gpio/max7310.c @@ -96,7 +96,7 @@ static int max7310_tx(I2CSlave *i2c, uint8_t data) case 0x01: /* Output port */ for (diff = (data ^ s->level) & ~s->direction; diff; diff &= ~(1 << line)) { - line = ffs(diff) - 1; + line = ctz32(diff); if (s->handler[line]) qemu_set_irq(s->handler[line], (data >> line) & 1); } diff --git a/hw/gpio/omap_gpio.c b/hw/gpio/omap_gpio.c index 9a43486890..d92f8cfbae 100644 --- a/hw/gpio/omap_gpio.c +++ b/hw/gpio/omap_gpio.c @@ -125,8 +125,7 @@ static void omap_gpio_write(void *opaque, hwaddr addr, case 0x04: /* DATA_OUTPUT */ diff = (s->outputs ^ value) & ~s->dir; s->outputs = value; - while ((ln = ffs(diff))) { - ln --; + while ((ln = ctz32(diff)) != 32) { if (s->handler[ln]) qemu_set_irq(s->handler[ln], (value >> ln) & 1); diff &= ~(1 << ln); @@ -138,8 +137,7 @@ static void omap_gpio_write(void *opaque, hwaddr addr, s->dir = value; value = s->outputs & ~s->dir; - while ((ln = ffs(diff))) { - ln --; + while ((ln = ctz32(diff)) != 32) { if (s->handler[ln]) qemu_set_irq(s->handler[ln], (value >> ln) & 1); diff &= ~(1 << ln); @@ -253,8 +251,7 @@ static inline void omap2_gpio_module_out_update(struct omap2_gpio_s *s, s->outputs ^= diff; diff &= ~s->dir; - while ((ln = ffs(diff))) { - ln --; + while ((ln = ctz32(diff)) != 32) { qemu_set_irq(s->handler[ln], (s->outputs >> ln) & 1); diff &= ~(1 << ln); } @@ -442,8 +439,8 @@ static void omap2_gpio_module_write(void *opaque, hwaddr addr, s->dir = value; value = s->outputs & ~s->dir; - while ((ln = ffs(diff))) { - diff &= ~(1 <<-- ln); + while ((ln = ctz32(diff)) != 32) { + diff &= ~(1 << ln); qemu_set_irq(s->handler[ln], (value >> ln) & 1); } diff --git a/hw/gpio/zaurus.c b/hw/gpio/zaurus.c index 94083424f8..24a77272d7 100644 --- a/hw/gpio/zaurus.c +++ b/hw/gpio/zaurus.c @@ -65,7 +65,7 @@ static inline void scoop_gpio_handler_update(ScoopInfo *s) { level = s->gpio_level & s->gpio_dir; for (diff = s->prev_level ^ level; diff; diff ^= 1 << bit) { - bit = ffs(diff) - 1; + bit = ctz32(diff); qemu_set_irq(s->handler[bit], (level >> bit) & 1); } diff --git a/hw/i2c/omap_i2c.c b/hw/i2c/omap_i2c.c index d63278dbde..b6f544a221 100644 --- a/hw/i2c/omap_i2c.c +++ b/hw/i2c/omap_i2c.c @@ -171,9 +171,13 @@ static uint32_t omap_i2c_read(void *opaque, hwaddr addr) case 0x0c: /* I2C_IV */ if (s->revision >= OMAP2_INTR_REV) break; - ret = ffs(s->stat & s->mask); - if (ret) - s->stat ^= 1 << (ret - 1); + ret = ctz32(s->stat & s->mask); + if (ret != 32) { + s->stat ^= 1 << ret; + ret++; + } else { + ret = 0; + } omap_i2c_interrupts_update(s); return ret; diff --git a/hw/intc/allwinner-a10-pic.c b/hw/intc/allwinner-a10-pic.c index de820b9723..eed7621f13 100644 --- a/hw/intc/allwinner-a10-pic.c +++ b/hw/intc/allwinner-a10-pic.c @@ -23,7 +23,7 @@ static void aw_a10_pic_update(AwA10PICState *s) { uint8_t i; - int irq = 0, fiq = 0, pending; + int irq = 0, fiq = 0, zeroes; s->vector = 0; @@ -32,9 +32,9 @@ static void aw_a10_pic_update(AwA10PICState *s) fiq |= s->select[i] & s->irq_pending[i] & ~s->mask[i]; if (!s->vector) { - pending = ffs(s->irq_pending[i] & ~s->mask[i]); - if (pending) { - s->vector = (i * 32 + pending - 1) * 4; + zeroes = ctz32(s->irq_pending[i] & ~s->mask[i]); + if (zeroes != 32) { + s->vector = (i * 32 + zeroes) * 4; } } } diff --git a/hw/intc/omap_intc.c b/hw/intc/omap_intc.c index ad3931c112..e9b38a3c63 100644 --- a/hw/intc/omap_intc.c +++ b/hw/intc/omap_intc.c @@ -60,7 +60,7 @@ struct omap_intr_handler_s { static void omap_inth_sir_update(struct omap_intr_handler_s *s, int is_fiq) { - int i, j, sir_intr, p_intr, p, f; + int i, j, sir_intr, p_intr, p; uint32_t level; sir_intr = 0; p_intr = 255; @@ -72,14 +72,15 @@ static void omap_inth_sir_update(struct omap_intr_handler_s *s, int is_fiq) for (j = 0; j < s->nbanks; ++j) { level = s->bank[j].irqs & ~s->bank[j].mask & (is_fiq ? s->bank[j].fiq : ~s->bank[j].fiq); - for (f = ffs(level), i = f - 1, level >>= f - 1; f; i += f, - level >>= f) { + + while (level != 0) { + i = ctz32(level); p = s->bank[j].priority[i]; if (p <= p_intr) { p_intr = p; sir_intr = 32 * j + i; } - f = ffs(level >> 1); + level &= level - 1; } } s->sir_intr[is_fiq] = sir_intr; diff --git a/hw/pci-host/bonito.c b/hw/pci-host/bonito.c index 8134d0bcd0..3a731fe18d 100644 --- a/hw/pci-host/bonito.c +++ b/hw/pci-host/bonito.c @@ -427,7 +427,7 @@ static uint32_t bonito_sbridge_pciaddr(void *opaque, hwaddr addr) cfgaddr |= (s->regs[BONITO_PCIMAP_CFG] & 0xffff) << 16; idsel = (cfgaddr & BONITO_PCICONF_IDSEL_MASK) >> BONITO_PCICONF_IDSEL_OFFSET; - devno = ffs(idsel) - 1; + devno = ctz32(idsel); funno = (cfgaddr & BONITO_PCICONF_FUN_MASK) >> BONITO_PCICONF_FUN_OFFSET; regno = (cfgaddr & BONITO_PCICONF_REG_MASK) >> BONITO_PCICONF_REG_OFFSET; diff --git a/hw/pci-host/uninorth.c b/hw/pci-host/uninorth.c index 53f2b59ae8..f0144eb7b0 100644 --- a/hw/pci-host/uninorth.c +++ b/hw/pci-host/uninorth.c @@ -92,7 +92,10 @@ static uint32_t unin_get_config_reg(uint32_t reg, uint32_t addr) uint32_t slot, func; /* Grab CFA0 style values */ - slot = ffs(reg & 0xfffff800) - 1; + slot = ctz32(reg & 0xfffff800); + if (slot == 32) { + slot = -1; /* XXX: should this be 0? */ + } func = (reg >> 8) & 7; /* ... and then convert them to x86 format */ diff --git a/hw/pci/msi.c b/hw/pci/msi.c index 916e1a1e5b..2949938223 100644 --- a/hw/pci/msi.c +++ b/hw/pci/msi.c @@ -72,7 +72,7 @@ static inline uint8_t msi_cap_sizeof(uint16_t flags) static inline unsigned int msi_nr_vectors(uint16_t flags) { return 1U << - ((flags & PCI_MSI_FLAGS_QSIZE) >> (ffs(PCI_MSI_FLAGS_QSIZE) - 1)); + ((flags & PCI_MSI_FLAGS_QSIZE) >> ctz32(PCI_MSI_FLAGS_QSIZE)); } static inline uint8_t msi_flags_off(const PCIDevice* dev) @@ -175,9 +175,9 @@ int msi_init(struct PCIDevice *dev, uint8_t offset, assert(nr_vectors > 0); assert(nr_vectors <= PCI_MSI_VECTORS_MAX); /* the nr of MSI vectors is up to 32 */ - vectors_order = ffs(nr_vectors) - 1; + vectors_order = ctz32(nr_vectors); - flags = vectors_order << (ffs(PCI_MSI_FLAGS_QMASK) - 1); + flags = vectors_order << ctz32(PCI_MSI_FLAGS_QMASK); if (msi64bit) { flags |= PCI_MSI_FLAGS_64BIT; } @@ -355,12 +355,12 @@ void msi_write_config(PCIDevice *dev, uint32_t addr, uint32_t val, int len) * just don't crash the host */ log_num_vecs = - (flags & PCI_MSI_FLAGS_QSIZE) >> (ffs(PCI_MSI_FLAGS_QSIZE) - 1); + (flags & PCI_MSI_FLAGS_QSIZE) >> ctz32(PCI_MSI_FLAGS_QSIZE); log_max_vecs = - (flags & PCI_MSI_FLAGS_QMASK) >> (ffs(PCI_MSI_FLAGS_QMASK) - 1); + (flags & PCI_MSI_FLAGS_QMASK) >> ctz32(PCI_MSI_FLAGS_QMASK); if (log_num_vecs > log_max_vecs) { flags &= ~PCI_MSI_FLAGS_QSIZE; - flags |= log_max_vecs << (ffs(PCI_MSI_FLAGS_QSIZE) - 1); + flags |= log_max_vecs << ctz32(PCI_MSI_FLAGS_QSIZE); pci_set_word(dev->config + msi_flags_off(dev), flags); } diff --git a/hw/pci/pcie_aer.c b/hw/pci/pcie_aer.c index eaa3e6ea94..b48c09cd11 100644 --- a/hw/pci/pcie_aer.c +++ b/hw/pci/pcie_aer.c @@ -410,7 +410,7 @@ static void pcie_aer_msg(PCIDevice *dev, const PCIEAERMsg *msg) static void pcie_aer_update_log(PCIDevice *dev, const PCIEAERErr *err) { uint8_t *aer_cap = dev->config + dev->exp.aer_cap; - uint8_t first_bit = ffs(err->status) - 1; + uint8_t first_bit = ctz32(err->status); uint32_t errcap = pci_get_long(aer_cap + PCI_ERR_CAP); int i; diff --git a/hw/pci/shpc.c b/hw/pci/shpc.c index 759910f79a..a706486394 100644 --- a/hw/pci/shpc.c +++ b/hw/pci/shpc.c @@ -61,7 +61,7 @@ /* Same slot state masks are used for command and status registers */ #define SHPC_SLOT_STATE_MASK 0x03 #define SHPC_SLOT_STATE_SHIFT \ - (ffs(SHPC_SLOT_STATE_MASK) - 1) + ctz32(SHPC_SLOT_STATE_MASK) #define SHPC_STATE_NO 0x0 #define SHPC_STATE_PWRONLY 0x1 @@ -70,10 +70,10 @@ #define SHPC_SLOT_PWR_LED_MASK 0xC #define SHPC_SLOT_PWR_LED_SHIFT \ - (ffs(SHPC_SLOT_PWR_LED_MASK) - 1) + ctz32(SHPC_SLOT_PWR_LED_MASK) #define SHPC_SLOT_ATTN_LED_MASK 0x30 #define SHPC_SLOT_ATTN_LED_SHIFT \ - (ffs(SHPC_SLOT_ATTN_LED_MASK) - 1) + ctz32(SHPC_SLOT_ATTN_LED_MASK) #define SHPC_LED_NO 0x0 #define SHPC_LED_ON 0x1 @@ -136,7 +136,7 @@ static int roundup_pow_of_two(int x) static uint16_t shpc_get_status(SHPCDevice *shpc, int slot, uint16_t msk) { uint8_t *status = shpc->config + SHPC_SLOT_STATUS(slot); - return (pci_get_word(status) & msk) >> (ffs(msk) - 1); + return (pci_get_word(status) & msk) >> ctz32(msk); } static void shpc_set_status(SHPCDevice *shpc, @@ -144,7 +144,7 @@ static void shpc_set_status(SHPCDevice *shpc, { uint8_t *status = shpc->config + SHPC_SLOT_STATUS(slot); pci_word_test_and_clear_mask(status, msk); - pci_word_test_and_set_mask(status, value << (ffs(msk) - 1)); + pci_word_test_and_set_mask(status, value << ctz32(msk)); } static void shpc_interrupt_update(PCIDevice *d) diff --git a/hw/pci/slotid_cap.c b/hw/pci/slotid_cap.c index 62f7bae2f1..1c01d346c9 100644 --- a/hw/pci/slotid_cap.c +++ b/hw/pci/slotid_cap.c @@ -3,7 +3,7 @@ #include "qemu/error-report.h" #define SLOTID_CAP_LENGTH 4 -#define SLOTID_NSLOTS_SHIFT (ffs(PCI_SID_ESR_NSLOTS) - 1) +#define SLOTID_NSLOTS_SHIFT ctz32(PCI_SID_ESR_NSLOTS) int slotid_cap_init(PCIDevice *d, int nslots, uint8_t chassis, diff --git a/hw/ppc/ppce500_spin.c b/hw/ppc/ppce500_spin.c index d49f2b8803..a99f7b0397 100644 --- a/hw/ppc/ppce500_spin.c +++ b/hw/ppc/ppce500_spin.c @@ -74,7 +74,7 @@ static void spin_reset(void *opaque) /* Create -kernel TLB entries for BookE, linearly spanning 256MB. */ static inline hwaddr booke206_page_size_to_tlb(uint64_t size) { - return (ffs(size >> 10) - 1) >> 1; + return ctz32(size >> 10) >> 1; } static void mmubooke_create_initial_mapping(CPUPPCState *env, diff --git a/hw/scsi/megasas.c b/hw/scsi/megasas.c index ad7317bfe9..91a5d97c73 100644 --- a/hw/scsi/megasas.c +++ b/hw/scsi/megasas.c @@ -804,7 +804,7 @@ static int megasas_ctrl_get_info(MegasasState *s, MegasasCmd *cmd) MFI_INFO_LDOPS_READ_POLICY); info.max_strips_per_io = cpu_to_le16(s->fw_sge); info.stripe_sz_ops.min = 3; - info.stripe_sz_ops.max = ffs(MEGASAS_MAX_SECTORS + 1) - 1; + info.stripe_sz_ops.max = ctz32(MEGASAS_MAX_SECTORS + 1); info.properties.pred_fail_poll_interval = cpu_to_le16(300); info.properties.intr_throttle_cnt = cpu_to_le16(16); info.properties.intr_throttle_timeout = cpu_to_le16(50); diff --git a/hw/sd/sd.c b/hw/sd/sd.c index f955265f74..8abf0c9e31 100644 --- a/hw/sd/sd.c +++ b/hw/sd/sd.c @@ -796,8 +796,9 @@ static sd_rsp_type_t sd_normal_command(SDState *sd, sd->vhs = 0; /* No response if not exactly one VHS bit is set. */ - if (!(req.arg >> 8) || (req.arg >> ffs(req.arg & ~0xff))) + if (!(req.arg >> 8) || (req.arg >> (ctz32(req.arg & ~0xff) + 1))) { return sd->spi ? sd_r7 : sd_r0; + } /* Accept. */ sd->vhs = req.arg; diff --git a/include/block/aio.h b/include/block/aio.h index 7d1e26b33b..d2bb423de1 100644 --- a/include/block/aio.h +++ b/include/block/aio.h @@ -82,9 +82,6 @@ struct AioContext { /* Used for aio_notify. */ EventNotifier notifier; - /* GPollFDs for aio_poll() */ - GArray *pollfds; - /* Thread pool for performing work and receiving completion callbacks */ struct ThreadPool *thread_pool; @@ -121,13 +118,14 @@ void aio_context_ref(AioContext *ctx); void aio_context_unref(AioContext *ctx); /* Take ownership of the AioContext. If the AioContext will be shared between - * threads, a thread must have ownership when calling aio_poll(). + * threads, and a thread does not want to be interrupted, it will have to + * take ownership around calls to aio_poll(). Otherwise, aio_poll() + * automatically takes care of calling aio_context_acquire and + * aio_context_release. * - * Note that multiple threads calling aio_poll() means timers, BHs, and - * callbacks may be invoked from a different thread than they were registered - * from. Therefore, code must use AioContext acquire/release or use - * fine-grained synchronization to protect shared state if other threads will - * be accessing it simultaneously. + * Access to timers and BHs from a thread that has not acquired AioContext + * is possible. Access to callbacks for now must be done while the AioContext + * is owned by the thread (FIXME). */ void aio_context_acquire(AioContext *ctx); diff --git a/include/block/block.h b/include/block/block.h index 4c57d63fe2..7d1a7174f6 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -382,7 +382,7 @@ void bdrv_lock_medium(BlockDriverState *bs, bool locked); void bdrv_eject(BlockDriverState *bs, bool eject_flag); const char *bdrv_get_format_name(BlockDriverState *bs); BlockDriverState *bdrv_find_node(const char *node_name); -BlockDeviceInfoList *bdrv_named_nodes_list(void); +BlockDeviceInfoList *bdrv_named_nodes_list(Error **errp); BlockDriverState *bdrv_lookup_bs(const char *device, const char *node_name, Error **errp); @@ -398,6 +398,7 @@ void bdrv_iterate_format(void (*it)(void *opaque, const char *name), void *opaque); const char *bdrv_get_node_name(const BlockDriverState *bs); const char *bdrv_get_device_name(const BlockDriverState *bs); +const char *bdrv_get_device_or_node_name(const BlockDriverState *bs); int bdrv_get_flags(BlockDriverState *bs); int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, const uint8_t *buf, int nb_sectors); @@ -449,18 +450,39 @@ bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov); struct HBitmapIter; typedef struct BdrvDirtyBitmap BdrvDirtyBitmap; -BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity, +BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, + uint32_t granularity, + const char *name, Error **errp); +int bdrv_dirty_bitmap_create_successor(BlockDriverState *bs, + BdrvDirtyBitmap *bitmap, + Error **errp); +BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BlockDriverState *bs, + BdrvDirtyBitmap *bitmap, + Error **errp); +BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BlockDriverState *bs, + BdrvDirtyBitmap *bitmap, + Error **errp); +BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, + const char *name); +void bdrv_dirty_bitmap_make_anon(BdrvDirtyBitmap *bitmap); void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap); +void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap); +void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap); BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs); +uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs); +uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap); +bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap); +bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap); int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector); -void bdrv_set_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, +void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap, int64_t cur_sector, int nr_sectors); -void bdrv_reset_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, +void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap, int64_t cur_sector, int nr_sectors); -void bdrv_dirty_iter_init(BlockDriverState *bs, - BdrvDirtyBitmap *bitmap, struct HBitmapIter *hbi); -int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap); +void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap); +void bdrv_dirty_iter_init(BdrvDirtyBitmap *bitmap, struct HBitmapIter *hbi); +void bdrv_set_dirty_iter(struct HBitmapIter *hbi, int64_t offset); +int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap); void bdrv_enable_copy_on_read(BlockDriverState *bs); void bdrv_disable_copy_on_read(BlockDriverState *bs); diff --git a/include/block/block_int.h b/include/block/block_int.h index dccb092df7..db29b7424e 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -439,6 +439,14 @@ extern BlockDriver bdrv_file; extern BlockDriver bdrv_raw; extern BlockDriver bdrv_qcow2; +/** + * bdrv_setup_io_funcs: + * + * Prepare a #BlockDriver for I/O request processing by populating + * unimplemented coroutine and AIO interfaces with generic wrapper functions + * that fall back to implemented interfaces. + */ +void bdrv_setup_io_funcs(BlockDriver *bdrv); int get_tmp_filename(char *filename, int size); BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size, @@ -590,7 +598,7 @@ void commit_active_start(BlockDriverState *bs, BlockDriverState *base, */ void mirror_start(BlockDriverState *bs, BlockDriverState *target, const char *replaces, - int64_t speed, int64_t granularity, int64_t buf_size, + int64_t speed, uint32_t granularity, int64_t buf_size, MirrorSyncMode mode, BlockdevOnError on_source_error, BlockdevOnError on_target_error, BlockCompletionFunc *cb, @@ -602,6 +610,7 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target, * @target: Block device to write to. * @speed: The maximum speed, in bytes per second, or 0 for unlimited. * @sync_mode: What parts of the disk image should be copied to the destination. + * @sync_bitmap: The dirty bitmap if sync_mode is MIRROR_SYNC_MODE_DIRTY_BITMAP. * @on_source_error: The action to take upon error reading from the source. * @on_target_error: The action to take upon error writing to the target. * @cb: Completion function for the job. @@ -612,6 +621,7 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target, */ void backup_start(BlockDriverState *bs, BlockDriverState *target, int64_t speed, MirrorSyncMode sync_mode, + BdrvDirtyBitmap *sync_bitmap, BlockdevOnError on_source_error, BlockdevOnError on_target_error, BlockCompletionFunc *cb, void *opaque, @@ -624,4 +634,8 @@ bool blk_dev_is_tray_open(BlockBackend *blk); bool blk_dev_is_medium_locked(BlockBackend *blk); void blk_dev_resize_cb(BlockBackend *blk); +void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors); +void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, + int nr_sectors); + #endif /* BLOCK_INT_H */ diff --git a/include/block/blockjob.h b/include/block/blockjob.h index b6d4ebbe03..57d8ef13e2 100644 --- a/include/block/blockjob.h +++ b/include/block/blockjob.h @@ -79,10 +79,16 @@ struct BlockJob { bool cancelled; /** - * Set to true if the job is either paused, or will pause itself - * as soon as possible (if busy == true). + * Counter for pause request. If non-zero, the block job is either paused, + * or if busy == true will pause itself as soon as possible. */ - bool paused; + int pause_count; + + /** + * Set to true if the job is paused by user. Can be unpaused with the + * block-job-resume QMP command. + */ + bool user_paused; /** * Set to false by the job while it is in a quiescent state, where @@ -225,11 +231,19 @@ void block_job_pause(BlockJob *job); * block_job_resume: * @job: The job to be resumed. * - * Resume the specified job. + * Resume the specified job. Must be paired with a preceding block_job_pause. */ void block_job_resume(BlockJob *job); /** + * block_job_enter: + * @job: The job to enter. + * + * Continue the specified job by entering the coroutine. + */ +void block_job_enter(BlockJob *job); + +/** * block_job_event_cancelled: * @job: The job whose information is requested. * diff --git a/include/block/qapi.h b/include/block/qapi.h index 168d788521..327549d917 100644 --- a/include/block/qapi.h +++ b/include/block/qapi.h @@ -29,7 +29,7 @@ #include "block/block.h" #include "block/snapshot.h" -BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs); +BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs, Error **errp); int bdrv_query_snapshot_info_list(BlockDriverState *bs, SnapshotInfoList **p_list, Error **errp); diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index b97c2956ec..d4ffead48a 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -568,7 +568,7 @@ static inline void pci_set_byte_by_mask(uint8_t *config, uint8_t mask, uint8_t reg) { uint8_t val = pci_get_byte(config); - uint8_t rval = reg << (ffs(mask) - 1); + uint8_t rval = reg << ctz32(mask); pci_set_byte(config, (~mask & val) | (mask & rval)); } @@ -576,14 +576,14 @@ static inline uint8_t pci_get_byte_by_mask(uint8_t *config, uint8_t mask) { uint8_t val = pci_get_byte(config); - return (val & mask) >> (ffs(mask) - 1); + return (val & mask) >> ctz32(mask); } static inline void pci_set_word_by_mask(uint8_t *config, uint16_t mask, uint16_t reg) { uint16_t val = pci_get_word(config); - uint16_t rval = reg << (ffs(mask) - 1); + uint16_t rval = reg << ctz32(mask); pci_set_word(config, (~mask & val) | (mask & rval)); } @@ -591,14 +591,14 @@ static inline uint16_t pci_get_word_by_mask(uint8_t *config, uint16_t mask) { uint16_t val = pci_get_word(config); - return (val & mask) >> (ffs(mask) - 1); + return (val & mask) >> ctz32(mask); } static inline void pci_set_long_by_mask(uint8_t *config, uint32_t mask, uint32_t reg) { uint32_t val = pci_get_long(config); - uint32_t rval = reg << (ffs(mask) - 1); + uint32_t rval = reg << ctz32(mask); pci_set_long(config, (~mask & val) | (mask & rval)); } @@ -606,14 +606,14 @@ static inline uint32_t pci_get_long_by_mask(uint8_t *config, uint32_t mask) { uint32_t val = pci_get_long(config); - return (val & mask) >> (ffs(mask) - 1); + return (val & mask) >> ctz32(mask); } static inline void pci_set_quad_by_mask(uint8_t *config, uint64_t mask, uint64_t reg) { uint64_t val = pci_get_quad(config); - uint64_t rval = reg << (ffs(mask) - 1); + uint64_t rval = reg << ctz32(mask); pci_set_quad(config, (~mask & val) | (mask & rval)); } @@ -621,7 +621,7 @@ static inline uint64_t pci_get_quad_by_mask(uint8_t *config, uint64_t mask) { uint64_t val = pci_get_quad(config); - return (val & mask) >> (ffs(mask) - 1); + return (val & mask) >> ctz32(mask); } PCIDevice *pci_create_multifunction(PCIBus *bus, int devfn, bool multifunction, diff --git a/include/hw/pci/pcie_regs.h b/include/hw/pci/pcie_regs.h index 848ab1c206..6a28b33e69 100644 --- a/include/hw/pci/pcie_regs.h +++ b/include/hw/pci/pcie_regs.h @@ -27,34 +27,34 @@ /* PCI_EXP_FLAGS */ #define PCI_EXP_FLAGS_VER2 2 /* for now, supports only ver. 2 */ -#define PCI_EXP_FLAGS_IRQ_SHIFT (ffs(PCI_EXP_FLAGS_IRQ) - 1) -#define PCI_EXP_FLAGS_TYPE_SHIFT (ffs(PCI_EXP_FLAGS_TYPE) - 1) +#define PCI_EXP_FLAGS_IRQ_SHIFT ctz32(PCI_EXP_FLAGS_IRQ) +#define PCI_EXP_FLAGS_TYPE_SHIFT ctz32(PCI_EXP_FLAGS_TYPE) /* PCI_EXP_LINK{CAP, STA} */ /* link speed */ #define PCI_EXP_LNK_LS_25 1 -#define PCI_EXP_LNK_MLW_SHIFT (ffs(PCI_EXP_LNKCAP_MLW) - 1) +#define PCI_EXP_LNK_MLW_SHIFT ctz32(PCI_EXP_LNKCAP_MLW) #define PCI_EXP_LNK_MLW_1 (1 << PCI_EXP_LNK_MLW_SHIFT) /* PCI_EXP_LINKCAP */ -#define PCI_EXP_LNKCAP_ASPMS_SHIFT (ffs(PCI_EXP_LNKCAP_ASPMS) - 1) +#define PCI_EXP_LNKCAP_ASPMS_SHIFT ctz32(PCI_EXP_LNKCAP_ASPMS) #define PCI_EXP_LNKCAP_ASPMS_0S (1 << PCI_EXP_LNKCAP_ASPMS_SHIFT) -#define PCI_EXP_LNKCAP_PN_SHIFT (ffs(PCI_EXP_LNKCAP_PN) - 1) +#define PCI_EXP_LNKCAP_PN_SHIFT ctz32(PCI_EXP_LNKCAP_PN) -#define PCI_EXP_SLTCAP_PSN_SHIFT (ffs(PCI_EXP_SLTCAP_PSN) - 1) +#define PCI_EXP_SLTCAP_PSN_SHIFT ctz32(PCI_EXP_SLTCAP_PSN) #define PCI_EXP_SLTCTL_IND_RESERVED 0x0 #define PCI_EXP_SLTCTL_IND_ON 0x1 #define PCI_EXP_SLTCTL_IND_BLINK 0x2 #define PCI_EXP_SLTCTL_IND_OFF 0x3 -#define PCI_EXP_SLTCTL_AIC_SHIFT (ffs(PCI_EXP_SLTCTL_AIC) - 1) +#define PCI_EXP_SLTCTL_AIC_SHIFT ctz32(PCI_EXP_SLTCTL_AIC) #define PCI_EXP_SLTCTL_AIC_OFF \ (PCI_EXP_SLTCTL_IND_OFF << PCI_EXP_SLTCTL_AIC_SHIFT) -#define PCI_EXP_SLTCTL_PIC_SHIFT (ffs(PCI_EXP_SLTCTL_PIC) - 1) +#define PCI_EXP_SLTCTL_PIC_SHIFT ctz32(PCI_EXP_SLTCTL_PIC) #define PCI_EXP_SLTCTL_PIC_OFF \ (PCI_EXP_SLTCTL_IND_OFF << PCI_EXP_SLTCTL_PIC_SHIFT) #define PCI_EXP_SLTCTL_PIC_ON \ @@ -109,7 +109,7 @@ #define PCI_ERR_ROOT_IRQ_MAX 32 #define PCI_ERR_ROOT_IRQ 0xf8000000 -#define PCI_ERR_ROOT_IRQ_SHIFT (ffs(PCI_ERR_ROOT_IRQ) - 1) +#define PCI_ERR_ROOT_IRQ_SHIFT ctz32(PCI_ERR_ROOT_IRQ) #define PCI_ERR_ROOT_STATUS_REPORT_MASK (PCI_ERR_ROOT_COR_RCV | \ PCI_ERR_ROOT_MULTI_COR_RCV | \ PCI_ERR_ROOT_UNCOR_RCV | \ diff --git a/include/qapi/qmp/qerror.h b/include/qapi/qmp/qerror.h index 57a62d4b76..e5673394d3 100644 --- a/include/qapi/qmp/qerror.h +++ b/include/qapi/qmp/qerror.h @@ -37,9 +37,6 @@ void qerror_report_err(Error *err); #define QERR_BASE_NOT_FOUND \ ERROR_CLASS_GENERIC_ERROR, "Base '%s' not found" -#define QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED \ - ERROR_CLASS_GENERIC_ERROR, "Block format '%s' used by device '%s' does not support feature '%s'" - #define QERR_BLOCK_JOB_NOT_READY \ ERROR_CLASS_GENERIC_ERROR, "The active block job for device '%s' cannot be completed" @@ -58,9 +55,6 @@ void qerror_report_err(Error *err); #define QERR_DEVICE_IN_USE \ ERROR_CLASS_GENERIC_ERROR, "Device '%s' is in use" -#define QERR_DEVICE_IS_READ_ONLY \ - ERROR_CLASS_GENERIC_ERROR, "Device '%s' is read only" - #define QERR_DEVICE_NO_HOTPLUG \ ERROR_CLASS_GENERIC_ERROR, "Device '%s' does not support hotplugging" diff --git a/include/qemu/hbitmap.h b/include/qemu/hbitmap.h index 550d7ce2c3..f0a85f8649 100644 --- a/include/qemu/hbitmap.h +++ b/include/qemu/hbitmap.h @@ -65,6 +65,29 @@ struct HBitmapIter { HBitmap *hbitmap_alloc(uint64_t size, int granularity); /** + * hbitmap_truncate: + * @hb: The bitmap to change the size of. + * @size: The number of elements to change the bitmap to accommodate. + * + * truncate or grow an existing bitmap to accommodate a new number of elements. + * This may invalidate existing HBitmapIterators. + */ +void hbitmap_truncate(HBitmap *hb, uint64_t size); + +/** + * hbitmap_merge: + * @a: The bitmap to store the result in. + * @b: The bitmap to merge into @a. + * @return true if the merge was successful, + * false if it was not attempted. + * + * Merge two bitmaps together. + * A := A (BITOR) B. + * B is left unmodified. + */ +bool hbitmap_merge(HBitmap *a, const HBitmap *b); + +/** * hbitmap_empty: * @hb: HBitmap to operate on. * diff --git a/include/standard-headers/linux/virtio_blk.h b/include/standard-headers/linux/virtio_blk.h index 12016b47f3..cd601f4069 100644 --- a/include/standard-headers/linux/virtio_blk.h +++ b/include/standard-headers/linux/virtio_blk.h @@ -58,7 +58,7 @@ struct virtio_blk_config { uint32_t size_max; /* The maximum number of segments (if VIRTIO_BLK_F_SEG_MAX) */ uint32_t seg_max; - /* geometry the device (if VIRTIO_BLK_F_GEOMETRY) */ + /* geometry of the device (if VIRTIO_BLK_F_GEOMETRY) */ struct virtio_blk_geometry { uint16_t cylinders; uint8_t heads; @@ -117,7 +117,11 @@ struct virtio_blk_config { #define VIRTIO_BLK_T_BARRIER 0x80000000 #endif /* !VIRTIO_BLK_NO_LEGACY */ -/* This is the first element of the read scatter-gather list. */ +/* + * This comes first in the read scatter-gather list. + * For legacy virtio, if VIRTIO_F_ANY_LAYOUT is not negotiated, + * this is the first element of the read scatter-gather list. + */ struct virtio_blk_outhdr { /* VIRTIO_BLK_T* */ __virtio32 type; diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h index 77e9b9c370..b4a4d5e0b9 100644 --- a/include/sysemu/block-backend.h +++ b/include/sysemu/block-backend.h @@ -87,6 +87,8 @@ int blk_read_unthrottled(BlockBackend *blk, int64_t sector_num, uint8_t *buf, int nb_sectors); int blk_write(BlockBackend *blk, int64_t sector_num, const uint8_t *buf, int nb_sectors); +int blk_write_zeroes(BlockBackend *blk, int64_t sector_num, + int nb_sectors, BdrvRequestFlags flags); BlockAIOCB *blk_aio_write_zeroes(BlockBackend *blk, int64_t sector_num, int nb_sectors, BdrvRequestFlags flags, BlockCompletionFunc *cb, void *opaque); diff --git a/include/sysemu/os-win32.h b/include/sysemu/os-win32.h index 9cc9e08139..4035c4fe54 100644 --- a/include/sysemu/os-win32.h +++ b/include/sysemu/os-win32.h @@ -72,9 +72,6 @@ #define sigsetjmp(env, savemask) setjmp(env) #define siglongjmp(env, val) longjmp(env, val) -/* Declaration of ffs() is missing in MinGW's strings.h. */ -int ffs(int i); - /* Missing POSIX functions. Don't use MinGW-w64 macros. */ #undef gmtime_r struct tm *gmtime_r(const time_t *timep, struct tm *result); diff --git a/iothread.c b/iothread.c index 342a23fcb0..a1f91099bc 100644 --- a/iothread.c +++ b/iothread.c @@ -31,21 +31,14 @@ typedef ObjectClass IOThreadClass; static void *iothread_run(void *opaque) { IOThread *iothread = opaque; - bool blocking; qemu_mutex_lock(&iothread->init_done_lock); iothread->thread_id = qemu_get_thread_id(); qemu_cond_signal(&iothread->init_done_cond); qemu_mutex_unlock(&iothread->init_done_lock); - while (!iothread->stopping) { - aio_context_acquire(iothread->ctx); - blocking = true; - while (!iothread->stopping && aio_poll(iothread->ctx, blocking)) { - /* Progress was made, keep going */ - blocking = false; - } - aio_context_release(iothread->ctx); + while (!atomic_read(&iothread->stopping)) { + aio_poll(iothread->ctx, true); } return NULL; } @@ -1141,18 +1141,18 @@ static int kvm_irqchip_get_virq(KVMState *s) { uint32_t *word = s->used_gsi_bitmap; int max_words = ALIGN(s->gsi_count, 32) / 32; - int i, bit; + int i, zeroes; bool retry = true; again: /* Return the lowest unused GSI in the bitmap */ for (i = 0; i < max_words; i++) { - bit = ffs(~word[i]); - if (!bit) { + zeroes = ctz32(~word[i]); + if (zeroes == 32) { continue; } - return bit - 1 + i * 32; + return zeroes + i * 32; } if (!s->direct_msi && retry) { retry = false; diff --git a/migration/block.c b/migration/block.c index 085c0fae05..ddb59ccf87 100644 --- a/migration/block.c +++ b/migration/block.c @@ -304,7 +304,7 @@ static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds) blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov, nr_sectors, blk_mig_read_cb, blk); - bdrv_reset_dirty_bitmap(bs, bmds->dirty_bitmap, cur_sector, nr_sectors); + bdrv_reset_dirty_bitmap(bmds->dirty_bitmap, cur_sector, nr_sectors); qemu_mutex_unlock_iothread(); bmds->cur_sector = cur_sector + nr_sectors; @@ -320,7 +320,7 @@ static int set_dirty_tracking(void) QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { bmds->dirty_bitmap = bdrv_create_dirty_bitmap(bmds->bs, BLOCK_SIZE, - NULL); + NULL, NULL); if (!bmds->dirty_bitmap) { ret = -errno; goto fail; @@ -497,8 +497,7 @@ static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds, g_free(blk); } - bdrv_reset_dirty_bitmap(bmds->bs, bmds->dirty_bitmap, sector, - nr_sectors); + bdrv_reset_dirty_bitmap(bmds->dirty_bitmap, sector, nr_sectors); break; } sector += BDRV_SECTORS_PER_DIRTY_CHUNK; @@ -584,7 +583,7 @@ static int64_t get_remaining_dirty(void) int64_t dirty = 0; QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { - dirty += bdrv_get_dirty_count(bmds->bs, bmds->dirty_bitmap); + dirty += bdrv_get_dirty_count(bmds->dirty_bitmap); } return dirty << BDRV_SECTOR_BITS; diff --git a/qapi/block-core.json b/qapi/block-core.json index 78730846c2..1c17224c77 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -330,14 +330,19 @@ # # Block dirty bitmap information. # +# @name: #optional the name of the dirty bitmap (Since 2.4) +# # @count: number of dirty bytes according to the dirty bitmap # # @granularity: granularity of the dirty bitmap in bytes (since 1.4) # +# @frozen: whether the dirty bitmap is frozen (Since 2.4) +# # Since: 1.3 ## { 'type': 'BlockDirtyInfo', - 'data': {'count': 'int', 'granularity': 'int'} } + 'data': {'*name': 'str', 'count': 'int', 'granularity': 'uint32', + 'frozen': 'bool'} } ## # @BlockInfo: @@ -510,10 +515,12 @@ # # @none: only copy data written from now on # +# @dirty-bitmap: only copy data described by the dirty bitmap. Since: 2.4 +# # Since: 1.3 ## { 'enum': 'MirrorSyncMode', - 'data': ['top', 'full', 'none'] } + 'data': ['top', 'full', 'none', 'dirty-bitmap'] } ## # @BlockJobType: @@ -688,14 +695,18 @@ # probe if @mode is 'existing', else the format of the source # # @sync: what parts of the disk image should be copied to the destination -# (all the disk, only the sectors allocated in the topmost image, or -# only new I/O). +# (all the disk, only the sectors allocated in the topmost image, from a +# dirty bitmap, or only new I/O). # # @mode: #optional whether and how QEMU should create a new image, default is # 'absolute-paths'. # # @speed: #optional the maximum speed, in bytes per second # +# @bitmap: #optional the name of dirty bitmap if sync is "dirty-bitmap". +# Must be present if sync is "dirty-bitmap", must NOT be present +# otherwise. (Since 2.4) +# # @on-source-error: #optional the action to take on an error on the source, # default 'report'. 'stop' and 'enospc' can only be used # if the block device supports io-status (see BlockInfo). @@ -713,7 +724,7 @@ { 'type': 'DriveBackup', 'data': { 'device': 'str', 'target': 'str', '*format': 'str', 'sync': 'MirrorSyncMode', '*mode': 'NewImageMode', - '*speed': 'int', + '*speed': 'int', '*bitmap': 'str', '*on-source-error': 'BlockdevOnError', '*on-target-error': 'BlockdevOnError' } } @@ -958,6 +969,76 @@ '*on-target-error': 'BlockdevOnError' } } ## +# @BlockDirtyBitmap +# +# @node: name of device/node which the bitmap is tracking +# +# @name: name of the dirty bitmap +# +# Since 2.4 +## +{ 'type': 'BlockDirtyBitmap', + 'data': { 'node': 'str', 'name': 'str' } } + +## +# @BlockDirtyBitmapAdd +# +# @node: name of device/node which the bitmap is tracking +# +# @name: name of the dirty bitmap +# +# @granularity: #optional the bitmap granularity, default is 64k for +# block-dirty-bitmap-add +# +# Since 2.4 +## +{ 'type': 'BlockDirtyBitmapAdd', + 'data': { 'node': 'str', 'name': 'str', '*granularity': 'uint32' } } + +## +# @block-dirty-bitmap-add +# +# Create a dirty bitmap with a name on the node +# +# Returns: nothing on success +# If @node is not a valid block device or node, DeviceNotFound +# If @name is already taken, GenericError with an explanation +# +# Since 2.4 +## +{ 'command': 'block-dirty-bitmap-add', + 'data': 'BlockDirtyBitmapAdd' } + +## +# @block-dirty-bitmap-remove +# +# Remove a dirty bitmap on the node +# +# Returns: nothing on success +# If @node is not a valid block device or node, DeviceNotFound +# If @name is not found, GenericError with an explanation +# if @name is frozen by an operation, GenericError +# +# Since 2.4 +## +{ 'command': 'block-dirty-bitmap-remove', + 'data': 'BlockDirtyBitmap' } + +## +# @block-dirty-bitmap-clear +# +# Clear (reset) a dirty bitmap on the device +# +# Returns: nothing on success +# If @node is not a valid block device, DeviceNotFound +# If @name is not found, GenericError with an explanation +# +# Since 2.4 +## +{ 'command': 'block-dirty-bitmap-clear', + 'data': 'BlockDirtyBitmap' } + +## # @block_set_io_throttle: # # Change I/O throttle limits for a block drive. @@ -1310,11 +1391,14 @@ # Driver specific block device options for the null backend. # # @size: #optional size of the device in bytes. +# @latency-ns: #optional emulated latency (in nanoseconds) in processing +# requests. Default to zero which completes requests immediately. +# (Since 2.4) # # Since: 2.2 ## { 'type': 'BlockdevOptionsNull', - 'data': { '*size': 'int' } } + 'data': { '*size': 'int', '*latency-ns': 'uint64' } } ## # @BlockdevOptionsVVFAT @@ -1754,7 +1838,11 @@ # # Emitted when a corruption has been detected in a disk image # -# @device: device name +# @device: device name. This is always present for compatibility +# reasons, but it can be empty ("") if the image does not +# have a device name associated. +# +# @node-name: #optional node name (Since: 2.4) # # @msg: informative message for human consumption, such as the kind of # corruption being detected. It should not be parsed by machine as it is @@ -1773,11 +1861,12 @@ # Since: 1.7 ## { 'event': 'BLOCK_IMAGE_CORRUPTED', - 'data': { 'device' : 'str', - 'msg' : 'str', - '*offset': 'int', - '*size' : 'int', - 'fatal' : 'bool' } } + 'data': { 'device' : 'str', + '*node-name' : 'str', + 'msg' : 'str', + '*offset' : 'int', + '*size' : 'int', + 'fatal' : 'bool' } } ## # @BLOCK_IO_ERROR diff --git a/qemu-img.c b/qemu-img.c index 9dddfbefce..8d30e43b53 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -1305,20 +1305,312 @@ out3: return ret; } +enum ImgConvertBlockStatus { + BLK_DATA, + BLK_ZERO, + BLK_BACKING_FILE, +}; + +typedef struct ImgConvertState { + BlockBackend **src; + int64_t *src_sectors; + int src_cur, src_num; + int64_t src_cur_offset; + int64_t total_sectors; + int64_t allocated_sectors; + enum ImgConvertBlockStatus status; + int64_t sector_next_status; + BlockBackend *target; + bool has_zero_init; + bool compressed; + bool target_has_backing; + int min_sparse; + size_t cluster_sectors; + size_t buf_sectors; +} ImgConvertState; + +static void convert_select_part(ImgConvertState *s, int64_t sector_num) +{ + assert(sector_num >= s->src_cur_offset); + while (sector_num - s->src_cur_offset >= s->src_sectors[s->src_cur]) { + s->src_cur_offset += s->src_sectors[s->src_cur]; + s->src_cur++; + assert(s->src_cur < s->src_num); + } +} + +static int convert_iteration_sectors(ImgConvertState *s, int64_t sector_num) +{ + int64_t ret; + int n; + + convert_select_part(s, sector_num); + + assert(s->total_sectors > sector_num); + n = MIN(s->total_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS); + + if (s->sector_next_status <= sector_num) { + ret = bdrv_get_block_status(blk_bs(s->src[s->src_cur]), + sector_num - s->src_cur_offset, + n, &n); + if (ret < 0) { + return ret; + } + + if (ret & BDRV_BLOCK_ZERO) { + s->status = BLK_ZERO; + } else if (ret & BDRV_BLOCK_DATA) { + s->status = BLK_DATA; + } else if (!s->target_has_backing) { + /* Without a target backing file we must copy over the contents of + * the backing file as well. */ + /* TODO Check block status of the backing file chain to avoid + * needlessly reading zeroes and limiting the iteration to the + * buffer size */ + s->status = BLK_DATA; + } else { + s->status = BLK_BACKING_FILE; + } + + s->sector_next_status = sector_num + n; + } + + n = MIN(n, s->sector_next_status - sector_num); + if (s->status == BLK_DATA) { + n = MIN(n, s->buf_sectors); + } + + /* We need to write complete clusters for compressed images, so if an + * unallocated area is shorter than that, we must consider the whole + * cluster allocated. */ + if (s->compressed) { + if (n < s->cluster_sectors) { + n = MIN(s->cluster_sectors, s->total_sectors - sector_num); + s->status = BLK_DATA; + } else { + n = QEMU_ALIGN_DOWN(n, s->cluster_sectors); + } + } + + return n; +} + +static int convert_read(ImgConvertState *s, int64_t sector_num, int nb_sectors, + uint8_t *buf) +{ + int n; + int ret; + + if (s->status == BLK_ZERO || s->status == BLK_BACKING_FILE) { + return 0; + } + + assert(nb_sectors <= s->buf_sectors); + while (nb_sectors > 0) { + BlockBackend *blk; + int64_t bs_sectors; + + /* In the case of compression with multiple source files, we can get a + * nb_sectors that spreads into the next part. So we must be able to + * read across multiple BDSes for one convert_read() call. */ + convert_select_part(s, sector_num); + blk = s->src[s->src_cur]; + bs_sectors = s->src_sectors[s->src_cur]; + + n = MIN(nb_sectors, bs_sectors - (sector_num - s->src_cur_offset)); + ret = blk_read(blk, sector_num - s->src_cur_offset, buf, n); + if (ret < 0) { + return ret; + } + + sector_num += n; + nb_sectors -= n; + buf += n * BDRV_SECTOR_SIZE; + } + + return 0; +} + +static int convert_write(ImgConvertState *s, int64_t sector_num, int nb_sectors, + const uint8_t *buf) +{ + int ret; + + while (nb_sectors > 0) { + int n = nb_sectors; + + switch (s->status) { + case BLK_BACKING_FILE: + /* If we have a backing file, leave clusters unallocated that are + * unallocated in the source image, so that the backing file is + * visible at the respective offset. */ + assert(s->target_has_backing); + break; + + case BLK_DATA: + /* We must always write compressed clusters as a whole, so don't + * try to find zeroed parts in the buffer. We can only save the + * write if the buffer is completely zeroed and we're allowed to + * keep the target sparse. */ + if (s->compressed) { + if (s->has_zero_init && s->min_sparse && + buffer_is_zero(buf, n * BDRV_SECTOR_SIZE)) + { + assert(!s->target_has_backing); + break; + } + + ret = blk_write_compressed(s->target, sector_num, buf, n); + if (ret < 0) { + return ret; + } + break; + } + + /* If there is real non-zero data or we're told to keep the target + * fully allocated (-S 0), we must write it. Otherwise we can treat + * it as zero sectors. */ + if (!s->min_sparse || + is_allocated_sectors_min(buf, n, &n, s->min_sparse)) + { + ret = blk_write(s->target, sector_num, buf, n); + if (ret < 0) { + return ret; + } + break; + } + /* fall-through */ + + case BLK_ZERO: + if (s->has_zero_init) { + break; + } + ret = blk_write_zeroes(s->target, sector_num, n, 0); + if (ret < 0) { + return ret; + } + break; + } + + sector_num += n; + nb_sectors -= n; + buf += n * BDRV_SECTOR_SIZE; + } + + return 0; +} + +static int convert_do_copy(ImgConvertState *s) +{ + uint8_t *buf = NULL; + int64_t sector_num, allocated_done; + int ret; + int n; + + /* Check whether we have zero initialisation or can get it efficiently */ + s->has_zero_init = s->min_sparse && !s->target_has_backing + ? bdrv_has_zero_init(blk_bs(s->target)) + : false; + + if (!s->has_zero_init && !s->target_has_backing && + bdrv_can_write_zeroes_with_unmap(blk_bs(s->target))) + { + ret = bdrv_make_zero(blk_bs(s->target), BDRV_REQ_MAY_UNMAP); + if (ret == 0) { + s->has_zero_init = true; + } + } + + /* Allocate buffer for copied data. For compressed images, only one cluster + * can be copied at a time. */ + if (s->compressed) { + if (s->cluster_sectors <= 0 || s->cluster_sectors > s->buf_sectors) { + error_report("invalid cluster size"); + ret = -EINVAL; + goto fail; + } + s->buf_sectors = s->cluster_sectors; + } + buf = blk_blockalign(s->target, s->buf_sectors * BDRV_SECTOR_SIZE); + + /* Calculate allocated sectors for progress */ + s->allocated_sectors = 0; + sector_num = 0; + while (sector_num < s->total_sectors) { + n = convert_iteration_sectors(s, sector_num); + if (n < 0) { + ret = n; + goto fail; + } + if (s->status == BLK_DATA) { + s->allocated_sectors += n; + } + sector_num += n; + } + + /* Do the copy */ + s->src_cur = 0; + s->src_cur_offset = 0; + s->sector_next_status = 0; + + sector_num = 0; + allocated_done = 0; + + while (sector_num < s->total_sectors) { + n = convert_iteration_sectors(s, sector_num); + if (n < 0) { + ret = n; + goto fail; + } + if (s->status == BLK_DATA) { + allocated_done += n; + qemu_progress_print(100.0 * allocated_done / s->allocated_sectors, + 0); + } + + ret = convert_read(s, sector_num, n, buf); + if (ret < 0) { + error_report("error while reading sector %" PRId64 + ": %s", sector_num, strerror(-ret)); + goto fail; + } + + ret = convert_write(s, sector_num, n, buf); + if (ret < 0) { + error_report("error while writing sector %" PRId64 + ": %s", sector_num, strerror(-ret)); + goto fail; + } + + sector_num += n; + } + + if (s->compressed) { + /* signal EOF to align */ + ret = blk_write_compressed(s->target, 0, NULL, 0); + if (ret < 0) { + goto fail; + } + } + + ret = 0; +fail: + qemu_vfree(buf); + return ret; +} + static int img_convert(int argc, char **argv) { - int c, n, n1, bs_n, bs_i, compress, cluster_sectors, skip_create; + int c, bs_n, bs_i, compress, cluster_sectors, skip_create; int64_t ret = 0; int progress = 0, flags, src_flags; const char *fmt, *out_fmt, *cache, *src_cache, *out_baseimg, *out_filename; BlockDriver *drv, *proto_drv; BlockBackend **blk = NULL, *out_blk = NULL; BlockDriverState **bs = NULL, *out_bs = NULL; - int64_t total_sectors, nb_sectors, sector_num, bs_offset; + int64_t total_sectors; int64_t *bs_sectors = NULL; - uint8_t * buf = NULL; size_t bufsectors = IO_BUF_SIZE / BDRV_SECTOR_SIZE; - const uint8_t *buf1; BlockDriverInfo bdi; QemuOpts *opts = NULL; QemuOptsList *create_opts = NULL; @@ -1329,6 +1621,7 @@ static int img_convert(int argc, char **argv) bool quiet = false; Error *local_err = NULL; QemuOpts *sn_opts = NULL; + ImgConvertState state; fmt = NULL; out_fmt = "raw"; @@ -1627,9 +1920,6 @@ static int img_convert(int argc, char **argv) } out_bs = blk_bs(out_blk); - bs_i = 0; - bs_offset = 0; - /* increase bufsectors from the default 4096 (2M) if opt_transfer_length * or discard_alignment of the out_bs is greater. Limit to 32768 (16MB) * as maximum. */ @@ -1638,8 +1928,6 @@ static int img_convert(int argc, char **argv) out_bs->bl.discard_alignment)) ); - buf = blk_blockalign(out_blk, bufsectors * BDRV_SECTOR_SIZE); - if (skip_create) { int64_t output_sectors = blk_nb_sectors(out_blk); if (output_sectors < 0) { @@ -1666,203 +1954,20 @@ static int img_convert(int argc, char **argv) cluster_sectors = bdi.cluster_size / BDRV_SECTOR_SIZE; } - if (compress) { - if (cluster_sectors <= 0 || cluster_sectors > bufsectors) { - error_report("invalid cluster size"); - ret = -1; - goto out; - } - sector_num = 0; - - nb_sectors = total_sectors; - - for(;;) { - int64_t bs_num; - int remainder; - uint8_t *buf2; - - nb_sectors = total_sectors - sector_num; - if (nb_sectors <= 0) - break; - if (nb_sectors >= cluster_sectors) - n = cluster_sectors; - else - n = nb_sectors; - - bs_num = sector_num - bs_offset; - assert (bs_num >= 0); - remainder = n; - buf2 = buf; - while (remainder > 0) { - int nlow; - while (bs_num == bs_sectors[bs_i]) { - bs_offset += bs_sectors[bs_i]; - bs_i++; - assert (bs_i < bs_n); - bs_num = 0; - /* printf("changing part: sector_num=%" PRId64 ", " - "bs_i=%d, bs_offset=%" PRId64 ", bs_sectors=%" PRId64 - "\n", sector_num, bs_i, bs_offset, bs_sectors[bs_i]); */ - } - assert (bs_num < bs_sectors[bs_i]); - - nlow = remainder > bs_sectors[bs_i] - bs_num - ? bs_sectors[bs_i] - bs_num : remainder; - - ret = blk_read(blk[bs_i], bs_num, buf2, nlow); - if (ret < 0) { - error_report("error while reading sector %" PRId64 ": %s", - bs_num, strerror(-ret)); - goto out; - } - - buf2 += nlow * 512; - bs_num += nlow; - - remainder -= nlow; - } - assert (remainder == 0); - - if (!buffer_is_zero(buf, n * BDRV_SECTOR_SIZE)) { - ret = blk_write_compressed(out_blk, sector_num, buf, n); - if (ret != 0) { - error_report("error while compressing sector %" PRId64 - ": %s", sector_num, strerror(-ret)); - goto out; - } - } - sector_num += n; - qemu_progress_print(100.0 * sector_num / total_sectors, 0); - } - /* signal EOF to align */ - blk_write_compressed(out_blk, 0, NULL, 0); - } else { - int64_t sectors_to_read, sectors_read, sector_num_next_status; - bool count_allocated_sectors; - int has_zero_init = min_sparse ? bdrv_has_zero_init(out_bs) : 0; - - if (!has_zero_init && bdrv_can_write_zeroes_with_unmap(out_bs)) { - ret = bdrv_make_zero(out_bs, BDRV_REQ_MAY_UNMAP); - if (ret < 0) { - goto out; - } - has_zero_init = 1; - } - - sectors_to_read = total_sectors; - count_allocated_sectors = progress && (out_baseimg || has_zero_init); -restart: - sector_num = 0; // total number of sectors converted so far - sectors_read = 0; - sector_num_next_status = 0; - - for(;;) { - nb_sectors = total_sectors - sector_num; - if (nb_sectors <= 0) { - if (count_allocated_sectors) { - sectors_to_read = sectors_read; - count_allocated_sectors = false; - goto restart; - } - ret = 0; - break; - } - - while (sector_num - bs_offset >= bs_sectors[bs_i]) { - bs_offset += bs_sectors[bs_i]; - bs_i ++; - assert (bs_i < bs_n); - /* printf("changing part: sector_num=%" PRId64 ", bs_i=%d, " - "bs_offset=%" PRId64 ", bs_sectors=%" PRId64 "\n", - sector_num, bs_i, bs_offset, bs_sectors[bs_i]); */ - } - - if ((out_baseimg || has_zero_init) && - sector_num >= sector_num_next_status) { - n = nb_sectors > INT_MAX ? INT_MAX : nb_sectors; - ret = bdrv_get_block_status(bs[bs_i], sector_num - bs_offset, - n, &n1); - if (ret < 0) { - error_report("error while reading block status of sector %" - PRId64 ": %s", sector_num - bs_offset, - strerror(-ret)); - goto out; - } - /* If the output image is zero initialized, we are not working - * on a shared base and the input is zero we can skip the next - * n1 sectors */ - if (has_zero_init && !out_baseimg && (ret & BDRV_BLOCK_ZERO)) { - sector_num += n1; - continue; - } - /* If the output image is being created as a copy on write - * image, assume that sectors which are unallocated in the - * input image are present in both the output's and input's - * base images (no need to copy them). */ - if (out_baseimg) { - if (!(ret & BDRV_BLOCK_DATA)) { - sector_num += n1; - continue; - } - /* The next 'n1' sectors are allocated in the input image. - * Copy only those as they may be followed by unallocated - * sectors. */ - nb_sectors = n1; - } - /* avoid redundant callouts to get_block_status */ - sector_num_next_status = sector_num + n1; - } - - n = MIN(nb_sectors, bufsectors); - - /* round down request length to an aligned sector, but - * do not bother doing this on short requests. They happen - * when we found an all-zero area, and the next sector to - * write will not be sector_num + n. */ - if (cluster_sectors > 0 && n >= cluster_sectors) { - int64_t next_aligned_sector = (sector_num + n); - next_aligned_sector -= next_aligned_sector % cluster_sectors; - if (sector_num + n > next_aligned_sector) { - n = next_aligned_sector - sector_num; - } - } - - n = MIN(n, bs_sectors[bs_i] - (sector_num - bs_offset)); - - sectors_read += n; - if (count_allocated_sectors) { - sector_num += n; - continue; - } + state = (ImgConvertState) { + .src = blk, + .src_sectors = bs_sectors, + .src_num = bs_n, + .total_sectors = total_sectors, + .target = out_blk, + .compressed = compress, + .target_has_backing = (bool) out_baseimg, + .min_sparse = min_sparse, + .cluster_sectors = cluster_sectors, + .buf_sectors = bufsectors, + }; + ret = convert_do_copy(&state); - n1 = n; - ret = blk_read(blk[bs_i], sector_num - bs_offset, buf, n); - if (ret < 0) { - error_report("error while reading sector %" PRId64 ": %s", - sector_num - bs_offset, strerror(-ret)); - goto out; - } - /* NOTE: at the same time we convert, we do not write zero - sectors to have a chance to compress the image. Ideally, we - should add a specific call to have the info to go faster */ - buf1 = buf; - while (n > 0) { - if (!has_zero_init || - is_allocated_sectors_min(buf1, n, &n1, min_sparse)) { - ret = blk_write(out_blk, sector_num, buf1, n1); - if (ret < 0) { - error_report("error while writing sector %" PRId64 - ": %s", sector_num, strerror(-ret)); - goto out; - } - } - sector_num += n1; - n -= n1; - buf1 += n1 * 512; - } - qemu_progress_print(100.0 * sectors_read / sectors_to_read, 0); - } - } out: if (!ret) { qemu_progress_print(100, 0); @@ -1870,7 +1975,6 @@ out: qemu_progress_end(); qemu_opts_del(opts); qemu_opts_free(create_opts); - qemu_vfree(buf); qemu_opts_del(sn_opts); blk_unref(out_blk); g_free(bs); diff --git a/qmp-commands.hx b/qmp-commands.hx index 1e59541665..213508fe5d 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -1007,6 +1007,43 @@ EQMP .mhandler.cmd_new = qmp_marshal_input_block_stream, }, +SQMP +block-stream +------------ + +Copy data from a backing file into a block device. + +Arguments: + +- "device": The device's ID, must be unique (json-string) +- "base": The file name of the backing image above which copying starts + (json-string, optional) +- "backing-file": The backing file string to write into the active layer. This + filename is not validated. + + If a pathname string is such that it cannot be resolved by + QEMU, that means that subsequent QMP or HMP commands must use + node-names for the image in question, as filename lookup + methods will fail. + + If not specified, QEMU will automatically determine the + backing file string to use, or error out if there is no + obvious choice. Care should be taken when specifying the + string, to specify a valid filename or protocol. + (json-string, optional) (Since 2.1) +- "speed": the maximum speed, in bytes per second (json-int, optional) +- "on-error": the action to take on an error (default 'report'). 'stop' and + 'enospc' can only be used if the block device supports io-status. + (json-string, optional) (Since 2.1) + +Example: + +-> { "execute": "block-stream", "arguments": { "device": "virtio0", + "base": "/tmp/master.qcow2" } } +<- { "return": {} } + +EQMP + { .name = "block-commit", .args_type = "device:B,base:s?,top:s?,backing-file:s?,speed:o?", @@ -1073,7 +1110,7 @@ EQMP { .name = "drive-backup", .args_type = "sync:s,device:B,target:s,speed:i?,mode:s?,format:s?," - "on-source-error:s?,on-target-error:s?", + "bitmap:s?,on-source-error:s?,on-target-error:s?", .mhandler.cmd_new = qmp_marshal_input_drive_backup, }, @@ -1100,8 +1137,10 @@ Arguments: (json-string, optional) - "sync": what parts of the disk image should be copied to the destination; possibilities include "full" for all the disk, "top" for only the sectors - allocated in the topmost image, or "none" to only replicate new I/O - (MirrorSyncMode). + allocated in the topmost image, "dirty-bitmap" for only the dirty sectors in + the bitmap, or "none" to only replicate new I/O (MirrorSyncMode). +- "bitmap": dirty bitmap name for sync==dirty-bitmap. Must be present if sync + is "dirty-bitmap", must NOT be present otherwise. - "mode": whether and how QEMU should create a new image (NewImageMode, optional, default 'absolute-paths') - "speed": the maximum speed, in bytes per second (json-int, optional) @@ -1269,6 +1308,91 @@ Example: EQMP { + .name = "block-dirty-bitmap-add", + .args_type = "node:B,name:s,granularity:i?", + .mhandler.cmd_new = qmp_marshal_input_block_dirty_bitmap_add, + }, + +SQMP + +block-dirty-bitmap-add +---------------------- +Since 2.4 + +Create a dirty bitmap with a name on the device, and start tracking the writes. + +Arguments: + +- "node": device/node on which to create dirty bitmap (json-string) +- "name": name of the new dirty bitmap (json-string) +- "granularity": granularity to track writes with (int, optional) + +Example: + +-> { "execute": "block-dirty-bitmap-add", "arguments": { "node": "drive0", + "name": "bitmap0" } } +<- { "return": {} } + +EQMP + + { + .name = "block-dirty-bitmap-remove", + .args_type = "node:B,name:s", + .mhandler.cmd_new = qmp_marshal_input_block_dirty_bitmap_remove, + }, + +SQMP + +block-dirty-bitmap-remove +------------------------- +Since 2.4 + +Stop write tracking and remove the dirty bitmap that was created with +block-dirty-bitmap-add. + +Arguments: + +- "node": device/node on which to remove dirty bitmap (json-string) +- "name": name of the dirty bitmap to remove (json-string) + +Example: + +-> { "execute": "block-dirty-bitmap-remove", "arguments": { "node": "drive0", + "name": "bitmap0" } } +<- { "return": {} } + +EQMP + + { + .name = "block-dirty-bitmap-clear", + .args_type = "node:B,name:s", + .mhandler.cmd_new = qmp_marshal_input_block_dirty_bitmap_clear, + }, + +SQMP + +block-dirty-bitmap-clear +------------------------ +Since 2.4 + +Reset the dirty bitmap associated with a node so that an incremental backup +from this point in time forward will only backup clusters modified after this +clear operation. + +Arguments: + +- "node": device/node on which to remove dirty bitmap (json-string) +- "name": name of the dirty bitmap to remove (json-string) + +Example: + +-> { "execute": "block-dirty-bitmap-clear", "arguments": { "node": "drive0", + "name": "bitmap0" } } +<- { "return": {} } + +EQMP + + { .name = "blockdev-snapshot-sync", .args_type = "device:s?,node-name:s?,snapshot-file:s,snapshot-node-name:s?,format:s?,mode:s?", .mhandler.cmd_new = qmp_marshal_input_blockdev_snapshot_sync, diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 5df61f9aa9..7f0aae977d 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2911,6 +2911,17 @@ sub process { if ($rawline =~ /\b(?:Qemu|QEmu)\b/) { WARN("use QEMU instead of Qemu or QEmu\n" . $herecurr); } + +# check for non-portable ffs() calls that have portable alternatives in QEMU + if ($line =~ /\bffs\(/) { + ERROR("use ctz32() instead of ffs()\n" . $herecurr); + } + if ($line =~ /\bffsl\(/) { + ERROR("use ctz32() or ctz64() instead of ffsl()\n" . $herecurr); + } + if ($line =~ /\bffsll\(/) { + ERROR("use ctz64() instead of ffsll()\n" . $herecurr); + } } # If we have no input at all, then there is nothing to report on diff --git a/scripts/qemu-gdb.py b/scripts/qemu-gdb.py index 8a0f30534f..6c7f4fbe53 100644 --- a/scripts/qemu-gdb.py +++ b/scripts/qemu-gdb.py @@ -22,12 +22,86 @@ def isnull(ptr): def int128(p): return long(p['lo']) + (long(p['hi']) << 64) +def get_fs_base(): + '''Fetch %fs base value using arch_prctl(ARCH_GET_FS)''' + # %rsp - 120 is scratch space according to the SystemV ABI + old = gdb.parse_and_eval('*(uint64_t*)($rsp - 120)') + gdb.execute('call arch_prctl(0x1003, $rsp - 120)', False, True) + fs_base = gdb.parse_and_eval('*(uint64_t*)($rsp - 120)') + gdb.execute('set *(uint64_t*)($rsp - 120) = %s' % old, False, True) + return fs_base + +def get_glibc_pointer_guard(): + '''Fetch glibc pointer guard value''' + fs_base = get_fs_base() + return gdb.parse_and_eval('*(uint64_t*)((uint64_t)%s + 0x30)' % fs_base) + +def glibc_ptr_demangle(val, pointer_guard): + '''Undo effect of glibc's PTR_MANGLE()''' + return gdb.parse_and_eval('(((uint64_t)%s >> 0x11) | ((uint64_t)%s << (64 - 0x11))) ^ (uint64_t)%s' % (val, val, pointer_guard)) + +def bt_jmpbuf(jmpbuf): + '''Backtrace a jmpbuf''' + JB_RBX = 0 + JB_RBP = 1 + JB_R12 = 2 + JB_R13 = 3 + JB_R14 = 4 + JB_R15 = 5 + JB_RSP = 6 + JB_PC = 7 + + old_rbx = gdb.parse_and_eval('(uint64_t)$rbx') + old_rbp = gdb.parse_and_eval('(uint64_t)$rbp') + old_rsp = gdb.parse_and_eval('(uint64_t)$rsp') + old_r12 = gdb.parse_and_eval('(uint64_t)$r12') + old_r13 = gdb.parse_and_eval('(uint64_t)$r13') + old_r14 = gdb.parse_and_eval('(uint64_t)$r14') + old_r15 = gdb.parse_and_eval('(uint64_t)$r15') + old_rip = gdb.parse_and_eval('(uint64_t)$rip') + + pointer_guard = get_glibc_pointer_guard() + gdb.execute('set $rbx = %s' % jmpbuf[JB_RBX]) + gdb.execute('set $rbp = %s' % glibc_ptr_demangle(jmpbuf[JB_RBP], pointer_guard)) + gdb.execute('set $rsp = %s' % glibc_ptr_demangle(jmpbuf[JB_RSP], pointer_guard)) + gdb.execute('set $r12 = %s' % jmpbuf[JB_R12]) + gdb.execute('set $r13 = %s' % jmpbuf[JB_R13]) + gdb.execute('set $r14 = %s' % jmpbuf[JB_R14]) + gdb.execute('set $r15 = %s' % jmpbuf[JB_R15]) + gdb.execute('set $rip = %s' % glibc_ptr_demangle(jmpbuf[JB_PC], pointer_guard)) + + gdb.execute('bt') + + gdb.execute('set $rbx = %s' % old_rbx) + gdb.execute('set $rbp = %s' % old_rbp) + gdb.execute('set $rsp = %s' % old_rsp) + gdb.execute('set $r12 = %s' % old_r12) + gdb.execute('set $r13 = %s' % old_r13) + gdb.execute('set $r14 = %s' % old_r14) + gdb.execute('set $r15 = %s' % old_r15) + gdb.execute('set $rip = %s' % old_rip) + class QemuCommand(gdb.Command): '''Prefix for QEMU debug support commands''' def __init__(self): gdb.Command.__init__(self, 'qemu', gdb.COMMAND_DATA, gdb.COMPLETE_NONE, True) +class CoroutineCommand(gdb.Command): + '''Display coroutine backtrace''' + def __init__(self): + gdb.Command.__init__(self, 'qemu coroutine', gdb.COMMAND_DATA, + gdb.COMPLETE_NONE) + + def invoke(self, arg, from_tty): + argv = gdb.string_to_argv(arg) + if len(argv) != 1: + gdb.write('usage: qemu coroutine <coroutine-pointer>\n') + return + + coroutine_pointer = gdb.parse_and_eval(argv[0]).cast(gdb.lookup_type('CoroutineUContext').pointer()) + bt_jmpbuf(coroutine_pointer['env']['__jmpbuf']) + class MtreeCommand(gdb.Command): '''Display the memory tree hierarchy''' def __init__(self): @@ -86,4 +160,5 @@ class MtreeCommand(gdb.Command): subregion = subregion['subregions_link']['tqe_next'] QemuCommand() +CoroutineCommand() MtreeCommand() diff --git a/scripts/qmp/qmp.py b/scripts/qmp/qmp.py index 20b6ec795e..1d38e3e9e7 100644 --- a/scripts/qmp/qmp.py +++ b/scripts/qmp/qmp.py @@ -21,6 +21,9 @@ class QMPConnectError(QMPError): class QMPCapabilitiesError(QMPError): pass +class QMPTimeoutError(QMPError): + pass + class QEMUMonitorProtocol: def __init__(self, address, server=False): """ @@ -72,6 +75,44 @@ class QEMUMonitorProtocol: error = socket.error + def __get_events(self, wait=False): + """ + Check for new events in the stream and cache them in __events. + + @param wait (bool): block until an event is available. + @param wait (float): If wait is a float, treat it as a timeout value. + + @raise QMPTimeoutError: If a timeout float is provided and the timeout + period elapses. + @raise QMPConnectError: If wait is True but no events could be retrieved + or if some other error occurred. + """ + + # Check for new events regardless and pull them into the cache: + self.__sock.setblocking(0) + try: + self.__json_read() + except socket.error, err: + if err[0] == errno.EAGAIN: + # No data available + pass + self.__sock.setblocking(1) + + # Wait for new events, if needed. + # if wait is 0.0, this means "no wait" and is also implicitly false. + if not self.__events and wait: + if isinstance(wait, float): + self.__sock.settimeout(wait) + try: + ret = self.__json_read(only_event=True) + except socket.timeout: + raise QMPTimeoutError("Timeout waiting for event") + except: + raise QMPConnectError("Error while reading from socket") + if ret is None: + raise QMPConnectError("Error while reading from socket") + self.__sock.settimeout(None) + def connect(self, negotiate=True): """ Connect to the QMP Monitor and perform capabilities negotiation. @@ -140,43 +181,37 @@ class QEMUMonitorProtocol: """ Get and delete the first available QMP event. - @param wait: block until an event is available (bool) + @param wait (bool): block until an event is available. + @param wait (float): If wait is a float, treat it as a timeout value. + + @raise QMPTimeoutError: If a timeout float is provided and the timeout + period elapses. + @raise QMPConnectError: If wait is True but no events could be retrieved + or if some other error occurred. + + @return The first available QMP event, or None. """ - self.__sock.setblocking(0) - try: - self.__json_read() - except socket.error, err: - if err[0] == errno.EAGAIN: - # No data available - pass - self.__sock.setblocking(1) - if not self.__events and wait: - self.__json_read(only_event=True) - event = self.__events[0] - del self.__events[0] - return event + self.__get_events(wait) + + if self.__events: + return self.__events.pop(0) + return None def get_events(self, wait=False): """ Get a list of available QMP events. - @param wait: block until an event is available (bool) - """ - self.__sock.setblocking(0) - try: - self.__json_read() - except socket.error, err: - if err[0] == errno.EAGAIN: - # No data available - pass - self.__sock.setblocking(1) - if not self.__events and wait: - ret = self.__json_read(only_event=True) - if ret == None: - # We are in blocking mode, if don't get anything, something - # went wrong - raise QMPConnectError("Error while reading from socket") + @param wait (bool): block until an event is available. + @param wait (float): If wait is a float, treat it as a timeout value. + @raise QMPTimeoutError: If a timeout float is provided and the timeout + period elapses. + @raise QMPConnectError: If wait is True but no events could be retrieved + or if some other error occurred. + + @return The list of available QMP events. + """ + self.__get_events(wait) return self.__events def clear_events(self): diff --git a/target-ppc/cpu.h b/target-ppc/cpu.h index f15815f11b..c05c503305 100644 --- a/target-ppc/cpu.h +++ b/target-ppc/cpu.h @@ -2251,8 +2251,8 @@ static inline ppcmas_tlb_t *booke206_get_tlbm(CPUPPCState *env, const int tlbn, { int r; uint32_t ways = booke206_tlb_ways(env, tlbn); - int ways_bits = ffs(ways) - 1; - int tlb_bits = ffs(booke206_tlb_size(env, tlbn)) - 1; + int ways_bits = ctz32(ways); + int tlb_bits = ctz32(booke206_tlb_size(env, tlbn)); int i; way &= ways - 1; diff --git a/tests/qemu-iotests/122 b/tests/qemu-iotests/122 new file mode 100755 index 0000000000..350ca9c466 --- /dev/null +++ b/tests/qemu-iotests/122 @@ -0,0 +1,223 @@ +#!/bin/bash +# +# Test some qemu-img convert cases +# +# Copyright (C) 2015 Red Hat, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# + +# creator +owner=kwolf@redhat.com + +seq="$(basename $0)" +echo "QA output created by $seq" + +here="$PWD" +tmp=/tmp/$$ +status=1 # failure is the default! + +_cleanup() +{ + rm -f "$TEST_IMG".[123] + _cleanup_test_img +} +trap "_cleanup; exit \$status" 0 1 2 3 15 + +# get standard environment, filters and checks +. ./common.rc +. ./common.filter + +_supported_fmt qcow2 +_supported_proto file +_supported_os Linux + + +TEST_IMG="$TEST_IMG".base _make_test_img 64M +$QEMU_IO -c "write -P 0x11 0 64M" "$TEST_IMG".base 2>&1 | _filter_qemu_io | _filter_testdir + + +echo +echo "=== Check allocation status regression with -B ===" +echo + +_make_test_img -b "$TEST_IMG".base +$QEMU_IO -c "write -P 0x22 0 3M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IMG convert -O $IMGFMT -B "$TEST_IMG".base "$TEST_IMG" "$TEST_IMG".orig +$QEMU_IMG map "$TEST_IMG".orig | _filter_qemu_img_map + + +echo +echo "=== Check that zero clusters are kept in overlay ===" +echo + +_make_test_img -b "$TEST_IMG".base + +$QEMU_IO -c "write -P 0 0 3M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IMG convert -O $IMGFMT -B "$TEST_IMG".base "$TEST_IMG" "$TEST_IMG".orig +$QEMU_IO -c "read -P 0 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IMG convert -O $IMGFMT -c -B "$TEST_IMG".base "$TEST_IMG" "$TEST_IMG".orig +$QEMU_IO -c "read -P 0 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir + +$QEMU_IO -c "write -z 0 3M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IMG convert -O $IMGFMT -B "$TEST_IMG".base "$TEST_IMG" "$TEST_IMG".orig +$QEMU_IO -c "read -P 0 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IMG convert -O $IMGFMT -c -B "$TEST_IMG".base "$TEST_IMG" "$TEST_IMG".orig +$QEMU_IO -c "read -P 0 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir + + +echo +echo "=== Concatenate multiple source images ===" +echo + +TEST_IMG="$TEST_IMG".1 _make_test_img 4M +TEST_IMG="$TEST_IMG".2 _make_test_img 4M +TEST_IMG="$TEST_IMG".3 _make_test_img 4M + +$QEMU_IO -c "write -P 0x11 0 64k" "$TEST_IMG".1 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -c "write -P 0x22 0 64k" "$TEST_IMG".2 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -c "write -P 0x33 0 64k" "$TEST_IMG".3 2>&1 | _filter_qemu_io | _filter_testdir + +$QEMU_IMG convert -O $IMGFMT "$TEST_IMG".[123] "$TEST_IMG" +$QEMU_IMG map "$TEST_IMG" | _filter_qemu_img_map +$QEMU_IO -c "read -P 0x11 0 64k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -c "read -P 0x22 4M 64k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -c "read -P 0x33 8M 64k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir + +$QEMU_IMG convert -c -O $IMGFMT "$TEST_IMG".[123] "$TEST_IMG" +$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map +$QEMU_IO -c "read -P 0x11 0 64k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -c "read -P 0x22 4M 64k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -c "read -P 0x33 8M 64k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir + +# -B can't be combined with concatenation +$QEMU_IMG convert -O $IMGFMT -B "$TEST_IMG".base "$TEST_IMG".[123] "$TEST_IMG" +$QEMU_IMG convert -O $IMGFMT -c -B "$TEST_IMG".base "$TEST_IMG".[123] "$TEST_IMG" + + +echo +echo "=== Compression with misaligned allocations and image sizes ===" +echo + +TEST_IMG="$TEST_IMG".1 _make_test_img 1023k -o cluster_size=1024 +TEST_IMG="$TEST_IMG".2 _make_test_img 1023k -o cluster_size=1024 + +$QEMU_IO -c "write -P 0x11 16k 16k" "$TEST_IMG".1 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -c "write -P 0x22 130k 130k" "$TEST_IMG".1 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -c "write -P 0x33 1022k 1k" "$TEST_IMG".1 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -c "write -P 0x44 0k 1k" "$TEST_IMG".2 2>&1 | _filter_qemu_io | _filter_testdir + +$QEMU_IMG convert -c -O $IMGFMT "$TEST_IMG".[12] "$TEST_IMG" +$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map +$QEMU_IO -c "read -P 0 0k 16k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -c "read -P 0x11 16k 16k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -c "read -P 0 32k 98k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -c "read -P 0x22 130k 130k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -c "read -P 0 260k 762k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -c "read -P 0x33 1022k 1k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -c "read -P 0x44 1023k 1k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -c "read -P 0 1024k 1022k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir + + +echo +echo "=== Full allocation with -S 0 ===" +echo + +# Standalone image +_make_test_img 64M +$QEMU_IO -c "write -P 0x22 0 3M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -c "write -P 0 3M 3M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir + +echo +echo convert -S 0: +$QEMU_IMG convert -O $IMGFMT -S 0 "$TEST_IMG" "$TEST_IMG".orig +$QEMU_IO -c "read -P 0x22 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -c "read -P 0 3M 61M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IMG map --output=json "$TEST_IMG".orig | _filter_qemu_img_map + +echo +echo convert -c -S 0: +$QEMU_IMG convert -O $IMGFMT -c -S 0 "$TEST_IMG" "$TEST_IMG".orig +$QEMU_IO -c "read -P 0x22 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -c "read -P 0 3M 61M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IMG map --output=json "$TEST_IMG".orig | _filter_qemu_img_map + +# With backing file +TEST_IMG="$TEST_IMG".base _make_test_img 64M +$QEMU_IO -c "write -P 0x11 0 32M" "$TEST_IMG".base 2>&1 | _filter_qemu_io | _filter_testdir + +_make_test_img -b "$TEST_IMG".base 64M +$QEMU_IO -c "write -P 0x22 0 3M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir + +echo +echo convert -S 0 with source backing file: +$QEMU_IMG convert -O $IMGFMT -S 0 "$TEST_IMG" "$TEST_IMG".orig +$QEMU_IO -c "read -P 0x22 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -c "read -P 0x11 3M 29M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -c "read -P 0 32M 32M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IMG map --output=json "$TEST_IMG".orig | _filter_qemu_img_map + +echo +echo convert -c -S 0 with source backing file: +$QEMU_IMG convert -O $IMGFMT -c -S 0 "$TEST_IMG" "$TEST_IMG".orig +$QEMU_IO -c "read -P 0x22 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -c "read -P 0x11 3M 29M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -c "read -P 0 32M 32M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IMG map --output=json "$TEST_IMG".orig | _filter_qemu_img_map + +# With keeping the backing file +echo +echo convert -S 0 -B ... +$QEMU_IMG convert -O $IMGFMT -S 0 "$TEST_IMG" "$TEST_IMG".orig +$QEMU_IO -c "read -P 0x22 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -c "read -P 0x11 3M 29M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -c "read -P 0 32M 32M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IMG map --output=json "$TEST_IMG".orig | _filter_qemu_img_map + +echo +echo convert -c -S 0 -B ... +$QEMU_IMG convert -O $IMGFMT -c -S 0 "$TEST_IMG" "$TEST_IMG".orig +$QEMU_IO -c "read -P 0x22 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -c "read -P 0x11 3M 29M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -c "read -P 0 32M 32M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IMG map --output=json "$TEST_IMG".orig | _filter_qemu_img_map + + +echo +echo "=== Non-zero -S ===" +echo + +_make_test_img 64M -o cluster_size=1k +$QEMU_IO -c "write -P 0 0 64k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -c "write 0 1k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -c "write 8k 1k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -c "write 17k 1k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir + +for min_sparse in 4k 8k; do + echo + echo convert -S $min_sparse + $QEMU_IMG convert -O $IMGFMT -o cluster_size=1k -S $min_sparse "$TEST_IMG" "$TEST_IMG".orig + $QEMU_IMG map --output=json "$TEST_IMG".orig | _filter_qemu_img_map + + echo + echo convert -c -S $min_sparse + # For compressed images, -S values other than 0 are ignored + $QEMU_IMG convert -O $IMGFMT -o cluster_size=1k -c -S $min_sparse "$TEST_IMG" "$TEST_IMG".orig + $QEMU_IMG map --output=json "$TEST_IMG".orig | _filter_qemu_img_map +done + +# success, all done +echo '*** done' +rm -f $seq.full +status=0 diff --git a/tests/qemu-iotests/122.out b/tests/qemu-iotests/122.out new file mode 100644 index 0000000000..1f853b9e93 --- /dev/null +++ b/tests/qemu-iotests/122.out @@ -0,0 +1,209 @@ +QA output created by 122 +Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=67108864 +wrote 67108864/67108864 bytes at offset 0 +64 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + +=== Check allocation status regression with -B === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 backing_file='TEST_DIR/t.IMGFMT.base' +wrote 3145728/3145728 bytes at offset 0 +3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +Offset Length File +0 0x300000 TEST_DIR/t.IMGFMT.orig +0x300000 0x3d00000 TEST_DIR/t.IMGFMT.base + +=== Check that zero clusters are kept in overlay === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 backing_file='TEST_DIR/t.IMGFMT.base' +wrote 3145728/3145728 bytes at offset 0 +3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 3145728/3145728 bytes at offset 0 +3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 3145728/3145728 bytes at offset 0 +3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 3145728/3145728 bytes at offset 0 +3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 3145728/3145728 bytes at offset 0 +3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 3145728/3145728 bytes at offset 0 +3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + +=== Concatenate multiple source images === + +Formatting 'TEST_DIR/t.IMGFMT.1', fmt=IMGFMT size=4194304 +Formatting 'TEST_DIR/t.IMGFMT.2', fmt=IMGFMT size=4194304 +Formatting 'TEST_DIR/t.IMGFMT.3', fmt=IMGFMT size=4194304 +wrote 65536/65536 bytes at offset 0 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 65536/65536 bytes at offset 0 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 65536/65536 bytes at offset 0 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +Offset Length File +0 0x10000 TEST_DIR/t.IMGFMT +0x400000 0x10000 TEST_DIR/t.IMGFMT +0x800000 0x10000 TEST_DIR/t.IMGFMT +read 65536/65536 bytes at offset 0 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 65536/65536 bytes at offset 4194304 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 65536/65536 bytes at offset 8388608 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +[{ "start": 0, "length": 65536, "depth": 0, "zero": false, "data": true}, +{ "start": 65536, "length": 4128768, "depth": 0, "zero": true, "data": false}, +{ "start": 4194304, "length": 65536, "depth": 0, "zero": false, "data": true}, +{ "start": 4259840, "length": 4128768, "depth": 0, "zero": true, "data": false}, +{ "start": 8388608, "length": 65536, "depth": 0, "zero": false, "data": true}, +{ "start": 8454144, "length": 4128768, "depth": 0, "zero": true, "data": false}] +read 65536/65536 bytes at offset 0 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 65536/65536 bytes at offset 4194304 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 65536/65536 bytes at offset 8388608 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +qemu-img: -B makes no sense when concatenating multiple input images +qemu-img: -B makes no sense when concatenating multiple input images + +=== Compression with misaligned allocations and image sizes === + +Formatting 'TEST_DIR/t.IMGFMT.1', fmt=IMGFMT size=1047552 +Formatting 'TEST_DIR/t.IMGFMT.2', fmt=IMGFMT size=1047552 +wrote 16384/16384 bytes at offset 16384 +16 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 133120/133120 bytes at offset 133120 +130 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 1024/1024 bytes at offset 1046528 +1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 1024/1024 bytes at offset 0 +1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +[{ "start": 0, "length": 65536, "depth": 0, "zero": false, "data": true}, +{ "start": 65536, "length": 65536, "depth": 0, "zero": true, "data": false}, +{ "start": 131072, "length": 196608, "depth": 0, "zero": false, "data": true}, +{ "start": 327680, "length": 655360, "depth": 0, "zero": true, "data": false}, +{ "start": 983040, "length": 65536, "depth": 0, "zero": false, "data": true}, +{ "start": 1048576, "length": 1046528, "depth": 0, "zero": true, "data": false}] +read 16384/16384 bytes at offset 0 +16 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 16384/16384 bytes at offset 16384 +16 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 100352/100352 bytes at offset 32768 +98 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 133120/133120 bytes at offset 133120 +130 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 780288/780288 bytes at offset 266240 +762 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 1024/1024 bytes at offset 1046528 +1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 1024/1024 bytes at offset 1047552 +1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 1046528/1046528 bytes at offset 1048576 +1022 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + +=== Full allocation with -S 0 === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 +wrote 3145728/3145728 bytes at offset 0 +3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 3145728/3145728 bytes at offset 3145728 +3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + +convert -S 0: +read 3145728/3145728 bytes at offset 0 +3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 63963136/63963136 bytes at offset 3145728 +61 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +[{ "start": 0, "length": 6291456, "depth": 0, "zero": false, "data": true, "offset": 327680}, +{ "start": 6291456, "length": 60817408, "depth": 0, "zero": true, "data": false}] + +convert -c -S 0: +read 3145728/3145728 bytes at offset 0 +3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 63963136/63963136 bytes at offset 3145728 +61 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +[{ "start": 0, "length": 6291456, "depth": 0, "zero": false, "data": true}, +{ "start": 6291456, "length": 60817408, "depth": 0, "zero": true, "data": false}] +Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=67108864 +wrote 33554432/33554432 bytes at offset 0 +32 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 backing_file='TEST_DIR/t.IMGFMT.base' +wrote 3145728/3145728 bytes at offset 0 +3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + +convert -S 0 with source backing file: +read 3145728/3145728 bytes at offset 0 +3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 30408704/30408704 bytes at offset 3145728 +29 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 33554432/33554432 bytes at offset 33554432 +32 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +[{ "start": 0, "length": 67108864, "depth": 0, "zero": false, "data": true, "offset": 327680}] + +convert -c -S 0 with source backing file: +read 3145728/3145728 bytes at offset 0 +3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 30408704/30408704 bytes at offset 3145728 +29 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 33554432/33554432 bytes at offset 33554432 +32 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +[{ "start": 0, "length": 67108864, "depth": 0, "zero": false, "data": true}] + +convert -S 0 -B ... +read 3145728/3145728 bytes at offset 0 +3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 30408704/30408704 bytes at offset 3145728 +29 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 33554432/33554432 bytes at offset 33554432 +32 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +[{ "start": 0, "length": 67108864, "depth": 0, "zero": false, "data": true, "offset": 327680}] + +convert -c -S 0 -B ... +read 3145728/3145728 bytes at offset 0 +3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 30408704/30408704 bytes at offset 3145728 +29 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 33554432/33554432 bytes at offset 33554432 +32 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +[{ "start": 0, "length": 67108864, "depth": 0, "zero": false, "data": true}] + +=== Non-zero -S === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 +wrote 65536/65536 bytes at offset 0 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 1024/1024 bytes at offset 0 +1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 1024/1024 bytes at offset 8192 +1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 1024/1024 bytes at offset 17408 +1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + +convert -S 4k +[{ "start": 0, "length": 1024, "depth": 0, "zero": false, "data": true, "offset": 8192}, +{ "start": 1024, "length": 7168, "depth": 0, "zero": true, "data": false}, +{ "start": 8192, "length": 1024, "depth": 0, "zero": false, "data": true, "offset": 9216}, +{ "start": 9216, "length": 8192, "depth": 0, "zero": true, "data": false}, +{ "start": 17408, "length": 1024, "depth": 0, "zero": false, "data": true, "offset": 10240}, +{ "start": 18432, "length": 67090432, "depth": 0, "zero": true, "data": false}] + +convert -c -S 4k +[{ "start": 0, "length": 1024, "depth": 0, "zero": false, "data": true}, +{ "start": 1024, "length": 7168, "depth": 0, "zero": true, "data": false}, +{ "start": 8192, "length": 1024, "depth": 0, "zero": false, "data": true}, +{ "start": 9216, "length": 8192, "depth": 0, "zero": true, "data": false}, +{ "start": 17408, "length": 1024, "depth": 0, "zero": false, "data": true}, +{ "start": 18432, "length": 67090432, "depth": 0, "zero": true, "data": false}] + +convert -S 8k +[{ "start": 0, "length": 9216, "depth": 0, "zero": false, "data": true, "offset": 8192}, +{ "start": 9216, "length": 8192, "depth": 0, "zero": true, "data": false}, +{ "start": 17408, "length": 1024, "depth": 0, "zero": false, "data": true, "offset": 17408}, +{ "start": 18432, "length": 67090432, "depth": 0, "zero": true, "data": false}] + +convert -c -S 8k +[{ "start": 0, "length": 1024, "depth": 0, "zero": false, "data": true}, +{ "start": 1024, "length": 7168, "depth": 0, "zero": true, "data": false}, +{ "start": 8192, "length": 1024, "depth": 0, "zero": false, "data": true}, +{ "start": 9216, "length": 8192, "depth": 0, "zero": true, "data": false}, +{ "start": 17408, "length": 1024, "depth": 0, "zero": false, "data": true}, +{ "start": 18432, "length": 67090432, "depth": 0, "zero": true, "data": false}] +*** done diff --git a/tests/qemu-iotests/124 b/tests/qemu-iotests/124 new file mode 100644 index 0000000000..3ee78cd1f1 --- /dev/null +++ b/tests/qemu-iotests/124 @@ -0,0 +1,363 @@ +#!/usr/bin/env python +# +# Tests for incremental drive-backup +# +# Copyright (C) 2015 John Snow for Red Hat, Inc. +# +# Based on 056. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# + +import os +import iotests + + +def io_write_patterns(img, patterns): + for pattern in patterns: + iotests.qemu_io('-c', 'write -P%s %s %s' % pattern, img) + + +def try_remove(img): + try: + os.remove(img) + except OSError: + pass + + +class Bitmap: + def __init__(self, name, drive): + self.name = name + self.drive = drive + self.num = 0 + self.backups = list() + + def base_target(self): + return (self.drive['backup'], None) + + def new_target(self, num=None): + if num is None: + num = self.num + self.num = num + 1 + base = os.path.join(iotests.test_dir, + "%s.%s." % (self.drive['id'], self.name)) + suff = "%i.%s" % (num, self.drive['fmt']) + target = base + "inc" + suff + reference = base + "ref" + suff + self.backups.append((target, reference)) + return (target, reference) + + def last_target(self): + if self.backups: + return self.backups[-1] + return self.base_target() + + def del_target(self): + for image in self.backups.pop(): + try_remove(image) + self.num -= 1 + + def cleanup(self): + for backup in self.backups: + for image in backup: + try_remove(image) + + +class TestIncrementalBackup(iotests.QMPTestCase): + def setUp(self): + self.bitmaps = list() + self.files = list() + self.drives = list() + self.vm = iotests.VM() + self.err_img = os.path.join(iotests.test_dir, 'err.%s' % iotests.imgfmt) + + # Create a base image with a distinctive patterning + drive0 = self.add_node('drive0') + self.img_create(drive0['file'], drive0['fmt']) + self.vm.add_drive(drive0['file']) + io_write_patterns(drive0['file'], (('0x41', 0, 512), + ('0xd5', '1M', '32k'), + ('0xdc', '32M', '124k'))) + self.vm.launch() + + + def add_node(self, node_id, fmt=iotests.imgfmt, path=None, backup=None): + if path is None: + path = os.path.join(iotests.test_dir, '%s.%s' % (node_id, fmt)) + if backup is None: + backup = os.path.join(iotests.test_dir, + '%s.full.backup.%s' % (node_id, fmt)) + + self.drives.append({ + 'id': node_id, + 'file': path, + 'backup': backup, + 'fmt': fmt }) + return self.drives[-1] + + + def img_create(self, img, fmt=iotests.imgfmt, size='64M', + parent=None, parentFormat=None): + if parent: + if parentFormat is None: + parentFormat = fmt + iotests.qemu_img('create', '-f', fmt, img, size, + '-b', parent, '-F', parentFormat) + else: + iotests.qemu_img('create', '-f', fmt, img, size) + self.files.append(img) + + + def do_qmp_backup(self, error='Input/output error', **kwargs): + res = self.vm.qmp('drive-backup', **kwargs) + self.assert_qmp(res, 'return', {}) + + event = self.vm.event_wait(name="BLOCK_JOB_COMPLETED", + match={'data': {'device': kwargs['device']}}) + self.assertIsNotNone(event) + + try: + failure = self.dictpath(event, 'data/error') + except AssertionError: + # Backup succeeded. + self.assert_qmp(event, 'data/offset', event['data']['len']) + return True + else: + # Backup failed. + self.assert_qmp(event, 'data/error', error) + return False + + + def create_anchor_backup(self, drive=None): + if drive is None: + drive = self.drives[-1] + res = self.do_qmp_backup(device=drive['id'], sync='full', + format=drive['fmt'], target=drive['backup']) + self.assertTrue(res) + self.files.append(drive['backup']) + return drive['backup'] + + + def make_reference_backup(self, bitmap=None): + if bitmap is None: + bitmap = self.bitmaps[-1] + _, reference = bitmap.last_target() + res = self.do_qmp_backup(device=bitmap.drive['id'], sync='full', + format=bitmap.drive['fmt'], target=reference) + self.assertTrue(res) + + + def add_bitmap(self, name, drive, **kwargs): + bitmap = Bitmap(name, drive) + self.bitmaps.append(bitmap) + result = self.vm.qmp('block-dirty-bitmap-add', node=drive['id'], + name=bitmap.name, **kwargs) + self.assert_qmp(result, 'return', {}) + return bitmap + + + def prepare_backup(self, bitmap=None, parent=None): + if bitmap is None: + bitmap = self.bitmaps[-1] + if parent is None: + parent, _ = bitmap.last_target() + + target, _ = bitmap.new_target() + self.img_create(target, bitmap.drive['fmt'], parent=parent) + return target + + + def create_incremental(self, bitmap=None, parent=None, + parentFormat=None, validate=True): + if bitmap is None: + bitmap = self.bitmaps[-1] + if parent is None: + parent, _ = bitmap.last_target() + + target = self.prepare_backup(bitmap, parent) + res = self.do_qmp_backup(device=bitmap.drive['id'], + sync='dirty-bitmap', bitmap=bitmap.name, + format=bitmap.drive['fmt'], target=target, + mode='existing') + if not res: + bitmap.del_target(); + self.assertFalse(validate) + else: + self.make_reference_backup(bitmap) + return res + + + def check_backups(self): + for bitmap in self.bitmaps: + for incremental, reference in bitmap.backups: + self.assertTrue(iotests.compare_images(incremental, reference)) + last = bitmap.last_target()[0] + self.assertTrue(iotests.compare_images(last, bitmap.drive['file'])) + + + def hmp_io_writes(self, drive, patterns): + for pattern in patterns: + self.vm.hmp_qemu_io(drive, 'write -P%s %s %s' % pattern) + self.vm.hmp_qemu_io(drive, 'flush') + + + def do_incremental_simple(self, **kwargs): + self.create_anchor_backup() + self.add_bitmap('bitmap0', self.drives[0], **kwargs) + + # Sanity: Create a "hollow" incremental backup + self.create_incremental() + # Three writes: One complete overwrite, one new segment, + # and one partial overlap. + self.hmp_io_writes(self.drives[0]['id'], (('0xab', 0, 512), + ('0xfe', '16M', '256k'), + ('0x64', '32736k', '64k'))) + self.create_incremental() + # Three more writes, one of each kind, like above + self.hmp_io_writes(self.drives[0]['id'], (('0x9a', 0, 512), + ('0x55', '8M', '352k'), + ('0x78', '15872k', '1M'))) + self.create_incremental() + self.vm.shutdown() + self.check_backups() + + + def test_incremental_simple(self): + ''' + Test: Create and verify three incremental backups. + + Create a bitmap and a full backup before VM execution begins, + then create a series of three incremental backups "during execution," + i.e.; after IO requests begin modifying the drive. + ''' + return self.do_incremental_simple() + + + def test_small_granularity(self): + ''' + Test: Create and verify backups made with a small granularity bitmap. + + Perform the same test as test_incremental_simple, but with a granularity + of only 32KiB instead of the present default of 64KiB. + ''' + return self.do_incremental_simple(granularity=32768) + + + def test_large_granularity(self): + ''' + Test: Create and verify backups made with a large granularity bitmap. + + Perform the same test as test_incremental_simple, but with a granularity + of 128KiB instead of the present default of 64KiB. + ''' + return self.do_incremental_simple(granularity=131072) + + + def test_incremental_failure(self): + '''Test: Verify backups made after a failure are correct. + + Simulate a failure during an incremental backup block job, + emulate additional writes, then create another incremental backup + afterwards and verify that the backup created is correct. + ''' + + # Create a blkdebug interface to this img as 'drive1', + # but don't actually create a new image. + drive1 = self.add_node('drive1', self.drives[0]['fmt'], + path=self.drives[0]['file'], + backup=self.drives[0]['backup']) + result = self.vm.qmp('blockdev-add', options={ + 'id': drive1['id'], + 'driver': drive1['fmt'], + 'file': { + 'driver': 'blkdebug', + 'image': { + 'driver': 'file', + 'filename': drive1['file'] + }, + 'set-state': [{ + 'event': 'flush_to_disk', + 'state': 1, + 'new_state': 2 + }], + 'inject-error': [{ + 'event': 'read_aio', + 'errno': 5, + 'state': 2, + 'immediately': False, + 'once': True + }], + } + }) + self.assert_qmp(result, 'return', {}) + + self.create_anchor_backup(self.drives[0]) + self.add_bitmap('bitmap0', drive1) + # Note: at this point, during a normal execution, + # Assume that the VM resumes and begins issuing IO requests here. + + self.hmp_io_writes(drive1['id'], (('0xab', 0, 512), + ('0xfe', '16M', '256k'), + ('0x64', '32736k', '64k'))) + + result = self.create_incremental(validate=False) + self.assertFalse(result) + self.hmp_io_writes(drive1['id'], (('0x9a', 0, 512), + ('0x55', '8M', '352k'), + ('0x78', '15872k', '1M'))) + self.create_incremental() + self.vm.shutdown() + self.check_backups() + + + def test_sync_dirty_bitmap_missing(self): + self.assert_no_active_block_jobs() + self.files.append(self.err_img) + result = self.vm.qmp('drive-backup', device=self.drives[0]['id'], + sync='dirty-bitmap', format=self.drives[0]['fmt'], + target=self.err_img) + self.assert_qmp(result, 'error/class', 'GenericError') + + + def test_sync_dirty_bitmap_not_found(self): + self.assert_no_active_block_jobs() + self.files.append(self.err_img) + result = self.vm.qmp('drive-backup', device=self.drives[0]['id'], + sync='dirty-bitmap', bitmap='unknown', + format=self.drives[0]['fmt'], target=self.err_img) + self.assert_qmp(result, 'error/class', 'GenericError') + + + def test_sync_dirty_bitmap_bad_granularity(self): + ''' + Test: Test what happens if we provide an improper granularity. + + The granularity must always be a power of 2. + ''' + self.assert_no_active_block_jobs() + self.assertRaises(AssertionError, self.add_bitmap, + 'bitmap0', self.drives[0], + granularity=64000) + + + def tearDown(self): + self.vm.shutdown() + for bitmap in self.bitmaps: + bitmap.cleanup() + for filename in self.files: + try_remove(filename) + + +if __name__ == '__main__': + iotests.main(supported_fmts=['qcow2']) diff --git a/tests/qemu-iotests/124.out b/tests/qemu-iotests/124.out new file mode 100644 index 0000000000..2f7d3902f2 --- /dev/null +++ b/tests/qemu-iotests/124.out @@ -0,0 +1,5 @@ +....... +---------------------------------------------------------------------- +Ran 7 tests + +OK diff --git a/tests/qemu-iotests/129 b/tests/qemu-iotests/129 new file mode 100644 index 0000000000..9e87e1c8d9 --- /dev/null +++ b/tests/qemu-iotests/129 @@ -0,0 +1,86 @@ +#!/usr/bin/env python +# +# Tests that "bdrv_drain_all" doesn't drain block jobs +# +# Copyright (C) 2015 Red Hat, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# + +import os +import iotests +import time + +class TestStopWithBlockJob(iotests.QMPTestCase): + test_img = os.path.join(iotests.test_dir, 'test.img') + target_img = os.path.join(iotests.test_dir, 'target.img') + base_img = os.path.join(iotests.test_dir, 'base.img') + + def setUp(self): + iotests.qemu_img('create', '-f', iotests.imgfmt, self.base_img, "1G") + iotests.qemu_img('create', '-f', iotests.imgfmt, self.test_img, "-b", self.base_img) + iotests.qemu_io('-f', iotests.imgfmt, '-c', 'write -P0x5d 1M 128M', self.test_img) + self.vm = iotests.VM().add_drive(self.test_img) + self.vm.launch() + + def tearDown(self): + params = {"device": "drive0", + "bps": 0, + "bps_rd": 0, + "bps_wr": 0, + "iops": 0, + "iops_rd": 0, + "iops_wr": 0, + } + result = self.vm.qmp("block_set_io_throttle", conv_keys=False, + **params) + self.vm.shutdown() + + def do_test_stop(self, cmd, **args): + """Test 'stop' while block job is running on a throttled drive. + The 'stop' command shouldn't drain the job""" + params = {"device": "drive0", + "bps": 1024, + "bps_rd": 0, + "bps_wr": 0, + "iops": 0, + "iops_rd": 0, + "iops_wr": 0, + } + result = self.vm.qmp("block_set_io_throttle", conv_keys=False, + **params) + self.assert_qmp(result, 'return', {}) + result = self.vm.qmp(cmd, **args) + self.assert_qmp(result, 'return', {}) + result = self.vm.qmp("stop") + self.assert_qmp(result, 'return', {}) + result = self.vm.qmp("query-block-jobs") + self.assert_qmp(result, 'return[0]/busy', True) + self.assert_qmp(result, 'return[0]/ready', False) + + def test_drive_mirror(self): + self.do_test_stop("drive-mirror", device="drive0", + target=self.target_img, + sync="full") + + def test_drive_backup(self): + self.do_test_stop("drive-backup", device="drive0", + target=self.target_img, + sync="full") + + def test_block_commit(self): + self.do_test_stop("block-commit", device="drive0") + +if __name__ == '__main__': + iotests.main(supported_fmts=["qcow2"]) diff --git a/tests/qemu-iotests/129.out b/tests/qemu-iotests/129.out new file mode 100644 index 0000000000..8d7e996700 --- /dev/null +++ b/tests/qemu-iotests/129.out @@ -0,0 +1,5 @@ +... +---------------------------------------------------------------------- +Ran 3 tests + +OK diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group index bcf25786ab..6ca3466ec5 100644 --- a/tests/qemu-iotests/group +++ b/tests/qemu-iotests/group @@ -122,6 +122,9 @@ 115 rw auto 116 rw auto quick 121 rw auto +122 rw auto 123 rw auto quick +124 rw auto backing 128 rw auto quick +129 rw auto quick 130 rw auto quick diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py index 14028540b3..e93e62387b 100644 --- a/tests/qemu-iotests/iotests.py +++ b/tests/qemu-iotests/iotests.py @@ -78,6 +78,23 @@ def create_image(name, size): i = i + 512 file.close() +# Test if 'match' is a recursive subset of 'event' +def event_match(event, match=None): + if match is None: + return True + + for key in match: + if key in event: + if isinstance(event[key], dict): + if not event_match(event[key], match[key]): + return False + elif event[key] != match[key]: + return False + else: + return False + + return True + class VM(object): '''A QEMU VM''' @@ -92,6 +109,7 @@ class VM(object): '-machine', 'accel=qtest', '-display', 'none', '-vga', 'none'] self._num_drives = 0 + self._events = [] # This can be used to add an unused monitor instance. def add_monitor_telnet(self, ip, port): @@ -202,14 +220,34 @@ class VM(object): def get_qmp_event(self, wait=False): '''Poll for one queued QMP events and return it''' + if len(self._events) > 0: + return self._events.pop(0) return self._qmp.pull_event(wait=wait) def get_qmp_events(self, wait=False): '''Poll for queued QMP events and return a list of dicts''' events = self._qmp.get_events(wait=wait) + events.extend(self._events) + del self._events[:] self._qmp.clear_events() return events + def event_wait(self, name='BLOCK_JOB_COMPLETED', timeout=60.0, match=None): + # Search cached events + for event in self._events: + if (event['event'] == name) and event_match(event, match): + self._events.remove(event) + return event + + # Poll for new events + while True: + event = self._qmp.pull_event(wait=timeout) + if (event['event'] == name) and event_match(event, match): + return event + self._events.append(event) + + return None + index_re = re.compile(r'([^\[]+)\[([^\]]+)\]') class QMPTestCase(unittest.TestCase): diff --git a/tests/test-aio.c b/tests/test-aio.c index a7cb5c9915..4b0cb45d31 100644 --- a/tests/test-aio.c +++ b/tests/test-aio.c @@ -107,6 +107,7 @@ static void test_notify(void) typedef struct { QemuMutex start_lock; + EventNotifier notifier; bool thread_acquired; } AcquireTestData; @@ -118,6 +119,8 @@ static void *test_acquire_thread(void *opaque) qemu_mutex_lock(&data->start_lock); qemu_mutex_unlock(&data->start_lock); + g_usleep(500000); + event_notifier_set(&data->notifier); aio_context_acquire(ctx); aio_context_release(ctx); @@ -126,20 +129,19 @@ static void *test_acquire_thread(void *opaque) return NULL; } -static void dummy_notifier_read(EventNotifier *unused) +static void dummy_notifier_read(EventNotifier *n) { - g_assert(false); /* should never be invoked */ + event_notifier_test_and_clear(n); } static void test_acquire(void) { QemuThread thread; - EventNotifier notifier; AcquireTestData data; /* Dummy event notifier ensures aio_poll() will block */ - event_notifier_init(¬ifier, false); - aio_set_event_notifier(ctx, ¬ifier, dummy_notifier_read); + event_notifier_init(&data.notifier, false); + aio_set_event_notifier(ctx, &data.notifier, dummy_notifier_read); g_assert(!aio_poll(ctx, false)); /* consume aio_notify() */ qemu_mutex_init(&data.start_lock); @@ -153,12 +155,13 @@ static void test_acquire(void) /* Block in aio_poll(), let other thread kick us and acquire context */ aio_context_acquire(ctx); qemu_mutex_unlock(&data.start_lock); /* let the thread run */ - g_assert(!aio_poll(ctx, true)); + g_assert(aio_poll(ctx, true)); + g_assert(!data.thread_acquired); aio_context_release(ctx); qemu_thread_join(&thread); - aio_set_event_notifier(ctx, ¬ifier, NULL); - event_notifier_cleanup(¬ifier); + aio_set_event_notifier(ctx, &data.notifier, NULL); + event_notifier_cleanup(&data.notifier); g_assert(data.thread_acquired); } diff --git a/tests/test-hbitmap.c b/tests/test-hbitmap.c index 8c902f2055..9f41b5fd2e 100644 --- a/tests/test-hbitmap.c +++ b/tests/test-hbitmap.c @@ -11,6 +11,8 @@ #include <glib.h> #include <stdarg.h> +#include <string.h> +#include <sys/types.h> #include "qemu/hbitmap.h" #define LOG_BITS_PER_LONG (BITS_PER_LONG == 32 ? 5 : 6) @@ -23,6 +25,7 @@ typedef struct TestHBitmapData { HBitmap *hb; unsigned long *bits; size_t size; + size_t old_size; int granularity; } TestHBitmapData; @@ -91,6 +94,44 @@ static void hbitmap_test_init(TestHBitmapData *data, } } +static inline size_t hbitmap_test_array_size(size_t bits) +{ + size_t n = (bits + BITS_PER_LONG - 1) / BITS_PER_LONG; + return n ? n : 1; +} + +static void hbitmap_test_truncate_impl(TestHBitmapData *data, + size_t size) +{ + size_t n; + size_t m; + data->old_size = data->size; + data->size = size; + + if (data->size == data->old_size) { + return; + } + + n = hbitmap_test_array_size(size); + m = hbitmap_test_array_size(data->old_size); + data->bits = g_realloc(data->bits, sizeof(unsigned long) * n); + if (n > m) { + memset(&data->bits[m], 0x00, sizeof(unsigned long) * (n - m)); + } + + /* If we shrink to an uneven multiple of sizeof(unsigned long), + * scrub the leftover memory. */ + if (data->size < data->old_size) { + m = size % (sizeof(unsigned long) * 8); + if (m) { + unsigned long mask = (1ULL << m) - 1; + data->bits[n-1] &= mask; + } + } + + hbitmap_truncate(data->hb, size); +} + static void hbitmap_test_teardown(TestHBitmapData *data, const void *unused) { @@ -369,6 +410,198 @@ static void test_hbitmap_iter_granularity(TestHBitmapData *data, g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0); } +static void hbitmap_test_set_boundary_bits(TestHBitmapData *data, ssize_t diff) +{ + size_t size = data->size; + + /* First bit */ + hbitmap_test_set(data, 0, 1); + if (diff < 0) { + /* Last bit in new, shortened map */ + hbitmap_test_set(data, size + diff - 1, 1); + + /* First bit to be truncated away */ + hbitmap_test_set(data, size + diff, 1); + } + /* Last bit */ + hbitmap_test_set(data, size - 1, 1); + if (data->granularity == 0) { + hbitmap_test_check_get(data); + } +} + +static void hbitmap_test_check_boundary_bits(TestHBitmapData *data) +{ + size_t size = MIN(data->size, data->old_size); + + if (data->granularity == 0) { + hbitmap_test_check_get(data); + hbitmap_test_check(data, 0); + } else { + /* If a granularity was set, note that every distinct + * (bit >> granularity) value that was set will increase + * the bit pop count by 2^granularity, not just 1. + * + * The hbitmap_test_check facility does not currently tolerate + * non-zero granularities, so test the boundaries and the population + * count manually. + */ + g_assert(hbitmap_get(data->hb, 0)); + g_assert(hbitmap_get(data->hb, size - 1)); + g_assert_cmpint(2 << data->granularity, ==, hbitmap_count(data->hb)); + } +} + +/* Generic truncate test. */ +static void hbitmap_test_truncate(TestHBitmapData *data, + size_t size, + ssize_t diff, + int granularity) +{ + hbitmap_test_init(data, size, granularity); + hbitmap_test_set_boundary_bits(data, diff); + hbitmap_test_truncate_impl(data, size + diff); + hbitmap_test_check_boundary_bits(data); +} + +static void test_hbitmap_truncate_nop(TestHBitmapData *data, + const void *unused) +{ + hbitmap_test_truncate(data, L2, 0, 0); +} + +/** + * Grow by an amount smaller than the granularity, without crossing + * a granularity alignment boundary. Effectively a NOP. + */ +static void test_hbitmap_truncate_grow_negligible(TestHBitmapData *data, + const void *unused) +{ + size_t size = L2 - 1; + size_t diff = 1; + int granularity = 1; + + hbitmap_test_truncate(data, size, diff, granularity); +} + +/** + * Shrink by an amount smaller than the granularity, without crossing + * a granularity alignment boundary. Effectively a NOP. + */ +static void test_hbitmap_truncate_shrink_negligible(TestHBitmapData *data, + const void *unused) +{ + size_t size = L2; + ssize_t diff = -1; + int granularity = 1; + + hbitmap_test_truncate(data, size, diff, granularity); +} + +/** + * Grow by an amount smaller than the granularity, but crossing over + * a granularity alignment boundary. + */ +static void test_hbitmap_truncate_grow_tiny(TestHBitmapData *data, + const void *unused) +{ + size_t size = L2 - 2; + ssize_t diff = 1; + int granularity = 1; + + hbitmap_test_truncate(data, size, diff, granularity); +} + +/** + * Shrink by an amount smaller than the granularity, but crossing over + * a granularity alignment boundary. + */ +static void test_hbitmap_truncate_shrink_tiny(TestHBitmapData *data, + const void *unused) +{ + size_t size = L2 - 1; + ssize_t diff = -1; + int granularity = 1; + + hbitmap_test_truncate(data, size, diff, granularity); +} + +/** + * Grow by an amount smaller than sizeof(long), and not crossing over + * a sizeof(long) alignment boundary. + */ +static void test_hbitmap_truncate_grow_small(TestHBitmapData *data, + const void *unused) +{ + size_t size = L2 + 1; + size_t diff = sizeof(long) / 2; + + hbitmap_test_truncate(data, size, diff, 0); +} + +/** + * Shrink by an amount smaller than sizeof(long), and not crossing over + * a sizeof(long) alignment boundary. + */ +static void test_hbitmap_truncate_shrink_small(TestHBitmapData *data, + const void *unused) +{ + size_t size = L2; + size_t diff = sizeof(long) / 2; + + hbitmap_test_truncate(data, size, -diff, 0); +} + +/** + * Grow by an amount smaller than sizeof(long), while crossing over + * a sizeof(long) alignment boundary. + */ +static void test_hbitmap_truncate_grow_medium(TestHBitmapData *data, + const void *unused) +{ + size_t size = L2 - 1; + size_t diff = sizeof(long) / 2; + + hbitmap_test_truncate(data, size, diff, 0); +} + +/** + * Shrink by an amount smaller than sizeof(long), while crossing over + * a sizeof(long) alignment boundary. + */ +static void test_hbitmap_truncate_shrink_medium(TestHBitmapData *data, + const void *unused) +{ + size_t size = L2 + 1; + size_t diff = sizeof(long) / 2; + + hbitmap_test_truncate(data, size, -diff, 0); +} + +/** + * Grow by an amount larger than sizeof(long). + */ +static void test_hbitmap_truncate_grow_large(TestHBitmapData *data, + const void *unused) +{ + size_t size = L2; + size_t diff = 8 * sizeof(long); + + hbitmap_test_truncate(data, size, diff, 0); +} + +/** + * Shrink by an amount larger than sizeof(long). + */ +static void test_hbitmap_truncate_shrink_large(TestHBitmapData *data, + const void *unused) +{ + size_t size = L2; + size_t diff = 8 * sizeof(long); + + hbitmap_test_truncate(data, size, -diff, 0); +} + static void hbitmap_test_add(const char *testpath, void (*test_func)(TestHBitmapData *data, const void *user_data)) { @@ -395,6 +628,28 @@ int main(int argc, char **argv) hbitmap_test_add("/hbitmap/reset/empty", test_hbitmap_reset_empty); hbitmap_test_add("/hbitmap/reset/general", test_hbitmap_reset); hbitmap_test_add("/hbitmap/granularity", test_hbitmap_granularity); + + hbitmap_test_add("/hbitmap/truncate/nop", test_hbitmap_truncate_nop); + hbitmap_test_add("/hbitmap/truncate/grow/negligible", + test_hbitmap_truncate_grow_negligible); + hbitmap_test_add("/hbitmap/truncate/shrink/negligible", + test_hbitmap_truncate_shrink_negligible); + hbitmap_test_add("/hbitmap/truncate/grow/tiny", + test_hbitmap_truncate_grow_tiny); + hbitmap_test_add("/hbitmap/truncate/shrink/tiny", + test_hbitmap_truncate_shrink_tiny); + hbitmap_test_add("/hbitmap/truncate/grow/small", + test_hbitmap_truncate_grow_small); + hbitmap_test_add("/hbitmap/truncate/shrink/small", + test_hbitmap_truncate_shrink_small); + hbitmap_test_add("/hbitmap/truncate/grow/medium", + test_hbitmap_truncate_grow_medium); + hbitmap_test_add("/hbitmap/truncate/shrink/medium", + test_hbitmap_truncate_shrink_medium); + hbitmap_test_add("/hbitmap/truncate/grow/large", + test_hbitmap_truncate_grow_large); + hbitmap_test_add("/hbitmap/truncate/shrink/large", + test_hbitmap_truncate_shrink_large); g_test_run(); return 0; diff --git a/thread-pool.c b/thread-pool.c index e2cac8e4ff..ac909f4986 100644 --- a/thread-pool.c +++ b/thread-pool.c @@ -170,12 +170,12 @@ restart: if (elem->state != THREAD_DONE) { continue; } - if (elem->state == THREAD_DONE) { - trace_thread_pool_complete(pool, elem, elem->common.opaque, - elem->ret); - } - if (elem->state == THREAD_DONE && elem->common.cb) { - QLIST_REMOVE(elem, all); + + trace_thread_pool_complete(pool, elem, elem->common.opaque, + elem->ret); + QLIST_REMOVE(elem, all); + + if (elem->common.cb) { /* Read state before ret. */ smp_rmb(); @@ -188,8 +188,6 @@ restart: qemu_aio_unref(elem); goto restart; } else { - /* remove the request */ - QLIST_REMOVE(elem, all); qemu_aio_unref(elem); } } diff --git a/util/hbitmap.c b/util/hbitmap.c index ab139717f5..a10c7aeeda 100644 --- a/util/hbitmap.c +++ b/util/hbitmap.c @@ -90,6 +90,9 @@ struct HBitmap { * bitmap will still allocate HBITMAP_LEVELS arrays. */ unsigned long *levels[HBITMAP_LEVELS]; + + /* The length of each levels[] array. */ + uint64_t sizes[HBITMAP_LEVELS]; }; /* Advance hbi to the next nonzero word and return it. hbi->pos @@ -384,6 +387,7 @@ HBitmap *hbitmap_alloc(uint64_t size, int granularity) hb->granularity = granularity; for (i = HBITMAP_LEVELS; i-- > 0; ) { size = MAX((size + BITS_PER_LONG - 1) >> BITS_PER_LEVEL, 1); + hb->sizes[i] = size; hb->levels[i] = g_new0(unsigned long, size); } @@ -395,3 +399,84 @@ HBitmap *hbitmap_alloc(uint64_t size, int granularity) hb->levels[0][0] |= 1UL << (BITS_PER_LONG - 1); return hb; } + +void hbitmap_truncate(HBitmap *hb, uint64_t size) +{ + bool shrink; + unsigned i; + uint64_t num_elements = size; + uint64_t old; + + /* Size comes in as logical elements, adjust for granularity. */ + size = (size + (1ULL << hb->granularity) - 1) >> hb->granularity; + assert(size <= ((uint64_t)1 << HBITMAP_LOG_MAX_SIZE)); + shrink = size < hb->size; + + /* bit sizes are identical; nothing to do. */ + if (size == hb->size) { + return; + } + + /* If we're losing bits, let's clear those bits before we invalidate all of + * our invariants. This helps keep the bitcount consistent, and will prevent + * us from carrying around garbage bits beyond the end of the map. + */ + if (shrink) { + /* Don't clear partial granularity groups; + * start at the first full one. */ + uint64_t start = QEMU_ALIGN_UP(num_elements, 1 << hb->granularity); + uint64_t fix_count = (hb->size << hb->granularity) - start; + + assert(fix_count); + hbitmap_reset(hb, start, fix_count); + } + + hb->size = size; + for (i = HBITMAP_LEVELS; i-- > 0; ) { + size = MAX(BITS_TO_LONGS(size), 1); + if (hb->sizes[i] == size) { + break; + } + old = hb->sizes[i]; + hb->sizes[i] = size; + hb->levels[i] = g_realloc(hb->levels[i], size * sizeof(unsigned long)); + if (!shrink) { + memset(&hb->levels[i][old], 0x00, + (size - old) * sizeof(*hb->levels[i])); + } + } +} + + +/** + * Given HBitmaps A and B, let A := A (BITOR) B. + * Bitmap B will not be modified. + * + * @return true if the merge was successful, + * false if it was not attempted. + */ +bool hbitmap_merge(HBitmap *a, const HBitmap *b) +{ + int i; + uint64_t j; + + if ((a->size != b->size) || (a->granularity != b->granularity)) { + return false; + } + + if (hbitmap_count(b) == 0) { + return true; + } + + /* This merge is O(size), as BITS_PER_LONG and HBITMAP_LEVELS are constant. + * It may be possible to improve running times for sparsely populated maps + * by using hbitmap_iter_next, but this is suboptimal for dense maps. + */ + for (i = HBITMAP_LEVELS - 1; i >= 0; i--) { + for (j = 0; j < a->sizes[i]; j++) { + a->levels[i][j] |= b->levels[i][j]; + } + } + + return true; +} |