diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/Makefile.objs | 3 | ||||
-rw-r--r-- | block/backup.c | 21 | ||||
-rw-r--r-- | block/blkdebug.c | 2 | ||||
-rw-r--r-- | block/commit.c | 70 | ||||
-rw-r--r-- | block/curl.c | 9 | ||||
-rw-r--r-- | block/iscsi.c | 49 | ||||
-rw-r--r-- | block/mirror.c | 119 | ||||
-rw-r--r-- | block/parallels.c | 2 | ||||
-rw-r--r-- | block/qcow2-cluster.c | 144 | ||||
-rw-r--r-- | block/qcow2-refcount.c | 26 | ||||
-rw-r--r-- | block/qcow2-snapshot.c | 2 | ||||
-rw-r--r-- | block/qcow2.c | 202 | ||||
-rw-r--r-- | block/qcow2.h | 7 | ||||
-rw-r--r-- | block/raw-posix.c | 42 | ||||
-rw-r--r-- | block/rbd.c | 15 | ||||
-rw-r--r-- | block/snapshot.c | 4 | ||||
-rw-r--r-- | block/stream.c | 50 |
17 files changed, 558 insertions, 209 deletions
diff --git a/block/Makefile.objs b/block/Makefile.objs index 27911b6b88..04b0e43eb1 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -9,7 +9,7 @@ block-obj-y += block-backend.o snapshot.o qapi.o block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o block-obj-$(CONFIG_POSIX) += raw-posix.o block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o -block-obj-y += null.o +block-obj-y += null.o mirror.o block-obj-y += nbd.o nbd-client.o sheepdog.o block-obj-$(CONFIG_LIBISCSI) += iscsi.o @@ -23,7 +23,6 @@ block-obj-y += accounting.o common-obj-y += stream.o common-obj-y += commit.o -common-obj-y += mirror.o common-obj-y += backup.o iscsi.o-cflags := $(LIBISCSI_CFLAGS) diff --git a/block/backup.c b/block/backup.c index e334740161..792e65514b 100644 --- a/block/backup.c +++ b/block/backup.c @@ -227,9 +227,25 @@ static BlockErrorAction backup_error_action(BackupBlockJob *job, } } +typedef struct { + int ret; +} BackupCompleteData; + +static void backup_complete(BlockJob *job, void *opaque) +{ + BackupBlockJob *s = container_of(job, BackupBlockJob, common); + BackupCompleteData *data = opaque; + + bdrv_unref(s->target); + + block_job_completed(job, data->ret); + g_free(data); +} + static void coroutine_fn backup_run(void *opaque) { BackupBlockJob *job = opaque; + BackupCompleteData *data; BlockDriverState *bs = job->common.bs; BlockDriverState *target = job->target; BlockdevOnError on_target_error = job->on_target_error; @@ -344,9 +360,10 @@ static void coroutine_fn backup_run(void *opaque) hbitmap_free(job->bitmap); bdrv_iostatus_disable(target); - bdrv_unref(target); - block_job_completed(&job->common, ret); + data = g_malloc(sizeof(*data)); + data->ret = ret; + block_job_defer_to_main_loop(&job->common, backup_complete, data); } void backup_start(BlockDriverState *bs, BlockDriverState *target, diff --git a/block/blkdebug.c b/block/blkdebug.c index e046b920fb..862d93b59b 100644 --- a/block/blkdebug.c +++ b/block/blkdebug.c @@ -195,6 +195,8 @@ static const char *event_names[BLKDBG_EVENT_MAX] = { [BLKDBG_PWRITEV] = "pwritev", [BLKDBG_PWRITEV_ZERO] = "pwritev_zero", [BLKDBG_PWRITEV_DONE] = "pwritev_done", + + [BLKDBG_EMPTY_IMAGE_PREPARE] = "empty_image_prepare", }; static int get_event_by_name(const char *name, BlkDebugEvent *event) diff --git a/block/commit.c b/block/commit.c index 60a2accf04..cfa2bbebc2 100644 --- a/block/commit.c +++ b/block/commit.c @@ -60,17 +60,50 @@ static int coroutine_fn commit_populate(BlockDriverState *bs, return 0; } -static void coroutine_fn commit_run(void *opaque) +typedef struct { + int ret; +} CommitCompleteData; + +static void commit_complete(BlockJob *job, void *opaque) { - CommitBlockJob *s = opaque; + CommitBlockJob *s = container_of(job, CommitBlockJob, common); + CommitCompleteData *data = opaque; BlockDriverState *active = s->active; BlockDriverState *top = s->top; BlockDriverState *base = s->base; BlockDriverState *overlay_bs; + int ret = data->ret; + + if (!block_job_is_cancelled(&s->common) && ret == 0) { + /* success */ + ret = bdrv_drop_intermediate(active, top, base, s->backing_file_str); + } + + /* restore base open flags here if appropriate (e.g., change the base back + * to r/o). These reopens do not need to be atomic, since we won't abort + * even on failure here */ + if (s->base_flags != bdrv_get_flags(base)) { + bdrv_reopen(base, s->base_flags, NULL); + } + overlay_bs = bdrv_find_overlay(active, top); + if (overlay_bs && s->orig_overlay_flags != bdrv_get_flags(overlay_bs)) { + bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL); + } + g_free(s->backing_file_str); + block_job_completed(&s->common, ret); + g_free(data); +} + +static void coroutine_fn commit_run(void *opaque) +{ + CommitBlockJob *s = opaque; + CommitCompleteData *data; + BlockDriverState *top = s->top; + BlockDriverState *base = s->base; int64_t sector_num, end; int ret = 0; int n = 0; - void *buf; + void *buf = NULL; int bytes_written = 0; int64_t base_len; @@ -78,18 +111,18 @@ static void coroutine_fn commit_run(void *opaque) if (s->common.len < 0) { - goto exit_restore_reopen; + goto out; } ret = base_len = bdrv_getlength(base); if (base_len < 0) { - goto exit_restore_reopen; + goto out; } if (base_len < s->common.len) { ret = bdrv_truncate(base, s->common.len); if (ret) { - goto exit_restore_reopen; + goto out; } } @@ -128,7 +161,7 @@ wait: if (s->on_error == BLOCKDEV_ON_ERROR_STOP || s->on_error == BLOCKDEV_ON_ERROR_REPORT|| (s->on_error == BLOCKDEV_ON_ERROR_ENOSPC && ret == -ENOSPC)) { - goto exit_free_buf; + goto out; } else { n = 0; continue; @@ -140,27 +173,12 @@ wait: ret = 0; - if (!block_job_is_cancelled(&s->common) && sector_num == end) { - /* success */ - ret = bdrv_drop_intermediate(active, top, base, s->backing_file_str); - } - -exit_free_buf: +out: qemu_vfree(buf); -exit_restore_reopen: - /* restore base open flags here if appropriate (e.g., change the base back - * to r/o). These reopens do not need to be atomic, since we won't abort - * even on failure here */ - if (s->base_flags != bdrv_get_flags(base)) { - bdrv_reopen(base, s->base_flags, NULL); - } - overlay_bs = bdrv_find_overlay(active, top); - if (overlay_bs && s->orig_overlay_flags != bdrv_get_flags(overlay_bs)) { - bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL); - } - g_free(s->backing_file_str); - block_job_completed(&s->common, ret); + data = g_malloc(sizeof(*data)); + data->ret = ret; + block_job_defer_to_main_loop(&s->common, commit_complete, data); } static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp) diff --git a/block/curl.c b/block/curl.c index b4157cc8b3..bbee3ca179 100644 --- a/block/curl.c +++ b/block/curl.c @@ -64,6 +64,7 @@ static CURLMcode __curl_multi_socket_action(CURLM *multi_handle, #define SECTOR_SIZE 512 #define READ_AHEAD_DEFAULT (256 * 1024) #define CURL_TIMEOUT_DEFAULT 5 +#define CURL_TIMEOUT_MAX 10000 #define FIND_RET_NONE 0 #define FIND_RET_OK 1 @@ -112,7 +113,7 @@ typedef struct BDRVCURLState { char *url; size_t readahead_size; bool sslverify; - int timeout; + uint64_t timeout; char *cookie; bool accept_range; AioContext *aio_context; @@ -390,7 +391,7 @@ static CURLState *curl_init_state(BlockDriverState *bs, BDRVCURLState *s) if (s->cookie) { curl_easy_setopt(state->curl, CURLOPT_COOKIE, s->cookie); } - curl_easy_setopt(state->curl, CURLOPT_TIMEOUT, s->timeout); + curl_easy_setopt(state->curl, CURLOPT_TIMEOUT, (long)s->timeout); curl_easy_setopt(state->curl, CURLOPT_WRITEFUNCTION, (void *)curl_read_cb); curl_easy_setopt(state->curl, CURLOPT_WRITEDATA, (void *)state); @@ -546,6 +547,10 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags, s->timeout = qemu_opt_get_number(opts, CURL_BLOCK_OPT_TIMEOUT, CURL_TIMEOUT_DEFAULT); + if (s->timeout > CURL_TIMEOUT_MAX) { + error_setg(errp, "timeout parameter is too large or negative"); + goto out_noclean; + } s->sslverify = qemu_opt_get_bool(opts, CURL_BLOCK_OPT_SSLVERIFY, true); diff --git a/block/iscsi.c b/block/iscsi.c index 3485d622ec..ed375fc30e 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -362,6 +362,12 @@ static int coroutine_fn iscsi_co_writev(BlockDriverState *bs, return -EINVAL; } + if (bs->bl.max_transfer_length && nb_sectors > bs->bl.max_transfer_length) { + error_report("iSCSI Error: Write of %d sectors exceeds max_xfer_len " + "of %d sectors", nb_sectors, bs->bl.max_transfer_length); + return -EINVAL; + } + lba = sector_qemu2lun(sector_num, iscsilun); num_sectors = sector_qemu2lun(nb_sectors, iscsilun); iscsi_co_init_iscsitask(iscsilun, &iTask); @@ -529,6 +535,12 @@ static int coroutine_fn iscsi_co_readv(BlockDriverState *bs, return -EINVAL; } + if (bs->bl.max_transfer_length && nb_sectors > bs->bl.max_transfer_length) { + error_report("iSCSI Error: Read of %d sectors exceeds max_xfer_len " + "of %d sectors", nb_sectors, bs->bl.max_transfer_length); + return -EINVAL; + } + if (iscsilun->lbprz && nb_sectors >= ISCSI_CHECKALLOC_THRES && !iscsi_allocationmap_is_allocated(iscsilun, sector_num, nb_sectors)) { int64_t ret; @@ -1489,31 +1501,44 @@ static void iscsi_close(BlockDriverState *bs) memset(iscsilun, 0, sizeof(IscsiLun)); } -static void iscsi_refresh_limits(BlockDriverState *bs, Error **errp) +static int sector_limits_lun2qemu(int64_t sector, IscsiLun *iscsilun) { - IscsiLun *iscsilun = bs->opaque; + return MIN(sector_lun2qemu(sector, iscsilun), INT_MAX / 2 + 1); +} +static void iscsi_refresh_limits(BlockDriverState *bs, Error **errp) +{ /* We don't actually refresh here, but just return data queried in * iscsi_open(): iscsi targets don't change their limits. */ + + IscsiLun *iscsilun = bs->opaque; + uint32_t max_xfer_len = iscsilun->use_16_for_rw ? 0xffffffff : 0xffff; + + if (iscsilun->bl.max_xfer_len) { + max_xfer_len = MIN(max_xfer_len, iscsilun->bl.max_xfer_len); + } + + bs->bl.max_transfer_length = sector_limits_lun2qemu(max_xfer_len, iscsilun); + if (iscsilun->lbp.lbpu) { if (iscsilun->bl.max_unmap < 0xffffffff) { - bs->bl.max_discard = sector_lun2qemu(iscsilun->bl.max_unmap, - iscsilun); + bs->bl.max_discard = + sector_limits_lun2qemu(iscsilun->bl.max_unmap, iscsilun); } - bs->bl.discard_alignment = sector_lun2qemu(iscsilun->bl.opt_unmap_gran, - iscsilun); + bs->bl.discard_alignment = + sector_limits_lun2qemu(iscsilun->bl.opt_unmap_gran, iscsilun); } if (iscsilun->bl.max_ws_len < 0xffffffff) { - bs->bl.max_write_zeroes = sector_lun2qemu(iscsilun->bl.max_ws_len, - iscsilun); + bs->bl.max_write_zeroes = + sector_limits_lun2qemu(iscsilun->bl.max_ws_len, iscsilun); } if (iscsilun->lbp.lbpws) { - bs->bl.write_zeroes_alignment = sector_lun2qemu(iscsilun->bl.opt_unmap_gran, - iscsilun); + bs->bl.write_zeroes_alignment = + sector_limits_lun2qemu(iscsilun->bl.opt_unmap_gran, iscsilun); } - bs->bl.opt_transfer_length = sector_lun2qemu(iscsilun->bl.opt_xfer_len, - iscsilun); + bs->bl.opt_transfer_length = + sector_limits_lun2qemu(iscsilun->bl.opt_xfer_len, iscsilun); } /* Since iscsi_open() ignores bdrv_flags, there is nothing to do here in diff --git a/block/mirror.c b/block/mirror.c index e8a43eb39e..2c6dd2a4c1 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -45,6 +45,7 @@ typedef struct MirrorBlockJob { int64_t sector_num; int64_t granularity; size_t buf_size; + int64_t bdev_length; unsigned long *cow_bitmap; BdrvDirtyBitmap *dirty_bitmap; HBitmapIter hbi; @@ -54,6 +55,7 @@ typedef struct MirrorBlockJob { unsigned long *in_flight_bitmap; int in_flight; + int sectors_in_flight; int ret; } MirrorBlockJob; @@ -87,6 +89,7 @@ static void mirror_iteration_done(MirrorOp *op, int ret) trace_mirror_iteration_done(s, op->sector_num, op->nb_sectors, ret); s->in_flight--; + s->sectors_in_flight -= op->nb_sectors; iov = op->qiov.iov; for (i = 0; i < op->qiov.niov; i++) { MirrorBuffer *buf = (MirrorBuffer *) iov[i].iov_base; @@ -98,8 +101,11 @@ static void mirror_iteration_done(MirrorOp *op, int ret) chunk_num = op->sector_num / sectors_per_chunk; nb_chunks = op->nb_sectors / sectors_per_chunk; bitmap_clear(s->in_flight_bitmap, chunk_num, nb_chunks); - if (s->cow_bitmap && ret >= 0) { - bitmap_set(s->cow_bitmap, chunk_num, nb_chunks); + if (ret >= 0) { + if (s->cow_bitmap) { + bitmap_set(s->cow_bitmap, chunk_num, nb_chunks); + } + s->common.offset += (uint64_t)op->nb_sectors * BDRV_SECTOR_SIZE; } qemu_iovec_destroy(&op->qiov); @@ -172,7 +178,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) hbitmap_next_sector = s->sector_num; sector_num = s->sector_num; sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; - end = s->common.len >> BDRV_SECTOR_BITS; + end = s->bdev_length / BDRV_SECTOR_SIZE; /* Extend the QEMUIOVector to include all adjacent blocks that will * be copied in this operation. @@ -284,6 +290,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) /* Copy the dirty cluster. */ s->in_flight++; + s->sectors_in_flight += nb_sectors; trace_mirror_one_iteration(s, sector_num, nb_sectors); bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors, mirror_read_complete, op); @@ -314,9 +321,56 @@ static void mirror_drain(MirrorBlockJob *s) } } +typedef struct { + int ret; +} MirrorExitData; + +static void mirror_exit(BlockJob *job, void *opaque) +{ + MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); + MirrorExitData *data = opaque; + AioContext *replace_aio_context = NULL; + + if (s->to_replace) { + replace_aio_context = bdrv_get_aio_context(s->to_replace); + aio_context_acquire(replace_aio_context); + } + + if (s->should_complete && data->ret == 0) { + BlockDriverState *to_replace = s->common.bs; + if (s->to_replace) { + to_replace = s->to_replace; + } + if (bdrv_get_flags(s->target) != bdrv_get_flags(to_replace)) { + bdrv_reopen(s->target, bdrv_get_flags(to_replace), NULL); + } + bdrv_swap(s->target, to_replace); + if (s->common.driver->job_type == BLOCK_JOB_TYPE_COMMIT) { + /* drop the bs loop chain formed by the swap: break the loop then + * trigger the unref from the top one */ + BlockDriverState *p = s->base->backing_hd; + bdrv_set_backing_hd(s->base, NULL); + bdrv_unref(p); + } + } + if (s->to_replace) { + bdrv_op_unblock_all(s->to_replace, s->replace_blocker); + error_free(s->replace_blocker); + bdrv_unref(s->to_replace); + } + if (replace_aio_context) { + aio_context_release(replace_aio_context); + } + g_free(s->replaces); + bdrv_unref(s->target); + block_job_completed(&s->common, data->ret); + g_free(data); +} + static void coroutine_fn mirror_run(void *opaque) { MirrorBlockJob *s = opaque; + MirrorExitData *data; BlockDriverState *bs = s->common.bs; int64_t sector_num, end, sectors_per_chunk, length; uint64_t last_pause_ns; @@ -329,11 +383,11 @@ static void coroutine_fn mirror_run(void *opaque) goto immediate_exit; } - s->common.len = bdrv_getlength(bs); - if (s->common.len < 0) { - ret = s->common.len; + s->bdev_length = bdrv_getlength(bs); + if (s->bdev_length < 0) { + ret = s->bdev_length; goto immediate_exit; - } else if (s->common.len == 0) { + } else if (s->bdev_length == 0) { /* Report BLOCK_JOB_READY and wait for complete. */ block_job_event_ready(&s->common); s->synced = true; @@ -344,7 +398,7 @@ static void coroutine_fn mirror_run(void *opaque) goto immediate_exit; } - length = DIV_ROUND_UP(s->common.len, s->granularity); + length = DIV_ROUND_UP(s->bdev_length, s->granularity); s->in_flight_bitmap = bitmap_new(length); /* If we have no backing file yet in the destination, we cannot let @@ -364,7 +418,7 @@ static void coroutine_fn mirror_run(void *opaque) } } - end = s->common.len >> BDRV_SECTOR_BITS; + end = s->bdev_length / BDRV_SECTOR_SIZE; s->buf = qemu_try_blockalign(bs, s->buf_size); if (s->buf == NULL) { ret = -ENOMEM; @@ -409,6 +463,12 @@ static void coroutine_fn mirror_run(void *opaque) } cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap); + /* s->common.offset contains the number of bytes already processed so + * far, cnt is the number of dirty sectors remaining and + * s->sectors_in_flight is the number of sectors currently being + * processed; together those are the current total operation length */ + s->common.len = s->common.offset + + (cnt + s->sectors_in_flight) * BDRV_SECTOR_SIZE; /* Note that even when no rate limit is applied we need to yield * periodically with no pending I/O so that qemu_aio_flush() returns. @@ -445,7 +505,6 @@ static void coroutine_fn mirror_run(void *opaque) * report completion. This way, block-job-cancel will leave * the target in a consistent state. */ - s->common.offset = end * BDRV_SECTOR_SIZE; if (!s->synced) { block_job_event_ready(&s->common); s->synced = true; @@ -467,15 +526,13 @@ static void coroutine_fn mirror_run(void *opaque) * mirror_populate runs. */ trace_mirror_before_drain(s, cnt); - bdrv_drain_all(); + bdrv_drain(bs); cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap); } ret = 0; trace_mirror_before_sleep(s, cnt, s->synced, delay_ns); if (!s->synced) { - /* Publish progress */ - s->common.offset = (end - cnt) * BDRV_SECTOR_SIZE; block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns); if (block_job_is_cancelled(&s->common)) { break; @@ -510,31 +567,10 @@ immediate_exit: g_free(s->in_flight_bitmap); bdrv_release_dirty_bitmap(bs, s->dirty_bitmap); bdrv_iostatus_disable(s->target); - if (s->should_complete && ret == 0) { - BlockDriverState *to_replace = s->common.bs; - if (s->to_replace) { - to_replace = s->to_replace; - } - if (bdrv_get_flags(s->target) != bdrv_get_flags(to_replace)) { - bdrv_reopen(s->target, bdrv_get_flags(to_replace), NULL); - } - bdrv_swap(s->target, to_replace); - if (s->common.driver->job_type == BLOCK_JOB_TYPE_COMMIT) { - /* drop the bs loop chain formed by the swap: break the loop then - * trigger the unref from the top one */ - BlockDriverState *p = s->base->backing_hd; - bdrv_set_backing_hd(s->base, NULL); - bdrv_unref(p); - } - } - if (s->to_replace) { - bdrv_op_unblock_all(s->to_replace, s->replace_blocker); - error_free(s->replace_blocker); - bdrv_unref(s->to_replace); - } - g_free(s->replaces); - bdrv_unref(s->target); - block_job_completed(&s->common, ret); + + data = g_malloc(sizeof(*data)); + data->ret = ret; + block_job_defer_to_main_loop(&s->common, mirror_exit, data); } static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp) @@ -574,16 +610,23 @@ static void mirror_complete(BlockJob *job, Error **errp) /* check the target bs is not blocked and block all operations on it */ if (s->replaces) { + AioContext *replace_aio_context; + s->to_replace = check_to_replace_node(s->replaces, &local_err); if (!s->to_replace) { error_propagate(errp, local_err); return; } + replace_aio_context = bdrv_get_aio_context(s->to_replace); + aio_context_acquire(replace_aio_context); + error_setg(&s->replace_blocker, "block device is in use by block-job-complete"); bdrv_op_block_all(s->to_replace, s->replace_blocker); bdrv_ref(s->to_replace); + + aio_context_release(replace_aio_context); } s->should_complete = true; diff --git a/block/parallels.c b/block/parallels.c index 2a814f3db4..4f9cd8dd23 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -155,7 +155,7 @@ static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num) offset = sector_num % s->tracks; /* not allocated */ - if ((index > s->catalog_size) || (s->catalog_bitmap[index] == 0)) + if ((index >= s->catalog_size) || (s->catalog_bitmap[index] == 0)) return -1; return ((uint64_t)s->catalog_bitmap[index] * s->off_multiplier + offset) * 512; diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 4d888c785f..df0b2c9cec 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -1414,7 +1414,7 @@ int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) * clusters. */ static int discard_single_l2(BlockDriverState *bs, uint64_t offset, - unsigned int nb_clusters, enum qcow2_discard_type type) + unsigned int nb_clusters, enum qcow2_discard_type type, bool full_discard) { BDRVQcowState *s = bs->opaque; uint64_t *l2_table; @@ -1436,23 +1436,30 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset, old_l2_entry = be64_to_cpu(l2_table[l2_index + i]); /* - * Make sure that a discarded area reads back as zeroes for v3 images - * (we cannot do it for v2 without actually writing a zero-filled - * buffer). We can skip the operation if the cluster is already marked - * as zero, or if it's unallocated and we don't have a backing file. + * If full_discard is false, make sure that a discarded area reads back + * as zeroes for v3 images (we cannot do it for v2 without actually + * writing a zero-filled buffer). We can skip the operation if the + * cluster is already marked as zero, or if it's unallocated and we + * don't have a backing file. * * TODO We might want to use bdrv_get_block_status(bs) here, but we're * holding s->lock, so that doesn't work today. + * + * If full_discard is true, the sector should not read back as zeroes, + * but rather fall through to the backing file. */ switch (qcow2_get_cluster_type(old_l2_entry)) { case QCOW2_CLUSTER_UNALLOCATED: - if (!bs->backing_hd) { + if (full_discard || !bs->backing_hd) { continue; } break; case QCOW2_CLUSTER_ZERO: - continue; + if (!full_discard) { + continue; + } + break; case QCOW2_CLUSTER_NORMAL: case QCOW2_CLUSTER_COMPRESSED: @@ -1464,7 +1471,7 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset, /* First remove L2 entries */ qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); - if (s->qcow_version >= 3) { + if (!full_discard && s->qcow_version >= 3) { l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO); } else { l2_table[l2_index + i] = cpu_to_be64(0); @@ -1483,7 +1490,7 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset, } int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, - int nb_sectors, enum qcow2_discard_type type) + int nb_sectors, enum qcow2_discard_type type, bool full_discard) { BDRVQcowState *s = bs->opaque; uint64_t end_offset; @@ -1506,7 +1513,7 @@ int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, /* Each L2 table is handled by its own loop iteration */ while (nb_clusters > 0) { - ret = discard_single_l2(bs, offset, nb_clusters, type); + ret = discard_single_l2(bs, offset, nb_clusters, type, full_discard); if (ret < 0) { goto fail; } @@ -1606,15 +1613,14 @@ fail: * Expands all zero clusters in a specific L1 table (or deallocates them, for * non-backed non-pre-allocated zero clusters). * - * expanded_clusters is a bitmap where every bit corresponds to one cluster in - * the image file; a bit gets set if the corresponding cluster has been used for - * zero expansion (i.e., has been filled with zeroes and is referenced from an - * L2 table). nb_clusters contains the total cluster count of the image file, - * i.e., the number of bits in expanded_clusters. + * l1_entries and *visited_l1_entries are used to keep track of progress for + * status_cb(). l1_entries contains the total number of L1 entries and + * *visited_l1_entries counts all visited L1 entries. */ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, - int l1_size, uint8_t **expanded_clusters, - uint64_t *nb_clusters) + int l1_size, int64_t *visited_l1_entries, + int64_t l1_entries, + BlockDriverAmendStatusCB *status_cb) { BDRVQcowState *s = bs->opaque; bool is_active_l1 = (l1_table == s->l1_table); @@ -1634,9 +1640,14 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, for (i = 0; i < l1_size; i++) { uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK; bool l2_dirty = false; + int l2_refcount; if (!l2_offset) { /* unallocated */ + (*visited_l1_entries)++; + if (status_cb) { + status_cb(bs, *visited_l1_entries, l1_entries); + } continue; } @@ -1653,33 +1664,19 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, goto fail; } + l2_refcount = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits); + if (l2_refcount < 0) { + ret = l2_refcount; + goto fail; + } + for (j = 0; j < s->l2_size; j++) { uint64_t l2_entry = be64_to_cpu(l2_table[j]); - int64_t offset = l2_entry & L2E_OFFSET_MASK, cluster_index; + int64_t offset = l2_entry & L2E_OFFSET_MASK; int cluster_type = qcow2_get_cluster_type(l2_entry); bool preallocated = offset != 0; - if (cluster_type == QCOW2_CLUSTER_NORMAL) { - cluster_index = offset >> s->cluster_bits; - assert((cluster_index >= 0) && (cluster_index < *nb_clusters)); - if ((*expanded_clusters)[cluster_index / 8] & - (1 << (cluster_index % 8))) { - /* Probably a shared L2 table; this cluster was a zero - * cluster which has been expanded, its refcount - * therefore most likely requires an update. */ - ret = qcow2_update_cluster_refcount(bs, cluster_index, 1, - QCOW2_DISCARD_NEVER); - if (ret < 0) { - goto fail; - } - /* Since we just increased the refcount, the COPIED flag may - * no longer be set. */ - l2_table[j] = cpu_to_be64(l2_entry & ~QCOW_OFLAG_COPIED); - l2_dirty = true; - } - continue; - } - else if (qcow2_get_cluster_type(l2_entry) != QCOW2_CLUSTER_ZERO) { + if (cluster_type != QCOW2_CLUSTER_ZERO) { continue; } @@ -1697,6 +1694,19 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, ret = offset; goto fail; } + + if (l2_refcount > 1) { + /* For shared L2 tables, set the refcount accordingly (it is + * already 1 and needs to be l2_refcount) */ + ret = qcow2_update_cluster_refcount(bs, + offset >> s->cluster_bits, l2_refcount - 1, + QCOW2_DISCARD_OTHER); + if (ret < 0) { + qcow2_free_clusters(bs, offset, s->cluster_size, + QCOW2_DISCARD_OTHER); + goto fail; + } + } } ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size); @@ -1718,29 +1728,12 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, goto fail; } - l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED); - l2_dirty = true; - - cluster_index = offset >> s->cluster_bits; - - if (cluster_index >= *nb_clusters) { - uint64_t old_bitmap_size = (*nb_clusters + 7) / 8; - uint64_t new_bitmap_size; - /* The offset may lie beyond the old end of the underlying image - * file for growable files only */ - assert(bs->file->growable); - *nb_clusters = size_to_clusters(s, bs->file->total_sectors * - BDRV_SECTOR_SIZE); - new_bitmap_size = (*nb_clusters + 7) / 8; - *expanded_clusters = g_realloc(*expanded_clusters, - new_bitmap_size); - /* clear the newly allocated space */ - memset(&(*expanded_clusters)[old_bitmap_size], 0, - new_bitmap_size - old_bitmap_size); + if (l2_refcount == 1) { + l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED); + } else { + l2_table[j] = cpu_to_be64(offset); } - - assert((cluster_index >= 0) && (cluster_index < *nb_clusters)); - (*expanded_clusters)[cluster_index / 8] |= 1 << (cluster_index % 8); + l2_dirty = true; } if (is_active_l1) { @@ -1769,6 +1762,11 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, } } } + + (*visited_l1_entries)++; + if (status_cb) { + status_cb(bs, *visited_l1_entries, l1_entries); + } } ret = 0; @@ -1795,25 +1793,25 @@ fail: * allocation for pre-allocated ones). This is important for downgrading to a * qcow2 version which doesn't yet support metadata zero clusters. */ -int qcow2_expand_zero_clusters(BlockDriverState *bs) +int qcow2_expand_zero_clusters(BlockDriverState *bs, + BlockDriverAmendStatusCB *status_cb) { BDRVQcowState *s = bs->opaque; uint64_t *l1_table = NULL; - uint64_t nb_clusters; - uint8_t *expanded_clusters; + int64_t l1_entries = 0, visited_l1_entries = 0; int ret; int i, j; - nb_clusters = size_to_clusters(s, bs->file->total_sectors * - BDRV_SECTOR_SIZE); - expanded_clusters = g_try_malloc0((nb_clusters + 7) / 8); - if (expanded_clusters == NULL) { - ret = -ENOMEM; - goto fail; + if (status_cb) { + l1_entries = s->l1_size; + for (i = 0; i < s->nb_snapshots; i++) { + l1_entries += s->snapshots[i].l1_size; + } } ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size, - &expanded_clusters, &nb_clusters); + &visited_l1_entries, l1_entries, + status_cb); if (ret < 0) { goto fail; } @@ -1847,7 +1845,8 @@ int qcow2_expand_zero_clusters(BlockDriverState *bs) } ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size, - &expanded_clusters, &nb_clusters); + &visited_l1_entries, l1_entries, + status_cb); if (ret < 0) { goto fail; } @@ -1856,7 +1855,6 @@ int qcow2_expand_zero_clusters(BlockDriverState *bs) ret = 0; fail: - g_free(expanded_clusters); g_free(l1_table); return ret; } diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 1477031206..9afdb40b40 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -91,7 +91,7 @@ static int load_refcount_block(BlockDriverState *bs, * return value is the refcount of the cluster, negative values are -errno * and indicate an error. */ -static int get_refcount(BlockDriverState *bs, int64_t cluster_index) +int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index) { BDRVQcowState *s = bs->opaque; uint64_t refcount_table_index, block_index; @@ -629,8 +629,7 @@ fail: } /* - * Increases or decreases the refcount of a given cluster by one. - * addend must be 1 or -1. + * Increases or decreases the refcount of a given cluster. * * If the return value is non-negative, it is the new refcount of the cluster. * If it is negative, it is -errno and indicates an error. @@ -649,7 +648,7 @@ int qcow2_update_cluster_refcount(BlockDriverState *bs, return ret; } - return get_refcount(bs, cluster_index); + return qcow2_get_refcount(bs, cluster_index); } @@ -670,7 +669,7 @@ static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size) retry: for(i = 0; i < nb_clusters; i++) { uint64_t next_cluster_index = s->free_cluster_index++; - refcount = get_refcount(bs, next_cluster_index); + refcount = qcow2_get_refcount(bs, next_cluster_index); if (refcount < 0) { return refcount; @@ -734,7 +733,7 @@ int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, /* Check how many clusters there are free */ cluster_index = offset >> s->cluster_bits; for(i = 0; i < nb_clusters; i++) { - refcount = get_refcount(bs, cluster_index++); + refcount = qcow2_get_refcount(bs, cluster_index++); if (refcount < 0) { return refcount; @@ -981,7 +980,7 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, cluster_index, addend, QCOW2_DISCARD_SNAPSHOT); } else { - refcount = get_refcount(bs, cluster_index); + refcount = qcow2_get_refcount(bs, cluster_index); } if (refcount < 0) { @@ -1021,7 +1020,7 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, refcount = qcow2_update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend, QCOW2_DISCARD_SNAPSHOT); } else { - refcount = get_refcount(bs, l2_offset >> s->cluster_bits); + refcount = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits); } if (refcount < 0) { ret = refcount; @@ -1332,8 +1331,8 @@ fail: * Checks the OFLAG_COPIED flag for all L1 and L2 entries. * * This function does not print an error message nor does it increment - * check_errors if get_refcount fails (this is because such an error will have - * been already detected and sufficiently signaled by the calling function + * check_errors if qcow2_get_refcount fails (this is because such an error will + * have been already detected and sufficiently signaled by the calling function * (qcow2_check_refcounts) by the time this function is called). */ static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res, @@ -1354,7 +1353,7 @@ static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res, continue; } - refcount = get_refcount(bs, l2_offset >> s->cluster_bits); + refcount = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits); if (refcount < 0) { /* don't print message nor increment check_errors */ continue; @@ -1396,7 +1395,8 @@ static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res, if ((cluster_type == QCOW2_CLUSTER_NORMAL) || ((cluster_type == QCOW2_CLUSTER_ZERO) && (data_offset != 0))) { - refcount = get_refcount(bs, data_offset >> s->cluster_bits); + refcount = qcow2_get_refcount(bs, + data_offset >> s->cluster_bits); if (refcount < 0) { /* don't print message nor increment check_errors */ continue; @@ -1632,7 +1632,7 @@ static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res, int refcount1, refcount2, ret; for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) { - refcount1 = get_refcount(bs, i); + refcount1 = qcow2_get_refcount(bs, i); if (refcount1 < 0) { fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n", i, strerror(-refcount1)); diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c index f52d7fdd22..5b3903cf22 100644 --- a/block/qcow2-snapshot.c +++ b/block/qcow2-snapshot.c @@ -441,7 +441,7 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) qcow2_discard_clusters(bs, qcow2_vm_state_offset(s), align_offset(sn->vm_state_size, s->cluster_size) >> BDRV_SECTOR_BITS, - QCOW2_DISCARD_NEVER); + QCOW2_DISCARD_NEVER, false); #ifdef DEBUG_ALLOC { diff --git a/block/qcow2.c b/block/qcow2.c index d031515838..d12049451a 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -2089,7 +2089,7 @@ static coroutine_fn int qcow2_co_discard(BlockDriverState *bs, qemu_co_mutex_lock(&s->lock); ret = qcow2_discard_clusters(bs, sector_num << BDRV_SECTOR_BITS, - nb_sectors, QCOW2_DISCARD_REQUEST); + nb_sectors, QCOW2_DISCARD_REQUEST, false); qemu_co_mutex_unlock(&s->lock); return ret; } @@ -2230,6 +2230,195 @@ fail: return ret; } +static int make_completely_empty(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + int ret, l1_clusters; + int64_t offset; + uint64_t *new_reftable = NULL; + uint64_t rt_entry, l1_size2; + struct { + uint64_t l1_offset; + uint64_t reftable_offset; + uint32_t reftable_clusters; + } QEMU_PACKED l1_ofs_rt_ofs_cls; + + ret = qcow2_cache_empty(bs, s->l2_table_cache); + if (ret < 0) { + goto fail; + } + + ret = qcow2_cache_empty(bs, s->refcount_block_cache); + if (ret < 0) { + goto fail; + } + + /* Refcounts will be broken utterly */ + ret = qcow2_mark_dirty(bs); + if (ret < 0) { + goto fail; + } + + BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE); + + l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / sizeof(uint64_t)); + l1_size2 = (uint64_t)s->l1_size * sizeof(uint64_t); + + /* After this call, neither the in-memory nor the on-disk refcount + * information accurately describe the actual references */ + + ret = bdrv_write_zeroes(bs->file, s->l1_table_offset / BDRV_SECTOR_SIZE, + l1_clusters * s->cluster_sectors, 0); + if (ret < 0) { + goto fail_broken_refcounts; + } + memset(s->l1_table, 0, l1_size2); + + BLKDBG_EVENT(bs->file, BLKDBG_EMPTY_IMAGE_PREPARE); + + /* Overwrite enough clusters at the beginning of the sectors to place + * the refcount table, a refcount block and the L1 table in; this may + * overwrite parts of the existing refcount and L1 table, which is not + * an issue because the dirty flag is set, complete data loss is in fact + * desired and partial data loss is consequently fine as well */ + ret = bdrv_write_zeroes(bs->file, s->cluster_size / BDRV_SECTOR_SIZE, + (2 + l1_clusters) * s->cluster_size / + BDRV_SECTOR_SIZE, 0); + /* This call (even if it failed overall) may have overwritten on-disk + * refcount structures; in that case, the in-memory refcount information + * will probably differ from the on-disk information which makes the BDS + * unusable */ + if (ret < 0) { + goto fail_broken_refcounts; + } + + BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE); + BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_UPDATE); + + /* "Create" an empty reftable (one cluster) directly after the image + * header and an empty L1 table three clusters after the image header; + * the cluster between those two will be used as the first refblock */ + cpu_to_be64w(&l1_ofs_rt_ofs_cls.l1_offset, 3 * s->cluster_size); + cpu_to_be64w(&l1_ofs_rt_ofs_cls.reftable_offset, s->cluster_size); + cpu_to_be32w(&l1_ofs_rt_ofs_cls.reftable_clusters, 1); + ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_table_offset), + &l1_ofs_rt_ofs_cls, sizeof(l1_ofs_rt_ofs_cls)); + if (ret < 0) { + goto fail_broken_refcounts; + } + + s->l1_table_offset = 3 * s->cluster_size; + + new_reftable = g_try_new0(uint64_t, s->cluster_size / sizeof(uint64_t)); + if (!new_reftable) { + ret = -ENOMEM; + goto fail_broken_refcounts; + } + + s->refcount_table_offset = s->cluster_size; + s->refcount_table_size = s->cluster_size / sizeof(uint64_t); + + g_free(s->refcount_table); + s->refcount_table = new_reftable; + new_reftable = NULL; + + /* Now the in-memory refcount information again corresponds to the on-disk + * information (reftable is empty and no refblocks (the refblock cache is + * empty)); however, this means some clusters (e.g. the image header) are + * referenced, but not refcounted, but the normal qcow2 code assumes that + * the in-memory information is always correct */ + + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC); + + /* Enter the first refblock into the reftable */ + rt_entry = cpu_to_be64(2 * s->cluster_size); + ret = bdrv_pwrite_sync(bs->file, s->cluster_size, + &rt_entry, sizeof(rt_entry)); + if (ret < 0) { + goto fail_broken_refcounts; + } + s->refcount_table[0] = 2 * s->cluster_size; + + s->free_cluster_index = 0; + assert(3 + l1_clusters <= s->refcount_block_size); + offset = qcow2_alloc_clusters(bs, 3 * s->cluster_size + l1_size2); + if (offset < 0) { + ret = offset; + goto fail_broken_refcounts; + } else if (offset > 0) { + error_report("First cluster in emptied image is in use"); + abort(); + } + + /* Now finally the in-memory information corresponds to the on-disk + * structures and is correct */ + ret = qcow2_mark_clean(bs); + if (ret < 0) { + goto fail; + } + + ret = bdrv_truncate(bs->file, (3 + l1_clusters) * s->cluster_size); + if (ret < 0) { + goto fail; + } + + return 0; + +fail_broken_refcounts: + /* The BDS is unusable at this point. If we wanted to make it usable, we + * would have to call qcow2_refcount_close(), qcow2_refcount_init(), + * qcow2_check_refcounts(), qcow2_refcount_close() and qcow2_refcount_init() + * again. However, because the functions which could have caused this error + * path to be taken are used by those functions as well, it's very likely + * that that sequence will fail as well. Therefore, just eject the BDS. */ + bs->drv = NULL; + +fail: + g_free(new_reftable); + return ret; +} + +static int qcow2_make_empty(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + uint64_t start_sector; + int sector_step = INT_MAX / BDRV_SECTOR_SIZE; + int l1_clusters, ret = 0; + + l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / sizeof(uint64_t)); + + if (s->qcow_version >= 3 && !s->snapshots && + 3 + l1_clusters <= s->refcount_block_size) { + /* The following function only works for qcow2 v3 images (it requires + * the dirty flag) and only as long as there are no snapshots (because + * it completely empties the image). Furthermore, the L1 table and three + * additional clusters (image header, refcount table, one refcount + * block) have to fit inside one refcount block. */ + return make_completely_empty(bs); + } + + /* This fallback code simply discards every active cluster; this is slow, + * but works in all cases */ + for (start_sector = 0; start_sector < bs->total_sectors; + start_sector += sector_step) + { + /* As this function is generally used after committing an external + * snapshot, QCOW2_DISCARD_SNAPSHOT seems appropriate. Also, the + * default action for this kind of discard is to pass the discard, + * which will ideally result in an actually smaller image file, as + * is probably desired. */ + ret = qcow2_discard_clusters(bs, start_sector * BDRV_SECTOR_SIZE, + MIN(sector_step, + bs->total_sectors - start_sector), + QCOW2_DISCARD_SNAPSHOT, true); + if (ret < 0) { + break; + } + } + + return ret; +} + static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs) { BDRVQcowState *s = bs->opaque; @@ -2361,7 +2550,8 @@ static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf, * Downgrades an image's version. To achieve this, any incompatible features * have to be removed. */ -static int qcow2_downgrade(BlockDriverState *bs, int target_version) +static int qcow2_downgrade(BlockDriverState *bs, int target_version, + BlockDriverAmendStatusCB *status_cb) { BDRVQcowState *s = bs->opaque; int current_version = s->qcow_version; @@ -2410,7 +2600,7 @@ static int qcow2_downgrade(BlockDriverState *bs, int target_version) /* clearing autoclear features is trivial */ s->autoclear_features = 0; - ret = qcow2_expand_zero_clusters(bs); + ret = qcow2_expand_zero_clusters(bs, status_cb); if (ret < 0) { return ret; } @@ -2424,7 +2614,8 @@ static int qcow2_downgrade(BlockDriverState *bs, int target_version) return 0; } -static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts) +static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts, + BlockDriverAmendStatusCB *status_cb) { BDRVQcowState *s = bs->opaque; int old_version = s->qcow_version, new_version = old_version; @@ -2502,7 +2693,7 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts) return ret; } } else { - ret = qcow2_downgrade(bs, new_version); + ret = qcow2_downgrade(bs, new_version, status_cb); if (ret < 0) { return ret; } @@ -2676,6 +2867,7 @@ static BlockDriver bdrv_qcow2 = { .bdrv_co_discard = qcow2_co_discard, .bdrv_truncate = qcow2_truncate, .bdrv_write_compressed = qcow2_write_compressed, + .bdrv_make_empty = qcow2_make_empty, .bdrv_snapshot_create = qcow2_snapshot_create, .bdrv_snapshot_goto = qcow2_snapshot_goto, diff --git a/block/qcow2.h b/block/qcow2.h index 577ccd1bf0..6e39a1b639 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -487,6 +487,8 @@ void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset, int qcow2_refcount_init(BlockDriverState *bs); void qcow2_refcount_close(BlockDriverState *bs); +int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index); + int qcow2_update_cluster_refcount(BlockDriverState *bs, int64_t cluster_index, int addend, enum qcow2_discard_type type); @@ -534,10 +536,11 @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m); int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, - int nb_sectors, enum qcow2_discard_type type); + int nb_sectors, enum qcow2_discard_type type, bool full_discard); int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors); -int qcow2_expand_zero_clusters(BlockDriverState *bs); +int qcow2_expand_zero_clusters(BlockDriverState *bs, + BlockDriverAmendStatusCB *status_cb); /* qcow2-snapshot.c functions */ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info); diff --git a/block/raw-posix.c b/block/raw-posix.c index 475cf74655..e100ae2046 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -1481,12 +1481,12 @@ out: return result; } -static int64_t try_fiemap(BlockDriverState *bs, off_t start, off_t *data, - off_t *hole, int nb_sectors, int *pnum) +static int try_fiemap(BlockDriverState *bs, off_t start, off_t *data, + off_t *hole, int nb_sectors) { #ifdef CONFIG_FIEMAP BDRVRawState *s = bs->opaque; - int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start; + int ret = 0; struct { struct fiemap fm; struct fiemap_extent fe; @@ -1527,18 +1527,14 @@ static int64_t try_fiemap(BlockDriverState *bs, off_t start, off_t *data, #endif } -static int64_t try_seek_hole(BlockDriverState *bs, off_t start, off_t *data, - off_t *hole, int *pnum) +static int try_seek_hole(BlockDriverState *bs, off_t start, off_t *data, + off_t *hole) { #if defined SEEK_HOLE && defined SEEK_DATA BDRVRawState *s = bs->opaque; *hole = lseek(s->fd, start, SEEK_HOLE); if (*hole == -1) { - /* -ENXIO indicates that sector_num was past the end of the file. - * There is a virtual hole there. */ - assert(errno != -ENXIO); - return -errno; } @@ -1552,7 +1548,7 @@ static int64_t try_seek_hole(BlockDriverState *bs, off_t start, off_t *data, } } - return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start; + return 0; #else return -ENOTSUP; #endif @@ -1578,7 +1574,8 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, int nb_sectors, int *pnum) { off_t start, data = 0, hole = 0; - int64_t ret; + int64_t total_size; + int ret; ret = fd_open(bs); if (ret < 0) { @@ -1586,29 +1583,38 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, } start = sector_num * BDRV_SECTOR_SIZE; + total_size = bdrv_getlength(bs); + if (total_size < 0) { + return total_size; + } else if (start >= total_size) { + *pnum = 0; + return 0; + } else if (start + nb_sectors * BDRV_SECTOR_SIZE > total_size) { + nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE); + } - ret = try_seek_hole(bs, start, &data, &hole, pnum); + ret = try_seek_hole(bs, start, &data, &hole); if (ret < 0) { - ret = try_fiemap(bs, start, &data, &hole, nb_sectors, pnum); + ret = try_fiemap(bs, start, &data, &hole, nb_sectors); if (ret < 0) { /* Assume everything is allocated. */ data = 0; hole = start + nb_sectors * BDRV_SECTOR_SIZE; - ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start; + ret = 0; } } + assert(ret >= 0); + if (data <= start) { /* On a data extent, compute sectors to the end of the extent. */ *pnum = MIN(nb_sectors, (hole - start) / BDRV_SECTOR_SIZE); + return ret | BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start; } else { /* On a hole, compute sectors to the beginning of the next extent. */ *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE); - ret &= ~BDRV_BLOCK_DATA; - ret |= BDRV_BLOCK_ZERO; + return ret | BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID | start; } - - return ret; } static coroutine_fn BlockAIOCB *raw_aio_discard(BlockDriverState *bs, diff --git a/block/rbd.c b/block/rbd.c index 47cab8be94..5b5a64a27a 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -887,6 +887,18 @@ static BlockAIOCB* qemu_rbd_aio_discard(BlockDriverState *bs, } #endif +#ifdef LIBRBD_SUPPORTS_INVALIDATE +static void qemu_rbd_invalidate_cache(BlockDriverState *bs, + Error **errp) +{ + BDRVRBDState *s = bs->opaque; + int r = rbd_invalidate_cache(s->image); + if (r < 0) { + error_setg_errno(errp, -r, "Failed to invalidate the cache"); + } +} +#endif + static QemuOptsList qemu_rbd_create_opts = { .name = "rbd-create-opts", .head = QTAILQ_HEAD_INITIALIZER(qemu_rbd_create_opts.head), @@ -936,6 +948,9 @@ static BlockDriver bdrv_rbd = { .bdrv_snapshot_delete = qemu_rbd_snap_remove, .bdrv_snapshot_list = qemu_rbd_snap_list, .bdrv_snapshot_goto = qemu_rbd_snap_rollback, +#ifdef LIBRBD_SUPPORTS_INVALIDATE + .bdrv_invalidate_cache = qemu_rbd_invalidate_cache, +#endif }; static void bdrv_rbd_init(void) diff --git a/block/snapshot.c b/block/snapshot.c index 85c52ff455..698e1a1d58 100644 --- a/block/snapshot.c +++ b/block/snapshot.c @@ -236,6 +236,10 @@ int bdrv_snapshot_delete(BlockDriverState *bs, error_setg(errp, "snapshot_id and name are both NULL"); return -EINVAL; } + + /* drain all pending i/o before deleting snapshot */ + bdrv_drain_all(); + if (drv->bdrv_snapshot_delete) { return drv->bdrv_snapshot_delete(bs, snapshot_id, name, errp); } diff --git a/block/stream.c b/block/stream.c index a1dc8da484..a628901f69 100644 --- a/block/stream.c +++ b/block/stream.c @@ -79,9 +79,39 @@ static void close_unused_images(BlockDriverState *top, BlockDriverState *base, bdrv_refresh_limits(top, NULL); } +typedef struct { + int ret; + bool reached_end; +} StreamCompleteData; + +static void stream_complete(BlockJob *job, void *opaque) +{ + StreamBlockJob *s = container_of(job, StreamBlockJob, common); + StreamCompleteData *data = opaque; + BlockDriverState *base = s->base; + + if (!block_job_is_cancelled(&s->common) && data->reached_end && + data->ret == 0) { + const char *base_id = NULL, *base_fmt = NULL; + if (base) { + base_id = s->backing_file_str; + if (base->drv) { + base_fmt = base->drv->format_name; + } + } + data->ret = bdrv_change_backing_file(job->bs, base_id, base_fmt); + close_unused_images(job->bs, base, base_id); + } + + g_free(s->backing_file_str); + block_job_completed(&s->common, data->ret); + g_free(data); +} + static void coroutine_fn stream_run(void *opaque) { StreamBlockJob *s = opaque; + StreamCompleteData *data; BlockDriverState *bs = s->common.bs; BlockDriverState *base = s->base; int64_t sector_num, end; @@ -183,21 +213,13 @@ wait: /* Do not remove the backing file if an error was there but ignored. */ ret = error; - if (!block_job_is_cancelled(&s->common) && sector_num == end && ret == 0) { - const char *base_id = NULL, *base_fmt = NULL; - if (base) { - base_id = s->backing_file_str; - if (base->drv) { - base_fmt = base->drv->format_name; - } - } - ret = bdrv_change_backing_file(bs, base_id, base_fmt); - close_unused_images(bs, base, base_id); - } - qemu_vfree(buf); - g_free(s->backing_file_str); - block_job_completed(&s->common, ret); + + /* Modify backing chain and close BDSes in main loop */ + data = g_malloc(sizeof(*data)); + data->ret = ret; + data->reached_end = sector_num == end; + block_job_defer_to_main_loop(&s->common, stream_complete, data); } static void stream_set_speed(BlockJob *job, int64_t speed, Error **errp) |