aboutsummaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/file-posix.c7
-rw-r--r--block/gluster.c23
-rw-r--r--block/io.c68
-rw-r--r--block/iscsi.c3
-rw-r--r--block/mirror.c25
-rw-r--r--block/qcow2-cluster.c78
-rw-r--r--block/qcow2-refcount.c326
-rw-r--r--block/qcow2.c13
-rw-r--r--block/qcow2.h7
9 files changed, 381 insertions, 169 deletions
diff --git a/block/file-posix.c b/block/file-posix.c
index 1854bfa397..d81e15efa4 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -2744,7 +2744,8 @@ static int find_allocation(BlockDriverState *bs, off_t start,
* the specified offset) that are known to be in the same
* allocated/unallocated state.
*
- * 'bytes' is the max value 'pnum' should be set to.
+ * 'bytes' is a soft cap for 'pnum'. If the information is free, 'pnum' may
+ * well exceed it.
*/
static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
bool want_zero,
@@ -2782,7 +2783,7 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
} else if (data == offset) {
/* On a data extent, compute bytes to the end of the extent,
* possibly including a partial sector at EOF. */
- *pnum = MIN(bytes, hole - offset);
+ *pnum = hole - offset;
/*
* We are not allowed to return partial sectors, though, so
@@ -2801,7 +2802,7 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
} else {
/* On a hole, compute bytes to the beginning of the next extent. */
assert(hole == offset);
- *pnum = MIN(bytes, data - offset);
+ *pnum = data - offset;
ret = BDRV_BLOCK_ZERO;
}
*map = offset;
diff --git a/block/gluster.c b/block/gluster.c
index e8ee14c8e9..d51938e447 100644
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -1461,7 +1461,8 @@ exit:
* the specified offset) that are known to be in the same
* allocated/unallocated state.
*
- * 'bytes' is the max value 'pnum' should be set to.
+ * 'bytes' is a soft cap for 'pnum'. If the information is free, 'pnum' may
+ * well exceed it.
*
* (Based on raw_co_block_status() from file-posix.c.)
*/
@@ -1477,6 +1478,8 @@ static int coroutine_fn qemu_gluster_co_block_status(BlockDriverState *bs,
off_t data = 0, hole = 0;
int ret = -EINVAL;
+ assert(QEMU_IS_ALIGNED(offset | bytes, bs->bl.request_alignment));
+
if (!s->fd) {
return ret;
}
@@ -1500,12 +1503,26 @@ static int coroutine_fn qemu_gluster_co_block_status(BlockDriverState *bs,
} else if (data == offset) {
/* On a data extent, compute bytes to the end of the extent,
* possibly including a partial sector at EOF. */
- *pnum = MIN(bytes, hole - offset);
+ *pnum = hole - offset;
+
+ /*
+ * We are not allowed to return partial sectors, though, so
+ * round up if necessary.
+ */
+ if (!QEMU_IS_ALIGNED(*pnum, bs->bl.request_alignment)) {
+ int64_t file_length = qemu_gluster_getlength(bs);
+ if (file_length > 0) {
+ /* Ignore errors, this is just a safeguard */
+ assert(hole == file_length);
+ }
+ *pnum = ROUND_UP(*pnum, bs->bl.request_alignment);
+ }
+
ret = BDRV_BLOCK_DATA;
} else {
/* On a hole, compute bytes to the beginning of the next extent. */
assert(hole == offset);
- *pnum = MIN(bytes, data - offset);
+ *pnum = data - offset;
ret = BDRV_BLOCK_ZERO;
}
diff --git a/block/io.c b/block/io.c
index a19942718b..99ee182ca4 100644
--- a/block/io.c
+++ b/block/io.c
@@ -1883,6 +1883,9 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
return -ENOTSUP;
}
+ /* Invalidate the cached block-status data range if this write overlaps */
+ bdrv_bsc_invalidate_range(bs, offset, bytes);
+
assert(alignment % bs->bl.request_alignment == 0);
head = offset % alignment;
tail = (offset + bytes) % alignment;
@@ -2447,9 +2450,65 @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,
aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset;
if (bs->drv->bdrv_co_block_status) {
- ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset,
- aligned_bytes, pnum, &local_map,
- &local_file);
+ /*
+ * Use the block-status cache only for protocol nodes: Format
+ * drivers are generally quick to inquire the status, but protocol
+ * drivers often need to get information from outside of qemu, so
+ * we do not have control over the actual implementation. There
+ * have been cases where inquiring the status took an unreasonably
+ * long time, and we can do nothing in qemu to fix it.
+ * This is especially problematic for images with large data areas,
+ * because finding the few holes in them and giving them special
+ * treatment does not gain much performance. Therefore, we try to
+ * cache the last-identified data region.
+ *
+ * Second, limiting ourselves to protocol nodes allows us to assume
+ * the block status for data regions to be DATA | OFFSET_VALID, and
+ * that the host offset is the same as the guest offset.
+ *
+ * Note that it is possible that external writers zero parts of
+ * the cached regions without the cache being invalidated, and so
+ * we may report zeroes as data. This is not catastrophic,
+ * however, because reporting zeroes as data is fine.
+ */
+ if (QLIST_EMPTY(&bs->children) &&
+ bdrv_bsc_is_data(bs, aligned_offset, pnum))
+ {
+ ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
+ local_file = bs;
+ local_map = aligned_offset;
+ } else {
+ ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset,
+ aligned_bytes, pnum, &local_map,
+ &local_file);
+
+ /*
+ * Note that checking QLIST_EMPTY(&bs->children) is also done when
+ * the cache is queried above. Technically, we do not need to check
+ * it here; the worst that can happen is that we fill the cache for
+ * non-protocol nodes, and then it is never used. However, filling
+ * the cache requires an RCU update, so double check here to avoid
+ * such an update if possible.
+ */
+ if (ret == (BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID) &&
+ QLIST_EMPTY(&bs->children))
+ {
+ /*
+ * When a protocol driver reports BLOCK_OFFSET_VALID, the
+ * returned local_map value must be the same as the offset we
+ * have passed (aligned_offset), and local_bs must be the node
+ * itself.
+ * Assert this, because we follow this rule when reading from
+ * the cache (see the `local_file = bs` and
+ * `local_map = aligned_offset` assignments above), and the
+ * result the cache delivers must be the same as the driver
+ * would deliver.
+ */
+ assert(local_file == bs);
+ assert(local_map == aligned_offset);
+ bdrv_bsc_fill(bs, aligned_offset, *pnum);
+ }
+ }
} else {
/* Default code for filters */
@@ -3002,6 +3061,9 @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset,
return 0;
}
+ /* Invalidate the cached block-status data range if this discard overlaps */
+ bdrv_bsc_invalidate_range(bs, offset, bytes);
+
/* Discard is advisory, but some devices track and coalesce
* unaligned requests, so we must pass everything down rather than
* round here. Still, most devices will just silently ignore
diff --git a/block/iscsi.c b/block/iscsi.c
index 4d2a416ce7..852384086b 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -781,9 +781,6 @@ retry:
iscsi_allocmap_set_allocated(iscsilun, offset, *pnum);
}
- if (*pnum > bytes) {
- *pnum = bytes;
- }
out_unlock:
qemu_mutex_unlock(&iscsilun->mutex);
g_free(iTask.err_str);
diff --git a/block/mirror.c b/block/mirror.c
index 98fc66eabf..85b781bc21 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -160,18 +160,25 @@ static void coroutine_fn mirror_wait_on_conflicts(MirrorOp *self,
if (ranges_overlap(self_start_chunk, self_nb_chunks,
op_start_chunk, op_nb_chunks))
{
- /*
- * If the operation is already (indirectly) waiting for us, or
- * will wait for us as soon as it wakes up, then just go on
- * (instead of producing a deadlock in the former case).
- */
- if (op->waiting_for_op) {
- continue;
+ if (self) {
+ /*
+ * If the operation is already (indirectly) waiting for us,
+ * or will wait for us as soon as it wakes up, then just go
+ * on (instead of producing a deadlock in the former case).
+ */
+ if (op->waiting_for_op) {
+ continue;
+ }
+
+ self->waiting_for_op = op;
}
- self->waiting_for_op = op;
qemu_co_queue_wait(&op->waiting_requests, NULL);
- self->waiting_for_op = NULL;
+
+ if (self) {
+ self->waiting_for_op = NULL;
+ }
+
break;
}
}
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index bd0597842f..4ebb49a087 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -556,8 +556,7 @@ static int coroutine_fn do_perform_cow_write(BlockDriverState *bs,
* offset needs to be aligned to a cluster boundary.
*
* If the cluster is unallocated then *host_offset will be 0.
- * If the cluster is compressed then *host_offset will contain the
- * complete compressed cluster descriptor.
+ * If the cluster is compressed then *host_offset will contain the l2 entry.
*
* On entry, *bytes is the maximum number of contiguous bytes starting at
* offset that we are interested in.
@@ -660,7 +659,7 @@ int qcow2_get_host_offset(BlockDriverState *bs, uint64_t offset,
ret = -EIO;
goto fail;
}
- *host_offset = l2_entry & L2E_COMPRESSED_OFFSET_SIZE_MASK;
+ *host_offset = l2_entry;
break;
case QCOW2_SUBCLUSTER_ZERO_PLAIN:
case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
@@ -1400,29 +1399,47 @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
if (end <= old_start || start >= old_end) {
/* No intersection */
+ continue;
+ }
+
+ if (old_alloc->keep_old_clusters &&
+ (end <= l2meta_cow_start(old_alloc) ||
+ start >= l2meta_cow_end(old_alloc)))
+ {
+ /*
+ * Clusters intersect but COW areas don't. And cluster itself is
+ * already allocated. So, there is no actual conflict.
+ */
+ continue;
+ }
+
+ /* Conflict */
+
+ if (start < old_start) {
+ /* Stop at the start of a running allocation */
+ bytes = old_start - start;
} else {
- if (start < old_start) {
- /* Stop at the start of a running allocation */
- bytes = old_start - start;
- } else {
- bytes = 0;
- }
+ bytes = 0;
+ }
- /* Stop if already an l2meta exists. After yielding, it wouldn't
- * be valid any more, so we'd have to clean up the old L2Metas
- * and deal with requests depending on them before starting to
- * gather new ones. Not worth the trouble. */
- if (bytes == 0 && *m) {
- *cur_bytes = 0;
- return 0;
- }
+ /*
+ * Stop if an l2meta already exists. After yielding, it wouldn't
+ * be valid any more, so we'd have to clean up the old L2Metas
+ * and deal with requests depending on them before starting to
+ * gather new ones. Not worth the trouble.
+ */
+ if (bytes == 0 && *m) {
+ *cur_bytes = 0;
+ return 0;
+ }
- if (bytes == 0) {
- /* Wait for the dependency to complete. We need to recheck
- * the free/allocated clusters when we continue. */
- qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock);
- return -EAGAIN;
- }
+ if (bytes == 0) {
+ /*
+ * Wait for the dependency to complete. We need to recheck
+ * the free/allocated clusters when we continue.
+ */
+ qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock);
+ return -EAGAIN;
}
}
@@ -2463,3 +2480,18 @@ fail:
g_free(l1_table);
return ret;
}
+
+void qcow2_parse_compressed_l2_entry(BlockDriverState *bs, uint64_t l2_entry,
+ uint64_t *coffset, int *csize)
+{
+ BDRVQcow2State *s = bs->opaque;
+ int nb_csectors;
+
+ assert(qcow2_get_cluster_type(bs, l2_entry) == QCOW2_CLUSTER_COMPRESSED);
+
+ *coffset = l2_entry & s->cluster_offset_mask;
+
+ nb_csectors = ((l2_entry >> s->csize_shift) & s->csize_mask) + 1;
+ *csize = nb_csectors * QCOW2_COMPRESSED_SECTOR_SIZE -
+ (*coffset & (QCOW2_COMPRESSED_SECTOR_SIZE - 1));
+}
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 8e649b008e..4614572252 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -1177,11 +1177,11 @@ void qcow2_free_any_cluster(BlockDriverState *bs, uint64_t l2_entry,
switch (ctype) {
case QCOW2_CLUSTER_COMPRESSED:
{
- int64_t offset = (l2_entry & s->cluster_offset_mask)
- & QCOW2_COMPRESSED_SECTOR_MASK;
- int size = QCOW2_COMPRESSED_SECTOR_SIZE *
- (((l2_entry >> s->csize_shift) & s->csize_mask) + 1);
- qcow2_free_clusters(bs, offset, size, type);
+ uint64_t coffset;
+ int csize;
+
+ qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize);
+ qcow2_free_clusters(bs, coffset, csize, type);
}
break;
case QCOW2_CLUSTER_NORMAL:
@@ -1247,7 +1247,7 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
bool l1_allocated = false;
int64_t old_entry, old_l2_offset;
unsigned slice, slice_size2, n_slices;
- int i, j, l1_modified = 0, nb_csectors;
+ int i, j, l1_modified = 0;
int ret;
assert(addend >= -1 && addend <= 1);
@@ -1318,14 +1318,14 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
switch (qcow2_get_cluster_type(bs, entry)) {
case QCOW2_CLUSTER_COMPRESSED:
- nb_csectors = ((entry >> s->csize_shift) &
- s->csize_mask) + 1;
if (addend != 0) {
- uint64_t coffset = (entry & s->cluster_offset_mask)
- & QCOW2_COMPRESSED_SECTOR_MASK;
+ uint64_t coffset;
+ int csize;
+
+ qcow2_parse_compressed_l2_entry(bs, entry,
+ &coffset, &csize);
ret = update_refcount(
- bs, coffset,
- nb_csectors * QCOW2_COMPRESSED_SECTOR_SIZE,
+ bs, coffset, csize,
abs(addend), addend < 0,
QCOW2_DISCARD_SNAPSHOT);
if (ret < 0) {
@@ -1588,6 +1588,66 @@ enum {
};
/*
+ * Fix L2 entry by making it QCOW2_CLUSTER_ZERO_PLAIN (or making all its present
+ * subclusters QCOW2_SUBCLUSTER_ZERO_PLAIN).
+ *
+ * This function decrements res->corruptions on success, so the caller is
+ * responsible to increment res->corruptions prior to the call.
+ *
+ * On failure in-memory @l2_table may be modified.
+ */
+static int fix_l2_entry_by_zero(BlockDriverState *bs, BdrvCheckResult *res,
+ uint64_t l2_offset,
+ uint64_t *l2_table, int l2_index, bool active,
+ bool *metadata_overlap)
+{
+ BDRVQcow2State *s = bs->opaque;
+ int ret;
+ int idx = l2_index * (l2_entry_size(s) / sizeof(uint64_t));
+ uint64_t l2e_offset = l2_offset + (uint64_t)l2_index * l2_entry_size(s);
+ int ign = active ? QCOW2_OL_ACTIVE_L2 : QCOW2_OL_INACTIVE_L2;
+
+ if (has_subclusters(s)) {
+ uint64_t l2_bitmap = get_l2_bitmap(s, l2_table, l2_index);
+
+ /* Allocated subclusters become zero */
+ l2_bitmap |= l2_bitmap << 32;
+ l2_bitmap &= QCOW_L2_BITMAP_ALL_ZEROES;
+
+ set_l2_bitmap(s, l2_table, l2_index, l2_bitmap);
+ set_l2_entry(s, l2_table, l2_index, 0);
+ } else {
+ set_l2_entry(s, l2_table, l2_index, QCOW_OFLAG_ZERO);
+ }
+
+ ret = qcow2_pre_write_overlap_check(bs, ign, l2e_offset, l2_entry_size(s),
+ false);
+ if (metadata_overlap) {
+ *metadata_overlap = ret < 0;
+ }
+ if (ret < 0) {
+ fprintf(stderr, "ERROR: Overlap check failed\n");
+ goto fail;
+ }
+
+ ret = bdrv_pwrite_sync(bs->file, l2e_offset, &l2_table[idx],
+ l2_entry_size(s));
+ if (ret < 0) {
+ fprintf(stderr, "ERROR: Failed to overwrite L2 "
+ "table entry: %s\n", strerror(-ret));
+ goto fail;
+ }
+
+ res->corruptions--;
+ res->corruptions_fixed++;
+ return 0;
+
+fail:
+ res->check_errors++;
+ return ret;
+}
+
+/*
* Increases the refcount in the given refcount table for the all clusters
* referenced in the L2 table. While doing so, performs some checks on L2
* entries.
@@ -1601,26 +1661,41 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
int flags, BdrvCheckMode fix, bool active)
{
BDRVQcow2State *s = bs->opaque;
- uint64_t *l2_table, l2_entry;
+ uint64_t l2_entry, l2_bitmap;
uint64_t next_contiguous_offset = 0;
- int i, l2_size, nb_csectors, ret;
+ int i, ret;
+ size_t l2_size_bytes = s->l2_size * l2_entry_size(s);
+ g_autofree uint64_t *l2_table = g_malloc(l2_size_bytes);
+ bool metadata_overlap;
/* Read L2 table from disk */
- l2_size = s->l2_size * l2_entry_size(s);
- l2_table = g_malloc(l2_size);
-
- ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size);
+ ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size_bytes);
if (ret < 0) {
fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
res->check_errors++;
- goto fail;
+ return ret;
}
/* Do the actual checks */
- for(i = 0; i < s->l2_size; i++) {
+ for (i = 0; i < s->l2_size; i++) {
+ uint64_t coffset;
+ int csize;
+ QCow2ClusterType type;
+
l2_entry = get_l2_entry(s, l2_table, i);
+ l2_bitmap = get_l2_bitmap(s, l2_table, i);
+ type = qcow2_get_cluster_type(bs, l2_entry);
+
+ if (type != QCOW2_CLUSTER_COMPRESSED) {
+ /* Check reserved bits of Standard Cluster Descriptor */
+ if (l2_entry & L2E_STD_RESERVED_MASK) {
+ fprintf(stderr, "ERROR found l2 entry with reserved bits set: "
+ "%" PRIx64 "\n", l2_entry);
+ res->corruptions++;
+ }
+ }
- switch (qcow2_get_cluster_type(bs, l2_entry)) {
+ switch (type) {
case QCOW2_CLUSTER_COMPRESSED:
/* Compressed clusters don't have QCOW_OFLAG_COPIED */
if (l2_entry & QCOW_OFLAG_COPIED) {
@@ -1638,23 +1713,28 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
break;
}
+ if (l2_bitmap) {
+ fprintf(stderr, "ERROR compressed cluster %d with non-zero "
+ "subcluster allocation bitmap, entry=0x%" PRIx64 "\n",
+ i, l2_entry);
+ res->corruptions++;
+ break;
+ }
+
/* Mark cluster as used */
- nb_csectors = ((l2_entry >> s->csize_shift) &
- s->csize_mask) + 1;
- l2_entry &= s->cluster_offset_mask;
+ qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize);
ret = qcow2_inc_refcounts_imrt(
- bs, res, refcount_table, refcount_table_size,
- l2_entry & QCOW2_COMPRESSED_SECTOR_MASK,
- nb_csectors * QCOW2_COMPRESSED_SECTOR_SIZE);
+ bs, res, refcount_table, refcount_table_size, coffset, csize);
if (ret < 0) {
- goto fail;
+ return ret;
}
if (flags & CHECK_FRAG_INFO) {
res->bfi.allocated_clusters++;
res->bfi.compressed_clusters++;
- /* Compressed clusters are fragmented by nature. Since they
+ /*
+ * Compressed clusters are fragmented by nature. Since they
* take up sub-sector space but we only have sector granularity
* I/O we need to re-read the same sectors even for adjacent
* compressed clusters.
@@ -1668,13 +1748,19 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
{
uint64_t offset = l2_entry & L2E_OFFSET_MASK;
+ if ((l2_bitmap >> 32) & l2_bitmap) {
+ res->corruptions++;
+ fprintf(stderr, "ERROR offset=%" PRIx64 ": Allocated "
+ "cluster has corrupted subcluster allocation bitmap\n",
+ offset);
+ }
+
/* Correct offsets are cluster aligned */
if (offset_into_cluster(s, offset)) {
bool contains_data;
res->corruptions++;
if (has_subclusters(s)) {
- uint64_t l2_bitmap = get_l2_bitmap(s, l2_table, i);
contains_data = (l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC);
} else {
contains_data = !(l2_entry & QCOW_OFLAG_ZERO);
@@ -1687,40 +1773,30 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR",
offset);
if (fix & BDRV_FIX_ERRORS) {
- int idx = i * (l2_entry_size(s) / sizeof(uint64_t));
- uint64_t l2e_offset =
- l2_offset + (uint64_t)i * l2_entry_size(s);
- int ign = active ? QCOW2_OL_ACTIVE_L2 :
- QCOW2_OL_INACTIVE_L2;
-
- l2_entry = has_subclusters(s) ? 0 : QCOW_OFLAG_ZERO;
- set_l2_entry(s, l2_table, i, l2_entry);
- ret = qcow2_pre_write_overlap_check(bs, ign,
- l2e_offset, l2_entry_size(s), false);
- if (ret < 0) {
- fprintf(stderr, "ERROR: Overlap check failed\n");
- res->check_errors++;
- /* Something is seriously wrong, so abort checking
- * this L2 table */
- goto fail;
+ ret = fix_l2_entry_by_zero(bs, res, l2_offset,
+ l2_table, i, active,
+ &metadata_overlap);
+ if (metadata_overlap) {
+ /*
+ * Something is seriously wrong, so abort checking
+ * this L2 table.
+ */
+ return ret;
}
- ret = bdrv_pwrite_sync(bs->file, l2e_offset,
- &l2_table[idx],
- l2_entry_size(s));
- if (ret < 0) {
- fprintf(stderr, "ERROR: Failed to overwrite L2 "
- "table entry: %s\n", strerror(-ret));
- res->check_errors++;
- /* Do not abort, continue checking the rest of this
- * L2 table's entries */
- } else {
- res->corruptions--;
- res->corruptions_fixed++;
- /* Skip marking the cluster as used
- * (it is unused now) */
+ if (ret == 0) {
+ /*
+ * Skip marking the cluster as used
+ * (it is unused now).
+ */
continue;
}
+
+ /*
+ * Failed to fix.
+ * Do not abort, continue checking the rest of this
+ * L2 table's entries.
+ */
}
} else {
fprintf(stderr, "ERROR offset=%" PRIx64 ": Data cluster is "
@@ -1743,14 +1819,23 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
refcount_table_size,
offset, s->cluster_size);
if (ret < 0) {
- goto fail;
+ return ret;
}
}
break;
}
case QCOW2_CLUSTER_ZERO_PLAIN:
+ /* Impossible when image has subclusters */
+ assert(!l2_bitmap);
+ break;
+
case QCOW2_CLUSTER_UNALLOCATED:
+ if (l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC) {
+ res->corruptions++;
+ fprintf(stderr, "ERROR: Unallocated "
+ "cluster has non-zero subcluster allocation map\n");
+ }
break;
default:
@@ -1758,12 +1843,7 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
}
}
- g_free(l2_table);
return 0;
-
-fail:
- g_free(l2_table);
- return ret;
}
/*
@@ -1782,71 +1862,79 @@ static int check_refcounts_l1(BlockDriverState *bs,
int flags, BdrvCheckMode fix, bool active)
{
BDRVQcow2State *s = bs->opaque;
- uint64_t *l1_table = NULL, l2_offset, l1_size2;
+ size_t l1_size_bytes = l1_size * L1E_SIZE;
+ g_autofree uint64_t *l1_table = NULL;
+ uint64_t l2_offset;
int i, ret;
- l1_size2 = l1_size * L1E_SIZE;
+ if (!l1_size) {
+ return 0;
+ }
/* Mark L1 table as used */
ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, refcount_table_size,
- l1_table_offset, l1_size2);
+ l1_table_offset, l1_size_bytes);
if (ret < 0) {
- goto fail;
+ return ret;
+ }
+
+ l1_table = g_try_malloc(l1_size_bytes);
+ if (l1_table == NULL) {
+ res->check_errors++;
+ return -ENOMEM;
}
/* Read L1 table entries from disk */
- if (l1_size2 > 0) {
- l1_table = g_try_malloc(l1_size2);
- if (l1_table == NULL) {
- ret = -ENOMEM;
- res->check_errors++;
- goto fail;
- }
- ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
- if (ret < 0) {
- fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
- res->check_errors++;
- goto fail;
- }
- for(i = 0;i < l1_size; i++)
- be64_to_cpus(&l1_table[i]);
+ ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size_bytes);
+ if (ret < 0) {
+ fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
+ res->check_errors++;
+ return ret;
+ }
+
+ for (i = 0; i < l1_size; i++) {
+ be64_to_cpus(&l1_table[i]);
}
/* Do the actual checks */
- for(i = 0; i < l1_size; i++) {
- l2_offset = l1_table[i];
- if (l2_offset) {
- /* Mark L2 table as used */
- l2_offset &= L1E_OFFSET_MASK;
- ret = qcow2_inc_refcounts_imrt(bs, res,
- refcount_table, refcount_table_size,
- l2_offset, s->cluster_size);
- if (ret < 0) {
- goto fail;
- }
+ for (i = 0; i < l1_size; i++) {
+ if (!l1_table[i]) {
+ continue;
+ }
- /* L2 tables are cluster aligned */
- if (offset_into_cluster(s, l2_offset)) {
- fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
- "cluster aligned; L1 entry corrupted\n", l2_offset);
- res->corruptions++;
- }
+ if (l1_table[i] & L1E_RESERVED_MASK) {
+ fprintf(stderr, "ERROR found L1 entry with reserved bits set: "
+ "%" PRIx64 "\n", l1_table[i]);
+ res->corruptions++;
+ }
- /* Process and check L2 entries */
- ret = check_refcounts_l2(bs, res, refcount_table,
- refcount_table_size, l2_offset, flags,
- fix, active);
- if (ret < 0) {
- goto fail;
- }
+ l2_offset = l1_table[i] & L1E_OFFSET_MASK;
+
+ /* Mark L2 table as used */
+ ret = qcow2_inc_refcounts_imrt(bs, res,
+ refcount_table, refcount_table_size,
+ l2_offset, s->cluster_size);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* L2 tables are cluster aligned */
+ if (offset_into_cluster(s, l2_offset)) {
+ fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
+ "cluster aligned; L1 entry corrupted\n", l2_offset);
+ res->corruptions++;
+ }
+
+ /* Process and check L2 entries */
+ ret = check_refcounts_l2(bs, res, refcount_table,
+ refcount_table_size, l2_offset, flags,
+ fix, active);
+ if (ret < 0) {
+ return ret;
}
}
- g_free(l1_table);
- return 0;
-fail:
- g_free(l1_table);
- return ret;
+ return 0;
}
/*
@@ -2001,9 +2089,17 @@ static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
for(i = 0; i < s->refcount_table_size; i++) {
uint64_t offset, cluster;
- offset = s->refcount_table[i];
+ offset = s->refcount_table[i] & REFT_OFFSET_MASK;
cluster = offset >> s->cluster_bits;
+ if (s->refcount_table[i] & REFT_RESERVED_MASK) {
+ fprintf(stderr, "ERROR refcount table entry %" PRId64 " has "
+ "reserved bits set\n", i);
+ res->corruptions++;
+ *rebuild = true;
+ continue;
+ }
+
/* Refcount blocks are cluster aligned */
if (offset_into_cluster(s, offset)) {
fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
diff --git a/block/qcow2.c b/block/qcow2.c
index 9f1b6461c8..02f9f3e636 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -74,7 +74,7 @@ typedef struct {
static int coroutine_fn
qcow2_co_preadv_compressed(BlockDriverState *bs,
- uint64_t cluster_descriptor,
+ uint64_t l2_entry,
uint64_t offset,
uint64_t bytes,
QEMUIOVector *qiov,
@@ -2205,7 +2205,7 @@ typedef struct Qcow2AioTask {
BlockDriverState *bs;
QCow2SubclusterType subcluster_type; /* only for read */
- uint64_t host_offset; /* or full descriptor in compressed clusters */
+ uint64_t host_offset; /* or l2_entry for compressed read */
uint64_t offset;
uint64_t bytes;
QEMUIOVector *qiov;
@@ -4693,22 +4693,19 @@ qcow2_co_pwritev_compressed_part(BlockDriverState *bs,
static int coroutine_fn
qcow2_co_preadv_compressed(BlockDriverState *bs,
- uint64_t cluster_descriptor,
+ uint64_t l2_entry,
uint64_t offset,
uint64_t bytes,
QEMUIOVector *qiov,
size_t qiov_offset)
{
BDRVQcow2State *s = bs->opaque;
- int ret = 0, csize, nb_csectors;
+ int ret = 0, csize;
uint64_t coffset;
uint8_t *buf, *out_buf;
int offset_in_cluster = offset_into_cluster(s, offset);
- coffset = cluster_descriptor & s->cluster_offset_mask;
- nb_csectors = ((cluster_descriptor >> s->csize_shift) & s->csize_mask) + 1;
- csize = nb_csectors * QCOW2_COMPRESSED_SECTOR_SIZE -
- (coffset & ~QCOW2_COMPRESSED_SECTOR_MASK);
+ qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize);
buf = g_try_malloc(csize);
if (!buf) {
diff --git a/block/qcow2.h b/block/qcow2.h
index 0fe5f74ed3..fd48a89d45 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -110,7 +110,6 @@
/* Defined in the qcow2 spec (compressed cluster descriptor) */
#define QCOW2_COMPRESSED_SECTOR_SIZE 512U
-#define QCOW2_COMPRESSED_SECTOR_MASK (~(QCOW2_COMPRESSED_SECTOR_SIZE - 1ULL))
/* Must be at least 2 to cover COW */
#define MIN_L2_CACHE_SIZE 2 /* cache entries */
@@ -587,10 +586,12 @@ typedef enum QCow2MetadataOverlap {
(QCOW2_OL_CACHED | QCOW2_OL_INACTIVE_L2)
#define L1E_OFFSET_MASK 0x00fffffffffffe00ULL
+#define L1E_RESERVED_MASK 0x7f000000000001ffULL
#define L2E_OFFSET_MASK 0x00fffffffffffe00ULL
-#define L2E_COMPRESSED_OFFSET_SIZE_MASK 0x3fffffffffffffffULL
+#define L2E_STD_RESERVED_MASK 0x3f000000000001feULL
#define REFT_OFFSET_MASK 0xfffffffffffffe00ULL
+#define REFT_RESERVED_MASK 0x1ffULL
#define INV_OFFSET (-1ULL)
@@ -914,6 +915,8 @@ int qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
uint64_t offset,
int compressed_size,
uint64_t *host_offset);
+void qcow2_parse_compressed_l2_entry(BlockDriverState *bs, uint64_t l2_entry,
+ uint64_t *coffset, int *csize);
int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m);
void qcow2_alloc_cluster_abort(BlockDriverState *bs, QCowL2Meta *m);