diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2015-03-10 14:01:22 +0000 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2015-03-10 14:01:22 +0000 |
commit | 1976058109890892db8ec88bfd3273f79c459f6b (patch) | |
tree | 3819ee5d4406491751f99828f961a4b938f1e180 /block | |
parent | 8437f7be3b1c49631e435c652707f2cee477149d (diff) | |
parent | 280458a34abcca2ba70843a089a35468c81e3740 (diff) |
Merge remote-tracking branch 'remotes/kevin/tags/for-upstream' into staging
Block patches for 2.3
# gpg: Signature made Tue Mar 10 13:03:17 2015 GMT using RSA key ID C88F2FD6
# gpg: Good signature from "Kevin Wolf <kwolf@redhat.com>"
* remotes/kevin/tags/for-upstream: (73 commits)
MAINTAINERS: Add jcody as blockjobs, block devices maintainer
iotests: add O_DIRECT alignment probing test
block/raw-posix: fix launching with failed disks
MAINTAINERS: Add jsnow as IDE maintainer
sheepdog: Fix misleading error messages in sd_snapshot_create()
Add testcase for scsi-hd devices without drive property
scsi-hd: fix property unset case
block/vdi: Add locking for parallel requests
iotests: Drop vpc from 004's and 104's format list
iotests: Remove 006
iotests: Fix 051's reference output
virtio-blk: Remove the stale FIXME comment
tests: Check QVIRTIO_F_ANY_LAYOUT flag in virtio-blk test
libqos: Solve bug in interrupt checking when using MSIX in virtio-pci.c
sheepdog: fix confused return values
qtest/ahci: add fragmented dma test
qtest/ahci: Add PIO and LBA48 tests
qtest/ahci: Add DMA test variants
libqos/ahci: add ahci command helpers
qtest/ahci: Add a macro bootup routine
...
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Diffstat (limited to 'block')
-rw-r--r-- | block/blkdebug.c | 6 | ||||
-rw-r--r-- | block/block-backend.c | 10 | ||||
-rw-r--r-- | block/qcow2-cluster.c | 11 | ||||
-rw-r--r-- | block/qcow2-refcount.c | 545 | ||||
-rw-r--r-- | block/qcow2.c | 111 | ||||
-rw-r--r-- | block/qcow2.h | 32 | ||||
-rw-r--r-- | block/raw-posix.c | 185 | ||||
-rw-r--r-- | block/raw_bsd.c | 12 | ||||
-rw-r--r-- | block/sheepdog.c | 171 | ||||
-rw-r--r-- | block/vdi.c | 25 | ||||
-rw-r--r-- | block/vpc.c | 60 |
11 files changed, 893 insertions, 275 deletions
diff --git a/block/blkdebug.c b/block/blkdebug.c index 9ce35cd035..63611e0a33 100644 --- a/block/blkdebug.c +++ b/block/blkdebug.c @@ -472,12 +472,14 @@ static BlockAIOCB *inject_error(BlockDriverState *bs, int error = rule->options.inject.error; struct BlkdebugAIOCB *acb; QEMUBH *bh; + bool immediately = rule->options.inject.immediately; if (rule->options.inject.once) { - QSIMPLEQ_INIT(&s->active_rules); + QSIMPLEQ_REMOVE(&s->active_rules, rule, BlkdebugRule, active_next); + remove_rule(rule); } - if (rule->options.inject.immediately) { + if (immediately) { return NULL; } diff --git a/block/block-backend.c b/block/block-backend.c index bfb041823e..48b6e4c05c 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -892,3 +892,13 @@ int blk_load_vmstate(BlockBackend *blk, uint8_t *buf, int64_t pos, int size) { return bdrv_load_vmstate(blk->bs, buf, pos, size); } + +int blk_probe_blocksizes(BlockBackend *blk, BlockSizes *bsz) +{ + return bdrv_probe_blocksizes(blk->bs, bsz); +} + +int blk_probe_geometry(BlockBackend *blk, HDGeometry *geo) +{ + return bdrv_probe_geometry(blk->bs, geo); +} diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 183177d518..ed2b44d291 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -1640,7 +1640,7 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, for (i = 0; i < l1_size; i++) { uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK; bool l2_dirty = false; - int l2_refcount; + uint64_t l2_refcount; if (!l2_offset) { /* unallocated */ @@ -1672,9 +1672,9 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, goto fail; } - l2_refcount = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits); - if (l2_refcount < 0) { - ret = l2_refcount; + ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits, + &l2_refcount); + if (ret < 0) { goto fail; } @@ -1707,7 +1707,8 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, /* For shared L2 tables, set the refcount accordingly (it is * already 1 and needs to be l2_refcount) */ ret = qcow2_update_cluster_refcount(bs, - offset >> s->cluster_bits, l2_refcount - 1, + offset >> s->cluster_bits, + refcount_diff(1, l2_refcount), false, QCOW2_DISCARD_OTHER); if (ret < 0) { qcow2_free_clusters(bs, offset, s->cluster_size, diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 9b80ca79ea..dc8d186a82 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -29,8 +29,52 @@ static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size); static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, - int64_t offset, int64_t length, - int addend, enum qcow2_discard_type type); + int64_t offset, int64_t length, uint64_t addend, + bool decrease, enum qcow2_discard_type type); + +static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index); +static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index); +static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index); +static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index); +static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index); +static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index); +static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index); + +static void set_refcount_ro0(void *refcount_array, uint64_t index, + uint64_t value); +static void set_refcount_ro1(void *refcount_array, uint64_t index, + uint64_t value); +static void set_refcount_ro2(void *refcount_array, uint64_t index, + uint64_t value); +static void set_refcount_ro3(void *refcount_array, uint64_t index, + uint64_t value); +static void set_refcount_ro4(void *refcount_array, uint64_t index, + uint64_t value); +static void set_refcount_ro5(void *refcount_array, uint64_t index, + uint64_t value); +static void set_refcount_ro6(void *refcount_array, uint64_t index, + uint64_t value); + + +static Qcow2GetRefcountFunc *const get_refcount_funcs[] = { + &get_refcount_ro0, + &get_refcount_ro1, + &get_refcount_ro2, + &get_refcount_ro3, + &get_refcount_ro4, + &get_refcount_ro5, + &get_refcount_ro6 +}; + +static Qcow2SetRefcountFunc *const set_refcount_funcs[] = { + &set_refcount_ro0, + &set_refcount_ro1, + &set_refcount_ro2, + &set_refcount_ro3, + &set_refcount_ro4, + &set_refcount_ro5, + &set_refcount_ro6 +}; /*********************************************************/ @@ -42,6 +86,11 @@ int qcow2_refcount_init(BlockDriverState *bs) unsigned int refcount_table_size2, i; int ret; + assert(s->refcount_order >= 0 && s->refcount_order <= 6); + + s->get_refcount = get_refcount_funcs[s->refcount_order]; + s->set_refcount = set_refcount_funcs[s->refcount_order]; + assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t)); refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t); s->refcount_table = g_try_malloc(refcount_table_size2); @@ -72,6 +121,95 @@ void qcow2_refcount_close(BlockDriverState *bs) } +static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index) +{ + return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1; +} + +static void set_refcount_ro0(void *refcount_array, uint64_t index, + uint64_t value) +{ + assert(!(value >> 1)); + ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8)); + ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8); +} + +static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index) +{ + return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4))) + & 0x3; +} + +static void set_refcount_ro1(void *refcount_array, uint64_t index, + uint64_t value) +{ + assert(!(value >> 2)); + ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4))); + ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4)); +} + +static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index) +{ + return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2))) + & 0xf; +} + +static void set_refcount_ro2(void *refcount_array, uint64_t index, + uint64_t value) +{ + assert(!(value >> 4)); + ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2))); + ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2)); +} + +static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index) +{ + return ((const uint8_t *)refcount_array)[index]; +} + +static void set_refcount_ro3(void *refcount_array, uint64_t index, + uint64_t value) +{ + assert(!(value >> 8)); + ((uint8_t *)refcount_array)[index] = value; +} + +static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index) +{ + return be16_to_cpu(((const uint16_t *)refcount_array)[index]); +} + +static void set_refcount_ro4(void *refcount_array, uint64_t index, + uint64_t value) +{ + assert(!(value >> 16)); + ((uint16_t *)refcount_array)[index] = cpu_to_be16(value); +} + +static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index) +{ + return be32_to_cpu(((const uint32_t *)refcount_array)[index]); +} + +static void set_refcount_ro5(void *refcount_array, uint64_t index, + uint64_t value) +{ + assert(!(value >> 32)); + ((uint32_t *)refcount_array)[index] = cpu_to_be32(value); +} + +static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index) +{ + return be64_to_cpu(((const uint64_t *)refcount_array)[index]); +} + +static void set_refcount_ro6(void *refcount_array, uint64_t index, + uint64_t value) +{ + ((uint64_t *)refcount_array)[index] = cpu_to_be64(value); +} + + static int load_refcount_block(BlockDriverState *bs, int64_t refcount_block_offset, void **refcount_block) @@ -87,26 +225,29 @@ static int load_refcount_block(BlockDriverState *bs, } /* - * Returns the refcount of the cluster given by its index. Any non-negative - * return value is the refcount of the cluster, negative values are -errno - * and indicate an error. + * Retrieves the refcount of the cluster given by its index and stores it in + * *refcount. Returns 0 on success and -errno on failure. */ -int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index) +int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index, + uint64_t *refcount) { BDRVQcowState *s = bs->opaque; uint64_t refcount_table_index, block_index; int64_t refcount_block_offset; int ret; - uint16_t *refcount_block; - uint16_t refcount; + void *refcount_block; refcount_table_index = cluster_index >> s->refcount_block_bits; - if (refcount_table_index >= s->refcount_table_size) + if (refcount_table_index >= s->refcount_table_size) { + *refcount = 0; return 0; + } refcount_block_offset = s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK; - if (!refcount_block_offset) + if (!refcount_block_offset) { + *refcount = 0; return 0; + } if (offset_into_cluster(s, refcount_block_offset)) { qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64 @@ -116,21 +257,20 @@ int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index) } ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset, - (void**) &refcount_block); + &refcount_block); if (ret < 0) { return ret; } block_index = cluster_index & (s->refcount_block_size - 1); - refcount = be16_to_cpu(refcount_block[block_index]); + *refcount = s->get_refcount(refcount_block, block_index); - ret = qcow2_cache_put(bs, s->refcount_block_cache, - (void**) &refcount_block); + ret = qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block); if (ret < 0) { return ret; } - return refcount; + return 0; } /* @@ -169,7 +309,7 @@ static int in_same_refcount_block(BDRVQcowState *s, uint64_t offset_a, * Returns 0 on success or -errno in error case */ static int alloc_refcount_block(BlockDriverState *bs, - int64_t cluster_index, uint16_t **refcount_block) + int64_t cluster_index, void **refcount_block) { BDRVQcowState *s = bs->opaque; unsigned int refcount_table_index; @@ -196,7 +336,7 @@ static int alloc_refcount_block(BlockDriverState *bs, } return load_refcount_block(bs, refcount_block_offset, - (void**) refcount_block); + refcount_block); } } @@ -246,7 +386,7 @@ static int alloc_refcount_block(BlockDriverState *bs, if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) { /* Zero the new refcount block before updating it */ ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block, - (void**) refcount_block); + refcount_block); if (ret < 0) { goto fail_block; } @@ -256,11 +396,11 @@ static int alloc_refcount_block(BlockDriverState *bs, /* The block describes itself, need to update the cache */ int block_index = (new_block >> s->cluster_bits) & (s->refcount_block_size - 1); - (*refcount_block)[block_index] = cpu_to_be16(1); + s->set_refcount(*refcount_block, block_index, 1); } else { /* Described somewhere else. This can recurse at most twice before we * arrive at a block that describes itself. */ - ret = update_refcount(bs, new_block, s->cluster_size, 1, + ret = update_refcount(bs, new_block, s->cluster_size, 1, false, QCOW2_DISCARD_NEVER); if (ret < 0) { goto fail_block; @@ -274,7 +414,7 @@ static int alloc_refcount_block(BlockDriverState *bs, /* Initialize the new refcount block only after updating its refcount, * update_refcount uses the refcount cache itself */ ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block, - (void**) refcount_block); + refcount_block); if (ret < 0) { goto fail_block; } @@ -308,7 +448,7 @@ static int alloc_refcount_block(BlockDriverState *bs, return -EAGAIN; } - ret = qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block); + ret = qcow2_cache_put(bs, s->refcount_block_cache, refcount_block); if (ret < 0) { goto fail_block; } @@ -362,7 +502,7 @@ static int alloc_refcount_block(BlockDriverState *bs, s->cluster_size; uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size; uint64_t *new_table = g_try_new0(uint64_t, table_size); - uint16_t *new_blocks = g_try_malloc0(blocks_clusters * s->cluster_size); + void *new_blocks = g_try_malloc0(blocks_clusters * s->cluster_size); assert(table_size > 0 && blocks_clusters > 0); if (new_table == NULL || new_blocks == NULL) { @@ -384,7 +524,7 @@ static int alloc_refcount_block(BlockDriverState *bs, uint64_t table_clusters = size_to_clusters(s, table_size * sizeof(uint64_t)); int block = 0; for (i = 0; i < table_clusters + blocks_clusters; i++) { - new_blocks[block++] = cpu_to_be16(1); + s->set_refcount(new_blocks, block++, 1); } /* Write refcount blocks to disk */ @@ -437,7 +577,7 @@ static int alloc_refcount_block(BlockDriverState *bs, qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t), QCOW2_DISCARD_OTHER); - ret = load_refcount_block(bs, new_block, (void**) refcount_block); + ret = load_refcount_block(bs, new_block, refcount_block); if (ret < 0) { return ret; } @@ -452,7 +592,7 @@ fail_table: g_free(new_table); fail_block: if (*refcount_block != NULL) { - qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block); + qcow2_cache_put(bs, s->refcount_block_cache, refcount_block); } return ret; } @@ -527,18 +667,25 @@ found: } /* XXX: cache several refcount block clusters ? */ +/* @addend is the absolute value of the addend; if @decrease is set, @addend + * will be subtracted from the current refcount, otherwise it will be added */ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, - int64_t offset, int64_t length, int addend, enum qcow2_discard_type type) + int64_t offset, + int64_t length, + uint64_t addend, + bool decrease, + enum qcow2_discard_type type) { BDRVQcowState *s = bs->opaque; int64_t start, last, cluster_offset; - uint16_t *refcount_block = NULL; + void *refcount_block = NULL; int64_t old_table_index = -1; int ret; #ifdef DEBUG_ALLOC2 - fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64 " addend=%d\n", - offset, length, addend); + fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64 + " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "", + addend); #endif if (length < 0) { return -EINVAL; @@ -546,7 +693,7 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, return 0; } - if (addend < 0) { + if (decrease) { qcow2_cache_set_dependency(bs, s->refcount_block_cache, s->l2_table_cache); } @@ -556,7 +703,8 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, for(cluster_offset = start; cluster_offset <= last; cluster_offset += s->cluster_size) { - int block_index, refcount; + int block_index; + uint64_t refcount; int64_t cluster_index = cluster_offset >> s->cluster_bits; int64_t table_index = cluster_index >> s->refcount_block_bits; @@ -564,7 +712,7 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, if (table_index != old_table_index) { if (refcount_block) { ret = qcow2_cache_put(bs, s->refcount_block_cache, - (void**) &refcount_block); + &refcount_block); if (ret < 0) { goto fail; } @@ -582,16 +730,23 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, /* we can update the count and save it */ block_index = cluster_index & (s->refcount_block_size - 1); - refcount = be16_to_cpu(refcount_block[block_index]); - refcount += addend; - if (refcount < 0 || refcount > 0xffff) { + refcount = s->get_refcount(refcount_block, block_index); + if (decrease ? (refcount - addend > refcount) + : (refcount + addend < refcount || + refcount + addend > s->refcount_max)) + { ret = -EINVAL; goto fail; } + if (decrease) { + refcount -= addend; + } else { + refcount += addend; + } if (refcount == 0 && cluster_index < s->free_cluster_index) { s->free_cluster_index = cluster_index; } - refcount_block[block_index] = cpu_to_be16(refcount); + s->set_refcount(refcount_block, block_index, refcount); if (refcount == 0 && s->discard_passthrough[type]) { update_refcount_discard(bs, cluster_offset, s->cluster_size); @@ -607,8 +762,7 @@ fail: /* Write last changed block to disk */ if (refcount_block) { int wret; - wret = qcow2_cache_put(bs, s->refcount_block_cache, - (void**) &refcount_block); + wret = qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block); if (wret < 0) { return ret < 0 ? ret : wret; } @@ -620,8 +774,8 @@ fail: */ if (ret < 0) { int dummy; - dummy = update_refcount(bs, offset, cluster_offset - offset, -addend, - QCOW2_DISCARD_NEVER); + dummy = update_refcount(bs, offset, cluster_offset - offset, addend, + !decrease, QCOW2_DISCARD_NEVER); (void)dummy; } @@ -631,24 +785,26 @@ fail: /* * Increases or decreases the refcount of a given cluster. * - * If the return value is non-negative, it is the new refcount of the cluster. - * If it is negative, it is -errno and indicates an error. + * @addend is the absolute value of the addend; if @decrease is set, @addend + * will be subtracted from the current refcount, otherwise it will be added. + * + * On success 0 is returned; on failure -errno is returned. */ int qcow2_update_cluster_refcount(BlockDriverState *bs, int64_t cluster_index, - int addend, + uint64_t addend, bool decrease, enum qcow2_discard_type type) { BDRVQcowState *s = bs->opaque; int ret; ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend, - type); + decrease, type); if (ret < 0) { return ret; } - return qcow2_get_refcount(bs, cluster_index); + return 0; } @@ -662,17 +818,17 @@ int qcow2_update_cluster_refcount(BlockDriverState *bs, static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size) { BDRVQcowState *s = bs->opaque; - uint64_t i, nb_clusters; - int refcount; + uint64_t i, nb_clusters, refcount; + int ret; nb_clusters = size_to_clusters(s, size); retry: for(i = 0; i < nb_clusters; i++) { uint64_t next_cluster_index = s->free_cluster_index++; - refcount = qcow2_get_refcount(bs, next_cluster_index); + ret = qcow2_get_refcount(bs, next_cluster_index, &refcount); - if (refcount < 0) { - return refcount; + if (ret < 0) { + return ret; } else if (refcount != 0) { goto retry; } @@ -706,7 +862,7 @@ int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size) return offset; } - ret = update_refcount(bs, offset, size, 1, QCOW2_DISCARD_NEVER); + ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER); } while (ret == -EAGAIN); if (ret < 0) { @@ -720,9 +876,9 @@ int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, int nb_clusters) { BDRVQcowState *s = bs->opaque; - uint64_t cluster_index; + uint64_t cluster_index, refcount; uint64_t i; - int refcount, ret; + int ret; assert(nb_clusters >= 0); if (nb_clusters == 0) { @@ -733,17 +889,16 @@ int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, /* Check how many clusters there are free */ cluster_index = offset >> s->cluster_bits; for(i = 0; i < nb_clusters; i++) { - refcount = qcow2_get_refcount(bs, cluster_index++); - - if (refcount < 0) { - return refcount; + ret = qcow2_get_refcount(bs, cluster_index++, &refcount); + if (ret < 0) { + return ret; } else if (refcount != 0) { break; } } /* And then allocate them */ - ret = update_refcount(bs, offset, i << s->cluster_bits, 1, + ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false, QCOW2_DISCARD_NEVER); } while (ret == -EAGAIN); @@ -770,12 +925,13 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size) offset = s->free_byte_offset; if (offset) { - int refcount = qcow2_get_refcount(bs, offset >> s->cluster_bits); - if (refcount < 0) { - return refcount; + uint64_t refcount; + ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount); + if (ret < 0) { + return ret; } - if (refcount == 0xffff) { + if (refcount == s->refcount_max) { offset = 0; } } @@ -793,7 +949,7 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size) } assert(offset); - ret = update_refcount(bs, offset, size, 1, QCOW2_DISCARD_NEVER); + ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER); if (ret < 0) { return ret; } @@ -817,7 +973,7 @@ void qcow2_free_clusters(BlockDriverState *bs, int ret; BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE); - ret = update_refcount(bs, offset, size, -1, type); + ret = update_refcount(bs, offset, size, 1, true, type); if (ret < 0) { fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret)); /* TODO Remember the clusters to free them later and avoid leaking */ @@ -876,12 +1032,14 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, int64_t l1_table_offset, int l1_size, int addend) { BDRVQcowState *s = bs->opaque; - uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2; + uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, refcount; bool l1_allocated = false; int64_t old_offset, old_l2_offset; - int i, j, l1_modified = 0, nb_csectors, refcount; + int i, j, l1_modified = 0, nb_csectors; int ret; + assert(addend >= -1 && addend <= 1); + l2_table = NULL; l1_table = NULL; l1_size2 = l1_size * sizeof(uint64_t); @@ -946,7 +1104,7 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, if (addend != 0) { ret = update_refcount(bs, (offset & s->cluster_offset_mask) & ~511, - nb_csectors * 512, addend, + nb_csectors * 512, abs(addend), addend < 0, QCOW2_DISCARD_SNAPSHOT); if (ret < 0) { goto fail; @@ -976,15 +1134,16 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, break; } if (addend != 0) { - refcount = qcow2_update_cluster_refcount(bs, - cluster_index, addend, + ret = qcow2_update_cluster_refcount(bs, + cluster_index, abs(addend), addend < 0, QCOW2_DISCARD_SNAPSHOT); - } else { - refcount = qcow2_get_refcount(bs, cluster_index); + if (ret < 0) { + goto fail; + } } - if (refcount < 0) { - ret = refcount; + ret = qcow2_get_refcount(bs, cluster_index, &refcount); + if (ret < 0) { goto fail; } break; @@ -1017,13 +1176,17 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, if (addend != 0) { - refcount = qcow2_update_cluster_refcount(bs, l2_offset >> - s->cluster_bits, addend, QCOW2_DISCARD_SNAPSHOT); - } else { - refcount = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits); + ret = qcow2_update_cluster_refcount(bs, l2_offset >> + s->cluster_bits, + abs(addend), addend < 0, + QCOW2_DISCARD_SNAPSHOT); + if (ret < 0) { + goto fail; + } } - if (refcount < 0) { - ret = refcount; + ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits, + &refcount); + if (ret < 0) { goto fail; } else if (refcount == 1) { l2_offset |= QCOW_OFLAG_COPIED; @@ -1068,6 +1231,63 @@ fail: /* refcount checking functions */ +static size_t refcount_array_byte_size(BDRVQcowState *s, uint64_t entries) +{ + /* This assertion holds because there is no way we can address more than + * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because + * offsets have to be representable in bytes); due to every cluster + * corresponding to one refcount entry, we are well below that limit */ + assert(entries < (UINT64_C(1) << (64 - 9))); + + /* Thanks to the assertion this will not overflow, because + * s->refcount_order < 7. + * (note: x << s->refcount_order == x * s->refcount_bits) */ + return DIV_ROUND_UP(entries << s->refcount_order, 8); +} + +/** + * Reallocates *array so that it can hold new_size entries. *size must contain + * the current number of entries in *array. If the reallocation fails, *array + * and *size will not be modified and -errno will be returned. If the + * reallocation is successful, *array will be set to the new buffer, *size + * will be set to new_size and 0 will be returned. The size of the reallocated + * refcount array buffer will be aligned to a cluster boundary, and the newly + * allocated area will be zeroed. + */ +static int realloc_refcount_array(BDRVQcowState *s, void **array, + int64_t *size, int64_t new_size) +{ + size_t old_byte_size, new_byte_size; + void *new_ptr; + + /* Round to clusters so the array can be directly written to disk */ + old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size)) + * s->cluster_size; + new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size)) + * s->cluster_size; + + if (new_byte_size == old_byte_size) { + *size = new_size; + return 0; + } + + assert(new_byte_size > 0); + + new_ptr = g_try_realloc(*array, new_byte_size); + if (!new_ptr) { + return -ENOMEM; + } + + if (new_byte_size > old_byte_size) { + memset((void *)((uintptr_t)new_ptr + old_byte_size), 0, + new_byte_size - old_byte_size); + } + + *array = new_ptr; + *size = new_size; + + return 0; +} /* * Increases the refcount for a range of clusters in a given refcount table. @@ -1078,12 +1298,13 @@ fail: */ static int inc_refcounts(BlockDriverState *bs, BdrvCheckResult *res, - uint16_t **refcount_table, + void **refcount_table, int64_t *refcount_table_size, int64_t offset, int64_t size) { BDRVQcowState *s = bs->opaque; - uint64_t start, last, cluster_offset, k; + uint64_t start, last, cluster_offset, k, refcount; + int ret; if (size <= 0) { return 0; @@ -1095,30 +1316,22 @@ static int inc_refcounts(BlockDriverState *bs, cluster_offset += s->cluster_size) { k = cluster_offset >> s->cluster_bits; if (k >= *refcount_table_size) { - int64_t old_refcount_table_size = *refcount_table_size; - uint16_t *new_refcount_table; - - *refcount_table_size = k + 1; - new_refcount_table = g_try_realloc(*refcount_table, - *refcount_table_size * - sizeof(**refcount_table)); - if (!new_refcount_table) { - *refcount_table_size = old_refcount_table_size; + ret = realloc_refcount_array(s, refcount_table, + refcount_table_size, k + 1); + if (ret < 0) { res->check_errors++; - return -ENOMEM; + return ret; } - *refcount_table = new_refcount_table; - - memset(*refcount_table + old_refcount_table_size, 0, - (*refcount_table_size - old_refcount_table_size) * - sizeof(**refcount_table)); } - if (++(*refcount_table)[k] == 0) { + refcount = s->get_refcount(*refcount_table, k); + if (refcount == s->refcount_max) { fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64 "\n", cluster_offset); res->corruptions++; + continue; } + s->set_refcount(*refcount_table, k, refcount + 1); } return 0; @@ -1138,8 +1351,9 @@ enum { * error occurred. */ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, - uint16_t **refcount_table, int64_t *refcount_table_size, int64_t l2_offset, - int flags) + void **refcount_table, + int64_t *refcount_table_size, int64_t l2_offset, + int flags) { BDRVQcowState *s = bs->opaque; uint64_t *l2_table, l2_entry; @@ -1256,7 +1470,7 @@ fail: */ static int check_refcounts_l1(BlockDriverState *bs, BdrvCheckResult *res, - uint16_t **refcount_table, + void **refcount_table, int64_t *refcount_table_size, int64_t l1_table_offset, int l1_size, int flags) @@ -1341,7 +1555,7 @@ static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res, BDRVQcowState *s = bs->opaque; uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size); int ret; - int refcount; + uint64_t refcount; int i, j; for (i = 0; i < s->l1_size; i++) { @@ -1353,14 +1567,15 @@ static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res, continue; } - refcount = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits); - if (refcount < 0) { + ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits, + &refcount); + if (ret < 0) { /* don't print message nor increment check_errors */ continue; } if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) { fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d " - "l1_entry=%" PRIx64 " refcount=%d\n", + "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n", fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i, l1_entry, refcount); @@ -1395,15 +1610,16 @@ static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res, if ((cluster_type == QCOW2_CLUSTER_NORMAL) || ((cluster_type == QCOW2_CLUSTER_ZERO) && (data_offset != 0))) { - refcount = qcow2_get_refcount(bs, - data_offset >> s->cluster_bits); - if (refcount < 0) { + ret = qcow2_get_refcount(bs, + data_offset >> s->cluster_bits, + &refcount); + if (ret < 0) { /* don't print message nor increment check_errors */ continue; } if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) { fprintf(stderr, "%s OFLAG_COPIED data cluster: " - "l2_entry=%" PRIx64 " refcount=%d\n", + "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n", fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", l2_entry, refcount); @@ -1453,7 +1669,7 @@ fail: */ static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix, bool *rebuild, - uint16_t **refcount_table, int64_t *nb_clusters) + void **refcount_table, int64_t *nb_clusters) { BDRVQcowState *s = bs->opaque; int64_t i, size; @@ -1478,8 +1694,7 @@ static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res, fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i); if (fix & BDRV_FIX_ERRORS) { - int64_t old_nb_clusters = *nb_clusters; - uint16_t *new_refcount_table; + int64_t new_nb_clusters; if (offset > INT64_MAX - s->cluster_size) { ret = -EINVAL; @@ -1496,22 +1711,15 @@ static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res, goto resize_fail; } - *nb_clusters = size_to_clusters(s, size); - assert(*nb_clusters >= old_nb_clusters); + new_nb_clusters = size_to_clusters(s, size); + assert(new_nb_clusters >= *nb_clusters); - new_refcount_table = g_try_realloc(*refcount_table, - *nb_clusters * - sizeof(**refcount_table)); - if (!new_refcount_table) { - *nb_clusters = old_nb_clusters; + ret = realloc_refcount_array(s, refcount_table, + nb_clusters, new_nb_clusters); + if (ret < 0) { res->check_errors++; - return -ENOMEM; + return ret; } - *refcount_table = new_refcount_table; - - memset(*refcount_table + old_nb_clusters, 0, - (*nb_clusters - old_nb_clusters) * - sizeof(**refcount_table)); if (cluster >= *nb_clusters) { ret = -EINVAL; @@ -1546,9 +1754,10 @@ resize_fail: if (ret < 0) { return ret; } - if ((*refcount_table)[cluster] != 1) { + if (s->get_refcount(*refcount_table, cluster) != 1) { fprintf(stderr, "ERROR refcount block %" PRId64 - " refcount=%d\n", i, (*refcount_table)[cluster]); + " refcount=%" PRIu64 "\n", i, + s->get_refcount(*refcount_table, cluster)); res->corruptions++; *rebuild = true; } @@ -1563,7 +1772,7 @@ resize_fail: */ static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix, bool *rebuild, - uint16_t **refcount_table, int64_t *nb_clusters) + void **refcount_table, int64_t *nb_clusters) { BDRVQcowState *s = bs->opaque; int64_t i; @@ -1571,10 +1780,12 @@ static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res, int ret; if (!*refcount_table) { - *refcount_table = g_try_new0(uint16_t, *nb_clusters); - if (*nb_clusters && *refcount_table == NULL) { + int64_t old_size = 0; + ret = realloc_refcount_array(s, refcount_table, + &old_size, *nb_clusters); + if (ret < 0) { res->check_errors++; - return -ENOMEM; + return ret; } } @@ -1625,22 +1836,23 @@ static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res, static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix, bool *rebuild, int64_t *highest_cluster, - uint16_t *refcount_table, int64_t nb_clusters) + void *refcount_table, int64_t nb_clusters) { BDRVQcowState *s = bs->opaque; int64_t i; - int refcount1, refcount2, ret; + uint64_t refcount1, refcount2; + int ret; for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) { - refcount1 = qcow2_get_refcount(bs, i); - if (refcount1 < 0) { + ret = qcow2_get_refcount(bs, i, &refcount1); + if (ret < 0) { fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n", - i, strerror(-refcount1)); + i, strerror(-ret)); res->check_errors++; continue; } - refcount2 = refcount_table[i]; + refcount2 = s->get_refcount(refcount_table, i); if (refcount1 > 0 || refcount2 > 0) { *highest_cluster = i; @@ -1657,7 +1869,8 @@ static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res, num_fixed = &res->corruptions_fixed; } - fprintf(stderr, "%s cluster %" PRId64 " refcount=%d reference=%d\n", + fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64 + " reference=%" PRIu64 "\n", num_fixed != NULL ? "Repairing" : refcount1 < refcount2 ? "ERROR" : "Leaked", @@ -1665,7 +1878,8 @@ static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res, if (num_fixed) { ret = update_refcount(bs, i << s->cluster_bits, 1, - refcount2 - refcount1, + refcount_diff(refcount1, refcount2), + refcount1 > refcount2, QCOW2_DISCARD_ALWAYS); if (ret >= 0) { (*num_fixed)++; @@ -1697,7 +1911,7 @@ static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res, */ static int64_t alloc_clusters_imrt(BlockDriverState *bs, int cluster_count, - uint16_t **refcount_table, + void **refcount_table, int64_t *imrt_nb_clusters, int64_t *first_free_cluster) { @@ -1705,6 +1919,7 @@ static int64_t alloc_clusters_imrt(BlockDriverState *bs, int64_t cluster = *first_free_cluster, i; bool first_gap = true; int contiguous_free_clusters; + int ret; /* Starting at *first_free_cluster, find a range of at least cluster_count * continuously free clusters */ @@ -1713,7 +1928,7 @@ static int64_t alloc_clusters_imrt(BlockDriverState *bs, contiguous_free_clusters < cluster_count; cluster++) { - if (!(*refcount_table)[cluster]) { + if (!s->get_refcount(*refcount_table, cluster)) { contiguous_free_clusters++; if (first_gap) { /* If this is the first free cluster found, update @@ -1734,34 +1949,24 @@ static int64_t alloc_clusters_imrt(BlockDriverState *bs, /* If no such range could be found, grow the in-memory refcount table * accordingly to append free clusters at the end of the image */ if (contiguous_free_clusters < cluster_count) { - int64_t old_imrt_nb_clusters = *imrt_nb_clusters; - uint16_t *new_refcount_table; - /* contiguous_free_clusters clusters are already empty at the image end; * we need cluster_count clusters; therefore, we have to allocate * cluster_count - contiguous_free_clusters new clusters at the end of * the image (which is the current value of cluster; note that cluster * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond * the image end) */ - *imrt_nb_clusters = cluster + cluster_count - contiguous_free_clusters; - new_refcount_table = g_try_realloc(*refcount_table, - *imrt_nb_clusters * - sizeof(**refcount_table)); - if (!new_refcount_table) { - *imrt_nb_clusters = old_imrt_nb_clusters; - return -ENOMEM; + ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters, + cluster + cluster_count + - contiguous_free_clusters); + if (ret < 0) { + return ret; } - *refcount_table = new_refcount_table; - - memset(*refcount_table + old_imrt_nb_clusters, 0, - (*imrt_nb_clusters - old_imrt_nb_clusters) * - sizeof(**refcount_table)); } /* Go back to the first free cluster */ cluster -= contiguous_free_clusters; for (i = 0; i < cluster_count; i++) { - (*refcount_table)[cluster + i] = 1; + s->set_refcount(*refcount_table, cluster + i, 1); } return cluster << s->cluster_bits; @@ -1777,7 +1982,7 @@ static int64_t alloc_clusters_imrt(BlockDriverState *bs, */ static int rebuild_refcount_structure(BlockDriverState *bs, BdrvCheckResult *res, - uint16_t **refcount_table, + void **refcount_table, int64_t *nb_clusters) { BDRVQcowState *s = bs->opaque; @@ -1785,8 +1990,8 @@ static int rebuild_refcount_structure(BlockDriverState *bs, int64_t refblock_offset, refblock_start, refblock_index; uint32_t reftable_size = 0; uint64_t *on_disk_reftable = NULL; - uint16_t *on_disk_refblock; - int i, ret = 0; + void *on_disk_refblock; + int ret = 0; struct { uint64_t reftable_offset; uint32_t reftable_clusters; @@ -1796,7 +2001,7 @@ static int rebuild_refcount_structure(BlockDriverState *bs, write_refblocks: for (; cluster < *nb_clusters; cluster++) { - if (!(*refcount_table)[cluster]) { + if (!s->get_refcount(*refcount_table, cluster)) { continue; } @@ -1869,17 +2074,13 @@ write_refblocks: goto fail; } - on_disk_refblock = qemu_blockalign0(bs->file, s->cluster_size); - for (i = 0; i < s->refcount_block_size && - refblock_start + i < *nb_clusters; i++) - { - on_disk_refblock[i] = - cpu_to_be16((*refcount_table)[refblock_start + i]); - } + /* The size of *refcount_table is always cluster-aligned, therefore the + * write operation will not overflow */ + on_disk_refblock = (void *)((char *) *refcount_table + + refblock_index * s->cluster_size); ret = bdrv_write(bs->file, refblock_offset / BDRV_SECTOR_SIZE, - (void *)on_disk_refblock, s->cluster_sectors); - qemu_vfree(on_disk_refblock); + on_disk_refblock, s->cluster_sectors); if (ret < 0) { fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret)); goto fail; @@ -1974,7 +2175,7 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, BDRVQcowState *s = bs->opaque; BdrvCheckResult pre_compare_res; int64_t size, highest_cluster, nb_clusters; - uint16_t *refcount_table = NULL; + void *refcount_table = NULL; bool rebuild = false; int ret; @@ -2023,7 +2224,7 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, /* Because the old reftable has been exchanged for a new one the * references have to be recalculated */ rebuild = false; - memset(refcount_table, 0, nb_clusters * sizeof(uint16_t)); + memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters)); ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table, &nb_clusters); if (ret < 0) { diff --git a/block/qcow2.c b/block/qcow2.c index 50e0a947df..8bfb094e53 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -677,13 +677,16 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, } /* Check support for various header values */ - if (header.refcount_order != 4) { - report_unsupported(bs, errp, "%d bit reference counts", - 1 << header.refcount_order); - ret = -ENOTSUP; + if (header.refcount_order > 6) { + error_setg(errp, "Reference count entry width too large; may not " + "exceed 64 bits"); + ret = -EINVAL; goto fail; } s->refcount_order = header.refcount_order; + s->refcount_bits = 1 << s->refcount_order; + s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1); + s->refcount_max += s->refcount_max - 1; if (header.crypt_method > QCOW_CRYPT_AES) { error_setg(errp, "Unsupported encryption method: %" PRIu32, @@ -1780,7 +1783,7 @@ static int preallocate(BlockDriverState *bs) static int qcow2_create2(const char *filename, int64_t total_size, const char *backing_file, const char *backing_format, int flags, size_t cluster_size, PreallocMode prealloc, - QemuOpts *opts, int version, + QemuOpts *opts, int version, int refcount_order, Error **errp) { /* Calculate cluster_bits */ @@ -1813,9 +1816,21 @@ static int qcow2_create2(const char *filename, int64_t total_size, int ret; if (prealloc == PREALLOC_MODE_FULL || prealloc == PREALLOC_MODE_FALLOC) { + /* Note: The following calculation does not need to be exact; if it is a + * bit off, either some bytes will be "leaked" (which is fine) or we + * will need to increase the file size by some bytes (which is fine, + * too, as long as the bulk is allocated here). Therefore, using + * floating point arithmetic is fine. */ int64_t meta_size = 0; uint64_t nreftablee, nrefblocke, nl1e, nl2e; int64_t aligned_total_size = align_offset(total_size, cluster_size); + int refblock_bits, refblock_size; + /* refcount entry size in bytes */ + double rces = (1 << refcount_order) / 8.; + + /* see qcow2_open() */ + refblock_bits = cluster_bits - (refcount_order - 3); + refblock_size = 1 << refblock_bits; /* header: 1 cluster */ meta_size += cluster_size; @@ -1840,20 +1855,20 @@ static int qcow2_create2(const char *filename, int64_t total_size, * c = cluster size * y1 = number of refcount blocks entries * y2 = meta size including everything + * rces = refcount entry size in bytes * then, * y1 = (y2 + a)/c - * y2 = y1 * sizeof(u16) + y1 * sizeof(u16) * sizeof(u64) / c + m + * y2 = y1 * rces + y1 * rces * sizeof(u64) / c + m * we can get y1: - * y1 = (a + m) / (c - sizeof(u16) - sizeof(u16) * sizeof(u64) / c) + * y1 = (a + m) / (c - rces - rces * sizeof(u64) / c) */ - nrefblocke = (aligned_total_size + meta_size + cluster_size) / - (cluster_size - sizeof(uint16_t) - - 1.0 * sizeof(uint16_t) * sizeof(uint64_t) / cluster_size); - nrefblocke = align_offset(nrefblocke, cluster_size / sizeof(uint16_t)); - meta_size += nrefblocke * sizeof(uint16_t); + nrefblocke = (aligned_total_size + meta_size + cluster_size) + / (cluster_size - rces - rces * sizeof(uint64_t) + / cluster_size); + meta_size += DIV_ROUND_UP(nrefblocke, refblock_size) * cluster_size; /* total size of refcount tables */ - nreftablee = nrefblocke * sizeof(uint16_t) / cluster_size; + nreftablee = nrefblocke / refblock_size; nreftablee = align_offset(nreftablee, cluster_size / sizeof(uint64_t)); meta_size += nreftablee * sizeof(uint64_t); @@ -1889,7 +1904,7 @@ static int qcow2_create2(const char *filename, int64_t total_size, .l1_size = cpu_to_be32(0), .refcount_table_offset = cpu_to_be64(cluster_size), .refcount_table_clusters = cpu_to_be32(1), - .refcount_order = cpu_to_be32(4), + .refcount_order = cpu_to_be32(refcount_order), .header_length = cpu_to_be32(sizeof(*header)), }; @@ -2008,6 +2023,8 @@ static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp) size_t cluster_size = DEFAULT_CLUSTER_SIZE; PreallocMode prealloc; int version = 3; + uint64_t refcount_bits = 16; + int refcount_order; Error *local_err = NULL; int ret; @@ -2062,8 +2079,28 @@ static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp) goto finish; } + refcount_bits = qemu_opt_get_number_del(opts, BLOCK_OPT_REFCOUNT_BITS, + refcount_bits); + if (refcount_bits > 64 || !is_power_of_2(refcount_bits)) { + error_setg(errp, "Refcount width must be a power of two and may not " + "exceed 64 bits"); + ret = -EINVAL; + goto finish; + } + + if (version < 3 && refcount_bits != 16) { + error_setg(errp, "Different refcount widths than 16 bits require " + "compatibility level 1.1 or above (use compat=1.1 or " + "greater)"); + ret = -EINVAL; + goto finish; + } + + refcount_order = ffs(refcount_bits) - 1; + ret = qcow2_create2(filename, size, backing_file, backing_fmt, flags, - cluster_size, prealloc, opts, version, &local_err); + cluster_size, prealloc, opts, version, refcount_order, + &local_err); if (local_err) { error_propagate(errp, local_err); } @@ -2479,7 +2516,8 @@ static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs) }; if (s->qcow_version == 2) { *spec_info->qcow2 = (ImageInfoSpecificQCow2){ - .compat = g_strdup("0.10"), + .compat = g_strdup("0.10"), + .refcount_bits = s->refcount_bits, }; } else if (s->qcow_version == 3) { *spec_info->qcow2 = (ImageInfoSpecificQCow2){ @@ -2490,6 +2528,7 @@ static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs) .corrupt = s->incompatible_features & QCOW2_INCOMPAT_CORRUPT, .has_corrupt = true, + .refcount_bits = s->refcount_bits, }; } @@ -2642,8 +2681,8 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts, continue; } - if (!strcmp(desc->name, "compat")) { - compat = qemu_opt_get(opts, "compat"); + if (!strcmp(desc->name, BLOCK_OPT_COMPAT_LEVEL)) { + compat = qemu_opt_get(opts, BLOCK_OPT_COMPAT_LEVEL); if (!compat) { /* preserve default */ } else if (!strcmp(compat, "0.10")) { @@ -2654,33 +2693,37 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts, fprintf(stderr, "Unknown compatibility level %s.\n", compat); return -EINVAL; } - } else if (!strcmp(desc->name, "preallocation")) { + } else if (!strcmp(desc->name, BLOCK_OPT_PREALLOC)) { fprintf(stderr, "Cannot change preallocation mode.\n"); return -ENOTSUP; - } else if (!strcmp(desc->name, "size")) { - new_size = qemu_opt_get_size(opts, "size", 0); - } else if (!strcmp(desc->name, "backing_file")) { - backing_file = qemu_opt_get(opts, "backing_file"); - } else if (!strcmp(desc->name, "backing_fmt")) { - backing_format = qemu_opt_get(opts, "backing_fmt"); - } else if (!strcmp(desc->name, "encryption")) { - encrypt = qemu_opt_get_bool(opts, "encryption", s->crypt_method); + } else if (!strcmp(desc->name, BLOCK_OPT_SIZE)) { + new_size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0); + } else if (!strcmp(desc->name, BLOCK_OPT_BACKING_FILE)) { + backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE); + } else if (!strcmp(desc->name, BLOCK_OPT_BACKING_FMT)) { + backing_format = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT); + } else if (!strcmp(desc->name, BLOCK_OPT_ENCRYPT)) { + encrypt = qemu_opt_get_bool(opts, BLOCK_OPT_ENCRYPT, + s->crypt_method); if (encrypt != !!s->crypt_method) { fprintf(stderr, "Changing the encryption flag is not " "supported.\n"); return -ENOTSUP; } - } else if (!strcmp(desc->name, "cluster_size")) { - cluster_size = qemu_opt_get_size(opts, "cluster_size", + } else if (!strcmp(desc->name, BLOCK_OPT_CLUSTER_SIZE)) { + cluster_size = qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, cluster_size); if (cluster_size != s->cluster_size) { fprintf(stderr, "Changing the cluster size is not " "supported.\n"); return -ENOTSUP; } - } else if (!strcmp(desc->name, "lazy_refcounts")) { - lazy_refcounts = qemu_opt_get_bool(opts, "lazy_refcounts", + } else if (!strcmp(desc->name, BLOCK_OPT_LAZY_REFCOUNTS)) { + lazy_refcounts = qemu_opt_get_bool(opts, BLOCK_OPT_LAZY_REFCOUNTS, lazy_refcounts); + } else if (!strcmp(desc->name, BLOCK_OPT_REFCOUNT_BITS)) { + error_report("Cannot change refcount entry width"); + return -ENOTSUP; } else { /* if this assertion fails, this probably means a new option was * added without having it covered here */ @@ -2850,6 +2893,12 @@ static QemuOptsList qcow2_create_opts = { .help = "Postpone refcount updates", .def_value_str = "off" }, + { + .name = BLOCK_OPT_REFCOUNT_BITS, + .type = QEMU_OPT_NUMBER, + .help = "Width of a reference count entry in bits", + .def_value_str = "16" + }, { /* end of list */ } } }; diff --git a/block/qcow2.h b/block/qcow2.h index 6e39a1b639..aa6d367818 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -213,6 +213,11 @@ typedef struct Qcow2DiscardRegion { QTAILQ_ENTRY(Qcow2DiscardRegion) next; } Qcow2DiscardRegion; +typedef uint64_t Qcow2GetRefcountFunc(const void *refcount_array, + uint64_t index); +typedef void Qcow2SetRefcountFunc(void *refcount_array, + uint64_t index, uint64_t value); + typedef struct BDRVQcowState { int cluster_bits; int cluster_size; @@ -258,6 +263,11 @@ typedef struct BDRVQcowState { int qcow_version; bool use_lazy_refcounts; int refcount_order; + int refcount_bits; + uint64_t refcount_max; + + Qcow2GetRefcountFunc *get_refcount; + Qcow2SetRefcountFunc *set_refcount; bool discard_passthrough[QCOW2_DISCARD_MAX]; @@ -275,17 +285,6 @@ typedef struct BDRVQcowState { bool cache_discards; } BDRVQcowState; -/* XXX: use std qcow open function ? */ -typedef struct QCowCreateState { - int cluster_size; - int cluster_bits; - uint16_t *refcount_block; - uint64_t *refcount_table; - int64_t l1_table_offset; - int64_t refcount_table_offset; - int64_t refcount_block_offset; -} QCowCreateState; - struct QCowAIOCB; typedef struct Qcow2COWRegion { @@ -468,6 +467,11 @@ static inline uint64_t l2meta_cow_end(QCowL2Meta *m) + (m->cow_end.nb_sectors << BDRV_SECTOR_BITS); } +static inline uint64_t refcount_diff(uint64_t r1, uint64_t r2) +{ + return r1 > r2 ? r1 - r2 : r2 - r1; +} + // FIXME Need qcow2_ prefix to global functions /* qcow2.c functions */ @@ -487,10 +491,12 @@ void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset, int qcow2_refcount_init(BlockDriverState *bs); void qcow2_refcount_close(BlockDriverState *bs); -int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index); +int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index, + uint64_t *refcount); int qcow2_update_cluster_refcount(BlockDriverState *bs, int64_t cluster_index, - int addend, enum qcow2_discard_type type); + uint64_t addend, bool decrease, + enum qcow2_discard_type type); int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size); int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, diff --git a/block/raw-posix.c b/block/raw-posix.c index b5f077a8f1..f0b4488d1e 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -56,6 +56,10 @@ #include <linux/cdrom.h> #include <linux/fd.h> #include <linux/fs.h> +#include <linux/hdreg.h> +#ifdef __s390__ +#include <asm/dasd.h> +#endif #ifndef FS_NOCOW_FL #define FS_NOCOW_FL 0x00800000 /* Do not cow file */ #endif @@ -218,39 +222,100 @@ static int raw_normalize_devicepath(const char **filename) } #endif -static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp) +/* + * Get logical block size via ioctl. On success store it in @sector_size_p. + */ +static int probe_logical_blocksize(int fd, unsigned int *sector_size_p) { - BDRVRawState *s = bs->opaque; - char *buf; unsigned int sector_size; + bool success = false; - /* For /dev/sg devices the alignment is not really used. - With buffered I/O, we don't have any restrictions. */ - if (bs->sg || !s->needs_alignment) { - bs->request_alignment = 1; - s->buf_align = 1; - return; - } + errno = ENOTSUP; /* Try a few ioctls to get the right size */ - bs->request_alignment = 0; - s->buf_align = 0; - #ifdef BLKSSZGET if (ioctl(fd, BLKSSZGET, §or_size) >= 0) { - bs->request_alignment = sector_size; + *sector_size_p = sector_size; + success = true; } #endif #ifdef DKIOCGETBLOCKSIZE if (ioctl(fd, DKIOCGETBLOCKSIZE, §or_size) >= 0) { - bs->request_alignment = sector_size; + *sector_size_p = sector_size; + success = true; } #endif #ifdef DIOCGSECTORSIZE if (ioctl(fd, DIOCGSECTORSIZE, §or_size) >= 0) { - bs->request_alignment = sector_size; + *sector_size_p = sector_size; + success = true; + } +#endif + + return success ? 0 : -errno; +} + +/** + * Get physical block size of @fd. + * On success, store it in @blk_size and return 0. + * On failure, return -errno. + */ +static int probe_physical_blocksize(int fd, unsigned int *blk_size) +{ +#ifdef BLKPBSZGET + if (ioctl(fd, BLKPBSZGET, blk_size) < 0) { + return -errno; + } + return 0; +#else + return -ENOTSUP; +#endif +} + +/* Check if read is allowed with given memory buffer and length. + * + * This function is used to check O_DIRECT memory buffer and request alignment. + */ +static bool raw_is_io_aligned(int fd, void *buf, size_t len) +{ + ssize_t ret = pread(fd, buf, len, 0); + + if (ret >= 0) { + return true; + } + +#ifdef __linux__ + /* The Linux kernel returns EINVAL for misaligned O_DIRECT reads. Ignore + * other errors (e.g. real I/O error), which could happen on a failed + * drive, since we only care about probing alignment. + */ + if (errno != EINVAL) { + return true; } #endif + + return false; +} + +static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp) +{ + BDRVRawState *s = bs->opaque; + char *buf; + + /* For /dev/sg devices the alignment is not really used. + With buffered I/O, we don't have any restrictions. */ + if (bs->sg || !s->needs_alignment) { + bs->request_alignment = 1; + s->buf_align = 1; + return; + } + + bs->request_alignment = 0; + s->buf_align = 0; + /* Let's try to use the logical blocksize for the alignment. */ + if (probe_logical_blocksize(fd, &bs->request_alignment) < 0) { + bs->request_alignment = 0; + } #ifdef CONFIG_XFS if (s->is_xfs) { struct dioattr da; @@ -267,7 +332,7 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp) size_t align; buf = qemu_memalign(MAX_BLOCKSIZE, 2 * MAX_BLOCKSIZE); for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) { - if (pread(fd, buf + align, MAX_BLOCKSIZE, 0) >= 0) { + if (raw_is_io_aligned(fd, buf + align, MAX_BLOCKSIZE)) { s->buf_align = align; break; } @@ -279,7 +344,7 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp) size_t align; buf = qemu_memalign(s->buf_align, MAX_BLOCKSIZE); for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) { - if (pread(fd, buf, align, 0) >= 0) { + if (raw_is_io_aligned(fd, buf, align)) { bs->request_alignment = align; break; } @@ -655,6 +720,86 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp) bs->bl.opt_mem_alignment = s->buf_align; } +static int check_for_dasd(int fd) +{ +#ifdef BIODASDINFO2 + struct dasd_information2_t info = {0}; + + return ioctl(fd, BIODASDINFO2, &info); +#else + return -1; +#endif +} + +/** + * Try to get @bs's logical and physical block size. + * On success, store them in @bsz and return zero. + * On failure, return negative errno. + */ +static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz) +{ + BDRVRawState *s = bs->opaque; + int ret; + + /* If DASD, get blocksizes */ + if (check_for_dasd(s->fd) < 0) { + return -ENOTSUP; + } + ret = probe_logical_blocksize(s->fd, &bsz->log); + if (ret < 0) { + return ret; + } + return probe_physical_blocksize(s->fd, &bsz->phys); +} + +/** + * Try to get @bs's geometry: cyls, heads, sectors. + * On success, store them in @geo and return 0. + * On failure return -errno. + * (Allows block driver to assign default geometry values that guest sees) + */ +#ifdef __linux__ +static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo) +{ + BDRVRawState *s = bs->opaque; + struct hd_geometry ioctl_geo = {0}; + uint32_t blksize; + + /* If DASD, get its geometry */ + if (check_for_dasd(s->fd) < 0) { + return -ENOTSUP; + } + if (ioctl(s->fd, HDIO_GETGEO, &ioctl_geo) < 0) { + return -errno; + } + /* HDIO_GETGEO may return success even though geo contains zeros + (e.g. certain multipath setups) */ + if (!ioctl_geo.heads || !ioctl_geo.sectors || !ioctl_geo.cylinders) { + return -ENOTSUP; + } + /* Do not return a geometry for partition */ + if (ioctl_geo.start != 0) { + return -ENOTSUP; + } + geo->heads = ioctl_geo.heads; + geo->sectors = ioctl_geo.sectors; + if (!probe_physical_blocksize(s->fd, &blksize)) { + /* overwrite cyls: HDIO_GETGEO result is incorrect for big drives */ + geo->cylinders = bdrv_nb_sectors(bs) / (blksize / BDRV_SECTOR_SIZE) + / (geo->heads * geo->sectors); + return 0; + } + geo->cylinders = ioctl_geo.cylinders; + + return 0; +} +#else /* __linux__ */ +static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo) +{ + return -ENOTSUP; +} +#endif + static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb) { int ret; @@ -944,7 +1089,9 @@ static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb) static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb) { +#if defined(CONFIG_FALLOCATE) || defined(CONFIG_XFS) BDRVRawState *s = aiocb->bs->opaque; +#endif if (aiocb->aio_type & QEMU_AIO_BLKDEV) { return handle_aiocb_write_zeroes_block(aiocb); @@ -2194,6 +2341,8 @@ static BlockDriver bdrv_host_device = { .bdrv_get_info = raw_get_info, .bdrv_get_allocated_file_size = raw_get_allocated_file_size, + .bdrv_probe_blocksizes = hdev_probe_blocksizes, + .bdrv_probe_geometry = hdev_probe_geometry, .bdrv_detach_aio_context = raw_detach_aio_context, .bdrv_attach_aio_context = raw_attach_aio_context, diff --git a/block/raw_bsd.c b/block/raw_bsd.c index 05b02c76d4..e3d2d04681 100644 --- a/block/raw_bsd.c +++ b/block/raw_bsd.c @@ -235,6 +235,16 @@ static int raw_probe(const uint8_t *buf, int buf_size, const char *filename) return 1; } +static int raw_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz) +{ + return bdrv_probe_blocksizes(bs->file, bsz); +} + +static int raw_probe_geometry(BlockDriverState *bs, HDGeometry *geo) +{ + return bdrv_probe_geometry(bs->file, geo); +} + BlockDriver bdrv_raw = { .format_name = "raw", .bdrv_probe = &raw_probe, @@ -252,6 +262,8 @@ BlockDriver bdrv_raw = { .has_variable_length = true, .bdrv_get_info = &raw_get_info, .bdrv_refresh_limits = &raw_refresh_limits, + .bdrv_probe_blocksizes = &raw_probe_blocksizes, + .bdrv_probe_geometry = &raw_probe_geometry, .bdrv_is_inserted = &raw_is_inserted, .bdrv_media_changed = &raw_media_changed, .bdrv_eject = &raw_eject, diff --git a/block/sheepdog.c b/block/sheepdog.c index d17ee360c5..c14172cfa6 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -37,6 +37,7 @@ #define SD_OP_READ_VDIS 0x15 #define SD_OP_FLUSH_VDI 0x16 #define SD_OP_DEL_VDI 0x17 +#define SD_OP_GET_CLUSTER_DEFAULT 0x18 #define SD_FLAG_CMD_WRITE 0x01 #define SD_FLAG_CMD_COW 0x02 @@ -91,6 +92,7 @@ #define SD_NR_VDIS (1U << 24) #define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22) #define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS) +#define SD_DEFAULT_BLOCK_SIZE_SHIFT 22 /* * For erasure coding, we use at most SD_EC_MAX_STRIP for data strips and * (SD_EC_MAX_STRIP - 1) for parity strips @@ -167,7 +169,8 @@ typedef struct SheepdogVdiReq { uint32_t base_vdi_id; uint8_t copies; uint8_t copy_policy; - uint8_t reserved[2]; + uint8_t store_policy; + uint8_t block_size_shift; uint32_t snapid; uint32_t type; uint32_t pad[2]; @@ -186,6 +189,21 @@ typedef struct SheepdogVdiRsp { uint32_t pad[5]; } SheepdogVdiRsp; +typedef struct SheepdogClusterRsp { + uint8_t proto_ver; + uint8_t opcode; + uint16_t flags; + uint32_t epoch; + uint32_t id; + uint32_t data_length; + uint32_t result; + uint8_t nr_copies; + uint8_t copy_policy; + uint8_t block_size_shift; + uint8_t __pad1; + uint32_t __pad2[6]; +} SheepdogClusterRsp; + typedef struct SheepdogInode { char name[SD_MAX_VDI_LEN]; char tag[SD_MAX_VDI_TAG_LEN]; @@ -527,6 +545,7 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov, return acb; } +/* Return -EIO in case of error, file descriptor on success */ static int connect_to_sdog(BDRVSheepdogState *s, Error **errp) { int fd; @@ -546,11 +565,14 @@ static int connect_to_sdog(BDRVSheepdogState *s, Error **errp) if (fd >= 0) { qemu_set_nonblock(fd); + } else { + fd = -EIO; } return fd; } +/* Return 0 on success and -errno in case of error */ static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data, unsigned int *wlen) { @@ -559,11 +581,13 @@ static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data, ret = qemu_co_send(sockfd, hdr, sizeof(*hdr)); if (ret != sizeof(*hdr)) { error_report("failed to send a req, %s", strerror(errno)); + ret = -socket_error(); return ret; } ret = qemu_co_send(sockfd, data, *wlen); if (ret != *wlen) { + ret = -socket_error(); error_report("failed to send a req, %s", strerror(errno)); } @@ -638,6 +662,11 @@ out: srco->finished = true; } +/* + * Send the request to the sheep in a synchronous manner. + * + * Return 0 on success, -errno in case of error. + */ static int do_req(int sockfd, AioContext *aio_context, SheepdogReq *hdr, void *data, unsigned int *wlen, unsigned int *rlen) { @@ -1541,6 +1570,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot, hdr.vdi_size = s->inode.vdi_size; hdr.copy_policy = s->inode.copy_policy; hdr.copies = s->inode.nr_copies; + hdr.block_size_shift = s->inode.block_size_shift; ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen); @@ -1566,9 +1596,12 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot, static int sd_prealloc(const char *filename, Error **errp) { BlockDriverState *bs = NULL; + BDRVSheepdogState *base = NULL; + unsigned long buf_size; uint32_t idx, max_idx; + uint32_t object_size; int64_t vdi_size; - void *buf = g_malloc0(SD_DATA_OBJ_SIZE); + void *buf = NULL; int ret; ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, @@ -1582,18 +1615,24 @@ static int sd_prealloc(const char *filename, Error **errp) ret = vdi_size; goto out; } - max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE); + + base = bs->opaque; + object_size = (UINT32_C(1) << base->inode.block_size_shift); + buf_size = MIN(object_size, SD_DATA_OBJ_SIZE); + buf = g_malloc0(buf_size); + + max_idx = DIV_ROUND_UP(vdi_size, buf_size); for (idx = 0; idx < max_idx; idx++) { /* * The created image can be a cloned image, so we need to read * a data from the source image. */ - ret = bdrv_pread(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE); + ret = bdrv_pread(bs, idx * buf_size, buf, buf_size); if (ret < 0) { goto out; } - ret = bdrv_pwrite(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE); + ret = bdrv_pwrite(bs, idx * buf_size, buf, buf_size); if (ret < 0) { goto out; } @@ -1666,6 +1705,27 @@ static int parse_redundancy(BDRVSheepdogState *s, const char *opt) return 0; } +static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt) +{ + struct SheepdogInode *inode = &s->inode; + uint64_t object_size; + int obj_order; + + object_size = qemu_opt_get_size_del(opt, BLOCK_OPT_OBJECT_SIZE, 0); + if (object_size) { + if ((object_size - 1) & object_size) { /* not a power of 2? */ + return -EINVAL; + } + obj_order = ffs(object_size) - 1; + if (obj_order < 20 || obj_order > 31) { + return -EINVAL; + } + inode->block_size_shift = (uint8_t)obj_order; + } + + return 0; +} + static int sd_create(const char *filename, QemuOpts *opts, Error **errp) { @@ -1676,6 +1736,7 @@ static int sd_create(const char *filename, QemuOpts *opts, BDRVSheepdogState *s; char tag[SD_MAX_VDI_TAG_LEN]; uint32_t snapid; + uint64_t max_vdi_size; bool prealloc = false; s = g_new0(BDRVSheepdogState, 1); @@ -1714,10 +1775,11 @@ static int sd_create(const char *filename, QemuOpts *opts, goto out; } } - - if (s->inode.vdi_size > SD_MAX_VDI_SIZE) { - error_setg(errp, "too big image size"); - ret = -EINVAL; + ret = parse_block_size_shift(s, opts); + if (ret < 0) { + error_setg(errp, "Invalid object_size." + " obect_size needs to be power of 2" + " and be limited from 2^20 to 2^31"); goto out; } @@ -1754,6 +1816,51 @@ static int sd_create(const char *filename, QemuOpts *opts, } s->aio_context = qemu_get_aio_context(); + + /* if block_size_shift is not specified, get cluster default value */ + if (s->inode.block_size_shift == 0) { + SheepdogVdiReq hdr; + SheepdogClusterRsp *rsp = (SheepdogClusterRsp *)&hdr; + Error *local_err = NULL; + int fd; + unsigned int wlen = 0, rlen = 0; + + fd = connect_to_sdog(s, &local_err); + if (fd < 0) { + error_report("%s", error_get_pretty(local_err)); + error_free(local_err); + ret = -EIO; + goto out; + } + + memset(&hdr, 0, sizeof(hdr)); + hdr.opcode = SD_OP_GET_CLUSTER_DEFAULT; + hdr.proto_ver = SD_PROTO_VER; + + ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, + NULL, &wlen, &rlen); + closesocket(fd); + if (ret) { + error_setg_errno(errp, -ret, "failed to get cluster default"); + goto out; + } + if (rsp->result == SD_RES_SUCCESS) { + s->inode.block_size_shift = rsp->block_size_shift; + } else { + s->inode.block_size_shift = SD_DEFAULT_BLOCK_SIZE_SHIFT; + } + } + + max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS; + + if (s->inode.vdi_size > max_vdi_size) { + error_setg(errp, "An image is too large." + " The maximum image size is %"PRIu64 "GB", + max_vdi_size / 1024 / 1024 / 1024); + ret = -EINVAL; + goto out; + } + ret = do_sd_create(s, &vid, 0, errp); if (ret) { goto out; @@ -1823,11 +1930,13 @@ static int sd_truncate(BlockDriverState *bs, int64_t offset) BDRVSheepdogState *s = bs->opaque; int ret, fd; unsigned int datalen; + uint64_t max_vdi_size; + max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS; if (offset < s->inode.vdi_size) { error_report("shrinking is not supported"); return -EINVAL; - } else if (offset > SD_MAX_VDI_SIZE) { + } else if (offset > max_vdi_size) { error_report("too big image size"); return -EINVAL; } @@ -2005,9 +2114,10 @@ static int coroutine_fn sd_co_rw_vector(void *p) SheepdogAIOCB *acb = p; int ret = 0; unsigned long len, done = 0, total = acb->nb_sectors * BDRV_SECTOR_SIZE; - unsigned long idx = acb->sector_num * BDRV_SECTOR_SIZE / SD_DATA_OBJ_SIZE; + unsigned long idx; + uint32_t object_size; uint64_t oid; - uint64_t offset = (acb->sector_num * BDRV_SECTOR_SIZE) % SD_DATA_OBJ_SIZE; + uint64_t offset; BDRVSheepdogState *s = acb->common.bs->opaque; SheepdogInode *inode = &s->inode; AIOReq *aio_req; @@ -2024,6 +2134,10 @@ static int coroutine_fn sd_co_rw_vector(void *p) } } + object_size = (UINT32_C(1) << inode->block_size_shift); + idx = acb->sector_num * BDRV_SECTOR_SIZE / object_size; + offset = (acb->sector_num * BDRV_SECTOR_SIZE) % object_size; + /* * Make sure we don't free the aiocb before we are done with all requests. * This additional reference is dropped at the end of this function. @@ -2037,7 +2151,7 @@ static int coroutine_fn sd_co_rw_vector(void *p) oid = vid_to_data_oid(inode->data_vdi_id[idx], idx); - len = MIN(total - done, SD_DATA_OBJ_SIZE - offset); + len = MIN(total - done, object_size - offset); switch (acb->aiocb_type) { case AIOCB_READ_UDATA: @@ -2061,7 +2175,7 @@ static int coroutine_fn sd_co_rw_vector(void *p) * We discard the object only when the whole object is * 1) allocated 2) trimmed. Otherwise, simply skip it. */ - if (len != SD_DATA_OBJ_SIZE || inode->data_vdi_id[idx] == 0) { + if (len != object_size || inode->data_vdi_id[idx] == 0) { goto done; } break; @@ -2225,9 +2339,8 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) ret = do_sd_create(s, &new_vid, 1, &local_err); if (ret < 0) { - error_report_err(local_err); - error_report("failed to create inode for snapshot. %s", - strerror(errno)); + error_report("failed to create inode for snapshot: %s", + error_get_pretty(local_err)); goto cleanup; } @@ -2414,6 +2527,7 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data, uint64_t offset; uint32_t vdi_index; uint32_t vdi_id = load ? s->inode.parent_vdi_id : s->inode.vdi_id; + uint32_t object_size = (UINT32_C(1) << s->inode.block_size_shift); fd = connect_to_sdog(s, &local_err); if (fd < 0) { @@ -2422,10 +2536,10 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data, } while (remaining) { - vdi_index = pos / SD_DATA_OBJ_SIZE; - offset = pos % SD_DATA_OBJ_SIZE; + vdi_index = pos / object_size; + offset = pos % object_size; - data_len = MIN(remaining, SD_DATA_OBJ_SIZE - offset); + data_len = MIN(remaining, object_size - offset); vmstate_oid = vid_to_vmstate_oid(vdi_id, vdi_index); @@ -2512,10 +2626,11 @@ sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors, { BDRVSheepdogState *s = bs->opaque; SheepdogInode *inode = &s->inode; + uint32_t object_size = (UINT32_C(1) << inode->block_size_shift); uint64_t offset = sector_num * BDRV_SECTOR_SIZE; - unsigned long start = offset / SD_DATA_OBJ_SIZE, + unsigned long start = offset / object_size, end = DIV_ROUND_UP((sector_num + nb_sectors) * - BDRV_SECTOR_SIZE, SD_DATA_OBJ_SIZE); + BDRV_SECTOR_SIZE, object_size); unsigned long idx; int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset; @@ -2534,7 +2649,7 @@ sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors, } } - *pnum = (idx - start) * SD_DATA_OBJ_SIZE / BDRV_SECTOR_SIZE; + *pnum = (idx - start) * object_size / BDRV_SECTOR_SIZE; if (*pnum > nb_sectors) { *pnum = nb_sectors; } @@ -2545,14 +2660,15 @@ static int64_t sd_get_allocated_file_size(BlockDriverState *bs) { BDRVSheepdogState *s = bs->opaque; SheepdogInode *inode = &s->inode; - unsigned long i, last = DIV_ROUND_UP(inode->vdi_size, SD_DATA_OBJ_SIZE); + uint32_t object_size = (UINT32_C(1) << inode->block_size_shift); + unsigned long i, last = DIV_ROUND_UP(inode->vdi_size, object_size); uint64_t size = 0; for (i = 0; i < last; i++) { if (inode->data_vdi_id[i] == 0) { continue; } - size += SD_DATA_OBJ_SIZE; + size += object_size; } return size; } @@ -2581,6 +2697,11 @@ static QemuOptsList sd_create_opts = { .type = QEMU_OPT_STRING, .help = "Redundancy of the image" }, + { + .name = BLOCK_OPT_OBJECT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Object size of the image" + }, { /* end of list */ } } }; diff --git a/block/vdi.c b/block/vdi.c index 74030c6e30..53bd02fe22 100644 --- a/block/vdi.c +++ b/block/vdi.c @@ -53,6 +53,7 @@ #include "block/block_int.h" #include "qemu/module.h" #include "migration/migration.h" +#include "block/coroutine.h" #if defined(CONFIG_UUID) #include <uuid/uuid.h> @@ -196,6 +197,8 @@ typedef struct { /* VDI header (converted to host endianness). */ VdiHeader header; + CoMutex write_lock; + Error *migration_blocker; } BDRVVdiState; @@ -504,6 +507,8 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags, "vdi", bdrv_get_device_name(bs), "live migration"); migrate_add_blocker(s->migration_blocker); + qemu_co_mutex_init(&s->write_lock); + return 0; fail_free_bmap: @@ -639,11 +644,31 @@ static int vdi_co_write(BlockDriverState *bs, buf, n_sectors * SECTOR_SIZE); memset(block + (sector_in_block + n_sectors) * SECTOR_SIZE, 0, (s->block_sectors - n_sectors - sector_in_block) * SECTOR_SIZE); + + /* Note that this coroutine does not yield anywhere from reading the + * bmap entry until here, so in regards to all the coroutines trying + * to write to this cluster, the one doing the allocation will + * always be the first to try to acquire the lock. + * Therefore, it is also the first that will actually be able to + * acquire the lock and thus the padded cluster is written before + * the other coroutines can write to the affected area. */ + qemu_co_mutex_lock(&s->write_lock); ret = bdrv_write(bs->file, offset, block, s->block_sectors); + qemu_co_mutex_unlock(&s->write_lock); } else { uint64_t offset = s->header.offset_data / SECTOR_SIZE + (uint64_t)bmap_entry * s->block_sectors + sector_in_block; + qemu_co_mutex_lock(&s->write_lock); + /* This lock is only used to make sure the following write operation + * is executed after the write issued by the coroutine allocating + * this cluster, therefore we do not need to keep it locked. + * As stated above, the allocating coroutine will always try to lock + * the mutex before all the other concurrent accesses to that + * cluster, therefore at this point we can be absolutely certain + * that that write operation has returned (there may be other writes + * in flight, but they do not concern this very operation). */ + qemu_co_mutex_unlock(&s->write_lock); ret = bdrv_write(bs->file, offset, buf, n_sectors); } diff --git a/block/vpc.c b/block/vpc.c index 46803b14be..1533b6a64d 100644 --- a/block/vpc.c +++ b/block/vpc.c @@ -597,6 +597,51 @@ static coroutine_fn int vpc_co_write(BlockDriverState *bs, int64_t sector_num, return ret; } +static int64_t coroutine_fn vpc_co_get_block_status(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, int *pnum) +{ + BDRVVPCState *s = bs->opaque; + VHDFooter *footer = (VHDFooter*) s->footer_buf; + int64_t start, offset, next; + bool allocated; + int n; + + if (be32_to_cpu(footer->type) == VHD_FIXED) { + *pnum = nb_sectors; + return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_DATA | + (sector_num << BDRV_SECTOR_BITS); + } + + offset = get_sector_offset(bs, sector_num, 0); + start = offset; + allocated = (offset != -1); + *pnum = 0; + + do { + /* All sectors in a block are contiguous (without using the bitmap) */ + n = ROUND_UP(sector_num + 1, s->block_size / BDRV_SECTOR_SIZE) + - sector_num; + n = MIN(n, nb_sectors); + + *pnum += n; + sector_num += n; + nb_sectors -= n; + next = start + (*pnum * BDRV_SECTOR_SIZE); + + if (nb_sectors == 0) { + break; + } + + offset = get_sector_offset(bs, sector_num, 0); + } while ((allocated && offset == next) || (!allocated && offset == -1)); + + if (allocated) { + return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start; + } else { + return 0; + } +} + /* * Calculates the number of cylinders, heads and sectors per cylinder * based on a given number of sectors. This is the algorithm described @@ -801,6 +846,7 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp) } total_sectors = (int64_t) cyls * heads * secs_per_cyl; + total_size = total_sectors * BDRV_SECTOR_SIZE; /* Prepare the Hard Disk Footer */ memset(buf, 0, 1024); @@ -822,13 +868,8 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp) /* Version of Virtual PC 2007 */ footer->major = cpu_to_be16(0x0005); footer->minor = cpu_to_be16(0x0003); - if (disk_type == VHD_DYNAMIC) { - footer->orig_size = cpu_to_be64(total_sectors * 512); - footer->size = cpu_to_be64(total_sectors * 512); - } else { - footer->orig_size = cpu_to_be64(total_size); - footer->size = cpu_to_be64(total_size); - } + footer->orig_size = cpu_to_be64(total_size); + footer->size = cpu_to_be64(total_size); footer->cyls = cpu_to_be16(cyls); footer->heads = heads; footer->secs_per_cyl = secs_per_cyl; @@ -907,8 +948,9 @@ static BlockDriver bdrv_vpc = { .bdrv_reopen_prepare = vpc_reopen_prepare, .bdrv_create = vpc_create, - .bdrv_read = vpc_co_read, - .bdrv_write = vpc_co_write, + .bdrv_read = vpc_co_read, + .bdrv_write = vpc_co_write, + .bdrv_co_get_block_status = vpc_co_get_block_status, .bdrv_get_info = vpc_get_info, |