diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2015-02-06 18:06:07 +0000 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2015-02-06 18:06:07 +0000 |
commit | 3d815ac82b0a3e816fd7a6d2561a73e780c3b685 (patch) | |
tree | bb5af8266e302cbe460b3e5de2cb261a877a6678 /block | |
parent | a2f2d288b5a06e6c680c387c9980d91363f59c61 (diff) | |
parent | 728dacbda817b2ca259e9d337fab06bcf14e94a6 (diff) |
Merge remote-tracking branch 'remotes/kevin/tags/for-upstream' into staging
Block patches for 2.3
# gpg: Signature made Fri 06 Feb 2015 17:14:10 GMT using RSA key ID C88F2FD6
# gpg: Good signature from "Kevin Wolf <kwolf@redhat.com>"
* remotes/kevin/tags/for-upstream: (47 commits)
block/raw-posix.c: Fix raw_getlength() on Mac OS X block devices
block: Eliminate silly QERR_ macros used for encryption keys
block: New bdrv_add_key(), convert monitor to use it
blockdev: Eliminate silly QERR_BLOCK_JOB_NOT_ACTIVE macro
blockdev: Give find_block_job() an Error ** parameter
qcow2: Rewrite qcow2_alloc_bytes()
block: Give always priority to unused entries in the qcow2 L2 cache
nbd: fix max_discard/max_transfer_length
block: introduce BDRV_REQUEST_MAX_SECTORS
nbd: Improve error messages
iotests: Fix 104 for NBD
iotests: Fix 100 for nbd
iotests: Fix 083
block: fix off-by-one error in qcow and qcow2
qemu-iotests: add 116 invalid QED input file tests
qed: check for header size overflow
block/dmg: improve zeroes handling
block/dmg: support bzip2 block entry types
block/dmg: factor out block type check
block/dmg: use SectorNumber from BLKX header
...
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Diffstat (limited to 'block')
-rw-r--r-- | block/Makefile.objs | 2 | ||||
-rw-r--r-- | block/accounting.c | 7 | ||||
-rw-r--r-- | block/block-backend.c | 5 | ||||
-rw-r--r-- | block/dmg.c | 502 | ||||
-rw-r--r-- | block/nbd-client.c | 4 | ||||
-rw-r--r-- | block/nbd-client.h | 2 | ||||
-rw-r--r-- | block/nbd.c | 11 | ||||
-rw-r--r-- | block/qapi.c | 5 | ||||
-rw-r--r-- | block/qcow.c | 2 | ||||
-rw-r--r-- | block/qcow2-cache.c | 4 | ||||
-rw-r--r-- | block/qcow2-refcount.c | 78 | ||||
-rw-r--r-- | block/qcow2.c | 2 | ||||
-rw-r--r-- | block/qed.c | 5 | ||||
-rw-r--r-- | block/qed.h | 1 | ||||
-rw-r--r-- | block/raw-posix.c | 140 | ||||
-rw-r--r-- | block/write-threshold.c | 125 |
16 files changed, 699 insertions, 196 deletions
diff --git a/block/Makefile.objs b/block/Makefile.objs index 04b0e43eb1..db2933e469 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -20,6 +20,7 @@ block-obj-$(CONFIG_GLUSTERFS) += gluster.o block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o block-obj-$(CONFIG_LIBSSH2) += ssh.o block-obj-y += accounting.o +block-obj-y += write-threshold.o common-obj-y += stream.o common-obj-y += commit.o @@ -36,5 +37,6 @@ gluster.o-libs := $(GLUSTERFS_LIBS) ssh.o-cflags := $(LIBSSH2_CFLAGS) ssh.o-libs := $(LIBSSH2_LIBS) archipelago.o-libs := $(ARCHIPELAGO_LIBS) +dmg.o-libs := $(BZIP2_LIBS) qcow.o-libs := -lz linux-aio.o-libs := -laio diff --git a/block/accounting.c b/block/accounting.c index 18102f015d..01d594ffdc 100644 --- a/block/accounting.c +++ b/block/accounting.c @@ -54,3 +54,10 @@ void block_acct_highest_sector(BlockAcctStats *stats, int64_t sector_num, stats->wr_highest_sector = sector_num + nb_sectors - 1; } } + +void block_acct_merge_done(BlockAcctStats *stats, enum BlockAcctType type, + int num_requests) +{ + assert(type < BLOCK_MAX_IOTYPE); + stats->merged[type] += num_requests; +} diff --git a/block/block-backend.c b/block/block-backend.c index d00c129f15..c28e2402c7 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -580,6 +580,11 @@ int blk_get_flags(BlockBackend *blk) return bdrv_get_flags(blk->bs); } +int blk_get_max_transfer_length(BlockBackend *blk) +{ + return blk->bs->bl.max_transfer_length; +} + void blk_set_guest_block_size(BlockBackend *blk, int align) { bdrv_set_guest_block_size(blk->bs, align); diff --git a/block/dmg.c b/block/dmg.c index e455886d2b..825c49d59a 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -26,6 +26,10 @@ #include "qemu/bswap.h" #include "qemu/module.h" #include <zlib.h> +#ifdef CONFIG_BZIP2 +#include <bzlib.h> +#endif +#include <glib.h> enum { /* Limit chunk sizes to prevent unreasonable amounts of memory being used @@ -55,6 +59,9 @@ typedef struct BDRVDMGState { uint8_t *compressed_chunk; uint8_t *uncompressed_chunk; z_stream zstream; +#ifdef CONFIG_BZIP2 + bz_stream bzstream; +#endif } BDRVDMGState; static int dmg_probe(const uint8_t *buf, int buf_size, const char *filename) @@ -100,6 +107,16 @@ static int read_uint32(BlockDriverState *bs, int64_t offset, uint32_t *result) return 0; } +static inline uint64_t buff_read_uint64(const uint8_t *buffer, int64_t offset) +{ + return be64_to_cpu(*(uint64_t *)&buffer[offset]); +} + +static inline uint32_t buff_read_uint32(const uint8_t *buffer, int64_t offset) +{ + return be32_to_cpu(*(uint32_t *)&buffer[offset]); +} + /* Increase max chunk sizes, if necessary. This function is used to calculate * the buffer sizes needed for compressed/uncompressed chunk I/O. */ @@ -112,6 +129,7 @@ static void update_max_chunk_size(BDRVDMGState *s, uint32_t chunk, switch (s->types[chunk]) { case 0x80000005: /* zlib compressed */ + case 0x80000006: /* bzip2 compressed */ compressed_size = s->lengths[chunk]; uncompressed_sectors = s->sectorcounts[chunk]; break; @@ -119,7 +137,9 @@ static void update_max_chunk_size(BDRVDMGState *s, uint32_t chunk, uncompressed_sectors = (s->lengths[chunk] + 511) / 512; break; case 2: /* zero */ - uncompressed_sectors = s->sectorcounts[chunk]; + /* as the all-zeroes block may be large, it is treated specially: the + * sector is not copied from a large buffer, a simple memset is used + * instead. Therefore uncompressed_sectors does not need to be set. */ break; } @@ -131,163 +151,372 @@ static void update_max_chunk_size(BDRVDMGState *s, uint32_t chunk, } } +static int64_t dmg_find_koly_offset(BlockDriverState *file_bs, Error **errp) +{ + int64_t length; + int64_t offset = 0; + uint8_t buffer[515]; + int i, ret; + + /* bdrv_getlength returns a multiple of block size (512), rounded up. Since + * dmg images can have odd sizes, try to look for the "koly" magic which + * marks the begin of the UDIF trailer (512 bytes). This magic can be found + * in the last 511 bytes of the second-last sector or the first 4 bytes of + * the last sector (search space: 515 bytes) */ + length = bdrv_getlength(file_bs); + if (length < 0) { + error_setg_errno(errp, -length, + "Failed to get file size while reading UDIF trailer"); + return length; + } else if (length < 512) { + error_setg(errp, "dmg file must be at least 512 bytes long"); + return -EINVAL; + } + if (length > 511 + 512) { + offset = length - 511 - 512; + } + length = length < 515 ? length : 515; + ret = bdrv_pread(file_bs, offset, buffer, length); + if (ret < 0) { + error_setg_errno(errp, -ret, "Failed while reading UDIF trailer"); + return ret; + } + for (i = 0; i < length - 3; i++) { + if (buffer[i] == 'k' && buffer[i+1] == 'o' && + buffer[i+2] == 'l' && buffer[i+3] == 'y') { + return offset + i; + } + } + error_setg(errp, "Could not locate UDIF trailer in dmg file"); + return -EINVAL; +} + +/* used when building the sector table */ +typedef struct DmgHeaderState { + /* used internally by dmg_read_mish_block to remember offsets of blocks + * across calls */ + uint64_t data_fork_offset; + /* exported for dmg_open */ + uint32_t max_compressed_size; + uint32_t max_sectors_per_chunk; +} DmgHeaderState; + +static bool dmg_is_known_block_type(uint32_t entry_type) +{ + switch (entry_type) { + case 0x00000001: /* uncompressed */ + case 0x00000002: /* zeroes */ + case 0x80000005: /* zlib */ +#ifdef CONFIG_BZIP2 + case 0x80000006: /* bzip2 */ +#endif + return true; + default: + return false; + } +} + +static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds, + uint8_t *buffer, uint32_t count) +{ + uint32_t type, i; + int ret; + size_t new_size; + uint32_t chunk_count; + int64_t offset = 0; + uint64_t data_offset; + uint64_t in_offset = ds->data_fork_offset; + uint64_t out_offset; + + type = buff_read_uint32(buffer, offset); + /* skip data that is not a valid MISH block (invalid magic or too small) */ + if (type != 0x6d697368 || count < 244) { + /* assume success for now */ + return 0; + } + + /* chunk offsets are relative to this sector number */ + out_offset = buff_read_uint64(buffer, offset + 8); + + /* location in data fork for (compressed) blob (in bytes) */ + data_offset = buff_read_uint64(buffer, offset + 0x18); + in_offset += data_offset; + + /* move to begin of chunk entries */ + offset += 204; + + chunk_count = (count - 204) / 40; + new_size = sizeof(uint64_t) * (s->n_chunks + chunk_count); + s->types = g_realloc(s->types, new_size / 2); + s->offsets = g_realloc(s->offsets, new_size); + s->lengths = g_realloc(s->lengths, new_size); + s->sectors = g_realloc(s->sectors, new_size); + s->sectorcounts = g_realloc(s->sectorcounts, new_size); + + for (i = s->n_chunks; i < s->n_chunks + chunk_count; i++) { + s->types[i] = buff_read_uint32(buffer, offset); + if (!dmg_is_known_block_type(s->types[i])) { + chunk_count--; + i--; + offset += 40; + continue; + } + + /* sector number */ + s->sectors[i] = buff_read_uint64(buffer, offset + 8); + s->sectors[i] += out_offset; + + /* sector count */ + s->sectorcounts[i] = buff_read_uint64(buffer, offset + 0x10); + + /* all-zeroes sector (type 2) does not need to be "uncompressed" and can + * therefore be unbounded. */ + if (s->types[i] != 2 && s->sectorcounts[i] > DMG_SECTORCOUNTS_MAX) { + error_report("sector count %" PRIu64 " for chunk %" PRIu32 + " is larger than max (%u)", + s->sectorcounts[i], i, DMG_SECTORCOUNTS_MAX); + ret = -EINVAL; + goto fail; + } + + /* offset in (compressed) data fork */ + s->offsets[i] = buff_read_uint64(buffer, offset + 0x18); + s->offsets[i] += in_offset; + + /* length in (compressed) data fork */ + s->lengths[i] = buff_read_uint64(buffer, offset + 0x20); + + if (s->lengths[i] > DMG_LENGTHS_MAX) { + error_report("length %" PRIu64 " for chunk %" PRIu32 + " is larger than max (%u)", + s->lengths[i], i, DMG_LENGTHS_MAX); + ret = -EINVAL; + goto fail; + } + + update_max_chunk_size(s, i, &ds->max_compressed_size, + &ds->max_sectors_per_chunk); + offset += 40; + } + s->n_chunks += chunk_count; + return 0; + +fail: + return ret; +} + +static int dmg_read_resource_fork(BlockDriverState *bs, DmgHeaderState *ds, + uint64_t info_begin, uint64_t info_length) +{ + BDRVDMGState *s = bs->opaque; + int ret; + uint32_t count, rsrc_data_offset; + uint8_t *buffer = NULL; + uint64_t info_end; + uint64_t offset; + + /* read offset from begin of resource fork (info_begin) to resource data */ + ret = read_uint32(bs, info_begin, &rsrc_data_offset); + if (ret < 0) { + goto fail; + } else if (rsrc_data_offset > info_length) { + ret = -EINVAL; + goto fail; + } + + /* read length of resource data */ + ret = read_uint32(bs, info_begin + 8, &count); + if (ret < 0) { + goto fail; + } else if (count == 0 || rsrc_data_offset + count > info_length) { + ret = -EINVAL; + goto fail; + } + + /* begin of resource data (consisting of one or more resources) */ + offset = info_begin + rsrc_data_offset; + + /* end of resource data (there is possibly a following resource map + * which will be ignored). */ + info_end = offset + count; + + /* read offsets (mish blocks) from one or more resources in resource data */ + while (offset < info_end) { + /* size of following resource */ + ret = read_uint32(bs, offset, &count); + if (ret < 0) { + goto fail; + } else if (count == 0 || count > info_end - offset) { + ret = -EINVAL; + goto fail; + } + offset += 4; + + buffer = g_realloc(buffer, count); + ret = bdrv_pread(bs->file, offset, buffer, count); + if (ret < 0) { + goto fail; + } + + ret = dmg_read_mish_block(s, ds, buffer, count); + if (ret < 0) { + goto fail; + } + /* advance offset by size of resource */ + offset += count; + } + ret = 0; + +fail: + g_free(buffer); + return ret; +} + +static int dmg_read_plist_xml(BlockDriverState *bs, DmgHeaderState *ds, + uint64_t info_begin, uint64_t info_length) +{ + BDRVDMGState *s = bs->opaque; + int ret; + uint8_t *buffer = NULL; + char *data_begin, *data_end; + + /* Have at least some length to avoid NULL for g_malloc. Attempt to set a + * safe upper cap on the data length. A test sample had a XML length of + * about 1 MiB. */ + if (info_length == 0 || info_length > 16 * 1024 * 1024) { + ret = -EINVAL; + goto fail; + } + + buffer = g_malloc(info_length + 1); + buffer[info_length] = '\0'; + ret = bdrv_pread(bs->file, info_begin, buffer, info_length); + if (ret != info_length) { + ret = -EINVAL; + goto fail; + } + + /* look for <data>...</data>. The data is 284 (0x11c) bytes after base64 + * decode. The actual data element has 431 (0x1af) bytes which includes tabs + * and line feeds. */ + data_end = (char *)buffer; + while ((data_begin = strstr(data_end, "<data>")) != NULL) { + guchar *mish; + gsize out_len = 0; + + data_begin += 6; + data_end = strstr(data_begin, "</data>"); + /* malformed XML? */ + if (data_end == NULL) { + ret = -EINVAL; + goto fail; + } + *data_end++ = '\0'; + mish = g_base64_decode(data_begin, &out_len); + ret = dmg_read_mish_block(s, ds, mish, (uint32_t)out_len); + g_free(mish); + if (ret < 0) { + goto fail; + } + } + ret = 0; + +fail: + g_free(buffer); + return ret; +} + static int dmg_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { BDRVDMGState *s = bs->opaque; - uint64_t info_begin, info_end, last_in_offset, last_out_offset; - uint32_t count, tmp; - uint32_t max_compressed_size = 1, max_sectors_per_chunk = 1, i; + DmgHeaderState ds; + uint64_t rsrc_fork_offset, rsrc_fork_length; + uint64_t plist_xml_offset, plist_xml_length; int64_t offset; int ret; bs->read_only = 1; s->n_chunks = 0; s->offsets = s->lengths = s->sectors = s->sectorcounts = NULL; + /* used by dmg_read_mish_block to keep track of the current I/O position */ + ds.data_fork_offset = 0; + ds.max_compressed_size = 1; + ds.max_sectors_per_chunk = 1; - /* read offset of info blocks */ - offset = bdrv_getlength(bs->file); + /* locate the UDIF trailer */ + offset = dmg_find_koly_offset(bs->file, errp); if (offset < 0) { ret = offset; goto fail; } - offset -= 0x1d8; - ret = read_uint64(bs, offset, &info_begin); + /* offset of data fork (DataForkOffset) */ + ret = read_uint64(bs, offset + 0x18, &ds.data_fork_offset); if (ret < 0) { goto fail; - } else if (info_begin == 0) { + } else if (ds.data_fork_offset > offset) { ret = -EINVAL; goto fail; } - ret = read_uint32(bs, info_begin, &tmp); + /* offset of resource fork (RsrcForkOffset) */ + ret = read_uint64(bs, offset + 0x28, &rsrc_fork_offset); if (ret < 0) { goto fail; - } else if (tmp != 0x100) { + } + ret = read_uint64(bs, offset + 0x30, &rsrc_fork_length); + if (ret < 0) { + goto fail; + } + if (rsrc_fork_offset >= offset || + rsrc_fork_length > offset - rsrc_fork_offset) { ret = -EINVAL; goto fail; } - - ret = read_uint32(bs, info_begin + 4, &count); + /* offset of property list (XMLOffset) */ + ret = read_uint64(bs, offset + 0xd8, &plist_xml_offset); if (ret < 0) { goto fail; - } else if (count == 0) { + } + ret = read_uint64(bs, offset + 0xe0, &plist_xml_length); + if (ret < 0) { + goto fail; + } + if (plist_xml_offset >= offset || + plist_xml_length > offset - plist_xml_offset) { ret = -EINVAL; goto fail; } - info_end = info_begin + count; - - offset = info_begin + 0x100; - - /* read offsets */ - last_in_offset = last_out_offset = 0; - while (offset < info_end) { - uint32_t type; - - ret = read_uint32(bs, offset, &count); + ret = read_uint64(bs, offset + 0x1ec, (uint64_t *)&bs->total_sectors); + if (ret < 0) { + goto fail; + } + if (bs->total_sectors < 0) { + ret = -EINVAL; + goto fail; + } + if (rsrc_fork_length != 0) { + ret = dmg_read_resource_fork(bs, &ds, + rsrc_fork_offset, rsrc_fork_length); if (ret < 0) { goto fail; - } else if (count == 0) { - ret = -EINVAL; - goto fail; } - offset += 4; - - ret = read_uint32(bs, offset, &type); + } else if (plist_xml_length != 0) { + ret = dmg_read_plist_xml(bs, &ds, plist_xml_offset, plist_xml_length); if (ret < 0) { goto fail; } - - if (type == 0x6d697368 && count >= 244) { - size_t new_size; - uint32_t chunk_count; - - offset += 4; - offset += 200; - - chunk_count = (count - 204) / 40; - new_size = sizeof(uint64_t) * (s->n_chunks + chunk_count); - s->types = g_realloc(s->types, new_size / 2); - s->offsets = g_realloc(s->offsets, new_size); - s->lengths = g_realloc(s->lengths, new_size); - s->sectors = g_realloc(s->sectors, new_size); - s->sectorcounts = g_realloc(s->sectorcounts, new_size); - - for (i = s->n_chunks; i < s->n_chunks + chunk_count; i++) { - ret = read_uint32(bs, offset, &s->types[i]); - if (ret < 0) { - goto fail; - } - offset += 4; - if (s->types[i] != 0x80000005 && s->types[i] != 1 && - s->types[i] != 2) { - if (s->types[i] == 0xffffffff && i > 0) { - last_in_offset = s->offsets[i - 1] + s->lengths[i - 1]; - last_out_offset = s->sectors[i - 1] + - s->sectorcounts[i - 1]; - } - chunk_count--; - i--; - offset += 36; - continue; - } - offset += 4; - - ret = read_uint64(bs, offset, &s->sectors[i]); - if (ret < 0) { - goto fail; - } - s->sectors[i] += last_out_offset; - offset += 8; - - ret = read_uint64(bs, offset, &s->sectorcounts[i]); - if (ret < 0) { - goto fail; - } - offset += 8; - - if (s->sectorcounts[i] > DMG_SECTORCOUNTS_MAX) { - error_report("sector count %" PRIu64 " for chunk %" PRIu32 - " is larger than max (%u)", - s->sectorcounts[i], i, DMG_SECTORCOUNTS_MAX); - ret = -EINVAL; - goto fail; - } - - ret = read_uint64(bs, offset, &s->offsets[i]); - if (ret < 0) { - goto fail; - } - s->offsets[i] += last_in_offset; - offset += 8; - - ret = read_uint64(bs, offset, &s->lengths[i]); - if (ret < 0) { - goto fail; - } - offset += 8; - - if (s->lengths[i] > DMG_LENGTHS_MAX) { - error_report("length %" PRIu64 " for chunk %" PRIu32 - " is larger than max (%u)", - s->lengths[i], i, DMG_LENGTHS_MAX); - ret = -EINVAL; - goto fail; - } - - update_max_chunk_size(s, i, &max_compressed_size, - &max_sectors_per_chunk); - } - s->n_chunks += chunk_count; - } + } else { + ret = -EINVAL; + goto fail; } /* initialize zlib engine */ s->compressed_chunk = qemu_try_blockalign(bs->file, - max_compressed_size + 1); + ds.max_compressed_size + 1); s->uncompressed_chunk = qemu_try_blockalign(bs->file, - 512 * max_sectors_per_chunk); + 512 * ds.max_sectors_per_chunk); if (s->compressed_chunk == NULL || s->uncompressed_chunk == NULL) { ret = -ENOMEM; goto fail; @@ -349,13 +578,16 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num) if (!is_sector_in_chunk(s, s->current_chunk, sector_num)) { int ret; uint32_t chunk = search_chunk(s, sector_num); +#ifdef CONFIG_BZIP2 + uint64_t total_out; +#endif if (chunk >= s->n_chunks) { return -1; } s->current_chunk = s->n_chunks; - switch (s->types[chunk]) { + switch (s->types[chunk]) { /* block entry type */ case 0x80000005: { /* zlib compressed */ /* we need to buffer, because only the chunk as whole can be * inflated. */ @@ -379,6 +611,34 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num) return -1; } break; } +#ifdef CONFIG_BZIP2 + case 0x80000006: /* bzip2 compressed */ + /* we need to buffer, because only the chunk as whole can be + * inflated. */ + ret = bdrv_pread(bs->file, s->offsets[chunk], + s->compressed_chunk, s->lengths[chunk]); + if (ret != s->lengths[chunk]) { + return -1; + } + + ret = BZ2_bzDecompressInit(&s->bzstream, 0, 0); + if (ret != BZ_OK) { + return -1; + } + s->bzstream.next_in = (char *)s->compressed_chunk; + s->bzstream.avail_in = (unsigned int) s->lengths[chunk]; + s->bzstream.next_out = (char *)s->uncompressed_chunk; + s->bzstream.avail_out = (unsigned int) 512 * s->sectorcounts[chunk]; + ret = BZ2_bzDecompress(&s->bzstream); + total_out = ((uint64_t)s->bzstream.total_out_hi32 << 32) + + s->bzstream.total_out_lo32; + BZ2_bzDecompressEnd(&s->bzstream); + if (ret != BZ_STREAM_END || + total_out != 512 * s->sectorcounts[chunk]) { + return -1; + } + break; +#endif /* CONFIG_BZIP2 */ case 1: /* copy */ ret = bdrv_pread(bs->file, s->offsets[chunk], s->uncompressed_chunk, s->lengths[chunk]); @@ -387,7 +647,8 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num) } break; case 2: /* zero */ - memset(s->uncompressed_chunk, 0, 512 * s->sectorcounts[chunk]); + /* see dmg_read, it is treated specially. No buffer needs to be + * pre-filled, the zeroes can be set directly. */ break; } s->current_chunk = chunk; @@ -406,6 +667,13 @@ static int dmg_read(BlockDriverState *bs, int64_t sector_num, if (dmg_read_chunk(bs, sector_num + i) != 0) { return -1; } + /* Special case: current chunk is all zeroes. Do not perform a memcpy as + * s->uncompressed_chunk may be too small to cover the large all-zeroes + * section. dmg_read_chunk is called to find s->current_chunk */ + if (s->types[s->current_chunk] == 2) { /* all zeroes block entry */ + memset(buf + i * 512, 0, 512); + continue; + } sector_offset_in_chunk = sector_num + i - s->sectors[s->current_chunk]; memcpy(buf + i * 512, s->uncompressed_chunk + sector_offset_in_chunk * 512, 512); diff --git a/block/nbd-client.c b/block/nbd-client.c index 6e1c97cad0..28bfb62bed 100644 --- a/block/nbd-client.c +++ b/block/nbd-client.c @@ -373,7 +373,7 @@ void nbd_client_session_close(NbdClientSession *client) } int nbd_client_session_init(NbdClientSession *client, BlockDriverState *bs, - int sock, const char *export) + int sock, const char *export, Error **errp) { int ret; @@ -382,7 +382,7 @@ int nbd_client_session_init(NbdClientSession *client, BlockDriverState *bs, qemu_set_block(sock); ret = nbd_receive_negotiate(sock, export, &client->nbdflags, &client->size, - &client->blocksize); + &client->blocksize, errp); if (ret < 0) { logout("Failed to negotiate with the NBD server\n"); closesocket(sock); diff --git a/block/nbd-client.h b/block/nbd-client.h index cd478f3a98..cfeecc2775 100644 --- a/block/nbd-client.h +++ b/block/nbd-client.h @@ -36,7 +36,7 @@ typedef struct NbdClientSession { } NbdClientSession; int nbd_client_session_init(NbdClientSession *client, BlockDriverState *bs, - int sock, const char *export_name); + int sock, const char *export_name, Error **errp); void nbd_client_session_close(NbdClientSession *client); int nbd_client_session_co_discard(NbdClientSession *client, int64_t sector_num, diff --git a/block/nbd.c b/block/nbd.c index 04cc845076..b05d1d0407 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -271,7 +271,7 @@ static int nbd_open(BlockDriverState *bs, QDict *options, int flags, } /* NBD handshake */ - result = nbd_client_session_init(&s->client, bs, sock, export); + result = nbd_client_session_init(&s->client, bs, sock, export, errp); g_free(export); return result; } @@ -301,6 +301,12 @@ static int nbd_co_flush(BlockDriverState *bs) return nbd_client_session_co_flush(&s->client); } +static void nbd_refresh_limits(BlockDriverState *bs, Error **errp) +{ + bs->bl.max_discard = UINT32_MAX >> BDRV_SECTOR_BITS; + bs->bl.max_transfer_length = UINT32_MAX >> BDRV_SECTOR_BITS; +} + static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) { @@ -396,6 +402,7 @@ static BlockDriver bdrv_nbd = { .bdrv_close = nbd_close, .bdrv_co_flush_to_os = nbd_co_flush, .bdrv_co_discard = nbd_co_discard, + .bdrv_refresh_limits = nbd_refresh_limits, .bdrv_getlength = nbd_getlength, .bdrv_detach_aio_context = nbd_detach_aio_context, .bdrv_attach_aio_context = nbd_attach_aio_context, @@ -413,6 +420,7 @@ static BlockDriver bdrv_nbd_tcp = { .bdrv_close = nbd_close, .bdrv_co_flush_to_os = nbd_co_flush, .bdrv_co_discard = nbd_co_discard, + .bdrv_refresh_limits = nbd_refresh_limits, .bdrv_getlength = nbd_getlength, .bdrv_detach_aio_context = nbd_detach_aio_context, .bdrv_attach_aio_context = nbd_attach_aio_context, @@ -430,6 +438,7 @@ static BlockDriver bdrv_nbd_unix = { .bdrv_close = nbd_close, .bdrv_co_flush_to_os = nbd_co_flush, .bdrv_co_discard = nbd_co_discard, + .bdrv_refresh_limits = nbd_refresh_limits, .bdrv_getlength = nbd_getlength, .bdrv_detach_aio_context = nbd_detach_aio_context, .bdrv_attach_aio_context = nbd_attach_aio_context, diff --git a/block/qapi.c b/block/qapi.c index 75c388e90b..1808e67336 100644 --- a/block/qapi.c +++ b/block/qapi.c @@ -24,6 +24,7 @@ #include "block/qapi.h" #include "block/block_int.h" +#include "block/write-threshold.h" #include "qmp-commands.h" #include "qapi-visit.h" #include "qapi/qmp-output-visitor.h" @@ -89,6 +90,8 @@ BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs) info->iops_size = cfg.op_size; } + info->write_threshold = bdrv_write_threshold_get(bs); + return info; } @@ -335,6 +338,8 @@ static BlockStats *bdrv_query_stats(const BlockDriverState *bs, s->stats->wr_bytes = bs->stats.nr_bytes[BLOCK_ACCT_WRITE]; s->stats->rd_operations = bs->stats.nr_ops[BLOCK_ACCT_READ]; s->stats->wr_operations = bs->stats.nr_ops[BLOCK_ACCT_WRITE]; + s->stats->rd_merged = bs->stats.merged[BLOCK_ACCT_READ]; + s->stats->wr_merged = bs->stats.merged[BLOCK_ACCT_WRITE]; s->stats->wr_highest_offset = bs->stats.wr_highest_sector * BDRV_SECTOR_SIZE; s->stats->flush_operations = bs->stats.nr_ops[BLOCK_ACCT_FLUSH]; diff --git a/block/qcow.c b/block/qcow.c index ccbe9e0d2c..055896910e 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -215,7 +215,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags, /* read the backing file name */ if (header.backing_file_offset != 0) { len = header.backing_file_size; - if (len > 1023 || len > sizeof(bs->backing_file)) { + if (len > 1023 || len >= sizeof(bs->backing_file)) { error_setg(errp, "Backing file name too long"); ret = -EINVAL; goto fail; diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c index 904f6b1f44..b1155492a3 100644 --- a/block/qcow2-cache.c +++ b/block/qcow2-cache.c @@ -253,7 +253,9 @@ static int qcow2_cache_find_entry_to_replace(Qcow2Cache *c) /* Give newer hits priority */ /* TODO Check how to optimize the replacement strategy */ - c->entries[i].cache_hits /= 2; + if (c->entries[i].cache_hits > 1) { + c->entries[i].cache_hits /= 2; + } } if (min_index == -1) { diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 9afdb40b40..9b80ca79ea 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -759,54 +759,54 @@ int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size) { BDRVQcowState *s = bs->opaque; - int64_t offset, cluster_offset; - int free_in_cluster; + int64_t offset; + size_t free_in_cluster; + int ret; BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES); assert(size > 0 && size <= s->cluster_size); - if (s->free_byte_offset == 0) { - offset = qcow2_alloc_clusters(bs, s->cluster_size); - if (offset < 0) { - return offset; + assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset)); + + offset = s->free_byte_offset; + + if (offset) { + int refcount = qcow2_get_refcount(bs, offset >> s->cluster_bits); + if (refcount < 0) { + return refcount; } - s->free_byte_offset = offset; - } - redo: - free_in_cluster = s->cluster_size - - offset_into_cluster(s, s->free_byte_offset); - if (size <= free_in_cluster) { - /* enough space in current cluster */ - offset = s->free_byte_offset; - s->free_byte_offset += size; - free_in_cluster -= size; - if (free_in_cluster == 0) - s->free_byte_offset = 0; - if (offset_into_cluster(s, offset) != 0) - qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1, - QCOW2_DISCARD_NEVER); - } else { - offset = qcow2_alloc_clusters(bs, s->cluster_size); - if (offset < 0) { - return offset; + + if (refcount == 0xffff) { + offset = 0; } - cluster_offset = start_of_cluster(s, s->free_byte_offset); - if ((cluster_offset + s->cluster_size) == offset) { - /* we are lucky: contiguous data */ - offset = s->free_byte_offset; - qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1, - QCOW2_DISCARD_NEVER); - s->free_byte_offset += size; - } else { - s->free_byte_offset = offset; - goto redo; + } + + free_in_cluster = s->cluster_size - offset_into_cluster(s, offset); + if (!offset || free_in_cluster < size) { + int64_t new_cluster = alloc_clusters_noref(bs, s->cluster_size); + if (new_cluster < 0) { + return new_cluster; + } + + if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) { + offset = new_cluster; } } - /* The cluster refcount was incremented, either by qcow2_alloc_clusters() - * or explicitly by qcow2_update_cluster_refcount(). Refcount blocks must - * be flushed before the caller's L2 table updates. - */ + assert(offset); + ret = update_refcount(bs, offset, size, 1, QCOW2_DISCARD_NEVER); + if (ret < 0) { + return ret; + } + + /* The cluster refcount was incremented; refcount blocks must be flushed + * before the caller's L2 table updates. */ qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache); + + s->free_byte_offset = offset + size; + if (!offset_into_cluster(s, s->free_byte_offset)) { + s->free_byte_offset = 0; + } + return offset; } diff --git a/block/qcow2.c b/block/qcow2.c index dbaf016bc7..7e614d76a4 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -869,7 +869,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, if (header.backing_file_offset != 0) { len = header.backing_file_size; if (len > MIN(1023, s->cluster_size - header.backing_file_offset) || - len > sizeof(bs->backing_file)) { + len >= sizeof(bs->backing_file)) { error_setg(errp, "Backing file name too long"); ret = -EINVAL; goto fail; diff --git a/block/qed.c b/block/qed.c index 80f18d82e2..892b13c806 100644 --- a/block/qed.c +++ b/block/qed.c @@ -440,6 +440,11 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags, s->l2_mask = s->table_nelems - 1; s->l1_shift = s->l2_shift + ffs(s->table_nelems) - 1; + /* Header size calculation must not overflow uint32_t */ + if (s->header.header_size > UINT32_MAX / s->header.cluster_size) { + return -EINVAL; + } + if ((s->header.features & QED_F_BACKING_FILE)) { if ((uint64_t)s->header.backing_filename_offset + s->header.backing_filename_size > diff --git a/block/qed.h b/block/qed.h index d3934a05cd..615e676fc8 100644 --- a/block/qed.h +++ b/block/qed.h @@ -133,7 +133,6 @@ typedef struct QEDAIOCB { int bh_ret; /* final return status for completion bh */ QSIMPLEQ_ENTRY(QEDAIOCB) next; /* next request */ int flags; /* QED_AIOCB_* bits ORed together */ - bool *finished; /* signal for cancel completion */ uint64_t end_pos; /* request end on block device, in bytes */ /* User scatter-gather list */ diff --git a/block/raw-posix.c b/block/raw-posix.c index e51293a455..e474c17974 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -60,7 +60,7 @@ #define FS_NOCOW_FL 0x00800000 /* Do not cow file */ #endif #endif -#ifdef CONFIG_FALLOCATE_PUNCH_HOLE +#if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_FALLOCATE_ZERO_RANGE) #include <linux/falloc.h> #endif #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) @@ -147,6 +147,7 @@ typedef struct BDRVRawState { bool has_discard:1; bool has_write_zeroes:1; bool discard_zeroes:1; + bool has_fallocate; bool needs_alignment; } BDRVRawState; @@ -452,6 +453,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, } if (S_ISREG(st.st_mode)) { s->discard_zeroes = true; + s->has_fallocate = true; } if (S_ISBLK(st.st_mode)) { #ifdef BLKDISCARDZEROES @@ -893,40 +895,108 @@ static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes) } #endif -static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb) +static int translate_err(int err) { - int ret = -EOPNOTSUPP; + if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP || + err == -ENOTTY) { + err = -ENOTSUP; + } + return err; +} + +#ifdef CONFIG_FALLOCATE +static int do_fallocate(int fd, int mode, off_t offset, off_t len) +{ + do { + if (fallocate(fd, mode, offset, len) == 0) { + return 0; + } + } while (errno == EINTR); + return translate_err(-errno); +} +#endif + +static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb) +{ + int ret = -ENOTSUP; BDRVRawState *s = aiocb->bs->opaque; - if (s->has_write_zeroes == 0) { + if (!s->has_write_zeroes) { return -ENOTSUP; } - if (aiocb->aio_type & QEMU_AIO_BLKDEV) { #ifdef BLKZEROOUT - do { - uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes }; - if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) { - return 0; - } - } while (errno == EINTR); + do { + uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes }; + if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) { + return 0; + } + } while (errno == EINTR); - ret = -errno; + ret = translate_err(-errno); #endif - } else { + + if (ret == -ENOTSUP) { + s->has_write_zeroes = false; + } + return ret; +} + +static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb) +{ + BDRVRawState *s = aiocb->bs->opaque; + + if (aiocb->aio_type & QEMU_AIO_BLKDEV) { + return handle_aiocb_write_zeroes_block(aiocb); + } + #ifdef CONFIG_XFS - if (s->is_xfs) { - return xfs_write_zeroes(s, aiocb->aio_offset, aiocb->aio_nbytes); + if (s->is_xfs) { + return xfs_write_zeroes(s, aiocb->aio_offset, aiocb->aio_nbytes); + } +#endif + +#ifdef CONFIG_FALLOCATE_ZERO_RANGE + if (s->has_write_zeroes) { + int ret = do_fallocate(s->fd, FALLOC_FL_ZERO_RANGE, + aiocb->aio_offset, aiocb->aio_nbytes); + if (ret == 0 || ret != -ENOTSUP) { + return ret; } + s->has_write_zeroes = false; + } #endif + +#ifdef CONFIG_FALLOCATE_PUNCH_HOLE + if (s->has_discard && s->has_fallocate) { + int ret = do_fallocate(s->fd, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + aiocb->aio_offset, aiocb->aio_nbytes); + if (ret == 0) { + ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes); + if (ret == 0 || ret != -ENOTSUP) { + return ret; + } + s->has_fallocate = false; + } else if (ret != -ENOTSUP) { + return ret; + } else { + s->has_discard = false; + } } +#endif - if (ret == -ENODEV || ret == -ENOSYS || ret == -EOPNOTSUPP || - ret == -ENOTTY) { - s->has_write_zeroes = false; - ret = -ENOTSUP; +#ifdef CONFIG_FALLOCATE + if (s->has_fallocate && aiocb->aio_offset >= bdrv_getlength(aiocb->bs)) { + int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes); + if (ret == 0 || ret != -ENOTSUP) { + return ret; + } + s->has_fallocate = false; } - return ret; +#endif + + return -ENOTSUP; } static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb) @@ -957,21 +1027,14 @@ static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb) #endif #ifdef CONFIG_FALLOCATE_PUNCH_HOLE - do { - if (fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, - aiocb->aio_offset, aiocb->aio_nbytes) == 0) { - return 0; - } - } while (errno == EINTR); - - ret = -errno; + ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + aiocb->aio_offset, aiocb->aio_nbytes); #endif } - if (ret == -ENODEV || ret == -ENOSYS || ret == -EOPNOTSUPP || - ret == -ENOTTY) { + ret = translate_err(ret); + if (ret == -ENOTSUP) { s->has_discard = false; - ret = -ENOTSUP; } return ret; } @@ -1312,7 +1375,20 @@ again: if (size == 0) #endif #if defined(__APPLE__) && defined(__MACH__) - size = LLONG_MAX; + { + uint64_t sectors = 0; + uint32_t sector_size = 0; + + if (ioctl(fd, DKIOCGETBLOCKCOUNT, §ors) == 0 + && ioctl(fd, DKIOCGETBLOCKSIZE, §or_size) == 0) { + size = sectors * sector_size; + } else { + size = lseek(fd, 0LL, SEEK_END); + if (size < 0) { + return -errno; + } + } + } #else size = lseek(fd, 0LL, SEEK_END); if (size < 0) { diff --git a/block/write-threshold.c b/block/write-threshold.c new file mode 100644 index 0000000000..c2cd517716 --- /dev/null +++ b/block/write-threshold.c @@ -0,0 +1,125 @@ +/* + * QEMU System Emulator block write threshold notification + * + * Copyright Red Hat, Inc. 2014 + * + * Authors: + * Francesco Romani <fromani@redhat.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + */ + +#include "block/block_int.h" +#include "block/coroutine.h" +#include "block/write-threshold.h" +#include "qemu/notify.h" +#include "qapi-event.h" +#include "qmp-commands.h" + + +uint64_t bdrv_write_threshold_get(const BlockDriverState *bs) +{ + return bs->write_threshold_offset; +} + +bool bdrv_write_threshold_is_set(const BlockDriverState *bs) +{ + return bs->write_threshold_offset > 0; +} + +static void write_threshold_disable(BlockDriverState *bs) +{ + if (bdrv_write_threshold_is_set(bs)) { + notifier_with_return_remove(&bs->write_threshold_notifier); + bs->write_threshold_offset = 0; + } +} + +uint64_t bdrv_write_threshold_exceeded(const BlockDriverState *bs, + const BdrvTrackedRequest *req) +{ + if (bdrv_write_threshold_is_set(bs)) { + if (req->offset > bs->write_threshold_offset) { + return (req->offset - bs->write_threshold_offset) + req->bytes; + } + if ((req->offset + req->bytes) > bs->write_threshold_offset) { + return (req->offset + req->bytes) - bs->write_threshold_offset; + } + } + return 0; +} + +static int coroutine_fn before_write_notify(NotifierWithReturn *notifier, + void *opaque) +{ + BdrvTrackedRequest *req = opaque; + BlockDriverState *bs = req->bs; + uint64_t amount = 0; + + amount = bdrv_write_threshold_exceeded(bs, req); + if (amount > 0) { + qapi_event_send_block_write_threshold( + bs->node_name, + amount, + bs->write_threshold_offset, + &error_abort); + + /* autodisable to avoid flooding the monitor */ + write_threshold_disable(bs); + } + + return 0; /* should always let other notifiers run */ +} + +static void write_threshold_register_notifier(BlockDriverState *bs) +{ + bs->write_threshold_notifier.notify = before_write_notify; + notifier_with_return_list_add(&bs->before_write_notifiers, + &bs->write_threshold_notifier); +} + +static void write_threshold_update(BlockDriverState *bs, + int64_t threshold_bytes) +{ + bs->write_threshold_offset = threshold_bytes; +} + +void bdrv_write_threshold_set(BlockDriverState *bs, uint64_t threshold_bytes) +{ + if (bdrv_write_threshold_is_set(bs)) { + if (threshold_bytes > 0) { + write_threshold_update(bs, threshold_bytes); + } else { + write_threshold_disable(bs); + } + } else { + if (threshold_bytes > 0) { + /* avoid multiple registration */ + write_threshold_register_notifier(bs); + write_threshold_update(bs, threshold_bytes); + } + /* discard bogus disable request */ + } +} + +void qmp_block_set_write_threshold(const char *node_name, + uint64_t threshold_bytes, + Error **errp) +{ + BlockDriverState *bs; + AioContext *aio_context; + + bs = bdrv_find_node(node_name); + if (!bs) { + error_set(errp, QERR_DEVICE_NOT_FOUND, node_name); + return; + } + + aio_context = bdrv_get_aio_context(bs); + aio_context_acquire(aio_context); + + bdrv_write_threshold_set(bs, threshold_bytes); + + aio_context_release(aio_context); +} |