diff options
author | Nir Soffer <nirsof@gmail.com> | 2019-08-27 04:05:27 +0300 |
---|---|---|
committer | Max Reitz <mreitz@redhat.com> | 2019-09-03 14:55:35 +0200 |
commit | 3a20013fbb26d2a1bd11ef148eefdb1508783787 (patch) | |
tree | 73c38b63a74eeb27ccc0a8daaa67cad3ee6cb917 /block/file-posix.c | |
parent | b503de619ed462cd433187db60719f98fab470c2 (diff) |
block: posix: Always allocate the first block
When creating an image with preallocation "off" or "falloc", the first
block of the image is typically not allocated. When using Gluster
storage backed by XFS filesystem, reading this block using direct I/O
succeeds regardless of request length, fooling alignment detection.
In this case we fallback to a safe value (4096) instead of the optimal
value (512), which may lead to unneeded data copying when aligning
requests. Allocating the first block avoids the fallback.
Since we allocate the first block even with preallocation=off, we no
longer create images with zero disk size:
$ ./qemu-img create -f raw test.raw 1g
Formatting 'test.raw', fmt=raw size=1073741824
$ ls -lhs test.raw
4.0K -rw-r--r--. 1 nsoffer nsoffer 1.0G Aug 16 23:48 test.raw
And converting the image requires additional cluster:
$ ./qemu-img measure -f raw -O qcow2 test.raw
required size: 458752
fully allocated size: 1074135040
When using format like vmdk with multiple files per image, we allocate
one block per file:
$ ./qemu-img create -f vmdk -o subformat=twoGbMaxExtentFlat test.vmdk 4g
Formatting 'test.vmdk', fmt=vmdk size=4294967296 compat6=off hwversion=undefined subformat=twoGbMaxExtentFlat
$ ls -lhs test*.vmdk
4.0K -rw-r--r--. 1 nsoffer nsoffer 2.0G Aug 27 03:23 test-f001.vmdk
4.0K -rw-r--r--. 1 nsoffer nsoffer 2.0G Aug 27 03:23 test-f002.vmdk
4.0K -rw-r--r--. 1 nsoffer nsoffer 353 Aug 27 03:23 test.vmdk
I did quick performance test for copying disks with qemu-img convert to
new raw target image to Gluster storage with sector size of 512 bytes:
for i in $(seq 10); do
rm -f dst.raw
sleep 10
time ./qemu-img convert -f raw -O raw -t none -T none src.raw dst.raw
done
Here is a table comparing the total time spent:
Type Before(s) After(s) Diff(%)
---------------------------------------
real 530.028 469.123 -11.4
user 17.204 10.768 -37.4
sys 17.881 7.011 -60.7
We can see very clear improvement in CPU usage.
Signed-off-by: Nir Soffer <nsoffer@redhat.com>
Message-id: 20190827010528.8818-2-nsoffer@redhat.com
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
Diffstat (limited to 'block/file-posix.c')
-rw-r--r-- | block/file-posix.c | 51 |
1 files changed, 51 insertions, 0 deletions
diff --git a/block/file-posix.c b/block/file-posix.c index fbeb0068db..447f937aa1 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -1749,6 +1749,43 @@ static int handle_aiocb_discard(void *opaque) return ret; } +/* + * Help alignment probing by allocating the first block. + * + * When reading with direct I/O from unallocated area on Gluster backed by XFS, + * reading succeeds regardless of request length. In this case we fallback to + * safe alignment which is not optimal. Allocating the first block avoids this + * fallback. + * + * fd may be opened with O_DIRECT, but we don't know the buffer alignment or + * request alignment, so we use safe values. + * + * Returns: 0 on success, -errno on failure. Since this is an optimization, + * caller may ignore failures. + */ +static int allocate_first_block(int fd, size_t max_size) +{ + size_t write_size = (max_size < MAX_BLOCKSIZE) + ? BDRV_SECTOR_SIZE + : MAX_BLOCKSIZE; + size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize()); + void *buf; + ssize_t n; + int ret; + + buf = qemu_memalign(max_align, write_size); + memset(buf, 0, write_size); + + do { + n = pwrite(fd, buf, write_size, 0); + } while (n == -1 && errno == EINTR); + + ret = (n == -1) ? -errno : 0; + + qemu_vfree(buf); + return ret; +} + static int handle_aiocb_truncate(void *opaque) { RawPosixAIOData *aiocb = opaque; @@ -1788,6 +1825,17 @@ static int handle_aiocb_truncate(void *opaque) /* posix_fallocate() doesn't set errno. */ error_setg_errno(errp, -result, "Could not preallocate new data"); + } else if (current_length == 0) { + /* + * posix_fallocate() uses fallocate() if the filesystem + * supports it, or fallback to manually writing zeroes. If + * fallocate() was used, unaligned reads from the fallocated + * area in raw_probe_alignment() will succeed, hence we need to + * allocate the first block. + * + * Optimize future alignment probing; ignore failures. + */ + allocate_first_block(fd, offset); } } else { result = 0; @@ -1849,6 +1897,9 @@ static int handle_aiocb_truncate(void *opaque) if (ftruncate(fd, offset) != 0) { result = -errno; error_setg_errno(errp, -result, "Could not resize file"); + } else if (current_length == 0 && offset > current_length) { + /* Optimize future alignment probing; ignore failures. */ + allocate_first_block(fd, offset); } return result; default: |