diff options
author | Kevin Wolf <kwolf@redhat.com> | 2020-07-07 16:23:29 +0200 |
---|---|---|
committer | Kevin Wolf <kwolf@redhat.com> | 2020-07-14 15:18:59 +0200 |
commit | ffa244c84a1a30dff69ecc80b0137a2b6d428ecb (patch) | |
tree | abc0ddb51c762daad5f27ec4fb890ebb8f86e272 | |
parent | 046e07ca556b3eb44ac5c64911b5bda204403113 (diff) |
file-posix: Mitigate file fragmentation with extent size hints
Especially when O_DIRECT is used with image files so that the page cache
indirection can't cause a merge of allocating requests, the file will
fragment on the file system layer, with a potentially very small
fragment size (this depends on the requests the guest sent).
On Linux, fragmentation can be reduced by setting an extent size hint
when creating the file (at least on XFS, it can't be set any more after
the first extent has been allocated), basically giving raw files a
"cluster size" for allocation.
This adds a create option to set the extent size hint, and changes the
default from not setting a hint to setting it to 1 MB. The main reason
why qcow2 defaults to smaller cluster sizes is that COW becomes more
expensive, which is not an issue with raw files, so we can choose a
larger size. The tradeoff here is only potentially wasted disk space.
For qcow2 (or other image formats) over file-posix, the advantage should
even be greater because they grow sequentially without leaving holes, so
there won't be wasted space. Setting even larger extent size hints for
such images may make sense. This can be done with the new option, but
let's keep the default conservative for now.
The effect is very visible with a test that intentionally creates a
badly fragmented file with qemu-img bench (the time difference while
creating the file is already remarkable) and then looks at the number of
extents and the time a simple "qemu-img map" takes.
Without an extent size hint:
$ ./qemu-img create -f raw -o extent_size_hint=0 ~/tmp/test.raw 10G
Formatting '/home/kwolf/tmp/test.raw', fmt=raw size=10737418240 extent_size_hint=0
$ ./qemu-img bench -f raw -t none -n -w ~/tmp/test.raw -c 1000000 -S 8192 -o 0
Sending 1000000 write requests, 4096 bytes each, 64 in parallel (starting at offset 0, step size 8192)
Run completed in 25.848 seconds.
$ ./qemu-img bench -f raw -t none -n -w ~/tmp/test.raw -c 1000000 -S 8192 -o 4096
Sending 1000000 write requests, 4096 bytes each, 64 in parallel (starting at offset 4096, step size 8192)
Run completed in 19.616 seconds.
$ filefrag ~/tmp/test.raw
/home/kwolf/tmp/test.raw: 2000000 extents found
$ time ./qemu-img map ~/tmp/test.raw
Offset Length Mapped to File
0 0x1e8480000 0 /home/kwolf/tmp/test.raw
real 0m1,279s
user 0m0,043s
sys 0m1,226s
With the new default extent size hint of 1 MB:
$ ./qemu-img create -f raw -o extent_size_hint=1M ~/tmp/test.raw 10G
Formatting '/home/kwolf/tmp/test.raw', fmt=raw size=10737418240 extent_size_hint=1048576
$ ./qemu-img bench -f raw -t none -n -w ~/tmp/test.raw -c 1000000 -S 8192 -o 0
Sending 1000000 write requests, 4096 bytes each, 64 in parallel (starting at offset 0, step size 8192)
Run completed in 11.833 seconds.
$ ./qemu-img bench -f raw -t none -n -w ~/tmp/test.raw -c 1000000 -S 8192 -o 4096
Sending 1000000 write requests, 4096 bytes each, 64 in parallel (starting at offset 4096, step size 8192)
Run completed in 10.155 seconds.
$ filefrag ~/tmp/test.raw
/home/kwolf/tmp/test.raw: 178 extents found
$ time ./qemu-img map ~/tmp/test.raw
Offset Length Mapped to File
0 0x1e8480000 0 /home/kwolf/tmp/test.raw
real 0m0,061s
user 0m0,040s
sys 0m0,014s
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Message-Id: <20200707142329.48303-1-kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-rw-r--r-- | block/file-posix.c | 44 | ||||
-rw-r--r-- | include/block/block_int.h | 1 | ||||
-rw-r--r-- | qapi/block-core.json | 11 | ||||
-rw-r--r-- | tests/qemu-iotests/082.out | 16 | ||||
-rwxr-xr-x | tests/qemu-iotests/106 | 7 | ||||
-rwxr-xr-x | tests/qemu-iotests/175 | 6 | ||||
-rwxr-xr-x | tests/qemu-iotests/243 | 7 |
7 files changed, 80 insertions, 12 deletions
diff --git a/block/file-posix.c b/block/file-posix.c index 1989eae85f..8067e238cb 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -30,6 +30,7 @@ #include "block/block_int.h" #include "qemu/module.h" #include "qemu/option.h" +#include "qemu/units.h" #include "trace.h" #include "block/thread-pool.h" #include "qemu/iov.h" @@ -2318,6 +2319,14 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp) if (!file_opts->has_preallocation) { file_opts->preallocation = PREALLOC_MODE_OFF; } + if (!file_opts->has_extent_size_hint) { + file_opts->extent_size_hint = 1 * MiB; + } + if (file_opts->extent_size_hint > UINT32_MAX) { + result = -EINVAL; + error_setg(errp, "Extent size hint is too large"); + goto out; + } /* Create file */ fd = qemu_open(file_opts->filename, O_RDWR | O_CREAT | O_BINARY, 0644); @@ -2375,6 +2384,27 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp) } #endif } +#ifdef FS_IOC_FSSETXATTR + /* + * Try to set the extent size hint. Failure is not fatal, and a warning is + * only printed if the option was explicitly specified. + */ + { + struct fsxattr attr; + result = ioctl(fd, FS_IOC_FSGETXATTR, &attr); + if (result == 0) { + attr.fsx_xflags |= FS_XFLAG_EXTSIZE; + attr.fsx_extsize = file_opts->extent_size_hint; + result = ioctl(fd, FS_IOC_FSSETXATTR, &attr); + } + if (result < 0 && file_opts->has_extent_size_hint && + file_opts->extent_size_hint) + { + warn_report("Failed to set extent size hint: %s", + strerror(errno)); + } + } +#endif /* Resize and potentially preallocate the file to the desired * final size */ @@ -2410,6 +2440,8 @@ static int coroutine_fn raw_co_create_opts(BlockDriver *drv, { BlockdevCreateOptions options; int64_t total_size = 0; + int64_t extent_size_hint = 0; + bool has_extent_size_hint = false; bool nocow = false; PreallocMode prealloc; char *buf = NULL; @@ -2421,6 +2453,11 @@ static int coroutine_fn raw_co_create_opts(BlockDriver *drv, /* Read out options */ total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), BDRV_SECTOR_SIZE); + if (qemu_opt_get(opts, BLOCK_OPT_EXTENT_SIZE_HINT)) { + has_extent_size_hint = true; + extent_size_hint = + qemu_opt_get_size_del(opts, BLOCK_OPT_EXTENT_SIZE_HINT, -1); + } nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false); buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); prealloc = qapi_enum_parse(&PreallocMode_lookup, buf, @@ -2440,6 +2477,8 @@ static int coroutine_fn raw_co_create_opts(BlockDriver *drv, .preallocation = prealloc, .has_nocow = true, .nocow = nocow, + .has_extent_size_hint = has_extent_size_hint, + .extent_size_hint = extent_size_hint, }, }; return raw_co_create(&options, errp); @@ -2930,6 +2969,11 @@ static QemuOptsList raw_create_opts = { #endif ", full)" }, + { + .name = BLOCK_OPT_EXTENT_SIZE_HINT, + .type = QEMU_OPT_SIZE, + .help = "Extent size hint for the image file, 0 to disable" + }, { /* end of list */ } } }; diff --git a/include/block/block_int.h b/include/block/block_int.h index 3d6cf88592..38dec0275b 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -53,6 +53,7 @@ #define BLOCK_OPT_ADAPTER_TYPE "adapter_type" #define BLOCK_OPT_REDUNDANCY "redundancy" #define BLOCK_OPT_NOCOW "nocow" +#define BLOCK_OPT_EXTENT_SIZE_HINT "extent_size_hint" #define BLOCK_OPT_OBJECT_SIZE "object_size" #define BLOCK_OPT_REFCOUNT_BITS "refcount_bits" #define BLOCK_OPT_DATA_FILE "data_file" diff --git a/qapi/block-core.json b/qapi/block-core.json index b20332e592..463ffd83da 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -4185,14 +4185,17 @@ # falloc (if defined CONFIG_POSIX_FALLOCATE), # full (if defined CONFIG_POSIX)) # @nocow: Turn off copy-on-write (valid only on btrfs; default: off) +# @extent-size-hint: Extent size hint to add to the image file; 0 for not +# adding an extent size hint (default: 1 MB, since 5.1) # # Since: 2.12 ## { 'struct': 'BlockdevCreateOptionsFile', - 'data': { 'filename': 'str', - 'size': 'size', - '*preallocation': 'PreallocMode', - '*nocow': 'bool' } } + 'data': { 'filename': 'str', + 'size': 'size', + '*preallocation': 'PreallocMode', + '*nocow': 'bool', + '*extent-size-hint': 'size'} } ## # @BlockdevCreateOptionsGluster: diff --git a/tests/qemu-iotests/082.out b/tests/qemu-iotests/082.out index a4a2b69030..f7b3d54b28 100644 --- a/tests/qemu-iotests/082.out +++ b/tests/qemu-iotests/082.out @@ -62,6 +62,7 @@ Supported options: encrypt.ivgen-hash-alg=<str> - Name of IV generator hash algorithm encrypt.key-secret=<str> - ID of secret providing qcow AES key or LUKS passphrase encryption=<bool (on/off)> - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) + extent_size_hint=<size> - Extent size hint for the image file, 0 to disable lazy_refcounts=<bool (on/off)> - Postpone refcount updates nocow=<bool (on/off)> - Turn off copy-on-write (valid only on btrfs) preallocation=<str> - Preallocation mode (allowed values: off, metadata, falloc, full) @@ -86,6 +87,7 @@ Supported options: encrypt.ivgen-hash-alg=<str> - Name of IV generator hash algorithm encrypt.key-secret=<str> - ID of secret providing qcow AES key or LUKS passphrase encryption=<bool (on/off)> - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) + extent_size_hint=<size> - Extent size hint for the image file, 0 to disable lazy_refcounts=<bool (on/off)> - Postpone refcount updates nocow=<bool (on/off)> - Turn off copy-on-write (valid only on btrfs) preallocation=<str> - Preallocation mode (allowed values: off, metadata, falloc, full) @@ -110,6 +112,7 @@ Supported options: encrypt.ivgen-hash-alg=<str> - Name of IV generator hash algorithm encrypt.key-secret=<str> - ID of secret providing qcow AES key or LUKS passphrase encryption=<bool (on/off)> - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) + extent_size_hint=<size> - Extent size hint for the image file, 0 to disable lazy_refcounts=<bool (on/off)> - Postpone refcount updates nocow=<bool (on/off)> - Turn off copy-on-write (valid only on btrfs) preallocation=<str> - Preallocation mode (allowed values: off, metadata, falloc, full) @@ -134,6 +137,7 @@ Supported options: encrypt.ivgen-hash-alg=<str> - Name of IV generator hash algorithm encrypt.key-secret=<str> - ID of secret providing qcow AES key or LUKS passphrase encryption=<bool (on/off)> - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) + extent_size_hint=<size> - Extent size hint for the image file, 0 to disable lazy_refcounts=<bool (on/off)> - Postpone refcount updates nocow=<bool (on/off)> - Turn off copy-on-write (valid only on btrfs) preallocation=<str> - Preallocation mode (allowed values: off, metadata, falloc, full) @@ -158,6 +162,7 @@ Supported options: encrypt.ivgen-hash-alg=<str> - Name of IV generator hash algorithm encrypt.key-secret=<str> - ID of secret providing qcow AES key or LUKS passphrase encryption=<bool (on/off)> - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) + extent_size_hint=<size> - Extent size hint for the image file, 0 to disable lazy_refcounts=<bool (on/off)> - Postpone refcount updates nocow=<bool (on/off)> - Turn off copy-on-write (valid only on btrfs) preallocation=<str> - Preallocation mode (allowed values: off, metadata, falloc, full) @@ -182,6 +187,7 @@ Supported options: encrypt.ivgen-hash-alg=<str> - Name of IV generator hash algorithm encrypt.key-secret=<str> - ID of secret providing qcow AES key or LUKS passphrase encryption=<bool (on/off)> - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) + extent_size_hint=<size> - Extent size hint for the image file, 0 to disable lazy_refcounts=<bool (on/off)> - Postpone refcount updates nocow=<bool (on/off)> - Turn off copy-on-write (valid only on btrfs) preallocation=<str> - Preallocation mode (allowed values: off, metadata, falloc, full) @@ -206,6 +212,7 @@ Supported options: encrypt.ivgen-hash-alg=<str> - Name of IV generator hash algorithm encrypt.key-secret=<str> - ID of secret providing qcow AES key or LUKS passphrase encryption=<bool (on/off)> - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) + extent_size_hint=<size> - Extent size hint for the image file, 0 to disable lazy_refcounts=<bool (on/off)> - Postpone refcount updates nocow=<bool (on/off)> - Turn off copy-on-write (valid only on btrfs) preallocation=<str> - Preallocation mode (allowed values: off, metadata, falloc, full) @@ -230,6 +237,7 @@ Supported options: encrypt.ivgen-hash-alg=<str> - Name of IV generator hash algorithm encrypt.key-secret=<str> - ID of secret providing qcow AES key or LUKS passphrase encryption=<bool (on/off)> - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) + extent_size_hint=<size> - Extent size hint for the image file, 0 to disable lazy_refcounts=<bool (on/off)> - Postpone refcount updates nocow=<bool (on/off)> - Turn off copy-on-write (valid only on btrfs) preallocation=<str> - Preallocation mode (allowed values: off, metadata, falloc, full) @@ -353,6 +361,7 @@ Supported options: encrypt.ivgen-hash-alg=<str> - Name of IV generator hash algorithm encrypt.key-secret=<str> - ID of secret providing qcow AES key or LUKS passphrase encryption=<bool (on/off)> - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) + extent_size_hint=<size> - Extent size hint for the image file, 0 to disable lazy_refcounts=<bool (on/off)> - Postpone refcount updates nocow=<bool (on/off)> - Turn off copy-on-write (valid only on btrfs) preallocation=<str> - Preallocation mode (allowed values: off, metadata, falloc, full) @@ -377,6 +386,7 @@ Supported options: encrypt.ivgen-hash-alg=<str> - Name of IV generator hash algorithm encrypt.key-secret=<str> - ID of secret providing qcow AES key or LUKS passphrase encryption=<bool (on/off)> - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) + extent_size_hint=<size> - Extent size hint for the image file, 0 to disable lazy_refcounts=<bool (on/off)> - Postpone refcount updates nocow=<bool (on/off)> - Turn off copy-on-write (valid only on btrfs) preallocation=<str> - Preallocation mode (allowed values: off, metadata, falloc, full) @@ -401,6 +411,7 @@ Supported options: encrypt.ivgen-hash-alg=<str> - Name of IV generator hash algorithm encrypt.key-secret=<str> - ID of secret providing qcow AES key or LUKS passphrase encryption=<bool (on/off)> - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) + extent_size_hint=<size> - Extent size hint for the image file, 0 to disable lazy_refcounts=<bool (on/off)> - Postpone refcount updates nocow=<bool (on/off)> - Turn off copy-on-write (valid only on btrfs) preallocation=<str> - Preallocation mode (allowed values: off, metadata, falloc, full) @@ -425,6 +436,7 @@ Supported options: encrypt.ivgen-hash-alg=<str> - Name of IV generator hash algorithm encrypt.key-secret=<str> - ID of secret providing qcow AES key or LUKS passphrase encryption=<bool (on/off)> - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) + extent_size_hint=<size> - Extent size hint for the image file, 0 to disable lazy_refcounts=<bool (on/off)> - Postpone refcount updates nocow=<bool (on/off)> - Turn off copy-on-write (valid only on btrfs) preallocation=<str> - Preallocation mode (allowed values: off, metadata, falloc, full) @@ -449,6 +461,7 @@ Supported options: encrypt.ivgen-hash-alg=<str> - Name of IV generator hash algorithm encrypt.key-secret=<str> - ID of secret providing qcow AES key or LUKS passphrase encryption=<bool (on/off)> - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) + extent_size_hint=<size> - Extent size hint for the image file, 0 to disable lazy_refcounts=<bool (on/off)> - Postpone refcount updates nocow=<bool (on/off)> - Turn off copy-on-write (valid only on btrfs) preallocation=<str> - Preallocation mode (allowed values: off, metadata, falloc, full) @@ -473,6 +486,7 @@ Supported options: encrypt.ivgen-hash-alg=<str> - Name of IV generator hash algorithm encrypt.key-secret=<str> - ID of secret providing qcow AES key or LUKS passphrase encryption=<bool (on/off)> - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) + extent_size_hint=<size> - Extent size hint for the image file, 0 to disable lazy_refcounts=<bool (on/off)> - Postpone refcount updates nocow=<bool (on/off)> - Turn off copy-on-write (valid only on btrfs) preallocation=<str> - Preallocation mode (allowed values: off, metadata, falloc, full) @@ -497,6 +511,7 @@ Supported options: encrypt.ivgen-hash-alg=<str> - Name of IV generator hash algorithm encrypt.key-secret=<str> - ID of secret providing qcow AES key or LUKS passphrase encryption=<bool (on/off)> - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) + extent_size_hint=<size> - Extent size hint for the image file, 0 to disable lazy_refcounts=<bool (on/off)> - Postpone refcount updates nocow=<bool (on/off)> - Turn off copy-on-write (valid only on btrfs) preallocation=<str> - Preallocation mode (allowed values: off, metadata, falloc, full) @@ -521,6 +536,7 @@ Supported options: encrypt.ivgen-hash-alg=<str> - Name of IV generator hash algorithm encrypt.key-secret=<str> - ID of secret providing qcow AES key or LUKS passphrase encryption=<bool (on/off)> - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) + extent_size_hint=<size> - Extent size hint for the image file, 0 to disable lazy_refcounts=<bool (on/off)> - Postpone refcount updates nocow=<bool (on/off)> - Turn off copy-on-write (valid only on btrfs) preallocation=<str> - Preallocation mode (allowed values: off, metadata, falloc, full) diff --git a/tests/qemu-iotests/106 b/tests/qemu-iotests/106 index b5d1ec4078..a20659d443 100755 --- a/tests/qemu-iotests/106 +++ b/tests/qemu-iotests/106 @@ -51,7 +51,10 @@ for create_mode in off falloc full; do echo echo "--- create_mode=$create_mode growth_mode=$growth_mode ---" - _make_test_img -o "preallocation=$create_mode" ${CREATION_SIZE}K + # Our calculation below assumes kilobytes as unit for the actual size. + # Disable the extent size hint because it would give us a result in + # megabytes. + _make_test_img -o "preallocation=$create_mode,extent_size_hint=0" ${CREATION_SIZE}K $QEMU_IMG resize -f "$IMGFMT" --preallocation=$growth_mode "$TEST_IMG" +${GROWTH_SIZE}K expected_size=0 @@ -98,7 +101,7 @@ for growth_mode in falloc full; do # plain int. We should use the correct type for the result, and # this tests we do. - _make_test_img 2G + _make_test_img -o "extent_size_hint=0" 2G $QEMU_IMG resize -f "$IMGFMT" --preallocation=$growth_mode "$TEST_IMG" +${GROWTH_SIZE}K actual_size=$($QEMU_IMG info -f "$IMGFMT" "$TEST_IMG" | grep 'disk size') diff --git a/tests/qemu-iotests/175 b/tests/qemu-iotests/175 index 020ed8e61f..00a626aa63 100755 --- a/tests/qemu-iotests/175 +++ b/tests/qemu-iotests/175 @@ -89,20 +89,20 @@ min_blocks=$(stat -c '%b' "$TEST_DIR/empty") echo echo "== creating image with default preallocation ==" -_make_test_img $size | _filter_imgfmt +_make_test_img -o extent_size_hint=0 $size | _filter_imgfmt stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks $min_blocks $size for mode in off full falloc; do echo echo "== creating image with preallocation $mode ==" - _make_test_img -o preallocation=$mode $size | _filter_imgfmt + _make_test_img -o preallocation=$mode,extent_size_hint=0 $size | _filter_imgfmt stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks $min_blocks $size done for new_size in 4096 1048576; do echo echo "== resize empty image with block_resize ==" - _make_test_img 0 | _filter_imgfmt + _make_test_img -o extent_size_hint=0 0 | _filter_imgfmt _block_resize $TEST_IMG $new_size >/dev/null stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks $min_blocks $new_size done diff --git a/tests/qemu-iotests/243 b/tests/qemu-iotests/243 index a61852f6d9..17388a4644 100755 --- a/tests/qemu-iotests/243 +++ b/tests/qemu-iotests/243 @@ -51,7 +51,7 @@ for mode in off metadata falloc full; do echo "=== preallocation=$mode ===" echo - _make_test_img -o "preallocation=$mode" 64M + _make_test_img -o "preallocation=$mode,extent_size_hint=0" 64M printf "File size: " du -b $TEST_IMG | cut -f1 @@ -68,7 +68,8 @@ for mode in off metadata falloc full; do echo "=== External data file: preallocation=$mode ===" echo - _make_test_img -o "data_file=$TEST_IMG.data,preallocation=$mode" 64M + _make_test_img \ + -o "data_file=$TEST_IMG.data,preallocation=$mode,extent_size_hint=0" 64M echo -n "qcow2 file size: " du -b $TEST_IMG | cut -f1 @@ -79,7 +80,7 @@ for mode in off metadata falloc full; do echo -n "qcow2 disk usage: " [ $(du -B1 $TEST_IMG | cut -f1) -lt 1048576 ] && echo "low" || echo "high" echo -n "data disk usage: " - [ $(du -B1 $TEST_IMG.data | cut -f1) -lt 1048576 ] && echo "low" || echo "high" + [ $(du -B1 $TEST_IMG.data | cut -f1) -lt 2097152 ] && echo "low" || echo "high" done |