diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2016-03-09 01:07:16 +0000 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2016-03-09 01:07:16 +0000 |
commit | 8519c8e0730039e2925ddb7cc8cfa588a1ef9d13 (patch) | |
tree | 018507e35db6cfd73f69341c94da93e7de8109c5 | |
parent | 3293680dc79c9a4fbbc2bbdd9395a886825a87a2 (diff) | |
parent | 28b90d9c19d368645f475e36297ca21c53c38799 (diff) |
Merge remote-tracking branch 'remotes/amit-migration/tags/migration-for-2.6-6' into staging
migration:
* add avx2 instruction optimization, speeds up zero-page checking on
compatible architectures and compilers (gcc 4.9+)
* add additional postcopy stats to 'info migrate' output
# gpg: Signature made Tue 08 Mar 2016 11:29:48 GMT using RSA key ID 854083B6
# gpg: Good signature from "Amit Shah <amit@amitshah.net>"
# gpg: aka "Amit Shah <amit@kernel.org>"
# gpg: aka "Amit Shah <amitshah@gmx.net>"
* remotes/amit-migration/tags/migration-for-2.6-6:
cutils: add avx2 instruction optimization
configure: detect ifunc and avx2 attribute
Postcopy: Fix sync count in info migrate
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rwxr-xr-x | configure | 21 | ||||
-rw-r--r-- | include/qemu-common.h | 8 | ||||
-rw-r--r-- | migration/migration.c | 1 | ||||
-rw-r--r-- | util/cutils.c | 124 |
4 files changed, 143 insertions, 11 deletions
@@ -280,6 +280,7 @@ libusb="" usb_redir="" opengl="" opengl_dmabuf="no" +avx2_opt="no" zlib="yes" lzo="" snappy="" @@ -1773,6 +1774,21 @@ EOF fi ########################################## +# avx2 optimization requirement check + +cat > $TMPC << EOF +static void bar(void) {} +static void *bar_ifunc(void) {return (void*) bar;} +static void foo(void) __attribute__((ifunc("bar_ifunc"))); +int main(void) { foo(); return 0; } +EOF +if compile_prog "-mavx2" "" ; then + if readelf --syms $TMPE |grep "IFUNC.*foo" >/dev/null 2>&1; then + avx2_opt="yes" + fi +fi + +######################################### # zlib check if test "$zlib" != "no" ; then @@ -4790,6 +4806,7 @@ echo "bzip2 support $bzip2" echo "NUMA host support $numa" echo "tcmalloc support $tcmalloc" echo "jemalloc support $jemalloc" +echo "avx2 optimization $avx2_opt" if test "$sdl_too_old" = "yes"; then echo "-> Your SDL version is too old - please upgrade to have SDL support" @@ -5178,6 +5195,10 @@ if test "$opengl" = "yes" ; then fi fi +if test "$avx2_opt" = "yes" ; then + echo "CONFIG_AVX2_OPT=y" >> $config_host_mak +fi + if test "$lzo" = "yes" ; then echo "CONFIG_LZO=y" >> $config_host_mak fi diff --git a/include/qemu-common.h b/include/qemu-common.h index ced2994402..887ca71c8a 100644 --- a/include/qemu-common.h +++ b/include/qemu-common.h @@ -476,13 +476,7 @@ void qemu_hexdump(const char *buf, FILE *fp, const char *prefix, size_t size); #endif #define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8 -static inline bool -can_use_buffer_find_nonzero_offset(const void *buf, size_t len) -{ - return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR - * sizeof(VECTYPE)) == 0 - && ((uintptr_t) buf) % sizeof(VECTYPE) == 0); -} +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len); size_t buffer_find_nonzero_offset(const void *buf, size_t len); /* diff --git a/migration/migration.c b/migration/migration.c index 0129d9f420..7d13377b8e 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -635,6 +635,7 @@ MigrationInfo *qmp_query_migrate(Error **errp) info->ram->normal_bytes = norm_mig_bytes_transferred(); info->ram->dirty_pages_rate = s->dirty_pages_rate; info->ram->mbps = s->mbps; + info->ram->dirty_sync_count = s->dirty_sync_count; if (blk_mig_active()) { info->has_disk = true; diff --git a/util/cutils.c b/util/cutils.c index 59e1f70d5f..c3dd53453a 100644 --- a/util/cutils.c +++ b/util/cutils.c @@ -160,6 +160,14 @@ int qemu_fdatasync(int fd) #endif } +static bool +can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len) +{ + return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR + * sizeof(VECTYPE)) == 0 + && ((uintptr_t) buf) % sizeof(VECTYPE) == 0); +} + /* * Searches for an area with non-zero content in a buffer * @@ -168,8 +176,8 @@ int qemu_fdatasync(int fd) * and addr must be a multiple of sizeof(VECTYPE) due to * restriction of optimizations in this function. * - * can_use_buffer_find_nonzero_offset() can be used to check - * these requirements. + * can_use_buffer_find_nonzero_offset_inner() can be used to + * check these requirements. * * The return value is the offset of the non-zero area rounded * down to a multiple of sizeof(VECTYPE) for the first @@ -180,13 +188,13 @@ int qemu_fdatasync(int fd) * If the buffer is all zero the return value is equal to len. */ -size_t buffer_find_nonzero_offset(const void *buf, size_t len) +static size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len) { const VECTYPE *p = buf; const VECTYPE zero = (VECTYPE){0}; size_t i; - assert(can_use_buffer_find_nonzero_offset(buf, len)); + assert(can_use_buffer_find_nonzero_offset_inner(buf, len)); if (!len) { return 0; @@ -216,6 +224,114 @@ size_t buffer_find_nonzero_offset(const void *buf, size_t len) } /* + * GCC before version 4.9 has a bug which will cause the target + * attribute work incorrectly and failed to compile in some case, + * restrict the gcc version to 4.9+ to prevent the failure. + */ + +#if defined CONFIG_AVX2_OPT && QEMU_GNUC_PREREQ(4, 9) +#pragma GCC push_options +#pragma GCC target("avx2") +#include <cpuid.h> +#include <immintrin.h> + +#define AVX2_VECTYPE __m256i +#define AVX2_SPLAT(p) _mm256_set1_epi8(*(p)) +#define AVX2_ALL_EQ(v1, v2) \ + (_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0xFFFFFFFF) +#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2)) + +static bool +can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len) +{ + return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR + * sizeof(AVX2_VECTYPE)) == 0 + && ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0); +} + +static size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len) +{ + const AVX2_VECTYPE *p = buf; + const AVX2_VECTYPE zero = (AVX2_VECTYPE){0}; + size_t i; + + assert(can_use_buffer_find_nonzero_offset_avx2(buf, len)); + + if (!len) { + return 0; + } + + for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) { + if (!AVX2_ALL_EQ(p[i], zero)) { + return i * sizeof(AVX2_VECTYPE); + } + } + + for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; + i < len / sizeof(AVX2_VECTYPE); + i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) { + AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]); + AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]); + AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]); + AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]); + AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1); + AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, tmp3); + if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01, tmp23), zero)) { + break; + } + } + + return i * sizeof(AVX2_VECTYPE); +} + +static bool avx2_support(void) +{ + int a, b, c, d; + + if (__get_cpuid_max(0, NULL) < 7) { + return false; + } + + __cpuid_count(7, 0, a, b, c, d); + + return b & bit_AVX2; +} + +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len) \ + __attribute__ ((ifunc("can_use_buffer_find_nonzero_offset_ifunc"))); +size_t buffer_find_nonzero_offset(const void *buf, size_t len) \ + __attribute__ ((ifunc("buffer_find_nonzero_offset_ifunc"))); + +static void *buffer_find_nonzero_offset_ifunc(void) +{ + typeof(buffer_find_nonzero_offset) *func = (avx2_support()) ? + buffer_find_nonzero_offset_avx2 : buffer_find_nonzero_offset_inner; + + return func; +} + +static void *can_use_buffer_find_nonzero_offset_ifunc(void) +{ + typeof(can_use_buffer_find_nonzero_offset) *func = (avx2_support()) ? + can_use_buffer_find_nonzero_offset_avx2 : + can_use_buffer_find_nonzero_offset_inner; + + return func; +} +#pragma GCC pop_options +#else +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len) +{ + return can_use_buffer_find_nonzero_offset_inner(buf, len); +} + +size_t buffer_find_nonzero_offset(const void *buf, size_t len) +{ + return buffer_find_nonzero_offset_inner(buf, len); +} +#endif + +/* * Checks if a buffer is all zeroes * * Attention! The len must be a multiple of 4 * sizeof(long) due to |