Merge remote-tracking branch 'remotes/amit-migration/tags/migration-for-2.6-6' into staging

migration: * add avx2 instruction optimization, speeds up zero-page checking on compatible architectures and compilers (gcc 4.9+) * add additional postcopy stats to 'info migrate' output # gpg: Signature made Tue 08 Mar 2016 11:29:48 GMT using RSA key ID 854083B6 # gpg: Good signature from "Amit Shah <amit@amitshah.net>" # gpg: aka "Amit Shah <amit@kernel.org>" # gpg: aka "Amit Shah <amitshah@gmx.net>" * remotes/amit-migration/tags/migration-for-2.6-6: cutils: add avx2 instruction optimization configure: detect ifunc and avx2 attribute Postcopy: Fix sync count in info migrate Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
author: Peter Maydell <peter.maydell@linaro.org> 2016-03-09 01:07:16 +0000
committer: Peter Maydell <peter.maydell@linaro.org> 2016-03-09 01:07:16 +0000
commit: 8519c8e0730039e2925ddb7cc8cfa588a1ef9d13 (patch)
tree: 018507e35db6cfd73f69341c94da93e7de8109c5
parent: 3293680dc79c9a4fbbc2bbdd9395a886825a87a2 (diff)
parent: 28b90d9c19d368645f475e36297ca21c53c38799 (diff)
4 files changed, 143 insertions, 11 deletions
diff --git a/configure b/configure
index 0c0472a7da..2b3287641a 100755
--- a/configure
+++ b/configure
@@ -280,6 +280,7 @@ libusb=""
 usb_redir=""
 opengl=""
 opengl_dmabuf="no"
+avx2_opt="no"
 zlib="yes"
 lzo=""
 snappy=""
@@ -1773,6 +1774,21 @@ EOF
 fi
 
 ##########################################
+# avx2 optimization requirement check
+
+cat > $TMPC << EOF
+static void bar(void) {}
+static void *bar_ifunc(void) {return (void*) bar;}
+static void foo(void) __attribute__((ifunc("bar_ifunc")));
+int main(void) { foo(); return 0; }
+EOF
+if compile_prog "-mavx2" "" ; then
+    if readelf --syms $TMPE |grep "IFUNC.*foo" >/dev/null 2>&1; then
+        avx2_opt="yes"
+    fi
+fi
+
+#########################################
 # zlib check
 
 if test "$zlib" != "no" ; then
@@ -4790,6 +4806,7 @@ echo "bzip2 support     $bzip2"
 echo "NUMA host support $numa"
 echo "tcmalloc support  $tcmalloc"
 echo "jemalloc support  $jemalloc"
+echo "avx2 optimization $avx2_opt"
 
 if test "$sdl_too_old" = "yes"; then
 echo "-> Your SDL version is too old - please upgrade to have SDL support"
@@ -5178,6 +5195,10 @@ if test "$opengl" = "yes" ; then
   fi
 fi
 
+if test "$avx2_opt" = "yes" ; then
+  echo "CONFIG_AVX2_OPT=y" >> $config_host_mak
+fi
+
 if test "$lzo" = "yes" ; then
   echo "CONFIG_LZO=y" >> $config_host_mak
 fi
diff --git a/include/qemu-common.h b/include/qemu-common.h
index ced2994402..887ca71c8a 100644
--- a/include/qemu-common.h
+++ b/include/qemu-common.h
@@ -476,13 +476,7 @@ void qemu_hexdump(const char *buf, FILE *fp, const char *prefix, size_t size);
 #endif
 
 #define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8
-static inline bool
-can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
-{
-    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
-                   * sizeof(VECTYPE)) == 0
-            && ((uintptr_t) buf) % sizeof(VECTYPE) == 0);
-}
+bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len);
 size_t buffer_find_nonzero_offset(const void *buf, size_t len);
 
 /*
diff --git a/migration/migration.c b/migration/migration.c
index 0129d9f420..7d13377b8e 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -635,6 +635,7 @@ MigrationInfo *qmp_query_migrate(Error **errp)
         info->ram->normal_bytes = norm_mig_bytes_transferred();
         info->ram->dirty_pages_rate = s->dirty_pages_rate;
         info->ram->mbps = s->mbps;
+        info->ram->dirty_sync_count = s->dirty_sync_count;
 
         if (blk_mig_active()) {
             info->has_disk = true;
diff --git a/util/cutils.c b/util/cutils.c
index 59e1f70d5f..c3dd53453a 100644
--- a/util/cutils.c
+++ b/util/cutils.c
@@ -160,6 +160,14 @@ int qemu_fdatasync(int fd)
 #endif
 }
 
+static bool
+can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len)
+{
+    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
+                   * sizeof(VECTYPE)) == 0
+            && ((uintptr_t) buf) % sizeof(VECTYPE) == 0);
+}
+
 /*
  * Searches for an area with non-zero content in a buffer
  *
@@ -168,8 +176,8 @@ int qemu_fdatasync(int fd)
  * and addr must be a multiple of sizeof(VECTYPE) due to
  * restriction of optimizations in this function.
  *
- * can_use_buffer_find_nonzero_offset() can be used to check
- * these requirements.
+ * can_use_buffer_find_nonzero_offset_inner() can be used to
+ * check these requirements.
  *
  * The return value is the offset of the non-zero area rounded
  * down to a multiple of sizeof(VECTYPE) for the first
@@ -180,13 +188,13 @@ int qemu_fdatasync(int fd)
  * If the buffer is all zero the return value is equal to len.
  */
 
-size_t buffer_find_nonzero_offset(const void *buf, size_t len)
+static size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len)
 {
     const VECTYPE *p = buf;
     const VECTYPE zero = (VECTYPE){0};
     size_t i;
 
-    assert(can_use_buffer_find_nonzero_offset(buf, len));
+    assert(can_use_buffer_find_nonzero_offset_inner(buf, len));
 
     if (!len) {
         return 0;
@@ -216,6 +224,114 @@ size_t buffer_find_nonzero_offset(const void *buf, size_t len)
 }
 
 /*
+ * GCC before version 4.9 has a bug which will cause the target
+ * attribute work incorrectly and failed to compile in some case,
+ * restrict the gcc version to 4.9+ to prevent the failure.
+ */
+
+#if defined CONFIG_AVX2_OPT && QEMU_GNUC_PREREQ(4, 9)
+#pragma GCC push_options
+#pragma GCC target("avx2")
+#include <cpuid.h>
+#include <immintrin.h>
+
+#define AVX2_VECTYPE        __m256i
+#define AVX2_SPLAT(p)       _mm256_set1_epi8(*(p))
+#define AVX2_ALL_EQ(v1, v2) \
+    (_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0xFFFFFFFF)
+#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2))
+
+static bool
+can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
+{
+    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
+                   * sizeof(AVX2_VECTYPE)) == 0
+            && ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0);
+}
+
+static size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
+{
+    const AVX2_VECTYPE *p = buf;
+    const AVX2_VECTYPE zero = (AVX2_VECTYPE){0};
+    size_t i;
+
+    assert(can_use_buffer_find_nonzero_offset_avx2(buf, len));
+
+    if (!len) {
+        return 0;
+    }
+
+    for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) {
+        if (!AVX2_ALL_EQ(p[i], zero)) {
+            return i * sizeof(AVX2_VECTYPE);
+        }
+    }
+
+    for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR;
+         i < len / sizeof(AVX2_VECTYPE);
+         i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) {
+        AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]);
+        AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]);
+        AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]);
+        AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]);
+        AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1);
+        AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, tmp3);
+        if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01, tmp23), zero)) {
+            break;
+        }
+    }
+
+    return i * sizeof(AVX2_VECTYPE);
+}
+
+static bool avx2_support(void)
+{
+    int a, b, c, d;
+
+    if (__get_cpuid_max(0, NULL) < 7) {
+        return false;
+    }
+
+    __cpuid_count(7, 0, a, b, c, d);
+
+    return b & bit_AVX2;
+}
+
+bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len) \
+         __attribute__ ((ifunc("can_use_buffer_find_nonzero_offset_ifunc")));
+size_t buffer_find_nonzero_offset(const void *buf, size_t len) \
+         __attribute__ ((ifunc("buffer_find_nonzero_offset_ifunc")));
+
+static void *buffer_find_nonzero_offset_ifunc(void)
+{
+    typeof(buffer_find_nonzero_offset) *func = (avx2_support()) ?
+        buffer_find_nonzero_offset_avx2 : buffer_find_nonzero_offset_inner;
+
+    return func;
+}
+
+static void *can_use_buffer_find_nonzero_offset_ifunc(void)
+{
+    typeof(can_use_buffer_find_nonzero_offset) *func = (avx2_support()) ?
+        can_use_buffer_find_nonzero_offset_avx2 :
+        can_use_buffer_find_nonzero_offset_inner;
+
+    return func;
+}
+#pragma GCC pop_options
+#else
+bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
+{
+    return can_use_buffer_find_nonzero_offset_inner(buf, len);
+}
+
+size_t buffer_find_nonzero_offset(const void *buf, size_t len)
+{
+    return buffer_find_nonzero_offset_inner(buf, len);
+}
+#endif
+
+/*
  * Checks if a buffer is all zeroes
  *
  * Attention! The len must be a multiple of 4 * sizeof(long) due to
author	Peter Maydell <peter.maydell@linaro.org>	2016-03-09 01:07:16 +0000
committer	Peter Maydell <peter.maydell@linaro.org>	2016-03-09 01:07:16 +0000
commit	8519c8e0730039e2925ddb7cc8cfa588a1ef9d13 (patch)
tree	018507e35db6cfd73f69341c94da93e7de8109c5
parent	3293680dc79c9a4fbbc2bbdd9395a886825a87a2 (diff)
parent	28b90d9c19d368645f475e36297ca21c53c38799 (diff)