diff options
author | Wladimir J. van der Laan <laanwj@gmail.com> | 2018-07-09 20:33:41 +0200 |
---|---|---|
committer | Wladimir J. van der Laan <laanwj@gmail.com> | 2018-07-09 21:17:18 +0200 |
commit | 3a3eabef40979b5b136b8bd81a65c228c8b8895d (patch) | |
tree | f4afb0a04507acf4b330965b8f65f27b39eea77e /configure.ac | |
parent | 7e74c54fed364a2974b6033da12de65abc07df93 (diff) | |
parent | 66b2cf1ccfad545a8ec3f2a854e23f647322bf30 (diff) |
Merge #13386: SHA256 implementations based on Intel SHA Extensions
66b2cf1ccfad545a8ec3f2a854e23f647322bf30 Use immintrin.h everywhere for intrinsics (Pieter Wuille)
4c935e2eee456ff66cdfb908b0edffdd1e8a6c04 Add SHA256 implementation using using Intel SHA intrinsics (Pieter Wuille)
268400d3188200c9e3dcd3482c4853354388a721 [Refactor] CPU feature detection logic for SHA256 (Pieter Wuille)
Pull request description:
Based on #13191.
This adds SHA256 implementations that use Intel's SHA Extension instructions (using intrinsics). This needs GCC 4.9 or Clang 3.4.
In addition to #13191, two extra implementations are provided:
* (a) A variable-length SHA256 implementation using SHA extensions.
* (b) A 2-way 64-byte input double-SHA256 implementation using SHA extensions.
Benchmarks for 9001-element Merkle tree root computation on an AMD Ryzen 1800X system:
* Using generic C++ code (pre-#10821): 6.1ms
* Using SSE4 (master, #10821): 4.6ms
* Using 4-way SSE4 specialized for 64-byte inputs (#13191): 2.8ms
* Using 8-way AVX2 specialized for 64-byte inputs (#13191): 2.1ms
* Using 2-way SHA-NI specialized for 64-byte inputs (this PR): 0.56ms
Benchmarks for 32-byte SHA256 on the same system:
* Using SSE4 (master, #10821): 190ns
* Using SHA-NI (this PR): 53ns
Benchmarks for 1000000-byte SHA256 on the same system:
* Using SSE4 (master, #10821): 2.5ms
* Using SHA-NI (this PR): 0.51ms
Tree-SHA512: 2b319e33b22579f815d91f9daf7994a5e1e799c4f73c13e15070dd54ba71f3f6438ccf77ae9cbd1ce76f972d9cbeb5f0edfea3d86f101bbc1055db70e42743b7
Diffstat (limited to 'configure.ac')
-rw-r--r-- | configure.ac | 28 |
1 files changed, 20 insertions, 8 deletions
diff --git a/configure.ac b/configure.ac index 9fba9d0851..490b638ee1 100644 --- a/configure.ac +++ b/configure.ac @@ -320,6 +320,7 @@ fi AX_CHECK_COMPILE_FLAG([-msse4.2],[[SSE42_CXXFLAGS="-msse4.2"]],,[[$CXXFLAG_WERROR]]) AX_CHECK_COMPILE_FLAG([-msse4.1],[[SSE41_CXXFLAGS="-msse4.1"]],,[[$CXXFLAG_WERROR]]) AX_CHECK_COMPILE_FLAG([-mavx -mavx2],[[AVX2_CXXFLAGS="-mavx -mavx2"]],,[[$CXXFLAG_WERROR]]) +AX_CHECK_COMPILE_FLAG([-msse4 -msha],[[SHANI_CXXFLAGS="-msse4 -msha"]],,[[$CXXFLAG_WERROR]]) TEMP_CXXFLAGS="$CXXFLAGS" CXXFLAGS="$CXXFLAGS $SSE42_CXXFLAGS" @@ -348,11 +349,7 @@ CXXFLAGS="$CXXFLAGS $SSE41_CXXFLAGS" AC_MSG_CHECKING(for SSE4.1 intrinsics) AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ #include <stdint.h> - #if defined(_MSC_VER) #include <immintrin.h> - #elif defined(__GNUC__) - #include <x86intrin.h> - #endif ]],[[ __m128i l = _mm_set1_epi32(0); return _mm_extract_epi32(l, 3); @@ -367,11 +364,7 @@ CXXFLAGS="$CXXFLAGS $AVX2_CXXFLAGS" AC_MSG_CHECKING(for AVX2 intrinsics) AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ #include <stdint.h> - #if defined(_MSC_VER) #include <immintrin.h> - #elif defined(__GNUC__) && defined(__AVX2__) - #include <x86intrin.h> - #endif ]],[[ __m256i l = _mm256_set1_epi32(0); return _mm256_extract_epi32(l, 7); @@ -381,6 +374,23 @@ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ ) CXXFLAGS="$TEMP_CXXFLAGS" +TEMP_CXXFLAGS="$CXXFLAGS" +CXXFLAGS="$CXXFLAGS $SHANI_CXXFLAGS" +AC_MSG_CHECKING(for SHA-NI intrinsics) +AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ + #include <stdint.h> + #include <immintrin.h> + ]],[[ + __m128i i = _mm_set1_epi32(0); + __m128i j = _mm_set1_epi32(1); + __m128i k = _mm_set1_epi32(2); + return _mm_extract_epi32(_mm_sha256rnds2_epu32(i, i, k), 0); + ]])], + [ AC_MSG_RESULT(yes); enable_shani=yes; AC_DEFINE(ENABLE_SHANI, 1, [Define this symbol to build code that uses SHA-NI intrinsics]) ], + [ AC_MSG_RESULT(no)] +) +CXXFLAGS="$TEMP_CXXFLAGS" + CPPFLAGS="$CPPFLAGS -DHAVE_BUILD_INFO -D__STDC_FORMAT_MACROS" AC_ARG_WITH([utils], @@ -1309,6 +1319,7 @@ AM_CONDITIONAL([HARDEN],[test x$use_hardening = xyes]) AM_CONDITIONAL([ENABLE_HWCRC32],[test x$enable_hwcrc32 = xyes]) AM_CONDITIONAL([ENABLE_SSE41],[test x$enable_sse41 = xyes]) AM_CONDITIONAL([ENABLE_AVX2],[test x$enable_avx2 = xyes]) +AM_CONDITIONAL([ENABLE_SHANI],[test x$enable_shani = xyes]) AM_CONDITIONAL([USE_ASM],[test x$use_asm = xyes]) AC_DEFINE(CLIENT_VERSION_MAJOR, _CLIENT_VERSION_MAJOR, [Major version]) @@ -1353,6 +1364,7 @@ AC_SUBST(SANITIZER_LDFLAGS) AC_SUBST(SSE42_CXXFLAGS) AC_SUBST(SSE41_CXXFLAGS) AC_SUBST(AVX2_CXXFLAGS) +AC_SUBST(SHANI_CXXFLAGS) AC_SUBST(LIBTOOL_APP_LDFLAGS) AC_SUBST(USE_UPNP) AC_SUBST(USE_QRCODE) |