diff options
author | MacroFake <falke.marco@gmail.com> | 2022-05-09 13:56:32 +0200 |
---|---|---|
committer | MacroFake <falke.marco@gmail.com> | 2022-05-09 13:56:36 +0200 |
commit | dab18f03f725d8a0fc3abb3d5af5f6b8facb5f6b (patch) | |
tree | c1f0600d51d81822713849033cf51d05236de522 /src | |
parent | 8abe79aedd0ba129e0fd3bcd971e8733d22fb3c4 (diff) | |
parent | 81c09ee45caecf8d9daf6766b94cebf54f3f08cd (diff) | |
download | bitcoin-dab18f03f725d8a0fc3abb3d5af5f6b8facb5f6b.tar.xz |
Merge bitcoin/bitcoin#24946: Unroll the ChaCha20 inner loop for performance
81c09ee45caecf8d9daf6766b94cebf54f3f08cd Unroll the ChaCha20 inner loop for performance (Pieter Wuille)
Pull request description:
Unrolling the inner ChaCha20 loop gives a ~15% speedup for me in the CHACHA20_* benchmarks. It's a simple change, this performance helps with RNG generation, and will matter more for BIP324.
ACKs for top commit:
martinus:
tested ACK 81c09ee with clang++ 13.0.1, test `CHACHA20_1MB`:
MarcoFalke:
ACK 81c09ee45caecf8d9daf6766b94cebf54f3f08cd 🍟
Tree-SHA512: 108bd0ba573bb08de92d611e7be7c09a2c2700f9655f44129b87f9b71f7e101dfc6bd345783e7b4b9b40f0b003913cf59187f422da8cdb5b20887f7855b2611a
Diffstat (limited to 'src')
-rw-r--r-- | src/crypto/chacha20.cpp | 48 |
1 files changed, 28 insertions, 20 deletions
diff --git a/src/crypto/chacha20.cpp b/src/crypto/chacha20.cpp index f3ff4268ee..c7e12b0612 100644 --- a/src/crypto/chacha20.cpp +++ b/src/crypto/chacha20.cpp @@ -18,6 +18,8 @@ constexpr static inline uint32_t rotl32(uint32_t v, int c) { return (v << c) | ( a += b; d = rotl32(d ^ a, 8); \ c += d; b = rotl32(b ^ c, 7); +#define REPEAT10(a) do { {a}; {a}; {a}; {a}; {a}; {a}; {a}; {a}; {a}; {a}; } while(0) + static const unsigned char sigma[] = "expand 32-byte k"; static const unsigned char tau[] = "expand 16-byte k"; @@ -119,16 +121,19 @@ void ChaCha20::Keystream(unsigned char* c, size_t bytes) x13 = j13; x14 = j14; x15 = j15; - for (i = 20;i > 0;i -= 2) { - QUARTERROUND( x0, x4, x8,x12) - QUARTERROUND( x1, x5, x9,x13) - QUARTERROUND( x2, x6,x10,x14) - QUARTERROUND( x3, x7,x11,x15) - QUARTERROUND( x0, x5,x10,x15) - QUARTERROUND( x1, x6,x11,x12) - QUARTERROUND( x2, x7, x8,x13) - QUARTERROUND( x3, x4, x9,x14) - } + + // The 20 inner ChaCha20 rounds are unrolled here for performance. + REPEAT10( + QUARTERROUND( x0, x4, x8,x12); + QUARTERROUND( x1, x5, x9,x13); + QUARTERROUND( x2, x6,x10,x14); + QUARTERROUND( x3, x7,x11,x15); + QUARTERROUND( x0, x5,x10,x15); + QUARTERROUND( x1, x6,x11,x12); + QUARTERROUND( x2, x7, x8,x13); + QUARTERROUND( x3, x4, x9,x14); + ); + x0 += j0; x1 += j1; x2 += j2; @@ -231,16 +236,19 @@ void ChaCha20::Crypt(const unsigned char* m, unsigned char* c, size_t bytes) x13 = j13; x14 = j14; x15 = j15; - for (i = 20;i > 0;i -= 2) { - QUARTERROUND( x0, x4, x8,x12) - QUARTERROUND( x1, x5, x9,x13) - QUARTERROUND( x2, x6,x10,x14) - QUARTERROUND( x3, x7,x11,x15) - QUARTERROUND( x0, x5,x10,x15) - QUARTERROUND( x1, x6,x11,x12) - QUARTERROUND( x2, x7, x8,x13) - QUARTERROUND( x3, x4, x9,x14) - } + + // The 20 inner ChaCha20 rounds are unrolled here for performance. + REPEAT10( + QUARTERROUND( x0, x4, x8,x12); + QUARTERROUND( x1, x5, x9,x13); + QUARTERROUND( x2, x6,x10,x14); + QUARTERROUND( x3, x7,x11,x15); + QUARTERROUND( x0, x5,x10,x15); + QUARTERROUND( x1, x6,x11,x12); + QUARTERROUND( x2, x7, x8,x13); + QUARTERROUND( x3, x4, x9,x14); + ); + x0 += j0; x1 += j1; x2 += j2; |