diff options
Diffstat (limited to 'src/crypto')
-rw-r--r-- | src/crypto/aes.cpp | 216 | ||||
-rw-r--r-- | src/crypto/aes.h | 118 | ||||
-rw-r--r-- | src/crypto/chacha20.cpp | 180 | ||||
-rw-r--r-- | src/crypto/chacha20.h | 26 | ||||
-rw-r--r-- | src/crypto/common.h | 103 | ||||
-rw-r--r-- | src/crypto/ctaes/COPYING | 21 | ||||
-rw-r--r-- | src/crypto/ctaes/README.md | 41 | ||||
-rw-r--r-- | src/crypto/ctaes/bench.c | 170 | ||||
-rw-r--r-- | src/crypto/ctaes/ctaes.c | 556 | ||||
-rw-r--r-- | src/crypto/ctaes/ctaes.h | 41 | ||||
-rw-r--r-- | src/crypto/ctaes/test.c | 110 | ||||
-rw-r--r-- | src/crypto/hmac_sha256.cpp | 34 | ||||
-rw-r--r-- | src/crypto/hmac_sha256.h | 32 | ||||
-rw-r--r-- | src/crypto/hmac_sha512.cpp | 34 | ||||
-rw-r--r-- | src/crypto/hmac_sha512.h | 32 | ||||
-rw-r--r-- | src/crypto/ripemd160.cpp | 292 | ||||
-rw-r--r-- | src/crypto/ripemd160.h | 28 | ||||
-rw-r--r-- | src/crypto/sha1.cpp | 199 | ||||
-rw-r--r-- | src/crypto/sha1.h | 28 | ||||
-rw-r--r-- | src/crypto/sha256.cpp | 249 | ||||
-rw-r--r-- | src/crypto/sha256.h | 34 | ||||
-rw-r--r-- | src/crypto/sha256_sse4.cpp | 1506 | ||||
-rw-r--r-- | src/crypto/sha512.cpp | 207 | ||||
-rw-r--r-- | src/crypto/sha512.h | 28 |
24 files changed, 4285 insertions, 0 deletions
diff --git a/src/crypto/aes.cpp b/src/crypto/aes.cpp new file mode 100644 index 0000000000..bf7a252349 --- /dev/null +++ b/src/crypto/aes.cpp @@ -0,0 +1,216 @@ +// Copyright (c) 2016-2017 The Bitcoin Core developers +// Distributed under the MIT software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. + +#include <crypto/aes.h> +#include <crypto/common.h> + +#include <assert.h> +#include <string.h> + +extern "C" { +#include <crypto/ctaes/ctaes.c> +} + +AES128Encrypt::AES128Encrypt(const unsigned char key[16]) +{ + AES128_init(&ctx, key); +} + +AES128Encrypt::~AES128Encrypt() +{ + memset(&ctx, 0, sizeof(ctx)); +} + +void AES128Encrypt::Encrypt(unsigned char ciphertext[16], const unsigned char plaintext[16]) const +{ + AES128_encrypt(&ctx, 1, ciphertext, plaintext); +} + +AES128Decrypt::AES128Decrypt(const unsigned char key[16]) +{ + AES128_init(&ctx, key); +} + +AES128Decrypt::~AES128Decrypt() +{ + memset(&ctx, 0, sizeof(ctx)); +} + +void AES128Decrypt::Decrypt(unsigned char plaintext[16], const unsigned char ciphertext[16]) const +{ + AES128_decrypt(&ctx, 1, plaintext, ciphertext); +} + +AES256Encrypt::AES256Encrypt(const unsigned char key[32]) +{ + AES256_init(&ctx, key); +} + +AES256Encrypt::~AES256Encrypt() +{ + memset(&ctx, 0, sizeof(ctx)); +} + +void AES256Encrypt::Encrypt(unsigned char ciphertext[16], const unsigned char plaintext[16]) const +{ + AES256_encrypt(&ctx, 1, ciphertext, plaintext); +} + +AES256Decrypt::AES256Decrypt(const unsigned char key[32]) +{ + AES256_init(&ctx, key); +} + +AES256Decrypt::~AES256Decrypt() +{ + memset(&ctx, 0, sizeof(ctx)); +} + +void AES256Decrypt::Decrypt(unsigned char plaintext[16], const unsigned char ciphertext[16]) const +{ + AES256_decrypt(&ctx, 1, plaintext, ciphertext); +} + + +template <typename T> +static int CBCEncrypt(const T& enc, const unsigned char iv[AES_BLOCKSIZE], const unsigned char* data, int size, bool pad, unsigned char* out) +{ + int written = 0; + int padsize = size % AES_BLOCKSIZE; + unsigned char mixed[AES_BLOCKSIZE]; + + if (!data || !size || !out) + return 0; + + if (!pad && padsize != 0) + return 0; + + memcpy(mixed, iv, AES_BLOCKSIZE); + + // Write all but the last block + while (written + AES_BLOCKSIZE <= size) { + for (int i = 0; i != AES_BLOCKSIZE; i++) + mixed[i] ^= *data++; + enc.Encrypt(out + written, mixed); + memcpy(mixed, out + written, AES_BLOCKSIZE); + written += AES_BLOCKSIZE; + } + if (pad) { + // For all that remains, pad each byte with the value of the remaining + // space. If there is none, pad by a full block. + for (int i = 0; i != padsize; i++) + mixed[i] ^= *data++; + for (int i = padsize; i != AES_BLOCKSIZE; i++) + mixed[i] ^= AES_BLOCKSIZE - padsize; + enc.Encrypt(out + written, mixed); + written += AES_BLOCKSIZE; + } + return written; +} + +template <typename T> +static int CBCDecrypt(const T& dec, const unsigned char iv[AES_BLOCKSIZE], const unsigned char* data, int size, bool pad, unsigned char* out) +{ + int written = 0; + bool fail = false; + const unsigned char* prev = iv; + + if (!data || !size || !out) + return 0; + + if (size % AES_BLOCKSIZE != 0) + return 0; + + // Decrypt all data. Padding will be checked in the output. + while (written != size) { + dec.Decrypt(out, data + written); + for (int i = 0; i != AES_BLOCKSIZE; i++) + *out++ ^= prev[i]; + prev = data + written; + written += AES_BLOCKSIZE; + } + + // When decrypting padding, attempt to run in constant-time + if (pad) { + // If used, padding size is the value of the last decrypted byte. For + // it to be valid, It must be between 1 and AES_BLOCKSIZE. + unsigned char padsize = *--out; + fail = !padsize | (padsize > AES_BLOCKSIZE); + + // If not well-formed, treat it as though there's no padding. + padsize *= !fail; + + // All padding must equal the last byte otherwise it's not well-formed + for (int i = AES_BLOCKSIZE; i != 0; i--) + fail |= ((i > AES_BLOCKSIZE - padsize) & (*out-- != padsize)); + + written -= padsize; + } + return written * !fail; +} + +AES256CBCEncrypt::AES256CBCEncrypt(const unsigned char key[AES256_KEYSIZE], const unsigned char ivIn[AES_BLOCKSIZE], bool padIn) + : enc(key), pad(padIn) +{ + memcpy(iv, ivIn, AES_BLOCKSIZE); +} + +int AES256CBCEncrypt::Encrypt(const unsigned char* data, int size, unsigned char* out) const +{ + return CBCEncrypt(enc, iv, data, size, pad, out); +} + +AES256CBCEncrypt::~AES256CBCEncrypt() +{ + memset(iv, 0, sizeof(iv)); +} + +AES256CBCDecrypt::AES256CBCDecrypt(const unsigned char key[AES256_KEYSIZE], const unsigned char ivIn[AES_BLOCKSIZE], bool padIn) + : dec(key), pad(padIn) +{ + memcpy(iv, ivIn, AES_BLOCKSIZE); +} + + +int AES256CBCDecrypt::Decrypt(const unsigned char* data, int size, unsigned char* out) const +{ + return CBCDecrypt(dec, iv, data, size, pad, out); +} + +AES256CBCDecrypt::~AES256CBCDecrypt() +{ + memset(iv, 0, sizeof(iv)); +} + +AES128CBCEncrypt::AES128CBCEncrypt(const unsigned char key[AES128_KEYSIZE], const unsigned char ivIn[AES_BLOCKSIZE], bool padIn) + : enc(key), pad(padIn) +{ + memcpy(iv, ivIn, AES_BLOCKSIZE); +} + +AES128CBCEncrypt::~AES128CBCEncrypt() +{ + memset(iv, 0, AES_BLOCKSIZE); +} + +int AES128CBCEncrypt::Encrypt(const unsigned char* data, int size, unsigned char* out) const +{ + return CBCEncrypt(enc, iv, data, size, pad, out); +} + +AES128CBCDecrypt::AES128CBCDecrypt(const unsigned char key[AES128_KEYSIZE], const unsigned char ivIn[AES_BLOCKSIZE], bool padIn) + : dec(key), pad(padIn) +{ + memcpy(iv, ivIn, AES_BLOCKSIZE); +} + +AES128CBCDecrypt::~AES128CBCDecrypt() +{ + memset(iv, 0, AES_BLOCKSIZE); +} + +int AES128CBCDecrypt::Decrypt(const unsigned char* data, int size, unsigned char* out) const +{ + return CBCDecrypt(dec, iv, data, size, pad, out); +} diff --git a/src/crypto/aes.h b/src/crypto/aes.h new file mode 100644 index 0000000000..2dec8d9558 --- /dev/null +++ b/src/crypto/aes.h @@ -0,0 +1,118 @@ +// Copyright (c) 2015-2017 The Bitcoin Core developers +// Distributed under the MIT software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. +// +// C++ wrapper around ctaes, a constant-time AES implementation + +#ifndef BITCOIN_CRYPTO_AES_H +#define BITCOIN_CRYPTO_AES_H + +extern "C" { +#include <crypto/ctaes/ctaes.h> +} + +static const int AES_BLOCKSIZE = 16; +static const int AES128_KEYSIZE = 16; +static const int AES256_KEYSIZE = 32; + +/** An encryption class for AES-128. */ +class AES128Encrypt +{ +private: + AES128_ctx ctx; + +public: + explicit AES128Encrypt(const unsigned char key[16]); + ~AES128Encrypt(); + void Encrypt(unsigned char ciphertext[16], const unsigned char plaintext[16]) const; +}; + +/** A decryption class for AES-128. */ +class AES128Decrypt +{ +private: + AES128_ctx ctx; + +public: + explicit AES128Decrypt(const unsigned char key[16]); + ~AES128Decrypt(); + void Decrypt(unsigned char plaintext[16], const unsigned char ciphertext[16]) const; +}; + +/** An encryption class for AES-256. */ +class AES256Encrypt +{ +private: + AES256_ctx ctx; + +public: + explicit AES256Encrypt(const unsigned char key[32]); + ~AES256Encrypt(); + void Encrypt(unsigned char ciphertext[16], const unsigned char plaintext[16]) const; +}; + +/** A decryption class for AES-256. */ +class AES256Decrypt +{ +private: + AES256_ctx ctx; + +public: + explicit AES256Decrypt(const unsigned char key[32]); + ~AES256Decrypt(); + void Decrypt(unsigned char plaintext[16], const unsigned char ciphertext[16]) const; +}; + +class AES256CBCEncrypt +{ +public: + AES256CBCEncrypt(const unsigned char key[AES256_KEYSIZE], const unsigned char ivIn[AES_BLOCKSIZE], bool padIn); + ~AES256CBCEncrypt(); + int Encrypt(const unsigned char* data, int size, unsigned char* out) const; + +private: + const AES256Encrypt enc; + const bool pad; + unsigned char iv[AES_BLOCKSIZE]; +}; + +class AES256CBCDecrypt +{ +public: + AES256CBCDecrypt(const unsigned char key[AES256_KEYSIZE], const unsigned char ivIn[AES_BLOCKSIZE], bool padIn); + ~AES256CBCDecrypt(); + int Decrypt(const unsigned char* data, int size, unsigned char* out) const; + +private: + const AES256Decrypt dec; + const bool pad; + unsigned char iv[AES_BLOCKSIZE]; +}; + +class AES128CBCEncrypt +{ +public: + AES128CBCEncrypt(const unsigned char key[AES128_KEYSIZE], const unsigned char ivIn[AES_BLOCKSIZE], bool padIn); + ~AES128CBCEncrypt(); + int Encrypt(const unsigned char* data, int size, unsigned char* out) const; + +private: + const AES128Encrypt enc; + const bool pad; + unsigned char iv[AES_BLOCKSIZE]; +}; + +class AES128CBCDecrypt +{ +public: + AES128CBCDecrypt(const unsigned char key[AES128_KEYSIZE], const unsigned char ivIn[AES_BLOCKSIZE], bool padIn); + ~AES128CBCDecrypt(); + int Decrypt(const unsigned char* data, int size, unsigned char* out) const; + +private: + const AES128Decrypt dec; + const bool pad; + unsigned char iv[AES_BLOCKSIZE]; +}; + +#endif // BITCOIN_CRYPTO_AES_H diff --git a/src/crypto/chacha20.cpp b/src/crypto/chacha20.cpp new file mode 100644 index 0000000000..ac4470f04f --- /dev/null +++ b/src/crypto/chacha20.cpp @@ -0,0 +1,180 @@ +// Copyright (c) 2017 The Bitcoin Core developers +// Distributed under the MIT software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. + +// Based on the public domain implementation 'merged' by D. J. Bernstein +// See https://cr.yp.to/chacha.html. + +#include <crypto/common.h> +#include <crypto/chacha20.h> + +#include <string.h> + +constexpr static inline uint32_t rotl32(uint32_t v, int c) { return (v << c) | (v >> (32 - c)); } + +#define QUARTERROUND(a,b,c,d) \ + a += b; d = rotl32(d ^ a, 16); \ + c += d; b = rotl32(b ^ c, 12); \ + a += b; d = rotl32(d ^ a, 8); \ + c += d; b = rotl32(b ^ c, 7); + +static const unsigned char sigma[] = "expand 32-byte k"; +static const unsigned char tau[] = "expand 16-byte k"; + +void ChaCha20::SetKey(const unsigned char* k, size_t keylen) +{ + const unsigned char *constants; + + input[4] = ReadLE32(k + 0); + input[5] = ReadLE32(k + 4); + input[6] = ReadLE32(k + 8); + input[7] = ReadLE32(k + 12); + if (keylen == 32) { /* recommended */ + k += 16; + constants = sigma; + } else { /* keylen == 16 */ + constants = tau; + } + input[8] = ReadLE32(k + 0); + input[9] = ReadLE32(k + 4); + input[10] = ReadLE32(k + 8); + input[11] = ReadLE32(k + 12); + input[0] = ReadLE32(constants + 0); + input[1] = ReadLE32(constants + 4); + input[2] = ReadLE32(constants + 8); + input[3] = ReadLE32(constants + 12); + input[12] = 0; + input[13] = 0; + input[14] = 0; + input[15] = 0; +} + +ChaCha20::ChaCha20() +{ + memset(input, 0, sizeof(input)); +} + +ChaCha20::ChaCha20(const unsigned char* k, size_t keylen) +{ + SetKey(k, keylen); +} + +void ChaCha20::SetIV(uint64_t iv) +{ + input[14] = iv; + input[15] = iv >> 32; +} + +void ChaCha20::Seek(uint64_t pos) +{ + input[12] = pos; + input[13] = pos >> 32; +} + +void ChaCha20::Output(unsigned char* c, size_t bytes) +{ + uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; + uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15; + unsigned char *ctarget = nullptr; + unsigned char tmp[64]; + unsigned int i; + + if (!bytes) return; + + j0 = input[0]; + j1 = input[1]; + j2 = input[2]; + j3 = input[3]; + j4 = input[4]; + j5 = input[5]; + j6 = input[6]; + j7 = input[7]; + j8 = input[8]; + j9 = input[9]; + j10 = input[10]; + j11 = input[11]; + j12 = input[12]; + j13 = input[13]; + j14 = input[14]; + j15 = input[15]; + + for (;;) { + if (bytes < 64) { + ctarget = c; + c = tmp; + } + x0 = j0; + x1 = j1; + x2 = j2; + x3 = j3; + x4 = j4; + x5 = j5; + x6 = j6; + x7 = j7; + x8 = j8; + x9 = j9; + x10 = j10; + x11 = j11; + x12 = j12; + x13 = j13; + x14 = j14; + x15 = j15; + for (i = 20;i > 0;i -= 2) { + QUARTERROUND( x0, x4, x8,x12) + QUARTERROUND( x1, x5, x9,x13) + QUARTERROUND( x2, x6,x10,x14) + QUARTERROUND( x3, x7,x11,x15) + QUARTERROUND( x0, x5,x10,x15) + QUARTERROUND( x1, x6,x11,x12) + QUARTERROUND( x2, x7, x8,x13) + QUARTERROUND( x3, x4, x9,x14) + } + x0 += j0; + x1 += j1; + x2 += j2; + x3 += j3; + x4 += j4; + x5 += j5; + x6 += j6; + x7 += j7; + x8 += j8; + x9 += j9; + x10 += j10; + x11 += j11; + x12 += j12; + x13 += j13; + x14 += j14; + x15 += j15; + + ++j12; + if (!j12) ++j13; + + WriteLE32(c + 0, x0); + WriteLE32(c + 4, x1); + WriteLE32(c + 8, x2); + WriteLE32(c + 12, x3); + WriteLE32(c + 16, x4); + WriteLE32(c + 20, x5); + WriteLE32(c + 24, x6); + WriteLE32(c + 28, x7); + WriteLE32(c + 32, x8); + WriteLE32(c + 36, x9); + WriteLE32(c + 40, x10); + WriteLE32(c + 44, x11); + WriteLE32(c + 48, x12); + WriteLE32(c + 52, x13); + WriteLE32(c + 56, x14); + WriteLE32(c + 60, x15); + + if (bytes <= 64) { + if (bytes < 64) { + for (i = 0;i < bytes;++i) ctarget[i] = c[i]; + } + input[12] = j12; + input[13] = j13; + return; + } + bytes -= 64; + c += 64; + } +} diff --git a/src/crypto/chacha20.h b/src/crypto/chacha20.h new file mode 100644 index 0000000000..a305977bcd --- /dev/null +++ b/src/crypto/chacha20.h @@ -0,0 +1,26 @@ +// Copyright (c) 2017 The Bitcoin Core developers +// Distributed under the MIT software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. + +#ifndef BITCOIN_CRYPTO_CHACHA20_H +#define BITCOIN_CRYPTO_CHACHA20_H + +#include <stdint.h> +#include <stdlib.h> + +/** A PRNG class for ChaCha20. */ +class ChaCha20 +{ +private: + uint32_t input[16]; + +public: + ChaCha20(); + ChaCha20(const unsigned char* key, size_t keylen); + void SetKey(const unsigned char* key, size_t keylen); + void SetIV(uint64_t iv); + void Seek(uint64_t pos); + void Output(unsigned char* output, size_t bytes); +}; + +#endif // BITCOIN_CRYPTO_CHACHA20_H diff --git a/src/crypto/common.h b/src/crypto/common.h new file mode 100644 index 0000000000..825b430978 --- /dev/null +++ b/src/crypto/common.h @@ -0,0 +1,103 @@ +// Copyright (c) 2014-2017 The Bitcoin Core developers +// Distributed under the MIT software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. + +#ifndef BITCOIN_CRYPTO_COMMON_H +#define BITCOIN_CRYPTO_COMMON_H + +#if defined(HAVE_CONFIG_H) +#include <config/bitcoin-config.h> +#endif + +#include <stdint.h> +#include <string.h> + +#include <compat/endian.h> + +uint16_t static inline ReadLE16(const unsigned char* ptr) +{ + uint16_t x; + memcpy((char*)&x, ptr, 2); + return le16toh(x); +} + +uint32_t static inline ReadLE32(const unsigned char* ptr) +{ + uint32_t x; + memcpy((char*)&x, ptr, 4); + return le32toh(x); +} + +uint64_t static inline ReadLE64(const unsigned char* ptr) +{ + uint64_t x; + memcpy((char*)&x, ptr, 8); + return le64toh(x); +} + +void static inline WriteLE16(unsigned char* ptr, uint16_t x) +{ + uint16_t v = htole16(x); + memcpy(ptr, (char*)&v, 2); +} + +void static inline WriteLE32(unsigned char* ptr, uint32_t x) +{ + uint32_t v = htole32(x); + memcpy(ptr, (char*)&v, 4); +} + +void static inline WriteLE64(unsigned char* ptr, uint64_t x) +{ + uint64_t v = htole64(x); + memcpy(ptr, (char*)&v, 8); +} + +uint32_t static inline ReadBE32(const unsigned char* ptr) +{ + uint32_t x; + memcpy((char*)&x, ptr, 4); + return be32toh(x); +} + +uint64_t static inline ReadBE64(const unsigned char* ptr) +{ + uint64_t x; + memcpy((char*)&x, ptr, 8); + return be64toh(x); +} + +void static inline WriteBE32(unsigned char* ptr, uint32_t x) +{ + uint32_t v = htobe32(x); + memcpy(ptr, (char*)&v, 4); +} + +void static inline WriteBE64(unsigned char* ptr, uint64_t x) +{ + uint64_t v = htobe64(x); + memcpy(ptr, (char*)&v, 8); +} + +/** Return the smallest number n such that (x >> n) == 0 (or 64 if the highest bit in x is set. */ +uint64_t static inline CountBits(uint64_t x) +{ +#ifdef HAVE_DECL___BUILTIN_CLZL + if (sizeof(unsigned long) >= sizeof(uint64_t)) { + return x ? 8 * sizeof(unsigned long) - __builtin_clzl(x) : 0; + } +#endif +#ifdef HAVE_DECL___BUILTIN_CLZLL + if (sizeof(unsigned long long) >= sizeof(uint64_t)) { + return x ? 8 * sizeof(unsigned long long) - __builtin_clzll(x) : 0; + } +#endif + int ret = 0; + while (x) { + x >>= 1; + ++ret; + } + return ret; +} + +#endif // BITCOIN_CRYPTO_COMMON_H diff --git a/src/crypto/ctaes/COPYING b/src/crypto/ctaes/COPYING new file mode 100644 index 0000000000..415b202a2a --- /dev/null +++ b/src/crypto/ctaes/COPYING @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2016 Pieter Wuille + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/src/crypto/ctaes/README.md b/src/crypto/ctaes/README.md new file mode 100644 index 0000000000..0e7fe17751 --- /dev/null +++ b/src/crypto/ctaes/README.md @@ -0,0 +1,41 @@ +ctaes +===== + +Simple C module for constant-time AES encryption and decryption. + +Features: +* Simple, pure C code without any dependencies. +* No tables or data-dependent branches whatsoever, but using bit sliced approach from https://eprint.iacr.org/2009/129.pdf. +* Very small object code: slightly over 4k of executable code when compiled with -Os. +* Slower than implementations based on precomputed tables or specialized instructions, but can do ~15 MB/s on modern CPUs. + +Performance +----------- + +Compiled with GCC 5.3.1 with -O3, on an Intel(R) Core(TM) i7-4800MQ CPU, numbers in CPU cycles: + +| Algorithm | Key schedule | Encryption per byte | Decryption per byte | +| --------- | ------------:| -------------------:| -------------------:| +| AES-128 | 2.8k | 154 | 161 | +| AES-192 | 3.1k | 169 | 181 | +| AES-256 | 4.0k | 191 | 203 | + +Build steps +----------- + +Object code: + + $ gcc -O3 ctaes.c -c -o ctaes.o + +Tests: + + $ gcc -O3 ctaes.c test.c -o test + +Benchmark: + + $ gcc -O3 ctaes.c bench.c -o bench + +Review +------ + +Results of a formal review of the code can be found in http://bitcoin.sipa.be/ctaes/review.zip diff --git a/src/crypto/ctaes/bench.c b/src/crypto/ctaes/bench.c new file mode 100644 index 0000000000..a86df496c8 --- /dev/null +++ b/src/crypto/ctaes/bench.c @@ -0,0 +1,170 @@ +#include <stdio.h> +#include <math.h> +#include "sys/time.h" + +#include "ctaes.h" + +static double gettimedouble(void) { + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_usec * 0.000001 + tv.tv_sec; +} + +static void print_number(double x) { + double y = x; + int c = 0; + if (y < 0.0) { + y = -y; + } + while (y < 100.0) { + y *= 10.0; + c++; + } + printf("%.*f", c, x); +} + +static void run_benchmark(char *name, void (*benchmark)(void*), void (*setup)(void*), void (*teardown)(void*), void* data, int count, int iter) { + int i; + double min = HUGE_VAL; + double sum = 0.0; + double max = 0.0; + for (i = 0; i < count; i++) { + double begin, total; + if (setup != NULL) { + setup(data); + } + begin = gettimedouble(); + benchmark(data); + total = gettimedouble() - begin; + if (teardown != NULL) { + teardown(data); + } + if (total < min) { + min = total; + } + if (total > max) { + max = total; + } + sum += total; + } + printf("%s: min ", name); + print_number(min * 1000000000.0 / iter); + printf("ns / avg "); + print_number((sum / count) * 1000000000.0 / iter); + printf("ns / max "); + print_number(max * 1000000000.0 / iter); + printf("ns\n"); +} + +static void bench_AES128_init(void* data) { + AES128_ctx* ctx = (AES128_ctx*)data; + int i; + for (i = 0; i < 50000; i++) { + AES128_init(ctx, (unsigned char*)ctx); + } +} + +static void bench_AES128_encrypt_setup(void* data) { + AES128_ctx* ctx = (AES128_ctx*)data; + static const unsigned char key[16] = {0}; + AES128_init(ctx, key); +} + +static void bench_AES128_encrypt(void* data) { + const AES128_ctx* ctx = (const AES128_ctx*)data; + unsigned char scratch[16] = {0}; + int i; + for (i = 0; i < 4000000 / 16; i++) { + AES128_encrypt(ctx, 1, scratch, scratch); + } +} + +static void bench_AES128_decrypt(void* data) { + const AES128_ctx* ctx = (const AES128_ctx*)data; + unsigned char scratch[16] = {0}; + int i; + for (i = 0; i < 4000000 / 16; i++) { + AES128_decrypt(ctx, 1, scratch, scratch); + } +} + +static void bench_AES192_init(void* data) { + AES192_ctx* ctx = (AES192_ctx*)data; + int i; + for (i = 0; i < 50000; i++) { + AES192_init(ctx, (unsigned char*)ctx); + } +} + +static void bench_AES192_encrypt_setup(void* data) { + AES192_ctx* ctx = (AES192_ctx*)data; + static const unsigned char key[16] = {0}; + AES192_init(ctx, key); +} + +static void bench_AES192_encrypt(void* data) { + const AES192_ctx* ctx = (const AES192_ctx*)data; + unsigned char scratch[16] = {0}; + int i; + for (i = 0; i < 4000000 / 16; i++) { + AES192_encrypt(ctx, 1, scratch, scratch); + } +} + +static void bench_AES192_decrypt(void* data) { + const AES192_ctx* ctx = (const AES192_ctx*)data; + unsigned char scratch[16] = {0}; + int i; + for (i = 0; i < 4000000 / 16; i++) { + AES192_decrypt(ctx, 1, scratch, scratch); + } +} + +static void bench_AES256_init(void* data) { + AES256_ctx* ctx = (AES256_ctx*)data; + int i; + for (i = 0; i < 50000; i++) { + AES256_init(ctx, (unsigned char*)ctx); + } +} + + +static void bench_AES256_encrypt_setup(void* data) { + AES256_ctx* ctx = (AES256_ctx*)data; + static const unsigned char key[16] = {0}; + AES256_init(ctx, key); +} + +static void bench_AES256_encrypt(void* data) { + const AES256_ctx* ctx = (const AES256_ctx*)data; + unsigned char scratch[16] = {0}; + int i; + for (i = 0; i < 4000000 / 16; i++) { + AES256_encrypt(ctx, 1, scratch, scratch); + } +} + +static void bench_AES256_decrypt(void* data) { + const AES256_ctx* ctx = (const AES256_ctx*)data; + unsigned char scratch[16] = {0}; + int i; + for (i = 0; i < 4000000 / 16; i++) { + AES256_decrypt(ctx, 1, scratch, scratch); + } +} + +int main(void) { + AES128_ctx ctx128; + AES192_ctx ctx192; + AES256_ctx ctx256; + run_benchmark("aes128_init", bench_AES128_init, NULL, NULL, &ctx128, 20, 50000); + run_benchmark("aes128_encrypt_byte", bench_AES128_encrypt, bench_AES128_encrypt_setup, NULL, &ctx128, 20, 4000000); + run_benchmark("aes128_decrypt_byte", bench_AES128_decrypt, bench_AES128_encrypt_setup, NULL, &ctx128, 20, 4000000); + run_benchmark("aes192_init", bench_AES192_init, NULL, NULL, &ctx192, 20, 50000); + run_benchmark("aes192_encrypt_byte", bench_AES192_encrypt, bench_AES192_encrypt_setup, NULL, &ctx192, 20, 4000000); + run_benchmark("aes192_decrypt_byte", bench_AES192_decrypt, bench_AES192_encrypt_setup, NULL, &ctx192, 20, 4000000); + run_benchmark("aes256_init", bench_AES256_init, NULL, NULL, &ctx256, 20, 50000); + run_benchmark("aes256_encrypt_byte", bench_AES256_encrypt, bench_AES256_encrypt_setup, NULL, &ctx256, 20, 4000000); + run_benchmark("aes256_decrypt_byte", bench_AES256_decrypt, bench_AES256_encrypt_setup, NULL, &ctx256, 20, 4000000); + return 0; +} diff --git a/src/crypto/ctaes/ctaes.c b/src/crypto/ctaes/ctaes.c new file mode 100644 index 0000000000..55962bf252 --- /dev/null +++ b/src/crypto/ctaes/ctaes.c @@ -0,0 +1,556 @@ + /********************************************************************* + * Copyright (c) 2016 Pieter Wuille * + * Distributed under the MIT software license, see the accompanying * + * file COPYING or http://www.opensource.org/licenses/mit-license.php.* + **********************************************************************/ + +/* Constant time, unoptimized, concise, plain C, AES implementation + * Based On: + * Emilia Kasper and Peter Schwabe, Faster and Timing-Attack Resistant AES-GCM + * http://www.iacr.org/archive/ches2009/57470001/57470001.pdf + * But using 8 16-bit integers representing a single AES state rather than 8 128-bit + * integers representing 8 AES states. + */ + +#include "ctaes.h" + +/* Slice variable slice_i contains the i'th bit of the 16 state variables in this order: + * 0 1 2 3 + * 4 5 6 7 + * 8 9 10 11 + * 12 13 14 15 + */ + +/** Convert a byte to sliced form, storing it corresponding to given row and column in s */ +static void LoadByte(AES_state* s, unsigned char byte, int r, int c) { + int i; + for (i = 0; i < 8; i++) { + s->slice[i] |= (byte & 1) << (r * 4 + c); + byte >>= 1; + } +} + +/** Load 16 bytes of data into 8 sliced integers */ +static void LoadBytes(AES_state *s, const unsigned char* data16) { + int c; + for (c = 0; c < 4; c++) { + int r; + for (r = 0; r < 4; r++) { + LoadByte(s, *(data16++), r, c); + } + } +} + +/** Convert 8 sliced integers into 16 bytes of data */ +static void SaveBytes(unsigned char* data16, const AES_state *s) { + int c; + for (c = 0; c < 4; c++) { + int r; + for (r = 0; r < 4; r++) { + int b; + uint8_t v = 0; + for (b = 0; b < 8; b++) { + v |= ((s->slice[b] >> (r * 4 + c)) & 1) << b; + } + *(data16++) = v; + } + } +} + +/* S-box implementation based on the gate logic from: + * Joan Boyar and Rene Peralta, A depth-16 circuit for the AES S-box. + * https://eprint.iacr.org/2011/332.pdf +*/ +static void SubBytes(AES_state *s, int inv) { + /* Load the bit slices */ + uint16_t U0 = s->slice[7], U1 = s->slice[6], U2 = s->slice[5], U3 = s->slice[4]; + uint16_t U4 = s->slice[3], U5 = s->slice[2], U6 = s->slice[1], U7 = s->slice[0]; + + uint16_t T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16; + uint16_t T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, D; + uint16_t M1, M6, M11, M13, M15, M20, M21, M22, M23, M25, M37, M38, M39, M40; + uint16_t M41, M42, M43, M44, M45, M46, M47, M48, M49, M50, M51, M52, M53, M54; + uint16_t M55, M56, M57, M58, M59, M60, M61, M62, M63; + + if (inv) { + uint16_t R5, R13, R17, R18, R19; + /* Undo linear postprocessing */ + T23 = U0 ^ U3; + T22 = ~(U1 ^ U3); + T2 = ~(U0 ^ U1); + T1 = U3 ^ U4; + T24 = ~(U4 ^ U7); + R5 = U6 ^ U7; + T8 = ~(U1 ^ T23); + T19 = T22 ^ R5; + T9 = ~(U7 ^ T1); + T10 = T2 ^ T24; + T13 = T2 ^ R5; + T3 = T1 ^ R5; + T25 = ~(U2 ^ T1); + R13 = U1 ^ U6; + T17 = ~(U2 ^ T19); + T20 = T24 ^ R13; + T4 = U4 ^ T8; + R17 = ~(U2 ^ U5); + R18 = ~(U5 ^ U6); + R19 = ~(U2 ^ U4); + D = U0 ^ R17; + T6 = T22 ^ R17; + T16 = R13 ^ R19; + T27 = T1 ^ R18; + T15 = T10 ^ T27; + T14 = T10 ^ R18; + T26 = T3 ^ T16; + } else { + /* Linear preprocessing. */ + T1 = U0 ^ U3; + T2 = U0 ^ U5; + T3 = U0 ^ U6; + T4 = U3 ^ U5; + T5 = U4 ^ U6; + T6 = T1 ^ T5; + T7 = U1 ^ U2; + T8 = U7 ^ T6; + T9 = U7 ^ T7; + T10 = T6 ^ T7; + T11 = U1 ^ U5; + T12 = U2 ^ U5; + T13 = T3 ^ T4; + T14 = T6 ^ T11; + T15 = T5 ^ T11; + T16 = T5 ^ T12; + T17 = T9 ^ T16; + T18 = U3 ^ U7; + T19 = T7 ^ T18; + T20 = T1 ^ T19; + T21 = U6 ^ U7; + T22 = T7 ^ T21; + T23 = T2 ^ T22; + T24 = T2 ^ T10; + T25 = T20 ^ T17; + T26 = T3 ^ T16; + T27 = T1 ^ T12; + D = U7; + } + + /* Non-linear transformation (shared between the forward and backward case) */ + M1 = T13 & T6; + M6 = T3 & T16; + M11 = T1 & T15; + M13 = (T4 & T27) ^ M11; + M15 = (T2 & T10) ^ M11; + M20 = T14 ^ M1 ^ (T23 & T8) ^ M13; + M21 = (T19 & D) ^ M1 ^ T24 ^ M15; + M22 = T26 ^ M6 ^ (T22 & T9) ^ M13; + M23 = (T20 & T17) ^ M6 ^ M15 ^ T25; + M25 = M22 & M20; + M37 = M21 ^ ((M20 ^ M21) & (M23 ^ M25)); + M38 = M20 ^ M25 ^ (M21 | (M20 & M23)); + M39 = M23 ^ ((M22 ^ M23) & (M21 ^ M25)); + M40 = M22 ^ M25 ^ (M23 | (M21 & M22)); + M41 = M38 ^ M40; + M42 = M37 ^ M39; + M43 = M37 ^ M38; + M44 = M39 ^ M40; + M45 = M42 ^ M41; + M46 = M44 & T6; + M47 = M40 & T8; + M48 = M39 & D; + M49 = M43 & T16; + M50 = M38 & T9; + M51 = M37 & T17; + M52 = M42 & T15; + M53 = M45 & T27; + M54 = M41 & T10; + M55 = M44 & T13; + M56 = M40 & T23; + M57 = M39 & T19; + M58 = M43 & T3; + M59 = M38 & T22; + M60 = M37 & T20; + M61 = M42 & T1; + M62 = M45 & T4; + M63 = M41 & T2; + + if (inv){ + /* Undo linear preprocessing */ + uint16_t P0 = M52 ^ M61; + uint16_t P1 = M58 ^ M59; + uint16_t P2 = M54 ^ M62; + uint16_t P3 = M47 ^ M50; + uint16_t P4 = M48 ^ M56; + uint16_t P5 = M46 ^ M51; + uint16_t P6 = M49 ^ M60; + uint16_t P7 = P0 ^ P1; + uint16_t P8 = M50 ^ M53; + uint16_t P9 = M55 ^ M63; + uint16_t P10 = M57 ^ P4; + uint16_t P11 = P0 ^ P3; + uint16_t P12 = M46 ^ M48; + uint16_t P13 = M49 ^ M51; + uint16_t P14 = M49 ^ M62; + uint16_t P15 = M54 ^ M59; + uint16_t P16 = M57 ^ M61; + uint16_t P17 = M58 ^ P2; + uint16_t P18 = M63 ^ P5; + uint16_t P19 = P2 ^ P3; + uint16_t P20 = P4 ^ P6; + uint16_t P22 = P2 ^ P7; + uint16_t P23 = P7 ^ P8; + uint16_t P24 = P5 ^ P7; + uint16_t P25 = P6 ^ P10; + uint16_t P26 = P9 ^ P11; + uint16_t P27 = P10 ^ P18; + uint16_t P28 = P11 ^ P25; + uint16_t P29 = P15 ^ P20; + s->slice[7] = P13 ^ P22; + s->slice[6] = P26 ^ P29; + s->slice[5] = P17 ^ P28; + s->slice[4] = P12 ^ P22; + s->slice[3] = P23 ^ P27; + s->slice[2] = P19 ^ P24; + s->slice[1] = P14 ^ P23; + s->slice[0] = P9 ^ P16; + } else { + /* Linear postprocessing */ + uint16_t L0 = M61 ^ M62; + uint16_t L1 = M50 ^ M56; + uint16_t L2 = M46 ^ M48; + uint16_t L3 = M47 ^ M55; + uint16_t L4 = M54 ^ M58; + uint16_t L5 = M49 ^ M61; + uint16_t L6 = M62 ^ L5; + uint16_t L7 = M46 ^ L3; + uint16_t L8 = M51 ^ M59; + uint16_t L9 = M52 ^ M53; + uint16_t L10 = M53 ^ L4; + uint16_t L11 = M60 ^ L2; + uint16_t L12 = M48 ^ M51; + uint16_t L13 = M50 ^ L0; + uint16_t L14 = M52 ^ M61; + uint16_t L15 = M55 ^ L1; + uint16_t L16 = M56 ^ L0; + uint16_t L17 = M57 ^ L1; + uint16_t L18 = M58 ^ L8; + uint16_t L19 = M63 ^ L4; + uint16_t L20 = L0 ^ L1; + uint16_t L21 = L1 ^ L7; + uint16_t L22 = L3 ^ L12; + uint16_t L23 = L18 ^ L2; + uint16_t L24 = L15 ^ L9; + uint16_t L25 = L6 ^ L10; + uint16_t L26 = L7 ^ L9; + uint16_t L27 = L8 ^ L10; + uint16_t L28 = L11 ^ L14; + uint16_t L29 = L11 ^ L17; + s->slice[7] = L6 ^ L24; + s->slice[6] = ~(L16 ^ L26); + s->slice[5] = ~(L19 ^ L28); + s->slice[4] = L6 ^ L21; + s->slice[3] = L20 ^ L22; + s->slice[2] = L25 ^ L29; + s->slice[1] = ~(L13 ^ L27); + s->slice[0] = ~(L6 ^ L23); + } +} + +#define BIT_RANGE(from,to) (((1 << ((to) - (from))) - 1) << (from)) + +#define BIT_RANGE_LEFT(x,from,to,shift) (((x) & BIT_RANGE((from), (to))) << (shift)) +#define BIT_RANGE_RIGHT(x,from,to,shift) (((x) & BIT_RANGE((from), (to))) >> (shift)) + +static void ShiftRows(AES_state* s) { + int i; + for (i = 0; i < 8; i++) { + uint16_t v = s->slice[i]; + s->slice[i] = + (v & BIT_RANGE(0, 4)) | + BIT_RANGE_LEFT(v, 4, 5, 3) | BIT_RANGE_RIGHT(v, 5, 8, 1) | + BIT_RANGE_LEFT(v, 8, 10, 2) | BIT_RANGE_RIGHT(v, 10, 12, 2) | + BIT_RANGE_LEFT(v, 12, 15, 1) | BIT_RANGE_RIGHT(v, 15, 16, 3); + } +} + +static void InvShiftRows(AES_state* s) { + int i; + for (i = 0; i < 8; i++) { + uint16_t v = s->slice[i]; + s->slice[i] = + (v & BIT_RANGE(0, 4)) | + BIT_RANGE_LEFT(v, 4, 7, 1) | BIT_RANGE_RIGHT(v, 7, 8, 3) | + BIT_RANGE_LEFT(v, 8, 10, 2) | BIT_RANGE_RIGHT(v, 10, 12, 2) | + BIT_RANGE_LEFT(v, 12, 13, 3) | BIT_RANGE_RIGHT(v, 13, 16, 1); + } +} + +#define ROT(x,b) (((x) >> ((b) * 4)) | ((x) << ((4-(b)) * 4))) + +static void MixColumns(AES_state* s, int inv) { + /* The MixColumns transform treats the bytes of the columns of the state as + * coefficients of a 3rd degree polynomial over GF(2^8) and multiplies them + * by the fixed polynomial a(x) = {03}x^3 + {01}x^2 + {01}x + {02}, modulo + * x^4 + {01}. + * + * In the inverse transform, we multiply by the inverse of a(x), + * a^-1(x) = {0b}x^3 + {0d}x^2 + {09}x + {0e}. This is equal to + * a(x) * ({04}x^2 + {05}), so we can reuse the forward transform's code + * (found in OpenSSL's bsaes-x86_64.pl, attributed to Jussi Kivilinna) + * + * In the bitsliced representation, a multiplication of every column by x + * mod x^4 + 1 is simply a right rotation. + */ + + /* Shared for both directions is a multiplication by a(x), which can be + * rewritten as (x^3 + x^2 + x) + {02}*(x^3 + {01}). + * + * First compute s into the s? variables, (x^3 + {01}) * s into the s?_01 + * variables and (x^3 + x^2 + x)*s into the s?_123 variables. + */ + uint16_t s0 = s->slice[0], s1 = s->slice[1], s2 = s->slice[2], s3 = s->slice[3]; + uint16_t s4 = s->slice[4], s5 = s->slice[5], s6 = s->slice[6], s7 = s->slice[7]; + uint16_t s0_01 = s0 ^ ROT(s0, 1), s0_123 = ROT(s0_01, 1) ^ ROT(s0, 3); + uint16_t s1_01 = s1 ^ ROT(s1, 1), s1_123 = ROT(s1_01, 1) ^ ROT(s1, 3); + uint16_t s2_01 = s2 ^ ROT(s2, 1), s2_123 = ROT(s2_01, 1) ^ ROT(s2, 3); + uint16_t s3_01 = s3 ^ ROT(s3, 1), s3_123 = ROT(s3_01, 1) ^ ROT(s3, 3); + uint16_t s4_01 = s4 ^ ROT(s4, 1), s4_123 = ROT(s4_01, 1) ^ ROT(s4, 3); + uint16_t s5_01 = s5 ^ ROT(s5, 1), s5_123 = ROT(s5_01, 1) ^ ROT(s5, 3); + uint16_t s6_01 = s6 ^ ROT(s6, 1), s6_123 = ROT(s6_01, 1) ^ ROT(s6, 3); + uint16_t s7_01 = s7 ^ ROT(s7, 1), s7_123 = ROT(s7_01, 1) ^ ROT(s7, 3); + /* Now compute s = s?_123 + {02} * s?_01. */ + s->slice[0] = s7_01 ^ s0_123; + s->slice[1] = s7_01 ^ s0_01 ^ s1_123; + s->slice[2] = s1_01 ^ s2_123; + s->slice[3] = s7_01 ^ s2_01 ^ s3_123; + s->slice[4] = s7_01 ^ s3_01 ^ s4_123; + s->slice[5] = s4_01 ^ s5_123; + s->slice[6] = s5_01 ^ s6_123; + s->slice[7] = s6_01 ^ s7_123; + if (inv) { + /* In the reverse direction, we further need to multiply by + * {04}x^2 + {05}, which can be written as {04} * (x^2 + {01}) + {01}. + * + * First compute (x^2 + {01}) * s into the t?_02 variables: */ + uint16_t t0_02 = s->slice[0] ^ ROT(s->slice[0], 2); + uint16_t t1_02 = s->slice[1] ^ ROT(s->slice[1], 2); + uint16_t t2_02 = s->slice[2] ^ ROT(s->slice[2], 2); + uint16_t t3_02 = s->slice[3] ^ ROT(s->slice[3], 2); + uint16_t t4_02 = s->slice[4] ^ ROT(s->slice[4], 2); + uint16_t t5_02 = s->slice[5] ^ ROT(s->slice[5], 2); + uint16_t t6_02 = s->slice[6] ^ ROT(s->slice[6], 2); + uint16_t t7_02 = s->slice[7] ^ ROT(s->slice[7], 2); + /* And then update s += {04} * t?_02 */ + s->slice[0] ^= t6_02; + s->slice[1] ^= t6_02 ^ t7_02; + s->slice[2] ^= t0_02 ^ t7_02; + s->slice[3] ^= t1_02 ^ t6_02; + s->slice[4] ^= t2_02 ^ t6_02 ^ t7_02; + s->slice[5] ^= t3_02 ^ t7_02; + s->slice[6] ^= t4_02; + s->slice[7] ^= t5_02; + } +} + +static void AddRoundKey(AES_state* s, const AES_state* round) { + int b; + for (b = 0; b < 8; b++) { + s->slice[b] ^= round->slice[b]; + } +} + +/** column_0(s) = column_c(a) */ +static void GetOneColumn(AES_state* s, const AES_state* a, int c) { + int b; + for (b = 0; b < 8; b++) { + s->slice[b] = (a->slice[b] >> c) & 0x1111; + } +} + +/** column_c1(r) |= (column_0(s) ^= column_c2(a)) */ +static void KeySetupColumnMix(AES_state* s, AES_state* r, const AES_state* a, int c1, int c2) { + int b; + for (b = 0; b < 8; b++) { + r->slice[b] |= ((s->slice[b] ^= ((a->slice[b] >> c2) & 0x1111)) & 0x1111) << c1; + } +} + +/** Rotate the rows in s one position upwards, and xor in r */ +static void KeySetupTransform(AES_state* s, const AES_state* r) { + int b; + for (b = 0; b < 8; b++) { + s->slice[b] = ((s->slice[b] >> 4) | (s->slice[b] << 12)) ^ r->slice[b]; + } +} + +/* Multiply the cells in s by x, as polynomials over GF(2) mod x^8 + x^4 + x^3 + x + 1 */ +static void MultX(AES_state* s) { + uint16_t top = s->slice[7]; + s->slice[7] = s->slice[6]; + s->slice[6] = s->slice[5]; + s->slice[5] = s->slice[4]; + s->slice[4] = s->slice[3] ^ top; + s->slice[3] = s->slice[2] ^ top; + s->slice[2] = s->slice[1]; + s->slice[1] = s->slice[0] ^ top; + s->slice[0] = top; +} + +/** Expand the cipher key into the key schedule. + * + * state must be a pointer to an array of size nrounds + 1. + * key must be a pointer to 4 * nkeywords bytes. + * + * AES128 uses nkeywords = 4, nrounds = 10 + * AES192 uses nkeywords = 6, nrounds = 12 + * AES256 uses nkeywords = 8, nrounds = 14 + */ +static void AES_setup(AES_state* rounds, const uint8_t* key, int nkeywords, int nrounds) +{ + int i; + + /* The one-byte round constant */ + AES_state rcon = {{1,0,0,0,0,0,0,0}}; + /* The number of the word being generated, modulo nkeywords */ + int pos = 0; + /* The column representing the word currently being processed */ + AES_state column; + + for (i = 0; i < nrounds + 1; i++) { + int b; + for (b = 0; b < 8; b++) { + rounds[i].slice[b] = 0; + } + } + + /* The first nkeywords round columns are just taken from the key directly. */ + for (i = 0; i < nkeywords; i++) { + int r; + for (r = 0; r < 4; r++) { + LoadByte(&rounds[i >> 2], *(key++), r, i & 3); + } + } + + GetOneColumn(&column, &rounds[(nkeywords - 1) >> 2], (nkeywords - 1) & 3); + + for (i = nkeywords; i < 4 * (nrounds + 1); i++) { + /* Transform column */ + if (pos == 0) { + SubBytes(&column, 0); + KeySetupTransform(&column, &rcon); + MultX(&rcon); + } else if (nkeywords > 6 && pos == 4) { + SubBytes(&column, 0); + } + if (++pos == nkeywords) pos = 0; + KeySetupColumnMix(&column, &rounds[i >> 2], &rounds[(i - nkeywords) >> 2], i & 3, (i - nkeywords) & 3); + } +} + +static void AES_encrypt(const AES_state* rounds, int nrounds, unsigned char* cipher16, const unsigned char* plain16) { + AES_state s = {{0}}; + int round; + + LoadBytes(&s, plain16); + AddRoundKey(&s, rounds++); + + for (round = 1; round < nrounds; round++) { + SubBytes(&s, 0); + ShiftRows(&s); + MixColumns(&s, 0); + AddRoundKey(&s, rounds++); + } + + SubBytes(&s, 0); + ShiftRows(&s); + AddRoundKey(&s, rounds); + + SaveBytes(cipher16, &s); +} + +static void AES_decrypt(const AES_state* rounds, int nrounds, unsigned char* plain16, const unsigned char* cipher16) { + /* Most AES decryption implementations use the alternate scheme + * (the Equivalent Inverse Cipher), which allows for more code reuse between + * the encryption and decryption code, but requires separate setup for both. + */ + AES_state s = {{0}}; + int round; + + rounds += nrounds; + + LoadBytes(&s, cipher16); + AddRoundKey(&s, rounds--); + + for (round = 1; round < nrounds; round++) { + InvShiftRows(&s); + SubBytes(&s, 1); + AddRoundKey(&s, rounds--); + MixColumns(&s, 1); + } + + InvShiftRows(&s); + SubBytes(&s, 1); + AddRoundKey(&s, rounds); + + SaveBytes(plain16, &s); +} + +void AES128_init(AES128_ctx* ctx, const unsigned char* key16) { + AES_setup(ctx->rk, key16, 4, 10); +} + +void AES128_encrypt(const AES128_ctx* ctx, size_t blocks, unsigned char* cipher16, const unsigned char* plain16) { + while (blocks--) { + AES_encrypt(ctx->rk, 10, cipher16, plain16); + cipher16 += 16; + plain16 += 16; + } +} + +void AES128_decrypt(const AES128_ctx* ctx, size_t blocks, unsigned char* plain16, const unsigned char* cipher16) { + while (blocks--) { + AES_decrypt(ctx->rk, 10, plain16, cipher16); + cipher16 += 16; + plain16 += 16; + } +} + +void AES192_init(AES192_ctx* ctx, const unsigned char* key24) { + AES_setup(ctx->rk, key24, 6, 12); +} + +void AES192_encrypt(const AES192_ctx* ctx, size_t blocks, unsigned char* cipher16, const unsigned char* plain16) { + while (blocks--) { + AES_encrypt(ctx->rk, 12, cipher16, plain16); + cipher16 += 16; + plain16 += 16; + } + +} + +void AES192_decrypt(const AES192_ctx* ctx, size_t blocks, unsigned char* plain16, const unsigned char* cipher16) { + while (blocks--) { + AES_decrypt(ctx->rk, 12, plain16, cipher16); + cipher16 += 16; + plain16 += 16; + } +} + +void AES256_init(AES256_ctx* ctx, const unsigned char* key32) { + AES_setup(ctx->rk, key32, 8, 14); +} + +void AES256_encrypt(const AES256_ctx* ctx, size_t blocks, unsigned char* cipher16, const unsigned char* plain16) { + while (blocks--) { + AES_encrypt(ctx->rk, 14, cipher16, plain16); + cipher16 += 16; + plain16 += 16; + } +} + +void AES256_decrypt(const AES256_ctx* ctx, size_t blocks, unsigned char* plain16, const unsigned char* cipher16) { + while (blocks--) { + AES_decrypt(ctx->rk, 14, plain16, cipher16); + cipher16 += 16; + plain16 += 16; + } +} diff --git a/src/crypto/ctaes/ctaes.h b/src/crypto/ctaes/ctaes.h new file mode 100644 index 0000000000..2f0af04216 --- /dev/null +++ b/src/crypto/ctaes/ctaes.h @@ -0,0 +1,41 @@ + /********************************************************************* + * Copyright (c) 2016 Pieter Wuille * + * Distributed under the MIT software license, see the accompanying * + * file COPYING or http://www.opensource.org/licenses/mit-license.php.* + **********************************************************************/ + +#ifndef _CTAES_H_ +#define _CTAES_H_ 1 + +#include <stdint.h> +#include <stdlib.h> + +typedef struct { + uint16_t slice[8]; +} AES_state; + +typedef struct { + AES_state rk[11]; +} AES128_ctx; + +typedef struct { + AES_state rk[13]; +} AES192_ctx; + +typedef struct { + AES_state rk[15]; +} AES256_ctx; + +void AES128_init(AES128_ctx* ctx, const unsigned char* key16); +void AES128_encrypt(const AES128_ctx* ctx, size_t blocks, unsigned char* cipher16, const unsigned char* plain16); +void AES128_decrypt(const AES128_ctx* ctx, size_t blocks, unsigned char* plain16, const unsigned char* cipher16); + +void AES192_init(AES192_ctx* ctx, const unsigned char* key24); +void AES192_encrypt(const AES192_ctx* ctx, size_t blocks, unsigned char* cipher16, const unsigned char* plain16); +void AES192_decrypt(const AES192_ctx* ctx, size_t blocks, unsigned char* plain16, const unsigned char* cipher16); + +void AES256_init(AES256_ctx* ctx, const unsigned char* key32); +void AES256_encrypt(const AES256_ctx* ctx, size_t blocks, unsigned char* cipher16, const unsigned char* plain16); +void AES256_decrypt(const AES256_ctx* ctx, size_t blocks, unsigned char* plain16, const unsigned char* cipher16); + +#endif diff --git a/src/crypto/ctaes/test.c b/src/crypto/ctaes/test.c new file mode 100644 index 0000000000..21439a16f1 --- /dev/null +++ b/src/crypto/ctaes/test.c @@ -0,0 +1,110 @@ + /********************************************************************* + * Copyright (c) 2016 Pieter Wuille * + * Distributed under the MIT software license, see the accompanying * + * file COPYING or http://www.opensource.org/licenses/mit-license.php.* + **********************************************************************/ + +#include "ctaes.h" + +#include <stdio.h> +#include <string.h> +#include <assert.h> + +typedef struct { + int keysize; + const char* key; + const char* plain; + const char* cipher; +} ctaes_test; + +static const ctaes_test ctaes_tests[] = { + /* AES test vectors from FIPS 197. */ + {128, "000102030405060708090a0b0c0d0e0f", "00112233445566778899aabbccddeeff", "69c4e0d86a7b0430d8cdb78070b4c55a"}, + {192, "000102030405060708090a0b0c0d0e0f1011121314151617", "00112233445566778899aabbccddeeff", "dda97ca4864cdfe06eaf70a0ec0d7191"}, + {256, "000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f", "00112233445566778899aabbccddeeff", "8ea2b7ca516745bfeafc49904b496089"}, + + /* AES-ECB test vectors from NIST sp800-38a. */ + {128, "2b7e151628aed2a6abf7158809cf4f3c", "6bc1bee22e409f96e93d7e117393172a", "3ad77bb40d7a3660a89ecaf32466ef97"}, + {128, "2b7e151628aed2a6abf7158809cf4f3c", "ae2d8a571e03ac9c9eb76fac45af8e51", "f5d3d58503b9699de785895a96fdbaaf"}, + {128, "2b7e151628aed2a6abf7158809cf4f3c", "30c81c46a35ce411e5fbc1191a0a52ef", "43b1cd7f598ece23881b00e3ed030688"}, + {128, "2b7e151628aed2a6abf7158809cf4f3c", "f69f2445df4f9b17ad2b417be66c3710", "7b0c785e27e8ad3f8223207104725dd4"}, + {192, "8e73b0f7da0e6452c810f32b809079e562f8ead2522c6b7b", "6bc1bee22e409f96e93d7e117393172a", "bd334f1d6e45f25ff712a214571fa5cc"}, + {192, "8e73b0f7da0e6452c810f32b809079e562f8ead2522c6b7b", "ae2d8a571e03ac9c9eb76fac45af8e51", "974104846d0ad3ad7734ecb3ecee4eef"}, + {192, "8e73b0f7da0e6452c810f32b809079e562f8ead2522c6b7b", "30c81c46a35ce411e5fbc1191a0a52ef", "ef7afd2270e2e60adce0ba2face6444e"}, + {192, "8e73b0f7da0e6452c810f32b809079e562f8ead2522c6b7b", "f69f2445df4f9b17ad2b417be66c3710", "9a4b41ba738d6c72fb16691603c18e0e"}, + {256, "603deb1015ca71be2b73aef0857d77811f352c073b6108d72d9810a30914dff4", "6bc1bee22e409f96e93d7e117393172a", "f3eed1bdb5d2a03c064b5a7e3db181f8"}, + {256, "603deb1015ca71be2b73aef0857d77811f352c073b6108d72d9810a30914dff4", "ae2d8a571e03ac9c9eb76fac45af8e51", "591ccb10d410ed26dc5ba74a31362870"}, + {256, "603deb1015ca71be2b73aef0857d77811f352c073b6108d72d9810a30914dff4", "30c81c46a35ce411e5fbc1191a0a52ef", "b6ed21b99ca6f4f9f153e7b1beafed1d"}, + {256, "603deb1015ca71be2b73aef0857d77811f352c073b6108d72d9810a30914dff4", "f69f2445df4f9b17ad2b417be66c3710", "23304b7a39f9f3ff067d8d8f9e24ecc7"} +}; + +static void from_hex(unsigned char* data, int len, const char* hex) { + int p; + for (p = 0; p < len; p++) { + int v = 0; + int n; + for (n = 0; n < 2; n++) { + assert((*hex >= '0' && *hex <= '9') || (*hex >= 'a' && *hex <= 'f')); + if (*hex >= '0' && *hex <= '9') { + v |= (*hex - '0') << (4 * (1 - n)); + } else { + v |= (*hex - 'a' + 10) << (4 * (1 - n)); + } + hex++; + } + *(data++) = v; + } + assert(*hex == 0); +} + +int main(void) { + int i; + int fail = 0; + for (i = 0; i < sizeof(ctaes_tests) / sizeof(ctaes_tests[0]); i++) { + unsigned char key[32], plain[16], cipher[16], ciphered[16], deciphered[16]; + const ctaes_test* test = &ctaes_tests[i]; + assert(test->keysize == 128 || test->keysize == 192 || test->keysize == 256); + from_hex(plain, 16, test->plain); + from_hex(cipher, 16, test->cipher); + switch (test->keysize) { + case 128: { + AES128_ctx ctx; + from_hex(key, 16, test->key); + AES128_init(&ctx, key); + AES128_encrypt(&ctx, 1, ciphered, plain); + AES128_decrypt(&ctx, 1, deciphered, cipher); + break; + } + case 192: { + AES192_ctx ctx; + from_hex(key, 24, test->key); + AES192_init(&ctx, key); + AES192_encrypt(&ctx, 1, ciphered, plain); + AES192_decrypt(&ctx, 1, deciphered, cipher); + break; + } + case 256: { + AES256_ctx ctx; + from_hex(key, 32, test->key); + AES256_init(&ctx, key); + AES256_encrypt(&ctx, 1, ciphered, plain); + AES256_decrypt(&ctx, 1, deciphered, cipher); + break; + } + } + if (memcmp(cipher, ciphered, 16)) { + fprintf(stderr, "E(key=\"%s\", plain=\"%s\") != \"%s\"\n", test->key, test->plain, test->cipher); + fail++; + } + if (memcmp(plain, deciphered, 16)) { + fprintf(stderr, "D(key=\"%s\", cipher=\"%s\") != \"%s\"\n", test->key, test->cipher, test->plain); + fail++; + } + } + if (fail == 0) { + fprintf(stderr, "All tests successful\n"); + } else { + fprintf(stderr, "%i tests failed\n", fail); + } + return (fail != 0); +} diff --git a/src/crypto/hmac_sha256.cpp b/src/crypto/hmac_sha256.cpp new file mode 100644 index 0000000000..d4afe1439f --- /dev/null +++ b/src/crypto/hmac_sha256.cpp @@ -0,0 +1,34 @@ +// Copyright (c) 2014-2017 The Bitcoin Core developers +// Distributed under the MIT software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. + +#include <crypto/hmac_sha256.h> + +#include <string.h> + +CHMAC_SHA256::CHMAC_SHA256(const unsigned char* key, size_t keylen) +{ + unsigned char rkey[64]; + if (keylen <= 64) { + memcpy(rkey, key, keylen); + memset(rkey + keylen, 0, 64 - keylen); + } else { + CSHA256().Write(key, keylen).Finalize(rkey); + memset(rkey + 32, 0, 32); + } + + for (int n = 0; n < 64; n++) + rkey[n] ^= 0x5c; + outer.Write(rkey, 64); + + for (int n = 0; n < 64; n++) + rkey[n] ^= 0x5c ^ 0x36; + inner.Write(rkey, 64); +} + +void CHMAC_SHA256::Finalize(unsigned char hash[OUTPUT_SIZE]) +{ + unsigned char temp[32]; + inner.Finalize(temp); + outer.Write(temp, 32).Finalize(hash); +} diff --git a/src/crypto/hmac_sha256.h b/src/crypto/hmac_sha256.h new file mode 100644 index 0000000000..4fb30b7ac0 --- /dev/null +++ b/src/crypto/hmac_sha256.h @@ -0,0 +1,32 @@ +// Copyright (c) 2014-2017 The Bitcoin Core developers +// Distributed under the MIT software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. + +#ifndef BITCOIN_CRYPTO_HMAC_SHA256_H +#define BITCOIN_CRYPTO_HMAC_SHA256_H + +#include <crypto/sha256.h> + +#include <stdint.h> +#include <stdlib.h> + +/** A hasher class for HMAC-SHA-256. */ +class CHMAC_SHA256 +{ +private: + CSHA256 outer; + CSHA256 inner; + +public: + static const size_t OUTPUT_SIZE = 32; + + CHMAC_SHA256(const unsigned char* key, size_t keylen); + CHMAC_SHA256& Write(const unsigned char* data, size_t len) + { + inner.Write(data, len); + return *this; + } + void Finalize(unsigned char hash[OUTPUT_SIZE]); +}; + +#endif // BITCOIN_CRYPTO_HMAC_SHA256_H diff --git a/src/crypto/hmac_sha512.cpp b/src/crypto/hmac_sha512.cpp new file mode 100644 index 0000000000..d9c4d04100 --- /dev/null +++ b/src/crypto/hmac_sha512.cpp @@ -0,0 +1,34 @@ +// Copyright (c) 2014-2017 The Bitcoin Core developers +// Distributed under the MIT software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. + +#include <crypto/hmac_sha512.h> + +#include <string.h> + +CHMAC_SHA512::CHMAC_SHA512(const unsigned char* key, size_t keylen) +{ + unsigned char rkey[128]; + if (keylen <= 128) { + memcpy(rkey, key, keylen); + memset(rkey + keylen, 0, 128 - keylen); + } else { + CSHA512().Write(key, keylen).Finalize(rkey); + memset(rkey + 64, 0, 64); + } + + for (int n = 0; n < 128; n++) + rkey[n] ^= 0x5c; + outer.Write(rkey, 128); + + for (int n = 0; n < 128; n++) + rkey[n] ^= 0x5c ^ 0x36; + inner.Write(rkey, 128); +} + +void CHMAC_SHA512::Finalize(unsigned char hash[OUTPUT_SIZE]) +{ + unsigned char temp[64]; + inner.Finalize(temp); + outer.Write(temp, 64).Finalize(hash); +} diff --git a/src/crypto/hmac_sha512.h b/src/crypto/hmac_sha512.h new file mode 100644 index 0000000000..ab84ee7652 --- /dev/null +++ b/src/crypto/hmac_sha512.h @@ -0,0 +1,32 @@ +// Copyright (c) 2014-2017 The Bitcoin Core developers +// Distributed under the MIT software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. + +#ifndef BITCOIN_CRYPTO_HMAC_SHA512_H +#define BITCOIN_CRYPTO_HMAC_SHA512_H + +#include <crypto/sha512.h> + +#include <stdint.h> +#include <stdlib.h> + +/** A hasher class for HMAC-SHA-512. */ +class CHMAC_SHA512 +{ +private: + CSHA512 outer; + CSHA512 inner; + +public: + static const size_t OUTPUT_SIZE = 64; + + CHMAC_SHA512(const unsigned char* key, size_t keylen); + CHMAC_SHA512& Write(const unsigned char* data, size_t len) + { + inner.Write(data, len); + return *this; + } + void Finalize(unsigned char hash[OUTPUT_SIZE]); +}; + +#endif // BITCOIN_CRYPTO_HMAC_SHA512_H diff --git a/src/crypto/ripemd160.cpp b/src/crypto/ripemd160.cpp new file mode 100644 index 0000000000..51468ec8d0 --- /dev/null +++ b/src/crypto/ripemd160.cpp @@ -0,0 +1,292 @@ +// Copyright (c) 2014-2017 The Bitcoin Core developers +// Distributed under the MIT software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. + +#include <crypto/ripemd160.h> + +#include <crypto/common.h> + +#include <string.h> + +// Internal implementation code. +namespace +{ +/// Internal RIPEMD-160 implementation. +namespace ripemd160 +{ +uint32_t inline f1(uint32_t x, uint32_t y, uint32_t z) { return x ^ y ^ z; } +uint32_t inline f2(uint32_t x, uint32_t y, uint32_t z) { return (x & y) | (~x & z); } +uint32_t inline f3(uint32_t x, uint32_t y, uint32_t z) { return (x | ~y) ^ z; } +uint32_t inline f4(uint32_t x, uint32_t y, uint32_t z) { return (x & z) | (y & ~z); } +uint32_t inline f5(uint32_t x, uint32_t y, uint32_t z) { return x ^ (y | ~z); } + +/** Initialize RIPEMD-160 state. */ +void inline Initialize(uint32_t* s) +{ + s[0] = 0x67452301ul; + s[1] = 0xEFCDAB89ul; + s[2] = 0x98BADCFEul; + s[3] = 0x10325476ul; + s[4] = 0xC3D2E1F0ul; +} + +uint32_t inline rol(uint32_t x, int i) { return (x << i) | (x >> (32 - i)); } + +void inline Round(uint32_t& a, uint32_t b, uint32_t& c, uint32_t d, uint32_t e, uint32_t f, uint32_t x, uint32_t k, int r) +{ + a = rol(a + f + x + k, r) + e; + c = rol(c, 10); +} + +void inline R11(uint32_t& a, uint32_t b, uint32_t& c, uint32_t d, uint32_t e, uint32_t x, int r) { Round(a, b, c, d, e, f1(b, c, d), x, 0, r); } +void inline R21(uint32_t& a, uint32_t b, uint32_t& c, uint32_t d, uint32_t e, uint32_t x, int r) { Round(a, b, c, d, e, f2(b, c, d), x, 0x5A827999ul, r); } +void inline R31(uint32_t& a, uint32_t b, uint32_t& c, uint32_t d, uint32_t e, uint32_t x, int r) { Round(a, b, c, d, e, f3(b, c, d), x, 0x6ED9EBA1ul, r); } +void inline R41(uint32_t& a, uint32_t b, uint32_t& c, uint32_t d, uint32_t e, uint32_t x, int r) { Round(a, b, c, d, e, f4(b, c, d), x, 0x8F1BBCDCul, r); } +void inline R51(uint32_t& a, uint32_t b, uint32_t& c, uint32_t d, uint32_t e, uint32_t x, int r) { Round(a, b, c, d, e, f5(b, c, d), x, 0xA953FD4Eul, r); } + +void inline R12(uint32_t& a, uint32_t b, uint32_t& c, uint32_t d, uint32_t e, uint32_t x, int r) { Round(a, b, c, d, e, f5(b, c, d), x, 0x50A28BE6ul, r); } +void inline R22(uint32_t& a, uint32_t b, uint32_t& c, uint32_t d, uint32_t e, uint32_t x, int r) { Round(a, b, c, d, e, f4(b, c, d), x, 0x5C4DD124ul, r); } +void inline R32(uint32_t& a, uint32_t b, uint32_t& c, uint32_t d, uint32_t e, uint32_t x, int r) { Round(a, b, c, d, e, f3(b, c, d), x, 0x6D703EF3ul, r); } +void inline R42(uint32_t& a, uint32_t b, uint32_t& c, uint32_t d, uint32_t e, uint32_t x, int r) { Round(a, b, c, d, e, f2(b, c, d), x, 0x7A6D76E9ul, r); } +void inline R52(uint32_t& a, uint32_t b, uint32_t& c, uint32_t d, uint32_t e, uint32_t x, int r) { Round(a, b, c, d, e, f1(b, c, d), x, 0, r); } + +/** Perform a RIPEMD-160 transformation, processing a 64-byte chunk. */ +void Transform(uint32_t* s, const unsigned char* chunk) +{ + uint32_t a1 = s[0], b1 = s[1], c1 = s[2], d1 = s[3], e1 = s[4]; + uint32_t a2 = a1, b2 = b1, c2 = c1, d2 = d1, e2 = e1; + uint32_t w0 = ReadLE32(chunk + 0), w1 = ReadLE32(chunk + 4), w2 = ReadLE32(chunk + 8), w3 = ReadLE32(chunk + 12); + uint32_t w4 = ReadLE32(chunk + 16), w5 = ReadLE32(chunk + 20), w6 = ReadLE32(chunk + 24), w7 = ReadLE32(chunk + 28); + uint32_t w8 = ReadLE32(chunk + 32), w9 = ReadLE32(chunk + 36), w10 = ReadLE32(chunk + 40), w11 = ReadLE32(chunk + 44); + uint32_t w12 = ReadLE32(chunk + 48), w13 = ReadLE32(chunk + 52), w14 = ReadLE32(chunk + 56), w15 = ReadLE32(chunk + 60); + + R11(a1, b1, c1, d1, e1, w0, 11); + R12(a2, b2, c2, d2, e2, w5, 8); + R11(e1, a1, b1, c1, d1, w1, 14); + R12(e2, a2, b2, c2, d2, w14, 9); + R11(d1, e1, a1, b1, c1, w2, 15); + R12(d2, e2, a2, b2, c2, w7, 9); + R11(c1, d1, e1, a1, b1, w3, 12); + R12(c2, d2, e2, a2, b2, w0, 11); + R11(b1, c1, d1, e1, a1, w4, 5); + R12(b2, c2, d2, e2, a2, w9, 13); + R11(a1, b1, c1, d1, e1, w5, 8); + R12(a2, b2, c2, d2, e2, w2, 15); + R11(e1, a1, b1, c1, d1, w6, 7); + R12(e2, a2, b2, c2, d2, w11, 15); + R11(d1, e1, a1, b1, c1, w7, 9); + R12(d2, e2, a2, b2, c2, w4, 5); + R11(c1, d1, e1, a1, b1, w8, 11); + R12(c2, d2, e2, a2, b2, w13, 7); + R11(b1, c1, d1, e1, a1, w9, 13); + R12(b2, c2, d2, e2, a2, w6, 7); + R11(a1, b1, c1, d1, e1, w10, 14); + R12(a2, b2, c2, d2, e2, w15, 8); + R11(e1, a1, b1, c1, d1, w11, 15); + R12(e2, a2, b2, c2, d2, w8, 11); + R11(d1, e1, a1, b1, c1, w12, 6); + R12(d2, e2, a2, b2, c2, w1, 14); + R11(c1, d1, e1, a1, b1, w13, 7); + R12(c2, d2, e2, a2, b2, w10, 14); + R11(b1, c1, d1, e1, a1, w14, 9); + R12(b2, c2, d2, e2, a2, w3, 12); + R11(a1, b1, c1, d1, e1, w15, 8); + R12(a2, b2, c2, d2, e2, w12, 6); + + R21(e1, a1, b1, c1, d1, w7, 7); + R22(e2, a2, b2, c2, d2, w6, 9); + R21(d1, e1, a1, b1, c1, w4, 6); + R22(d2, e2, a2, b2, c2, w11, 13); + R21(c1, d1, e1, a1, b1, w13, 8); + R22(c2, d2, e2, a2, b2, w3, 15); + R21(b1, c1, d1, e1, a1, w1, 13); + R22(b2, c2, d2, e2, a2, w7, 7); + R21(a1, b1, c1, d1, e1, w10, 11); + R22(a2, b2, c2, d2, e2, w0, 12); + R21(e1, a1, b1, c1, d1, w6, 9); + R22(e2, a2, b2, c2, d2, w13, 8); + R21(d1, e1, a1, b1, c1, w15, 7); + R22(d2, e2, a2, b2, c2, w5, 9); + R21(c1, d1, e1, a1, b1, w3, 15); + R22(c2, d2, e2, a2, b2, w10, 11); + R21(b1, c1, d1, e1, a1, w12, 7); + R22(b2, c2, d2, e2, a2, w14, 7); + R21(a1, b1, c1, d1, e1, w0, 12); + R22(a2, b2, c2, d2, e2, w15, 7); + R21(e1, a1, b1, c1, d1, w9, 15); + R22(e2, a2, b2, c2, d2, w8, 12); + R21(d1, e1, a1, b1, c1, w5, 9); + R22(d2, e2, a2, b2, c2, w12, 7); + R21(c1, d1, e1, a1, b1, w2, 11); + R22(c2, d2, e2, a2, b2, w4, 6); + R21(b1, c1, d1, e1, a1, w14, 7); + R22(b2, c2, d2, e2, a2, w9, 15); + R21(a1, b1, c1, d1, e1, w11, 13); + R22(a2, b2, c2, d2, e2, w1, 13); + R21(e1, a1, b1, c1, d1, w8, 12); + R22(e2, a2, b2, c2, d2, w2, 11); + + R31(d1, e1, a1, b1, c1, w3, 11); + R32(d2, e2, a2, b2, c2, w15, 9); + R31(c1, d1, e1, a1, b1, w10, 13); + R32(c2, d2, e2, a2, b2, w5, 7); + R31(b1, c1, d1, e1, a1, w14, 6); + R32(b2, c2, d2, e2, a2, w1, 15); + R31(a1, b1, c1, d1, e1, w4, 7); + R32(a2, b2, c2, d2, e2, w3, 11); + R31(e1, a1, b1, c1, d1, w9, 14); + R32(e2, a2, b2, c2, d2, w7, 8); + R31(d1, e1, a1, b1, c1, w15, 9); + R32(d2, e2, a2, b2, c2, w14, 6); + R31(c1, d1, e1, a1, b1, w8, 13); + R32(c2, d2, e2, a2, b2, w6, 6); + R31(b1, c1, d1, e1, a1, w1, 15); + R32(b2, c2, d2, e2, a2, w9, 14); + R31(a1, b1, c1, d1, e1, w2, 14); + R32(a2, b2, c2, d2, e2, w11, 12); + R31(e1, a1, b1, c1, d1, w7, 8); + R32(e2, a2, b2, c2, d2, w8, 13); + R31(d1, e1, a1, b1, c1, w0, 13); + R32(d2, e2, a2, b2, c2, w12, 5); + R31(c1, d1, e1, a1, b1, w6, 6); + R32(c2, d2, e2, a2, b2, w2, 14); + R31(b1, c1, d1, e1, a1, w13, 5); + R32(b2, c2, d2, e2, a2, w10, 13); + R31(a1, b1, c1, d1, e1, w11, 12); + R32(a2, b2, c2, d2, e2, w0, 13); + R31(e1, a1, b1, c1, d1, w5, 7); + R32(e2, a2, b2, c2, d2, w4, 7); + R31(d1, e1, a1, b1, c1, w12, 5); + R32(d2, e2, a2, b2, c2, w13, 5); + + R41(c1, d1, e1, a1, b1, w1, 11); + R42(c2, d2, e2, a2, b2, w8, 15); + R41(b1, c1, d1, e1, a1, w9, 12); + R42(b2, c2, d2, e2, a2, w6, 5); + R41(a1, b1, c1, d1, e1, w11, 14); + R42(a2, b2, c2, d2, e2, w4, 8); + R41(e1, a1, b1, c1, d1, w10, 15); + R42(e2, a2, b2, c2, d2, w1, 11); + R41(d1, e1, a1, b1, c1, w0, 14); + R42(d2, e2, a2, b2, c2, w3, 14); + R41(c1, d1, e1, a1, b1, w8, 15); + R42(c2, d2, e2, a2, b2, w11, 14); + R41(b1, c1, d1, e1, a1, w12, 9); + R42(b2, c2, d2, e2, a2, w15, 6); + R41(a1, b1, c1, d1, e1, w4, 8); + R42(a2, b2, c2, d2, e2, w0, 14); + R41(e1, a1, b1, c1, d1, w13, 9); + R42(e2, a2, b2, c2, d2, w5, 6); + R41(d1, e1, a1, b1, c1, w3, 14); + R42(d2, e2, a2, b2, c2, w12, 9); + R41(c1, d1, e1, a1, b1, w7, 5); + R42(c2, d2, e2, a2, b2, w2, 12); + R41(b1, c1, d1, e1, a1, w15, 6); + R42(b2, c2, d2, e2, a2, w13, 9); + R41(a1, b1, c1, d1, e1, w14, 8); + R42(a2, b2, c2, d2, e2, w9, 12); + R41(e1, a1, b1, c1, d1, w5, 6); + R42(e2, a2, b2, c2, d2, w7, 5); + R41(d1, e1, a1, b1, c1, w6, 5); + R42(d2, e2, a2, b2, c2, w10, 15); + R41(c1, d1, e1, a1, b1, w2, 12); + R42(c2, d2, e2, a2, b2, w14, 8); + + R51(b1, c1, d1, e1, a1, w4, 9); + R52(b2, c2, d2, e2, a2, w12, 8); + R51(a1, b1, c1, d1, e1, w0, 15); + R52(a2, b2, c2, d2, e2, w15, 5); + R51(e1, a1, b1, c1, d1, w5, 5); + R52(e2, a2, b2, c2, d2, w10, 12); + R51(d1, e1, a1, b1, c1, w9, 11); + R52(d2, e2, a2, b2, c2, w4, 9); + R51(c1, d1, e1, a1, b1, w7, 6); + R52(c2, d2, e2, a2, b2, w1, 12); + R51(b1, c1, d1, e1, a1, w12, 8); + R52(b2, c2, d2, e2, a2, w5, 5); + R51(a1, b1, c1, d1, e1, w2, 13); + R52(a2, b2, c2, d2, e2, w8, 14); + R51(e1, a1, b1, c1, d1, w10, 12); + R52(e2, a2, b2, c2, d2, w7, 6); + R51(d1, e1, a1, b1, c1, w14, 5); + R52(d2, e2, a2, b2, c2, w6, 8); + R51(c1, d1, e1, a1, b1, w1, 12); + R52(c2, d2, e2, a2, b2, w2, 13); + R51(b1, c1, d1, e1, a1, w3, 13); + R52(b2, c2, d2, e2, a2, w13, 6); + R51(a1, b1, c1, d1, e1, w8, 14); + R52(a2, b2, c2, d2, e2, w14, 5); + R51(e1, a1, b1, c1, d1, w11, 11); + R52(e2, a2, b2, c2, d2, w0, 15); + R51(d1, e1, a1, b1, c1, w6, 8); + R52(d2, e2, a2, b2, c2, w3, 13); + R51(c1, d1, e1, a1, b1, w15, 5); + R52(c2, d2, e2, a2, b2, w9, 11); + R51(b1, c1, d1, e1, a1, w13, 6); + R52(b2, c2, d2, e2, a2, w11, 11); + + uint32_t t = s[0]; + s[0] = s[1] + c1 + d2; + s[1] = s[2] + d1 + e2; + s[2] = s[3] + e1 + a2; + s[3] = s[4] + a1 + b2; + s[4] = t + b1 + c2; +} + +} // namespace ripemd160 + +} // namespace + +////// RIPEMD160 + +CRIPEMD160::CRIPEMD160() : bytes(0) +{ + ripemd160::Initialize(s); +} + +CRIPEMD160& CRIPEMD160::Write(const unsigned char* data, size_t len) +{ + const unsigned char* end = data + len; + size_t bufsize = bytes % 64; + if (bufsize && bufsize + len >= 64) { + // Fill the buffer, and process it. + memcpy(buf + bufsize, data, 64 - bufsize); + bytes += 64 - bufsize; + data += 64 - bufsize; + ripemd160::Transform(s, buf); + bufsize = 0; + } + while (end >= data + 64) { + // Process full chunks directly from the source. + ripemd160::Transform(s, data); + bytes += 64; + data += 64; + } + if (end > data) { + // Fill the buffer with what remains. + memcpy(buf + bufsize, data, end - data); + bytes += end - data; + } + return *this; +} + +void CRIPEMD160::Finalize(unsigned char hash[OUTPUT_SIZE]) +{ + static const unsigned char pad[64] = {0x80}; + unsigned char sizedesc[8]; + WriteLE64(sizedesc, bytes << 3); + Write(pad, 1 + ((119 - (bytes % 64)) % 64)); + Write(sizedesc, 8); + WriteLE32(hash, s[0]); + WriteLE32(hash + 4, s[1]); + WriteLE32(hash + 8, s[2]); + WriteLE32(hash + 12, s[3]); + WriteLE32(hash + 16, s[4]); +} + +CRIPEMD160& CRIPEMD160::Reset() +{ + bytes = 0; + ripemd160::Initialize(s); + return *this; +} diff --git a/src/crypto/ripemd160.h b/src/crypto/ripemd160.h new file mode 100644 index 0000000000..38ea375c1f --- /dev/null +++ b/src/crypto/ripemd160.h @@ -0,0 +1,28 @@ +// Copyright (c) 2014-2016 The Bitcoin Core developers +// Distributed under the MIT software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. + +#ifndef BITCOIN_CRYPTO_RIPEMD160_H +#define BITCOIN_CRYPTO_RIPEMD160_H + +#include <stdint.h> +#include <stdlib.h> + +/** A hasher class for RIPEMD-160. */ +class CRIPEMD160 +{ +private: + uint32_t s[5]; + unsigned char buf[64]; + uint64_t bytes; + +public: + static const size_t OUTPUT_SIZE = 20; + + CRIPEMD160(); + CRIPEMD160& Write(const unsigned char* data, size_t len); + void Finalize(unsigned char hash[OUTPUT_SIZE]); + CRIPEMD160& Reset(); +}; + +#endif // BITCOIN_CRYPTO_RIPEMD160_H diff --git a/src/crypto/sha1.cpp b/src/crypto/sha1.cpp new file mode 100644 index 0000000000..dc96ac507a --- /dev/null +++ b/src/crypto/sha1.cpp @@ -0,0 +1,199 @@ +// Copyright (c) 2014-2017 The Bitcoin Core developers +// Distributed under the MIT software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. + +#include <crypto/sha1.h> + +#include <crypto/common.h> + +#include <string.h> + +// Internal implementation code. +namespace +{ +/// Internal SHA-1 implementation. +namespace sha1 +{ +/** One round of SHA-1. */ +void inline Round(uint32_t a, uint32_t& b, uint32_t c, uint32_t d, uint32_t& e, uint32_t f, uint32_t k, uint32_t w) +{ + e += ((a << 5) | (a >> 27)) + f + k + w; + b = (b << 30) | (b >> 2); +} + +uint32_t inline f1(uint32_t b, uint32_t c, uint32_t d) { return d ^ (b & (c ^ d)); } +uint32_t inline f2(uint32_t b, uint32_t c, uint32_t d) { return b ^ c ^ d; } +uint32_t inline f3(uint32_t b, uint32_t c, uint32_t d) { return (b & c) | (d & (b | c)); } + +uint32_t inline left(uint32_t x) { return (x << 1) | (x >> 31); } + +/** Initialize SHA-1 state. */ +void inline Initialize(uint32_t* s) +{ + s[0] = 0x67452301ul; + s[1] = 0xEFCDAB89ul; + s[2] = 0x98BADCFEul; + s[3] = 0x10325476ul; + s[4] = 0xC3D2E1F0ul; +} + +const uint32_t k1 = 0x5A827999ul; +const uint32_t k2 = 0x6ED9EBA1ul; +const uint32_t k3 = 0x8F1BBCDCul; +const uint32_t k4 = 0xCA62C1D6ul; + +/** Perform a SHA-1 transformation, processing a 64-byte chunk. */ +void Transform(uint32_t* s, const unsigned char* chunk) +{ + uint32_t a = s[0], b = s[1], c = s[2], d = s[3], e = s[4]; + uint32_t w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; + + Round(a, b, c, d, e, f1(b, c, d), k1, w0 = ReadBE32(chunk + 0)); + Round(e, a, b, c, d, f1(a, b, c), k1, w1 = ReadBE32(chunk + 4)); + Round(d, e, a, b, c, f1(e, a, b), k1, w2 = ReadBE32(chunk + 8)); + Round(c, d, e, a, b, f1(d, e, a), k1, w3 = ReadBE32(chunk + 12)); + Round(b, c, d, e, a, f1(c, d, e), k1, w4 = ReadBE32(chunk + 16)); + Round(a, b, c, d, e, f1(b, c, d), k1, w5 = ReadBE32(chunk + 20)); + Round(e, a, b, c, d, f1(a, b, c), k1, w6 = ReadBE32(chunk + 24)); + Round(d, e, a, b, c, f1(e, a, b), k1, w7 = ReadBE32(chunk + 28)); + Round(c, d, e, a, b, f1(d, e, a), k1, w8 = ReadBE32(chunk + 32)); + Round(b, c, d, e, a, f1(c, d, e), k1, w9 = ReadBE32(chunk + 36)); + Round(a, b, c, d, e, f1(b, c, d), k1, w10 = ReadBE32(chunk + 40)); + Round(e, a, b, c, d, f1(a, b, c), k1, w11 = ReadBE32(chunk + 44)); + Round(d, e, a, b, c, f1(e, a, b), k1, w12 = ReadBE32(chunk + 48)); + Round(c, d, e, a, b, f1(d, e, a), k1, w13 = ReadBE32(chunk + 52)); + Round(b, c, d, e, a, f1(c, d, e), k1, w14 = ReadBE32(chunk + 56)); + Round(a, b, c, d, e, f1(b, c, d), k1, w15 = ReadBE32(chunk + 60)); + + Round(e, a, b, c, d, f1(a, b, c), k1, w0 = left(w0 ^ w13 ^ w8 ^ w2)); + Round(d, e, a, b, c, f1(e, a, b), k1, w1 = left(w1 ^ w14 ^ w9 ^ w3)); + Round(c, d, e, a, b, f1(d, e, a), k1, w2 = left(w2 ^ w15 ^ w10 ^ w4)); + Round(b, c, d, e, a, f1(c, d, e), k1, w3 = left(w3 ^ w0 ^ w11 ^ w5)); + Round(a, b, c, d, e, f2(b, c, d), k2, w4 = left(w4 ^ w1 ^ w12 ^ w6)); + Round(e, a, b, c, d, f2(a, b, c), k2, w5 = left(w5 ^ w2 ^ w13 ^ w7)); + Round(d, e, a, b, c, f2(e, a, b), k2, w6 = left(w6 ^ w3 ^ w14 ^ w8)); + Round(c, d, e, a, b, f2(d, e, a), k2, w7 = left(w7 ^ w4 ^ w15 ^ w9)); + Round(b, c, d, e, a, f2(c, d, e), k2, w8 = left(w8 ^ w5 ^ w0 ^ w10)); + Round(a, b, c, d, e, f2(b, c, d), k2, w9 = left(w9 ^ w6 ^ w1 ^ w11)); + Round(e, a, b, c, d, f2(a, b, c), k2, w10 = left(w10 ^ w7 ^ w2 ^ w12)); + Round(d, e, a, b, c, f2(e, a, b), k2, w11 = left(w11 ^ w8 ^ w3 ^ w13)); + Round(c, d, e, a, b, f2(d, e, a), k2, w12 = left(w12 ^ w9 ^ w4 ^ w14)); + Round(b, c, d, e, a, f2(c, d, e), k2, w13 = left(w13 ^ w10 ^ w5 ^ w15)); + Round(a, b, c, d, e, f2(b, c, d), k2, w14 = left(w14 ^ w11 ^ w6 ^ w0)); + Round(e, a, b, c, d, f2(a, b, c), k2, w15 = left(w15 ^ w12 ^ w7 ^ w1)); + + Round(d, e, a, b, c, f2(e, a, b), k2, w0 = left(w0 ^ w13 ^ w8 ^ w2)); + Round(c, d, e, a, b, f2(d, e, a), k2, w1 = left(w1 ^ w14 ^ w9 ^ w3)); + Round(b, c, d, e, a, f2(c, d, e), k2, w2 = left(w2 ^ w15 ^ w10 ^ w4)); + Round(a, b, c, d, e, f2(b, c, d), k2, w3 = left(w3 ^ w0 ^ w11 ^ w5)); + Round(e, a, b, c, d, f2(a, b, c), k2, w4 = left(w4 ^ w1 ^ w12 ^ w6)); + Round(d, e, a, b, c, f2(e, a, b), k2, w5 = left(w5 ^ w2 ^ w13 ^ w7)); + Round(c, d, e, a, b, f2(d, e, a), k2, w6 = left(w6 ^ w3 ^ w14 ^ w8)); + Round(b, c, d, e, a, f2(c, d, e), k2, w7 = left(w7 ^ w4 ^ w15 ^ w9)); + Round(a, b, c, d, e, f3(b, c, d), k3, w8 = left(w8 ^ w5 ^ w0 ^ w10)); + Round(e, a, b, c, d, f3(a, b, c), k3, w9 = left(w9 ^ w6 ^ w1 ^ w11)); + Round(d, e, a, b, c, f3(e, a, b), k3, w10 = left(w10 ^ w7 ^ w2 ^ w12)); + Round(c, d, e, a, b, f3(d, e, a), k3, w11 = left(w11 ^ w8 ^ w3 ^ w13)); + Round(b, c, d, e, a, f3(c, d, e), k3, w12 = left(w12 ^ w9 ^ w4 ^ w14)); + Round(a, b, c, d, e, f3(b, c, d), k3, w13 = left(w13 ^ w10 ^ w5 ^ w15)); + Round(e, a, b, c, d, f3(a, b, c), k3, w14 = left(w14 ^ w11 ^ w6 ^ w0)); + Round(d, e, a, b, c, f3(e, a, b), k3, w15 = left(w15 ^ w12 ^ w7 ^ w1)); + + Round(c, d, e, a, b, f3(d, e, a), k3, w0 = left(w0 ^ w13 ^ w8 ^ w2)); + Round(b, c, d, e, a, f3(c, d, e), k3, w1 = left(w1 ^ w14 ^ w9 ^ w3)); + Round(a, b, c, d, e, f3(b, c, d), k3, w2 = left(w2 ^ w15 ^ w10 ^ w4)); + Round(e, a, b, c, d, f3(a, b, c), k3, w3 = left(w3 ^ w0 ^ w11 ^ w5)); + Round(d, e, a, b, c, f3(e, a, b), k3, w4 = left(w4 ^ w1 ^ w12 ^ w6)); + Round(c, d, e, a, b, f3(d, e, a), k3, w5 = left(w5 ^ w2 ^ w13 ^ w7)); + Round(b, c, d, e, a, f3(c, d, e), k3, w6 = left(w6 ^ w3 ^ w14 ^ w8)); + Round(a, b, c, d, e, f3(b, c, d), k3, w7 = left(w7 ^ w4 ^ w15 ^ w9)); + Round(e, a, b, c, d, f3(a, b, c), k3, w8 = left(w8 ^ w5 ^ w0 ^ w10)); + Round(d, e, a, b, c, f3(e, a, b), k3, w9 = left(w9 ^ w6 ^ w1 ^ w11)); + Round(c, d, e, a, b, f3(d, e, a), k3, w10 = left(w10 ^ w7 ^ w2 ^ w12)); + Round(b, c, d, e, a, f3(c, d, e), k3, w11 = left(w11 ^ w8 ^ w3 ^ w13)); + Round(a, b, c, d, e, f2(b, c, d), k4, w12 = left(w12 ^ w9 ^ w4 ^ w14)); + Round(e, a, b, c, d, f2(a, b, c), k4, w13 = left(w13 ^ w10 ^ w5 ^ w15)); + Round(d, e, a, b, c, f2(e, a, b), k4, w14 = left(w14 ^ w11 ^ w6 ^ w0)); + Round(c, d, e, a, b, f2(d, e, a), k4, w15 = left(w15 ^ w12 ^ w7 ^ w1)); + + Round(b, c, d, e, a, f2(c, d, e), k4, w0 = left(w0 ^ w13 ^ w8 ^ w2)); + Round(a, b, c, d, e, f2(b, c, d), k4, w1 = left(w1 ^ w14 ^ w9 ^ w3)); + Round(e, a, b, c, d, f2(a, b, c), k4, w2 = left(w2 ^ w15 ^ w10 ^ w4)); + Round(d, e, a, b, c, f2(e, a, b), k4, w3 = left(w3 ^ w0 ^ w11 ^ w5)); + Round(c, d, e, a, b, f2(d, e, a), k4, w4 = left(w4 ^ w1 ^ w12 ^ w6)); + Round(b, c, d, e, a, f2(c, d, e), k4, w5 = left(w5 ^ w2 ^ w13 ^ w7)); + Round(a, b, c, d, e, f2(b, c, d), k4, w6 = left(w6 ^ w3 ^ w14 ^ w8)); + Round(e, a, b, c, d, f2(a, b, c), k4, w7 = left(w7 ^ w4 ^ w15 ^ w9)); + Round(d, e, a, b, c, f2(e, a, b), k4, w8 = left(w8 ^ w5 ^ w0 ^ w10)); + Round(c, d, e, a, b, f2(d, e, a), k4, w9 = left(w9 ^ w6 ^ w1 ^ w11)); + Round(b, c, d, e, a, f2(c, d, e), k4, w10 = left(w10 ^ w7 ^ w2 ^ w12)); + Round(a, b, c, d, e, f2(b, c, d), k4, w11 = left(w11 ^ w8 ^ w3 ^ w13)); + Round(e, a, b, c, d, f2(a, b, c), k4, w12 = left(w12 ^ w9 ^ w4 ^ w14)); + Round(d, e, a, b, c, f2(e, a, b), k4, left(w13 ^ w10 ^ w5 ^ w15)); + Round(c, d, e, a, b, f2(d, e, a), k4, left(w14 ^ w11 ^ w6 ^ w0)); + Round(b, c, d, e, a, f2(c, d, e), k4, left(w15 ^ w12 ^ w7 ^ w1)); + + s[0] += a; + s[1] += b; + s[2] += c; + s[3] += d; + s[4] += e; +} + +} // namespace sha1 + +} // namespace + +////// SHA1 + +CSHA1::CSHA1() : bytes(0) +{ + sha1::Initialize(s); +} + +CSHA1& CSHA1::Write(const unsigned char* data, size_t len) +{ + const unsigned char* end = data + len; + size_t bufsize = bytes % 64; + if (bufsize && bufsize + len >= 64) { + // Fill the buffer, and process it. + memcpy(buf + bufsize, data, 64 - bufsize); + bytes += 64 - bufsize; + data += 64 - bufsize; + sha1::Transform(s, buf); + bufsize = 0; + } + while (end >= data + 64) { + // Process full chunks directly from the source. + sha1::Transform(s, data); + bytes += 64; + data += 64; + } + if (end > data) { + // Fill the buffer with what remains. + memcpy(buf + bufsize, data, end - data); + bytes += end - data; + } + return *this; +} + +void CSHA1::Finalize(unsigned char hash[OUTPUT_SIZE]) +{ + static const unsigned char pad[64] = {0x80}; + unsigned char sizedesc[8]; + WriteBE64(sizedesc, bytes << 3); + Write(pad, 1 + ((119 - (bytes % 64)) % 64)); + Write(sizedesc, 8); + WriteBE32(hash, s[0]); + WriteBE32(hash + 4, s[1]); + WriteBE32(hash + 8, s[2]); + WriteBE32(hash + 12, s[3]); + WriteBE32(hash + 16, s[4]); +} + +CSHA1& CSHA1::Reset() +{ + bytes = 0; + sha1::Initialize(s); + return *this; +} diff --git a/src/crypto/sha1.h b/src/crypto/sha1.h new file mode 100644 index 0000000000..8b4568ee12 --- /dev/null +++ b/src/crypto/sha1.h @@ -0,0 +1,28 @@ +// Copyright (c) 2014-2016 The Bitcoin Core developers +// Distributed under the MIT software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. + +#ifndef BITCOIN_CRYPTO_SHA1_H +#define BITCOIN_CRYPTO_SHA1_H + +#include <stdint.h> +#include <stdlib.h> + +/** A hasher class for SHA1. */ +class CSHA1 +{ +private: + uint32_t s[5]; + unsigned char buf[64]; + uint64_t bytes; + +public: + static const size_t OUTPUT_SIZE = 20; + + CSHA1(); + CSHA1& Write(const unsigned char* data, size_t len); + void Finalize(unsigned char hash[OUTPUT_SIZE]); + CSHA1& Reset(); +}; + +#endif // BITCOIN_CRYPTO_SHA1_H diff --git a/src/crypto/sha256.cpp b/src/crypto/sha256.cpp new file mode 100644 index 0000000000..f3245b8dea --- /dev/null +++ b/src/crypto/sha256.cpp @@ -0,0 +1,249 @@ +// Copyright (c) 2014-2017 The Bitcoin Core developers +// Distributed under the MIT software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. + +#include <crypto/sha256.h> +#include <crypto/common.h> + +#include <assert.h> +#include <string.h> +#include <atomic> + +#if defined(__x86_64__) || defined(__amd64__) +#if defined(USE_ASM) +#include <cpuid.h> +namespace sha256_sse4 +{ +void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks); +} +#endif +#endif + +// Internal implementation code. +namespace +{ +/// Internal SHA-256 implementation. +namespace sha256 +{ +uint32_t inline Ch(uint32_t x, uint32_t y, uint32_t z) { return z ^ (x & (y ^ z)); } +uint32_t inline Maj(uint32_t x, uint32_t y, uint32_t z) { return (x & y) | (z & (x | y)); } +uint32_t inline Sigma0(uint32_t x) { return (x >> 2 | x << 30) ^ (x >> 13 | x << 19) ^ (x >> 22 | x << 10); } +uint32_t inline Sigma1(uint32_t x) { return (x >> 6 | x << 26) ^ (x >> 11 | x << 21) ^ (x >> 25 | x << 7); } +uint32_t inline sigma0(uint32_t x) { return (x >> 7 | x << 25) ^ (x >> 18 | x << 14) ^ (x >> 3); } +uint32_t inline sigma1(uint32_t x) { return (x >> 17 | x << 15) ^ (x >> 19 | x << 13) ^ (x >> 10); } + +/** One round of SHA-256. */ +void inline Round(uint32_t a, uint32_t b, uint32_t c, uint32_t& d, uint32_t e, uint32_t f, uint32_t g, uint32_t& h, uint32_t k, uint32_t w) +{ + uint32_t t1 = h + Sigma1(e) + Ch(e, f, g) + k + w; + uint32_t t2 = Sigma0(a) + Maj(a, b, c); + d += t1; + h = t1 + t2; +} + +/** Initialize SHA-256 state. */ +void inline Initialize(uint32_t* s) +{ + s[0] = 0x6a09e667ul; + s[1] = 0xbb67ae85ul; + s[2] = 0x3c6ef372ul; + s[3] = 0xa54ff53aul; + s[4] = 0x510e527ful; + s[5] = 0x9b05688cul; + s[6] = 0x1f83d9abul; + s[7] = 0x5be0cd19ul; +} + +/** Perform a number of SHA-256 transformations, processing 64-byte chunks. */ +void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks) +{ + while (blocks--) { + uint32_t a = s[0], b = s[1], c = s[2], d = s[3], e = s[4], f = s[5], g = s[6], h = s[7]; + uint32_t w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; + + Round(a, b, c, d, e, f, g, h, 0x428a2f98, w0 = ReadBE32(chunk + 0)); + Round(h, a, b, c, d, e, f, g, 0x71374491, w1 = ReadBE32(chunk + 4)); + Round(g, h, a, b, c, d, e, f, 0xb5c0fbcf, w2 = ReadBE32(chunk + 8)); + Round(f, g, h, a, b, c, d, e, 0xe9b5dba5, w3 = ReadBE32(chunk + 12)); + Round(e, f, g, h, a, b, c, d, 0x3956c25b, w4 = ReadBE32(chunk + 16)); + Round(d, e, f, g, h, a, b, c, 0x59f111f1, w5 = ReadBE32(chunk + 20)); + Round(c, d, e, f, g, h, a, b, 0x923f82a4, w6 = ReadBE32(chunk + 24)); + Round(b, c, d, e, f, g, h, a, 0xab1c5ed5, w7 = ReadBE32(chunk + 28)); + Round(a, b, c, d, e, f, g, h, 0xd807aa98, w8 = ReadBE32(chunk + 32)); + Round(h, a, b, c, d, e, f, g, 0x12835b01, w9 = ReadBE32(chunk + 36)); + Round(g, h, a, b, c, d, e, f, 0x243185be, w10 = ReadBE32(chunk + 40)); + Round(f, g, h, a, b, c, d, e, 0x550c7dc3, w11 = ReadBE32(chunk + 44)); + Round(e, f, g, h, a, b, c, d, 0x72be5d74, w12 = ReadBE32(chunk + 48)); + Round(d, e, f, g, h, a, b, c, 0x80deb1fe, w13 = ReadBE32(chunk + 52)); + Round(c, d, e, f, g, h, a, b, 0x9bdc06a7, w14 = ReadBE32(chunk + 56)); + Round(b, c, d, e, f, g, h, a, 0xc19bf174, w15 = ReadBE32(chunk + 60)); + + Round(a, b, c, d, e, f, g, h, 0xe49b69c1, w0 += sigma1(w14) + w9 + sigma0(w1)); + Round(h, a, b, c, d, e, f, g, 0xefbe4786, w1 += sigma1(w15) + w10 + sigma0(w2)); + Round(g, h, a, b, c, d, e, f, 0x0fc19dc6, w2 += sigma1(w0) + w11 + sigma0(w3)); + Round(f, g, h, a, b, c, d, e, 0x240ca1cc, w3 += sigma1(w1) + w12 + sigma0(w4)); + Round(e, f, g, h, a, b, c, d, 0x2de92c6f, w4 += sigma1(w2) + w13 + sigma0(w5)); + Round(d, e, f, g, h, a, b, c, 0x4a7484aa, w5 += sigma1(w3) + w14 + sigma0(w6)); + Round(c, d, e, f, g, h, a, b, 0x5cb0a9dc, w6 += sigma1(w4) + w15 + sigma0(w7)); + Round(b, c, d, e, f, g, h, a, 0x76f988da, w7 += sigma1(w5) + w0 + sigma0(w8)); + Round(a, b, c, d, e, f, g, h, 0x983e5152, w8 += sigma1(w6) + w1 + sigma0(w9)); + Round(h, a, b, c, d, e, f, g, 0xa831c66d, w9 += sigma1(w7) + w2 + sigma0(w10)); + Round(g, h, a, b, c, d, e, f, 0xb00327c8, w10 += sigma1(w8) + w3 + sigma0(w11)); + Round(f, g, h, a, b, c, d, e, 0xbf597fc7, w11 += sigma1(w9) + w4 + sigma0(w12)); + Round(e, f, g, h, a, b, c, d, 0xc6e00bf3, w12 += sigma1(w10) + w5 + sigma0(w13)); + Round(d, e, f, g, h, a, b, c, 0xd5a79147, w13 += sigma1(w11) + w6 + sigma0(w14)); + Round(c, d, e, f, g, h, a, b, 0x06ca6351, w14 += sigma1(w12) + w7 + sigma0(w15)); + Round(b, c, d, e, f, g, h, a, 0x14292967, w15 += sigma1(w13) + w8 + sigma0(w0)); + + Round(a, b, c, d, e, f, g, h, 0x27b70a85, w0 += sigma1(w14) + w9 + sigma0(w1)); + Round(h, a, b, c, d, e, f, g, 0x2e1b2138, w1 += sigma1(w15) + w10 + sigma0(w2)); + Round(g, h, a, b, c, d, e, f, 0x4d2c6dfc, w2 += sigma1(w0) + w11 + sigma0(w3)); + Round(f, g, h, a, b, c, d, e, 0x53380d13, w3 += sigma1(w1) + w12 + sigma0(w4)); + Round(e, f, g, h, a, b, c, d, 0x650a7354, w4 += sigma1(w2) + w13 + sigma0(w5)); + Round(d, e, f, g, h, a, b, c, 0x766a0abb, w5 += sigma1(w3) + w14 + sigma0(w6)); + Round(c, d, e, f, g, h, a, b, 0x81c2c92e, w6 += sigma1(w4) + w15 + sigma0(w7)); + Round(b, c, d, e, f, g, h, a, 0x92722c85, w7 += sigma1(w5) + w0 + sigma0(w8)); + Round(a, b, c, d, e, f, g, h, 0xa2bfe8a1, w8 += sigma1(w6) + w1 + sigma0(w9)); + Round(h, a, b, c, d, e, f, g, 0xa81a664b, w9 += sigma1(w7) + w2 + sigma0(w10)); + Round(g, h, a, b, c, d, e, f, 0xc24b8b70, w10 += sigma1(w8) + w3 + sigma0(w11)); + Round(f, g, h, a, b, c, d, e, 0xc76c51a3, w11 += sigma1(w9) + w4 + sigma0(w12)); + Round(e, f, g, h, a, b, c, d, 0xd192e819, w12 += sigma1(w10) + w5 + sigma0(w13)); + Round(d, e, f, g, h, a, b, c, 0xd6990624, w13 += sigma1(w11) + w6 + sigma0(w14)); + Round(c, d, e, f, g, h, a, b, 0xf40e3585, w14 += sigma1(w12) + w7 + sigma0(w15)); + Round(b, c, d, e, f, g, h, a, 0x106aa070, w15 += sigma1(w13) + w8 + sigma0(w0)); + + Round(a, b, c, d, e, f, g, h, 0x19a4c116, w0 += sigma1(w14) + w9 + sigma0(w1)); + Round(h, a, b, c, d, e, f, g, 0x1e376c08, w1 += sigma1(w15) + w10 + sigma0(w2)); + Round(g, h, a, b, c, d, e, f, 0x2748774c, w2 += sigma1(w0) + w11 + sigma0(w3)); + Round(f, g, h, a, b, c, d, e, 0x34b0bcb5, w3 += sigma1(w1) + w12 + sigma0(w4)); + Round(e, f, g, h, a, b, c, d, 0x391c0cb3, w4 += sigma1(w2) + w13 + sigma0(w5)); + Round(d, e, f, g, h, a, b, c, 0x4ed8aa4a, w5 += sigma1(w3) + w14 + sigma0(w6)); + Round(c, d, e, f, g, h, a, b, 0x5b9cca4f, w6 += sigma1(w4) + w15 + sigma0(w7)); + Round(b, c, d, e, f, g, h, a, 0x682e6ff3, w7 += sigma1(w5) + w0 + sigma0(w8)); + Round(a, b, c, d, e, f, g, h, 0x748f82ee, w8 += sigma1(w6) + w1 + sigma0(w9)); + Round(h, a, b, c, d, e, f, g, 0x78a5636f, w9 += sigma1(w7) + w2 + sigma0(w10)); + Round(g, h, a, b, c, d, e, f, 0x84c87814, w10 += sigma1(w8) + w3 + sigma0(w11)); + Round(f, g, h, a, b, c, d, e, 0x8cc70208, w11 += sigma1(w9) + w4 + sigma0(w12)); + Round(e, f, g, h, a, b, c, d, 0x90befffa, w12 += sigma1(w10) + w5 + sigma0(w13)); + Round(d, e, f, g, h, a, b, c, 0xa4506ceb, w13 += sigma1(w11) + w6 + sigma0(w14)); + Round(c, d, e, f, g, h, a, b, 0xbef9a3f7, w14 + sigma1(w12) + w7 + sigma0(w15)); + Round(b, c, d, e, f, g, h, a, 0xc67178f2, w15 + sigma1(w13) + w8 + sigma0(w0)); + + s[0] += a; + s[1] += b; + s[2] += c; + s[3] += d; + s[4] += e; + s[5] += f; + s[6] += g; + s[7] += h; + chunk += 64; + } +} + +} // namespace sha256 + +typedef void (*TransformType)(uint32_t*, const unsigned char*, size_t); + +bool SelfTest(TransformType tr) { + static const unsigned char in1[65] = {0, 0x80}; + static const unsigned char in2[129] = { + 0, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0 + }; + static const uint32_t init[8] = {0x6a09e667ul, 0xbb67ae85ul, 0x3c6ef372ul, 0xa54ff53aul, 0x510e527ful, 0x9b05688cul, 0x1f83d9abul, 0x5be0cd19ul}; + static const uint32_t out1[8] = {0xe3b0c442ul, 0x98fc1c14ul, 0x9afbf4c8ul, 0x996fb924ul, 0x27ae41e4ul, 0x649b934cul, 0xa495991bul, 0x7852b855ul}; + static const uint32_t out2[8] = {0xce4153b0ul, 0x147c2a86ul, 0x3ed4298eul, 0xe0676bc8ul, 0x79fc77a1ul, 0x2abe1f49ul, 0xb2b055dful, 0x1069523eul}; + uint32_t buf[8]; + memcpy(buf, init, sizeof(buf)); + // Process nothing, and check we remain in the initial state. + tr(buf, nullptr, 0); + if (memcmp(buf, init, sizeof(buf))) return false; + // Process the padded empty string (unaligned) + tr(buf, in1 + 1, 1); + if (memcmp(buf, out1, sizeof(buf))) return false; + // Process 64 spaces (unaligned) + memcpy(buf, init, sizeof(buf)); + tr(buf, in2 + 1, 2); + if (memcmp(buf, out2, sizeof(buf))) return false; + return true; +} + +TransformType Transform = sha256::Transform; + +} // namespace + +std::string SHA256AutoDetect() +{ +#if defined(USE_ASM) && (defined(__x86_64__) || defined(__amd64__)) + uint32_t eax, ebx, ecx, edx; + if (__get_cpuid(1, &eax, &ebx, &ecx, &edx) && (ecx >> 19) & 1) { + Transform = sha256_sse4::Transform; + assert(SelfTest(Transform)); + return "sse4"; + } +#endif + + assert(SelfTest(Transform)); + return "standard"; +} + +////// SHA-256 + +CSHA256::CSHA256() : bytes(0) +{ + sha256::Initialize(s); +} + +CSHA256& CSHA256::Write(const unsigned char* data, size_t len) +{ + const unsigned char* end = data + len; + size_t bufsize = bytes % 64; + if (bufsize && bufsize + len >= 64) { + // Fill the buffer, and process it. + memcpy(buf + bufsize, data, 64 - bufsize); + bytes += 64 - bufsize; + data += 64 - bufsize; + Transform(s, buf, 1); + bufsize = 0; + } + if (end - data >= 64) { + size_t blocks = (end - data) / 64; + Transform(s, data, blocks); + data += 64 * blocks; + bytes += 64 * blocks; + } + if (end > data) { + // Fill the buffer with what remains. + memcpy(buf + bufsize, data, end - data); + bytes += end - data; + } + return *this; +} + +void CSHA256::Finalize(unsigned char hash[OUTPUT_SIZE]) +{ + static const unsigned char pad[64] = {0x80}; + unsigned char sizedesc[8]; + WriteBE64(sizedesc, bytes << 3); + Write(pad, 1 + ((119 - (bytes % 64)) % 64)); + Write(sizedesc, 8); + WriteBE32(hash, s[0]); + WriteBE32(hash + 4, s[1]); + WriteBE32(hash + 8, s[2]); + WriteBE32(hash + 12, s[3]); + WriteBE32(hash + 16, s[4]); + WriteBE32(hash + 20, s[5]); + WriteBE32(hash + 24, s[6]); + WriteBE32(hash + 28, s[7]); +} + +CSHA256& CSHA256::Reset() +{ + bytes = 0; + sha256::Initialize(s); + return *this; +} diff --git a/src/crypto/sha256.h b/src/crypto/sha256.h new file mode 100644 index 0000000000..dd30fe396f --- /dev/null +++ b/src/crypto/sha256.h @@ -0,0 +1,34 @@ +// Copyright (c) 2014-2017 The Bitcoin Core developers +// Distributed under the MIT software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. + +#ifndef BITCOIN_CRYPTO_SHA256_H +#define BITCOIN_CRYPTO_SHA256_H + +#include <stdint.h> +#include <stdlib.h> +#include <string> + +/** A hasher class for SHA-256. */ +class CSHA256 +{ +private: + uint32_t s[8]; + unsigned char buf[64]; + uint64_t bytes; + +public: + static const size_t OUTPUT_SIZE = 32; + + CSHA256(); + CSHA256& Write(const unsigned char* data, size_t len); + void Finalize(unsigned char hash[OUTPUT_SIZE]); + CSHA256& Reset(); +}; + +/** Autodetect the best available SHA256 implementation. + * Returns the name of the implementation. + */ +std::string SHA256AutoDetect(); + +#endif // BITCOIN_CRYPTO_SHA256_H diff --git a/src/crypto/sha256_sse4.cpp b/src/crypto/sha256_sse4.cpp new file mode 100644 index 0000000000..89f529a3ab --- /dev/null +++ b/src/crypto/sha256_sse4.cpp @@ -0,0 +1,1506 @@ +// Copyright (c) 2017 The Bitcoin Core developers +// Distributed under the MIT software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. +// +// This is a translation to GCC extended asm syntax from YASM code by Intel +// (available at the bottom of this file). + +#include <stdint.h> +#include <stdlib.h> + +#if defined(__x86_64__) || defined(__amd64__) + +namespace sha256_sse4 +{ +void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks) +{ + static const uint32_t K256 alignas(16) [] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, + }; + static const uint32_t FLIP_MASK alignas(16) [] = {0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f}; + static const uint32_t SHUF_00BA alignas(16) [] = {0x03020100, 0x0b0a0908, 0xffffffff, 0xffffffff}; + static const uint32_t SHUF_DC00 alignas(16) [] = {0xffffffff, 0xffffffff, 0x03020100, 0x0b0a0908}; + uint32_t a, b, c, d, f, g, h, y0, y1, y2; + uint64_t tbl; + uint64_t inp_end, inp; + uint32_t xfer alignas(16) [4]; + + __asm__ __volatile__( + "shl $0x6,%2;" + "je Ldone_hash_%=;" + "add %1,%2;" + "mov %2,%14;" + "mov (%0),%3;" + "mov 0x4(%0),%4;" + "mov 0x8(%0),%5;" + "mov 0xc(%0),%6;" + "mov 0x10(%0),%k2;" + "mov 0x14(%0),%7;" + "mov 0x18(%0),%8;" + "mov 0x1c(%0),%9;" + "movdqa %18,%%xmm12;" + "movdqa %19,%%xmm10;" + "movdqa %20,%%xmm11;" + + "Lloop0_%=:" + "lea %17,%13;" + "movdqu (%1),%%xmm4;" + "pshufb %%xmm12,%%xmm4;" + "movdqu 0x10(%1),%%xmm5;" + "pshufb %%xmm12,%%xmm5;" + "movdqu 0x20(%1),%%xmm6;" + "pshufb %%xmm12,%%xmm6;" + "movdqu 0x30(%1),%%xmm7;" + "pshufb %%xmm12,%%xmm7;" + "mov %1,%15;" + "mov $3,%1;" + + "Lloop1_%=:" + "movdqa 0x0(%13),%%xmm9;" + "paddd %%xmm4,%%xmm9;" + "movdqa %%xmm9,%16;" + "movdqa %%xmm7,%%xmm0;" + "mov %k2,%10;" + "ror $0xe,%10;" + "mov %3,%11;" + "palignr $0x4,%%xmm6,%%xmm0;" + "ror $0x9,%11;" + "xor %k2,%10;" + "mov %7,%12;" + "ror $0x5,%10;" + "movdqa %%xmm5,%%xmm1;" + "xor %3,%11;" + "xor %8,%12;" + "paddd %%xmm4,%%xmm0;" + "xor %k2,%10;" + "and %k2,%12;" + "ror $0xb,%11;" + "palignr $0x4,%%xmm4,%%xmm1;" + "xor %3,%11;" + "ror $0x6,%10;" + "xor %8,%12;" + "movdqa %%xmm1,%%xmm2;" + "ror $0x2,%11;" + "add %10,%12;" + "add %16,%12;" + "movdqa %%xmm1,%%xmm3;" + "mov %3,%10;" + "add %12,%9;" + "mov %3,%12;" + "pslld $0x19,%%xmm1;" + "or %5,%10;" + "add %9,%6;" + "and %5,%12;" + "psrld $0x7,%%xmm2;" + "and %4,%10;" + "add %11,%9;" + "por %%xmm2,%%xmm1;" + "or %12,%10;" + "add %10,%9;" + "movdqa %%xmm3,%%xmm2;" + "mov %6,%10;" + "mov %9,%11;" + "movdqa %%xmm3,%%xmm8;" + "ror $0xe,%10;" + "xor %6,%10;" + "mov %k2,%12;" + "ror $0x9,%11;" + "pslld $0xe,%%xmm3;" + "xor %9,%11;" + "ror $0x5,%10;" + "xor %7,%12;" + "psrld $0x12,%%xmm2;" + "ror $0xb,%11;" + "xor %6,%10;" + "and %6,%12;" + "ror $0x6,%10;" + "pxor %%xmm3,%%xmm1;" + "xor %9,%11;" + "xor %7,%12;" + "psrld $0x3,%%xmm8;" + "add %10,%12;" + "add 4+%16,%12;" + "ror $0x2,%11;" + "pxor %%xmm2,%%xmm1;" + "mov %9,%10;" + "add %12,%8;" + "mov %9,%12;" + "pxor %%xmm8,%%xmm1;" + "or %4,%10;" + "add %8,%5;" + "and %4,%12;" + "pshufd $0xfa,%%xmm7,%%xmm2;" + "and %3,%10;" + "add %11,%8;" + "paddd %%xmm1,%%xmm0;" + "or %12,%10;" + "add %10,%8;" + "movdqa %%xmm2,%%xmm3;" + "mov %5,%10;" + "mov %8,%11;" + "ror $0xe,%10;" + "movdqa %%xmm2,%%xmm8;" + "xor %5,%10;" + "ror $0x9,%11;" + "mov %6,%12;" + "xor %8,%11;" + "ror $0x5,%10;" + "psrlq $0x11,%%xmm2;" + "xor %k2,%12;" + "psrlq $0x13,%%xmm3;" + "xor %5,%10;" + "and %5,%12;" + "psrld $0xa,%%xmm8;" + "ror $0xb,%11;" + "xor %8,%11;" + "xor %k2,%12;" + "ror $0x6,%10;" + "pxor %%xmm3,%%xmm2;" + "add %10,%12;" + "ror $0x2,%11;" + "add 8+%16,%12;" + "pxor %%xmm2,%%xmm8;" + "mov %8,%10;" + "add %12,%7;" + "mov %8,%12;" + "pshufb %%xmm10,%%xmm8;" + "or %3,%10;" + "add %7,%4;" + "and %3,%12;" + "paddd %%xmm8,%%xmm0;" + "and %9,%10;" + "add %11,%7;" + "pshufd $0x50,%%xmm0,%%xmm2;" + "or %12,%10;" + "add %10,%7;" + "movdqa %%xmm2,%%xmm3;" + "mov %4,%10;" + "ror $0xe,%10;" + "mov %7,%11;" + "movdqa %%xmm2,%%xmm4;" + "ror $0x9,%11;" + "xor %4,%10;" + "mov %5,%12;" + "ror $0x5,%10;" + "psrlq $0x11,%%xmm2;" + "xor %7,%11;" + "xor %6,%12;" + "psrlq $0x13,%%xmm3;" + "xor %4,%10;" + "and %4,%12;" + "ror $0xb,%11;" + "psrld $0xa,%%xmm4;" + "xor %7,%11;" + "ror $0x6,%10;" + "xor %6,%12;" + "pxor %%xmm3,%%xmm2;" + "ror $0x2,%11;" + "add %10,%12;" + "add 12+%16,%12;" + "pxor %%xmm2,%%xmm4;" + "mov %7,%10;" + "add %12,%k2;" + "mov %7,%12;" + "pshufb %%xmm11,%%xmm4;" + "or %9,%10;" + "add %k2,%3;" + "and %9,%12;" + "paddd %%xmm0,%%xmm4;" + "and %8,%10;" + "add %11,%k2;" + "or %12,%10;" + "add %10,%k2;" + "movdqa 0x10(%13),%%xmm9;" + "paddd %%xmm5,%%xmm9;" + "movdqa %%xmm9,%16;" + "movdqa %%xmm4,%%xmm0;" + "mov %3,%10;" + "ror $0xe,%10;" + "mov %k2,%11;" + "palignr $0x4,%%xmm7,%%xmm0;" + "ror $0x9,%11;" + "xor %3,%10;" + "mov %4,%12;" + "ror $0x5,%10;" + "movdqa %%xmm6,%%xmm1;" + "xor %k2,%11;" + "xor %5,%12;" + "paddd %%xmm5,%%xmm0;" + "xor %3,%10;" + "and %3,%12;" + "ror $0xb,%11;" + "palignr $0x4,%%xmm5,%%xmm1;" + "xor %k2,%11;" + "ror $0x6,%10;" + "xor %5,%12;" + "movdqa %%xmm1,%%xmm2;" + "ror $0x2,%11;" + "add %10,%12;" + "add %16,%12;" + "movdqa %%xmm1,%%xmm3;" + "mov %k2,%10;" + "add %12,%6;" + "mov %k2,%12;" + "pslld $0x19,%%xmm1;" + "or %8,%10;" + "add %6,%9;" + "and %8,%12;" + "psrld $0x7,%%xmm2;" + "and %7,%10;" + "add %11,%6;" + "por %%xmm2,%%xmm1;" + "or %12,%10;" + "add %10,%6;" + "movdqa %%xmm3,%%xmm2;" + "mov %9,%10;" + "mov %6,%11;" + "movdqa %%xmm3,%%xmm8;" + "ror $0xe,%10;" + "xor %9,%10;" + "mov %3,%12;" + "ror $0x9,%11;" + "pslld $0xe,%%xmm3;" + "xor %6,%11;" + "ror $0x5,%10;" + "xor %4,%12;" + "psrld $0x12,%%xmm2;" + "ror $0xb,%11;" + "xor %9,%10;" + "and %9,%12;" + "ror $0x6,%10;" + "pxor %%xmm3,%%xmm1;" + "xor %6,%11;" + "xor %4,%12;" + "psrld $0x3,%%xmm8;" + "add %10,%12;" + "add 4+%16,%12;" + "ror $0x2,%11;" + "pxor %%xmm2,%%xmm1;" + "mov %6,%10;" + "add %12,%5;" + "mov %6,%12;" + "pxor %%xmm8,%%xmm1;" + "or %7,%10;" + "add %5,%8;" + "and %7,%12;" + "pshufd $0xfa,%%xmm4,%%xmm2;" + "and %k2,%10;" + "add %11,%5;" + "paddd %%xmm1,%%xmm0;" + "or %12,%10;" + "add %10,%5;" + "movdqa %%xmm2,%%xmm3;" + "mov %8,%10;" + "mov %5,%11;" + "ror $0xe,%10;" + "movdqa %%xmm2,%%xmm8;" + "xor %8,%10;" + "ror $0x9,%11;" + "mov %9,%12;" + "xor %5,%11;" + "ror $0x5,%10;" + "psrlq $0x11,%%xmm2;" + "xor %3,%12;" + "psrlq $0x13,%%xmm3;" + "xor %8,%10;" + "and %8,%12;" + "psrld $0xa,%%xmm8;" + "ror $0xb,%11;" + "xor %5,%11;" + "xor %3,%12;" + "ror $0x6,%10;" + "pxor %%xmm3,%%xmm2;" + "add %10,%12;" + "ror $0x2,%11;" + "add 8+%16,%12;" + "pxor %%xmm2,%%xmm8;" + "mov %5,%10;" + "add %12,%4;" + "mov %5,%12;" + "pshufb %%xmm10,%%xmm8;" + "or %k2,%10;" + "add %4,%7;" + "and %k2,%12;" + "paddd %%xmm8,%%xmm0;" + "and %6,%10;" + "add %11,%4;" + "pshufd $0x50,%%xmm0,%%xmm2;" + "or %12,%10;" + "add %10,%4;" + "movdqa %%xmm2,%%xmm3;" + "mov %7,%10;" + "ror $0xe,%10;" + "mov %4,%11;" + "movdqa %%xmm2,%%xmm5;" + "ror $0x9,%11;" + "xor %7,%10;" + "mov %8,%12;" + "ror $0x5,%10;" + "psrlq $0x11,%%xmm2;" + "xor %4,%11;" + "xor %9,%12;" + "psrlq $0x13,%%xmm3;" + "xor %7,%10;" + "and %7,%12;" + "ror $0xb,%11;" + "psrld $0xa,%%xmm5;" + "xor %4,%11;" + "ror $0x6,%10;" + "xor %9,%12;" + "pxor %%xmm3,%%xmm2;" + "ror $0x2,%11;" + "add %10,%12;" + "add 12+%16,%12;" + "pxor %%xmm2,%%xmm5;" + "mov %4,%10;" + "add %12,%3;" + "mov %4,%12;" + "pshufb %%xmm11,%%xmm5;" + "or %6,%10;" + "add %3,%k2;" + "and %6,%12;" + "paddd %%xmm0,%%xmm5;" + "and %5,%10;" + "add %11,%3;" + "or %12,%10;" + "add %10,%3;" + "movdqa 0x20(%13),%%xmm9;" + "paddd %%xmm6,%%xmm9;" + "movdqa %%xmm9,%16;" + "movdqa %%xmm5,%%xmm0;" + "mov %k2,%10;" + "ror $0xe,%10;" + "mov %3,%11;" + "palignr $0x4,%%xmm4,%%xmm0;" + "ror $0x9,%11;" + "xor %k2,%10;" + "mov %7,%12;" + "ror $0x5,%10;" + "movdqa %%xmm7,%%xmm1;" + "xor %3,%11;" + "xor %8,%12;" + "paddd %%xmm6,%%xmm0;" + "xor %k2,%10;" + "and %k2,%12;" + "ror $0xb,%11;" + "palignr $0x4,%%xmm6,%%xmm1;" + "xor %3,%11;" + "ror $0x6,%10;" + "xor %8,%12;" + "movdqa %%xmm1,%%xmm2;" + "ror $0x2,%11;" + "add %10,%12;" + "add %16,%12;" + "movdqa %%xmm1,%%xmm3;" + "mov %3,%10;" + "add %12,%9;" + "mov %3,%12;" + "pslld $0x19,%%xmm1;" + "or %5,%10;" + "add %9,%6;" + "and %5,%12;" + "psrld $0x7,%%xmm2;" + "and %4,%10;" + "add %11,%9;" + "por %%xmm2,%%xmm1;" + "or %12,%10;" + "add %10,%9;" + "movdqa %%xmm3,%%xmm2;" + "mov %6,%10;" + "mov %9,%11;" + "movdqa %%xmm3,%%xmm8;" + "ror $0xe,%10;" + "xor %6,%10;" + "mov %k2,%12;" + "ror $0x9,%11;" + "pslld $0xe,%%xmm3;" + "xor %9,%11;" + "ror $0x5,%10;" + "xor %7,%12;" + "psrld $0x12,%%xmm2;" + "ror $0xb,%11;" + "xor %6,%10;" + "and %6,%12;" + "ror $0x6,%10;" + "pxor %%xmm3,%%xmm1;" + "xor %9,%11;" + "xor %7,%12;" + "psrld $0x3,%%xmm8;" + "add %10,%12;" + "add 4+%16,%12;" + "ror $0x2,%11;" + "pxor %%xmm2,%%xmm1;" + "mov %9,%10;" + "add %12,%8;" + "mov %9,%12;" + "pxor %%xmm8,%%xmm1;" + "or %4,%10;" + "add %8,%5;" + "and %4,%12;" + "pshufd $0xfa,%%xmm5,%%xmm2;" + "and %3,%10;" + "add %11,%8;" + "paddd %%xmm1,%%xmm0;" + "or %12,%10;" + "add %10,%8;" + "movdqa %%xmm2,%%xmm3;" + "mov %5,%10;" + "mov %8,%11;" + "ror $0xe,%10;" + "movdqa %%xmm2,%%xmm8;" + "xor %5,%10;" + "ror $0x9,%11;" + "mov %6,%12;" + "xor %8,%11;" + "ror $0x5,%10;" + "psrlq $0x11,%%xmm2;" + "xor %k2,%12;" + "psrlq $0x13,%%xmm3;" + "xor %5,%10;" + "and %5,%12;" + "psrld $0xa,%%xmm8;" + "ror $0xb,%11;" + "xor %8,%11;" + "xor %k2,%12;" + "ror $0x6,%10;" + "pxor %%xmm3,%%xmm2;" + "add %10,%12;" + "ror $0x2,%11;" + "add 8+%16,%12;" + "pxor %%xmm2,%%xmm8;" + "mov %8,%10;" + "add %12,%7;" + "mov %8,%12;" + "pshufb %%xmm10,%%xmm8;" + "or %3,%10;" + "add %7,%4;" + "and %3,%12;" + "paddd %%xmm8,%%xmm0;" + "and %9,%10;" + "add %11,%7;" + "pshufd $0x50,%%xmm0,%%xmm2;" + "or %12,%10;" + "add %10,%7;" + "movdqa %%xmm2,%%xmm3;" + "mov %4,%10;" + "ror $0xe,%10;" + "mov %7,%11;" + "movdqa %%xmm2,%%xmm6;" + "ror $0x9,%11;" + "xor %4,%10;" + "mov %5,%12;" + "ror $0x5,%10;" + "psrlq $0x11,%%xmm2;" + "xor %7,%11;" + "xor %6,%12;" + "psrlq $0x13,%%xmm3;" + "xor %4,%10;" + "and %4,%12;" + "ror $0xb,%11;" + "psrld $0xa,%%xmm6;" + "xor %7,%11;" + "ror $0x6,%10;" + "xor %6,%12;" + "pxor %%xmm3,%%xmm2;" + "ror $0x2,%11;" + "add %10,%12;" + "add 12+%16,%12;" + "pxor %%xmm2,%%xmm6;" + "mov %7,%10;" + "add %12,%k2;" + "mov %7,%12;" + "pshufb %%xmm11,%%xmm6;" + "or %9,%10;" + "add %k2,%3;" + "and %9,%12;" + "paddd %%xmm0,%%xmm6;" + "and %8,%10;" + "add %11,%k2;" + "or %12,%10;" + "add %10,%k2;" + "movdqa 0x30(%13),%%xmm9;" + "paddd %%xmm7,%%xmm9;" + "movdqa %%xmm9,%16;" + "add $0x40,%13;" + "movdqa %%xmm6,%%xmm0;" + "mov %3,%10;" + "ror $0xe,%10;" + "mov %k2,%11;" + "palignr $0x4,%%xmm5,%%xmm0;" + "ror $0x9,%11;" + "xor %3,%10;" + "mov %4,%12;" + "ror $0x5,%10;" + "movdqa %%xmm4,%%xmm1;" + "xor %k2,%11;" + "xor %5,%12;" + "paddd %%xmm7,%%xmm0;" + "xor %3,%10;" + "and %3,%12;" + "ror $0xb,%11;" + "palignr $0x4,%%xmm7,%%xmm1;" + "xor %k2,%11;" + "ror $0x6,%10;" + "xor %5,%12;" + "movdqa %%xmm1,%%xmm2;" + "ror $0x2,%11;" + "add %10,%12;" + "add %16,%12;" + "movdqa %%xmm1,%%xmm3;" + "mov %k2,%10;" + "add %12,%6;" + "mov %k2,%12;" + "pslld $0x19,%%xmm1;" + "or %8,%10;" + "add %6,%9;" + "and %8,%12;" + "psrld $0x7,%%xmm2;" + "and %7,%10;" + "add %11,%6;" + "por %%xmm2,%%xmm1;" + "or %12,%10;" + "add %10,%6;" + "movdqa %%xmm3,%%xmm2;" + "mov %9,%10;" + "mov %6,%11;" + "movdqa %%xmm3,%%xmm8;" + "ror $0xe,%10;" + "xor %9,%10;" + "mov %3,%12;" + "ror $0x9,%11;" + "pslld $0xe,%%xmm3;" + "xor %6,%11;" + "ror $0x5,%10;" + "xor %4,%12;" + "psrld $0x12,%%xmm2;" + "ror $0xb,%11;" + "xor %9,%10;" + "and %9,%12;" + "ror $0x6,%10;" + "pxor %%xmm3,%%xmm1;" + "xor %6,%11;" + "xor %4,%12;" + "psrld $0x3,%%xmm8;" + "add %10,%12;" + "add 4+%16,%12;" + "ror $0x2,%11;" + "pxor %%xmm2,%%xmm1;" + "mov %6,%10;" + "add %12,%5;" + "mov %6,%12;" + "pxor %%xmm8,%%xmm1;" + "or %7,%10;" + "add %5,%8;" + "and %7,%12;" + "pshufd $0xfa,%%xmm6,%%xmm2;" + "and %k2,%10;" + "add %11,%5;" + "paddd %%xmm1,%%xmm0;" + "or %12,%10;" + "add %10,%5;" + "movdqa %%xmm2,%%xmm3;" + "mov %8,%10;" + "mov %5,%11;" + "ror $0xe,%10;" + "movdqa %%xmm2,%%xmm8;" + "xor %8,%10;" + "ror $0x9,%11;" + "mov %9,%12;" + "xor %5,%11;" + "ror $0x5,%10;" + "psrlq $0x11,%%xmm2;" + "xor %3,%12;" + "psrlq $0x13,%%xmm3;" + "xor %8,%10;" + "and %8,%12;" + "psrld $0xa,%%xmm8;" + "ror $0xb,%11;" + "xor %5,%11;" + "xor %3,%12;" + "ror $0x6,%10;" + "pxor %%xmm3,%%xmm2;" + "add %10,%12;" + "ror $0x2,%11;" + "add 8+%16,%12;" + "pxor %%xmm2,%%xmm8;" + "mov %5,%10;" + "add %12,%4;" + "mov %5,%12;" + "pshufb %%xmm10,%%xmm8;" + "or %k2,%10;" + "add %4,%7;" + "and %k2,%12;" + "paddd %%xmm8,%%xmm0;" + "and %6,%10;" + "add %11,%4;" + "pshufd $0x50,%%xmm0,%%xmm2;" + "or %12,%10;" + "add %10,%4;" + "movdqa %%xmm2,%%xmm3;" + "mov %7,%10;" + "ror $0xe,%10;" + "mov %4,%11;" + "movdqa %%xmm2,%%xmm7;" + "ror $0x9,%11;" + "xor %7,%10;" + "mov %8,%12;" + "ror $0x5,%10;" + "psrlq $0x11,%%xmm2;" + "xor %4,%11;" + "xor %9,%12;" + "psrlq $0x13,%%xmm3;" + "xor %7,%10;" + "and %7,%12;" + "ror $0xb,%11;" + "psrld $0xa,%%xmm7;" + "xor %4,%11;" + "ror $0x6,%10;" + "xor %9,%12;" + "pxor %%xmm3,%%xmm2;" + "ror $0x2,%11;" + "add %10,%12;" + "add 12+%16,%12;" + "pxor %%xmm2,%%xmm7;" + "mov %4,%10;" + "add %12,%3;" + "mov %4,%12;" + "pshufb %%xmm11,%%xmm7;" + "or %6,%10;" + "add %3,%k2;" + "and %6,%12;" + "paddd %%xmm0,%%xmm7;" + "and %5,%10;" + "add %11,%3;" + "or %12,%10;" + "add %10,%3;" + "sub $0x1,%1;" + "jne Lloop1_%=;" + "mov $0x2,%1;" + + "Lloop2_%=:" + "paddd 0x0(%13),%%xmm4;" + "movdqa %%xmm4,%16;" + "mov %k2,%10;" + "ror $0xe,%10;" + "mov %3,%11;" + "xor %k2,%10;" + "ror $0x9,%11;" + "mov %7,%12;" + "xor %3,%11;" + "ror $0x5,%10;" + "xor %8,%12;" + "xor %k2,%10;" + "ror $0xb,%11;" + "and %k2,%12;" + "xor %3,%11;" + "ror $0x6,%10;" + "xor %8,%12;" + "add %10,%12;" + "ror $0x2,%11;" + "add %16,%12;" + "mov %3,%10;" + "add %12,%9;" + "mov %3,%12;" + "or %5,%10;" + "add %9,%6;" + "and %5,%12;" + "and %4,%10;" + "add %11,%9;" + "or %12,%10;" + "add %10,%9;" + "mov %6,%10;" + "ror $0xe,%10;" + "mov %9,%11;" + "xor %6,%10;" + "ror $0x9,%11;" + "mov %k2,%12;" + "xor %9,%11;" + "ror $0x5,%10;" + "xor %7,%12;" + "xor %6,%10;" + "ror $0xb,%11;" + "and %6,%12;" + "xor %9,%11;" + "ror $0x6,%10;" + "xor %7,%12;" + "add %10,%12;" + "ror $0x2,%11;" + "add 4+%16,%12;" + "mov %9,%10;" + "add %12,%8;" + "mov %9,%12;" + "or %4,%10;" + "add %8,%5;" + "and %4,%12;" + "and %3,%10;" + "add %11,%8;" + "or %12,%10;" + "add %10,%8;" + "mov %5,%10;" + "ror $0xe,%10;" + "mov %8,%11;" + "xor %5,%10;" + "ror $0x9,%11;" + "mov %6,%12;" + "xor %8,%11;" + "ror $0x5,%10;" + "xor %k2,%12;" + "xor %5,%10;" + "ror $0xb,%11;" + "and %5,%12;" + "xor %8,%11;" + "ror $0x6,%10;" + "xor %k2,%12;" + "add %10,%12;" + "ror $0x2,%11;" + "add 8+%16,%12;" + "mov %8,%10;" + "add %12,%7;" + "mov %8,%12;" + "or %3,%10;" + "add %7,%4;" + "and %3,%12;" + "and %9,%10;" + "add %11,%7;" + "or %12,%10;" + "add %10,%7;" + "mov %4,%10;" + "ror $0xe,%10;" + "mov %7,%11;" + "xor %4,%10;" + "ror $0x9,%11;" + "mov %5,%12;" + "xor %7,%11;" + "ror $0x5,%10;" + "xor %6,%12;" + "xor %4,%10;" + "ror $0xb,%11;" + "and %4,%12;" + "xor %7,%11;" + "ror $0x6,%10;" + "xor %6,%12;" + "add %10,%12;" + "ror $0x2,%11;" + "add 12+%16,%12;" + "mov %7,%10;" + "add %12,%k2;" + "mov %7,%12;" + "or %9,%10;" + "add %k2,%3;" + "and %9,%12;" + "and %8,%10;" + "add %11,%k2;" + "or %12,%10;" + "add %10,%k2;" + "paddd 0x10(%13),%%xmm5;" + "movdqa %%xmm5,%16;" + "add $0x20,%13;" + "mov %3,%10;" + "ror $0xe,%10;" + "mov %k2,%11;" + "xor %3,%10;" + "ror $0x9,%11;" + "mov %4,%12;" + "xor %k2,%11;" + "ror $0x5,%10;" + "xor %5,%12;" + "xor %3,%10;" + "ror $0xb,%11;" + "and %3,%12;" + "xor %k2,%11;" + "ror $0x6,%10;" + "xor %5,%12;" + "add %10,%12;" + "ror $0x2,%11;" + "add %16,%12;" + "mov %k2,%10;" + "add %12,%6;" + "mov %k2,%12;" + "or %8,%10;" + "add %6,%9;" + "and %8,%12;" + "and %7,%10;" + "add %11,%6;" + "or %12,%10;" + "add %10,%6;" + "mov %9,%10;" + "ror $0xe,%10;" + "mov %6,%11;" + "xor %9,%10;" + "ror $0x9,%11;" + "mov %3,%12;" + "xor %6,%11;" + "ror $0x5,%10;" + "xor %4,%12;" + "xor %9,%10;" + "ror $0xb,%11;" + "and %9,%12;" + "xor %6,%11;" + "ror $0x6,%10;" + "xor %4,%12;" + "add %10,%12;" + "ror $0x2,%11;" + "add 4+%16,%12;" + "mov %6,%10;" + "add %12,%5;" + "mov %6,%12;" + "or %7,%10;" + "add %5,%8;" + "and %7,%12;" + "and %k2,%10;" + "add %11,%5;" + "or %12,%10;" + "add %10,%5;" + "mov %8,%10;" + "ror $0xe,%10;" + "mov %5,%11;" + "xor %8,%10;" + "ror $0x9,%11;" + "mov %9,%12;" + "xor %5,%11;" + "ror $0x5,%10;" + "xor %3,%12;" + "xor %8,%10;" + "ror $0xb,%11;" + "and %8,%12;" + "xor %5,%11;" + "ror $0x6,%10;" + "xor %3,%12;" + "add %10,%12;" + "ror $0x2,%11;" + "add 8+%16,%12;" + "mov %5,%10;" + "add %12,%4;" + "mov %5,%12;" + "or %k2,%10;" + "add %4,%7;" + "and %k2,%12;" + "and %6,%10;" + "add %11,%4;" + "or %12,%10;" + "add %10,%4;" + "mov %7,%10;" + "ror $0xe,%10;" + "mov %4,%11;" + "xor %7,%10;" + "ror $0x9,%11;" + "mov %8,%12;" + "xor %4,%11;" + "ror $0x5,%10;" + "xor %9,%12;" + "xor %7,%10;" + "ror $0xb,%11;" + "and %7,%12;" + "xor %4,%11;" + "ror $0x6,%10;" + "xor %9,%12;" + "add %10,%12;" + "ror $0x2,%11;" + "add 12+%16,%12;" + "mov %4,%10;" + "add %12,%3;" + "mov %4,%12;" + "or %6,%10;" + "add %3,%k2;" + "and %6,%12;" + "and %5,%10;" + "add %11,%3;" + "or %12,%10;" + "add %10,%3;" + "movdqa %%xmm6,%%xmm4;" + "movdqa %%xmm7,%%xmm5;" + "sub $0x1,%1;" + "jne Lloop2_%=;" + "add (%0),%3;" + "mov %3,(%0);" + "add 0x4(%0),%4;" + "mov %4,0x4(%0);" + "add 0x8(%0),%5;" + "mov %5,0x8(%0);" + "add 0xc(%0),%6;" + "mov %6,0xc(%0);" + "add 0x10(%0),%k2;" + "mov %k2,0x10(%0);" + "add 0x14(%0),%7;" + "mov %7,0x14(%0);" + "add 0x18(%0),%8;" + "mov %8,0x18(%0);" + "add 0x1c(%0),%9;" + "mov %9,0x1c(%0);" + "mov %15,%1;" + "add $0x40,%1;" + "cmp %14,%1;" + "jne Lloop0_%=;" + + "Ldone_hash_%=:" + + : "+r"(s), "+r"(chunk), "+r"(blocks), "=r"(a), "=r"(b), "=r"(c), "=r"(d), /* e = chunk */ "=r"(f), "=r"(g), "=r"(h), "=r"(y0), "=r"(y1), "=r"(y2), "=r"(tbl), "+m"(inp_end), "+m"(inp), "+m"(xfer) + : "m"(K256), "m"(FLIP_MASK), "m"(SHUF_00BA), "m"(SHUF_DC00) + : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12" + ); +} +} + +/* +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright (c) 2012, Intel Corporation +; +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are +; met: +; +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in the +; documentation and/or other materials provided with the +; distribution. +; +; * Neither the name of the Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived from +; this software without specific prior written permission. +; +; +; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY +; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; Example YASM command lines: +; Windows: yasm -Xvc -f x64 -rnasm -pnasm -o sha256_sse4.obj -g cv8 sha256_sse4.asm +; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_sse4.o sha256_sse4.asm +; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; This code is described in an Intel White-Paper: +; "Fast SHA-256 Implementations on Intel Architecture Processors" +; +; To find it, surf to http://www.intel.com/p/en_US/embedded +; and search for that title. +; The paper is expected to be released roughly at the end of April, 2012 +; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; This code schedules 1 blocks at a time, with 4 lanes per block +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define MOVDQ movdqu ;; assume buffers not aligned + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros + +; addm [mem], reg +; Add reg to mem using reg-mem add and store +%macro addm 2 + add %2, %1 + mov %1, %2 +%endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask +; Load xmm with mem and byte swap each dword +%macro COPY_XMM_AND_BSWAP 3 + MOVDQ %1, %2 + pshufb %1, %3 +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define X0 xmm4 +%define X1 xmm5 +%define X2 xmm6 +%define X3 xmm7 + +%define XTMP0 xmm0 +%define XTMP1 xmm1 +%define XTMP2 xmm2 +%define XTMP3 xmm3 +%define XTMP4 xmm8 +%define XFER xmm9 + +%define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA +%define SHUF_DC00 xmm11 ; shuffle xDxC -> DC00 +%define BYTE_FLIP_MASK xmm12 + +%ifdef LINUX +%define NUM_BLKS rdx ; 3rd arg +%define CTX rsi ; 2nd arg +%define INP rdi ; 1st arg + +%define SRND rdi ; clobbers INP +%define c ecx +%define d r8d +%define e edx +%else +%define NUM_BLKS r8 ; 3rd arg +%define CTX rdx ; 2nd arg +%define INP rcx ; 1st arg + +%define SRND rcx ; clobbers INP +%define c edi +%define d esi +%define e r8d + +%endif +%define TBL rbp +%define a eax +%define b ebx + +%define f r9d +%define g r10d +%define h r11d + +%define y0 r13d +%define y1 r14d +%define y2 r15d + + + +_INP_END_SIZE equ 8 +_INP_SIZE equ 8 +_XFER_SIZE equ 8 +%ifdef LINUX +_XMM_SAVE_SIZE equ 0 +%else +_XMM_SAVE_SIZE equ 7*16 +%endif +; STACK_SIZE plus pushes must be an odd multiple of 8 +_ALIGN_SIZE equ 8 + +_INP_END equ 0 +_INP equ _INP_END + _INP_END_SIZE +_XFER equ _INP + _INP_SIZE +_XMM_SAVE equ _XFER + _XFER_SIZE + _ALIGN_SIZE +STACK_SIZE equ _XMM_SAVE + _XMM_SAVE_SIZE + +; rotate_Xs +; Rotate values of symbols X0...X3 +%macro rotate_Xs 0 +%xdefine X_ X0 +%xdefine X0 X1 +%xdefine X1 X2 +%xdefine X2 X3 +%xdefine X3 X_ +%endm + +; ROTATE_ARGS +; Rotate values of symbols a...h +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + +%macro FOUR_ROUNDS_AND_SCHED 0 + ;; compute s0 four at a time and s1 two at a time + ;; compute W[-16] + W[-7] 4 at a time + movdqa XTMP0, X3 + mov y0, e ; y0 = e + ror y0, (25-11) ; y0 = e >> (25-11) + mov y1, a ; y1 = a + palignr XTMP0, X2, 4 ; XTMP0 = W[-7] + ror y1, (22-13) ; y1 = a >> (22-13) + xor y0, e ; y0 = e ^ (e >> (25-11)) + mov y2, f ; y2 = f + ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + movdqa XTMP1, X1 + xor y1, a ; y1 = a ^ (a >> (22-13) + xor y2, g ; y2 = f^g + paddd XTMP0, X0 ; XTMP0 = W[-7] + W[-16] + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and y2, e ; y2 = (f^g)&e + ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + ;; compute s0 + palignr XTMP1, X0, 4 ; XTMP1 = W[-15] + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor y2, g ; y2 = CH = ((f^g)&e)^g + movdqa XTMP2, XTMP1 ; XTMP2 = W[-15] + ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y2, y0 ; y2 = S1 + CH + add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH + movdqa XTMP3, XTMP1 ; XTMP3 = W[-15] + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + pslld XTMP1, (32-7) + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + psrld XTMP2, 7 + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + por XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + +ROTATE_ARGS + movdqa XTMP2, XTMP3 ; XTMP2 = W[-15] + mov y0, e ; y0 = e + mov y1, a ; y1 = a + movdqa XTMP4, XTMP3 ; XTMP4 = W[-15] + ror y0, (25-11) ; y0 = e >> (25-11) + xor y0, e ; y0 = e ^ (e >> (25-11)) + mov y2, f ; y2 = f + ror y1, (22-13) ; y1 = a >> (22-13) + pslld XTMP3, (32-18) + xor y1, a ; y1 = a ^ (a >> (22-13) + ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + xor y2, g ; y2 = f^g + psrld XTMP2, 18 + ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and y2, e ; y2 = (f^g)&e + ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + pxor XTMP1, XTMP3 + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + xor y2, g ; y2 = CH = ((f^g)&e)^g + psrld XTMP4, 3 ; XTMP4 = W[-15] >> 3 + add y2, y0 ; y2 = S1 + CH + add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH + ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + pxor XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + pxor XTMP1, XTMP4 ; XTMP1 = s0 + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + ;; compute low s1 + pshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA} + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + paddd XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + +ROTATE_ARGS + movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA} + mov y0, e ; y0 = e + mov y1, a ; y1 = a + ror y0, (25-11) ; y0 = e >> (25-11) + movdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA} + xor y0, e ; y0 = e ^ (e >> (25-11)) + ror y1, (22-13) ; y1 = a >> (22-13) + mov y2, f ; y2 = f + xor y1, a ; y1 = a ^ (a >> (22-13) + ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA} + xor y2, g ; y2 = f^g + psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xBxA} + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and y2, e ; y2 = (f^g)&e + psrld XTMP4, 10 ; XTMP4 = W[-2] >> 10 {BBAA} + ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + xor y2, g ; y2 = CH = ((f^g)&e)^g + ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + pxor XTMP2, XTMP3 + add y2, y0 ; y2 = S1 + CH + ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH + pxor XTMP4, XTMP2 ; XTMP4 = s1 {xBxA} + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + pshufb XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA} + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + paddd XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]} + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + ;; compute high s1 + pshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC} + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + +ROTATE_ARGS + movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC} + mov y0, e ; y0 = e + ror y0, (25-11) ; y0 = e >> (25-11) + mov y1, a ; y1 = a + movdqa X0, XTMP2 ; X0 = W[-2] {DDCC} + ror y1, (22-13) ; y1 = a >> (22-13) + xor y0, e ; y0 = e ^ (e >> (25-11)) + mov y2, f ; y2 = f + ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC} + xor y1, a ; y1 = a ^ (a >> (22-13) + xor y2, g ; y2 = f^g + psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xDxC} + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and y2, e ; y2 = (f^g)&e + ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + psrld X0, 10 ; X0 = W[-2] >> 10 {DDCC} + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor y2, g ; y2 = CH = ((f^g)&e)^g + pxor XTMP2, XTMP3 + ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y2, y0 ; y2 = S1 + CH + add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH + pxor X0, XTMP2 ; X0 = s1 {xDxC} + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + pshufb X0, SHUF_DC00 ; X0 = s1 {DC00} + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + paddd X0, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]} + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + +ROTATE_ARGS +rotate_Xs +%endm + +;; input is [rsp + _XFER + %1 * 4] +%macro DO_ROUND 1 + mov y0, e ; y0 = e + ror y0, (25-11) ; y0 = e >> (25-11) + mov y1, a ; y1 = a + xor y0, e ; y0 = e ^ (e >> (25-11)) + ror y1, (22-13) ; y1 = a >> (22-13) + mov y2, f ; y2 = f + xor y1, a ; y1 = a ^ (a >> (22-13) + ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + xor y2, g ; y2 = f^g + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + and y2, e ; y2 = (f^g)&e + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor y2, g ; y2 = CH = ((f^g)&e)^g + add y2, y0 ; y2 = S1 + CH + ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + ROTATE_ARGS +%endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks) +;; arg 1 : pointer to input data +;; arg 2 : pointer to digest +;; arg 3 : Num blocks +section .text +global sha256_sse4 +align 32 +sha256_sse4: + push rbx +%ifndef LINUX + push rsi + push rdi +%endif + push rbp + push r13 + push r14 + push r15 + + sub rsp,STACK_SIZE +%ifndef LINUX + movdqa [rsp + _XMM_SAVE + 0*16],xmm6 + movdqa [rsp + _XMM_SAVE + 1*16],xmm7 + movdqa [rsp + _XMM_SAVE + 2*16],xmm8 + movdqa [rsp + _XMM_SAVE + 3*16],xmm9 + movdqa [rsp + _XMM_SAVE + 4*16],xmm10 + movdqa [rsp + _XMM_SAVE + 5*16],xmm11 + movdqa [rsp + _XMM_SAVE + 6*16],xmm12 +%endif + + shl NUM_BLKS, 6 ; convert to bytes + jz done_hash + add NUM_BLKS, INP ; pointer to end of data + mov [rsp + _INP_END], NUM_BLKS + + ;; load initial digest + mov a,[4*0 + CTX] + mov b,[4*1 + CTX] + mov c,[4*2 + CTX] + mov d,[4*3 + CTX] + mov e,[4*4 + CTX] + mov f,[4*5 + CTX] + mov g,[4*6 + CTX] + mov h,[4*7 + CTX] + + movdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip] + movdqa SHUF_00BA, [_SHUF_00BA wrt rip] + movdqa SHUF_DC00, [_SHUF_DC00 wrt rip] + +loop0: + lea TBL,[K256 wrt rip] + + ;; byte swap first 16 dwords + COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK + + mov [rsp + _INP], INP + + ;; schedule 48 input dwords, by doing 3 rounds of 16 each + mov SRND, 3 +align 16 +loop1: + movdqa XFER, [TBL + 0*16] + paddd XFER, X0 + movdqa [rsp + _XFER], XFER + FOUR_ROUNDS_AND_SCHED + + movdqa XFER, [TBL + 1*16] + paddd XFER, X0 + movdqa [rsp + _XFER], XFER + FOUR_ROUNDS_AND_SCHED + + movdqa XFER, [TBL + 2*16] + paddd XFER, X0 + movdqa [rsp + _XFER], XFER + FOUR_ROUNDS_AND_SCHED + + movdqa XFER, [TBL + 3*16] + paddd XFER, X0 + movdqa [rsp + _XFER], XFER + add TBL, 4*16 + FOUR_ROUNDS_AND_SCHED + + sub SRND, 1 + jne loop1 + + mov SRND, 2 +loop2: + paddd X0, [TBL + 0*16] + movdqa [rsp + _XFER], X0 + DO_ROUND 0 + DO_ROUND 1 + DO_ROUND 2 + DO_ROUND 3 + paddd X1, [TBL + 1*16] + movdqa [rsp + _XFER], X1 + add TBL, 2*16 + DO_ROUND 0 + DO_ROUND 1 + DO_ROUND 2 + DO_ROUND 3 + + movdqa X0, X2 + movdqa X1, X3 + + sub SRND, 1 + jne loop2 + + addm [4*0 + CTX],a + addm [4*1 + CTX],b + addm [4*2 + CTX],c + addm [4*3 + CTX],d + addm [4*4 + CTX],e + addm [4*5 + CTX],f + addm [4*6 + CTX],g + addm [4*7 + CTX],h + + mov INP, [rsp + _INP] + add INP, 64 + cmp INP, [rsp + _INP_END] + jne loop0 + +done_hash: +%ifndef LINUX + movdqa xmm6,[rsp + _XMM_SAVE + 0*16] + movdqa xmm7,[rsp + _XMM_SAVE + 1*16] + movdqa xmm8,[rsp + _XMM_SAVE + 2*16] + movdqa xmm9,[rsp + _XMM_SAVE + 3*16] + movdqa xmm10,[rsp + _XMM_SAVE + 4*16] + movdqa xmm11,[rsp + _XMM_SAVE + 5*16] + movdqa xmm12,[rsp + _XMM_SAVE + 6*16] +%endif + + add rsp, STACK_SIZE + + pop r15 + pop r14 + pop r13 + pop rbp +%ifndef LINUX + pop rdi + pop rsi +%endif + pop rbx + + ret + + +section .data +align 64 +K256: + dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203 + +; shuffle xBxA -> 00BA +_SHUF_00BA: ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100 + +; shuffle xDxC -> DC00 +_SHUF_DC00: ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF +*/ + +#endif diff --git a/src/crypto/sha512.cpp b/src/crypto/sha512.cpp new file mode 100644 index 0000000000..dff4d8da1a --- /dev/null +++ b/src/crypto/sha512.cpp @@ -0,0 +1,207 @@ +// Copyright (c) 2014-2017 The Bitcoin Core developers +// Distributed under the MIT software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. + +#include <crypto/sha512.h> + +#include <crypto/common.h> + +#include <string.h> + +// Internal implementation code. +namespace +{ +/// Internal SHA-512 implementation. +namespace sha512 +{ +uint64_t inline Ch(uint64_t x, uint64_t y, uint64_t z) { return z ^ (x & (y ^ z)); } +uint64_t inline Maj(uint64_t x, uint64_t y, uint64_t z) { return (x & y) | (z & (x | y)); } +uint64_t inline Sigma0(uint64_t x) { return (x >> 28 | x << 36) ^ (x >> 34 | x << 30) ^ (x >> 39 | x << 25); } +uint64_t inline Sigma1(uint64_t x) { return (x >> 14 | x << 50) ^ (x >> 18 | x << 46) ^ (x >> 41 | x << 23); } +uint64_t inline sigma0(uint64_t x) { return (x >> 1 | x << 63) ^ (x >> 8 | x << 56) ^ (x >> 7); } +uint64_t inline sigma1(uint64_t x) { return (x >> 19 | x << 45) ^ (x >> 61 | x << 3) ^ (x >> 6); } + +/** One round of SHA-512. */ +void inline Round(uint64_t a, uint64_t b, uint64_t c, uint64_t& d, uint64_t e, uint64_t f, uint64_t g, uint64_t& h, uint64_t k, uint64_t w) +{ + uint64_t t1 = h + Sigma1(e) + Ch(e, f, g) + k + w; + uint64_t t2 = Sigma0(a) + Maj(a, b, c); + d += t1; + h = t1 + t2; +} + +/** Initialize SHA-256 state. */ +void inline Initialize(uint64_t* s) +{ + s[0] = 0x6a09e667f3bcc908ull; + s[1] = 0xbb67ae8584caa73bull; + s[2] = 0x3c6ef372fe94f82bull; + s[3] = 0xa54ff53a5f1d36f1ull; + s[4] = 0x510e527fade682d1ull; + s[5] = 0x9b05688c2b3e6c1full; + s[6] = 0x1f83d9abfb41bd6bull; + s[7] = 0x5be0cd19137e2179ull; +} + +/** Perform one SHA-512 transformation, processing a 128-byte chunk. */ +void Transform(uint64_t* s, const unsigned char* chunk) +{ + uint64_t a = s[0], b = s[1], c = s[2], d = s[3], e = s[4], f = s[5], g = s[6], h = s[7]; + uint64_t w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; + + Round(a, b, c, d, e, f, g, h, 0x428a2f98d728ae22ull, w0 = ReadBE64(chunk + 0)); + Round(h, a, b, c, d, e, f, g, 0x7137449123ef65cdull, w1 = ReadBE64(chunk + 8)); + Round(g, h, a, b, c, d, e, f, 0xb5c0fbcfec4d3b2full, w2 = ReadBE64(chunk + 16)); + Round(f, g, h, a, b, c, d, e, 0xe9b5dba58189dbbcull, w3 = ReadBE64(chunk + 24)); + Round(e, f, g, h, a, b, c, d, 0x3956c25bf348b538ull, w4 = ReadBE64(chunk + 32)); + Round(d, e, f, g, h, a, b, c, 0x59f111f1b605d019ull, w5 = ReadBE64(chunk + 40)); + Round(c, d, e, f, g, h, a, b, 0x923f82a4af194f9bull, w6 = ReadBE64(chunk + 48)); + Round(b, c, d, e, f, g, h, a, 0xab1c5ed5da6d8118ull, w7 = ReadBE64(chunk + 56)); + Round(a, b, c, d, e, f, g, h, 0xd807aa98a3030242ull, w8 = ReadBE64(chunk + 64)); + Round(h, a, b, c, d, e, f, g, 0x12835b0145706fbeull, w9 = ReadBE64(chunk + 72)); + Round(g, h, a, b, c, d, e, f, 0x243185be4ee4b28cull, w10 = ReadBE64(chunk + 80)); + Round(f, g, h, a, b, c, d, e, 0x550c7dc3d5ffb4e2ull, w11 = ReadBE64(chunk + 88)); + Round(e, f, g, h, a, b, c, d, 0x72be5d74f27b896full, w12 = ReadBE64(chunk + 96)); + Round(d, e, f, g, h, a, b, c, 0x80deb1fe3b1696b1ull, w13 = ReadBE64(chunk + 104)); + Round(c, d, e, f, g, h, a, b, 0x9bdc06a725c71235ull, w14 = ReadBE64(chunk + 112)); + Round(b, c, d, e, f, g, h, a, 0xc19bf174cf692694ull, w15 = ReadBE64(chunk + 120)); + + Round(a, b, c, d, e, f, g, h, 0xe49b69c19ef14ad2ull, w0 += sigma1(w14) + w9 + sigma0(w1)); + Round(h, a, b, c, d, e, f, g, 0xefbe4786384f25e3ull, w1 += sigma1(w15) + w10 + sigma0(w2)); + Round(g, h, a, b, c, d, e, f, 0x0fc19dc68b8cd5b5ull, w2 += sigma1(w0) + w11 + sigma0(w3)); + Round(f, g, h, a, b, c, d, e, 0x240ca1cc77ac9c65ull, w3 += sigma1(w1) + w12 + sigma0(w4)); + Round(e, f, g, h, a, b, c, d, 0x2de92c6f592b0275ull, w4 += sigma1(w2) + w13 + sigma0(w5)); + Round(d, e, f, g, h, a, b, c, 0x4a7484aa6ea6e483ull, w5 += sigma1(w3) + w14 + sigma0(w6)); + Round(c, d, e, f, g, h, a, b, 0x5cb0a9dcbd41fbd4ull, w6 += sigma1(w4) + w15 + sigma0(w7)); + Round(b, c, d, e, f, g, h, a, 0x76f988da831153b5ull, w7 += sigma1(w5) + w0 + sigma0(w8)); + Round(a, b, c, d, e, f, g, h, 0x983e5152ee66dfabull, w8 += sigma1(w6) + w1 + sigma0(w9)); + Round(h, a, b, c, d, e, f, g, 0xa831c66d2db43210ull, w9 += sigma1(w7) + w2 + sigma0(w10)); + Round(g, h, a, b, c, d, e, f, 0xb00327c898fb213full, w10 += sigma1(w8) + w3 + sigma0(w11)); + Round(f, g, h, a, b, c, d, e, 0xbf597fc7beef0ee4ull, w11 += sigma1(w9) + w4 + sigma0(w12)); + Round(e, f, g, h, a, b, c, d, 0xc6e00bf33da88fc2ull, w12 += sigma1(w10) + w5 + sigma0(w13)); + Round(d, e, f, g, h, a, b, c, 0xd5a79147930aa725ull, w13 += sigma1(w11) + w6 + sigma0(w14)); + Round(c, d, e, f, g, h, a, b, 0x06ca6351e003826full, w14 += sigma1(w12) + w7 + sigma0(w15)); + Round(b, c, d, e, f, g, h, a, 0x142929670a0e6e70ull, w15 += sigma1(w13) + w8 + sigma0(w0)); + + Round(a, b, c, d, e, f, g, h, 0x27b70a8546d22ffcull, w0 += sigma1(w14) + w9 + sigma0(w1)); + Round(h, a, b, c, d, e, f, g, 0x2e1b21385c26c926ull, w1 += sigma1(w15) + w10 + sigma0(w2)); + Round(g, h, a, b, c, d, e, f, 0x4d2c6dfc5ac42aedull, w2 += sigma1(w0) + w11 + sigma0(w3)); + Round(f, g, h, a, b, c, d, e, 0x53380d139d95b3dfull, w3 += sigma1(w1) + w12 + sigma0(w4)); + Round(e, f, g, h, a, b, c, d, 0x650a73548baf63deull, w4 += sigma1(w2) + w13 + sigma0(w5)); + Round(d, e, f, g, h, a, b, c, 0x766a0abb3c77b2a8ull, w5 += sigma1(w3) + w14 + sigma0(w6)); + Round(c, d, e, f, g, h, a, b, 0x81c2c92e47edaee6ull, w6 += sigma1(w4) + w15 + sigma0(w7)); + Round(b, c, d, e, f, g, h, a, 0x92722c851482353bull, w7 += sigma1(w5) + w0 + sigma0(w8)); + Round(a, b, c, d, e, f, g, h, 0xa2bfe8a14cf10364ull, w8 += sigma1(w6) + w1 + sigma0(w9)); + Round(h, a, b, c, d, e, f, g, 0xa81a664bbc423001ull, w9 += sigma1(w7) + w2 + sigma0(w10)); + Round(g, h, a, b, c, d, e, f, 0xc24b8b70d0f89791ull, w10 += sigma1(w8) + w3 + sigma0(w11)); + Round(f, g, h, a, b, c, d, e, 0xc76c51a30654be30ull, w11 += sigma1(w9) + w4 + sigma0(w12)); + Round(e, f, g, h, a, b, c, d, 0xd192e819d6ef5218ull, w12 += sigma1(w10) + w5 + sigma0(w13)); + Round(d, e, f, g, h, a, b, c, 0xd69906245565a910ull, w13 += sigma1(w11) + w6 + sigma0(w14)); + Round(c, d, e, f, g, h, a, b, 0xf40e35855771202aull, w14 += sigma1(w12) + w7 + sigma0(w15)); + Round(b, c, d, e, f, g, h, a, 0x106aa07032bbd1b8ull, w15 += sigma1(w13) + w8 + sigma0(w0)); + + Round(a, b, c, d, e, f, g, h, 0x19a4c116b8d2d0c8ull, w0 += sigma1(w14) + w9 + sigma0(w1)); + Round(h, a, b, c, d, e, f, g, 0x1e376c085141ab53ull, w1 += sigma1(w15) + w10 + sigma0(w2)); + Round(g, h, a, b, c, d, e, f, 0x2748774cdf8eeb99ull, w2 += sigma1(w0) + w11 + sigma0(w3)); + Round(f, g, h, a, b, c, d, e, 0x34b0bcb5e19b48a8ull, w3 += sigma1(w1) + w12 + sigma0(w4)); + Round(e, f, g, h, a, b, c, d, 0x391c0cb3c5c95a63ull, w4 += sigma1(w2) + w13 + sigma0(w5)); + Round(d, e, f, g, h, a, b, c, 0x4ed8aa4ae3418acbull, w5 += sigma1(w3) + w14 + sigma0(w6)); + Round(c, d, e, f, g, h, a, b, 0x5b9cca4f7763e373ull, w6 += sigma1(w4) + w15 + sigma0(w7)); + Round(b, c, d, e, f, g, h, a, 0x682e6ff3d6b2b8a3ull, w7 += sigma1(w5) + w0 + sigma0(w8)); + Round(a, b, c, d, e, f, g, h, 0x748f82ee5defb2fcull, w8 += sigma1(w6) + w1 + sigma0(w9)); + Round(h, a, b, c, d, e, f, g, 0x78a5636f43172f60ull, w9 += sigma1(w7) + w2 + sigma0(w10)); + Round(g, h, a, b, c, d, e, f, 0x84c87814a1f0ab72ull, w10 += sigma1(w8) + w3 + sigma0(w11)); + Round(f, g, h, a, b, c, d, e, 0x8cc702081a6439ecull, w11 += sigma1(w9) + w4 + sigma0(w12)); + Round(e, f, g, h, a, b, c, d, 0x90befffa23631e28ull, w12 += sigma1(w10) + w5 + sigma0(w13)); + Round(d, e, f, g, h, a, b, c, 0xa4506cebde82bde9ull, w13 += sigma1(w11) + w6 + sigma0(w14)); + Round(c, d, e, f, g, h, a, b, 0xbef9a3f7b2c67915ull, w14 += sigma1(w12) + w7 + sigma0(w15)); + Round(b, c, d, e, f, g, h, a, 0xc67178f2e372532bull, w15 += sigma1(w13) + w8 + sigma0(w0)); + + Round(a, b, c, d, e, f, g, h, 0xca273eceea26619cull, w0 += sigma1(w14) + w9 + sigma0(w1)); + Round(h, a, b, c, d, e, f, g, 0xd186b8c721c0c207ull, w1 += sigma1(w15) + w10 + sigma0(w2)); + Round(g, h, a, b, c, d, e, f, 0xeada7dd6cde0eb1eull, w2 += sigma1(w0) + w11 + sigma0(w3)); + Round(f, g, h, a, b, c, d, e, 0xf57d4f7fee6ed178ull, w3 += sigma1(w1) + w12 + sigma0(w4)); + Round(e, f, g, h, a, b, c, d, 0x06f067aa72176fbaull, w4 += sigma1(w2) + w13 + sigma0(w5)); + Round(d, e, f, g, h, a, b, c, 0x0a637dc5a2c898a6ull, w5 += sigma1(w3) + w14 + sigma0(w6)); + Round(c, d, e, f, g, h, a, b, 0x113f9804bef90daeull, w6 += sigma1(w4) + w15 + sigma0(w7)); + Round(b, c, d, e, f, g, h, a, 0x1b710b35131c471bull, w7 += sigma1(w5) + w0 + sigma0(w8)); + Round(a, b, c, d, e, f, g, h, 0x28db77f523047d84ull, w8 += sigma1(w6) + w1 + sigma0(w9)); + Round(h, a, b, c, d, e, f, g, 0x32caab7b40c72493ull, w9 += sigma1(w7) + w2 + sigma0(w10)); + Round(g, h, a, b, c, d, e, f, 0x3c9ebe0a15c9bebcull, w10 += sigma1(w8) + w3 + sigma0(w11)); + Round(f, g, h, a, b, c, d, e, 0x431d67c49c100d4cull, w11 += sigma1(w9) + w4 + sigma0(w12)); + Round(e, f, g, h, a, b, c, d, 0x4cc5d4becb3e42b6ull, w12 += sigma1(w10) + w5 + sigma0(w13)); + Round(d, e, f, g, h, a, b, c, 0x597f299cfc657e2aull, w13 += sigma1(w11) + w6 + sigma0(w14)); + Round(c, d, e, f, g, h, a, b, 0x5fcb6fab3ad6faecull, w14 + sigma1(w12) + w7 + sigma0(w15)); + Round(b, c, d, e, f, g, h, a, 0x6c44198c4a475817ull, w15 + sigma1(w13) + w8 + sigma0(w0)); + + s[0] += a; + s[1] += b; + s[2] += c; + s[3] += d; + s[4] += e; + s[5] += f; + s[6] += g; + s[7] += h; +} + +} // namespace sha512 + +} // namespace + + +////// SHA-512 + +CSHA512::CSHA512() : bytes(0) +{ + sha512::Initialize(s); +} + +CSHA512& CSHA512::Write(const unsigned char* data, size_t len) +{ + const unsigned char* end = data + len; + size_t bufsize = bytes % 128; + if (bufsize && bufsize + len >= 128) { + // Fill the buffer, and process it. + memcpy(buf + bufsize, data, 128 - bufsize); + bytes += 128 - bufsize; + data += 128 - bufsize; + sha512::Transform(s, buf); + bufsize = 0; + } + while (end >= data + 128) { + // Process full chunks directly from the source. + sha512::Transform(s, data); + data += 128; + bytes += 128; + } + if (end > data) { + // Fill the buffer with what remains. + memcpy(buf + bufsize, data, end - data); + bytes += end - data; + } + return *this; +} + +void CSHA512::Finalize(unsigned char hash[OUTPUT_SIZE]) +{ + static const unsigned char pad[128] = {0x80}; + unsigned char sizedesc[16] = {0x00}; + WriteBE64(sizedesc + 8, bytes << 3); + Write(pad, 1 + ((239 - (bytes % 128)) % 128)); + Write(sizedesc, 16); + WriteBE64(hash, s[0]); + WriteBE64(hash + 8, s[1]); + WriteBE64(hash + 16, s[2]); + WriteBE64(hash + 24, s[3]); + WriteBE64(hash + 32, s[4]); + WriteBE64(hash + 40, s[5]); + WriteBE64(hash + 48, s[6]); + WriteBE64(hash + 56, s[7]); +} + +CSHA512& CSHA512::Reset() +{ + bytes = 0; + sha512::Initialize(s); + return *this; +} diff --git a/src/crypto/sha512.h b/src/crypto/sha512.h new file mode 100644 index 0000000000..cd1023bc85 --- /dev/null +++ b/src/crypto/sha512.h @@ -0,0 +1,28 @@ +// Copyright (c) 2014-2016 The Bitcoin Core developers +// Distributed under the MIT software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. + +#ifndef BITCOIN_CRYPTO_SHA512_H +#define BITCOIN_CRYPTO_SHA512_H + +#include <stdint.h> +#include <stdlib.h> + +/** A hasher class for SHA-512. */ +class CSHA512 +{ +private: + uint64_t s[8]; + unsigned char buf[128]; + uint64_t bytes; + +public: + static const size_t OUTPUT_SIZE = 64; + + CSHA512(); + CSHA512& Write(const unsigned char* data, size_t len); + void Finalize(unsigned char hash[OUTPUT_SIZE]); + CSHA512& Reset(); +}; + +#endif // BITCOIN_CRYPTO_SHA512_H |