diff options
Diffstat (limited to 'src/crc32c/src/crc32c_sse42.cc')
-rw-r--r-- | src/crc32c/src/crc32c_sse42.cc | 258 |
1 files changed, 258 insertions, 0 deletions
diff --git a/src/crc32c/src/crc32c_sse42.cc b/src/crc32c/src/crc32c_sse42.cc new file mode 100644 index 0000000000..139520428e --- /dev/null +++ b/src/crc32c/src/crc32c_sse42.cc @@ -0,0 +1,258 @@ +// Copyright 2008 The CRC32C Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "./crc32c_sse42.h" + +// In a separate source file to allow this accelerated CRC32C function to be +// compiled with the appropriate compiler flags to enable SSE4.2 instructions. + +// This implementation is loosely based on Intel Pub 323405 from April 2011, +// "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction". + +#include <cstddef> +#include <cstdint> + +#include "./crc32c_internal.h" +#include "./crc32c_prefetch.h" +#include "./crc32c_read_le.h" +#include "./crc32c_round_up.h" +#ifdef CRC32C_HAVE_CONFIG_H +#include "crc32c/crc32c_config.h" +#endif + +#if HAVE_SSE42 && (defined(_M_X64) || defined(__x86_64__)) + +#if defined(_MSC_VER) +#include <intrin.h> +#else // !defined(_MSC_VER) +#include <nmmintrin.h> +#endif // defined(_MSC_VER) + +namespace crc32c { + +namespace { + +constexpr const ptrdiff_t kGroups = 3; +constexpr const ptrdiff_t kBlock0Size = 16 * 1024 / kGroups / 64 * 64; +constexpr const ptrdiff_t kBlock1Size = 4 * 1024 / kGroups / 8 * 8; +constexpr const ptrdiff_t kBlock2Size = 1024 / kGroups / 8 * 8; + +const uint32_t kBlock0SkipTable[8][16] = { + {0x00000000, 0xff770459, 0xfb027e43, 0x04757a1a, 0xf3e88a77, 0x0c9f8e2e, + 0x08eaf434, 0xf79df06d, 0xe23d621f, 0x1d4a6646, 0x193f1c5c, 0xe6481805, + 0x11d5e868, 0xeea2ec31, 0xead7962b, 0x15a09272}, + {0x00000000, 0xc196b2cf, 0x86c1136f, 0x4757a1a0, 0x086e502f, 0xc9f8e2e0, + 0x8eaf4340, 0x4f39f18f, 0x10dca05e, 0xd14a1291, 0x961db331, 0x578b01fe, + 0x18b2f071, 0xd92442be, 0x9e73e31e, 0x5fe551d1}, + {0x00000000, 0x21b940bc, 0x43728178, 0x62cbc1c4, 0x86e502f0, 0xa75c424c, + 0xc5978388, 0xe42ec334, 0x08267311, 0x299f33ad, 0x4b54f269, 0x6aedb2d5, + 0x8ec371e1, 0xaf7a315d, 0xcdb1f099, 0xec08b025}, + {0x00000000, 0x104ce622, 0x2099cc44, 0x30d52a66, 0x41339888, 0x517f7eaa, + 0x61aa54cc, 0x71e6b2ee, 0x82673110, 0x922bd732, 0xa2fefd54, 0xb2b21b76, + 0xc354a998, 0xd3184fba, 0xe3cd65dc, 0xf38183fe}, + {0x00000000, 0x012214d1, 0x024429a2, 0x03663d73, 0x04885344, 0x05aa4795, + 0x06cc7ae6, 0x07ee6e37, 0x0910a688, 0x0832b259, 0x0b548f2a, 0x0a769bfb, + 0x0d98f5cc, 0x0cbae11d, 0x0fdcdc6e, 0x0efec8bf}, + {0x00000000, 0x12214d10, 0x24429a20, 0x3663d730, 0x48853440, 0x5aa47950, + 0x6cc7ae60, 0x7ee6e370, 0x910a6880, 0x832b2590, 0xb548f2a0, 0xa769bfb0, + 0xd98f5cc0, 0xcbae11d0, 0xfdcdc6e0, 0xefec8bf0}, + {0x00000000, 0x27f8a7f1, 0x4ff14fe2, 0x6809e813, 0x9fe29fc4, 0xb81a3835, + 0xd013d026, 0xf7eb77d7, 0x3a294979, 0x1dd1ee88, 0x75d8069b, 0x5220a16a, + 0xa5cbd6bd, 0x8233714c, 0xea3a995f, 0xcdc23eae}, + {0x00000000, 0x745292f2, 0xe8a525e4, 0x9cf7b716, 0xd4a63d39, 0xa0f4afcb, + 0x3c0318dd, 0x48518a2f, 0xaca00c83, 0xd8f29e71, 0x44052967, 0x3057bb95, + 0x780631ba, 0x0c54a348, 0x90a3145e, 0xe4f186ac}, +}; +const uint32_t kBlock1SkipTable[8][16] = { + {0x00000000, 0x79113270, 0xf22264e0, 0x8b335690, 0xe1a8bf31, 0x98b98d41, + 0x138adbd1, 0x6a9be9a1, 0xc6bd0893, 0xbfac3ae3, 0x349f6c73, 0x4d8e5e03, + 0x2715b7a2, 0x5e0485d2, 0xd537d342, 0xac26e132}, + {0x00000000, 0x889667d7, 0x14c0b95f, 0x9c56de88, 0x298172be, 0xa1171569, + 0x3d41cbe1, 0xb5d7ac36, 0x5302e57c, 0xdb9482ab, 0x47c25c23, 0xcf543bf4, + 0x7a8397c2, 0xf215f015, 0x6e432e9d, 0xe6d5494a}, + {0x00000000, 0xa605caf8, 0x49e7e301, 0xefe229f9, 0x93cfc602, 0x35ca0cfa, + 0xda282503, 0x7c2deffb, 0x2273faf5, 0x8476300d, 0x6b9419f4, 0xcd91d30c, + 0xb1bc3cf7, 0x17b9f60f, 0xf85bdff6, 0x5e5e150e}, + {0x00000000, 0x44e7f5ea, 0x89cfebd4, 0xcd281e3e, 0x1673a159, 0x529454b3, + 0x9fbc4a8d, 0xdb5bbf67, 0x2ce742b2, 0x6800b758, 0xa528a966, 0xe1cf5c8c, + 0x3a94e3eb, 0x7e731601, 0xb35b083f, 0xf7bcfdd5}, + {0x00000000, 0x59ce8564, 0xb39d0ac8, 0xea538fac, 0x62d66361, 0x3b18e605, + 0xd14b69a9, 0x8885eccd, 0xc5acc6c2, 0x9c6243a6, 0x7631cc0a, 0x2fff496e, + 0xa77aa5a3, 0xfeb420c7, 0x14e7af6b, 0x4d292a0f}, + {0x00000000, 0x8eb5fb75, 0x1887801b, 0x96327b6e, 0x310f0036, 0xbfbafb43, + 0x2988802d, 0xa73d7b58, 0x621e006c, 0xecabfb19, 0x7a998077, 0xf42c7b02, + 0x5311005a, 0xdda4fb2f, 0x4b968041, 0xc5237b34}, + {0x00000000, 0xc43c00d8, 0x8d947741, 0x49a87799, 0x1ec49873, 0xdaf898ab, + 0x9350ef32, 0x576cefea, 0x3d8930e6, 0xf9b5303e, 0xb01d47a7, 0x7421477f, + 0x234da895, 0xe771a84d, 0xaed9dfd4, 0x6ae5df0c}, + {0x00000000, 0x7b1261cc, 0xf624c398, 0x8d36a254, 0xe9a5f1c1, 0x92b7900d, + 0x1f813259, 0x64935395, 0xd6a79573, 0xadb5f4bf, 0x208356eb, 0x5b913727, + 0x3f0264b2, 0x4410057e, 0xc926a72a, 0xb234c6e6}, +}; +const uint32_t kBlock2SkipTable[8][16] = { + {0x00000000, 0x8f158014, 0x1bc776d9, 0x94d2f6cd, 0x378eedb2, 0xb89b6da6, + 0x2c499b6b, 0xa35c1b7f, 0x6f1ddb64, 0xe0085b70, 0x74daadbd, 0xfbcf2da9, + 0x589336d6, 0xd786b6c2, 0x4354400f, 0xcc41c01b}, + {0x00000000, 0xde3bb6c8, 0xb99b1b61, 0x67a0ada9, 0x76da4033, 0xa8e1f6fb, + 0xcf415b52, 0x117aed9a, 0xedb48066, 0x338f36ae, 0x542f9b07, 0x8a142dcf, + 0x9b6ec055, 0x4555769d, 0x22f5db34, 0xfcce6dfc}, + {0x00000000, 0xde85763d, 0xb8e69a8b, 0x6663ecb6, 0x742143e7, 0xaaa435da, + 0xccc7d96c, 0x1242af51, 0xe84287ce, 0x36c7f1f3, 0x50a41d45, 0x8e216b78, + 0x9c63c429, 0x42e6b214, 0x24855ea2, 0xfa00289f}, + {0x00000000, 0xd569796d, 0xaf3e842b, 0x7a57fd46, 0x5b917ea7, 0x8ef807ca, + 0xf4affa8c, 0x21c683e1, 0xb722fd4e, 0x624b8423, 0x181c7965, 0xcd750008, + 0xecb383e9, 0x39dafa84, 0x438d07c2, 0x96e47eaf}, + {0x00000000, 0x6ba98c6d, 0xd75318da, 0xbcfa94b7, 0xab4a4745, 0xc0e3cb28, + 0x7c195f9f, 0x17b0d3f2, 0x5378f87b, 0x38d17416, 0x842be0a1, 0xef826ccc, + 0xf832bf3e, 0x939b3353, 0x2f61a7e4, 0x44c82b89}, + {0x00000000, 0xa6f1f0f6, 0x480f971d, 0xeefe67eb, 0x901f2e3a, 0x36eedecc, + 0xd810b927, 0x7ee149d1, 0x25d22a85, 0x8323da73, 0x6dddbd98, 0xcb2c4d6e, + 0xb5cd04bf, 0x133cf449, 0xfdc293a2, 0x5b336354}, + {0x00000000, 0x4ba4550a, 0x9748aa14, 0xdcecff1e, 0x2b7d22d9, 0x60d977d3, + 0xbc3588cd, 0xf791ddc7, 0x56fa45b2, 0x1d5e10b8, 0xc1b2efa6, 0x8a16baac, + 0x7d87676b, 0x36233261, 0xeacfcd7f, 0xa16b9875}, + {0x00000000, 0xadf48b64, 0x5e056039, 0xf3f1eb5d, 0xbc0ac072, 0x11fe4b16, + 0xe20fa04b, 0x4ffb2b2f, 0x7df9f615, 0xd00d7d71, 0x23fc962c, 0x8e081d48, + 0xc1f33667, 0x6c07bd03, 0x9ff6565e, 0x3202dd3a}, +}; + +constexpr const ptrdiff_t kPrefetchHorizon = 256; + +} // namespace + +uint32_t ExtendSse42(uint32_t crc, const uint8_t* data, size_t size) { + const uint8_t* p = data; + const uint8_t* e = data + size; + uint32_t l = crc ^ kCRC32Xor; + +#define STEP1 \ + do { \ + l = _mm_crc32_u8(l, *p++); \ + } while (0) + +#define STEP4(crc) \ + do { \ + crc = _mm_crc32_u32(crc, ReadUint32LE(p)); \ + p += 4; \ + } while (0) + +#define STEP8(crc, data) \ + do { \ + crc = _mm_crc32_u64(crc, ReadUint64LE(data)); \ + data += 8; \ + } while (0) + +#define STEP8BY3(crc0, crc1, crc2, p0, p1, p2) \ + do { \ + STEP8(crc0, p0); \ + STEP8(crc1, p1); \ + STEP8(crc2, p2); \ + } while (0) + +#define STEP8X3(crc0, crc1, crc2, bs) \ + do { \ + crc0 = _mm_crc32_u64(crc0, ReadUint64LE(p)); \ + crc1 = _mm_crc32_u64(crc1, ReadUint64LE(p + bs)); \ + crc2 = _mm_crc32_u64(crc2, ReadUint64LE(p + 2 * bs)); \ + p += 8; \ + } while (0) + +#define SKIP_BLOCK(crc, tab) \ + do { \ + crc = tab[0][crc & 0xf] ^ tab[1][(crc >> 4) & 0xf] ^ \ + tab[2][(crc >> 8) & 0xf] ^ tab[3][(crc >> 12) & 0xf] ^ \ + tab[4][(crc >> 16) & 0xf] ^ tab[5][(crc >> 20) & 0xf] ^ \ + tab[6][(crc >> 24) & 0xf] ^ tab[7][(crc >> 28) & 0xf]; \ + } while (0) + + // Point x at first 8-byte aligned byte in the buffer. This might be past the + // end of the buffer. + const uint8_t* x = RoundUp<8>(p); + if (x <= e) { + // Process bytes p is 8-byte aligned. + while (p != x) { + STEP1; + } + } + + // Proccess the data in predetermined block sizes with tables for quickly + // combining the checksum. Experimentally it's better to use larger block + // sizes where possible so use a hierarchy of decreasing block sizes. + uint64_t l64 = l; + while ((e - p) >= kGroups * kBlock0Size) { + uint64_t l641 = 0; + uint64_t l642 = 0; + for (int i = 0; i < kBlock0Size; i += 8 * 8) { + // Prefetch ahead to hide latency. + RequestPrefetch(p + kPrefetchHorizon); + RequestPrefetch(p + kBlock0Size + kPrefetchHorizon); + RequestPrefetch(p + 2 * kBlock0Size + kPrefetchHorizon); + + // Process 64 bytes at a time. + STEP8X3(l64, l641, l642, kBlock0Size); + STEP8X3(l64, l641, l642, kBlock0Size); + STEP8X3(l64, l641, l642, kBlock0Size); + STEP8X3(l64, l641, l642, kBlock0Size); + STEP8X3(l64, l641, l642, kBlock0Size); + STEP8X3(l64, l641, l642, kBlock0Size); + STEP8X3(l64, l641, l642, kBlock0Size); + STEP8X3(l64, l641, l642, kBlock0Size); + } + + // Combine results. + SKIP_BLOCK(l64, kBlock0SkipTable); + l64 ^= l641; + SKIP_BLOCK(l64, kBlock0SkipTable); + l64 ^= l642; + p += (kGroups - 1) * kBlock0Size; + } + while ((e - p) >= kGroups * kBlock1Size) { + uint64_t l641 = 0; + uint64_t l642 = 0; + for (int i = 0; i < kBlock1Size; i += 8) { + STEP8X3(l64, l641, l642, kBlock1Size); + } + SKIP_BLOCK(l64, kBlock1SkipTable); + l64 ^= l641; + SKIP_BLOCK(l64, kBlock1SkipTable); + l64 ^= l642; + p += (kGroups - 1) * kBlock1Size; + } + while ((e - p) >= kGroups * kBlock2Size) { + uint64_t l641 = 0; + uint64_t l642 = 0; + for (int i = 0; i < kBlock2Size; i += 8) { + STEP8X3(l64, l641, l642, kBlock2Size); + } + SKIP_BLOCK(l64, kBlock2SkipTable); + l64 ^= l641; + SKIP_BLOCK(l64, kBlock2SkipTable); + l64 ^= l642; + p += (kGroups - 1) * kBlock2Size; + } + + // Process bytes 16 at a time + while ((e - p) >= 16) { + STEP8(l64, p); + STEP8(l64, p); + } + + l = static_cast<uint32_t>(l64); + // Process the last few bytes. + while (p != e) { + STEP1; + } +#undef SKIP_BLOCK +#undef STEP8X3 +#undef STEP8BY3 +#undef STEP8 +#undef STEP4 +#undef STEP1 + + return l ^ kCRC32Xor; +} + +} // namespace crc32c + +#endif // HAVE_SSE42 && (defined(_M_X64) || defined(__x86_64__)) |