aboutsummaryrefslogtreecommitdiff
path: root/src/leveldb/port/port_posix_sse.cc
blob: 1e519ba0b64befe92b0a6ca70f290f699151baab (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
// Copyright 2016 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// A portable implementation of crc32c, optimized to handle
// four bytes at a time.
//
// In a separate source file to allow this accelerated CRC32C function to be
// compiled with the appropriate compiler flags to enable x86 SSE 4.2
// instructions.

#include <stdint.h>
#include <string.h>
#include "port/port.h"

#if defined(LEVELDB_PLATFORM_POSIX_SSE)

#if defined(_MSC_VER)
#include <intrin.h>
#elif defined(__GNUC__) && defined(__SSE4_2__)
#include <nmmintrin.h>
#include <cpuid.h>
#endif

#endif  // defined(LEVELDB_PLATFORM_POSIX_SSE)

namespace leveldb {
namespace port {

#if defined(LEVELDB_PLATFORM_POSIX_SSE)

// Used to fetch a naturally-aligned 32-bit word in little endian byte-order
static inline uint32_t LE_LOAD32(const uint8_t *p) {
  // SSE is x86 only, so ensured that |p| is always little-endian.
  uint32_t word;
  memcpy(&word, p, sizeof(word));
  return word;
}

#if defined(_M_X64) || defined(__x86_64__)  // LE_LOAD64 is only used on x64.

// Used to fetch a naturally-aligned 64-bit word in little endian byte-order
static inline uint64_t LE_LOAD64(const uint8_t *p) {
  uint64_t dword;
  memcpy(&dword, p, sizeof(dword));
  return dword;
}

#endif  // defined(_M_X64) || defined(__x86_64__)

static inline bool HaveSSE42() {
#if defined(_MSC_VER)
  int cpu_info[4];
  __cpuid(cpu_info, 1);
  return (cpu_info[2] & (1 << 20)) != 0;
#elif defined(__GNUC__)
  unsigned int eax, ebx, ecx, edx;
  __get_cpuid(1, &eax, &ebx, &ecx, &edx);
  return (ecx & (1 << 20)) != 0;
#else
  return false;
#endif
}

#endif  // defined(LEVELDB_PLATFORM_POSIX_SSE)

// For further improvements see Intel publication at:
// http://download.intel.com/design/intarch/papers/323405.pdf
uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size) {
#if !defined(LEVELDB_PLATFORM_POSIX_SSE)
  return 0;
#else
  static bool have = HaveSSE42();
  if (!have) {
    return 0;
  }

  const uint8_t *p = reinterpret_cast<const uint8_t *>(buf);
  const uint8_t *e = p + size;
  uint32_t l = crc ^ 0xffffffffu;

#define STEP1 do {                              \
    l = _mm_crc32_u8(l, *p++);                  \
} while (0)
#define STEP4 do {                              \
    l = _mm_crc32_u32(l, LE_LOAD32(p));         \
    p += 4;                                     \
} while (0)
#define STEP8 do {                              \
    l = _mm_crc32_u64(l, LE_LOAD64(p));         \
    p += 8;                                     \
} while (0)

  if (size > 16) {
    // Process unaligned bytes
    for (unsigned int i = reinterpret_cast<uintptr_t>(p) % 8; i; --i) {
      STEP1;
    }

    // _mm_crc32_u64 is only available on x64.
#if defined(_M_X64) || defined(__x86_64__)
    // Process 8 bytes at a time
    while ((e-p) >= 8) {
      STEP8;
    }
    // Process 4 bytes at a time
    if ((e-p) >= 4) {
      STEP4;
    }
#else  // !(defined(_M_X64) || defined(__x86_64__))
    // Process 4 bytes at a time
    while ((e-p) >= 4) {
      STEP4;
    }
#endif  // defined(_M_X64) || defined(__x86_64__)
  }
  // Process the last few bytes
  while (p != e) {
    STEP1;
  }
#undef STEP8
#undef STEP4
#undef STEP1
  return l ^ 0xffffffffu;
#endif  // defined(LEVELDB_PLATFORM_POSIX_SSE)
}

}  // namespace port
}  // namespace leveldb