1 files changed, 98 insertions, 38 deletions
diff --git a/src/leveldb/util/coding.h b/src/leveldb/util/coding.h
index 3993c4a755..1983ae7173 100644
--- a/src/leveldb/util/coding.h
+++ b/src/leveldb/util/coding.h
@@ -10,87 +10,147 @@
 #ifndef STORAGE_LEVELDB_UTIL_CODING_H_
 #define STORAGE_LEVELDB_UTIL_CODING_H_
 
-#include <stdint.h>
-#include <string.h>
+#include <cstdint>
+#include <cstring>
 #include <string>
+
 #include "leveldb/slice.h"
 #include "port/port.h"
 
 namespace leveldb {
 
 // Standard Put... routines append to a string
-extern void PutFixed32(std::string* dst, uint32_t value);
-extern void PutFixed64(std::string* dst, uint64_t value);
-extern void PutVarint32(std::string* dst, uint32_t value);
-extern void PutVarint64(std::string* dst, uint64_t value);
-extern void PutLengthPrefixedSlice(std::string* dst, const Slice& value);
+void PutFixed32(std::string* dst, uint32_t value);
+void PutFixed64(std::string* dst, uint64_t value);
+void PutVarint32(std::string* dst, uint32_t value);
+void PutVarint64(std::string* dst, uint64_t value);
+void PutLengthPrefixedSlice(std::string* dst, const Slice& value);
 
 // Standard Get... routines parse a value from the beginning of a Slice
 // and advance the slice past the parsed value.
-extern bool GetVarint32(Slice* input, uint32_t* value);
-extern bool GetVarint64(Slice* input, uint64_t* value);
-extern bool GetLengthPrefixedSlice(Slice* input, Slice* result);
+bool GetVarint32(Slice* input, uint32_t* value);
+bool GetVarint64(Slice* input, uint64_t* value);
+bool GetLengthPrefixedSlice(Slice* input, Slice* result);
 
 // Pointer-based variants of GetVarint...  These either store a value
 // in *v and return a pointer just past the parsed value, or return
-// NULL on error.  These routines only look at bytes in the range
+// nullptr on error.  These routines only look at bytes in the range
 // [p..limit-1]
-extern const char* GetVarint32Ptr(const char* p,const char* limit, uint32_t* v);
-extern const char* GetVarint64Ptr(const char* p,const char* limit, uint64_t* v);
+const char* GetVarint32Ptr(const char* p, const char* limit, uint32_t* v);
+const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* v);
 
 // Returns the length of the varint32 or varint64 encoding of "v"
-extern int VarintLength(uint64_t v);
+int VarintLength(uint64_t v);
 
 // Lower-level versions of Put... that write directly into a character buffer
+// and return a pointer just past the last byte written.
 // REQUIRES: dst has enough space for the value being written
-extern void EncodeFixed32(char* dst, uint32_t value);
-extern void EncodeFixed64(char* dst, uint64_t value);
+char* EncodeVarint32(char* dst, uint32_t value);
+char* EncodeVarint64(char* dst, uint64_t value);
+
+// TODO(costan): Remove port::kLittleEndian and the fast paths based on
+//               std::memcpy when clang learns to optimize the generic code, as
+//               described in https://bugs.llvm.org/show_bug.cgi?id=41761
+//
+// The platform-independent code in DecodeFixed{32,64}() gets optimized to mov
+// on x86 and ldr on ARM64, by both clang and gcc. However, only gcc optimizes
+// the platform-independent code in EncodeFixed{32,64}() to mov / str.
 
 // Lower-level versions of Put... that write directly into a character buffer
-// and return a pointer just past the last byte written.
 // REQUIRES: dst has enough space for the value being written
-extern char* EncodeVarint32(char* dst, uint32_t value);
-extern char* EncodeVarint64(char* dst, uint64_t value);
+
+inline void EncodeFixed32(char* dst, uint32_t value) {
+  uint8_t* const buffer = reinterpret_cast<uint8_t*>(dst);
+
+  if (port::kLittleEndian) {
+    // Fast path for little-endian CPUs. All major compilers optimize this to a
+    // single mov (x86_64) / str (ARM) instruction.
+    std::memcpy(buffer, &value, sizeof(uint32_t));
+    return;
+  }
+
+  // Platform-independent code.
+  // Currently, only gcc optimizes this to a single mov / str instruction.
+  buffer[0] = static_cast<uint8_t>(value);
+  buffer[1] = static_cast<uint8_t>(value >> 8);
+  buffer[2] = static_cast<uint8_t>(value >> 16);
+  buffer[3] = static_cast<uint8_t>(value >> 24);
+}
+
+inline void EncodeFixed64(char* dst, uint64_t value) {
+  uint8_t* const buffer = reinterpret_cast<uint8_t*>(dst);
+
+  if (port::kLittleEndian) {
+    // Fast path for little-endian CPUs. All major compilers optimize this to a
+    // single mov (x86_64) / str (ARM) instruction.
+    std::memcpy(buffer, &value, sizeof(uint64_t));
+    return;
+  }
+
+  // Platform-independent code.
+  // Currently, only gcc optimizes this to a single mov / str instruction.
+  buffer[0] = static_cast<uint8_t>(value);
+  buffer[1] = static_cast<uint8_t>(value >> 8);
+  buffer[2] = static_cast<uint8_t>(value >> 16);
+  buffer[3] = static_cast<uint8_t>(value >> 24);
+  buffer[4] = static_cast<uint8_t>(value >> 32);
+  buffer[5] = static_cast<uint8_t>(value >> 40);
+  buffer[6] = static_cast<uint8_t>(value >> 48);
+  buffer[7] = static_cast<uint8_t>(value >> 56);
+}
 
 // Lower-level versions of Get... that read directly from a character buffer
 // without any bounds checking.
 
 inline uint32_t DecodeFixed32(const char* ptr) {
+  const uint8_t* const buffer = reinterpret_cast<const uint8_t*>(ptr);
+
   if (port::kLittleEndian) {
-    // Load the raw bytes
+    // Fast path for little-endian CPUs. All major compilers optimize this to a
+    // single mov (x86_64) / ldr (ARM) instruction.
     uint32_t result;
-    memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load
+    std::memcpy(&result, buffer, sizeof(uint32_t));
     return result;
-  } else {
-    return ((static_cast<uint32_t>(static_cast<unsigned char>(ptr[0])))
-        | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[1])) << 8)
-        | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[2])) << 16)
-        | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[3])) << 24));
   }
+
+  // Platform-independent code.
+  // Clang and gcc optimize this to a single mov / ldr instruction.
+  return (static_cast<uint32_t>(buffer[0])) |
+         (static_cast<uint32_t>(buffer[1]) << 8) |
+         (static_cast<uint32_t>(buffer[2]) << 16) |
+         (static_cast<uint32_t>(buffer[3]) << 24);
 }
 
 inline uint64_t DecodeFixed64(const char* ptr) {
+  const uint8_t* const buffer = reinterpret_cast<const uint8_t*>(ptr);
+
   if (port::kLittleEndian) {
-    // Load the raw bytes
+    // Fast path for little-endian CPUs. All major compilers optimize this to a
+    // single mov (x86_64) / ldr (ARM) instruction.
     uint64_t result;
-    memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load
+    std::memcpy(&result, buffer, sizeof(uint64_t));
     return result;
-  } else {
-    uint64_t lo = DecodeFixed32(ptr);
-    uint64_t hi = DecodeFixed32(ptr + 4);
-    return (hi << 32) | lo;
   }
+
+  // Platform-independent code.
+  // Clang and gcc optimize this to a single mov / ldr instruction.
+  return (static_cast<uint64_t>(buffer[0])) |
+         (static_cast<uint64_t>(buffer[1]) << 8) |
+         (static_cast<uint64_t>(buffer[2]) << 16) |
+         (static_cast<uint64_t>(buffer[3]) << 24) |
+         (static_cast<uint64_t>(buffer[4]) << 32) |
+         (static_cast<uint64_t>(buffer[5]) << 40) |
+         (static_cast<uint64_t>(buffer[6]) << 48) |
+         (static_cast<uint64_t>(buffer[7]) << 56);
 }
 
 // Internal routine for use by fallback path of GetVarint32Ptr
-extern const char* GetVarint32PtrFallback(const char* p,
-                                          const char* limit,
-                                          uint32_t* value);
-inline const char* GetVarint32Ptr(const char* p,
-                                  const char* limit,
+const char* GetVarint32PtrFallback(const char* p, const char* limit,
+                                   uint32_t* value);
+inline const char* GetVarint32Ptr(const char* p, const char* limit,
                                   uint32_t* value) {
   if (p < limit) {
-    uint32_t result = *(reinterpret_cast<const unsigned char*>(p));
+    uint32_t result = *(reinterpret_cast<const uint8_t*>(p));
     if ((result & 128) == 0) {
       *value = result;
       return p + 1;