diff options
author | Pieter Wuille <pieter.wuille@gmail.com> | 2012-06-15 14:19:11 +0200 |
---|---|---|
committer | Pieter Wuille <pieter.wuille@gmail.com> | 2012-10-20 23:08:56 +0200 |
commit | 4d6144f97faf9d2a6c89f41d7d2360f21f0b71e2 (patch) | |
tree | 02f4f8d5cb22c9ca152e1cb55f9d62943ee5458e | |
parent | 43b7905e98d827df45e970dbaf5f08561ecc6cda (diff) |
Compact serialization for variable-length integers
Variable-length integers: bytes are a MSB base-128 encoding of the number.
The high bit in each byte signifies whether another digit follows. To make
the encoding is one-to-one, one is subtracted from all but the last digit.
Thus, the byte sequence a[] with length len, where all but the last byte
has bit 128 set, encodes the number:
(a[len-1] & 0x7F) + sum(i=1..len-1, 128^i*((a[len-i-1] & 0x7F)+1))
Properties:
* Very small (0-127: 1 byte, 128-16511: 2 bytes, 16512-2113663: 3 bytes)
* Every integer has exactly one encoding
* Encoding does not depend on size of original integer type
-rw-r--r-- | src/serialize.h | 94 | ||||
-rw-r--r-- | src/test/serialize_tests.cpp | 45 |
2 files changed, 138 insertions, 1 deletions
diff --git a/src/serialize.h b/src/serialize.h index 63df3160a6..549e46cbc3 100644 --- a/src/serialize.h +++ b/src/serialize.h @@ -238,9 +238,75 @@ uint64 ReadCompactSize(Stream& is) return nSizeRet; } +// Variable-length integers: bytes are a MSB base-128 encoding of the number. +// The high bit in each byte signifies whether another digit follows. To make +// the encoding is one-to-one, one is subtracted from all but the last digit. +// Thus, the byte sequence a[] with length len, where all but the last byte +// has bit 128 set, encodes the number: +// +// (a[len-1] & 0x7F) + sum(i=1..len-1, 128^i*((a[len-i-1] & 0x7F)+1)) +// +// Properties: +// * Very small (0-127: 1 byte, 128-16511: 2 bytes, 16512-2113663: 3 bytes) +// * Every integer has exactly one encoding +// * Encoding does not depend on size of original integer type +// * No redundancy: every (infinite) byte sequence corresponds to a list +// of encoded integers. +// +// 0: [0x00] 256: [0x81 0x00] +// 1: [0x01] 16383: [0xFE 0x7F] +// 127: [0x7F] 16384: [0xFF 0x00] +// 128: [0x80 0x00] 16511: [0x80 0xFF 0x7F] +// 255: [0x80 0x7F] 65535: [0x82 0xFD 0x7F] +// 2^32: [0x8E 0xFE 0xFE 0xFF 0x00] + +template<typename I> +inline unsigned int GetSizeOfVarInt(I n) +{ + int nRet = 0; + while(true) { + nRet++; + if (n <= 0x7F) + break; + n = (n >> 7) - 1; + } + return nRet; +} + +template<typename Stream, typename I> +void WriteVarInt(Stream& os, I n) +{ + unsigned char tmp[(sizeof(n)*8+6)/7]; + int len=0; + while(true) { + tmp[len] = (n & 0x7F) | (len ? 0x80 : 0x00); + if (n <= 0x7F) + break; + n = (n >> 7) - 1; + len++; + } + do { + WRITEDATA(os, tmp[len]); + } while(len--); +} +template<typename Stream, typename I> +I ReadVarInt(Stream& is) +{ + I n = 0; + while(true) { + unsigned char chData; + READDATA(is, chData); + n = (n << 7) | (chData & 0x7F); + if (chData & 0x80) + n++; + else + return n; + } +} -#define FLATDATA(obj) REF(CFlatData((char*)&(obj), (char*)&(obj) + sizeof(obj))) +#define FLATDATA(obj) REF(CFlatData((char*)&(obj), (char*)&(obj) + sizeof(obj))) +#define VARINT(obj) REF(WrapVarInt(REF(obj))) /** Wrapper for serializing arrays and POD. */ @@ -274,6 +340,32 @@ public: } }; +template<typename I> +class CVarInt +{ +protected: + I &n; +public: + CVarInt(I& nIn) : n(nIn) { } + + unsigned int GetSerializeSize(int, int) const { + return GetSizeOfVarInt<I>(n); + } + + template<typename Stream> + void Serialize(Stream &s, int, int) const { + WriteVarInt<Stream,I>(s, n); + } + + template<typename Stream> + void Unserialize(Stream& s, int, int) { + n = ReadVarInt<Stream,I>(s); + } +}; + +template<typename I> +CVarInt<I> WrapVarInt(I& n) { return CVarInt<I>(n); } + // // Forward declarations // diff --git a/src/test/serialize_tests.cpp b/src/test/serialize_tests.cpp new file mode 100644 index 0000000000..90ac89f8c5 --- /dev/null +++ b/src/test/serialize_tests.cpp @@ -0,0 +1,45 @@ +#include <boost/test/unit_test.hpp> + +#include <string> +#include <vector> + +#include "serialize.h" + +using namespace std; + +BOOST_AUTO_TEST_SUITE(serialize_tests) + +BOOST_AUTO_TEST_CASE(varints) +{ + // encode + + CDataStream ss(SER_DISK, 0); + CDataStream::size_type size = 0; + for (int i = 0; i < 100000; i++) { + ss << VARINT(i); + size += ::GetSerializeSize(VARINT(i), 0, 0); + BOOST_CHECK(size == ss.size()); + } + + for (uint64 i = 0; i < 100000000000ULL; i += 999999937) { + ss << VARINT(i); + size += ::GetSerializeSize(VARINT(i), 0, 0); + BOOST_CHECK(size == ss.size()); + } + + // decode + for (int i = 0; i < 100000; i++) { + int j; + ss >> VARINT(j); + BOOST_CHECK_MESSAGE(i == j, "decoded:" << j << " expected:" << i); + } + + for (uint64 i = 0; i < 100000000000ULL; i += 999999937) { + uint64 j; + ss >> VARINT(j); + BOOST_CHECK_MESSAGE(i == j, "decoded:" << j << " expected:" << i); + } + +} + +BOOST_AUTO_TEST_SUITE_END() |