diff options
author | Gavin Andresen <gavinandresen@gmail.com> | 2015-04-24 13:14:45 -0400 |
---|---|---|
committer | Pieter Wuille <pieter.wuille@gmail.com> | 2015-04-30 07:58:29 -0700 |
commit | 69a5f8be0abda1e462f8ef44acadd2cbfaa850fb (patch) | |
tree | 0369fcb002ab0d63aba1522b151ff89248c44818 | |
parent | 8a10000222cb49eb253b41802ecf312adaf79439 (diff) |
Rolling bloom filter class
For when you need to keep track of the last N items
you've seen, and can tolerate some false-positives.
Rebased-by: Pieter Wuille <pieter.wuille@gmail.com>
-rw-r--r-- | src/bloom.cpp | 83 | ||||
-rw-r--r-- | src/bloom.h | 28 | ||||
-rw-r--r-- | src/test/bloom_tests.cpp | 78 |
3 files changed, 173 insertions, 16 deletions
diff --git a/src/bloom.cpp b/src/bloom.cpp index e60576f4b4..36cba491c4 100644 --- a/src/bloom.cpp +++ b/src/bloom.cpp @@ -21,22 +21,33 @@ using namespace std; CBloomFilter::CBloomFilter(unsigned int nElements, double nFPRate, unsigned int nTweakIn, unsigned char nFlagsIn) : -/** - * The ideal size for a bloom filter with a given number of elements and false positive rate is: - * - nElements * log(fp rate) / ln(2)^2 - * We ignore filter parameters which will create a bloom filter larger than the protocol limits - */ -vData(min((unsigned int)(-1 / LN2SQUARED * nElements * log(nFPRate)), MAX_BLOOM_FILTER_SIZE * 8) / 8), -/** - * The ideal number of hash functions is filter size * ln(2) / number of elements - * Again, we ignore filter parameters which will create a bloom filter with more hash functions than the protocol limits - * See https://en.wikipedia.org/wiki/Bloom_filter for an explanation of these formulas - */ -isFull(false), -isEmpty(false), -nHashFuncs(min((unsigned int)(vData.size() * 8 / nElements * LN2), MAX_HASH_FUNCS)), -nTweak(nTweakIn), -nFlags(nFlagsIn) + /** + * The ideal size for a bloom filter with a given number of elements and false positive rate is: + * - nElements * log(fp rate) / ln(2)^2 + * We ignore filter parameters which will create a bloom filter larger than the protocol limits + */ + vData(min((unsigned int)(-1 / LN2SQUARED * nElements * log(nFPRate)), MAX_BLOOM_FILTER_SIZE * 8) / 8), + /** + * The ideal number of hash functions is filter size * ln(2) / number of elements + * Again, we ignore filter parameters which will create a bloom filter with more hash functions than the protocol limits + * See https://en.wikipedia.org/wiki/Bloom_filter for an explanation of these formulas + */ + isFull(false), + isEmpty(false), + nHashFuncs(min((unsigned int)(vData.size() * 8 / nElements * LN2), MAX_HASH_FUNCS)), + nTweak(nTweakIn), + nFlags(nFlagsIn) +{ +} + +// Private constructor used by CRollingBloomFilter +CBloomFilter::CBloomFilter(unsigned int nElements, double nFPRate, unsigned int nTweakIn) : + vData((unsigned int)(-1 / LN2SQUARED * nElements * log(nFPRate)) / 8), + isFull(false), + isEmpty(true), + nHashFuncs((unsigned int)(vData.size() * 8 / nElements * LN2)), + nTweak(nTweakIn), + nFlags(BLOOM_UPDATE_NONE) { } @@ -197,3 +208,43 @@ void CBloomFilter::UpdateEmptyFull() isFull = full; isEmpty = empty; } + +CRollingBloomFilter::CRollingBloomFilter(unsigned int nElements, double fpRate, unsigned int nTweak) : + b1(nElements * 2, fpRate, nTweak), b2(nElements * 2, fpRate, nTweak) +{ + // Implemented using two bloom filters of 2 * nElements each. + // We fill them up, and clear them, staggered, every nElements + // inserted, so at least one always contains the last nElements + // inserted. + nBloomSize = nElements * 2; + nInsertions = 0; +} + +void CRollingBloomFilter::insert(const std::vector<unsigned char>& vKey) +{ + if (nInsertions == 0) { + b1.clear(); + } else if (nInsertions == nBloomSize / 2) { + b2.clear(); + } + b1.insert(vKey); + b2.insert(vKey); + if (++nInsertions == nBloomSize) { + nInsertions = 0; + } +} + +bool CRollingBloomFilter::contains(const std::vector<unsigned char>& vKey) const +{ + if (nInsertions < nBloomSize / 2) { + return b2.contains(vKey); + } + return b1.contains(vKey); +} + +void CRollingBloomFilter::clear() +{ + b1.clear(); + b2.clear(); + nInsertions = 0; +} diff --git a/src/bloom.h b/src/bloom.h index 191ffa19b3..7bab379a39 100644 --- a/src/bloom.h +++ b/src/bloom.h @@ -53,6 +53,10 @@ private: unsigned int Hash(unsigned int nHashNum, const std::vector<unsigned char>& vDataToHash) const; + // Private constructor for CRollingBloomFilter, no restrictions on size + CBloomFilter(unsigned int nElements, double nFPRate, unsigned int nTweak); + friend class CRollingBloomFilter; + public: /** * Creates a new bloom filter which will provide the given fp rate when filled with the given number of elements @@ -97,4 +101,28 @@ public: void UpdateEmptyFull(); }; +/** + * RollingBloomFilter is a probabilistic "keep track of most recently inserted" set. + * Construct it with the number of items to keep track of, and a false-positive rate. + * + * contains(item) will always return true if item was one of the last N things + * insert()'ed ... but may also return true for items that were not inserted. + */ +class CRollingBloomFilter +{ +public: + CRollingBloomFilter(unsigned int nElements, double nFPRate, unsigned int nTweak); + + void insert(const std::vector<unsigned char>& vKey); + bool contains(const std::vector<unsigned char>& vKey) const; + + void clear(); + +private: + unsigned int nBloomSize; + unsigned int nInsertions; + CBloomFilter b1, b2; +}; + + #endif // BITCOIN_BLOOM_H diff --git a/src/test/bloom_tests.cpp b/src/test/bloom_tests.cpp index 73a146f05c..1bda8a7ea1 100644 --- a/src/test/bloom_tests.cpp +++ b/src/test/bloom_tests.cpp @@ -8,6 +8,7 @@ #include "clientversion.h" #include "key.h" #include "merkleblock.h" +#include "random.h" #include "serialize.h" #include "streams.h" #include "uint256.h" @@ -459,4 +460,81 @@ BOOST_AUTO_TEST_CASE(merkle_block_4_test_update_none) BOOST_CHECK(!filter.contains(COutPoint(uint256S("0x02981fa052f0481dbc5868f4fc2166035a10f27a03cfd2de67326471df5bc041"), 0))); } +static std::vector<unsigned char> RandomData() +{ + uint256 r = GetRandHash(); + return std::vector<unsigned char>(r.begin(), r.end()); +} + +BOOST_AUTO_TEST_CASE(rolling_bloom) +{ + // last-100-entry, 1% false positive: + CRollingBloomFilter rb1(100, 0.01, 0); + + // Overfill: + static const int DATASIZE=399; + std::vector<unsigned char> data[DATASIZE]; + for (int i = 0; i < DATASIZE; i++) { + data[i] = RandomData(); + rb1.insert(data[i]); + } + // Last 100 guaranteed to be remembered: + for (int i = 299; i < DATASIZE; i++) { + BOOST_CHECK(rb1.contains(data[i])); + } + + // false positive rate is 1%, so we should get about 100 hits if + // testing 10,000 random keys. We get worst-case false positive + // behavior when the filter is as full as possible, which is + // when we've inserted one minus an integer multiple of nElement*2. + unsigned int nHits = 0; + for (int i = 0; i < 10000; i++) { + if (rb1.contains(RandomData())) + ++nHits; + } + // Run test_bitcoin with --log_level=message to see BOOST_TEST_MESSAGEs: + BOOST_TEST_MESSAGE("RollingBloomFilter got " << nHits << " false positives (~100 expected)"); + + // Insanely unlikely to get a fp count outside this range: + BOOST_CHECK(nHits > 25); + BOOST_CHECK(nHits < 175); + + BOOST_CHECK(rb1.contains(data[DATASIZE-1])); + rb1.clear(); + BOOST_CHECK(!rb1.contains(data[DATASIZE-1])); + + // Now roll through data, make sure last 100 entries + // are always remembered: + for (int i = 0; i < DATASIZE; i++) { + if (i >= 100) + BOOST_CHECK(rb1.contains(data[i-100])); + rb1.insert(data[i]); + } + + // Insert 999 more random entries: + for (int i = 0; i < 999; i++) { + rb1.insert(RandomData()); + } + // Sanity check to make sure the filter isn't just filling up: + nHits = 0; + for (int i = 0; i < DATASIZE; i++) { + if (rb1.contains(data[i])) + ++nHits; + } + // Expect about 5 false positives, more than 100 means + // something is definitely broken. + BOOST_TEST_MESSAGE("RollingBloomFilter got " << nHits << " false positives (~5 expected)"); + BOOST_CHECK(nHits < 100); + + // last-1000-entry, 0.01% false positive: + CRollingBloomFilter rb2(1000, 0.001, 0); + for (int i = 0; i < DATASIZE; i++) { + rb2.insert(data[i]); + } + // ... room for all of them: + for (int i = 0; i < DATASIZE; i++) { + BOOST_CHECK(rb2.contains(data[i])); + } +} + BOOST_AUTO_TEST_SUITE_END() |