From 69a5f8be0abda1e462f8ef44acadd2cbfaa850fb Mon Sep 17 00:00:00 2001 From: Gavin Andresen Date: Fri, 24 Apr 2015 13:14:45 -0400 Subject: Rolling bloom filter class For when you need to keep track of the last N items you've seen, and can tolerate some false-positives. Rebased-by: Pieter Wuille --- src/bloom.cpp | 83 ++++++++++++++++++++++++++++++++++++++---------- src/bloom.h | 28 ++++++++++++++++ src/test/bloom_tests.cpp | 78 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 173 insertions(+), 16 deletions(-) diff --git a/src/bloom.cpp b/src/bloom.cpp index e60576f4b4..36cba491c4 100644 --- a/src/bloom.cpp +++ b/src/bloom.cpp @@ -21,22 +21,33 @@ using namespace std; CBloomFilter::CBloomFilter(unsigned int nElements, double nFPRate, unsigned int nTweakIn, unsigned char nFlagsIn) : -/** - * The ideal size for a bloom filter with a given number of elements and false positive rate is: - * - nElements * log(fp rate) / ln(2)^2 - * We ignore filter parameters which will create a bloom filter larger than the protocol limits - */ -vData(min((unsigned int)(-1 / LN2SQUARED * nElements * log(nFPRate)), MAX_BLOOM_FILTER_SIZE * 8) / 8), -/** - * The ideal number of hash functions is filter size * ln(2) / number of elements - * Again, we ignore filter parameters which will create a bloom filter with more hash functions than the protocol limits - * See https://en.wikipedia.org/wiki/Bloom_filter for an explanation of these formulas - */ -isFull(false), -isEmpty(false), -nHashFuncs(min((unsigned int)(vData.size() * 8 / nElements * LN2), MAX_HASH_FUNCS)), -nTweak(nTweakIn), -nFlags(nFlagsIn) + /** + * The ideal size for a bloom filter with a given number of elements and false positive rate is: + * - nElements * log(fp rate) / ln(2)^2 + * We ignore filter parameters which will create a bloom filter larger than the protocol limits + */ + vData(min((unsigned int)(-1 / LN2SQUARED * nElements * log(nFPRate)), MAX_BLOOM_FILTER_SIZE * 8) / 8), + /** + * The ideal number of hash functions is filter size * ln(2) / number of elements + * Again, we ignore filter parameters which will create a bloom filter with more hash functions than the protocol limits + * See https://en.wikipedia.org/wiki/Bloom_filter for an explanation of these formulas + */ + isFull(false), + isEmpty(false), + nHashFuncs(min((unsigned int)(vData.size() * 8 / nElements * LN2), MAX_HASH_FUNCS)), + nTweak(nTweakIn), + nFlags(nFlagsIn) +{ +} + +// Private constructor used by CRollingBloomFilter +CBloomFilter::CBloomFilter(unsigned int nElements, double nFPRate, unsigned int nTweakIn) : + vData((unsigned int)(-1 / LN2SQUARED * nElements * log(nFPRate)) / 8), + isFull(false), + isEmpty(true), + nHashFuncs((unsigned int)(vData.size() * 8 / nElements * LN2)), + nTweak(nTweakIn), + nFlags(BLOOM_UPDATE_NONE) { } @@ -197,3 +208,43 @@ void CBloomFilter::UpdateEmptyFull() isFull = full; isEmpty = empty; } + +CRollingBloomFilter::CRollingBloomFilter(unsigned int nElements, double fpRate, unsigned int nTweak) : + b1(nElements * 2, fpRate, nTweak), b2(nElements * 2, fpRate, nTweak) +{ + // Implemented using two bloom filters of 2 * nElements each. + // We fill them up, and clear them, staggered, every nElements + // inserted, so at least one always contains the last nElements + // inserted. + nBloomSize = nElements * 2; + nInsertions = 0; +} + +void CRollingBloomFilter::insert(const std::vector& vKey) +{ + if (nInsertions == 0) { + b1.clear(); + } else if (nInsertions == nBloomSize / 2) { + b2.clear(); + } + b1.insert(vKey); + b2.insert(vKey); + if (++nInsertions == nBloomSize) { + nInsertions = 0; + } +} + +bool CRollingBloomFilter::contains(const std::vector& vKey) const +{ + if (nInsertions < nBloomSize / 2) { + return b2.contains(vKey); + } + return b1.contains(vKey); +} + +void CRollingBloomFilter::clear() +{ + b1.clear(); + b2.clear(); + nInsertions = 0; +} diff --git a/src/bloom.h b/src/bloom.h index 191ffa19b3..7bab379a39 100644 --- a/src/bloom.h +++ b/src/bloom.h @@ -53,6 +53,10 @@ private: unsigned int Hash(unsigned int nHashNum, const std::vector& vDataToHash) const; + // Private constructor for CRollingBloomFilter, no restrictions on size + CBloomFilter(unsigned int nElements, double nFPRate, unsigned int nTweak); + friend class CRollingBloomFilter; + public: /** * Creates a new bloom filter which will provide the given fp rate when filled with the given number of elements @@ -97,4 +101,28 @@ public: void UpdateEmptyFull(); }; +/** + * RollingBloomFilter is a probabilistic "keep track of most recently inserted" set. + * Construct it with the number of items to keep track of, and a false-positive rate. + * + * contains(item) will always return true if item was one of the last N things + * insert()'ed ... but may also return true for items that were not inserted. + */ +class CRollingBloomFilter +{ +public: + CRollingBloomFilter(unsigned int nElements, double nFPRate, unsigned int nTweak); + + void insert(const std::vector& vKey); + bool contains(const std::vector& vKey) const; + + void clear(); + +private: + unsigned int nBloomSize; + unsigned int nInsertions; + CBloomFilter b1, b2; +}; + + #endif // BITCOIN_BLOOM_H diff --git a/src/test/bloom_tests.cpp b/src/test/bloom_tests.cpp index 73a146f05c..1bda8a7ea1 100644 --- a/src/test/bloom_tests.cpp +++ b/src/test/bloom_tests.cpp @@ -8,6 +8,7 @@ #include "clientversion.h" #include "key.h" #include "merkleblock.h" +#include "random.h" #include "serialize.h" #include "streams.h" #include "uint256.h" @@ -459,4 +460,81 @@ BOOST_AUTO_TEST_CASE(merkle_block_4_test_update_none) BOOST_CHECK(!filter.contains(COutPoint(uint256S("0x02981fa052f0481dbc5868f4fc2166035a10f27a03cfd2de67326471df5bc041"), 0))); } +static std::vector RandomData() +{ + uint256 r = GetRandHash(); + return std::vector(r.begin(), r.end()); +} + +BOOST_AUTO_TEST_CASE(rolling_bloom) +{ + // last-100-entry, 1% false positive: + CRollingBloomFilter rb1(100, 0.01, 0); + + // Overfill: + static const int DATASIZE=399; + std::vector data[DATASIZE]; + for (int i = 0; i < DATASIZE; i++) { + data[i] = RandomData(); + rb1.insert(data[i]); + } + // Last 100 guaranteed to be remembered: + for (int i = 299; i < DATASIZE; i++) { + BOOST_CHECK(rb1.contains(data[i])); + } + + // false positive rate is 1%, so we should get about 100 hits if + // testing 10,000 random keys. We get worst-case false positive + // behavior when the filter is as full as possible, which is + // when we've inserted one minus an integer multiple of nElement*2. + unsigned int nHits = 0; + for (int i = 0; i < 10000; i++) { + if (rb1.contains(RandomData())) + ++nHits; + } + // Run test_bitcoin with --log_level=message to see BOOST_TEST_MESSAGEs: + BOOST_TEST_MESSAGE("RollingBloomFilter got " << nHits << " false positives (~100 expected)"); + + // Insanely unlikely to get a fp count outside this range: + BOOST_CHECK(nHits > 25); + BOOST_CHECK(nHits < 175); + + BOOST_CHECK(rb1.contains(data[DATASIZE-1])); + rb1.clear(); + BOOST_CHECK(!rb1.contains(data[DATASIZE-1])); + + // Now roll through data, make sure last 100 entries + // are always remembered: + for (int i = 0; i < DATASIZE; i++) { + if (i >= 100) + BOOST_CHECK(rb1.contains(data[i-100])); + rb1.insert(data[i]); + } + + // Insert 999 more random entries: + for (int i = 0; i < 999; i++) { + rb1.insert(RandomData()); + } + // Sanity check to make sure the filter isn't just filling up: + nHits = 0; + for (int i = 0; i < DATASIZE; i++) { + if (rb1.contains(data[i])) + ++nHits; + } + // Expect about 5 false positives, more than 100 means + // something is definitely broken. + BOOST_TEST_MESSAGE("RollingBloomFilter got " << nHits << " false positives (~5 expected)"); + BOOST_CHECK(nHits < 100); + + // last-1000-entry, 0.01% false positive: + CRollingBloomFilter rb2(1000, 0.001, 0); + for (int i = 0; i < DATASIZE; i++) { + rb2.insert(data[i]); + } + // ... room for all of them: + for (int i = 0; i < DATASIZE; i++) { + BOOST_CHECK(rb2.contains(data[i])); + } +} + BOOST_AUTO_TEST_SUITE_END() -- cgit v1.2.3 From d81cff32e50fe5f686f985d0af2e74219f328ed0 Mon Sep 17 00:00:00 2001 From: Gavin Andresen Date: Sat, 25 Apr 2015 16:25:44 -0400 Subject: Replace mruset setAddrKnown with CRollingBloomFilter addrKnown Use a probabilistic bloom filter to keep track of which addresses we think we have given our peers, instead of a list. This uses much less memory, at the cost of sometimes failing to relay an address to a peer-- worst case if the bloom filter happens to be as full as it gets, 1-in-1,000. Measured memory usage of a full mruset setAddrKnown: 650Kbytes Constant memory usage of CRollingBloomFilter addrKnown: 37Kbytes. This will also help heap fragmentation, because the 37K of storage is allocated when a CNode is created (when a connection to a peer is established) and then there is no per-item-remembered memory allocation. I plan on testing by restarting a full node with an empty peers.dat, running a while with -debug=addrman and -debug=net, and making sure that the 'addr' message traffic out is reasonable. (suggestions for better tests welcome) --- src/main.cpp | 10 +++++----- src/net.cpp | 2 +- src/net.h | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index e6248c6617..a6b717d57f 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -3995,7 +3995,7 @@ bool static ProcessMessage(CNode* pfrom, string strCommand, CDataStream& vRecv, { LOCK(cs_vNodes); // Use deterministic randomness to send to the same nodes for 24 hours - // at a time so the setAddrKnowns of the chosen nodes prevent repeats + // at a time so the addrKnowns of the chosen nodes prevent repeats static uint256 hashSalt; if (hashSalt.IsNull()) hashSalt = GetRandHash(); @@ -4779,9 +4779,9 @@ bool SendMessages(CNode* pto, bool fSendTrickle) LOCK(cs_vNodes); BOOST_FOREACH(CNode* pnode, vNodes) { - // Periodically clear setAddrKnown to allow refresh broadcasts + // Periodically clear addrKnown to allow refresh broadcasts if (nLastRebroadcast) - pnode->setAddrKnown.clear(); + pnode->addrKnown.clear(); // Rebroadcast our address AdvertizeLocal(pnode); @@ -4799,9 +4799,9 @@ bool SendMessages(CNode* pto, bool fSendTrickle) vAddr.reserve(pto->vAddrToSend.size()); BOOST_FOREACH(const CAddress& addr, pto->vAddrToSend) { - // returns true if wasn't already contained in the set - if (pto->setAddrKnown.insert(addr).second) + if (!pto->addrKnown.contains(addr.GetKey())) { + pto->addrKnown.insert(addr.GetKey()); vAddr.push_back(addr); // receiver rejects addr messages larger than 1000 if (vAddr.size() >= 1000) diff --git a/src/net.cpp b/src/net.cpp index 731c810935..4648dae11e 100644 --- a/src/net.cpp +++ b/src/net.cpp @@ -1905,7 +1905,7 @@ bool CAddrDB::Read(CAddrMan& addr) unsigned int ReceiveFloodSize() { return 1000*GetArg("-maxreceivebuffer", 5*1000); } unsigned int SendBufferSize() { return 1000*GetArg("-maxsendbuffer", 1*1000); } -CNode::CNode(SOCKET hSocketIn, CAddress addrIn, std::string addrNameIn, bool fInboundIn) : ssSend(SER_NETWORK, INIT_PROTO_VERSION), setAddrKnown(5000) +CNode::CNode(SOCKET hSocketIn, CAddress addrIn, std::string addrNameIn, bool fInboundIn) : ssSend(SER_NETWORK, INIT_PROTO_VERSION), addrKnown(5000, 0.001, insecure_rand()) { nServices = 0; hSocket = hSocketIn; diff --git a/src/net.h b/src/net.h index 9fc6ce68d0..24e927c9f6 100644 --- a/src/net.h +++ b/src/net.h @@ -300,7 +300,7 @@ public: // flood relay std::vector vAddrToSend; - mruset setAddrKnown; + CRollingBloomFilter addrKnown; bool fGetAddr; std::set setKnown; @@ -380,7 +380,7 @@ public: void AddAddressKnown(const CAddress& addr) { - setAddrKnown.insert(addr); + addrKnown.insert(addr.GetKey()); } void PushAddress(const CAddress& addr) @@ -388,7 +388,7 @@ public: // Known checking here is only to save space from duplicates. // SendMessages will filter it again for knowns that were added // after addresses were pushed. - if (addr.IsValid() && !setAddrKnown.count(addr)) { + if (addr.IsValid() && !addrKnown.contains(addr.GetKey())) { if (vAddrToSend.size() >= MAX_ADDR_TO_SEND) { vAddrToSend[insecure_rand() % vAddrToSend.size()] = addr; } else { -- cgit v1.2.3 From d4d5022cfc6f1b826e4c644539a2c756a7499198 Mon Sep 17 00:00:00 2001 From: Pieter Wuille Date: Sat, 25 Apr 2015 08:19:57 -0700 Subject: Use ring buffer of set iterators instead of deque of copies in mruset --- src/mruset.h | 36 ++++++++++++++++-------------------- src/net.cpp | 6 ++++-- src/test/mruset_tests.cpp | 2 +- 3 files changed, 21 insertions(+), 23 deletions(-) diff --git a/src/mruset.h b/src/mruset.h index 1969f419cb..398aa173bf 100644 --- a/src/mruset.h +++ b/src/mruset.h @@ -1,12 +1,12 @@ -// Copyright (c) 2012 The Bitcoin Core developers +// Copyright (c) 2012-2015 The Bitcoin Core developers // Distributed under the MIT software license, see the accompanying // file COPYING or http://www.opensource.org/licenses/mit-license.php. #ifndef BITCOIN_MRUSET_H #define BITCOIN_MRUSET_H -#include #include +#include #include /** STL-like set container that only keeps the most recent N elements. */ @@ -22,11 +22,13 @@ public: protected: std::set set; - std::deque queue; - size_type nMaxSize; + std::vector order; + size_type first_used; + size_type first_unused; + const size_type nMaxSize; public: - mruset(size_type nMaxSizeIn = 0) { nMaxSize = nMaxSizeIn; } + mruset(size_type nMaxSizeIn = 1) : nMaxSize(nMaxSizeIn) { clear(); } iterator begin() const { return set.begin(); } iterator end() const { return set.end(); } size_type size() const { return set.size(); } @@ -36,7 +38,9 @@ public: void clear() { set.clear(); - queue.clear(); + order.assign(nMaxSize, set.end()); + first_used = 0; + first_unused = 0; } bool inline friend operator==(const mruset& a, const mruset& b) { return a.set == b.set; } bool inline friend operator==(const mruset& a, const std::set& b) { return a.set == b; } @@ -45,25 +49,17 @@ public: { std::pair ret = set.insert(x); if (ret.second) { - if (nMaxSize && queue.size() == nMaxSize) { - set.erase(queue.front()); - queue.pop_front(); + if (set.size() == nMaxSize + 1) { + set.erase(order[first_used]); + order[first_used] = set.end(); + if (++first_used == nMaxSize) first_used = 0; } - queue.push_back(x); + order[first_unused] = ret.first; + if (++first_unused == nMaxSize) first_unused = 0; } return ret; } size_type max_size() const { return nMaxSize; } - size_type max_size(size_type s) - { - if (s) - while (queue.size() > s) { - set.erase(queue.front()); - queue.pop_front(); - } - nMaxSize = s; - return nMaxSize; - } }; #endif // BITCOIN_MRUSET_H diff --git a/src/net.cpp b/src/net.cpp index 4648dae11e..2de04fc574 100644 --- a/src/net.cpp +++ b/src/net.cpp @@ -1905,7 +1905,10 @@ bool CAddrDB::Read(CAddrMan& addr) unsigned int ReceiveFloodSize() { return 1000*GetArg("-maxreceivebuffer", 5*1000); } unsigned int SendBufferSize() { return 1000*GetArg("-maxsendbuffer", 1*1000); } -CNode::CNode(SOCKET hSocketIn, CAddress addrIn, std::string addrNameIn, bool fInboundIn) : ssSend(SER_NETWORK, INIT_PROTO_VERSION), addrKnown(5000, 0.001, insecure_rand()) +CNode::CNode(SOCKET hSocketIn, CAddress addrIn, std::string addrNameIn, bool fInboundIn) : + ssSend(SER_NETWORK, INIT_PROTO_VERSION), + addrKnown(5000, 0.001, insecure_rand()), + setInventoryKnown(SendBufferSize() / 1000) { nServices = 0; hSocket = hSocketIn; @@ -1934,7 +1937,6 @@ CNode::CNode(SOCKET hSocketIn, CAddress addrIn, std::string addrNameIn, bool fIn nStartingHeight = -1; fGetAddr = false; fRelayTxes = false; - setInventoryKnown.max_size(SendBufferSize() / 1000); pfilter = new CBloomFilter(); nPingNonceSent = 0; nPingUsecStart = 0; diff --git a/src/test/mruset_tests.cpp b/src/test/mruset_tests.cpp index bd4e9c1d38..9a9763e27a 100644 --- a/src/test/mruset_tests.cpp +++ b/src/test/mruset_tests.cpp @@ -24,7 +24,7 @@ private: std::set set; public: - mrutester() { mru.max_size(MAX_SIZE); } + mrutester() : mru(MAX_SIZE) {} int size() const { return set.size(); } void insert(int n) -- cgit v1.2.3 From f46a680f423ed1de5316d176e2292edefd916a95 Mon Sep 17 00:00:00 2001 From: Pieter Wuille Date: Sat, 25 Apr 2015 14:45:46 -0700 Subject: Better mruset unit test --- src/test/mruset_tests.cpp | 126 ++++++++++++++++++++-------------------------- 1 file changed, 54 insertions(+), 72 deletions(-) diff --git a/src/test/mruset_tests.cpp b/src/test/mruset_tests.cpp index 9a9763e27a..2b68f8899e 100644 --- a/src/test/mruset_tests.cpp +++ b/src/test/mruset_tests.cpp @@ -17,83 +17,65 @@ using namespace std; -class mrutester -{ -private: - mruset mru; - std::set set; - -public: - mrutester() : mru(MAX_SIZE) {} - int size() const { return set.size(); } - - void insert(int n) - { - mru.insert(n); - set.insert(n); - BOOST_CHECK(mru == set); - } -}; - BOOST_FIXTURE_TEST_SUITE(mruset_tests, BasicTestingSetup) -// Test that an mruset behaves like a set, as long as no more than MAX_SIZE elements are in it -BOOST_AUTO_TEST_CASE(mruset_like_set) -{ - - for (int nTest=0; nTest mru(MAX_SIZE); - for (int nAction=0; nAction<3*MAX_SIZE; nAction++) - { - int n = GetRandInt(2 * MAX_SIZE); - mru.insert(n); - BOOST_CHECK(mru.size() <= MAX_SIZE); + // The mruset being tested. + mruset mru(5000); + + // Run the test 10 times. + for (int test = 0; test < 10; test++) { + // Reset mru. + mru.clear(); + + // A deque + set to simulate the mruset. + std::deque rep; + std::set all; + + // Insert 10000 random integers below 15000. + for (int j=0; j<10000; j++) { + int add = GetRandInt(15000); + mru.insert(add); + + // Add the number to rep/all as well. + if (all.count(add) == 0) { + all.insert(add); + rep.push_back(add); + if (all.size() == 5001) { + all.erase(rep.front()); + rep.pop_front(); + } + } + + // Do a full comparison between mru and the simulated mru every 1000 and every 5001 elements. + if (j % 1000 == 0 || j % 5001 == 0) { + mruset mru2 = mru; // Also try making a copy + + // Check that all elements that should be in there, are in there. + BOOST_FOREACH(int x, rep) { + BOOST_CHECK(mru.count(x)); + BOOST_CHECK(mru2.count(x)); + } + + // Check that all elements that are in there, should be in there. + BOOST_FOREACH(int x, mru) { + BOOST_CHECK(all.count(x)); + } + + // Check that all elements that are in there, should be in there. + BOOST_FOREACH(int x, mru2) { + BOOST_CHECK(all.count(x)); + } + + for (int t = 0; t < 10; t++) { + int r = GetRandInt(15000); + BOOST_CHECK(all.count(r) == mru.count(r)); + BOOST_CHECK(all.count(r) == mru2.count(r)); + } + } } } } -// 16-bit permutation function -int static permute(int n) -{ - // hexadecimals of pi; verified to be linearly independent - static const int table[16] = {0x243F, 0x6A88, 0x85A3, 0x08D3, 0x1319, 0x8A2E, 0x0370, 0x7344, - 0xA409, 0x3822, 0x299F, 0x31D0, 0x082E, 0xFA98, 0xEC4E, 0x6C89}; - - int ret = 0; - for (int bit=0; bit<16; bit++) - if (n & (1< mru(MAX_SIZE); - for (int n=0; n<10*MAX_SIZE; n++) - { - mru.insert(permute(n)); - - set tester; - for (int m=max(0,n-MAX_SIZE+1); m<=n; m++) - tester.insert(permute(m)); - - BOOST_CHECK(mru == tester); - } -} - BOOST_AUTO_TEST_SUITE_END() -- cgit v1.2.3