aboutsummaryrefslogtreecommitdiff
path: root/src/net_processing.cpp
diff options
context:
space:
mode:
authorAndrew Chow <github@achow101.com>2023-01-27 01:44:17 -0500
committerAndrew Chow <github@achow101.com>2023-01-27 01:53:21 -0500
commit835212cd1d8f8fc7f19775f5ff8cc21c099122b2 (patch)
tree52e4bff58bddf2c42d1a827791f5a6dd2c60124d /src/net_processing.cpp
parentffc22b7d42c6360223508293b8c1f88b1a1a468b (diff)
parent39b93649c4b98cd82c64b957fd9f6a6fd3c2a359 (diff)
Merge bitcoin/bitcoin#25880: p2p: Make stalling timeout adaptive during IBD
39b93649c4b98cd82c64b957fd9f6a6fd3c2a359 test: add functional test for IBD stalling logic (Martin Zumsande) 0565951f34e6d155dc825964c5d8b1dd00931682 p2p: Make block stalling timeout adaptive (Martin Zumsande) Pull request description: During IBD, there is the following stalling mechanism if we can't proceed with assigning blocks from a 1024 lookahead window because all of these blocks are either already downloaded or in-flight: We'll mark the peer from which we expect the current block that would allow us to advance our tip (and thereby move the 1024 window ahead) as a possible staller. We then give this peer 2 more seconds to deliver a block (`BLOCK_STALLING_TIMEOUT`) and if it doesn't, disconnect it and assign the critical block we need to another peer. Now the problem is that this second peer is immediately marked as a potential staller using the same mechanism and given 2 seconds as well - if our own connection is so slow that it simply takes us more than 2 seconds to download this block, that peer will also be disconnected (and so on...), leading to repeated disconnections and no progress in IBD. This has been described in #9213, and I have observed this when doing IBD on slower connections or with Tor - sometimes there would be several minutes without progress, where all we did was disconnect peers and find new ones. The `2s` stalling timeout was introduced in #4468, when blocks weren't full and before Segwit increased the maximum possible physical size of blocks - so I think it made a lot of sense back then. But it would be good to revisit this timeout now. This PR makes the timout adaptive (idea by sipa): If we disconnect a peer for stalling, we now double the timeout for the next peer (up to a maximum of 64s). If we connect a block, we half it again up to the old value of 2 seconds. That way, peers that are comparatively slower will still get disconnected, but long phases of disconnecting all peers shouldn't happen anymore. Fixes #9213 ACKs for top commit: achow101: ACK 39b93649c4b98cd82c64b957fd9f6a6fd3c2a359 RandyMcMillan: Strong Concept ACK 39b93649c4b98cd82c64b957fd9f6a6fd3c2a359 vasild: ACK 39b93649c4b98cd82c64b957fd9f6a6fd3c2a359 naumenkogs: ACK 39b93649c4b98cd82c64b957fd9f6a6fd3c2a359 Tree-SHA512: 85bc57093b2fb1d28d7409ed8df5a91543909405907bc129de7c6285d0810dd79bc05219e4d5aefcb55c85512b0ad5bed43a4114a17e46c35b9a3f9a983d5754
Diffstat (limited to 'src/net_processing.cpp')
-rw-r--r--src/net_processing.cpp32
1 files changed, 28 insertions, 4 deletions
diff --git a/src/net_processing.cpp b/src/net_processing.cpp
index dc5ba650c1..b5e1222db5 100644
--- a/src/net_processing.cpp
+++ b/src/net_processing.cpp
@@ -113,8 +113,11 @@ static constexpr auto GETDATA_TX_INTERVAL{60s};
static const unsigned int MAX_GETDATA_SZ = 1000;
/** Number of blocks that can be requested at any given time from a single peer. */
static const int MAX_BLOCKS_IN_TRANSIT_PER_PEER = 16;
-/** Time during which a peer must stall block download progress before being disconnected. */
-static constexpr auto BLOCK_STALLING_TIMEOUT{2s};
+/** Default time during which a peer must stall block download progress before being disconnected.
+ * the actual timeout is increased temporarily if peers are disconnected for hitting the timeout */
+static constexpr auto BLOCK_STALLING_TIMEOUT_DEFAULT{2s};
+/** Maximum timeout for stalling block download. */
+static constexpr auto BLOCK_STALLING_TIMEOUT_MAX{64s};
/** Number of headers sent in one getheaders result. We rely on the assumption that if a peer sends
* less than this number, we reached its tip. Changing this value is a protocol upgrade. */
static const unsigned int MAX_HEADERS_RESULTS = 2000;
@@ -774,6 +777,9 @@ private:
/** Number of preferable block download peers. */
int m_num_preferred_download_peers GUARDED_BY(cs_main){0};
+ /** Stalling timeout for blocks in IBD */
+ std::atomic<std::chrono::seconds> m_block_stalling_timeout{BLOCK_STALLING_TIMEOUT_DEFAULT};
+
bool AlreadyHaveTx(const GenTxid& gtxid)
EXCLUSIVE_LOCKS_REQUIRED(cs_main, !m_recent_confirmed_transactions_mutex);
@@ -1812,7 +1818,8 @@ void PeerManagerImpl::StartScheduledTasks(CScheduler& scheduler)
/**
* Evict orphan txn pool entries based on a newly connected
* block, remember the recently confirmed transactions, and delete tracked
- * announcements for them. Also save the time of the last tip update.
+ * announcements for them. Also save the time of the last tip update and
+ * possibly reduce dynamic block stalling timeout.
*/
void PeerManagerImpl::BlockConnected(const std::shared_ptr<const CBlock>& pblock, const CBlockIndex* pindex)
{
@@ -1835,6 +1842,16 @@ void PeerManagerImpl::BlockConnected(const std::shared_ptr<const CBlock>& pblock
m_txrequest.ForgetTxHash(ptx->GetWitnessHash());
}
}
+
+ // In case the dynamic timeout was doubled once or more, reduce it slowly back to its default value
+ auto stalling_timeout = m_block_stalling_timeout.load();
+ Assume(stalling_timeout >= BLOCK_STALLING_TIMEOUT_DEFAULT);
+ if (stalling_timeout != BLOCK_STALLING_TIMEOUT_DEFAULT) {
+ const auto new_timeout = std::max(std::chrono::duration_cast<std::chrono::seconds>(stalling_timeout * 0.85), BLOCK_STALLING_TIMEOUT_DEFAULT);
+ if (m_block_stalling_timeout.compare_exchange_strong(stalling_timeout, new_timeout)) {
+ LogPrint(BCLog::NET, "Decreased stalling timeout to %d seconds\n", new_timeout.count());
+ }
+ }
}
void PeerManagerImpl::BlockDisconnected(const std::shared_ptr<const CBlock> &block, const CBlockIndex* pindex)
@@ -5713,12 +5730,19 @@ bool PeerManagerImpl::SendMessages(CNode* pto)
m_connman.PushMessage(pto, msgMaker.Make(NetMsgType::INV, vInv));
// Detect whether we're stalling
- if (state.m_stalling_since.count() && state.m_stalling_since < current_time - BLOCK_STALLING_TIMEOUT) {
+ auto stalling_timeout = m_block_stalling_timeout.load();
+ if (state.m_stalling_since.count() && state.m_stalling_since < current_time - stalling_timeout) {
// Stalling only triggers when the block download window cannot move. During normal steady state,
// the download window should be much larger than the to-be-downloaded set of blocks, so disconnection
// should only happen during initial block download.
LogPrintf("Peer=%d is stalling block download, disconnecting\n", pto->GetId());
pto->fDisconnect = true;
+ // Increase timeout for the next peer so that we don't disconnect multiple peers if our own
+ // bandwidth is insufficient.
+ const auto new_timeout = std::min(2 * stalling_timeout, BLOCK_STALLING_TIMEOUT_MAX);
+ if (stalling_timeout != new_timeout && m_block_stalling_timeout.compare_exchange_strong(stalling_timeout, new_timeout)) {
+ LogPrint(BCLog::NET, "Increased stalling timeout temporarily to %d seconds\n", m_block_stalling_timeout.load().count());
+ }
return true;
}
// In case there is a block that has been in flight from this peer for block_interval * (1 + 0.5 * N)