diff options
author | James O'Beirne <james.obeirne@pm.me> | 2022-04-28 10:23:33 -0400 |
---|---|---|
committer | James O'Beirne <james.obeirne@pm.me> | 2023-03-07 16:06:17 -0500 |
commit | d96c59cc5cd2f73f1f55c133c52208671fe75ef3 (patch) | |
tree | 1a8031f6e3e087a9f902bc7a253d740dad547ed4 | |
parent | f2a4f3376f1476b38a79a549bd81ba3006225df6 (diff) |
validation: add ChainMan logic for completing UTXO snapshot validation
Trigger completion when a background validation chainstate reaches the
same height as a UTXO snapshot, and handle cleaning up the chainstate
on subsequent startup.
-rw-r--r-- | src/node/chainstate.cpp | 44 | ||||
-rw-r--r-- | src/validation.cpp | 322 | ||||
-rw-r--r-- | src/validation.h | 58 |
3 files changed, 421 insertions, 3 deletions
diff --git a/src/node/chainstate.cpp b/src/node/chainstate.cpp index e9eea90bcb..cd82d8743c 100644 --- a/src/node/chainstate.cpp +++ b/src/node/chainstate.cpp @@ -85,6 +85,9 @@ static ChainstateLoadResult CompleteChainstateInitialization( return options.reindex || options.reindex_chainstate || chainstate->CoinsTip().GetBestBlock().IsNull(); }; + assert(chainman.m_total_coinstip_cache > 0); + assert(chainman.m_total_coinsdb_cache > 0); + // Conservative value which is arbitrarily chosen, as it will ultimately be changed // by a call to `chainman.MaybeRebalanceCaches()`. We just need to make sure // that the sum of the two caches (40%) does not exceed the allowable amount @@ -183,6 +186,47 @@ ChainstateLoadResult LoadChainstate(ChainstateManager& chainman, const CacheSize return {init_status, init_error}; } + // If a snapshot chainstate was fully validated by a background chainstate during + // the last run, detect it here and clean up the now-unneeded background + // chainstate. + // + // Why is this cleanup done here (on subsequent restart) and not just when the + // snapshot is actually validated? Because this entails unusual + // filesystem operations to move leveldb data directories around, and that seems + // too risky to do in the middle of normal runtime. + auto snapshot_completion = chainman.MaybeCompleteSnapshotValidation(); + + if (snapshot_completion == SnapshotCompletionResult::SKIPPED) { + // do nothing; expected case + } else if (snapshot_completion == SnapshotCompletionResult::SUCCESS) { + LogPrintf("[snapshot] cleaning up unneeded background chainstate, then reinitializing\n"); + if (!chainman.ValidatedSnapshotCleanup()) { + AbortNode("Background chainstate cleanup failed unexpectedly."); + } + + // Because ValidatedSnapshotCleanup() has torn down chainstates with + // ChainstateManager::ResetChainstates(), reinitialize them here without + // duplicating the blockindex work above. + assert(chainman.GetAll().empty()); + assert(!chainman.IsSnapshotActive()); + assert(!chainman.IsSnapshotValidated()); + + chainman.InitializeChainstate(options.mempool); + + // A reload of the block index is required to recompute setBlockIndexCandidates + // for the fully validated chainstate. + chainman.ActiveChainstate().UnloadBlockIndex(); + + auto [init_status, init_error] = CompleteChainstateInitialization(chainman, cache_sizes, options); + if (init_status != ChainstateLoadStatus::SUCCESS) { + return {init_status, init_error}; + } + } else { + return {ChainstateLoadStatus::FAILURE, _( + "UTXO snapshot failed to validate. " + "Restart to resume normal initial block download, or try loading a different snapshot.")}; + } + return {ChainstateLoadStatus::SUCCESS, {}}; } diff --git a/src/validation.cpp b/src/validation.cpp index 9b5875319e..e4bc6cb10e 100644 --- a/src/validation.cpp +++ b/src/validation.cpp @@ -2845,6 +2845,14 @@ bool Chainstate::ConnectTip(BlockValidationState& state, CBlockIndex* pindexNew, Ticks<SecondsDouble>(time_total), Ticks<MillisecondsDouble>(time_total) / num_blocks_total); + // If we are the background validation chainstate, check to see if we are done + // validating the snapshot (i.e. our tip has reached the snapshot's base block). + if (this != &m_chainman.ActiveChainstate()) { + // This call may set `m_disabled`, which is referenced immediately afterwards in + // ActivateBestChain, so that we stop connecting blocks past the snapshot base. + m_chainman.MaybeCompleteSnapshotValidation(); + } + connectTrace.BlockConnected(pindexNew, std::move(pthisBlock)); return true; } @@ -3067,6 +3075,14 @@ bool Chainstate::ActivateBestChain(BlockValidationState& state, std::shared_ptr< // we use m_chainstate_mutex to enforce mutual exclusion so that only one caller may execute this function at a time LOCK(m_chainstate_mutex); + // Belt-and-suspenders check that we aren't attempting to advance the background + // chainstate past the snapshot base block. + if (WITH_LOCK(::cs_main, return m_disabled)) { + LogPrintf("m_disabled is set - this chainstate should not be in operation. " /* Continued */ + "Please report this as a bug. %s\n", PACKAGE_BUGREPORT); + return false; + } + CBlockIndex *pindexMostWork = nullptr; CBlockIndex *pindexNewTip = nullptr; int nStopAtHeight = gArgs.GetIntArg("-stopatheight", DEFAULT_STOPATHEIGHT); @@ -3117,6 +3133,15 @@ bool Chainstate::ActivateBestChain(BlockValidationState& state, std::shared_ptr< assert(trace.pblock && trace.pindex); GetMainSignals().BlockConnected(trace.pblock, trace.pindex); } + + // This will have been toggled in + // ActivateBestChainStep -> ConnectTip -> MaybeCompleteSnapshotValidation, + // if at all, so we should catch it here. + // + // Break this do-while to ensure we don't advance past the base snapshot. + if (m_disabled) { + break; + } } while (!m_chain.Tip() || (starting_tip && CBlockIndexWorkComparator()(m_chain.Tip(), starting_tip))); if (!blocks_connected) return true; @@ -3137,6 +3162,11 @@ bool Chainstate::ActivateBestChain(BlockValidationState& state, std::shared_ptr< if (nStopAtHeight && pindexNewTip && pindexNewTip->nHeight >= nStopAtHeight) StartShutdown(); + if (WITH_LOCK(::cs_main, return m_disabled)) { + // Background chainstate has reached the snapshot base block, so exit. + break; + } + // We check shutdown only after giving ActivateBestChainStep a chance to run once so that we // never shutdown before connecting the genesis block during LoadChainTip(). Previously this // caused an assert() failure during shutdown in such cases as the UTXO DB flushing checks @@ -5046,6 +5076,19 @@ static void FlushSnapshotToDisk(CCoinsViewCache& coins_cache, bool snapshot_load coins_cache.Flush(); } +struct StopHashingException : public std::exception +{ + const char* what() const throw() override + { + return "ComputeUTXOStats interrupted by shutdown."; + } +}; + +static void SnapshotUTXOHashBreakpoint() +{ + if (ShutdownRequested()) throw StopHashingException(); +} + bool ChainstateManager::PopulateAndValidateSnapshot( Chainstate& snapshot_chainstate, AutoFile& coins_file, @@ -5169,13 +5212,18 @@ bool ChainstateManager::PopulateAndValidateSnapshot( assert(coins_cache.GetBestBlock() == base_blockhash); - auto breakpoint_fnc = [] { /* TODO insert breakpoint here? */ }; - // As above, okay to immediately release cs_main here since no other context knows // about the snapshot_chainstate. CCoinsViewDB* snapshot_coinsdb = WITH_LOCK(::cs_main, return &snapshot_chainstate.CoinsDB()); - const std::optional<CCoinsStats> maybe_stats = ComputeUTXOStats(CoinStatsHashType::HASH_SERIALIZED, snapshot_coinsdb, m_blockman, breakpoint_fnc); + std::optional<CCoinsStats> maybe_stats; + + try { + maybe_stats = ComputeUTXOStats( + CoinStatsHashType::HASH_SERIALIZED, snapshot_coinsdb, m_blockman, SnapshotUTXOHashBreakpoint); + } catch (StopHashingException const&) { + return false; + } if (!maybe_stats.has_value()) { LogPrintf("[snapshot] failed to generate coins stats\n"); return false; @@ -5243,6 +5291,149 @@ bool ChainstateManager::PopulateAndValidateSnapshot( return true; } +// Currently, this function holds cs_main for its duration, which could be for +// multiple minutes due to the ComputeUTXOStats call. This hold is necessary +// because we need to avoid advancing the background validation chainstate +// farther than the snapshot base block - and this function is also invoked +// from within ConnectTip, i.e. from within ActivateBestChain, so cs_main is +// held anyway. +// +// Eventually (TODO), we could somehow separate this function's runtime from +// maintenance of the active chain, but that will either require +// +// (i) setting `m_disabled` immediately and ensuring all chainstate accesses go +// through IsUsable() checks, or +// +// (ii) giving each chainstate its own lock instead of using cs_main for everything. +SnapshotCompletionResult ChainstateManager::MaybeCompleteSnapshotValidation( + std::function<void(bilingual_str)> shutdown_fnc) +{ + AssertLockHeld(cs_main); + if (m_ibd_chainstate.get() == &this->ActiveChainstate() || + !this->IsUsable(m_snapshot_chainstate.get()) || + !this->IsUsable(m_ibd_chainstate.get()) || + !m_ibd_chainstate->m_chain.Tip()) { + // Nothing to do - this function only applies to the background + // validation chainstate. + return SnapshotCompletionResult::SKIPPED; + } + const int snapshot_tip_height = this->ActiveHeight(); + const int snapshot_base_height = *Assert(this->GetSnapshotBaseHeight()); + const CBlockIndex& index_new = *Assert(m_ibd_chainstate->m_chain.Tip()); + + if (index_new.nHeight < snapshot_base_height) { + // Background IBD not complete yet. + return SnapshotCompletionResult::SKIPPED; + } + + assert(SnapshotBlockhash()); + uint256 snapshot_blockhash = *Assert(SnapshotBlockhash()); + + auto handle_invalid_snapshot = [&]() EXCLUSIVE_LOCKS_REQUIRED(::cs_main) { + bilingual_str user_error = strprintf(_( + "%s failed to validate the -assumeutxo snapshot state. " + "This indicates a hardware problem, or a bug in the software, or a " + "bad software modification that allowed an invalid snapshot to be " + "loaded. As a result of this, the node will shut down and stop using any " + "state that was built on the snapshot, resetting the chain height " + "from %d to %d. On the next " + "restart, the node will resume syncing from %d " + "without using any snapshot data. " + "Please report this incident to %s, including how you obtained the snapshot. " + "The invalid snapshot chainstate has been left on disk in case it is " + "helpful in diagnosing the issue that caused this error."), + PACKAGE_NAME, snapshot_tip_height, snapshot_base_height, snapshot_base_height, PACKAGE_BUGREPORT + ); + + LogPrintf("[snapshot] !!! %s\n", user_error.original); + LogPrintf("[snapshot] deleting snapshot, reverting to validated chain, and stopping node\n"); + + m_active_chainstate = m_ibd_chainstate.get(); + m_snapshot_chainstate->m_disabled = true; + assert(!this->IsUsable(m_snapshot_chainstate.get())); + assert(this->IsUsable(m_ibd_chainstate.get())); + + m_snapshot_chainstate->InvalidateCoinsDBOnDisk(); + + shutdown_fnc(user_error); + }; + + if (index_new.GetBlockHash() != snapshot_blockhash) { + LogPrintf("[snapshot] supposed base block %s does not match the " /* Continued */ + "snapshot base block %s (height %d). Snapshot is not valid.", + index_new.ToString(), snapshot_blockhash.ToString(), snapshot_base_height); + handle_invalid_snapshot(); + return SnapshotCompletionResult::BASE_BLOCKHASH_MISMATCH; + } + + assert(index_new.nHeight == snapshot_base_height); + + int curr_height = m_ibd_chainstate->m_chain.Height(); + + assert(snapshot_base_height == curr_height); + assert(snapshot_base_height == index_new.nHeight); + assert(this->IsUsable(m_snapshot_chainstate.get())); + assert(this->GetAll().size() == 2); + + CCoinsViewDB& ibd_coins_db = m_ibd_chainstate->CoinsDB(); + m_ibd_chainstate->ForceFlushStateToDisk(); + + auto maybe_au_data = ExpectedAssumeutxo(curr_height, ::Params()); + if (!maybe_au_data) { + LogPrintf("[snapshot] assumeutxo data not found for height " /* Continued */ + "(%d) - refusing to validate snapshot\n", curr_height); + handle_invalid_snapshot(); + return SnapshotCompletionResult::MISSING_CHAINPARAMS; + } + + const AssumeutxoData& au_data = *maybe_au_data; + std::optional<CCoinsStats> maybe_ibd_stats; + LogPrintf("[snapshot] computing UTXO stats for background chainstate to validate " /* Continued */ + "snapshot - this could take a few minutes\n"); + try { + maybe_ibd_stats = ComputeUTXOStats( + CoinStatsHashType::HASH_SERIALIZED, + &ibd_coins_db, + m_blockman, + SnapshotUTXOHashBreakpoint); + } catch (StopHashingException const&) { + return SnapshotCompletionResult::STATS_FAILED; + } + + // XXX note that this function is slow and will hold cs_main for potentially minutes. + if (!maybe_ibd_stats) { + LogPrintf("[snapshot] failed to generate stats for validation coins db\n"); + // While this isn't a problem with the snapshot per se, this condition + // prevents us from validating the snapshot, so we should shut down and let the + // user handle the issue manually. + handle_invalid_snapshot(); + return SnapshotCompletionResult::STATS_FAILED; + } + const auto& ibd_stats = *maybe_ibd_stats; + + // Compare the background validation chainstate's UTXO set hash against the hard-coded + // assumeutxo hash we expect. + // + // TODO: For belt-and-suspenders, we could cache the UTXO set + // hash for the snapshot when it's loaded in its chainstate's leveldb. We could then + // reference that here for an additional check. + if (AssumeutxoHash{ibd_stats.hashSerialized} != au_data.hash_serialized) { + LogPrintf("[snapshot] hash mismatch: actual=%s, expected=%s\n", + ibd_stats.hashSerialized.ToString(), + au_data.hash_serialized.ToString()); + handle_invalid_snapshot(); + return SnapshotCompletionResult::HASH_MISMATCH; + } + + LogPrintf("[snapshot] snapshot beginning at %s has been fully validated\n", + snapshot_blockhash.ToString()); + + m_ibd_chainstate->m_disabled = true; + this->MaybeRebalanceCaches(); + + return SnapshotCompletionResult::SUCCESS; +} + Chainstate& ChainstateManager::ActiveChainstate() const { LOCK(::cs_main); @@ -5367,6 +5558,44 @@ bool IsBIP30Unspendable(const CBlockIndex& block_index) (block_index.nHeight==91812 && block_index.GetBlockHash() == uint256S("0x00000000000af0aed4792b1acee3d966af36cf5def14935db8de83d6f9306f2f")); } +void Chainstate::InvalidateCoinsDBOnDisk() +{ + AssertLockHeld(::cs_main); + // Should never be called on a non-snapshot chainstate. + assert(m_from_snapshot_blockhash); + auto storage_path_maybe = this->CoinsDB().StoragePath(); + // Should never be called with a non-existent storage path. + assert(storage_path_maybe); + fs::path snapshot_datadir = *storage_path_maybe; + + // Coins views no longer usable. + m_coins_views.reset(); + + auto invalid_path = snapshot_datadir + "_INVALID"; + std::string dbpath = fs::PathToString(snapshot_datadir); + std::string target = fs::PathToString(invalid_path); + LogPrintf("[snapshot] renaming snapshot datadir %s to %s\n", dbpath, target); + + // The invalid snapshot datadir is simply moved and not deleted because we may + // want to do forensics later during issue investigation. The user is instructed + // accordingly in MaybeCompleteSnapshotValidation(). + try { + fs::rename(snapshot_datadir, invalid_path); + } catch (const fs::filesystem_error& e) { + auto src_str = fs::PathToString(snapshot_datadir); + auto dest_str = fs::PathToString(invalid_path); + + LogPrintf("%s: error renaming file '%s' -> '%s': %s\n", + __func__, src_str, dest_str, e.what()); + AbortNode(strprintf( + "Rename of '%s' -> '%s' failed. " + "You should resolve this by manually moving or deleting the invalid " + "snapshot directory %s, otherwise you will encounter the same error again " + "on the next startup.", + src_str, dest_str, src_str)); + } +} + const CBlockIndex* ChainstateManager::GetSnapshotBaseBlock() const { const auto blockhash_op = this->SnapshotBlockhash(); @@ -5379,3 +5608,90 @@ std::optional<int> ChainstateManager::GetSnapshotBaseHeight() const const CBlockIndex* base = this->GetSnapshotBaseBlock(); return base ? std::make_optional(base->nHeight) : std::nullopt; } + +bool ChainstateManager::ValidatedSnapshotCleanup() +{ + AssertLockHeld(::cs_main); + auto get_storage_path = [](auto& chainstate) EXCLUSIVE_LOCKS_REQUIRED(::cs_main) -> std::optional<fs::path> { + if (!(chainstate && chainstate->HasCoinsViews())) { + return {}; + } + return chainstate->CoinsDB().StoragePath(); + }; + std::optional<fs::path> ibd_chainstate_path_maybe = get_storage_path(m_ibd_chainstate); + std::optional<fs::path> snapshot_chainstate_path_maybe = get_storage_path(m_snapshot_chainstate); + + if (!this->IsSnapshotValidated()) { + // No need to clean up. + return false; + } + // If either path doesn't exist, that means at least one of the chainstates + // is in-memory, in which case we can't do on-disk cleanup. You'd better be + // in a unittest! + if (!ibd_chainstate_path_maybe || !snapshot_chainstate_path_maybe) { + LogPrintf("[snapshot] snapshot chainstate cleanup cannot happen with " /* Continued */ + "in-memory chainstates. You are testing, right?\n"); + return false; + } + + const auto& snapshot_chainstate_path = *snapshot_chainstate_path_maybe; + const auto& ibd_chainstate_path = *ibd_chainstate_path_maybe; + + // Since we're going to be moving around the underlying leveldb filesystem content + // for each chainstate, make sure that the chainstates (and their constituent + // CoinsViews members) have been destructed first. + // + // The caller of this method will be responsible for reinitializing chainstates + // if they want to continue operation. + this->ResetChainstates(); + + // No chainstates should be considered usable. + assert(this->GetAll().size() == 0); + + LogPrintf("[snapshot] deleting background chainstate directory (now unnecessary) (%s)\n", + fs::PathToString(ibd_chainstate_path)); + + fs::path tmp_old{ibd_chainstate_path + "_todelete"}; + + auto rename_failed_abort = []( + fs::path p_old, + fs::path p_new, + const fs::filesystem_error& err) { + LogPrintf("%s: error renaming file (%s): %s\n", + __func__, fs::PathToString(p_old), err.what()); + AbortNode(strprintf( + "Rename of '%s' -> '%s' failed. " + "Cannot clean up the background chainstate leveldb directory.", + fs::PathToString(p_old), fs::PathToString(p_new))); + }; + + try { + fs::rename(ibd_chainstate_path, tmp_old); + } catch (const fs::filesystem_error& e) { + rename_failed_abort(ibd_chainstate_path, tmp_old, e); + throw; + } + + LogPrintf("[snapshot] moving snapshot chainstate (%s) to " /* Continued */ + "default chainstate directory (%s)\n", + fs::PathToString(snapshot_chainstate_path), fs::PathToString(ibd_chainstate_path)); + + try { + fs::rename(snapshot_chainstate_path, ibd_chainstate_path); + } catch (const fs::filesystem_error& e) { + rename_failed_abort(snapshot_chainstate_path, ibd_chainstate_path, e); + throw; + } + + if (!DeleteCoinsDBFromDisk(tmp_old, /*is_snapshot=*/false)) { + // No need to AbortNode because once the unneeded bg chainstate data is + // moved, it will not interfere with subsequent initialization. + LogPrintf("Deletion of %s failed. Please remove it manually, as the " /* Continued */ + "directory is now unnecessary.\n", + fs::PathToString(tmp_old)); + } else { + LogPrintf("[snapshot] deleted background chainstate directory (%s)\n", + fs::PathToString(ibd_chainstate_path)); + } + return true; +} diff --git a/src/validation.h b/src/validation.h index f809c728b6..9ba35b538d 100644 --- a/src/validation.h +++ b/src/validation.h @@ -24,6 +24,7 @@ #include <policy/packages.h> #include <policy/policy.h> #include <script/script_error.h> +#include <shutdown.h> #include <sync.h> #include <txdb.h> #include <txmempool.h> // For CTxMemPool::cs @@ -663,6 +664,12 @@ public: * May not be called with cs_main held. May not be called in a * validationinterface callback. * + * Note that if this is called while a snapshot chainstate is active, and if + * it is called on a background chainstate whose tip has reached the base block + * of the snapshot, its execution will take *MINUTES* while it hashes the + * background UTXO set to verify the assumeutxo value the snapshot was activated + * with. `cs_main` will be held during this time. + * * @returns true unless a system error occurred */ bool ActivateBestChain( @@ -784,9 +791,37 @@ private: std::chrono::microseconds m_last_write{0}; std::chrono::microseconds m_last_flush{0}; + /** + * In case of an invalid snapshot, rename the coins leveldb directory so + * that it can be examined for issue diagnosis. + */ + void InvalidateCoinsDBOnDisk() EXCLUSIVE_LOCKS_REQUIRED(::cs_main); + friend ChainstateManager; }; + +enum class SnapshotCompletionResult { + SUCCESS, + SKIPPED, + + // Expected assumeutxo configuration data is not found for the height of the + // base block. + MISSING_CHAINPARAMS, + + // Failed to generate UTXO statistics (to check UTXO set hash) for the background + // chainstate. + STATS_FAILED, + + // The UTXO set hash of the background validation chainstate does not match + // the one expected by assumeutxo chainparams. + HASH_MISMATCH, + + // The blockhash of the current tip of the background validation chainstate does + // not match the one expected by the snapshot chainstate. + BASE_BLOCKHASH_MISMATCH, +}; + /** * Provides an interface for creating and interacting with one or two * chainstates: an IBD chainstate generated by downloading blocks, and @@ -984,6 +1019,18 @@ public: [[nodiscard]] bool ActivateSnapshot( AutoFile& coins_file, const node::SnapshotMetadata& metadata, bool in_memory); + //! Once the background validation chainstate has reached the height which + //! is the base of the UTXO snapshot in use, compare its coins to ensure + //! they match those expected by the snapshot. + //! + //! If the coins match (expected), then mark the validation chainstate for + //! deletion and continue using the snapshot chainstate as active. + //! Otherwise, revert to using the ibd chainstate and shutdown. + SnapshotCompletionResult MaybeCompleteSnapshotValidation( + std::function<void(bilingual_str)> shutdown_fnc = + [](bilingual_str msg) { AbortNode(msg.original, msg); }) + EXCLUSIVE_LOCKS_REQUIRED(::cs_main); + //! The most-work chain. Chainstate& ActiveChainstate() const; CChain& ActiveChain() const EXCLUSIVE_LOCKS_REQUIRED(GetMutex()) { return ActiveChainstate().m_chain; } @@ -1091,6 +1138,17 @@ public: Chainstate& ActivateExistingSnapshot(CTxMemPool* mempool, uint256 base_blockhash) EXCLUSIVE_LOCKS_REQUIRED(::cs_main); + //! If we have validated a snapshot chain during this runtime, copy its + //! chainstate directory over to the main `chainstate` location, completing + //! validation of the snapshot. + //! + //! If the cleanup succeeds, the caller will need to ensure chainstates are + //! reinitialized, since ResetChainstates() will be called before leveldb + //! directories are moved or deleted. + //! + //! @sa node/chainstate:LoadChainstate() + bool ValidatedSnapshotCleanup() EXCLUSIVE_LOCKS_REQUIRED(::cs_main); + ~ChainstateManager(); }; |