From 1dde4b8d2e9cab5ded83045ccf6e3b20f60a2af8 Mon Sep 17 00:00:00 2001 From: Garand Tyson Date: Wed, 18 Feb 2026 15:29:35 -0800 Subject: [PATCH] Make background apply compatible with RUN_STANDALONE --- docs/apply-load-max-sac-tps.cfg | 1 - src/bucket/test/BucketManagerTests.cpp | 4 ++++ src/herder/test/HerderTests.cpp | 5 +++++ src/herder/test/TransactionQueueTests.cpp | 6 ++++++ src/history/test/HistoryTests.cpp | 14 +++++++++++++ .../test/ConservationOfLumensTests.cpp | 8 +++++++ src/ledger/LedgerManagerImpl.cpp | 11 ++++++++++ .../test/LedgerCloseMetaStreamTests.cpp | 7 +++++++ src/main/ApplicationImpl.cpp | 9 ++++++++ src/main/CommandLine.cpp | 3 +++ src/main/Config.cpp | 21 +++++-------------- src/overlay/test/TCPPeerTests.cpp | 7 +++++-- src/transactions/test/InflationTests.cpp | 4 ++++ src/util/Timer.cpp | 20 +++++++++++++++--- 14 files changed, 98 insertions(+), 22 deletions(-) diff --git a/docs/apply-load-max-sac-tps.cfg b/docs/apply-load-max-sac-tps.cfg index bc937898aa..1f00d6b67a 100644 --- a/docs/apply-load-max-sac-tps.cfg +++ b/docs/apply-load-max-sac-tps.cfg @@ -39,7 +39,6 @@ APPLY_LOAD_BL_LAST_BATCH_LEDGERS = 0 # Minimal core config boilerplate RUN_STANDALONE=true -PARALLEL_LEDGER_APPLY=false NODE_IS_VALIDATOR=true UNSAFE_QUORUM=true NETWORK_PASSPHRASE="Standalone Network ; February 2017" diff --git a/src/bucket/test/BucketManagerTests.cpp b/src/bucket/test/BucketManagerTests.cpp index 281bcd3d5c..68143219d2 100644 --- a/src/bucket/test/BucketManagerTests.cpp +++ b/src/bucket/test/BucketManagerTests.cpp @@ -692,6 +692,10 @@ TEST_CASE_VERSIONS( cfg.MAX_CONCURRENT_SUBPROCESSES = 1; cfg.ARTIFICIALLY_ACCELERATE_TIME_FOR_TESTING = true; cfg.ARTIFICIALLY_PESSIMIZE_MERGES_FOR_TESTING = true; + // Test loop calls forgetUnreferencedBuckets and + // setNextLedgerEntryBatchForBucketTesting while ledgers close + // automatically, which races with background apply. + cfg.PARALLEL_LEDGER_APPLY = false; stellar::historytestutils::TmpDirHistoryConfigurator tcfg; cfg = tcfg.configure(cfg, true); diff --git a/src/herder/test/HerderTests.cpp b/src/herder/test/HerderTests.cpp index 71b2b8b86d..ca857813f2 100644 --- a/src/herder/test/HerderTests.cpp +++ b/src/herder/test/HerderTests.cpp @@ -5282,6 +5282,11 @@ externalize(SecretKey const& sk, LedgerManager& lm, HerderImpl& herder, xdr::xvector{}, sk); herder.getHerderSCPDriver().valueExternalized(ledgerSeq, xdr::xdr_to_opaque(sv)); + // With background apply, crank until the ledger is fully applied + while (lm.getLastClosedLedgerNum() < ledgerSeq) + { + app.getClock().crank(true); + } } TEST_CASE("do not flood invalid transactions", "[herder]") diff --git a/src/herder/test/TransactionQueueTests.cpp b/src/herder/test/TransactionQueueTests.cpp index b5dc409889..ec2b84a062 100644 --- a/src/herder/test/TransactionQueueTests.cpp +++ b/src/herder/test/TransactionQueueTests.cpp @@ -3069,6 +3069,12 @@ TEST_CASE("remove applied", "[herder][transactionqueue]") app->getConfig().NODE_SEED); herder.getHerderSCPDriver().valueExternalized(ledgerSeq, xdr::xdr_to_opaque(sv)); + + // With background apply, crank until the ledger is fully applied + while (lm.getLastClosedLedgerNum() < ledgerSeq) + { + clock.crank(true); + } } REQUIRE(tq.getTransactions({}).size() == 1); diff --git a/src/history/test/HistoryTests.cpp b/src/history/test/HistoryTests.cpp index 305f29ff00..28ec0cf1e1 100644 --- a/src/history/test/HistoryTests.cpp +++ b/src/history/test/HistoryTests.cpp @@ -1599,6 +1599,14 @@ TEST_CASE_VERSIONS( while (hm.getPublishQueueCount() != 1) { + // With background apply, wait for any in-progress + // ledger close to finish before writing to the shared + // test-entry vectors that finalizeLedgerTxnChanges + // reads on the apply thread. + while (lm.isApplying()) + { + clock.crank(true); + } auto lcl = lm.getLastClosedLedgerHeader(); lcl.header.ledgerSeq += 1; // Generate entries excluding soroban types to avoid worrying @@ -1975,6 +1983,12 @@ TEST_CASE("Introduce and fix gap without starting catchup", // Fill in the second gap. All buffered ledgers should be applied, but we // wait for another ledger to close to get in sync catchupSimulation.externalizeLedger(herder, nextLedger + 4); + + // With background apply, crank until all queued ledgers are applied + while (lm.getLastClosedLedgerNum() < nextLedger + 5) + { + app->getClock().crank(true); + } REQUIRE(lm.isSynced()); REQUIRE(lam.getLargestLedgerSeqHeard() == lm.getLastClosedLedgerNum()); REQUIRE(!lam.isCatchupInitialized()); diff --git a/src/invariant/test/ConservationOfLumensTests.cpp b/src/invariant/test/ConservationOfLumensTests.cpp index 93a7142158..b1cf3fb84c 100644 --- a/src/invariant/test/ConservationOfLumensTests.cpp +++ b/src/invariant/test/ConservationOfLumensTests.cpp @@ -315,6 +315,11 @@ TEST_CASE( { auto cfg = getTestConfig(); cfg.INVARIANT_CHECKS = {"ConservationOfLumens"}; + // This test directly modifies LedgerTxnRoot header (totalCoins), which + // creates a hash mismatch between the DB header and what SCP externalized. + // This is incompatible with background apply where the cross-check runs on + // a thread that doesn't have access to the cached LCL header. + cfg.PARALLEL_LEDGER_APPLY = false; SorobanTest test(cfg); @@ -378,6 +383,9 @@ TEST_CASE("ConservationOfLumens snapshot invariant detects bucket corruption", auto cfg = getTestConfig(); cfg.INVARIANT_CHECKS = {}; // Disable automatic invariant checks because we // will invoke it manually + // This test directly modifies LedgerTxnRoot header (totalCoins), which is + // incompatible with background apply (see comment in the test above). + cfg.PARALLEL_LEDGER_APPLY = false; VirtualClock clock; auto app = createTestApplication( diff --git a/src/ledger/LedgerManagerImpl.cpp b/src/ledger/LedgerManagerImpl.cpp index 33800cd267..9b71144657 100644 --- a/src/ledger/LedgerManagerImpl.cpp +++ b/src/ledger/LedgerManagerImpl.cpp @@ -1520,6 +1520,17 @@ LedgerManagerImpl::applyLedger(LedgerCloseData const& ledgerData, CLOG_ERROR(Ledger, "{}", xdrToCerealString(prevHeader, "Full LCL")); CLOG_ERROR(Ledger, "{}", POSSIBLY_CORRUPTED_LOCAL_DATA); +#ifdef BUILD_TESTS + if (!threadIsMain()) + { + throw std::runtime_error( + "txset mismatch on background apply thread. This usually means " + "a test directly modified the LedgerTxnRoot header (e.g. " + "totalCoins). Set cfg.PARALLEL_LEDGER_APPLY = false for such " + "tests."); + } +#endif + throw std::runtime_error("txset mismatch"); } diff --git a/src/ledger/test/LedgerCloseMetaStreamTests.cpp b/src/ledger/test/LedgerCloseMetaStreamTests.cpp index ac6552ec9c..a44374fcac 100644 --- a/src/ledger/test/LedgerCloseMetaStreamTests.cpp +++ b/src/ledger/test/LedgerCloseMetaStreamTests.cpp @@ -248,6 +248,13 @@ TEST_CASE("METADATA_DEBUG_LEDGERS works", "[metadebug]") { // Generate just enough meta to not triggers garbage collection closeLedgers(cfg.METADATA_DEBUG_LEDGERS); + + // Drain any remaining background apply before stopping, so the + // debug tx set file and LCL are consistent when we read them. + while (lm.isApplying()) + { + clock.crank(true); + } app->gracefulStop(); // Verify presence of the latest debug tx set diff --git a/src/main/ApplicationImpl.cpp b/src/main/ApplicationImpl.cpp index b1f9be9245..5562bdcf9a 100644 --- a/src/main/ApplicationImpl.cpp +++ b/src/main/ApplicationImpl.cpp @@ -987,6 +987,15 @@ ApplicationImpl::manualClose(std::optional const& manualLedgerSeq, if (mConfig.RUN_STANDALONE) { + // With background apply, triggerNextLedger posts work to the + // apply thread. Crank until the ledger is fully applied and + // LCL has advanced. + while (getLedgerManager().getLastClosedLedgerNum() < + targetLedgerSeq) + { + getClock().crank(true); + } + auto const newLedgerSeq = getLedgerManager().getLastClosedLedgerNum(); if (newLedgerSeq != targetLedgerSeq) diff --git a/src/main/CommandLine.cpp b/src/main/CommandLine.cpp index b562e71127..dd1ca00f5e 100644 --- a/src/main/CommandLine.cpp +++ b/src/main/CommandLine.cpp @@ -1960,6 +1960,9 @@ runApplyLoad(CommandLineArgs const& args) // Apply Load may exceed TX_SET byte size limits, so ignore them config.IGNORE_MESSAGE_LIMITS_FOR_TESTING = true; + + // Always use background ledger close for max-sac-tps + config.PARALLEL_LEDGER_APPLY = true; } VirtualClock clock(VirtualClock::REAL_TIME); diff --git a/src/main/Config.cpp b/src/main/Config.cpp index 2c7a44d2a2..464999c9f8 100644 --- a/src/main/Config.cpp +++ b/src/main/Config.cpp @@ -1966,19 +1966,10 @@ Config::processConfig(std::shared_ptr t) if (PARALLEL_LEDGER_APPLY && !parallelLedgerClose()) { - if (RUN_STANDALONE) - { - LOG_WARNING(DEFAULT_LOG, "RUN_STANDALONE is enabled, disabling " - "PARALLEL_LEDGER_APPLY"); - PARALLEL_LEDGER_APPLY = false; - } - else - { - std::string msg = - "Invalid configuration: PARALLEL_LEDGER_APPLY " - "does not support in-memory database modes."; - throw std::runtime_error(msg); - } + LOG_WARNING(DEFAULT_LOG, + "PARALLEL_LEDGER_APPLY is not supported with " + "in-memory SQLite, disabling."); + PARALLEL_LEDGER_APPLY = false; } if (INVARIANT_EXTRA_CHECKS && NODE_IS_VALIDATOR) @@ -2547,9 +2538,7 @@ Config::allBucketsInMemory() const bool Config::parallelLedgerClose() const { - // Standalone mode expects synchronous ledger application - return PARALLEL_LEDGER_APPLY && !RUN_STANDALONE && - DATABASE.value != "sqlite3://:memory:"; + return PARALLEL_LEDGER_APPLY && DATABASE.value != "sqlite3://:memory:"; } void diff --git a/src/overlay/test/TCPPeerTests.cpp b/src/overlay/test/TCPPeerTests.cpp index 226ef153fc..5431d9f759 100644 --- a/src/overlay/test/TCPPeerTests.cpp +++ b/src/overlay/test/TCPPeerTests.cpp @@ -163,7 +163,10 @@ TEST_CASE("TCPPeer read malformed messages", "[overlay]") v11SecretKey.getPublicKey()); s->startAllNodes(); s->stopOverlayTick(); - s->crankForAtLeast(std::chrono::seconds(5), false); + // Use generous timeouts: ARTIFICIALLY_SLEEP_MAIN_THREAD_FOR_TESTING + // adds 300ms per postOnMainThread callback. With background ledger + // apply, there are additional callbacks that each incur this sleep. + s->crankForAtLeast(std::chrono::seconds(15), false); auto p0 = n0->getOverlayManager().getConnectedPeer( PeerBareAddress{"127.0.0.1", n1->getConfig().PEER_PORT}); @@ -193,7 +196,7 @@ TEST_CASE("TCPPeer read malformed messages", "[overlay]") return !p0->isConnectedForTesting() && !p1->isConnectedForTesting(); }, - std::chrono::seconds(10), false); + std::chrono::seconds(30), false); REQUIRE(!p0->isConnectedForTesting()); REQUIRE(!p1->isConnectedForTesting()); REQUIRE(p1->getDropReason() == dropReason); diff --git a/src/transactions/test/InflationTests.cpp b/src/transactions/test/InflationTests.cpp index 2eab233559..de7adf2725 100644 --- a/src/transactions/test/InflationTests.cpp +++ b/src/transactions/test/InflationTests.cpp @@ -292,6 +292,10 @@ TEST_CASE_VERSIONS("inflation total coins", "[tx][inflation]") return; } + // This test directly modifies LedgerTxnRoot header (ledgerVersion), which + // creates a hash mismatch incompatible with background apply. + cfg.PARALLEL_LEDGER_APPLY = false; + // The math in this test assumes that every tx will be charged the fee it // specifies, but this isn't true for v11, so start from V0 and update the // version in the header diff --git a/src/util/Timer.cpp b/src/util/Timer.cpp index b9fd0bf986..b0b1e2a855 100644 --- a/src/util/Timer.cpp +++ b/src/util/Timer.cpp @@ -388,12 +388,26 @@ VirtualClock::crank(bool block) // Subtract out any timer cancellations from the above two steps. progressCount -= nRealTimerCancelEvents; + if (mMode == VIRTUAL_TIME && progressCount == 0 && mBackgroundWorkCount.load() == 0) { - // If we did nothing and we're in virtual mode, we're idle and can - // skip time forward, dispatching all timers at the next time-step. - progressCount += advanceToNext(); + // Check if there are pending actions from background threads + // before deciding to advance virtual time. Without this check, + // we might spuriously jump time forward while background apply + // results are waiting in the pending queue. + bool hasPendingActions = false; + { + std::lock_guard guard(mPendingActionQueueMutex); + hasPendingActions = !mPendingActionQueue.empty(); + } + if (!hasPendingActions) + { + // If we did nothing and we're in virtual mode, we're idle + // and can skip time forward, dispatching all timers at the + // next time-step. + progressCount += advanceToNext(); + } } }