From a4cfc02e9d43487a388d4a7996553ae7c4479e57 Mon Sep 17 00:00:00 2001 From: Ruben Bridgewater Date: Tue, 3 Feb 2026 14:02:15 +0100 Subject: [PATCH 1/5] fix(wall): bound CPED freelist with dynamic budget + hysteresis This implement a lower and upper bound to match different user load situations. Refs: https://github.com/DataDog/dd-trace-js/issues/7355 --- bindings/profilers/wall.cc | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/bindings/profilers/wall.cc b/bindings/profilers/wall.cc index 80be444e..4ce88f93 100644 --- a/bindings/profilers/wall.cc +++ b/bindings/profilers/wall.cc @@ -148,6 +148,39 @@ class PersistentContextPtr { void WallProfiler::MarkDeadPersistentContextPtr(PersistentContextPtr* ptr) { deadContextPtrs_.push_back(ptr); liveContextPtrs_.erase(ptr); + // Cap freelist growth by a dynamic byte budget based on live async contexts. + constexpr size_t kMinDeadContextPtrBudgetBytes = 512 * 1024; // 512 KiB + constexpr size_t kMaxDeadContextPtrBudgetBytes = 16 * 1024 * 1024; // 16 MiB + constexpr size_t kDeadContextPtrMultiplier = 2; + const size_t perPtrBytes = sizeof(PersistentContextPtr); + size_t maxDeadContextPtrs = kMaxDeadContextPtrBudgetBytes / perPtrBytes; + size_t minDeadContextPtrs = kMinDeadContextPtrBudgetBytes / perPtrBytes; + if (minDeadContextPtrs > maxDeadContextPtrs) { + minDeadContextPtrs = maxDeadContextPtrs; + } + + const size_t liveCount = liveContextPtrs_.size(); + size_t targetDeadContextPtrs; + if (liveCount >= maxDeadContextPtrs / kDeadContextPtrMultiplier) { + targetDeadContextPtrs = maxDeadContextPtrs; + } else { + targetDeadContextPtrs = liveCount * kDeadContextPtrMultiplier; + if (targetDeadContextPtrs < minDeadContextPtrs) { + targetDeadContextPtrs = minDeadContextPtrs; + } + } + + const size_t shrinkThreshold = + targetDeadContextPtrs + targetDeadContextPtrs / 2; // 1.5x hysteresis + if (deadContextPtrs_.size() <= shrinkThreshold) { + return; + } + + while (deadContextPtrs_.size() > targetDeadContextPtrs) { + auto* toDelete = deadContextPtrs_.front(); + deadContextPtrs_.pop_front(); + delete toDelete; + } } // Maximum number of rounds in the GetV8ToEpochOffset From 4ed68b0fe1082bdf536ead498c1625c7a52f3f31 Mon Sep 17 00:00:00 2001 From: Ruben Bridgewater Date: Tue, 3 Feb 2026 14:05:49 +0100 Subject: [PATCH 2/5] fix(wall): add bounded batch trimming + track dead counts per GC --- bindings/profilers/wall.cc | 21 ++++++++++++++++----- bindings/profilers/wall.hh | 5 +++++ 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/bindings/profilers/wall.cc b/bindings/profilers/wall.cc index 4ce88f93..029690ba 100644 --- a/bindings/profilers/wall.cc +++ b/bindings/profilers/wall.cc @@ -176,10 +176,15 @@ void WallProfiler::MarkDeadPersistentContextPtr(PersistentContextPtr* ptr) { return; } - while (deadContextPtrs_.size() > targetDeadContextPtrs) { + size_t toTrim = deadContextPtrs_.size() - targetDeadContextPtrs; + if (toTrim > trimBatch_) { + toTrim = trimBatch_; + } + while (toTrim > 0) { auto* toDelete = deadContextPtrs_.front(); deadContextPtrs_.pop_front(); delete toDelete; + --toTrim; } } @@ -1537,11 +1542,17 @@ void WallProfiler::OnGCStart(v8::Isolate* isolate) { void WallProfiler::OnGCEnd() { auto oldCount = gcCount.fetch_sub(1, std::memory_order_relaxed); - if (oldCount == 1 && useCPED_) { - // Not strictly necessary, as we'll reset it to something else on next GC, - // but why retain it longer than needed? - gcContext_.reset(); + if (oldCount != 1 || !useCPED_) { + return; } + + // Not strictly necessary, as we'll reset it to something else on next GC, + // but why retain it longer than needed? + gcContext_.reset(); + + const size_t deadCount = deadContextPtrs_.size(); + deadCountAtPrevGc_ = deadCountAtLastGc_; + deadCountAtLastGc_ = deadCount; } void WallProfiler::PushContext(int64_t time_from, diff --git a/bindings/profilers/wall.hh b/bindings/profilers/wall.hh index e7c224b2..7c167d90 100644 --- a/bindings/profilers/wall.hh +++ b/bindings/profilers/wall.hh @@ -65,6 +65,11 @@ class WallProfiler : public Nan::ObjectWrap { // Context pointers belonging to GC'd CPED objects register themselves here. // They will be reused. std::deque deadContextPtrs_; + static constexpr size_t kTrimBatchMin = 32; + static constexpr size_t kTrimBatchMax = 1024; + size_t trimBatch_ = kTrimBatchMin; + size_t deadCountAtLastGc_ = 0; + size_t deadCountAtPrevGc_ = 0; std::atomic gcCount = 0; std::atomic setInProgress_ = false; From 034de53922980c19ec2e860490858e9c550e4c46 Mon Sep 17 00:00:00 2001 From: Ruben Bridgewater Date: Tue, 3 Feb 2026 14:07:03 +0100 Subject: [PATCH 3/5] fix(wall): adapt trim batch with growth/decay + emergency override --- bindings/profilers/wall.cc | 28 +++++++++++++++++++++++++++- bindings/profilers/wall.hh | 2 ++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/bindings/profilers/wall.cc b/bindings/profilers/wall.cc index 029690ba..4e9aaec2 100644 --- a/bindings/profilers/wall.cc +++ b/bindings/profilers/wall.cc @@ -176,8 +176,9 @@ void WallProfiler::MarkDeadPersistentContextPtr(PersistentContextPtr* ptr) { return; } + const size_t emergencyThreshold = maxDeadContextPtrs * 2; // 2x max size_t toTrim = deadContextPtrs_.size() - targetDeadContextPtrs; - if (toTrim > trimBatch_) { + if (deadContextPtrs_.size() <= emergencyThreshold && toTrim > trimBatch_) { toTrim = trimBatch_; } while (toTrim > 0) { @@ -1553,6 +1554,31 @@ void WallProfiler::OnGCEnd() { const size_t deadCount = deadContextPtrs_.size(); deadCountAtPrevGc_ = deadCountAtLastGc_; deadCountAtLastGc_ = deadCount; + if (deadCountAtLastGc_ > deadCountAtPrevGc_) { + deadStableCycles_ = 0; + if (trimBatch_ < kTrimBatchMax) { + if (++deadGrowthCycles_ >= 2) { + const size_t doubled = trimBatch_ * 2; + trimBatch_ = doubled > kTrimBatchMax ? kTrimBatchMax : doubled; + deadGrowthCycles_ = 0; + } + } else { + deadGrowthCycles_ = 0; + } + } else { + deadGrowthCycles_ = 0; + if (trimBatch_ > kTrimBatchMin) { + if (++deadStableCycles_ >= 3) { + trimBatch_ = trimBatch_ / 2; + if (trimBatch_ < kTrimBatchMin) { + trimBatch_ = kTrimBatchMin; + } + deadStableCycles_ = 0; + } + } else { + deadStableCycles_ = 0; + } + } } void WallProfiler::PushContext(int64_t time_from, diff --git a/bindings/profilers/wall.hh b/bindings/profilers/wall.hh index 7c167d90..d60bcdf7 100644 --- a/bindings/profilers/wall.hh +++ b/bindings/profilers/wall.hh @@ -70,6 +70,8 @@ class WallProfiler : public Nan::ObjectWrap { size_t trimBatch_ = kTrimBatchMin; size_t deadCountAtLastGc_ = 0; size_t deadCountAtPrevGc_ = 0; + unsigned int deadGrowthCycles_ = 0; + unsigned int deadStableCycles_ = 0; std::atomic gcCount = 0; std::atomic setInProgress_ = false; From e3edd0c2da08065a6a20dda4fec44e810b9ea04d Mon Sep 17 00:00:00 2001 From: Ruben Bridgewater Date: Tue, 3 Feb 2026 17:35:38 +0100 Subject: [PATCH 4/5] test: add memory leak test This verifies that it fails before and is fixed with the change. --- ts/test/cped-freelist-regression-child.ts | 121 ++++++++++++++++++++++ ts/test/test-cped-freelist-trimming.ts | 64 ++++++++++++ 2 files changed, 185 insertions(+) create mode 100644 ts/test/cped-freelist-regression-child.ts create mode 100644 ts/test/test-cped-freelist-trimming.ts diff --git a/ts/test/cped-freelist-regression-child.ts b/ts/test/cped-freelist-regression-child.ts new file mode 100644 index 00000000..eb840ea0 --- /dev/null +++ b/ts/test/cped-freelist-regression-child.ts @@ -0,0 +1,121 @@ +/** + * Child process entrypoint for CPED freelist trimming regression test. + * + * This file is intentionally not named `test-*.ts` so mocha won't execute it + * directly. It is executed as a standalone Node.js script from the test suite. + */ + +import assert from 'assert'; +import {AsyncLocalStorage} from 'async_hooks'; +import {satisfies} from 'semver'; + +// Require from the built output to match how tests run in CI (out/test/*). +// eslint-disable-next-line @typescript-eslint/no-var-requires +const {time} = require('../src'); + +function isUseCPEDEnabled(): boolean { + return ( + (satisfies(process.versions.node, '>=24.0.0') && + !process.execArgv.includes('--no-async-context-frame')) || + (satisfies(process.versions.node, '>=22.7.0') && + process.execArgv.includes('--experimental-async-context-frame')) + ); +} + +async function main() { + if (process.platform !== 'darwin' && process.platform !== 'linux') { + return; // unsupported in this repo's time profiler tests + } + + // This regression targets the CPED path. + const useCPED = isUseCPEDEnabled(); + if (!useCPED) return; + + const gc = global.gc; + if (typeof gc !== 'function') { + throw new Error('expected --expose-gc'); + } + const runGc = gc as () => void; + + // Ensure an async context frame exists to hold the profiler context. + new AsyncLocalStorage().enterWith(1); + + time.start({ + intervalMicros: 1000, + durationMillis: 10_000, + withContexts: true, + lineNumbers: false, + useCPED: true, + }); + + const als = new AsyncLocalStorage(); + const sharedContext = {v: 1}; + + const testCapCount = Number.parseInt( + process.env.DD_PPROF_TEST_FREELIST_CAP_COUNT || '', + 10 + ); + assert( + Number.isFinite(testCapCount) && testCapCount > 0, + 'DD_PPROF_TEST_FREELIST_CAP_COUNT must be set for this test' + ); + const waveSize = Math.max(testCapCount * 10, 20_000); + const minDelta = Math.max(testCapCount * 4, 5_000); + const debug = process.env.DEBUG_CPED_TEST === '1'; + const log = (...args: unknown[]) => { + if (debug) { + // eslint-disable-next-line no-console + console.error(...args); + } + }; + + async function gcAndYield(times = 3) { + for (let i = 0; i < times; i++) { + runGc(); + await new Promise(resolve => setImmediate(resolve)); + } + } + + async function runWave(count: number): Promise { + const tasks: Array> = []; + for (let i = 0; i < count; i++) { + const value = i; + tasks.push( + als.run(value, async () => { + await new Promise(resolve => setTimeout(resolve, 0)); + time.setContext({v: value}); + }) + ); + } + await Promise.all(tasks); + } + + const baseline = time.getMetrics().totalAsyncContextCount; + await runWave(waveSize); + const metricsBeforeGc = time.getMetrics(); + const totalBeforeGc = metricsBeforeGc.totalAsyncContextCount; + log('baseline', baseline, 'metricsBeforeGc', metricsBeforeGc); + assert( + totalBeforeGc - baseline >= minDelta, + `test did not create enough async contexts (baseline=${baseline}, total=${totalBeforeGc})` + ); + + await gcAndYield(6); + const metricsAfterGc = time.getMetrics(); + const totalAfterGc = metricsAfterGc.totalAsyncContextCount; + log('metricsAfterGc', metricsAfterGc); + const capWithSlack = Math.ceil(testCapCount * 1.6); + assert( + totalAfterGc <= capWithSlack, + `expected trimming; before=${totalBeforeGc}, after=${totalAfterGc}, cap=${capWithSlack}` + ); + + time.stop(false); +} + +main().catch(err => { + // Ensure the child exits non-zero on failure. + // eslint-disable-next-line no-console + console.error(err); + process.exit(1); +}); diff --git a/ts/test/test-cped-freelist-trimming.ts b/ts/test/test-cped-freelist-trimming.ts new file mode 100644 index 00000000..cad39b60 --- /dev/null +++ b/ts/test/test-cped-freelist-trimming.ts @@ -0,0 +1,64 @@ +/** + * Regression test for CPED context pointer freelist growth. + * + * Runs the actual workload in a separate Node.js process launched with + * `--expose-gc` so we can force GC deterministically. + */ + +import assert from 'assert'; +import {spawnSync} from 'child_process'; +import path from 'path'; +import {satisfies} from 'semver'; + +describe('CPED freelist trimming (regression)', () => { + it('should plateau total async context pointers after enough churn', function () { + this.timeout(120_000); + + if (process.platform !== 'darwin' && process.platform !== 'linux') { + this.skip(); + } + + const supportsCPED = + satisfies(process.versions.node, '>=24.0.0') || + satisfies(process.versions.node, '>=22.7.0'); + if (!supportsCPED) { + this.skip(); + } + + const gcCheck = spawnSync( + process.execPath, + [ + '--expose-gc', + '-e', + "process.exit(typeof global.gc === 'function' ? 0 : 1)", + ], + {stdio: 'pipe'} + ); + if (gcCheck.status !== 0) { + this.skip(); + } + + const child = path.join(__dirname, 'cped-freelist-regression-child.js'); + const args = ['--expose-gc', '--max-old-space-size=4096']; + if ( + satisfies(process.versions.node, '>=22.7.0') && + satisfies(process.versions.node, '<24.0.0') + ) { + args.push('--experimental-async-context-frame'); + } + args.push(child); + const res = spawnSync(process.execPath, args, { + stdio: 'inherit', + env: { + ...process.env, + NODE_GYP_BUILD_FROM_SOURCE: '1', + npm_config_build_from_source: '1', + DD_PPROF_TEST_FREELIST_CAP_COUNT: '2000', + }, + }); + + // If the child process exits non-zero, fail with a helpful message. + assert.strictEqual(res.error, undefined); + assert.strictEqual(res.status, 0, `child exited with status ${res.status}`); + }); +}); From b27e227c4de1dd24eb746b52d096c4960c3731aa Mon Sep 17 00:00:00 2001 From: Ruben Bridgewater Date: Tue, 3 Feb 2026 20:13:12 +0100 Subject: [PATCH 5/5] fixup! --- bindings/profilers/wall.cc | 4 +-- ts/test/cped-freelist-regression-child.ts | 36 +++++++++++++---------- ts/test/test-cped-freelist-trimming.ts | 6 ---- 3 files changed, 22 insertions(+), 24 deletions(-) diff --git a/bindings/profilers/wall.cc b/bindings/profilers/wall.cc index 4e9aaec2..e2c24817 100644 --- a/bindings/profilers/wall.cc +++ b/bindings/profilers/wall.cc @@ -149,8 +149,8 @@ void WallProfiler::MarkDeadPersistentContextPtr(PersistentContextPtr* ptr) { deadContextPtrs_.push_back(ptr); liveContextPtrs_.erase(ptr); // Cap freelist growth by a dynamic byte budget based on live async contexts. - constexpr size_t kMinDeadContextPtrBudgetBytes = 512 * 1024; // 512 KiB - constexpr size_t kMaxDeadContextPtrBudgetBytes = 16 * 1024 * 1024; // 16 MiB + constexpr size_t kMinDeadContextPtrBudgetBytes = 512 * 1024; // 512 KiB + constexpr size_t kMaxDeadContextPtrBudgetBytes = 16 * 1024 * 1024; // 16 MiB constexpr size_t kDeadContextPtrMultiplier = 2; const size_t perPtrBytes = sizeof(PersistentContextPtr); size_t maxDeadContextPtrs = kMaxDeadContextPtrBudgetBytes / perPtrBytes; diff --git a/ts/test/cped-freelist-regression-child.ts b/ts/test/cped-freelist-regression-child.ts index eb840ea0..988a7a8d 100644 --- a/ts/test/cped-freelist-regression-child.ts +++ b/ts/test/cped-freelist-regression-child.ts @@ -49,18 +49,11 @@ async function main() { }); const als = new AsyncLocalStorage(); - const sharedContext = {v: 1}; - const testCapCount = Number.parseInt( - process.env.DD_PPROF_TEST_FREELIST_CAP_COUNT || '', - 10 - ); - assert( - Number.isFinite(testCapCount) && testCapCount > 0, - 'DD_PPROF_TEST_FREELIST_CAP_COUNT must be set for this test' - ); - const waveSize = Math.max(testCapCount * 10, 20_000); - const minDelta = Math.max(testCapCount * 4, 5_000); + const waveSize = 20_000; + const maxWaves = 6; + const minDelta = 5_000; + const minTotalBeforeGc = 40_000; const debug = process.env.DEBUG_CPED_TEST === '1'; const log = (...args: unknown[]) => { if (debug) { @@ -91,23 +84,33 @@ async function main() { } const baseline = time.getMetrics().totalAsyncContextCount; - await runWave(waveSize); + let totalBeforeGc = baseline; + let wavesRun = 0; + while (wavesRun < maxWaves && totalBeforeGc < minTotalBeforeGc) { + await runWave(waveSize); + totalBeforeGc = time.getMetrics().totalAsyncContextCount; + wavesRun++; + log('wave', wavesRun, 'totalBeforeGc', totalBeforeGc); + } const metricsBeforeGc = time.getMetrics(); - const totalBeforeGc = metricsBeforeGc.totalAsyncContextCount; log('baseline', baseline, 'metricsBeforeGc', metricsBeforeGc); assert( totalBeforeGc - baseline >= minDelta, `test did not create enough async contexts (baseline=${baseline}, total=${totalBeforeGc})` ); + assert( + totalBeforeGc >= minTotalBeforeGc, + `test did not reach target async context count (total=${totalBeforeGc})` + ); await gcAndYield(6); const metricsAfterGc = time.getMetrics(); const totalAfterGc = metricsAfterGc.totalAsyncContextCount; log('metricsAfterGc', metricsAfterGc); - const capWithSlack = Math.ceil(testCapCount * 1.6); + const maxAllowed = Math.floor(totalBeforeGc * 0.75); assert( - totalAfterGc <= capWithSlack, - `expected trimming; before=${totalBeforeGc}, after=${totalAfterGc}, cap=${capWithSlack}` + totalAfterGc <= maxAllowed, + `expected trimming; before=${totalBeforeGc}, after=${totalAfterGc}, max=${maxAllowed}` ); time.stop(false); @@ -117,5 +120,6 @@ main().catch(err => { // Ensure the child exits non-zero on failure. // eslint-disable-next-line no-console console.error(err); + // eslint-disable-next-line no-process-exit process.exit(1); }); diff --git a/ts/test/test-cped-freelist-trimming.ts b/ts/test/test-cped-freelist-trimming.ts index cad39b60..f8863c25 100644 --- a/ts/test/test-cped-freelist-trimming.ts +++ b/ts/test/test-cped-freelist-trimming.ts @@ -49,12 +49,6 @@ describe('CPED freelist trimming (regression)', () => { args.push(child); const res = spawnSync(process.execPath, args, { stdio: 'inherit', - env: { - ...process.env, - NODE_GYP_BUILD_FROM_SOURCE: '1', - npm_config_build_from_source: '1', - DD_PPROF_TEST_FREELIST_CAP_COUNT: '2000', - }, }); // If the child process exits non-zero, fail with a helpful message.