From 24540580b566ceb6ff4bc86e96de812712023e7f Mon Sep 17 00:00:00 2001 From: Jussi Maki Date: Thu, 15 Jan 2026 10:58:42 +0100 Subject: [PATCH 1/3] statedb: Add commit-only benchmarks Add 1 and 100 table commit-only benchmark to test for impact of lots of registered tables on the cost of [dbRoot] cloning. Signed-off-by: Jussi Maki --- benchmarks_test.go | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/benchmarks_test.go b/benchmarks_test.go index 8b1b04d..336dcde 100644 --- a/benchmarks_test.go +++ b/benchmarks_test.go @@ -95,6 +95,29 @@ func BenchmarkDB_WriteTxn_100_SecondaryIndex(b *testing.B) { b.ReportMetric(float64(b.N)/b.Elapsed().Seconds(), "objects/sec") } +func BenchmarkDB_WriteTxn_CommitOnly_100Tables(b *testing.B) { + db := New(WithMetrics(&NopMetrics{})) + for i := range 99 { + newTestObjectTable(b, db, fmt.Sprintf("other%d", i)) + } + table := newTestObjectTable(b, db, "test", tagsIndex) + b.ResetTimer() + + for b.Loop() { + db.WriteTxn(table).Commit() + } +} + +func BenchmarkDB_WriteTxn_CommitOnly_1Table(b *testing.B) { + db := New(WithMetrics(&NopMetrics{})) + table := newTestObjectTable(b, db, "test", tagsIndex) + b.ResetTimer() + + for b.Loop() { + db.WriteTxn(table).Commit() + } +} + func BenchmarkDB_NewWriteTxn(b *testing.B) { db, table := newTestDBWithMetrics(b, &NopMetrics{}, tagsIndex) for b.Loop() { From 0bd6f943203903aca284bc6fe164c6f2d05930ac Mon Sep 17 00:00:00 2001 From: Jussi Maki Date: Thu, 15 Jan 2026 09:38:33 +0100 Subject: [PATCH 2/3] part: Pass Tree around by value This avoids allocating [part.Tree] in the heap and saves an allocation on the write path. Signed-off-by: Jussi Maki --- benchmarks_test.go | 8 -------- deletetracker.go | 3 ++- part/map.go | 36 ++++++++++++++++++++++-------------- part/map_test.go | 16 ++++++++-------- part/quick_test.go | 12 ++++++------ part/set.go | 30 +++++++++++++++++------------- part/tree.go | 16 +++++++++------- part/txn.go | 16 ++++++++++------ part_index.go | 26 +++++++++++++++----------- table.go | 3 ++- write_txn.go | 3 ++- 11 files changed, 93 insertions(+), 76 deletions(-) diff --git a/benchmarks_test.go b/benchmarks_test.go index 336dcde..317a566 100644 --- a/benchmarks_test.go +++ b/benchmarks_test.go @@ -101,7 +101,6 @@ func BenchmarkDB_WriteTxn_CommitOnly_100Tables(b *testing.B) { newTestObjectTable(b, db, fmt.Sprintf("other%d", i)) } table := newTestObjectTable(b, db, "test", tagsIndex) - b.ResetTimer() for b.Loop() { db.WriteTxn(table).Commit() @@ -111,7 +110,6 @@ func BenchmarkDB_WriteTxn_CommitOnly_100Tables(b *testing.B) { func BenchmarkDB_WriteTxn_CommitOnly_1Table(b *testing.B) { db := New(WithMetrics(&NopMetrics{})) table := newTestObjectTable(b, db, "test", tagsIndex) - b.ResetTimer() for b.Loop() { db.WriteTxn(table).Commit() @@ -131,7 +129,6 @@ func BenchmarkDB_WriteTxnCommit100(b *testing.B) { for i := range len(tables) { tables[i], _ = NewTable(db, fmt.Sprintf("test%d", i), idIndex) } - b.ResetTimer() for b.Loop() { db.WriteTxn(tables[len(tables)-1]).Commit() @@ -473,7 +470,6 @@ func BenchmarkDB_FullIteration_All(b *testing.B) { require.NoError(b, err) } wtxn.Commit() - b.ResetTimer() for b.Loop() { txn := db.ReadTxn() @@ -499,7 +495,6 @@ func BenchmarkDB_FullIteration_Prefix(b *testing.B) { require.NoError(b, err) } wtxn.Commit() - b.ResetTimer() query := Query[*testObject]{index: idIndex.indexName()} @@ -529,7 +524,6 @@ func BenchmarkDB_FullIteration_Get(b *testing.B) { require.NoError(b, err) } wtxn.Commit() - b.ResetTimer() txn := db.ReadTxn() for b.Loop() { @@ -554,7 +548,6 @@ func BenchmarkDB_FullIteration_Get_Secondary(b *testing.B) { require.NoError(b, err) } wtxn.Commit() - b.ResetTimer() txn := db.ReadTxn() for b.Loop() { @@ -580,7 +573,6 @@ func BenchmarkDB_FullIteration_ReadTxnGet(b *testing.B) { require.NoError(b, err) } wtxn.Commit() - b.ResetTimer() for b.Loop() { for _, q := range queries { diff --git a/deletetracker.go b/deletetracker.go index f0bc75e..8ced835 100644 --- a/deletetracker.go +++ b/deletetracker.go @@ -66,7 +66,8 @@ func (dt *deleteTracker[Obj]) close() { if !table.locked { panic("BUG: Table not locked") } - _, _, table.deleteTrackers = table.deleteTrackers.Delete([]byte(dt.trackerName)) + _, _, updated := table.deleteTrackers.Delete([]byte(dt.trackerName)) + table.deleteTrackers = &updated wtxn.Commit() db.metrics.DeleteTrackerCount(dt.table.Name(), table.deleteTrackers.Len()) diff --git a/part/map.go b/part/map.go index 9ab85a0..e975563 100644 --- a/part/map.go +++ b/part/map.go @@ -20,8 +20,9 @@ import ( // keys that are not []byte. type Map[K, V any] struct { bytesFromKeyFunc func(K) []byte - tree *Tree[mapKVPair[K, V]] singleton *mapKVPair[K, V] + tree Tree[mapKVPair[K, V]] + hasTree bool } type mapKVPair[K, V any] struct { @@ -59,8 +60,9 @@ func FromMap[K comparable, V any](m Map[K, V], hm map[K]V) Map[K, V] { // it is. The whole nil tree thing is to make sure that creating // an empty map does not allocate anything. func (m *Map[K, V]) ensureTree() { - if m.tree == nil { + if !m.hasTree { m.tree = New[mapKVPair[K, V]](RootOnlyWatch) + m.hasTree = true } } @@ -70,7 +72,7 @@ func (m Map[K, V]) Get(key K) (value V, found bool) { return m.singleton.Value, true } - if m.tree == nil { + if !m.hasTree { return } kv, _, found := m.tree.Get(m.keyToBytes(key)) @@ -81,7 +83,7 @@ func (m Map[K, V]) Get(key K) (value V, found bool) { // Original map is unchanged. func (m Map[K, V]) Set(key K, value V) Map[K, V] { keyBytes := m.keyToBytes(key) - if m.tree == nil && m.singleton == nil || m.singleton != nil && bytes.Equal(keyBytes, m.keyToBytes(m.singleton.Key)) { + if !m.hasTree && m.singleton == nil || m.singleton != nil && bytes.Equal(keyBytes, m.keyToBytes(m.singleton.Key)) { m.singleton = &mapKVPair[K, V]{key, value} return m } @@ -113,16 +115,18 @@ func (m Map[K, V]) Delete(key K) Map[K, V] { } return m } - if m.tree != nil { + if m.hasTree { txn := m.tree.Txn() txn.Delete(m.keyToBytes(key)) switch txn.Len() { case 0: - m.tree = nil + m.tree = Tree[mapKVPair[K, V]]{} + m.hasTree = false case 1: for _, v := range txn.Iterator().All { m.singleton = &v - m.tree = nil + m.tree = Tree[mapKVPair[K, V]]{} + m.hasTree = false } default: m.tree = txn.Commit() @@ -149,7 +153,7 @@ func (m Map[K, V]) LowerBound(from K) iter.Seq2[K, V] { return m.singletonIter() } } - if m.tree == nil { + if !m.hasTree { return toSeq2[K, V](Iterator[mapKVPair[K, V]]{}) } return toSeq2(m.tree.LowerBound(m.keyToBytes(from))) @@ -171,7 +175,7 @@ func (m Map[K, V]) Prefix(prefix K) iter.Seq2[K, V] { return m.singletonIter() } } - if m.tree == nil { + if !m.hasTree { return toSeq2[K, V](Iterator[mapKVPair[K, V]]{}) } iter, _ := m.tree.Prefix(m.keyToBytes(prefix)) @@ -185,7 +189,7 @@ func (m Map[K, V]) All() iter.Seq2[K, V] { if m.singleton != nil { return m.singletonIter() } - if m.tree == nil { + if !m.hasTree { return toSeq2[K, V](Iterator[mapKVPair[K, V]]{}) } return toSeq2(m.tree.Iterator()) @@ -198,7 +202,7 @@ func (m Map[K, V]) EqualKeys(other Map[K, V]) bool { return false case m.singleton != nil && other.singleton != nil: return bytes.Equal(m.keyToBytes(m.singleton.Key), other.keyToBytes(other.singleton.Key)) - case m.tree == nil && other.tree == nil: + case !m.hasTree && !other.hasTree: return true default: iter1 := m.tree.Iterator() @@ -228,7 +232,7 @@ func (m Map[K, V]) SlowEqual(other Map[K, V]) bool { case m.singleton != nil && other.singleton != nil: return bytes.Equal(m.keyToBytes(m.singleton.Key), other.keyToBytes(other.singleton.Key)) && reflect.DeepEqual(m.singleton.Value, other.singleton.Value) - case m.tree == nil && other.tree == nil: + case !m.hasTree && !other.hasTree: return true default: iter1 := m.tree.Iterator() @@ -253,14 +257,14 @@ func (m Map[K, V]) Len() int { if m.singleton != nil { return 1 } - if m.tree == nil { + if !m.hasTree { return 0 } return m.tree.size } func (m Map[K, V]) MarshalJSON() ([]byte, error) { - if m.tree == nil && m.singleton == nil { + if !m.hasTree && m.singleton == nil { return []byte("[]"), nil } @@ -416,6 +420,10 @@ func (txn MapTxn[K, V]) Commit() (m Map[K, V]) { m.singleton = &kv default: m.tree = txn.txn.Commit() + m.hasTree = true + } + if m.singleton != nil { + m.hasTree = false } return } diff --git a/part/map_test.go b/part/map_test.go index 83ae3a9..55e23fe 100644 --- a/part/map_test.go +++ b/part/map_test.go @@ -147,18 +147,18 @@ func TestSingletonMap(t *testing.T) { switch m.Len() { case 0: require.Nil(t, m.singleton) - require.Nil(t, m.tree) + require.False(t, m.hasTree) case 1: require.NotNil(t, m.singleton) - require.Nil(t, m.tree) + require.False(t, m.hasTree) default: require.Nil(t, m.singleton) - require.NotNil(t, m.tree) + require.True(t, m.hasTree) } if m.singleton != nil { - require.Nil(t, m.tree, "Tree should not be set if singleton set") + require.False(t, m.hasTree, "Tree should not be set if singleton set") } - if m.tree != nil { + if m.hasTree { require.Nil(t, m.singleton, "Singleton should not be set if tree set") } } @@ -290,7 +290,7 @@ func TestMapTxn(t *testing.T) { tree := txn.Commit() assert.Equal(t, 0, tree.Len()) - assert.Nil(t, tree.tree) + assert.False(t, tree.hasTree) assert.Nil(t, tree.singleton) // Add foo=>42 @@ -306,7 +306,7 @@ func TestMapTxn(t *testing.T) { tree = txn.Commit() assert.Equal(t, 1, tree.Len()) - assert.Nil(t, tree.tree) + assert.False(t, tree.hasTree) assert.NotNil(t, tree.singleton) v, found = tree.Get("foo") assert.True(t, found) @@ -350,7 +350,7 @@ func TestMapTxn(t *testing.T) { tree = txn.Commit() assert.Equal(t, 2, tree.Len()) - assert.NotNil(t, tree.tree) + assert.True(t, tree.hasTree) assert.Nil(t, tree.singleton) mp = maps.Collect(tree.All()) assert.Len(t, mp, 2) diff --git a/part/quick_test.go b/part/quick_test.go index 17dfe7d..f796b76 100644 --- a/part/quick_test.go +++ b/part/quick_test.go @@ -25,7 +25,7 @@ var quickConfig = &quick.Config{ func TestQuick_InsertGetPrefix(t *testing.T) { t.Parallel() - var tree *Tree[string] + var tree Tree[string] insert := func(key, value string) any { watchChannels := []<-chan struct{}{} // Add all possible watch channels for prefixes of the key @@ -260,18 +260,18 @@ func TestQuick_Map(t *testing.T) { switch m.Len() { case 0: require.Nil(t, m.singleton) - require.Nil(t, m.tree) + require.False(t, m.hasTree) case 1: require.NotNil(t, m.singleton) - require.Nil(t, m.tree) + require.False(t, m.hasTree) default: require.Nil(t, m.singleton) - require.NotNil(t, m.tree) + require.True(t, m.hasTree) } if m.singleton != nil { - require.Nil(t, m.tree, "Tree should not be set if singleton set") + require.False(t, m.hasTree, "Tree should not be set if singleton set") } - if m.tree != nil { + if m.hasTree { require.Nil(t, m.singleton, "Singleton should not be set if tree set") } } diff --git a/part/set.go b/part/set.go index b717eed..c78b56c 100644 --- a/part/set.go +++ b/part/set.go @@ -21,7 +21,8 @@ import ( // For Set-only use only [bytesFromKey] needs to be defined. type Set[T any] struct { toBytes func(T) []byte - tree *Tree[T] + tree Tree[T] + hasTree bool } // NewSet creates a new set of T. @@ -41,8 +42,9 @@ func NewSet[T any](values ...T) Set[T] { } func (s *Set[T]) ensureTree() { - if s.tree == nil { + if !s.hasTree { s.tree = New[T](RootOnlyWatch) + s.hasTree = true } s.toBytes = lookupKeyType[T]() } @@ -59,21 +61,22 @@ func (s Set[T]) Set(v T) Set[T] { // Delete returns a new set without the value. The original // set is unchanged. func (s Set[T]) Delete(v T) Set[T] { - if s.tree == nil { + if !s.hasTree { return s } txn := s.tree.Txn() txn.Delete(s.toBytes(v)) s.tree = txn.Commit() if s.tree.Len() == 0 { - s.tree = nil + s.tree = Tree[T]{} + s.hasTree = false } return s } // Has returns true if the set has the value. func (s Set[T]) Has(v T) bool { - if s.tree == nil { + if !s.hasTree { return false } _, _, found := s.tree.Get(s.toBytes(v)) @@ -85,7 +88,7 @@ func emptySeq[T any](yield func(T) bool) { // All returns an iterator for all values. func (s Set[T]) All() iter.Seq[T] { - if s.tree == nil { + if !s.hasTree { return emptySeq[T] } return s.yieldAll @@ -100,10 +103,10 @@ func (s Set[T]) yieldAll(yield func(v T) bool) { // Union returns a set that is the union of the values // in the input sets. func (s Set[T]) Union(s2 Set[T]) Set[T] { - if s2.tree == nil { + if !s2.hasTree { return s } - if s.tree == nil { + if !s.hasTree { return s2 } txn := s.tree.Txn() @@ -118,7 +121,7 @@ func (s Set[T]) Union(s2 Set[T]) Set[T] { // Difference returns a set with values that only // appear in the first set. func (s Set[T]) Difference(s2 Set[T]) Set[T] { - if s.tree == nil || s2.tree == nil { + if !s.hasTree || !s2.hasTree { return s } @@ -133,7 +136,7 @@ func (s Set[T]) Difference(s2 Set[T]) Set[T] { // Len returns the number of values in the set. func (s Set[T]) Len() int { - if s.tree == nil { + if !s.hasTree { return 0 } return s.tree.size @@ -142,7 +145,7 @@ func (s Set[T]) Len() int { // Equal returns true if the two sets contain the equal keys. func (s Set[T]) Equal(other Set[T]) bool { switch { - case s.tree == nil && other.tree == nil: + case !s.hasTree && !other.hasTree: return true case s.Len() != other.Len(): return false @@ -172,7 +175,7 @@ func (s Set[T]) ToBytesFunc() func(T) []byte { } func (s Set[T]) MarshalJSON() ([]byte, error) { - if s.tree == nil { + if !s.hasTree { return []byte("[]"), nil } var b bytes.Buffer @@ -218,7 +221,8 @@ func (s *Set[T]) UnmarshalJSON(data []byte) error { } s.tree = txn.Commit() if s.tree.Len() == 0 { - s.tree = nil + s.tree = Tree[T]{} + s.hasTree = false } t, err = dec.Token() diff --git a/part/tree.go b/part/tree.go index 81db713..7134549 100644 --- a/part/tree.go +++ b/part/tree.go @@ -18,21 +18,22 @@ type Tree[T any] struct { rootWatch chan struct{} size int // the number of objects in the tree opts options - prevTxn atomic.Pointer[Txn[T]] // the previous txn for reusing the allocation - prevTxnID uint64 // the transaction ID that produced this tree + prevTxn *atomic.Pointer[Txn[T]] // the previous txn for reusing the allocation + prevTxnID uint64 // the transaction ID that produced this tree } // New constructs a new tree. -func New[T any](opts ...Option) *Tree[T] { +func New[T any](opts ...Option) Tree[T] { var o options for _, opt := range opts { opt(&o) } - t := &Tree[T]{ + t := Tree[T]{ root: nil, rootWatch: make(chan struct{}), size: 0, opts: o, + prevTxn: &atomic.Pointer[Txn[T]]{}, } return t } @@ -72,6 +73,7 @@ func (t *Tree[T]) Txn() *Txn[T] { txn.oldRoot = t.root txn.rootWatch = t.rootWatch txn.size = t.size + txn.prevTxn = t.prevTxn txn.txnID = t.prevTxnID + 1 return txn } @@ -112,7 +114,7 @@ func (t *Tree[T]) LowerBound(key []byte) Iterator[T] { // Insert inserts the key into the tree with the given value. // Returns the old value if it exists and a new tree. -func (t *Tree[T]) Insert(key []byte, value T) (old T, hadOld bool, tree *Tree[T]) { +func (t *Tree[T]) Insert(key []byte, value T) (old T, hadOld bool, tree Tree[T]) { txn := t.Txn() old, hadOld = txn.Insert(key, value) tree = txn.CommitAndNotify() @@ -123,7 +125,7 @@ func (t *Tree[T]) Insert(key []byte, value T) (old T, hadOld bool, tree *Tree[T] // function is called with the zero value for T. It is up to the // caller to not mutate the value in-place and to return a clone. // Returns the old value if it exists. -func (t *Tree[T]) Modify(key []byte, value T, mod func(T, T) T) (old T, hadOld bool, tree *Tree[T]) { +func (t *Tree[T]) Modify(key []byte, value T, mod func(T, T) T) (old T, hadOld bool, tree Tree[T]) { txn := t.Txn() old, hadOld = txn.Modify(key, value, mod) tree = txn.CommitAndNotify() @@ -132,7 +134,7 @@ func (t *Tree[T]) Modify(key []byte, value T, mod func(T, T) T) (old T, hadOld b // Delete the given key from the tree. // Returns the old value if it exists and the new tree. -func (t *Tree[T]) Delete(key []byte) (old T, hadOld bool, tree *Tree[T]) { +func (t *Tree[T]) Delete(key []byte) (old T, hadOld bool, tree Tree[T]) { txn := t.Txn() old, hadOld = txn.Delete(key) tree = txn.CommitAndNotify() diff --git a/part/txn.go b/part/txn.go index 81c469d..b6f484e 100644 --- a/part/txn.go +++ b/part/txn.go @@ -8,6 +8,7 @@ import ( "fmt" "os" "slices" + "sync/atomic" ) // Txn is a transaction against a tree. It allows doing efficient @@ -16,6 +17,7 @@ type Txn[T any] struct { root *header[T] oldRoot *header[T] rootWatch chan struct{} + prevTxn *atomic.Pointer[Txn[T]] dirty bool @@ -53,15 +55,16 @@ func (txn *Txn[T]) All(yield func([]byte, T) bool) { // Clone returns a clone of the transaction for reading. The clone is unaffected // by any future changes done with the original transaction. -func (txn *Txn[T]) Clone() *Tree[T] { +func (txn *Txn[T]) Clone() Tree[T] { // Invalidate in-place mutations so the returned clone won't be changed by // further modifications in this transaction. txn.txnID++ - return &Tree[T]{ + return Tree[T]{ opts: txn.opts, root: txn.root, rootWatch: txn.rootWatch, size: txn.size, + prevTxn: txn.prevTxn, prevTxnID: txn.txnID, } } @@ -166,7 +169,7 @@ func (txn *Txn[T]) Iterator() Iterator[T] { // CommitAndNotify commits the transaction and notifies by // closing the watch channels of all modified nodes. -func (txn *Txn[T]) CommitAndNotify() *Tree[T] { +func (txn *Txn[T]) CommitAndNotify() Tree[T] { txn.Notify() return txn.Commit() } @@ -175,20 +178,21 @@ func (txn *Txn[T]) CommitAndNotify() *Tree[T] { // watch channels. Returns the new tree. // To close the watch channels call Notify(). You must call Notify() before // Tree.Txn(). -func (txn *Txn[T]) Commit() *Tree[T] { +func (txn *Txn[T]) Commit() Tree[T] { newRootWatch := txn.rootWatch if txn.dirty { newRootWatch = make(chan struct{}) validateTree(txn.oldRoot, nil, nil, txn.txnID) validateTree(txn.root, nil, txn.watches, txn.txnID) } - t := &Tree[T]{ + t := Tree[T]{ opts: txn.opts, root: txn.root, rootWatch: newRootWatch, size: txn.size, + prevTxn: txn.prevTxn, + prevTxnID: txn.txnID, } - t.prevTxnID = txn.txnID // Store this txn in the tree to reuse the allocation next time. t.prevTxn.Store(txn) return t diff --git a/part_index.go b/part_index.go index 3f7972b..7bde0da 100644 --- a/part_index.go +++ b/part_index.go @@ -106,7 +106,7 @@ func (i Index[Obj, Key]) newTableIndex() tableIndex { // partIndex indexes objects in a [part.Tree], e.g. an adaptive radix tree. type partIndex struct { - tree *part.Tree[object] + tree part.Tree[object] // partIndexTxn is the current transaction against the index. It's embedded // here to avoid heap allocations. @@ -115,7 +115,7 @@ type partIndex struct { // list implements tableIndex. func (r *partIndex) list(key index.Key) (tableIndexIterator, <-chan struct{}) { - return partList(r.unique, r.tree, key) + return partList(r.unique, &r.tree, key) } var emptyTableIndexIterator = &singletonTableIndexIterator{} @@ -156,7 +156,7 @@ func (r *partIndex) commit() (tableIndex, tableIndexTxnNotify) { // get implements tableIndex. func (r *partIndex) get(ikey index.Key) (iobj object, watch <-chan struct{}, found bool) { - return partGet(r.unique, r.tree, ikey) + return partGet(r.unique, &r.tree, ikey) } func partGet(unique bool, tree part.Ops[object], ikey index.Key) (iobj object, watch <-chan struct{}, found bool) { @@ -191,12 +191,12 @@ func (r *partIndex) len() int { } func (r *partIndex) all() (tableIndexIterator, <-chan struct{}) { - return r.tree, r.rootWatch() + return &r.tree, r.rootWatch() } // prefix implements tableIndex. func (r *partIndex) prefix(ikey index.Key) (tableIndexIterator, <-chan struct{}) { - return partPrefix(r.unique, r.tree, ikey) + return partPrefix(r.unique, &r.tree, ikey) } func partPrefix(unique bool, tree part.Ops[object], key index.Key) (tableIndexIterator, <-chan struct{}) { @@ -212,7 +212,7 @@ func partPrefix(unique bool, tree part.Ops[object], key index.Key) (tableIndexIt // lowerBound implements tableIndexTxn. func (r *partIndex) lowerBound(ikey index.Key) (tableIndexIterator, <-chan struct{}) { - return partLowerBound(r.unique, r.tree, ikey), r.rootWatch() + return partLowerBound(r.unique, &r.tree, ikey), r.rootWatch() } // lowerBoundNext implements tableIndexTxn. @@ -255,18 +255,20 @@ type partIndexTxn struct { // all implements tableIndexTxn. func (r *partIndexTxn) all() (tableIndexIterator, <-chan struct{}) { - return r.tx.Clone(), r.rootWatch() + snapshot := r.tx.Clone() + return &snapshot, r.rootWatch() } // list implements tableIndexTxn. func (r *partIndexTxn) list(ikey index.Key) (tableIndexIterator, <-chan struct{}) { - return partList(r.unique, r.tx.Clone(), ikey) + snapshot := r.tx.Clone() + return partList(r.unique, &snapshot, ikey) } // lowerBound implements tableIndexTxn. func (r *partIndexTxn) lowerBound(ikey index.Key) (tableIndexIterator, <-chan struct{}) { snapshot := r.tx.Clone() - return partLowerBound(r.unique, snapshot, ikey), r.rootWatch() + return partLowerBound(r.unique, &snapshot, ikey), r.rootWatch() } // lowerBoundNext implements tableIndexTxn. @@ -274,7 +276,8 @@ func (r *partIndexTxn) lowerBoundNext(key index.Key) (func() ([]byte, object, bo if !r.unique { key = encodeNonUniqueBytes(key) } - iter := r.tx.Clone().LowerBound(key) + snapshot := r.tx.Clone() + iter := snapshot.LowerBound(key) if r.unique { return iter.Next, r.rootWatch() } @@ -332,7 +335,8 @@ func (r *partIndexTxn) notify() { // prefix implements tableIndexTxn. func (r *partIndexTxn) prefix(ikey index.Key) (tableIndexIterator, <-chan struct{}) { - return partPrefix(r.unique, r.tx.Clone(), ikey) + snapshot := r.tx.Clone() + return partPrefix(r.unique, &snapshot, ikey) } func (r *partIndexTxn) objectToKey(obj object) index.Key { diff --git a/table.go b/table.go index beceb8a..dbf542f 100644 --- a/table.go +++ b/table.go @@ -236,7 +236,8 @@ func (t *genTable[Obj]) getAcquiredInfo() string { func (t *genTable[Obj]) tableEntry() tableEntry { var entry tableEntry entry.meta = t - entry.deleteTrackers = part.New[anyDeleteTracker]() + deleteTrackers := part.New[anyDeleteTracker]() + entry.deleteTrackers = &deleteTrackers // A new table is initialized, as no initializers are registered. entry.indexes = make([]tableIndex, len(t.indexPositions)) diff --git a/write_txn.go b/write_txn.go index ed58a6b..2b98add 100644 --- a/write_txn.go +++ b/write_txn.go @@ -216,7 +216,8 @@ func (txn *writeTxnState) addDeleteTracker(meta TableMeta, trackerName string, d return tableError(meta.Name(), ErrTableNotLockedForWriting) } - _, _, table.deleteTrackers = table.deleteTrackers.Insert([]byte(trackerName), dt) + _, _, updated := table.deleteTrackers.Insert([]byte(trackerName), dt) + table.deleteTrackers = &updated txn.db.metrics.DeleteTrackerCount(meta.Name(), table.deleteTrackers.Len()) return nil From b12825073d0b610a8a79a00ed7de2ef091ec1569 Mon Sep 17 00:00:00 2001 From: Jussi Maki Date: Thu, 15 Jan 2026 10:53:03 +0100 Subject: [PATCH 3/3] statedb: Store tableEntry by pointer As StateDB usage grows in Cilium we start to reach the point where cloning the [dbRoot] is becoming increasingly expensive. At the cost of an additional heap allocation change to storing the tableEntry by pointer. Before: BenchmarkDB_WriteTxn_1-8 1222287 981.1 ns/op 1019299 objects/sec 944 B/op 15 allocs/op BenchmarkDB_WriteTxn_10-8 2851748 418.0 ns/op 2392425 objects/sec 499 B/op 8 allocs/op BenchmarkDB_WriteTxn_100-8 3561373 334.3 ns/op 2991648 objects/sec 485 B/op 7 allocs/op BenchmarkDB_WriteTxn_1000-8 3045235 393.8 ns/op 2539482 objects/sec 437 B/op 7 allocs/op BenchmarkDB_WriteTxn_100_SecondaryIndex-8 1502962 796.5 ns/op 1255539 objects/sec 1004 B/op 20 allocs/op BenchmarkDB_WriteTxn_CommitOnly_100Tables-8 542452 2213 ns/op 8332 B/op 4 allocs/op BenchmarkDB_WriteTxn_CommitOnly_1Table-8 2411512 498.2 ns/op 216 B/op 4 allocs/op After: BenchmarkDB_WriteTxn_1-8 1205449 994.8 ns/op 1005270 objects/sec 952 B/op 16 allocs/op BenchmarkDB_WriteTxn_10-8 2862157 416.8 ns/op 2399386 objects/sec 500 B/op 8 allocs/op BenchmarkDB_WriteTxn_100-8 3564074 333.1 ns/op 3001994 objects/sec 485 B/op 7 allocs/op BenchmarkDB_WriteTxn_1000-8 2716454 392.7 ns/op 2546569 objects/sec 437 B/op 7 allocs/op BenchmarkDB_WriteTxn_100_SecondaryIndex-8 1507063 798.2 ns/op 1252773 objects/sec 1004 B/op 20 allocs/op BenchmarkDB_WriteTxn_CommitOnly_100Tables-8 1427040 846.3 ns/op 1112 B/op 5 allocs/op BenchmarkDB_WriteTxn_CommitOnly_1Table-8 2386636 505.4 ns/op 224 B/op 5 allocs/op Signed-off-by: Jussi Maki --- db.go | 12 ++++++------ deletetracker.go | 2 +- iterator.go | 2 +- read_txn.go | 6 +++--- table.go | 8 ++++---- types.go | 2 +- write_txn.go | 18 +++++++++--------- 7 files changed, 25 insertions(+), 25 deletions(-) diff --git a/db.go b/db.go index 00baa05..6999076 100644 --- a/db.go +++ b/db.go @@ -99,7 +99,7 @@ type dbState struct { writeTxnPool sync.Pool } -type dbRoot = []tableEntry +type dbRoot = []*tableEntry type Option func(*opts) @@ -145,7 +145,7 @@ func (db *DB) updateWriteTxnPoolLocked(numTables int) { func() any { return &writeTxnState{ db: db, - tableEntries: make([]tableEntry, 0, numTables), + tableEntries: make([]*tableEntry, 0, numTables), smus: make(internal.SortableMutexes, 0, defaultNumTables), tableNames: make([]string, 0, defaultNumTables), } @@ -212,10 +212,10 @@ func (db *DB) WriteTxn(tables ...TableMeta) WriteTxn { txn.tableNames = reuseSlice(txn.tableNames, len(tables)) for i, table := range tables { pos := table.tablePos() - tableEntry := &txn.tableEntries[pos] - tableEntry.indexes = slices.Clone(tableEntry.indexes) - tableEntry.locked = true - + tableEntryCopy := *txn.tableEntries[pos] + tableEntryCopy.indexes = slices.Clone(tableEntryCopy.indexes) + tableEntryCopy.locked = true + txn.tableEntries[pos] = &tableEntryCopy name := table.Name() txn.tableNames[i] = name diff --git a/deletetracker.go b/deletetracker.go index 8ced835..b134a48 100644 --- a/deletetracker.go +++ b/deletetracker.go @@ -62,7 +62,7 @@ func (dt *deleteTracker[Obj]) close() { txn := wtxn.unwrap() dt.db = nil db := txn.db - table := &txn.tableEntries[dt.table.tablePos()] + table := txn.tableEntries[dt.table.tablePos()] if !table.locked { panic("BUG: Table not locked") } diff --git a/iterator.go b/iterator.go index 49ab57c..db7ff8c 100644 --- a/iterator.go +++ b/iterator.go @@ -166,7 +166,7 @@ type changeIterator[Obj any] struct { } func (it *changeIterator[Obj]) refresh(txn ReadTxn) { - tableEntry := &txn.root()[it.table.tablePos()] + tableEntry := txn.root()[it.table.tablePos()] if it.iter != nil && tableEntry.locked { var obj Obj panic(fmt.Sprintf("Table[%T].Changes().Next() called with the target table locked. This is not supported.", obj)) diff --git a/read_txn.go b/read_txn.go index 16ea1d5..11fe3b9 100644 --- a/read_txn.go +++ b/read_txn.go @@ -8,10 +8,10 @@ import ( "slices" ) -type readTxn []tableEntry +type readTxn []*tableEntry func (r *readTxn) getTableEntry(meta TableMeta) *tableEntry { - return &(*r)[meta.tablePos()] + return (*r)[meta.tablePos()] } // indexReadTxn implements ReadTxn. @@ -52,7 +52,7 @@ func (r *readTxn) WriteJSON(w io.Writer, tables ...string) error { first = false } - if err := writeTableAsJSON(buf, r, &table); err != nil { + if err := writeTableAsJSON(buf, r, table); err != nil { return err } } diff --git a/table.go b/table.go index dbf542f..2e01cf6 100644 --- a/table.go +++ b/table.go @@ -233,7 +233,7 @@ func (t *genTable[Obj]) getAcquiredInfo() string { return fmt.Sprintf("%s (%s ago, locked for %s)", info.handle, since, internal.PrettyDuration(dur)) } -func (t *genTable[Obj]) tableEntry() tableEntry { +func (t *genTable[Obj]) tableEntry() *tableEntry { var entry tableEntry entry.meta = t deleteTrackers := part.New[anyDeleteTracker]() @@ -253,7 +253,7 @@ func (t *genTable[Obj]) tableEntry() tableEntry { entry.indexes[GraveyardRevisionIndexPos] = newRevisionIndex() entry.indexes[GraveyardIndexPos] = newGraveyardIndex(primaryIndex) - return entry + return &entry } // newRevisionIndex constructs an index for storing objects by revision. @@ -352,7 +352,7 @@ func (t *genTable[Obj]) PendingInitializers(txn ReadTxn) []string { } func (t *genTable[Obj]) RegisterInitializer(txn WriteTxn, name string) func(WriteTxn) { - table := &txn.unwrap().tableEntries[t.pos] + table := txn.unwrap().tableEntries[t.pos] if !table.locked { panic(fmt.Sprintf("RegisterInitializer: Table %q not locked for writing", t.table)) } @@ -379,7 +379,7 @@ func (t *genTable[Obj]) RegisterInitializer(txn WriteTxn, name string) func(Writ var once sync.Once return func(txn WriteTxn) { once.Do(func() { - table := &txn.unwrap().tableEntries[t.pos] + table := txn.unwrap().tableEntries[t.pos] if !table.locked { panic(fmt.Sprintf("RegisterInitializer/MarkDone: Table %q not locked for writing", t.table)) } diff --git a/types.go b/types.go index 1384a87..15e3ac6 100644 --- a/types.go +++ b/types.go @@ -361,7 +361,7 @@ type anyDeleteTracker interface { } type tableInternal interface { - tableEntry() tableEntry + tableEntry() *tableEntry tablePos() int setTablePos(int) indexPos(string) int diff --git a/write_txn.go b/write_txn.go index 2b98add..6aae11d 100644 --- a/write_txn.go +++ b/write_txn.go @@ -32,7 +32,7 @@ type writeTxnState struct { acquiredAt time.Time // the time at which the transaction acquired the locks duration atomic.Uint64 // the transaction duration after it finished - tableEntries []tableEntry // table entries being modified + tableEntries []*tableEntry // table entries being modified numTxns int // number of index transactions opened smus internal.SortableMutexes // the (sorted) table locks tableNames []string @@ -59,7 +59,7 @@ func txnFinalizer(handle *writeTxnHandle) { } func (txn *writeTxnState) getTableEntry(meta TableMeta) *tableEntry { - return &txn.tableEntries[meta.tablePos()] + return txn.tableEntries[meta.tablePos()] } // indexReadTxn returns a transaction to read from the specific index. @@ -74,7 +74,7 @@ func (txn *writeTxnState) indexReadTxn(meta TableMeta, indexPos int) (tableIndex // indexWriteTxn returns a transaction to read/write to a specific index. // The created transaction is memoized and used for subsequent reads and/or writes. func (txn *writeTxnState) indexWriteTxn(meta TableMeta, indexPos int) (tableIndexTxn, error) { - table := &txn.tableEntries[meta.tablePos()] + table := txn.tableEntries[meta.tablePos()] if !table.locked { return nil, tableError(meta.Name(), ErrTableNotLockedForWriting) } @@ -118,7 +118,7 @@ func (txn *writeTxnState) modify(meta TableMeta, guardRevision Revision, newData // Look up table and allocate a new revision. tableName := meta.Name() - table := &txn.tableEntries[meta.tablePos()] + table := txn.tableEntries[meta.tablePos()] if !table.locked { return object{}, false, nil, tableError(tableName, ErrTableNotLockedForWriting) } @@ -203,7 +203,7 @@ func (txn *writeTxnState) modify(meta TableMeta, guardRevision Revision, newData } func (txn *writeTxnState) hasDeleteTrackers(meta TableMeta) bool { - table := &txn.tableEntries[meta.tablePos()] + table := txn.tableEntries[meta.tablePos()] return table.deleteTrackers.Len() > 0 } @@ -211,7 +211,7 @@ func (txn *writeTxnState) addDeleteTracker(meta TableMeta, trackerName string, d if txn == nil { return ErrTransactionClosed } - table := &txn.tableEntries[meta.tablePos()] + table := txn.tableEntries[meta.tablePos()] if !table.locked { return tableError(meta.Name(), ErrTableNotLockedForWriting) } @@ -230,7 +230,7 @@ func (txn *writeTxnState) delete(meta TableMeta, guardRevision Revision, data an // Look up table and allocate a new revision. tableName := meta.Name() - table := &txn.tableEntries[meta.tablePos()] + table := txn.tableEntries[meta.tablePos()] if !table.locked { return object{}, false, tableError(tableName, ErrTableNotLockedForWriting) } @@ -368,7 +368,7 @@ func (handle *writeTxnHandle) Commit() ReadTxn { // first as otherwise readers would wake up too early. txnToNotify := make([]tableIndexTxnNotify, 0, txn.numTxns) for pos := range txn.tableEntries { - table := &txn.tableEntries[pos] + table := txn.tableEntries[pos] if !table.locked { continue } @@ -401,7 +401,7 @@ func (handle *writeTxnHandle) Commit() ReadTxn { // Insert the modified tables into the root tree of tables. for pos := range txn.tableEntries { - table := &txn.tableEntries[pos] + table := txn.tableEntries[pos] if !table.locked { // Table was not locked so it might have changed. // Update the entry from the current root.