From f0ea79b11797f8052153de405d30380658ffc66a Mon Sep 17 00:00:00 2001 From: Marcin Rataj Date: Tue, 9 Dec 2025 23:27:04 +0100 Subject: [PATCH] fix(cli): deduplicate dag stat blocks by multihash since Kubo v0.12.0, blocks are stored by multihash, so identical data with different CIDs (e.g., CIDv0 vs CIDv1) is stored once. dag stat now reflects actual storage by using multihash-based deduplication instead of CID-based. updated help text to clarify deduplication behavior and note that dag export uses CID-based keying and may include duplicates. added regression test for multihash deduplication. --- core/commands/dag/dag.go | 5 ++++- core/commands/dag/stat.go | 14 +++++++++----- docs/changelogs/v0.40.md | 7 ++++++- test/cli/dag_test.go | 21 +++++++++++++++++++++ 4 files changed, 40 insertions(+), 7 deletions(-) diff --git a/core/commands/dag/dag.go b/core/commands/dag/dag.go index 6827e46fab1..d8034d67104 100644 --- a/core/commands/dag/dag.go +++ b/core/commands/dag/dag.go @@ -377,7 +377,10 @@ var DagStatCmd = &cmds.Command{ 'ipfs dag stat' fetches a DAG and returns various statistics about it. Statistics include size and number of blocks. -Note: This command skips duplicate blocks in reporting both size and the number of blocks +Note: Duplicate blocks are identified by content hash (multihash) to reflect +actual disk usage. Identical data referenced via different CIDs is counted +once. 'dag export' uses CID-based keying and may include the same data +multiple times if referenced by different CIDs. `, }, Arguments: []cmds.Argument{ diff --git a/core/commands/dag/stat.go b/core/commands/dag/stat.go index bb9be7e0d90..72641ba07d5 100644 --- a/core/commands/dag/stat.go +++ b/core/commands/dag/stat.go @@ -7,8 +7,9 @@ import ( mdag "github.com/ipfs/boxo/ipld/merkledag" "github.com/ipfs/boxo/ipld/merkledag/traverse" - cid "github.com/ipfs/go-cid" cmds "github.com/ipfs/go-ipfs-cmds" + mh "github.com/multiformats/go-multihash" + "github.com/ipfs/kubo/core/commands/cmdenv" "github.com/ipfs/kubo/core/commands/cmdutils" "github.com/ipfs/kubo/core/commands/e" @@ -26,7 +27,10 @@ func dagStat(req *cmds.Request, res cmds.ResponseEmitter, env cmds.Environment) } nodeGetter := mdag.NewSession(req.Context, api.Dag()) - cidSet := cid.NewSet() + // Use multihash set for deduplication to reflect actual storage. + // Since Kubo v0.12.0, blocks are stored by multihash, so identical + // data with different CIDs (e.g., CIDv0 vs CIDv1) is stored once. + mhSet := mh.NewSet() dagStatSummary := &DagStatSummary{DagStatsArray: []*DagStat{}} for _, a := range req.Arguments { p, err := cmdutils.PathOrCidPath(a) @@ -54,11 +58,11 @@ func dagStat(req *cmds.Request, res cmds.ResponseEmitter, env cmds.Environment) currentNodeSize := uint64(len(current.Node.RawData())) dagstats.Size += currentNodeSize dagstats.NumBlocks++ - if !cidSet.Has(current.Node.Cid()) { + // Visit returns true if this multihash was not seen before + if mhSet.Visit(current.Node.Cid().Hash()) { dagStatSummary.incrementTotalSize(currentNodeSize) } dagStatSummary.incrementRedundantSize(currentNodeSize) - cidSet.Add(current.Node.Cid()) if progressive { if err := res.Emit(dagStatSummary); err != nil { return err @@ -74,7 +78,7 @@ func dagStat(req *cmds.Request, res cmds.ResponseEmitter, env cmds.Environment) } } - dagStatSummary.UniqueBlocks = cidSet.Len() + dagStatSummary.UniqueBlocks = mhSet.Len() dagStatSummary.calculateSummary() if err := res.Emit(dagStatSummary); err != nil { diff --git a/docs/changelogs/v0.40.md b/docs/changelogs/v0.40.md index 29780937f4b..4eb318b9469 100644 --- a/docs/changelogs/v0.40.md +++ b/docs/changelogs/v0.40.md @@ -11,7 +11,8 @@ This release was brought to you by the [Shipyard](https://ipshipyard.com/) team. - [Overview](#overview) - [๐Ÿ”ฆ Highlights](#-highlights) - [Routing V1 HTTP API now exposed by default](#routing-v1-http-api-now-exposed-by-default) - - [Track total size when adding pins](#track-total-size-when-adding-pins] + - [Track total size when adding pins](#track-total-size-when-adding-pins) + - [Fixed `ipfs dag stat` block counting](#fixed-ipfs-dag-stat-block-counting) - [๐Ÿ“ Changelog](#-changelog) - [๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ Contributors](#-contributors) @@ -32,6 +33,10 @@ Example output: Fetched/Processed 336 nodes (83 MB) ``` +#### Fixed `ipfs dag stat` block counting + +Since Kubo v0.12.0, blocks are stored by multihash, so the same data is stored only once regardless of which CID references it. The `dag stat` command now reflects actual storage by deduplicating blocks by content hash (e.g., data referenced via both CIDv0 and CIDv1 is counted once). See `ipfs dag stat --help` for more details. + ### ๐Ÿ“ Changelog ### ๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ Contributors diff --git a/test/cli/dag_test.go b/test/cli/dag_test.go index f6758a71037..ff05a401bbb 100644 --- a/test/cli/dag_test.go +++ b/test/cli/dag_test.go @@ -104,6 +104,27 @@ func TestDag(t *testing.T) { stat := node.RunIPFS("dag", "stat", "--progress=false", node1Cid, node2Cid) assert.Equal(t, content, stat.Stdout.Bytes()) }) + + t.Run("dag stat deduplicates by multihash", func(t *testing.T) { + t.Parallel() + node := harness.NewT(t).NewNode().Init().StartDaemon() + + // Add content and get CIDv0 with dag-pb (not raw leaves) + cidV0 := node.IPFSAddStr("hello world", "--cid-version=0", "--raw-leaves=false") + + // Convert to CIDv1 (same multihash, different CID) + cidV1 := node.IPFS("cid", "format", "-v", "1", "-b", "base32", cidV0).Stdout.Trimmed() + + // Run dag stat with both CIDs - should deduplicate by multihash + stat := node.RunIPFS("dag", "stat", "--progress=false", "--enc=json", cidV0, cidV1) + var data Data + err := json.Unmarshal(stat.Stdout.Bytes(), &data) + require.NoError(t, err) + + // Same block referenced via CIDv0 and CIDv1 should be counted once + assert.Equal(t, 1, data.UniqueBlocks, "same data via different CIDs should be 1 unique block") + assert.Equal(t, 2.0, data.Ratio, "ratio should be 2.0 (2 refs to 1 block)") + }) } func TestDagImportFastProvide(t *testing.T) {