From a9a5cea691991da0cc47df97aed5c2aa9c8b2b30 Mon Sep 17 00:00:00 2001
From: Xinhao Yuan <xinhaoyuan@google.com>
Date: Wed, 5 Nov 2025 12:59:11 -0800
Subject: [PATCH] Add various weight computation methods.

PiperOrigin-RevId: 828592996
---
 centipede/centipede.cc        |  14 +++-
 centipede/centipede_flags.inc |   3 +
 centipede/corpus.cc           |  49 ++++++++++--
 centipede/corpus.h            |  21 ++++-
 centipede/corpus_test.cc      | 141 +++++++++++++++++++++++++++++++++-
 centipede/feature_set.cc      |   5 +-
 centipede/feature_set.h       |   2 +-
 centipede/feature_set_test.cc |   8 +-
 8 files changed, 216 insertions(+), 27 deletions(-)
diff --git a/centipede/centipede.cc b/centipede/centipede.cc
index 2c2070203..fd1175191 100644
--- a/centipede/centipede.cc
+++ b/centipede/centipede.cc
@@ -76,6 +76,7 @@
 #include "./centipede/centipede_callbacks.h"
 #include "./centipede/command.h"
 #include "./centipede/control_flow.h"
+#include "./centipede/corpus.h"
 #include "./centipede/corpus_io.h"
 #include "./centipede/coverage.h"
 #include "./centipede/environment.h"
@@ -98,14 +99,21 @@
 
 namespace fuzztest::internal {
 
-Centipede::Centipede(const Environment &env, CentipedeCallbacks &user_callbacks,
-                     const BinaryInfo &binary_info,
-                     CoverageLogger &coverage_logger, std::atomic<Stats> &stats)
+Centipede::Centipede(const Environment& env, CentipedeCallbacks& user_callbacks,
+                     const BinaryInfo& binary_info,
+                     CoverageLogger& coverage_logger, std::atomic<Stats>& stats)
     : env_(env),
       user_callbacks_(user_callbacks),
       rng_(env_.seed),
       // TODO(kcc): [impl] find a better way to compute frequency_threshold.
       fs_(env_.feature_frequency_threshold, env_.MakeDomainDiscardMask()),
+      corpus_([this] {
+        const auto parsed_weight_method =
+            Corpus::ParseWeightMethod(env_.corpus_weight_method);
+        FUZZTEST_CHECK(parsed_weight_method.has_value())
+            << "Unknown corpus weight method " << env_.corpus_weight_method;
+        return parsed_weight_method.value();
+      }()),
       coverage_frontier_(binary_info),
       binary_info_(binary_info),
       pc_table_(binary_info_.pc_table),
diff --git a/centipede/centipede_flags.inc b/centipede/centipede_flags.inc
index 572e56d93..1ab484b36 100644
--- a/centipede/centipede_flags.inc
+++ b/centipede/centipede_flags.inc
@@ -192,6 +192,9 @@ CENTIPEDE_FLAG(
     bool, use_corpus_weights, true,
     "If true, use weighted distribution when choosing the corpus element "
     "to mutate. This flag is mostly for Centipede developers.")
+CENTIPEDE_FLAG(std::string, corpus_weight_method, "feature_rarity",
+               "The weight method to use on corpus. Available options are "
+               "`uniform`, `recency`, and `feature_rarity` (default).")
 CENTIPEDE_FLAG(
     bool, exec_time_weight_scaling, true,
     "If true, scale the corpus weight by the execution time of each input.")
diff --git a/centipede/corpus.cc b/centipede/corpus.cc
index c2746910a..f3bc0019d 100644
--- a/centipede/corpus.cc
+++ b/centipede/corpus.cc
@@ -17,6 +17,7 @@
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
+#include <optional>
 #include <string>
 #include <string_view>
 #include <utility>
@@ -45,13 +46,13 @@ namespace fuzztest::internal {
 //                                  Corpus
 //------------------------------------------------------------------------------
 
-// Returns the weight of `fv` computed using `fs` and `coverage_frontier`.
-static size_t ComputeWeight(const FeatureVec &fv, const FeatureSet &fs,
-                            const CoverageFrontier &coverage_frontier) {
-  size_t weight = fs.ComputeWeight(fv);
+// Computes the weight component of input using its features `fv` and
+// the overall `coverage_frontier`.
+static size_t ComputeFrontierWeight(const FeatureVec& fv,
+                                    const CoverageFrontier& coverage_frontier) {
   // The following is checking for the cases where PCTable is not present. In
   // such cases, we cannot use any ControlFlow related features.
-  if (coverage_frontier.MaxPcIndex() == 0) return weight;
+  if (coverage_frontier.MaxPcIndex() == 0) return 1;
   size_t frontier_weights_sum = 0;
   for (const auto feature : fv) {
     if (!feature_domains::kPCs.Contains(feature)) continue;
@@ -63,7 +64,19 @@ static size_t ComputeWeight(const FeatureVec &fv, const FeatureSet &fs,
       frontier_weights_sum += coverage_frontier.FrontierWeight(pc_index);
     }
   }
-  return weight * (frontier_weights_sum + 1);  // Multiply by at least 1.
+  return frontier_weights_sum + 1;  // Multiply by at least 1.
+}
+
+std::optional<Corpus::WeightMethod> Corpus::ParseWeightMethod(
+    std::string_view method_string) {
+  if (method_string == "uniform") {
+    return WeightMethod::Uniform;
+  } else if (method_string == "recency") {
+    return WeightMethod::Recency;
+  } else if (method_string == "feature_rarity") {
+    return WeightMethod::FeatureRarity;
+  }
+  return std::nullopt;
 }
 
 std::pair<size_t, size_t> Corpus::MaxAndAvgSize() const {
@@ -86,7 +99,26 @@ void Corpus::UpdateWeights(const FeatureSet& fs,
     auto& record = records_[i];
     const size_t unseen = fs.PruneFeaturesAndCountUnseen(record.features);
     FUZZTEST_CHECK_EQ(unseen, 0);
-    weights[i] = fs.ComputeWeight(record.features);
+    if (record.features.empty()) {
+      weights[i] = 0;
+      continue;
+    }
+    double base_weight = 0;
+    switch (method_) {
+      case WeightMethod::Uniform:
+        base_weight = 1;
+        break;
+      case WeightMethod::Recency:
+        base_weight = i + 1;
+        break;
+      case WeightMethod::FeatureRarity:
+        base_weight = fs.ComputeRarityWeight(record.features);
+        break;
+      default:
+        FUZZTEST_LOG(FATAL) << "Unknown corpus weight method";
+    }
+    weights[i] =
+        base_weight * ComputeFrontierWeight(record.features, coverage_frontier);
   }
   if (scale_by_exec_time) {
     double total_exec_time_usec = 0;
@@ -206,7 +238,8 @@ void Corpus::Add(const ByteArray& data, const FeatureVec& fv,
       << "Got request to add empty element to corpus: ignoring";
   FUZZTEST_CHECK_EQ(records_.size(), weighted_distribution_.size());
   records_.push_back({data, fv, metadata, stats});
-  weighted_distribution_.AddWeight(ComputeWeight(fv, fs, coverage_frontier));
+  // Will be updated by `UpdateWeights`.
+  weighted_distribution_.AddWeight(0);
 }
 
 const CorpusRecord& Corpus::WeightedRandom(absl::BitGenRef rng) const {
diff --git a/centipede/corpus.h b/centipede/corpus.h
index 07164663e..1a1c4c1da 100644
--- a/centipede/corpus.h
+++ b/centipede/corpus.h
@@ -17,6 +17,7 @@
 
 #include <cstddef>
 #include <cstdint>
+#include <optional>
 #include <ostream>
 #include <string>
 #include <string_view>
@@ -98,7 +99,17 @@ struct CorpusRecord {
 // Allows to prune (forget) inputs that become uninteresting.
 class Corpus {
  public:
-  Corpus() = default;
+  enum class WeightMethod {
+    Uniform,
+    Recency,
+    FeatureRarity,
+  };
+
+  static std::optional<WeightMethod> ParseWeightMethod(
+      std::string_view method_string);
+
+  Corpus() : Corpus(WeightMethod::FeatureRarity) {}
+  explicit Corpus(WeightMethod method) : method_(method) {}
 
   Corpus(const Corpus &) = default;
   Corpus(Corpus &&) noexcept = default;
@@ -120,9 +131,9 @@ class Corpus {
   // Returns the number of removed elements.
   size_t Prune(const FeatureSet &fs, const CoverageFrontier &coverage_frontier,
                size_t max_corpus_size, Rng &rng);
-  // Updates the corpus weights according to `fs` and `coverage_frontier`. If
-  // `scale_by_exec_time` is set, scales the weights by the corpus execution
-  // time relative to the average.
+  // Updates the corpus weights according to `fs` and `coverage_frontier` using
+  // the weight `method`. If `scale_by_exec_time` is set, scales the weights by
+  // the corpus execution time relative to the average.
   void UpdateWeights(const FeatureSet& fs,
                      const CoverageFrontier& coverage_frontier,
                      bool scale_by_exec_time);
@@ -164,6 +175,8 @@ class Corpus {
   // Maintains weights for elements of records_.
   WeightedDistribution weighted_distribution_;
   size_t num_pruned_ = 0;
+  // Method for weighting the corpus elements.
+  WeightMethod method_;
 };
 
 // Coverage frontier is a set of PCs that are themselves covered, but some of
diff --git a/centipede/corpus_test.cc b/centipede/corpus_test.cc
index f816f46ea..4b47d5c6d 100644
--- a/centipede/corpus_test.cc
+++ b/centipede/corpus_test.cc
@@ -114,7 +114,8 @@ TEST(Corpus, Prune) {
   Add({{2}, {30, 40}});
   Add({{3}, {40, 50}});
   Add({{4}, {10, 20}});
-  corpus.UpdateWeights(fs, coverage_frontier, /*scale_by_exec_time=*/false);
+  corpus.UpdateWeights(fs, coverage_frontier,
+                       /*scale_by_exec_time=*/false);
 
   // Prune. Features 20 and 40 are frequent => input {0} will be removed.
   EXPECT_EQ(corpus.NumActive(), 5);
@@ -124,7 +125,8 @@ TEST(Corpus, Prune) {
   VerifyActiveInputs({{1}, {2}, {3}, {4}});
 
   Add({{5}, {30, 60}});
-  corpus.UpdateWeights(fs, coverage_frontier, /*scale_by_exec_time=*/false);
+  corpus.UpdateWeights(fs, coverage_frontier,
+                       /*scale_by_exec_time=*/false);
 
   EXPECT_EQ(corpus.NumTotal(), 6);
   // Prune. Feature 30 is now frequent => inputs {1} and {2} will be removed.
@@ -145,6 +147,131 @@ TEST(Corpus, Prune) {
   EXPECT_EQ(corpus.NumTotal(), 6);
 }
 
+TEST(Corpus, UniformWeightMethodsWorkAsExpected) {
+  PCTable pc_table(100);
+  CFTable cf_table(100);
+  BinaryInfo bin_info{pc_table, {}, cf_table, {}, {}, {}};
+  CoverageFrontier coverage_frontier(bin_info);
+  FeatureSet fs(3, {});
+  Corpus corpus(Corpus::WeightMethod::Uniform);
+
+  auto Add = [&](const CorpusRecord& record) {
+    fs.MergeFeatures(record.features);
+    corpus.Add(record.data, record.features, /*metadata=*/{}, /*stats=*/{}, fs,
+               coverage_frontier);
+  };
+
+  Add({/*data=*/{0}, /*features=*/{30, 20}});
+  Add({/*data=*/{1}, /*features=*/{10, 20}});
+  Add({/*data=*/{2}, /*features=*/{10}});
+
+  constexpr int kNumIter = 10000;
+  std::vector<uint64_t> freq;
+
+  Rng rng;
+  auto ComputeFreq = [&]() {
+    freq.clear();
+    freq.resize(corpus.NumActive());
+    for (int i = 0; i < kNumIter; i++) {
+      const auto& record = corpus.WeightedRandom(rng);
+      const auto id = record.data[0];
+      ASSERT_LT(id, freq.size());
+      freq[id]++;
+    }
+  };
+
+  // The weights should be equal with the uniform method
+  corpus.UpdateWeights(fs, coverage_frontier,
+                       /*scale_by_exec_time=*/false);
+  ComputeFreq();
+  EXPECT_NEAR(freq[0], kNumIter / 3, 100);
+  EXPECT_NEAR(freq[1], kNumIter / 3, 100);
+  EXPECT_NEAR(freq[2], kNumIter / 3, 100);
+}
+
+TEST(Corpus, RecencyWeightMethodsWorkAsExpected) {
+  PCTable pc_table(100);
+  CFTable cf_table(100);
+  BinaryInfo bin_info{pc_table, {}, cf_table, {}, {}, {}};
+  CoverageFrontier coverage_frontier(bin_info);
+  FeatureSet fs(3, {});
+  Corpus corpus(Corpus::WeightMethod::Recency);
+
+  auto Add = [&](const CorpusRecord& record) {
+    fs.MergeFeatures(record.features);
+    corpus.Add(record.data, record.features, /*metadata=*/{}, /*stats=*/{}, fs,
+               coverage_frontier);
+  };
+
+  Add({/*data=*/{0}, /*features=*/{30, 20}});
+  Add({/*data=*/{1}, /*features=*/{10, 20}});
+  Add({/*data=*/{2}, /*features=*/{10}});
+
+  constexpr int kNumIter = 10000;
+  std::vector<uint64_t> freq;
+
+  Rng rng;
+  auto ComputeFreq = [&]() {
+    freq.clear();
+    freq.resize(corpus.NumActive());
+    for (int i = 0; i < kNumIter; i++) {
+      const auto& record = corpus.WeightedRandom(rng);
+      const auto id = record.data[0];
+      ASSERT_LT(id, freq.size());
+      freq[id]++;
+    }
+  };
+
+  // The weights should favor {2} over {1} over {0} with the recency method.
+  corpus.UpdateWeights(fs, coverage_frontier,
+                       /*scale_by_exec_time=*/false);
+  ComputeFreq();
+  EXPECT_GT(freq[2], freq[1] + 100);
+  EXPECT_GT(freq[1], freq[0] + 100);
+}
+
+TEST(Corpus, FeatureRarityWeightMethodsWorkAsExpected) {
+  PCTable pc_table(100);
+  CFTable cf_table(100);
+  BinaryInfo bin_info{pc_table, {}, cf_table, {}, {}, {}};
+  CoverageFrontier coverage_frontier(bin_info);
+  FeatureSet fs(3, {});
+  Corpus corpus(Corpus::WeightMethod::FeatureRarity);
+
+  auto Add = [&](const CorpusRecord& record) {
+    fs.MergeFeatures(record.features);
+    corpus.Add(record.data, record.features, /*metadata=*/{}, /*stats=*/{}, fs,
+               coverage_frontier);
+  };
+
+  Add({/*data=*/{0}, /*features=*/{30, 20}});
+  Add({/*data=*/{1}, /*features=*/{10, 20}});
+  Add({/*data=*/{2}, /*features=*/{10}});
+
+  constexpr int kNumIter = 10000;
+  std::vector<uint64_t> freq;
+
+  Rng rng;
+  auto ComputeFreq = [&]() {
+    freq.clear();
+    freq.resize(corpus.NumActive());
+    for (int i = 0; i < kNumIter; i++) {
+      const auto& record = corpus.WeightedRandom(rng);
+      const auto id = record.data[0];
+      ASSERT_LT(id, freq.size());
+      freq[id]++;
+    }
+  };
+
+  // The weights should favor {0} over {1} over {2} with the feature rarity
+  // method.
+  corpus.UpdateWeights(fs, coverage_frontier,
+                       /*scale_by_exec_time=*/false);
+  ComputeFreq();
+  EXPECT_GT(freq[0], freq[1] + 100);
+  EXPECT_GT(freq[1], freq[2] + 100);
+}
+
 TEST(Corpus, ScalesWeightsWithExecTime) {
   PCTable pc_table(100);
   CFTable cf_table(100);
@@ -181,14 +308,16 @@ TEST(Corpus, ScalesWeightsWithExecTime) {
   };
 
   // The weights should be equal without exec time scaling.
-  corpus.UpdateWeights(fs, coverage_frontier, /*scale_by_exec_time=*/false);
+  corpus.UpdateWeights(fs, coverage_frontier,
+                       /*scale_by_exec_time=*/false);
   ComputeFreq();
   EXPECT_NEAR(freq[0], kNumIter / 3, 100);
   EXPECT_NEAR(freq[1], kNumIter / 3, 100);
   EXPECT_NEAR(freq[2], kNumIter / 3, 100);
 
   // The weights should favor {0} over {1} over {2} with exec time scaling.
-  corpus.UpdateWeights(fs, coverage_frontier, /*scale_by_exec_time=*/true);
+  corpus.UpdateWeights(fs, coverage_frontier,
+                       /*scale_by_exec_time=*/true);
   ComputeFreq();
   EXPECT_GT(freq[0], freq[1] + 100);
   EXPECT_GT(freq[1], freq[2] + 100);
@@ -208,6 +337,8 @@ TEST(Corpus, PruneCorpusWithAllEmptyFeatureInputs) {
              coverage_frontier);
   corpus.Add(/*data=*/{2}, /*fv=*/{}, /*metadata=*/{}, /*stats=*/{}, fs,
              coverage_frontier);
+  corpus.UpdateWeights(fs, coverage_frontier,
+                       /*scale_by_exec_time=*/false);
   // Should not crash.
   corpus.Prune(fs, coverage_frontier, max_corpus_size, rng);
 }
@@ -231,6 +362,8 @@ TEST(Corpus, PruneRegressionTest1) {
 
   Add({{1}, {10, 20}});
   Add({{2}, {10}});
+  corpus.UpdateWeights(fs, coverage_frontier,
+                       /*scale_by_exec_time=*/false);
   corpus.Prune(fs, coverage_frontier, max_corpus_size, rng);
 }
 
diff --git a/centipede/feature_set.cc b/centipede/feature_set.cc
index 6806b2f94..86b741d11 100644
--- a/centipede/feature_set.cc
+++ b/centipede/feature_set.cc
@@ -139,9 +139,8 @@ void FeatureSet::MergeFeatures(const FeatureVec& features) {
 }
 
 __attribute__((noinline))  // to see it in profile.
-uint64_t
-FeatureSet::ComputeWeight(const FeatureVec &features) const {
-  uint64_t weight = 0;
+double FeatureSet::ComputeRarityWeight(const FeatureVec& features) const {
+  double weight = 0;
   for (auto feature : features) {
     // The less frequent is the feature, the more valuable it is.
     // (frequency == 1) => (weight == 256)
diff --git a/centipede/feature_set.h b/centipede/feature_set.h
index 7e85dfef4..484d7e0d3 100644
--- a/centipede/feature_set.h
+++ b/centipede/feature_set.h
@@ -97,7 +97,7 @@ class FeatureSet {
   // Computes combined weight of `features`.
   // The less frequent the feature is, the bigger its weight.
   // The weight of a FeatureVec is a sum of individual feature weights.
-  uint64_t ComputeWeight(const FeatureVec &features) const;
+  double ComputeRarityWeight(const FeatureVec& features) const;
 
   // Returns a debug string representing the state of *this.
   std::string DebugString() const;
diff --git a/centipede/feature_set_test.cc b/centipede/feature_set_test.cc
index 9a81d2d73..8a796cdcb 100644
--- a/centipede/feature_set_test.cc
+++ b/centipede/feature_set_test.cc
@@ -27,8 +27,8 @@ namespace {
 TEST(FeatureSet, ComputeWeight) {
   FeatureSet feature_set(10, {});
 
-  auto W = [&](const FeatureVec &features) -> uint64_t {
-    return feature_set.ComputeWeight(features);
+  auto W = [&](const FeatureVec& features) {
+    return feature_set.ComputeRarityWeight(features);
   };
 
   feature_set.MergeFeatures({1, 2, 3});
@@ -60,8 +60,8 @@ TEST(FeatureSet, ComputeWeightWithDifferentDomains) {
                              /* three features from domain #3 */ f3, f3 + 1,
                              f3 + 2});
 
-  auto weight = [&](const FeatureVec &features) -> uint64_t {
-    return feature_set.ComputeWeight(features);
+  auto weight = [&](const FeatureVec& features) {
+    return feature_set.ComputeRarityWeight(features);
   };
 
   // Test that features from a less frequent domain have more weight.