From a9a5cea691991da0cc47df97aed5c2aa9c8b2b30 Mon Sep 17 00:00:00 2001 From: Xinhao Yuan Date: Wed, 5 Nov 2025 12:59:11 -0800 Subject: [PATCH] Add various weight computation methods. PiperOrigin-RevId: 828592996 --- centipede/centipede.cc | 14 +++- centipede/centipede_flags.inc | 3 + centipede/corpus.cc | 49 ++++++++++-- centipede/corpus.h | 21 ++++- centipede/corpus_test.cc | 141 +++++++++++++++++++++++++++++++++- centipede/feature_set.cc | 5 +- centipede/feature_set.h | 2 +- centipede/feature_set_test.cc | 8 +- 8 files changed, 216 insertions(+), 27 deletions(-) diff --git a/centipede/centipede.cc b/centipede/centipede.cc index 2c2070203..fd1175191 100644 --- a/centipede/centipede.cc +++ b/centipede/centipede.cc @@ -76,6 +76,7 @@ #include "./centipede/centipede_callbacks.h" #include "./centipede/command.h" #include "./centipede/control_flow.h" +#include "./centipede/corpus.h" #include "./centipede/corpus_io.h" #include "./centipede/coverage.h" #include "./centipede/environment.h" @@ -98,14 +99,21 @@ namespace fuzztest::internal { -Centipede::Centipede(const Environment &env, CentipedeCallbacks &user_callbacks, - const BinaryInfo &binary_info, - CoverageLogger &coverage_logger, std::atomic &stats) +Centipede::Centipede(const Environment& env, CentipedeCallbacks& user_callbacks, + const BinaryInfo& binary_info, + CoverageLogger& coverage_logger, std::atomic& stats) : env_(env), user_callbacks_(user_callbacks), rng_(env_.seed), // TODO(kcc): [impl] find a better way to compute frequency_threshold. fs_(env_.feature_frequency_threshold, env_.MakeDomainDiscardMask()), + corpus_([this] { + const auto parsed_weight_method = + Corpus::ParseWeightMethod(env_.corpus_weight_method); + FUZZTEST_CHECK(parsed_weight_method.has_value()) + << "Unknown corpus weight method " << env_.corpus_weight_method; + return parsed_weight_method.value(); + }()), coverage_frontier_(binary_info), binary_info_(binary_info), pc_table_(binary_info_.pc_table), diff --git a/centipede/centipede_flags.inc b/centipede/centipede_flags.inc index 572e56d93..1ab484b36 100644 --- a/centipede/centipede_flags.inc +++ b/centipede/centipede_flags.inc @@ -192,6 +192,9 @@ CENTIPEDE_FLAG( bool, use_corpus_weights, true, "If true, use weighted distribution when choosing the corpus element " "to mutate. This flag is mostly for Centipede developers.") +CENTIPEDE_FLAG(std::string, corpus_weight_method, "feature_rarity", + "The weight method to use on corpus. Available options are " + "`uniform`, `recency`, and `feature_rarity` (default).") CENTIPEDE_FLAG( bool, exec_time_weight_scaling, true, "If true, scale the corpus weight by the execution time of each input.") diff --git a/centipede/corpus.cc b/centipede/corpus.cc index c2746910a..f3bc0019d 100644 --- a/centipede/corpus.cc +++ b/centipede/corpus.cc @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -45,13 +46,13 @@ namespace fuzztest::internal { // Corpus //------------------------------------------------------------------------------ -// Returns the weight of `fv` computed using `fs` and `coverage_frontier`. -static size_t ComputeWeight(const FeatureVec &fv, const FeatureSet &fs, - const CoverageFrontier &coverage_frontier) { - size_t weight = fs.ComputeWeight(fv); +// Computes the weight component of input using its features `fv` and +// the overall `coverage_frontier`. +static size_t ComputeFrontierWeight(const FeatureVec& fv, + const CoverageFrontier& coverage_frontier) { // The following is checking for the cases where PCTable is not present. In // such cases, we cannot use any ControlFlow related features. - if (coverage_frontier.MaxPcIndex() == 0) return weight; + if (coverage_frontier.MaxPcIndex() == 0) return 1; size_t frontier_weights_sum = 0; for (const auto feature : fv) { if (!feature_domains::kPCs.Contains(feature)) continue; @@ -63,7 +64,19 @@ static size_t ComputeWeight(const FeatureVec &fv, const FeatureSet &fs, frontier_weights_sum += coverage_frontier.FrontierWeight(pc_index); } } - return weight * (frontier_weights_sum + 1); // Multiply by at least 1. + return frontier_weights_sum + 1; // Multiply by at least 1. +} + +std::optional Corpus::ParseWeightMethod( + std::string_view method_string) { + if (method_string == "uniform") { + return WeightMethod::Uniform; + } else if (method_string == "recency") { + return WeightMethod::Recency; + } else if (method_string == "feature_rarity") { + return WeightMethod::FeatureRarity; + } + return std::nullopt; } std::pair Corpus::MaxAndAvgSize() const { @@ -86,7 +99,26 @@ void Corpus::UpdateWeights(const FeatureSet& fs, auto& record = records_[i]; const size_t unseen = fs.PruneFeaturesAndCountUnseen(record.features); FUZZTEST_CHECK_EQ(unseen, 0); - weights[i] = fs.ComputeWeight(record.features); + if (record.features.empty()) { + weights[i] = 0; + continue; + } + double base_weight = 0; + switch (method_) { + case WeightMethod::Uniform: + base_weight = 1; + break; + case WeightMethod::Recency: + base_weight = i + 1; + break; + case WeightMethod::FeatureRarity: + base_weight = fs.ComputeRarityWeight(record.features); + break; + default: + FUZZTEST_LOG(FATAL) << "Unknown corpus weight method"; + } + weights[i] = + base_weight * ComputeFrontierWeight(record.features, coverage_frontier); } if (scale_by_exec_time) { double total_exec_time_usec = 0; @@ -206,7 +238,8 @@ void Corpus::Add(const ByteArray& data, const FeatureVec& fv, << "Got request to add empty element to corpus: ignoring"; FUZZTEST_CHECK_EQ(records_.size(), weighted_distribution_.size()); records_.push_back({data, fv, metadata, stats}); - weighted_distribution_.AddWeight(ComputeWeight(fv, fs, coverage_frontier)); + // Will be updated by `UpdateWeights`. + weighted_distribution_.AddWeight(0); } const CorpusRecord& Corpus::WeightedRandom(absl::BitGenRef rng) const { diff --git a/centipede/corpus.h b/centipede/corpus.h index 07164663e..1a1c4c1da 100644 --- a/centipede/corpus.h +++ b/centipede/corpus.h @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -98,7 +99,17 @@ struct CorpusRecord { // Allows to prune (forget) inputs that become uninteresting. class Corpus { public: - Corpus() = default; + enum class WeightMethod { + Uniform, + Recency, + FeatureRarity, + }; + + static std::optional ParseWeightMethod( + std::string_view method_string); + + Corpus() : Corpus(WeightMethod::FeatureRarity) {} + explicit Corpus(WeightMethod method) : method_(method) {} Corpus(const Corpus &) = default; Corpus(Corpus &&) noexcept = default; @@ -120,9 +131,9 @@ class Corpus { // Returns the number of removed elements. size_t Prune(const FeatureSet &fs, const CoverageFrontier &coverage_frontier, size_t max_corpus_size, Rng &rng); - // Updates the corpus weights according to `fs` and `coverage_frontier`. If - // `scale_by_exec_time` is set, scales the weights by the corpus execution - // time relative to the average. + // Updates the corpus weights according to `fs` and `coverage_frontier` using + // the weight `method`. If `scale_by_exec_time` is set, scales the weights by + // the corpus execution time relative to the average. void UpdateWeights(const FeatureSet& fs, const CoverageFrontier& coverage_frontier, bool scale_by_exec_time); @@ -164,6 +175,8 @@ class Corpus { // Maintains weights for elements of records_. WeightedDistribution weighted_distribution_; size_t num_pruned_ = 0; + // Method for weighting the corpus elements. + WeightMethod method_; }; // Coverage frontier is a set of PCs that are themselves covered, but some of diff --git a/centipede/corpus_test.cc b/centipede/corpus_test.cc index f816f46ea..4b47d5c6d 100644 --- a/centipede/corpus_test.cc +++ b/centipede/corpus_test.cc @@ -114,7 +114,8 @@ TEST(Corpus, Prune) { Add({{2}, {30, 40}}); Add({{3}, {40, 50}}); Add({{4}, {10, 20}}); - corpus.UpdateWeights(fs, coverage_frontier, /*scale_by_exec_time=*/false); + corpus.UpdateWeights(fs, coverage_frontier, + /*scale_by_exec_time=*/false); // Prune. Features 20 and 40 are frequent => input {0} will be removed. EXPECT_EQ(corpus.NumActive(), 5); @@ -124,7 +125,8 @@ TEST(Corpus, Prune) { VerifyActiveInputs({{1}, {2}, {3}, {4}}); Add({{5}, {30, 60}}); - corpus.UpdateWeights(fs, coverage_frontier, /*scale_by_exec_time=*/false); + corpus.UpdateWeights(fs, coverage_frontier, + /*scale_by_exec_time=*/false); EXPECT_EQ(corpus.NumTotal(), 6); // Prune. Feature 30 is now frequent => inputs {1} and {2} will be removed. @@ -145,6 +147,131 @@ TEST(Corpus, Prune) { EXPECT_EQ(corpus.NumTotal(), 6); } +TEST(Corpus, UniformWeightMethodsWorkAsExpected) { + PCTable pc_table(100); + CFTable cf_table(100); + BinaryInfo bin_info{pc_table, {}, cf_table, {}, {}, {}}; + CoverageFrontier coverage_frontier(bin_info); + FeatureSet fs(3, {}); + Corpus corpus(Corpus::WeightMethod::Uniform); + + auto Add = [&](const CorpusRecord& record) { + fs.MergeFeatures(record.features); + corpus.Add(record.data, record.features, /*metadata=*/{}, /*stats=*/{}, fs, + coverage_frontier); + }; + + Add({/*data=*/{0}, /*features=*/{30, 20}}); + Add({/*data=*/{1}, /*features=*/{10, 20}}); + Add({/*data=*/{2}, /*features=*/{10}}); + + constexpr int kNumIter = 10000; + std::vector freq; + + Rng rng; + auto ComputeFreq = [&]() { + freq.clear(); + freq.resize(corpus.NumActive()); + for (int i = 0; i < kNumIter; i++) { + const auto& record = corpus.WeightedRandom(rng); + const auto id = record.data[0]; + ASSERT_LT(id, freq.size()); + freq[id]++; + } + }; + + // The weights should be equal with the uniform method + corpus.UpdateWeights(fs, coverage_frontier, + /*scale_by_exec_time=*/false); + ComputeFreq(); + EXPECT_NEAR(freq[0], kNumIter / 3, 100); + EXPECT_NEAR(freq[1], kNumIter / 3, 100); + EXPECT_NEAR(freq[2], kNumIter / 3, 100); +} + +TEST(Corpus, RecencyWeightMethodsWorkAsExpected) { + PCTable pc_table(100); + CFTable cf_table(100); + BinaryInfo bin_info{pc_table, {}, cf_table, {}, {}, {}}; + CoverageFrontier coverage_frontier(bin_info); + FeatureSet fs(3, {}); + Corpus corpus(Corpus::WeightMethod::Recency); + + auto Add = [&](const CorpusRecord& record) { + fs.MergeFeatures(record.features); + corpus.Add(record.data, record.features, /*metadata=*/{}, /*stats=*/{}, fs, + coverage_frontier); + }; + + Add({/*data=*/{0}, /*features=*/{30, 20}}); + Add({/*data=*/{1}, /*features=*/{10, 20}}); + Add({/*data=*/{2}, /*features=*/{10}}); + + constexpr int kNumIter = 10000; + std::vector freq; + + Rng rng; + auto ComputeFreq = [&]() { + freq.clear(); + freq.resize(corpus.NumActive()); + for (int i = 0; i < kNumIter; i++) { + const auto& record = corpus.WeightedRandom(rng); + const auto id = record.data[0]; + ASSERT_LT(id, freq.size()); + freq[id]++; + } + }; + + // The weights should favor {2} over {1} over {0} with the recency method. + corpus.UpdateWeights(fs, coverage_frontier, + /*scale_by_exec_time=*/false); + ComputeFreq(); + EXPECT_GT(freq[2], freq[1] + 100); + EXPECT_GT(freq[1], freq[0] + 100); +} + +TEST(Corpus, FeatureRarityWeightMethodsWorkAsExpected) { + PCTable pc_table(100); + CFTable cf_table(100); + BinaryInfo bin_info{pc_table, {}, cf_table, {}, {}, {}}; + CoverageFrontier coverage_frontier(bin_info); + FeatureSet fs(3, {}); + Corpus corpus(Corpus::WeightMethod::FeatureRarity); + + auto Add = [&](const CorpusRecord& record) { + fs.MergeFeatures(record.features); + corpus.Add(record.data, record.features, /*metadata=*/{}, /*stats=*/{}, fs, + coverage_frontier); + }; + + Add({/*data=*/{0}, /*features=*/{30, 20}}); + Add({/*data=*/{1}, /*features=*/{10, 20}}); + Add({/*data=*/{2}, /*features=*/{10}}); + + constexpr int kNumIter = 10000; + std::vector freq; + + Rng rng; + auto ComputeFreq = [&]() { + freq.clear(); + freq.resize(corpus.NumActive()); + for (int i = 0; i < kNumIter; i++) { + const auto& record = corpus.WeightedRandom(rng); + const auto id = record.data[0]; + ASSERT_LT(id, freq.size()); + freq[id]++; + } + }; + + // The weights should favor {0} over {1} over {2} with the feature rarity + // method. + corpus.UpdateWeights(fs, coverage_frontier, + /*scale_by_exec_time=*/false); + ComputeFreq(); + EXPECT_GT(freq[0], freq[1] + 100); + EXPECT_GT(freq[1], freq[2] + 100); +} + TEST(Corpus, ScalesWeightsWithExecTime) { PCTable pc_table(100); CFTable cf_table(100); @@ -181,14 +308,16 @@ TEST(Corpus, ScalesWeightsWithExecTime) { }; // The weights should be equal without exec time scaling. - corpus.UpdateWeights(fs, coverage_frontier, /*scale_by_exec_time=*/false); + corpus.UpdateWeights(fs, coverage_frontier, + /*scale_by_exec_time=*/false); ComputeFreq(); EXPECT_NEAR(freq[0], kNumIter / 3, 100); EXPECT_NEAR(freq[1], kNumIter / 3, 100); EXPECT_NEAR(freq[2], kNumIter / 3, 100); // The weights should favor {0} over {1} over {2} with exec time scaling. - corpus.UpdateWeights(fs, coverage_frontier, /*scale_by_exec_time=*/true); + corpus.UpdateWeights(fs, coverage_frontier, + /*scale_by_exec_time=*/true); ComputeFreq(); EXPECT_GT(freq[0], freq[1] + 100); EXPECT_GT(freq[1], freq[2] + 100); @@ -208,6 +337,8 @@ TEST(Corpus, PruneCorpusWithAllEmptyFeatureInputs) { coverage_frontier); corpus.Add(/*data=*/{2}, /*fv=*/{}, /*metadata=*/{}, /*stats=*/{}, fs, coverage_frontier); + corpus.UpdateWeights(fs, coverage_frontier, + /*scale_by_exec_time=*/false); // Should not crash. corpus.Prune(fs, coverage_frontier, max_corpus_size, rng); } @@ -231,6 +362,8 @@ TEST(Corpus, PruneRegressionTest1) { Add({{1}, {10, 20}}); Add({{2}, {10}}); + corpus.UpdateWeights(fs, coverage_frontier, + /*scale_by_exec_time=*/false); corpus.Prune(fs, coverage_frontier, max_corpus_size, rng); } diff --git a/centipede/feature_set.cc b/centipede/feature_set.cc index 6806b2f94..86b741d11 100644 --- a/centipede/feature_set.cc +++ b/centipede/feature_set.cc @@ -139,9 +139,8 @@ void FeatureSet::MergeFeatures(const FeatureVec& features) { } __attribute__((noinline)) // to see it in profile. -uint64_t -FeatureSet::ComputeWeight(const FeatureVec &features) const { - uint64_t weight = 0; +double FeatureSet::ComputeRarityWeight(const FeatureVec& features) const { + double weight = 0; for (auto feature : features) { // The less frequent is the feature, the more valuable it is. // (frequency == 1) => (weight == 256) diff --git a/centipede/feature_set.h b/centipede/feature_set.h index 7e85dfef4..484d7e0d3 100644 --- a/centipede/feature_set.h +++ b/centipede/feature_set.h @@ -97,7 +97,7 @@ class FeatureSet { // Computes combined weight of `features`. // The less frequent the feature is, the bigger its weight. // The weight of a FeatureVec is a sum of individual feature weights. - uint64_t ComputeWeight(const FeatureVec &features) const; + double ComputeRarityWeight(const FeatureVec& features) const; // Returns a debug string representing the state of *this. std::string DebugString() const; diff --git a/centipede/feature_set_test.cc b/centipede/feature_set_test.cc index 9a81d2d73..8a796cdcb 100644 --- a/centipede/feature_set_test.cc +++ b/centipede/feature_set_test.cc @@ -27,8 +27,8 @@ namespace { TEST(FeatureSet, ComputeWeight) { FeatureSet feature_set(10, {}); - auto W = [&](const FeatureVec &features) -> uint64_t { - return feature_set.ComputeWeight(features); + auto W = [&](const FeatureVec& features) { + return feature_set.ComputeRarityWeight(features); }; feature_set.MergeFeatures({1, 2, 3}); @@ -60,8 +60,8 @@ TEST(FeatureSet, ComputeWeightWithDifferentDomains) { /* three features from domain #3 */ f3, f3 + 1, f3 + 2}); - auto weight = [&](const FeatureVec &features) -> uint64_t { - return feature_set.ComputeWeight(features); + auto weight = [&](const FeatureVec& features) { + return feature_set.ComputeRarityWeight(features); }; // Test that features from a less frequent domain have more weight.