diff --git a/crates/driver/src/domain/competition/bad_orders/metrics.rs b/crates/driver/src/domain/competition/bad_orders/metrics.rs new file mode 100644 index 0000000000..56753cc939 --- /dev/null +++ b/crates/driver/src/domain/competition/bad_orders/metrics.rs @@ -0,0 +1,205 @@ +use { + super::Quality, + crate::{ + domain::competition::order, + infra::{observe::metrics, solver}, + }, + dashmap::DashMap, + std::{ + sync::Arc, + time::{Duration, Instant}, + }, +}; + +#[derive(Default, Debug)] +struct OrderStatistics { + attempts: u32, + fails: u32, + flagged_unsupported_at: Option, +} + +/// Monitors orders to determine whether they are considered "unsupported" based +/// on the ratio of failing to total settlement encoding attempts. An order must +/// have participated in at least `REQUIRED_MEASUREMENTS` settlement attempts to +/// be evaluated. If, at that point, the ratio of failures is greater than or +/// equal to `FAILURE_RATIO`, the order is considered unsupported. +/// +/// This detector tracks settlement simulation failures at the order level +/// rather than the token level, avoiding the problem of banning good tokens due +/// to solver-specific issues or bad solutions. +#[derive(Clone)] +pub struct Detector { + failure_ratio: f64, + required_measurements: u32, + counter: Arc>, + log_only: bool, + order_freeze_time: Duration, + solver: solver::Name, +} + +impl Detector { + pub fn new( + failure_ratio: f64, + required_measurements: u32, + log_only: bool, + order_freeze_time: Duration, + solver: solver::Name, + ) -> Self { + Self { + failure_ratio, + required_measurements, + counter: Default::default(), + log_only, + order_freeze_time, + solver, + } + } + + pub fn get_quality(&self, uid: &order::Uid, now: Instant) -> Quality { + let Some(stats) = self.counter.get(uid) else { + return Quality::Unknown; + }; + + if stats + .flagged_unsupported_at + .is_some_and(|t| now.duration_since(t) > self.order_freeze_time) + { + // Sometimes orders only cause issues temporarily (e.g., insufficient balance + // that gets topped up later). If the order's freeze period expired we pretend + // we don't have enough information to give it another chance. If it still + // behaves badly it will get frozen immediately. + return Quality::Unknown; + } + + match self.log_only { + true => Quality::Supported, + false => self.quality_based_on_stats(&stats), + } + } + + fn quality_based_on_stats(&self, stats: &OrderStatistics) -> Quality { + if stats.attempts < self.required_measurements { + return Quality::Unknown; + } + let order_failure_ratio = f64::from(stats.fails) / f64::from(stats.attempts); + match order_failure_ratio >= self.failure_ratio { + true => Quality::Unsupported, + false => Quality::Supported, + } + } + + /// Updates the orders that participated in settlements by + /// incrementing their attempt count. + /// `failure` indicates whether the settlement encoding/simulation was + /// successful or not. + pub fn update_orders(&self, order_uids: &[order::Uid], failure: bool) { + let now = Instant::now(); + let mut new_unsupported_orders = vec![]; + + for uid in order_uids { + let mut stats = self + .counter + .entry(*uid) + .and_modify(|counter| { + counter.attempts += 1; + counter.fails += u32::from(failure); + }) + .or_insert_with(|| OrderStatistics { + attempts: 1, + fails: u32::from(failure), + flagged_unsupported_at: None, + }); + + // order needs to be frozen as unsupported for a while + if self.quality_based_on_stats(&stats) == Quality::Unsupported + && stats + .flagged_unsupported_at + .is_none_or(|t| now.duration_since(t) > self.order_freeze_time) + { + new_unsupported_orders.push(*uid); + stats.flagged_unsupported_at = Some(now); + } + } + + if !new_unsupported_orders.is_empty() { + tracing::debug!( + orders = ?new_unsupported_orders, + "mark orders as unsupported" + ); + metrics::get() + .bad_orders_detected + .with_label_values(&[&self.solver.0, "metrics"]) + .inc_by(new_unsupported_orders.len() as u64); + } + } +} + +#[cfg(test)] +mod tests { + use {super::*, crate::util::Bytes}; + + fn test_uid(value: u8) -> order::Uid { + order::Uid(Bytes([value; 56])) + } + + /// Tests that an order only gets marked temporarily as unsupported. + /// After the freeze period it will be allowed again. + #[tokio::test] + async fn unfreeze_bad_orders() { + const FREEZE_DURATION: Duration = Duration::from_millis(50); + let detector = Detector::new( + 0.5, + 2, + false, + FREEZE_DURATION, + solver::Name("mysolver".to_string()), + ); + + let order_a = test_uid(1); + let order_b = test_uid(2); + let order_quality = || detector.get_quality(&order_a, Instant::now()); + + // order is reported as unknown while we don't have enough measurements + assert_eq!(order_quality(), Quality::Unknown); + detector.update_orders(&[order_a, order_b], true); + assert_eq!(order_quality(), Quality::Unknown); + detector.update_orders(&[order_a, order_b], true); + + // after we got enough measurements the order gets marked as bad + assert_eq!(order_quality(), Quality::Unsupported); + + // after the freeze period is over the order gets reported as unknown again + tokio::time::sleep(FREEZE_DURATION).await; + assert_eq!(order_quality(), Quality::Unknown); + + // after an unfreeze another bad measurement is enough to freeze it again + detector.update_orders(&[order_a, order_b], true); + assert_eq!(order_quality(), Quality::Unsupported); + } + + #[test] + fn different_orders_tracked_independently() { + let detector = Detector::new( + 0.5, + 2, + false, + Duration::from_secs(60), + solver::Name("mysolver".to_string()), + ); + + let order_a = test_uid(1); + let order_b = test_uid(2); + + // order_a fails twice + detector.update_orders(&[order_a], true); + detector.update_orders(&[order_a], true); + + // order_b succeeds twice + detector.update_orders(&[order_b], false); + detector.update_orders(&[order_b], false); + + let now = Instant::now(); + assert_eq!(detector.get_quality(&order_a, now), Quality::Unsupported); + assert_eq!(detector.get_quality(&order_b, now), Quality::Supported); + } +} diff --git a/crates/driver/src/domain/competition/bad_orders/mod.rs b/crates/driver/src/domain/competition/bad_orders/mod.rs new file mode 100644 index 0000000000..08f89f1ab0 --- /dev/null +++ b/crates/driver/src/domain/competition/bad_orders/mod.rs @@ -0,0 +1,99 @@ +use { + crate::domain::competition::{Auction, order}, + std::fmt, +}; + +pub mod metrics; + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub enum Quality { + /// Order is likely to produce working solutions when included. + Supported, + /// Order will likely produce failing solutions when included. + /// This can have many reasons: + /// * order-specific issues (bad pre/post interactions, signature problems) + /// * insufficient balance or approval + /// * order targeting problematic tokens + /// * malicious or buggy order parameters + Unsupported, + /// The detection strategy does not have enough data to make an informed + /// decision. + Unknown, +} + +#[derive(Default)] +pub struct Detector { + metrics: Option, +} + +impl Detector { + /// Creates a new detector without any detection mechanisms enabled. + pub fn new() -> Self { + Self { metrics: None } + } + + /// Enables detection of unsupported orders based on settlement simulation + /// failure heuristics. + pub fn with_metrics_detector(&mut self, detector: metrics::Detector) -> &mut Self { + self.metrics = Some(detector); + self + } + + /// Removes all unsupported orders from the auction. + pub fn filter_unsupported_orders_in_auction(&self, mut auction: Auction) -> Auction { + let now = std::time::Instant::now(); + + // reuse the original allocation + let all_orders = std::mem::take(&mut auction.orders); + let mut removed_uids = Vec::new(); + + let supported_orders: Vec<_> = all_orders + .into_iter() + .filter_map(|order| { + let quality = self.get_order_quality(&order.uid, now); + match quality { + Quality::Supported | Quality::Unknown => Some(order), + Quality::Unsupported => { + removed_uids.push(order.uid); + None + } + } + }) + .collect(); + + auction.orders = supported_orders; + if !removed_uids.is_empty() { + tracing::debug!(orders = ?removed_uids, "ignored orders flagged as unsupported"); + } + + auction + } + + /// Updates the order quality metric for successful settlements. + pub fn encoding_succeeded(&self, order_uids: &[order::Uid]) { + if let Some(metrics) = &self.metrics { + metrics.update_orders(order_uids, false); + } + } + + /// Updates the order quality metric for failed settlements. + pub fn encoding_failed(&self, order_uids: &[order::Uid]) { + if let Some(metrics) = &self.metrics { + metrics.update_orders(order_uids, true); + } + } + + fn get_order_quality(&self, uid: &order::Uid, now: std::time::Instant) -> Quality { + if let Some(Quality::Unsupported) = self.metrics.as_ref().map(|m| m.get_quality(uid, now)) { + return Quality::Unsupported; + } + + Quality::Unknown + } +} + +impl fmt::Debug for Detector { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Detector").finish() + } +} diff --git a/crates/driver/src/domain/competition/mod.rs b/crates/driver/src/domain/competition/mod.rs index 1fb751b9ca..37eef1713a 100644 --- a/crates/driver/src/domain/competition/mod.rs +++ b/crates/driver/src/domain/competition/mod.rs @@ -39,6 +39,7 @@ use { }; pub mod auction; +pub mod bad_orders; pub mod bad_tokens; pub mod order; mod pre_processing; @@ -68,6 +69,7 @@ pub struct Competition { /// Cached solutions with the most recent solutions at the front. pub settlements: Mutex>, pub bad_tokens: Arc, + pub bad_orders: Arc, fetcher: Arc, settle_queue: mpsc::Sender, order_sorting_strategies: Vec>, @@ -83,6 +85,7 @@ impl Competition { simulator: Simulator, mempools: Mempools, bad_tokens: Arc, + bad_orders: Arc, fetcher: Arc, order_sorting_strategies: Vec>, ) -> Arc { @@ -98,6 +101,7 @@ impl Competition { settlements: Default::default(), settle_queue: settle_sender, bad_tokens, + bad_orders, fetcher, order_sorting_strategies, }); @@ -242,6 +246,7 @@ impl Competition { .map(|solution| async move { let id = solution.id().clone(); let token_pairs = solution.token_pairs(); + let order_uids = solution.order_uids(); observe::encoding(&id); let settlement = solution .encode( @@ -251,19 +256,21 @@ impl Competition { self.solver.solver_native_token(), ) .await; - (id, token_pairs, settlement) + (id, token_pairs, order_uids, settlement) }) .collect::>() - .filter_map(|(id, token_pairs, result)| async move { + .filter_map(|(id, token_pairs, order_uids, result)| async move { match result { Ok(solution) => { self.bad_tokens.encoding_succeeded(&token_pairs); + self.bad_orders.encoding_succeeded(&order_uids); Some(solution) } // don't report on errors coming from solution merging Err(_err) if id.solutions().len() > 1 => None, Err(err) => { self.bad_tokens.encoding_failed(&token_pairs); + self.bad_orders.encoding_failed(&order_uids); observe::encoding_failed(self.solver.name(), &id, &err); notify::encoding_failed(&self.solver, auction.id(), &id, &err); None @@ -767,9 +774,16 @@ impl Competition { if !self.solver.config().flashloans_enabled { auction.orders.retain(|o| o.app_data.flashloan().is_none()); } - self.bad_tokens + // First filter by bad tokens (simulation-based detection) + auction = self + .bad_tokens .filter_unsupported_orders_in_auction(auction) - .await + .await; + // Then filter by bad orders (metrics-based detection) + auction = self + .bad_orders + .filter_unsupported_orders_in_auction(auction); + auction } } diff --git a/crates/driver/src/domain/competition/solution/mod.rs b/crates/driver/src/domain/competition/solution/mod.rs index 327dfcc808..c9acb4e0bf 100644 --- a/crates/driver/src/domain/competition/solution/mod.rs +++ b/crates/driver/src/domain/competition/solution/mod.rs @@ -199,6 +199,17 @@ impl Solution { .collect() } + /// Returns all the order UIDs involved in the solution. + pub fn order_uids(&self) -> Vec { + self.trades + .iter() + .map(|trade| match trade { + Trade::Fulfillment(fulfillment) => fulfillment.order().uid, + Trade::Jit(jit) => jit.order().uid, + }) + .collect() + } + /// Interactions executed by this solution. pub fn interactions(&self) -> &[Interaction] { &self.interactions diff --git a/crates/driver/src/infra/api/mod.rs b/crates/driver/src/infra/api/mod.rs index fbfa63ade5..4ab5c55aab 100644 --- a/crates/driver/src/infra/api/mod.rs +++ b/crates/driver/src/infra/api/mod.rs @@ -3,7 +3,7 @@ use { domain::{ self, Mempools, - competition::{bad_tokens, order::app_data::AppDataRetriever, sorting}, + competition::{bad_orders, bad_tokens, order::app_data::AppDataRetriever, sorting}, }, infra::{ self, @@ -111,6 +111,18 @@ impl Api { )); } + let bad_order_config = solver.bad_order_detection(); + let mut bad_orders = bad_orders::Detector::new(); + if bad_order_config.enable_metrics_strategy { + bad_orders.with_metrics_detector(bad_orders::metrics::Detector::new( + bad_order_config.metrics_strategy_failure_ratio, + bad_order_config.metrics_strategy_required_measurements, + bad_order_config.metrics_strategy_log_only, + bad_order_config.metrics_strategy_order_freeze_time, + name.clone(), + )); + } + let router = router.with_state(State(Arc::new(Inner { eth: self.eth.clone(), solver: solver.clone(), @@ -122,6 +134,7 @@ impl Api { self.simulator.clone(), self.mempools.clone(), Arc::new(bad_tokens), + Arc::new(bad_orders), fetcher.clone(), order_sorting_strategies.clone(), ), diff --git a/crates/driver/src/infra/config/file/load.rs b/crates/driver/src/infra/config/file/load.rs index ffd20f214e..53b265239d 100644 --- a/crates/driver/src/infra/config/file/load.rs +++ b/crates/driver/src/infra/config/file/load.rs @@ -9,7 +9,7 @@ use { mempool, notify, simulator, - solver::{self, BadTokenDetection, SolutionMerging}, + solver::{self, BadOrderDetection, BadTokenDetection, SolutionMerging}, }, }, chain::Chain, @@ -136,6 +136,23 @@ pub async fn load(chain: Chain, path: &Path) -> infra::Config { .bad_token_detection .metrics_strategy_token_freeze_time, }, + bad_order_detection: BadOrderDetection { + enable_metrics_strategy: solver_config + .bad_order_detection + .enable_metrics_strategy, + metrics_strategy_failure_ratio: solver_config + .bad_order_detection + .metrics_strategy_failure_ratio, + metrics_strategy_required_measurements: solver_config + .bad_order_detection + .metrics_strategy_required_measurements, + metrics_strategy_log_only: solver_config + .bad_order_detection + .metrics_strategy_log_only, + metrics_strategy_order_freeze_time: solver_config + .bad_order_detection + .metrics_strategy_order_freeze_time, + }, settle_queue_size: solver_config.settle_queue_size, flashloans_enabled: config.flashloans_enabled, fetch_liquidity_at_block: match config.liquidity.fetch_at_block { diff --git a/crates/driver/src/infra/config/file/mod.rs b/crates/driver/src/infra/config/file/mod.rs index 109113f6d1..7b678292da 100644 --- a/crates/driver/src/infra/config/file/mod.rs +++ b/crates/driver/src/infra/config/file/mod.rs @@ -314,6 +314,10 @@ struct SolverConfig { #[serde(default, flatten)] bad_token_detection: BadTokenDetectionConfig, + /// Configuration for bad order detection. + #[serde(default, flatten)] + bad_order_detection: BadOrderDetectionConfig, + /// The maximum number of `/settle` requests that can be queued up /// before the driver starts dropping new `/solve` requests. #[serde(default = "default_settle_queue_size")] @@ -883,6 +887,54 @@ pub struct BadTokenDetectionConfig { pub metrics_strategy_token_freeze_time: Duration, } +#[derive(Clone, Debug, Deserialize)] +#[serde(rename_all = "kebab-case", deny_unknown_fields)] +pub struct BadOrderDetectionConfig { + /// Whether the solver opted into detecting problematic + /// orders with metrics-based detection. + #[serde(default, rename = "enable-metrics-bad-order-detection")] + pub enable_metrics_strategy: bool, + + /// The ratio of failures to attempts that qualifies an order as + /// problematic. + #[serde( + default = "default_metrics_bad_order_detector_failure_ratio", + rename = "metrics-bad-order-detection-failure-ratio" + )] + pub metrics_strategy_failure_ratio: f64, + + /// The minimum number of attempts required before evaluating an order's + /// quality. + #[serde( + default = "default_metrics_bad_order_detector_required_measurements", + rename = "metrics-bad-order-detection-required-measurements" + )] + pub metrics_strategy_required_measurements: u32, + + /// Controls whether the metrics based detection strategy should only log + /// problematic orders or actually filter them out. + #[serde( + default = "default_metrics_bad_order_detector_log_only", + rename = "metrics-bad-order-detection-log-only" + )] + pub metrics_strategy_log_only: bool, + + /// How long the metrics based bad order detection should flag an order as + /// problematic before it allows to solve for that order again. + #[serde( + default = "default_metrics_bad_order_detector_freeze_time", + rename = "metrics-bad-order-detection-order-freeze-time", + with = "humantime_serde" + )] + pub metrics_strategy_order_freeze_time: Duration, +} + +impl Default for BadOrderDetectionConfig { + fn default() -> Self { + serde_json::from_str("{}").expect("BadOrderDetectionConfig uses default values") + } +} + impl Default for BadTokenDetectionConfig { fn default() -> Self { serde_json::from_str("{}").expect("MetricsBadTokenDetectorConfig uses default values") @@ -959,6 +1011,22 @@ fn default_metrics_bad_token_detector_freeze_time() -> Duration { Duration::from_secs(60 * 10) } +fn default_metrics_bad_order_detector_failure_ratio() -> f64 { + 0.9 +} + +fn default_metrics_bad_order_detector_required_measurements() -> u32 { + 5 +} + +fn default_metrics_bad_order_detector_log_only() -> bool { + false +} + +fn default_metrics_bad_order_detector_freeze_time() -> Duration { + Duration::from_secs(60 * 60) // 1 hour +} + /// According to statistics, the average size of the app-data is ~800 bytes. /// With this default, the approximate size of the cache will be ~1.6 MB. fn default_app_data_cache_size() -> u64 { diff --git a/crates/driver/src/infra/observe/metrics.rs b/crates/driver/src/infra/observe/metrics.rs index 6098c9bf2f..ccde93e2f0 100644 --- a/crates/driver/src/infra/observe/metrics.rs +++ b/crates/driver/src/infra/observe/metrics.rs @@ -28,6 +28,9 @@ pub struct Metrics { /// How many tokens detected by specific solver and strategy. #[metric(labels("solver", "strategy"))] pub bad_tokens_detected: prometheus::IntCounterVec, + /// How many orders detected as problematic by specific solver and strategy. + #[metric(labels("solver", "strategy"))] + pub bad_orders_detected: prometheus::IntCounterVec, /// Time spent in the auction preprocessing stage. #[metric( labels("stage"), diff --git a/crates/driver/src/infra/solver/mod.rs b/crates/driver/src/infra/solver/mod.rs index 3c526f2cfa..1385ba256b 100644 --- a/crates/driver/src/infra/solver/mod.rs +++ b/crates/driver/src/infra/solver/mod.rs @@ -131,6 +131,7 @@ pub struct Config { pub quote_tx_origin: Option, pub response_size_limit_max_bytes: usize, pub bad_token_detection: BadTokenDetection, + pub bad_order_detection: BadOrderDetection, /// Max size of the pending settlements queue. pub settle_queue_size: usize, /// Whether flashloan hints should be sent to the solver. @@ -170,6 +171,10 @@ impl Solver { &self.config.bad_token_detection } + pub fn bad_order_detection(&self) -> &BadOrderDetection { + &self.config.bad_order_detection + } + pub fn persistence(&self) -> Persistence { self.persistence.clone() } @@ -436,3 +441,12 @@ pub struct BadTokenDetection { pub metrics_strategy_log_only: bool, pub metrics_strategy_token_freeze_time: Duration, } + +#[derive(Debug, Clone)] +pub struct BadOrderDetection { + pub enable_metrics_strategy: bool, + pub metrics_strategy_failure_ratio: f64, + pub metrics_strategy_required_measurements: u32, + pub metrics_strategy_log_only: bool, + pub metrics_strategy_order_freeze_time: Duration, +}