graniet · nazq · Dec 31, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -79,6 +79,7 @@ log = "0.4"
 env_logger = { version = "0.11", optional = true }
 chrono = {version = "0.4", default-features = false, features = ["serde"]}
 rand = "0.8"
+pin-project = "1"
 
 [[bin]]
 name = "llm"

diff --git a/src/builder.rs b/src/builder.rs
@@ -198,6 +198,8 @@ pub struct LLMBuilder {
     resilient_max_delay_ms: Option<u64>,
     /// Resilience: jitter toggle
     resilient_jitter: Option<bool>,
+    /// Enable metrics collection (timing, usage) for non-streaming calls
+    enable_metrics: Option<bool>,
 }
 
 impl LLMBuilder {
@@ -476,6 +478,31 @@ impl LLMBuilder {
         self
     }
 
+    /// Enables metrics collection for non-streaming chat calls.
+    ///
+    /// When enabled, `ChatResponse::metrics()` will return timing and usage
+    /// information. For streaming calls, use `Tracked::new()` wrapper instead.
+    ///
+    /// # Example
+    ///
+    /// ```rust,ignore
+    /// let llm = LLMBuilder::new()
+    ///     .backend(LLMBackend::OpenAI)
+    ///     .api_key("...")
+    ///     .enable_metrics(true)
+    ///     .build()?;
+    ///
+    /// let response = llm.chat("Hello").await?;
+    /// if let Some(metrics) = response.metrics() {
+    ///     println!("Duration: {:?}", metrics.duration);
+    ///     println!("Tokens/sec: {:?}", metrics.tokens_per_second());
+    /// }
+    /// ```
+    pub fn enable_metrics(mut self, enable: bool) -> Self {
+        self.enable_metrics = Some(enable);
+        self
+    }
+
     #[deprecated(note = "Renamed to `xai_search_mode`.")]
     pub fn search_mode(self, mode: impl Into<String>) -> Self {
         self.xai_search_mode(mode)
@@ -1112,6 +1139,11 @@ impl LLMBuilder {
             final_provider = Box::new(crate::resilient_llm::ResilientLLM::new(final_provider, cfg));
         }
 
+        // Wrap with metrics collection if enabled
+        if self.enable_metrics.unwrap_or(false) {
+            final_provider = Box::new(crate::metrics::MetricsProvider::new(final_provider));
+        }
+
         // Wrap with memory capabilities if memory is configured
         if let Some(memory) = self.memory {
             let memory_arc = Arc::new(RwLock::new(memory));

diff --git a/src/chat/mod.rs b/src/chat/mod.rs
@@ -1,6 +1,7 @@
 use std::collections::HashMap;
 use std::fmt;
 use std::pin::Pin;
+use std::time::Duration;
 
 use async_trait::async_trait;
 use futures::stream::{Stream, StreamExt};
@@ -9,6 +10,9 @@ use serde_json::Value;
 
 use crate::{error::LLMError, ToolCall};
 
+mod tracked;
+pub use tracked::{Trackable, Tracked};
+
 /// Usage metadata for a chat response.
 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
 pub struct Usage {
@@ -127,6 +131,54 @@ pub struct PromptTokensDetails {
     pub audio_tokens: Option<u32>,
 }
 
+/// Comprehensive metrics for a chat request including timing and token usage.
+///
+/// This struct is returned by `ChatResponse::metrics()` when metrics collection
+/// is enabled, or by `Tracked::finalize()` for streaming responses.
+///
+/// # Example
+///
+/// ```rust,ignore
+/// // Non-streaming with metrics enabled
+/// let response = llm.chat("Hello").await?;
+/// if let Some(metrics) = response.metrics() {
+///     println!("Duration: {:?}", metrics.duration);
+///     println!("Tokens: {:?}", metrics.usage);
+///     println!("Tokens/sec: {:?}", metrics.tokens_per_second());
+/// }
+///
+/// // Streaming with Tracked wrapper
+/// let stream = llm.chat_stream_with_tools(messages, None).await?;
+/// let mut tracked = Tracked::new(stream);
+/// while let Some(chunk) = tracked.next().await { /* ... */ }
+/// let metrics = tracked.finalize();
+/// println!("TTFT: {:?}", metrics.time_to_first_token);
+/// ```
+#[derive(Debug, Clone, Default)]
+pub struct ChatMetrics {
+    /// Token usage (prompt, completion, total)
+    pub usage: Option<Usage>,
+    /// Total wall-clock duration of the request
+    pub duration: Duration,
+    /// Time to first token (streaming only, None for non-streaming)
+    pub time_to_first_token: Option<Duration>,
+}
+
+impl ChatMetrics {
+    /// Calculate tokens per second (completion tokens / duration).
+    ///
+    /// Returns `None` if usage data is unavailable or duration is zero.
+    pub fn tokens_per_second(&self) -> Option<f64> {
+        let usage = self.usage.as_ref()?;
+        let secs = self.duration.as_secs_f64();
+        if secs > 0.0 {
+            Some(usage.completion_tokens as f64 / secs)
+        } else {
+            None
+        }
+    }
+}
+
 /// Role of a participant in a chat conversation.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub enum ChatRole {
@@ -356,15 +408,36 @@ impl Serialize for ToolChoice {
     }
 }
 
+/// Trait for chat response types returned by providers.
+///
+/// Provides access to the response content, tool calls, usage statistics,
+/// and optional metrics when enabled.
 pub trait ChatResponse: std::fmt::Debug + std::fmt::Display + Send + Sync {
+    /// Returns the text content of the response, if any.
     fn text(&self) -> Option<String>;
+
+    /// Returns tool calls requested by the model, if any.
     fn tool_calls(&self) -> Option<Vec<ToolCall>>;
+
+    /// Returns the model's thinking/reasoning output, if available.
     fn thinking(&self) -> Option<String> {
         None
     }
+
+    /// Returns token usage statistics, if available.
     fn usage(&self) -> Option<Usage> {
         None
     }
+
+    /// Returns comprehensive metrics including timing and usage.
+    ///
+    /// This method returns `Some` only when metrics collection is enabled
+    /// via `.enable_metrics(true)` on the builder. Otherwise returns `None`.
+    ///
+    /// For streaming responses, use `Tracked::finalize()` instead.
+    fn metrics(&self) -> Option<ChatMetrics> {
+        None
+    }
 }
 
 /// Trait for providers that support chat-style interactions.