Add prompt cache statistics tracking to /stats command

- Extend Usage struct with cache_creation_tokens and cache_read_tokens fields - Parse Anthropic cache_creation_input_tokens and cache_read_input_tokens - Parse OpenAI prompt_tokens_details.cached_tokens for automatic prefix caching - Add CacheStats struct to Agent for cumulative tracking across API calls - Add "Prompt Cache Statistics" section to /stats output showing: - API call count and cache hit count - Hit rate percentage - Total input tokens and cache read/creation tokens - Cache efficiency (% of input served from cache) - Update all provider implementations and test files
2026-01-27 11:32:45 +11:00
parent 96899230a4
commit 5b4079e861
13 changed files with 214 additions and 2 deletions
--- a/crates/g3-core/src/lib.rs
+++ b/crates/g3-core/src/lib.rs
@@ -74,6 +74,22 @@ pub struct ToolCall {
    pub args: serde_json::Value, // Should be a JSON object with tool-specific arguments
 }

+/// Cumulative cache statistics for prompt caching efficacy tracking.
+/// Tracks both Anthropic-style (cache_creation + cache_read) and OpenAI-style (cached_tokens) caching.
+#[derive(Debug, Clone, Default, Serialize, Deserialize)]
+pub struct CacheStats {
+    /// Total tokens written to cache across all API calls
+    pub total_cache_creation_tokens: u64,
+    /// Total tokens read from cache across all API calls
+    pub total_cache_read_tokens: u64,
+    /// Total input tokens (for calculating cache hit rate)
+    pub total_input_tokens: u64,
+    /// Number of API calls that had cache hits
+    pub cache_hit_calls: u32,
+    /// Total number of API calls
+    pub total_calls: u32,
+}
+
 // Re-export WebDriverSession from its own module
 pub use webdriver_session::WebDriverSession;

@@ -103,6 +119,8 @@ pub struct Agent<W: UiWriter> {
    auto_compact: bool,               // whether to auto-compact at 90% before tool calls
    compaction_events: Vec<usize>,    // chars saved per compaction event
    first_token_times: Vec<Duration>, // time to first token for each completion
+    /// Cumulative cache statistics across all API calls
+    cache_stats: CacheStats,
    config: Config,
    session_id: Option<String>,
    tool_call_metrics: Vec<(String, Duration, bool)>, // (tool_name, duration, success)
@@ -211,6 +229,7 @@ impl<W: UiWriter> Agent<W> {
            thinning_events: Vec::new(),
            compaction_events: Vec::new(),
            first_token_times: Vec::new(),
+            cache_stats: CacheStats::default(),
            config,
            session_id: None,
            tool_call_metrics: Vec::new(),
@@ -272,6 +291,7 @@ impl<W: UiWriter> Agent<W> {
            thinning_events: Vec::new(),
            compaction_events: Vec::new(),
            first_token_times: Vec::new(),
+            cache_stats: CacheStats::default(),
            config,
            session_id: None,
            tool_call_metrics: Vec::new(),
@@ -387,6 +407,7 @@ impl<W: UiWriter> Agent<W> {
            thinning_events: Vec::new(),
            compaction_events: Vec::new(),
            first_token_times: Vec::new(),
+            cache_stats: CacheStats::default(),
            config,
            session_id: None,
            tool_call_metrics: Vec::new(),
@@ -986,6 +1007,8 @@ impl<W: UiWriter> Agent<W> {
            prompt_tokens: 100,                                   // Estimate
            completion_tokens: response_content.len() as u32 / 4, // Rough estimate
            total_tokens: 100 + (response_content.len() as u32 / 4),
+            cache_creation_tokens: 0,
+            cache_read_tokens: 0,
        };

        // Update context window with estimated token usage
@@ -1408,6 +1431,7 @@ impl<W: UiWriter> Agent<W> {
            first_token_times: &self.first_token_times,
            tool_call_metrics: &self.tool_call_metrics,
            provider_info: self.get_provider_info().ok(),
+            cache_stats: &self.cache_stats,
        };

        snapshot.format()
@@ -2111,6 +2135,17 @@ Skip if nothing new. Be brief."#;
                        if let Some(ref usage) = chunk.usage {
                            iter.accumulated_usage = Some(usage.clone());
                            state.turn_accumulated_usage = Some(usage.clone());
+                            
+                            // Update cumulative cache statistics
+                            self.cache_stats.total_calls += 1;
+                            self.cache_stats.total_input_tokens += usage.prompt_tokens as u64;
+                            self.cache_stats.total_cache_creation_tokens +=
+                                usage.cache_creation_tokens as u64;
+                            self.cache_stats.total_cache_read_tokens +=
+                                usage.cache_read_tokens as u64;
+                            if usage.cache_read_tokens > 0 {
+                                self.cache_stats.cache_hit_calls += 1;
+                            }
                            debug!(
                                "Received usage data - prompt: {}, completion: {}, total: {}",
                                usage.prompt_tokens, usage.completion_tokens, usage.total_tokens