Add prompt cache statistics tracking to /stats command

- Extend Usage struct with cache_creation_tokens and cache_read_tokens fields
- Parse Anthropic cache_creation_input_tokens and cache_read_input_tokens
- Parse OpenAI prompt_tokens_details.cached_tokens for automatic prefix caching
- Add CacheStats struct to Agent for cumulative tracking across API calls
- Add "Prompt Cache Statistics" section to /stats output showing:
  - API call count and cache hit count
  - Hit rate percentage
  - Total input tokens and cache read/creation tokens
  - Cache efficiency (% of input served from cache)
- Update all provider implementations and test files
This commit is contained in:
Dhanji R. Prasanna
2026-01-27 11:32:45 +11:00
parent 96899230a4
commit 5b4079e861
13 changed files with 214 additions and 2 deletions

View File

@@ -74,6 +74,22 @@ pub struct ToolCall {
pub args: serde_json::Value, // Should be a JSON object with tool-specific arguments
}
/// Cumulative cache statistics for prompt caching efficacy tracking.
/// Tracks both Anthropic-style (cache_creation + cache_read) and OpenAI-style (cached_tokens) caching.
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct CacheStats {
/// Total tokens written to cache across all API calls
pub total_cache_creation_tokens: u64,
/// Total tokens read from cache across all API calls
pub total_cache_read_tokens: u64,
/// Total input tokens (for calculating cache hit rate)
pub total_input_tokens: u64,
/// Number of API calls that had cache hits
pub cache_hit_calls: u32,
/// Total number of API calls
pub total_calls: u32,
}
// Re-export WebDriverSession from its own module
pub use webdriver_session::WebDriverSession;
@@ -103,6 +119,8 @@ pub struct Agent<W: UiWriter> {
auto_compact: bool, // whether to auto-compact at 90% before tool calls
compaction_events: Vec<usize>, // chars saved per compaction event
first_token_times: Vec<Duration>, // time to first token for each completion
/// Cumulative cache statistics across all API calls
cache_stats: CacheStats,
config: Config,
session_id: Option<String>,
tool_call_metrics: Vec<(String, Duration, bool)>, // (tool_name, duration, success)
@@ -211,6 +229,7 @@ impl<W: UiWriter> Agent<W> {
thinning_events: Vec::new(),
compaction_events: Vec::new(),
first_token_times: Vec::new(),
cache_stats: CacheStats::default(),
config,
session_id: None,
tool_call_metrics: Vec::new(),
@@ -272,6 +291,7 @@ impl<W: UiWriter> Agent<W> {
thinning_events: Vec::new(),
compaction_events: Vec::new(),
first_token_times: Vec::new(),
cache_stats: CacheStats::default(),
config,
session_id: None,
tool_call_metrics: Vec::new(),
@@ -387,6 +407,7 @@ impl<W: UiWriter> Agent<W> {
thinning_events: Vec::new(),
compaction_events: Vec::new(),
first_token_times: Vec::new(),
cache_stats: CacheStats::default(),
config,
session_id: None,
tool_call_metrics: Vec::new(),
@@ -986,6 +1007,8 @@ impl<W: UiWriter> Agent<W> {
prompt_tokens: 100, // Estimate
completion_tokens: response_content.len() as u32 / 4, // Rough estimate
total_tokens: 100 + (response_content.len() as u32 / 4),
cache_creation_tokens: 0,
cache_read_tokens: 0,
};
// Update context window with estimated token usage
@@ -1408,6 +1431,7 @@ impl<W: UiWriter> Agent<W> {
first_token_times: &self.first_token_times,
tool_call_metrics: &self.tool_call_metrics,
provider_info: self.get_provider_info().ok(),
cache_stats: &self.cache_stats,
};
snapshot.format()
@@ -2111,6 +2135,17 @@ Skip if nothing new. Be brief."#;
if let Some(ref usage) = chunk.usage {
iter.accumulated_usage = Some(usage.clone());
state.turn_accumulated_usage = Some(usage.clone());
// Update cumulative cache statistics
self.cache_stats.total_calls += 1;
self.cache_stats.total_input_tokens += usage.prompt_tokens as u64;
self.cache_stats.total_cache_creation_tokens +=
usage.cache_creation_tokens as u64;
self.cache_stats.total_cache_read_tokens +=
usage.cache_read_tokens as u64;
if usage.cache_read_tokens > 0 {
self.cache_stats.cache_hit_calls += 1;
}
debug!(
"Received usage data - prompt: {}, completion: {}, total: {}",
usage.prompt_tokens, usage.completion_tokens, usage.total_tokens

View File

@@ -7,6 +7,7 @@ use g3_providers::MessageRole;
use std::time::Duration;
use crate::context_window::ContextWindow;
use crate::CacheStats;
/// Data required to format agent statistics.
/// This struct captures a snapshot of agent state for formatting.
@@ -17,6 +18,7 @@ pub struct AgentStatsSnapshot<'a> {
pub first_token_times: &'a [Duration],
pub tool_call_metrics: &'a [(String, Duration, bool)],
pub provider_info: Option<(String, String)>,
pub cache_stats: &'a CacheStats,
}
impl<'a> AgentStatsSnapshot<'a> {
@@ -33,6 +35,7 @@ impl<'a> AgentStatsSnapshot<'a> {
self.format_performance_metrics(&mut stats);
self.format_conversation_history(&mut stats);
self.format_tool_call_metrics(&mut stats);
self.format_cache_stats(&mut stats);
self.format_provider_info(&mut stats);
stats.push_str(&"=".repeat(60));
@@ -184,6 +187,53 @@ impl<'a> AgentStatsSnapshot<'a> {
stats.push('\n');
}
fn format_cache_stats(&self, stats: &mut String) {
stats.push_str("💾 Prompt Cache Statistics:\n");
stats.push_str(&format!(
" • API Calls: {:>10}\n",
self.cache_stats.total_calls
));
stats.push_str(&format!(
" • Cache Hits: {:>10}\n",
self.cache_stats.cache_hit_calls
));
// Calculate hit rate
let hit_rate = if self.cache_stats.total_calls > 0 {
(self.cache_stats.cache_hit_calls as f64 / self.cache_stats.total_calls as f64) * 100.0
} else {
0.0
};
stats.push_str(&format!(" • Hit Rate: {:>9.1}%\n", hit_rate));
stats.push_str(&format!(
" • Total Input Tokens:{:>10}\n",
self.cache_stats.total_input_tokens
));
stats.push_str(&format!(
" • Cache Created: {:>10} tokens\n",
self.cache_stats.total_cache_creation_tokens
));
stats.push_str(&format!(
" • Cache Read: {:>10} tokens\n",
self.cache_stats.total_cache_read_tokens
));
// Calculate cache read percentage of total input
let cache_read_pct = if self.cache_stats.total_input_tokens > 0 {
(self.cache_stats.total_cache_read_tokens as f64
/ self.cache_stats.total_input_tokens as f64)
* 100.0
} else {
0.0
};
stats.push_str(&format!(
" • Cache Efficiency: {:>9.1}% of input from cache\n",
cache_read_pct
));
stats.push('\n');
}
fn format_provider_info(&self, stats: &mut String) {
stats.push_str("🔌 Provider:\n");
if let Some((provider, model)) = &self.provider_info {
@@ -201,6 +251,7 @@ mod tests {
#[test]
fn test_format_stats_empty() {
let context_window = ContextWindow::new(100000);
let cache_stats = CacheStats::default();
let snapshot = AgentStatsSnapshot {
context_window: &context_window,
thinning_events: &[],
@@ -208,6 +259,7 @@ mod tests {
first_token_times: &[],
tool_call_metrics: &[],
provider_info: None,
cache_stats: &cache_stats,
};
let stats = snapshot.format();
@@ -215,6 +267,7 @@ mod tests {
assert!(stats.contains("Used Tokens"));
assert!(stats.contains("Thinning Events"));
assert!(stats.contains("Tool Call Metrics"));
assert!(stats.contains("Prompt Cache Statistics"));
}
#[test]
@@ -222,6 +275,13 @@ mod tests {
let context_window = ContextWindow::new(100000);
let thinning_events = vec![1000, 2000, 1500];
let compaction_events = vec![5000];
let cache_stats = CacheStats {
total_calls: 5,
cache_hit_calls: 3,
total_input_tokens: 10000,
total_cache_creation_tokens: 2000,
total_cache_read_tokens: 6000,
};
let first_token_times = vec![
Duration::from_millis(100),
Duration::from_millis(150),
@@ -240,6 +300,7 @@ mod tests {
first_token_times: &first_token_times,
tool_call_metrics: &tool_call_metrics,
provider_info: Some(("anthropic".to_string(), "claude-3".to_string())),
cache_stats: &cache_stats,
};
let stats = snapshot.format();
@@ -259,5 +320,12 @@ mod tests {
// Check provider info
assert!(stats.contains("Provider: anthropic"));
assert!(stats.contains("Model: claude-3"));
// Check cache stats
assert!(stats.contains("Prompt Cache Statistics"));
assert!(stats.contains("API Calls: 5"));
assert!(stats.contains("Cache Hits: 3"));
assert!(stats.contains("Hit Rate:") && stats.contains("60.0%"));
assert!(stats.contains("Cache Efficiency:"));
}
}

View File

@@ -57,6 +57,8 @@ fn finished_chunk() -> CompletionChunk {
prompt_tokens: 100,
completion_tokens: 50,
total_tokens: 150,
cache_creation_tokens: 0,
cache_read_tokens: 0,
}),
}
}
@@ -697,6 +699,8 @@ async fn test_agent_json_fallback_executes() {
prompt_tokens: 100,
completion_tokens: 50,
total_tokens: 150,
cache_creation_tokens: 0,
cache_read_tokens: 0,
},
))
.with_default_response(MockResponse::text("Done."));
@@ -800,6 +804,8 @@ async fn test_tool_result_with_json_not_parsed() {
prompt_tokens: 100,
completion_tokens: 50,
total_tokens: 150,
cache_creation_tokens: 0,
cache_read_tokens: 0,
},
))
// Second response: LLM acknowledges the file content

View File

@@ -674,6 +674,8 @@ async fn test_multiple_tools_in_single_response_all_executed() {
prompt_tokens: 100,
completion_tokens: 100,
total_tokens: 200,
cache_creation_tokens: 0,
cache_read_tokens: 0,
},
))
.with_default_response(MockResponse::text("Both commands executed."));
@@ -774,6 +776,8 @@ async fn test_llm_repeats_text_before_each_tool_call() {
prompt_tokens: 100,
completion_tokens: 50,
total_tokens: 150,
cache_creation_tokens: 0,
cache_read_tokens: 0,
},
))
// Second response: SAME preamble + tool call 2
@@ -789,6 +793,8 @@ async fn test_llm_repeats_text_before_each_tool_call() {
prompt_tokens: 150,
completion_tokens: 50,
total_tokens: 200,
cache_creation_tokens: 0,
cache_read_tokens: 0,
},
))
// Third response: final acknowledgment

View File

@@ -60,6 +60,8 @@ fn default_usage() -> Usage {
prompt_tokens: 0,
completion_tokens: 0,
total_tokens: 0,
cache_creation_tokens: 0,
cache_read_tokens: 0,
}
}
@@ -169,6 +171,8 @@ impl LLMProvider for MockStreamingProvider {
prompt_tokens: 100,
completion_tokens: 50,
total_tokens: 150,
cache_creation_tokens: 0,
cache_read_tokens: 0,
}),
stop_reason: Some("end_turn".to_string()),
tool_call_streaming: None,
@@ -201,6 +205,8 @@ impl LLMProvider for MockStreamingProvider {
prompt_tokens: 50,
completion_tokens: 10,
total_tokens: 60,
cache_creation_tokens: 0,
cache_read_tokens: 0,
}),
stop_reason: Some("end_turn".to_string()),
tool_call_streaming: None,
@@ -407,6 +413,8 @@ async fn test_finished_signal_terminates_stream() {
prompt_tokens: 0,
completion_tokens: 0,
total_tokens: 0,
cache_creation_tokens: 0,
cache_read_tokens: 0,
},
model: "simple".to_string(),
})
@@ -439,6 +447,8 @@ async fn test_finished_signal_terminates_stream() {
prompt_tokens: 10,
completion_tokens: 10,
total_tokens: 20,
cache_creation_tokens: 0,
cache_read_tokens: 0,
}),
stop_reason: Some("end_turn".to_string()),
tool_call_streaming: None,

View File

@@ -38,6 +38,8 @@ fn test_update_usage_only_affects_cumulative() {
prompt_tokens: 100,
completion_tokens: 50,
total_tokens: 150,
cache_creation_tokens: 0,
cache_read_tokens: 0,
};
window.update_usage_from_response(&usage);
@@ -52,6 +54,8 @@ fn test_update_usage_only_affects_cumulative() {
prompt_tokens: 200,
completion_tokens: 75,
total_tokens: 275,
cache_creation_tokens: 0,
cache_read_tokens: 0,
};
window.update_usage_from_response(&usage2);
@@ -156,6 +160,8 @@ fn test_cumulative_vs_used_independence() {
prompt_tokens: 500,
completion_tokens: 200,
total_tokens: 700,
cache_creation_tokens: 0,
cache_read_tokens: 0,
};
window.update_usage_from_response(&usage);