From 5b4079e86185f4566523293be2d1d6c81c75469f Mon Sep 17 00:00:00 2001 From: "Dhanji R. Prasanna" Date: Tue, 27 Jan 2026 11:32:45 +1100 Subject: [PATCH] Add prompt cache statistics tracking to /stats command - Extend Usage struct with cache_creation_tokens and cache_read_tokens fields - Parse Anthropic cache_creation_input_tokens and cache_read_input_tokens - Parse OpenAI prompt_tokens_details.cached_tokens for automatic prefix caching - Add CacheStats struct to Agent for cumulative tracking across API calls - Add "Prompt Cache Statistics" section to /stats output showing: - API call count and cache hit count - Hit rate percentage - Total input tokens and cache read/creation tokens - Cache efficiency (% of input served from cache) - Update all provider implementations and test files --- analysis/memory.md | 26 ++++++- crates/g3-core/src/lib.rs | 35 ++++++++++ crates/g3-core/src/stats.rs | 68 +++++++++++++++++++ .../g3-core/tests/json_parsing_stress_test.rs | 6 ++ .../tests/mock_provider_integration_test.rs | 6 ++ .../tests/streaming_completion_test.rs | 10 +++ crates/g3-core/tests/test_token_counting.rs | 6 ++ crates/g3-providers/src/anthropic.rs | 12 ++++ crates/g3-providers/src/databricks.rs | 2 + crates/g3-providers/src/embedded.rs | 2 + crates/g3-providers/src/lib.rs | 6 ++ crates/g3-providers/src/mock.rs | 14 ++++ crates/g3-providers/src/openai.rs | 23 +++++++ 13 files changed, 214 insertions(+), 2 deletions(-) diff --git a/analysis/memory.md b/analysis/memory.md index a816201..b24b559 100644 --- a/analysis/memory.md +++ b/analysis/memory.md @@ -1,5 +1,5 @@ # Workspace Memory -> Updated: 2026-01-20T10:16:13Z | Size: 18.3k chars +> Updated: 2026-01-27T00:12:18Z | Size: 19.5k chars ### Remember Tool Wiring - `crates/g3-core/src/tools/memory.rs` [0..5000] - `execute_remember()`, `get_memory_path()`, `merge_memory()` @@ -324,4 +324,26 @@ Centralized logic for determining how to display tool execution results. - `is_compact_tool()` [147..162] - checks if tool uses one-line summaries (read_file, write_file, str_replace, etc.) - `is_self_handled_tool()` [164..167] - checks if tool handles own output (todo_read, todo_write) - `format_compact_tool_summary()` [169..185] - dispatches to format_*_summary() based on tool name - - `parse_diff_stats()` [187..210] - parses "+N insertions | -M deletions" from str_replace result \ No newline at end of file + - `parse_diff_stats()` [187..210] - parses "+N insertions | -M deletions" from str_replace result + +### Prompt Cache Statistics Tracking +Tracks prompt/prefix caching efficacy across Anthropic and OpenAI providers. + +- `crates/g3-providers/src/lib.rs` + - `Usage` [195..210] - added `cache_creation_tokens` and `cache_read_tokens` fields with `#[serde(default)]` + +- `crates/g3-providers/src/anthropic.rs` + - `AnthropicUsage` [944..956] - parses `cache_creation_input_tokens` and `cache_read_input_tokens` + +- `crates/g3-providers/src/openai.rs` + - `OpenAIUsage` [494..510] - parses `prompt_tokens_details.cached_tokens` + - `OpenAIPromptTokensDetails` [504..510] - nested struct for prompt token details + +- `crates/g3-core/src/lib.rs` + - `CacheStats` [75..90] - cumulative cache statistics struct with `total_cache_creation_tokens`, `total_cache_read_tokens`, `total_input_tokens`, `cache_hit_calls`, `total_calls` + - `Agent.cache_stats` [106] - field tracking cumulative cache stats + - Cache stats updated in `stream_completion_with_tools()` [2140..2150] when usage data received + +- `crates/g3-core/src/stats.rs` + - `AgentStatsSnapshot.cache_stats` [20] - reference to cache stats for formatting + - `format_cache_stats()` [189..230] - formats cache statistics section with hit rate and efficiency metrics \ No newline at end of file diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs index 78d7e54..616a015 100644 --- a/crates/g3-core/src/lib.rs +++ b/crates/g3-core/src/lib.rs @@ -74,6 +74,22 @@ pub struct ToolCall { pub args: serde_json::Value, // Should be a JSON object with tool-specific arguments } +/// Cumulative cache statistics for prompt caching efficacy tracking. +/// Tracks both Anthropic-style (cache_creation + cache_read) and OpenAI-style (cached_tokens) caching. +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct CacheStats { + /// Total tokens written to cache across all API calls + pub total_cache_creation_tokens: u64, + /// Total tokens read from cache across all API calls + pub total_cache_read_tokens: u64, + /// Total input tokens (for calculating cache hit rate) + pub total_input_tokens: u64, + /// Number of API calls that had cache hits + pub cache_hit_calls: u32, + /// Total number of API calls + pub total_calls: u32, +} + // Re-export WebDriverSession from its own module pub use webdriver_session::WebDriverSession; @@ -103,6 +119,8 @@ pub struct Agent { auto_compact: bool, // whether to auto-compact at 90% before tool calls compaction_events: Vec, // chars saved per compaction event first_token_times: Vec, // time to first token for each completion + /// Cumulative cache statistics across all API calls + cache_stats: CacheStats, config: Config, session_id: Option, tool_call_metrics: Vec<(String, Duration, bool)>, // (tool_name, duration, success) @@ -211,6 +229,7 @@ impl Agent { thinning_events: Vec::new(), compaction_events: Vec::new(), first_token_times: Vec::new(), + cache_stats: CacheStats::default(), config, session_id: None, tool_call_metrics: Vec::new(), @@ -272,6 +291,7 @@ impl Agent { thinning_events: Vec::new(), compaction_events: Vec::new(), first_token_times: Vec::new(), + cache_stats: CacheStats::default(), config, session_id: None, tool_call_metrics: Vec::new(), @@ -387,6 +407,7 @@ impl Agent { thinning_events: Vec::new(), compaction_events: Vec::new(), first_token_times: Vec::new(), + cache_stats: CacheStats::default(), config, session_id: None, tool_call_metrics: Vec::new(), @@ -986,6 +1007,8 @@ impl Agent { prompt_tokens: 100, // Estimate completion_tokens: response_content.len() as u32 / 4, // Rough estimate total_tokens: 100 + (response_content.len() as u32 / 4), + cache_creation_tokens: 0, + cache_read_tokens: 0, }; // Update context window with estimated token usage @@ -1408,6 +1431,7 @@ impl Agent { first_token_times: &self.first_token_times, tool_call_metrics: &self.tool_call_metrics, provider_info: self.get_provider_info().ok(), + cache_stats: &self.cache_stats, }; snapshot.format() @@ -2111,6 +2135,17 @@ Skip if nothing new. Be brief."#; if let Some(ref usage) = chunk.usage { iter.accumulated_usage = Some(usage.clone()); state.turn_accumulated_usage = Some(usage.clone()); + + // Update cumulative cache statistics + self.cache_stats.total_calls += 1; + self.cache_stats.total_input_tokens += usage.prompt_tokens as u64; + self.cache_stats.total_cache_creation_tokens += + usage.cache_creation_tokens as u64; + self.cache_stats.total_cache_read_tokens += + usage.cache_read_tokens as u64; + if usage.cache_read_tokens > 0 { + self.cache_stats.cache_hit_calls += 1; + } debug!( "Received usage data - prompt: {}, completion: {}, total: {}", usage.prompt_tokens, usage.completion_tokens, usage.total_tokens diff --git a/crates/g3-core/src/stats.rs b/crates/g3-core/src/stats.rs index c78d384..2cef307 100644 --- a/crates/g3-core/src/stats.rs +++ b/crates/g3-core/src/stats.rs @@ -7,6 +7,7 @@ use g3_providers::MessageRole; use std::time::Duration; use crate::context_window::ContextWindow; +use crate::CacheStats; /// Data required to format agent statistics. /// This struct captures a snapshot of agent state for formatting. @@ -17,6 +18,7 @@ pub struct AgentStatsSnapshot<'a> { pub first_token_times: &'a [Duration], pub tool_call_metrics: &'a [(String, Duration, bool)], pub provider_info: Option<(String, String)>, + pub cache_stats: &'a CacheStats, } impl<'a> AgentStatsSnapshot<'a> { @@ -33,6 +35,7 @@ impl<'a> AgentStatsSnapshot<'a> { self.format_performance_metrics(&mut stats); self.format_conversation_history(&mut stats); self.format_tool_call_metrics(&mut stats); + self.format_cache_stats(&mut stats); self.format_provider_info(&mut stats); stats.push_str(&"=".repeat(60)); @@ -184,6 +187,53 @@ impl<'a> AgentStatsSnapshot<'a> { stats.push('\n'); } + fn format_cache_stats(&self, stats: &mut String) { + stats.push_str("💾 Prompt Cache Statistics:\n"); + stats.push_str(&format!( + " • API Calls: {:>10}\n", + self.cache_stats.total_calls + )); + stats.push_str(&format!( + " • Cache Hits: {:>10}\n", + self.cache_stats.cache_hit_calls + )); + + // Calculate hit rate + let hit_rate = if self.cache_stats.total_calls > 0 { + (self.cache_stats.cache_hit_calls as f64 / self.cache_stats.total_calls as f64) * 100.0 + } else { + 0.0 + }; + stats.push_str(&format!(" • Hit Rate: {:>9.1}%\n", hit_rate)); + + stats.push_str(&format!( + " • Total Input Tokens:{:>10}\n", + self.cache_stats.total_input_tokens + )); + stats.push_str(&format!( + " • Cache Created: {:>10} tokens\n", + self.cache_stats.total_cache_creation_tokens + )); + stats.push_str(&format!( + " • Cache Read: {:>10} tokens\n", + self.cache_stats.total_cache_read_tokens + )); + + // Calculate cache read percentage of total input + let cache_read_pct = if self.cache_stats.total_input_tokens > 0 { + (self.cache_stats.total_cache_read_tokens as f64 + / self.cache_stats.total_input_tokens as f64) + * 100.0 + } else { + 0.0 + }; + stats.push_str(&format!( + " • Cache Efficiency: {:>9.1}% of input from cache\n", + cache_read_pct + )); + stats.push('\n'); + } + fn format_provider_info(&self, stats: &mut String) { stats.push_str("🔌 Provider:\n"); if let Some((provider, model)) = &self.provider_info { @@ -201,6 +251,7 @@ mod tests { #[test] fn test_format_stats_empty() { let context_window = ContextWindow::new(100000); + let cache_stats = CacheStats::default(); let snapshot = AgentStatsSnapshot { context_window: &context_window, thinning_events: &[], @@ -208,6 +259,7 @@ mod tests { first_token_times: &[], tool_call_metrics: &[], provider_info: None, + cache_stats: &cache_stats, }; let stats = snapshot.format(); @@ -215,6 +267,7 @@ mod tests { assert!(stats.contains("Used Tokens")); assert!(stats.contains("Thinning Events")); assert!(stats.contains("Tool Call Metrics")); + assert!(stats.contains("Prompt Cache Statistics")); } #[test] @@ -222,6 +275,13 @@ mod tests { let context_window = ContextWindow::new(100000); let thinning_events = vec![1000, 2000, 1500]; let compaction_events = vec![5000]; + let cache_stats = CacheStats { + total_calls: 5, + cache_hit_calls: 3, + total_input_tokens: 10000, + total_cache_creation_tokens: 2000, + total_cache_read_tokens: 6000, + }; let first_token_times = vec![ Duration::from_millis(100), Duration::from_millis(150), @@ -240,6 +300,7 @@ mod tests { first_token_times: &first_token_times, tool_call_metrics: &tool_call_metrics, provider_info: Some(("anthropic".to_string(), "claude-3".to_string())), + cache_stats: &cache_stats, }; let stats = snapshot.format(); @@ -259,5 +320,12 @@ mod tests { // Check provider info assert!(stats.contains("Provider: anthropic")); assert!(stats.contains("Model: claude-3")); + + // Check cache stats + assert!(stats.contains("Prompt Cache Statistics")); + assert!(stats.contains("API Calls: 5")); + assert!(stats.contains("Cache Hits: 3")); + assert!(stats.contains("Hit Rate:") && stats.contains("60.0%")); + assert!(stats.contains("Cache Efficiency:")); } } diff --git a/crates/g3-core/tests/json_parsing_stress_test.rs b/crates/g3-core/tests/json_parsing_stress_test.rs index 939db4d..4e048b3 100644 --- a/crates/g3-core/tests/json_parsing_stress_test.rs +++ b/crates/g3-core/tests/json_parsing_stress_test.rs @@ -57,6 +57,8 @@ fn finished_chunk() -> CompletionChunk { prompt_tokens: 100, completion_tokens: 50, total_tokens: 150, + cache_creation_tokens: 0, + cache_read_tokens: 0, }), } } @@ -697,6 +699,8 @@ async fn test_agent_json_fallback_executes() { prompt_tokens: 100, completion_tokens: 50, total_tokens: 150, + cache_creation_tokens: 0, + cache_read_tokens: 0, }, )) .with_default_response(MockResponse::text("Done.")); @@ -800,6 +804,8 @@ async fn test_tool_result_with_json_not_parsed() { prompt_tokens: 100, completion_tokens: 50, total_tokens: 150, + cache_creation_tokens: 0, + cache_read_tokens: 0, }, )) // Second response: LLM acknowledges the file content diff --git a/crates/g3-core/tests/mock_provider_integration_test.rs b/crates/g3-core/tests/mock_provider_integration_test.rs index 97b3336..cd9a8b7 100644 --- a/crates/g3-core/tests/mock_provider_integration_test.rs +++ b/crates/g3-core/tests/mock_provider_integration_test.rs @@ -674,6 +674,8 @@ async fn test_multiple_tools_in_single_response_all_executed() { prompt_tokens: 100, completion_tokens: 100, total_tokens: 200, + cache_creation_tokens: 0, + cache_read_tokens: 0, }, )) .with_default_response(MockResponse::text("Both commands executed.")); @@ -774,6 +776,8 @@ async fn test_llm_repeats_text_before_each_tool_call() { prompt_tokens: 100, completion_tokens: 50, total_tokens: 150, + cache_creation_tokens: 0, + cache_read_tokens: 0, }, )) // Second response: SAME preamble + tool call 2 @@ -789,6 +793,8 @@ async fn test_llm_repeats_text_before_each_tool_call() { prompt_tokens: 150, completion_tokens: 50, total_tokens: 200, + cache_creation_tokens: 0, + cache_read_tokens: 0, }, )) // Third response: final acknowledgment diff --git a/crates/g3-core/tests/streaming_completion_test.rs b/crates/g3-core/tests/streaming_completion_test.rs index 9279478..28037e5 100644 --- a/crates/g3-core/tests/streaming_completion_test.rs +++ b/crates/g3-core/tests/streaming_completion_test.rs @@ -60,6 +60,8 @@ fn default_usage() -> Usage { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0, + cache_creation_tokens: 0, + cache_read_tokens: 0, } } @@ -169,6 +171,8 @@ impl LLMProvider for MockStreamingProvider { prompt_tokens: 100, completion_tokens: 50, total_tokens: 150, + cache_creation_tokens: 0, + cache_read_tokens: 0, }), stop_reason: Some("end_turn".to_string()), tool_call_streaming: None, @@ -201,6 +205,8 @@ impl LLMProvider for MockStreamingProvider { prompt_tokens: 50, completion_tokens: 10, total_tokens: 60, + cache_creation_tokens: 0, + cache_read_tokens: 0, }), stop_reason: Some("end_turn".to_string()), tool_call_streaming: None, @@ -407,6 +413,8 @@ async fn test_finished_signal_terminates_stream() { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0, + cache_creation_tokens: 0, + cache_read_tokens: 0, }, model: "simple".to_string(), }) @@ -439,6 +447,8 @@ async fn test_finished_signal_terminates_stream() { prompt_tokens: 10, completion_tokens: 10, total_tokens: 20, + cache_creation_tokens: 0, + cache_read_tokens: 0, }), stop_reason: Some("end_turn".to_string()), tool_call_streaming: None, diff --git a/crates/g3-core/tests/test_token_counting.rs b/crates/g3-core/tests/test_token_counting.rs index 654d41e..062283d 100644 --- a/crates/g3-core/tests/test_token_counting.rs +++ b/crates/g3-core/tests/test_token_counting.rs @@ -38,6 +38,8 @@ fn test_update_usage_only_affects_cumulative() { prompt_tokens: 100, completion_tokens: 50, total_tokens: 150, + cache_creation_tokens: 0, + cache_read_tokens: 0, }; window.update_usage_from_response(&usage); @@ -52,6 +54,8 @@ fn test_update_usage_only_affects_cumulative() { prompt_tokens: 200, completion_tokens: 75, total_tokens: 275, + cache_creation_tokens: 0, + cache_read_tokens: 0, }; window.update_usage_from_response(&usage2); @@ -156,6 +160,8 @@ fn test_cumulative_vs_used_independence() { prompt_tokens: 500, completion_tokens: 200, total_tokens: 700, + cache_creation_tokens: 0, + cache_read_tokens: 0, }; window.update_usage_from_response(&usage); diff --git a/crates/g3-providers/src/anthropic.rs b/crates/g3-providers/src/anthropic.rs index 9835ed1..780cbfe 100644 --- a/crates/g3-providers/src/anthropic.rs +++ b/crates/g3-providers/src/anthropic.rs @@ -464,6 +464,10 @@ impl AnthropicProvider { completion_tokens: usage.output_tokens, total_tokens: usage.input_tokens + usage.output_tokens, + cache_creation_tokens: usage + .cache_creation_input_tokens, + cache_read_tokens: usage + .cache_read_input_tokens, }); debug!( "Captured usage from message_start: {:?}", @@ -739,6 +743,8 @@ impl LLMProvider for AnthropicProvider { completion_tokens: anthropic_response.usage.output_tokens, total_tokens: anthropic_response.usage.input_tokens + anthropic_response.usage.output_tokens, + cache_creation_tokens: anthropic_response.usage.cache_creation_input_tokens, + cache_read_tokens: anthropic_response.usage.cache_read_input_tokens, }; debug!( @@ -945,6 +951,12 @@ struct AnthropicResponse { struct AnthropicUsage { input_tokens: u32, output_tokens: u32, + /// Tokens written to cache when creating a new cache entry + #[serde(default)] + cache_creation_input_tokens: u32, + /// Tokens retrieved from cache (cache hit) + #[serde(default)] + cache_read_input_tokens: u32, } // Streaming response structures diff --git a/crates/g3-providers/src/databricks.rs b/crates/g3-providers/src/databricks.rs index c309a78..161a5e8 100644 --- a/crates/g3-providers/src/databricks.rs +++ b/crates/g3-providers/src/databricks.rs @@ -763,6 +763,8 @@ impl LLMProvider for DatabricksProvider { prompt_tokens: databricks_response.usage.prompt_tokens, completion_tokens: databricks_response.usage.completion_tokens, total_tokens: databricks_response.usage.total_tokens, + cache_creation_tokens: 0, // Databricks doesn't support prompt caching + cache_read_tokens: 0, }; debug!( diff --git a/crates/g3-providers/src/embedded.rs b/crates/g3-providers/src/embedded.rs index 9ac9882..9a196d2 100644 --- a/crates/g3-providers/src/embedded.rs +++ b/crates/g3-providers/src/embedded.rs @@ -531,6 +531,8 @@ impl LLMProvider for EmbeddedProvider { prompt_tokens, completion_tokens, total_tokens: prompt_tokens + completion_tokens, + cache_creation_tokens: 0, // Embedded models don't support prompt caching + cache_read_tokens: 0, }, model: self.model_name.clone(), }) diff --git a/crates/g3-providers/src/lib.rs b/crates/g3-providers/src/lib.rs index f34860b..ab5c38a 100644 --- a/crates/g3-providers/src/lib.rs +++ b/crates/g3-providers/src/lib.rs @@ -196,6 +196,12 @@ pub struct Usage { pub prompt_tokens: u32, pub completion_tokens: u32, pub total_tokens: u32, + /// Tokens written to cache (Anthropic: cache_creation_input_tokens) + #[serde(default)] + pub cache_creation_tokens: u32, + /// Tokens read from cache (Anthropic: cache_read_input_tokens, OpenAI: cached_tokens) + #[serde(default)] + pub cache_read_tokens: u32, } pub type CompletionStream = tokio_stream::wrappers::ReceiverStream>; diff --git a/crates/g3-providers/src/mock.rs b/crates/g3-providers/src/mock.rs index 9007f84..bfccb83 100644 --- a/crates/g3-providers/src/mock.rs +++ b/crates/g3-providers/src/mock.rs @@ -120,6 +120,8 @@ impl MockResponse { prompt_tokens: 100, completion_tokens: content.len() as u32 / 4, total_tokens: 100 + content.len() as u32 / 4, + cache_creation_tokens: 0, + cache_read_tokens: 0, }, } } @@ -139,6 +141,8 @@ impl MockResponse { prompt_tokens: 100, completion_tokens: total_content.len() as u32 / 4, total_tokens: 100 + total_content.len() as u32 / 4, + cache_creation_tokens: 0, + cache_read_tokens: 0, }, } } @@ -155,6 +159,8 @@ impl MockResponse { prompt_tokens: 100, completion_tokens: 50, total_tokens: 150, + cache_creation_tokens: 0, + cache_read_tokens: 0, }, } } @@ -172,6 +178,8 @@ impl MockResponse { prompt_tokens: 100, completion_tokens: 50 + text.len() as u32 / 4, total_tokens: 150 + text.len() as u32 / 4, + cache_creation_tokens: 0, + cache_read_tokens: 0, }, } } @@ -192,6 +200,8 @@ impl MockResponse { prompt_tokens: 100, completion_tokens: 100, total_tokens: 200, + cache_creation_tokens: 0, + cache_read_tokens: 0, }, } } @@ -215,6 +225,8 @@ impl MockResponse { prompt_tokens: 100, completion_tokens: full_content.len() as u32 / 4, total_tokens: 100 + full_content.len() as u32 / 4, + cache_creation_tokens: 0, + cache_read_tokens: 0, }, } } @@ -230,6 +242,8 @@ impl MockResponse { prompt_tokens: 100, completion_tokens: content.len() as u32 / 4, total_tokens: 100 + content.len() as u32 / 4, + cache_creation_tokens: 0, + cache_read_tokens: 0, }, } } diff --git a/crates/g3-providers/src/openai.rs b/crates/g3-providers/src/openai.rs index a60b333..19d6610 100644 --- a/crates/g3-providers/src/openai.rs +++ b/crates/g3-providers/src/openai.rs @@ -220,6 +220,12 @@ impl OpenAIProvider { prompt_tokens: usage.prompt_tokens, completion_tokens: usage.completion_tokens, total_tokens: usage.total_tokens, + cache_creation_tokens: 0, // OpenAI doesn't report cache creation + cache_read_tokens: usage + .prompt_tokens_details + .as_ref() + .map(|d| d.cached_tokens) + .unwrap_or(0), }); } } @@ -306,6 +312,13 @@ impl LLMProvider for OpenAIProvider { prompt_tokens: openai_response.usage.prompt_tokens, completion_tokens: openai_response.usage.completion_tokens, total_tokens: openai_response.usage.total_tokens, + cache_creation_tokens: 0, // OpenAI doesn't report cache creation + cache_read_tokens: openai_response + .usage + .prompt_tokens_details + .as_ref() + .map(|d| d.cached_tokens) + .unwrap_or(0), }; debug!( @@ -495,6 +508,16 @@ struct OpenAIUsage { prompt_tokens: u32, completion_tokens: u32, total_tokens: u32, + /// Detailed breakdown of prompt tokens including cache info + #[serde(default)] + prompt_tokens_details: Option, +} + +#[derive(Debug, Deserialize, Default)] +struct OpenAIPromptTokensDetails { + /// Tokens retrieved from cache (cache hit) + #[serde(default)] + cached_tokens: u32, } // Streaming response structures