From 5b4079e86185f4566523293be2d1d6c81c75469f Mon Sep 17 00:00:00 2001
From: "Dhanji R. Prasanna" <d@wideplay.com>
Date: Tue, 27 Jan 2026 11:32:45 +1100
Subject: [PATCH] Add prompt cache statistics tracking to /stats command

- Extend Usage struct with cache_creation_tokens and cache_read_tokens fields
- Parse Anthropic cache_creation_input_tokens and cache_read_input_tokens
- Parse OpenAI prompt_tokens_details.cached_tokens for automatic prefix caching
- Add CacheStats struct to Agent for cumulative tracking across API calls
- Add "Prompt Cache Statistics" section to /stats output showing:
  - API call count and cache hit count
  - Hit rate percentage
  - Total input tokens and cache read/creation tokens
  - Cache efficiency (% of input served from cache)
- Update all provider implementations and test files
---
 analysis/memory.md                            | 26 ++++++-
 crates/g3-core/src/lib.rs                     | 35 ++++++++++
 crates/g3-core/src/stats.rs                   | 68 +++++++++++++++++++
 .../g3-core/tests/json_parsing_stress_test.rs |  6 ++
 .../tests/mock_provider_integration_test.rs   |  6 ++
 .../tests/streaming_completion_test.rs        | 10 +++
 crates/g3-core/tests/test_token_counting.rs   |  6 ++
 crates/g3-providers/src/anthropic.rs          | 12 ++++
 crates/g3-providers/src/databricks.rs         |  2 +
 crates/g3-providers/src/embedded.rs           |  2 +
 crates/g3-providers/src/lib.rs                |  6 ++
 crates/g3-providers/src/mock.rs               | 14 ++++
 crates/g3-providers/src/openai.rs             | 23 +++++++
 13 files changed, 214 insertions(+), 2 deletions(-)

diff --git a/analysis/memory.md b/analysis/memory.md
index a816201..b24b559 100644
--- a/analysis/memory.md
+++ b/analysis/memory.md
@@ -1,5 +1,5 @@
 # Workspace Memory
-> Updated: 2026-01-20T10:16:13Z | Size: 18.3k chars
+> Updated: 2026-01-27T00:12:18Z | Size: 19.5k chars
 
 ### Remember Tool Wiring
 - `crates/g3-core/src/tools/memory.rs` [0..5000] - `execute_remember()`, `get_memory_path()`, `merge_memory()`
@@ -324,4 +324,26 @@ Centralized logic for determining how to display tool execution results.
   - `is_compact_tool()` [147..162] - checks if tool uses one-line summaries (read_file, write_file, str_replace, etc.)
   - `is_self_handled_tool()` [164..167] - checks if tool handles own output (todo_read, todo_write)
   - `format_compact_tool_summary()` [169..185] - dispatches to format_*_summary() based on tool name
-  - `parse_diff_stats()` [187..210] - parses "+N insertions | -M deletions" from str_replace result
\ No newline at end of file
+  - `parse_diff_stats()` [187..210] - parses "+N insertions | -M deletions" from str_replace result
+
+### Prompt Cache Statistics Tracking
+Tracks prompt/prefix caching efficacy across Anthropic and OpenAI providers.
+
+- `crates/g3-providers/src/lib.rs`
+  - `Usage` [195..210] - added `cache_creation_tokens` and `cache_read_tokens` fields with `#[serde(default)]`
+
+- `crates/g3-providers/src/anthropic.rs`
+  - `AnthropicUsage` [944..956] - parses `cache_creation_input_tokens` and `cache_read_input_tokens`
+
+- `crates/g3-providers/src/openai.rs`
+  - `OpenAIUsage` [494..510] - parses `prompt_tokens_details.cached_tokens`
+  - `OpenAIPromptTokensDetails` [504..510] - nested struct for prompt token details
+
+- `crates/g3-core/src/lib.rs`
+  - `CacheStats` [75..90] - cumulative cache statistics struct with `total_cache_creation_tokens`, `total_cache_read_tokens`, `total_input_tokens`, `cache_hit_calls`, `total_calls`
+  - `Agent.cache_stats` [106] - field tracking cumulative cache stats
+  - Cache stats updated in `stream_completion_with_tools()` [2140..2150] when usage data received
+
+- `crates/g3-core/src/stats.rs`
+  - `AgentStatsSnapshot.cache_stats` [20] - reference to cache stats for formatting
+  - `format_cache_stats()` [189..230] - formats cache statistics section with hit rate and efficiency metrics
\ No newline at end of file
diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs
index 78d7e54..616a015 100644
--- a/crates/g3-core/src/lib.rs
+++ b/crates/g3-core/src/lib.rs
@@ -74,6 +74,22 @@ pub struct ToolCall {
     pub args: serde_json::Value, // Should be a JSON object with tool-specific arguments
 }
 
+/// Cumulative cache statistics for prompt caching efficacy tracking.
+/// Tracks both Anthropic-style (cache_creation + cache_read) and OpenAI-style (cached_tokens) caching.
+#[derive(Debug, Clone, Default, Serialize, Deserialize)]
+pub struct CacheStats {
+    /// Total tokens written to cache across all API calls
+    pub total_cache_creation_tokens: u64,
+    /// Total tokens read from cache across all API calls
+    pub total_cache_read_tokens: u64,
+    /// Total input tokens (for calculating cache hit rate)
+    pub total_input_tokens: u64,
+    /// Number of API calls that had cache hits
+    pub cache_hit_calls: u32,
+    /// Total number of API calls
+    pub total_calls: u32,
+}
+
 // Re-export WebDriverSession from its own module
 pub use webdriver_session::WebDriverSession;
 
@@ -103,6 +119,8 @@ pub struct Agent<W: UiWriter> {
     auto_compact: bool,               // whether to auto-compact at 90% before tool calls
     compaction_events: Vec<usize>,    // chars saved per compaction event
     first_token_times: Vec<Duration>, // time to first token for each completion
+    /// Cumulative cache statistics across all API calls
+    cache_stats: CacheStats,
     config: Config,
     session_id: Option<String>,
     tool_call_metrics: Vec<(String, Duration, bool)>, // (tool_name, duration, success)
@@ -211,6 +229,7 @@ impl<W: UiWriter> Agent<W> {
             thinning_events: Vec::new(),
             compaction_events: Vec::new(),
             first_token_times: Vec::new(),
+            cache_stats: CacheStats::default(),
             config,
             session_id: None,
             tool_call_metrics: Vec::new(),
@@ -272,6 +291,7 @@ impl<W: UiWriter> Agent<W> {
             thinning_events: Vec::new(),
             compaction_events: Vec::new(),
             first_token_times: Vec::new(),
+            cache_stats: CacheStats::default(),
             config,
             session_id: None,
             tool_call_metrics: Vec::new(),
@@ -387,6 +407,7 @@ impl<W: UiWriter> Agent<W> {
             thinning_events: Vec::new(),
             compaction_events: Vec::new(),
             first_token_times: Vec::new(),
+            cache_stats: CacheStats::default(),
             config,
             session_id: None,
             tool_call_metrics: Vec::new(),
@@ -986,6 +1007,8 @@ impl<W: UiWriter> Agent<W> {
             prompt_tokens: 100,                                   // Estimate
             completion_tokens: response_content.len() as u32 / 4, // Rough estimate
             total_tokens: 100 + (response_content.len() as u32 / 4),
+            cache_creation_tokens: 0,
+            cache_read_tokens: 0,
         };
 
         // Update context window with estimated token usage
@@ -1408,6 +1431,7 @@ impl<W: UiWriter> Agent<W> {
             first_token_times: &self.first_token_times,
             tool_call_metrics: &self.tool_call_metrics,
             provider_info: self.get_provider_info().ok(),
+            cache_stats: &self.cache_stats,
         };
 
         snapshot.format()
@@ -2111,6 +2135,17 @@ Skip if nothing new. Be brief."#;
                         if let Some(ref usage) = chunk.usage {
                             iter.accumulated_usage = Some(usage.clone());
                             state.turn_accumulated_usage = Some(usage.clone());
+                            
+                            // Update cumulative cache statistics
+                            self.cache_stats.total_calls += 1;
+                            self.cache_stats.total_input_tokens += usage.prompt_tokens as u64;
+                            self.cache_stats.total_cache_creation_tokens +=
+                                usage.cache_creation_tokens as u64;
+                            self.cache_stats.total_cache_read_tokens +=
+                                usage.cache_read_tokens as u64;
+                            if usage.cache_read_tokens > 0 {
+                                self.cache_stats.cache_hit_calls += 1;
+                            }
                             debug!(
                                 "Received usage data - prompt: {}, completion: {}, total: {}",
                                 usage.prompt_tokens, usage.completion_tokens, usage.total_tokens
diff --git a/crates/g3-core/src/stats.rs b/crates/g3-core/src/stats.rs
index c78d384..2cef307 100644
--- a/crates/g3-core/src/stats.rs
+++ b/crates/g3-core/src/stats.rs
@@ -7,6 +7,7 @@ use g3_providers::MessageRole;
 use std::time::Duration;
 
 use crate::context_window::ContextWindow;
+use crate::CacheStats;
 
 /// Data required to format agent statistics.
 /// This struct captures a snapshot of agent state for formatting.
@@ -17,6 +18,7 @@ pub struct AgentStatsSnapshot<'a> {
     pub first_token_times: &'a [Duration],
     pub tool_call_metrics: &'a [(String, Duration, bool)],
     pub provider_info: Option<(String, String)>,
+    pub cache_stats: &'a CacheStats,
 }
 
 impl<'a> AgentStatsSnapshot<'a> {
@@ -33,6 +35,7 @@ impl<'a> AgentStatsSnapshot<'a> {
         self.format_performance_metrics(&mut stats);
         self.format_conversation_history(&mut stats);
         self.format_tool_call_metrics(&mut stats);
+        self.format_cache_stats(&mut stats);
         self.format_provider_info(&mut stats);
 
         stats.push_str(&"=".repeat(60));
@@ -184,6 +187,53 @@ impl<'a> AgentStatsSnapshot<'a> {
         stats.push('\n');
     }
 
+    fn format_cache_stats(&self, stats: &mut String) {
+        stats.push_str("💾 Prompt Cache Statistics:\n");
+        stats.push_str(&format!(
+            "   • API Calls:         {:>10}\n",
+            self.cache_stats.total_calls
+        ));
+        stats.push_str(&format!(
+            "   • Cache Hits:        {:>10}\n",
+            self.cache_stats.cache_hit_calls
+        ));
+        
+        // Calculate hit rate
+        let hit_rate = if self.cache_stats.total_calls > 0 {
+            (self.cache_stats.cache_hit_calls as f64 / self.cache_stats.total_calls as f64) * 100.0
+        } else {
+            0.0
+        };
+        stats.push_str(&format!("   • Hit Rate:          {:>9.1}%\n", hit_rate));
+        
+        stats.push_str(&format!(
+            "   • Total Input Tokens:{:>10}\n",
+            self.cache_stats.total_input_tokens
+        ));
+        stats.push_str(&format!(
+            "   • Cache Created:     {:>10} tokens\n",
+            self.cache_stats.total_cache_creation_tokens
+        ));
+        stats.push_str(&format!(
+            "   • Cache Read:        {:>10} tokens\n",
+            self.cache_stats.total_cache_read_tokens
+        ));
+        
+        // Calculate cache read percentage of total input
+        let cache_read_pct = if self.cache_stats.total_input_tokens > 0 {
+            (self.cache_stats.total_cache_read_tokens as f64
+                / self.cache_stats.total_input_tokens as f64)
+                * 100.0
+        } else {
+            0.0
+        };
+        stats.push_str(&format!(
+            "   • Cache Efficiency:  {:>9.1}% of input from cache\n",
+            cache_read_pct
+        ));
+        stats.push('\n');
+    }
+
     fn format_provider_info(&self, stats: &mut String) {
         stats.push_str("🔌 Provider:\n");
         if let Some((provider, model)) = &self.provider_info {
@@ -201,6 +251,7 @@ mod tests {
     #[test]
     fn test_format_stats_empty() {
         let context_window = ContextWindow::new(100000);
+        let cache_stats = CacheStats::default();
         let snapshot = AgentStatsSnapshot {
             context_window: &context_window,
             thinning_events: &[],
@@ -208,6 +259,7 @@ mod tests {
             first_token_times: &[],
             tool_call_metrics: &[],
             provider_info: None,
+            cache_stats: &cache_stats,
         };
 
         let stats = snapshot.format();
@@ -215,6 +267,7 @@ mod tests {
         assert!(stats.contains("Used Tokens"));
         assert!(stats.contains("Thinning Events"));
         assert!(stats.contains("Tool Call Metrics"));
+        assert!(stats.contains("Prompt Cache Statistics"));
     }
 
     #[test]
@@ -222,6 +275,13 @@ mod tests {
         let context_window = ContextWindow::new(100000);
         let thinning_events = vec![1000, 2000, 1500];
         let compaction_events = vec![5000];
+        let cache_stats = CacheStats {
+            total_calls: 5,
+            cache_hit_calls: 3,
+            total_input_tokens: 10000,
+            total_cache_creation_tokens: 2000,
+            total_cache_read_tokens: 6000,
+        };
         let first_token_times = vec![
             Duration::from_millis(100),
             Duration::from_millis(150),
@@ -240,6 +300,7 @@ mod tests {
             first_token_times: &first_token_times,
             tool_call_metrics: &tool_call_metrics,
             provider_info: Some(("anthropic".to_string(), "claude-3".to_string())),
+            cache_stats: &cache_stats,
         };
 
         let stats = snapshot.format();
@@ -259,5 +320,12 @@ mod tests {
         // Check provider info
         assert!(stats.contains("Provider:          anthropic"));
         assert!(stats.contains("Model:             claude-3"));
+        
+        // Check cache stats
+        assert!(stats.contains("Prompt Cache Statistics"));
+        assert!(stats.contains("API Calls:                  5"));
+        assert!(stats.contains("Cache Hits:                 3"));
+        assert!(stats.contains("Hit Rate:") && stats.contains("60.0%"));
+        assert!(stats.contains("Cache Efficiency:"));
     }
 }
diff --git a/crates/g3-core/tests/json_parsing_stress_test.rs b/crates/g3-core/tests/json_parsing_stress_test.rs
index 939db4d..4e048b3 100644
--- a/crates/g3-core/tests/json_parsing_stress_test.rs
+++ b/crates/g3-core/tests/json_parsing_stress_test.rs
@@ -57,6 +57,8 @@ fn finished_chunk() -> CompletionChunk {
             prompt_tokens: 100,
             completion_tokens: 50,
             total_tokens: 150,
+            cache_creation_tokens: 0,
+            cache_read_tokens: 0,
         }),
     }
 }
@@ -697,6 +699,8 @@ async fn test_agent_json_fallback_executes() {
                 prompt_tokens: 100,
                 completion_tokens: 50,
                 total_tokens: 150,
+            cache_creation_tokens: 0,
+            cache_read_tokens: 0,
             },
         ))
         .with_default_response(MockResponse::text("Done."));
@@ -800,6 +804,8 @@ async fn test_tool_result_with_json_not_parsed() {
                 prompt_tokens: 100,
                 completion_tokens: 50,
                 total_tokens: 150,
+            cache_creation_tokens: 0,
+            cache_read_tokens: 0,
             },
         ))
         // Second response: LLM acknowledges the file content
diff --git a/crates/g3-core/tests/mock_provider_integration_test.rs b/crates/g3-core/tests/mock_provider_integration_test.rs
index 97b3336..cd9a8b7 100644
--- a/crates/g3-core/tests/mock_provider_integration_test.rs
+++ b/crates/g3-core/tests/mock_provider_integration_test.rs
@@ -674,6 +674,8 @@ async fn test_multiple_tools_in_single_response_all_executed() {
                 prompt_tokens: 100,
                 completion_tokens: 100,
                 total_tokens: 200,
+                cache_creation_tokens: 0,
+                cache_read_tokens: 0,
             },
         ))
         .with_default_response(MockResponse::text("Both commands executed."));
@@ -774,6 +776,8 @@ async fn test_llm_repeats_text_before_each_tool_call() {
                 prompt_tokens: 100,
                 completion_tokens: 50,
                 total_tokens: 150,
+                cache_creation_tokens: 0,
+                cache_read_tokens: 0,
             },
         ))
         // Second response: SAME preamble + tool call 2
@@ -789,6 +793,8 @@ async fn test_llm_repeats_text_before_each_tool_call() {
                 prompt_tokens: 150,
                 completion_tokens: 50,
                 total_tokens: 200,
+                cache_creation_tokens: 0,
+                cache_read_tokens: 0,
             },
         ))
         // Third response: final acknowledgment
diff --git a/crates/g3-core/tests/streaming_completion_test.rs b/crates/g3-core/tests/streaming_completion_test.rs
index 9279478..28037e5 100644
--- a/crates/g3-core/tests/streaming_completion_test.rs
+++ b/crates/g3-core/tests/streaming_completion_test.rs
@@ -60,6 +60,8 @@ fn default_usage() -> Usage {
         prompt_tokens: 0,
         completion_tokens: 0,
         total_tokens: 0,
+            cache_creation_tokens: 0,
+            cache_read_tokens: 0,
     }
 }
 
@@ -169,6 +171,8 @@ impl LLMProvider for MockStreamingProvider {
                             prompt_tokens: 100,
                             completion_tokens: 50,
                             total_tokens: 150,
+            cache_creation_tokens: 0,
+            cache_read_tokens: 0,
                         }),
                         stop_reason: Some("end_turn".to_string()),
                         tool_call_streaming: None,
@@ -201,6 +205,8 @@ impl LLMProvider for MockStreamingProvider {
                             prompt_tokens: 50,
                             completion_tokens: 10,
                             total_tokens: 60,
+            cache_creation_tokens: 0,
+            cache_read_tokens: 0,
                         }),
                         stop_reason: Some("end_turn".to_string()),
                         tool_call_streaming: None,
@@ -407,6 +413,8 @@ async fn test_finished_signal_terminates_stream() {
                     prompt_tokens: 0,
                     completion_tokens: 0,
                     total_tokens: 0,
+            cache_creation_tokens: 0,
+            cache_read_tokens: 0,
                 },
                 model: "simple".to_string(),
             })
@@ -439,6 +447,8 @@ async fn test_finished_signal_terminates_stream() {
                             prompt_tokens: 10,
                             completion_tokens: 10,
                             total_tokens: 20,
+            cache_creation_tokens: 0,
+            cache_read_tokens: 0,
                         }),
                         stop_reason: Some("end_turn".to_string()),
                         tool_call_streaming: None,
diff --git a/crates/g3-core/tests/test_token_counting.rs b/crates/g3-core/tests/test_token_counting.rs
index 654d41e..062283d 100644
--- a/crates/g3-core/tests/test_token_counting.rs
+++ b/crates/g3-core/tests/test_token_counting.rs
@@ -38,6 +38,8 @@ fn test_update_usage_only_affects_cumulative() {
         prompt_tokens: 100,
         completion_tokens: 50,
         total_tokens: 150,
+        cache_creation_tokens: 0,
+        cache_read_tokens: 0,
     };
     window.update_usage_from_response(&usage);
 
@@ -52,6 +54,8 @@ fn test_update_usage_only_affects_cumulative() {
         prompt_tokens: 200,
         completion_tokens: 75,
         total_tokens: 275,
+        cache_creation_tokens: 0,
+        cache_read_tokens: 0,
     };
     window.update_usage_from_response(&usage2);
 
@@ -156,6 +160,8 @@ fn test_cumulative_vs_used_independence() {
         prompt_tokens: 500,
         completion_tokens: 200,
         total_tokens: 700,
+        cache_creation_tokens: 0,
+        cache_read_tokens: 0,
     };
     window.update_usage_from_response(&usage);
 
diff --git a/crates/g3-providers/src/anthropic.rs b/crates/g3-providers/src/anthropic.rs
index 9835ed1..780cbfe 100644
--- a/crates/g3-providers/src/anthropic.rs
+++ b/crates/g3-providers/src/anthropic.rs
@@ -464,6 +464,10 @@ impl AnthropicProvider {
                                                         completion_tokens: usage.output_tokens,
                                                         total_tokens: usage.input_tokens
                                                             + usage.output_tokens,
+                                                        cache_creation_tokens: usage
+                                                            .cache_creation_input_tokens,
+                                                        cache_read_tokens: usage
+                                                            .cache_read_input_tokens,
                                                     });
                                                     debug!(
                                                         "Captured usage from message_start: {:?}",
@@ -739,6 +743,8 @@ impl LLMProvider for AnthropicProvider {
             completion_tokens: anthropic_response.usage.output_tokens,
             total_tokens: anthropic_response.usage.input_tokens
                 + anthropic_response.usage.output_tokens,
+            cache_creation_tokens: anthropic_response.usage.cache_creation_input_tokens,
+            cache_read_tokens: anthropic_response.usage.cache_read_input_tokens,
         };
 
         debug!(
@@ -945,6 +951,12 @@ struct AnthropicResponse {
 struct AnthropicUsage {
     input_tokens: u32,
     output_tokens: u32,
+    /// Tokens written to cache when creating a new cache entry
+    #[serde(default)]
+    cache_creation_input_tokens: u32,
+    /// Tokens retrieved from cache (cache hit)
+    #[serde(default)]
+    cache_read_input_tokens: u32,
 }
 
 // Streaming response structures
diff --git a/crates/g3-providers/src/databricks.rs b/crates/g3-providers/src/databricks.rs
index c309a78..161a5e8 100644
--- a/crates/g3-providers/src/databricks.rs
+++ b/crates/g3-providers/src/databricks.rs
@@ -763,6 +763,8 @@ impl LLMProvider for DatabricksProvider {
             prompt_tokens: databricks_response.usage.prompt_tokens,
             completion_tokens: databricks_response.usage.completion_tokens,
             total_tokens: databricks_response.usage.total_tokens,
+            cache_creation_tokens: 0, // Databricks doesn't support prompt caching
+            cache_read_tokens: 0,
         };
 
         debug!(
diff --git a/crates/g3-providers/src/embedded.rs b/crates/g3-providers/src/embedded.rs
index 9ac9882..9a196d2 100644
--- a/crates/g3-providers/src/embedded.rs
+++ b/crates/g3-providers/src/embedded.rs
@@ -531,6 +531,8 @@ impl LLMProvider for EmbeddedProvider {
                 prompt_tokens,
                 completion_tokens,
                 total_tokens: prompt_tokens + completion_tokens,
+                cache_creation_tokens: 0, // Embedded models don't support prompt caching
+                cache_read_tokens: 0,
             },
             model: self.model_name.clone(),
         })
diff --git a/crates/g3-providers/src/lib.rs b/crates/g3-providers/src/lib.rs
index f34860b..ab5c38a 100644
--- a/crates/g3-providers/src/lib.rs
+++ b/crates/g3-providers/src/lib.rs
@@ -196,6 +196,12 @@ pub struct Usage {
     pub prompt_tokens: u32,
     pub completion_tokens: u32,
     pub total_tokens: u32,
+    /// Tokens written to cache (Anthropic: cache_creation_input_tokens)
+    #[serde(default)]
+    pub cache_creation_tokens: u32,
+    /// Tokens read from cache (Anthropic: cache_read_input_tokens, OpenAI: cached_tokens)
+    #[serde(default)]
+    pub cache_read_tokens: u32,
 }
 
 pub type CompletionStream = tokio_stream::wrappers::ReceiverStream<Result<CompletionChunk>>;
diff --git a/crates/g3-providers/src/mock.rs b/crates/g3-providers/src/mock.rs
index 9007f84..bfccb83 100644
--- a/crates/g3-providers/src/mock.rs
+++ b/crates/g3-providers/src/mock.rs
@@ -120,6 +120,8 @@ impl MockResponse {
                 prompt_tokens: 100,
                 completion_tokens: content.len() as u32 / 4,
                 total_tokens: 100 + content.len() as u32 / 4,
+                cache_creation_tokens: 0,
+                cache_read_tokens: 0,
             },
         }
     }
@@ -139,6 +141,8 @@ impl MockResponse {
                 prompt_tokens: 100,
                 completion_tokens: total_content.len() as u32 / 4,
                 total_tokens: 100 + total_content.len() as u32 / 4,
+                cache_creation_tokens: 0,
+                cache_read_tokens: 0,
             },
         }
     }
@@ -155,6 +159,8 @@ impl MockResponse {
                 prompt_tokens: 100,
                 completion_tokens: 50,
                 total_tokens: 150,
+                cache_creation_tokens: 0,
+                cache_read_tokens: 0,
             },
         }
     }
@@ -172,6 +178,8 @@ impl MockResponse {
                 prompt_tokens: 100,
                 completion_tokens: 50 + text.len() as u32 / 4,
                 total_tokens: 150 + text.len() as u32 / 4,
+                cache_creation_tokens: 0,
+                cache_read_tokens: 0,
             },
         }
     }
@@ -192,6 +200,8 @@ impl MockResponse {
                 prompt_tokens: 100,
                 completion_tokens: 100,
                 total_tokens: 200,
+                cache_creation_tokens: 0,
+                cache_read_tokens: 0,
             },
         }
     }
@@ -215,6 +225,8 @@ impl MockResponse {
                 prompt_tokens: 100,
                 completion_tokens: full_content.len() as u32 / 4,
                 total_tokens: 100 + full_content.len() as u32 / 4,
+                cache_creation_tokens: 0,
+                cache_read_tokens: 0,
             },
         }
     }
@@ -230,6 +242,8 @@ impl MockResponse {
                 prompt_tokens: 100,
                 completion_tokens: content.len() as u32 / 4,
                 total_tokens: 100 + content.len() as u32 / 4,
+                cache_creation_tokens: 0,
+                cache_read_tokens: 0,
             },
         }
     }
diff --git a/crates/g3-providers/src/openai.rs b/crates/g3-providers/src/openai.rs
index a60b333..19d6610 100644
--- a/crates/g3-providers/src/openai.rs
+++ b/crates/g3-providers/src/openai.rs
@@ -220,6 +220,12 @@ impl OpenAIProvider {
                                             prompt_tokens: usage.prompt_tokens,
                                             completion_tokens: usage.completion_tokens,
                                             total_tokens: usage.total_tokens,
+                                            cache_creation_tokens: 0, // OpenAI doesn't report cache creation
+                                            cache_read_tokens: usage
+                                                .prompt_tokens_details
+                                                .as_ref()
+                                                .map(|d| d.cached_tokens)
+                                                .unwrap_or(0),
                                         });
                                     }
                                 }
@@ -306,6 +312,13 @@ impl LLMProvider for OpenAIProvider {
             prompt_tokens: openai_response.usage.prompt_tokens,
             completion_tokens: openai_response.usage.completion_tokens,
             total_tokens: openai_response.usage.total_tokens,
+            cache_creation_tokens: 0, // OpenAI doesn't report cache creation
+            cache_read_tokens: openai_response
+                .usage
+                .prompt_tokens_details
+                .as_ref()
+                .map(|d| d.cached_tokens)
+                .unwrap_or(0),
         };
 
         debug!(
@@ -495,6 +508,16 @@ struct OpenAIUsage {
     prompt_tokens: u32,
     completion_tokens: u32,
     total_tokens: u32,
+    /// Detailed breakdown of prompt tokens including cache info
+    #[serde(default)]
+    prompt_tokens_details: Option<OpenAIPromptTokensDetails>,
+}
+
+#[derive(Debug, Deserialize, Default)]
+struct OpenAIPromptTokensDetails {
+    /// Tokens retrieved from cache (cache hit)
+    #[serde(default)]
+    cached_tokens: u32,
 }
 
 // Streaming response structures