Add prompt cache statistics tracking to /stats command

- Extend Usage struct with cache_creation_tokens and cache_read_tokens fields - Parse Anthropic cache_creation_input_tokens and cache_read_input_tokens - Parse OpenAI prompt_tokens_details.cached_tokens for automatic prefix caching - Add CacheStats struct to Agent for cumulative tracking across API calls - Add "Prompt Cache Statistics" section to /stats output showing: - API call count and cache hit count - Hit rate percentage - Total input tokens and cache read/creation tokens - Cache efficiency (% of input served from cache) - Update all provider implementations and test files
2026-01-27 11:32:45 +11:00
parent 96899230a4
commit 5b4079e861
13 changed files with 214 additions and 2 deletions
--- a/crates/g3-providers/src/anthropic.rs
+++ b/crates/g3-providers/src/anthropic.rs
@@ -464,6 +464,10 @@ impl AnthropicProvider {
                                                        completion_tokens: usage.output_tokens,
                                                        total_tokens: usage.input_tokens
                                                            + usage.output_tokens,
+                                                        cache_creation_tokens: usage
+                                                            .cache_creation_input_tokens,
+                                                        cache_read_tokens: usage
+                                                            .cache_read_input_tokens,
                                                    });
                                                    debug!(
                                                        "Captured usage from message_start: {:?}",
@@ -739,6 +743,8 @@ impl LLMProvider for AnthropicProvider {
            completion_tokens: anthropic_response.usage.output_tokens,
            total_tokens: anthropic_response.usage.input_tokens
                + anthropic_response.usage.output_tokens,
+            cache_creation_tokens: anthropic_response.usage.cache_creation_input_tokens,
+            cache_read_tokens: anthropic_response.usage.cache_read_input_tokens,
        };

        debug!(
@@ -945,6 +951,12 @@ struct AnthropicResponse {
 struct AnthropicUsage {
    input_tokens: u32,
    output_tokens: u32,
+    /// Tokens written to cache when creating a new cache entry
+    #[serde(default)]
+    cache_creation_input_tokens: u32,
+    /// Tokens retrieved from cache (cache hit)
+    #[serde(default)]
+    cache_read_input_tokens: u32,
 }

 // Streaming response structures