Add prompt cache statistics tracking to /stats command
- Extend Usage struct with cache_creation_tokens and cache_read_tokens fields - Parse Anthropic cache_creation_input_tokens and cache_read_input_tokens - Parse OpenAI prompt_tokens_details.cached_tokens for automatic prefix caching - Add CacheStats struct to Agent for cumulative tracking across API calls - Add "Prompt Cache Statistics" section to /stats output showing: - API call count and cache hit count - Hit rate percentage - Total input tokens and cache read/creation tokens - Cache efficiency (% of input served from cache) - Update all provider implementations and test files
This commit is contained in:
@@ -1,5 +1,5 @@
|
|||||||
# Workspace Memory
|
# Workspace Memory
|
||||||
> Updated: 2026-01-20T10:16:13Z | Size: 18.3k chars
|
> Updated: 2026-01-27T00:12:18Z | Size: 19.5k chars
|
||||||
|
|
||||||
### Remember Tool Wiring
|
### Remember Tool Wiring
|
||||||
- `crates/g3-core/src/tools/memory.rs` [0..5000] - `execute_remember()`, `get_memory_path()`, `merge_memory()`
|
- `crates/g3-core/src/tools/memory.rs` [0..5000] - `execute_remember()`, `get_memory_path()`, `merge_memory()`
|
||||||
@@ -324,4 +324,26 @@ Centralized logic for determining how to display tool execution results.
|
|||||||
- `is_compact_tool()` [147..162] - checks if tool uses one-line summaries (read_file, write_file, str_replace, etc.)
|
- `is_compact_tool()` [147..162] - checks if tool uses one-line summaries (read_file, write_file, str_replace, etc.)
|
||||||
- `is_self_handled_tool()` [164..167] - checks if tool handles own output (todo_read, todo_write)
|
- `is_self_handled_tool()` [164..167] - checks if tool handles own output (todo_read, todo_write)
|
||||||
- `format_compact_tool_summary()` [169..185] - dispatches to format_*_summary() based on tool name
|
- `format_compact_tool_summary()` [169..185] - dispatches to format_*_summary() based on tool name
|
||||||
- `parse_diff_stats()` [187..210] - parses "+N insertions | -M deletions" from str_replace result
|
- `parse_diff_stats()` [187..210] - parses "+N insertions | -M deletions" from str_replace result
|
||||||
|
|
||||||
|
### Prompt Cache Statistics Tracking
|
||||||
|
Tracks prompt/prefix caching efficacy across Anthropic and OpenAI providers.
|
||||||
|
|
||||||
|
- `crates/g3-providers/src/lib.rs`
|
||||||
|
- `Usage` [195..210] - added `cache_creation_tokens` and `cache_read_tokens` fields with `#[serde(default)]`
|
||||||
|
|
||||||
|
- `crates/g3-providers/src/anthropic.rs`
|
||||||
|
- `AnthropicUsage` [944..956] - parses `cache_creation_input_tokens` and `cache_read_input_tokens`
|
||||||
|
|
||||||
|
- `crates/g3-providers/src/openai.rs`
|
||||||
|
- `OpenAIUsage` [494..510] - parses `prompt_tokens_details.cached_tokens`
|
||||||
|
- `OpenAIPromptTokensDetails` [504..510] - nested struct for prompt token details
|
||||||
|
|
||||||
|
- `crates/g3-core/src/lib.rs`
|
||||||
|
- `CacheStats` [75..90] - cumulative cache statistics struct with `total_cache_creation_tokens`, `total_cache_read_tokens`, `total_input_tokens`, `cache_hit_calls`, `total_calls`
|
||||||
|
- `Agent.cache_stats` [106] - field tracking cumulative cache stats
|
||||||
|
- Cache stats updated in `stream_completion_with_tools()` [2140..2150] when usage data received
|
||||||
|
|
||||||
|
- `crates/g3-core/src/stats.rs`
|
||||||
|
- `AgentStatsSnapshot.cache_stats` [20] - reference to cache stats for formatting
|
||||||
|
- `format_cache_stats()` [189..230] - formats cache statistics section with hit rate and efficiency metrics
|
||||||
@@ -74,6 +74,22 @@ pub struct ToolCall {
|
|||||||
pub args: serde_json::Value, // Should be a JSON object with tool-specific arguments
|
pub args: serde_json::Value, // Should be a JSON object with tool-specific arguments
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Cumulative cache statistics for prompt caching efficacy tracking.
|
||||||
|
/// Tracks both Anthropic-style (cache_creation + cache_read) and OpenAI-style (cached_tokens) caching.
|
||||||
|
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
|
||||||
|
pub struct CacheStats {
|
||||||
|
/// Total tokens written to cache across all API calls
|
||||||
|
pub total_cache_creation_tokens: u64,
|
||||||
|
/// Total tokens read from cache across all API calls
|
||||||
|
pub total_cache_read_tokens: u64,
|
||||||
|
/// Total input tokens (for calculating cache hit rate)
|
||||||
|
pub total_input_tokens: u64,
|
||||||
|
/// Number of API calls that had cache hits
|
||||||
|
pub cache_hit_calls: u32,
|
||||||
|
/// Total number of API calls
|
||||||
|
pub total_calls: u32,
|
||||||
|
}
|
||||||
|
|
||||||
// Re-export WebDriverSession from its own module
|
// Re-export WebDriverSession from its own module
|
||||||
pub use webdriver_session::WebDriverSession;
|
pub use webdriver_session::WebDriverSession;
|
||||||
|
|
||||||
@@ -103,6 +119,8 @@ pub struct Agent<W: UiWriter> {
|
|||||||
auto_compact: bool, // whether to auto-compact at 90% before tool calls
|
auto_compact: bool, // whether to auto-compact at 90% before tool calls
|
||||||
compaction_events: Vec<usize>, // chars saved per compaction event
|
compaction_events: Vec<usize>, // chars saved per compaction event
|
||||||
first_token_times: Vec<Duration>, // time to first token for each completion
|
first_token_times: Vec<Duration>, // time to first token for each completion
|
||||||
|
/// Cumulative cache statistics across all API calls
|
||||||
|
cache_stats: CacheStats,
|
||||||
config: Config,
|
config: Config,
|
||||||
session_id: Option<String>,
|
session_id: Option<String>,
|
||||||
tool_call_metrics: Vec<(String, Duration, bool)>, // (tool_name, duration, success)
|
tool_call_metrics: Vec<(String, Duration, bool)>, // (tool_name, duration, success)
|
||||||
@@ -211,6 +229,7 @@ impl<W: UiWriter> Agent<W> {
|
|||||||
thinning_events: Vec::new(),
|
thinning_events: Vec::new(),
|
||||||
compaction_events: Vec::new(),
|
compaction_events: Vec::new(),
|
||||||
first_token_times: Vec::new(),
|
first_token_times: Vec::new(),
|
||||||
|
cache_stats: CacheStats::default(),
|
||||||
config,
|
config,
|
||||||
session_id: None,
|
session_id: None,
|
||||||
tool_call_metrics: Vec::new(),
|
tool_call_metrics: Vec::new(),
|
||||||
@@ -272,6 +291,7 @@ impl<W: UiWriter> Agent<W> {
|
|||||||
thinning_events: Vec::new(),
|
thinning_events: Vec::new(),
|
||||||
compaction_events: Vec::new(),
|
compaction_events: Vec::new(),
|
||||||
first_token_times: Vec::new(),
|
first_token_times: Vec::new(),
|
||||||
|
cache_stats: CacheStats::default(),
|
||||||
config,
|
config,
|
||||||
session_id: None,
|
session_id: None,
|
||||||
tool_call_metrics: Vec::new(),
|
tool_call_metrics: Vec::new(),
|
||||||
@@ -387,6 +407,7 @@ impl<W: UiWriter> Agent<W> {
|
|||||||
thinning_events: Vec::new(),
|
thinning_events: Vec::new(),
|
||||||
compaction_events: Vec::new(),
|
compaction_events: Vec::new(),
|
||||||
first_token_times: Vec::new(),
|
first_token_times: Vec::new(),
|
||||||
|
cache_stats: CacheStats::default(),
|
||||||
config,
|
config,
|
||||||
session_id: None,
|
session_id: None,
|
||||||
tool_call_metrics: Vec::new(),
|
tool_call_metrics: Vec::new(),
|
||||||
@@ -986,6 +1007,8 @@ impl<W: UiWriter> Agent<W> {
|
|||||||
prompt_tokens: 100, // Estimate
|
prompt_tokens: 100, // Estimate
|
||||||
completion_tokens: response_content.len() as u32 / 4, // Rough estimate
|
completion_tokens: response_content.len() as u32 / 4, // Rough estimate
|
||||||
total_tokens: 100 + (response_content.len() as u32 / 4),
|
total_tokens: 100 + (response_content.len() as u32 / 4),
|
||||||
|
cache_creation_tokens: 0,
|
||||||
|
cache_read_tokens: 0,
|
||||||
};
|
};
|
||||||
|
|
||||||
// Update context window with estimated token usage
|
// Update context window with estimated token usage
|
||||||
@@ -1408,6 +1431,7 @@ impl<W: UiWriter> Agent<W> {
|
|||||||
first_token_times: &self.first_token_times,
|
first_token_times: &self.first_token_times,
|
||||||
tool_call_metrics: &self.tool_call_metrics,
|
tool_call_metrics: &self.tool_call_metrics,
|
||||||
provider_info: self.get_provider_info().ok(),
|
provider_info: self.get_provider_info().ok(),
|
||||||
|
cache_stats: &self.cache_stats,
|
||||||
};
|
};
|
||||||
|
|
||||||
snapshot.format()
|
snapshot.format()
|
||||||
@@ -2111,6 +2135,17 @@ Skip if nothing new. Be brief."#;
|
|||||||
if let Some(ref usage) = chunk.usage {
|
if let Some(ref usage) = chunk.usage {
|
||||||
iter.accumulated_usage = Some(usage.clone());
|
iter.accumulated_usage = Some(usage.clone());
|
||||||
state.turn_accumulated_usage = Some(usage.clone());
|
state.turn_accumulated_usage = Some(usage.clone());
|
||||||
|
|
||||||
|
// Update cumulative cache statistics
|
||||||
|
self.cache_stats.total_calls += 1;
|
||||||
|
self.cache_stats.total_input_tokens += usage.prompt_tokens as u64;
|
||||||
|
self.cache_stats.total_cache_creation_tokens +=
|
||||||
|
usage.cache_creation_tokens as u64;
|
||||||
|
self.cache_stats.total_cache_read_tokens +=
|
||||||
|
usage.cache_read_tokens as u64;
|
||||||
|
if usage.cache_read_tokens > 0 {
|
||||||
|
self.cache_stats.cache_hit_calls += 1;
|
||||||
|
}
|
||||||
debug!(
|
debug!(
|
||||||
"Received usage data - prompt: {}, completion: {}, total: {}",
|
"Received usage data - prompt: {}, completion: {}, total: {}",
|
||||||
usage.prompt_tokens, usage.completion_tokens, usage.total_tokens
|
usage.prompt_tokens, usage.completion_tokens, usage.total_tokens
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ use g3_providers::MessageRole;
|
|||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
|
||||||
use crate::context_window::ContextWindow;
|
use crate::context_window::ContextWindow;
|
||||||
|
use crate::CacheStats;
|
||||||
|
|
||||||
/// Data required to format agent statistics.
|
/// Data required to format agent statistics.
|
||||||
/// This struct captures a snapshot of agent state for formatting.
|
/// This struct captures a snapshot of agent state for formatting.
|
||||||
@@ -17,6 +18,7 @@ pub struct AgentStatsSnapshot<'a> {
|
|||||||
pub first_token_times: &'a [Duration],
|
pub first_token_times: &'a [Duration],
|
||||||
pub tool_call_metrics: &'a [(String, Duration, bool)],
|
pub tool_call_metrics: &'a [(String, Duration, bool)],
|
||||||
pub provider_info: Option<(String, String)>,
|
pub provider_info: Option<(String, String)>,
|
||||||
|
pub cache_stats: &'a CacheStats,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> AgentStatsSnapshot<'a> {
|
impl<'a> AgentStatsSnapshot<'a> {
|
||||||
@@ -33,6 +35,7 @@ impl<'a> AgentStatsSnapshot<'a> {
|
|||||||
self.format_performance_metrics(&mut stats);
|
self.format_performance_metrics(&mut stats);
|
||||||
self.format_conversation_history(&mut stats);
|
self.format_conversation_history(&mut stats);
|
||||||
self.format_tool_call_metrics(&mut stats);
|
self.format_tool_call_metrics(&mut stats);
|
||||||
|
self.format_cache_stats(&mut stats);
|
||||||
self.format_provider_info(&mut stats);
|
self.format_provider_info(&mut stats);
|
||||||
|
|
||||||
stats.push_str(&"=".repeat(60));
|
stats.push_str(&"=".repeat(60));
|
||||||
@@ -184,6 +187,53 @@ impl<'a> AgentStatsSnapshot<'a> {
|
|||||||
stats.push('\n');
|
stats.push('\n');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn format_cache_stats(&self, stats: &mut String) {
|
||||||
|
stats.push_str("💾 Prompt Cache Statistics:\n");
|
||||||
|
stats.push_str(&format!(
|
||||||
|
" • API Calls: {:>10}\n",
|
||||||
|
self.cache_stats.total_calls
|
||||||
|
));
|
||||||
|
stats.push_str(&format!(
|
||||||
|
" • Cache Hits: {:>10}\n",
|
||||||
|
self.cache_stats.cache_hit_calls
|
||||||
|
));
|
||||||
|
|
||||||
|
// Calculate hit rate
|
||||||
|
let hit_rate = if self.cache_stats.total_calls > 0 {
|
||||||
|
(self.cache_stats.cache_hit_calls as f64 / self.cache_stats.total_calls as f64) * 100.0
|
||||||
|
} else {
|
||||||
|
0.0
|
||||||
|
};
|
||||||
|
stats.push_str(&format!(" • Hit Rate: {:>9.1}%\n", hit_rate));
|
||||||
|
|
||||||
|
stats.push_str(&format!(
|
||||||
|
" • Total Input Tokens:{:>10}\n",
|
||||||
|
self.cache_stats.total_input_tokens
|
||||||
|
));
|
||||||
|
stats.push_str(&format!(
|
||||||
|
" • Cache Created: {:>10} tokens\n",
|
||||||
|
self.cache_stats.total_cache_creation_tokens
|
||||||
|
));
|
||||||
|
stats.push_str(&format!(
|
||||||
|
" • Cache Read: {:>10} tokens\n",
|
||||||
|
self.cache_stats.total_cache_read_tokens
|
||||||
|
));
|
||||||
|
|
||||||
|
// Calculate cache read percentage of total input
|
||||||
|
let cache_read_pct = if self.cache_stats.total_input_tokens > 0 {
|
||||||
|
(self.cache_stats.total_cache_read_tokens as f64
|
||||||
|
/ self.cache_stats.total_input_tokens as f64)
|
||||||
|
* 100.0
|
||||||
|
} else {
|
||||||
|
0.0
|
||||||
|
};
|
||||||
|
stats.push_str(&format!(
|
||||||
|
" • Cache Efficiency: {:>9.1}% of input from cache\n",
|
||||||
|
cache_read_pct
|
||||||
|
));
|
||||||
|
stats.push('\n');
|
||||||
|
}
|
||||||
|
|
||||||
fn format_provider_info(&self, stats: &mut String) {
|
fn format_provider_info(&self, stats: &mut String) {
|
||||||
stats.push_str("🔌 Provider:\n");
|
stats.push_str("🔌 Provider:\n");
|
||||||
if let Some((provider, model)) = &self.provider_info {
|
if let Some((provider, model)) = &self.provider_info {
|
||||||
@@ -201,6 +251,7 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_format_stats_empty() {
|
fn test_format_stats_empty() {
|
||||||
let context_window = ContextWindow::new(100000);
|
let context_window = ContextWindow::new(100000);
|
||||||
|
let cache_stats = CacheStats::default();
|
||||||
let snapshot = AgentStatsSnapshot {
|
let snapshot = AgentStatsSnapshot {
|
||||||
context_window: &context_window,
|
context_window: &context_window,
|
||||||
thinning_events: &[],
|
thinning_events: &[],
|
||||||
@@ -208,6 +259,7 @@ mod tests {
|
|||||||
first_token_times: &[],
|
first_token_times: &[],
|
||||||
tool_call_metrics: &[],
|
tool_call_metrics: &[],
|
||||||
provider_info: None,
|
provider_info: None,
|
||||||
|
cache_stats: &cache_stats,
|
||||||
};
|
};
|
||||||
|
|
||||||
let stats = snapshot.format();
|
let stats = snapshot.format();
|
||||||
@@ -215,6 +267,7 @@ mod tests {
|
|||||||
assert!(stats.contains("Used Tokens"));
|
assert!(stats.contains("Used Tokens"));
|
||||||
assert!(stats.contains("Thinning Events"));
|
assert!(stats.contains("Thinning Events"));
|
||||||
assert!(stats.contains("Tool Call Metrics"));
|
assert!(stats.contains("Tool Call Metrics"));
|
||||||
|
assert!(stats.contains("Prompt Cache Statistics"));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -222,6 +275,13 @@ mod tests {
|
|||||||
let context_window = ContextWindow::new(100000);
|
let context_window = ContextWindow::new(100000);
|
||||||
let thinning_events = vec![1000, 2000, 1500];
|
let thinning_events = vec![1000, 2000, 1500];
|
||||||
let compaction_events = vec![5000];
|
let compaction_events = vec![5000];
|
||||||
|
let cache_stats = CacheStats {
|
||||||
|
total_calls: 5,
|
||||||
|
cache_hit_calls: 3,
|
||||||
|
total_input_tokens: 10000,
|
||||||
|
total_cache_creation_tokens: 2000,
|
||||||
|
total_cache_read_tokens: 6000,
|
||||||
|
};
|
||||||
let first_token_times = vec![
|
let first_token_times = vec![
|
||||||
Duration::from_millis(100),
|
Duration::from_millis(100),
|
||||||
Duration::from_millis(150),
|
Duration::from_millis(150),
|
||||||
@@ -240,6 +300,7 @@ mod tests {
|
|||||||
first_token_times: &first_token_times,
|
first_token_times: &first_token_times,
|
||||||
tool_call_metrics: &tool_call_metrics,
|
tool_call_metrics: &tool_call_metrics,
|
||||||
provider_info: Some(("anthropic".to_string(), "claude-3".to_string())),
|
provider_info: Some(("anthropic".to_string(), "claude-3".to_string())),
|
||||||
|
cache_stats: &cache_stats,
|
||||||
};
|
};
|
||||||
|
|
||||||
let stats = snapshot.format();
|
let stats = snapshot.format();
|
||||||
@@ -259,5 +320,12 @@ mod tests {
|
|||||||
// Check provider info
|
// Check provider info
|
||||||
assert!(stats.contains("Provider: anthropic"));
|
assert!(stats.contains("Provider: anthropic"));
|
||||||
assert!(stats.contains("Model: claude-3"));
|
assert!(stats.contains("Model: claude-3"));
|
||||||
|
|
||||||
|
// Check cache stats
|
||||||
|
assert!(stats.contains("Prompt Cache Statistics"));
|
||||||
|
assert!(stats.contains("API Calls: 5"));
|
||||||
|
assert!(stats.contains("Cache Hits: 3"));
|
||||||
|
assert!(stats.contains("Hit Rate:") && stats.contains("60.0%"));
|
||||||
|
assert!(stats.contains("Cache Efficiency:"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -57,6 +57,8 @@ fn finished_chunk() -> CompletionChunk {
|
|||||||
prompt_tokens: 100,
|
prompt_tokens: 100,
|
||||||
completion_tokens: 50,
|
completion_tokens: 50,
|
||||||
total_tokens: 150,
|
total_tokens: 150,
|
||||||
|
cache_creation_tokens: 0,
|
||||||
|
cache_read_tokens: 0,
|
||||||
}),
|
}),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -697,6 +699,8 @@ async fn test_agent_json_fallback_executes() {
|
|||||||
prompt_tokens: 100,
|
prompt_tokens: 100,
|
||||||
completion_tokens: 50,
|
completion_tokens: 50,
|
||||||
total_tokens: 150,
|
total_tokens: 150,
|
||||||
|
cache_creation_tokens: 0,
|
||||||
|
cache_read_tokens: 0,
|
||||||
},
|
},
|
||||||
))
|
))
|
||||||
.with_default_response(MockResponse::text("Done."));
|
.with_default_response(MockResponse::text("Done."));
|
||||||
@@ -800,6 +804,8 @@ async fn test_tool_result_with_json_not_parsed() {
|
|||||||
prompt_tokens: 100,
|
prompt_tokens: 100,
|
||||||
completion_tokens: 50,
|
completion_tokens: 50,
|
||||||
total_tokens: 150,
|
total_tokens: 150,
|
||||||
|
cache_creation_tokens: 0,
|
||||||
|
cache_read_tokens: 0,
|
||||||
},
|
},
|
||||||
))
|
))
|
||||||
// Second response: LLM acknowledges the file content
|
// Second response: LLM acknowledges the file content
|
||||||
|
|||||||
@@ -674,6 +674,8 @@ async fn test_multiple_tools_in_single_response_all_executed() {
|
|||||||
prompt_tokens: 100,
|
prompt_tokens: 100,
|
||||||
completion_tokens: 100,
|
completion_tokens: 100,
|
||||||
total_tokens: 200,
|
total_tokens: 200,
|
||||||
|
cache_creation_tokens: 0,
|
||||||
|
cache_read_tokens: 0,
|
||||||
},
|
},
|
||||||
))
|
))
|
||||||
.with_default_response(MockResponse::text("Both commands executed."));
|
.with_default_response(MockResponse::text("Both commands executed."));
|
||||||
@@ -774,6 +776,8 @@ async fn test_llm_repeats_text_before_each_tool_call() {
|
|||||||
prompt_tokens: 100,
|
prompt_tokens: 100,
|
||||||
completion_tokens: 50,
|
completion_tokens: 50,
|
||||||
total_tokens: 150,
|
total_tokens: 150,
|
||||||
|
cache_creation_tokens: 0,
|
||||||
|
cache_read_tokens: 0,
|
||||||
},
|
},
|
||||||
))
|
))
|
||||||
// Second response: SAME preamble + tool call 2
|
// Second response: SAME preamble + tool call 2
|
||||||
@@ -789,6 +793,8 @@ async fn test_llm_repeats_text_before_each_tool_call() {
|
|||||||
prompt_tokens: 150,
|
prompt_tokens: 150,
|
||||||
completion_tokens: 50,
|
completion_tokens: 50,
|
||||||
total_tokens: 200,
|
total_tokens: 200,
|
||||||
|
cache_creation_tokens: 0,
|
||||||
|
cache_read_tokens: 0,
|
||||||
},
|
},
|
||||||
))
|
))
|
||||||
// Third response: final acknowledgment
|
// Third response: final acknowledgment
|
||||||
|
|||||||
@@ -60,6 +60,8 @@ fn default_usage() -> Usage {
|
|||||||
prompt_tokens: 0,
|
prompt_tokens: 0,
|
||||||
completion_tokens: 0,
|
completion_tokens: 0,
|
||||||
total_tokens: 0,
|
total_tokens: 0,
|
||||||
|
cache_creation_tokens: 0,
|
||||||
|
cache_read_tokens: 0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -169,6 +171,8 @@ impl LLMProvider for MockStreamingProvider {
|
|||||||
prompt_tokens: 100,
|
prompt_tokens: 100,
|
||||||
completion_tokens: 50,
|
completion_tokens: 50,
|
||||||
total_tokens: 150,
|
total_tokens: 150,
|
||||||
|
cache_creation_tokens: 0,
|
||||||
|
cache_read_tokens: 0,
|
||||||
}),
|
}),
|
||||||
stop_reason: Some("end_turn".to_string()),
|
stop_reason: Some("end_turn".to_string()),
|
||||||
tool_call_streaming: None,
|
tool_call_streaming: None,
|
||||||
@@ -201,6 +205,8 @@ impl LLMProvider for MockStreamingProvider {
|
|||||||
prompt_tokens: 50,
|
prompt_tokens: 50,
|
||||||
completion_tokens: 10,
|
completion_tokens: 10,
|
||||||
total_tokens: 60,
|
total_tokens: 60,
|
||||||
|
cache_creation_tokens: 0,
|
||||||
|
cache_read_tokens: 0,
|
||||||
}),
|
}),
|
||||||
stop_reason: Some("end_turn".to_string()),
|
stop_reason: Some("end_turn".to_string()),
|
||||||
tool_call_streaming: None,
|
tool_call_streaming: None,
|
||||||
@@ -407,6 +413,8 @@ async fn test_finished_signal_terminates_stream() {
|
|||||||
prompt_tokens: 0,
|
prompt_tokens: 0,
|
||||||
completion_tokens: 0,
|
completion_tokens: 0,
|
||||||
total_tokens: 0,
|
total_tokens: 0,
|
||||||
|
cache_creation_tokens: 0,
|
||||||
|
cache_read_tokens: 0,
|
||||||
},
|
},
|
||||||
model: "simple".to_string(),
|
model: "simple".to_string(),
|
||||||
})
|
})
|
||||||
@@ -439,6 +447,8 @@ async fn test_finished_signal_terminates_stream() {
|
|||||||
prompt_tokens: 10,
|
prompt_tokens: 10,
|
||||||
completion_tokens: 10,
|
completion_tokens: 10,
|
||||||
total_tokens: 20,
|
total_tokens: 20,
|
||||||
|
cache_creation_tokens: 0,
|
||||||
|
cache_read_tokens: 0,
|
||||||
}),
|
}),
|
||||||
stop_reason: Some("end_turn".to_string()),
|
stop_reason: Some("end_turn".to_string()),
|
||||||
tool_call_streaming: None,
|
tool_call_streaming: None,
|
||||||
|
|||||||
@@ -38,6 +38,8 @@ fn test_update_usage_only_affects_cumulative() {
|
|||||||
prompt_tokens: 100,
|
prompt_tokens: 100,
|
||||||
completion_tokens: 50,
|
completion_tokens: 50,
|
||||||
total_tokens: 150,
|
total_tokens: 150,
|
||||||
|
cache_creation_tokens: 0,
|
||||||
|
cache_read_tokens: 0,
|
||||||
};
|
};
|
||||||
window.update_usage_from_response(&usage);
|
window.update_usage_from_response(&usage);
|
||||||
|
|
||||||
@@ -52,6 +54,8 @@ fn test_update_usage_only_affects_cumulative() {
|
|||||||
prompt_tokens: 200,
|
prompt_tokens: 200,
|
||||||
completion_tokens: 75,
|
completion_tokens: 75,
|
||||||
total_tokens: 275,
|
total_tokens: 275,
|
||||||
|
cache_creation_tokens: 0,
|
||||||
|
cache_read_tokens: 0,
|
||||||
};
|
};
|
||||||
window.update_usage_from_response(&usage2);
|
window.update_usage_from_response(&usage2);
|
||||||
|
|
||||||
@@ -156,6 +160,8 @@ fn test_cumulative_vs_used_independence() {
|
|||||||
prompt_tokens: 500,
|
prompt_tokens: 500,
|
||||||
completion_tokens: 200,
|
completion_tokens: 200,
|
||||||
total_tokens: 700,
|
total_tokens: 700,
|
||||||
|
cache_creation_tokens: 0,
|
||||||
|
cache_read_tokens: 0,
|
||||||
};
|
};
|
||||||
window.update_usage_from_response(&usage);
|
window.update_usage_from_response(&usage);
|
||||||
|
|
||||||
|
|||||||
@@ -464,6 +464,10 @@ impl AnthropicProvider {
|
|||||||
completion_tokens: usage.output_tokens,
|
completion_tokens: usage.output_tokens,
|
||||||
total_tokens: usage.input_tokens
|
total_tokens: usage.input_tokens
|
||||||
+ usage.output_tokens,
|
+ usage.output_tokens,
|
||||||
|
cache_creation_tokens: usage
|
||||||
|
.cache_creation_input_tokens,
|
||||||
|
cache_read_tokens: usage
|
||||||
|
.cache_read_input_tokens,
|
||||||
});
|
});
|
||||||
debug!(
|
debug!(
|
||||||
"Captured usage from message_start: {:?}",
|
"Captured usage from message_start: {:?}",
|
||||||
@@ -739,6 +743,8 @@ impl LLMProvider for AnthropicProvider {
|
|||||||
completion_tokens: anthropic_response.usage.output_tokens,
|
completion_tokens: anthropic_response.usage.output_tokens,
|
||||||
total_tokens: anthropic_response.usage.input_tokens
|
total_tokens: anthropic_response.usage.input_tokens
|
||||||
+ anthropic_response.usage.output_tokens,
|
+ anthropic_response.usage.output_tokens,
|
||||||
|
cache_creation_tokens: anthropic_response.usage.cache_creation_input_tokens,
|
||||||
|
cache_read_tokens: anthropic_response.usage.cache_read_input_tokens,
|
||||||
};
|
};
|
||||||
|
|
||||||
debug!(
|
debug!(
|
||||||
@@ -945,6 +951,12 @@ struct AnthropicResponse {
|
|||||||
struct AnthropicUsage {
|
struct AnthropicUsage {
|
||||||
input_tokens: u32,
|
input_tokens: u32,
|
||||||
output_tokens: u32,
|
output_tokens: u32,
|
||||||
|
/// Tokens written to cache when creating a new cache entry
|
||||||
|
#[serde(default)]
|
||||||
|
cache_creation_input_tokens: u32,
|
||||||
|
/// Tokens retrieved from cache (cache hit)
|
||||||
|
#[serde(default)]
|
||||||
|
cache_read_input_tokens: u32,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Streaming response structures
|
// Streaming response structures
|
||||||
|
|||||||
@@ -763,6 +763,8 @@ impl LLMProvider for DatabricksProvider {
|
|||||||
prompt_tokens: databricks_response.usage.prompt_tokens,
|
prompt_tokens: databricks_response.usage.prompt_tokens,
|
||||||
completion_tokens: databricks_response.usage.completion_tokens,
|
completion_tokens: databricks_response.usage.completion_tokens,
|
||||||
total_tokens: databricks_response.usage.total_tokens,
|
total_tokens: databricks_response.usage.total_tokens,
|
||||||
|
cache_creation_tokens: 0, // Databricks doesn't support prompt caching
|
||||||
|
cache_read_tokens: 0,
|
||||||
};
|
};
|
||||||
|
|
||||||
debug!(
|
debug!(
|
||||||
|
|||||||
@@ -531,6 +531,8 @@ impl LLMProvider for EmbeddedProvider {
|
|||||||
prompt_tokens,
|
prompt_tokens,
|
||||||
completion_tokens,
|
completion_tokens,
|
||||||
total_tokens: prompt_tokens + completion_tokens,
|
total_tokens: prompt_tokens + completion_tokens,
|
||||||
|
cache_creation_tokens: 0, // Embedded models don't support prompt caching
|
||||||
|
cache_read_tokens: 0,
|
||||||
},
|
},
|
||||||
model: self.model_name.clone(),
|
model: self.model_name.clone(),
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -196,6 +196,12 @@ pub struct Usage {
|
|||||||
pub prompt_tokens: u32,
|
pub prompt_tokens: u32,
|
||||||
pub completion_tokens: u32,
|
pub completion_tokens: u32,
|
||||||
pub total_tokens: u32,
|
pub total_tokens: u32,
|
||||||
|
/// Tokens written to cache (Anthropic: cache_creation_input_tokens)
|
||||||
|
#[serde(default)]
|
||||||
|
pub cache_creation_tokens: u32,
|
||||||
|
/// Tokens read from cache (Anthropic: cache_read_input_tokens, OpenAI: cached_tokens)
|
||||||
|
#[serde(default)]
|
||||||
|
pub cache_read_tokens: u32,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub type CompletionStream = tokio_stream::wrappers::ReceiverStream<Result<CompletionChunk>>;
|
pub type CompletionStream = tokio_stream::wrappers::ReceiverStream<Result<CompletionChunk>>;
|
||||||
|
|||||||
@@ -120,6 +120,8 @@ impl MockResponse {
|
|||||||
prompt_tokens: 100,
|
prompt_tokens: 100,
|
||||||
completion_tokens: content.len() as u32 / 4,
|
completion_tokens: content.len() as u32 / 4,
|
||||||
total_tokens: 100 + content.len() as u32 / 4,
|
total_tokens: 100 + content.len() as u32 / 4,
|
||||||
|
cache_creation_tokens: 0,
|
||||||
|
cache_read_tokens: 0,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -139,6 +141,8 @@ impl MockResponse {
|
|||||||
prompt_tokens: 100,
|
prompt_tokens: 100,
|
||||||
completion_tokens: total_content.len() as u32 / 4,
|
completion_tokens: total_content.len() as u32 / 4,
|
||||||
total_tokens: 100 + total_content.len() as u32 / 4,
|
total_tokens: 100 + total_content.len() as u32 / 4,
|
||||||
|
cache_creation_tokens: 0,
|
||||||
|
cache_read_tokens: 0,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -155,6 +159,8 @@ impl MockResponse {
|
|||||||
prompt_tokens: 100,
|
prompt_tokens: 100,
|
||||||
completion_tokens: 50,
|
completion_tokens: 50,
|
||||||
total_tokens: 150,
|
total_tokens: 150,
|
||||||
|
cache_creation_tokens: 0,
|
||||||
|
cache_read_tokens: 0,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -172,6 +178,8 @@ impl MockResponse {
|
|||||||
prompt_tokens: 100,
|
prompt_tokens: 100,
|
||||||
completion_tokens: 50 + text.len() as u32 / 4,
|
completion_tokens: 50 + text.len() as u32 / 4,
|
||||||
total_tokens: 150 + text.len() as u32 / 4,
|
total_tokens: 150 + text.len() as u32 / 4,
|
||||||
|
cache_creation_tokens: 0,
|
||||||
|
cache_read_tokens: 0,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -192,6 +200,8 @@ impl MockResponse {
|
|||||||
prompt_tokens: 100,
|
prompt_tokens: 100,
|
||||||
completion_tokens: 100,
|
completion_tokens: 100,
|
||||||
total_tokens: 200,
|
total_tokens: 200,
|
||||||
|
cache_creation_tokens: 0,
|
||||||
|
cache_read_tokens: 0,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -215,6 +225,8 @@ impl MockResponse {
|
|||||||
prompt_tokens: 100,
|
prompt_tokens: 100,
|
||||||
completion_tokens: full_content.len() as u32 / 4,
|
completion_tokens: full_content.len() as u32 / 4,
|
||||||
total_tokens: 100 + full_content.len() as u32 / 4,
|
total_tokens: 100 + full_content.len() as u32 / 4,
|
||||||
|
cache_creation_tokens: 0,
|
||||||
|
cache_read_tokens: 0,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -230,6 +242,8 @@ impl MockResponse {
|
|||||||
prompt_tokens: 100,
|
prompt_tokens: 100,
|
||||||
completion_tokens: content.len() as u32 / 4,
|
completion_tokens: content.len() as u32 / 4,
|
||||||
total_tokens: 100 + content.len() as u32 / 4,
|
total_tokens: 100 + content.len() as u32 / 4,
|
||||||
|
cache_creation_tokens: 0,
|
||||||
|
cache_read_tokens: 0,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -220,6 +220,12 @@ impl OpenAIProvider {
|
|||||||
prompt_tokens: usage.prompt_tokens,
|
prompt_tokens: usage.prompt_tokens,
|
||||||
completion_tokens: usage.completion_tokens,
|
completion_tokens: usage.completion_tokens,
|
||||||
total_tokens: usage.total_tokens,
|
total_tokens: usage.total_tokens,
|
||||||
|
cache_creation_tokens: 0, // OpenAI doesn't report cache creation
|
||||||
|
cache_read_tokens: usage
|
||||||
|
.prompt_tokens_details
|
||||||
|
.as_ref()
|
||||||
|
.map(|d| d.cached_tokens)
|
||||||
|
.unwrap_or(0),
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -306,6 +312,13 @@ impl LLMProvider for OpenAIProvider {
|
|||||||
prompt_tokens: openai_response.usage.prompt_tokens,
|
prompt_tokens: openai_response.usage.prompt_tokens,
|
||||||
completion_tokens: openai_response.usage.completion_tokens,
|
completion_tokens: openai_response.usage.completion_tokens,
|
||||||
total_tokens: openai_response.usage.total_tokens,
|
total_tokens: openai_response.usage.total_tokens,
|
||||||
|
cache_creation_tokens: 0, // OpenAI doesn't report cache creation
|
||||||
|
cache_read_tokens: openai_response
|
||||||
|
.usage
|
||||||
|
.prompt_tokens_details
|
||||||
|
.as_ref()
|
||||||
|
.map(|d| d.cached_tokens)
|
||||||
|
.unwrap_or(0),
|
||||||
};
|
};
|
||||||
|
|
||||||
debug!(
|
debug!(
|
||||||
@@ -495,6 +508,16 @@ struct OpenAIUsage {
|
|||||||
prompt_tokens: u32,
|
prompt_tokens: u32,
|
||||||
completion_tokens: u32,
|
completion_tokens: u32,
|
||||||
total_tokens: u32,
|
total_tokens: u32,
|
||||||
|
/// Detailed breakdown of prompt tokens including cache info
|
||||||
|
#[serde(default)]
|
||||||
|
prompt_tokens_details: Option<OpenAIPromptTokensDetails>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize, Default)]
|
||||||
|
struct OpenAIPromptTokensDetails {
|
||||||
|
/// Tokens retrieved from cache (cache hit)
|
||||||
|
#[serde(default)]
|
||||||
|
cached_tokens: u32,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Streaming response structures
|
// Streaming response structures
|
||||||
|
|||||||
Reference in New Issue
Block a user