Add prompt cache statistics tracking to /stats command
- Extend Usage struct with cache_creation_tokens and cache_read_tokens fields - Parse Anthropic cache_creation_input_tokens and cache_read_input_tokens - Parse OpenAI prompt_tokens_details.cached_tokens for automatic prefix caching - Add CacheStats struct to Agent for cumulative tracking across API calls - Add "Prompt Cache Statistics" section to /stats output showing: - API call count and cache hit count - Hit rate percentage - Total input tokens and cache read/creation tokens - Cache efficiency (% of input served from cache) - Update all provider implementations and test files
This commit is contained in:
@@ -57,6 +57,8 @@ fn finished_chunk() -> CompletionChunk {
|
||||
prompt_tokens: 100,
|
||||
completion_tokens: 50,
|
||||
total_tokens: 150,
|
||||
cache_creation_tokens: 0,
|
||||
cache_read_tokens: 0,
|
||||
}),
|
||||
}
|
||||
}
|
||||
@@ -697,6 +699,8 @@ async fn test_agent_json_fallback_executes() {
|
||||
prompt_tokens: 100,
|
||||
completion_tokens: 50,
|
||||
total_tokens: 150,
|
||||
cache_creation_tokens: 0,
|
||||
cache_read_tokens: 0,
|
||||
},
|
||||
))
|
||||
.with_default_response(MockResponse::text("Done."));
|
||||
@@ -800,6 +804,8 @@ async fn test_tool_result_with_json_not_parsed() {
|
||||
prompt_tokens: 100,
|
||||
completion_tokens: 50,
|
||||
total_tokens: 150,
|
||||
cache_creation_tokens: 0,
|
||||
cache_read_tokens: 0,
|
||||
},
|
||||
))
|
||||
// Second response: LLM acknowledges the file content
|
||||
|
||||
@@ -674,6 +674,8 @@ async fn test_multiple_tools_in_single_response_all_executed() {
|
||||
prompt_tokens: 100,
|
||||
completion_tokens: 100,
|
||||
total_tokens: 200,
|
||||
cache_creation_tokens: 0,
|
||||
cache_read_tokens: 0,
|
||||
},
|
||||
))
|
||||
.with_default_response(MockResponse::text("Both commands executed."));
|
||||
@@ -774,6 +776,8 @@ async fn test_llm_repeats_text_before_each_tool_call() {
|
||||
prompt_tokens: 100,
|
||||
completion_tokens: 50,
|
||||
total_tokens: 150,
|
||||
cache_creation_tokens: 0,
|
||||
cache_read_tokens: 0,
|
||||
},
|
||||
))
|
||||
// Second response: SAME preamble + tool call 2
|
||||
@@ -789,6 +793,8 @@ async fn test_llm_repeats_text_before_each_tool_call() {
|
||||
prompt_tokens: 150,
|
||||
completion_tokens: 50,
|
||||
total_tokens: 200,
|
||||
cache_creation_tokens: 0,
|
||||
cache_read_tokens: 0,
|
||||
},
|
||||
))
|
||||
// Third response: final acknowledgment
|
||||
|
||||
@@ -60,6 +60,8 @@ fn default_usage() -> Usage {
|
||||
prompt_tokens: 0,
|
||||
completion_tokens: 0,
|
||||
total_tokens: 0,
|
||||
cache_creation_tokens: 0,
|
||||
cache_read_tokens: 0,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -169,6 +171,8 @@ impl LLMProvider for MockStreamingProvider {
|
||||
prompt_tokens: 100,
|
||||
completion_tokens: 50,
|
||||
total_tokens: 150,
|
||||
cache_creation_tokens: 0,
|
||||
cache_read_tokens: 0,
|
||||
}),
|
||||
stop_reason: Some("end_turn".to_string()),
|
||||
tool_call_streaming: None,
|
||||
@@ -201,6 +205,8 @@ impl LLMProvider for MockStreamingProvider {
|
||||
prompt_tokens: 50,
|
||||
completion_tokens: 10,
|
||||
total_tokens: 60,
|
||||
cache_creation_tokens: 0,
|
||||
cache_read_tokens: 0,
|
||||
}),
|
||||
stop_reason: Some("end_turn".to_string()),
|
||||
tool_call_streaming: None,
|
||||
@@ -407,6 +413,8 @@ async fn test_finished_signal_terminates_stream() {
|
||||
prompt_tokens: 0,
|
||||
completion_tokens: 0,
|
||||
total_tokens: 0,
|
||||
cache_creation_tokens: 0,
|
||||
cache_read_tokens: 0,
|
||||
},
|
||||
model: "simple".to_string(),
|
||||
})
|
||||
@@ -439,6 +447,8 @@ async fn test_finished_signal_terminates_stream() {
|
||||
prompt_tokens: 10,
|
||||
completion_tokens: 10,
|
||||
total_tokens: 20,
|
||||
cache_creation_tokens: 0,
|
||||
cache_read_tokens: 0,
|
||||
}),
|
||||
stop_reason: Some("end_turn".to_string()),
|
||||
tool_call_streaming: None,
|
||||
|
||||
@@ -38,6 +38,8 @@ fn test_update_usage_only_affects_cumulative() {
|
||||
prompt_tokens: 100,
|
||||
completion_tokens: 50,
|
||||
total_tokens: 150,
|
||||
cache_creation_tokens: 0,
|
||||
cache_read_tokens: 0,
|
||||
};
|
||||
window.update_usage_from_response(&usage);
|
||||
|
||||
@@ -52,6 +54,8 @@ fn test_update_usage_only_affects_cumulative() {
|
||||
prompt_tokens: 200,
|
||||
completion_tokens: 75,
|
||||
total_tokens: 275,
|
||||
cache_creation_tokens: 0,
|
||||
cache_read_tokens: 0,
|
||||
};
|
||||
window.update_usage_from_response(&usage2);
|
||||
|
||||
@@ -156,6 +160,8 @@ fn test_cumulative_vs_used_independence() {
|
||||
prompt_tokens: 500,
|
||||
completion_tokens: 200,
|
||||
total_tokens: 700,
|
||||
cache_creation_tokens: 0,
|
||||
cache_read_tokens: 0,
|
||||
};
|
||||
window.update_usage_from_response(&usage);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user