Add prompt cache statistics tracking to /stats command
- Extend Usage struct with cache_creation_tokens and cache_read_tokens fields - Parse Anthropic cache_creation_input_tokens and cache_read_input_tokens - Parse OpenAI prompt_tokens_details.cached_tokens for automatic prefix caching - Add CacheStats struct to Agent for cumulative tracking across API calls - Add "Prompt Cache Statistics" section to /stats output showing: - API call count and cache hit count - Hit rate percentage - Total input tokens and cache read/creation tokens - Cache efficiency (% of input served from cache) - Update all provider implementations and test files
This commit is contained in:
@@ -464,6 +464,10 @@ impl AnthropicProvider {
|
||||
completion_tokens: usage.output_tokens,
|
||||
total_tokens: usage.input_tokens
|
||||
+ usage.output_tokens,
|
||||
cache_creation_tokens: usage
|
||||
.cache_creation_input_tokens,
|
||||
cache_read_tokens: usage
|
||||
.cache_read_input_tokens,
|
||||
});
|
||||
debug!(
|
||||
"Captured usage from message_start: {:?}",
|
||||
@@ -739,6 +743,8 @@ impl LLMProvider for AnthropicProvider {
|
||||
completion_tokens: anthropic_response.usage.output_tokens,
|
||||
total_tokens: anthropic_response.usage.input_tokens
|
||||
+ anthropic_response.usage.output_tokens,
|
||||
cache_creation_tokens: anthropic_response.usage.cache_creation_input_tokens,
|
||||
cache_read_tokens: anthropic_response.usage.cache_read_input_tokens,
|
||||
};
|
||||
|
||||
debug!(
|
||||
@@ -945,6 +951,12 @@ struct AnthropicResponse {
|
||||
struct AnthropicUsage {
|
||||
input_tokens: u32,
|
||||
output_tokens: u32,
|
||||
/// Tokens written to cache when creating a new cache entry
|
||||
#[serde(default)]
|
||||
cache_creation_input_tokens: u32,
|
||||
/// Tokens retrieved from cache (cache hit)
|
||||
#[serde(default)]
|
||||
cache_read_input_tokens: u32,
|
||||
}
|
||||
|
||||
// Streaming response structures
|
||||
|
||||
@@ -763,6 +763,8 @@ impl LLMProvider for DatabricksProvider {
|
||||
prompt_tokens: databricks_response.usage.prompt_tokens,
|
||||
completion_tokens: databricks_response.usage.completion_tokens,
|
||||
total_tokens: databricks_response.usage.total_tokens,
|
||||
cache_creation_tokens: 0, // Databricks doesn't support prompt caching
|
||||
cache_read_tokens: 0,
|
||||
};
|
||||
|
||||
debug!(
|
||||
|
||||
@@ -531,6 +531,8 @@ impl LLMProvider for EmbeddedProvider {
|
||||
prompt_tokens,
|
||||
completion_tokens,
|
||||
total_tokens: prompt_tokens + completion_tokens,
|
||||
cache_creation_tokens: 0, // Embedded models don't support prompt caching
|
||||
cache_read_tokens: 0,
|
||||
},
|
||||
model: self.model_name.clone(),
|
||||
})
|
||||
|
||||
@@ -196,6 +196,12 @@ pub struct Usage {
|
||||
pub prompt_tokens: u32,
|
||||
pub completion_tokens: u32,
|
||||
pub total_tokens: u32,
|
||||
/// Tokens written to cache (Anthropic: cache_creation_input_tokens)
|
||||
#[serde(default)]
|
||||
pub cache_creation_tokens: u32,
|
||||
/// Tokens read from cache (Anthropic: cache_read_input_tokens, OpenAI: cached_tokens)
|
||||
#[serde(default)]
|
||||
pub cache_read_tokens: u32,
|
||||
}
|
||||
|
||||
pub type CompletionStream = tokio_stream::wrappers::ReceiverStream<Result<CompletionChunk>>;
|
||||
|
||||
@@ -120,6 +120,8 @@ impl MockResponse {
|
||||
prompt_tokens: 100,
|
||||
completion_tokens: content.len() as u32 / 4,
|
||||
total_tokens: 100 + content.len() as u32 / 4,
|
||||
cache_creation_tokens: 0,
|
||||
cache_read_tokens: 0,
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -139,6 +141,8 @@ impl MockResponse {
|
||||
prompt_tokens: 100,
|
||||
completion_tokens: total_content.len() as u32 / 4,
|
||||
total_tokens: 100 + total_content.len() as u32 / 4,
|
||||
cache_creation_tokens: 0,
|
||||
cache_read_tokens: 0,
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -155,6 +159,8 @@ impl MockResponse {
|
||||
prompt_tokens: 100,
|
||||
completion_tokens: 50,
|
||||
total_tokens: 150,
|
||||
cache_creation_tokens: 0,
|
||||
cache_read_tokens: 0,
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -172,6 +178,8 @@ impl MockResponse {
|
||||
prompt_tokens: 100,
|
||||
completion_tokens: 50 + text.len() as u32 / 4,
|
||||
total_tokens: 150 + text.len() as u32 / 4,
|
||||
cache_creation_tokens: 0,
|
||||
cache_read_tokens: 0,
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -192,6 +200,8 @@ impl MockResponse {
|
||||
prompt_tokens: 100,
|
||||
completion_tokens: 100,
|
||||
total_tokens: 200,
|
||||
cache_creation_tokens: 0,
|
||||
cache_read_tokens: 0,
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -215,6 +225,8 @@ impl MockResponse {
|
||||
prompt_tokens: 100,
|
||||
completion_tokens: full_content.len() as u32 / 4,
|
||||
total_tokens: 100 + full_content.len() as u32 / 4,
|
||||
cache_creation_tokens: 0,
|
||||
cache_read_tokens: 0,
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -230,6 +242,8 @@ impl MockResponse {
|
||||
prompt_tokens: 100,
|
||||
completion_tokens: content.len() as u32 / 4,
|
||||
total_tokens: 100 + content.len() as u32 / 4,
|
||||
cache_creation_tokens: 0,
|
||||
cache_read_tokens: 0,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
@@ -220,6 +220,12 @@ impl OpenAIProvider {
|
||||
prompt_tokens: usage.prompt_tokens,
|
||||
completion_tokens: usage.completion_tokens,
|
||||
total_tokens: usage.total_tokens,
|
||||
cache_creation_tokens: 0, // OpenAI doesn't report cache creation
|
||||
cache_read_tokens: usage
|
||||
.prompt_tokens_details
|
||||
.as_ref()
|
||||
.map(|d| d.cached_tokens)
|
||||
.unwrap_or(0),
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -306,6 +312,13 @@ impl LLMProvider for OpenAIProvider {
|
||||
prompt_tokens: openai_response.usage.prompt_tokens,
|
||||
completion_tokens: openai_response.usage.completion_tokens,
|
||||
total_tokens: openai_response.usage.total_tokens,
|
||||
cache_creation_tokens: 0, // OpenAI doesn't report cache creation
|
||||
cache_read_tokens: openai_response
|
||||
.usage
|
||||
.prompt_tokens_details
|
||||
.as_ref()
|
||||
.map(|d| d.cached_tokens)
|
||||
.unwrap_or(0),
|
||||
};
|
||||
|
||||
debug!(
|
||||
@@ -495,6 +508,16 @@ struct OpenAIUsage {
|
||||
prompt_tokens: u32,
|
||||
completion_tokens: u32,
|
||||
total_tokens: u32,
|
||||
/// Detailed breakdown of prompt tokens including cache info
|
||||
#[serde(default)]
|
||||
prompt_tokens_details: Option<OpenAIPromptTokensDetails>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, Default)]
|
||||
struct OpenAIPromptTokensDetails {
|
||||
/// Tokens retrieved from cache (cache hit)
|
||||
#[serde(default)]
|
||||
cached_tokens: u32,
|
||||
}
|
||||
|
||||
// Streaming response structures
|
||||
|
||||
Reference in New Issue
Block a user