respect context length for anthropic

use the context length as per the config, rather than just hard-coded values.
This commit is contained in:
Jochen
2025-11-06 15:07:46 +11:00
parent cef234d91a
commit af20c93c61
2 changed files with 157 additions and 14 deletions

View File

@@ -921,11 +921,28 @@ impl<W: UiWriter> Agent<W> {
}
fn determine_context_length(config: &Config, providers: &ProviderRegistry) -> Result<u32> {
// Get the configured max_tokens for the current provider
fn get_provider_max_tokens(config: &Config, provider_name: &str) -> Option<u32> {
match provider_name {
"anthropic" => config.providers.anthropic.as_ref()?.max_tokens,
"openai" => config.providers.openai.as_ref()?.max_tokens,
"databricks" => config.providers.databricks.as_ref()?.max_tokens,
"embedded" => config.providers.embedded.as_ref()?.max_tokens,
_ => None,
}
}
// Get the active provider to determine context length
let provider = providers.get(None)?;
let provider_name = provider.name();
let model_name = provider.model();
// Check if there's a configured context length override first
if let Some(max_tokens) = get_provider_max_tokens(config, provider_name) {
debug!("Using configured max_tokens for {}: {}", provider_name, max_tokens);
return Ok(max_tokens);
}
// Use provider-specific context length if available, otherwise fall back to agent config
let context_length = match provider_name {
"embedded" => {
@@ -950,17 +967,21 @@ impl<W: UiWriter> Agent<W> {
}
"anthropic" => {
// Claude models have large context windows
200000 // Default for Claude models
// Use configured max_tokens or fall back to default
get_provider_max_tokens(config, "anthropic").unwrap_or(200000)
}
"databricks" => {
// Databricks models have varying context windows depending on the model
if model_name.contains("claude") {
200000 // Claude models on Databricks have large context windows
} else if model_name.contains("llama") || model_name.contains("dbrx") {
32768 // DBRX supports 32k context
} else {
16384 // Conservative default for other Databricks models
}
// Use configured max_tokens or fall back to model-specific defaults
get_provider_max_tokens(config, "databricks").unwrap_or_else(|| {
if model_name.contains("claude") {
200000 // Claude models on Databricks have large context windows
} else if model_name.contains("llama") || model_name.contains("dbrx") {
32768 // DBRX supports 32k context
} else {
16384 // Conservative default for other Databricks models
}
})
}
_ => config.agent.max_context_length as u32,
};
@@ -1511,7 +1532,7 @@ Template:
// Dynamically calculate max_tokens for summary based on what's left
let summary_max_tokens = match provider.name() {
"databricks" | "anthropic" => {
let model_limit = 200_000u32;
let model_limit = self.context_window.total_tokens;
let current_usage = self.context_window.used_tokens;
let available = model_limit
.saturating_sub(current_usage)
@@ -2394,6 +2415,28 @@ Template:
// Check if we need to summarize before starting
if self.context_window.should_summarize() {
// First try thinning if we haven't reached 90% yet
if self.context_window.percentage_used() < 90.0 && self.context_window.should_thin() {
self.ui_writer.print_context_status(&format!(
"\n🥒 Context window at {}%. Trying thinning first...",
self.context_window.percentage_used() as u32
));
let (thin_summary, chars_saved) = self.context_window.thin_context();
self.thinning_events.push(chars_saved);
self.ui_writer.print_context_thinning(&thin_summary);
// Check if thinning was sufficient
if !self.context_window.should_summarize() {
self.ui_writer.print_context_status("✅ Thinning resolved capacity issue. Continuing...\n");
// Continue with the original request without summarization
} else {
self.ui_writer.print_context_status("⚠️ Thinning insufficient. Proceeding with summarization...\n");
}
}
// Only proceed with summarization if still needed after thinning
if self.context_window.should_summarize() {
// Notify user about summarization
self.ui_writer.print_context_status(&format!(
"\n🗜️ Context window reaching capacity ({}%). Creating summary...",
@@ -2433,14 +2476,22 @@ Template:
// We need to ensure: used_tokens + max_tokens <= total_context_limit
let summary_max_tokens = match provider.name() {
"databricks" | "anthropic" => {
// Claude models have 200k context
// Calculate how much room we have left
let model_limit = 200_000u32;
// Use the actual configured context window size
let model_limit = self.context_window.total_tokens;
let current_usage = self.context_window.used_tokens;
// Leave some buffer (5k tokens) for safety
// Check if we have enough capacity for summarization
if current_usage >= model_limit.saturating_sub(1000) {
error!("Context window at capacity ({}%), cannot summarize. Current: {}, Limit: {}",
self.context_window.percentage_used(), current_usage, model_limit);
return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify or /compact commands to reduce context size, or start a new session."));
}
// Leave buffer proportional to model size (min 1k, max 10k)
let buffer = (model_limit / 40).clamp(1000, 10000); // 2.5% buffer
let available = model_limit
.saturating_sub(current_usage)
.saturating_sub(5000);
.saturating_sub(buffer);
// Cap at a reasonable summary size (10k tokens max)
Some(available.min(10_000))
}
@@ -2448,6 +2499,13 @@ Template:
// For smaller context models, be more conservative
let model_limit = self.context_window.total_tokens;
let current_usage = self.context_window.used_tokens;
// Check capacity for embedded models too
if current_usage >= model_limit.saturating_sub(500) {
error!("Embedded model context window at capacity ({}%)", self.context_window.percentage_used());
return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify command to reduce context size, or start a new session."));
}
// Leave 1k buffer
let available = model_limit
.saturating_sub(current_usage)
@@ -2457,6 +2515,14 @@ Template:
}
_ => {
// Default: conservative approach
let model_limit = self.context_window.total_tokens;
let current_usage = self.context_window.used_tokens;
if current_usage >= model_limit.saturating_sub(1000) {
error!("Context window at capacity ({}%)", self.context_window.percentage_used());
return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify or /compact commands, or start a new session."));
}
let available = self.context_window.remaining_tokens().saturating_sub(2000);
Some(available.min(5000))
}
@@ -2466,6 +2532,12 @@ Template:
"Requesting summary with max_tokens: {:?} (current usage: {} tokens)",
summary_max_tokens, self.context_window.used_tokens
);
// Final safety check
if summary_max_tokens.unwrap_or(0) == 0 {
error!("No tokens available for summarization");
return Err(anyhow::anyhow!("No context window capacity left for summarization. Use /thinnify to reduce context size or start a new session."));
}
let summary_request = CompletionRequest {
messages: summary_messages,
@@ -2507,6 +2579,7 @@ Template:
}
}
}
}
loop {
iteration_count += 1;