validate max_tokens for call, also fallbacks for summary

When the CW is full, max_tokens is often passed at 0 or tiny. The LLM will fail. For Anthropic with thining, there is also the thinking budget.
This can happen during summary attempts, in that case
first try thinnify, skinnify etc..
This commit is contained in:
Jochen
2025-12-09 10:15:32 +11:00
parent 48e6d594bc
commit 696c441a47
2 changed files with 434 additions and 106 deletions

View File

@@ -1418,6 +1418,204 @@ impl<W: UiWriter> Agent<W> {
}
}
/// Get the thinking budget tokens for Anthropic provider, if configured
fn get_thinking_budget_tokens(&self) -> Option<u32> {
self.config
.providers
.anthropic
.as_ref()
.and_then(|c| c.thinking_budget_tokens)
}
/// Pre-flight check to validate and adjust max_tokens for the thinking.budget_tokens constraint.
/// Returns the adjusted max_tokens that satisfies: max_tokens > thinking.budget_tokens
/// Also returns whether we need to apply fallback actions (thinnify/skinnify).
///
/// Returns: (adjusted_max_tokens, needs_context_reduction)
fn preflight_validate_max_tokens(
&self,
provider_name: &str,
proposed_max_tokens: u32,
) -> (u32, bool) {
// Only applies to Anthropic provider with thinking enabled
if provider_name != "anthropic" {
return (proposed_max_tokens, false);
}
let budget_tokens = match self.get_thinking_budget_tokens() {
Some(budget) => budget,
None => return (proposed_max_tokens, false), // No thinking enabled
};
// Anthropic requires: max_tokens > budget_tokens
// We add a minimum output buffer of 1024 tokens for actual response content
let minimum_required = budget_tokens + 1024;
if proposed_max_tokens >= minimum_required {
// We have enough headroom
(proposed_max_tokens, false)
} else {
// max_tokens is too low - need to either adjust or reduce context
warn!(
"max_tokens ({}) is below required minimum ({}) for thinking.budget_tokens ({}). Context reduction needed.",
proposed_max_tokens, minimum_required, budget_tokens
);
// Return the minimum required, but flag that we need context reduction
(minimum_required, true)
}
}
/// Calculate max_tokens for a summary request, ensuring it satisfies the thinking constraint.
/// Applies fallback sequence: thinnify -> skinnify -> hard-coded minimum
/// Returns (max_tokens, whether_fallback_was_used)
fn calculate_summary_max_tokens(
&mut self,
provider_name: &str,
) -> (u32, bool) {
let model_limit = self.context_window.total_tokens;
let current_usage = self.context_window.used_tokens;
// Calculate available tokens with buffer
let buffer = (model_limit / 40).clamp(1000, 10000); // 2.5% buffer
let available = model_limit
.saturating_sub(current_usage)
.saturating_sub(buffer);
let proposed_max_tokens = available.min(10_000);
// Validate against thinking budget constraint
let (adjusted, needs_reduction) = self.preflight_validate_max_tokens(provider_name, proposed_max_tokens);
if !needs_reduction {
return (adjusted, false);
}
// We need more headroom - the context is too full
// Return the adjusted value but flag that fallbacks are needed
(adjusted, true)
}
/// Apply the fallback sequence to free up context space for thinking budget.
/// Sequence: thinnify (first third) → skinnify (all) → hard-coded minimum
/// Returns the validated max_tokens that satisfies thinking.budget_tokens constraint.
fn apply_max_tokens_fallback_sequence(
&mut self,
provider_name: &str,
initial_max_tokens: u32,
hard_coded_minimum: u32,
) -> u32 {
let (mut max_tokens, needs_reduction) = self.preflight_validate_max_tokens(provider_name, initial_max_tokens);
if !needs_reduction {
return max_tokens;
}
self.ui_writer.print_context_status(
"⚠️ Context window too full for thinking budget. Applying fallback sequence...\n",
);
// Step 1: Try thinnify (first third of context)
self.ui_writer.print_context_status("🥒 Step 1: Trying thinnify...\n");
let (thin_msg, thin_saved) = self.context_window.thin_context();
self.thinning_events.push(thin_saved);
self.ui_writer.print_context_thinning(&thin_msg);
// Recalculate max_tokens after thinnify
let recalc_max = self.resolve_max_tokens(provider_name);
let (new_max, still_needs_reduction) = self.preflight_validate_max_tokens(provider_name, recalc_max);
max_tokens = new_max;
if !still_needs_reduction {
self.ui_writer.print_context_status(
"✅ Thinnify resolved capacity issue. Continuing...\n",
);
return max_tokens;
}
// Step 2: Try skinnify (entire context)
self.ui_writer.print_context_status("🦴 Step 2: Trying skinnify...\n");
let (skinny_msg, skinny_saved) = self.context_window.thin_context_all();
self.thinning_events.push(skinny_saved);
self.ui_writer.print_context_thinning(&skinny_msg);
// Recalculate max_tokens after skinnify
let recalc_max = self.resolve_max_tokens(provider_name);
let (final_max, final_needs_reduction) = self.preflight_validate_max_tokens(provider_name, recalc_max);
max_tokens = final_max;
if !final_needs_reduction {
self.ui_writer.print_context_status(
"✅ Skinnify resolved capacity issue. Continuing...\n",
);
return max_tokens;
}
// Step 3: Nothing worked, use hard-coded minimum as last resort
self.ui_writer.print_context_status(&format!(
"⚠️ Step 3: Context reduction insufficient. Using hard-coded max_tokens={} as last resort...\n",
hard_coded_minimum
));
hard_coded_minimum
}
/// Apply the fallback sequence for summary requests to free up context space.
/// Uses calculate_summary_max_tokens for recalculation (based on available space).
/// Returns the validated max_tokens for summary requests.
fn apply_summary_fallback_sequence(
&mut self,
provider_name: &str,
) -> u32 {
let (mut summary_max_tokens, needs_reduction) = self.calculate_summary_max_tokens(provider_name);
if !needs_reduction {
return summary_max_tokens;
}
self.ui_writer.print_context_status(
"⚠️ Context window too full for thinking budget. Applying fallback sequence...\n",
);
// Step 1: Try thinnify (first third of context)
self.ui_writer.print_context_status("🥒 Step 1: Trying thinnify...\n");
let (thin_msg, thin_saved) = self.context_window.thin_context();
self.thinning_events.push(thin_saved);
self.ui_writer.print_context_thinning(&thin_msg);
// Recalculate max_tokens after thinnify
let (new_max, still_needs_reduction) = self.calculate_summary_max_tokens(provider_name);
summary_max_tokens = new_max;
if !still_needs_reduction {
self.ui_writer.print_context_status(
"✅ Thinnify resolved capacity issue. Continuing...\n",
);
return summary_max_tokens;
}
// Step 2: Try skinnify (entire context)
self.ui_writer.print_context_status("🦴 Step 2: Trying skinnify...\n");
let (skinny_msg, skinny_saved) = self.context_window.thin_context_all();
self.thinning_events.push(skinny_saved);
self.ui_writer.print_context_thinning(&skinny_msg);
// Recalculate max_tokens after skinnify
let (final_max, final_needs_reduction) = self.calculate_summary_max_tokens(provider_name);
summary_max_tokens = final_max;
if !final_needs_reduction {
self.ui_writer.print_context_status(
"✅ Skinnify resolved capacity issue. Continuing...\n",
);
return summary_max_tokens;
}
// Step 3: Nothing worked, use hard-coded minimum
self.ui_writer.print_context_status(
"⚠️ Step 3: Context reduction insufficient. Using hard-coded max_tokens=5000 as last resort...\n",
);
5000
}
/// Resolve the temperature to use for a given provider, applying fallbacks
fn resolve_temperature(&self, provider_name: &str) -> f32 {
match provider_name {
@@ -1805,8 +2003,14 @@ impl<W: UiWriter> Agent<W> {
};
let _ = provider; // Drop the provider reference to avoid borrowing issues
// Get max_tokens from provider configuration, falling back to sensible defaults
let max_tokens = Some(self.resolve_max_tokens(&provider_name));
// Get max_tokens from provider configuration with preflight validation
// This ensures max_tokens > thinking.budget_tokens for Anthropic with extended thinking
let initial_max_tokens = self.resolve_max_tokens(&provider_name);
let max_tokens = Some(self.apply_max_tokens_fallback_sequence(
&provider_name,
initial_max_tokens,
16000, // Hard-coded minimum for main API calls (higher than summary's 5000)
));
let request = CompletionRequest {
messages,
@@ -2211,6 +2415,25 @@ impl<W: UiWriter> Agent<W> {
self.context_window.percentage_used() as u32
));
let provider = self.providers.get(None)?;
let provider_name = provider.name().to_string();
let _ = provider; // Release borrow early
// Apply fallback sequence: thinnify -> skinnify -> hard-coded 5000
let mut summary_max_tokens = self.apply_summary_fallback_sequence(&provider_name);
// Apply provider-specific caps
summary_max_tokens = match provider_name.as_str() {
"databricks" | "anthropic" => summary_max_tokens.min(10_000),
"embedded" => summary_max_tokens.min(3000),
_ => summary_max_tokens.min(5000),
};
debug!(
"Requesting summary with max_tokens: {} (current usage: {} tokens)",
summary_max_tokens, self.context_window.used_tokens
);
// Create summary request with FULL history
let summary_prompt = self.context_window.create_summary_prompt();
@@ -2239,38 +2462,9 @@ impl<W: UiWriter> Agent<W> {
let provider = self.providers.get(None)?;
// Dynamically calculate max_tokens for summary based on what's left
let summary_max_tokens = match provider.name() {
"databricks" | "anthropic" => {
let model_limit = self.context_window.total_tokens;
let current_usage = self.context_window.used_tokens;
let available = model_limit
.saturating_sub(current_usage)
.saturating_sub(5000);
Some(available.min(10_000))
}
"embedded" => {
let model_limit = self.context_window.total_tokens;
let current_usage = self.context_window.used_tokens;
let available = model_limit
.saturating_sub(current_usage)
.saturating_sub(1000);
Some(available.min(3000))
}
_ => {
let available = self.context_window.remaining_tokens().saturating_sub(2000);
Some(available.min(5000))
}
};
debug!(
"Requesting summary with max_tokens: {:?} (current usage: {} tokens)",
summary_max_tokens, self.context_window.used_tokens
);
let summary_request = CompletionRequest {
messages: summary_messages,
max_tokens: summary_max_tokens,
max_tokens: Some(summary_max_tokens),
temperature: Some(self.resolve_temperature(provider.name())),
stream: false,
tools: None,
@@ -3234,6 +3428,25 @@ impl<W: UiWriter> Agent<W> {
self.context_window.percentage_used() as u32
));
let provider = self.providers.get(None)?;
let provider_name = provider.name().to_string();
let _ = provider; // Release borrow early
// Apply fallback sequence: thinnify -> skinnify -> hard-coded 5000
let mut summary_max_tokens = self.apply_summary_fallback_sequence(&provider_name);
// Apply provider-specific caps
summary_max_tokens = match provider_name.as_str() {
"databricks" | "anthropic" => summary_max_tokens.min(10_000),
"embedded" => summary_max_tokens.min(3000),
_ => summary_max_tokens.min(5000),
};
debug!(
"Requesting summary with max_tokens: {} (current usage: {} tokens)",
summary_max_tokens, self.context_window.used_tokens
);
// Create summary request with FULL history
let summary_prompt = self.context_window.create_summary_prompt();
@@ -3262,82 +3475,9 @@ impl<W: UiWriter> Agent<W> {
let provider = self.providers.get(None)?;
// Dynamically calculate max_tokens for summary based on what's left
// We need to ensure: used_tokens + max_tokens <= total_context_limit
let summary_max_tokens = match provider.name() {
"databricks" | "anthropic" => {
// Use the actual configured context window size
let model_limit = self.context_window.total_tokens;
let current_usage = self.context_window.used_tokens;
// Check if we have enough capacity for summarization
if current_usage >= model_limit.saturating_sub(1000) {
error!("Context window at capacity ({}%), cannot summarize. Current: {}, Limit: {}",
self.context_window.percentage_used(), current_usage, model_limit);
return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify or /compact commands to reduce context size, or start a new session."));
}
// Leave buffer proportional to model size (min 1k, max 10k)
let buffer = (model_limit / 40).clamp(1000, 10000); // 2.5% buffer
let available = model_limit
.saturating_sub(current_usage)
.saturating_sub(buffer);
// Cap at a reasonable summary size (10k tokens max)
Some(available.min(10_000))
}
"embedded" => {
// For smaller context models, be more conservative
let model_limit = self.context_window.total_tokens;
let current_usage = self.context_window.used_tokens;
// Check capacity for embedded models too
if current_usage >= model_limit.saturating_sub(500) {
error!(
"Embedded model context window at capacity ({}%)",
self.context_window.percentage_used()
);
return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify command to reduce context size, or start a new session."));
}
// Leave 1k buffer
let available = model_limit
.saturating_sub(current_usage)
.saturating_sub(1000);
// Cap at 3k for embedded models
Some(available.min(3000))
}
_ => {
// Default: conservative approach
let model_limit = self.context_window.total_tokens;
let current_usage = self.context_window.used_tokens;
if current_usage >= model_limit.saturating_sub(1000) {
error!(
"Context window at capacity ({}%)",
self.context_window.percentage_used()
);
return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify or /compact commands, or start a new session."));
}
let available = self.context_window.remaining_tokens().saturating_sub(2000);
Some(available.min(5000))
}
};
debug!(
"Requesting summary with max_tokens: {:?} (current usage: {} tokens)",
summary_max_tokens, self.context_window.used_tokens
);
// Final safety check
if summary_max_tokens.unwrap_or(0) == 0 {
error!("No tokens available for summarization");
return Err(anyhow::anyhow!("No context window capacity left for summarization. Use /thinnify to reduce context size or start a new session."));
}
let summary_request = CompletionRequest {
messages: summary_messages,
max_tokens: summary_max_tokens,
max_tokens: Some(summary_max_tokens),
temperature: Some(self.resolve_temperature(provider.name())),
stream: false,
tools: None,