Merge pull request #41 from dhanji/jochen-fix-max_tokens

Fix bugs where insufficient max_tokens were passed to LLM
This commit is contained in:
Jochen
2025-12-11 16:02:04 +11:00
committed by GitHub
6 changed files with 611 additions and 110 deletions

View File

@@ -1408,14 +1408,237 @@ impl<W: UiWriter> Agent<W> {
/// Resolve the max_tokens to use for a given provider, applying fallbacks /// Resolve the max_tokens to use for a given provider, applying fallbacks
fn resolve_max_tokens(&self, provider_name: &str) -> u32 { fn resolve_max_tokens(&self, provider_name: &str) -> u32 {
match provider_name { let base = match provider_name {
"databricks" => Self::provider_max_tokens(&self.config, "databricks") "databricks" => Self::provider_max_tokens(&self.config, "databricks")
.or(Some(self.config.agent.fallback_default_max_tokens as u32)) .or(Some(self.config.agent.fallback_default_max_tokens as u32))
.unwrap_or(32000), .unwrap_or(32000),
other => Self::provider_max_tokens(&self.config, other) other => Self::provider_max_tokens(&self.config, other)
.or(Some(self.config.agent.fallback_default_max_tokens as u32)) .or(Some(self.config.agent.fallback_default_max_tokens as u32))
.unwrap_or(16000), .unwrap_or(16000),
};
// For Anthropic with thinking enabled, ensure max_tokens is sufficient
// Anthropic requires: max_tokens > thinking.budget_tokens
if provider_name == "anthropic" {
if let Some(budget) = self.get_thinking_budget_tokens() {
let minimum_for_thinking = budget + 1024;
return base.max(minimum_for_thinking);
}
} }
base
}
/// Get the thinking budget tokens for Anthropic provider, if configured
fn get_thinking_budget_tokens(&self) -> Option<u32> {
self.config
.providers
.anthropic
.as_ref()
.and_then(|c| c.thinking_budget_tokens)
}
/// Pre-flight check to validate and adjust max_tokens for the thinking.budget_tokens constraint.
/// Returns the adjusted max_tokens that satisfies: max_tokens > thinking.budget_tokens
/// Also returns whether we need to apply fallback actions (thinnify/skinnify).
///
/// Returns: (adjusted_max_tokens, needs_context_reduction)
fn preflight_validate_max_tokens(
&self,
provider_name: &str,
proposed_max_tokens: u32,
) -> (u32, bool) {
// Only applies to Anthropic provider with thinking enabled
if provider_name != "anthropic" {
return (proposed_max_tokens, false);
}
let budget_tokens = match self.get_thinking_budget_tokens() {
Some(budget) => budget,
None => return (proposed_max_tokens, false), // No thinking enabled
};
// Anthropic requires: max_tokens > budget_tokens
// We add a minimum output buffer of 1024 tokens for actual response content
let minimum_required = budget_tokens + 1024;
if proposed_max_tokens >= minimum_required {
// We have enough headroom
(proposed_max_tokens, false)
} else {
// max_tokens is too low - need to either adjust or reduce context
warn!(
"max_tokens ({}) is below required minimum ({}) for thinking.budget_tokens ({}). Context reduction needed.",
proposed_max_tokens, minimum_required, budget_tokens
);
// Return the minimum required, but flag that we need context reduction
(minimum_required, true)
}
}
/// Calculate max_tokens for a summary request, ensuring it satisfies the thinking constraint.
/// Applies fallback sequence: thinnify -> skinnify -> hard-coded minimum
/// Returns (max_tokens, whether_fallback_was_used)
fn calculate_summary_max_tokens(
&mut self,
provider_name: &str,
) -> (u32, bool) {
let model_limit = self.context_window.total_tokens;
let current_usage = self.context_window.used_tokens;
// Get the configured max_tokens for this provider
let configured_max_tokens = self.resolve_max_tokens(provider_name);
// Calculate available tokens with buffer
let buffer = (model_limit / 40).clamp(1000, 10000); // 2.5% buffer
let available = model_limit
.saturating_sub(current_usage)
.saturating_sub(buffer);
// Use the smaller of available tokens or configured max_tokens,
// but ensure we don't go below thinking budget floor for Anthropic
let proposed_max_tokens = available.min(configured_max_tokens);
let proposed_max_tokens = if provider_name == "anthropic" {
if let Some(budget) = self.get_thinking_budget_tokens() {
proposed_max_tokens.max(budget + 1024)
} else {
proposed_max_tokens
}
} else {
proposed_max_tokens
};
// Validate against thinking budget constraint
let (adjusted, needs_reduction) = self.preflight_validate_max_tokens(provider_name, proposed_max_tokens);
if !needs_reduction {
return (adjusted, false);
}
// We need more headroom - the context is too full
// Return the adjusted value but flag that fallbacks are needed
(adjusted, true)
}
/// Apply the fallback sequence to free up context space for thinking budget.
/// Sequence: thinnify (first third) → skinnify (all) → hard-coded minimum
/// Returns the validated max_tokens that satisfies thinking.budget_tokens constraint.
fn apply_max_tokens_fallback_sequence(
&mut self,
provider_name: &str,
initial_max_tokens: u32,
hard_coded_minimum: u32,
) -> u32 {
let (mut max_tokens, needs_reduction) = self.preflight_validate_max_tokens(provider_name, initial_max_tokens);
if !needs_reduction {
return max_tokens;
}
self.ui_writer.print_context_status(
"⚠️ Context window too full for thinking budget. Applying fallback sequence...\n",
);
// Step 1: Try thinnify (first third of context)
self.ui_writer.print_context_status("🥒 Step 1: Trying thinnify...\n");
let (thin_msg, thin_saved) = self.context_window.thin_context();
self.thinning_events.push(thin_saved);
self.ui_writer.print_context_thinning(&thin_msg);
// Recalculate max_tokens after thinnify
let recalc_max = self.resolve_max_tokens(provider_name);
let (new_max, still_needs_reduction) = self.preflight_validate_max_tokens(provider_name, recalc_max);
max_tokens = new_max;
if !still_needs_reduction {
self.ui_writer.print_context_status(
"✅ Thinnify resolved capacity issue. Continuing...\n",
);
return max_tokens;
}
// Step 2: Try skinnify (entire context)
self.ui_writer.print_context_status("🦴 Step 2: Trying skinnify...\n");
let (skinny_msg, skinny_saved) = self.context_window.thin_context_all();
self.thinning_events.push(skinny_saved);
self.ui_writer.print_context_thinning(&skinny_msg);
// Recalculate max_tokens after skinnify
let recalc_max = self.resolve_max_tokens(provider_name);
let (final_max, final_needs_reduction) = self.preflight_validate_max_tokens(provider_name, recalc_max);
max_tokens = final_max;
if !final_needs_reduction {
self.ui_writer.print_context_status(
"✅ Skinnify resolved capacity issue. Continuing...\n",
);
return max_tokens;
}
// Step 3: Nothing worked, use hard-coded minimum as last resort
self.ui_writer.print_context_status(&format!(
"⚠️ Step 3: Context reduction insufficient. Using hard-coded max_tokens={} as last resort...\n",
hard_coded_minimum
));
hard_coded_minimum
}
/// Apply the fallback sequence for summary requests to free up context space.
/// Uses calculate_summary_max_tokens for recalculation (based on available space).
/// Returns the validated max_tokens for summary requests.
fn apply_summary_fallback_sequence(
&mut self,
provider_name: &str,
) -> u32 {
let (mut summary_max_tokens, needs_reduction) = self.calculate_summary_max_tokens(provider_name);
if !needs_reduction {
return summary_max_tokens;
}
self.ui_writer.print_context_status(
"⚠️ Context window too full for thinking budget. Applying fallback sequence...\n",
);
// Step 1: Try thinnify (first third of context)
self.ui_writer.print_context_status("🥒 Step 1: Trying thinnify...\n");
let (thin_msg, thin_saved) = self.context_window.thin_context();
self.thinning_events.push(thin_saved);
self.ui_writer.print_context_thinning(&thin_msg);
// Recalculate max_tokens after thinnify
let (new_max, still_needs_reduction) = self.calculate_summary_max_tokens(provider_name);
summary_max_tokens = new_max;
if !still_needs_reduction {
self.ui_writer.print_context_status(
"✅ Thinnify resolved capacity issue. Continuing...\n",
);
return summary_max_tokens;
}
// Step 2: Try skinnify (entire context)
self.ui_writer.print_context_status("🦴 Step 2: Trying skinnify...\n");
let (skinny_msg, skinny_saved) = self.context_window.thin_context_all();
self.thinning_events.push(skinny_saved);
self.ui_writer.print_context_thinning(&skinny_msg);
// Recalculate max_tokens after skinnify
let (final_max, final_needs_reduction) = self.calculate_summary_max_tokens(provider_name);
summary_max_tokens = final_max;
if !final_needs_reduction {
self.ui_writer.print_context_status(
"✅ Skinnify resolved capacity issue. Continuing...\n",
);
return summary_max_tokens;
}
// Step 3: Nothing worked, use hard-coded minimum
self.ui_writer.print_context_status(
"⚠️ Step 3: Context reduction insufficient. Using hard-coded max_tokens=5000 as last resort...\n",
);
5000
} }
/// Resolve the temperature to use for a given provider, applying fallbacks /// Resolve the temperature to use for a given provider, applying fallbacks
@@ -1805,8 +2028,14 @@ impl<W: UiWriter> Agent<W> {
}; };
let _ = provider; // Drop the provider reference to avoid borrowing issues let _ = provider; // Drop the provider reference to avoid borrowing issues
// Get max_tokens from provider configuration, falling back to sensible defaults // Get max_tokens from provider configuration with preflight validation
let max_tokens = Some(self.resolve_max_tokens(&provider_name)); // This ensures max_tokens > thinking.budget_tokens for Anthropic with extended thinking
let initial_max_tokens = self.resolve_max_tokens(&provider_name);
let max_tokens = Some(self.apply_max_tokens_fallback_sequence(
&provider_name,
initial_max_tokens,
16000, // Hard-coded minimum for main API calls (higher than summary's 5000)
));
let request = CompletionRequest { let request = CompletionRequest {
messages, messages,
@@ -1814,6 +2043,7 @@ impl<W: UiWriter> Agent<W> {
temperature: Some(self.resolve_temperature(&provider_name)), temperature: Some(self.resolve_temperature(&provider_name)),
stream: true, // Enable streaming stream: true, // Enable streaming
tools, tools,
disable_thinking: false,
}; };
// Time the LLM call with cancellation support and streaming // Time the LLM call with cancellation support and streaming
@@ -2211,6 +2441,32 @@ impl<W: UiWriter> Agent<W> {
self.context_window.percentage_used() as u32 self.context_window.percentage_used() as u32
)); ));
let provider = self.providers.get(None)?;
let provider_name = provider.name().to_string();
let _ = provider; // Release borrow early
// Apply fallback sequence: thinnify -> skinnify -> hard-coded 5000
let mut summary_max_tokens = self.apply_summary_fallback_sequence(&provider_name);
// Apply provider-specific caps
// For Anthropic with thinking enabled, we need max_tokens > thinking.budget_tokens
// So we set a higher cap when thinking is configured
let anthropic_cap = match self.get_thinking_budget_tokens() {
Some(budget) => (budget + 2000).max(10_000), // At least budget + 2000 for response
None => 10_000,
};
summary_max_tokens = match provider_name.as_str() {
"anthropic" => summary_max_tokens.min(anthropic_cap),
"databricks" => summary_max_tokens.min(10_000),
"embedded" => summary_max_tokens.min(3000),
_ => summary_max_tokens.min(5000),
};
debug!(
"Requesting summary with max_tokens: {} (current usage: {} tokens)",
summary_max_tokens, self.context_window.used_tokens
);
// Create summary request with FULL history // Create summary request with FULL history
let summary_prompt = self.context_window.create_summary_prompt(); let summary_prompt = self.context_window.create_summary_prompt();
@@ -2239,41 +2495,26 @@ impl<W: UiWriter> Agent<W> {
let provider = self.providers.get(None)?; let provider = self.providers.get(None)?;
// Dynamically calculate max_tokens for summary based on what's left // Determine if we need to disable thinking mode for this request
let summary_max_tokens = match provider.name() { // Anthropic requires: max_tokens > thinking.budget_tokens + 1024
"databricks" | "anthropic" => { let disable_thinking = self.get_thinking_budget_tokens().map_or(false, |budget| {
let model_limit = self.context_window.total_tokens; let minimum_for_thinking = budget + 1024;
let current_usage = self.context_window.used_tokens; let should_disable = summary_max_tokens <= minimum_for_thinking;
let available = model_limit if should_disable {
.saturating_sub(current_usage) tracing::warn!("Disabling thinking mode for summary: max_tokens ({}) <= minimum_for_thinking ({})", summary_max_tokens, minimum_for_thinking);
.saturating_sub(5000);
Some(available.min(10_000))
} }
"embedded" => { should_disable
let model_limit = self.context_window.total_tokens; });
let current_usage = self.context_window.used_tokens;
let available = model_limit
.saturating_sub(current_usage)
.saturating_sub(1000);
Some(available.min(3000))
}
_ => {
let available = self.context_window.remaining_tokens().saturating_sub(2000);
Some(available.min(5000))
}
};
debug!( tracing::debug!("Creating summary request: max_tokens={}, disable_thinking={}", summary_max_tokens, disable_thinking);
"Requesting summary with max_tokens: {:?} (current usage: {} tokens)",
summary_max_tokens, self.context_window.used_tokens
);
let summary_request = CompletionRequest { let summary_request = CompletionRequest {
messages: summary_messages, messages: summary_messages,
max_tokens: summary_max_tokens, max_tokens: Some(summary_max_tokens),
temperature: Some(self.resolve_temperature(provider.name())), temperature: Some(self.resolve_temperature(provider.name())),
stream: false, stream: false,
tools: None, tools: None,
disable_thinking,
}; };
// Get the summary // Get the summary
@@ -3234,6 +3475,32 @@ impl<W: UiWriter> Agent<W> {
self.context_window.percentage_used() as u32 self.context_window.percentage_used() as u32
)); ));
let provider = self.providers.get(None)?;
let provider_name = provider.name().to_string();
let _ = provider; // Release borrow early
// Apply fallback sequence: thinnify -> skinnify -> hard-coded 5000
let mut summary_max_tokens = self.apply_summary_fallback_sequence(&provider_name);
// Apply provider-specific caps
// For Anthropic with thinking enabled, we need max_tokens > thinking.budget_tokens
// So we set a higher cap when thinking is configured
let anthropic_cap = match self.get_thinking_budget_tokens() {
Some(budget) => (budget + 2000).max(10_000), // At least budget + 2000 for response
None => 10_000,
};
summary_max_tokens = match provider_name.as_str() {
"anthropic" => summary_max_tokens.min(anthropic_cap),
"databricks" => summary_max_tokens.min(10_000),
"embedded" => summary_max_tokens.min(3000),
_ => summary_max_tokens.min(5000),
};
debug!(
"Requesting summary with max_tokens: {} (current usage: {} tokens)",
summary_max_tokens, self.context_window.used_tokens
);
// Create summary request with FULL history // Create summary request with FULL history
let summary_prompt = self.context_window.create_summary_prompt(); let summary_prompt = self.context_window.create_summary_prompt();
@@ -3262,85 +3529,26 @@ impl<W: UiWriter> Agent<W> {
let provider = self.providers.get(None)?; let provider = self.providers.get(None)?;
// Dynamically calculate max_tokens for summary based on what's left // Determine if we need to disable thinking mode for this request
// We need to ensure: used_tokens + max_tokens <= total_context_limit // Anthropic requires: max_tokens > thinking.budget_tokens + 1024
let summary_max_tokens = match provider.name() { let disable_thinking = self.get_thinking_budget_tokens().map_or(false, |budget| {
"databricks" | "anthropic" => { let minimum_for_thinking = budget + 1024;
// Use the actual configured context window size let should_disable = summary_max_tokens <= minimum_for_thinking;
let model_limit = self.context_window.total_tokens; if should_disable {
let current_usage = self.context_window.used_tokens; tracing::warn!("Disabling thinking mode for summary: max_tokens ({}) <= minimum_for_thinking ({})", summary_max_tokens, minimum_for_thinking);
// Check if we have enough capacity for summarization
if current_usage >= model_limit.saturating_sub(1000) {
error!("Context window at capacity ({}%), cannot summarize. Current: {}, Limit: {}",
self.context_window.percentage_used(), current_usage, model_limit);
return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify or /compact commands to reduce context size, or start a new session."));
}
// Leave buffer proportional to model size (min 1k, max 10k)
let buffer = (model_limit / 40).clamp(1000, 10000); // 2.5% buffer
let available = model_limit
.saturating_sub(current_usage)
.saturating_sub(buffer);
// Cap at a reasonable summary size (10k tokens max)
Some(available.min(10_000))
} }
"embedded" => { should_disable
// For smaller context models, be more conservative });
let model_limit = self.context_window.total_tokens;
let current_usage = self.context_window.used_tokens;
// Check capacity for embedded models too tracing::debug!("Creating auto-summary request: max_tokens={}, disable_thinking={}", summary_max_tokens, disable_thinking);
if current_usage >= model_limit.saturating_sub(500) {
error!(
"Embedded model context window at capacity ({}%)",
self.context_window.percentage_used()
);
return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify command to reduce context size, or start a new session."));
}
// Leave 1k buffer
let available = model_limit
.saturating_sub(current_usage)
.saturating_sub(1000);
// Cap at 3k for embedded models
Some(available.min(3000))
}
_ => {
// Default: conservative approach
let model_limit = self.context_window.total_tokens;
let current_usage = self.context_window.used_tokens;
if current_usage >= model_limit.saturating_sub(1000) {
error!(
"Context window at capacity ({}%)",
self.context_window.percentage_used()
);
return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify or /compact commands, or start a new session."));
}
let available = self.context_window.remaining_tokens().saturating_sub(2000);
Some(available.min(5000))
}
};
debug!(
"Requesting summary with max_tokens: {:?} (current usage: {} tokens)",
summary_max_tokens, self.context_window.used_tokens
);
// Final safety check
if summary_max_tokens.unwrap_or(0) == 0 {
error!("No tokens available for summarization");
return Err(anyhow::anyhow!("No context window capacity left for summarization. Use /thinnify to reduce context size or start a new session."));
}
let summary_request = CompletionRequest { let summary_request = CompletionRequest {
messages: summary_messages, messages: summary_messages,
max_tokens: summary_max_tokens, max_tokens: Some(summary_max_tokens),
temperature: Some(self.resolve_temperature(provider.name())), temperature: Some(self.resolve_temperature(provider.name())),
stream: false, stream: false,
tools: None, tools: None,
disable_thinking,
}; };
// Get the summary // Get the summary

View File

@@ -0,0 +1,188 @@
//! Tests for the pre-flight max_tokens validation with thinking.budget_tokens constraint
//!
//! These tests verify that when using Anthropic with extended thinking enabled,
//! the max_tokens calculation properly accounts for the budget_tokens constraint.
use g3_config::Config;
use g3_core::ContextWindow;
/// Helper function to create a minimal config for testing
fn create_test_config_with_thinking(thinking_budget: Option<u32>) -> Config {
let mut config = Config::default();
// Set up Anthropic provider with optional thinking budget
config.providers.anthropic = Some(g3_config::AnthropicConfig {
api_key: "test-key".to_string(),
model: "claude-sonnet-4-5".to_string(),
max_tokens: Some(16000),
temperature: Some(0.1),
cache_config: None,
enable_1m_context: None,
thinking_budget_tokens: thinking_budget,
});
config.providers.default_provider = "anthropic".to_string();
config
}
/// Test that when thinking is disabled, max_tokens passes through unchanged
#[test]
fn test_no_thinking_budget_passes_through() {
let config = create_test_config_with_thinking(None);
// Without thinking budget, any max_tokens should be fine
let proposed_max = 5000;
// The constraint check would return (proposed_max, false)
// since there's no thinking_budget_tokens configured
assert!(config.providers.anthropic.as_ref().unwrap().thinking_budget_tokens.is_none());
}
/// Test that when max_tokens > budget_tokens + buffer, no reduction is needed
#[test]
fn test_sufficient_max_tokens_no_reduction_needed() {
let config = create_test_config_with_thinking(Some(10000));
let budget_tokens = config.providers.anthropic.as_ref().unwrap().thinking_budget_tokens.unwrap();
// minimum_required = budget_tokens + 1024 = 11024
let minimum_required = budget_tokens + 1024;
// If proposed_max >= minimum_required, no reduction is needed
let proposed_max = 15000;
assert!(proposed_max >= minimum_required);
}
/// Test that when max_tokens < budget_tokens + buffer, reduction is needed
#[test]
fn test_insufficient_max_tokens_needs_reduction() {
let config = create_test_config_with_thinking(Some(10000));
let budget_tokens = config.providers.anthropic.as_ref().unwrap().thinking_budget_tokens.unwrap();
// minimum_required = budget_tokens + 1024 = 11024
let minimum_required = budget_tokens + 1024;
// If proposed_max < minimum_required, reduction IS needed
let proposed_max = 5000;
assert!(proposed_max < minimum_required);
}
/// Test the minimum required calculation
#[test]
fn test_minimum_required_calculation() {
// For a budget of 10000, we need at least 11024 tokens
let budget_tokens = 10000u32;
let output_buffer = 1024u32;
let minimum_required = budget_tokens + output_buffer;
assert_eq!(minimum_required, 11024);
// For a larger budget
let budget_tokens = 32000u32;
let minimum_required = budget_tokens + output_buffer;
assert_eq!(minimum_required, 33024);
}
/// Test context window usage calculation for summary max_tokens
#[test]
fn test_context_window_available_tokens() {
let mut context = ContextWindow::new(200000); // 200k context window
// Simulate heavy usage
context.used_tokens = 180000; // 90% used
let model_limit = context.total_tokens;
let current_usage = context.used_tokens;
// 2.5% buffer calculation
let buffer = (model_limit / 40).clamp(1000, 10000);
assert_eq!(buffer, 5000); // 200000/40 = 5000
let available = model_limit
.saturating_sub(current_usage)
.saturating_sub(buffer);
// 200000 - 180000 - 5000 = 15000
assert_eq!(available, 15000);
// Capped at 10000 for summary
let summary_max = available.min(10_000);
assert_eq!(summary_max, 10000);
}
/// Test that when context is nearly full, available tokens may be below thinking budget
#[test]
fn test_context_nearly_full_triggers_reduction() {
let mut context = ContextWindow::new(200000);
// Very heavy usage - 98% used
context.used_tokens = 196000;
let model_limit = context.total_tokens;
let current_usage = context.used_tokens;
let buffer = (model_limit / 40).clamp(1000, 10000); // 5000
let available = model_limit
.saturating_sub(current_usage)
.saturating_sub(buffer);
// 200000 - 196000 - 5000 = -1000 -> saturates to 0
assert_eq!(available, 0);
// With thinking_budget of 10000, this would definitely need reduction
let thinking_budget = 10000u32;
let minimum_required = thinking_budget + 1024;
assert!(available < minimum_required);
}
/// Test the hard-coded fallback value
#[test]
fn test_hardcoded_fallback_value() {
// When all else fails, we use 5000 as the hard-coded max_tokens
let hardcoded_fallback = 5000u32;
// This should be a reasonable value that Anthropic will accept
// even with thinking enabled (though output will be limited)
assert!(hardcoded_fallback > 0);
// Note: With a 10000 thinking budget, 5000 is still below the
// minimum required (11024), but we send it anyway as a "last resort"
// hoping the API might still work for basic operations
}
/// Test provider-specific caps
#[test]
fn test_provider_specific_caps() {
// Anthropic/Databricks: cap at 10000
let anthropic_cap = 10000u32;
let proposed = 15000u32;
assert_eq!(proposed.min(anthropic_cap), 10000);
// Embedded: cap at 3000
let embedded_cap = 3000u32;
let proposed = 5000u32;
assert_eq!(proposed.min(embedded_cap), 3000);
// Default: cap at 5000
let default_cap = 5000u32;
let proposed = 8000u32;
assert_eq!(proposed.min(default_cap), 5000);
}
/// Test that the error message mentions the thinking budget constraint
#[test]
fn test_error_message_content() {
// Verify the warning message format contains useful information
let proposed_max_tokens = 5000u32;
let budget_tokens = 10000u32;
let minimum_required = budget_tokens + 1024;
let warning = format!(
"max_tokens ({}) is below required minimum ({}) for thinking.budget_tokens ({}). Context reduction needed.",
proposed_max_tokens, minimum_required, budget_tokens
);
assert!(warning.contains("5000"));
assert!(warning.contains("11024"));
assert!(warning.contains("10000"));
assert!(warning.contains("Context reduction needed"));
}

View File

@@ -85,6 +85,7 @@ pub async fn get_initial_discovery_messages(
temperature: Some(provider.temperature()), temperature: Some(provider.temperature()),
stream: false, stream: false,
tools: None, tools: None,
disable_thinking: false,
}; };
status("🤖 Calling LLM for discovery commands..."); status("🤖 Calling LLM for discovery commands...");

View File

@@ -39,6 +39,7 @@
//! temperature: Some(0.7), //! temperature: Some(0.7),
//! stream: false, //! stream: false,
//! tools: None, //! tools: None,
//! disable_thinking: false,
//! }; //! };
//! //!
//! // Get a completion //! // Get a completion
@@ -75,6 +76,7 @@
//! temperature: Some(0.7), //! temperature: Some(0.7),
//! stream: true, //! stream: true,
//! tools: None, //! tools: None,
//! disable_thinking: false,
//! }; //! };
//! //!
//! let mut stream = provider.stream(request).await?; //! let mut stream = provider.stream(request).await?;
@@ -272,6 +274,7 @@ impl AnthropicProvider {
streaming: bool, streaming: bool,
max_tokens: u32, max_tokens: u32,
temperature: f32, temperature: f32,
disable_thinking: bool,
) -> Result<AnthropicRequest> { ) -> Result<AnthropicRequest> {
let (system, anthropic_messages) = self.convert_messages(messages)?; let (system, anthropic_messages) = self.convert_messages(messages)?;
@@ -284,10 +287,32 @@ impl AnthropicProvider {
// Convert tools if provided // Convert tools if provided
let anthropic_tools = tools.map(|t| self.convert_tools(t)); let anthropic_tools = tools.map(|t| self.convert_tools(t));
// Add thinking configuration if budget_tokens is set // Add thinking configuration if budget_tokens is set AND max_tokens is sufficient AND not explicitly disabled
let thinking = self.thinking_budget_tokens.map(|budget| { // Anthropic requires: max_tokens > thinking.budget_tokens
ThinkingConfig::enabled(budget) // We add 1024 as minimum buffer for actual response content
}); tracing::debug!("create_request_body called: max_tokens={}, disable_thinking={}, thinking_budget_tokens={:?}", max_tokens, disable_thinking, self.thinking_budget_tokens);
let thinking = if disable_thinking {
tracing::info!(
"Thinking mode explicitly disabled for this request (max_tokens={})",
max_tokens
);
None
} else {
self.thinking_budget_tokens.and_then(|budget| {
let min_required = budget + 1024;
if max_tokens > min_required {
Some(ThinkingConfig::enabled(budget))
} else {
tracing::warn!(
"Disabling thinking mode: max_tokens ({}) is not greater than thinking.budget_tokens ({}) + 1024 buffer. \
Required: max_tokens > {}",
max_tokens, budget, min_required
);
None
}
})
};
let request = AnthropicRequest { let request = AnthropicRequest {
model: self.model.clone(), model: self.model.clone(),
@@ -637,6 +662,7 @@ impl LLMProvider for AnthropicProvider {
false, false,
max_tokens, max_tokens,
temperature, temperature,
request.disable_thinking,
)?; )?;
debug!( debug!(
@@ -710,6 +736,7 @@ impl LLMProvider for AnthropicProvider {
true, true,
max_tokens, max_tokens,
temperature, temperature,
request.disable_thinking,
)?; )?;
debug!( debug!(
@@ -847,6 +874,12 @@ enum AnthropicContent {
#[serde(skip_serializing_if = "Option::is_none")] #[serde(skip_serializing_if = "Option::is_none")]
cache_control: Option<crate::CacheControl>, cache_control: Option<crate::CacheControl>,
}, },
#[serde(rename = "thinking")]
Thinking {
thinking: String,
#[serde(default)]
signature: Option<String>,
},
#[serde(rename = "tool_use")] #[serde(rename = "tool_use")]
ToolUse { ToolUse {
id: String, id: String,
@@ -947,7 +980,7 @@ mod tests {
let messages = vec![Message::new(MessageRole::User, "Test message".to_string())]; let messages = vec![Message::new(MessageRole::User, "Test message".to_string())];
let request_body = provider let request_body = provider
.create_request_body(&messages, None, false, 1000, 0.5) .create_request_body(&messages, None, false, 1000, 0.5, false)
.unwrap(); .unwrap();
assert_eq!(request_body.model, "claude-3-haiku-20240307"); assert_eq!(request_body.model, "claude-3-haiku-20240307");
@@ -1053,16 +1086,17 @@ mod tests {
let messages = vec![Message::new(MessageRole::User, "Test message".to_string())]; let messages = vec![Message::new(MessageRole::User, "Test message".to_string())];
let request_without = provider_without let request_without = provider_without
.create_request_body(&messages, None, false, 1000, 0.5) .create_request_body(&messages, None, false, 1000, 0.5, false)
.unwrap(); .unwrap();
let json_without = serde_json::to_string(&request_without).unwrap(); let json_without = serde_json::to_string(&request_without).unwrap();
assert!(!json_without.contains("thinking"), "JSON should not contain 'thinking' field when not configured"); assert!(!json_without.contains("thinking"), "JSON should not contain 'thinking' field when not configured");
// Test WITH thinking parameter // Test WITH thinking parameter - max_tokens must be > budget_tokens + 1024
// Using budget=10000 requires max_tokens > 11024
let provider_with = AnthropicProvider::new( let provider_with = AnthropicProvider::new(
"test-key".to_string(), "test-key".to_string(),
Some("claude-sonnet-4-5".to_string()), Some("claude-sonnet-4-5".to_string()),
Some(1000), Some(20000), // Sufficient for thinking budget
Some(0.5), Some(0.5),
None, None,
None, None,
@@ -1071,11 +1105,78 @@ mod tests {
.unwrap(); .unwrap();
let request_with = provider_with let request_with = provider_with
.create_request_body(&messages, None, false, 1000, 0.5) .create_request_body(&messages, None, false, 20000, 0.5, false)
.unwrap(); .unwrap();
let json_with = serde_json::to_string(&request_with).unwrap(); let json_with = serde_json::to_string(&request_with).unwrap();
assert!(json_with.contains("thinking"), "JSON should contain 'thinking' field when configured"); assert!(json_with.contains("thinking"), "JSON should contain 'thinking' field when configured");
assert!(json_with.contains("\"type\":\"enabled\""), "JSON should contain type: enabled"); assert!(json_with.contains("\"type\":\"enabled\""), "JSON should contain type: enabled");
assert!(json_with.contains("\"budget_tokens\":10000"), "JSON should contain budget_tokens: 10000"); assert!(json_with.contains("\"budget_tokens\":10000"), "JSON should contain budget_tokens: 10000");
// Test WITH thinking parameter but INSUFFICIENT max_tokens - thinking should be disabled
let request_insufficient = provider_with
.create_request_body(&messages, None, false, 5000, 0.5, false) // Less than budget + 1024
.unwrap();
let json_insufficient = serde_json::to_string(&request_insufficient).unwrap();
assert!(!json_insufficient.contains("thinking"), "JSON should NOT contain 'thinking' field when max_tokens is insufficient");
}
#[test]
fn test_disable_thinking_flag() {
// Test that disable_thinking=true prevents thinking even with sufficient max_tokens
let provider = AnthropicProvider::new(
"test-key".to_string(),
Some("claude-sonnet-4-5".to_string()),
Some(20000),
Some(0.5),
None,
None,
Some(10000), // With thinking budget
)
.unwrap();
let messages = vec![Message::new(MessageRole::User, "Test message".to_string())];
// With disable_thinking=false, thinking should be enabled (max_tokens is sufficient)
let request_with_thinking = provider
.create_request_body(&messages, None, false, 20000, 0.5, false)
.unwrap();
let json_with = serde_json::to_string(&request_with_thinking).unwrap();
assert!(json_with.contains("thinking"), "JSON should contain 'thinking' field when not disabled");
// With disable_thinking=true, thinking should be disabled even with sufficient max_tokens
let request_without_thinking = provider
.create_request_body(&messages, None, false, 20000, 0.5, true)
.unwrap();
let json_without = serde_json::to_string(&request_without_thinking).unwrap();
assert!(!json_without.contains("thinking"), "JSON should NOT contain 'thinking' field when explicitly disabled");
}
#[test]
fn test_thinking_content_block_deserialization() {
// Test that we can deserialize a response containing a "thinking" content block
// This is what Anthropic returns when extended thinking is enabled
let json_response = r#"{
"content": [
{"type": "thinking", "thinking": "Let me analyze this...", "signature": "abc123"},
{"type": "text", "text": "Here is my response."}
],
"model": "claude-sonnet-4-5",
"usage": {"input_tokens": 100, "output_tokens": 50}
}"#;
let response: AnthropicResponse = serde_json::from_str(json_response)
.expect("Should be able to deserialize response with thinking block");
assert_eq!(response.content.len(), 2);
assert_eq!(response.model, "claude-sonnet-4-5");
// Extract only text content (thinking should be filtered out)
let text_content: Vec<_> = response.content.iter().filter_map(|c| match c {
AnthropicContent::Text { text, .. } => Some(text.as_str()),
_ => None,
}).collect();
assert_eq!(text_content.len(), 1);
assert_eq!(text_content[0], "Here is my response.");
} }
} }

View File

@@ -45,6 +45,7 @@
//! temperature: Some(0.7), //! temperature: Some(0.7),
//! stream: false, //! stream: false,
//! tools: None, //! tools: None,
//! disable_thinking: false,
//! }; //! };
//! //!
//! // Get a completion //! // Get a completion

View File

@@ -42,6 +42,8 @@ pub struct CompletionRequest {
pub temperature: Option<f32>, pub temperature: Option<f32>,
pub stream: bool, pub stream: bool,
pub tools: Option<Vec<Tool>>, pub tools: Option<Vec<Tool>>,
/// Force disable thinking mode for this request (used when max_tokens is too low)
pub disable_thinking: bool,
} }
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]