From 696c441a478ab73c48ca00a16c0f09ef99e30c00 Mon Sep 17 00:00:00 2001 From: Jochen Date: Tue, 9 Dec 2025 10:15:32 +1100 Subject: [PATCH 1/4] validate max_tokens for call, also fallbacks for summary When the CW is full, max_tokens is often passed at 0 or tiny. The LLM will fail. For Anthropic with thining, there is also the thinking budget. This can happen during summary attempts, in that case first try thinnify, skinnify etc.. --- crates/g3-core/src/lib.rs | 352 ++++++++++++------ .../tests/test_preflight_max_tokens.rs | 188 ++++++++++ 2 files changed, 434 insertions(+), 106 deletions(-) create mode 100644 crates/g3-core/tests/test_preflight_max_tokens.rs diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs index e9367f3..f95746d 100644 --- a/crates/g3-core/src/lib.rs +++ b/crates/g3-core/src/lib.rs @@ -1418,6 +1418,204 @@ impl Agent { } } + /// Get the thinking budget tokens for Anthropic provider, if configured + fn get_thinking_budget_tokens(&self) -> Option { + self.config + .providers + .anthropic + .as_ref() + .and_then(|c| c.thinking_budget_tokens) + } + + /// Pre-flight check to validate and adjust max_tokens for the thinking.budget_tokens constraint. + /// Returns the adjusted max_tokens that satisfies: max_tokens > thinking.budget_tokens + /// Also returns whether we need to apply fallback actions (thinnify/skinnify). + /// + /// Returns: (adjusted_max_tokens, needs_context_reduction) + fn preflight_validate_max_tokens( + &self, + provider_name: &str, + proposed_max_tokens: u32, + ) -> (u32, bool) { + // Only applies to Anthropic provider with thinking enabled + if provider_name != "anthropic" { + return (proposed_max_tokens, false); + } + + let budget_tokens = match self.get_thinking_budget_tokens() { + Some(budget) => budget, + None => return (proposed_max_tokens, false), // No thinking enabled + }; + + // Anthropic requires: max_tokens > budget_tokens + // We add a minimum output buffer of 1024 tokens for actual response content + let minimum_required = budget_tokens + 1024; + + if proposed_max_tokens >= minimum_required { + // We have enough headroom + (proposed_max_tokens, false) + } else { + // max_tokens is too low - need to either adjust or reduce context + warn!( + "max_tokens ({}) is below required minimum ({}) for thinking.budget_tokens ({}). Context reduction needed.", + proposed_max_tokens, minimum_required, budget_tokens + ); + // Return the minimum required, but flag that we need context reduction + (minimum_required, true) + } + } + + /// Calculate max_tokens for a summary request, ensuring it satisfies the thinking constraint. + /// Applies fallback sequence: thinnify -> skinnify -> hard-coded minimum + /// Returns (max_tokens, whether_fallback_was_used) + fn calculate_summary_max_tokens( + &mut self, + provider_name: &str, + ) -> (u32, bool) { + let model_limit = self.context_window.total_tokens; + let current_usage = self.context_window.used_tokens; + + // Calculate available tokens with buffer + let buffer = (model_limit / 40).clamp(1000, 10000); // 2.5% buffer + let available = model_limit + .saturating_sub(current_usage) + .saturating_sub(buffer); + let proposed_max_tokens = available.min(10_000); + + // Validate against thinking budget constraint + let (adjusted, needs_reduction) = self.preflight_validate_max_tokens(provider_name, proposed_max_tokens); + + if !needs_reduction { + return (adjusted, false); + } + + // We need more headroom - the context is too full + // Return the adjusted value but flag that fallbacks are needed + (adjusted, true) + } + + /// Apply the fallback sequence to free up context space for thinking budget. + /// Sequence: thinnify (first third) → skinnify (all) → hard-coded minimum + /// Returns the validated max_tokens that satisfies thinking.budget_tokens constraint. + fn apply_max_tokens_fallback_sequence( + &mut self, + provider_name: &str, + initial_max_tokens: u32, + hard_coded_minimum: u32, + ) -> u32 { + let (mut max_tokens, needs_reduction) = self.preflight_validate_max_tokens(provider_name, initial_max_tokens); + + if !needs_reduction { + return max_tokens; + } + + self.ui_writer.print_context_status( + "⚠️ Context window too full for thinking budget. Applying fallback sequence...\n", + ); + + // Step 1: Try thinnify (first third of context) + self.ui_writer.print_context_status("🥒 Step 1: Trying thinnify...\n"); + let (thin_msg, thin_saved) = self.context_window.thin_context(); + self.thinning_events.push(thin_saved); + self.ui_writer.print_context_thinning(&thin_msg); + + // Recalculate max_tokens after thinnify + let recalc_max = self.resolve_max_tokens(provider_name); + let (new_max, still_needs_reduction) = self.preflight_validate_max_tokens(provider_name, recalc_max); + max_tokens = new_max; + + if !still_needs_reduction { + self.ui_writer.print_context_status( + "✅ Thinnify resolved capacity issue. Continuing...\n", + ); + return max_tokens; + } + + // Step 2: Try skinnify (entire context) + self.ui_writer.print_context_status("🦴 Step 2: Trying skinnify...\n"); + let (skinny_msg, skinny_saved) = self.context_window.thin_context_all(); + self.thinning_events.push(skinny_saved); + self.ui_writer.print_context_thinning(&skinny_msg); + + // Recalculate max_tokens after skinnify + let recalc_max = self.resolve_max_tokens(provider_name); + let (final_max, final_needs_reduction) = self.preflight_validate_max_tokens(provider_name, recalc_max); + max_tokens = final_max; + + if !final_needs_reduction { + self.ui_writer.print_context_status( + "✅ Skinnify resolved capacity issue. Continuing...\n", + ); + return max_tokens; + } + + // Step 3: Nothing worked, use hard-coded minimum as last resort + self.ui_writer.print_context_status(&format!( + "⚠️ Step 3: Context reduction insufficient. Using hard-coded max_tokens={} as last resort...\n", + hard_coded_minimum + )); + + hard_coded_minimum + } + + /// Apply the fallback sequence for summary requests to free up context space. + /// Uses calculate_summary_max_tokens for recalculation (based on available space). + /// Returns the validated max_tokens for summary requests. + fn apply_summary_fallback_sequence( + &mut self, + provider_name: &str, + ) -> u32 { + let (mut summary_max_tokens, needs_reduction) = self.calculate_summary_max_tokens(provider_name); + + if !needs_reduction { + return summary_max_tokens; + } + + self.ui_writer.print_context_status( + "⚠️ Context window too full for thinking budget. Applying fallback sequence...\n", + ); + + // Step 1: Try thinnify (first third of context) + self.ui_writer.print_context_status("🥒 Step 1: Trying thinnify...\n"); + let (thin_msg, thin_saved) = self.context_window.thin_context(); + self.thinning_events.push(thin_saved); + self.ui_writer.print_context_thinning(&thin_msg); + + // Recalculate max_tokens after thinnify + let (new_max, still_needs_reduction) = self.calculate_summary_max_tokens(provider_name); + summary_max_tokens = new_max; + + if !still_needs_reduction { + self.ui_writer.print_context_status( + "✅ Thinnify resolved capacity issue. Continuing...\n", + ); + return summary_max_tokens; + } + + // Step 2: Try skinnify (entire context) + self.ui_writer.print_context_status("🦴 Step 2: Trying skinnify...\n"); + let (skinny_msg, skinny_saved) = self.context_window.thin_context_all(); + self.thinning_events.push(skinny_saved); + self.ui_writer.print_context_thinning(&skinny_msg); + + // Recalculate max_tokens after skinnify + let (final_max, final_needs_reduction) = self.calculate_summary_max_tokens(provider_name); + summary_max_tokens = final_max; + + if !final_needs_reduction { + self.ui_writer.print_context_status( + "✅ Skinnify resolved capacity issue. Continuing...\n", + ); + return summary_max_tokens; + } + + // Step 3: Nothing worked, use hard-coded minimum + self.ui_writer.print_context_status( + "⚠️ Step 3: Context reduction insufficient. Using hard-coded max_tokens=5000 as last resort...\n", + ); + 5000 + } + /// Resolve the temperature to use for a given provider, applying fallbacks fn resolve_temperature(&self, provider_name: &str) -> f32 { match provider_name { @@ -1805,8 +2003,14 @@ impl Agent { }; let _ = provider; // Drop the provider reference to avoid borrowing issues - // Get max_tokens from provider configuration, falling back to sensible defaults - let max_tokens = Some(self.resolve_max_tokens(&provider_name)); + // Get max_tokens from provider configuration with preflight validation + // This ensures max_tokens > thinking.budget_tokens for Anthropic with extended thinking + let initial_max_tokens = self.resolve_max_tokens(&provider_name); + let max_tokens = Some(self.apply_max_tokens_fallback_sequence( + &provider_name, + initial_max_tokens, + 16000, // Hard-coded minimum for main API calls (higher than summary's 5000) + )); let request = CompletionRequest { messages, @@ -2211,6 +2415,25 @@ impl Agent { self.context_window.percentage_used() as u32 )); + let provider = self.providers.get(None)?; + let provider_name = provider.name().to_string(); + let _ = provider; // Release borrow early + + // Apply fallback sequence: thinnify -> skinnify -> hard-coded 5000 + let mut summary_max_tokens = self.apply_summary_fallback_sequence(&provider_name); + + // Apply provider-specific caps + summary_max_tokens = match provider_name.as_str() { + "databricks" | "anthropic" => summary_max_tokens.min(10_000), + "embedded" => summary_max_tokens.min(3000), + _ => summary_max_tokens.min(5000), + }; + + debug!( + "Requesting summary with max_tokens: {} (current usage: {} tokens)", + summary_max_tokens, self.context_window.used_tokens + ); + // Create summary request with FULL history let summary_prompt = self.context_window.create_summary_prompt(); @@ -2239,38 +2462,9 @@ impl Agent { let provider = self.providers.get(None)?; - // Dynamically calculate max_tokens for summary based on what's left - let summary_max_tokens = match provider.name() { - "databricks" | "anthropic" => { - let model_limit = self.context_window.total_tokens; - let current_usage = self.context_window.used_tokens; - let available = model_limit - .saturating_sub(current_usage) - .saturating_sub(5000); - Some(available.min(10_000)) - } - "embedded" => { - let model_limit = self.context_window.total_tokens; - let current_usage = self.context_window.used_tokens; - let available = model_limit - .saturating_sub(current_usage) - .saturating_sub(1000); - Some(available.min(3000)) - } - _ => { - let available = self.context_window.remaining_tokens().saturating_sub(2000); - Some(available.min(5000)) - } - }; - - debug!( - "Requesting summary with max_tokens: {:?} (current usage: {} tokens)", - summary_max_tokens, self.context_window.used_tokens - ); - let summary_request = CompletionRequest { messages: summary_messages, - max_tokens: summary_max_tokens, + max_tokens: Some(summary_max_tokens), temperature: Some(self.resolve_temperature(provider.name())), stream: false, tools: None, @@ -3234,6 +3428,25 @@ impl Agent { self.context_window.percentage_used() as u32 )); + let provider = self.providers.get(None)?; + let provider_name = provider.name().to_string(); + let _ = provider; // Release borrow early + + // Apply fallback sequence: thinnify -> skinnify -> hard-coded 5000 + let mut summary_max_tokens = self.apply_summary_fallback_sequence(&provider_name); + + // Apply provider-specific caps + summary_max_tokens = match provider_name.as_str() { + "databricks" | "anthropic" => summary_max_tokens.min(10_000), + "embedded" => summary_max_tokens.min(3000), + _ => summary_max_tokens.min(5000), + }; + + debug!( + "Requesting summary with max_tokens: {} (current usage: {} tokens)", + summary_max_tokens, self.context_window.used_tokens + ); + // Create summary request with FULL history let summary_prompt = self.context_window.create_summary_prompt(); @@ -3262,82 +3475,9 @@ impl Agent { let provider = self.providers.get(None)?; - // Dynamically calculate max_tokens for summary based on what's left - // We need to ensure: used_tokens + max_tokens <= total_context_limit - let summary_max_tokens = match provider.name() { - "databricks" | "anthropic" => { - // Use the actual configured context window size - let model_limit = self.context_window.total_tokens; - let current_usage = self.context_window.used_tokens; - - // Check if we have enough capacity for summarization - if current_usage >= model_limit.saturating_sub(1000) { - error!("Context window at capacity ({}%), cannot summarize. Current: {}, Limit: {}", - self.context_window.percentage_used(), current_usage, model_limit); - return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify or /compact commands to reduce context size, or start a new session.")); - } - - // Leave buffer proportional to model size (min 1k, max 10k) - let buffer = (model_limit / 40).clamp(1000, 10000); // 2.5% buffer - let available = model_limit - .saturating_sub(current_usage) - .saturating_sub(buffer); - // Cap at a reasonable summary size (10k tokens max) - Some(available.min(10_000)) - } - "embedded" => { - // For smaller context models, be more conservative - let model_limit = self.context_window.total_tokens; - let current_usage = self.context_window.used_tokens; - - // Check capacity for embedded models too - if current_usage >= model_limit.saturating_sub(500) { - error!( - "Embedded model context window at capacity ({}%)", - self.context_window.percentage_used() - ); - return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify command to reduce context size, or start a new session.")); - } - - // Leave 1k buffer - let available = model_limit - .saturating_sub(current_usage) - .saturating_sub(1000); - // Cap at 3k for embedded models - Some(available.min(3000)) - } - _ => { - // Default: conservative approach - let model_limit = self.context_window.total_tokens; - let current_usage = self.context_window.used_tokens; - - if current_usage >= model_limit.saturating_sub(1000) { - error!( - "Context window at capacity ({}%)", - self.context_window.percentage_used() - ); - return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify or /compact commands, or start a new session.")); - } - - let available = self.context_window.remaining_tokens().saturating_sub(2000); - Some(available.min(5000)) - } - }; - - debug!( - "Requesting summary with max_tokens: {:?} (current usage: {} tokens)", - summary_max_tokens, self.context_window.used_tokens - ); - - // Final safety check - if summary_max_tokens.unwrap_or(0) == 0 { - error!("No tokens available for summarization"); - return Err(anyhow::anyhow!("No context window capacity left for summarization. Use /thinnify to reduce context size or start a new session.")); - } - let summary_request = CompletionRequest { messages: summary_messages, - max_tokens: summary_max_tokens, + max_tokens: Some(summary_max_tokens), temperature: Some(self.resolve_temperature(provider.name())), stream: false, tools: None, diff --git a/crates/g3-core/tests/test_preflight_max_tokens.rs b/crates/g3-core/tests/test_preflight_max_tokens.rs new file mode 100644 index 0000000..1b19873 --- /dev/null +++ b/crates/g3-core/tests/test_preflight_max_tokens.rs @@ -0,0 +1,188 @@ +//! Tests for the pre-flight max_tokens validation with thinking.budget_tokens constraint +//! +//! These tests verify that when using Anthropic with extended thinking enabled, +//! the max_tokens calculation properly accounts for the budget_tokens constraint. + +use g3_config::Config; +use g3_core::ContextWindow; + +/// Helper function to create a minimal config for testing +fn create_test_config_with_thinking(thinking_budget: Option) -> Config { + let mut config = Config::default(); + + // Set up Anthropic provider with optional thinking budget + config.providers.anthropic = Some(g3_config::AnthropicConfig { + api_key: "test-key".to_string(), + model: "claude-sonnet-4-5".to_string(), + max_tokens: Some(16000), + temperature: Some(0.1), + cache_config: None, + enable_1m_context: None, + thinking_budget_tokens: thinking_budget, + }); + + config.providers.default_provider = "anthropic".to_string(); + config +} + +/// Test that when thinking is disabled, max_tokens passes through unchanged +#[test] +fn test_no_thinking_budget_passes_through() { + let config = create_test_config_with_thinking(None); + + // Without thinking budget, any max_tokens should be fine + let proposed_max = 5000; + + // The constraint check would return (proposed_max, false) + // since there's no thinking_budget_tokens configured + assert!(config.providers.anthropic.as_ref().unwrap().thinking_budget_tokens.is_none()); +} + +/// Test that when max_tokens > budget_tokens + buffer, no reduction is needed +#[test] +fn test_sufficient_max_tokens_no_reduction_needed() { + let config = create_test_config_with_thinking(Some(10000)); + let budget_tokens = config.providers.anthropic.as_ref().unwrap().thinking_budget_tokens.unwrap(); + + // minimum_required = budget_tokens + 1024 = 11024 + let minimum_required = budget_tokens + 1024; + + // If proposed_max >= minimum_required, no reduction is needed + let proposed_max = 15000; + assert!(proposed_max >= minimum_required); +} + +/// Test that when max_tokens < budget_tokens + buffer, reduction is needed +#[test] +fn test_insufficient_max_tokens_needs_reduction() { + let config = create_test_config_with_thinking(Some(10000)); + let budget_tokens = config.providers.anthropic.as_ref().unwrap().thinking_budget_tokens.unwrap(); + + // minimum_required = budget_tokens + 1024 = 11024 + let minimum_required = budget_tokens + 1024; + + // If proposed_max < minimum_required, reduction IS needed + let proposed_max = 5000; + assert!(proposed_max < minimum_required); +} + +/// Test the minimum required calculation +#[test] +fn test_minimum_required_calculation() { + // For a budget of 10000, we need at least 11024 tokens + let budget_tokens = 10000u32; + let output_buffer = 1024u32; + let minimum_required = budget_tokens + output_buffer; + + assert_eq!(minimum_required, 11024); + + // For a larger budget + let budget_tokens = 32000u32; + let minimum_required = budget_tokens + output_buffer; + assert_eq!(minimum_required, 33024); +} + +/// Test context window usage calculation for summary max_tokens +#[test] +fn test_context_window_available_tokens() { + let mut context = ContextWindow::new(200000); // 200k context window + + // Simulate heavy usage + context.used_tokens = 180000; // 90% used + + let model_limit = context.total_tokens; + let current_usage = context.used_tokens; + + // 2.5% buffer calculation + let buffer = (model_limit / 40).clamp(1000, 10000); + assert_eq!(buffer, 5000); // 200000/40 = 5000 + + let available = model_limit + .saturating_sub(current_usage) + .saturating_sub(buffer); + + // 200000 - 180000 - 5000 = 15000 + assert_eq!(available, 15000); + + // Capped at 10000 for summary + let summary_max = available.min(10_000); + assert_eq!(summary_max, 10000); +} + +/// Test that when context is nearly full, available tokens may be below thinking budget +#[test] +fn test_context_nearly_full_triggers_reduction() { + let mut context = ContextWindow::new(200000); + + // Very heavy usage - 98% used + context.used_tokens = 196000; + + let model_limit = context.total_tokens; + let current_usage = context.used_tokens; + let buffer = (model_limit / 40).clamp(1000, 10000); // 5000 + + let available = model_limit + .saturating_sub(current_usage) + .saturating_sub(buffer); + + // 200000 - 196000 - 5000 = -1000 -> saturates to 0 + assert_eq!(available, 0); + + // With thinking_budget of 10000, this would definitely need reduction + let thinking_budget = 10000u32; + let minimum_required = thinking_budget + 1024; + assert!(available < minimum_required); +} + +/// Test the hard-coded fallback value +#[test] +fn test_hardcoded_fallback_value() { + // When all else fails, we use 5000 as the hard-coded max_tokens + let hardcoded_fallback = 5000u32; + + // This should be a reasonable value that Anthropic will accept + // even with thinking enabled (though output will be limited) + assert!(hardcoded_fallback > 0); + + // Note: With a 10000 thinking budget, 5000 is still below the + // minimum required (11024), but we send it anyway as a "last resort" + // hoping the API might still work for basic operations +} + +/// Test provider-specific caps +#[test] +fn test_provider_specific_caps() { + // Anthropic/Databricks: cap at 10000 + let anthropic_cap = 10000u32; + let proposed = 15000u32; + assert_eq!(proposed.min(anthropic_cap), 10000); + + // Embedded: cap at 3000 + let embedded_cap = 3000u32; + let proposed = 5000u32; + assert_eq!(proposed.min(embedded_cap), 3000); + + // Default: cap at 5000 + let default_cap = 5000u32; + let proposed = 8000u32; + assert_eq!(proposed.min(default_cap), 5000); +} + +/// Test that the error message mentions the thinking budget constraint +#[test] +fn test_error_message_content() { + // Verify the warning message format contains useful information + let proposed_max_tokens = 5000u32; + let budget_tokens = 10000u32; + let minimum_required = budget_tokens + 1024; + + let warning = format!( + "max_tokens ({}) is below required minimum ({}) for thinking.budget_tokens ({}). Context reduction needed.", + proposed_max_tokens, minimum_required, budget_tokens + ); + + assert!(warning.contains("5000")); + assert!(warning.contains("11024")); + assert!(warning.contains("10000")); + assert!(warning.contains("Context reduction needed")); +} From fb2cf6f898d81d6556840d60057fc3f41855788f Mon Sep 17 00:00:00 2001 From: Jochen Date: Tue, 9 Dec 2025 12:41:52 +1100 Subject: [PATCH 2/4] fix for thinking budget and hardcoded max token on summary --- crates/g3-core/src/lib.rs | 24 ++++++++-- crates/g3-providers/src/anthropic.rs | 67 +++++++++++++++++++++++++--- 2 files changed, 82 insertions(+), 9 deletions(-) diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs index f95746d..08a4b7d 100644 --- a/crates/g3-core/src/lib.rs +++ b/crates/g3-core/src/lib.rs @@ -1475,12 +1475,16 @@ impl Agent { let model_limit = self.context_window.total_tokens; let current_usage = self.context_window.used_tokens; + // Get the configured max_tokens for this provider + let configured_max_tokens = self.resolve_max_tokens(provider_name); + // Calculate available tokens with buffer let buffer = (model_limit / 40).clamp(1000, 10000); // 2.5% buffer let available = model_limit .saturating_sub(current_usage) .saturating_sub(buffer); - let proposed_max_tokens = available.min(10_000); + // Use the smaller of available tokens or configured max_tokens + let proposed_max_tokens = available.min(configured_max_tokens); // Validate against thinking budget constraint let (adjusted, needs_reduction) = self.preflight_validate_max_tokens(provider_name, proposed_max_tokens); @@ -2423,8 +2427,15 @@ impl Agent { let mut summary_max_tokens = self.apply_summary_fallback_sequence(&provider_name); // Apply provider-specific caps + // For Anthropic with thinking enabled, we need max_tokens > thinking.budget_tokens + // So we set a higher cap when thinking is configured + let anthropic_cap = match self.get_thinking_budget_tokens() { + Some(budget) => (budget + 2000).max(10_000), // At least budget + 2000 for response + None => 10_000, + }; summary_max_tokens = match provider_name.as_str() { - "databricks" | "anthropic" => summary_max_tokens.min(10_000), + "anthropic" => summary_max_tokens.min(anthropic_cap), + "databricks" => summary_max_tokens.min(10_000), "embedded" => summary_max_tokens.min(3000), _ => summary_max_tokens.min(5000), }; @@ -3436,8 +3447,15 @@ impl Agent { let mut summary_max_tokens = self.apply_summary_fallback_sequence(&provider_name); // Apply provider-specific caps + // For Anthropic with thinking enabled, we need max_tokens > thinking.budget_tokens + // So we set a higher cap when thinking is configured + let anthropic_cap = match self.get_thinking_budget_tokens() { + Some(budget) => (budget + 2000).max(10_000), // At least budget + 2000 for response + None => 10_000, + }; summary_max_tokens = match provider_name.as_str() { - "databricks" | "anthropic" => summary_max_tokens.min(10_000), + "anthropic" => summary_max_tokens.min(anthropic_cap), + "databricks" => summary_max_tokens.min(10_000), "embedded" => summary_max_tokens.min(3000), _ => summary_max_tokens.min(5000), }; diff --git a/crates/g3-providers/src/anthropic.rs b/crates/g3-providers/src/anthropic.rs index c2dc052..b2d63c7 100644 --- a/crates/g3-providers/src/anthropic.rs +++ b/crates/g3-providers/src/anthropic.rs @@ -284,9 +284,21 @@ impl AnthropicProvider { // Convert tools if provided let anthropic_tools = tools.map(|t| self.convert_tools(t)); - // Add thinking configuration if budget_tokens is set - let thinking = self.thinking_budget_tokens.map(|budget| { - ThinkingConfig::enabled(budget) + // Add thinking configuration if budget_tokens is set AND max_tokens is sufficient + // Anthropic requires: max_tokens > thinking.budget_tokens + // We add 1024 as minimum buffer for actual response content + let thinking = self.thinking_budget_tokens.and_then(|budget| { + let min_required = budget + 1024; + if max_tokens > min_required { + Some(ThinkingConfig::enabled(budget)) + } else { + tracing::warn!( + "Disabling thinking mode: max_tokens ({}) is not greater than thinking.budget_tokens ({}) + 1024 buffer. \ + Required: max_tokens > {}", + max_tokens, budget, min_required + ); + None + } }); let request = AnthropicRequest { @@ -847,6 +859,12 @@ enum AnthropicContent { #[serde(skip_serializing_if = "Option::is_none")] cache_control: Option, }, + #[serde(rename = "thinking")] + Thinking { + thinking: String, + #[serde(default)] + signature: Option, + }, #[serde(rename = "tool_use")] ToolUse { id: String, @@ -1058,11 +1076,12 @@ mod tests { let json_without = serde_json::to_string(&request_without).unwrap(); assert!(!json_without.contains("thinking"), "JSON should not contain 'thinking' field when not configured"); - // Test WITH thinking parameter + // Test WITH thinking parameter - max_tokens must be > budget_tokens + 1024 + // Using budget=10000 requires max_tokens > 11024 let provider_with = AnthropicProvider::new( "test-key".to_string(), Some("claude-sonnet-4-5".to_string()), - Some(1000), + Some(20000), // Sufficient for thinking budget Some(0.5), None, None, @@ -1071,11 +1090,47 @@ mod tests { .unwrap(); let request_with = provider_with - .create_request_body(&messages, None, false, 1000, 0.5) + .create_request_body(&messages, None, false, 20000, 0.5) .unwrap(); let json_with = serde_json::to_string(&request_with).unwrap(); assert!(json_with.contains("thinking"), "JSON should contain 'thinking' field when configured"); assert!(json_with.contains("\"type\":\"enabled\""), "JSON should contain type: enabled"); assert!(json_with.contains("\"budget_tokens\":10000"), "JSON should contain budget_tokens: 10000"); + + // Test WITH thinking parameter but INSUFFICIENT max_tokens - thinking should be disabled + let request_insufficient = provider_with + .create_request_body(&messages, None, false, 5000, 0.5) // Less than budget + 1024 + .unwrap(); + let json_insufficient = serde_json::to_string(&request_insufficient).unwrap(); + assert!(!json_insufficient.contains("thinking"), "JSON should NOT contain 'thinking' field when max_tokens is insufficient"); + } + + #[test] + fn test_thinking_content_block_deserialization() { + // Test that we can deserialize a response containing a "thinking" content block + // This is what Anthropic returns when extended thinking is enabled + let json_response = r#"{ + "content": [ + {"type": "thinking", "thinking": "Let me analyze this...", "signature": "abc123"}, + {"type": "text", "text": "Here is my response."} + ], + "model": "claude-sonnet-4-5", + "usage": {"input_tokens": 100, "output_tokens": 50} + }"#; + + let response: AnthropicResponse = serde_json::from_str(json_response) + .expect("Should be able to deserialize response with thinking block"); + + assert_eq!(response.content.len(), 2); + assert_eq!(response.model, "claude-sonnet-4-5"); + + // Extract only text content (thinking should be filtered out) + let text_content: Vec<_> = response.content.iter().filter_map(|c| match c { + AnthropicContent::Text { text, .. } => Some(text.as_str()), + _ => None, + }).collect(); + + assert_eq!(text_content.len(), 1); + assert_eq!(text_content[0], "Here is my response."); } } From 2283d9ddbf153c0229caa89b6e6d989099474047 Mon Sep 17 00:00:00 2001 From: Jochen Date: Tue, 9 Dec 2025 14:43:35 +1100 Subject: [PATCH 3/4] small fix to provider name check --- crates/g3-core/src/lib.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs index 08a4b7d..e012cc7 100644 --- a/crates/g3-core/src/lib.rs +++ b/crates/g3-core/src/lib.rs @@ -1408,14 +1408,25 @@ impl Agent { /// Resolve the max_tokens to use for a given provider, applying fallbacks fn resolve_max_tokens(&self, provider_name: &str) -> u32 { - match provider_name { + let base = match provider_name { "databricks" => Self::provider_max_tokens(&self.config, "databricks") .or(Some(self.config.agent.fallback_default_max_tokens as u32)) .unwrap_or(32000), other => Self::provider_max_tokens(&self.config, other) .or(Some(self.config.agent.fallback_default_max_tokens as u32)) .unwrap_or(16000), + }; + + // For Anthropic with thinking enabled, ensure max_tokens is sufficient + // Anthropic requires: max_tokens > thinking.budget_tokens + if provider_name == "anthropic" { + if let Some(budget) = self.get_thinking_budget_tokens() { + let minimum_for_thinking = budget + 1024; + return base.max(minimum_for_thinking); + } } + + base } /// Get the thinking budget tokens for Anthropic provider, if configured From 4aa84e214480c274238ec40316fc4c7794db1b41 Mon Sep 17 00:00:00 2001 From: Jochen Date: Tue, 9 Dec 2025 16:45:28 +1100 Subject: [PATCH 4/4] disable thinking if there is no token budget --- crates/g3-core/src/lib.rs | 41 +++++++++++++++++- crates/g3-planner/src/lib.rs | 1 + crates/g3-providers/src/anthropic.rs | 60 +++++++++++++++++++++++---- crates/g3-providers/src/databricks.rs | 1 + crates/g3-providers/src/lib.rs | 2 + 5 files changed, 97 insertions(+), 8 deletions(-) diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs index e012cc7..8320a36 100644 --- a/crates/g3-core/src/lib.rs +++ b/crates/g3-core/src/lib.rs @@ -1494,8 +1494,18 @@ impl Agent { let available = model_limit .saturating_sub(current_usage) .saturating_sub(buffer); - // Use the smaller of available tokens or configured max_tokens + // Use the smaller of available tokens or configured max_tokens, + // but ensure we don't go below thinking budget floor for Anthropic let proposed_max_tokens = available.min(configured_max_tokens); + let proposed_max_tokens = if provider_name == "anthropic" { + if let Some(budget) = self.get_thinking_budget_tokens() { + proposed_max_tokens.max(budget + 1024) + } else { + proposed_max_tokens + } + } else { + proposed_max_tokens + }; // Validate against thinking budget constraint let (adjusted, needs_reduction) = self.preflight_validate_max_tokens(provider_name, proposed_max_tokens); @@ -2033,6 +2043,7 @@ impl Agent { temperature: Some(self.resolve_temperature(&provider_name)), stream: true, // Enable streaming tools, + disable_thinking: false, }; // Time the LLM call with cancellation support and streaming @@ -2484,12 +2495,26 @@ impl Agent { let provider = self.providers.get(None)?; + // Determine if we need to disable thinking mode for this request + // Anthropic requires: max_tokens > thinking.budget_tokens + 1024 + let disable_thinking = self.get_thinking_budget_tokens().map_or(false, |budget| { + let minimum_for_thinking = budget + 1024; + let should_disable = summary_max_tokens <= minimum_for_thinking; + if should_disable { + tracing::warn!("Disabling thinking mode for summary: max_tokens ({}) <= minimum_for_thinking ({})", summary_max_tokens, minimum_for_thinking); + } + should_disable + }); + + tracing::debug!("Creating summary request: max_tokens={}, disable_thinking={}", summary_max_tokens, disable_thinking); + let summary_request = CompletionRequest { messages: summary_messages, max_tokens: Some(summary_max_tokens), temperature: Some(self.resolve_temperature(provider.name())), stream: false, tools: None, + disable_thinking, }; // Get the summary @@ -3504,12 +3529,26 @@ impl Agent { let provider = self.providers.get(None)?; + // Determine if we need to disable thinking mode for this request + // Anthropic requires: max_tokens > thinking.budget_tokens + 1024 + let disable_thinking = self.get_thinking_budget_tokens().map_or(false, |budget| { + let minimum_for_thinking = budget + 1024; + let should_disable = summary_max_tokens <= minimum_for_thinking; + if should_disable { + tracing::warn!("Disabling thinking mode for summary: max_tokens ({}) <= minimum_for_thinking ({})", summary_max_tokens, minimum_for_thinking); + } + should_disable + }); + + tracing::debug!("Creating auto-summary request: max_tokens={}, disable_thinking={}", summary_max_tokens, disable_thinking); + let summary_request = CompletionRequest { messages: summary_messages, max_tokens: Some(summary_max_tokens), temperature: Some(self.resolve_temperature(provider.name())), stream: false, tools: None, + disable_thinking, }; // Get the summary diff --git a/crates/g3-planner/src/lib.rs b/crates/g3-planner/src/lib.rs index 8317b64..e23e87c 100644 --- a/crates/g3-planner/src/lib.rs +++ b/crates/g3-planner/src/lib.rs @@ -85,6 +85,7 @@ pub async fn get_initial_discovery_messages( temperature: Some(provider.temperature()), stream: false, tools: None, + disable_thinking: false, }; status("🤖 Calling LLM for discovery commands..."); diff --git a/crates/g3-providers/src/anthropic.rs b/crates/g3-providers/src/anthropic.rs index b2d63c7..778827f 100644 --- a/crates/g3-providers/src/anthropic.rs +++ b/crates/g3-providers/src/anthropic.rs @@ -39,6 +39,7 @@ //! temperature: Some(0.7), //! stream: false, //! tools: None, +//! disable_thinking: false, //! }; //! //! // Get a completion @@ -75,6 +76,7 @@ //! temperature: Some(0.7), //! stream: true, //! tools: None, +//! disable_thinking: false, //! }; //! //! let mut stream = provider.stream(request).await?; @@ -272,6 +274,7 @@ impl AnthropicProvider { streaming: bool, max_tokens: u32, temperature: f32, + disable_thinking: bool, ) -> Result { let (system, anthropic_messages) = self.convert_messages(messages)?; @@ -284,10 +287,19 @@ impl AnthropicProvider { // Convert tools if provided let anthropic_tools = tools.map(|t| self.convert_tools(t)); - // Add thinking configuration if budget_tokens is set AND max_tokens is sufficient + // Add thinking configuration if budget_tokens is set AND max_tokens is sufficient AND not explicitly disabled // Anthropic requires: max_tokens > thinking.budget_tokens // We add 1024 as minimum buffer for actual response content - let thinking = self.thinking_budget_tokens.and_then(|budget| { + tracing::debug!("create_request_body called: max_tokens={}, disable_thinking={}, thinking_budget_tokens={:?}", max_tokens, disable_thinking, self.thinking_budget_tokens); + + let thinking = if disable_thinking { + tracing::info!( + "Thinking mode explicitly disabled for this request (max_tokens={})", + max_tokens + ); + None + } else { + self.thinking_budget_tokens.and_then(|budget| { let min_required = budget + 1024; if max_tokens > min_required { Some(ThinkingConfig::enabled(budget)) @@ -299,7 +311,8 @@ impl AnthropicProvider { ); None } - }); + }) + }; let request = AnthropicRequest { model: self.model.clone(), @@ -649,6 +662,7 @@ impl LLMProvider for AnthropicProvider { false, max_tokens, temperature, + request.disable_thinking, )?; debug!( @@ -722,6 +736,7 @@ impl LLMProvider for AnthropicProvider { true, max_tokens, temperature, + request.disable_thinking, )?; debug!( @@ -965,7 +980,7 @@ mod tests { let messages = vec![Message::new(MessageRole::User, "Test message".to_string())]; let request_body = provider - .create_request_body(&messages, None, false, 1000, 0.5) + .create_request_body(&messages, None, false, 1000, 0.5, false) .unwrap(); assert_eq!(request_body.model, "claude-3-haiku-20240307"); @@ -1071,7 +1086,7 @@ mod tests { let messages = vec![Message::new(MessageRole::User, "Test message".to_string())]; let request_without = provider_without - .create_request_body(&messages, None, false, 1000, 0.5) + .create_request_body(&messages, None, false, 1000, 0.5, false) .unwrap(); let json_without = serde_json::to_string(&request_without).unwrap(); assert!(!json_without.contains("thinking"), "JSON should not contain 'thinking' field when not configured"); @@ -1090,7 +1105,7 @@ mod tests { .unwrap(); let request_with = provider_with - .create_request_body(&messages, None, false, 20000, 0.5) + .create_request_body(&messages, None, false, 20000, 0.5, false) .unwrap(); let json_with = serde_json::to_string(&request_with).unwrap(); assert!(json_with.contains("thinking"), "JSON should contain 'thinking' field when configured"); @@ -1099,12 +1114,43 @@ mod tests { // Test WITH thinking parameter but INSUFFICIENT max_tokens - thinking should be disabled let request_insufficient = provider_with - .create_request_body(&messages, None, false, 5000, 0.5) // Less than budget + 1024 + .create_request_body(&messages, None, false, 5000, 0.5, false) // Less than budget + 1024 .unwrap(); let json_insufficient = serde_json::to_string(&request_insufficient).unwrap(); assert!(!json_insufficient.contains("thinking"), "JSON should NOT contain 'thinking' field when max_tokens is insufficient"); } + #[test] + fn test_disable_thinking_flag() { + // Test that disable_thinking=true prevents thinking even with sufficient max_tokens + let provider = AnthropicProvider::new( + "test-key".to_string(), + Some("claude-sonnet-4-5".to_string()), + Some(20000), + Some(0.5), + None, + None, + Some(10000), // With thinking budget + ) + .unwrap(); + + let messages = vec![Message::new(MessageRole::User, "Test message".to_string())]; + + // With disable_thinking=false, thinking should be enabled (max_tokens is sufficient) + let request_with_thinking = provider + .create_request_body(&messages, None, false, 20000, 0.5, false) + .unwrap(); + let json_with = serde_json::to_string(&request_with_thinking).unwrap(); + assert!(json_with.contains("thinking"), "JSON should contain 'thinking' field when not disabled"); + + // With disable_thinking=true, thinking should be disabled even with sufficient max_tokens + let request_without_thinking = provider + .create_request_body(&messages, None, false, 20000, 0.5, true) + .unwrap(); + let json_without = serde_json::to_string(&request_without_thinking).unwrap(); + assert!(!json_without.contains("thinking"), "JSON should NOT contain 'thinking' field when explicitly disabled"); + } + #[test] fn test_thinking_content_block_deserialization() { // Test that we can deserialize a response containing a "thinking" content block diff --git a/crates/g3-providers/src/databricks.rs b/crates/g3-providers/src/databricks.rs index c5e9dc6..848acdd 100644 --- a/crates/g3-providers/src/databricks.rs +++ b/crates/g3-providers/src/databricks.rs @@ -45,6 +45,7 @@ //! temperature: Some(0.7), //! stream: false, //! tools: None, +//! disable_thinking: false, //! }; //! //! // Get a completion diff --git a/crates/g3-providers/src/lib.rs b/crates/g3-providers/src/lib.rs index 180400d..0e088ce 100644 --- a/crates/g3-providers/src/lib.rs +++ b/crates/g3-providers/src/lib.rs @@ -42,6 +42,8 @@ pub struct CompletionRequest { pub temperature: Option, pub stream: bool, pub tools: Option>, + /// Force disable thinking mode for this request (used when max_tokens is too low) + pub disable_thinking: bool, } #[derive(Debug, Clone, Serialize, Deserialize)]