diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs index e9367f3..8320a36 100644 --- a/crates/g3-core/src/lib.rs +++ b/crates/g3-core/src/lib.rs @@ -1408,14 +1408,237 @@ impl Agent { /// Resolve the max_tokens to use for a given provider, applying fallbacks fn resolve_max_tokens(&self, provider_name: &str) -> u32 { - match provider_name { + let base = match provider_name { "databricks" => Self::provider_max_tokens(&self.config, "databricks") .or(Some(self.config.agent.fallback_default_max_tokens as u32)) .unwrap_or(32000), other => Self::provider_max_tokens(&self.config, other) .or(Some(self.config.agent.fallback_default_max_tokens as u32)) .unwrap_or(16000), + }; + + // For Anthropic with thinking enabled, ensure max_tokens is sufficient + // Anthropic requires: max_tokens > thinking.budget_tokens + if provider_name == "anthropic" { + if let Some(budget) = self.get_thinking_budget_tokens() { + let minimum_for_thinking = budget + 1024; + return base.max(minimum_for_thinking); + } } + + base + } + + /// Get the thinking budget tokens for Anthropic provider, if configured + fn get_thinking_budget_tokens(&self) -> Option { + self.config + .providers + .anthropic + .as_ref() + .and_then(|c| c.thinking_budget_tokens) + } + + /// Pre-flight check to validate and adjust max_tokens for the thinking.budget_tokens constraint. + /// Returns the adjusted max_tokens that satisfies: max_tokens > thinking.budget_tokens + /// Also returns whether we need to apply fallback actions (thinnify/skinnify). + /// + /// Returns: (adjusted_max_tokens, needs_context_reduction) + fn preflight_validate_max_tokens( + &self, + provider_name: &str, + proposed_max_tokens: u32, + ) -> (u32, bool) { + // Only applies to Anthropic provider with thinking enabled + if provider_name != "anthropic" { + return (proposed_max_tokens, false); + } + + let budget_tokens = match self.get_thinking_budget_tokens() { + Some(budget) => budget, + None => return (proposed_max_tokens, false), // No thinking enabled + }; + + // Anthropic requires: max_tokens > budget_tokens + // We add a minimum output buffer of 1024 tokens for actual response content + let minimum_required = budget_tokens + 1024; + + if proposed_max_tokens >= minimum_required { + // We have enough headroom + (proposed_max_tokens, false) + } else { + // max_tokens is too low - need to either adjust or reduce context + warn!( + "max_tokens ({}) is below required minimum ({}) for thinking.budget_tokens ({}). Context reduction needed.", + proposed_max_tokens, minimum_required, budget_tokens + ); + // Return the minimum required, but flag that we need context reduction + (minimum_required, true) + } + } + + /// Calculate max_tokens for a summary request, ensuring it satisfies the thinking constraint. + /// Applies fallback sequence: thinnify -> skinnify -> hard-coded minimum + /// Returns (max_tokens, whether_fallback_was_used) + fn calculate_summary_max_tokens( + &mut self, + provider_name: &str, + ) -> (u32, bool) { + let model_limit = self.context_window.total_tokens; + let current_usage = self.context_window.used_tokens; + + // Get the configured max_tokens for this provider + let configured_max_tokens = self.resolve_max_tokens(provider_name); + + // Calculate available tokens with buffer + let buffer = (model_limit / 40).clamp(1000, 10000); // 2.5% buffer + let available = model_limit + .saturating_sub(current_usage) + .saturating_sub(buffer); + // Use the smaller of available tokens or configured max_tokens, + // but ensure we don't go below thinking budget floor for Anthropic + let proposed_max_tokens = available.min(configured_max_tokens); + let proposed_max_tokens = if provider_name == "anthropic" { + if let Some(budget) = self.get_thinking_budget_tokens() { + proposed_max_tokens.max(budget + 1024) + } else { + proposed_max_tokens + } + } else { + proposed_max_tokens + }; + + // Validate against thinking budget constraint + let (adjusted, needs_reduction) = self.preflight_validate_max_tokens(provider_name, proposed_max_tokens); + + if !needs_reduction { + return (adjusted, false); + } + + // We need more headroom - the context is too full + // Return the adjusted value but flag that fallbacks are needed + (adjusted, true) + } + + /// Apply the fallback sequence to free up context space for thinking budget. + /// Sequence: thinnify (first third) → skinnify (all) → hard-coded minimum + /// Returns the validated max_tokens that satisfies thinking.budget_tokens constraint. + fn apply_max_tokens_fallback_sequence( + &mut self, + provider_name: &str, + initial_max_tokens: u32, + hard_coded_minimum: u32, + ) -> u32 { + let (mut max_tokens, needs_reduction) = self.preflight_validate_max_tokens(provider_name, initial_max_tokens); + + if !needs_reduction { + return max_tokens; + } + + self.ui_writer.print_context_status( + "⚠️ Context window too full for thinking budget. Applying fallback sequence...\n", + ); + + // Step 1: Try thinnify (first third of context) + self.ui_writer.print_context_status("🥒 Step 1: Trying thinnify...\n"); + let (thin_msg, thin_saved) = self.context_window.thin_context(); + self.thinning_events.push(thin_saved); + self.ui_writer.print_context_thinning(&thin_msg); + + // Recalculate max_tokens after thinnify + let recalc_max = self.resolve_max_tokens(provider_name); + let (new_max, still_needs_reduction) = self.preflight_validate_max_tokens(provider_name, recalc_max); + max_tokens = new_max; + + if !still_needs_reduction { + self.ui_writer.print_context_status( + "✅ Thinnify resolved capacity issue. Continuing...\n", + ); + return max_tokens; + } + + // Step 2: Try skinnify (entire context) + self.ui_writer.print_context_status("🦴 Step 2: Trying skinnify...\n"); + let (skinny_msg, skinny_saved) = self.context_window.thin_context_all(); + self.thinning_events.push(skinny_saved); + self.ui_writer.print_context_thinning(&skinny_msg); + + // Recalculate max_tokens after skinnify + let recalc_max = self.resolve_max_tokens(provider_name); + let (final_max, final_needs_reduction) = self.preflight_validate_max_tokens(provider_name, recalc_max); + max_tokens = final_max; + + if !final_needs_reduction { + self.ui_writer.print_context_status( + "✅ Skinnify resolved capacity issue. Continuing...\n", + ); + return max_tokens; + } + + // Step 3: Nothing worked, use hard-coded minimum as last resort + self.ui_writer.print_context_status(&format!( + "⚠️ Step 3: Context reduction insufficient. Using hard-coded max_tokens={} as last resort...\n", + hard_coded_minimum + )); + + hard_coded_minimum + } + + /// Apply the fallback sequence for summary requests to free up context space. + /// Uses calculate_summary_max_tokens for recalculation (based on available space). + /// Returns the validated max_tokens for summary requests. + fn apply_summary_fallback_sequence( + &mut self, + provider_name: &str, + ) -> u32 { + let (mut summary_max_tokens, needs_reduction) = self.calculate_summary_max_tokens(provider_name); + + if !needs_reduction { + return summary_max_tokens; + } + + self.ui_writer.print_context_status( + "⚠️ Context window too full for thinking budget. Applying fallback sequence...\n", + ); + + // Step 1: Try thinnify (first third of context) + self.ui_writer.print_context_status("🥒 Step 1: Trying thinnify...\n"); + let (thin_msg, thin_saved) = self.context_window.thin_context(); + self.thinning_events.push(thin_saved); + self.ui_writer.print_context_thinning(&thin_msg); + + // Recalculate max_tokens after thinnify + let (new_max, still_needs_reduction) = self.calculate_summary_max_tokens(provider_name); + summary_max_tokens = new_max; + + if !still_needs_reduction { + self.ui_writer.print_context_status( + "✅ Thinnify resolved capacity issue. Continuing...\n", + ); + return summary_max_tokens; + } + + // Step 2: Try skinnify (entire context) + self.ui_writer.print_context_status("🦴 Step 2: Trying skinnify...\n"); + let (skinny_msg, skinny_saved) = self.context_window.thin_context_all(); + self.thinning_events.push(skinny_saved); + self.ui_writer.print_context_thinning(&skinny_msg); + + // Recalculate max_tokens after skinnify + let (final_max, final_needs_reduction) = self.calculate_summary_max_tokens(provider_name); + summary_max_tokens = final_max; + + if !final_needs_reduction { + self.ui_writer.print_context_status( + "✅ Skinnify resolved capacity issue. Continuing...\n", + ); + return summary_max_tokens; + } + + // Step 3: Nothing worked, use hard-coded minimum + self.ui_writer.print_context_status( + "⚠️ Step 3: Context reduction insufficient. Using hard-coded max_tokens=5000 as last resort...\n", + ); + 5000 } /// Resolve the temperature to use for a given provider, applying fallbacks @@ -1805,8 +2028,14 @@ impl Agent { }; let _ = provider; // Drop the provider reference to avoid borrowing issues - // Get max_tokens from provider configuration, falling back to sensible defaults - let max_tokens = Some(self.resolve_max_tokens(&provider_name)); + // Get max_tokens from provider configuration with preflight validation + // This ensures max_tokens > thinking.budget_tokens for Anthropic with extended thinking + let initial_max_tokens = self.resolve_max_tokens(&provider_name); + let max_tokens = Some(self.apply_max_tokens_fallback_sequence( + &provider_name, + initial_max_tokens, + 16000, // Hard-coded minimum for main API calls (higher than summary's 5000) + )); let request = CompletionRequest { messages, @@ -1814,6 +2043,7 @@ impl Agent { temperature: Some(self.resolve_temperature(&provider_name)), stream: true, // Enable streaming tools, + disable_thinking: false, }; // Time the LLM call with cancellation support and streaming @@ -2211,6 +2441,32 @@ impl Agent { self.context_window.percentage_used() as u32 )); + let provider = self.providers.get(None)?; + let provider_name = provider.name().to_string(); + let _ = provider; // Release borrow early + + // Apply fallback sequence: thinnify -> skinnify -> hard-coded 5000 + let mut summary_max_tokens = self.apply_summary_fallback_sequence(&provider_name); + + // Apply provider-specific caps + // For Anthropic with thinking enabled, we need max_tokens > thinking.budget_tokens + // So we set a higher cap when thinking is configured + let anthropic_cap = match self.get_thinking_budget_tokens() { + Some(budget) => (budget + 2000).max(10_000), // At least budget + 2000 for response + None => 10_000, + }; + summary_max_tokens = match provider_name.as_str() { + "anthropic" => summary_max_tokens.min(anthropic_cap), + "databricks" => summary_max_tokens.min(10_000), + "embedded" => summary_max_tokens.min(3000), + _ => summary_max_tokens.min(5000), + }; + + debug!( + "Requesting summary with max_tokens: {} (current usage: {} tokens)", + summary_max_tokens, self.context_window.used_tokens + ); + // Create summary request with FULL history let summary_prompt = self.context_window.create_summary_prompt(); @@ -2239,41 +2495,26 @@ impl Agent { let provider = self.providers.get(None)?; - // Dynamically calculate max_tokens for summary based on what's left - let summary_max_tokens = match provider.name() { - "databricks" | "anthropic" => { - let model_limit = self.context_window.total_tokens; - let current_usage = self.context_window.used_tokens; - let available = model_limit - .saturating_sub(current_usage) - .saturating_sub(5000); - Some(available.min(10_000)) + // Determine if we need to disable thinking mode for this request + // Anthropic requires: max_tokens > thinking.budget_tokens + 1024 + let disable_thinking = self.get_thinking_budget_tokens().map_or(false, |budget| { + let minimum_for_thinking = budget + 1024; + let should_disable = summary_max_tokens <= minimum_for_thinking; + if should_disable { + tracing::warn!("Disabling thinking mode for summary: max_tokens ({}) <= minimum_for_thinking ({})", summary_max_tokens, minimum_for_thinking); } - "embedded" => { - let model_limit = self.context_window.total_tokens; - let current_usage = self.context_window.used_tokens; - let available = model_limit - .saturating_sub(current_usage) - .saturating_sub(1000); - Some(available.min(3000)) - } - _ => { - let available = self.context_window.remaining_tokens().saturating_sub(2000); - Some(available.min(5000)) - } - }; + should_disable + }); - debug!( - "Requesting summary with max_tokens: {:?} (current usage: {} tokens)", - summary_max_tokens, self.context_window.used_tokens - ); + tracing::debug!("Creating summary request: max_tokens={}, disable_thinking={}", summary_max_tokens, disable_thinking); let summary_request = CompletionRequest { messages: summary_messages, - max_tokens: summary_max_tokens, + max_tokens: Some(summary_max_tokens), temperature: Some(self.resolve_temperature(provider.name())), stream: false, tools: None, + disable_thinking, }; // Get the summary @@ -3234,6 +3475,32 @@ impl Agent { self.context_window.percentage_used() as u32 )); + let provider = self.providers.get(None)?; + let provider_name = provider.name().to_string(); + let _ = provider; // Release borrow early + + // Apply fallback sequence: thinnify -> skinnify -> hard-coded 5000 + let mut summary_max_tokens = self.apply_summary_fallback_sequence(&provider_name); + + // Apply provider-specific caps + // For Anthropic with thinking enabled, we need max_tokens > thinking.budget_tokens + // So we set a higher cap when thinking is configured + let anthropic_cap = match self.get_thinking_budget_tokens() { + Some(budget) => (budget + 2000).max(10_000), // At least budget + 2000 for response + None => 10_000, + }; + summary_max_tokens = match provider_name.as_str() { + "anthropic" => summary_max_tokens.min(anthropic_cap), + "databricks" => summary_max_tokens.min(10_000), + "embedded" => summary_max_tokens.min(3000), + _ => summary_max_tokens.min(5000), + }; + + debug!( + "Requesting summary with max_tokens: {} (current usage: {} tokens)", + summary_max_tokens, self.context_window.used_tokens + ); + // Create summary request with FULL history let summary_prompt = self.context_window.create_summary_prompt(); @@ -3262,85 +3529,26 @@ impl Agent { let provider = self.providers.get(None)?; - // Dynamically calculate max_tokens for summary based on what's left - // We need to ensure: used_tokens + max_tokens <= total_context_limit - let summary_max_tokens = match provider.name() { - "databricks" | "anthropic" => { - // Use the actual configured context window size - let model_limit = self.context_window.total_tokens; - let current_usage = self.context_window.used_tokens; - - // Check if we have enough capacity for summarization - if current_usage >= model_limit.saturating_sub(1000) { - error!("Context window at capacity ({}%), cannot summarize. Current: {}, Limit: {}", - self.context_window.percentage_used(), current_usage, model_limit); - return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify or /compact commands to reduce context size, or start a new session.")); - } - - // Leave buffer proportional to model size (min 1k, max 10k) - let buffer = (model_limit / 40).clamp(1000, 10000); // 2.5% buffer - let available = model_limit - .saturating_sub(current_usage) - .saturating_sub(buffer); - // Cap at a reasonable summary size (10k tokens max) - Some(available.min(10_000)) + // Determine if we need to disable thinking mode for this request + // Anthropic requires: max_tokens > thinking.budget_tokens + 1024 + let disable_thinking = self.get_thinking_budget_tokens().map_or(false, |budget| { + let minimum_for_thinking = budget + 1024; + let should_disable = summary_max_tokens <= minimum_for_thinking; + if should_disable { + tracing::warn!("Disabling thinking mode for summary: max_tokens ({}) <= minimum_for_thinking ({})", summary_max_tokens, minimum_for_thinking); } - "embedded" => { - // For smaller context models, be more conservative - let model_limit = self.context_window.total_tokens; - let current_usage = self.context_window.used_tokens; + should_disable + }); - // Check capacity for embedded models too - if current_usage >= model_limit.saturating_sub(500) { - error!( - "Embedded model context window at capacity ({}%)", - self.context_window.percentage_used() - ); - return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify command to reduce context size, or start a new session.")); - } - - // Leave 1k buffer - let available = model_limit - .saturating_sub(current_usage) - .saturating_sub(1000); - // Cap at 3k for embedded models - Some(available.min(3000)) - } - _ => { - // Default: conservative approach - let model_limit = self.context_window.total_tokens; - let current_usage = self.context_window.used_tokens; - - if current_usage >= model_limit.saturating_sub(1000) { - error!( - "Context window at capacity ({}%)", - self.context_window.percentage_used() - ); - return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify or /compact commands, or start a new session.")); - } - - let available = self.context_window.remaining_tokens().saturating_sub(2000); - Some(available.min(5000)) - } - }; - - debug!( - "Requesting summary with max_tokens: {:?} (current usage: {} tokens)", - summary_max_tokens, self.context_window.used_tokens - ); - - // Final safety check - if summary_max_tokens.unwrap_or(0) == 0 { - error!("No tokens available for summarization"); - return Err(anyhow::anyhow!("No context window capacity left for summarization. Use /thinnify to reduce context size or start a new session.")); - } + tracing::debug!("Creating auto-summary request: max_tokens={}, disable_thinking={}", summary_max_tokens, disable_thinking); let summary_request = CompletionRequest { messages: summary_messages, - max_tokens: summary_max_tokens, + max_tokens: Some(summary_max_tokens), temperature: Some(self.resolve_temperature(provider.name())), stream: false, tools: None, + disable_thinking, }; // Get the summary diff --git a/crates/g3-core/tests/test_preflight_max_tokens.rs b/crates/g3-core/tests/test_preflight_max_tokens.rs new file mode 100644 index 0000000..1b19873 --- /dev/null +++ b/crates/g3-core/tests/test_preflight_max_tokens.rs @@ -0,0 +1,188 @@ +//! Tests for the pre-flight max_tokens validation with thinking.budget_tokens constraint +//! +//! These tests verify that when using Anthropic with extended thinking enabled, +//! the max_tokens calculation properly accounts for the budget_tokens constraint. + +use g3_config::Config; +use g3_core::ContextWindow; + +/// Helper function to create a minimal config for testing +fn create_test_config_with_thinking(thinking_budget: Option) -> Config { + let mut config = Config::default(); + + // Set up Anthropic provider with optional thinking budget + config.providers.anthropic = Some(g3_config::AnthropicConfig { + api_key: "test-key".to_string(), + model: "claude-sonnet-4-5".to_string(), + max_tokens: Some(16000), + temperature: Some(0.1), + cache_config: None, + enable_1m_context: None, + thinking_budget_tokens: thinking_budget, + }); + + config.providers.default_provider = "anthropic".to_string(); + config +} + +/// Test that when thinking is disabled, max_tokens passes through unchanged +#[test] +fn test_no_thinking_budget_passes_through() { + let config = create_test_config_with_thinking(None); + + // Without thinking budget, any max_tokens should be fine + let proposed_max = 5000; + + // The constraint check would return (proposed_max, false) + // since there's no thinking_budget_tokens configured + assert!(config.providers.anthropic.as_ref().unwrap().thinking_budget_tokens.is_none()); +} + +/// Test that when max_tokens > budget_tokens + buffer, no reduction is needed +#[test] +fn test_sufficient_max_tokens_no_reduction_needed() { + let config = create_test_config_with_thinking(Some(10000)); + let budget_tokens = config.providers.anthropic.as_ref().unwrap().thinking_budget_tokens.unwrap(); + + // minimum_required = budget_tokens + 1024 = 11024 + let minimum_required = budget_tokens + 1024; + + // If proposed_max >= minimum_required, no reduction is needed + let proposed_max = 15000; + assert!(proposed_max >= minimum_required); +} + +/// Test that when max_tokens < budget_tokens + buffer, reduction is needed +#[test] +fn test_insufficient_max_tokens_needs_reduction() { + let config = create_test_config_with_thinking(Some(10000)); + let budget_tokens = config.providers.anthropic.as_ref().unwrap().thinking_budget_tokens.unwrap(); + + // minimum_required = budget_tokens + 1024 = 11024 + let minimum_required = budget_tokens + 1024; + + // If proposed_max < minimum_required, reduction IS needed + let proposed_max = 5000; + assert!(proposed_max < minimum_required); +} + +/// Test the minimum required calculation +#[test] +fn test_minimum_required_calculation() { + // For a budget of 10000, we need at least 11024 tokens + let budget_tokens = 10000u32; + let output_buffer = 1024u32; + let minimum_required = budget_tokens + output_buffer; + + assert_eq!(minimum_required, 11024); + + // For a larger budget + let budget_tokens = 32000u32; + let minimum_required = budget_tokens + output_buffer; + assert_eq!(minimum_required, 33024); +} + +/// Test context window usage calculation for summary max_tokens +#[test] +fn test_context_window_available_tokens() { + let mut context = ContextWindow::new(200000); // 200k context window + + // Simulate heavy usage + context.used_tokens = 180000; // 90% used + + let model_limit = context.total_tokens; + let current_usage = context.used_tokens; + + // 2.5% buffer calculation + let buffer = (model_limit / 40).clamp(1000, 10000); + assert_eq!(buffer, 5000); // 200000/40 = 5000 + + let available = model_limit + .saturating_sub(current_usage) + .saturating_sub(buffer); + + // 200000 - 180000 - 5000 = 15000 + assert_eq!(available, 15000); + + // Capped at 10000 for summary + let summary_max = available.min(10_000); + assert_eq!(summary_max, 10000); +} + +/// Test that when context is nearly full, available tokens may be below thinking budget +#[test] +fn test_context_nearly_full_triggers_reduction() { + let mut context = ContextWindow::new(200000); + + // Very heavy usage - 98% used + context.used_tokens = 196000; + + let model_limit = context.total_tokens; + let current_usage = context.used_tokens; + let buffer = (model_limit / 40).clamp(1000, 10000); // 5000 + + let available = model_limit + .saturating_sub(current_usage) + .saturating_sub(buffer); + + // 200000 - 196000 - 5000 = -1000 -> saturates to 0 + assert_eq!(available, 0); + + // With thinking_budget of 10000, this would definitely need reduction + let thinking_budget = 10000u32; + let minimum_required = thinking_budget + 1024; + assert!(available < minimum_required); +} + +/// Test the hard-coded fallback value +#[test] +fn test_hardcoded_fallback_value() { + // When all else fails, we use 5000 as the hard-coded max_tokens + let hardcoded_fallback = 5000u32; + + // This should be a reasonable value that Anthropic will accept + // even with thinking enabled (though output will be limited) + assert!(hardcoded_fallback > 0); + + // Note: With a 10000 thinking budget, 5000 is still below the + // minimum required (11024), but we send it anyway as a "last resort" + // hoping the API might still work for basic operations +} + +/// Test provider-specific caps +#[test] +fn test_provider_specific_caps() { + // Anthropic/Databricks: cap at 10000 + let anthropic_cap = 10000u32; + let proposed = 15000u32; + assert_eq!(proposed.min(anthropic_cap), 10000); + + // Embedded: cap at 3000 + let embedded_cap = 3000u32; + let proposed = 5000u32; + assert_eq!(proposed.min(embedded_cap), 3000); + + // Default: cap at 5000 + let default_cap = 5000u32; + let proposed = 8000u32; + assert_eq!(proposed.min(default_cap), 5000); +} + +/// Test that the error message mentions the thinking budget constraint +#[test] +fn test_error_message_content() { + // Verify the warning message format contains useful information + let proposed_max_tokens = 5000u32; + let budget_tokens = 10000u32; + let minimum_required = budget_tokens + 1024; + + let warning = format!( + "max_tokens ({}) is below required minimum ({}) for thinking.budget_tokens ({}). Context reduction needed.", + proposed_max_tokens, minimum_required, budget_tokens + ); + + assert!(warning.contains("5000")); + assert!(warning.contains("11024")); + assert!(warning.contains("10000")); + assert!(warning.contains("Context reduction needed")); +} diff --git a/crates/g3-planner/src/lib.rs b/crates/g3-planner/src/lib.rs index 8317b64..e23e87c 100644 --- a/crates/g3-planner/src/lib.rs +++ b/crates/g3-planner/src/lib.rs @@ -85,6 +85,7 @@ pub async fn get_initial_discovery_messages( temperature: Some(provider.temperature()), stream: false, tools: None, + disable_thinking: false, }; status("🤖 Calling LLM for discovery commands..."); diff --git a/crates/g3-providers/src/anthropic.rs b/crates/g3-providers/src/anthropic.rs index c2dc052..778827f 100644 --- a/crates/g3-providers/src/anthropic.rs +++ b/crates/g3-providers/src/anthropic.rs @@ -39,6 +39,7 @@ //! temperature: Some(0.7), //! stream: false, //! tools: None, +//! disable_thinking: false, //! }; //! //! // Get a completion @@ -75,6 +76,7 @@ //! temperature: Some(0.7), //! stream: true, //! tools: None, +//! disable_thinking: false, //! }; //! //! let mut stream = provider.stream(request).await?; @@ -272,6 +274,7 @@ impl AnthropicProvider { streaming: bool, max_tokens: u32, temperature: f32, + disable_thinking: bool, ) -> Result { let (system, anthropic_messages) = self.convert_messages(messages)?; @@ -284,10 +287,32 @@ impl AnthropicProvider { // Convert tools if provided let anthropic_tools = tools.map(|t| self.convert_tools(t)); - // Add thinking configuration if budget_tokens is set - let thinking = self.thinking_budget_tokens.map(|budget| { - ThinkingConfig::enabled(budget) - }); + // Add thinking configuration if budget_tokens is set AND max_tokens is sufficient AND not explicitly disabled + // Anthropic requires: max_tokens > thinking.budget_tokens + // We add 1024 as minimum buffer for actual response content + tracing::debug!("create_request_body called: max_tokens={}, disable_thinking={}, thinking_budget_tokens={:?}", max_tokens, disable_thinking, self.thinking_budget_tokens); + + let thinking = if disable_thinking { + tracing::info!( + "Thinking mode explicitly disabled for this request (max_tokens={})", + max_tokens + ); + None + } else { + self.thinking_budget_tokens.and_then(|budget| { + let min_required = budget + 1024; + if max_tokens > min_required { + Some(ThinkingConfig::enabled(budget)) + } else { + tracing::warn!( + "Disabling thinking mode: max_tokens ({}) is not greater than thinking.budget_tokens ({}) + 1024 buffer. \ + Required: max_tokens > {}", + max_tokens, budget, min_required + ); + None + } + }) + }; let request = AnthropicRequest { model: self.model.clone(), @@ -637,6 +662,7 @@ impl LLMProvider for AnthropicProvider { false, max_tokens, temperature, + request.disable_thinking, )?; debug!( @@ -710,6 +736,7 @@ impl LLMProvider for AnthropicProvider { true, max_tokens, temperature, + request.disable_thinking, )?; debug!( @@ -847,6 +874,12 @@ enum AnthropicContent { #[serde(skip_serializing_if = "Option::is_none")] cache_control: Option, }, + #[serde(rename = "thinking")] + Thinking { + thinking: String, + #[serde(default)] + signature: Option, + }, #[serde(rename = "tool_use")] ToolUse { id: String, @@ -947,7 +980,7 @@ mod tests { let messages = vec![Message::new(MessageRole::User, "Test message".to_string())]; let request_body = provider - .create_request_body(&messages, None, false, 1000, 0.5) + .create_request_body(&messages, None, false, 1000, 0.5, false) .unwrap(); assert_eq!(request_body.model, "claude-3-haiku-20240307"); @@ -1053,16 +1086,17 @@ mod tests { let messages = vec![Message::new(MessageRole::User, "Test message".to_string())]; let request_without = provider_without - .create_request_body(&messages, None, false, 1000, 0.5) + .create_request_body(&messages, None, false, 1000, 0.5, false) .unwrap(); let json_without = serde_json::to_string(&request_without).unwrap(); assert!(!json_without.contains("thinking"), "JSON should not contain 'thinking' field when not configured"); - // Test WITH thinking parameter + // Test WITH thinking parameter - max_tokens must be > budget_tokens + 1024 + // Using budget=10000 requires max_tokens > 11024 let provider_with = AnthropicProvider::new( "test-key".to_string(), Some("claude-sonnet-4-5".to_string()), - Some(1000), + Some(20000), // Sufficient for thinking budget Some(0.5), None, None, @@ -1071,11 +1105,78 @@ mod tests { .unwrap(); let request_with = provider_with - .create_request_body(&messages, None, false, 1000, 0.5) + .create_request_body(&messages, None, false, 20000, 0.5, false) .unwrap(); let json_with = serde_json::to_string(&request_with).unwrap(); assert!(json_with.contains("thinking"), "JSON should contain 'thinking' field when configured"); assert!(json_with.contains("\"type\":\"enabled\""), "JSON should contain type: enabled"); assert!(json_with.contains("\"budget_tokens\":10000"), "JSON should contain budget_tokens: 10000"); + + // Test WITH thinking parameter but INSUFFICIENT max_tokens - thinking should be disabled + let request_insufficient = provider_with + .create_request_body(&messages, None, false, 5000, 0.5, false) // Less than budget + 1024 + .unwrap(); + let json_insufficient = serde_json::to_string(&request_insufficient).unwrap(); + assert!(!json_insufficient.contains("thinking"), "JSON should NOT contain 'thinking' field when max_tokens is insufficient"); + } + + #[test] + fn test_disable_thinking_flag() { + // Test that disable_thinking=true prevents thinking even with sufficient max_tokens + let provider = AnthropicProvider::new( + "test-key".to_string(), + Some("claude-sonnet-4-5".to_string()), + Some(20000), + Some(0.5), + None, + None, + Some(10000), // With thinking budget + ) + .unwrap(); + + let messages = vec![Message::new(MessageRole::User, "Test message".to_string())]; + + // With disable_thinking=false, thinking should be enabled (max_tokens is sufficient) + let request_with_thinking = provider + .create_request_body(&messages, None, false, 20000, 0.5, false) + .unwrap(); + let json_with = serde_json::to_string(&request_with_thinking).unwrap(); + assert!(json_with.contains("thinking"), "JSON should contain 'thinking' field when not disabled"); + + // With disable_thinking=true, thinking should be disabled even with sufficient max_tokens + let request_without_thinking = provider + .create_request_body(&messages, None, false, 20000, 0.5, true) + .unwrap(); + let json_without = serde_json::to_string(&request_without_thinking).unwrap(); + assert!(!json_without.contains("thinking"), "JSON should NOT contain 'thinking' field when explicitly disabled"); + } + + #[test] + fn test_thinking_content_block_deserialization() { + // Test that we can deserialize a response containing a "thinking" content block + // This is what Anthropic returns when extended thinking is enabled + let json_response = r#"{ + "content": [ + {"type": "thinking", "thinking": "Let me analyze this...", "signature": "abc123"}, + {"type": "text", "text": "Here is my response."} + ], + "model": "claude-sonnet-4-5", + "usage": {"input_tokens": 100, "output_tokens": 50} + }"#; + + let response: AnthropicResponse = serde_json::from_str(json_response) + .expect("Should be able to deserialize response with thinking block"); + + assert_eq!(response.content.len(), 2); + assert_eq!(response.model, "claude-sonnet-4-5"); + + // Extract only text content (thinking should be filtered out) + let text_content: Vec<_> = response.content.iter().filter_map(|c| match c { + AnthropicContent::Text { text, .. } => Some(text.as_str()), + _ => None, + }).collect(); + + assert_eq!(text_content.len(), 1); + assert_eq!(text_content[0], "Here is my response."); } } diff --git a/crates/g3-providers/src/databricks.rs b/crates/g3-providers/src/databricks.rs index c5e9dc6..848acdd 100644 --- a/crates/g3-providers/src/databricks.rs +++ b/crates/g3-providers/src/databricks.rs @@ -45,6 +45,7 @@ //! temperature: Some(0.7), //! stream: false, //! tools: None, +//! disable_thinking: false, //! }; //! //! // Get a completion diff --git a/crates/g3-providers/src/lib.rs b/crates/g3-providers/src/lib.rs index 180400d..0e088ce 100644 --- a/crates/g3-providers/src/lib.rs +++ b/crates/g3-providers/src/lib.rs @@ -42,6 +42,8 @@ pub struct CompletionRequest { pub temperature: Option, pub stream: bool, pub tools: Option>, + /// Force disable thinking mode for this request (used when max_tokens is too low) + pub disable_thinking: bool, } #[derive(Debug, Clone, Serialize, Deserialize)]