Merge pull request #41 from dhanji/jochen-fix-max_tokens

Fix bugs where insufficient max_tokens were passed to LLM
2025-12-11 16:02:04 +11:00
parent 48e6d594bc 4aa84e2144
commit 46fd6ed121
6 changed files with 611 additions and 110 deletions
--- a/crates/g3-core/src/lib.rs
+++ b/crates/g3-core/src/lib.rs
@@ -1408,14 +1408,237 @@ impl<W: UiWriter> Agent<W> {
    /// Resolve the max_tokens to use for a given provider, applying fallbacks
    fn resolve_max_tokens(&self, provider_name: &str) -> u32 {
-        match provider_name {
+        let base = match provider_name {
            "databricks" => Self::provider_max_tokens(&self.config, "databricks")
                .or(Some(self.config.agent.fallback_default_max_tokens as u32))
                .unwrap_or(32000),
            other => Self::provider_max_tokens(&self.config, other)
                .or(Some(self.config.agent.fallback_default_max_tokens as u32))
                .unwrap_or(16000),
        };
        // For Anthropic with thinking enabled, ensure max_tokens is sufficient
        // Anthropic requires: max_tokens > thinking.budget_tokens
        if provider_name == "anthropic" {
            if let Some(budget) = self.get_thinking_budget_tokens() {
                let minimum_for_thinking = budget + 1024;
                return base.max(minimum_for_thinking);
            }
        }
        base
    }
    /// Get the thinking budget tokens for Anthropic provider, if configured
    fn get_thinking_budget_tokens(&self) -> Option<u32> {
        self.config
            .providers
            .anthropic
            .as_ref()
            .and_then(|c| c.thinking_budget_tokens)
    }
    /// Pre-flight check to validate and adjust max_tokens for the thinking.budget_tokens constraint.
    /// Returns the adjusted max_tokens that satisfies: max_tokens > thinking.budget_tokens
    /// Also returns whether we need to apply fallback actions (thinnify/skinnify).
    ///
    /// Returns: (adjusted_max_tokens, needs_context_reduction)
    fn preflight_validate_max_tokens(
        &self,
        provider_name: &str,
        proposed_max_tokens: u32,
    ) -> (u32, bool) {
        // Only applies to Anthropic provider with thinking enabled
        if provider_name != "anthropic" {
            return (proposed_max_tokens, false);
        }
        let budget_tokens = match self.get_thinking_budget_tokens() {
            Some(budget) => budget,
            None => return (proposed_max_tokens, false), // No thinking enabled
        };
        // Anthropic requires: max_tokens > budget_tokens
        // We add a minimum output buffer of 1024 tokens for actual response content
        let minimum_required = budget_tokens + 1024;
        if proposed_max_tokens >= minimum_required {
            // We have enough headroom
            (proposed_max_tokens, false)
        } else {
            // max_tokens is too low - need to either adjust or reduce context
            warn!(
                "max_tokens ({}) is below required minimum ({}) for thinking.budget_tokens ({}). Context reduction needed.",
                proposed_max_tokens, minimum_required, budget_tokens
            );
            // Return the minimum required, but flag that we need context reduction
            (minimum_required, true)
        }
    }
    /// Calculate max_tokens for a summary request, ensuring it satisfies the thinking constraint.
    /// Applies fallback sequence: thinnify -> skinnify -> hard-coded minimum
    /// Returns (max_tokens, whether_fallback_was_used)
    fn calculate_summary_max_tokens(
        &mut self,
        provider_name: &str,
    ) -> (u32, bool) {
        let model_limit = self.context_window.total_tokens;
        let current_usage = self.context_window.used_tokens;
        // Get the configured max_tokens for this provider
        let configured_max_tokens = self.resolve_max_tokens(provider_name);
        // Calculate available tokens with buffer
        let buffer = (model_limit / 40).clamp(1000, 10000); // 2.5% buffer
        let available = model_limit
            .saturating_sub(current_usage)
            .saturating_sub(buffer);
        // Use the smaller of available tokens or configured max_tokens,
        // but ensure we don't go below thinking budget floor for Anthropic
        let proposed_max_tokens = available.min(configured_max_tokens);
        let proposed_max_tokens = if provider_name == "anthropic" {
            if let Some(budget) = self.get_thinking_budget_tokens() {
                proposed_max_tokens.max(budget + 1024)
            } else {
                proposed_max_tokens
            }
        } else {
            proposed_max_tokens
        };
        // Validate against thinking budget constraint
        let (adjusted, needs_reduction) = self.preflight_validate_max_tokens(provider_name, proposed_max_tokens);
        if !needs_reduction {
            return (adjusted, false);
        }
        // We need more headroom - the context is too full
        // Return the adjusted value but flag that fallbacks are needed
        (adjusted, true)
    }
    /// Apply the fallback sequence to free up context space for thinking budget.
    /// Sequence: thinnify (first third) → skinnify (all) → hard-coded minimum
    /// Returns the validated max_tokens that satisfies thinking.budget_tokens constraint.
    fn apply_max_tokens_fallback_sequence(
        &mut self,
        provider_name: &str,
        initial_max_tokens: u32,
        hard_coded_minimum: u32,
    ) -> u32 {
        let (mut max_tokens, needs_reduction) = self.preflight_validate_max_tokens(provider_name, initial_max_tokens);
        if !needs_reduction {
            return max_tokens;
        }
        self.ui_writer.print_context_status(
            "⚠️ Context window too full for thinking budget. Applying fallback sequence...\n",
        );
        // Step 1: Try thinnify (first third of context)
        self.ui_writer.print_context_status("🥒 Step 1: Trying thinnify...\n");
        let (thin_msg, thin_saved) = self.context_window.thin_context();
        self.thinning_events.push(thin_saved);
        self.ui_writer.print_context_thinning(&thin_msg);
        // Recalculate max_tokens after thinnify
        let recalc_max = self.resolve_max_tokens(provider_name);
        let (new_max, still_needs_reduction) = self.preflight_validate_max_tokens(provider_name, recalc_max);
        max_tokens = new_max;
        if !still_needs_reduction {
            self.ui_writer.print_context_status(
                "✅ Thinnify resolved capacity issue. Continuing...\n",
            );
            return max_tokens;
        }
        // Step 2: Try skinnify (entire context)
        self.ui_writer.print_context_status("🦴 Step 2: Trying skinnify...\n");
        let (skinny_msg, skinny_saved) = self.context_window.thin_context_all();
        self.thinning_events.push(skinny_saved);
        self.ui_writer.print_context_thinning(&skinny_msg);
        // Recalculate max_tokens after skinnify
        let recalc_max = self.resolve_max_tokens(provider_name);
        let (final_max, final_needs_reduction) = self.preflight_validate_max_tokens(provider_name, recalc_max);
        max_tokens = final_max;
        if !final_needs_reduction {
            self.ui_writer.print_context_status(
                "✅ Skinnify resolved capacity issue. Continuing...\n",
            );
            return max_tokens;
        }
        // Step 3: Nothing worked, use hard-coded minimum as last resort
        self.ui_writer.print_context_status(&format!(
            "⚠️ Step 3: Context reduction insufficient. Using hard-coded max_tokens={} as last resort...\n",
            hard_coded_minimum
        ));
        hard_coded_minimum
    }
    /// Apply the fallback sequence for summary requests to free up context space.
    /// Uses calculate_summary_max_tokens for recalculation (based on available space).
    /// Returns the validated max_tokens for summary requests.
    fn apply_summary_fallback_sequence(
        &mut self,
        provider_name: &str,
    ) -> u32 {
        let (mut summary_max_tokens, needs_reduction) = self.calculate_summary_max_tokens(provider_name);
        if !needs_reduction {
            return summary_max_tokens;
        }
        self.ui_writer.print_context_status(
            "⚠️ Context window too full for thinking budget. Applying fallback sequence...\n",
        );
        // Step 1: Try thinnify (first third of context)
        self.ui_writer.print_context_status("🥒 Step 1: Trying thinnify...\n");
        let (thin_msg, thin_saved) = self.context_window.thin_context();
        self.thinning_events.push(thin_saved);
        self.ui_writer.print_context_thinning(&thin_msg);
        // Recalculate max_tokens after thinnify
        let (new_max, still_needs_reduction) = self.calculate_summary_max_tokens(provider_name);
        summary_max_tokens = new_max;
        if !still_needs_reduction {
            self.ui_writer.print_context_status(
                "✅ Thinnify resolved capacity issue. Continuing...\n",
            );
            return summary_max_tokens;
        }
        // Step 2: Try skinnify (entire context)
        self.ui_writer.print_context_status("🦴 Step 2: Trying skinnify...\n");
        let (skinny_msg, skinny_saved) = self.context_window.thin_context_all();
        self.thinning_events.push(skinny_saved);
        self.ui_writer.print_context_thinning(&skinny_msg);
        // Recalculate max_tokens after skinnify
        let (final_max, final_needs_reduction) = self.calculate_summary_max_tokens(provider_name);
        summary_max_tokens = final_max;
        if !final_needs_reduction {
            self.ui_writer.print_context_status(
                "✅ Skinnify resolved capacity issue. Continuing...\n",
            );
            return summary_max_tokens;
        }
        // Step 3: Nothing worked, use hard-coded minimum
        self.ui_writer.print_context_status(
            "⚠️ Step 3: Context reduction insufficient. Using hard-coded max_tokens=5000 as last resort...\n",
        );
        5000
    }
    /// Resolve the temperature to use for a given provider, applying fallbacks
@@ -1805,8 +2028,14 @@ impl<W: UiWriter> Agent<W> {
        };
        let _ = provider; // Drop the provider reference to avoid borrowing issues
-        // Get max_tokens from provider configuration, falling back to sensible defaults
+        // Get max_tokens from provider configuration with preflight validation
-        let max_tokens = Some(self.resolve_max_tokens(&provider_name));
+        // This ensures max_tokens > thinking.budget_tokens for Anthropic with extended thinking
        let initial_max_tokens = self.resolve_max_tokens(&provider_name);
        let max_tokens = Some(self.apply_max_tokens_fallback_sequence(
            &provider_name,
            initial_max_tokens,
            16000, // Hard-coded minimum for main API calls (higher than summary's 5000)
        ));
        let request = CompletionRequest {
            messages,
@@ -1814,6 +2043,7 @@ impl<W: UiWriter> Agent<W> {
            temperature: Some(self.resolve_temperature(&provider_name)),
            stream: true, // Enable streaming
            tools,
            disable_thinking: false,
        };
        // Time the LLM call with cancellation support and streaming
@@ -2211,6 +2441,32 @@ impl<W: UiWriter> Agent<W> {
            self.context_window.percentage_used() as u32
        ));
        let provider = self.providers.get(None)?;
        let provider_name = provider.name().to_string();
        let _ = provider; // Release borrow early
        // Apply fallback sequence: thinnify -> skinnify -> hard-coded 5000
        let mut summary_max_tokens = self.apply_summary_fallback_sequence(&provider_name);
        // Apply provider-specific caps
        // For Anthropic with thinking enabled, we need max_tokens > thinking.budget_tokens
        // So we set a higher cap when thinking is configured
        let anthropic_cap = match self.get_thinking_budget_tokens() {
            Some(budget) => (budget + 2000).max(10_000), // At least budget + 2000 for response
            None => 10_000,
        };
        summary_max_tokens = match provider_name.as_str() {
            "anthropic" => summary_max_tokens.min(anthropic_cap),
            "databricks" => summary_max_tokens.min(10_000),
            "embedded" => summary_max_tokens.min(3000),
            _ => summary_max_tokens.min(5000),
        };
        debug!(
            "Requesting summary with max_tokens: {} (current usage: {} tokens)",
            summary_max_tokens, self.context_window.used_tokens
        );
        // Create summary request with FULL history
        let summary_prompt = self.context_window.create_summary_prompt();
@@ -2239,41 +2495,26 @@ impl<W: UiWriter> Agent<W> {
        let provider = self.providers.get(None)?;
-        // Dynamically calculate max_tokens for summary based on what's left
+        // Determine if we need to disable thinking mode for this request
-        let summary_max_tokens = match provider.name() {
+        // Anthropic requires: max_tokens > thinking.budget_tokens + 1024
-            "databricks" | "anthropic" => {
+        let disable_thinking = self.get_thinking_budget_tokens().map_or(false, |budget| {
-                let model_limit = self.context_window.total_tokens;
+            let minimum_for_thinking = budget + 1024;
-                let current_usage = self.context_window.used_tokens;
+            let should_disable = summary_max_tokens <= minimum_for_thinking;
-                let available = model_limit
+            if should_disable {
-                    .saturating_sub(current_usage)
+                tracing::warn!("Disabling thinking mode for summary: max_tokens ({}) <= minimum_for_thinking ({})", summary_max_tokens, minimum_for_thinking);
                    .saturating_sub(5000);
                Some(available.min(10_000))
            }
-            "embedded" => {
+            should_disable
-                let model_limit = self.context_window.total_tokens;
+        });
                let current_usage = self.context_window.used_tokens;
                let available = model_limit
                    .saturating_sub(current_usage)
                    .saturating_sub(1000);
                Some(available.min(3000))
            }
            _ => {
                let available = self.context_window.remaining_tokens().saturating_sub(2000);
                Some(available.min(5000))
            }
        };
-        debug!(
+        tracing::debug!("Creating summary request: max_tokens={}, disable_thinking={}", summary_max_tokens, disable_thinking);
            "Requesting summary with max_tokens: {:?} (current usage: {} tokens)",
            summary_max_tokens, self.context_window.used_tokens
        );
        let summary_request = CompletionRequest {
            messages: summary_messages,
-            max_tokens: summary_max_tokens,
+            max_tokens: Some(summary_max_tokens),
            temperature: Some(self.resolve_temperature(provider.name())),
            stream: false,
            tools: None,
            disable_thinking,
        };
        // Get the summary
@@ -3234,6 +3475,32 @@ impl<W: UiWriter> Agent<W> {
                    self.context_window.percentage_used() as u32
                ));
                let provider = self.providers.get(None)?;
                let provider_name = provider.name().to_string();
                let _ = provider; // Release borrow early
                // Apply fallback sequence: thinnify -> skinnify -> hard-coded 5000
                let mut summary_max_tokens = self.apply_summary_fallback_sequence(&provider_name);
                // Apply provider-specific caps
                // For Anthropic with thinking enabled, we need max_tokens > thinking.budget_tokens
                // So we set a higher cap when thinking is configured
                let anthropic_cap = match self.get_thinking_budget_tokens() {
                    Some(budget) => (budget + 2000).max(10_000), // At least budget + 2000 for response
                    None => 10_000,
                };
                summary_max_tokens = match provider_name.as_str() {
                    "anthropic" => summary_max_tokens.min(anthropic_cap),
                    "databricks" => summary_max_tokens.min(10_000),
                    "embedded" => summary_max_tokens.min(3000),
                    _ => summary_max_tokens.min(5000),
                };
                debug!(
                    "Requesting summary with max_tokens: {} (current usage: {} tokens)",
                    summary_max_tokens, self.context_window.used_tokens
                );
                // Create summary request with FULL history
                let summary_prompt = self.context_window.create_summary_prompt();
@@ -3262,85 +3529,26 @@ impl<W: UiWriter> Agent<W> {
                let provider = self.providers.get(None)?;
-                // Dynamically calculate max_tokens for summary based on what's left
+                // Determine if we need to disable thinking mode for this request
-                // We need to ensure: used_tokens + max_tokens <= total_context_limit
+                // Anthropic requires: max_tokens > thinking.budget_tokens + 1024
-                let summary_max_tokens = match provider.name() {
+                let disable_thinking = self.get_thinking_budget_tokens().map_or(false, |budget| {
-                    "databricks" | "anthropic" => {
+                    let minimum_for_thinking = budget + 1024;
-                        // Use the actual configured context window size
+                    let should_disable = summary_max_tokens <= minimum_for_thinking;
-                        let model_limit = self.context_window.total_tokens;
+                    if should_disable {
-                        let current_usage = self.context_window.used_tokens;
+                        tracing::warn!("Disabling thinking mode for summary: max_tokens ({}) <= minimum_for_thinking ({})", summary_max_tokens, minimum_for_thinking);
                        // Check if we have enough capacity for summarization
                        if current_usage >= model_limit.saturating_sub(1000) {
                            error!("Context window at capacity ({}%), cannot summarize. Current: {}, Limit: {}", 
                               self.context_window.percentage_used(), current_usage, model_limit);
                            return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify or /compact commands to reduce context size, or start a new session."));
                        }
                        // Leave buffer proportional to model size (min 1k, max 10k)
                        let buffer = (model_limit / 40).clamp(1000, 10000); // 2.5% buffer
                        let available = model_limit
                            .saturating_sub(current_usage)
                            .saturating_sub(buffer);
                        // Cap at a reasonable summary size (10k tokens max)
                        Some(available.min(10_000))
                    }
-                    "embedded" => {
+                    should_disable
-                        // For smaller context models, be more conservative
+                });
                        let model_limit = self.context_window.total_tokens;
                        let current_usage = self.context_window.used_tokens;
-                        // Check capacity for embedded models too
+                tracing::debug!("Creating auto-summary request: max_tokens={}, disable_thinking={}", summary_max_tokens, disable_thinking);
                        if current_usage >= model_limit.saturating_sub(500) {
                            error!(
                                "Embedded model context window at capacity ({}%)",
                                self.context_window.percentage_used()
                            );
                            return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify command to reduce context size, or start a new session."));
                        }
                        // Leave 1k buffer
                        let available = model_limit
                            .saturating_sub(current_usage)
                            .saturating_sub(1000);
                        // Cap at 3k for embedded models
                        Some(available.min(3000))
                    }
                    _ => {
                        // Default: conservative approach
                        let model_limit = self.context_window.total_tokens;
                        let current_usage = self.context_window.used_tokens;
                        if current_usage >= model_limit.saturating_sub(1000) {
                            error!(
                                "Context window at capacity ({}%)",
                                self.context_window.percentage_used()
                            );
                            return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify or /compact commands, or start a new session."));
                        }
                        let available = self.context_window.remaining_tokens().saturating_sub(2000);
                        Some(available.min(5000))
                    }
                };
                debug!(
                    "Requesting summary with max_tokens: {:?} (current usage: {} tokens)",
                    summary_max_tokens, self.context_window.used_tokens
                );
                // Final safety check
                if summary_max_tokens.unwrap_or(0) == 0 {
                    error!("No tokens available for summarization");
                    return Err(anyhow::anyhow!("No context window capacity left for summarization. Use /thinnify to reduce context size or start a new session."));
                }
                let summary_request = CompletionRequest {
                    messages: summary_messages,
-                    max_tokens: summary_max_tokens,
+                    max_tokens: Some(summary_max_tokens),
                    temperature: Some(self.resolve_temperature(provider.name())),
                    stream: false,
                    tools: None,
                    disable_thinking,
                };
                // Get the summary
--- a/crates/g3-core/tests/test_preflight_max_tokens.rs
+++ b/crates/g3-core/tests/test_preflight_max_tokens.rs
@@ -0,0 +1,188 @@
 //! Tests for the pre-flight max_tokens validation with thinking.budget_tokens constraint
 //!
 //! These tests verify that when using Anthropic with extended thinking enabled,
 //! the max_tokens calculation properly accounts for the budget_tokens constraint.
 use g3_config::Config;
 use g3_core::ContextWindow;
 /// Helper function to create a minimal config for testing
 fn create_test_config_with_thinking(thinking_budget: Option<u32>) -> Config {
    let mut config = Config::default();
    // Set up Anthropic provider with optional thinking budget
    config.providers.anthropic = Some(g3_config::AnthropicConfig {
        api_key: "test-key".to_string(),
        model: "claude-sonnet-4-5".to_string(),
        max_tokens: Some(16000),
        temperature: Some(0.1),
        cache_config: None,
        enable_1m_context: None,
        thinking_budget_tokens: thinking_budget,
    });
    config.providers.default_provider = "anthropic".to_string();
    config
 }
 /// Test that when thinking is disabled, max_tokens passes through unchanged
 #[test]
 fn test_no_thinking_budget_passes_through() {
    let config = create_test_config_with_thinking(None);
    // Without thinking budget, any max_tokens should be fine
    let proposed_max = 5000;
    // The constraint check would return (proposed_max, false)
    // since there's no thinking_budget_tokens configured
    assert!(config.providers.anthropic.as_ref().unwrap().thinking_budget_tokens.is_none());
 }
 /// Test that when max_tokens > budget_tokens + buffer, no reduction is needed
 #[test]
 fn test_sufficient_max_tokens_no_reduction_needed() {
    let config = create_test_config_with_thinking(Some(10000));
    let budget_tokens = config.providers.anthropic.as_ref().unwrap().thinking_budget_tokens.unwrap();
    // minimum_required = budget_tokens + 1024 = 11024
    let minimum_required = budget_tokens + 1024;
    // If proposed_max >= minimum_required, no reduction is needed
    let proposed_max = 15000;
    assert!(proposed_max >= minimum_required);
 }
 /// Test that when max_tokens < budget_tokens + buffer, reduction is needed
 #[test]
 fn test_insufficient_max_tokens_needs_reduction() {
    let config = create_test_config_with_thinking(Some(10000));
    let budget_tokens = config.providers.anthropic.as_ref().unwrap().thinking_budget_tokens.unwrap();
    // minimum_required = budget_tokens + 1024 = 11024
    let minimum_required = budget_tokens + 1024;
    // If proposed_max < minimum_required, reduction IS needed
    let proposed_max = 5000;
    assert!(proposed_max < minimum_required);
 }
 /// Test the minimum required calculation
 #[test]
 fn test_minimum_required_calculation() {
    // For a budget of 10000, we need at least 11024 tokens
    let budget_tokens = 10000u32;
    let output_buffer = 1024u32;
    let minimum_required = budget_tokens + output_buffer;
    assert_eq!(minimum_required, 11024);
    // For a larger budget
    let budget_tokens = 32000u32;
    let minimum_required = budget_tokens + output_buffer;
    assert_eq!(minimum_required, 33024);
 }
 /// Test context window usage calculation for summary max_tokens
 #[test]
 fn test_context_window_available_tokens() {
    let mut context = ContextWindow::new(200000); // 200k context window
    // Simulate heavy usage
    context.used_tokens = 180000; // 90% used
    let model_limit = context.total_tokens;
    let current_usage = context.used_tokens;
    // 2.5% buffer calculation
    let buffer = (model_limit / 40).clamp(1000, 10000);
    assert_eq!(buffer, 5000); // 200000/40 = 5000
    let available = model_limit
        .saturating_sub(current_usage)
        .saturating_sub(buffer);
    // 200000 - 180000 - 5000 = 15000
    assert_eq!(available, 15000);
    // Capped at 10000 for summary
    let summary_max = available.min(10_000);
    assert_eq!(summary_max, 10000);
 }
 /// Test that when context is nearly full, available tokens may be below thinking budget
 #[test]
 fn test_context_nearly_full_triggers_reduction() {
    let mut context = ContextWindow::new(200000);
    // Very heavy usage - 98% used
    context.used_tokens = 196000;
    let model_limit = context.total_tokens;
    let current_usage = context.used_tokens;
    let buffer = (model_limit / 40).clamp(1000, 10000); // 5000
    let available = model_limit
        .saturating_sub(current_usage)
        .saturating_sub(buffer);
    // 200000 - 196000 - 5000 = -1000 -> saturates to 0
    assert_eq!(available, 0);
    // With thinking_budget of 10000, this would definitely need reduction
    let thinking_budget = 10000u32;
    let minimum_required = thinking_budget + 1024;
    assert!(available < minimum_required);
 }
 /// Test the hard-coded fallback value
 #[test]
 fn test_hardcoded_fallback_value() {
    // When all else fails, we use 5000 as the hard-coded max_tokens
    let hardcoded_fallback = 5000u32;
    // This should be a reasonable value that Anthropic will accept
    // even with thinking enabled (though output will be limited)
    assert!(hardcoded_fallback > 0);
    // Note: With a 10000 thinking budget, 5000 is still below the
    // minimum required (11024), but we send it anyway as a "last resort"
    // hoping the API might still work for basic operations
 }
 /// Test provider-specific caps
 #[test]
 fn test_provider_specific_caps() {
    // Anthropic/Databricks: cap at 10000
    let anthropic_cap = 10000u32;
    let proposed = 15000u32;
    assert_eq!(proposed.min(anthropic_cap), 10000);
    // Embedded: cap at 3000
    let embedded_cap = 3000u32;
    let proposed = 5000u32;
    assert_eq!(proposed.min(embedded_cap), 3000);
    // Default: cap at 5000
    let default_cap = 5000u32;
    let proposed = 8000u32;
    assert_eq!(proposed.min(default_cap), 5000);
 }
 /// Test that the error message mentions the thinking budget constraint
 #[test]
 fn test_error_message_content() {
    // Verify the warning message format contains useful information
    let proposed_max_tokens = 5000u32;
    let budget_tokens = 10000u32;
    let minimum_required = budget_tokens + 1024;
    let warning = format!(
        "max_tokens ({}) is below required minimum ({}) for thinking.budget_tokens ({}). Context reduction needed.",
        proposed_max_tokens, minimum_required, budget_tokens
    );
    assert!(warning.contains("5000"));
    assert!(warning.contains("11024"));
    assert!(warning.contains("10000"));
    assert!(warning.contains("Context reduction needed"));
 }
--- a/crates/g3-planner/src/lib.rs
+++ b/crates/g3-planner/src/lib.rs
@@ -85,6 +85,7 @@ pub async fn get_initial_discovery_messages(
        temperature: Some(provider.temperature()),
        stream: false,
        tools: None,
        disable_thinking: false,
    };
    status("🤖 Calling LLM for discovery commands...");
--- a/crates/g3-providers/src/anthropic.rs
+++ b/crates/g3-providers/src/anthropic.rs
@@ -39,6 +39,7 @@
 //!         temperature: Some(0.7),
 //!         stream: false,
 //!         tools: None,
 //!         disable_thinking: false,
 //!     };
 //!
 //!     // Get a completion
@@ -75,6 +76,7 @@
 //!         temperature: Some(0.7),
 //!         stream: true,
 //!         tools: None,
 //!         disable_thinking: false,
 //!     };
 //!
 //!     let mut stream = provider.stream(request).await?;
@@ -272,6 +274,7 @@ impl AnthropicProvider {
        streaming: bool,
        max_tokens: u32,
        temperature: f32,
        disable_thinking: bool,
    ) -> Result<AnthropicRequest> {
        let (system, anthropic_messages) = self.convert_messages(messages)?;
@@ -284,10 +287,32 @@ impl AnthropicProvider {
        // Convert tools if provided
        let anthropic_tools = tools.map(|t| self.convert_tools(t));
-        // Add thinking configuration if budget_tokens is set
+        // Add thinking configuration if budget_tokens is set AND max_tokens is sufficient AND not explicitly disabled
-        let thinking = self.thinking_budget_tokens.map(|budget| {
+        // Anthropic requires: max_tokens > thinking.budget_tokens
-            ThinkingConfig::enabled(budget)
+        // We add 1024 as minimum buffer for actual response content
-        });
+        tracing::debug!("create_request_body called: max_tokens={}, disable_thinking={}, thinking_budget_tokens={:?}", max_tokens, disable_thinking, self.thinking_budget_tokens);
        let thinking = if disable_thinking {
            tracing::info!(
                "Thinking mode explicitly disabled for this request (max_tokens={})",
                max_tokens
            );
            None
        } else {
            self.thinking_budget_tokens.and_then(|budget| {
            let min_required = budget + 1024;
            if max_tokens > min_required {
                Some(ThinkingConfig::enabled(budget))
            } else {
                tracing::warn!(
                    "Disabling thinking mode: max_tokens ({}) is not greater than thinking.budget_tokens ({}) + 1024 buffer. \
                     Required: max_tokens > {}",
                    max_tokens, budget, min_required
                );
                None
            }
            })
        };
        let request = AnthropicRequest {
            model: self.model.clone(),
@@ -637,6 +662,7 @@ impl LLMProvider for AnthropicProvider {
            false,
            max_tokens,
            temperature,
            request.disable_thinking,
        )?;
        debug!(
@@ -710,6 +736,7 @@ impl LLMProvider for AnthropicProvider {
            true,
            max_tokens,
            temperature,
            request.disable_thinking,
        )?;
        debug!(
@@ -847,6 +874,12 @@ enum AnthropicContent {
        #[serde(skip_serializing_if = "Option::is_none")]
        cache_control: Option<crate::CacheControl>,
    },
    #[serde(rename = "thinking")]
    Thinking {
        thinking: String,
        #[serde(default)]
        signature: Option<String>,
    },
    #[serde(rename = "tool_use")]
    ToolUse {
        id: String,
@@ -947,7 +980,7 @@ mod tests {
        let messages = vec![Message::new(MessageRole::User, "Test message".to_string())];
        let request_body = provider
-            .create_request_body(&messages, None, false, 1000, 0.5)
+            .create_request_body(&messages, None, false, 1000, 0.5, false)
            .unwrap();
        assert_eq!(request_body.model, "claude-3-haiku-20240307");
@@ -1053,16 +1086,17 @@ mod tests {
        let messages = vec![Message::new(MessageRole::User, "Test message".to_string())];
        let request_without = provider_without
-            .create_request_body(&messages, None, false, 1000, 0.5)
+            .create_request_body(&messages, None, false, 1000, 0.5, false)
            .unwrap();
        let json_without = serde_json::to_string(&request_without).unwrap();
        assert!(!json_without.contains("thinking"), "JSON should not contain 'thinking' field when not configured");
-        // Test WITH thinking parameter
+        // Test WITH thinking parameter - max_tokens must be > budget_tokens + 1024
        // Using budget=10000 requires max_tokens > 11024
        let provider_with = AnthropicProvider::new(
            "test-key".to_string(),
            Some("claude-sonnet-4-5".to_string()),
-            Some(1000),
+            Some(20000),  // Sufficient for thinking budget
            Some(0.5),
            None,
            None,
@@ -1071,11 +1105,78 @@ mod tests {
        .unwrap();
        let request_with = provider_with
-            .create_request_body(&messages, None, false, 1000, 0.5)
+            .create_request_body(&messages, None, false, 20000, 0.5, false)
            .unwrap();
        let json_with = serde_json::to_string(&request_with).unwrap();
        assert!(json_with.contains("thinking"), "JSON should contain 'thinking' field when configured");
        assert!(json_with.contains("\"type\":\"enabled\""), "JSON should contain type: enabled");
        assert!(json_with.contains("\"budget_tokens\":10000"), "JSON should contain budget_tokens: 10000");
        // Test WITH thinking parameter but INSUFFICIENT max_tokens - thinking should be disabled
        let request_insufficient = provider_with
            .create_request_body(&messages, None, false, 5000, 0.5, false)  // Less than budget + 1024
            .unwrap();
        let json_insufficient = serde_json::to_string(&request_insufficient).unwrap();
        assert!(!json_insufficient.contains("thinking"), "JSON should NOT contain 'thinking' field when max_tokens is insufficient");
    }
    #[test]
    fn test_disable_thinking_flag() {
        // Test that disable_thinking=true prevents thinking even with sufficient max_tokens
        let provider = AnthropicProvider::new(
            "test-key".to_string(),
            Some("claude-sonnet-4-5".to_string()),
            Some(20000),
            Some(0.5),
            None,
            None,
            Some(10000), // With thinking budget
        )
        .unwrap();
        let messages = vec![Message::new(MessageRole::User, "Test message".to_string())];
        // With disable_thinking=false, thinking should be enabled (max_tokens is sufficient)
        let request_with_thinking = provider
            .create_request_body(&messages, None, false, 20000, 0.5, false)
            .unwrap();
        let json_with = serde_json::to_string(&request_with_thinking).unwrap();
        assert!(json_with.contains("thinking"), "JSON should contain 'thinking' field when not disabled");
        // With disable_thinking=true, thinking should be disabled even with sufficient max_tokens
        let request_without_thinking = provider
            .create_request_body(&messages, None, false, 20000, 0.5, true)
            .unwrap();
        let json_without = serde_json::to_string(&request_without_thinking).unwrap();
        assert!(!json_without.contains("thinking"), "JSON should NOT contain 'thinking' field when explicitly disabled");
    }
    #[test]
    fn test_thinking_content_block_deserialization() {
        // Test that we can deserialize a response containing a "thinking" content block
        // This is what Anthropic returns when extended thinking is enabled
        let json_response = r#"{
            "content": [
                {"type": "thinking", "thinking": "Let me analyze this...", "signature": "abc123"},
                {"type": "text", "text": "Here is my response."}
            ],
            "model": "claude-sonnet-4-5",
            "usage": {"input_tokens": 100, "output_tokens": 50}
        }"#;
        let response: AnthropicResponse = serde_json::from_str(json_response)
            .expect("Should be able to deserialize response with thinking block");
        assert_eq!(response.content.len(), 2);
        assert_eq!(response.model, "claude-sonnet-4-5");
        // Extract only text content (thinking should be filtered out)
        let text_content: Vec<_> = response.content.iter().filter_map(|c| match c {
            AnthropicContent::Text { text, .. } => Some(text.as_str()),
            _ => None,
        }).collect();
        assert_eq!(text_content.len(), 1);
        assert_eq!(text_content[0], "Here is my response.");
    }
 }
--- a/crates/g3-providers/src/databricks.rs
+++ b/crates/g3-providers/src/databricks.rs
@@ -45,6 +45,7 @@
 //!         temperature: Some(0.7),
 //!         stream: false,
 //!         tools: None,
 //!         disable_thinking: false,
 //!     };
 //!
 //!     // Get a completion
--- a/crates/g3-providers/src/lib.rs
+++ b/crates/g3-providers/src/lib.rs
@@ -42,6 +42,8 @@ pub struct CompletionRequest {
    pub temperature: Option<f32>,
    pub stream: bool,
    pub tools: Option<Vec<Tool>>,
    /// Force disable thinking mode for this request (used when max_tokens is too low)
    pub disable_thinking: bool,
 }
 #[derive(Debug, Clone, Serialize, Deserialize)]