From 4aa84e214480c274238ec40316fc4c7794db1b41 Mon Sep 17 00:00:00 2001
From: Jochen <jochenx@bekmann.com>
Date: Tue, 9 Dec 2025 16:45:28 +1100
Subject: [PATCH] disable thinking if there is no token budget

---
 crates/g3-core/src/lib.rs             | 41 +++++++++++++++++-
 crates/g3-planner/src/lib.rs          |  1 +
 crates/g3-providers/src/anthropic.rs  | 60 +++++++++++++++++++++++----
 crates/g3-providers/src/databricks.rs |  1 +
 crates/g3-providers/src/lib.rs        |  2 +
 5 files changed, 97 insertions(+), 8 deletions(-)
diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs
index e012cc7..8320a36 100644
--- a/crates/g3-core/src/lib.rs
+++ b/crates/g3-core/src/lib.rs
@@ -1494,8 +1494,18 @@ impl<W: UiWriter> Agent<W> {
         let available = model_limit
             .saturating_sub(current_usage)
             .saturating_sub(buffer);
-        // Use the smaller of available tokens or configured max_tokens
+        // Use the smaller of available tokens or configured max_tokens,
+        // but ensure we don't go below thinking budget floor for Anthropic
         let proposed_max_tokens = available.min(configured_max_tokens);
+        let proposed_max_tokens = if provider_name == "anthropic" {
+            if let Some(budget) = self.get_thinking_budget_tokens() {
+                proposed_max_tokens.max(budget + 1024)
+            } else {
+                proposed_max_tokens
+            }
+        } else {
+            proposed_max_tokens
+        };
 
         // Validate against thinking budget constraint
         let (adjusted, needs_reduction) = self.preflight_validate_max_tokens(provider_name, proposed_max_tokens);
@@ -2033,6 +2043,7 @@ impl<W: UiWriter> Agent<W> {
             temperature: Some(self.resolve_temperature(&provider_name)),
             stream: true, // Enable streaming
             tools,
+            disable_thinking: false,
         };
 
         // Time the LLM call with cancellation support and streaming
@@ -2484,12 +2495,26 @@ impl<W: UiWriter> Agent<W> {
 
         let provider = self.providers.get(None)?;
 
+        // Determine if we need to disable thinking mode for this request
+        // Anthropic requires: max_tokens > thinking.budget_tokens + 1024
+        let disable_thinking = self.get_thinking_budget_tokens().map_or(false, |budget| {
+            let minimum_for_thinking = budget + 1024;
+            let should_disable = summary_max_tokens <= minimum_for_thinking;
+            if should_disable {
+                tracing::warn!("Disabling thinking mode for summary: max_tokens ({}) <= minimum_for_thinking ({})", summary_max_tokens, minimum_for_thinking);
+            }
+            should_disable
+        });
+
+        tracing::debug!("Creating summary request: max_tokens={}, disable_thinking={}", summary_max_tokens, disable_thinking);
+
         let summary_request = CompletionRequest {
             messages: summary_messages,
             max_tokens: Some(summary_max_tokens),
             temperature: Some(self.resolve_temperature(provider.name())),
             stream: false,
             tools: None,
+            disable_thinking,
         };
 
         // Get the summary
@@ -3504,12 +3529,26 @@ impl<W: UiWriter> Agent<W> {
 
                 let provider = self.providers.get(None)?;
 
+                // Determine if we need to disable thinking mode for this request
+                // Anthropic requires: max_tokens > thinking.budget_tokens + 1024
+                let disable_thinking = self.get_thinking_budget_tokens().map_or(false, |budget| {
+                    let minimum_for_thinking = budget + 1024;
+                    let should_disable = summary_max_tokens <= minimum_for_thinking;
+                    if should_disable {
+                        tracing::warn!("Disabling thinking mode for summary: max_tokens ({}) <= minimum_for_thinking ({})", summary_max_tokens, minimum_for_thinking);
+                    }
+                    should_disable
+                });
+
+                tracing::debug!("Creating auto-summary request: max_tokens={}, disable_thinking={}", summary_max_tokens, disable_thinking);
+
                 let summary_request = CompletionRequest {
                     messages: summary_messages,
                     max_tokens: Some(summary_max_tokens),
                     temperature: Some(self.resolve_temperature(provider.name())),
                     stream: false,
                     tools: None,
+                    disable_thinking,
                 };
 
                 // Get the summary
diff --git a/crates/g3-planner/src/lib.rs b/crates/g3-planner/src/lib.rs
index 8317b64..e23e87c 100644
--- a/crates/g3-planner/src/lib.rs
+++ b/crates/g3-planner/src/lib.rs
@@ -85,6 +85,7 @@ pub async fn get_initial_discovery_messages(
         temperature: Some(provider.temperature()),
         stream: false,
         tools: None,
+        disable_thinking: false,
     };
 
     status("🤖 Calling LLM for discovery commands...");
diff --git a/crates/g3-providers/src/anthropic.rs b/crates/g3-providers/src/anthropic.rs
index b2d63c7..778827f 100644
--- a/crates/g3-providers/src/anthropic.rs
+++ b/crates/g3-providers/src/anthropic.rs
@@ -39,6 +39,7 @@
 //!         temperature: Some(0.7),
 //!         stream: false,
 //!         tools: None,
+//!         disable_thinking: false,
 //!     };
 //!
 //!     // Get a completion
@@ -75,6 +76,7 @@
 //!         temperature: Some(0.7),
 //!         stream: true,
 //!         tools: None,
+//!         disable_thinking: false,
 //!     };
 //!
 //!     let mut stream = provider.stream(request).await?;
@@ -272,6 +274,7 @@ impl AnthropicProvider {
         streaming: bool,
         max_tokens: u32,
         temperature: f32,
+        disable_thinking: bool,
     ) -> Result<AnthropicRequest> {
         let (system, anthropic_messages) = self.convert_messages(messages)?;
 
@@ -284,10 +287,19 @@ impl AnthropicProvider {
         // Convert tools if provided
         let anthropic_tools = tools.map(|t| self.convert_tools(t));
 
-        // Add thinking configuration if budget_tokens is set AND max_tokens is sufficient
+        // Add thinking configuration if budget_tokens is set AND max_tokens is sufficient AND not explicitly disabled
         // Anthropic requires: max_tokens > thinking.budget_tokens
         // We add 1024 as minimum buffer for actual response content
-        let thinking = self.thinking_budget_tokens.and_then(|budget| {
+        tracing::debug!("create_request_body called: max_tokens={}, disable_thinking={}, thinking_budget_tokens={:?}", max_tokens, disable_thinking, self.thinking_budget_tokens);
+
+        let thinking = if disable_thinking {
+            tracing::info!(
+                "Thinking mode explicitly disabled for this request (max_tokens={})",
+                max_tokens
+            );
+            None
+        } else {
+            self.thinking_budget_tokens.and_then(|budget| {
             let min_required = budget + 1024;
             if max_tokens > min_required {
                 Some(ThinkingConfig::enabled(budget))
@@ -299,7 +311,8 @@ impl AnthropicProvider {
                 );
                 None
             }
-        });
+            })
+        };
 
         let request = AnthropicRequest {
             model: self.model.clone(),
@@ -649,6 +662,7 @@ impl LLMProvider for AnthropicProvider {
             false,
             max_tokens,
             temperature,
+            request.disable_thinking,
         )?;
 
         debug!(
@@ -722,6 +736,7 @@ impl LLMProvider for AnthropicProvider {
             true,
             max_tokens,
             temperature,
+            request.disable_thinking,
         )?;
 
         debug!(
@@ -965,7 +980,7 @@ mod tests {
         let messages = vec![Message::new(MessageRole::User, "Test message".to_string())];
 
         let request_body = provider
-            .create_request_body(&messages, None, false, 1000, 0.5)
+            .create_request_body(&messages, None, false, 1000, 0.5, false)
             .unwrap();
 
         assert_eq!(request_body.model, "claude-3-haiku-20240307");
@@ -1071,7 +1086,7 @@ mod tests {
 
         let messages = vec![Message::new(MessageRole::User, "Test message".to_string())];
         let request_without = provider_without
-            .create_request_body(&messages, None, false, 1000, 0.5)
+            .create_request_body(&messages, None, false, 1000, 0.5, false)
             .unwrap();
         let json_without = serde_json::to_string(&request_without).unwrap();
         assert!(!json_without.contains("thinking"), "JSON should not contain 'thinking' field when not configured");
@@ -1090,7 +1105,7 @@ mod tests {
         .unwrap();
 
         let request_with = provider_with
-            .create_request_body(&messages, None, false, 20000, 0.5)
+            .create_request_body(&messages, None, false, 20000, 0.5, false)
             .unwrap();
         let json_with = serde_json::to_string(&request_with).unwrap();
         assert!(json_with.contains("thinking"), "JSON should contain 'thinking' field when configured");
@@ -1099,12 +1114,43 @@ mod tests {
 
         // Test WITH thinking parameter but INSUFFICIENT max_tokens - thinking should be disabled
         let request_insufficient = provider_with
-            .create_request_body(&messages, None, false, 5000, 0.5)  // Less than budget + 1024
+            .create_request_body(&messages, None, false, 5000, 0.5, false)  // Less than budget + 1024
             .unwrap();
         let json_insufficient = serde_json::to_string(&request_insufficient).unwrap();
         assert!(!json_insufficient.contains("thinking"), "JSON should NOT contain 'thinking' field when max_tokens is insufficient");
     }
 
+    #[test]
+    fn test_disable_thinking_flag() {
+        // Test that disable_thinking=true prevents thinking even with sufficient max_tokens
+        let provider = AnthropicProvider::new(
+            "test-key".to_string(),
+            Some("claude-sonnet-4-5".to_string()),
+            Some(20000),
+            Some(0.5),
+            None,
+            None,
+            Some(10000), // With thinking budget
+        )
+        .unwrap();
+
+        let messages = vec![Message::new(MessageRole::User, "Test message".to_string())];
+        
+        // With disable_thinking=false, thinking should be enabled (max_tokens is sufficient)
+        let request_with_thinking = provider
+            .create_request_body(&messages, None, false, 20000, 0.5, false)
+            .unwrap();
+        let json_with = serde_json::to_string(&request_with_thinking).unwrap();
+        assert!(json_with.contains("thinking"), "JSON should contain 'thinking' field when not disabled");
+
+        // With disable_thinking=true, thinking should be disabled even with sufficient max_tokens
+        let request_without_thinking = provider
+            .create_request_body(&messages, None, false, 20000, 0.5, true)
+            .unwrap();
+        let json_without = serde_json::to_string(&request_without_thinking).unwrap();
+        assert!(!json_without.contains("thinking"), "JSON should NOT contain 'thinking' field when explicitly disabled");
+    }
+
     #[test]
     fn test_thinking_content_block_deserialization() {
         // Test that we can deserialize a response containing a "thinking" content block
diff --git a/crates/g3-providers/src/databricks.rs b/crates/g3-providers/src/databricks.rs
index c5e9dc6..848acdd 100644
--- a/crates/g3-providers/src/databricks.rs
+++ b/crates/g3-providers/src/databricks.rs
@@ -45,6 +45,7 @@
 //!         temperature: Some(0.7),
 //!         stream: false,
 //!         tools: None,
+//!         disable_thinking: false,
 //!     };
 //!
 //!     // Get a completion
diff --git a/crates/g3-providers/src/lib.rs b/crates/g3-providers/src/lib.rs
index 180400d..0e088ce 100644
--- a/crates/g3-providers/src/lib.rs
+++ b/crates/g3-providers/src/lib.rs
@@ -42,6 +42,8 @@ pub struct CompletionRequest {
     pub temperature: Option<f32>,
     pub stream: bool,
     pub tools: Option<Vec<Tool>>,
+    /// Force disable thinking mode for this request (used when max_tokens is too low)
+    pub disable_thinking: bool,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]