From 4aa84e214480c274238ec40316fc4c7794db1b41 Mon Sep 17 00:00:00 2001 From: Jochen Date: Tue, 9 Dec 2025 16:45:28 +1100 Subject: [PATCH] disable thinking if there is no token budget --- crates/g3-core/src/lib.rs | 41 +++++++++++++++++- crates/g3-planner/src/lib.rs | 1 + crates/g3-providers/src/anthropic.rs | 60 +++++++++++++++++++++++---- crates/g3-providers/src/databricks.rs | 1 + crates/g3-providers/src/lib.rs | 2 + 5 files changed, 97 insertions(+), 8 deletions(-) diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs index e012cc7..8320a36 100644 --- a/crates/g3-core/src/lib.rs +++ b/crates/g3-core/src/lib.rs @@ -1494,8 +1494,18 @@ impl Agent { let available = model_limit .saturating_sub(current_usage) .saturating_sub(buffer); - // Use the smaller of available tokens or configured max_tokens + // Use the smaller of available tokens or configured max_tokens, + // but ensure we don't go below thinking budget floor for Anthropic let proposed_max_tokens = available.min(configured_max_tokens); + let proposed_max_tokens = if provider_name == "anthropic" { + if let Some(budget) = self.get_thinking_budget_tokens() { + proposed_max_tokens.max(budget + 1024) + } else { + proposed_max_tokens + } + } else { + proposed_max_tokens + }; // Validate against thinking budget constraint let (adjusted, needs_reduction) = self.preflight_validate_max_tokens(provider_name, proposed_max_tokens); @@ -2033,6 +2043,7 @@ impl Agent { temperature: Some(self.resolve_temperature(&provider_name)), stream: true, // Enable streaming tools, + disable_thinking: false, }; // Time the LLM call with cancellation support and streaming @@ -2484,12 +2495,26 @@ impl Agent { let provider = self.providers.get(None)?; + // Determine if we need to disable thinking mode for this request + // Anthropic requires: max_tokens > thinking.budget_tokens + 1024 + let disable_thinking = self.get_thinking_budget_tokens().map_or(false, |budget| { + let minimum_for_thinking = budget + 1024; + let should_disable = summary_max_tokens <= minimum_for_thinking; + if should_disable { + tracing::warn!("Disabling thinking mode for summary: max_tokens ({}) <= minimum_for_thinking ({})", summary_max_tokens, minimum_for_thinking); + } + should_disable + }); + + tracing::debug!("Creating summary request: max_tokens={}, disable_thinking={}", summary_max_tokens, disable_thinking); + let summary_request = CompletionRequest { messages: summary_messages, max_tokens: Some(summary_max_tokens), temperature: Some(self.resolve_temperature(provider.name())), stream: false, tools: None, + disable_thinking, }; // Get the summary @@ -3504,12 +3529,26 @@ impl Agent { let provider = self.providers.get(None)?; + // Determine if we need to disable thinking mode for this request + // Anthropic requires: max_tokens > thinking.budget_tokens + 1024 + let disable_thinking = self.get_thinking_budget_tokens().map_or(false, |budget| { + let minimum_for_thinking = budget + 1024; + let should_disable = summary_max_tokens <= minimum_for_thinking; + if should_disable { + tracing::warn!("Disabling thinking mode for summary: max_tokens ({}) <= minimum_for_thinking ({})", summary_max_tokens, minimum_for_thinking); + } + should_disable + }); + + tracing::debug!("Creating auto-summary request: max_tokens={}, disable_thinking={}", summary_max_tokens, disable_thinking); + let summary_request = CompletionRequest { messages: summary_messages, max_tokens: Some(summary_max_tokens), temperature: Some(self.resolve_temperature(provider.name())), stream: false, tools: None, + disable_thinking, }; // Get the summary diff --git a/crates/g3-planner/src/lib.rs b/crates/g3-planner/src/lib.rs index 8317b64..e23e87c 100644 --- a/crates/g3-planner/src/lib.rs +++ b/crates/g3-planner/src/lib.rs @@ -85,6 +85,7 @@ pub async fn get_initial_discovery_messages( temperature: Some(provider.temperature()), stream: false, tools: None, + disable_thinking: false, }; status("🤖 Calling LLM for discovery commands..."); diff --git a/crates/g3-providers/src/anthropic.rs b/crates/g3-providers/src/anthropic.rs index b2d63c7..778827f 100644 --- a/crates/g3-providers/src/anthropic.rs +++ b/crates/g3-providers/src/anthropic.rs @@ -39,6 +39,7 @@ //! temperature: Some(0.7), //! stream: false, //! tools: None, +//! disable_thinking: false, //! }; //! //! // Get a completion @@ -75,6 +76,7 @@ //! temperature: Some(0.7), //! stream: true, //! tools: None, +//! disable_thinking: false, //! }; //! //! let mut stream = provider.stream(request).await?; @@ -272,6 +274,7 @@ impl AnthropicProvider { streaming: bool, max_tokens: u32, temperature: f32, + disable_thinking: bool, ) -> Result { let (system, anthropic_messages) = self.convert_messages(messages)?; @@ -284,10 +287,19 @@ impl AnthropicProvider { // Convert tools if provided let anthropic_tools = tools.map(|t| self.convert_tools(t)); - // Add thinking configuration if budget_tokens is set AND max_tokens is sufficient + // Add thinking configuration if budget_tokens is set AND max_tokens is sufficient AND not explicitly disabled // Anthropic requires: max_tokens > thinking.budget_tokens // We add 1024 as minimum buffer for actual response content - let thinking = self.thinking_budget_tokens.and_then(|budget| { + tracing::debug!("create_request_body called: max_tokens={}, disable_thinking={}, thinking_budget_tokens={:?}", max_tokens, disable_thinking, self.thinking_budget_tokens); + + let thinking = if disable_thinking { + tracing::info!( + "Thinking mode explicitly disabled for this request (max_tokens={})", + max_tokens + ); + None + } else { + self.thinking_budget_tokens.and_then(|budget| { let min_required = budget + 1024; if max_tokens > min_required { Some(ThinkingConfig::enabled(budget)) @@ -299,7 +311,8 @@ impl AnthropicProvider { ); None } - }); + }) + }; let request = AnthropicRequest { model: self.model.clone(), @@ -649,6 +662,7 @@ impl LLMProvider for AnthropicProvider { false, max_tokens, temperature, + request.disable_thinking, )?; debug!( @@ -722,6 +736,7 @@ impl LLMProvider for AnthropicProvider { true, max_tokens, temperature, + request.disable_thinking, )?; debug!( @@ -965,7 +980,7 @@ mod tests { let messages = vec![Message::new(MessageRole::User, "Test message".to_string())]; let request_body = provider - .create_request_body(&messages, None, false, 1000, 0.5) + .create_request_body(&messages, None, false, 1000, 0.5, false) .unwrap(); assert_eq!(request_body.model, "claude-3-haiku-20240307"); @@ -1071,7 +1086,7 @@ mod tests { let messages = vec![Message::new(MessageRole::User, "Test message".to_string())]; let request_without = provider_without - .create_request_body(&messages, None, false, 1000, 0.5) + .create_request_body(&messages, None, false, 1000, 0.5, false) .unwrap(); let json_without = serde_json::to_string(&request_without).unwrap(); assert!(!json_without.contains("thinking"), "JSON should not contain 'thinking' field when not configured"); @@ -1090,7 +1105,7 @@ mod tests { .unwrap(); let request_with = provider_with - .create_request_body(&messages, None, false, 20000, 0.5) + .create_request_body(&messages, None, false, 20000, 0.5, false) .unwrap(); let json_with = serde_json::to_string(&request_with).unwrap(); assert!(json_with.contains("thinking"), "JSON should contain 'thinking' field when configured"); @@ -1099,12 +1114,43 @@ mod tests { // Test WITH thinking parameter but INSUFFICIENT max_tokens - thinking should be disabled let request_insufficient = provider_with - .create_request_body(&messages, None, false, 5000, 0.5) // Less than budget + 1024 + .create_request_body(&messages, None, false, 5000, 0.5, false) // Less than budget + 1024 .unwrap(); let json_insufficient = serde_json::to_string(&request_insufficient).unwrap(); assert!(!json_insufficient.contains("thinking"), "JSON should NOT contain 'thinking' field when max_tokens is insufficient"); } + #[test] + fn test_disable_thinking_flag() { + // Test that disable_thinking=true prevents thinking even with sufficient max_tokens + let provider = AnthropicProvider::new( + "test-key".to_string(), + Some("claude-sonnet-4-5".to_string()), + Some(20000), + Some(0.5), + None, + None, + Some(10000), // With thinking budget + ) + .unwrap(); + + let messages = vec![Message::new(MessageRole::User, "Test message".to_string())]; + + // With disable_thinking=false, thinking should be enabled (max_tokens is sufficient) + let request_with_thinking = provider + .create_request_body(&messages, None, false, 20000, 0.5, false) + .unwrap(); + let json_with = serde_json::to_string(&request_with_thinking).unwrap(); + assert!(json_with.contains("thinking"), "JSON should contain 'thinking' field when not disabled"); + + // With disable_thinking=true, thinking should be disabled even with sufficient max_tokens + let request_without_thinking = provider + .create_request_body(&messages, None, false, 20000, 0.5, true) + .unwrap(); + let json_without = serde_json::to_string(&request_without_thinking).unwrap(); + assert!(!json_without.contains("thinking"), "JSON should NOT contain 'thinking' field when explicitly disabled"); + } + #[test] fn test_thinking_content_block_deserialization() { // Test that we can deserialize a response containing a "thinking" content block diff --git a/crates/g3-providers/src/databricks.rs b/crates/g3-providers/src/databricks.rs index c5e9dc6..848acdd 100644 --- a/crates/g3-providers/src/databricks.rs +++ b/crates/g3-providers/src/databricks.rs @@ -45,6 +45,7 @@ //! temperature: Some(0.7), //! stream: false, //! tools: None, +//! disable_thinking: false, //! }; //! //! // Get a completion diff --git a/crates/g3-providers/src/lib.rs b/crates/g3-providers/src/lib.rs index 180400d..0e088ce 100644 --- a/crates/g3-providers/src/lib.rs +++ b/crates/g3-providers/src/lib.rs @@ -42,6 +42,8 @@ pub struct CompletionRequest { pub temperature: Option, pub stream: bool, pub tools: Option>, + /// Force disable thinking mode for this request (used when max_tokens is too low) + pub disable_thinking: bool, } #[derive(Debug, Clone, Serialize, Deserialize)]