disable thinking if there is no token budget

2025-12-09 16:45:28 +11:00
parent 2283d9ddbf
commit 4aa84e2144
5 changed files with 97 additions and 8 deletions
--- a/crates/g3-core/src/lib.rs
+++ b/crates/g3-core/src/lib.rs
@@ -1494,8 +1494,18 @@ impl<W: UiWriter> Agent<W> {
        let available = model_limit
            .saturating_sub(current_usage)
            .saturating_sub(buffer);
-        // Use the smaller of available tokens or configured max_tokens
+        // Use the smaller of available tokens or configured max_tokens,
        // but ensure we don't go below thinking budget floor for Anthropic
        let proposed_max_tokens = available.min(configured_max_tokens);
        let proposed_max_tokens = if provider_name == "anthropic" {
            if let Some(budget) = self.get_thinking_budget_tokens() {
                proposed_max_tokens.max(budget + 1024)
            } else {
                proposed_max_tokens
            }
        } else {
            proposed_max_tokens
        };
        // Validate against thinking budget constraint
        let (adjusted, needs_reduction) = self.preflight_validate_max_tokens(provider_name, proposed_max_tokens);
@@ -2033,6 +2043,7 @@ impl<W: UiWriter> Agent<W> {
            temperature: Some(self.resolve_temperature(&provider_name)),
            stream: true, // Enable streaming
            tools,
            disable_thinking: false,
        };
        // Time the LLM call with cancellation support and streaming
@@ -2484,12 +2495,26 @@ impl<W: UiWriter> Agent<W> {
        let provider = self.providers.get(None)?;
        // Determine if we need to disable thinking mode for this request
        // Anthropic requires: max_tokens > thinking.budget_tokens + 1024
        let disable_thinking = self.get_thinking_budget_tokens().map_or(false, |budget| {
            let minimum_for_thinking = budget + 1024;
            let should_disable = summary_max_tokens <= minimum_for_thinking;
            if should_disable {
                tracing::warn!("Disabling thinking mode for summary: max_tokens ({}) <= minimum_for_thinking ({})", summary_max_tokens, minimum_for_thinking);
            }
            should_disable
        });
        tracing::debug!("Creating summary request: max_tokens={}, disable_thinking={}", summary_max_tokens, disable_thinking);
        let summary_request = CompletionRequest {
            messages: summary_messages,
            max_tokens: Some(summary_max_tokens),
            temperature: Some(self.resolve_temperature(provider.name())),
            stream: false,
            tools: None,
            disable_thinking,
        };
        // Get the summary
@@ -3504,12 +3529,26 @@ impl<W: UiWriter> Agent<W> {
                let provider = self.providers.get(None)?;
                // Determine if we need to disable thinking mode for this request
                // Anthropic requires: max_tokens > thinking.budget_tokens + 1024
                let disable_thinking = self.get_thinking_budget_tokens().map_or(false, |budget| {
                    let minimum_for_thinking = budget + 1024;
                    let should_disable = summary_max_tokens <= minimum_for_thinking;
                    if should_disable {
                        tracing::warn!("Disabling thinking mode for summary: max_tokens ({}) <= minimum_for_thinking ({})", summary_max_tokens, minimum_for_thinking);
                    }
                    should_disable
                });
                tracing::debug!("Creating auto-summary request: max_tokens={}, disable_thinking={}", summary_max_tokens, disable_thinking);
                let summary_request = CompletionRequest {
                    messages: summary_messages,
                    max_tokens: Some(summary_max_tokens),
                    temperature: Some(self.resolve_temperature(provider.name())),
                    stream: false,
                    tools: None,
                    disable_thinking,
                };
                // Get the summary
--- a/crates/g3-planner/src/lib.rs
+++ b/crates/g3-planner/src/lib.rs
@@ -85,6 +85,7 @@ pub async fn get_initial_discovery_messages(
        temperature: Some(provider.temperature()),
        stream: false,
        tools: None,
        disable_thinking: false,
    };
    status("🤖 Calling LLM for discovery commands...");
--- a/crates/g3-providers/src/anthropic.rs
+++ b/crates/g3-providers/src/anthropic.rs
@@ -39,6 +39,7 @@
 //!         temperature: Some(0.7),
 //!         stream: false,
 //!         tools: None,
 //!         disable_thinking: false,
 //!     };
 //!
 //!     // Get a completion
@@ -75,6 +76,7 @@
 //!         temperature: Some(0.7),
 //!         stream: true,
 //!         tools: None,
 //!         disable_thinking: false,
 //!     };
 //!
 //!     let mut stream = provider.stream(request).await?;
@@ -272,6 +274,7 @@ impl AnthropicProvider {
        streaming: bool,
        max_tokens: u32,
        temperature: f32,
        disable_thinking: bool,
    ) -> Result<AnthropicRequest> {
        let (system, anthropic_messages) = self.convert_messages(messages)?;
@@ -284,10 +287,19 @@ impl AnthropicProvider {
        // Convert tools if provided
        let anthropic_tools = tools.map(|t| self.convert_tools(t));
-        // Add thinking configuration if budget_tokens is set AND max_tokens is sufficient
+        // Add thinking configuration if budget_tokens is set AND max_tokens is sufficient AND not explicitly disabled
        // Anthropic requires: max_tokens > thinking.budget_tokens
        // We add 1024 as minimum buffer for actual response content
-        let thinking = self.thinking_budget_tokens.and_then(|budget| {
+        tracing::debug!("create_request_body called: max_tokens={}, disable_thinking={}, thinking_budget_tokens={:?}", max_tokens, disable_thinking, self.thinking_budget_tokens);
        let thinking = if disable_thinking {
            tracing::info!(
                "Thinking mode explicitly disabled for this request (max_tokens={})",
                max_tokens
            );
            None
        } else {
            self.thinking_budget_tokens.and_then(|budget| {
            let min_required = budget + 1024;
            if max_tokens > min_required {
                Some(ThinkingConfig::enabled(budget))
@@ -299,7 +311,8 @@ impl AnthropicProvider {
                );
                None
            }
-        });
+            })
        };
        let request = AnthropicRequest {
            model: self.model.clone(),
@@ -649,6 +662,7 @@ impl LLMProvider for AnthropicProvider {
            false,
            max_tokens,
            temperature,
            request.disable_thinking,
        )?;
        debug!(
@@ -722,6 +736,7 @@ impl LLMProvider for AnthropicProvider {
            true,
            max_tokens,
            temperature,
            request.disable_thinking,
        )?;
        debug!(
@@ -965,7 +980,7 @@ mod tests {
        let messages = vec![Message::new(MessageRole::User, "Test message".to_string())];
        let request_body = provider
-            .create_request_body(&messages, None, false, 1000, 0.5)
+            .create_request_body(&messages, None, false, 1000, 0.5, false)
            .unwrap();
        assert_eq!(request_body.model, "claude-3-haiku-20240307");
@@ -1071,7 +1086,7 @@ mod tests {
        let messages = vec![Message::new(MessageRole::User, "Test message".to_string())];
        let request_without = provider_without
-            .create_request_body(&messages, None, false, 1000, 0.5)
+            .create_request_body(&messages, None, false, 1000, 0.5, false)
            .unwrap();
        let json_without = serde_json::to_string(&request_without).unwrap();
        assert!(!json_without.contains("thinking"), "JSON should not contain 'thinking' field when not configured");
@@ -1090,7 +1105,7 @@ mod tests {
        .unwrap();
        let request_with = provider_with
-            .create_request_body(&messages, None, false, 20000, 0.5)
+            .create_request_body(&messages, None, false, 20000, 0.5, false)
            .unwrap();
        let json_with = serde_json::to_string(&request_with).unwrap();
        assert!(json_with.contains("thinking"), "JSON should contain 'thinking' field when configured");
@@ -1099,12 +1114,43 @@ mod tests {
        // Test WITH thinking parameter but INSUFFICIENT max_tokens - thinking should be disabled
        let request_insufficient = provider_with
-            .create_request_body(&messages, None, false, 5000, 0.5)  // Less than budget + 1024
+            .create_request_body(&messages, None, false, 5000, 0.5, false)  // Less than budget + 1024
            .unwrap();
        let json_insufficient = serde_json::to_string(&request_insufficient).unwrap();
        assert!(!json_insufficient.contains("thinking"), "JSON should NOT contain 'thinking' field when max_tokens is insufficient");
    }
    #[test]
    fn test_disable_thinking_flag() {
        // Test that disable_thinking=true prevents thinking even with sufficient max_tokens
        let provider = AnthropicProvider::new(
            "test-key".to_string(),
            Some("claude-sonnet-4-5".to_string()),
            Some(20000),
            Some(0.5),
            None,
            None,
            Some(10000), // With thinking budget
        )
        .unwrap();
        let messages = vec![Message::new(MessageRole::User, "Test message".to_string())];
        // With disable_thinking=false, thinking should be enabled (max_tokens is sufficient)
        let request_with_thinking = provider
            .create_request_body(&messages, None, false, 20000, 0.5, false)
            .unwrap();
        let json_with = serde_json::to_string(&request_with_thinking).unwrap();
        assert!(json_with.contains("thinking"), "JSON should contain 'thinking' field when not disabled");
        // With disable_thinking=true, thinking should be disabled even with sufficient max_tokens
        let request_without_thinking = provider
            .create_request_body(&messages, None, false, 20000, 0.5, true)
            .unwrap();
        let json_without = serde_json::to_string(&request_without_thinking).unwrap();
        assert!(!json_without.contains("thinking"), "JSON should NOT contain 'thinking' field when explicitly disabled");
    }
    #[test]
    fn test_thinking_content_block_deserialization() {
        // Test that we can deserialize a response containing a "thinking" content block
--- a/crates/g3-providers/src/databricks.rs
+++ b/crates/g3-providers/src/databricks.rs
@@ -45,6 +45,7 @@
 //!         temperature: Some(0.7),
 //!         stream: false,
 //!         tools: None,
 //!         disable_thinking: false,
 //!     };
 //!
 //!     // Get a completion
--- a/crates/g3-providers/src/lib.rs
+++ b/crates/g3-providers/src/lib.rs
@@ -42,6 +42,8 @@ pub struct CompletionRequest {
    pub temperature: Option<f32>,
    pub stream: bool,
    pub tools: Option<Vec<Tool>>,
    /// Force disable thinking mode for this request (used when max_tokens is too low)
    pub disable_thinking: bool,
 }
 #[derive(Debug, Clone, Serialize, Deserialize)]