fix for thinking budget and hardcoded max token on summary

2025-12-09 12:41:52 +11:00
parent 696c441a47
commit fb2cf6f898
2 changed files with 82 additions and 9 deletions
--- a/crates/g3-core/src/lib.rs
+++ b/crates/g3-core/src/lib.rs
@@ -1475,12 +1475,16 @@ impl<W: UiWriter> Agent<W> {
        let model_limit = self.context_window.total_tokens;
        let current_usage = self.context_window.used_tokens;
        // Get the configured max_tokens for this provider
        let configured_max_tokens = self.resolve_max_tokens(provider_name);
        // Calculate available tokens with buffer
        let buffer = (model_limit / 40).clamp(1000, 10000); // 2.5% buffer
        let available = model_limit
            .saturating_sub(current_usage)
            .saturating_sub(buffer);
-        let proposed_max_tokens = available.min(10_000);
+        // Use the smaller of available tokens or configured max_tokens
        let proposed_max_tokens = available.min(configured_max_tokens);
        // Validate against thinking budget constraint
        let (adjusted, needs_reduction) = self.preflight_validate_max_tokens(provider_name, proposed_max_tokens);
@@ -2423,8 +2427,15 @@ impl<W: UiWriter> Agent<W> {
        let mut summary_max_tokens = self.apply_summary_fallback_sequence(&provider_name);
        // Apply provider-specific caps
        // For Anthropic with thinking enabled, we need max_tokens > thinking.budget_tokens
        // So we set a higher cap when thinking is configured
        let anthropic_cap = match self.get_thinking_budget_tokens() {
            Some(budget) => (budget + 2000).max(10_000), // At least budget + 2000 for response
            None => 10_000,
        };
        summary_max_tokens = match provider_name.as_str() {
-            "databricks" | "anthropic" => summary_max_tokens.min(10_000),
+            "anthropic" => summary_max_tokens.min(anthropic_cap),
            "databricks" => summary_max_tokens.min(10_000),
            "embedded" => summary_max_tokens.min(3000),
            _ => summary_max_tokens.min(5000),
        };
@@ -3436,8 +3447,15 @@ impl<W: UiWriter> Agent<W> {
                let mut summary_max_tokens = self.apply_summary_fallback_sequence(&provider_name);
                // Apply provider-specific caps
                // For Anthropic with thinking enabled, we need max_tokens > thinking.budget_tokens
                // So we set a higher cap when thinking is configured
                let anthropic_cap = match self.get_thinking_budget_tokens() {
                    Some(budget) => (budget + 2000).max(10_000), // At least budget + 2000 for response
                    None => 10_000,
                };
                summary_max_tokens = match provider_name.as_str() {
-                    "databricks" | "anthropic" => summary_max_tokens.min(10_000),
+                    "anthropic" => summary_max_tokens.min(anthropic_cap),
                    "databricks" => summary_max_tokens.min(10_000),
                    "embedded" => summary_max_tokens.min(3000),
                    _ => summary_max_tokens.min(5000),
                };
--- a/crates/g3-providers/src/anthropic.rs
+++ b/crates/g3-providers/src/anthropic.rs
@@ -284,9 +284,21 @@ impl AnthropicProvider {
        // Convert tools if provided
        let anthropic_tools = tools.map(|t| self.convert_tools(t));
-        // Add thinking configuration if budget_tokens is set
+        // Add thinking configuration if budget_tokens is set AND max_tokens is sufficient
-        let thinking = self.thinking_budget_tokens.map(|budget| {
+        // Anthropic requires: max_tokens > thinking.budget_tokens
-            ThinkingConfig::enabled(budget)
+        // We add 1024 as minimum buffer for actual response content
        let thinking = self.thinking_budget_tokens.and_then(|budget| {
            let min_required = budget + 1024;
            if max_tokens > min_required {
                Some(ThinkingConfig::enabled(budget))
            } else {
                tracing::warn!(
                    "Disabling thinking mode: max_tokens ({}) is not greater than thinking.budget_tokens ({}) + 1024 buffer. \
                     Required: max_tokens > {}",
                    max_tokens, budget, min_required
                );
                None
            }
        });
        let request = AnthropicRequest {
@@ -847,6 +859,12 @@ enum AnthropicContent {
        #[serde(skip_serializing_if = "Option::is_none")]
        cache_control: Option<crate::CacheControl>,
    },
    #[serde(rename = "thinking")]
    Thinking {
        thinking: String,
        #[serde(default)]
        signature: Option<String>,
    },
    #[serde(rename = "tool_use")]
    ToolUse {
        id: String,
@@ -1058,11 +1076,12 @@ mod tests {
        let json_without = serde_json::to_string(&request_without).unwrap();
        assert!(!json_without.contains("thinking"), "JSON should not contain 'thinking' field when not configured");
-        // Test WITH thinking parameter
+        // Test WITH thinking parameter - max_tokens must be > budget_tokens + 1024
        // Using budget=10000 requires max_tokens > 11024
        let provider_with = AnthropicProvider::new(
            "test-key".to_string(),
            Some("claude-sonnet-4-5".to_string()),
-            Some(1000),
+            Some(20000),  // Sufficient for thinking budget
            Some(0.5),
            None,
            None,
@@ -1071,11 +1090,47 @@ mod tests {
        .unwrap();
        let request_with = provider_with
-            .create_request_body(&messages, None, false, 1000, 0.5)
+            .create_request_body(&messages, None, false, 20000, 0.5)
            .unwrap();
        let json_with = serde_json::to_string(&request_with).unwrap();
        assert!(json_with.contains("thinking"), "JSON should contain 'thinking' field when configured");
        assert!(json_with.contains("\"type\":\"enabled\""), "JSON should contain type: enabled");
        assert!(json_with.contains("\"budget_tokens\":10000"), "JSON should contain budget_tokens: 10000");
        // Test WITH thinking parameter but INSUFFICIENT max_tokens - thinking should be disabled
        let request_insufficient = provider_with
            .create_request_body(&messages, None, false, 5000, 0.5)  // Less than budget + 1024
            .unwrap();
        let json_insufficient = serde_json::to_string(&request_insufficient).unwrap();
        assert!(!json_insufficient.contains("thinking"), "JSON should NOT contain 'thinking' field when max_tokens is insufficient");
    }
    #[test]
    fn test_thinking_content_block_deserialization() {
        // Test that we can deserialize a response containing a "thinking" content block
        // This is what Anthropic returns when extended thinking is enabled
        let json_response = r#"{
            "content": [
                {"type": "thinking", "thinking": "Let me analyze this...", "signature": "abc123"},
                {"type": "text", "text": "Here is my response."}
            ],
            "model": "claude-sonnet-4-5",
            "usage": {"input_tokens": 100, "output_tokens": 50}
        }"#;
        let response: AnthropicResponse = serde_json::from_str(json_response)
            .expect("Should be able to deserialize response with thinking block");
        assert_eq!(response.content.len(), 2);
        assert_eq!(response.model, "claude-sonnet-4-5");
        // Extract only text content (thinking should be filtered out)
        let text_content: Vec<_> = response.content.iter().filter_map(|c| match c {
            AnthropicContent::Text { text, .. } => Some(text.as_str()),
            _ => None,
        }).collect();
        assert_eq!(text_content.len(), 1);
        assert_eq!(text_content[0], "Here is my response.");
    }
 }