fix for thinking budget and hardcoded max token on summary
This commit is contained in:
@@ -1475,12 +1475,16 @@ impl<W: UiWriter> Agent<W> {
|
|||||||
let model_limit = self.context_window.total_tokens;
|
let model_limit = self.context_window.total_tokens;
|
||||||
let current_usage = self.context_window.used_tokens;
|
let current_usage = self.context_window.used_tokens;
|
||||||
|
|
||||||
|
// Get the configured max_tokens for this provider
|
||||||
|
let configured_max_tokens = self.resolve_max_tokens(provider_name);
|
||||||
|
|
||||||
// Calculate available tokens with buffer
|
// Calculate available tokens with buffer
|
||||||
let buffer = (model_limit / 40).clamp(1000, 10000); // 2.5% buffer
|
let buffer = (model_limit / 40).clamp(1000, 10000); // 2.5% buffer
|
||||||
let available = model_limit
|
let available = model_limit
|
||||||
.saturating_sub(current_usage)
|
.saturating_sub(current_usage)
|
||||||
.saturating_sub(buffer);
|
.saturating_sub(buffer);
|
||||||
let proposed_max_tokens = available.min(10_000);
|
// Use the smaller of available tokens or configured max_tokens
|
||||||
|
let proposed_max_tokens = available.min(configured_max_tokens);
|
||||||
|
|
||||||
// Validate against thinking budget constraint
|
// Validate against thinking budget constraint
|
||||||
let (adjusted, needs_reduction) = self.preflight_validate_max_tokens(provider_name, proposed_max_tokens);
|
let (adjusted, needs_reduction) = self.preflight_validate_max_tokens(provider_name, proposed_max_tokens);
|
||||||
@@ -2423,8 +2427,15 @@ impl<W: UiWriter> Agent<W> {
|
|||||||
let mut summary_max_tokens = self.apply_summary_fallback_sequence(&provider_name);
|
let mut summary_max_tokens = self.apply_summary_fallback_sequence(&provider_name);
|
||||||
|
|
||||||
// Apply provider-specific caps
|
// Apply provider-specific caps
|
||||||
|
// For Anthropic with thinking enabled, we need max_tokens > thinking.budget_tokens
|
||||||
|
// So we set a higher cap when thinking is configured
|
||||||
|
let anthropic_cap = match self.get_thinking_budget_tokens() {
|
||||||
|
Some(budget) => (budget + 2000).max(10_000), // At least budget + 2000 for response
|
||||||
|
None => 10_000,
|
||||||
|
};
|
||||||
summary_max_tokens = match provider_name.as_str() {
|
summary_max_tokens = match provider_name.as_str() {
|
||||||
"databricks" | "anthropic" => summary_max_tokens.min(10_000),
|
"anthropic" => summary_max_tokens.min(anthropic_cap),
|
||||||
|
"databricks" => summary_max_tokens.min(10_000),
|
||||||
"embedded" => summary_max_tokens.min(3000),
|
"embedded" => summary_max_tokens.min(3000),
|
||||||
_ => summary_max_tokens.min(5000),
|
_ => summary_max_tokens.min(5000),
|
||||||
};
|
};
|
||||||
@@ -3436,8 +3447,15 @@ impl<W: UiWriter> Agent<W> {
|
|||||||
let mut summary_max_tokens = self.apply_summary_fallback_sequence(&provider_name);
|
let mut summary_max_tokens = self.apply_summary_fallback_sequence(&provider_name);
|
||||||
|
|
||||||
// Apply provider-specific caps
|
// Apply provider-specific caps
|
||||||
|
// For Anthropic with thinking enabled, we need max_tokens > thinking.budget_tokens
|
||||||
|
// So we set a higher cap when thinking is configured
|
||||||
|
let anthropic_cap = match self.get_thinking_budget_tokens() {
|
||||||
|
Some(budget) => (budget + 2000).max(10_000), // At least budget + 2000 for response
|
||||||
|
None => 10_000,
|
||||||
|
};
|
||||||
summary_max_tokens = match provider_name.as_str() {
|
summary_max_tokens = match provider_name.as_str() {
|
||||||
"databricks" | "anthropic" => summary_max_tokens.min(10_000),
|
"anthropic" => summary_max_tokens.min(anthropic_cap),
|
||||||
|
"databricks" => summary_max_tokens.min(10_000),
|
||||||
"embedded" => summary_max_tokens.min(3000),
|
"embedded" => summary_max_tokens.min(3000),
|
||||||
_ => summary_max_tokens.min(5000),
|
_ => summary_max_tokens.min(5000),
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -284,9 +284,21 @@ impl AnthropicProvider {
|
|||||||
// Convert tools if provided
|
// Convert tools if provided
|
||||||
let anthropic_tools = tools.map(|t| self.convert_tools(t));
|
let anthropic_tools = tools.map(|t| self.convert_tools(t));
|
||||||
|
|
||||||
// Add thinking configuration if budget_tokens is set
|
// Add thinking configuration if budget_tokens is set AND max_tokens is sufficient
|
||||||
let thinking = self.thinking_budget_tokens.map(|budget| {
|
// Anthropic requires: max_tokens > thinking.budget_tokens
|
||||||
ThinkingConfig::enabled(budget)
|
// We add 1024 as minimum buffer for actual response content
|
||||||
|
let thinking = self.thinking_budget_tokens.and_then(|budget| {
|
||||||
|
let min_required = budget + 1024;
|
||||||
|
if max_tokens > min_required {
|
||||||
|
Some(ThinkingConfig::enabled(budget))
|
||||||
|
} else {
|
||||||
|
tracing::warn!(
|
||||||
|
"Disabling thinking mode: max_tokens ({}) is not greater than thinking.budget_tokens ({}) + 1024 buffer. \
|
||||||
|
Required: max_tokens > {}",
|
||||||
|
max_tokens, budget, min_required
|
||||||
|
);
|
||||||
|
None
|
||||||
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
let request = AnthropicRequest {
|
let request = AnthropicRequest {
|
||||||
@@ -847,6 +859,12 @@ enum AnthropicContent {
|
|||||||
#[serde(skip_serializing_if = "Option::is_none")]
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
cache_control: Option<crate::CacheControl>,
|
cache_control: Option<crate::CacheControl>,
|
||||||
},
|
},
|
||||||
|
#[serde(rename = "thinking")]
|
||||||
|
Thinking {
|
||||||
|
thinking: String,
|
||||||
|
#[serde(default)]
|
||||||
|
signature: Option<String>,
|
||||||
|
},
|
||||||
#[serde(rename = "tool_use")]
|
#[serde(rename = "tool_use")]
|
||||||
ToolUse {
|
ToolUse {
|
||||||
id: String,
|
id: String,
|
||||||
@@ -1058,11 +1076,12 @@ mod tests {
|
|||||||
let json_without = serde_json::to_string(&request_without).unwrap();
|
let json_without = serde_json::to_string(&request_without).unwrap();
|
||||||
assert!(!json_without.contains("thinking"), "JSON should not contain 'thinking' field when not configured");
|
assert!(!json_without.contains("thinking"), "JSON should not contain 'thinking' field when not configured");
|
||||||
|
|
||||||
// Test WITH thinking parameter
|
// Test WITH thinking parameter - max_tokens must be > budget_tokens + 1024
|
||||||
|
// Using budget=10000 requires max_tokens > 11024
|
||||||
let provider_with = AnthropicProvider::new(
|
let provider_with = AnthropicProvider::new(
|
||||||
"test-key".to_string(),
|
"test-key".to_string(),
|
||||||
Some("claude-sonnet-4-5".to_string()),
|
Some("claude-sonnet-4-5".to_string()),
|
||||||
Some(1000),
|
Some(20000), // Sufficient for thinking budget
|
||||||
Some(0.5),
|
Some(0.5),
|
||||||
None,
|
None,
|
||||||
None,
|
None,
|
||||||
@@ -1071,11 +1090,47 @@ mod tests {
|
|||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let request_with = provider_with
|
let request_with = provider_with
|
||||||
.create_request_body(&messages, None, false, 1000, 0.5)
|
.create_request_body(&messages, None, false, 20000, 0.5)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let json_with = serde_json::to_string(&request_with).unwrap();
|
let json_with = serde_json::to_string(&request_with).unwrap();
|
||||||
assert!(json_with.contains("thinking"), "JSON should contain 'thinking' field when configured");
|
assert!(json_with.contains("thinking"), "JSON should contain 'thinking' field when configured");
|
||||||
assert!(json_with.contains("\"type\":\"enabled\""), "JSON should contain type: enabled");
|
assert!(json_with.contains("\"type\":\"enabled\""), "JSON should contain type: enabled");
|
||||||
assert!(json_with.contains("\"budget_tokens\":10000"), "JSON should contain budget_tokens: 10000");
|
assert!(json_with.contains("\"budget_tokens\":10000"), "JSON should contain budget_tokens: 10000");
|
||||||
|
|
||||||
|
// Test WITH thinking parameter but INSUFFICIENT max_tokens - thinking should be disabled
|
||||||
|
let request_insufficient = provider_with
|
||||||
|
.create_request_body(&messages, None, false, 5000, 0.5) // Less than budget + 1024
|
||||||
|
.unwrap();
|
||||||
|
let json_insufficient = serde_json::to_string(&request_insufficient).unwrap();
|
||||||
|
assert!(!json_insufficient.contains("thinking"), "JSON should NOT contain 'thinking' field when max_tokens is insufficient");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_thinking_content_block_deserialization() {
|
||||||
|
// Test that we can deserialize a response containing a "thinking" content block
|
||||||
|
// This is what Anthropic returns when extended thinking is enabled
|
||||||
|
let json_response = r#"{
|
||||||
|
"content": [
|
||||||
|
{"type": "thinking", "thinking": "Let me analyze this...", "signature": "abc123"},
|
||||||
|
{"type": "text", "text": "Here is my response."}
|
||||||
|
],
|
||||||
|
"model": "claude-sonnet-4-5",
|
||||||
|
"usage": {"input_tokens": 100, "output_tokens": 50}
|
||||||
|
}"#;
|
||||||
|
|
||||||
|
let response: AnthropicResponse = serde_json::from_str(json_response)
|
||||||
|
.expect("Should be able to deserialize response with thinking block");
|
||||||
|
|
||||||
|
assert_eq!(response.content.len(), 2);
|
||||||
|
assert_eq!(response.model, "claude-sonnet-4-5");
|
||||||
|
|
||||||
|
// Extract only text content (thinking should be filtered out)
|
||||||
|
let text_content: Vec<_> = response.content.iter().filter_map(|c| match c {
|
||||||
|
AnthropicContent::Text { text, .. } => Some(text.as_str()),
|
||||||
|
_ => None,
|
||||||
|
}).collect();
|
||||||
|
|
||||||
|
assert_eq!(text_content.len(), 1);
|
||||||
|
assert_eq!(text_content[0], "Here is my response.");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user