disable thinking if there is no token budget
This commit is contained in:
@@ -1494,8 +1494,18 @@ impl<W: UiWriter> Agent<W> {
|
|||||||
let available = model_limit
|
let available = model_limit
|
||||||
.saturating_sub(current_usage)
|
.saturating_sub(current_usage)
|
||||||
.saturating_sub(buffer);
|
.saturating_sub(buffer);
|
||||||
// Use the smaller of available tokens or configured max_tokens
|
// Use the smaller of available tokens or configured max_tokens,
|
||||||
|
// but ensure we don't go below thinking budget floor for Anthropic
|
||||||
let proposed_max_tokens = available.min(configured_max_tokens);
|
let proposed_max_tokens = available.min(configured_max_tokens);
|
||||||
|
let proposed_max_tokens = if provider_name == "anthropic" {
|
||||||
|
if let Some(budget) = self.get_thinking_budget_tokens() {
|
||||||
|
proposed_max_tokens.max(budget + 1024)
|
||||||
|
} else {
|
||||||
|
proposed_max_tokens
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
proposed_max_tokens
|
||||||
|
};
|
||||||
|
|
||||||
// Validate against thinking budget constraint
|
// Validate against thinking budget constraint
|
||||||
let (adjusted, needs_reduction) = self.preflight_validate_max_tokens(provider_name, proposed_max_tokens);
|
let (adjusted, needs_reduction) = self.preflight_validate_max_tokens(provider_name, proposed_max_tokens);
|
||||||
@@ -2033,6 +2043,7 @@ impl<W: UiWriter> Agent<W> {
|
|||||||
temperature: Some(self.resolve_temperature(&provider_name)),
|
temperature: Some(self.resolve_temperature(&provider_name)),
|
||||||
stream: true, // Enable streaming
|
stream: true, // Enable streaming
|
||||||
tools,
|
tools,
|
||||||
|
disable_thinking: false,
|
||||||
};
|
};
|
||||||
|
|
||||||
// Time the LLM call with cancellation support and streaming
|
// Time the LLM call with cancellation support and streaming
|
||||||
@@ -2484,12 +2495,26 @@ impl<W: UiWriter> Agent<W> {
|
|||||||
|
|
||||||
let provider = self.providers.get(None)?;
|
let provider = self.providers.get(None)?;
|
||||||
|
|
||||||
|
// Determine if we need to disable thinking mode for this request
|
||||||
|
// Anthropic requires: max_tokens > thinking.budget_tokens + 1024
|
||||||
|
let disable_thinking = self.get_thinking_budget_tokens().map_or(false, |budget| {
|
||||||
|
let minimum_for_thinking = budget + 1024;
|
||||||
|
let should_disable = summary_max_tokens <= minimum_for_thinking;
|
||||||
|
if should_disable {
|
||||||
|
tracing::warn!("Disabling thinking mode for summary: max_tokens ({}) <= minimum_for_thinking ({})", summary_max_tokens, minimum_for_thinking);
|
||||||
|
}
|
||||||
|
should_disable
|
||||||
|
});
|
||||||
|
|
||||||
|
tracing::debug!("Creating summary request: max_tokens={}, disable_thinking={}", summary_max_tokens, disable_thinking);
|
||||||
|
|
||||||
let summary_request = CompletionRequest {
|
let summary_request = CompletionRequest {
|
||||||
messages: summary_messages,
|
messages: summary_messages,
|
||||||
max_tokens: Some(summary_max_tokens),
|
max_tokens: Some(summary_max_tokens),
|
||||||
temperature: Some(self.resolve_temperature(provider.name())),
|
temperature: Some(self.resolve_temperature(provider.name())),
|
||||||
stream: false,
|
stream: false,
|
||||||
tools: None,
|
tools: None,
|
||||||
|
disable_thinking,
|
||||||
};
|
};
|
||||||
|
|
||||||
// Get the summary
|
// Get the summary
|
||||||
@@ -3504,12 +3529,26 @@ impl<W: UiWriter> Agent<W> {
|
|||||||
|
|
||||||
let provider = self.providers.get(None)?;
|
let provider = self.providers.get(None)?;
|
||||||
|
|
||||||
|
// Determine if we need to disable thinking mode for this request
|
||||||
|
// Anthropic requires: max_tokens > thinking.budget_tokens + 1024
|
||||||
|
let disable_thinking = self.get_thinking_budget_tokens().map_or(false, |budget| {
|
||||||
|
let minimum_for_thinking = budget + 1024;
|
||||||
|
let should_disable = summary_max_tokens <= minimum_for_thinking;
|
||||||
|
if should_disable {
|
||||||
|
tracing::warn!("Disabling thinking mode for summary: max_tokens ({}) <= minimum_for_thinking ({})", summary_max_tokens, minimum_for_thinking);
|
||||||
|
}
|
||||||
|
should_disable
|
||||||
|
});
|
||||||
|
|
||||||
|
tracing::debug!("Creating auto-summary request: max_tokens={}, disable_thinking={}", summary_max_tokens, disable_thinking);
|
||||||
|
|
||||||
let summary_request = CompletionRequest {
|
let summary_request = CompletionRequest {
|
||||||
messages: summary_messages,
|
messages: summary_messages,
|
||||||
max_tokens: Some(summary_max_tokens),
|
max_tokens: Some(summary_max_tokens),
|
||||||
temperature: Some(self.resolve_temperature(provider.name())),
|
temperature: Some(self.resolve_temperature(provider.name())),
|
||||||
stream: false,
|
stream: false,
|
||||||
tools: None,
|
tools: None,
|
||||||
|
disable_thinking,
|
||||||
};
|
};
|
||||||
|
|
||||||
// Get the summary
|
// Get the summary
|
||||||
|
|||||||
@@ -85,6 +85,7 @@ pub async fn get_initial_discovery_messages(
|
|||||||
temperature: Some(provider.temperature()),
|
temperature: Some(provider.temperature()),
|
||||||
stream: false,
|
stream: false,
|
||||||
tools: None,
|
tools: None,
|
||||||
|
disable_thinking: false,
|
||||||
};
|
};
|
||||||
|
|
||||||
status("🤖 Calling LLM for discovery commands...");
|
status("🤖 Calling LLM for discovery commands...");
|
||||||
|
|||||||
@@ -39,6 +39,7 @@
|
|||||||
//! temperature: Some(0.7),
|
//! temperature: Some(0.7),
|
||||||
//! stream: false,
|
//! stream: false,
|
||||||
//! tools: None,
|
//! tools: None,
|
||||||
|
//! disable_thinking: false,
|
||||||
//! };
|
//! };
|
||||||
//!
|
//!
|
||||||
//! // Get a completion
|
//! // Get a completion
|
||||||
@@ -75,6 +76,7 @@
|
|||||||
//! temperature: Some(0.7),
|
//! temperature: Some(0.7),
|
||||||
//! stream: true,
|
//! stream: true,
|
||||||
//! tools: None,
|
//! tools: None,
|
||||||
|
//! disable_thinking: false,
|
||||||
//! };
|
//! };
|
||||||
//!
|
//!
|
||||||
//! let mut stream = provider.stream(request).await?;
|
//! let mut stream = provider.stream(request).await?;
|
||||||
@@ -272,6 +274,7 @@ impl AnthropicProvider {
|
|||||||
streaming: bool,
|
streaming: bool,
|
||||||
max_tokens: u32,
|
max_tokens: u32,
|
||||||
temperature: f32,
|
temperature: f32,
|
||||||
|
disable_thinking: bool,
|
||||||
) -> Result<AnthropicRequest> {
|
) -> Result<AnthropicRequest> {
|
||||||
let (system, anthropic_messages) = self.convert_messages(messages)?;
|
let (system, anthropic_messages) = self.convert_messages(messages)?;
|
||||||
|
|
||||||
@@ -284,10 +287,19 @@ impl AnthropicProvider {
|
|||||||
// Convert tools if provided
|
// Convert tools if provided
|
||||||
let anthropic_tools = tools.map(|t| self.convert_tools(t));
|
let anthropic_tools = tools.map(|t| self.convert_tools(t));
|
||||||
|
|
||||||
// Add thinking configuration if budget_tokens is set AND max_tokens is sufficient
|
// Add thinking configuration if budget_tokens is set AND max_tokens is sufficient AND not explicitly disabled
|
||||||
// Anthropic requires: max_tokens > thinking.budget_tokens
|
// Anthropic requires: max_tokens > thinking.budget_tokens
|
||||||
// We add 1024 as minimum buffer for actual response content
|
// We add 1024 as minimum buffer for actual response content
|
||||||
let thinking = self.thinking_budget_tokens.and_then(|budget| {
|
tracing::debug!("create_request_body called: max_tokens={}, disable_thinking={}, thinking_budget_tokens={:?}", max_tokens, disable_thinking, self.thinking_budget_tokens);
|
||||||
|
|
||||||
|
let thinking = if disable_thinking {
|
||||||
|
tracing::info!(
|
||||||
|
"Thinking mode explicitly disabled for this request (max_tokens={})",
|
||||||
|
max_tokens
|
||||||
|
);
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
self.thinking_budget_tokens.and_then(|budget| {
|
||||||
let min_required = budget + 1024;
|
let min_required = budget + 1024;
|
||||||
if max_tokens > min_required {
|
if max_tokens > min_required {
|
||||||
Some(ThinkingConfig::enabled(budget))
|
Some(ThinkingConfig::enabled(budget))
|
||||||
@@ -299,7 +311,8 @@ impl AnthropicProvider {
|
|||||||
);
|
);
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
});
|
})
|
||||||
|
};
|
||||||
|
|
||||||
let request = AnthropicRequest {
|
let request = AnthropicRequest {
|
||||||
model: self.model.clone(),
|
model: self.model.clone(),
|
||||||
@@ -649,6 +662,7 @@ impl LLMProvider for AnthropicProvider {
|
|||||||
false,
|
false,
|
||||||
max_tokens,
|
max_tokens,
|
||||||
temperature,
|
temperature,
|
||||||
|
request.disable_thinking,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
debug!(
|
debug!(
|
||||||
@@ -722,6 +736,7 @@ impl LLMProvider for AnthropicProvider {
|
|||||||
true,
|
true,
|
||||||
max_tokens,
|
max_tokens,
|
||||||
temperature,
|
temperature,
|
||||||
|
request.disable_thinking,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
debug!(
|
debug!(
|
||||||
@@ -965,7 +980,7 @@ mod tests {
|
|||||||
let messages = vec![Message::new(MessageRole::User, "Test message".to_string())];
|
let messages = vec![Message::new(MessageRole::User, "Test message".to_string())];
|
||||||
|
|
||||||
let request_body = provider
|
let request_body = provider
|
||||||
.create_request_body(&messages, None, false, 1000, 0.5)
|
.create_request_body(&messages, None, false, 1000, 0.5, false)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
assert_eq!(request_body.model, "claude-3-haiku-20240307");
|
assert_eq!(request_body.model, "claude-3-haiku-20240307");
|
||||||
@@ -1071,7 +1086,7 @@ mod tests {
|
|||||||
|
|
||||||
let messages = vec![Message::new(MessageRole::User, "Test message".to_string())];
|
let messages = vec![Message::new(MessageRole::User, "Test message".to_string())];
|
||||||
let request_without = provider_without
|
let request_without = provider_without
|
||||||
.create_request_body(&messages, None, false, 1000, 0.5)
|
.create_request_body(&messages, None, false, 1000, 0.5, false)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let json_without = serde_json::to_string(&request_without).unwrap();
|
let json_without = serde_json::to_string(&request_without).unwrap();
|
||||||
assert!(!json_without.contains("thinking"), "JSON should not contain 'thinking' field when not configured");
|
assert!(!json_without.contains("thinking"), "JSON should not contain 'thinking' field when not configured");
|
||||||
@@ -1090,7 +1105,7 @@ mod tests {
|
|||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let request_with = provider_with
|
let request_with = provider_with
|
||||||
.create_request_body(&messages, None, false, 20000, 0.5)
|
.create_request_body(&messages, None, false, 20000, 0.5, false)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let json_with = serde_json::to_string(&request_with).unwrap();
|
let json_with = serde_json::to_string(&request_with).unwrap();
|
||||||
assert!(json_with.contains("thinking"), "JSON should contain 'thinking' field when configured");
|
assert!(json_with.contains("thinking"), "JSON should contain 'thinking' field when configured");
|
||||||
@@ -1099,12 +1114,43 @@ mod tests {
|
|||||||
|
|
||||||
// Test WITH thinking parameter but INSUFFICIENT max_tokens - thinking should be disabled
|
// Test WITH thinking parameter but INSUFFICIENT max_tokens - thinking should be disabled
|
||||||
let request_insufficient = provider_with
|
let request_insufficient = provider_with
|
||||||
.create_request_body(&messages, None, false, 5000, 0.5) // Less than budget + 1024
|
.create_request_body(&messages, None, false, 5000, 0.5, false) // Less than budget + 1024
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let json_insufficient = serde_json::to_string(&request_insufficient).unwrap();
|
let json_insufficient = serde_json::to_string(&request_insufficient).unwrap();
|
||||||
assert!(!json_insufficient.contains("thinking"), "JSON should NOT contain 'thinking' field when max_tokens is insufficient");
|
assert!(!json_insufficient.contains("thinking"), "JSON should NOT contain 'thinking' field when max_tokens is insufficient");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_disable_thinking_flag() {
|
||||||
|
// Test that disable_thinking=true prevents thinking even with sufficient max_tokens
|
||||||
|
let provider = AnthropicProvider::new(
|
||||||
|
"test-key".to_string(),
|
||||||
|
Some("claude-sonnet-4-5".to_string()),
|
||||||
|
Some(20000),
|
||||||
|
Some(0.5),
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
Some(10000), // With thinking budget
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let messages = vec![Message::new(MessageRole::User, "Test message".to_string())];
|
||||||
|
|
||||||
|
// With disable_thinking=false, thinking should be enabled (max_tokens is sufficient)
|
||||||
|
let request_with_thinking = provider
|
||||||
|
.create_request_body(&messages, None, false, 20000, 0.5, false)
|
||||||
|
.unwrap();
|
||||||
|
let json_with = serde_json::to_string(&request_with_thinking).unwrap();
|
||||||
|
assert!(json_with.contains("thinking"), "JSON should contain 'thinking' field when not disabled");
|
||||||
|
|
||||||
|
// With disable_thinking=true, thinking should be disabled even with sufficient max_tokens
|
||||||
|
let request_without_thinking = provider
|
||||||
|
.create_request_body(&messages, None, false, 20000, 0.5, true)
|
||||||
|
.unwrap();
|
||||||
|
let json_without = serde_json::to_string(&request_without_thinking).unwrap();
|
||||||
|
assert!(!json_without.contains("thinking"), "JSON should NOT contain 'thinking' field when explicitly disabled");
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_thinking_content_block_deserialization() {
|
fn test_thinking_content_block_deserialization() {
|
||||||
// Test that we can deserialize a response containing a "thinking" content block
|
// Test that we can deserialize a response containing a "thinking" content block
|
||||||
|
|||||||
@@ -45,6 +45,7 @@
|
|||||||
//! temperature: Some(0.7),
|
//! temperature: Some(0.7),
|
||||||
//! stream: false,
|
//! stream: false,
|
||||||
//! tools: None,
|
//! tools: None,
|
||||||
|
//! disable_thinking: false,
|
||||||
//! };
|
//! };
|
||||||
//!
|
//!
|
||||||
//! // Get a completion
|
//! // Get a completion
|
||||||
|
|||||||
@@ -42,6 +42,8 @@ pub struct CompletionRequest {
|
|||||||
pub temperature: Option<f32>,
|
pub temperature: Option<f32>,
|
||||||
pub stream: bool,
|
pub stream: bool,
|
||||||
pub tools: Option<Vec<Tool>>,
|
pub tools: Option<Vec<Tool>>,
|
||||||
|
/// Force disable thinking mode for this request (used when max_tokens is too low)
|
||||||
|
pub disable_thinking: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
|||||||
Reference in New Issue
Block a user