validate max_tokens for call, also fallbacks for summary
When the CW is full, max_tokens is often passed at 0 or tiny. The LLM will fail. For Anthropic with thining, there is also the thinking budget. This can happen during summary attempts, in that case first try thinnify, skinnify etc..
This commit is contained in:
@@ -1418,6 +1418,204 @@ impl<W: UiWriter> Agent<W> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the thinking budget tokens for Anthropic provider, if configured
|
||||
fn get_thinking_budget_tokens(&self) -> Option<u32> {
|
||||
self.config
|
||||
.providers
|
||||
.anthropic
|
||||
.as_ref()
|
||||
.and_then(|c| c.thinking_budget_tokens)
|
||||
}
|
||||
|
||||
/// Pre-flight check to validate and adjust max_tokens for the thinking.budget_tokens constraint.
|
||||
/// Returns the adjusted max_tokens that satisfies: max_tokens > thinking.budget_tokens
|
||||
/// Also returns whether we need to apply fallback actions (thinnify/skinnify).
|
||||
///
|
||||
/// Returns: (adjusted_max_tokens, needs_context_reduction)
|
||||
fn preflight_validate_max_tokens(
|
||||
&self,
|
||||
provider_name: &str,
|
||||
proposed_max_tokens: u32,
|
||||
) -> (u32, bool) {
|
||||
// Only applies to Anthropic provider with thinking enabled
|
||||
if provider_name != "anthropic" {
|
||||
return (proposed_max_tokens, false);
|
||||
}
|
||||
|
||||
let budget_tokens = match self.get_thinking_budget_tokens() {
|
||||
Some(budget) => budget,
|
||||
None => return (proposed_max_tokens, false), // No thinking enabled
|
||||
};
|
||||
|
||||
// Anthropic requires: max_tokens > budget_tokens
|
||||
// We add a minimum output buffer of 1024 tokens for actual response content
|
||||
let minimum_required = budget_tokens + 1024;
|
||||
|
||||
if proposed_max_tokens >= minimum_required {
|
||||
// We have enough headroom
|
||||
(proposed_max_tokens, false)
|
||||
} else {
|
||||
// max_tokens is too low - need to either adjust or reduce context
|
||||
warn!(
|
||||
"max_tokens ({}) is below required minimum ({}) for thinking.budget_tokens ({}). Context reduction needed.",
|
||||
proposed_max_tokens, minimum_required, budget_tokens
|
||||
);
|
||||
// Return the minimum required, but flag that we need context reduction
|
||||
(minimum_required, true)
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate max_tokens for a summary request, ensuring it satisfies the thinking constraint.
|
||||
/// Applies fallback sequence: thinnify -> skinnify -> hard-coded minimum
|
||||
/// Returns (max_tokens, whether_fallback_was_used)
|
||||
fn calculate_summary_max_tokens(
|
||||
&mut self,
|
||||
provider_name: &str,
|
||||
) -> (u32, bool) {
|
||||
let model_limit = self.context_window.total_tokens;
|
||||
let current_usage = self.context_window.used_tokens;
|
||||
|
||||
// Calculate available tokens with buffer
|
||||
let buffer = (model_limit / 40).clamp(1000, 10000); // 2.5% buffer
|
||||
let available = model_limit
|
||||
.saturating_sub(current_usage)
|
||||
.saturating_sub(buffer);
|
||||
let proposed_max_tokens = available.min(10_000);
|
||||
|
||||
// Validate against thinking budget constraint
|
||||
let (adjusted, needs_reduction) = self.preflight_validate_max_tokens(provider_name, proposed_max_tokens);
|
||||
|
||||
if !needs_reduction {
|
||||
return (adjusted, false);
|
||||
}
|
||||
|
||||
// We need more headroom - the context is too full
|
||||
// Return the adjusted value but flag that fallbacks are needed
|
||||
(adjusted, true)
|
||||
}
|
||||
|
||||
/// Apply the fallback sequence to free up context space for thinking budget.
|
||||
/// Sequence: thinnify (first third) → skinnify (all) → hard-coded minimum
|
||||
/// Returns the validated max_tokens that satisfies thinking.budget_tokens constraint.
|
||||
fn apply_max_tokens_fallback_sequence(
|
||||
&mut self,
|
||||
provider_name: &str,
|
||||
initial_max_tokens: u32,
|
||||
hard_coded_minimum: u32,
|
||||
) -> u32 {
|
||||
let (mut max_tokens, needs_reduction) = self.preflight_validate_max_tokens(provider_name, initial_max_tokens);
|
||||
|
||||
if !needs_reduction {
|
||||
return max_tokens;
|
||||
}
|
||||
|
||||
self.ui_writer.print_context_status(
|
||||
"⚠️ Context window too full for thinking budget. Applying fallback sequence...\n",
|
||||
);
|
||||
|
||||
// Step 1: Try thinnify (first third of context)
|
||||
self.ui_writer.print_context_status("🥒 Step 1: Trying thinnify...\n");
|
||||
let (thin_msg, thin_saved) = self.context_window.thin_context();
|
||||
self.thinning_events.push(thin_saved);
|
||||
self.ui_writer.print_context_thinning(&thin_msg);
|
||||
|
||||
// Recalculate max_tokens after thinnify
|
||||
let recalc_max = self.resolve_max_tokens(provider_name);
|
||||
let (new_max, still_needs_reduction) = self.preflight_validate_max_tokens(provider_name, recalc_max);
|
||||
max_tokens = new_max;
|
||||
|
||||
if !still_needs_reduction {
|
||||
self.ui_writer.print_context_status(
|
||||
"✅ Thinnify resolved capacity issue. Continuing...\n",
|
||||
);
|
||||
return max_tokens;
|
||||
}
|
||||
|
||||
// Step 2: Try skinnify (entire context)
|
||||
self.ui_writer.print_context_status("🦴 Step 2: Trying skinnify...\n");
|
||||
let (skinny_msg, skinny_saved) = self.context_window.thin_context_all();
|
||||
self.thinning_events.push(skinny_saved);
|
||||
self.ui_writer.print_context_thinning(&skinny_msg);
|
||||
|
||||
// Recalculate max_tokens after skinnify
|
||||
let recalc_max = self.resolve_max_tokens(provider_name);
|
||||
let (final_max, final_needs_reduction) = self.preflight_validate_max_tokens(provider_name, recalc_max);
|
||||
max_tokens = final_max;
|
||||
|
||||
if !final_needs_reduction {
|
||||
self.ui_writer.print_context_status(
|
||||
"✅ Skinnify resolved capacity issue. Continuing...\n",
|
||||
);
|
||||
return max_tokens;
|
||||
}
|
||||
|
||||
// Step 3: Nothing worked, use hard-coded minimum as last resort
|
||||
self.ui_writer.print_context_status(&format!(
|
||||
"⚠️ Step 3: Context reduction insufficient. Using hard-coded max_tokens={} as last resort...\n",
|
||||
hard_coded_minimum
|
||||
));
|
||||
|
||||
hard_coded_minimum
|
||||
}
|
||||
|
||||
/// Apply the fallback sequence for summary requests to free up context space.
|
||||
/// Uses calculate_summary_max_tokens for recalculation (based on available space).
|
||||
/// Returns the validated max_tokens for summary requests.
|
||||
fn apply_summary_fallback_sequence(
|
||||
&mut self,
|
||||
provider_name: &str,
|
||||
) -> u32 {
|
||||
let (mut summary_max_tokens, needs_reduction) = self.calculate_summary_max_tokens(provider_name);
|
||||
|
||||
if !needs_reduction {
|
||||
return summary_max_tokens;
|
||||
}
|
||||
|
||||
self.ui_writer.print_context_status(
|
||||
"⚠️ Context window too full for thinking budget. Applying fallback sequence...\n",
|
||||
);
|
||||
|
||||
// Step 1: Try thinnify (first third of context)
|
||||
self.ui_writer.print_context_status("🥒 Step 1: Trying thinnify...\n");
|
||||
let (thin_msg, thin_saved) = self.context_window.thin_context();
|
||||
self.thinning_events.push(thin_saved);
|
||||
self.ui_writer.print_context_thinning(&thin_msg);
|
||||
|
||||
// Recalculate max_tokens after thinnify
|
||||
let (new_max, still_needs_reduction) = self.calculate_summary_max_tokens(provider_name);
|
||||
summary_max_tokens = new_max;
|
||||
|
||||
if !still_needs_reduction {
|
||||
self.ui_writer.print_context_status(
|
||||
"✅ Thinnify resolved capacity issue. Continuing...\n",
|
||||
);
|
||||
return summary_max_tokens;
|
||||
}
|
||||
|
||||
// Step 2: Try skinnify (entire context)
|
||||
self.ui_writer.print_context_status("🦴 Step 2: Trying skinnify...\n");
|
||||
let (skinny_msg, skinny_saved) = self.context_window.thin_context_all();
|
||||
self.thinning_events.push(skinny_saved);
|
||||
self.ui_writer.print_context_thinning(&skinny_msg);
|
||||
|
||||
// Recalculate max_tokens after skinnify
|
||||
let (final_max, final_needs_reduction) = self.calculate_summary_max_tokens(provider_name);
|
||||
summary_max_tokens = final_max;
|
||||
|
||||
if !final_needs_reduction {
|
||||
self.ui_writer.print_context_status(
|
||||
"✅ Skinnify resolved capacity issue. Continuing...\n",
|
||||
);
|
||||
return summary_max_tokens;
|
||||
}
|
||||
|
||||
// Step 3: Nothing worked, use hard-coded minimum
|
||||
self.ui_writer.print_context_status(
|
||||
"⚠️ Step 3: Context reduction insufficient. Using hard-coded max_tokens=5000 as last resort...\n",
|
||||
);
|
||||
5000
|
||||
}
|
||||
|
||||
/// Resolve the temperature to use for a given provider, applying fallbacks
|
||||
fn resolve_temperature(&self, provider_name: &str) -> f32 {
|
||||
match provider_name {
|
||||
@@ -1805,8 +2003,14 @@ impl<W: UiWriter> Agent<W> {
|
||||
};
|
||||
let _ = provider; // Drop the provider reference to avoid borrowing issues
|
||||
|
||||
// Get max_tokens from provider configuration, falling back to sensible defaults
|
||||
let max_tokens = Some(self.resolve_max_tokens(&provider_name));
|
||||
// Get max_tokens from provider configuration with preflight validation
|
||||
// This ensures max_tokens > thinking.budget_tokens for Anthropic with extended thinking
|
||||
let initial_max_tokens = self.resolve_max_tokens(&provider_name);
|
||||
let max_tokens = Some(self.apply_max_tokens_fallback_sequence(
|
||||
&provider_name,
|
||||
initial_max_tokens,
|
||||
16000, // Hard-coded minimum for main API calls (higher than summary's 5000)
|
||||
));
|
||||
|
||||
let request = CompletionRequest {
|
||||
messages,
|
||||
@@ -2211,6 +2415,25 @@ impl<W: UiWriter> Agent<W> {
|
||||
self.context_window.percentage_used() as u32
|
||||
));
|
||||
|
||||
let provider = self.providers.get(None)?;
|
||||
let provider_name = provider.name().to_string();
|
||||
let _ = provider; // Release borrow early
|
||||
|
||||
// Apply fallback sequence: thinnify -> skinnify -> hard-coded 5000
|
||||
let mut summary_max_tokens = self.apply_summary_fallback_sequence(&provider_name);
|
||||
|
||||
// Apply provider-specific caps
|
||||
summary_max_tokens = match provider_name.as_str() {
|
||||
"databricks" | "anthropic" => summary_max_tokens.min(10_000),
|
||||
"embedded" => summary_max_tokens.min(3000),
|
||||
_ => summary_max_tokens.min(5000),
|
||||
};
|
||||
|
||||
debug!(
|
||||
"Requesting summary with max_tokens: {} (current usage: {} tokens)",
|
||||
summary_max_tokens, self.context_window.used_tokens
|
||||
);
|
||||
|
||||
// Create summary request with FULL history
|
||||
let summary_prompt = self.context_window.create_summary_prompt();
|
||||
|
||||
@@ -2239,38 +2462,9 @@ impl<W: UiWriter> Agent<W> {
|
||||
|
||||
let provider = self.providers.get(None)?;
|
||||
|
||||
// Dynamically calculate max_tokens for summary based on what's left
|
||||
let summary_max_tokens = match provider.name() {
|
||||
"databricks" | "anthropic" => {
|
||||
let model_limit = self.context_window.total_tokens;
|
||||
let current_usage = self.context_window.used_tokens;
|
||||
let available = model_limit
|
||||
.saturating_sub(current_usage)
|
||||
.saturating_sub(5000);
|
||||
Some(available.min(10_000))
|
||||
}
|
||||
"embedded" => {
|
||||
let model_limit = self.context_window.total_tokens;
|
||||
let current_usage = self.context_window.used_tokens;
|
||||
let available = model_limit
|
||||
.saturating_sub(current_usage)
|
||||
.saturating_sub(1000);
|
||||
Some(available.min(3000))
|
||||
}
|
||||
_ => {
|
||||
let available = self.context_window.remaining_tokens().saturating_sub(2000);
|
||||
Some(available.min(5000))
|
||||
}
|
||||
};
|
||||
|
||||
debug!(
|
||||
"Requesting summary with max_tokens: {:?} (current usage: {} tokens)",
|
||||
summary_max_tokens, self.context_window.used_tokens
|
||||
);
|
||||
|
||||
let summary_request = CompletionRequest {
|
||||
messages: summary_messages,
|
||||
max_tokens: summary_max_tokens,
|
||||
max_tokens: Some(summary_max_tokens),
|
||||
temperature: Some(self.resolve_temperature(provider.name())),
|
||||
stream: false,
|
||||
tools: None,
|
||||
@@ -3234,6 +3428,25 @@ impl<W: UiWriter> Agent<W> {
|
||||
self.context_window.percentage_used() as u32
|
||||
));
|
||||
|
||||
let provider = self.providers.get(None)?;
|
||||
let provider_name = provider.name().to_string();
|
||||
let _ = provider; // Release borrow early
|
||||
|
||||
// Apply fallback sequence: thinnify -> skinnify -> hard-coded 5000
|
||||
let mut summary_max_tokens = self.apply_summary_fallback_sequence(&provider_name);
|
||||
|
||||
// Apply provider-specific caps
|
||||
summary_max_tokens = match provider_name.as_str() {
|
||||
"databricks" | "anthropic" => summary_max_tokens.min(10_000),
|
||||
"embedded" => summary_max_tokens.min(3000),
|
||||
_ => summary_max_tokens.min(5000),
|
||||
};
|
||||
|
||||
debug!(
|
||||
"Requesting summary with max_tokens: {} (current usage: {} tokens)",
|
||||
summary_max_tokens, self.context_window.used_tokens
|
||||
);
|
||||
|
||||
// Create summary request with FULL history
|
||||
let summary_prompt = self.context_window.create_summary_prompt();
|
||||
|
||||
@@ -3262,82 +3475,9 @@ impl<W: UiWriter> Agent<W> {
|
||||
|
||||
let provider = self.providers.get(None)?;
|
||||
|
||||
// Dynamically calculate max_tokens for summary based on what's left
|
||||
// We need to ensure: used_tokens + max_tokens <= total_context_limit
|
||||
let summary_max_tokens = match provider.name() {
|
||||
"databricks" | "anthropic" => {
|
||||
// Use the actual configured context window size
|
||||
let model_limit = self.context_window.total_tokens;
|
||||
let current_usage = self.context_window.used_tokens;
|
||||
|
||||
// Check if we have enough capacity for summarization
|
||||
if current_usage >= model_limit.saturating_sub(1000) {
|
||||
error!("Context window at capacity ({}%), cannot summarize. Current: {}, Limit: {}",
|
||||
self.context_window.percentage_used(), current_usage, model_limit);
|
||||
return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify or /compact commands to reduce context size, or start a new session."));
|
||||
}
|
||||
|
||||
// Leave buffer proportional to model size (min 1k, max 10k)
|
||||
let buffer = (model_limit / 40).clamp(1000, 10000); // 2.5% buffer
|
||||
let available = model_limit
|
||||
.saturating_sub(current_usage)
|
||||
.saturating_sub(buffer);
|
||||
// Cap at a reasonable summary size (10k tokens max)
|
||||
Some(available.min(10_000))
|
||||
}
|
||||
"embedded" => {
|
||||
// For smaller context models, be more conservative
|
||||
let model_limit = self.context_window.total_tokens;
|
||||
let current_usage = self.context_window.used_tokens;
|
||||
|
||||
// Check capacity for embedded models too
|
||||
if current_usage >= model_limit.saturating_sub(500) {
|
||||
error!(
|
||||
"Embedded model context window at capacity ({}%)",
|
||||
self.context_window.percentage_used()
|
||||
);
|
||||
return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify command to reduce context size, or start a new session."));
|
||||
}
|
||||
|
||||
// Leave 1k buffer
|
||||
let available = model_limit
|
||||
.saturating_sub(current_usage)
|
||||
.saturating_sub(1000);
|
||||
// Cap at 3k for embedded models
|
||||
Some(available.min(3000))
|
||||
}
|
||||
_ => {
|
||||
// Default: conservative approach
|
||||
let model_limit = self.context_window.total_tokens;
|
||||
let current_usage = self.context_window.used_tokens;
|
||||
|
||||
if current_usage >= model_limit.saturating_sub(1000) {
|
||||
error!(
|
||||
"Context window at capacity ({}%)",
|
||||
self.context_window.percentage_used()
|
||||
);
|
||||
return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify or /compact commands, or start a new session."));
|
||||
}
|
||||
|
||||
let available = self.context_window.remaining_tokens().saturating_sub(2000);
|
||||
Some(available.min(5000))
|
||||
}
|
||||
};
|
||||
|
||||
debug!(
|
||||
"Requesting summary with max_tokens: {:?} (current usage: {} tokens)",
|
||||
summary_max_tokens, self.context_window.used_tokens
|
||||
);
|
||||
|
||||
// Final safety check
|
||||
if summary_max_tokens.unwrap_or(0) == 0 {
|
||||
error!("No tokens available for summarization");
|
||||
return Err(anyhow::anyhow!("No context window capacity left for summarization. Use /thinnify to reduce context size or start a new session."));
|
||||
}
|
||||
|
||||
let summary_request = CompletionRequest {
|
||||
messages: summary_messages,
|
||||
max_tokens: summary_max_tokens,
|
||||
max_tokens: Some(summary_max_tokens),
|
||||
temperature: Some(self.resolve_temperature(provider.name())),
|
||||
stream: false,
|
||||
tools: None,
|
||||
|
||||
188
crates/g3-core/tests/test_preflight_max_tokens.rs
Normal file
188
crates/g3-core/tests/test_preflight_max_tokens.rs
Normal file
@@ -0,0 +1,188 @@
|
||||
//! Tests for the pre-flight max_tokens validation with thinking.budget_tokens constraint
|
||||
//!
|
||||
//! These tests verify that when using Anthropic with extended thinking enabled,
|
||||
//! the max_tokens calculation properly accounts for the budget_tokens constraint.
|
||||
|
||||
use g3_config::Config;
|
||||
use g3_core::ContextWindow;
|
||||
|
||||
/// Helper function to create a minimal config for testing
|
||||
fn create_test_config_with_thinking(thinking_budget: Option<u32>) -> Config {
|
||||
let mut config = Config::default();
|
||||
|
||||
// Set up Anthropic provider with optional thinking budget
|
||||
config.providers.anthropic = Some(g3_config::AnthropicConfig {
|
||||
api_key: "test-key".to_string(),
|
||||
model: "claude-sonnet-4-5".to_string(),
|
||||
max_tokens: Some(16000),
|
||||
temperature: Some(0.1),
|
||||
cache_config: None,
|
||||
enable_1m_context: None,
|
||||
thinking_budget_tokens: thinking_budget,
|
||||
});
|
||||
|
||||
config.providers.default_provider = "anthropic".to_string();
|
||||
config
|
||||
}
|
||||
|
||||
/// Test that when thinking is disabled, max_tokens passes through unchanged
|
||||
#[test]
|
||||
fn test_no_thinking_budget_passes_through() {
|
||||
let config = create_test_config_with_thinking(None);
|
||||
|
||||
// Without thinking budget, any max_tokens should be fine
|
||||
let proposed_max = 5000;
|
||||
|
||||
// The constraint check would return (proposed_max, false)
|
||||
// since there's no thinking_budget_tokens configured
|
||||
assert!(config.providers.anthropic.as_ref().unwrap().thinking_budget_tokens.is_none());
|
||||
}
|
||||
|
||||
/// Test that when max_tokens > budget_tokens + buffer, no reduction is needed
|
||||
#[test]
|
||||
fn test_sufficient_max_tokens_no_reduction_needed() {
|
||||
let config = create_test_config_with_thinking(Some(10000));
|
||||
let budget_tokens = config.providers.anthropic.as_ref().unwrap().thinking_budget_tokens.unwrap();
|
||||
|
||||
// minimum_required = budget_tokens + 1024 = 11024
|
||||
let minimum_required = budget_tokens + 1024;
|
||||
|
||||
// If proposed_max >= minimum_required, no reduction is needed
|
||||
let proposed_max = 15000;
|
||||
assert!(proposed_max >= minimum_required);
|
||||
}
|
||||
|
||||
/// Test that when max_tokens < budget_tokens + buffer, reduction is needed
|
||||
#[test]
|
||||
fn test_insufficient_max_tokens_needs_reduction() {
|
||||
let config = create_test_config_with_thinking(Some(10000));
|
||||
let budget_tokens = config.providers.anthropic.as_ref().unwrap().thinking_budget_tokens.unwrap();
|
||||
|
||||
// minimum_required = budget_tokens + 1024 = 11024
|
||||
let minimum_required = budget_tokens + 1024;
|
||||
|
||||
// If proposed_max < minimum_required, reduction IS needed
|
||||
let proposed_max = 5000;
|
||||
assert!(proposed_max < minimum_required);
|
||||
}
|
||||
|
||||
/// Test the minimum required calculation
|
||||
#[test]
|
||||
fn test_minimum_required_calculation() {
|
||||
// For a budget of 10000, we need at least 11024 tokens
|
||||
let budget_tokens = 10000u32;
|
||||
let output_buffer = 1024u32;
|
||||
let minimum_required = budget_tokens + output_buffer;
|
||||
|
||||
assert_eq!(minimum_required, 11024);
|
||||
|
||||
// For a larger budget
|
||||
let budget_tokens = 32000u32;
|
||||
let minimum_required = budget_tokens + output_buffer;
|
||||
assert_eq!(minimum_required, 33024);
|
||||
}
|
||||
|
||||
/// Test context window usage calculation for summary max_tokens
|
||||
#[test]
|
||||
fn test_context_window_available_tokens() {
|
||||
let mut context = ContextWindow::new(200000); // 200k context window
|
||||
|
||||
// Simulate heavy usage
|
||||
context.used_tokens = 180000; // 90% used
|
||||
|
||||
let model_limit = context.total_tokens;
|
||||
let current_usage = context.used_tokens;
|
||||
|
||||
// 2.5% buffer calculation
|
||||
let buffer = (model_limit / 40).clamp(1000, 10000);
|
||||
assert_eq!(buffer, 5000); // 200000/40 = 5000
|
||||
|
||||
let available = model_limit
|
||||
.saturating_sub(current_usage)
|
||||
.saturating_sub(buffer);
|
||||
|
||||
// 200000 - 180000 - 5000 = 15000
|
||||
assert_eq!(available, 15000);
|
||||
|
||||
// Capped at 10000 for summary
|
||||
let summary_max = available.min(10_000);
|
||||
assert_eq!(summary_max, 10000);
|
||||
}
|
||||
|
||||
/// Test that when context is nearly full, available tokens may be below thinking budget
|
||||
#[test]
|
||||
fn test_context_nearly_full_triggers_reduction() {
|
||||
let mut context = ContextWindow::new(200000);
|
||||
|
||||
// Very heavy usage - 98% used
|
||||
context.used_tokens = 196000;
|
||||
|
||||
let model_limit = context.total_tokens;
|
||||
let current_usage = context.used_tokens;
|
||||
let buffer = (model_limit / 40).clamp(1000, 10000); // 5000
|
||||
|
||||
let available = model_limit
|
||||
.saturating_sub(current_usage)
|
||||
.saturating_sub(buffer);
|
||||
|
||||
// 200000 - 196000 - 5000 = -1000 -> saturates to 0
|
||||
assert_eq!(available, 0);
|
||||
|
||||
// With thinking_budget of 10000, this would definitely need reduction
|
||||
let thinking_budget = 10000u32;
|
||||
let minimum_required = thinking_budget + 1024;
|
||||
assert!(available < minimum_required);
|
||||
}
|
||||
|
||||
/// Test the hard-coded fallback value
|
||||
#[test]
|
||||
fn test_hardcoded_fallback_value() {
|
||||
// When all else fails, we use 5000 as the hard-coded max_tokens
|
||||
let hardcoded_fallback = 5000u32;
|
||||
|
||||
// This should be a reasonable value that Anthropic will accept
|
||||
// even with thinking enabled (though output will be limited)
|
||||
assert!(hardcoded_fallback > 0);
|
||||
|
||||
// Note: With a 10000 thinking budget, 5000 is still below the
|
||||
// minimum required (11024), but we send it anyway as a "last resort"
|
||||
// hoping the API might still work for basic operations
|
||||
}
|
||||
|
||||
/// Test provider-specific caps
|
||||
#[test]
|
||||
fn test_provider_specific_caps() {
|
||||
// Anthropic/Databricks: cap at 10000
|
||||
let anthropic_cap = 10000u32;
|
||||
let proposed = 15000u32;
|
||||
assert_eq!(proposed.min(anthropic_cap), 10000);
|
||||
|
||||
// Embedded: cap at 3000
|
||||
let embedded_cap = 3000u32;
|
||||
let proposed = 5000u32;
|
||||
assert_eq!(proposed.min(embedded_cap), 3000);
|
||||
|
||||
// Default: cap at 5000
|
||||
let default_cap = 5000u32;
|
||||
let proposed = 8000u32;
|
||||
assert_eq!(proposed.min(default_cap), 5000);
|
||||
}
|
||||
|
||||
/// Test that the error message mentions the thinking budget constraint
|
||||
#[test]
|
||||
fn test_error_message_content() {
|
||||
// Verify the warning message format contains useful information
|
||||
let proposed_max_tokens = 5000u32;
|
||||
let budget_tokens = 10000u32;
|
||||
let minimum_required = budget_tokens + 1024;
|
||||
|
||||
let warning = format!(
|
||||
"max_tokens ({}) is below required minimum ({}) for thinking.budget_tokens ({}). Context reduction needed.",
|
||||
proposed_max_tokens, minimum_required, budget_tokens
|
||||
);
|
||||
|
||||
assert!(warning.contains("5000"));
|
||||
assert!(warning.contains("11024"));
|
||||
assert!(warning.contains("10000"));
|
||||
assert!(warning.contains("Context reduction needed"));
|
||||
}
|
||||
Reference in New Issue
Block a user