validate max_tokens for call, also fallbacks for summary

When the CW is full, max_tokens is often passed at 0 or tiny. The LLM will fail. For Anthropic with thining, there is also the thinking budget.
This can happen during summary attempts, in that case
first try thinnify, skinnify etc..
This commit is contained in:
Jochen
2025-12-09 10:15:32 +11:00
parent 48e6d594bc
commit 696c441a47
2 changed files with 434 additions and 106 deletions

View File

@@ -0,0 +1,188 @@
//! Tests for the pre-flight max_tokens validation with thinking.budget_tokens constraint
//!
//! These tests verify that when using Anthropic with extended thinking enabled,
//! the max_tokens calculation properly accounts for the budget_tokens constraint.
use g3_config::Config;
use g3_core::ContextWindow;
/// Helper function to create a minimal config for testing
fn create_test_config_with_thinking(thinking_budget: Option<u32>) -> Config {
let mut config = Config::default();
// Set up Anthropic provider with optional thinking budget
config.providers.anthropic = Some(g3_config::AnthropicConfig {
api_key: "test-key".to_string(),
model: "claude-sonnet-4-5".to_string(),
max_tokens: Some(16000),
temperature: Some(0.1),
cache_config: None,
enable_1m_context: None,
thinking_budget_tokens: thinking_budget,
});
config.providers.default_provider = "anthropic".to_string();
config
}
/// Test that when thinking is disabled, max_tokens passes through unchanged
#[test]
fn test_no_thinking_budget_passes_through() {
let config = create_test_config_with_thinking(None);
// Without thinking budget, any max_tokens should be fine
let proposed_max = 5000;
// The constraint check would return (proposed_max, false)
// since there's no thinking_budget_tokens configured
assert!(config.providers.anthropic.as_ref().unwrap().thinking_budget_tokens.is_none());
}
/// Test that when max_tokens > budget_tokens + buffer, no reduction is needed
#[test]
fn test_sufficient_max_tokens_no_reduction_needed() {
let config = create_test_config_with_thinking(Some(10000));
let budget_tokens = config.providers.anthropic.as_ref().unwrap().thinking_budget_tokens.unwrap();
// minimum_required = budget_tokens + 1024 = 11024
let minimum_required = budget_tokens + 1024;
// If proposed_max >= minimum_required, no reduction is needed
let proposed_max = 15000;
assert!(proposed_max >= minimum_required);
}
/// Test that when max_tokens < budget_tokens + buffer, reduction is needed
#[test]
fn test_insufficient_max_tokens_needs_reduction() {
let config = create_test_config_with_thinking(Some(10000));
let budget_tokens = config.providers.anthropic.as_ref().unwrap().thinking_budget_tokens.unwrap();
// minimum_required = budget_tokens + 1024 = 11024
let minimum_required = budget_tokens + 1024;
// If proposed_max < minimum_required, reduction IS needed
let proposed_max = 5000;
assert!(proposed_max < minimum_required);
}
/// Test the minimum required calculation
#[test]
fn test_minimum_required_calculation() {
// For a budget of 10000, we need at least 11024 tokens
let budget_tokens = 10000u32;
let output_buffer = 1024u32;
let minimum_required = budget_tokens + output_buffer;
assert_eq!(minimum_required, 11024);
// For a larger budget
let budget_tokens = 32000u32;
let minimum_required = budget_tokens + output_buffer;
assert_eq!(minimum_required, 33024);
}
/// Test context window usage calculation for summary max_tokens
#[test]
fn test_context_window_available_tokens() {
let mut context = ContextWindow::new(200000); // 200k context window
// Simulate heavy usage
context.used_tokens = 180000; // 90% used
let model_limit = context.total_tokens;
let current_usage = context.used_tokens;
// 2.5% buffer calculation
let buffer = (model_limit / 40).clamp(1000, 10000);
assert_eq!(buffer, 5000); // 200000/40 = 5000
let available = model_limit
.saturating_sub(current_usage)
.saturating_sub(buffer);
// 200000 - 180000 - 5000 = 15000
assert_eq!(available, 15000);
// Capped at 10000 for summary
let summary_max = available.min(10_000);
assert_eq!(summary_max, 10000);
}
/// Test that when context is nearly full, available tokens may be below thinking budget
#[test]
fn test_context_nearly_full_triggers_reduction() {
let mut context = ContextWindow::new(200000);
// Very heavy usage - 98% used
context.used_tokens = 196000;
let model_limit = context.total_tokens;
let current_usage = context.used_tokens;
let buffer = (model_limit / 40).clamp(1000, 10000); // 5000
let available = model_limit
.saturating_sub(current_usage)
.saturating_sub(buffer);
// 200000 - 196000 - 5000 = -1000 -> saturates to 0
assert_eq!(available, 0);
// With thinking_budget of 10000, this would definitely need reduction
let thinking_budget = 10000u32;
let minimum_required = thinking_budget + 1024;
assert!(available < minimum_required);
}
/// Test the hard-coded fallback value
#[test]
fn test_hardcoded_fallback_value() {
// When all else fails, we use 5000 as the hard-coded max_tokens
let hardcoded_fallback = 5000u32;
// This should be a reasonable value that Anthropic will accept
// even with thinking enabled (though output will be limited)
assert!(hardcoded_fallback > 0);
// Note: With a 10000 thinking budget, 5000 is still below the
// minimum required (11024), but we send it anyway as a "last resort"
// hoping the API might still work for basic operations
}
/// Test provider-specific caps
#[test]
fn test_provider_specific_caps() {
// Anthropic/Databricks: cap at 10000
let anthropic_cap = 10000u32;
let proposed = 15000u32;
assert_eq!(proposed.min(anthropic_cap), 10000);
// Embedded: cap at 3000
let embedded_cap = 3000u32;
let proposed = 5000u32;
assert_eq!(proposed.min(embedded_cap), 3000);
// Default: cap at 5000
let default_cap = 5000u32;
let proposed = 8000u32;
assert_eq!(proposed.min(default_cap), 5000);
}
/// Test that the error message mentions the thinking budget constraint
#[test]
fn test_error_message_content() {
// Verify the warning message format contains useful information
let proposed_max_tokens = 5000u32;
let budget_tokens = 10000u32;
let minimum_required = budget_tokens + 1024;
let warning = format!(
"max_tokens ({}) is below required minimum ({}) for thinking.budget_tokens ({}). Context reduction needed.",
proposed_max_tokens, minimum_required, budget_tokens
);
assert!(warning.contains("5000"));
assert!(warning.contains("11024"));
assert!(warning.contains("10000"));
assert!(warning.contains("Context reduction needed"));
}