From af20c93c61eed3135104ceaa10748d55c68ea395 Mon Sep 17 00:00:00 2001 From: Jochen Date: Thu, 6 Nov 2025 15:07:46 +1100 Subject: [PATCH 1/2] respect context length for anthropic use the context length as per the config, rather than just hard-coded values. --- crates/g3-core/src/lib.rs | 101 ++++++++++++++++++++++++++++++++------ test_anthropic_fix.md | 70 ++++++++++++++++++++++++++ 2 files changed, 157 insertions(+), 14 deletions(-) create mode 100644 test_anthropic_fix.md diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs index 937d704..1c754fa 100644 --- a/crates/g3-core/src/lib.rs +++ b/crates/g3-core/src/lib.rs @@ -921,11 +921,28 @@ impl Agent { } fn determine_context_length(config: &Config, providers: &ProviderRegistry) -> Result { + // Get the configured max_tokens for the current provider + fn get_provider_max_tokens(config: &Config, provider_name: &str) -> Option { + match provider_name { + "anthropic" => config.providers.anthropic.as_ref()?.max_tokens, + "openai" => config.providers.openai.as_ref()?.max_tokens, + "databricks" => config.providers.databricks.as_ref()?.max_tokens, + "embedded" => config.providers.embedded.as_ref()?.max_tokens, + _ => None, + } + } + // Get the active provider to determine context length let provider = providers.get(None)?; let provider_name = provider.name(); let model_name = provider.model(); + // Check if there's a configured context length override first + if let Some(max_tokens) = get_provider_max_tokens(config, provider_name) { + debug!("Using configured max_tokens for {}: {}", provider_name, max_tokens); + return Ok(max_tokens); + } + // Use provider-specific context length if available, otherwise fall back to agent config let context_length = match provider_name { "embedded" => { @@ -950,17 +967,21 @@ impl Agent { } "anthropic" => { // Claude models have large context windows - 200000 // Default for Claude models + // Use configured max_tokens or fall back to default + get_provider_max_tokens(config, "anthropic").unwrap_or(200000) } "databricks" => { // Databricks models have varying context windows depending on the model - if model_name.contains("claude") { - 200000 // Claude models on Databricks have large context windows - } else if model_name.contains("llama") || model_name.contains("dbrx") { - 32768 // DBRX supports 32k context - } else { - 16384 // Conservative default for other Databricks models - } + // Use configured max_tokens or fall back to model-specific defaults + get_provider_max_tokens(config, "databricks").unwrap_or_else(|| { + if model_name.contains("claude") { + 200000 // Claude models on Databricks have large context windows + } else if model_name.contains("llama") || model_name.contains("dbrx") { + 32768 // DBRX supports 32k context + } else { + 16384 // Conservative default for other Databricks models + } + }) } _ => config.agent.max_context_length as u32, }; @@ -1511,7 +1532,7 @@ Template: // Dynamically calculate max_tokens for summary based on what's left let summary_max_tokens = match provider.name() { "databricks" | "anthropic" => { - let model_limit = 200_000u32; + let model_limit = self.context_window.total_tokens; let current_usage = self.context_window.used_tokens; let available = model_limit .saturating_sub(current_usage) @@ -2394,6 +2415,28 @@ Template: // Check if we need to summarize before starting if self.context_window.should_summarize() { + // First try thinning if we haven't reached 90% yet + if self.context_window.percentage_used() < 90.0 && self.context_window.should_thin() { + self.ui_writer.print_context_status(&format!( + "\n🥒 Context window at {}%. Trying thinning first...", + self.context_window.percentage_used() as u32 + )); + + let (thin_summary, chars_saved) = self.context_window.thin_context(); + self.thinning_events.push(chars_saved); + self.ui_writer.print_context_thinning(&thin_summary); + + // Check if thinning was sufficient + if !self.context_window.should_summarize() { + self.ui_writer.print_context_status("✅ Thinning resolved capacity issue. Continuing...\n"); + // Continue with the original request without summarization + } else { + self.ui_writer.print_context_status("⚠️ Thinning insufficient. Proceeding with summarization...\n"); + } + } + + // Only proceed with summarization if still needed after thinning + if self.context_window.should_summarize() { // Notify user about summarization self.ui_writer.print_context_status(&format!( "\n🗜️ Context window reaching capacity ({}%). Creating summary...", @@ -2433,14 +2476,22 @@ Template: // We need to ensure: used_tokens + max_tokens <= total_context_limit let summary_max_tokens = match provider.name() { "databricks" | "anthropic" => { - // Claude models have 200k context - // Calculate how much room we have left - let model_limit = 200_000u32; + // Use the actual configured context window size + let model_limit = self.context_window.total_tokens; let current_usage = self.context_window.used_tokens; - // Leave some buffer (5k tokens) for safety + + // Check if we have enough capacity for summarization + if current_usage >= model_limit.saturating_sub(1000) { + error!("Context window at capacity ({}%), cannot summarize. Current: {}, Limit: {}", + self.context_window.percentage_used(), current_usage, model_limit); + return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify or /compact commands to reduce context size, or start a new session.")); + } + + // Leave buffer proportional to model size (min 1k, max 10k) + let buffer = (model_limit / 40).clamp(1000, 10000); // 2.5% buffer let available = model_limit .saturating_sub(current_usage) - .saturating_sub(5000); + .saturating_sub(buffer); // Cap at a reasonable summary size (10k tokens max) Some(available.min(10_000)) } @@ -2448,6 +2499,13 @@ Template: // For smaller context models, be more conservative let model_limit = self.context_window.total_tokens; let current_usage = self.context_window.used_tokens; + + // Check capacity for embedded models too + if current_usage >= model_limit.saturating_sub(500) { + error!("Embedded model context window at capacity ({}%)", self.context_window.percentage_used()); + return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify command to reduce context size, or start a new session.")); + } + // Leave 1k buffer let available = model_limit .saturating_sub(current_usage) @@ -2457,6 +2515,14 @@ Template: } _ => { // Default: conservative approach + let model_limit = self.context_window.total_tokens; + let current_usage = self.context_window.used_tokens; + + if current_usage >= model_limit.saturating_sub(1000) { + error!("Context window at capacity ({}%)", self.context_window.percentage_used()); + return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify or /compact commands, or start a new session.")); + } + let available = self.context_window.remaining_tokens().saturating_sub(2000); Some(available.min(5000)) } @@ -2466,6 +2532,12 @@ Template: "Requesting summary with max_tokens: {:?} (current usage: {} tokens)", summary_max_tokens, self.context_window.used_tokens ); + + // Final safety check + if summary_max_tokens.unwrap_or(0) == 0 { + error!("No tokens available for summarization"); + return Err(anyhow::anyhow!("No context window capacity left for summarization. Use /thinnify to reduce context size or start a new session.")); + } let summary_request = CompletionRequest { messages: summary_messages, @@ -2507,6 +2579,7 @@ Template: } } } + } loop { iteration_count += 1; diff --git a/test_anthropic_fix.md b/test_anthropic_fix.md new file mode 100644 index 0000000..116bd5d --- /dev/null +++ b/test_anthropic_fix.md @@ -0,0 +1,70 @@ +# Anthropic max_tokens Error Fix - Test Plan + +## Changes Made + +### 1. Fixed Context Window Size Detection +- **Problem**: Code used hardcoded 200k limit for Anthropic instead of configured max_tokens +- **Fix**: Modified `determine_context_length()` to check configured max_tokens first before falling back to defaults +- **Files**: `crates/g3-core/src/lib.rs` lines 923-945, 967-985 + +### 2. Added Thinning Before Summarization +- **Problem**: Code attempted summarization even when context window was nearly full +- **Fix**: Added logic to try thinning first when context usage is between 80-90% +- **Files**: `crates/g3-core/src/lib.rs` lines 2415-2439 + +### 3. Added Capacity Checks Before Summarization +- **Problem**: No validation that sufficient tokens remained for summarization +- **Fix**: Added capacity checks for all provider types with helpful error messages +- **Files**: `crates/g3-core/src/lib.rs` lines 2480-2520 + +### 4. Improved Error Messages +- **Problem**: Generic errors when summarization failed +- **Fix**: Specific error messages suggesting `/thinnify` and `/compact` commands +- **Files**: Multiple locations in summarization logic + +### 5. Dynamic Buffer Calculation +- **Problem**: Fixed 5k buffer regardless of model size +- **Fix**: Proportional buffer (2.5% of model limit, min 1k, max 10k) +- **Files**: `crates/g3-core/src/lib.rs` line 2487 + +## Test Cases + +### Test 1: Configured max_tokens Respected +```toml +# In g3.toml +[providers.anthropic] +api_key = "your-key" +model = "claude-3-5-sonnet-20241022" +max_tokens = 50000 # Should use this instead of 200k default +``` + +### Test 2: Thinning Before Summarization +- Fill context to 85% capacity +- Verify thinning is attempted before summarization +- Check that summarization is skipped if thinning resolves the issue + +### Test 3: Capacity Error Handling +- Fill context to 98% capacity +- Verify helpful error message is shown instead of API error +- Check that `/thinnify` and `/compact` commands are suggested + +### Test 4: Provider-Specific Handling +- Test with different providers (anthropic, databricks, embedded) +- Verify each uses appropriate capacity checks and buffers + +## Expected Behavior + +1. **No more max_tokens API errors** from Anthropic when context window is full +2. **Automatic thinning** when approaching capacity (80-90%) +3. **Clear error messages** with actionable suggestions when at capacity +4. **Respect configured limits** instead of hardcoded defaults +5. **Graceful degradation** with helpful user guidance + +## Manual Testing Commands + +```bash +# Test with small max_tokens to trigger the issue quickly +g3 --chat +# Then paste large amounts of text to fill context window +# Verify thinning and error handling work correctly +``` From 0e1f9dbf9a5b2028d2b169910ff6975139d4fb3e Mon Sep 17 00:00:00 2001 From: Jochen Date: Thu, 6 Nov 2025 19:47:02 +1100 Subject: [PATCH 2/2] rename max_context_length to fallback_default_max_tokens --- config.coach-player.example.toml | 2 +- config.example.toml | 2 +- crates/g3-config/src/lib.rs | 6 +++--- crates/g3-config/src/tests.rs | 6 +++--- crates/g3-core/src/lib.rs | 12 ++++++------ 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/config.coach-player.example.toml b/config.coach-player.example.toml index 2101564..999b674 100644 --- a/config.coach-player.example.toml +++ b/config.coach-player.example.toml @@ -19,6 +19,6 @@ max_tokens = 4096 temperature = 0.3 # Slightly higher temperature for more creative implementations [agent] -max_context_length = 8192 +fallback_default_max_tokens = 8192 enable_streaming = true timeout_seconds = 60 \ No newline at end of file diff --git a/config.example.toml b/config.example.toml index b58ae3f..56954f9 100644 --- a/config.example.toml +++ b/config.example.toml @@ -15,7 +15,7 @@ temperature = 0.1 use_oauth = true [agent] -max_context_length = 8192 +fallback_default_max_tokens = 8192 enable_streaming = true timeout_seconds = 60 diff --git a/crates/g3-config/src/lib.rs b/crates/g3-config/src/lib.rs index d9f0602..ba578e9 100644 --- a/crates/g3-config/src/lib.rs +++ b/crates/g3-config/src/lib.rs @@ -62,7 +62,7 @@ pub struct EmbeddedConfig { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct AgentConfig { - pub max_context_length: usize, + pub fallback_default_max_tokens: usize, pub enable_streaming: bool, pub timeout_seconds: u64, pub auto_compact: bool, @@ -133,7 +133,7 @@ impl Default for Config { player: None, // Will use default_provider if not specified }, agent: AgentConfig { - max_context_length: 8192, + fallback_default_max_tokens: 8192, enable_streaming: true, timeout_seconds: 60, auto_compact: true, @@ -249,7 +249,7 @@ impl Config { player: None, // Will use default_provider if not specified }, agent: AgentConfig { - max_context_length: 8192, + fallback_default_max_tokens: 8192, enable_streaming: true, timeout_seconds: 60, auto_compact: true, diff --git a/crates/g3-config/src/tests.rs b/crates/g3-config/src/tests.rs index a1e1e9f..6899a8b 100644 --- a/crates/g3-config/src/tests.rs +++ b/crates/g3-config/src/tests.rs @@ -31,7 +31,7 @@ model_path = "test.gguf" model_type = "llama" [agent] -max_context_length = 8192 +fallback_default_max_tokens = 8192 enable_streaming = true timeout_seconds = 60 "#; @@ -72,7 +72,7 @@ token = "test-token" model = "test-model" [agent] -max_context_length = 8192 +fallback_default_max_tokens = 8192 enable_streaming = true timeout_seconds = 60 "#; @@ -113,7 +113,7 @@ token = "test-token" model = "test-model" [agent] -max_context_length = 8192 +fallback_default_max_tokens = 8192 enable_streaming = true timeout_seconds = 60 "#; diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs index 1c754fa..b338852 100644 --- a/crates/g3-core/src/lib.rs +++ b/crates/g3-core/src/lib.rs @@ -865,7 +865,7 @@ impl Agent { debug!("Default provider set successfully"); // Determine context window size based on active provider - let context_length = Self::determine_context_length(&config, &providers)?; + let context_length = Self::get_configured_context_length(&config, &providers)?; let mut context_window = ContextWindow::new(context_length); // If README content is provided, add it as the first system message @@ -920,7 +920,7 @@ impl Agent { }) } - fn determine_context_length(config: &Config, providers: &ProviderRegistry) -> Result { + fn get_configured_context_length(config: &Config, providers: &ProviderRegistry) -> Result { // Get the configured max_tokens for the current provider fn get_provider_max_tokens(config: &Config, provider_name: &str) -> Option { match provider_name { @@ -959,7 +959,7 @@ impl Agent { } }) } else { - config.agent.max_context_length as u32 + config.agent.fallback_default_max_tokens as u32 } } "openai" => { @@ -983,7 +983,7 @@ impl Agent { } }) } - _ => config.agent.max_context_length as u32, + _ => config.agent.fallback_default_max_tokens as u32, }; debug!( @@ -2415,8 +2415,8 @@ Template: // Check if we need to summarize before starting if self.context_window.should_summarize() { - // First try thinning if we haven't reached 90% yet - if self.context_window.percentage_used() < 90.0 && self.context_window.should_thin() { + // First try thinning if we are at capacity, don't call the LLM for a summary (might fail) + if self.context_window.percentage_used() > 90.0 && self.context_window.should_thin() { self.ui_writer.print_context_status(&format!( "\n🥒 Context window at {}%. Trying thinning first...", self.context_window.percentage_used() as u32