respect context length for anthropic

use the context length as per the config, rather than just hard-coded values.
2025-11-06 15:07:46 +11:00
parent cef234d91a
commit af20c93c61
2 changed files with 157 additions and 14 deletions
--- a/crates/g3-core/src/lib.rs
+++ b/crates/g3-core/src/lib.rs
@@ -921,11 +921,28 @@ impl<W: UiWriter> Agent<W> {
    }
    fn determine_context_length(config: &Config, providers: &ProviderRegistry) -> Result<u32> {
        // Get the configured max_tokens for the current provider
        fn get_provider_max_tokens(config: &Config, provider_name: &str) -> Option<u32> {
            match provider_name {
                "anthropic" => config.providers.anthropic.as_ref()?.max_tokens,
                "openai" => config.providers.openai.as_ref()?.max_tokens,
                "databricks" => config.providers.databricks.as_ref()?.max_tokens,
                "embedded" => config.providers.embedded.as_ref()?.max_tokens,
                _ => None,
            }
        }
        // Get the active provider to determine context length
        let provider = providers.get(None)?;
        let provider_name = provider.name();
        let model_name = provider.model();
        // Check if there's a configured context length override first
        if let Some(max_tokens) = get_provider_max_tokens(config, provider_name) {
            debug!("Using configured max_tokens for {}: {}", provider_name, max_tokens);
            return Ok(max_tokens);
        }
        // Use provider-specific context length if available, otherwise fall back to agent config
        let context_length = match provider_name {
            "embedded" => {
@@ -950,10 +967,13 @@ impl<W: UiWriter> Agent<W> {
            }
            "anthropic" => {
                // Claude models have large context windows
-                200000 // Default for Claude models
+                // Use configured max_tokens or fall back to default
                get_provider_max_tokens(config, "anthropic").unwrap_or(200000)
            }
            "databricks" => {
                // Databricks models have varying context windows depending on the model
                // Use configured max_tokens or fall back to model-specific defaults
                get_provider_max_tokens(config, "databricks").unwrap_or_else(|| {
                    if model_name.contains("claude") {
                        200000 // Claude models on Databricks have large context windows
                    } else if model_name.contains("llama") || model_name.contains("dbrx") {
@@ -961,6 +981,7 @@ impl<W: UiWriter> Agent<W> {
                    } else {
                        16384 // Conservative default for other Databricks models
                    }
                })
            }
            _ => config.agent.max_context_length as u32,
        };
@@ -1511,7 +1532,7 @@ Template:
        // Dynamically calculate max_tokens for summary based on what's left
        let summary_max_tokens = match provider.name() {
            "databricks" | "anthropic" => {
-                let model_limit = 200_000u32;
+                let model_limit = self.context_window.total_tokens;
                let current_usage = self.context_window.used_tokens;
                let available = model_limit
                    .saturating_sub(current_usage)
@@ -2393,6 +2414,28 @@ Template:
        let mut response_started = false;
        // Check if we need to summarize before starting
        if self.context_window.should_summarize() {
            // First try thinning if we haven't reached 90% yet
            if self.context_window.percentage_used() < 90.0 && self.context_window.should_thin() {
                self.ui_writer.print_context_status(&format!(
                    "\n🥒 Context window at {}%. Trying thinning first...",
                    self.context_window.percentage_used() as u32
                ));
                let (thin_summary, chars_saved) = self.context_window.thin_context();
                self.thinning_events.push(chars_saved);
                self.ui_writer.print_context_thinning(&thin_summary);
                // Check if thinning was sufficient
                if !self.context_window.should_summarize() {
                    self.ui_writer.print_context_status("✅ Thinning resolved capacity issue. Continuing...\n");
                    // Continue with the original request without summarization
                } else {
                    self.ui_writer.print_context_status("⚠️ Thinning insufficient. Proceeding with summarization...\n");
                }
            }
            // Only proceed with summarization if still needed after thinning
            if self.context_window.should_summarize() {
            // Notify user about summarization
            self.ui_writer.print_context_status(&format!(
@@ -2433,14 +2476,22 @@ Template:
            // We need to ensure: used_tokens + max_tokens <= total_context_limit
            let summary_max_tokens = match provider.name() {
                "databricks" | "anthropic" => {
-                    // Claude models have 200k context
+                    // Use the actual configured context window size
-                    // Calculate how much room we have left
+                    let model_limit = self.context_window.total_tokens;
                    let model_limit = 200_000u32;
                    let current_usage = self.context_window.used_tokens;
-                    // Leave some buffer (5k tokens) for safety
+                    
                    // Check if we have enough capacity for summarization
                    if current_usage >= model_limit.saturating_sub(1000) {
                        error!("Context window at capacity ({}%), cannot summarize. Current: {}, Limit: {}", 
                               self.context_window.percentage_used(), current_usage, model_limit);
                        return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify or /compact commands to reduce context size, or start a new session."));
                    }
                    // Leave buffer proportional to model size (min 1k, max 10k)
                    let buffer = (model_limit / 40).clamp(1000, 10000); // 2.5% buffer
                    let available = model_limit
                        .saturating_sub(current_usage)
-                        .saturating_sub(5000);
+                        .saturating_sub(buffer);
                    // Cap at a reasonable summary size (10k tokens max)
                    Some(available.min(10_000))
                }
@@ -2448,6 +2499,13 @@ Template:
                    // For smaller context models, be more conservative
                    let model_limit = self.context_window.total_tokens;
                    let current_usage = self.context_window.used_tokens;
                    // Check capacity for embedded models too
                    if current_usage >= model_limit.saturating_sub(500) {
                        error!("Embedded model context window at capacity ({}%)", self.context_window.percentage_used());
                        return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify command to reduce context size, or start a new session."));
                    }
                    // Leave 1k buffer
                    let available = model_limit
                        .saturating_sub(current_usage)
@@ -2457,6 +2515,14 @@ Template:
                }
                _ => {
                    // Default: conservative approach
                    let model_limit = self.context_window.total_tokens;
                    let current_usage = self.context_window.used_tokens;
                    if current_usage >= model_limit.saturating_sub(1000) {
                        error!("Context window at capacity ({}%)", self.context_window.percentage_used());
                        return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify or /compact commands, or start a new session."));
                    }
                    let available = self.context_window.remaining_tokens().saturating_sub(2000);
                    Some(available.min(5000))
                }
@@ -2467,6 +2533,12 @@ Template:
                summary_max_tokens, self.context_window.used_tokens
            );
            // Final safety check
            if summary_max_tokens.unwrap_or(0) == 0 {
                error!("No tokens available for summarization");
                return Err(anyhow::anyhow!("No context window capacity left for summarization. Use /thinnify to reduce context size or start a new session."));
            }
            let summary_request = CompletionRequest {
                messages: summary_messages,
                max_tokens: summary_max_tokens,
@@ -2507,6 +2579,7 @@ Template:
                }
            }
        }
        }
        loop {
            iteration_count += 1;
--- a/test_anthropic_fix.md
+++ b/test_anthropic_fix.md
@@ -0,0 +1,70 @@
 # Anthropic max_tokens Error Fix - Test Plan
 ## Changes Made
 ### 1. Fixed Context Window Size Detection
 - **Problem**: Code used hardcoded 200k limit for Anthropic instead of configured max_tokens
 - **Fix**: Modified `determine_context_length()` to check configured max_tokens first before falling back to defaults
 - **Files**: `crates/g3-core/src/lib.rs` lines 923-945, 967-985
 ### 2. Added Thinning Before Summarization
 - **Problem**: Code attempted summarization even when context window was nearly full
 - **Fix**: Added logic to try thinning first when context usage is between 80-90%
 - **Files**: `crates/g3-core/src/lib.rs` lines 2415-2439
 ### 3. Added Capacity Checks Before Summarization
 - **Problem**: No validation that sufficient tokens remained for summarization
 - **Fix**: Added capacity checks for all provider types with helpful error messages
 - **Files**: `crates/g3-core/src/lib.rs` lines 2480-2520
 ### 4. Improved Error Messages
 - **Problem**: Generic errors when summarization failed
 - **Fix**: Specific error messages suggesting `/thinnify` and `/compact` commands
 - **Files**: Multiple locations in summarization logic
 ### 5. Dynamic Buffer Calculation
 - **Problem**: Fixed 5k buffer regardless of model size
 - **Fix**: Proportional buffer (2.5% of model limit, min 1k, max 10k)
 - **Files**: `crates/g3-core/src/lib.rs` line 2487
 ## Test Cases
 ### Test 1: Configured max_tokens Respected
 ```toml
 # In g3.toml
 [providers.anthropic]
 api_key = "your-key"
 model = "claude-3-5-sonnet-20241022"
 max_tokens = 50000  # Should use this instead of 200k default
 ```
 ### Test 2: Thinning Before Summarization
 - Fill context to 85% capacity
 - Verify thinning is attempted before summarization
 - Check that summarization is skipped if thinning resolves the issue
 ### Test 3: Capacity Error Handling
 - Fill context to 98% capacity
 - Verify helpful error message is shown instead of API error
 - Check that `/thinnify` and `/compact` commands are suggested
 ### Test 4: Provider-Specific Handling
 - Test with different providers (anthropic, databricks, embedded)
 - Verify each uses appropriate capacity checks and buffers
 ## Expected Behavior
 1. **No more max_tokens API errors** from Anthropic when context window is full
 2. **Automatic thinning** when approaching capacity (80-90%)
 3. **Clear error messages** with actionable suggestions when at capacity
 4. **Respect configured limits** instead of hardcoded defaults
 5. **Graceful degradation** with helpful user guidance
 ## Manual Testing Commands
 ```bash
 # Test with small max_tokens to trigger the issue quickly
 g3 --chat
 # Then paste large amounts of text to fill context window
 # Verify thinning and error handling work correctly
 ```