From af20c93c61eed3135104ceaa10748d55c68ea395 Mon Sep 17 00:00:00 2001
From: Jochen <jochenx@bekmann.com>
Date: Thu, 6 Nov 2025 15:07:46 +1100
Subject: [PATCH 1/2] respect context length for anthropic

use the context length as per the config, rather than just hard-coded values.
---
 crates/g3-core/src/lib.rs | 101 ++++++++++++++++++++++++++++++++------
 test_anthropic_fix.md     |  70 ++++++++++++++++++++++++++
 2 files changed, 157 insertions(+), 14 deletions(-)
 create mode 100644 test_anthropic_fix.md
diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs
index 937d704..1c754fa 100644
--- a/crates/g3-core/src/lib.rs
+++ b/crates/g3-core/src/lib.rs
@@ -921,11 +921,28 @@ impl<W: UiWriter> Agent<W> {
     }
 
     fn determine_context_length(config: &Config, providers: &ProviderRegistry) -> Result<u32> {
+        // Get the configured max_tokens for the current provider
+        fn get_provider_max_tokens(config: &Config, provider_name: &str) -> Option<u32> {
+            match provider_name {
+                "anthropic" => config.providers.anthropic.as_ref()?.max_tokens,
+                "openai" => config.providers.openai.as_ref()?.max_tokens,
+                "databricks" => config.providers.databricks.as_ref()?.max_tokens,
+                "embedded" => config.providers.embedded.as_ref()?.max_tokens,
+                _ => None,
+            }
+        }
+
         // Get the active provider to determine context length
         let provider = providers.get(None)?;
         let provider_name = provider.name();
         let model_name = provider.model();
 
+        // Check if there's a configured context length override first
+        if let Some(max_tokens) = get_provider_max_tokens(config, provider_name) {
+            debug!("Using configured max_tokens for {}: {}", provider_name, max_tokens);
+            return Ok(max_tokens);
+        }
+
         // Use provider-specific context length if available, otherwise fall back to agent config
         let context_length = match provider_name {
             "embedded" => {
@@ -950,17 +967,21 @@ impl<W: UiWriter> Agent<W> {
             }
             "anthropic" => {
                 // Claude models have large context windows
-                200000 // Default for Claude models
+                // Use configured max_tokens or fall back to default
+                get_provider_max_tokens(config, "anthropic").unwrap_or(200000)
             }
             "databricks" => {
                 // Databricks models have varying context windows depending on the model
-                if model_name.contains("claude") {
-                    200000 // Claude models on Databricks have large context windows
-                } else if model_name.contains("llama") || model_name.contains("dbrx") {
-                    32768 // DBRX supports 32k context
-                } else {
-                    16384 // Conservative default for other Databricks models
-                }
+                // Use configured max_tokens or fall back to model-specific defaults
+                get_provider_max_tokens(config, "databricks").unwrap_or_else(|| {
+                    if model_name.contains("claude") {
+                        200000 // Claude models on Databricks have large context windows
+                    } else if model_name.contains("llama") || model_name.contains("dbrx") {
+                        32768 // DBRX supports 32k context
+                    } else {
+                        16384 // Conservative default for other Databricks models
+                    }
+                })
             }
             _ => config.agent.max_context_length as u32,
         };
@@ -1511,7 +1532,7 @@ Template:
         // Dynamically calculate max_tokens for summary based on what's left
         let summary_max_tokens = match provider.name() {
             "databricks" | "anthropic" => {
-                let model_limit = 200_000u32;
+                let model_limit = self.context_window.total_tokens;
                 let current_usage = self.context_window.used_tokens;
                 let available = model_limit
                     .saturating_sub(current_usage)
@@ -2394,6 +2415,28 @@ Template:
 
         // Check if we need to summarize before starting
         if self.context_window.should_summarize() {
+            // First try thinning if we haven't reached 90% yet
+            if self.context_window.percentage_used() < 90.0 && self.context_window.should_thin() {
+                self.ui_writer.print_context_status(&format!(
+                    "\n🥒 Context window at {}%. Trying thinning first...",
+                    self.context_window.percentage_used() as u32
+                ));
+                
+                let (thin_summary, chars_saved) = self.context_window.thin_context();
+                self.thinning_events.push(chars_saved);
+                self.ui_writer.print_context_thinning(&thin_summary);
+                
+                // Check if thinning was sufficient
+                if !self.context_window.should_summarize() {
+                    self.ui_writer.print_context_status("✅ Thinning resolved capacity issue. Continuing...\n");
+                    // Continue with the original request without summarization
+                } else {
+                    self.ui_writer.print_context_status("⚠️ Thinning insufficient. Proceeding with summarization...\n");
+                }
+            }
+            
+            // Only proceed with summarization if still needed after thinning
+            if self.context_window.should_summarize() {
             // Notify user about summarization
             self.ui_writer.print_context_status(&format!(
                 "\n🗜️ Context window reaching capacity ({}%). Creating summary...",
@@ -2433,14 +2476,22 @@ Template:
             // We need to ensure: used_tokens + max_tokens <= total_context_limit
             let summary_max_tokens = match provider.name() {
                 "databricks" | "anthropic" => {
-                    // Claude models have 200k context
-                    // Calculate how much room we have left
-                    let model_limit = 200_000u32;
+                    // Use the actual configured context window size
+                    let model_limit = self.context_window.total_tokens;
                     let current_usage = self.context_window.used_tokens;
-                    // Leave some buffer (5k tokens) for safety
+                    
+                    // Check if we have enough capacity for summarization
+                    if current_usage >= model_limit.saturating_sub(1000) {
+                        error!("Context window at capacity ({}%), cannot summarize. Current: {}, Limit: {}", 
+                               self.context_window.percentage_used(), current_usage, model_limit);
+                        return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify or /compact commands to reduce context size, or start a new session."));
+                    }
+                    
+                    // Leave buffer proportional to model size (min 1k, max 10k)
+                    let buffer = (model_limit / 40).clamp(1000, 10000); // 2.5% buffer
                     let available = model_limit
                         .saturating_sub(current_usage)
-                        .saturating_sub(5000);
+                        .saturating_sub(buffer);
                     // Cap at a reasonable summary size (10k tokens max)
                     Some(available.min(10_000))
                 }
@@ -2448,6 +2499,13 @@ Template:
                     // For smaller context models, be more conservative
                     let model_limit = self.context_window.total_tokens;
                     let current_usage = self.context_window.used_tokens;
+                    
+                    // Check capacity for embedded models too
+                    if current_usage >= model_limit.saturating_sub(500) {
+                        error!("Embedded model context window at capacity ({}%)", self.context_window.percentage_used());
+                        return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify command to reduce context size, or start a new session."));
+                    }
+                    
                     // Leave 1k buffer
                     let available = model_limit
                         .saturating_sub(current_usage)
@@ -2457,6 +2515,14 @@ Template:
                 }
                 _ => {
                     // Default: conservative approach
+                    let model_limit = self.context_window.total_tokens;
+                    let current_usage = self.context_window.used_tokens;
+                    
+                    if current_usage >= model_limit.saturating_sub(1000) {
+                        error!("Context window at capacity ({}%)", self.context_window.percentage_used());
+                        return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify or /compact commands, or start a new session."));
+                    }
+                    
                     let available = self.context_window.remaining_tokens().saturating_sub(2000);
                     Some(available.min(5000))
                 }
@@ -2466,6 +2532,12 @@ Template:
                 "Requesting summary with max_tokens: {:?} (current usage: {} tokens)",
                 summary_max_tokens, self.context_window.used_tokens
             );
+            
+            // Final safety check
+            if summary_max_tokens.unwrap_or(0) == 0 {
+                error!("No tokens available for summarization");
+                return Err(anyhow::anyhow!("No context window capacity left for summarization. Use /thinnify to reduce context size or start a new session."));
+            }
 
             let summary_request = CompletionRequest {
                 messages: summary_messages,
@@ -2507,6 +2579,7 @@ Template:
                 }
             }
         }
+        }
 
         loop {
             iteration_count += 1;
diff --git a/test_anthropic_fix.md b/test_anthropic_fix.md
new file mode 100644
index 0000000..116bd5d
--- /dev/null
+++ b/test_anthropic_fix.md
@@ -0,0 +1,70 @@
+# Anthropic max_tokens Error Fix - Test Plan
+
+## Changes Made
+
+### 1. Fixed Context Window Size Detection
+- **Problem**: Code used hardcoded 200k limit for Anthropic instead of configured max_tokens
+- **Fix**: Modified `determine_context_length()` to check configured max_tokens first before falling back to defaults
+- **Files**: `crates/g3-core/src/lib.rs` lines 923-945, 967-985
+
+### 2. Added Thinning Before Summarization
+- **Problem**: Code attempted summarization even when context window was nearly full
+- **Fix**: Added logic to try thinning first when context usage is between 80-90%
+- **Files**: `crates/g3-core/src/lib.rs` lines 2415-2439
+
+### 3. Added Capacity Checks Before Summarization
+- **Problem**: No validation that sufficient tokens remained for summarization
+- **Fix**: Added capacity checks for all provider types with helpful error messages
+- **Files**: `crates/g3-core/src/lib.rs` lines 2480-2520
+
+### 4. Improved Error Messages
+- **Problem**: Generic errors when summarization failed
+- **Fix**: Specific error messages suggesting `/thinnify` and `/compact` commands
+- **Files**: Multiple locations in summarization logic
+
+### 5. Dynamic Buffer Calculation
+- **Problem**: Fixed 5k buffer regardless of model size
+- **Fix**: Proportional buffer (2.5% of model limit, min 1k, max 10k)
+- **Files**: `crates/g3-core/src/lib.rs` line 2487
+
+## Test Cases
+
+### Test 1: Configured max_tokens Respected
+```toml
+# In g3.toml
+[providers.anthropic]
+api_key = "your-key"
+model = "claude-3-5-sonnet-20241022"
+max_tokens = 50000  # Should use this instead of 200k default
+```
+
+### Test 2: Thinning Before Summarization
+- Fill context to 85% capacity
+- Verify thinning is attempted before summarization
+- Check that summarization is skipped if thinning resolves the issue
+
+### Test 3: Capacity Error Handling
+- Fill context to 98% capacity
+- Verify helpful error message is shown instead of API error
+- Check that `/thinnify` and `/compact` commands are suggested
+
+### Test 4: Provider-Specific Handling
+- Test with different providers (anthropic, databricks, embedded)
+- Verify each uses appropriate capacity checks and buffers
+
+## Expected Behavior
+
+1. **No more max_tokens API errors** from Anthropic when context window is full
+2. **Automatic thinning** when approaching capacity (80-90%)
+3. **Clear error messages** with actionable suggestions when at capacity
+4. **Respect configured limits** instead of hardcoded defaults
+5. **Graceful degradation** with helpful user guidance
+
+## Manual Testing Commands
+
+```bash
+# Test with small max_tokens to trigger the issue quickly
+g3 --chat
+# Then paste large amounts of text to fill context window
+# Verify thinning and error handling work correctly
+```

From 0e1f9dbf9a5b2028d2b169910ff6975139d4fb3e Mon Sep 17 00:00:00 2001
From: Jochen <jochenx@bekmann.com>
Date: Thu, 6 Nov 2025 19:47:02 +1100
Subject: [PATCH 2/2] rename max_context_length to fallback_default_max_tokens

---
 config.coach-player.example.toml |  2 +-
 config.example.toml              |  2 +-
 crates/g3-config/src/lib.rs      |  6 +++---
 crates/g3-config/src/tests.rs    |  6 +++---
 crates/g3-core/src/lib.rs        | 12 ++++++------
 5 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/config.coach-player.example.toml b/config.coach-player.example.toml
index 2101564..999b674 100644
--- a/config.coach-player.example.toml
+++ b/config.coach-player.example.toml
@@ -19,6 +19,6 @@ max_tokens = 4096
 temperature = 0.3  # Slightly higher temperature for more creative implementations
 
 [agent]
-max_context_length = 8192
+fallback_default_max_tokens = 8192
 enable_streaming = true
 timeout_seconds = 60
\ No newline at end of file
diff --git a/config.example.toml b/config.example.toml
index b58ae3f..56954f9 100644
--- a/config.example.toml
+++ b/config.example.toml
@@ -15,7 +15,7 @@ temperature = 0.1
 use_oauth = true
 
 [agent]
-max_context_length = 8192
+fallback_default_max_tokens = 8192
 enable_streaming = true
 timeout_seconds = 60
 
diff --git a/crates/g3-config/src/lib.rs b/crates/g3-config/src/lib.rs
index d9f0602..ba578e9 100644
--- a/crates/g3-config/src/lib.rs
+++ b/crates/g3-config/src/lib.rs
@@ -62,7 +62,7 @@ pub struct EmbeddedConfig {
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct AgentConfig {
-    pub max_context_length: usize,
+    pub fallback_default_max_tokens: usize,
     pub enable_streaming: bool,
     pub timeout_seconds: u64,
     pub auto_compact: bool,
@@ -133,7 +133,7 @@ impl Default for Config {
                 player: None, // Will use default_provider if not specified
             },
             agent: AgentConfig {
-                max_context_length: 8192,
+                fallback_default_max_tokens: 8192,
                 enable_streaming: true,
                 timeout_seconds: 60,
                 auto_compact: true,
@@ -249,7 +249,7 @@ impl Config {
                 player: None, // Will use default_provider if not specified
             },
             agent: AgentConfig {
-                max_context_length: 8192,
+                fallback_default_max_tokens: 8192,
                 enable_streaming: true,
                 timeout_seconds: 60,
                 auto_compact: true,
diff --git a/crates/g3-config/src/tests.rs b/crates/g3-config/src/tests.rs
index a1e1e9f..6899a8b 100644
--- a/crates/g3-config/src/tests.rs
+++ b/crates/g3-config/src/tests.rs
@@ -31,7 +31,7 @@ model_path = "test.gguf"
 model_type = "llama"
 
 [agent]
-max_context_length = 8192
+fallback_default_max_tokens = 8192
 enable_streaming = true
 timeout_seconds = 60
 "#;
@@ -72,7 +72,7 @@ token = "test-token"
 model = "test-model"
 
 [agent]
-max_context_length = 8192
+fallback_default_max_tokens = 8192
 enable_streaming = true
 timeout_seconds = 60
 "#;
@@ -113,7 +113,7 @@ token = "test-token"
 model = "test-model"
 
 [agent]
-max_context_length = 8192
+fallback_default_max_tokens = 8192
 enable_streaming = true
 timeout_seconds = 60
 "#;
diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs
index 1c754fa..b338852 100644
--- a/crates/g3-core/src/lib.rs
+++ b/crates/g3-core/src/lib.rs
@@ -865,7 +865,7 @@ impl<W: UiWriter> Agent<W> {
         debug!("Default provider set successfully");
 
         // Determine context window size based on active provider
-        let context_length = Self::determine_context_length(&config, &providers)?;
+        let context_length = Self::get_configured_context_length(&config, &providers)?;
         let mut context_window = ContextWindow::new(context_length);
 
         // If README content is provided, add it as the first system message
@@ -920,7 +920,7 @@ impl<W: UiWriter> Agent<W> {
         })
     }
 
-    fn determine_context_length(config: &Config, providers: &ProviderRegistry) -> Result<u32> {
+    fn get_configured_context_length(config: &Config, providers: &ProviderRegistry) -> Result<u32> {
         // Get the configured max_tokens for the current provider
         fn get_provider_max_tokens(config: &Config, provider_name: &str) -> Option<u32> {
             match provider_name {
@@ -959,7 +959,7 @@ impl<W: UiWriter> Agent<W> {
                         }
                     })
                 } else {
-                    config.agent.max_context_length as u32
+                    config.agent.fallback_default_max_tokens as u32
                 }
             }
             "openai" => {
@@ -983,7 +983,7 @@ impl<W: UiWriter> Agent<W> {
                     }
                 })
             }
-            _ => config.agent.max_context_length as u32,
+            _ => config.agent.fallback_default_max_tokens as u32,
         };
 
         debug!(
@@ -2415,8 +2415,8 @@ Template:
 
         // Check if we need to summarize before starting
         if self.context_window.should_summarize() {
-            // First try thinning if we haven't reached 90% yet
-            if self.context_window.percentage_used() < 90.0 && self.context_window.should_thin() {
+            // First try thinning if we are at capacity, don't call the LLM for a summary (might fail)
+            if self.context_window.percentage_used() > 90.0 && self.context_window.should_thin() {
                 self.ui_writer.print_context_status(&format!(
                     "\n🥒 Context window at {}%. Trying thinning first...",
                     self.context_window.percentage_used() as u32