Merge pull request #19 from dhanji/jochen-fix-anthropic-context
Fix context window exhaustion
This commit is contained in:
@@ -19,6 +19,6 @@ max_tokens = 4096
|
|||||||
temperature = 0.3 # Slightly higher temperature for more creative implementations
|
temperature = 0.3 # Slightly higher temperature for more creative implementations
|
||||||
|
|
||||||
[agent]
|
[agent]
|
||||||
max_context_length = 8192
|
fallback_default_max_tokens = 8192
|
||||||
enable_streaming = true
|
enable_streaming = true
|
||||||
timeout_seconds = 60
|
timeout_seconds = 60
|
||||||
@@ -15,7 +15,7 @@ temperature = 0.1
|
|||||||
use_oauth = true
|
use_oauth = true
|
||||||
|
|
||||||
[agent]
|
[agent]
|
||||||
max_context_length = 8192
|
fallback_default_max_tokens = 8192
|
||||||
enable_streaming = true
|
enable_streaming = true
|
||||||
timeout_seconds = 60
|
timeout_seconds = 60
|
||||||
|
|
||||||
|
|||||||
@@ -62,7 +62,7 @@ pub struct EmbeddedConfig {
|
|||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct AgentConfig {
|
pub struct AgentConfig {
|
||||||
pub max_context_length: usize,
|
pub fallback_default_max_tokens: usize,
|
||||||
pub enable_streaming: bool,
|
pub enable_streaming: bool,
|
||||||
pub timeout_seconds: u64,
|
pub timeout_seconds: u64,
|
||||||
pub auto_compact: bool,
|
pub auto_compact: bool,
|
||||||
@@ -133,7 +133,7 @@ impl Default for Config {
|
|||||||
player: None, // Will use default_provider if not specified
|
player: None, // Will use default_provider if not specified
|
||||||
},
|
},
|
||||||
agent: AgentConfig {
|
agent: AgentConfig {
|
||||||
max_context_length: 8192,
|
fallback_default_max_tokens: 8192,
|
||||||
enable_streaming: true,
|
enable_streaming: true,
|
||||||
timeout_seconds: 60,
|
timeout_seconds: 60,
|
||||||
auto_compact: true,
|
auto_compact: true,
|
||||||
@@ -249,7 +249,7 @@ impl Config {
|
|||||||
player: None, // Will use default_provider if not specified
|
player: None, // Will use default_provider if not specified
|
||||||
},
|
},
|
||||||
agent: AgentConfig {
|
agent: AgentConfig {
|
||||||
max_context_length: 8192,
|
fallback_default_max_tokens: 8192,
|
||||||
enable_streaming: true,
|
enable_streaming: true,
|
||||||
timeout_seconds: 60,
|
timeout_seconds: 60,
|
||||||
auto_compact: true,
|
auto_compact: true,
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ model_path = "test.gguf"
|
|||||||
model_type = "llama"
|
model_type = "llama"
|
||||||
|
|
||||||
[agent]
|
[agent]
|
||||||
max_context_length = 8192
|
fallback_default_max_tokens = 8192
|
||||||
enable_streaming = true
|
enable_streaming = true
|
||||||
timeout_seconds = 60
|
timeout_seconds = 60
|
||||||
"#;
|
"#;
|
||||||
@@ -72,7 +72,7 @@ token = "test-token"
|
|||||||
model = "test-model"
|
model = "test-model"
|
||||||
|
|
||||||
[agent]
|
[agent]
|
||||||
max_context_length = 8192
|
fallback_default_max_tokens = 8192
|
||||||
enable_streaming = true
|
enable_streaming = true
|
||||||
timeout_seconds = 60
|
timeout_seconds = 60
|
||||||
"#;
|
"#;
|
||||||
@@ -113,7 +113,7 @@ token = "test-token"
|
|||||||
model = "test-model"
|
model = "test-model"
|
||||||
|
|
||||||
[agent]
|
[agent]
|
||||||
max_context_length = 8192
|
fallback_default_max_tokens = 8192
|
||||||
enable_streaming = true
|
enable_streaming = true
|
||||||
timeout_seconds = 60
|
timeout_seconds = 60
|
||||||
"#;
|
"#;
|
||||||
|
|||||||
@@ -877,7 +877,7 @@ impl<W: UiWriter> Agent<W> {
|
|||||||
debug!("Default provider set successfully");
|
debug!("Default provider set successfully");
|
||||||
|
|
||||||
// Determine context window size based on active provider
|
// Determine context window size based on active provider
|
||||||
let context_length = Self::determine_context_length(&config, &providers)?;
|
let context_length = Self::get_configured_context_length(&config, &providers)?;
|
||||||
let mut context_window = ContextWindow::new(context_length);
|
let mut context_window = ContextWindow::new(context_length);
|
||||||
|
|
||||||
// If README content is provided, add it as the first system message
|
// If README content is provided, add it as the first system message
|
||||||
@@ -943,12 +943,29 @@ impl<W: UiWriter> Agent<W> {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn determine_context_length(config: &Config, providers: &ProviderRegistry) -> Result<u32> {
|
fn get_configured_context_length(config: &Config, providers: &ProviderRegistry) -> Result<u32> {
|
||||||
|
// Get the configured max_tokens for the current provider
|
||||||
|
fn get_provider_max_tokens(config: &Config, provider_name: &str) -> Option<u32> {
|
||||||
|
match provider_name {
|
||||||
|
"anthropic" => config.providers.anthropic.as_ref()?.max_tokens,
|
||||||
|
"openai" => config.providers.openai.as_ref()?.max_tokens,
|
||||||
|
"databricks" => config.providers.databricks.as_ref()?.max_tokens,
|
||||||
|
"embedded" => config.providers.embedded.as_ref()?.max_tokens,
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Get the active provider to determine context length
|
// Get the active provider to determine context length
|
||||||
let provider = providers.get(None)?;
|
let provider = providers.get(None)?;
|
||||||
let provider_name = provider.name();
|
let provider_name = provider.name();
|
||||||
let model_name = provider.model();
|
let model_name = provider.model();
|
||||||
|
|
||||||
|
// Check if there's a configured context length override first
|
||||||
|
if let Some(max_tokens) = get_provider_max_tokens(config, provider_name) {
|
||||||
|
debug!("Using configured max_tokens for {}: {}", provider_name, max_tokens);
|
||||||
|
return Ok(max_tokens);
|
||||||
|
}
|
||||||
|
|
||||||
// Use provider-specific context length if available, otherwise fall back to agent config
|
// Use provider-specific context length if available, otherwise fall back to agent config
|
||||||
let context_length = match provider_name {
|
let context_length = match provider_name {
|
||||||
"embedded" => {
|
"embedded" => {
|
||||||
@@ -965,7 +982,7 @@ impl<W: UiWriter> Agent<W> {
|
|||||||
}
|
}
|
||||||
})
|
})
|
||||||
} else {
|
} else {
|
||||||
config.agent.max_context_length as u32
|
config.agent.fallback_default_max_tokens as u32
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
"openai" => {
|
"openai" => {
|
||||||
@@ -973,19 +990,23 @@ impl<W: UiWriter> Agent<W> {
|
|||||||
}
|
}
|
||||||
"anthropic" => {
|
"anthropic" => {
|
||||||
// Claude models have large context windows
|
// Claude models have large context windows
|
||||||
200000 // Default for Claude models
|
// Use configured max_tokens or fall back to default
|
||||||
|
get_provider_max_tokens(config, "anthropic").unwrap_or(200000)
|
||||||
}
|
}
|
||||||
"databricks" => {
|
"databricks" => {
|
||||||
// Databricks models have varying context windows depending on the model
|
// Databricks models have varying context windows depending on the model
|
||||||
if model_name.contains("claude") {
|
// Use configured max_tokens or fall back to model-specific defaults
|
||||||
200000 // Claude models on Databricks have large context windows
|
get_provider_max_tokens(config, "databricks").unwrap_or_else(|| {
|
||||||
} else if model_name.contains("llama") || model_name.contains("dbrx") {
|
if model_name.contains("claude") {
|
||||||
32768 // DBRX supports 32k context
|
200000 // Claude models on Databricks have large context windows
|
||||||
} else {
|
} else if model_name.contains("llama") || model_name.contains("dbrx") {
|
||||||
16384 // Conservative default for other Databricks models
|
32768 // DBRX supports 32k context
|
||||||
}
|
} else {
|
||||||
|
16384 // Conservative default for other Databricks models
|
||||||
|
}
|
||||||
|
})
|
||||||
}
|
}
|
||||||
_ => config.agent.max_context_length as u32,
|
_ => config.agent.fallback_default_max_tokens as u32,
|
||||||
};
|
};
|
||||||
|
|
||||||
debug!(
|
debug!(
|
||||||
@@ -1641,7 +1662,7 @@ If you can complete it with 1-2 tool calls, skip TODO.
|
|||||||
// Dynamically calculate max_tokens for summary based on what's left
|
// Dynamically calculate max_tokens for summary based on what's left
|
||||||
let summary_max_tokens = match provider.name() {
|
let summary_max_tokens = match provider.name() {
|
||||||
"databricks" | "anthropic" => {
|
"databricks" | "anthropic" => {
|
||||||
let model_limit = 200_000u32;
|
let model_limit = self.context_window.total_tokens;
|
||||||
let current_usage = self.context_window.used_tokens;
|
let current_usage = self.context_window.used_tokens;
|
||||||
let available = model_limit
|
let available = model_limit
|
||||||
.saturating_sub(current_usage)
|
.saturating_sub(current_usage)
|
||||||
@@ -2524,6 +2545,28 @@ If you can complete it with 1-2 tool calls, skip TODO.
|
|||||||
|
|
||||||
// Check if we need to summarize before starting
|
// Check if we need to summarize before starting
|
||||||
if self.context_window.should_summarize() {
|
if self.context_window.should_summarize() {
|
||||||
|
// First try thinning if we are at capacity, don't call the LLM for a summary (might fail)
|
||||||
|
if self.context_window.percentage_used() > 90.0 && self.context_window.should_thin() {
|
||||||
|
self.ui_writer.print_context_status(&format!(
|
||||||
|
"\n🥒 Context window at {}%. Trying thinning first...",
|
||||||
|
self.context_window.percentage_used() as u32
|
||||||
|
));
|
||||||
|
|
||||||
|
let (thin_summary, chars_saved) = self.context_window.thin_context();
|
||||||
|
self.thinning_events.push(chars_saved);
|
||||||
|
self.ui_writer.print_context_thinning(&thin_summary);
|
||||||
|
|
||||||
|
// Check if thinning was sufficient
|
||||||
|
if !self.context_window.should_summarize() {
|
||||||
|
self.ui_writer.print_context_status("✅ Thinning resolved capacity issue. Continuing...\n");
|
||||||
|
// Continue with the original request without summarization
|
||||||
|
} else {
|
||||||
|
self.ui_writer.print_context_status("⚠️ Thinning insufficient. Proceeding with summarization...\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Only proceed with summarization if still needed after thinning
|
||||||
|
if self.context_window.should_summarize() {
|
||||||
// Notify user about summarization
|
// Notify user about summarization
|
||||||
self.ui_writer.print_context_status(&format!(
|
self.ui_writer.print_context_status(&format!(
|
||||||
"\n🗜️ Context window reaching capacity ({}%). Creating summary...",
|
"\n🗜️ Context window reaching capacity ({}%). Creating summary...",
|
||||||
@@ -2563,14 +2606,22 @@ If you can complete it with 1-2 tool calls, skip TODO.
|
|||||||
// We need to ensure: used_tokens + max_tokens <= total_context_limit
|
// We need to ensure: used_tokens + max_tokens <= total_context_limit
|
||||||
let summary_max_tokens = match provider.name() {
|
let summary_max_tokens = match provider.name() {
|
||||||
"databricks" | "anthropic" => {
|
"databricks" | "anthropic" => {
|
||||||
// Claude models have 200k context
|
// Use the actual configured context window size
|
||||||
// Calculate how much room we have left
|
let model_limit = self.context_window.total_tokens;
|
||||||
let model_limit = 200_000u32;
|
|
||||||
let current_usage = self.context_window.used_tokens;
|
let current_usage = self.context_window.used_tokens;
|
||||||
// Leave some buffer (5k tokens) for safety
|
|
||||||
|
// Check if we have enough capacity for summarization
|
||||||
|
if current_usage >= model_limit.saturating_sub(1000) {
|
||||||
|
error!("Context window at capacity ({}%), cannot summarize. Current: {}, Limit: {}",
|
||||||
|
self.context_window.percentage_used(), current_usage, model_limit);
|
||||||
|
return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify or /compact commands to reduce context size, or start a new session."));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Leave buffer proportional to model size (min 1k, max 10k)
|
||||||
|
let buffer = (model_limit / 40).clamp(1000, 10000); // 2.5% buffer
|
||||||
let available = model_limit
|
let available = model_limit
|
||||||
.saturating_sub(current_usage)
|
.saturating_sub(current_usage)
|
||||||
.saturating_sub(5000);
|
.saturating_sub(buffer);
|
||||||
// Cap at a reasonable summary size (10k tokens max)
|
// Cap at a reasonable summary size (10k tokens max)
|
||||||
Some(available.min(10_000))
|
Some(available.min(10_000))
|
||||||
}
|
}
|
||||||
@@ -2578,6 +2629,13 @@ If you can complete it with 1-2 tool calls, skip TODO.
|
|||||||
// For smaller context models, be more conservative
|
// For smaller context models, be more conservative
|
||||||
let model_limit = self.context_window.total_tokens;
|
let model_limit = self.context_window.total_tokens;
|
||||||
let current_usage = self.context_window.used_tokens;
|
let current_usage = self.context_window.used_tokens;
|
||||||
|
|
||||||
|
// Check capacity for embedded models too
|
||||||
|
if current_usage >= model_limit.saturating_sub(500) {
|
||||||
|
error!("Embedded model context window at capacity ({}%)", self.context_window.percentage_used());
|
||||||
|
return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify command to reduce context size, or start a new session."));
|
||||||
|
}
|
||||||
|
|
||||||
// Leave 1k buffer
|
// Leave 1k buffer
|
||||||
let available = model_limit
|
let available = model_limit
|
||||||
.saturating_sub(current_usage)
|
.saturating_sub(current_usage)
|
||||||
@@ -2587,6 +2645,14 @@ If you can complete it with 1-2 tool calls, skip TODO.
|
|||||||
}
|
}
|
||||||
_ => {
|
_ => {
|
||||||
// Default: conservative approach
|
// Default: conservative approach
|
||||||
|
let model_limit = self.context_window.total_tokens;
|
||||||
|
let current_usage = self.context_window.used_tokens;
|
||||||
|
|
||||||
|
if current_usage >= model_limit.saturating_sub(1000) {
|
||||||
|
error!("Context window at capacity ({}%)", self.context_window.percentage_used());
|
||||||
|
return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify or /compact commands, or start a new session."));
|
||||||
|
}
|
||||||
|
|
||||||
let available = self.context_window.remaining_tokens().saturating_sub(2000);
|
let available = self.context_window.remaining_tokens().saturating_sub(2000);
|
||||||
Some(available.min(5000))
|
Some(available.min(5000))
|
||||||
}
|
}
|
||||||
@@ -2596,6 +2662,12 @@ If you can complete it with 1-2 tool calls, skip TODO.
|
|||||||
"Requesting summary with max_tokens: {:?} (current usage: {} tokens)",
|
"Requesting summary with max_tokens: {:?} (current usage: {} tokens)",
|
||||||
summary_max_tokens, self.context_window.used_tokens
|
summary_max_tokens, self.context_window.used_tokens
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Final safety check
|
||||||
|
if summary_max_tokens.unwrap_or(0) == 0 {
|
||||||
|
error!("No tokens available for summarization");
|
||||||
|
return Err(anyhow::anyhow!("No context window capacity left for summarization. Use /thinnify to reduce context size or start a new session."));
|
||||||
|
}
|
||||||
|
|
||||||
let summary_request = CompletionRequest {
|
let summary_request = CompletionRequest {
|
||||||
messages: summary_messages,
|
messages: summary_messages,
|
||||||
@@ -2637,6 +2709,7 @@ If you can complete it with 1-2 tool calls, skip TODO.
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
iteration_count += 1;
|
iteration_count += 1;
|
||||||
|
|||||||
70
test_anthropic_fix.md
Normal file
70
test_anthropic_fix.md
Normal file
@@ -0,0 +1,70 @@
|
|||||||
|
# Anthropic max_tokens Error Fix - Test Plan
|
||||||
|
|
||||||
|
## Changes Made
|
||||||
|
|
||||||
|
### 1. Fixed Context Window Size Detection
|
||||||
|
- **Problem**: Code used hardcoded 200k limit for Anthropic instead of configured max_tokens
|
||||||
|
- **Fix**: Modified `determine_context_length()` to check configured max_tokens first before falling back to defaults
|
||||||
|
- **Files**: `crates/g3-core/src/lib.rs` lines 923-945, 967-985
|
||||||
|
|
||||||
|
### 2. Added Thinning Before Summarization
|
||||||
|
- **Problem**: Code attempted summarization even when context window was nearly full
|
||||||
|
- **Fix**: Added logic to try thinning first when context usage is between 80-90%
|
||||||
|
- **Files**: `crates/g3-core/src/lib.rs` lines 2415-2439
|
||||||
|
|
||||||
|
### 3. Added Capacity Checks Before Summarization
|
||||||
|
- **Problem**: No validation that sufficient tokens remained for summarization
|
||||||
|
- **Fix**: Added capacity checks for all provider types with helpful error messages
|
||||||
|
- **Files**: `crates/g3-core/src/lib.rs` lines 2480-2520
|
||||||
|
|
||||||
|
### 4. Improved Error Messages
|
||||||
|
- **Problem**: Generic errors when summarization failed
|
||||||
|
- **Fix**: Specific error messages suggesting `/thinnify` and `/compact` commands
|
||||||
|
- **Files**: Multiple locations in summarization logic
|
||||||
|
|
||||||
|
### 5. Dynamic Buffer Calculation
|
||||||
|
- **Problem**: Fixed 5k buffer regardless of model size
|
||||||
|
- **Fix**: Proportional buffer (2.5% of model limit, min 1k, max 10k)
|
||||||
|
- **Files**: `crates/g3-core/src/lib.rs` line 2487
|
||||||
|
|
||||||
|
## Test Cases
|
||||||
|
|
||||||
|
### Test 1: Configured max_tokens Respected
|
||||||
|
```toml
|
||||||
|
# In g3.toml
|
||||||
|
[providers.anthropic]
|
||||||
|
api_key = "your-key"
|
||||||
|
model = "claude-3-5-sonnet-20241022"
|
||||||
|
max_tokens = 50000 # Should use this instead of 200k default
|
||||||
|
```
|
||||||
|
|
||||||
|
### Test 2: Thinning Before Summarization
|
||||||
|
- Fill context to 85% capacity
|
||||||
|
- Verify thinning is attempted before summarization
|
||||||
|
- Check that summarization is skipped if thinning resolves the issue
|
||||||
|
|
||||||
|
### Test 3: Capacity Error Handling
|
||||||
|
- Fill context to 98% capacity
|
||||||
|
- Verify helpful error message is shown instead of API error
|
||||||
|
- Check that `/thinnify` and `/compact` commands are suggested
|
||||||
|
|
||||||
|
### Test 4: Provider-Specific Handling
|
||||||
|
- Test with different providers (anthropic, databricks, embedded)
|
||||||
|
- Verify each uses appropriate capacity checks and buffers
|
||||||
|
|
||||||
|
## Expected Behavior
|
||||||
|
|
||||||
|
1. **No more max_tokens API errors** from Anthropic when context window is full
|
||||||
|
2. **Automatic thinning** when approaching capacity (80-90%)
|
||||||
|
3. **Clear error messages** with actionable suggestions when at capacity
|
||||||
|
4. **Respect configured limits** instead of hardcoded defaults
|
||||||
|
5. **Graceful degradation** with helpful user guidance
|
||||||
|
|
||||||
|
## Manual Testing Commands
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Test with small max_tokens to trigger the issue quickly
|
||||||
|
g3 --chat
|
||||||
|
# Then paste large amounts of text to fill context window
|
||||||
|
# Verify thinning and error handling work correctly
|
||||||
|
```
|
||||||
Reference in New Issue
Block a user