Add 1% safety buffer to context window to prevent API token limit errors
Our token estimation heuristic (chars/3 * 1.1 for code, chars/4 * 1.1 for text) slightly undercounts over long sessions with hundreds of tool calls. This accumulated drift of ~89 tokens caused Anthropic API 400 errors: 'prompt is too long: 200089 tokens > 200000 maximum' Fix: ContextWindow::new() now applies a 1% buffer, setting total_tokens to 99% of the provider-reported limit. For a 200k window this gives 198k, providing a 2000-token safety margin that absorbs estimation drift. All percentage calculations, compaction thresholds, and thinning triggers operate against the buffered limit, so compaction fires earlier and we never send a request the API will reject.
This commit is contained in:
@@ -84,9 +84,15 @@ pub struct ContextWindow {
|
|||||||
|
|
||||||
impl ContextWindow {
|
impl ContextWindow {
|
||||||
pub fn new(total_tokens: u32) -> Self {
|
pub fn new(total_tokens: u32) -> Self {
|
||||||
|
// Apply a 1% safety buffer to absorb token estimation drift.
|
||||||
|
// Our heuristic (chars/3 * 1.1 for code, chars/4 * 1.1 for text) slightly
|
||||||
|
// undercounts over long sessions with hundreds of tool calls. Without this
|
||||||
|
// buffer, accumulated drift of ~89 tokens caused API 400 errors:
|
||||||
|
// "prompt is too long: 200089 tokens > 200000 maximum"
|
||||||
|
let buffered_tokens = (total_tokens as f64 * 0.99) as u32;
|
||||||
Self {
|
Self {
|
||||||
used_tokens: 0,
|
used_tokens: 0,
|
||||||
total_tokens,
|
total_tokens: buffered_tokens,
|
||||||
cumulative_tokens: 0,
|
cumulative_tokens: 0,
|
||||||
conversation_history: Vec::new(),
|
conversation_history: Vec::new(),
|
||||||
last_thinning_percentage: 0,
|
last_thinning_percentage: 0,
|
||||||
@@ -783,23 +789,65 @@ mod tests {
|
|||||||
fn test_new_context_window() {
|
fn test_new_context_window() {
|
||||||
let cw = ContextWindow::new(100_000);
|
let cw = ContextWindow::new(100_000);
|
||||||
assert_eq!(cw.used_tokens, 0);
|
assert_eq!(cw.used_tokens, 0);
|
||||||
assert_eq!(cw.total_tokens, 100_000);
|
assert_eq!(cw.total_tokens, 99_000); // 1% buffer: 100_000 * 0.99
|
||||||
assert_eq!(cw.cumulative_tokens, 0);
|
assert_eq!(cw.cumulative_tokens, 0);
|
||||||
assert!(cw.conversation_history.is_empty());
|
assert!(cw.conversation_history.is_empty());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_1pct_buffer_200k() {
|
||||||
|
// The exact scenario from the screenshot: 200k Anthropic context window
|
||||||
|
let cw = ContextWindow::new(200_000);
|
||||||
|
assert_eq!(cw.total_tokens, 198_000, "200k * 0.99 = 198k");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_1pct_buffer_zero() {
|
||||||
|
// Edge case: zero tokens should not underflow
|
||||||
|
let cw = ContextWindow::new(0);
|
||||||
|
assert_eq!(cw.total_tokens, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_1pct_buffer_small() {
|
||||||
|
// Small context window: 100 * 0.99 = 99
|
||||||
|
let cw = ContextWindow::new(100);
|
||||||
|
assert_eq!(cw.total_tokens, 99);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_1pct_buffer_percentage_uses_buffered_total() {
|
||||||
|
// percentage_used() should report against the buffered limit
|
||||||
|
let mut cw = ContextWindow::new(200_000);
|
||||||
|
assert_eq!(cw.total_tokens, 198_000);
|
||||||
|
|
||||||
|
// Set used_tokens to 198_000 (100% of buffered, 99% of raw)
|
||||||
|
cw.used_tokens = 198_000;
|
||||||
|
let pct = cw.percentage_used();
|
||||||
|
assert!(
|
||||||
|
(pct - 100.0).abs() < 0.01,
|
||||||
|
"Should be ~100% of buffered limit, got {:.2}%",
|
||||||
|
pct,
|
||||||
|
);
|
||||||
|
// This means compaction triggers well before the raw API limit
|
||||||
|
assert!(cw.should_compact());
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_percentage_used() {
|
fn test_percentage_used() {
|
||||||
let mut cw = ContextWindow::new(100);
|
let mut cw = ContextWindow::new(100);
|
||||||
|
// total_tokens is 99 after 1% buffer
|
||||||
cw.used_tokens = 50;
|
cw.used_tokens = 50;
|
||||||
assert!((cw.percentage_used() - 50.0).abs() < 0.01);
|
let expected = (50.0 / 99.0) * 100.0;
|
||||||
|
assert!((cw.percentage_used() - expected).abs() < 0.01);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_remaining_tokens() {
|
fn test_remaining_tokens() {
|
||||||
let mut cw = ContextWindow::new(100);
|
let mut cw = ContextWindow::new(100);
|
||||||
|
// total_tokens is 99 after 1% buffer
|
||||||
cw.used_tokens = 30;
|
cw.used_tokens = 30;
|
||||||
assert_eq!(cw.remaining_tokens(), 70);
|
assert_eq!(cw.remaining_tokens(), 69); // 99 - 30
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ fn test_task_result_basic_functionality() {
|
|||||||
// Test basic properties
|
// Test basic properties
|
||||||
assert_eq!(result.response, response);
|
assert_eq!(result.response, response);
|
||||||
assert_eq!(result.context_window.conversation_history.len(), 2);
|
assert_eq!(result.context_window.conversation_history.len(), 2);
|
||||||
assert_eq!(result.context_window.total_tokens, 10000);
|
assert_eq!(result.context_window.total_tokens, 9900); // 10000 * 0.99 (1% buffer)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -122,7 +122,7 @@ fn test_context_window_preservation() {
|
|||||||
let result = TaskResult::new("Response".to_string(), context.clone());
|
let result = TaskResult::new("Response".to_string(), context.clone());
|
||||||
|
|
||||||
// Verify context is preserved
|
// Verify context is preserved
|
||||||
assert_eq!(result.context_window.total_tokens, 5000);
|
assert_eq!(result.context_window.total_tokens, 4950); // 5000 * 0.99 (1% buffer)
|
||||||
assert!(result.context_window.used_tokens > 1234); // Should have increased
|
assert!(result.context_window.used_tokens > 1234); // Should have increased
|
||||||
assert_eq!(result.context_window.conversation_history.len(), 5);
|
assert_eq!(result.context_window.conversation_history.len(), 5);
|
||||||
|
|
||||||
|
|||||||
@@ -1476,3 +1476,145 @@ async fn test_tool_call_input_tokens_tracked_in_context_window() {
|
|||||||
"recalculate_tokens() should agree with incrementally tracked used_tokens"
|
"recalculate_tokens() should agree with incrementally tracked used_tokens"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Test: 1% safety buffer prevents "prompt is too long" API errors
|
||||||
|
///
|
||||||
|
/// Exact reproduction of the failure from the screenshot:
|
||||||
|
/// "prompt is too long: 200089 tokens > 200000 maximum"
|
||||||
|
///
|
||||||
|
/// Our token estimation slightly undercounts (by ~0.05%) because:
|
||||||
|
/// - Tool call overhead (name, id, JSON structure) is approximated at 20 tokens
|
||||||
|
/// - The chars/3 * 1.1 heuristic for code/JSON can drift on certain content
|
||||||
|
/// - Message framing tokens (role markers, separators) aren't fully counted
|
||||||
|
///
|
||||||
|
/// Over a long session with hundreds of tool calls, these small errors accumulate
|
||||||
|
/// to ~89 tokens over the 200k limit. The 1% buffer (2000 tokens on a 200k window)
|
||||||
|
/// absorbs this drift so we never send a request the API will reject.
|
||||||
|
///
|
||||||
|
/// This test fills a context window to near-capacity and verifies:
|
||||||
|
/// 1. The buffered total_tokens is 99% of the requested size
|
||||||
|
/// 2. percentage_used() reports against the buffered limit (not the raw provider limit)
|
||||||
|
/// 3. A session that would be at 99.95% of the raw limit is at >100% of the buffered
|
||||||
|
/// limit, meaning compaction/thinning would have already triggered
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_1pct_buffer_prevents_prompt_too_long_error() {
|
||||||
|
use g3_core::context_window::ContextWindow;
|
||||||
|
use g3_providers::MessageToolCall;
|
||||||
|
|
||||||
|
// Create a 200k context window (the Anthropic default)
|
||||||
|
let cw = ContextWindow::new(200_000);
|
||||||
|
|
||||||
|
// The buffer should reduce total_tokens by 1%
|
||||||
|
let expected_buffered = (200_000_f64 * 0.99) as u32; // 198_000
|
||||||
|
assert_eq!(
|
||||||
|
cw.total_tokens, expected_buffered,
|
||||||
|
"ContextWindow should apply 1% safety buffer: expected {}, got {}",
|
||||||
|
expected_buffered, cw.total_tokens,
|
||||||
|
);
|
||||||
|
|
||||||
|
// Now simulate the exact scenario from the screenshot:
|
||||||
|
// Fill the context to ~199,900 estimated tokens (99.95% of raw 200k)
|
||||||
|
// which is ~100.96% of the buffered 198k limit.
|
||||||
|
let mut cw = ContextWindow::new(200_000);
|
||||||
|
|
||||||
|
// Add system prompt (~6k tokens)
|
||||||
|
cw.add_message(Message::new(
|
||||||
|
MessageRole::System,
|
||||||
|
"You are G3, an AI programming agent. ".repeat(500), // ~18.5k chars → ~5k tokens
|
||||||
|
));
|
||||||
|
|
||||||
|
// Add many tool call messages to accumulate tokens.
|
||||||
|
// Each tool call pair (assistant + tool result) adds ~800-1200 estimated tokens.
|
||||||
|
// We need ~194k more tokens to reach 99.95% of raw 200k.
|
||||||
|
let mut _total_messages = 1; // system message
|
||||||
|
let mut last_percentage = 0.0_f32;
|
||||||
|
|
||||||
|
for i in 0..500 {
|
||||||
|
// Assistant message with a tool call containing ~2k chars of JSON input
|
||||||
|
let large_input = serde_json::json!({
|
||||||
|
"file_path": format!("src/module_{}/recognizer.rs", i),
|
||||||
|
"diff": format!(
|
||||||
|
"@@ -1,10 +1,50 @@\n-old code\n+{}\n context\n",
|
||||||
|
format!(" pub fn process_form_{i}(&mut self) -> Result<(), Error> {{\n // Implementation with detailed logic\n let token = self.next_token()?;\n match token {{\n Token::Open => self.handle_open()?,\n Token::Close => self.handle_close()?,\n _ => return Err(Error::Unexpected(token)),\n }}\n Ok(())\n }}\n").repeat(8)
|
||||||
|
),
|
||||||
|
});
|
||||||
|
|
||||||
|
let mut assistant = Message::new(
|
||||||
|
MessageRole::Assistant,
|
||||||
|
format!("Applying changes to module {}.", i),
|
||||||
|
);
|
||||||
|
assistant.tool_calls.push(MessageToolCall {
|
||||||
|
id: format!("toolu_{:04}", i),
|
||||||
|
name: "str_replace".to_string(),
|
||||||
|
input: large_input,
|
||||||
|
});
|
||||||
|
cw.add_message(assistant);
|
||||||
|
_total_messages += 1;
|
||||||
|
|
||||||
|
// Tool result
|
||||||
|
let mut result = Message::new(
|
||||||
|
MessageRole::User,
|
||||||
|
format!("Tool result: Applied 1 hunk to src/module_{}/recognizer.rs", i),
|
||||||
|
);
|
||||||
|
result.tool_result_id = Some(format!("toolu_{:04}", i));
|
||||||
|
cw.add_message(result);
|
||||||
|
_total_messages += 1;
|
||||||
|
|
||||||
|
let pct = cw.percentage_used();
|
||||||
|
|
||||||
|
// Check: did we cross 100% of the BUFFERED limit?
|
||||||
|
// If so, the buffer is working — compaction would have triggered at 80%.
|
||||||
|
if pct >= 100.0 && last_percentage < 100.0 {
|
||||||
|
// Calculate what percentage of the RAW 200k limit we're at
|
||||||
|
let raw_percentage = (cw.used_tokens as f64 / 200_000.0) * 100.0;
|
||||||
|
|
||||||
|
// We should be UNDER the raw 200k limit even though we're over the buffered limit
|
||||||
|
assert!(
|
||||||
|
raw_percentage < 100.0,
|
||||||
|
"When crossing 100% of buffered limit, should still be under raw 200k. \
|
||||||
|
Buffered: {:.2}%, Raw: {:.2}%, used: {}, buffered_total: {}, raw_total: 200000",
|
||||||
|
pct, raw_percentage, cw.used_tokens, cw.total_tokens,
|
||||||
|
);
|
||||||
|
|
||||||
|
// The gap between raw and buffered should be the ~1% buffer
|
||||||
|
let gap = 100.0 - raw_percentage;
|
||||||
|
assert!(
|
||||||
|
gap > 0.0 && gap < 2.0,
|
||||||
|
"Gap between raw limit and current usage should be 0-2% (the buffer). Got {:.2}%",
|
||||||
|
gap,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
last_percentage = pct;
|
||||||
|
|
||||||
|
// Stop once we've exceeded the buffered limit
|
||||||
|
if pct > 101.0 {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Final assertions
|
||||||
|
assert!(
|
||||||
|
cw.percentage_used() > 100.0,
|
||||||
|
"Should have exceeded the buffered limit. Percentage: {:.1}%, used: {}, total: {}",
|
||||||
|
cw.percentage_used(), cw.used_tokens, cw.total_tokens,
|
||||||
|
);
|
||||||
|
|
||||||
|
// But we should NOT have exceeded the raw 200k limit by much (if at all)
|
||||||
|
// The ~89 token overshoot from the screenshot would be absorbed by the 2000-token buffer
|
||||||
|
let raw_overshoot = cw.used_tokens as i64 - 200_000;
|
||||||
|
assert!(
|
||||||
|
raw_overshoot < 2000,
|
||||||
|
"Should not overshoot raw 200k by more than the buffer size. Overshoot: {} tokens",
|
||||||
|
raw_overshoot,
|
||||||
|
);
|
||||||
|
|
||||||
|
// Compaction would have triggered at 80% of the buffered limit (158,400 tokens)
|
||||||
|
// which is 79.2% of the raw limit — well before any API error
|
||||||
|
let compaction_threshold_tokens = (cw.total_tokens as f64 * 0.80) as u32;
|
||||||
|
assert!(
|
||||||
|
compaction_threshold_tokens < 200_000,
|
||||||
|
"Compaction threshold ({} tokens) must be well under raw 200k limit",
|
||||||
|
compaction_threshold_tokens,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|||||||
@@ -98,14 +98,15 @@ fn test_context_window_available_tokens() {
|
|||||||
|
|
||||||
// 2.5% buffer calculation
|
// 2.5% buffer calculation
|
||||||
let buffer = (model_limit / 40).clamp(1000, 10000);
|
let buffer = (model_limit / 40).clamp(1000, 10000);
|
||||||
assert_eq!(buffer, 5000); // 200000/40 = 5000
|
// After 1% safety buffer: total_tokens = 198000, so 198000/40 = 4950
|
||||||
|
assert_eq!(buffer, 4950);
|
||||||
|
|
||||||
let available = model_limit
|
let available = model_limit
|
||||||
.saturating_sub(current_usage)
|
.saturating_sub(current_usage)
|
||||||
.saturating_sub(buffer);
|
.saturating_sub(buffer);
|
||||||
|
|
||||||
// 200000 - 180000 - 5000 = 15000
|
// 198000 - 180000 - 4950 = 13050
|
||||||
assert_eq!(available, 15000);
|
assert_eq!(available, 13050);
|
||||||
|
|
||||||
// Capped at 10000 for summary
|
// Capped at 10000 for summary
|
||||||
let summary_max = available.min(10_000);
|
let summary_max = available.min(10_000);
|
||||||
|
|||||||
@@ -94,7 +94,8 @@ fn test_percentage_based_on_used_tokens() {
|
|||||||
|
|
||||||
// Initially 0%
|
// Initially 0%
|
||||||
assert_eq!(window.percentage_used(), 0.0);
|
assert_eq!(window.percentage_used(), 0.0);
|
||||||
assert_eq!(window.remaining_tokens(), 1000);
|
// After 1% buffer: total_tokens = 990
|
||||||
|
assert_eq!(window.remaining_tokens(), 990);
|
||||||
|
|
||||||
// Add messages to increase used_tokens
|
// Add messages to increase used_tokens
|
||||||
// A message with ~100 chars should be roughly 25-30 tokens
|
// A message with ~100 chars should be roughly 25-30 tokens
|
||||||
@@ -107,7 +108,7 @@ fn test_percentage_based_on_used_tokens() {
|
|||||||
assert!(percentage < 100.0, "percentage should be < 100");
|
assert!(percentage < 100.0, "percentage should be < 100");
|
||||||
|
|
||||||
// remaining_tokens should decrease
|
// remaining_tokens should decrease
|
||||||
assert!(window.remaining_tokens() < 1000, "remaining tokens should decrease");
|
assert!(window.remaining_tokens() < 990, "remaining tokens should decrease");
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Test that the 80% compaction threshold works correctly.
|
/// Test that the 80% compaction threshold works correctly.
|
||||||
|
|||||||
Reference in New Issue
Block a user