Add 1% safety buffer to context window to prevent API token limit errors

Our token estimation heuristic (chars/3 * 1.1 for code, chars/4 * 1.1 for text)
slightly undercounts over long sessions with hundreds of tool calls. This
accumulated drift of ~89 tokens caused Anthropic API 400 errors:
  'prompt is too long: 200089 tokens > 200000 maximum'

Fix: ContextWindow::new() now applies a 1% buffer, setting total_tokens to 99%
of the provider-reported limit. For a 200k window this gives 198k, providing a
2000-token safety margin that absorbs estimation drift.

All percentage calculations, compaction thresholds, and thinning triggers
operate against the buffered limit, so compaction fires earlier and we never
send a request the API will reject.
This commit is contained in:
Dhanji R. Prasanna
2026-02-13 15:46:53 +11:00
parent a7e0b0ef9e
commit 0410efd41b
5 changed files with 203 additions and 11 deletions

View File

@@ -84,9 +84,15 @@ pub struct ContextWindow {
impl ContextWindow {
pub fn new(total_tokens: u32) -> Self {
// Apply a 1% safety buffer to absorb token estimation drift.
// Our heuristic (chars/3 * 1.1 for code, chars/4 * 1.1 for text) slightly
// undercounts over long sessions with hundreds of tool calls. Without this
// buffer, accumulated drift of ~89 tokens caused API 400 errors:
// "prompt is too long: 200089 tokens > 200000 maximum"
let buffered_tokens = (total_tokens as f64 * 0.99) as u32;
Self {
used_tokens: 0,
total_tokens,
total_tokens: buffered_tokens,
cumulative_tokens: 0,
conversation_history: Vec::new(),
last_thinning_percentage: 0,
@@ -783,23 +789,65 @@ mod tests {
fn test_new_context_window() {
let cw = ContextWindow::new(100_000);
assert_eq!(cw.used_tokens, 0);
assert_eq!(cw.total_tokens, 100_000);
assert_eq!(cw.total_tokens, 99_000); // 1% buffer: 100_000 * 0.99
assert_eq!(cw.cumulative_tokens, 0);
assert!(cw.conversation_history.is_empty());
}
#[test]
fn test_1pct_buffer_200k() {
// The exact scenario from the screenshot: 200k Anthropic context window
let cw = ContextWindow::new(200_000);
assert_eq!(cw.total_tokens, 198_000, "200k * 0.99 = 198k");
}
#[test]
fn test_1pct_buffer_zero() {
// Edge case: zero tokens should not underflow
let cw = ContextWindow::new(0);
assert_eq!(cw.total_tokens, 0);
}
#[test]
fn test_1pct_buffer_small() {
// Small context window: 100 * 0.99 = 99
let cw = ContextWindow::new(100);
assert_eq!(cw.total_tokens, 99);
}
#[test]
fn test_1pct_buffer_percentage_uses_buffered_total() {
// percentage_used() should report against the buffered limit
let mut cw = ContextWindow::new(200_000);
assert_eq!(cw.total_tokens, 198_000);
// Set used_tokens to 198_000 (100% of buffered, 99% of raw)
cw.used_tokens = 198_000;
let pct = cw.percentage_used();
assert!(
(pct - 100.0).abs() < 0.01,
"Should be ~100% of buffered limit, got {:.2}%",
pct,
);
// This means compaction triggers well before the raw API limit
assert!(cw.should_compact());
}
#[test]
fn test_percentage_used() {
let mut cw = ContextWindow::new(100);
// total_tokens is 99 after 1% buffer
cw.used_tokens = 50;
assert!((cw.percentage_used() - 50.0).abs() < 0.01);
let expected = (50.0 / 99.0) * 100.0;
assert!((cw.percentage_used() - expected).abs() < 0.01);
}
#[test]
fn test_remaining_tokens() {
let mut cw = ContextWindow::new(100);
// total_tokens is 99 after 1% buffer
cw.used_tokens = 30;
assert_eq!(cw.remaining_tokens(), 70);
assert_eq!(cw.remaining_tokens(), 69); // 99 - 30
}
#[test]

View File

@@ -22,7 +22,7 @@ fn test_task_result_basic_functionality() {
// Test basic properties
assert_eq!(result.response, response);
assert_eq!(result.context_window.conversation_history.len(), 2);
assert_eq!(result.context_window.total_tokens, 10000);
assert_eq!(result.context_window.total_tokens, 9900); // 10000 * 0.99 (1% buffer)
}
#[test]
@@ -122,7 +122,7 @@ fn test_context_window_preservation() {
let result = TaskResult::new("Response".to_string(), context.clone());
// Verify context is preserved
assert_eq!(result.context_window.total_tokens, 5000);
assert_eq!(result.context_window.total_tokens, 4950); // 5000 * 0.99 (1% buffer)
assert!(result.context_window.used_tokens > 1234); // Should have increased
assert_eq!(result.context_window.conversation_history.len(), 5);

View File

@@ -1476,3 +1476,145 @@ async fn test_tool_call_input_tokens_tracked_in_context_window() {
"recalculate_tokens() should agree with incrementally tracked used_tokens"
);
}
/// Test: 1% safety buffer prevents "prompt is too long" API errors
///
/// Exact reproduction of the failure from the screenshot:
/// "prompt is too long: 200089 tokens > 200000 maximum"
///
/// Our token estimation slightly undercounts (by ~0.05%) because:
/// - Tool call overhead (name, id, JSON structure) is approximated at 20 tokens
/// - The chars/3 * 1.1 heuristic for code/JSON can drift on certain content
/// - Message framing tokens (role markers, separators) aren't fully counted
///
/// Over a long session with hundreds of tool calls, these small errors accumulate
/// to ~89 tokens over the 200k limit. The 1% buffer (2000 tokens on a 200k window)
/// absorbs this drift so we never send a request the API will reject.
///
/// This test fills a context window to near-capacity and verifies:
/// 1. The buffered total_tokens is 99% of the requested size
/// 2. percentage_used() reports against the buffered limit (not the raw provider limit)
/// 3. A session that would be at 99.95% of the raw limit is at >100% of the buffered
/// limit, meaning compaction/thinning would have already triggered
#[tokio::test]
async fn test_1pct_buffer_prevents_prompt_too_long_error() {
use g3_core::context_window::ContextWindow;
use g3_providers::MessageToolCall;
// Create a 200k context window (the Anthropic default)
let cw = ContextWindow::new(200_000);
// The buffer should reduce total_tokens by 1%
let expected_buffered = (200_000_f64 * 0.99) as u32; // 198_000
assert_eq!(
cw.total_tokens, expected_buffered,
"ContextWindow should apply 1% safety buffer: expected {}, got {}",
expected_buffered, cw.total_tokens,
);
// Now simulate the exact scenario from the screenshot:
// Fill the context to ~199,900 estimated tokens (99.95% of raw 200k)
// which is ~100.96% of the buffered 198k limit.
let mut cw = ContextWindow::new(200_000);
// Add system prompt (~6k tokens)
cw.add_message(Message::new(
MessageRole::System,
"You are G3, an AI programming agent. ".repeat(500), // ~18.5k chars → ~5k tokens
));
// Add many tool call messages to accumulate tokens.
// Each tool call pair (assistant + tool result) adds ~800-1200 estimated tokens.
// We need ~194k more tokens to reach 99.95% of raw 200k.
let mut _total_messages = 1; // system message
let mut last_percentage = 0.0_f32;
for i in 0..500 {
// Assistant message with a tool call containing ~2k chars of JSON input
let large_input = serde_json::json!({
"file_path": format!("src/module_{}/recognizer.rs", i),
"diff": format!(
"@@ -1,10 +1,50 @@\n-old code\n+{}\n context\n",
format!(" pub fn process_form_{i}(&mut self) -> Result<(), Error> {{\n // Implementation with detailed logic\n let token = self.next_token()?;\n match token {{\n Token::Open => self.handle_open()?,\n Token::Close => self.handle_close()?,\n _ => return Err(Error::Unexpected(token)),\n }}\n Ok(())\n }}\n").repeat(8)
),
});
let mut assistant = Message::new(
MessageRole::Assistant,
format!("Applying changes to module {}.", i),
);
assistant.tool_calls.push(MessageToolCall {
id: format!("toolu_{:04}", i),
name: "str_replace".to_string(),
input: large_input,
});
cw.add_message(assistant);
_total_messages += 1;
// Tool result
let mut result = Message::new(
MessageRole::User,
format!("Tool result: Applied 1 hunk to src/module_{}/recognizer.rs", i),
);
result.tool_result_id = Some(format!("toolu_{:04}", i));
cw.add_message(result);
_total_messages += 1;
let pct = cw.percentage_used();
// Check: did we cross 100% of the BUFFERED limit?
// If so, the buffer is working — compaction would have triggered at 80%.
if pct >= 100.0 && last_percentage < 100.0 {
// Calculate what percentage of the RAW 200k limit we're at
let raw_percentage = (cw.used_tokens as f64 / 200_000.0) * 100.0;
// We should be UNDER the raw 200k limit even though we're over the buffered limit
assert!(
raw_percentage < 100.0,
"When crossing 100% of buffered limit, should still be under raw 200k. \
Buffered: {:.2}%, Raw: {:.2}%, used: {}, buffered_total: {}, raw_total: 200000",
pct, raw_percentage, cw.used_tokens, cw.total_tokens,
);
// The gap between raw and buffered should be the ~1% buffer
let gap = 100.0 - raw_percentage;
assert!(
gap > 0.0 && gap < 2.0,
"Gap between raw limit and current usage should be 0-2% (the buffer). Got {:.2}%",
gap,
);
}
last_percentage = pct;
// Stop once we've exceeded the buffered limit
if pct > 101.0 {
break;
}
}
// Final assertions
assert!(
cw.percentage_used() > 100.0,
"Should have exceeded the buffered limit. Percentage: {:.1}%, used: {}, total: {}",
cw.percentage_used(), cw.used_tokens, cw.total_tokens,
);
// But we should NOT have exceeded the raw 200k limit by much (if at all)
// The ~89 token overshoot from the screenshot would be absorbed by the 2000-token buffer
let raw_overshoot = cw.used_tokens as i64 - 200_000;
assert!(
raw_overshoot < 2000,
"Should not overshoot raw 200k by more than the buffer size. Overshoot: {} tokens",
raw_overshoot,
);
// Compaction would have triggered at 80% of the buffered limit (158,400 tokens)
// which is 79.2% of the raw limit — well before any API error
let compaction_threshold_tokens = (cw.total_tokens as f64 * 0.80) as u32;
assert!(
compaction_threshold_tokens < 200_000,
"Compaction threshold ({} tokens) must be well under raw 200k limit",
compaction_threshold_tokens,
);
}

View File

@@ -98,14 +98,15 @@ fn test_context_window_available_tokens() {
// 2.5% buffer calculation
let buffer = (model_limit / 40).clamp(1000, 10000);
assert_eq!(buffer, 5000); // 200000/40 = 5000
// After 1% safety buffer: total_tokens = 198000, so 198000/40 = 4950
assert_eq!(buffer, 4950);
let available = model_limit
.saturating_sub(current_usage)
.saturating_sub(buffer);
// 200000 - 180000 - 5000 = 15000
assert_eq!(available, 15000);
// 198000 - 180000 - 4950 = 13050
assert_eq!(available, 13050);
// Capped at 10000 for summary
let summary_max = available.min(10_000);

View File

@@ -94,7 +94,8 @@ fn test_percentage_based_on_used_tokens() {
// Initially 0%
assert_eq!(window.percentage_used(), 0.0);
assert_eq!(window.remaining_tokens(), 1000);
// After 1% buffer: total_tokens = 990
assert_eq!(window.remaining_tokens(), 990);
// Add messages to increase used_tokens
// A message with ~100 chars should be roughly 25-30 tokens
@@ -107,7 +108,7 @@ fn test_percentage_based_on_used_tokens() {
assert!(percentage < 100.0, "percentage should be < 100");
// remaining_tokens should decrease
assert!(window.remaining_tokens() < 1000, "remaining tokens should decrease");
assert!(window.remaining_tokens() < 990, "remaining tokens should decrease");
}
/// Test that the 80% compaction threshold works correctly.