Add 1% safety buffer to context window to prevent API token limit errors

Our token estimation heuristic (chars/3 * 1.1 for code, chars/4 * 1.1 for text) slightly undercounts over long sessions with hundreds of tool calls. This accumulated drift of ~89 tokens caused Anthropic API 400 errors: 'prompt is too long: 200089 tokens > 200000 maximum' Fix: ContextWindow::new() now applies a 1% buffer, setting total_tokens to 99% of the provider-reported limit. For a 200k window this gives 198k, providing a 2000-token safety margin that absorbs estimation drift. All percentage calculations, compaction thresholds, and thinning triggers operate against the buffered limit, so compaction fires earlier and we never send a request the API will reject.
2026-02-13 15:46:53 +11:00
parent a7e0b0ef9e
commit 0410efd41b
5 changed files with 203 additions and 11 deletions
--- a/crates/g3-core/src/context_window.rs
+++ b/crates/g3-core/src/context_window.rs
@@ -84,9 +84,15 @@ pub struct ContextWindow {

 impl ContextWindow {
    pub fn new(total_tokens: u32) -> Self {
+        // Apply a 1% safety buffer to absorb token estimation drift.
+        // Our heuristic (chars/3 * 1.1 for code, chars/4 * 1.1 for text) slightly
+        // undercounts over long sessions with hundreds of tool calls. Without this
+        // buffer, accumulated drift of ~89 tokens caused API 400 errors:
+        //   "prompt is too long: 200089 tokens > 200000 maximum"
+        let buffered_tokens = (total_tokens as f64 * 0.99) as u32;
        Self {
            used_tokens: 0,
-            total_tokens,
+            total_tokens: buffered_tokens,
            cumulative_tokens: 0,
            conversation_history: Vec::new(),
            last_thinning_percentage: 0,
@@ -783,23 +789,65 @@ mod tests {
    fn test_new_context_window() {
        let cw = ContextWindow::new(100_000);
        assert_eq!(cw.used_tokens, 0);
-        assert_eq!(cw.total_tokens, 100_000);
+        assert_eq!(cw.total_tokens, 99_000); // 1% buffer: 100_000 * 0.99
        assert_eq!(cw.cumulative_tokens, 0);
        assert!(cw.conversation_history.is_empty());
    }

+    #[test]
+    fn test_1pct_buffer_200k() {
+        // The exact scenario from the screenshot: 200k Anthropic context window
+        let cw = ContextWindow::new(200_000);
+        assert_eq!(cw.total_tokens, 198_000, "200k * 0.99 = 198k");
+    }
+
+    #[test]
+    fn test_1pct_buffer_zero() {
+        // Edge case: zero tokens should not underflow
+        let cw = ContextWindow::new(0);
+        assert_eq!(cw.total_tokens, 0);
+    }
+
+    #[test]
+    fn test_1pct_buffer_small() {
+        // Small context window: 100 * 0.99 = 99
+        let cw = ContextWindow::new(100);
+        assert_eq!(cw.total_tokens, 99);
+    }
+
+    #[test]
+    fn test_1pct_buffer_percentage_uses_buffered_total() {
+        // percentage_used() should report against the buffered limit
+        let mut cw = ContextWindow::new(200_000);
+        assert_eq!(cw.total_tokens, 198_000);
+
+        // Set used_tokens to 198_000 (100% of buffered, 99% of raw)
+        cw.used_tokens = 198_000;
+        let pct = cw.percentage_used();
+        assert!(
+            (pct - 100.0).abs() < 0.01,
+            "Should be ~100% of buffered limit, got {:.2}%",
+            pct,
+        );
+        // This means compaction triggers well before the raw API limit
+        assert!(cw.should_compact());
+    }
+
    #[test]
    fn test_percentage_used() {
        let mut cw = ContextWindow::new(100);
+        // total_tokens is 99 after 1% buffer
        cw.used_tokens = 50;
-        assert!((cw.percentage_used() - 50.0).abs() < 0.01);
+        let expected = (50.0 / 99.0) * 100.0;
+        assert!((cw.percentage_used() - expected).abs() < 0.01);
    }

    #[test]
    fn test_remaining_tokens() {
        let mut cw = ContextWindow::new(100);
+        // total_tokens is 99 after 1% buffer
        cw.used_tokens = 30;
-        assert_eq!(cw.remaining_tokens(), 70);
+        assert_eq!(cw.remaining_tokens(), 69); // 99 - 30
    }

    #[test]
--- a/crates/g3-core/src/task_result_comprehensive_tests.rs
+++ b/crates/g3-core/src/task_result_comprehensive_tests.rs
@@ -22,7 +22,7 @@ fn test_task_result_basic_functionality() {
    // Test basic properties
    assert_eq!(result.response, response);
    assert_eq!(result.context_window.conversation_history.len(), 2);
-    assert_eq!(result.context_window.total_tokens, 10000);
+    assert_eq!(result.context_window.total_tokens, 9900); // 10000 * 0.99 (1% buffer)
 }

 #[test]
@@ -122,7 +122,7 @@ fn test_context_window_preservation() {
    let result = TaskResult::new("Response".to_string(), context.clone());

    // Verify context is preserved
-    assert_eq!(result.context_window.total_tokens, 5000);
+    assert_eq!(result.context_window.total_tokens, 4950); // 5000 * 0.99 (1% buffer)
    assert!(result.context_window.used_tokens > 1234); // Should have increased
    assert_eq!(result.context_window.conversation_history.len(), 5);

--- a/crates/g3-core/tests/mock_provider_integration_test.rs
+++ b/crates/g3-core/tests/mock_provider_integration_test.rs
@@ -1476,3 +1476,145 @@ async fn test_tool_call_input_tokens_tracked_in_context_window() {
        "recalculate_tokens() should agree with incrementally tracked used_tokens"
    );
 }
+
+/// Test: 1% safety buffer prevents "prompt is too long" API errors
+///
+/// Exact reproduction of the failure from the screenshot:
+///   "prompt is too long: 200089 tokens > 200000 maximum"
+///
+/// Our token estimation slightly undercounts (by ~0.05%) because:
+/// - Tool call overhead (name, id, JSON structure) is approximated at 20 tokens
+/// - The chars/3 * 1.1 heuristic for code/JSON can drift on certain content
+/// - Message framing tokens (role markers, separators) aren't fully counted
+///
+/// Over a long session with hundreds of tool calls, these small errors accumulate
+/// to ~89 tokens over the 200k limit. The 1% buffer (2000 tokens on a 200k window)
+/// absorbs this drift so we never send a request the API will reject.
+///
+/// This test fills a context window to near-capacity and verifies:
+/// 1. The buffered total_tokens is 99% of the requested size
+/// 2. percentage_used() reports against the buffered limit (not the raw provider limit)
+/// 3. A session that would be at 99.95% of the raw limit is at >100% of the buffered
+///    limit, meaning compaction/thinning would have already triggered
+#[tokio::test]
+async fn test_1pct_buffer_prevents_prompt_too_long_error() {
+    use g3_core::context_window::ContextWindow;
+    use g3_providers::MessageToolCall;
+
+    // Create a 200k context window (the Anthropic default)
+    let cw = ContextWindow::new(200_000);
+
+    // The buffer should reduce total_tokens by 1%
+    let expected_buffered = (200_000_f64 * 0.99) as u32; // 198_000
+    assert_eq!(
+        cw.total_tokens, expected_buffered,
+        "ContextWindow should apply 1% safety buffer: expected {}, got {}",
+        expected_buffered, cw.total_tokens,
+    );
+
+    // Now simulate the exact scenario from the screenshot:
+    // Fill the context to ~199,900 estimated tokens (99.95% of raw 200k)
+    // which is ~100.96% of the buffered 198k limit.
+    let mut cw = ContextWindow::new(200_000);
+
+    // Add system prompt (~6k tokens)
+    cw.add_message(Message::new(
+        MessageRole::System,
+        "You are G3, an AI programming agent. ".repeat(500), // ~18.5k chars → ~5k tokens
+    ));
+
+    // Add many tool call messages to accumulate tokens.
+    // Each tool call pair (assistant + tool result) adds ~800-1200 estimated tokens.
+    // We need ~194k more tokens to reach 99.95% of raw 200k.
+    let mut _total_messages = 1; // system message
+    let mut last_percentage = 0.0_f32;
+
+    for i in 0..500 {
+        // Assistant message with a tool call containing ~2k chars of JSON input
+        let large_input = serde_json::json!({
+            "file_path": format!("src/module_{}/recognizer.rs", i),
+            "diff": format!(
+                "@@ -1,10 +1,50 @@\n-old code\n+{}\n context\n",
+                format!("    pub fn process_form_{i}(&mut self) -> Result<(), Error> {{\n        // Implementation with detailed logic\n        let token = self.next_token()?;\n        match token {{\n            Token::Open => self.handle_open()?,\n            Token::Close => self.handle_close()?,\n            _ => return Err(Error::Unexpected(token)),\n        }}\n        Ok(())\n    }}\n").repeat(8)
+            ),
+        });
+
+        let mut assistant = Message::new(
+            MessageRole::Assistant,
+            format!("Applying changes to module {}.", i),
+        );
+        assistant.tool_calls.push(MessageToolCall {
+            id: format!("toolu_{:04}", i),
+            name: "str_replace".to_string(),
+            input: large_input,
+        });
+        cw.add_message(assistant);
+        _total_messages += 1;
+
+        // Tool result
+        let mut result = Message::new(
+            MessageRole::User,
+            format!("Tool result: Applied 1 hunk to src/module_{}/recognizer.rs", i),
+        );
+        result.tool_result_id = Some(format!("toolu_{:04}", i));
+        cw.add_message(result);
+        _total_messages += 1;
+
+        let pct = cw.percentage_used();
+
+        // Check: did we cross 100% of the BUFFERED limit?
+        // If so, the buffer is working — compaction would have triggered at 80%.
+        if pct >= 100.0 && last_percentage < 100.0 {
+            // Calculate what percentage of the RAW 200k limit we're at
+            let raw_percentage = (cw.used_tokens as f64 / 200_000.0) * 100.0;
+
+            // We should be UNDER the raw 200k limit even though we're over the buffered limit
+            assert!(
+                raw_percentage < 100.0,
+                "When crossing 100% of buffered limit, should still be under raw 200k. \
+                 Buffered: {:.2}%, Raw: {:.2}%, used: {}, buffered_total: {}, raw_total: 200000",
+                pct, raw_percentage, cw.used_tokens, cw.total_tokens,
+            );
+
+            // The gap between raw and buffered should be the ~1% buffer
+            let gap = 100.0 - raw_percentage;
+            assert!(
+                gap > 0.0 && gap < 2.0,
+                "Gap between raw limit and current usage should be 0-2% (the buffer). Got {:.2}%",
+                gap,
+            );
+        }
+
+        last_percentage = pct;
+
+        // Stop once we've exceeded the buffered limit
+        if pct > 101.0 {
+            break;
+        }
+    }
+
+    // Final assertions
+    assert!(
+        cw.percentage_used() > 100.0,
+        "Should have exceeded the buffered limit. Percentage: {:.1}%, used: {}, total: {}",
+        cw.percentage_used(), cw.used_tokens, cw.total_tokens,
+    );
+
+    // But we should NOT have exceeded the raw 200k limit by much (if at all)
+    // The ~89 token overshoot from the screenshot would be absorbed by the 2000-token buffer
+    let raw_overshoot = cw.used_tokens as i64 - 200_000;
+    assert!(
+        raw_overshoot < 2000,
+        "Should not overshoot raw 200k by more than the buffer size. Overshoot: {} tokens",
+        raw_overshoot,
+    );
+
+    // Compaction would have triggered at 80% of the buffered limit (158,400 tokens)
+    // which is 79.2% of the raw limit — well before any API error
+    let compaction_threshold_tokens = (cw.total_tokens as f64 * 0.80) as u32;
+    assert!(
+        compaction_threshold_tokens < 200_000,
+        "Compaction threshold ({} tokens) must be well under raw 200k limit",
+        compaction_threshold_tokens,
+    );
+}
--- a/crates/g3-core/tests/test_preflight_max_tokens.rs
+++ b/crates/g3-core/tests/test_preflight_max_tokens.rs
@@ -98,14 +98,15 @@ fn test_context_window_available_tokens() {
    
    // 2.5% buffer calculation
    let buffer = (model_limit / 40).clamp(1000, 10000);
-    assert_eq!(buffer, 5000); // 200000/40 = 5000
+    // After 1% safety buffer: total_tokens = 198000, so 198000/40 = 4950
+    assert_eq!(buffer, 4950);
    
    let available = model_limit
        .saturating_sub(current_usage)
        .saturating_sub(buffer);
    
-    // 200000 - 180000 - 5000 = 15000
-    assert_eq!(available, 15000);
+    // 198000 - 180000 - 4950 = 13050
+    assert_eq!(available, 13050);
    
    // Capped at 10000 for summary
    let summary_max = available.min(10_000);
--- a/crates/g3-core/tests/test_token_counting.rs
+++ b/crates/g3-core/tests/test_token_counting.rs
@@ -94,7 +94,8 @@ fn test_percentage_based_on_used_tokens() {

    // Initially 0%
    assert_eq!(window.percentage_used(), 0.0);
-    assert_eq!(window.remaining_tokens(), 1000);
+    // After 1% buffer: total_tokens = 990
+    assert_eq!(window.remaining_tokens(), 990);

    // Add messages to increase used_tokens
    // A message with ~100 chars should be roughly 25-30 tokens
@@ -107,7 +108,7 @@ fn test_percentage_based_on_used_tokens() {
    assert!(percentage < 100.0, "percentage should be < 100");
    
    // remaining_tokens should decrease
-    assert!(window.remaining_tokens() < 1000, "remaining tokens should decrease");
+    assert!(window.remaining_tokens() < 990, "remaining tokens should decrease");
 }

 /// Test that the 80% compaction threshold works correctly.