Add 1% safety buffer to context window to prevent API token limit errors

Our token estimation heuristic (chars/3 * 1.1 for code, chars/4 * 1.1 for text) slightly undercounts over long sessions with hundreds of tool calls. This accumulated drift of ~89 tokens caused Anthropic API 400 errors: 'prompt is too long: 200089 tokens > 200000 maximum' Fix: ContextWindow::new() now applies a 1% buffer, setting total_tokens to 99% of the provider-reported limit. For a 200k window this gives 198k, providing a 2000-token safety margin that absorbs estimation drift. All percentage calculations, compaction thresholds, and thinning triggers operate against the buffered limit, so compaction fires earlier and we never send a request the API will reject.
2026-02-13 15:46:53 +11:00
parent a7e0b0ef9e
commit 0410efd41b
5 changed files with 203 additions and 11 deletions
--- a/crates/g3-core/src/context_window.rs
+++ b/crates/g3-core/src/context_window.rs
@@ -84,9 +84,15 @@ pub struct ContextWindow {
 impl ContextWindow {
    pub fn new(total_tokens: u32) -> Self {
        // Apply a 1% safety buffer to absorb token estimation drift.
        // Our heuristic (chars/3 * 1.1 for code, chars/4 * 1.1 for text) slightly
        // undercounts over long sessions with hundreds of tool calls. Without this
        // buffer, accumulated drift of ~89 tokens caused API 400 errors:
        //   "prompt is too long: 200089 tokens > 200000 maximum"
        let buffered_tokens = (total_tokens as f64 * 0.99) as u32;
        Self {
            used_tokens: 0,
-            total_tokens,
+            total_tokens: buffered_tokens,
            cumulative_tokens: 0,
            conversation_history: Vec::new(),
            last_thinning_percentage: 0,
@@ -783,23 +789,65 @@ mod tests {
    fn test_new_context_window() {
        let cw = ContextWindow::new(100_000);
        assert_eq!(cw.used_tokens, 0);
-        assert_eq!(cw.total_tokens, 100_000);
+        assert_eq!(cw.total_tokens, 99_000); // 1% buffer: 100_000 * 0.99
        assert_eq!(cw.cumulative_tokens, 0);
        assert!(cw.conversation_history.is_empty());
    }
    #[test]
    fn test_1pct_buffer_200k() {
        // The exact scenario from the screenshot: 200k Anthropic context window
        let cw = ContextWindow::new(200_000);
        assert_eq!(cw.total_tokens, 198_000, "200k * 0.99 = 198k");
    }
    #[test]
    fn test_1pct_buffer_zero() {
        // Edge case: zero tokens should not underflow
        let cw = ContextWindow::new(0);
        assert_eq!(cw.total_tokens, 0);
    }
    #[test]
    fn test_1pct_buffer_small() {
        // Small context window: 100 * 0.99 = 99
        let cw = ContextWindow::new(100);
        assert_eq!(cw.total_tokens, 99);
    }
    #[test]
    fn test_1pct_buffer_percentage_uses_buffered_total() {
        // percentage_used() should report against the buffered limit
        let mut cw = ContextWindow::new(200_000);
        assert_eq!(cw.total_tokens, 198_000);
        // Set used_tokens to 198_000 (100% of buffered, 99% of raw)
        cw.used_tokens = 198_000;
        let pct = cw.percentage_used();
        assert!(
            (pct - 100.0).abs() < 0.01,
            "Should be ~100% of buffered limit, got {:.2}%",
            pct,
        );
        // This means compaction triggers well before the raw API limit
        assert!(cw.should_compact());
    }
    #[test]
    fn test_percentage_used() {
        let mut cw = ContextWindow::new(100);
        // total_tokens is 99 after 1% buffer
        cw.used_tokens = 50;
-        assert!((cw.percentage_used() - 50.0).abs() < 0.01);
+        let expected = (50.0 / 99.0) * 100.0;
        assert!((cw.percentage_used() - expected).abs() < 0.01);
    }
    #[test]
    fn test_remaining_tokens() {
        let mut cw = ContextWindow::new(100);
        // total_tokens is 99 after 1% buffer
        cw.used_tokens = 30;
-        assert_eq!(cw.remaining_tokens(), 70);
+        assert_eq!(cw.remaining_tokens(), 69); // 99 - 30
    }
    #[test]
--- a/crates/g3-core/src/task_result_comprehensive_tests.rs
+++ b/crates/g3-core/src/task_result_comprehensive_tests.rs
@@ -22,7 +22,7 @@ fn test_task_result_basic_functionality() {
    // Test basic properties
    assert_eq!(result.response, response);
    assert_eq!(result.context_window.conversation_history.len(), 2);
-    assert_eq!(result.context_window.total_tokens, 10000);
+    assert_eq!(result.context_window.total_tokens, 9900); // 10000 * 0.99 (1% buffer)
 }
 #[test]
@@ -122,7 +122,7 @@ fn test_context_window_preservation() {
    let result = TaskResult::new("Response".to_string(), context.clone());
    // Verify context is preserved
-    assert_eq!(result.context_window.total_tokens, 5000);
+    assert_eq!(result.context_window.total_tokens, 4950); // 5000 * 0.99 (1% buffer)
    assert!(result.context_window.used_tokens > 1234); // Should have increased
    assert_eq!(result.context_window.conversation_history.len(), 5);
--- a/crates/g3-core/tests/mock_provider_integration_test.rs
+++ b/crates/g3-core/tests/mock_provider_integration_test.rs
@@ -1476,3 +1476,145 @@ async fn test_tool_call_input_tokens_tracked_in_context_window() {
        "recalculate_tokens() should agree with incrementally tracked used_tokens"
    );
 }
 /// Test: 1% safety buffer prevents "prompt is too long" API errors
 ///
 /// Exact reproduction of the failure from the screenshot:
 ///   "prompt is too long: 200089 tokens > 200000 maximum"
 ///
 /// Our token estimation slightly undercounts (by ~0.05%) because:
 /// - Tool call overhead (name, id, JSON structure) is approximated at 20 tokens
 /// - The chars/3 * 1.1 heuristic for code/JSON can drift on certain content
 /// - Message framing tokens (role markers, separators) aren't fully counted
 ///
 /// Over a long session with hundreds of tool calls, these small errors accumulate
 /// to ~89 tokens over the 200k limit. The 1% buffer (2000 tokens on a 200k window)
 /// absorbs this drift so we never send a request the API will reject.
 ///
 /// This test fills a context window to near-capacity and verifies:
 /// 1. The buffered total_tokens is 99% of the requested size
 /// 2. percentage_used() reports against the buffered limit (not the raw provider limit)
 /// 3. A session that would be at 99.95% of the raw limit is at >100% of the buffered
 ///    limit, meaning compaction/thinning would have already triggered
 #[tokio::test]
 async fn test_1pct_buffer_prevents_prompt_too_long_error() {
    use g3_core::context_window::ContextWindow;
    use g3_providers::MessageToolCall;
    // Create a 200k context window (the Anthropic default)
    let cw = ContextWindow::new(200_000);
    // The buffer should reduce total_tokens by 1%
    let expected_buffered = (200_000_f64 * 0.99) as u32; // 198_000
    assert_eq!(
        cw.total_tokens, expected_buffered,
        "ContextWindow should apply 1% safety buffer: expected {}, got {}",
        expected_buffered, cw.total_tokens,
    );
    // Now simulate the exact scenario from the screenshot:
    // Fill the context to ~199,900 estimated tokens (99.95% of raw 200k)
    // which is ~100.96% of the buffered 198k limit.
    let mut cw = ContextWindow::new(200_000);
    // Add system prompt (~6k tokens)
    cw.add_message(Message::new(
        MessageRole::System,
        "You are G3, an AI programming agent. ".repeat(500), // ~18.5k chars → ~5k tokens
    ));
    // Add many tool call messages to accumulate tokens.
    // Each tool call pair (assistant + tool result) adds ~800-1200 estimated tokens.
    // We need ~194k more tokens to reach 99.95% of raw 200k.
    let mut _total_messages = 1; // system message
    let mut last_percentage = 0.0_f32;
    for i in 0..500 {
        // Assistant message with a tool call containing ~2k chars of JSON input
        let large_input = serde_json::json!({
            "file_path": format!("src/module_{}/recognizer.rs", i),
            "diff": format!(
                "@@ -1,10 +1,50 @@\n-old code\n+{}\n context\n",
                format!("    pub fn process_form_{i}(&mut self) -> Result<(), Error> {{\n        // Implementation with detailed logic\n        let token = self.next_token()?;\n        match token {{\n            Token::Open => self.handle_open()?,\n            Token::Close => self.handle_close()?,\n            _ => return Err(Error::Unexpected(token)),\n        }}\n        Ok(())\n    }}\n").repeat(8)
            ),
        });
        let mut assistant = Message::new(
            MessageRole::Assistant,
            format!("Applying changes to module {}.", i),
        );
        assistant.tool_calls.push(MessageToolCall {
            id: format!("toolu_{:04}", i),
            name: "str_replace".to_string(),
            input: large_input,
        });
        cw.add_message(assistant);
        _total_messages += 1;
        // Tool result
        let mut result = Message::new(
            MessageRole::User,
            format!("Tool result: Applied 1 hunk to src/module_{}/recognizer.rs", i),
        );
        result.tool_result_id = Some(format!("toolu_{:04}", i));
        cw.add_message(result);
        _total_messages += 1;
        let pct = cw.percentage_used();
        // Check: did we cross 100% of the BUFFERED limit?
        // If so, the buffer is working — compaction would have triggered at 80%.
        if pct >= 100.0 && last_percentage < 100.0 {
            // Calculate what percentage of the RAW 200k limit we're at
            let raw_percentage = (cw.used_tokens as f64 / 200_000.0) * 100.0;
            // We should be UNDER the raw 200k limit even though we're over the buffered limit
            assert!(
                raw_percentage < 100.0,
                "When crossing 100% of buffered limit, should still be under raw 200k. \
                 Buffered: {:.2}%, Raw: {:.2}%, used: {}, buffered_total: {}, raw_total: 200000",
                pct, raw_percentage, cw.used_tokens, cw.total_tokens,
            );
            // The gap between raw and buffered should be the ~1% buffer
            let gap = 100.0 - raw_percentage;
            assert!(
                gap > 0.0 && gap < 2.0,
                "Gap between raw limit and current usage should be 0-2% (the buffer). Got {:.2}%",
                gap,
            );
        }
        last_percentage = pct;
        // Stop once we've exceeded the buffered limit
        if pct > 101.0 {
            break;
        }
    }
    // Final assertions
    assert!(
        cw.percentage_used() > 100.0,
        "Should have exceeded the buffered limit. Percentage: {:.1}%, used: {}, total: {}",
        cw.percentage_used(), cw.used_tokens, cw.total_tokens,
    );
    // But we should NOT have exceeded the raw 200k limit by much (if at all)
    // The ~89 token overshoot from the screenshot would be absorbed by the 2000-token buffer
    let raw_overshoot = cw.used_tokens as i64 - 200_000;
    assert!(
        raw_overshoot < 2000,
        "Should not overshoot raw 200k by more than the buffer size. Overshoot: {} tokens",
        raw_overshoot,
    );
    // Compaction would have triggered at 80% of the buffered limit (158,400 tokens)
    // which is 79.2% of the raw limit — well before any API error
    let compaction_threshold_tokens = (cw.total_tokens as f64 * 0.80) as u32;
    assert!(
        compaction_threshold_tokens < 200_000,
        "Compaction threshold ({} tokens) must be well under raw 200k limit",
        compaction_threshold_tokens,
    );
 }
--- a/crates/g3-core/tests/test_preflight_max_tokens.rs
+++ b/crates/g3-core/tests/test_preflight_max_tokens.rs
@@ -98,14 +98,15 @@ fn test_context_window_available_tokens() {
    // 2.5% buffer calculation
    let buffer = (model_limit / 40).clamp(1000, 10000);
-    assert_eq!(buffer, 5000); // 200000/40 = 5000
+    // After 1% safety buffer: total_tokens = 198000, so 198000/40 = 4950
    assert_eq!(buffer, 4950);
    let available = model_limit
        .saturating_sub(current_usage)
        .saturating_sub(buffer);
-    // 200000 - 180000 - 5000 = 15000
+    // 198000 - 180000 - 4950 = 13050
-    assert_eq!(available, 15000);
+    assert_eq!(available, 13050);
    // Capped at 10000 for summary
    let summary_max = available.min(10_000);
--- a/crates/g3-core/tests/test_token_counting.rs
+++ b/crates/g3-core/tests/test_token_counting.rs
@@ -94,7 +94,8 @@ fn test_percentage_based_on_used_tokens() {
    // Initially 0%
    assert_eq!(window.percentage_used(), 0.0);
-    assert_eq!(window.remaining_tokens(), 1000);
+    // After 1% buffer: total_tokens = 990
    assert_eq!(window.remaining_tokens(), 990);
    // Add messages to increase used_tokens
    // A message with ~100 chars should be roughly 25-30 tokens
@@ -107,7 +108,7 @@ fn test_percentage_based_on_used_tokens() {
    assert!(percentage < 100.0, "percentage should be < 100");
    // remaining_tokens should decrease
-    assert!(window.remaining_tokens() < 1000, "remaining tokens should decrease");
+    assert!(window.remaining_tokens() < 990, "remaining tokens should decrease");
 }
 /// Test that the 80% compaction threshold works correctly.