diff --git a/analysis/memory.md b/analysis/memory.md index 6b15dc5..d9035e0 100644 --- a/analysis/memory.md +++ b/analysis/memory.md @@ -1,5 +1,5 @@ # Workspace Memory -> Updated: 2026-02-14T22:33:04Z | Size: 22.9k chars +> Updated: 2026-03-18T03:59:01Z | Size: 25.2k chars ### Remember Tool Wiring - `crates/g3-core/src/tools/memory.rs` [0..5686] @@ -396,4 +396,19 @@ Tool output responsive to terminal width — no line wrapping, 4-char right marg - `crates/g3-core/src/lib.rs` [170..171] - `baseline_dirty_files: HashSet` field on Agent - `crates/g3-core/src/lib.rs` [1675..1686] - `set_plan_mode(enabled, working_dir)` - captures baseline on enable, clears on disable - **Key invariant**: The approval gate NEVER deletes or reverts files. It only warns. -- **Key invariant**: Pre-existing dirty files (captured at plan mode start) are excluded from gate checks. \ No newline at end of file +- **Key invariant**: Pre-existing dirty files (captured at plan mode start) are excluded from gate checks. + +### Context Window Calibration (Token Drift Fix) +- `crates/g3-core/src/context_window.rs` [159..189] - `update_usage_from_response()` now calibrates `used_tokens` from API `prompt_tokens` (ground truth). When `prompt_tokens > 0`, snaps `used_tokens` to it. When 0, leaves unchanged (heuristic fallback). +- `crates/g3-core/src/context_window.rs` [93..100] - No more 1% safety buffer. `total_tokens = raw` (was `raw * 0.99`). +- `crates/g3-core/src/context_window.rs` [222..250] - `estimate_message_tokens()` now adds: +4 per-message overhead, +30 per tool_use block (was 20), +15 per tool_result message. +- `crates/g3-core/src/lib.rs` [2232..2241] - `ensure_context_capacity()` called inside streaming loop for iteration > 1 (catches post-tool-execution growth). +- **Root cause**: Heuristic token estimation drifted ~48% over 809 messages / 388 tool calls (136k estimated vs 201k actual). API `prompt_tokens` is ground truth. + +### Context Window Calibration (Token Drift Fix) - CORRECTED +- `crates/g3-core/src/context_window.rs` [168..189] - `update_usage_from_response()` calibrates `used_tokens` from API `prompt_tokens` (ground truth). When `prompt_tokens > 0`, snaps `used_tokens` to it. When 0, leaves unchanged (heuristic fallback). +- `crates/g3-core/src/lib.rs` [2316..2319] - Calibration call placed **inline** during streaming (when usage chunk arrives in `chunk.usage`), NOT after the streaming loop. Critical because text-only responses take an early return path that bypasses post-loop code. +- `crates/g3-core/src/lib.rs` [2892..2898] - Post-loop code only handles fallback (no-usage) case now. +- `crates/g3-core/src/context_window.rs` [87..93] - 1% safety buffer IS still in place (`total_tokens * 0.99`). Left as safety net between calibration points. +- **Root cause of display bug**: (1) `update_usage_from_response` never calibrated `used_tokens`, only `cumulative_tokens`. (2) `execute_single_task` had mock usage with hardcoded `prompt_tokens: 100`. (3) Post-loop usage update was bypassed by early returns in text-only response paths. +- **Key streaming flow**: For text-only responses (most common in interactive mode), `chunk.finished` triggers an early `return Ok(self.finalize_streaming_turn(...))` that bypasses all post-loop code. Calibration MUST happen inline when `chunk.usage` arrives. \ No newline at end of file diff --git a/crates/g3-core/src/context_window.rs b/crates/g3-core/src/context_window.rs index b46cb2a..9a9b412 100644 --- a/crates/g3-core/src/context_window.rs +++ b/crates/g3-core/src/context_window.rs @@ -152,15 +152,37 @@ impl ContextWindow { /// Update token usage from provider response. /// /// NOTE: This only updates cumulative_tokens (total API usage tracking). - /// It does NOT update used_tokens because: - /// 1. prompt_tokens represents the ENTIRE context sent to API (already tracked via add_message) - /// 2. completion_tokens will be tracked when the assistant message is added via add_message - /// Adding total_tokens here would cause double/triple counting and break the 80% threshold check. + /// Calibrates `used_tokens` from the provider's actual token count when + /// available. Our heuristic estimation (chars/3 or chars/4) drifts + /// over long sessions because it doesn't account for tool definitions + /// (~4000 tokens) sent alongside the conversation history. + /// + /// `prompt_tokens` is the ground-truth count of every token the API + /// received (system prompt + conversation history + tool definitions). + /// By snapping `used_tokens` to this value after each API call, we + /// eliminate accumulated drift and ensure `should_compact()` triggers + /// at the right time. + /// + /// When `prompt_tokens` is 0 (some providers don't report it), we leave + /// `used_tokens` unchanged and fall back to the heuristic estimate. pub fn update_usage_from_response(&mut self, usage: &Usage) { self.cumulative_tokens += usage.total_tokens; + + // Calibrate used_tokens from the provider's actual prompt token count. + // prompt_tokens = all tokens sent to the API (system + history + tools). + // This is the ground truth — use it to correct heuristic drift. + if usage.prompt_tokens > 0 { + let old = self.used_tokens; + self.used_tokens = usage.prompt_tokens; + debug!( + "Calibrated used_tokens from API: {} -> {} (drift was {} tokens)", + old, self.used_tokens, (self.used_tokens as i64 - old as i64).abs() + ); + } + debug!( - "Updated cumulative tokens: {} (used: {}/{}, cumulative: {})", - usage.total_tokens, self.used_tokens, self.total_tokens, self.cumulative_tokens + "Post-calibration: used={}/{}, cumulative={}", + self.used_tokens, self.total_tokens, self.cumulative_tokens ); } diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs index f3f28ae..9425c86 100644 --- a/crates/g3-core/src/lib.rs +++ b/crates/g3-core/src/lib.rs @@ -1096,18 +1096,6 @@ impl Agent { let response_content = task_result.response.clone(); let _llm_duration = llm_start.elapsed(); - // Create a mock usage for now (we'll need to track this during streaming) - let mock_usage = g3_providers::Usage { - prompt_tokens: 100, // Estimate - completion_tokens: response_content.len() as u32 / 4, // Rough estimate - total_tokens: 100 + (response_content.len() as u32 / 4), - cache_creation_tokens: 0, - cache_read_tokens: 0, - }; - - // Update context window with estimated token usage - self.context_window.update_usage(&mock_usage); - // Add assistant response to context window only if not empty // This prevents the "Skipping empty message" warning when only tools were executed // Also strip timing footer - it's display-only and shouldn't be in context @@ -2192,6 +2180,7 @@ Skip if nothing new. Be brief."#; mut request: CompletionRequest, show_timing: bool, ) -> Result { + // ========================================================================= // STREAMING COMPLETION WITH TOOL EXECUTION // ========================================================================= @@ -2324,6 +2313,10 @@ Skip if nothing new. Be brief."#; iter.accumulated_usage = Some(usage.clone()); state.turn_accumulated_usage = Some(usage.clone()); + // Calibrate context window with actual API usage immediately + // (must happen here, not after the loop, because early returns bypass post-loop code) + self.context_window.update_usage_from_response(usage); + // Update cumulative cache statistics self.cache_stats.total_calls += 1; self.cache_stats.total_input_tokens += usage.prompt_tokens as u64; @@ -2899,12 +2892,9 @@ Skip if nothing new. Be brief."#; } } - // Update context window with actual usage if available - if let Some(usage) = iter.accumulated_usage { - debug!("Updating context window with actual usage from stream"); - self.context_window.update_usage_from_response(&usage); - } else { - // Fall back to estimation if no usage data was provided + // Fall back to estimation if no usage data was provided by the stream + // (calibration already happened inline when usage data arrived) + if iter.accumulated_usage.is_none() { debug!("No usage data from stream, using estimation"); let estimated_tokens = ContextWindow::estimate_tokens(&iter.current_response); self.context_window.add_streaming_tokens(estimated_tokens); diff --git a/crates/g3-core/tests/mock_provider_integration_test.rs b/crates/g3-core/tests/mock_provider_integration_test.rs index 210b152..6040628 100644 --- a/crates/g3-core/tests/mock_provider_integration_test.rs +++ b/crates/g3-core/tests/mock_provider_integration_test.rs @@ -714,10 +714,6 @@ async fn test_token_counting_no_double_count() { let (mut agent, _temp_dir) = create_agent_with_mock(provider).await; - // Get initial token count - let initial_used = agent.get_context_window().used_tokens; - let initial_percentage = agent.get_context_window().percentage_used(); - // Execute a task agent.execute_task("Say something short", None, false).await.unwrap(); @@ -725,18 +721,16 @@ async fn test_token_counting_no_double_count() { let final_used = agent.get_context_window().used_tokens; let final_percentage = agent.get_context_window().percentage_used(); - // The increase should be reasonable (not doubled) - // A short response + user message should be < 1000 tokens - let token_increase = final_used - initial_used; + // With calibration, used_tokens should be snapped to the mock's prompt_tokens (100) + // plus any heuristic addition from the assistant response message added after calibration. + // The key invariant: no double-counting that would push us to 80%+. assert!( - token_increase < 1000, - "Token increase should be reasonable, got {} ({}% -> {}%)", - token_increase, - initial_percentage, - final_percentage + final_used < 2000, + "After calibration from mock (prompt_tokens=100), used_tokens should be low, got {}", + final_used ); - // Percentage should also be reasonable (not jumping to 80%+) + // Percentage should be very low (not jumping to 80%+ from double-counting) assert!( final_percentage < 50.0, "Context percentage should be reasonable after one exchange, got {}%", diff --git a/crates/g3-core/tests/test_token_counting.rs b/crates/g3-core/tests/test_token_counting.rs index aa102b1..d310a39 100644 --- a/crates/g3-core/tests/test_token_counting.rs +++ b/crates/g3-core/tests/test_token_counting.rs @@ -1,8 +1,7 @@ use g3_core::ContextWindow; use g3_providers::{Message, MessageRole, Usage}; -/// Test that used_tokens is tracked via add_message, not update_usage_from_response. -/// This is critical for the 80% compaction threshold to work correctly. +/// Test that used_tokens is tracked via add_message. #[test] fn test_used_tokens_tracked_via_messages() { let mut window = ContextWindow::new(10000); @@ -23,17 +22,17 @@ fn test_used_tokens_tracked_via_messages() { assert!(window.used_tokens > tokens_after_user_msg, "used_tokens should increase after adding assistant message"); } -/// Test that update_usage_from_response only updates cumulative_tokens, not used_tokens. -/// This prevents double-counting which was causing the 80% threshold to be reached at 200%+. +/// Test that update_usage_from_response calibrates used_tokens from prompt_tokens. +/// When prompt_tokens > 0, used_tokens is snapped to the API's ground truth. +/// When prompt_tokens is 0, used_tokens is left unchanged (heuristic fallback). #[test] -fn test_update_usage_only_affects_cumulative() { +fn test_update_usage_calibrates_used_tokens() { let mut window = ContextWindow::new(10000); - // Initial state assert_eq!(window.used_tokens, 0); assert_eq!(window.cumulative_tokens, 0); - // Simulate API response with usage data + // Simulate API response — prompt_tokens > 0 triggers calibration let usage = Usage { prompt_tokens: 100, completion_tokens: 50, @@ -43,13 +42,13 @@ fn test_update_usage_only_affects_cumulative() { }; window.update_usage_from_response(&usage); - // used_tokens should NOT change - it's tracked via add_message - assert_eq!(window.used_tokens, 0, "used_tokens should not be updated by update_usage_from_response"); - - // cumulative_tokens SHOULD be updated for API usage tracking + // used_tokens should be calibrated to prompt_tokens + assert_eq!(window.used_tokens, 100, "used_tokens should be calibrated to prompt_tokens"); + + // cumulative_tokens tracks total API usage assert_eq!(window.cumulative_tokens, 150, "cumulative_tokens should track total API usage"); - // Another API call + // Another API call with higher prompt_tokens let usage2 = Usage { prompt_tokens: 200, completion_tokens: 75, @@ -59,11 +58,27 @@ fn test_update_usage_only_affects_cumulative() { }; window.update_usage_from_response(&usage2); - // used_tokens still unchanged - assert_eq!(window.used_tokens, 0, "used_tokens should remain unchanged"); - + // used_tokens calibrated to latest prompt_tokens + assert_eq!(window.used_tokens, 200, "used_tokens should be calibrated to latest prompt_tokens"); + // cumulative_tokens accumulates assert_eq!(window.cumulative_tokens, 425, "cumulative_tokens should accumulate"); + + // When prompt_tokens is 0, used_tokens should NOT change (fallback) + let usage3 = Usage { + prompt_tokens: 0, + completion_tokens: 30, + total_tokens: 30, + cache_creation_tokens: 0, + cache_read_tokens: 0, + }; + window.update_usage_from_response(&usage3); + + // used_tokens unchanged (prompt_tokens was 0) + assert_eq!(window.used_tokens, 200, "used_tokens should not change when prompt_tokens is 0"); + + // cumulative_tokens still accumulates + assert_eq!(window.cumulative_tokens, 455, "cumulative_tokens should still accumulate"); } /// Test that add_streaming_tokens only updates cumulative_tokens. @@ -112,7 +127,6 @@ fn test_percentage_based_on_used_tokens() { } /// Test that the 80% compaction threshold works correctly. -/// This was the original bug - used_tokens was being double/triple counted. #[test] fn test_should_compact_threshold() { let mut window = ContextWindow::new(1000); @@ -125,7 +139,6 @@ fn test_should_compact_threshold() { } // Should be around 720 tokens (72%) - not yet at threshold - // Note: actual token count depends on estimation algorithm let percentage = window.percentage_used(); println!("After 9 messages: {}% used ({} tokens)", percentage, window.used_tokens); @@ -142,21 +155,19 @@ fn test_should_compact_threshold() { } } -/// Test that cumulative_tokens and used_tokens are independent. +/// Test that calibration and cumulative tracking work together correctly. #[test] -fn test_cumulative_vs_used_independence() { +fn test_calibration_and_cumulative_interaction() { let mut window = ContextWindow::new(10000); - // Add a message (affects used_tokens) + // Add a message (affects both used_tokens and cumulative_tokens) let msg = Message::new(MessageRole::User, "Hello world".to_string()); window.add_message(msg); let used_after_msg = window.used_tokens; let cumulative_after_msg = window.cumulative_tokens; - - // Both should be equal at this point (message adds to both) assert_eq!(used_after_msg, cumulative_after_msg); - // Now simulate API response (only affects cumulative_tokens) + // Simulate API response — calibrates used_tokens, accumulates cumulative_tokens let usage = Usage { prompt_tokens: 500, completion_tokens: 200, @@ -166,12 +177,43 @@ fn test_cumulative_vs_used_independence() { }; window.update_usage_from_response(&usage); - // used_tokens unchanged - assert_eq!(window.used_tokens, used_after_msg, "used_tokens should not change from API response"); + // used_tokens calibrated to prompt_tokens (500) + assert_eq!(window.used_tokens, 500, "used_tokens should be calibrated to prompt_tokens"); - // cumulative_tokens increased + // cumulative_tokens increased by total_tokens assert_eq!(window.cumulative_tokens, cumulative_after_msg + 700, "cumulative_tokens should increase"); // They should now be different assert!(window.cumulative_tokens > window.used_tokens, "cumulative should be greater than used"); } + +/// Test that calibration corrects heuristic undercount. +/// The heuristic doesn't account for tool definitions (~4000 tokens), +/// so prompt_tokens from the API is always larger. +#[test] +fn test_calibration_corrects_undercount() { + let mut window = ContextWindow::new(200000); + + // Simulate adding a system prompt and user message via heuristic + let system_msg = Message::new(MessageRole::System, "x".repeat(4000)); // ~1000 tokens + window.add_message(system_msg); + let user_msg = Message::new(MessageRole::User, "Hello".to_string()); + window.add_message(user_msg); + + let heuristic_estimate = window.used_tokens; + assert!(heuristic_estimate > 0); + + // API reports higher prompt_tokens (includes tool definitions) + let usage = Usage { + prompt_tokens: heuristic_estimate + 4000, // tool definitions add ~4000 tokens + completion_tokens: 100, + total_tokens: heuristic_estimate + 4100, + cache_creation_tokens: 0, + cache_read_tokens: 0, + }; + window.update_usage_from_response(&usage); + + // used_tokens should now be higher than the heuristic estimate + assert_eq!(window.used_tokens, heuristic_estimate + 4000); + assert!(window.used_tokens > heuristic_estimate, "calibration should correct undercount"); +}