diff --git a/analysis/memory.md b/analysis/memory.md index 611c2ce..3b408c3 100644 --- a/analysis/memory.md +++ b/analysis/memory.md @@ -1,5 +1,5 @@ # Workspace Memory -> Updated: 2026-02-11T03:39:03Z | Size: 29.0k chars +> Updated: 2026-02-11T05:07:33Z | Size: 30.0k chars ### Remember Tool Wiring - `crates/g3-core/src/tools/memory.rs` [0..5000] - `execute_remember()`, `get_memory_path()`, `merge_memory()` @@ -452,4 +452,11 @@ Makes tool output responsive to terminal width - no line wrapping, with 4-char r - `crates/g3-providers/src/anthropic.rs` [369..435] - `strip_orphaned_tool_use()` defense-in-depth - Post-processing pass in `convert_messages()` detects orphaned `tool_use` blocks (no matching `tool_result` in next message) - Strips orphaned blocks with warning, adds placeholder text if message becomes empty -- Tests: `test_compaction_strips_tool_calls_from_last_assistant`, `test_compaction_drops_assistant_with_only_tool_calls_no_text`, `test_compaction_preserves_normal_assistant_message` (unit), `test_strip_orphaned_tool_use_*` (anthropic), `test_compaction_strips_structured_tool_calls` (integration) \ No newline at end of file +- Tests: `test_compaction_strips_tool_calls_from_last_assistant`, `test_compaction_drops_assistant_with_only_tool_calls_no_text`, `test_compaction_preserves_normal_assistant_message` (unit), `test_strip_orphaned_tool_use_*` (anthropic), `test_compaction_strips_structured_tool_calls` (integration) + +### Tool Call Token Tracking Fix (2026-02-11) +- `crates/g3-core/src/context_window.rs` [199..220] - `estimate_message_tokens()` accounts for both `message.content` and `message.tool_calls[].input` +- **Root cause**: `estimate_tokens()` only counted `message.content` chars, ignoring `tool_calls[].input` JSON. When sent to API, `tool_use` blocks include full input, causing massive undercount. +- **Impact**: In a real session, 303k chars of tool input (101k tokens) were invisible to the tracker. Context showed 39% but actual was >100%. Compaction never triggered → API 400 error. +- **Fix**: Added `estimate_message_tokens(message)` that sums content tokens + per-tool-call input tokens (chars/3 * 1.1 + 20 overhead). Updated `add_message_with_tokens()`, `recalculate_tokens()`, `clear_conversation()` to use it. +- **Tests**: 7 unit tests in `context_window.rs`, 1 integration test in `mock_provider_integration_test.rs::test_tool_call_input_tokens_tracked_in_context_window` \ No newline at end of file diff --git a/crates/g3-core/src/context_window.rs b/crates/g3-core/src/context_window.rs index 8009a74..8ae0d1d 100644 --- a/crates/g3-core/src/context_window.rs +++ b/crates/g3-core/src/context_window.rs @@ -109,7 +109,7 @@ impl ContextWindow { return; } - let token_count = tokens.unwrap_or_else(|| Self::estimate_tokens(&message.content)); + let token_count = tokens.unwrap_or_else(|| Self::estimate_message_tokens(&message)); self.used_tokens += token_count; self.cumulative_tokens += token_count; self.conversation_history.push(message); @@ -134,7 +134,7 @@ impl ContextWindow { self.used_tokens = self .conversation_history .iter() - .map(|m| Self::estimate_tokens(&m.content)) + .map(|m| Self::estimate_message_tokens(m)) .sum(); self.last_thinning_percentage = 0; } @@ -178,7 +178,7 @@ impl ContextWindow { self.used_tokens = self .conversation_history .iter() - .map(|m| Self::estimate_tokens(&m.content)) + .map(|m| Self::estimate_message_tokens(m)) .sum(); debug!("Recalculated tokens after thinning: {} tokens", self.used_tokens); } @@ -197,6 +197,29 @@ impl ContextWindow { (base_estimate as f32 * 1.1).ceil() as u32 } + /// Estimate tokens for a full message, including structured tool_calls. + /// + /// When the message is sent to the API, tool_calls are serialized as + /// structured blocks (e.g. Anthropic `tool_use`) whose input JSON counts + /// toward the prompt token budget. `estimate_tokens()` only looks at + /// `message.content`, so tool_call inputs were previously invisible to + /// the token tracker — causing used_tokens to massively undercount and + /// compaction to never trigger. + pub fn estimate_message_tokens(message: &Message) -> u32 { + let mut total = Self::estimate_tokens(&message.content); + for tc in &message.tool_calls { + // Serialize the input Value to a string for size estimation. + // Tool call inputs are always JSON/structured, so use the + // code/JSON heuristic (chars/3 * 1.1). + let input_str = tc.input.to_string(); + let base = (input_str.len() as f32 / 3.0).ceil() as u32; + let tc_tokens = (base as f32 * 1.1).ceil() as u32; + // Also count the tool name + id overhead (~20 tokens) + total += tc_tokens + 20; + } + total + } + // ======================================================================== // Capacity Queries // ======================================================================== @@ -1002,4 +1025,149 @@ mod tests { assert!(assistant_msgs[0].tool_calls.is_empty()); assert!(assistant_msgs[0].content.contains("Hello! How can I help you today?")); } + + // ==================================================================== + // Tool-call token tracking tests + // ==================================================================== + + #[test] + fn test_estimate_message_tokens_content_only() { + // Message without tool_calls should behave like estimate_tokens + let msg = Message::new(MessageRole::Assistant, "Hello world".to_string()); + let msg_tokens = ContextWindow::estimate_message_tokens(&msg); + let text_tokens = ContextWindow::estimate_tokens("Hello world"); + assert_eq!(msg_tokens, text_tokens); + } + + #[test] + fn test_estimate_message_tokens_with_tool_calls() { + // Message with tool_calls should count both content and tool input + let mut msg = Message::new(MessageRole::Assistant, "Let me read that.".to_string()); + msg.tool_calls.push(MessageToolCall { + id: "toolu_abc".to_string(), + name: "shell".to_string(), + input: serde_json::json!({"command": "echo hello world this is a moderately long command string for testing purposes"}), + }); + + let msg_tokens = ContextWindow::estimate_message_tokens(&msg); + let text_only_tokens = ContextWindow::estimate_tokens("Let me read that."); + + // Must be strictly greater than text-only estimate + assert!( + msg_tokens > text_only_tokens, + "estimate_message_tokens ({}) should be > text-only estimate ({})", + msg_tokens, text_only_tokens + ); + + // The tool input is ~90 chars of JSON → ~30 tokens + 20 overhead = ~50 extra + assert!( + msg_tokens >= text_only_tokens + 20, + "tool_call should add at least 20 tokens overhead, got delta={}", + msg_tokens - text_only_tokens + ); + } + + #[test] + fn test_estimate_message_tokens_empty_content_with_tool_calls() { + // Message with empty content but tool_calls should still count tool input + let mut msg = Message::new(MessageRole::Assistant, "".to_string()); + msg.tool_calls.push(MessageToolCall { + id: "toolu_xyz".to_string(), + name: "write_envelope".to_string(), + input: serde_json::json!({"facts": "a]".repeat(1000)}), + }); + + let tokens = ContextWindow::estimate_message_tokens(&msg); + assert!(tokens > 100, "Large tool input should produce significant token count, got {}", tokens); + } + + #[test] + fn test_estimate_message_tokens_large_tool_input() { + // Simulate the write_envelope case: 3751 chars of YAML in tool input + let large_yaml = "a: b\n".repeat(750); // ~3750 chars + let mut msg = Message::new(MessageRole::Assistant, "Writing envelope.".to_string()); + msg.tool_calls.push(MessageToolCall { + id: "toolu_env".to_string(), + name: "write_envelope".to_string(), + input: serde_json::json!({"facts": large_yaml}), + }); + + let tokens = ContextWindow::estimate_message_tokens(&msg); + // 3750 chars of JSON / 3 * 1.1 ≈ 1375 tokens + 20 overhead + content tokens + assert!(tokens > 1000, "Large tool input should produce >1000 tokens, got {}", tokens); + } + + #[test] + fn test_add_message_counts_tool_call_tokens() { + let mut cw = ContextWindow::new(200_000); + + // Add a message with tool_calls + let mut msg = Message::new(MessageRole::Assistant, "Running command.".to_string()); + msg.tool_calls.push(MessageToolCall { + id: "toolu_1".to_string(), + name: "shell".to_string(), + input: serde_json::json!({"command": "x]".repeat(500)}), + }); + + cw.add_message(msg); + + // used_tokens should reflect the tool_call input, not just the content + let content_only = ContextWindow::estimate_tokens("Running command."); + assert!( + cw.used_tokens > content_only, + "used_tokens ({}) should be > content-only estimate ({})", + cw.used_tokens, content_only + ); + } + + #[test] + fn test_should_compact_triggers_with_tool_call_tokens() { + // Reproduce the core bug: tool_calls push real usage past 80% but + // the old code would have tracked only content tokens (staying low). + let mut cw = ContextWindow::new(1000); + + // Add a message with small content but large tool input + // Content: ~5 tokens. Tool input: ~1000 chars → ~367 tokens + 20 = ~387 + // Total: ~392 tokens → 39% of 1000. Not enough alone. + // Add several to push past 80%. + for i in 0..3 { + let mut msg = Message::new(MessageRole::Assistant, "ok".to_string()); + msg.tool_calls.push(MessageToolCall { + id: format!("toolu_{}", i), + name: "shell".to_string(), + input: serde_json::json!({"command": "x".repeat(800)}), + }); + cw.add_message(msg); + // Also add a tool result + let mut result = Message::new(MessageRole::User, "Tool result: done".to_string()); + result.tool_result_id = Some(format!("toolu_{}", i)); + cw.add_message(result); + } + + // With tool_call tracking, should_compact should trigger + assert!( + cw.should_compact(), + "should_compact should trigger when tool_calls push past 80%, percentage={}%", + cw.percentage_used() + ); + } + + #[test] + fn test_recalculate_tokens_includes_tool_calls() { + let mut cw = ContextWindow::new(200_000); + + let mut msg = Message::new(MessageRole::Assistant, "hi".to_string()); + msg.tool_calls.push(MessageToolCall { + id: "toolu_r".to_string(), + name: "shell".to_string(), + input: serde_json::json!({"command": "x".repeat(600)}), + }); + cw.add_message(msg); + + let tokens_after_add = cw.used_tokens; + cw.recalculate_tokens(); + + assert_eq!(cw.used_tokens, tokens_after_add, + "recalculate_tokens should produce same result as add_message for tool_call messages"); + } } diff --git a/crates/g3-core/tests/mock_provider_integration_test.rs b/crates/g3-core/tests/mock_provider_integration_test.rs index a122db5..037f934 100644 --- a/crates/g3-core/tests/mock_provider_integration_test.rs +++ b/crates/g3-core/tests/mock_provider_integration_test.rs @@ -1182,3 +1182,238 @@ async fn test_triple_stuttered_tool_calls() { } } } + +/// Test: Tool call input tokens are tracked in context window +/// +/// Exact reproduction of the session trace bug from h3 session +/// create_a_plan_every_time_b38f28e2d722d6da: +/// +/// - 590 messages, 289 with tool_calls containing 303,046 chars of input +/// - Context window reported 39% (78,739 tokens) based on content only +/// - Actual API usage was 200,230 tokens (>100%) +/// - Compaction never triggered because should_compact() saw 39% +/// - Next API call got 400 "prompt is too long: 200230 tokens > 200000 maximum" +/// +/// This test replays a scaled-down version of that message pattern and verifies +/// that should_compact() triggers when tool_call inputs push past 80%. +#[tokio::test] +async fn test_tool_call_input_tokens_tracked_in_context_window() { + use g3_core::context_window::ContextWindow; + use g3_providers::MessageToolCall; + + // Use 200k tokens like the real session + let mut cw = ContextWindow::new(200_000); + + // Add system messages (~18k chars like the real session) + cw.add_message(Message::new( + MessageRole::System, + "You are G3, an AI programming agent. ".repeat(140), // ~5.2k chars + )); + cw.add_message(Message::new( + MessageRole::System, + "Workspace memory and project context. ".repeat(350), // ~13.3k chars + )); + + // Add a compaction summary (simulating prior compaction) + cw.add_message(Message::new( + MessageRole::User, + "Previous conversation summary: Building a training metrics dashboard...".repeat(10), // ~700 chars + )); + cw.add_message(Message::new( + MessageRole::Assistant, + "Continuing work on the recognizer.".to_string(), + )); + + // Now simulate the core pattern: many tool calls with large inputs. + // The real session had 289 tool calls with avg ~1048 chars of input each. + // We scale inputs to produce ~500k chars of tool input total (matching + // the real session's ratio where tool inputs were ~57% of all chars). + // + // Key tool types from the session: + // - plan_write: ~10-13k chars input each (6 calls) + // - str_replace: ~500-9k chars input each (50+ calls) + // - shell: ~700-28k chars input each (30+ calls) + // - write_envelope: ~3.9k chars input (1 call, the final straw) + + let mut compaction_triggered_at_msg: Option = None; + let mut msg_count = 4; // system + summary messages above + + // Simulate plan_write calls (large inputs) + for i in 0..5 { + let plan_yaml = format!( + "plan_id: test-plan\nrevision: {}\nitems:\n{}", + i + 1, + " - id: I1\n description: Test item with lots of detail about the recognizer implementation including token types and obligation handling\n state: doing\n touches: [src/recognize.rs, src/token.rs, src/obligation.rs, src/grammar.rs]\n checks:\n happy:\n desc: All forms recognized correctly\n target: recognize::tests\n".repeat(60) + ); + let mut msg = Message::new(MessageRole::Assistant, format!("Updating plan revision {}.", i + 1)); + msg.tool_calls.push(MessageToolCall { + id: format!("toolu_plan_{}", i), + name: "plan_write".to_string(), + input: serde_json::json!({"plan": plan_yaml}), + }); + cw.add_message(msg); + msg_count += 1; + + let mut result = Message::new(MessageRole::User, "Tool result: Plan updated.".to_string()); + result.tool_result_id = Some(format!("toolu_plan_{}", i)); + cw.add_message(result); + msg_count += 1; + + if compaction_triggered_at_msg.is_none() && cw.should_compact() { + compaction_triggered_at_msg = Some(msg_count); + } + } + + // Simulate str_replace calls (medium inputs) + for i in 0..40 { + let diff_content = format!( + "@@ -1,5 +1,50 @@\n-old line\n+{}\n context line\n+{}\n", + format!(" pub fn recognize_form_{i}(&mut self, token: Token) -> Result {{\n match token {{\n Token::StartBegin => self.push_obligation(NeedBeginBodyOrClose),\n Token::StartSetBang => self.push_obligation(NeedSetBangName),\n _ => Err(RecognizeError::UnexpectedToken(token)),\n }}\n }}\n").repeat(6), + format!(" #[test]\n fn test_recognize_form_{i}() {{\n let mut r = Recognizer::new();\n assert!(r.recognize_form_{i}(Token::StartBegin).is_ok());\n }}\n").repeat(6), + ); + let mut msg = Message::new(MessageRole::Assistant, "Applying diff.".to_string()); + msg.tool_calls.push(MessageToolCall { + id: format!("toolu_str_{}", i), + name: "str_replace".to_string(), + input: serde_json::json!({ + "file_path": "src/recognize.rs", + "diff": diff_content + }), + }); + cw.add_message(msg); + msg_count += 1; + + let mut result = Message::new(MessageRole::User, "Tool result: Applied diff.".to_string()); + result.tool_result_id = Some(format!("toolu_str_{}", i)); + cw.add_message(result); + msg_count += 1; + + if compaction_triggered_at_msg.is_none() && cw.should_compact() { + compaction_triggered_at_msg = Some(msg_count); + } + } + + // Simulate shell calls (variable size inputs, some very large) + for i in 0..30 { + let command = if i % 3 == 0 { + // Large shell commands (like Python scripts generating corpus files) + format!( + "python3 << 'EOF'\nimport os\nfor i in range(100):\n with open(f'corpus/{{i:03d}}.scm', 'w') as f:\n f.write('(define (func-{{}} x) (+ x 1))'.format(i))\n{}\nEOF", + " f.write('(define (helper-{i} x y) (if (> x y) (- x y) (+ x y)))\\n')\n".repeat(250) + ) + } else { + format!("cargo test --test test_{}", i) + }; + let mut msg = Message::new(MessageRole::Assistant, "".to_string()); + msg.tool_calls.push(MessageToolCall { + id: format!("toolu_sh_{}", i), + name: "shell".to_string(), + input: serde_json::json!({"command": command}), + }); + // Force-add messages with empty content but tool_calls + cw.conversation_history.push(msg.clone()); + let tc_tokens = ContextWindow::estimate_message_tokens(&msg); + cw.used_tokens += tc_tokens; + cw.cumulative_tokens += tc_tokens; + msg_count += 1; + + let mut result = Message::new( + MessageRole::User, + format!( + "Tool result: Finished `dev` profile target(s) in 0.02s\n Running `target/debug/hcube -t corpus/`\n\nTraining complete: {} observations, {} unique keys, hit_rate={:.3}\n{}", + i * 1000 + 500, i * 100 + 50, 0.696, + " form recognized: (define ...)\n".repeat(20) + ), + ); + result.tool_result_id = Some(format!("toolu_sh_{}", i)); + cw.add_message(result); + msg_count += 1; + + if compaction_triggered_at_msg.is_none() && cw.should_compact() { + compaction_triggered_at_msg = Some(msg_count); + } + } + + // Simulate the write_envelope call (the final straw in the real session) + let envelope_yaml = format!( + "type: code_change\nfacts:\n recognizer_expansion:\n new_special_forms: {}\n new_tokens: [\"StartBegin\", \"StartSetBang\", \"StartLetStar\", \"StartLetrec\", \"CloseLetStar\", \"CloseLetrec\", \"StartCase\", \"StartDo\"]\n new_binding_roles: [\"LetStar\", \"Letrec\", \"Do\"]\n new_obligations:\n{}\n files_touched:\n{}", + "[\"begin\", \"set!\", \"let*\", \"letrec\", \"case\", \"do\"]", + " - \"NeedBeginBodyOrClose\"\n - \"NeedSetBangName\"\n - \"NeedSetBangExpr\"\n".repeat(10), + " - \"src/token.rs\"\n - \"src/obligation.rs\"\n - \"src/grammar.rs\"\n - \"src/recognize.rs\"\n".repeat(10) + ); + let mut msg = Message::new(MessageRole::Assistant, "Writing envelope.".to_string()); + msg.tool_calls.push(MessageToolCall { + id: "toolu_envelope".to_string(), + name: "write_envelope".to_string(), + input: serde_json::json!({"facts": envelope_yaml}), + }); + cw.add_message(msg); + + // ==================================================================== + // Assertions + // ==================================================================== + + // 1. should_compact MUST have triggered before we reached 100% + assert!( + compaction_triggered_at_msg.is_some(), + "should_compact() should have triggered during the session! \ + Final percentage: {:.1}%, used_tokens: {}, total_tokens: {}", + cw.percentage_used(), + cw.used_tokens, + cw.total_tokens, + ); + + let trigger_msg = compaction_triggered_at_msg.unwrap(); + assert!( + trigger_msg < msg_count, + "Compaction should trigger well before the last message (triggered at msg {}, total {})", + trigger_msg, + msg_count, + ); + + // 2. Verify the OLD behavior would have MISSED compaction. + // Calculate what used_tokens would be with content-only estimation. + let content_only_tokens: u32 = cw + .conversation_history + .iter() + .map(|m| ContextWindow::estimate_tokens(&m.content)) + .sum(); + let content_only_percentage = (content_only_tokens as f32 / 200_000.0) * 100.0; + + // The content-only estimate should be well below 80% (the compaction threshold) + // In the real session it was 39%. + assert!( + content_only_percentage < 80.0, + "Content-only token estimate ({:.1}%) should be below 80% compaction threshold \ + (this proves the old code would have missed compaction). \ + Content-only tokens: {}", + content_only_percentage, + content_only_tokens, + ); + + // 3. The actual tracked percentage (with tool_calls) should be >= 80% + assert!( + cw.percentage_used() >= 80.0, + "Actual percentage with tool_call tracking ({:.1}%) should be >= 80%", + cw.percentage_used(), + ); + + // 4. The gap between content-only and actual should be significant + let gap = cw.percentage_used() - content_only_percentage; + assert!( + gap > 20.0, + "Gap between actual ({:.1}%) and content-only ({:.1}%) should be >20% \ + (tool_call inputs are a major portion of real token usage). Gap: {:.1}%", + cw.percentage_used(), + content_only_percentage, + gap, + ); + + // 5. recalculate_tokens should agree with the tracked count + let tracked = cw.used_tokens; + cw.recalculate_tokens(); + assert_eq!( + cw.used_tokens, tracked, + "recalculate_tokens() should agree with incrementally tracked used_tokens" + ); +}