Fix tool_call input tokens invisible to context window tracker

estimate_tokens() only counted message.content chars, completely
ignoring message.tool_calls[].input JSON. When sent to the API,
tool_use blocks include full input, so the token tracker massively
undercounted — in one session, 303k chars (101k tokens) of tool
input were invisible, showing 39% usage when actual was >100%.
Compaction never triggered, causing an API 400 error.

Added estimate_message_tokens() that accounts for both content and
tool_call input. Updated add_message_with_tokens(), recalculate_tokens(),
and clear_conversation() to use it.

7 unit tests + 1 integration test reproducing the exact session trace.
This commit is contained in:
Dhanji R. Prasanna
2026-02-11 16:12:13 +11:00
parent d61be719c2
commit 88d2b9592b
3 changed files with 415 additions and 5 deletions

View File

@@ -109,7 +109,7 @@ impl ContextWindow {
return;
}
let token_count = tokens.unwrap_or_else(|| Self::estimate_tokens(&message.content));
let token_count = tokens.unwrap_or_else(|| Self::estimate_message_tokens(&message));
self.used_tokens += token_count;
self.cumulative_tokens += token_count;
self.conversation_history.push(message);
@@ -134,7 +134,7 @@ impl ContextWindow {
self.used_tokens = self
.conversation_history
.iter()
.map(|m| Self::estimate_tokens(&m.content))
.map(|m| Self::estimate_message_tokens(m))
.sum();
self.last_thinning_percentage = 0;
}
@@ -178,7 +178,7 @@ impl ContextWindow {
self.used_tokens = self
.conversation_history
.iter()
.map(|m| Self::estimate_tokens(&m.content))
.map(|m| Self::estimate_message_tokens(m))
.sum();
debug!("Recalculated tokens after thinning: {} tokens", self.used_tokens);
}
@@ -197,6 +197,29 @@ impl ContextWindow {
(base_estimate as f32 * 1.1).ceil() as u32
}
/// Estimate tokens for a full message, including structured tool_calls.
///
/// When the message is sent to the API, tool_calls are serialized as
/// structured blocks (e.g. Anthropic `tool_use`) whose input JSON counts
/// toward the prompt token budget. `estimate_tokens()` only looks at
/// `message.content`, so tool_call inputs were previously invisible to
/// the token tracker — causing used_tokens to massively undercount and
/// compaction to never trigger.
pub fn estimate_message_tokens(message: &Message) -> u32 {
let mut total = Self::estimate_tokens(&message.content);
for tc in &message.tool_calls {
// Serialize the input Value to a string for size estimation.
// Tool call inputs are always JSON/structured, so use the
// code/JSON heuristic (chars/3 * 1.1).
let input_str = tc.input.to_string();
let base = (input_str.len() as f32 / 3.0).ceil() as u32;
let tc_tokens = (base as f32 * 1.1).ceil() as u32;
// Also count the tool name + id overhead (~20 tokens)
total += tc_tokens + 20;
}
total
}
// ========================================================================
// Capacity Queries
// ========================================================================
@@ -1002,4 +1025,149 @@ mod tests {
assert!(assistant_msgs[0].tool_calls.is_empty());
assert!(assistant_msgs[0].content.contains("Hello! How can I help you today?"));
}
// ====================================================================
// Tool-call token tracking tests
// ====================================================================
#[test]
fn test_estimate_message_tokens_content_only() {
// Message without tool_calls should behave like estimate_tokens
let msg = Message::new(MessageRole::Assistant, "Hello world".to_string());
let msg_tokens = ContextWindow::estimate_message_tokens(&msg);
let text_tokens = ContextWindow::estimate_tokens("Hello world");
assert_eq!(msg_tokens, text_tokens);
}
#[test]
fn test_estimate_message_tokens_with_tool_calls() {
// Message with tool_calls should count both content and tool input
let mut msg = Message::new(MessageRole::Assistant, "Let me read that.".to_string());
msg.tool_calls.push(MessageToolCall {
id: "toolu_abc".to_string(),
name: "shell".to_string(),
input: serde_json::json!({"command": "echo hello world this is a moderately long command string for testing purposes"}),
});
let msg_tokens = ContextWindow::estimate_message_tokens(&msg);
let text_only_tokens = ContextWindow::estimate_tokens("Let me read that.");
// Must be strictly greater than text-only estimate
assert!(
msg_tokens > text_only_tokens,
"estimate_message_tokens ({}) should be > text-only estimate ({})",
msg_tokens, text_only_tokens
);
// The tool input is ~90 chars of JSON → ~30 tokens + 20 overhead = ~50 extra
assert!(
msg_tokens >= text_only_tokens + 20,
"tool_call should add at least 20 tokens overhead, got delta={}",
msg_tokens - text_only_tokens
);
}
#[test]
fn test_estimate_message_tokens_empty_content_with_tool_calls() {
// Message with empty content but tool_calls should still count tool input
let mut msg = Message::new(MessageRole::Assistant, "".to_string());
msg.tool_calls.push(MessageToolCall {
id: "toolu_xyz".to_string(),
name: "write_envelope".to_string(),
input: serde_json::json!({"facts": "a]".repeat(1000)}),
});
let tokens = ContextWindow::estimate_message_tokens(&msg);
assert!(tokens > 100, "Large tool input should produce significant token count, got {}", tokens);
}
#[test]
fn test_estimate_message_tokens_large_tool_input() {
// Simulate the write_envelope case: 3751 chars of YAML in tool input
let large_yaml = "a: b\n".repeat(750); // ~3750 chars
let mut msg = Message::new(MessageRole::Assistant, "Writing envelope.".to_string());
msg.tool_calls.push(MessageToolCall {
id: "toolu_env".to_string(),
name: "write_envelope".to_string(),
input: serde_json::json!({"facts": large_yaml}),
});
let tokens = ContextWindow::estimate_message_tokens(&msg);
// 3750 chars of JSON / 3 * 1.1 ≈ 1375 tokens + 20 overhead + content tokens
assert!(tokens > 1000, "Large tool input should produce >1000 tokens, got {}", tokens);
}
#[test]
fn test_add_message_counts_tool_call_tokens() {
let mut cw = ContextWindow::new(200_000);
// Add a message with tool_calls
let mut msg = Message::new(MessageRole::Assistant, "Running command.".to_string());
msg.tool_calls.push(MessageToolCall {
id: "toolu_1".to_string(),
name: "shell".to_string(),
input: serde_json::json!({"command": "x]".repeat(500)}),
});
cw.add_message(msg);
// used_tokens should reflect the tool_call input, not just the content
let content_only = ContextWindow::estimate_tokens("Running command.");
assert!(
cw.used_tokens > content_only,
"used_tokens ({}) should be > content-only estimate ({})",
cw.used_tokens, content_only
);
}
#[test]
fn test_should_compact_triggers_with_tool_call_tokens() {
// Reproduce the core bug: tool_calls push real usage past 80% but
// the old code would have tracked only content tokens (staying low).
let mut cw = ContextWindow::new(1000);
// Add a message with small content but large tool input
// Content: ~5 tokens. Tool input: ~1000 chars → ~367 tokens + 20 = ~387
// Total: ~392 tokens → 39% of 1000. Not enough alone.
// Add several to push past 80%.
for i in 0..3 {
let mut msg = Message::new(MessageRole::Assistant, "ok".to_string());
msg.tool_calls.push(MessageToolCall {
id: format!("toolu_{}", i),
name: "shell".to_string(),
input: serde_json::json!({"command": "x".repeat(800)}),
});
cw.add_message(msg);
// Also add a tool result
let mut result = Message::new(MessageRole::User, "Tool result: done".to_string());
result.tool_result_id = Some(format!("toolu_{}", i));
cw.add_message(result);
}
// With tool_call tracking, should_compact should trigger
assert!(
cw.should_compact(),
"should_compact should trigger when tool_calls push past 80%, percentage={}%",
cw.percentage_used()
);
}
#[test]
fn test_recalculate_tokens_includes_tool_calls() {
let mut cw = ContextWindow::new(200_000);
let mut msg = Message::new(MessageRole::Assistant, "hi".to_string());
msg.tool_calls.push(MessageToolCall {
id: "toolu_r".to_string(),
name: "shell".to_string(),
input: serde_json::json!({"command": "x".repeat(600)}),
});
cw.add_message(msg);
let tokens_after_add = cw.used_tokens;
cw.recalculate_tokens();
assert_eq!(cw.used_tokens, tokens_after_add,
"recalculate_tokens should produce same result as add_message for tool_call messages");
}
}