From 1b4ea93ba40b7c102c4eb12ed7256d32ed920d1f Mon Sep 17 00:00:00 2001 From: "Dhanji R. Prasanna" Date: Mon, 1 Dec 2025 14:52:10 +1100 Subject: [PATCH] token counting bugfix --- crates/g3-core/src/lib.rs | 20 +- crates/g3-core/tests/test_token_counting.rs | 201 +++++++++++++------- 2 files changed, 147 insertions(+), 74 deletions(-) diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs index baacecc..b6bf619 100644 --- a/crates/g3-core/src/lib.rs +++ b/crates/g3-core/src/lib.rs @@ -340,14 +340,18 @@ impl ContextWindow { } /// Update token usage from provider response + /// NOTE: This only updates cumulative_tokens (total API usage tracking). + /// It does NOT update used_tokens because: + /// 1. prompt_tokens represents the ENTIRE context sent to API (already tracked via add_message) + /// 2. completion_tokens will be tracked when the assistant message is added via add_message + /// Adding total_tokens here would cause double/triple counting and break the 80% threshold check. pub fn update_usage_from_response(&mut self, usage: &g3_providers::Usage) { - // Add the tokens from this response to our running total - // The usage.total_tokens represents tokens used in this single API call - self.used_tokens += usage.total_tokens; + // Only update cumulative tokens for API usage tracking + // Do NOT update used_tokens - that's tracked via add_message to avoid double counting self.cumulative_tokens += usage.total_tokens; debug!( - "Added {} tokens from provider response (used: {}/{}, cumulative: {})", + "Updated cumulative tokens: {} (used: {}/{}, cumulative: {})", usage.total_tokens, self.used_tokens, self.total_tokens, self.cumulative_tokens ); } @@ -371,12 +375,14 @@ impl ContextWindow { self.update_usage_from_response(usage); } - /// Update cumulative token usage (for streaming) + /// Update cumulative token usage (for streaming) when no provider usage data is available + /// NOTE: This only updates cumulative_tokens, not used_tokens. + /// The assistant message will be added via add_message which tracks used_tokens. pub fn add_streaming_tokens(&mut self, new_tokens: u32) { - self.used_tokens += new_tokens; + // Only update cumulative tokens - used_tokens is tracked via add_message self.cumulative_tokens += new_tokens; debug!( - "Added {} streaming tokens (used: {}/{}, cumulative: {})", + "Updated cumulative streaming tokens: {} (used: {}/{}, cumulative: {})", new_tokens, self.used_tokens, self.total_tokens, self.cumulative_tokens ); } diff --git a/crates/g3-core/tests/test_token_counting.rs b/crates/g3-core/tests/test_token_counting.rs index 02646fa..21f62dd 100644 --- a/crates/g3-core/tests/test_token_counting.rs +++ b/crates/g3-core/tests/test_token_counting.rs @@ -1,103 +1,170 @@ use g3_core::ContextWindow; -use g3_providers::Usage; +use g3_providers::{Message, MessageRole, Usage}; +/// Test that used_tokens is tracked via add_message, not update_usage_from_response. +/// This is critical for the 80% summarization threshold to work correctly. #[test] -fn test_token_accumulation() { +fn test_used_tokens_tracked_via_messages() { let mut window = ContextWindow::new(10000); - // First API call: 100 prompt + 50 completion = 150 total - let usage1 = Usage { + // Add a user message - this should update used_tokens + let user_msg = Message::new(MessageRole::User, "Hello, how are you?".to_string()); + window.add_message(user_msg); + + // used_tokens should be non-zero after adding a message + assert!(window.used_tokens > 0, "used_tokens should increase after add_message"); + let tokens_after_user_msg = window.used_tokens; + + // Add an assistant message + let assistant_msg = Message::new(MessageRole::Assistant, "I'm doing well, thank you!".to_string()); + window.add_message(assistant_msg); + + // used_tokens should increase further + assert!(window.used_tokens > tokens_after_user_msg, "used_tokens should increase after adding assistant message"); +} + +/// Test that update_usage_from_response only updates cumulative_tokens, not used_tokens. +/// This prevents double-counting which was causing the 80% threshold to be reached at 200%+. +#[test] +fn test_update_usage_only_affects_cumulative() { + let mut window = ContextWindow::new(10000); + + // Initial state + assert_eq!(window.used_tokens, 0); + assert_eq!(window.cumulative_tokens, 0); + + // Simulate API response with usage data + let usage = Usage { prompt_tokens: 100, completion_tokens: 50, total_tokens: 150, }; - window.update_usage_from_response(&usage1); - assert_eq!(window.used_tokens, 150, "First call should have 150 tokens"); - assert_eq!(window.cumulative_tokens, 150, "Cumulative should be 150"); + window.update_usage_from_response(&usage); - // Second API call: 200 prompt + 75 completion = 275 total + // used_tokens should NOT change - it's tracked via add_message + assert_eq!(window.used_tokens, 0, "used_tokens should not be updated by update_usage_from_response"); + + // cumulative_tokens SHOULD be updated for API usage tracking + assert_eq!(window.cumulative_tokens, 150, "cumulative_tokens should track total API usage"); + + // Another API call let usage2 = Usage { prompt_tokens: 200, completion_tokens: 75, total_tokens: 275, }; window.update_usage_from_response(&usage2); - assert_eq!( - window.used_tokens, 425, - "Second call should accumulate to 425 tokens" - ); - assert_eq!(window.cumulative_tokens, 425, "Cumulative should be 425"); - // Third API call with SMALLER token count: 50 prompt + 25 completion = 75 total - let usage3 = Usage { - prompt_tokens: 50, - completion_tokens: 25, - total_tokens: 75, - }; - window.update_usage_from_response(&usage3); - assert_eq!( - window.used_tokens, 500, - "Third call should accumulate to 500 tokens" - ); - assert_eq!(window.cumulative_tokens, 500, "Cumulative should be 500"); - - // Verify tokens never decrease - assert!( - window.used_tokens >= 425, - "Token count should never decrease!" - ); + // used_tokens still unchanged + assert_eq!(window.used_tokens, 0, "used_tokens should remain unchanged"); + + // cumulative_tokens accumulates + assert_eq!(window.cumulative_tokens, 425, "cumulative_tokens should accumulate"); } +/// Test that add_streaming_tokens only updates cumulative_tokens. +/// The assistant message will be added via add_message which tracks used_tokens. #[test] -fn test_add_streaming_tokens() { +fn test_add_streaming_tokens_only_affects_cumulative() { let mut window = ContextWindow::new(10000); - // Add some streaming tokens + // Add streaming tokens (fallback when no usage data available) window.add_streaming_tokens(100); - assert_eq!(window.used_tokens, 100); - assert_eq!(window.cumulative_tokens, 100); + + // used_tokens should NOT change + assert_eq!(window.used_tokens, 0, "used_tokens should not be updated by add_streaming_tokens"); + + // cumulative_tokens SHOULD be updated + assert_eq!(window.cumulative_tokens, 100, "cumulative_tokens should be updated"); - // Add more + // Add more streaming tokens window.add_streaming_tokens(50); - assert_eq!(window.used_tokens, 150); + assert_eq!(window.used_tokens, 0); assert_eq!(window.cumulative_tokens, 150); - - // Now update from provider response - let usage = Usage { - prompt_tokens: 80, - completion_tokens: 40, - total_tokens: 120, - }; - window.update_usage_from_response(&usage); - - // Should ADD to existing, not replace - assert_eq!(window.used_tokens, 270, "Should add 120 to existing 150"); - assert_eq!(window.cumulative_tokens, 270); } +/// Test percentage calculation is based on used_tokens (actual context content). #[test] -fn test_percentage_calculation() { +fn test_percentage_based_on_used_tokens() { let mut window = ContextWindow::new(1000); - // Add tokens via provider response + // Initially 0% + assert_eq!(window.percentage_used(), 0.0); + assert_eq!(window.remaining_tokens(), 1000); + + // Add messages to increase used_tokens + // A message with ~100 chars should be roughly 25-30 tokens + let msg = Message::new(MessageRole::User, "x".repeat(400)); // ~100 tokens estimated + window.add_message(msg); + + // Percentage should be based on used_tokens + let percentage = window.percentage_used(); + assert!(percentage > 0.0, "percentage should be > 0 after adding message"); + assert!(percentage < 100.0, "percentage should be < 100"); + + // remaining_tokens should decrease + assert!(window.remaining_tokens() < 1000, "remaining tokens should decrease"); +} + +/// Test that the 80% summarization threshold works correctly. +/// This was the original bug - used_tokens was being double/triple counted. +#[test] +fn test_should_summarize_threshold() { + let mut window = ContextWindow::new(1000); + + // Add messages until we approach 80% + // Each message of ~320 chars is roughly 80 tokens (at 4 chars/token) + for _ in 0..9 { + let msg = Message::new(MessageRole::User, "x".repeat(320)); + window.add_message(msg); + } + + // Should be around 720 tokens (72%) - not yet at threshold + // Note: actual token count depends on estimation algorithm + let percentage = window.percentage_used(); + println!("After 9 messages: {}% used ({} tokens)", percentage, window.used_tokens); + + // Add one more message to push over 80% + let msg = Message::new(MessageRole::User, "x".repeat(320)); + window.add_message(msg); + + let percentage_after = window.percentage_used(); + println!("After 10 messages: {}% used ({} tokens)", percentage_after, window.used_tokens); + + // Now should_summarize should return true if we're at 80%+ + if percentage_after >= 80.0 { + assert!(window.should_summarize(), "should_summarize should be true at 80%+"); + } +} + +/// Test that cumulative_tokens and used_tokens are independent. +#[test] +fn test_cumulative_vs_used_independence() { + let mut window = ContextWindow::new(10000); + + // Add a message (affects used_tokens) + let msg = Message::new(MessageRole::User, "Hello world".to_string()); + window.add_message(msg); + let used_after_msg = window.used_tokens; + let cumulative_after_msg = window.cumulative_tokens; + + // Both should be equal at this point (message adds to both) + assert_eq!(used_after_msg, cumulative_after_msg); + + // Now simulate API response (only affects cumulative_tokens) let usage = Usage { - prompt_tokens: 150, - completion_tokens: 100, - total_tokens: 250, + prompt_tokens: 500, + completion_tokens: 200, + total_tokens: 700, }; window.update_usage_from_response(&usage); - assert_eq!(window.percentage_used(), 25.0); - assert_eq!(window.remaining_tokens(), 750); - - // Add more tokens - let usage2 = Usage { - prompt_tokens: 300, - completion_tokens: 200, - total_tokens: 500, - }; - window.update_usage_from_response(&usage2); - - assert_eq!(window.percentage_used(), 75.0); - assert_eq!(window.remaining_tokens(), 250); + // used_tokens unchanged + assert_eq!(window.used_tokens, used_after_msg, "used_tokens should not change from API response"); + + // cumulative_tokens increased + assert_eq!(window.cumulative_tokens, cumulative_after_msg + 700, "cumulative_tokens should increase"); + + // They should now be different + assert!(window.cumulative_tokens > window.used_tokens, "cumulative should be greater than used"); }