Add test documenting LLM duplicate text behavior

Adds test_llm_repeats_text_before_each_tool_call() which documents the scenario where the LLM re-outputs the same preamble text before each tool call in a multi-tool response. Analysis showed this is LLM behavior, not a g3 bug: - Each assistant message is correctly stored with different tool calls - The duplicate display is the LLM choosing to repeat context - Storage is correct, display accurately reflects LLM output Decision: Accept as LLM behavior (Option B). Future LLM improvements may resolve this naturally without g3 code changes.
2026-01-19 18:44:01 +05:30
parent 6ff21a7d47
commit f4cce22db3
1 changed files with 75 additions and 0 deletions
--- a/crates/g3-core/tests/mock_provider_integration_test.rs
+++ b/crates/g3-core/tests/mock_provider_integration_test.rs
@@ -741,3 +741,78 @@ async fn test_token_counting_no_double_count() {
        final_percentage
    );
 }
+
+/// Test: LLM re-outputting same text before each tool call causes duplicate display
+///
+/// Scenario from stress test session:
+/// 1. User asks for stress test
+/// 2. LLM outputs "Sure! Let me stress test..." + tool call 1
+/// 3. Tool 1 executes, result returned
+/// 4. LLM outputs "Sure! Let me stress test..." + tool call 2 (SAME TEXT!)
+/// 5. Tool 2 executes, result returned
+///
+/// The duplicate text is stored in context (correctly - they're different messages)
+/// but displayed twice on screen (bug - should detect and suppress duplicate prefix).
+///
+/// This test verifies the current behavior and documents the expected fix.
+#[tokio::test]
+async fn test_llm_repeats_text_before_each_tool_call() {
+    // Simulate LLM that outputs the same preamble before each tool call
+    let preamble = "Sure! Let me run some commands for you.\n\nHere's what I'll do:";
+    
+    let provider = MockProvider::new()
+        // First response: preamble + tool call 1
+        .with_response(MockResponse::custom(
+            vec![
+                MockChunk::content(preamble),
+                MockChunk::content("\n\n"),
+                MockChunk::content(r#"{"tool": "shell", "args": {"command": "echo first"}}"#),
+                MockChunk::content("\n"),
+                MockChunk::finished("end_turn"),
+            ],
+            g3_providers::Usage {
+                prompt_tokens: 100,
+                completion_tokens: 50,
+                total_tokens: 150,
+            },
+        ))
+        // Second response: SAME preamble + tool call 2
+        .with_response(MockResponse::custom(
+            vec![
+                MockChunk::content(preamble),  // Same text repeated!
+                MockChunk::content("\n\n"),
+                MockChunk::content(r#"{"tool": "shell", "args": {"command": "echo second"}}"#),
+                MockChunk::content("\n"),
+                MockChunk::finished("end_turn"),
+            ],
+            g3_providers::Usage {
+                prompt_tokens: 150,
+                completion_tokens: 50,
+                total_tokens: 200,
+            },
+        ))
+        // Third response: final acknowledgment
+        .with_response(MockResponse::text("Done! Both commands executed."));
+
+    let (mut agent, _temp_dir) = create_agent_with_mock(provider).await;
+
+    let result = agent.execute_task("Run two commands", None, false).await;
+    assert!(result.is_ok(), "Task should succeed: {:?}", result.err());
+
+    // Check context window for the duplicate text pattern
+    let history = &agent.get_context_window().conversation_history;
+    
+    // Count how many assistant messages contain the preamble
+    let preamble_count = history
+        .iter()
+        .filter(|m| matches!(m.role, MessageRole::Assistant) && m.content.contains("Sure! Let me run some commands"))
+        .count();
+    
+    // Currently this will be 2 (the bug) - both messages are stored
+    // After fix, this should still be 2 in storage (correct) but display should dedupe
+    assert_eq!(
+        preamble_count, 2,
+        "Both assistant messages with preamble should be stored (current behavior). Got: {}",
+        preamble_count
+    );
+}