From 2a4cd1f4d6518a007e67895fdcd2b6dae88d6032 Mon Sep 17 00:00:00 2001 From: "Dhanji R. Prasanna" Date: Tue, 10 Feb 2026 19:53:11 +1100 Subject: [PATCH] fix: strip duplicate tool call JSON from assistant messages when LLM stutters When the LLM emits identical JSON tool calls as text content (JSON fallback mode), the raw duplicate JSON was being stored in the assistant message in conversation history. This confused the model on subsequent turns, causing it to stall or repeat itself. Root cause: raw_content_for_log used get_text_content() which returns the full parser buffer including all duplicate tool call JSONs. Fix: Added get_text_before_tool_calls() to StreamingToolParser that returns only the text before the first JSON tool call. Changed raw_content_for_log to use this method so the assistant message only contains the preamble text + the single executed tool call. Added 5 integration tests covering stuttered duplicates, triple stutter, cross-turn dedup, and different-args boundary case. Added MockResponse helpers for simulating LLM stutter patterns. --- crates/g3-core/src/lib.rs | 5 +- crates/g3-core/src/streaming_parser.rs | 14 ++ .../tests/mock_provider_integration_test.rs | 231 ++++++++++++++++++ crates/g3-providers/src/mock.rs | 54 +++- 4 files changed, 302 insertions(+), 2 deletions(-) diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs index 9cf9da1..834780f 100644 --- a/crates/g3-core/src/lib.rs +++ b/crates/g3-core/src/lib.rs @@ -2368,7 +2368,10 @@ Skip if nothing new. Be brief."#; let already_displayed_chars = iter.current_response.chars().count(); let text_content = iter.parser.get_text_content(); let clean_content = streaming::clean_llm_tokens(&text_content); - let raw_content_for_log = clean_content.clone(); + // Use only the text before tool calls for the log message. + // This prevents duplicate tool call JSON from being stored + // in the assistant message when the LLM stutters. + let raw_content_for_log = streaming::clean_llm_tokens(iter.parser.get_text_before_tool_calls()); let filtered_content = self.ui_writer.filter_json_tool_calls(&clean_content); let final_display_content = filtered_content.trim(); diff --git a/crates/g3-core/src/streaming_parser.rs b/crates/g3-core/src/streaming_parser.rs index a9712bd..fc3f3ec 100644 --- a/crates/g3-core/src/streaming_parser.rs +++ b/crates/g3-core/src/streaming_parser.rs @@ -481,6 +481,20 @@ impl StreamingToolParser { &self.text_buffer } + /// Get only the text content before the first JSON tool call. + /// Returns the full buffer if no tool calls are found. + /// This is used to save the "preamble" text (e.g. "Let me run that.") + /// without including raw duplicate tool call JSON in the assistant message. + pub fn get_text_before_tool_calls(&self) -> &str { + let fence_ranges = find_code_fence_ranges(&self.text_buffer); + if let Some(pos) = find_first_tool_call_start(&self.text_buffer) { + if !is_position_in_fence_ranges(pos, &fence_ranges) { + return self.text_buffer[..pos].trim_end(); + } + } + &self.text_buffer + } + pub fn get_content_before_position(&self, pos: usize) -> String { if pos <= self.text_buffer.len() { self.text_buffer[..pos].to_string() diff --git a/crates/g3-core/tests/mock_provider_integration_test.rs b/crates/g3-core/tests/mock_provider_integration_test.rs index 6ebedee..a122db5 100644 --- a/crates/g3-core/tests/mock_provider_integration_test.rs +++ b/crates/g3-core/tests/mock_provider_integration_test.rs @@ -951,3 +951,234 @@ async fn test_plan_approval_gate_blocks_unapproved_changes() { assert!(has_blocking_message, "Should have blocking message in context"); } + +/// Test: JSON fallback stuttered duplicate tool calls - first executes, second deduped +/// +/// Reproduces the exact failure from session create_a_plan_generate_a_4ef735ceedecfdd: +/// The LLM emits two identical JSON tool calls as text content (non-native). +/// The first should be executed, the second should be deduplicated. +/// After execution, the follow-up response should work normally. +#[tokio::test] +async fn test_json_fallback_stuttered_duplicate_tool_calls() { + // Provider WITHOUT native tool calling - uses JSON fallback + // Response 1: Two identical JSON tool calls in text (the stutter) + // Response 2: Follow-up text after tool execution + let provider = MockProvider::new() + .with_native_tool_calling(false) + .with_response(MockResponse::text_with_duplicate_json_tools( + "shell", + serde_json::json!({"command": "echo stutter_test"}), + )) + .with_default_response(MockResponse::text("The command completed successfully.")); + + let (mut agent, _temp_dir) = create_agent_with_mock(provider).await; + + let result = agent.execute_task("Run a command", None, false).await; + assert!(result.is_ok(), "Task should succeed: {:?}", result.err()); + + let history = &agent.get_context_window().conversation_history; + + // Should have exactly one tool result (not two) + let tool_result_count = history + .iter() + .filter(|m| matches!(m.role, MessageRole::User) && m.content.contains("Tool result:") && m.content.contains("stutter_test")) + .count(); + assert_eq!(tool_result_count, 1, "Should have exactly 1 tool result, got {}", tool_result_count); + + // Should have the follow-up response + let has_followup = history + .iter() + .any(|m| matches!(m.role, MessageRole::Assistant) && m.content.contains("completed successfully")); + assert!(has_followup, "Follow-up response should be in context"); + + // CRITICAL: Verify the assistant message does NOT contain raw duplicate JSON tool calls + // This is the actual bug - the model's stuttered response gets stored as raw JSON content + // which confuses the model on subsequent turns. + for (i, msg) in history.iter().enumerate() { + if matches!(msg.role, MessageRole::Assistant) { + // Count how many times the tool call JSON pattern appears in this message + let tool_pattern = r#""tool": "shell""#; + let count = msg.content.matches(tool_pattern).count(); + assert!( + count <= 1, + "Assistant message [{}] contains {} duplicate tool call JSON patterns (should be 0 or 1): {}", + i, count, &msg.content[..msg.content.len().min(200)] + ); + } + } +} + +/// Test: JSON fallback stuttered duplicate with text prefix +/// +/// Same as above but with text before the tool calls: +/// "Let me run that.\n\n{tool...}\n\n{tool...}" +#[tokio::test] +async fn test_json_fallback_stuttered_duplicate_with_text_prefix() { + let provider = MockProvider::new() + .with_native_tool_calling(false) + .with_response(MockResponse::text_with_duplicate_json_tools_prefixed( + "Let me run that command.", + "shell", + serde_json::json!({"command": "echo prefixed_stutter"}), + )) + .with_default_response(MockResponse::text("Done.")); + + let (mut agent, _temp_dir) = create_agent_with_mock(provider).await; + + let result = agent.execute_task("Run a command", None, false).await; + assert!(result.is_ok(), "Task should succeed: {:?}", result.err()); + + let history = &agent.get_context_window().conversation_history; + + // Should have exactly one tool result + let tool_result_count = history + .iter() + .filter(|m| matches!(m.role, MessageRole::User) && m.content.contains("Tool result:") && m.content.contains("prefixed_stutter")) + .count(); + assert_eq!(tool_result_count, 1, "Should have exactly 1 tool result, got {}", tool_result_count); +} + +/// Test: Two different JSON tool calls in one response should BOTH execute +/// +/// When the LLM emits two different tool calls (not duplicates), both should execute. +/// This is the boundary case - different args means not a duplicate. +#[tokio::test] +async fn test_json_fallback_two_different_tool_calls_both_execute() { + // Manually construct two different tool calls as text chunks + let tool1 = r#"{"tool": "shell", "args": {"command": "echo first_call"}}"#; + let tool2 = r#"{"tool": "shell", "args": {"command": "echo second_call"}}"#; + + let provider = MockProvider::new() + .with_native_tool_calling(false) + .with_response(MockResponse::custom( + vec![ + MockChunk::content(tool1), + MockChunk::content("\n\n"), + MockChunk::content(tool2), + MockChunk::finished("end_turn"), + ], + g3_providers::Usage { + prompt_tokens: 100, + completion_tokens: 50, + total_tokens: 150, + cache_creation_tokens: 0, + cache_read_tokens: 0, + }, + )) + .with_default_response(MockResponse::text("Both commands ran.")); + + let (mut agent, _temp_dir) = create_agent_with_mock(provider).await; + + let result = agent.execute_task("Run two commands", None, false).await; + assert!(result.is_ok(), "Task should succeed: {:?}", result.err()); + + let history = &agent.get_context_window().conversation_history; + + // Should have tool results for both calls + let first_result = history + .iter() + .any(|m| matches!(m.role, MessageRole::User) && m.content.contains("Tool result:") && m.content.contains("first_call")); + let second_result = history + .iter() + .any(|m| matches!(m.role, MessageRole::User) && m.content.contains("Tool result:") && m.content.contains("second_call")); + + assert!(first_result, "First tool call should have been executed"); + assert!(second_result, "Second (different) tool call should also have been executed"); +} + +/// Test: Cross-turn same tool call should still execute (not be deduped as DUP IN MSG) +/// +/// Scenario: Turn 1 - model emits tool call A (executed, result returned). +/// Turn 2 (new user message) - model emits tool call A again (stuttered x2). +/// The first should execute (it's a new turn with a new user message between), +/// the second should be deduped as DUP IN CHUNK. +#[tokio::test] +async fn test_cross_turn_same_tool_call_executes() { + let provider = MockProvider::new() + .with_native_tool_calling(false) + .with_response(MockResponse::text_with_json_tool( + "Running the command.", + "shell", + serde_json::json!({"command": "echo cross_turn_test"}), + )) + .with_response(MockResponse::text("First run complete.")) + // Second task will get the stuttered duplicate + .with_response(MockResponse::text_with_duplicate_json_tools( + "shell", + serde_json::json!({"command": "echo cross_turn_test"}), + )) + .with_default_response(MockResponse::text("Second run complete.")); + + let (mut agent, _temp_dir) = create_agent_with_mock(provider).await; + + // First task + let result1 = agent.execute_task("Run the command", None, false).await; + assert!(result1.is_ok(), "First task should succeed: {:?}", result1.err()); + + // Second task - same tool call but new user message + let result2 = agent.execute_task("Run it again", None, false).await; + assert!(result2.is_ok(), "Second task should succeed: {:?}", result2.err()); + + let history = &agent.get_context_window().conversation_history; + + // Should have two tool results (one per task) + let tool_result_count = history + .iter() + .filter(|m| matches!(m.role, MessageRole::User) && m.content.contains("Tool result:") && m.content.contains("cross_turn_test")) + .count(); + assert_eq!(tool_result_count, 2, "Should have 2 tool results (one per turn), got {}", tool_result_count); +} + +/// Test: Three identical tool calls in one response - first executes, rest deduped +/// +/// Boundary case: model stutters three times. +#[tokio::test] +async fn test_triple_stuttered_tool_calls() { + let tool_str = r#"{"tool": "shell", "args": {"command": "echo triple"}}"#; + let provider = MockProvider::new() + .with_native_tool_calling(false) + .with_response(MockResponse::custom( + vec![ + MockChunk::content(tool_str), + MockChunk::content("\n\n"), + MockChunk::content(tool_str), + MockChunk::content("\n\n"), + MockChunk::content(tool_str), + MockChunk::finished("end_turn"), + ], + g3_providers::Usage { + prompt_tokens: 100, + completion_tokens: 50, + total_tokens: 150, + cache_creation_tokens: 0, + cache_read_tokens: 0, + }, + )) + .with_default_response(MockResponse::text("Done.")); + + let (mut agent, _temp_dir) = create_agent_with_mock(provider).await; + + let result = agent.execute_task("Run command", None, false).await; + assert!(result.is_ok(), "Task should succeed: {:?}", result.err()); + + let history = &agent.get_context_window().conversation_history; + + // Should have exactly one tool result + let tool_result_count = history + .iter() + .filter(|m| matches!(m.role, MessageRole::User) && m.content.contains("Tool result:") && m.content.contains("triple")) + .count(); + assert_eq!(tool_result_count, 1, "Should have exactly 1 tool result, got {}", tool_result_count); + + // Verify no assistant message has more than 1 tool call JSON pattern + for (i, msg) in history.iter().enumerate() { + if matches!(msg.role, MessageRole::Assistant) { + let count = msg.content.matches(r#""tool": "shell""#).count(); + assert!( + count <= 1, + "Assistant message [{}] contains {} tool call patterns (should be 0 or 1)", + i, count + ); + } + } +} diff --git a/crates/g3-providers/src/mock.rs b/crates/g3-providers/src/mock.rs index bfccb83..67ebe0a 100644 --- a/crates/g3-providers/src/mock.rs +++ b/crates/g3-providers/src/mock.rs @@ -231,7 +231,59 @@ impl MockResponse { } } - /// Create a response that gets cut off by max_tokens + /// Create a response with duplicate JSON tool calls in text content (non-native). + /// Mimics the LLM stuttering pattern where it emits the same tool call twice. + pub fn text_with_duplicate_json_tools(tool: &str, args: serde_json::Value) -> Self { + let args_str = serde_json::to_string(&args).unwrap(); + let tool_str = format!(r#"{{"tool": "{}", "args": {}}}"#, tool, args_str); + let full_content = format!("{}\n\n{}", &tool_str, &tool_str); + + Self { + chunks: vec![ + MockChunk::content(&tool_str), + MockChunk::content("\n\n"), + MockChunk::content(&tool_str), + MockChunk::finished("end_turn"), + ], + usage: Usage { + prompt_tokens: 100, + completion_tokens: full_content.len() as u32 / 4, + total_tokens: 100 + full_content.len() as u32 / 4, + cache_creation_tokens: 0, + cache_read_tokens: 0, + }, + } + } + + /// Create a response with text followed by duplicate JSON tool calls (non-native). + /// Mimics the pattern: "Let me run that.\n\n{tool...}\n\n{tool...}" + pub fn text_with_duplicate_json_tools_prefixed( + text: &str, + tool: &str, + args: serde_json::Value, + ) -> Self { + let args_str = serde_json::to_string(&args).unwrap(); + let tool_str = format!(r#"{{"tool": "{}", "args": {}}}"#, tool, args_str); + let full_content = format!("{}\n\n{}\n\n{}", text, &tool_str, &tool_str); + + Self { + chunks: vec![ + MockChunk::content(text), + MockChunk::content("\n\n"), + MockChunk::content(&tool_str), + MockChunk::content("\n\n"), + MockChunk::content(&tool_str), + MockChunk::finished("end_turn"), + ], + usage: Usage { + prompt_tokens: 100, + completion_tokens: full_content.len() as u32 / 4, + total_tokens: 100 + full_content.len() as u32 / 4, + cache_creation_tokens: 0, + cache_read_tokens: 0, + }, + } + } pub fn truncated(content: &str) -> Self { Self { chunks: vec![