diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs index 9cf9da1..834780f 100644 --- a/crates/g3-core/src/lib.rs +++ b/crates/g3-core/src/lib.rs @@ -2368,7 +2368,10 @@ Skip if nothing new. Be brief."#; let already_displayed_chars = iter.current_response.chars().count(); let text_content = iter.parser.get_text_content(); let clean_content = streaming::clean_llm_tokens(&text_content); - let raw_content_for_log = clean_content.clone(); + // Use only the text before tool calls for the log message. + // This prevents duplicate tool call JSON from being stored + // in the assistant message when the LLM stutters. + let raw_content_for_log = streaming::clean_llm_tokens(iter.parser.get_text_before_tool_calls()); let filtered_content = self.ui_writer.filter_json_tool_calls(&clean_content); let final_display_content = filtered_content.trim(); diff --git a/crates/g3-core/src/streaming_parser.rs b/crates/g3-core/src/streaming_parser.rs index a9712bd..fc3f3ec 100644 --- a/crates/g3-core/src/streaming_parser.rs +++ b/crates/g3-core/src/streaming_parser.rs @@ -481,6 +481,20 @@ impl StreamingToolParser { &self.text_buffer } + /// Get only the text content before the first JSON tool call. + /// Returns the full buffer if no tool calls are found. + /// This is used to save the "preamble" text (e.g. "Let me run that.") + /// without including raw duplicate tool call JSON in the assistant message. + pub fn get_text_before_tool_calls(&self) -> &str { + let fence_ranges = find_code_fence_ranges(&self.text_buffer); + if let Some(pos) = find_first_tool_call_start(&self.text_buffer) { + if !is_position_in_fence_ranges(pos, &fence_ranges) { + return self.text_buffer[..pos].trim_end(); + } + } + &self.text_buffer + } + pub fn get_content_before_position(&self, pos: usize) -> String { if pos <= self.text_buffer.len() { self.text_buffer[..pos].to_string() diff --git a/crates/g3-core/tests/mock_provider_integration_test.rs b/crates/g3-core/tests/mock_provider_integration_test.rs index 6ebedee..a122db5 100644 --- a/crates/g3-core/tests/mock_provider_integration_test.rs +++ b/crates/g3-core/tests/mock_provider_integration_test.rs @@ -951,3 +951,234 @@ async fn test_plan_approval_gate_blocks_unapproved_changes() { assert!(has_blocking_message, "Should have blocking message in context"); } + +/// Test: JSON fallback stuttered duplicate tool calls - first executes, second deduped +/// +/// Reproduces the exact failure from session create_a_plan_generate_a_4ef735ceedecfdd: +/// The LLM emits two identical JSON tool calls as text content (non-native). +/// The first should be executed, the second should be deduplicated. +/// After execution, the follow-up response should work normally. +#[tokio::test] +async fn test_json_fallback_stuttered_duplicate_tool_calls() { + // Provider WITHOUT native tool calling - uses JSON fallback + // Response 1: Two identical JSON tool calls in text (the stutter) + // Response 2: Follow-up text after tool execution + let provider = MockProvider::new() + .with_native_tool_calling(false) + .with_response(MockResponse::text_with_duplicate_json_tools( + "shell", + serde_json::json!({"command": "echo stutter_test"}), + )) + .with_default_response(MockResponse::text("The command completed successfully.")); + + let (mut agent, _temp_dir) = create_agent_with_mock(provider).await; + + let result = agent.execute_task("Run a command", None, false).await; + assert!(result.is_ok(), "Task should succeed: {:?}", result.err()); + + let history = &agent.get_context_window().conversation_history; + + // Should have exactly one tool result (not two) + let tool_result_count = history + .iter() + .filter(|m| matches!(m.role, MessageRole::User) && m.content.contains("Tool result:") && m.content.contains("stutter_test")) + .count(); + assert_eq!(tool_result_count, 1, "Should have exactly 1 tool result, got {}", tool_result_count); + + // Should have the follow-up response + let has_followup = history + .iter() + .any(|m| matches!(m.role, MessageRole::Assistant) && m.content.contains("completed successfully")); + assert!(has_followup, "Follow-up response should be in context"); + + // CRITICAL: Verify the assistant message does NOT contain raw duplicate JSON tool calls + // This is the actual bug - the model's stuttered response gets stored as raw JSON content + // which confuses the model on subsequent turns. + for (i, msg) in history.iter().enumerate() { + if matches!(msg.role, MessageRole::Assistant) { + // Count how many times the tool call JSON pattern appears in this message + let tool_pattern = r#""tool": "shell""#; + let count = msg.content.matches(tool_pattern).count(); + assert!( + count <= 1, + "Assistant message [{}] contains {} duplicate tool call JSON patterns (should be 0 or 1): {}", + i, count, &msg.content[..msg.content.len().min(200)] + ); + } + } +} + +/// Test: JSON fallback stuttered duplicate with text prefix +/// +/// Same as above but with text before the tool calls: +/// "Let me run that.\n\n{tool...}\n\n{tool...}" +#[tokio::test] +async fn test_json_fallback_stuttered_duplicate_with_text_prefix() { + let provider = MockProvider::new() + .with_native_tool_calling(false) + .with_response(MockResponse::text_with_duplicate_json_tools_prefixed( + "Let me run that command.", + "shell", + serde_json::json!({"command": "echo prefixed_stutter"}), + )) + .with_default_response(MockResponse::text("Done.")); + + let (mut agent, _temp_dir) = create_agent_with_mock(provider).await; + + let result = agent.execute_task("Run a command", None, false).await; + assert!(result.is_ok(), "Task should succeed: {:?}", result.err()); + + let history = &agent.get_context_window().conversation_history; + + // Should have exactly one tool result + let tool_result_count = history + .iter() + .filter(|m| matches!(m.role, MessageRole::User) && m.content.contains("Tool result:") && m.content.contains("prefixed_stutter")) + .count(); + assert_eq!(tool_result_count, 1, "Should have exactly 1 tool result, got {}", tool_result_count); +} + +/// Test: Two different JSON tool calls in one response should BOTH execute +/// +/// When the LLM emits two different tool calls (not duplicates), both should execute. +/// This is the boundary case - different args means not a duplicate. +#[tokio::test] +async fn test_json_fallback_two_different_tool_calls_both_execute() { + // Manually construct two different tool calls as text chunks + let tool1 = r#"{"tool": "shell", "args": {"command": "echo first_call"}}"#; + let tool2 = r#"{"tool": "shell", "args": {"command": "echo second_call"}}"#; + + let provider = MockProvider::new() + .with_native_tool_calling(false) + .with_response(MockResponse::custom( + vec![ + MockChunk::content(tool1), + MockChunk::content("\n\n"), + MockChunk::content(tool2), + MockChunk::finished("end_turn"), + ], + g3_providers::Usage { + prompt_tokens: 100, + completion_tokens: 50, + total_tokens: 150, + cache_creation_tokens: 0, + cache_read_tokens: 0, + }, + )) + .with_default_response(MockResponse::text("Both commands ran.")); + + let (mut agent, _temp_dir) = create_agent_with_mock(provider).await; + + let result = agent.execute_task("Run two commands", None, false).await; + assert!(result.is_ok(), "Task should succeed: {:?}", result.err()); + + let history = &agent.get_context_window().conversation_history; + + // Should have tool results for both calls + let first_result = history + .iter() + .any(|m| matches!(m.role, MessageRole::User) && m.content.contains("Tool result:") && m.content.contains("first_call")); + let second_result = history + .iter() + .any(|m| matches!(m.role, MessageRole::User) && m.content.contains("Tool result:") && m.content.contains("second_call")); + + assert!(first_result, "First tool call should have been executed"); + assert!(second_result, "Second (different) tool call should also have been executed"); +} + +/// Test: Cross-turn same tool call should still execute (not be deduped as DUP IN MSG) +/// +/// Scenario: Turn 1 - model emits tool call A (executed, result returned). +/// Turn 2 (new user message) - model emits tool call A again (stuttered x2). +/// The first should execute (it's a new turn with a new user message between), +/// the second should be deduped as DUP IN CHUNK. +#[tokio::test] +async fn test_cross_turn_same_tool_call_executes() { + let provider = MockProvider::new() + .with_native_tool_calling(false) + .with_response(MockResponse::text_with_json_tool( + "Running the command.", + "shell", + serde_json::json!({"command": "echo cross_turn_test"}), + )) + .with_response(MockResponse::text("First run complete.")) + // Second task will get the stuttered duplicate + .with_response(MockResponse::text_with_duplicate_json_tools( + "shell", + serde_json::json!({"command": "echo cross_turn_test"}), + )) + .with_default_response(MockResponse::text("Second run complete.")); + + let (mut agent, _temp_dir) = create_agent_with_mock(provider).await; + + // First task + let result1 = agent.execute_task("Run the command", None, false).await; + assert!(result1.is_ok(), "First task should succeed: {:?}", result1.err()); + + // Second task - same tool call but new user message + let result2 = agent.execute_task("Run it again", None, false).await; + assert!(result2.is_ok(), "Second task should succeed: {:?}", result2.err()); + + let history = &agent.get_context_window().conversation_history; + + // Should have two tool results (one per task) + let tool_result_count = history + .iter() + .filter(|m| matches!(m.role, MessageRole::User) && m.content.contains("Tool result:") && m.content.contains("cross_turn_test")) + .count(); + assert_eq!(tool_result_count, 2, "Should have 2 tool results (one per turn), got {}", tool_result_count); +} + +/// Test: Three identical tool calls in one response - first executes, rest deduped +/// +/// Boundary case: model stutters three times. +#[tokio::test] +async fn test_triple_stuttered_tool_calls() { + let tool_str = r#"{"tool": "shell", "args": {"command": "echo triple"}}"#; + let provider = MockProvider::new() + .with_native_tool_calling(false) + .with_response(MockResponse::custom( + vec![ + MockChunk::content(tool_str), + MockChunk::content("\n\n"), + MockChunk::content(tool_str), + MockChunk::content("\n\n"), + MockChunk::content(tool_str), + MockChunk::finished("end_turn"), + ], + g3_providers::Usage { + prompt_tokens: 100, + completion_tokens: 50, + total_tokens: 150, + cache_creation_tokens: 0, + cache_read_tokens: 0, + }, + )) + .with_default_response(MockResponse::text("Done.")); + + let (mut agent, _temp_dir) = create_agent_with_mock(provider).await; + + let result = agent.execute_task("Run command", None, false).await; + assert!(result.is_ok(), "Task should succeed: {:?}", result.err()); + + let history = &agent.get_context_window().conversation_history; + + // Should have exactly one tool result + let tool_result_count = history + .iter() + .filter(|m| matches!(m.role, MessageRole::User) && m.content.contains("Tool result:") && m.content.contains("triple")) + .count(); + assert_eq!(tool_result_count, 1, "Should have exactly 1 tool result, got {}", tool_result_count); + + // Verify no assistant message has more than 1 tool call JSON pattern + for (i, msg) in history.iter().enumerate() { + if matches!(msg.role, MessageRole::Assistant) { + let count = msg.content.matches(r#""tool": "shell""#).count(); + assert!( + count <= 1, + "Assistant message [{}] contains {} tool call patterns (should be 0 or 1)", + i, count + ); + } + } +} diff --git a/crates/g3-providers/src/mock.rs b/crates/g3-providers/src/mock.rs index bfccb83..67ebe0a 100644 --- a/crates/g3-providers/src/mock.rs +++ b/crates/g3-providers/src/mock.rs @@ -231,7 +231,59 @@ impl MockResponse { } } - /// Create a response that gets cut off by max_tokens + /// Create a response with duplicate JSON tool calls in text content (non-native). + /// Mimics the LLM stuttering pattern where it emits the same tool call twice. + pub fn text_with_duplicate_json_tools(tool: &str, args: serde_json::Value) -> Self { + let args_str = serde_json::to_string(&args).unwrap(); + let tool_str = format!(r#"{{"tool": "{}", "args": {}}}"#, tool, args_str); + let full_content = format!("{}\n\n{}", &tool_str, &tool_str); + + Self { + chunks: vec![ + MockChunk::content(&tool_str), + MockChunk::content("\n\n"), + MockChunk::content(&tool_str), + MockChunk::finished("end_turn"), + ], + usage: Usage { + prompt_tokens: 100, + completion_tokens: full_content.len() as u32 / 4, + total_tokens: 100 + full_content.len() as u32 / 4, + cache_creation_tokens: 0, + cache_read_tokens: 0, + }, + } + } + + /// Create a response with text followed by duplicate JSON tool calls (non-native). + /// Mimics the pattern: "Let me run that.\n\n{tool...}\n\n{tool...}" + pub fn text_with_duplicate_json_tools_prefixed( + text: &str, + tool: &str, + args: serde_json::Value, + ) -> Self { + let args_str = serde_json::to_string(&args).unwrap(); + let tool_str = format!(r#"{{"tool": "{}", "args": {}}}"#, tool, args_str); + let full_content = format!("{}\n\n{}\n\n{}", text, &tool_str, &tool_str); + + Self { + chunks: vec![ + MockChunk::content(text), + MockChunk::content("\n\n"), + MockChunk::content(&tool_str), + MockChunk::content("\n\n"), + MockChunk::content(&tool_str), + MockChunk::finished("end_turn"), + ], + usage: Usage { + prompt_tokens: 100, + completion_tokens: full_content.len() as u32 / 4, + total_tokens: 100 + full_content.len() as u32 / 4, + cache_creation_tokens: 0, + cache_read_tokens: 0, + }, + } + } pub fn truncated(content: &str) -> Self { Self { chunks: vec![