diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs index e5457f8..f6fa290 100644 --- a/crates/g3-core/src/lib.rs +++ b/crates/g3-core/src/lib.rs @@ -1376,9 +1376,12 @@ impl Agent { /// Check if a tool call is a duplicate of the last tool call in the previous assistant message. /// Returns Some("DUP IN MSG") if it's a duplicate, None otherwise. - fn check_duplicate_in_previous_message(&self, tool_call: &ToolCall) -> Option { - // Find the most recent assistant message - for msg in self.context_window.conversation_history.iter().rev() { + fn check_duplicate_in_previous_message(&self, tool_call: &ToolCall, history_cutoff: usize) -> Option { + // Find the most recent assistant message, but only look at messages that + // existed before the current streaming iteration started. This prevents + // tool calls within the same response from being marked as DUP IN MSG + // against messages added during the current iteration's tool execution. + for msg in self.context_window.conversation_history[..history_cutoff].iter().rev() { if !matches!(msg.role, MessageRole::Assistant) { continue; } @@ -1386,13 +1389,27 @@ impl Agent { // Check structured tool_calls first (native tool calling) if !msg.tool_calls.is_empty() { if let Some(last_tc) = msg.tool_calls.last() { - let prev = ToolCall { - tool: last_tc.name.clone(), - args: last_tc.input.clone(), - id: last_tc.id.clone(), - }; - if streaming::are_tool_calls_duplicate(&prev, tool_call) { - return Some("DUP IN MSG".to_string()); + // When both tool calls have non-empty IDs from native providers + // (e.g. Anthropic "toolu_*"), each API invocation gets a unique ID. + // Different IDs mean distinct invocations — not duplicates. + // This is critical for polling tools like research_status that are + // called repeatedly with identical arguments across turns. + if !last_tc.id.is_empty() && !tool_call.id.is_empty() { + if last_tc.id == tool_call.id { + return Some("DUP IN MSG".to_string()); + } + // Different IDs = different invocations, not a duplicate + } else { + // Fallback for JSON-based tool calls (embedded models) where + // IDs are auto-generated and not meaningful for dedup. + let prev = ToolCall { + tool: last_tc.name.clone(), + args: last_tc.input.clone(), + id: last_tc.id.clone(), + }; + if streaming::are_tool_calls_duplicate(&prev, tool_call) { + return Some("DUP IN MSG".to_string()); + } } } // Only check the most recent assistant message @@ -2280,6 +2297,12 @@ Skip if nothing new. Be brief."#; // Create fresh iteration state for this streaming iteration let mut iter = streaming::IterationState::new(); + // Snapshot history length before this iteration. Used by DUP IN MSG + // detection to avoid comparing against messages added during this + // iteration's own tool executions (which would false-positive on + // multi-tool responses where the model issues the same tool twice). + let history_len_before_iteration = self.context_window.conversation_history.len(); + while let Some(chunk_result) = stream.next().await { match chunk_result { Ok(chunk) => { @@ -2356,9 +2379,19 @@ Skip if nothing new. Be brief."#; // Always process all tool calls - they will be executed after stream ends // De-duplicate tool calls (sequential duplicates in chunk + duplicates from previous message) + let last_executed_in_iter = iter.last_executed_tool.clone(); let deduplicated_tools = streaming::deduplicate_tool_calls(completed_tools, |tc| { - self.check_duplicate_in_previous_message(tc) + // First check against the last tool executed in this + // iteration (catches duplicates across chunks within + // the same streaming response). + if let Some(ref last) = last_executed_in_iter { + if streaming::are_tool_calls_duplicate(last, tc) { + return Some("DUP IN ITER".to_string()); + } + } + // Then check against messages from before this iteration + self.check_duplicate_in_previous_message(tc, history_len_before_iteration) }); // Process each tool call @@ -2640,6 +2673,7 @@ Skip if nothing new. Be brief."#; // 2. At the end when no tools were executed (handled in the "no tool executed" branch) iter.tool_executed = true; + iter.last_executed_tool = Some(tool_call.clone()); state.any_tool_executed = true; // Track across all iterations // Reset the JSON tool call filter state after each tool execution diff --git a/crates/g3-core/src/streaming.rs b/crates/g3-core/src/streaming.rs index 6984df2..450e4c1 100644 --- a/crates/g3-core/src/streaming.rs +++ b/crates/g3-core/src/streaming.rs @@ -177,6 +177,8 @@ pub struct IterationState { pub raw_chunks: Vec, pub accumulated_usage: Option, pub stream_stop_reason: Option, + /// Last tool call executed in this iteration (for cross-chunk dedup within same response) + pub last_executed_tool: Option, } impl IterationState { @@ -189,6 +191,7 @@ impl IterationState { raw_chunks: Vec::new(), accumulated_usage: None, stream_stop_reason: None, + last_executed_tool: None, } } diff --git a/crates/g3-core/tests/end_of_turn_behavior_test.rs b/crates/g3-core/tests/end_of_turn_behavior_test.rs index d50338f..32c54f8 100644 --- a/crates/g3-core/tests/end_of_turn_behavior_test.rs +++ b/crates/g3-core/tests/end_of_turn_behavior_test.rs @@ -118,6 +118,13 @@ mod timing_footer { mod duplicate_detection { use super::*; + fn make_tool_call_with_id(tool: &str, args: serde_json::Value, id: &str) -> ToolCall { + ToolCall { + tool: tool.to_string(), + args, + id: id.to_string(), + } + } fn make_tool_call(tool: &str, args: serde_json::Value) -> ToolCall { ToolCall { tool: tool.to_string(), @@ -190,6 +197,48 @@ mod duplicate_detection { assert!(!are_tool_calls_duplicate(&tc1, &tc2)); } + + // ========================================================================= + // ID-ignorant deduplication tests (are_tool_calls_duplicate is name+args only) + // ID-aware cross-message dedup lives in check_duplicate_in_previous_message() + // and is tested via integration tests in mock_provider_integration_test.rs + // ========================================================================= + + /// Test: are_tool_calls_duplicate ignores IDs — same name+args = duplicate + /// regardless of ID. The ID-aware logic is in check_duplicate_in_previous_message. + #[test] + fn test_different_ids_same_args_still_duplicate_at_chunk_level() { + let tc1 = make_tool_call_with_id( + "research_status", + serde_json::json!({}), + "toolu_01ABC", + ); + let tc2 = make_tool_call_with_id( + "research_status", + serde_json::json!({}), + "toolu_01DEF", + ); + + // Same name+args = duplicate (IDs are ignored at this level) + assert!(are_tool_calls_duplicate(&tc1, &tc2)); + } + + /// Test: Different IDs with different args are NOT duplicates. + #[test] + fn test_different_ids_different_args_not_duplicate() { + let tc1 = make_tool_call_with_id( + "research_status", + serde_json::json!({"research_id": "r_123"}), + "toolu_01ABC", + ); + let tc2 = make_tool_call_with_id( + "research_status", + serde_json::json!({"research_id": "r_456"}), + "toolu_01DEF", + ); + + assert!(!are_tool_calls_duplicate(&tc1, &tc2)); + } } // ============================================================================= diff --git a/crates/g3-core/tests/mock_provider_integration_test.rs b/crates/g3-core/tests/mock_provider_integration_test.rs index 037f934..32dad1e 100644 --- a/crates/g3-core/tests/mock_provider_integration_test.rs +++ b/crates/g3-core/tests/mock_provider_integration_test.rs @@ -1129,6 +1129,65 @@ async fn test_cross_turn_same_tool_call_executes() { assert_eq!(tool_result_count, 2, "Should have 2 tool results (one per turn), got {}", tool_result_count); } +/// Test: Native polling tool (research_status) called with identical args across +/// auto-continue iterations must NOT be deduplicated. +/// +/// Reproduces the exact bug: model calls research_status in iteration 1, gets the +/// tool result, auto-continues to iteration 2, and calls research_status again with +/// identical args. Each Anthropic API response assigns a unique tool_use ID (toolu_*). +/// The old dedup logic compared only name+args, ignoring IDs, so the second call was +/// marked DUP IN MSG and skipped. With no tool executed and no text content, the +/// stream errored with "No response received from the model." +#[tokio::test] +async fn test_native_polling_tool_not_deduplicated_across_turns() { + let provider = MockProvider::new() + .with_native_tool_calling(true) + // Iteration 1: model calls research_status (gets unique ID) + .with_response(MockResponse::native_tool_call( + "research_status", + serde_json::json!({}), + )) + // Iteration 2 (auto-continue): model calls research_status AGAIN + // with identical args but a DIFFERENT unique ID + .with_response(MockResponse::native_tool_call( + "research_status", + serde_json::json!({}), + )) + // Iteration 3 (auto-continue): model produces text, ending the turn + .with_default_response(MockResponse::text("Research complete.")); + + let (mut agent, _temp_dir) = create_agent_with_mock(provider).await; + + // Single execute_task: model calls research_status twice via auto-continue, + // then produces text. Before the fix, the second research_status call would + // be falsely deduplicated as DUP IN MSG, causing "No response received". + let result = agent.execute_task("Check research status", None, false).await; + assert!(result.is_ok(), "Task should succeed: {:?}", result.err()); + + let history = &agent.get_context_window().conversation_history; + + // Both research_status calls should have produced a tool result + let tool_result_count = history + .iter() + .filter(|m| matches!(m.role, MessageRole::User) && m.content.contains("Tool result:")) + .count(); + assert_eq!( + tool_result_count, 2, + "Both research_status calls should execute (not deduplicated across iterations). Got {} tool results", + tool_result_count + ); + + // Verify both tool calls were stored with different IDs + let research_tool_calls: Vec<_> = history + .iter() + .filter(|m| matches!(m.role, MessageRole::Assistant)) + .flat_map(|m| m.tool_calls.iter()) + .filter(|tc| tc.name == "research_status") + .collect(); + assert_eq!(research_tool_calls.len(), 2, "Should have 2 research_status tool calls stored"); + assert_ne!(research_tool_calls[0].id, research_tool_calls[1].id, "Tool call IDs should differ"); +} + /// Test: Three identical tool calls in one response - first executes, rest deduped /// /// Boundary case: model stutters three times.