Fix research_status polling tool falsely deduplicated across auto-continue iterations

The dedup logic compared only tool name+args, ignoring the unique tool call IDs that native providers (Anthropic) assign to each invocation. When the model called research_status {} in iteration 1, auto-continued, and called it again in iteration 2 with identical args but a new ID, the second call was marked DUP IN MSG and skipped. With no tool executed and no text, the stream errored with 'No response received from the model.' Three-part fix: - ID-aware DUP IN MSG: check_duplicate_in_previous_message() uses tool call IDs when both are non-empty (different IDs = different invocations) - History cutoff: only checks messages from before the current iteration to prevent within-iteration false positives - DUP IN ITER: last_executed_tool on IterationState catches stuttered duplicates across chunks within the same response Regression test reproduces the exact bug (fails without fix, passes with).
2026-02-12 15:54:24 +11:00
parent 88d2b9592b
commit 68f8f13b38
4 changed files with 156 additions and 11 deletions
--- a/crates/g3-core/src/lib.rs
+++ b/crates/g3-core/src/lib.rs
@@ -1376,9 +1376,12 @@ impl<W: UiWriter> Agent<W> {

    /// Check if a tool call is a duplicate of the last tool call in the previous assistant message.
    /// Returns Some("DUP IN MSG") if it's a duplicate, None otherwise.
-    fn check_duplicate_in_previous_message(&self, tool_call: &ToolCall) -> Option<String> {
-        // Find the most recent assistant message
-        for msg in self.context_window.conversation_history.iter().rev() {
+    fn check_duplicate_in_previous_message(&self, tool_call: &ToolCall, history_cutoff: usize) -> Option<String> {
+        // Find the most recent assistant message, but only look at messages that
+        // existed before the current streaming iteration started. This prevents
+        // tool calls within the same response from being marked as DUP IN MSG
+        // against messages added during the current iteration's tool execution.
+        for msg in self.context_window.conversation_history[..history_cutoff].iter().rev() {
            if !matches!(msg.role, MessageRole::Assistant) {
                continue;
            }
@@ -1386,13 +1389,27 @@ impl<W: UiWriter> Agent<W> {
            // Check structured tool_calls first (native tool calling)
            if !msg.tool_calls.is_empty() {
                if let Some(last_tc) = msg.tool_calls.last() {
-                    let prev = ToolCall {
-                        tool: last_tc.name.clone(),
-                        args: last_tc.input.clone(),
-                        id: last_tc.id.clone(),
-                    };
-                    if streaming::are_tool_calls_duplicate(&prev, tool_call) {
-                        return Some("DUP IN MSG".to_string());
+                    // When both tool calls have non-empty IDs from native providers
+                    // (e.g. Anthropic "toolu_*"), each API invocation gets a unique ID.
+                    // Different IDs mean distinct invocations — not duplicates.
+                    // This is critical for polling tools like research_status that are
+                    // called repeatedly with identical arguments across turns.
+                    if !last_tc.id.is_empty() && !tool_call.id.is_empty() {
+                        if last_tc.id == tool_call.id {
+                            return Some("DUP IN MSG".to_string());
+                        }
+                        // Different IDs = different invocations, not a duplicate
+                    } else {
+                        // Fallback for JSON-based tool calls (embedded models) where
+                        // IDs are auto-generated and not meaningful for dedup.
+                        let prev = ToolCall {
+                            tool: last_tc.name.clone(),
+                            args: last_tc.input.clone(),
+                            id: last_tc.id.clone(),
+                        };
+                        if streaming::are_tool_calls_duplicate(&prev, tool_call) {
+                            return Some("DUP IN MSG".to_string());
+                        }
                    }
                }
                // Only check the most recent assistant message
@@ -2280,6 +2297,12 @@ Skip if nothing new. Be brief."#;
            // Create fresh iteration state for this streaming iteration
            let mut iter = streaming::IterationState::new();

+            // Snapshot history length before this iteration. Used by DUP IN MSG
+            // detection to avoid comparing against messages added during this
+            // iteration's own tool executions (which would false-positive on
+            // multi-tool responses where the model issues the same tool twice).
+            let history_len_before_iteration = self.context_window.conversation_history.len();
+
            while let Some(chunk_result) = stream.next().await {
                match chunk_result {
                    Ok(chunk) => {
@@ -2356,9 +2379,19 @@ Skip if nothing new. Be brief."#;
                        // Always process all tool calls - they will be executed after stream ends

                        // De-duplicate tool calls (sequential duplicates in chunk + duplicates from previous message)
+                        let last_executed_in_iter = iter.last_executed_tool.clone();
                        let deduplicated_tools =
                            streaming::deduplicate_tool_calls(completed_tools, |tc| {
-                                self.check_duplicate_in_previous_message(tc)
+                                // First check against the last tool executed in this
+                                // iteration (catches duplicates across chunks within
+                                // the same streaming response).
+                                if let Some(ref last) = last_executed_in_iter {
+                                    if streaming::are_tool_calls_duplicate(last, tc) {
+                                        return Some("DUP IN ITER".to_string());
+                                    }
+                                }
+                                // Then check against messages from before this iteration
+                                self.check_duplicate_in_previous_message(tc, history_len_before_iteration)
                            });

                        // Process each tool call
@@ -2640,6 +2673,7 @@ Skip if nothing new. Be brief."#;
                            // 2. At the end when no tools were executed (handled in the "no tool executed" branch)

                            iter.tool_executed = true;
+                            iter.last_executed_tool = Some(tool_call.clone());
                            state.any_tool_executed = true; // Track across all iterations

                            // Reset the JSON tool call filter state after each tool execution
--- a/crates/g3-core/src/streaming.rs
+++ b/crates/g3-core/src/streaming.rs
@@ -177,6 +177,8 @@ pub struct IterationState {
    pub raw_chunks: Vec<String>,
    pub accumulated_usage: Option<g3_providers::Usage>,
    pub stream_stop_reason: Option<String>,
+    /// Last tool call executed in this iteration (for cross-chunk dedup within same response)
+    pub last_executed_tool: Option<ToolCall>,
 }

 impl IterationState {
@@ -189,6 +191,7 @@ impl IterationState {
            raw_chunks: Vec::new(),
            accumulated_usage: None,
            stream_stop_reason: None,
+            last_executed_tool: None,
        }
    }

--- a/crates/g3-core/tests/end_of_turn_behavior_test.rs
+++ b/crates/g3-core/tests/end_of_turn_behavior_test.rs
@@ -118,6 +118,13 @@ mod timing_footer {
 mod duplicate_detection {
    use super::*;

+    fn make_tool_call_with_id(tool: &str, args: serde_json::Value, id: &str) -> ToolCall {
+        ToolCall {
+            tool: tool.to_string(),
+            args,
+            id: id.to_string(),
+        }
+    }
    fn make_tool_call(tool: &str, args: serde_json::Value) -> ToolCall {
        ToolCall {
            tool: tool.to_string(),
@@ -190,6 +197,48 @@ mod duplicate_detection {

        assert!(!are_tool_calls_duplicate(&tc1, &tc2));
    }
+
+    // =========================================================================
+    // ID-ignorant deduplication tests (are_tool_calls_duplicate is name+args only)
+    // ID-aware cross-message dedup lives in check_duplicate_in_previous_message()
+    // and is tested via integration tests in mock_provider_integration_test.rs
+    // =========================================================================
+
+    /// Test: are_tool_calls_duplicate ignores IDs — same name+args = duplicate
+    /// regardless of ID. The ID-aware logic is in check_duplicate_in_previous_message.
+    #[test]
+    fn test_different_ids_same_args_still_duplicate_at_chunk_level() {
+        let tc1 = make_tool_call_with_id(
+            "research_status",
+            serde_json::json!({}),
+            "toolu_01ABC",
+        );
+        let tc2 = make_tool_call_with_id(
+            "research_status",
+            serde_json::json!({}),
+            "toolu_01DEF",
+        );
+
+        // Same name+args = duplicate (IDs are ignored at this level)
+        assert!(are_tool_calls_duplicate(&tc1, &tc2));
+    }
+
+    /// Test: Different IDs with different args are NOT duplicates.
+    #[test]
+    fn test_different_ids_different_args_not_duplicate() {
+        let tc1 = make_tool_call_with_id(
+            "research_status",
+            serde_json::json!({"research_id": "r_123"}),
+            "toolu_01ABC",
+        );
+        let tc2 = make_tool_call_with_id(
+            "research_status",
+            serde_json::json!({"research_id": "r_456"}),
+            "toolu_01DEF",
+        );
+
+        assert!(!are_tool_calls_duplicate(&tc1, &tc2));
+    }
 }

 // =============================================================================
--- a/crates/g3-core/tests/mock_provider_integration_test.rs
+++ b/crates/g3-core/tests/mock_provider_integration_test.rs
@@ -1129,6 +1129,65 @@ async fn test_cross_turn_same_tool_call_executes() {
    assert_eq!(tool_result_count, 2, "Should have 2 tool results (one per turn), got {}", tool_result_count);
 }

+/// Test: Native polling tool (research_status) called with identical args across
+/// auto-continue iterations must NOT be deduplicated.
+///
+/// Reproduces the exact bug: model calls research_status in iteration 1, gets the
+/// tool result, auto-continues to iteration 2, and calls research_status again with
+/// identical args. Each Anthropic API response assigns a unique tool_use ID (toolu_*).
+/// The old dedup logic compared only name+args, ignoring IDs, so the second call was
+/// marked DUP IN MSG and skipped. With no tool executed and no text content, the
+/// stream errored with "No response received from the model."
+#[tokio::test]
+async fn test_native_polling_tool_not_deduplicated_across_turns() {
+    let provider = MockProvider::new()
+        .with_native_tool_calling(true)
+        // Iteration 1: model calls research_status (gets unique ID)
+        .with_response(MockResponse::native_tool_call(
+            "research_status",
+            serde_json::json!({}),
+        ))
+        // Iteration 2 (auto-continue): model calls research_status AGAIN
+        // with identical args but a DIFFERENT unique ID
+        .with_response(MockResponse::native_tool_call(
+            "research_status",
+            serde_json::json!({}),
+        ))
+        // Iteration 3 (auto-continue): model produces text, ending the turn
+        .with_default_response(MockResponse::text("Research complete."));
+
+    let (mut agent, _temp_dir) = create_agent_with_mock(provider).await;
+
+    // Single execute_task: model calls research_status twice via auto-continue,
+    // then produces text. Before the fix, the second research_status call would
+    // be falsely deduplicated as DUP IN MSG, causing "No response received".
+    let result = agent.execute_task("Check research status", None, false).await;
+    assert!(result.is_ok(), "Task should succeed: {:?}", result.err());
+
+    let history = &agent.get_context_window().conversation_history;
+
+    // Both research_status calls should have produced a tool result
+    let tool_result_count = history
+        .iter()
+        .filter(|m| matches!(m.role, MessageRole::User) && m.content.contains("Tool result:"))
+        .count();
+    assert_eq!(
+        tool_result_count, 2,
+        "Both research_status calls should execute (not deduplicated across iterations). Got {} tool results",
+        tool_result_count
+    );
+
+    // Verify both tool calls were stored with different IDs
+    let research_tool_calls: Vec<_> = history
+        .iter()
+        .filter(|m| matches!(m.role, MessageRole::Assistant))
+        .flat_map(|m| m.tool_calls.iter())
+        .filter(|tc| tc.name == "research_status")
+        .collect();
+    assert_eq!(research_tool_calls.len(), 2, "Should have 2 research_status tool calls stored");
+    assert_ne!(research_tool_calls[0].id, research_tool_calls[1].id, "Tool call IDs should differ");
+}
+
 /// Test: Three identical tool calls in one response - first executes, rest deduped
 ///
 /// Boundary case: model stutters three times.