Fix research_status polling tool falsely deduplicated across auto-continue iterations
The dedup logic compared only tool name+args, ignoring the unique tool call
IDs that native providers (Anthropic) assign to each invocation. When the
model called research_status {} in iteration 1, auto-continued, and called
it again in iteration 2 with identical args but a new ID, the second call
was marked DUP IN MSG and skipped. With no tool executed and no text, the
stream errored with 'No response received from the model.'
Three-part fix:
- ID-aware DUP IN MSG: check_duplicate_in_previous_message() uses tool call
IDs when both are non-empty (different IDs = different invocations)
- History cutoff: only checks messages from before the current iteration to
prevent within-iteration false positives
- DUP IN ITER: last_executed_tool on IterationState catches stuttered
duplicates across chunks within the same response
Regression test reproduces the exact bug (fails without fix, passes with).
This commit is contained in:
@@ -1376,9 +1376,12 @@ impl<W: UiWriter> Agent<W> {
|
|||||||
|
|
||||||
/// Check if a tool call is a duplicate of the last tool call in the previous assistant message.
|
/// Check if a tool call is a duplicate of the last tool call in the previous assistant message.
|
||||||
/// Returns Some("DUP IN MSG") if it's a duplicate, None otherwise.
|
/// Returns Some("DUP IN MSG") if it's a duplicate, None otherwise.
|
||||||
fn check_duplicate_in_previous_message(&self, tool_call: &ToolCall) -> Option<String> {
|
fn check_duplicate_in_previous_message(&self, tool_call: &ToolCall, history_cutoff: usize) -> Option<String> {
|
||||||
// Find the most recent assistant message
|
// Find the most recent assistant message, but only look at messages that
|
||||||
for msg in self.context_window.conversation_history.iter().rev() {
|
// existed before the current streaming iteration started. This prevents
|
||||||
|
// tool calls within the same response from being marked as DUP IN MSG
|
||||||
|
// against messages added during the current iteration's tool execution.
|
||||||
|
for msg in self.context_window.conversation_history[..history_cutoff].iter().rev() {
|
||||||
if !matches!(msg.role, MessageRole::Assistant) {
|
if !matches!(msg.role, MessageRole::Assistant) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -1386,13 +1389,27 @@ impl<W: UiWriter> Agent<W> {
|
|||||||
// Check structured tool_calls first (native tool calling)
|
// Check structured tool_calls first (native tool calling)
|
||||||
if !msg.tool_calls.is_empty() {
|
if !msg.tool_calls.is_empty() {
|
||||||
if let Some(last_tc) = msg.tool_calls.last() {
|
if let Some(last_tc) = msg.tool_calls.last() {
|
||||||
let prev = ToolCall {
|
// When both tool calls have non-empty IDs from native providers
|
||||||
tool: last_tc.name.clone(),
|
// (e.g. Anthropic "toolu_*"), each API invocation gets a unique ID.
|
||||||
args: last_tc.input.clone(),
|
// Different IDs mean distinct invocations — not duplicates.
|
||||||
id: last_tc.id.clone(),
|
// This is critical for polling tools like research_status that are
|
||||||
};
|
// called repeatedly with identical arguments across turns.
|
||||||
if streaming::are_tool_calls_duplicate(&prev, tool_call) {
|
if !last_tc.id.is_empty() && !tool_call.id.is_empty() {
|
||||||
return Some("DUP IN MSG".to_string());
|
if last_tc.id == tool_call.id {
|
||||||
|
return Some("DUP IN MSG".to_string());
|
||||||
|
}
|
||||||
|
// Different IDs = different invocations, not a duplicate
|
||||||
|
} else {
|
||||||
|
// Fallback for JSON-based tool calls (embedded models) where
|
||||||
|
// IDs are auto-generated and not meaningful for dedup.
|
||||||
|
let prev = ToolCall {
|
||||||
|
tool: last_tc.name.clone(),
|
||||||
|
args: last_tc.input.clone(),
|
||||||
|
id: last_tc.id.clone(),
|
||||||
|
};
|
||||||
|
if streaming::are_tool_calls_duplicate(&prev, tool_call) {
|
||||||
|
return Some("DUP IN MSG".to_string());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Only check the most recent assistant message
|
// Only check the most recent assistant message
|
||||||
@@ -2280,6 +2297,12 @@ Skip if nothing new. Be brief."#;
|
|||||||
// Create fresh iteration state for this streaming iteration
|
// Create fresh iteration state for this streaming iteration
|
||||||
let mut iter = streaming::IterationState::new();
|
let mut iter = streaming::IterationState::new();
|
||||||
|
|
||||||
|
// Snapshot history length before this iteration. Used by DUP IN MSG
|
||||||
|
// detection to avoid comparing against messages added during this
|
||||||
|
// iteration's own tool executions (which would false-positive on
|
||||||
|
// multi-tool responses where the model issues the same tool twice).
|
||||||
|
let history_len_before_iteration = self.context_window.conversation_history.len();
|
||||||
|
|
||||||
while let Some(chunk_result) = stream.next().await {
|
while let Some(chunk_result) = stream.next().await {
|
||||||
match chunk_result {
|
match chunk_result {
|
||||||
Ok(chunk) => {
|
Ok(chunk) => {
|
||||||
@@ -2356,9 +2379,19 @@ Skip if nothing new. Be brief."#;
|
|||||||
// Always process all tool calls - they will be executed after stream ends
|
// Always process all tool calls - they will be executed after stream ends
|
||||||
|
|
||||||
// De-duplicate tool calls (sequential duplicates in chunk + duplicates from previous message)
|
// De-duplicate tool calls (sequential duplicates in chunk + duplicates from previous message)
|
||||||
|
let last_executed_in_iter = iter.last_executed_tool.clone();
|
||||||
let deduplicated_tools =
|
let deduplicated_tools =
|
||||||
streaming::deduplicate_tool_calls(completed_tools, |tc| {
|
streaming::deduplicate_tool_calls(completed_tools, |tc| {
|
||||||
self.check_duplicate_in_previous_message(tc)
|
// First check against the last tool executed in this
|
||||||
|
// iteration (catches duplicates across chunks within
|
||||||
|
// the same streaming response).
|
||||||
|
if let Some(ref last) = last_executed_in_iter {
|
||||||
|
if streaming::are_tool_calls_duplicate(last, tc) {
|
||||||
|
return Some("DUP IN ITER".to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Then check against messages from before this iteration
|
||||||
|
self.check_duplicate_in_previous_message(tc, history_len_before_iteration)
|
||||||
});
|
});
|
||||||
|
|
||||||
// Process each tool call
|
// Process each tool call
|
||||||
@@ -2640,6 +2673,7 @@ Skip if nothing new. Be brief."#;
|
|||||||
// 2. At the end when no tools were executed (handled in the "no tool executed" branch)
|
// 2. At the end when no tools were executed (handled in the "no tool executed" branch)
|
||||||
|
|
||||||
iter.tool_executed = true;
|
iter.tool_executed = true;
|
||||||
|
iter.last_executed_tool = Some(tool_call.clone());
|
||||||
state.any_tool_executed = true; // Track across all iterations
|
state.any_tool_executed = true; // Track across all iterations
|
||||||
|
|
||||||
// Reset the JSON tool call filter state after each tool execution
|
// Reset the JSON tool call filter state after each tool execution
|
||||||
|
|||||||
@@ -177,6 +177,8 @@ pub struct IterationState {
|
|||||||
pub raw_chunks: Vec<String>,
|
pub raw_chunks: Vec<String>,
|
||||||
pub accumulated_usage: Option<g3_providers::Usage>,
|
pub accumulated_usage: Option<g3_providers::Usage>,
|
||||||
pub stream_stop_reason: Option<String>,
|
pub stream_stop_reason: Option<String>,
|
||||||
|
/// Last tool call executed in this iteration (for cross-chunk dedup within same response)
|
||||||
|
pub last_executed_tool: Option<ToolCall>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl IterationState {
|
impl IterationState {
|
||||||
@@ -189,6 +191,7 @@ impl IterationState {
|
|||||||
raw_chunks: Vec::new(),
|
raw_chunks: Vec::new(),
|
||||||
accumulated_usage: None,
|
accumulated_usage: None,
|
||||||
stream_stop_reason: None,
|
stream_stop_reason: None,
|
||||||
|
last_executed_tool: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -118,6 +118,13 @@ mod timing_footer {
|
|||||||
mod duplicate_detection {
|
mod duplicate_detection {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
|
fn make_tool_call_with_id(tool: &str, args: serde_json::Value, id: &str) -> ToolCall {
|
||||||
|
ToolCall {
|
||||||
|
tool: tool.to_string(),
|
||||||
|
args,
|
||||||
|
id: id.to_string(),
|
||||||
|
}
|
||||||
|
}
|
||||||
fn make_tool_call(tool: &str, args: serde_json::Value) -> ToolCall {
|
fn make_tool_call(tool: &str, args: serde_json::Value) -> ToolCall {
|
||||||
ToolCall {
|
ToolCall {
|
||||||
tool: tool.to_string(),
|
tool: tool.to_string(),
|
||||||
@@ -190,6 +197,48 @@ mod duplicate_detection {
|
|||||||
|
|
||||||
assert!(!are_tool_calls_duplicate(&tc1, &tc2));
|
assert!(!are_tool_calls_duplicate(&tc1, &tc2));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// =========================================================================
|
||||||
|
// ID-ignorant deduplication tests (are_tool_calls_duplicate is name+args only)
|
||||||
|
// ID-aware cross-message dedup lives in check_duplicate_in_previous_message()
|
||||||
|
// and is tested via integration tests in mock_provider_integration_test.rs
|
||||||
|
// =========================================================================
|
||||||
|
|
||||||
|
/// Test: are_tool_calls_duplicate ignores IDs — same name+args = duplicate
|
||||||
|
/// regardless of ID. The ID-aware logic is in check_duplicate_in_previous_message.
|
||||||
|
#[test]
|
||||||
|
fn test_different_ids_same_args_still_duplicate_at_chunk_level() {
|
||||||
|
let tc1 = make_tool_call_with_id(
|
||||||
|
"research_status",
|
||||||
|
serde_json::json!({}),
|
||||||
|
"toolu_01ABC",
|
||||||
|
);
|
||||||
|
let tc2 = make_tool_call_with_id(
|
||||||
|
"research_status",
|
||||||
|
serde_json::json!({}),
|
||||||
|
"toolu_01DEF",
|
||||||
|
);
|
||||||
|
|
||||||
|
// Same name+args = duplicate (IDs are ignored at this level)
|
||||||
|
assert!(are_tool_calls_duplicate(&tc1, &tc2));
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test: Different IDs with different args are NOT duplicates.
|
||||||
|
#[test]
|
||||||
|
fn test_different_ids_different_args_not_duplicate() {
|
||||||
|
let tc1 = make_tool_call_with_id(
|
||||||
|
"research_status",
|
||||||
|
serde_json::json!({"research_id": "r_123"}),
|
||||||
|
"toolu_01ABC",
|
||||||
|
);
|
||||||
|
let tc2 = make_tool_call_with_id(
|
||||||
|
"research_status",
|
||||||
|
serde_json::json!({"research_id": "r_456"}),
|
||||||
|
"toolu_01DEF",
|
||||||
|
);
|
||||||
|
|
||||||
|
assert!(!are_tool_calls_duplicate(&tc1, &tc2));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// =============================================================================
|
// =============================================================================
|
||||||
|
|||||||
@@ -1129,6 +1129,65 @@ async fn test_cross_turn_same_tool_call_executes() {
|
|||||||
assert_eq!(tool_result_count, 2, "Should have 2 tool results (one per turn), got {}", tool_result_count);
|
assert_eq!(tool_result_count, 2, "Should have 2 tool results (one per turn), got {}", tool_result_count);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Test: Native polling tool (research_status) called with identical args across
|
||||||
|
/// auto-continue iterations must NOT be deduplicated.
|
||||||
|
///
|
||||||
|
/// Reproduces the exact bug: model calls research_status in iteration 1, gets the
|
||||||
|
/// tool result, auto-continues to iteration 2, and calls research_status again with
|
||||||
|
/// identical args. Each Anthropic API response assigns a unique tool_use ID (toolu_*).
|
||||||
|
/// The old dedup logic compared only name+args, ignoring IDs, so the second call was
|
||||||
|
/// marked DUP IN MSG and skipped. With no tool executed and no text content, the
|
||||||
|
/// stream errored with "No response received from the model."
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_native_polling_tool_not_deduplicated_across_turns() {
|
||||||
|
let provider = MockProvider::new()
|
||||||
|
.with_native_tool_calling(true)
|
||||||
|
// Iteration 1: model calls research_status (gets unique ID)
|
||||||
|
.with_response(MockResponse::native_tool_call(
|
||||||
|
"research_status",
|
||||||
|
serde_json::json!({}),
|
||||||
|
))
|
||||||
|
// Iteration 2 (auto-continue): model calls research_status AGAIN
|
||||||
|
// with identical args but a DIFFERENT unique ID
|
||||||
|
.with_response(MockResponse::native_tool_call(
|
||||||
|
"research_status",
|
||||||
|
serde_json::json!({}),
|
||||||
|
))
|
||||||
|
// Iteration 3 (auto-continue): model produces text, ending the turn
|
||||||
|
.with_default_response(MockResponse::text("Research complete."));
|
||||||
|
|
||||||
|
let (mut agent, _temp_dir) = create_agent_with_mock(provider).await;
|
||||||
|
|
||||||
|
// Single execute_task: model calls research_status twice via auto-continue,
|
||||||
|
// then produces text. Before the fix, the second research_status call would
|
||||||
|
// be falsely deduplicated as DUP IN MSG, causing "No response received".
|
||||||
|
let result = agent.execute_task("Check research status", None, false).await;
|
||||||
|
assert!(result.is_ok(), "Task should succeed: {:?}", result.err());
|
||||||
|
|
||||||
|
let history = &agent.get_context_window().conversation_history;
|
||||||
|
|
||||||
|
// Both research_status calls should have produced a tool result
|
||||||
|
let tool_result_count = history
|
||||||
|
.iter()
|
||||||
|
.filter(|m| matches!(m.role, MessageRole::User) && m.content.contains("Tool result:"))
|
||||||
|
.count();
|
||||||
|
assert_eq!(
|
||||||
|
tool_result_count, 2,
|
||||||
|
"Both research_status calls should execute (not deduplicated across iterations). Got {} tool results",
|
||||||
|
tool_result_count
|
||||||
|
);
|
||||||
|
|
||||||
|
// Verify both tool calls were stored with different IDs
|
||||||
|
let research_tool_calls: Vec<_> = history
|
||||||
|
.iter()
|
||||||
|
.filter(|m| matches!(m.role, MessageRole::Assistant))
|
||||||
|
.flat_map(|m| m.tool_calls.iter())
|
||||||
|
.filter(|tc| tc.name == "research_status")
|
||||||
|
.collect();
|
||||||
|
assert_eq!(research_tool_calls.len(), 2, "Should have 2 research_status tool calls stored");
|
||||||
|
assert_ne!(research_tool_calls[0].id, research_tool_calls[1].id, "Tool call IDs should differ");
|
||||||
|
}
|
||||||
|
|
||||||
/// Test: Three identical tool calls in one response - first executes, rest deduped
|
/// Test: Three identical tool calls in one response - first executes, rest deduped
|
||||||
///
|
///
|
||||||
/// Boundary case: model stutters three times.
|
/// Boundary case: model stutters three times.
|
||||||
|
|||||||
Reference in New Issue
Block a user