diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs index 688f9d9..16b900e 100644 --- a/crates/g3-core/src/lib.rs +++ b/crates/g3-core/src/lib.rs @@ -1904,9 +1904,7 @@ Skip if nothing new. Be brief."#; const MAX_ITERATIONS: usize = 400; // Prevent infinite loops let mut response_started = false; let mut any_tool_executed = false; // Track if ANY tool was executed across all iterations - let mut auto_summary_attempts = 0; // Track auto-summary prompt attempts - const MAX_AUTO_SUMMARY_ATTEMPTS: usize = 5; // Limit auto-summary retries (increased from 2 for better recovery) - // + let mut assistant_message_added = false; // Track if assistant message was added to context this iteration // Note: Session-level duplicate tracking was removed - we only prevent sequential duplicates (DUP IN CHUNK, DUP IN MSG) let mut turn_accumulated_usage: Option = None; // Track token usage for timing footer @@ -2388,10 +2386,6 @@ Skip if nothing new. Be brief."#; tool_executed = true; any_tool_executed = true; // Track across all iterations - // Reset auto-continue attempts after successful tool execution - // This gives the LLM fresh attempts since it's making progress - auto_summary_attempts = 0; - // Reset the JSON tool call filter state after each tool execution // This ensures the filter doesn't stay in suppression mode for subsequent streaming content self.ui_writer.reset_json_filter(); @@ -2515,18 +2509,19 @@ Skip if nothing new. Be brief."#; } // If tools were executed in previous iterations, - // break to let the outer loop's auto-continue logic handle it + // break to let the outer loop handle finalization if any_tool_executed { - debug!("Tools were executed, continuing - breaking to auto-continue"); + debug!("Tools were executed in previous iterations, breaking to finalize"); // IMPORTANT: Save any text response to context window before breaking // This ensures text displayed after tool execution is not lost - if !current_response.trim().is_empty() { - debug!("Saving current_response ({} chars) to context before auto-continue", current_response.len()); + if !current_response.trim().is_empty() && !assistant_message_added { + debug!("Saving current_response ({} chars) to context before finalization", current_response.len()); let assistant_msg = Message::new( MessageRole::Assistant, current_response.clone(), ); self.context_window.add_message(assistant_msg); + assistant_message_added = true; } // NOTE: We intentionally do NOT set full_response here. @@ -2537,6 +2532,18 @@ Skip if nothing new. Be brief."#; break; } + // Save assistant message before returning (no tools were executed) + // This ensures text-only responses are saved to context + if !current_response.trim().is_empty() && !assistant_message_added { + debug!("Saving current_response ({} chars) to context before early return", current_response.len()); + let assistant_msg = Message::new( + MessageRole::Assistant, + current_response.clone(), + ); + self.context_window.add_message(assistant_msg); + // assistant_message_added = true; // Not needed, we're returning + } + // Set full_response to empty to avoid duplication in return value // (content was already displayed during streaming) return Ok(self.finalize_streaming_turn( @@ -2623,18 +2630,10 @@ Skip if nothing new. Be brief."#; let has_response = !current_response.is_empty() || !full_response.is_empty(); // Check if the response is essentially empty (just whitespace or timing lines) - // This detects cases where the LLM outputs nothing substantive - let response_text = if !current_response.is_empty() { - ¤t_response - } else { - &full_response - }; - let is_empty_response = streaming::is_empty_response(response_text); - - // Check if there's an incomplete tool call in the buffer + // Check if there's an incomplete tool call in the buffer (for debugging) let has_incomplete_tool_call = parser.has_incomplete_tool_call(); - // Check if there's a complete but unexecuted tool call in the buffer + // Check if there's a complete but unexecuted tool call in the buffer (for debugging) let has_unexecuted_tool_call = parser.has_unexecuted_tool_call(); // Log when we detect unexecuted or incomplete tool calls for debugging @@ -2652,106 +2651,11 @@ Skip if nothing new. Be brief."#; stream_stop_reason.as_deref() == Some("max_tokens"); if was_truncated_by_max_tokens { debug!("Response was truncated due to max_tokens limit"); - warn!("LLM response was cut off due to max_tokens limit - will auto-continue"); + warn!("LLM response was cut off due to max_tokens limit"); } - // --- Phase 3: Auto-Continue Decision --- - let auto_continue_reason = streaming::should_auto_continue( - self.is_autonomous, - any_tool_executed, - has_incomplete_tool_call, - has_unexecuted_tool_call, - was_truncated_by_max_tokens, - ); - - if let Some(reason) = auto_continue_reason { - if auto_summary_attempts < MAX_AUTO_SUMMARY_ATTEMPTS { - auto_summary_attempts += 1; - - // Log and display appropriate message based on reason - use streaming::AutoContinueReason::*; - let (log_msg, ui_msg) = match reason { - IncompleteToolCall => ( - "LLM emitted incomplete tool call", - "\nšŸ”„ Model emitted incomplete tool call. Auto-continuing...\n", - ), - UnexecutedToolCall => ( - "LLM emitted unexecuted tool call", - "\nšŸ”„ Model emitted tool call that wasn't executed. Auto-continuing...\n", - ), - MaxTokensTruncation => ( - "LLM response truncated by max_tokens", - "\nšŸ”„ Model response was truncated. Auto-continuing...\n", - ), - ToolsExecuted => ( - "LLM stopped after executing tools", - "\nšŸ”„ Model stopped without providing summary. Auto-continuing...\n", - ), - }; - warn!( - "{} ({} iterations, auto-continue attempt {}/{})", - log_msg, - iteration_count, - auto_summary_attempts, - MAX_AUTO_SUMMARY_ATTEMPTS - ); - self.ui_writer.print_context_status(ui_msg); - - // Add any text response to context before prompting for continuation - if has_response { - let response_text = if !current_response.is_empty() { - current_response.clone() - } else { - full_response.clone() - }; - if !response_text.trim().is_empty() { - let assistant_msg = Message::new( - MessageRole::Assistant, - response_text.trim().to_string(), - ); - self.context_window.add_message(assistant_msg); - } - } - - // Add a follow-up message asking for continuation - let continue_prompt = match reason { - IncompleteToolCall => Message::new( - MessageRole::User, - "Your previous response was cut off mid-tool-call. Please complete the tool call and continue.".to_string(), - ), - _ => Message::new( - MessageRole::User, - "Please continue until you are done. Provide a summary when complete.".to_string(), - ), - }; - self.context_window.add_message(continue_prompt); - request.messages = self.context_window.conversation_history.clone(); - - // Continue the loop - continue; - } else { - // Max attempts reached, give up gracefully - warn!( - "Max auto-continue attempts ({}) reached after {} iterations. Conditions: any_tool_executed={}, has_incomplete={}, has_unexecuted={}, is_empty_response={}", - MAX_AUTO_SUMMARY_ATTEMPTS, - iteration_count, - any_tool_executed, - - has_incomplete_tool_call, - has_unexecuted_tool_call, - is_empty_response - ); - self.ui_writer.print_agent_response( - &format!("\nāš ļø The model stopped without providing a summary after {} auto-continue attempts.\n", MAX_AUTO_SUMMARY_ATTEMPTS) - ); - } - } else if has_response { - // Only set full_response if it's empty (first iteration without tools) - // This prevents duplication when the agent responds - // NOTE: We intentionally do NOT set full_response here anymore. - // The content was already displayed during streaming via print_agent_response(). - // Setting full_response would cause the CLI to print it again. - // We only need full_response for the context window (handled separately). + // Log response status for debugging + if has_response { debug!( "Response already streamed, not setting full_response. current_response: {} chars", current_response.len() @@ -2762,15 +2666,21 @@ Skip if nothing new. Be brief."#; // This ensures the log contains the true raw content including any JSON. // Note: We check current_response, not full_response, because full_response // may be empty to avoid display duplication (content was already streamed). - if !current_response.trim().is_empty() { + if !current_response.trim().is_empty() && !assistant_message_added { // Get the raw text from the parser (before filtering) let raw_text = parser.get_text_content(); let raw_clean = streaming::clean_llm_tokens(&raw_text); - if !raw_clean.trim().is_empty() { - let assistant_message = Message::new(MessageRole::Assistant, raw_clean); - self.context_window.add_message(assistant_message); - } + // Use raw_clean if available, otherwise fall back to current_response. + // This fixes a bug where the parser buffer might be empty/cleared + // even though current_response has content that was displayed. + let content_to_save = if !raw_clean.trim().is_empty() { + raw_clean + } else { + current_response.clone() + }; + let assistant_message = Message::new(MessageRole::Assistant, content_to_save); + self.context_window.add_message(assistant_message); } return Ok(self.finalize_streaming_turn( diff --git a/crates/g3-core/tests/assistant_message_dedup_test.rs b/crates/g3-core/tests/assistant_message_dedup_test.rs new file mode 100644 index 0000000..2e0138a --- /dev/null +++ b/crates/g3-core/tests/assistant_message_dedup_test.rs @@ -0,0 +1,416 @@ +//! Tests for Assistant Message Deduplication +//! +//! These tests verify that the conversation history does NOT have +//! consecutive assistant messages, which was a bug that caused +//! compounding duplication in LLM responses. +//! +//! Bug fixed: When tools were executed in previous iterations (any_tool_executed=true) +//! and the current iteration finished without executing a tool (tool_executed=false), +//! the assistant message was being added twice: +//! 1. At the break point when any_tool_executed is true +//! 2. At the return point when !tool_executed +//! +//! The fix uses an `assistant_message_added` flag to ensure only one add occurs. + +use g3_core::context_window::ContextWindow; +use g3_providers::{Message, MessageRole}; + +// ============================================================================= +// Helper Functions +// ============================================================================= + +/// Check if conversation history has consecutive assistant messages +fn has_consecutive_assistant_messages(history: &[Message]) -> Option<(usize, usize)> { + for i in 0..history.len().saturating_sub(1) { + if matches!(history[i].role, MessageRole::Assistant) + && matches!(history[i + 1].role, MessageRole::Assistant) + { + return Some((i, i + 1)); + } + } + None +} + +/// Count assistant messages in history +fn count_assistant_messages(history: &[Message]) -> usize { + history + .iter() + .filter(|m| matches!(m.role, MessageRole::Assistant)) + .count() +} + +// ============================================================================= +// Unit Tests for Helper Functions +// ============================================================================= + +#[test] +fn test_has_consecutive_assistant_messages_none() { + let history = vec![ + Message::new(MessageRole::System, "System".to_string()), + Message::new(MessageRole::User, "Hi".to_string()), + Message::new(MessageRole::Assistant, "Hello".to_string()), + Message::new(MessageRole::User, "Bye".to_string()), + Message::new(MessageRole::Assistant, "Goodbye".to_string()), + ]; + assert!(has_consecutive_assistant_messages(&history).is_none()); +} + +#[test] +fn test_has_consecutive_assistant_messages_found_middle() { + let history = vec![ + Message::new(MessageRole::System, "System".to_string()), + Message::new(MessageRole::User, "Hi".to_string()), + Message::new(MessageRole::Assistant, "Hello".to_string()), + Message::new(MessageRole::Assistant, "Hello again".to_string()), // BUG! + Message::new(MessageRole::User, "Bye".to_string()), + ]; + assert_eq!(has_consecutive_assistant_messages(&history), Some((2, 3))); +} + +#[test] +fn test_has_consecutive_assistant_messages_found_end() { + let history = vec![ + Message::new(MessageRole::User, "Hi".to_string()), + Message::new(MessageRole::Assistant, "Hello".to_string()), + Message::new(MessageRole::User, "Bye".to_string()), + Message::new(MessageRole::Assistant, "Goodbye".to_string()), + Message::new(MessageRole::Assistant, "Goodbye again".to_string()), // BUG! + ]; + assert_eq!(has_consecutive_assistant_messages(&history), Some((3, 4))); +} + +#[test] +fn test_count_assistant_messages() { + let history = vec![ + Message::new(MessageRole::System, "System".to_string()), + Message::new(MessageRole::User, "Hi".to_string()), + Message::new(MessageRole::Assistant, "Hello".to_string()), + Message::new(MessageRole::User, "Bye".to_string()), + Message::new(MessageRole::Assistant, "Goodbye".to_string()), + ]; + assert_eq!(count_assistant_messages(&history), 2); +} + +// ============================================================================= +// ContextWindow Unit Tests +// ============================================================================= + +/// Test that ContextWindow correctly tracks messages in normal flow +#[test] +fn test_context_window_normal_flow() { + let mut context = ContextWindow::new(200_000); + + context.add_message(Message::new( + MessageRole::System, + "You are helpful.".to_string(), + )); + context.add_message(Message::new(MessageRole::User, "Hello".to_string())); + context.add_message(Message::new(MessageRole::Assistant, "Hi there!".to_string())); + context.add_message(Message::new(MessageRole::User, "How are you?".to_string())); + context.add_message(Message::new(MessageRole::Assistant, "I'm doing well!".to_string())); + + assert_eq!(context.conversation_history.len(), 5); + assert_eq!(count_assistant_messages(&context.conversation_history), 2); + assert!( + has_consecutive_assistant_messages(&context.conversation_history).is_none(), + "Normal conversation should not have consecutive assistant messages" + ); +} + +/// Test that simulates the correct flow after tool execution +#[test] +fn test_context_window_tool_execution_correct_flow() { + let mut context = ContextWindow::new(200_000); + + // Setup + context.add_message(Message::new( + MessageRole::System, + "You are helpful.".to_string(), + )); + context.add_message(Message::new(MessageRole::User, "Run a command".to_string())); + + // Tool call (assistant message with tool JSON) + context.add_message(Message::new( + MessageRole::Assistant, + r#"{"tool": "shell", "args": {"command": "ls"}}"#.to_string(), + )); + + // Tool result (user message) + context.add_message(Message::new( + MessageRole::User, + "Tool result: file1.txt file2.txt".to_string(), + )); + + // Summary - should only be added ONCE + context.add_message(Message::new( + MessageRole::Assistant, + "Here are the files: file1.txt, file2.txt".to_string(), + )); + + // Verify structure + assert_eq!(context.conversation_history.len(), 5); + assert_eq!(count_assistant_messages(&context.conversation_history), 2); + assert!( + has_consecutive_assistant_messages(&context.conversation_history).is_none(), + "Correct flow should not have consecutive assistant messages" + ); +} + +/// Test that demonstrates the bug scenario (what the bug looked like) +#[test] +fn test_context_window_bug_scenario_demonstration() { + let mut context = ContextWindow::new(200_000); + + // Setup + context.add_message(Message::new( + MessageRole::System, + "You are helpful.".to_string(), + )); + context.add_message(Message::new(MessageRole::User, "Run a command".to_string())); + + // Tool call + context.add_message(Message::new( + MessageRole::Assistant, + r#"{"tool": "shell", "args": {"command": "ls"}}"#.to_string(), + )); + context.add_message(Message::new( + MessageRole::User, + "Tool result: file1.txt".to_string(), + )); + + // THE BUG: Summary added TWICE + let summary = "Here are the files: file1.txt".to_string(); + context.add_message(Message::new(MessageRole::Assistant, summary.clone())); + context.add_message(Message::new(MessageRole::Assistant, summary.clone())); // BUG! + + // This demonstrates what the bug looked like + let consecutive = has_consecutive_assistant_messages(&context.conversation_history); + assert!( + consecutive.is_some(), + "Bug scenario should have consecutive assistant messages" + ); + assert_eq!(consecutive, Some((4, 5))); + + // The content is duplicated + assert_eq!( + context.conversation_history[4].content, + context.conversation_history[5].content, + "Bug: consecutive messages have identical content" + ); +} + +/// Test multiple tool executions followed by summary (correct flow) +#[test] +fn test_context_window_multiple_tools_correct_flow() { + let mut context = ContextWindow::new(200_000); + + context.add_message(Message::new( + MessageRole::System, + "You are helpful.".to_string(), + )); + context.add_message(Message::new( + MessageRole::User, + "List files and show current directory".to_string(), + )); + + // First tool call + context.add_message(Message::new( + MessageRole::Assistant, + r#"{"tool": "shell", "args": {"command": "ls"}}"#.to_string(), + )); + context.add_message(Message::new( + MessageRole::User, + "Tool result: file1.txt file2.txt".to_string(), + )); + + // Second tool call + context.add_message(Message::new( + MessageRole::Assistant, + r#"{"tool": "shell", "args": {"command": "pwd"}}"#.to_string(), + )); + context.add_message(Message::new( + MessageRole::User, + "Tool result: /home/user".to_string(), + )); + + // Final summary - only ONE + context.add_message(Message::new( + MessageRole::Assistant, + "Files: file1.txt, file2.txt. Current directory: /home/user".to_string(), + )); + + // Verify structure + // System + User + (Assistant + User) * 2 + Assistant = 1 + 1 + 4 + 1 = 7 + assert_eq!(context.conversation_history.len(), 7); + assert_eq!(count_assistant_messages(&context.conversation_history), 3); + assert!( + has_consecutive_assistant_messages(&context.conversation_history).is_none(), + "Multiple tools with correct flow should not have consecutive assistant messages" + ); +} + +/// Test the exact session log pattern from the original bug report +#[test] +fn test_session_log_pattern_from_bug_report() { + // This test recreates the exact pattern seen in the butler session log: + // - Index N: assistant message with content + // - Index N+1: assistant message with same content (consecutive!) + + let mut context = ContextWindow::new(200_000); + + // Simulate a conversation with many messages + context.add_message(Message::new(MessageRole::System, "You are butler.".to_string())); + + // Add some conversation history + for i in 0..10 { + context.add_message(Message::new(MessageRole::User, format!("Task {}", i))); + context.add_message(Message::new(MessageRole::Assistant, format!("Response {}", i))); + } + + // Now simulate the buggy pattern + context.add_message(Message::new( + MessageRole::User, + "Task: move appa_music into appa".to_string(), + )); + + // Tool call + context.add_message(Message::new( + MessageRole::Assistant, + r#"{"tool": "shell", "args": {"command": "mv ~/Desktop/appa_music ~/icloud/appa/music"}}"#.to_string(), + )); + context.add_message(Message::new( + MessageRole::User, + "Tool result: āœ“ Moved".to_string(), + )); + + // The bug: two consecutive assistant messages with duplicated content + let duplicated_content = "Done! āœ…\n\nappa/\nā”œā”€ā”€ archive/\nā”œā”€ā”€ docs/\nā”œā”€ā”€ music/ šŸŽµ NEW\n\nAnything else?"; + + // First add (correct) + context.add_message(Message::new(MessageRole::Assistant, duplicated_content.to_string())); + // Second add (THE BUG) + context.add_message(Message::new(MessageRole::Assistant, duplicated_content.to_string())); + + // Verify the bug pattern exists + let consecutive = has_consecutive_assistant_messages(&context.conversation_history); + assert!( + consecutive.is_some(), + "Should detect consecutive assistant messages from bug pattern" + ); + + // The consecutive messages should be the last two + let (idx1, idx2) = consecutive.unwrap(); + let history_len = context.conversation_history.len(); + assert_eq!(idx1, history_len - 2, "First consecutive should be second-to-last"); + assert_eq!(idx2, history_len - 1, "Second consecutive should be last"); +} + +/// Test that the fix prevents consecutive assistant messages +/// This simulates what the fixed code does: check before adding +#[test] +fn test_fix_prevents_consecutive_assistant_messages() { + let mut context = ContextWindow::new(200_000); + + context.add_message(Message::new( + MessageRole::System, + "You are helpful.".to_string(), + )); + context.add_message(Message::new(MessageRole::User, "Run a command".to_string())); + context.add_message(Message::new( + MessageRole::Assistant, + r#"{"tool": "shell", "args": {"command": "ls"}}"#.to_string(), + )); + context.add_message(Message::new( + MessageRole::User, + "Tool result: file1.txt".to_string(), + )); + + // Simulate the fix: use a flag to track if message was added + let mut assistant_message_added = false; + let summary = "Here are the files: file1.txt".to_string(); + + // First add location (simulating line ~2529 in the fix) + if !assistant_message_added { + context.add_message(Message::new(MessageRole::Assistant, summary.clone())); + assistant_message_added = true; + } + + // Second add location (simulating line ~2772 in the fix) + // This would have added a duplicate before the fix + if !assistant_message_added { + context.add_message(Message::new(MessageRole::Assistant, summary.clone())); + // assistant_message_added = true; // Not needed since we return after + } + + // Verify the fix works + assert!( + has_consecutive_assistant_messages(&context.conversation_history).is_none(), + "Fix should prevent consecutive assistant messages" + ); + assert_eq!( + count_assistant_messages(&context.conversation_history), + 2, + "Should have exactly 2 assistant messages (tool call + summary)" + ); +} + +/// Test edge case: empty response should not add message +#[test] +fn test_empty_response_not_added() { + let mut context = ContextWindow::new(200_000); + + context.add_message(Message::new( + MessageRole::System, + "You are helpful.".to_string(), + )); + context.add_message(Message::new(MessageRole::User, "Hello".to_string())); + context.add_message(Message::new(MessageRole::Assistant, "Hi!".to_string())); + + // Simulate the check: don't add empty/whitespace responses + let empty_response = " ".to_string(); + if !empty_response.trim().is_empty() { + context.add_message(Message::new(MessageRole::Assistant, empty_response)); + } + + // Should still have only 3 messages + assert_eq!(context.conversation_history.len(), 3); + assert!( + has_consecutive_assistant_messages(&context.conversation_history).is_none(), + "Empty response should not create consecutive messages" + ); +} + +/// Test the invariant: conversation should always alternate user/assistant after system +#[test] +fn test_alternating_pattern_invariant() { + let history = vec![ + Message::new(MessageRole::System, "System".to_string()), + Message::new(MessageRole::User, "Q1".to_string()), + Message::new(MessageRole::Assistant, "A1".to_string()), + Message::new(MessageRole::User, "Q2".to_string()), + Message::new(MessageRole::Assistant, "A2".to_string()), + Message::new(MessageRole::User, "Q3".to_string()), + Message::new(MessageRole::Assistant, "A3".to_string()), + ]; + + // Verify alternating pattern + for i in 1..history.len() - 1 { + let current = &history[i].role; + let next = &history[i + 1].role; + + if matches!(current, MessageRole::User) { + assert!( + matches!(next, MessageRole::Assistant), + "User at {} should be followed by Assistant", + i + ); + } + if matches!(current, MessageRole::Assistant) { + assert!( + matches!(next, MessageRole::User), + "Assistant at {} should be followed by User", + i + ); + } + } +} diff --git a/crates/g3-core/tests/consecutive_assistant_message_test.rs b/crates/g3-core/tests/consecutive_assistant_message_test.rs new file mode 100644 index 0000000..d6616dd --- /dev/null +++ b/crates/g3-core/tests/consecutive_assistant_message_test.rs @@ -0,0 +1,248 @@ +//! Tests for consecutive assistant message bug +//! +//! This test verifies that the streaming completion logic does not add +//! consecutive assistant messages to the conversation history. +//! +//! Bug description: +//! When tools are executed in previous iterations (any_tool_executed=true) +//! and the current iteration finishes without executing a tool (tool_executed=false), +//! the assistant message was being added twice: +//! 1. At line ~2529 when breaking to auto-continue (if any_tool_executed && !current_response.is_empty()) +//! 2. At line ~2772 when returning (if !current_response.is_empty()) +//! +//! This creates consecutive assistant messages in the conversation history, +//! which causes the LLM to see duplicated content and mimic the pattern, +//! leading to compounding duplication over time. + +use g3_core::context_window::ContextWindow; +use g3_providers::{Message, MessageRole}; + +/// Helper to check if conversation history has consecutive assistant messages +fn has_consecutive_assistant_messages(history: &[Message]) -> Option<(usize, usize)> { + for i in 0..history.len().saturating_sub(1) { + if matches!(history[i].role, MessageRole::Assistant) + && matches!(history[i + 1].role, MessageRole::Assistant) + { + return Some((i, i + 1)); + } + } + None +} + +/// Test that a well-formed conversation has no consecutive assistant messages +#[test] +fn test_no_consecutive_assistant_messages_in_normal_flow() { + let mut context = ContextWindow::new(200_000); + + // Simulate a normal conversation flow + context.add_message(Message::new(MessageRole::System, "You are a helpful assistant.".to_string())); + context.add_message(Message::new(MessageRole::User, "Task: Hello".to_string())); + context.add_message(Message::new(MessageRole::Assistant, "Hi there!".to_string())); + context.add_message(Message::new(MessageRole::User, "Task: How are you?".to_string())); + context.add_message(Message::new(MessageRole::Assistant, "I'm doing well!".to_string())); + + assert!( + has_consecutive_assistant_messages(&context.conversation_history).is_none(), + "Normal conversation should not have consecutive assistant messages" + ); +} + +/// Test that detects the bug: consecutive assistant messages +/// This test SHOULD FAIL until the bug is fixed +#[test] +fn test_detect_consecutive_assistant_messages_bug() { + let mut context = ContextWindow::new(200_000); + + // Simulate the buggy flow: + // 1. System message + context.add_message(Message::new(MessageRole::System, "You are a helpful assistant.".to_string())); + + // 2. User asks a question + context.add_message(Message::new(MessageRole::User, "Task: List files".to_string())); + + // 3. Assistant responds with tool call (tool execution adds this) + context.add_message(Message::new( + MessageRole::Assistant, + r#"{"tool": "shell", "args": {"command": "ls"}}"#.to_string(), + )); + + // 4. Tool result + context.add_message(Message::new(MessageRole::User, "Tool result: file1.txt file2.txt".to_string())); + + // 5. Assistant provides summary - THIS IS WHERE THE BUG OCCURS + // The bug adds this message TWICE + let summary = "Here are the files:\n- file1.txt\n- file2.txt".to_string(); + + // Simulate the bug: message added at line 2529 (when breaking to auto-continue) + context.add_message(Message::new(MessageRole::Assistant, summary.clone())); + + // Simulate the bug: message added AGAIN at line 2772 (when returning) + // This is the bug - the same message is added twice! + context.add_message(Message::new(MessageRole::Assistant, summary.clone())); + + // This assertion verifies the bug exists + let consecutive = has_consecutive_assistant_messages(&context.conversation_history); + assert!( + consecutive.is_some(), + "Bug reproduction: should have consecutive assistant messages at indices {:?}", + consecutive + ); + + // Verify the indices + let (idx1, idx2) = consecutive.unwrap(); + assert_eq!(idx1, 4, "First consecutive assistant message should be at index 4"); + assert_eq!(idx2, 5, "Second consecutive assistant message should be at index 5"); + + // Verify the content is duplicated + assert_eq!( + context.conversation_history[idx1].content, + context.conversation_history[idx2].content, + "Consecutive assistant messages should have identical content (the bug)" + ); +} + +/// Test that validates the invariant: no consecutive assistant messages should exist +/// This is the test that should PASS after the bug is fixed +#[test] +fn test_invariant_no_consecutive_assistant_messages() { + // This test documents the invariant that should hold + // After the bug is fixed, this test should pass + + let mut context = ContextWindow::new(200_000); + + // Build a conversation that would trigger the bug + context.add_message(Message::new(MessageRole::System, "You are a helpful assistant.".to_string())); + context.add_message(Message::new(MessageRole::User, "Task: List files".to_string())); + context.add_message(Message::new( + MessageRole::Assistant, + r#"{"tool": "shell", "args": {"command": "ls"}}"#.to_string(), + )); + context.add_message(Message::new(MessageRole::User, "Tool result: file1.txt file2.txt".to_string())); + + // After the fix, only ONE assistant message should be added + let summary = "Here are the files:\n- file1.txt\n- file2.txt".to_string(); + context.add_message(Message::new(MessageRole::Assistant, summary)); + + // Invariant: no consecutive assistant messages + assert!( + has_consecutive_assistant_messages(&context.conversation_history).is_none(), + "Invariant: conversation history should never have consecutive assistant messages" + ); +} + +/// Test helper function works correctly +#[test] +fn test_consecutive_detection_helper() { + // Test with no consecutive + let history = vec![ + Message::new(MessageRole::User, "Hi".to_string()), + Message::new(MessageRole::Assistant, "Hello".to_string()), + Message::new(MessageRole::User, "Bye".to_string()), + ]; + assert!(has_consecutive_assistant_messages(&history).is_none()); + + // Test with consecutive at start + let history = vec![ + Message::new(MessageRole::Assistant, "Hello".to_string()), + Message::new(MessageRole::Assistant, "Hello again".to_string()), + Message::new(MessageRole::User, "Hi".to_string()), + ]; + assert_eq!(has_consecutive_assistant_messages(&history), Some((0, 1))); + + // Test with consecutive in middle + let history = vec![ + Message::new(MessageRole::User, "Hi".to_string()), + Message::new(MessageRole::Assistant, "Hello".to_string()), + Message::new(MessageRole::Assistant, "Hello again".to_string()), + Message::new(MessageRole::User, "Bye".to_string()), + ]; + assert_eq!(has_consecutive_assistant_messages(&history), Some((1, 2))); + + // Test with consecutive at end + let history = vec![ + Message::new(MessageRole::User, "Hi".to_string()), + Message::new(MessageRole::Assistant, "Hello".to_string()), + Message::new(MessageRole::Assistant, "Hello again".to_string()), + ]; + assert_eq!(has_consecutive_assistant_messages(&history), Some((1, 2))); +} + +/// Test that simulates the exact session log pattern from the bug report +#[test] +fn test_session_log_pattern_from_bug_report() { + // This test recreates the exact pattern seen in the butler session log: + // - Index 368: assistant message with duplicated content + // - Index 369: assistant message with same duplicated content (consecutive!) + + let mut context = ContextWindow::new(200_000); + + // Simulate the conversation leading up to the bug + context.add_message(Message::new(MessageRole::System, "You are butler.".to_string())); + + // ... many messages ... + for i in 0..50 { + context.add_message(Message::new(MessageRole::User, format!("Task {}", i))); + context.add_message(Message::new(MessageRole::Assistant, format!("Response {}", i))); + } + + // Now simulate the buggy pattern + context.add_message(Message::new(MessageRole::User, "Task: move appa_music into appa".to_string())); + + // Tool call + context.add_message(Message::new( + MessageRole::Assistant, + r#"{"tool": "shell", "args": {"command": "mv ~/Desktop/appa_music ~/icloud/appa/music"}}"#.to_string(), + )); + context.add_message(Message::new(MessageRole::User, "Tool result: āœ“ Moved".to_string())); + + // The bug: two consecutive assistant messages with duplicated content + let duplicated_content = "Done! āœ…\n\nappa/\nā”œā”€ā”€ archive/\nā”œā”€ā”€ docs/\nā”œā”€ā”€ music/ šŸŽµ NEW\n\nAnything else?\n\nDone! āœ…\n\nappa/\nā”œā”€ā”€ archive/\nā”œā”€ā”€ docs/\nā”œā”€ā”€ music/ šŸŽµ NEW\n\nAnything else?"; + + // First add (line 2529) + context.add_message(Message::new(MessageRole::Assistant, duplicated_content.to_string())); + // Second add (line 2772) - THE BUG + context.add_message(Message::new(MessageRole::Assistant, duplicated_content.to_string())); + + // Verify the bug pattern exists + let consecutive = has_consecutive_assistant_messages(&context.conversation_history); + assert!( + consecutive.is_some(), + "Should detect consecutive assistant messages from bug pattern" + ); + + // The consecutive messages should be the last two + let (idx1, idx2) = consecutive.unwrap(); + let history_len = context.conversation_history.len(); + assert_eq!(idx1, history_len - 2, "First consecutive should be second-to-last"); + assert_eq!(idx2, history_len - 1, "Second consecutive should be last"); +} + +/// Test that the context window could have a guard against consecutive assistant messages +/// This is a proposed fix validation test +#[test] +fn test_proposed_fix_guard_against_consecutive() { + let mut context = ContextWindow::new(200_000); + + context.add_message(Message::new(MessageRole::System, "System".to_string())); + context.add_message(Message::new(MessageRole::User, "User".to_string())); + context.add_message(Message::new(MessageRole::Assistant, "First response".to_string())); + + // Proposed fix: before adding an assistant message, check if the last message + // is also an assistant message. If so, either: + // 1. Skip adding (if content is identical) + // 2. Merge the content (if content is different) + // 3. Log a warning and skip (safest option) + + let last_message = context.conversation_history.last(); + let is_last_assistant = last_message + .map(|m| matches!(m.role, MessageRole::Assistant)) + .unwrap_or(false); + + assert!( + is_last_assistant, + "Last message should be assistant for this test" + ); + + // The fix would check this condition before adding + // and skip adding if true (to prevent consecutive assistant messages) +} diff --git a/crates/g3-core/tests/early_return_path_test.rs b/crates/g3-core/tests/early_return_path_test.rs new file mode 100644 index 0000000..286fd7a --- /dev/null +++ b/crates/g3-core/tests/early_return_path_test.rs @@ -0,0 +1,192 @@ +//! Tests for the early return path in stream_completion_with_tools +//! +//! This test documents the bug fix for missing assistant messages when: +//! - The LLM responds with text only (no tool calls) +//! - any_tool_executed = false (no tools in previous iterations) +//! - The stream finishes and hits the early return path +//! +//! The fix adds assistant message saving before the early return at ~line 2535. + +use g3_core::context_window::ContextWindow; +use g3_providers::{Message, MessageRole}; + +/// Simulates the early return path logic +/// This mirrors the code at lines 2535-2555 in lib.rs +fn simulate_early_return_path( + context_window: &mut ContextWindow, + current_response: &str, + assistant_message_added: bool, + any_tool_executed: bool, +) -> bool { + // This simulates the code path when stream finishes + + if any_tool_executed { + // If tools were executed in previous iterations, break to finalize + if !current_response.trim().is_empty() && !assistant_message_added { + let assistant_msg = Message::new( + MessageRole::Assistant, + current_response.to_string(), + ); + context_window.add_message(assistant_msg); + return true; // assistant_message_added = true + } + return assistant_message_added; + } + + // THE FIX: Save assistant message before returning (no tools were executed) + if !current_response.trim().is_empty() && !assistant_message_added { + let assistant_msg = Message::new( + MessageRole::Assistant, + current_response.to_string(), + ); + context_window.add_message(assistant_msg); + return true; + } + + assistant_message_added +} + +/// Test: Text-only response with no previous tool execution +/// This is the exact bug scenario from butler session +#[test] +fn test_text_only_response_no_tools() { + let mut context = ContextWindow::new(200_000); + + // Add initial user message + context.add_message(Message::new(MessageRole::User, "Hello".to_string())); + + let current_response = "Phew! šŸ˜… Glad it's back. Sorry about that..."; + let assistant_message_added = false; + let any_tool_executed = false; + + let was_added = simulate_early_return_path( + &mut context, + current_response, + assistant_message_added, + any_tool_executed, + ); + + assert!(was_added, "Assistant message should be added"); + + let history = context.conversation_history.clone(); + assert_eq!(history.len(), 2, "Should have user + assistant messages"); + assert!(matches!(history[1].role, MessageRole::Assistant)); + assert!(history[1].content.contains("Phew!")); +} + +/// Test: Text-only response with previous tool execution +#[test] +fn test_text_only_response_with_previous_tools() { + let mut context = ContextWindow::new(200_000); + + context.add_message(Message::new(MessageRole::User, "Run ls".to_string())); + + let current_response = "Here are the files..."; + let assistant_message_added = false; + let any_tool_executed = true; // Tools were executed in previous iterations + + let was_added = simulate_early_return_path( + &mut context, + current_response, + assistant_message_added, + any_tool_executed, + ); + + assert!(was_added, "Assistant message should be added"); + + let history = context.conversation_history.clone(); + assert_eq!(history.len(), 2); + assert!(matches!(history[1].role, MessageRole::Assistant)); +} + +/// Test: Empty response should not add message +#[test] +fn test_empty_response_not_added() { + let mut context = ContextWindow::new(200_000); + + context.add_message(Message::new(MessageRole::User, "Hello".to_string())); + + let current_response = " "; // Whitespace only + let assistant_message_added = false; + let any_tool_executed = false; + + let was_added = simulate_early_return_path( + &mut context, + current_response, + assistant_message_added, + any_tool_executed, + ); + + assert!(!was_added, "Empty response should not be added"); + + let history = context.conversation_history.clone(); + assert_eq!(history.len(), 1, "Should only have user message"); +} + +/// Test: Already added flag prevents duplication +#[test] +fn test_already_added_prevents_duplication() { + let mut context = ContextWindow::new(200_000); + + context.add_message(Message::new(MessageRole::User, "Hello".to_string())); + context.add_message(Message::new(MessageRole::Assistant, "Hi there!".to_string())); + + let current_response = "Hi there!"; + let assistant_message_added = true; // Already added + let any_tool_executed = false; + + let was_added = simulate_early_return_path( + &mut context, + current_response, + assistant_message_added, + any_tool_executed, + ); + + assert!(was_added, "Flag should remain true"); + + let history = context.conversation_history.clone(); + assert_eq!(history.len(), 2, "Should not add duplicate"); +} + +/// Test: Consecutive user messages scenario (the bug) +/// Before the fix, this would result in consecutive user messages +#[test] +fn test_bug_scenario_consecutive_user_messages() { + let mut context = ContextWindow::new(200_000); + + // Simulate the butler session bug: + // Message 80: Tool result (user) + // Message 81: User input (user) - SHOULD have assistant between! + // Message 82: User input (user) + + context.add_message(Message::new(MessageRole::User, "Tool result: Self care".to_string())); + + // Simulate assistant response that was displayed but not saved (THE BUG) + let current_response = "Phew! šŸ˜… Glad it's back..."; + let assistant_message_added = false; + let any_tool_executed = false; + + // THE FIX: This should now save the assistant message + simulate_early_return_path( + &mut context, + current_response, + assistant_message_added, + any_tool_executed, + ); + + // Now add the next user message + context.add_message(Message::new(MessageRole::User, "Task: Ok it's back...".to_string())); + + let history = context.conversation_history.clone(); + + // Verify no consecutive user messages + for i in 0..history.len().saturating_sub(1) { + let current_is_user = matches!(history[i].role, MessageRole::User); + let next_is_user = matches!(history[i + 1].role, MessageRole::User); + assert!( + !(current_is_user && next_is_user), + "Found consecutive user messages at positions {} and {}", + i, i + 1 + ); + } +} diff --git a/crates/g3-core/tests/missing_assistant_message_test.rs b/crates/g3-core/tests/missing_assistant_message_test.rs new file mode 100644 index 0000000..3aa7a59 --- /dev/null +++ b/crates/g3-core/tests/missing_assistant_message_test.rs @@ -0,0 +1,100 @@ +//! Tests for the missing assistant message bug fix. +//! +//! Bug: When the LLM responds with text-only (no tool calls), the assistant message +//! was sometimes not saved to the context window because the code checked +//! `raw_clean.trim().is_empty()` after already confirming `current_response` had content. +//! If the parser buffer was empty/different from current_response, no message was added. +//! +//! Fix: Use current_response as fallback when raw_clean is empty. + +/// Test that the fix ensures assistant messages are always added when there's content. +/// This simulates the fixed behavior where current_response is used as fallback. +#[test] +fn test_fallback_to_current_response_when_raw_empty() { + // Simulate the scenario: + // - current_response has content (what was displayed) + // - raw_clean is empty (parser buffer was cleared) + // - The fix should use current_response as fallback + + let current_response = "Here's my helpful response!"; + let raw_clean = ""; // Parser buffer is empty + + // The fix logic: + let content_to_save = if !raw_clean.trim().is_empty() { + raw_clean.to_string() + } else { + current_response.to_string() + }; + + // Verify fallback works + assert_eq!(content_to_save, current_response); + assert!(!content_to_save.is_empty()); +} + +/// Test that raw_clean is preferred when available. +#[test] +fn test_prefer_raw_clean_when_available() { + let current_response = "Filtered response"; // What was displayed (filtered) + let raw_clean = "Raw response with {\"tool\": ...} JSON"; // Raw content + + // The fix logic: + let content_to_save = if !raw_clean.trim().is_empty() { + raw_clean.to_string() + } else { + current_response.to_string() + }; + + // Verify raw_clean is preferred + assert_eq!(content_to_save, raw_clean); +} + +/// Test that whitespace-only raw_clean triggers fallback. +#[test] +fn test_whitespace_raw_clean_triggers_fallback() { + let current_response = "Actual content"; + let raw_clean = " \n\t "; // Whitespace only + + // The fix logic: + let content_to_save = if !raw_clean.trim().is_empty() { + raw_clean.to_string() + } else { + current_response.to_string() + }; + + // Verify fallback to current_response + assert_eq!(content_to_save, current_response); +} + +/// Test that the fix logic handles various edge cases. +#[test] +fn test_fix_logic_edge_cases() { + // Test case 1: Both have content - prefer raw + let current = "displayed"; + let raw = "raw content"; + let result = if !raw.trim().is_empty() { raw } else { current }; + assert_eq!(result, "raw content"); + + // Test case 2: Raw is empty - use current + let current = "displayed"; + let raw = ""; + let result = if !raw.trim().is_empty() { raw } else { current }; + assert_eq!(result, "displayed"); + + // Test case 3: Raw is whitespace - use current + let current = "displayed"; + let raw = " \n "; + let result = if !raw.trim().is_empty() { raw } else { current }; + assert_eq!(result, "displayed"); + + // Test case 4: Both empty - still returns current (empty) + let current = ""; + let raw = ""; + let result = if !raw.trim().is_empty() { raw } else { current }; + assert_eq!(result, ""); + + // Test case 5: Current has content, raw has only newlines + let current = "Hello world!"; + let raw = "\n\n\n"; + let result = if !raw.trim().is_empty() { raw } else { current }; + assert_eq!(result, "Hello world!"); +}