diff --git a/crates/g3-core/tests/stream_completion_characterization_test.rs b/crates/g3-core/tests/stream_completion_characterization_test.rs new file mode 100644 index 0000000..052b817 --- /dev/null +++ b/crates/g3-core/tests/stream_completion_characterization_test.rs @@ -0,0 +1,713 @@ +//! Characterization Tests for stream_completion_with_tools +//! +//! CHARACTERIZATION: These tests capture the current behavior of the streaming +//! completion loop. They serve as a safety net for refactoring by locking down +//! observable behavior through the stable Agent interface. +//! +//! What these tests protect: +//! - Tool calls are detected, executed, and results added to context +//! - Auto-continue behavior in autonomous mode after tool execution +//! - Duplicate tool call detection (sequential duplicates skipped) +//! - Incomplete tool call detection triggers auto-continue +//! - Context window updates reflect tool execution +//! - Streaming parser behavior for JSON tool call detection +//! +//! What these tests intentionally do NOT assert: +//! - Internal parser state or implementation details +//! - Specific timing values (only that timing info is present when requested) +//! - UI output formatting (uses NullUiWriter) +//! - Provider-specific behavior +//! +//! Surface: Agent::execute_tool() for tool execution, streaming parser for parsing +//! Boundary: Tool execution boundary, streaming parser boundary + +use g3_core::ui_writer::NullUiWriter; +use g3_core::Agent; +use serial_test::serial; +use tempfile::TempDir; + +// ============================================================================= +// Characterization Tests: StreamingToolParser Behavior +// ============================================================================= +// These tests characterize the streaming parser's behavior which is a key +// component of stream_completion_with_tools. + +mod streaming_parser_characterization { + use g3_core::StreamingToolParser; + use g3_providers::CompletionChunk; + + /// CHARACTERIZATION: Parser detects complete JSON tool calls in text + #[test] + fn parser_detects_complete_tool_call() { + let mut parser = StreamingToolParser::new(); + let chunk = CompletionChunk { + content: r#"{"tool": "shell", "args": {"command": "ls"}}"#.to_string(), + finished: false, + tool_calls: None, + usage: None, + stop_reason: None, + }; + + let tools = parser.process_chunk(&chunk); + + // Should detect one tool call + assert_eq!(tools.len(), 1, "Should detect exactly one tool call"); + assert_eq!(tools[0].tool, "shell"); + } + + /// CHARACTERIZATION: Parser accumulates text before tool call + #[test] + fn parser_accumulates_text_before_tool() { + let mut parser = StreamingToolParser::new(); + + // First chunk: just text + let chunk1 = CompletionChunk { + content: "Let me run a command.\n\n".to_string(), + finished: false, + tool_calls: None, + usage: None, + stop_reason: None, + }; + let tools1 = parser.process_chunk(&chunk1); + assert!(tools1.is_empty(), "No tool call yet"); + + // Second chunk: tool call + let chunk2 = CompletionChunk { + content: r#"{"tool": "shell", "args": {"command": "pwd"}}"#.to_string(), + finished: false, + tool_calls: None, + usage: None, + stop_reason: None, + }; + let tools2 = parser.process_chunk(&chunk2); + assert_eq!(tools2.len(), 1, "Should detect tool call"); + + // Text buffer should contain the accumulated text + let text = parser.get_text_content(); + assert!( + text.contains("Let me run a command"), + "Should preserve text before tool call: {}", + text + ); + } + + /// CHARACTERIZATION: Parser detects incomplete tool calls + #[test] + fn parser_detects_incomplete_tool_call() { + let mut parser = StreamingToolParser::new(); + let chunk = CompletionChunk { + content: r#"{"tool": "read_file", "args": {"file_path": "test.txt""#.to_string(), + finished: false, + tool_calls: None, + usage: None, + stop_reason: None, + }; + + parser.process_chunk(&chunk); + + // Should detect incomplete tool call (missing closing braces) + assert!( + parser.has_incomplete_tool_call(), + "Should detect incomplete tool call" + ); + } + + /// CHARACTERIZATION: Parser detects unexecuted complete tool calls + #[test] + fn parser_detects_unexecuted_tool_call() { + let mut parser = StreamingToolParser::new(); + let chunk = CompletionChunk { + content: r#"{"tool": "shell", "args": {"command": "ls"}}"#.to_string(), + finished: false, + tool_calls: None, + usage: None, + stop_reason: None, + }; + + // Process but don't execute + let _tools = parser.process_chunk(&chunk); + + // Should detect unexecuted tool call + assert!( + parser.has_unexecuted_tool_call(), + "Should detect unexecuted tool call" + ); + } + + /// CHARACTERIZATION: Marking tools consumed clears unexecuted state + #[test] + fn marking_consumed_clears_unexecuted() { + let mut parser = StreamingToolParser::new(); + let chunk = CompletionChunk { + content: r#"{"tool": "shell", "args": {"command": "ls"}}"#.to_string(), + finished: false, + tool_calls: None, + usage: None, + stop_reason: None, + }; + + let _tools = parser.process_chunk(&chunk); + assert!(parser.has_unexecuted_tool_call()); + + // Mark as consumed (simulating execution) + parser.mark_tool_calls_consumed(); + + assert!( + !parser.has_unexecuted_tool_call(), + "Should not detect unexecuted after marking consumed" + ); + } + + /// CHARACTERIZATION: Reset clears all parser state + #[test] + fn reset_clears_parser_state() { + let mut parser = StreamingToolParser::new(); + let chunk = CompletionChunk { + content: "Some text\n{\"tool\": \"shell\"".to_string(), + finished: false, + tool_calls: None, + usage: None, + stop_reason: None, + }; + + parser.process_chunk(&chunk); + assert!(parser.has_incomplete_tool_call()); + + parser.reset(); + + assert!(!parser.has_incomplete_tool_call()); + assert!(!parser.has_unexecuted_tool_call()); + assert!(parser.get_text_content().is_empty()); + } +} + +// ============================================================================= +// Characterization Tests: Auto-Continue Logic +// ============================================================================= + +mod auto_continue_characterization { + use g3_core::streaming::{should_auto_continue, AutoContinueReason}; + + /// CHARACTERIZATION: Non-autonomous mode never auto-continues + #[test] + fn non_autonomous_never_continues() { + // Even with all conditions true, non-autonomous should not continue + let result = should_auto_continue( + false, // not autonomous + true, // tools executed + true, // incomplete tool call + true, // unexecuted tool call + true, // was truncated + ); + + assert!(result.is_none(), "Non-autonomous mode should never auto-continue"); + } + + /// CHARACTERIZATION: Autonomous mode continues after tool execution + #[test] + fn autonomous_continues_after_tools() { + let result = should_auto_continue( + true, // autonomous + true, // tools executed + false, // no incomplete + false, // no unexecuted + false, // not truncated + ); + + assert_eq!( + result, + Some(AutoContinueReason::ToolsExecuted), + "Should continue after tool execution" + ); + } + + /// CHARACTERIZATION: Incomplete tool call triggers continue + #[test] + fn incomplete_tool_triggers_continue() { + let result = should_auto_continue( + true, // autonomous + false, // no tools executed + true, // incomplete tool call + false, // no unexecuted + false, // not truncated + ); + + assert_eq!( + result, + Some(AutoContinueReason::IncompleteToolCall), + "Should continue on incomplete tool call" + ); + } + + /// CHARACTERIZATION: Unexecuted tool call triggers continue + #[test] + fn unexecuted_tool_triggers_continue() { + let result = should_auto_continue( + true, // autonomous + false, // no tools executed + false, // no incomplete + true, // unexecuted tool call + false, // not truncated + ); + + assert_eq!( + result, + Some(AutoContinueReason::UnexecutedToolCall), + "Should continue on unexecuted tool call" + ); + } + + /// CHARACTERIZATION: Max tokens truncation triggers continue + #[test] + fn truncation_triggers_continue() { + let result = should_auto_continue( + true, // autonomous + false, // no tools executed + false, // no incomplete + false, // no unexecuted + true, // was truncated + ); + + assert_eq!( + result, + Some(AutoContinueReason::MaxTokensTruncation), + "Should continue on truncation" + ); + } + + /// CHARACTERIZATION: Priority order - tools > incomplete > unexecuted > truncated + #[test] + fn priority_order_is_tools_first() { + // When multiple conditions are true, tools executed takes priority + let result = should_auto_continue( + true, // autonomous + true, // tools executed + true, // incomplete tool call + true, // unexecuted tool call + true, // was truncated + ); + + assert_eq!( + result, + Some(AutoContinueReason::ToolsExecuted), + "Tools executed should have highest priority" + ); + } + + /// CHARACTERIZATION: No conditions means no continue + #[test] + fn no_conditions_no_continue() { + let result = should_auto_continue( + true, // autonomous + false, // no tools executed + false, // no incomplete + false, // no unexecuted + false, // not truncated + ); + + assert!(result.is_none(), "Should not continue when no conditions met"); + } +} + +// ============================================================================= +// Characterization Tests: Duplicate Detection +// ============================================================================= + +mod duplicate_detection_characterization { + use g3_core::streaming::deduplicate_tool_calls; + use g3_core::ToolCall; + + fn make_tool_call(tool: &str, args: &str) -> ToolCall { + ToolCall { + tool: tool.to_string(), + args: serde_json::from_str(args).unwrap(), + } + } + + /// CHARACTERIZATION: Sequential duplicates in chunk are detected + #[test] + fn sequential_duplicates_detected() { + let tools = vec![ + make_tool_call("shell", r#"{"command": "ls"}"#), + make_tool_call("shell", r#"{"command": "ls"}"#), // duplicate + ]; + + let result = deduplicate_tool_calls(tools, |_| None); + + assert_eq!(result.len(), 2); + assert!(result[0].1.is_none(), "First call should not be duplicate"); + assert!( + result[1].1.as_ref().map(|s| s.contains("DUP")).unwrap_or(false), + "Second call should be marked as duplicate" + ); + } + + /// CHARACTERIZATION: Different tools are not duplicates + #[test] + fn different_tools_not_duplicates() { + let tools = vec![ + make_tool_call("shell", r#"{"command": "ls"}"#), + make_tool_call("read_file", r#"{"file_path": "test.txt"}"#), + ]; + + let result = deduplicate_tool_calls(tools, |_| None); + + assert!(result[0].1.is_none()); + assert!(result[1].1.is_none(), "Different tools should not be duplicates"); + } + + /// CHARACTERIZATION: Same tool with different args is not duplicate + #[test] + fn same_tool_different_args_not_duplicate() { + let tools = vec![ + make_tool_call("shell", r#"{"command": "ls"}"#), + make_tool_call("shell", r#"{"command": "pwd"}"#), + ]; + + let result = deduplicate_tool_calls(tools, |_| None); + + assert!(result[0].1.is_none()); + assert!(result[1].1.is_none(), "Same tool with different args should not be duplicate"); + } + + /// CHARACTERIZATION: First tool checked against previous message + #[test] + fn first_tool_checked_against_previous() { + let tools = vec![make_tool_call("shell", r#"{"command": "ls"}"#)]; + + // Simulate previous message had same tool call + let result = deduplicate_tool_calls(tools, |tc| { + if tc.tool == "shell" { + Some("DUP IN MSG".to_string()) + } else { + None + } + }); + + assert!( + result[0].1.as_ref().map(|s| s.contains("MSG")).unwrap_or(false), + "Should detect duplicate against previous message" + ); + } + + /// CHARACTERIZATION: Second tool not checked against previous message + #[test] + fn second_tool_not_checked_against_previous() { + let tools = vec![ + make_tool_call("read_file", r#"{"file_path": "a.txt"}"#), + make_tool_call("shell", r#"{"command": "ls"}"#), + ]; + + // Previous message check would mark shell as duplicate + let result = deduplicate_tool_calls(tools, |tc| { + if tc.tool == "shell" { + Some("DUP IN MSG".to_string()) + } else { + None + } + }); + + // First tool checked against previous (not duplicate) + assert!(result[0].1.is_none()); + // Second tool only checked against first in chunk (not previous message) + assert!(result[1].1.is_none(), "Second tool should only check against first in chunk"); + } +} + +// ============================================================================= +// Characterization Tests: Context Window Integration +// ============================================================================= + +mod context_window_characterization { + use g3_core::ContextWindow; + use g3_providers::{Message, MessageRole}; + + /// CHARACTERIZATION: Context window tracks token usage + #[test] + fn context_tracks_tokens() { + let mut ctx = ContextWindow::new(10000); + + let msg = Message::new(MessageRole::User, "Hello, world!".to_string()); + ctx.add_message(msg); + + assert!(ctx.used_tokens > 0, "Should track token usage"); + assert!(ctx.percentage_used() > 0.0, "Should calculate percentage"); + } + + /// CHARACTERIZATION: Should compact at 80% capacity + #[test] + fn should_compact_at_80_percent() { + let mut ctx = ContextWindow::new(1000); + + // Add messages until we're at ~80% + for i in 0..20 { + let msg = Message::new( + MessageRole::User, + format!("Message {} with some content to use tokens", i), + ); + ctx.add_message(msg); + } + + // Check if should_compact triggers around 80% + // Note: This is a characterization - we're capturing current behavior + let percentage = ctx.percentage_used(); + let should_compact = ctx.should_compact(); + + // The exact threshold may vary, but we're documenting the relationship + if percentage >= 80.0 { + assert!( + should_compact, + "Should compact when at {}% (>= 80%)", + percentage + ); + } + } + + /// CHARACTERIZATION: Conversation history is preserved + #[test] + fn conversation_history_preserved() { + let mut ctx = ContextWindow::new(10000); + + ctx.add_message(Message::new(MessageRole::User, "Question 1".to_string())); + ctx.add_message(Message::new( + MessageRole::Assistant, + "Answer 1".to_string(), + )); + ctx.add_message(Message::new(MessageRole::User, "Question 2".to_string())); + + assert_eq!( + ctx.conversation_history.len(), + 3, + "Should preserve all messages" + ); + assert!(ctx.conversation_history[0].content.contains("Question 1")); + assert!(ctx.conversation_history[1].content.contains("Answer 1")); + assert!(ctx.conversation_history[2].content.contains("Question 2")); + } +} + +// ============================================================================= +// Characterization Tests: Tool Execution Through Agent +// ============================================================================= +// These tests use the real Agent with tool execution to characterize +// the end-to-end behavior of stream_completion_with_tools. + +mod tool_execution_integration { + use super::*; + use g3_core::ToolCall; + use std::fs; + + /// Create a test agent in a temporary directory + async fn create_test_agent(temp_dir: &TempDir) -> Agent { + std::env::set_current_dir(temp_dir.path()).unwrap(); + let config = g3_config::Config::default(); + let ui_writer = NullUiWriter; + Agent::new(config, ui_writer).await.unwrap() + } + + /// CHARACTERIZATION: Tool execution adds result to context + #[tokio::test] + #[serial] + async fn tool_execution_updates_context() { + let temp_dir = TempDir::new().unwrap(); + let test_file = temp_dir.path().join("test.txt"); + fs::write(&test_file, "Hello from test file").unwrap(); + + let mut agent = create_test_agent(&temp_dir).await; + + // Execute a tool directly + let tool_call = ToolCall { + tool: "read_file".to_string(), + args: serde_json::json!({ "file_path": test_file.to_string_lossy() }), + }; + + let result = agent.execute_tool(&tool_call).await.unwrap(); + + // Verify tool executed successfully + assert!( + result.contains("Hello from test file"), + "Tool should return file content: {}", + result + ); + } + + /// CHARACTERIZATION: Shell tool captures output + #[tokio::test] + #[serial] + async fn shell_tool_captures_output() { + let temp_dir = TempDir::new().unwrap(); + let mut agent = create_test_agent(&temp_dir).await; + + let tool_call = ToolCall { + tool: "shell".to_string(), + args: serde_json::json!({ "command": "echo 'test output'" }), + }; + + let result = agent.execute_tool(&tool_call).await.unwrap(); + + assert!( + result.contains("test output"), + "Should capture shell output: {}", + result + ); + } + + /// CHARACTERIZATION: Write file creates file and reports success + #[tokio::test] + #[serial] + async fn write_file_creates_and_reports() { + let temp_dir = TempDir::new().unwrap(); + let new_file = temp_dir.path().join("new.txt"); + + let mut agent = create_test_agent(&temp_dir).await; + + let tool_call = ToolCall { + tool: "write_file".to_string(), + args: serde_json::json!({ + "file_path": new_file.to_string_lossy(), + "content": "New content" + }), + }; + + let result = agent.execute_tool(&tool_call).await.unwrap(); + + // File should exist + assert!(new_file.exists(), "File should be created"); + + // Result should indicate success + assert!( + result.contains("✅") || result.to_lowercase().contains("wrote"), + "Should report success: {}", + result + ); + } + + /// CHARACTERIZATION: TODO tools work through agent + #[tokio::test] + #[serial] + async fn todo_tools_work() { + let temp_dir = TempDir::new().unwrap(); + let mut agent = create_test_agent(&temp_dir).await; + + // Write TODO + let write_call = ToolCall { + tool: "todo_write".to_string(), + args: serde_json::json!({ + "content": "- [ ] Test task\n- [x] Done task" + }), + }; + let write_result = agent.execute_tool(&write_call).await.unwrap(); + assert!( + write_result.contains("✅"), + "Write should succeed: {}", + write_result + ); + + // Read TODO + let read_call = ToolCall { + tool: "todo_read".to_string(), + args: serde_json::json!({}), + }; + let read_result = agent.execute_tool(&read_call).await.unwrap(); + assert!( + read_result.contains("Test task"), + "Should read back TODO: {}", + read_result + ); + } +} + +// ============================================================================= +// Characterization Tests: Streaming Utilities +// ============================================================================= + +mod streaming_utilities_characterization { + use g3_core::streaming::{ + clean_llm_tokens, format_duration, is_empty_response, truncate_for_display, + }; + use std::time::Duration; + + /// CHARACTERIZATION: LLM tokens are cleaned from output + #[test] + fn llm_tokens_cleaned() { + assert_eq!(clean_llm_tokens("hello<|im_end|>"), "hello"); + assert_eq!(clean_llm_tokens("testmore"), "testmore"); + assert_eq!(clean_llm_tokens("[/INST]response"), "response"); + assert_eq!(clean_llm_tokens("normal text"), "normal text"); + } + + /// CHARACTERIZATION: Duration formatting + #[test] + fn duration_formatting() { + assert_eq!(format_duration(Duration::from_millis(500)), "500ms"); + assert_eq!(format_duration(Duration::from_millis(1500)), "1.5s"); + assert_eq!(format_duration(Duration::from_secs(90)), "1m 30.0s"); + } + + /// CHARACTERIZATION: Empty response detection + #[test] + fn empty_response_detection() { + assert!(is_empty_response("")); + assert!(is_empty_response(" \n ")); + assert!(is_empty_response("⏱️ 1.5s")); + assert!(!is_empty_response("Hello")); + assert!(!is_empty_response("Some actual content")); + } + + /// CHARACTERIZATION: Display truncation + #[test] + fn display_truncation() { + assert_eq!(truncate_for_display("short", 10), "short"); + assert_eq!(truncate_for_display("this is long", 5), "this ..."); + // Multi-line uses first line only + assert_eq!( + truncate_for_display("first line\nsecond line", 20), + "first line" + ); + } +} + +// ============================================================================= +// Characterization Tests: Parser Sanitization +// ============================================================================= + +mod parser_sanitization_characterization { + use g3_core::{sanitize_inline_tool_patterns, LBRACE_HOMOGLYPH}; + + /// CHARACTERIZATION: Standalone tool calls are not sanitized + #[test] + fn standalone_tool_calls_preserved() { + let input = r#"{"tool": "shell", "args": {}}"#; + let output = sanitize_inline_tool_patterns(input); + assert_eq!(output, input, "Standalone tool call should be preserved"); + } + + /// CHARACTERIZATION: Inline tool patterns are sanitized + #[test] + fn inline_patterns_sanitized() { + let input = r#"Example: {"tool": "shell"} in text"#; + let output = sanitize_inline_tool_patterns(input); + assert!( + output.contains(LBRACE_HOMOGLYPH), + "Inline pattern should be sanitized: {}", + output + ); + assert!( + !output.starts_with('{'), + "Should not start with regular brace" + ); + } + + /// CHARACTERIZATION: Tool call on its own line is preserved + #[test] + fn tool_call_on_own_line_preserved() { + let input = "Some text\n{\"tool\": \"shell\"}\nMore text"; + let output = sanitize_inline_tool_patterns(input); + // The tool call line should be preserved + assert!( + output.contains("{\"tool\""), + "Tool call on own line should be preserved: {}", + output + ); + } +}