Add characterization tests for stream_completion_with_tools

Add 32 blackbox characterization tests to lock down the behavior of the stream_completion_with_tools function (1067 lines) before refactoring. Tests cover key behaviors through stable boundaries: - StreamingToolParser: tool call detection, incomplete detection, text accumulation - Auto-continue logic: autonomous mode decisions, priority ordering - Duplicate detection: sequential duplicates, cross-message duplicates - Context window: token tracking, compaction threshold, history preservation - Tool execution: read_file, shell, write_file, todo tools through Agent - Streaming utilities: LLM token cleaning, duration formatting, truncation - Parser sanitization: inline tool pattern handling, homoglyph replacement These tests intentionally do NOT assert: - Internal parser state or implementation details - Specific timing values - UI output formatting - Provider-specific behavior Agent: hopper
2026-01-13 16:25:33 +05:30
parent 47e3a88cf6
commit b89d55a9ff
1 changed files with 713 additions and 0 deletions
--- a/crates/g3-core/tests/stream_completion_characterization_test.rs
+++ b/crates/g3-core/tests/stream_completion_characterization_test.rs
@@ -0,0 +1,713 @@
+//! Characterization Tests for stream_completion_with_tools
+//!
+//! CHARACTERIZATION: These tests capture the current behavior of the streaming
+//! completion loop. They serve as a safety net for refactoring by locking down
+//! observable behavior through the stable Agent interface.
+//!
+//! What these tests protect:
+//! - Tool calls are detected, executed, and results added to context
+//! - Auto-continue behavior in autonomous mode after tool execution
+//! - Duplicate tool call detection (sequential duplicates skipped)
+//! - Incomplete tool call detection triggers auto-continue
+//! - Context window updates reflect tool execution
+//! - Streaming parser behavior for JSON tool call detection
+//!
+//! What these tests intentionally do NOT assert:
+//! - Internal parser state or implementation details
+//! - Specific timing values (only that timing info is present when requested)
+//! - UI output formatting (uses NullUiWriter)
+//! - Provider-specific behavior
+//!
+//! Surface: Agent::execute_tool() for tool execution, streaming parser for parsing
+//! Boundary: Tool execution boundary, streaming parser boundary
+
+use g3_core::ui_writer::NullUiWriter;
+use g3_core::Agent;
+use serial_test::serial;
+use tempfile::TempDir;
+
+// =============================================================================
+// Characterization Tests: StreamingToolParser Behavior
+// =============================================================================
+// These tests characterize the streaming parser's behavior which is a key
+// component of stream_completion_with_tools.
+
+mod streaming_parser_characterization {
+    use g3_core::StreamingToolParser;
+    use g3_providers::CompletionChunk;
+
+    /// CHARACTERIZATION: Parser detects complete JSON tool calls in text
+    #[test]
+    fn parser_detects_complete_tool_call() {
+        let mut parser = StreamingToolParser::new();
+        let chunk = CompletionChunk {
+            content: r#"{"tool": "shell", "args": {"command": "ls"}}"#.to_string(),
+            finished: false,
+            tool_calls: None,
+            usage: None,
+            stop_reason: None,
+        };
+
+        let tools = parser.process_chunk(&chunk);
+
+        // Should detect one tool call
+        assert_eq!(tools.len(), 1, "Should detect exactly one tool call");
+        assert_eq!(tools[0].tool, "shell");
+    }
+
+    /// CHARACTERIZATION: Parser accumulates text before tool call
+    #[test]
+    fn parser_accumulates_text_before_tool() {
+        let mut parser = StreamingToolParser::new();
+
+        // First chunk: just text
+        let chunk1 = CompletionChunk {
+            content: "Let me run a command.\n\n".to_string(),
+            finished: false,
+            tool_calls: None,
+            usage: None,
+            stop_reason: None,
+        };
+        let tools1 = parser.process_chunk(&chunk1);
+        assert!(tools1.is_empty(), "No tool call yet");
+
+        // Second chunk: tool call
+        let chunk2 = CompletionChunk {
+            content: r#"{"tool": "shell", "args": {"command": "pwd"}}"#.to_string(),
+            finished: false,
+            tool_calls: None,
+            usage: None,
+            stop_reason: None,
+        };
+        let tools2 = parser.process_chunk(&chunk2);
+        assert_eq!(tools2.len(), 1, "Should detect tool call");
+
+        // Text buffer should contain the accumulated text
+        let text = parser.get_text_content();
+        assert!(
+            text.contains("Let me run a command"),
+            "Should preserve text before tool call: {}",
+            text
+        );
+    }
+
+    /// CHARACTERIZATION: Parser detects incomplete tool calls
+    #[test]
+    fn parser_detects_incomplete_tool_call() {
+        let mut parser = StreamingToolParser::new();
+        let chunk = CompletionChunk {
+            content: r#"{"tool": "read_file", "args": {"file_path": "test.txt""#.to_string(),
+            finished: false,
+            tool_calls: None,
+            usage: None,
+            stop_reason: None,
+        };
+
+        parser.process_chunk(&chunk);
+
+        // Should detect incomplete tool call (missing closing braces)
+        assert!(
+            parser.has_incomplete_tool_call(),
+            "Should detect incomplete tool call"
+        );
+    }
+
+    /// CHARACTERIZATION: Parser detects unexecuted complete tool calls
+    #[test]
+    fn parser_detects_unexecuted_tool_call() {
+        let mut parser = StreamingToolParser::new();
+        let chunk = CompletionChunk {
+            content: r#"{"tool": "shell", "args": {"command": "ls"}}"#.to_string(),
+            finished: false,
+            tool_calls: None,
+            usage: None,
+            stop_reason: None,
+        };
+
+        // Process but don't execute
+        let _tools = parser.process_chunk(&chunk);
+
+        // Should detect unexecuted tool call
+        assert!(
+            parser.has_unexecuted_tool_call(),
+            "Should detect unexecuted tool call"
+        );
+    }
+
+    /// CHARACTERIZATION: Marking tools consumed clears unexecuted state
+    #[test]
+    fn marking_consumed_clears_unexecuted() {
+        let mut parser = StreamingToolParser::new();
+        let chunk = CompletionChunk {
+            content: r#"{"tool": "shell", "args": {"command": "ls"}}"#.to_string(),
+            finished: false,
+            tool_calls: None,
+            usage: None,
+            stop_reason: None,
+        };
+
+        let _tools = parser.process_chunk(&chunk);
+        assert!(parser.has_unexecuted_tool_call());
+
+        // Mark as consumed (simulating execution)
+        parser.mark_tool_calls_consumed();
+
+        assert!(
+            !parser.has_unexecuted_tool_call(),
+            "Should not detect unexecuted after marking consumed"
+        );
+    }
+
+    /// CHARACTERIZATION: Reset clears all parser state
+    #[test]
+    fn reset_clears_parser_state() {
+        let mut parser = StreamingToolParser::new();
+        let chunk = CompletionChunk {
+            content: "Some text\n{\"tool\": \"shell\"".to_string(),
+            finished: false,
+            tool_calls: None,
+            usage: None,
+            stop_reason: None,
+        };
+
+        parser.process_chunk(&chunk);
+        assert!(parser.has_incomplete_tool_call());
+
+        parser.reset();
+
+        assert!(!parser.has_incomplete_tool_call());
+        assert!(!parser.has_unexecuted_tool_call());
+        assert!(parser.get_text_content().is_empty());
+    }
+}
+
+// =============================================================================
+// Characterization Tests: Auto-Continue Logic
+// =============================================================================
+
+mod auto_continue_characterization {
+    use g3_core::streaming::{should_auto_continue, AutoContinueReason};
+
+    /// CHARACTERIZATION: Non-autonomous mode never auto-continues
+    #[test]
+    fn non_autonomous_never_continues() {
+        // Even with all conditions true, non-autonomous should not continue
+        let result = should_auto_continue(
+            false, // not autonomous
+            true,  // tools executed
+            true,  // incomplete tool call
+            true,  // unexecuted tool call
+            true,  // was truncated
+        );
+
+        assert!(result.is_none(), "Non-autonomous mode should never auto-continue");
+    }
+
+    /// CHARACTERIZATION: Autonomous mode continues after tool execution
+    #[test]
+    fn autonomous_continues_after_tools() {
+        let result = should_auto_continue(
+            true,  // autonomous
+            true,  // tools executed
+            false, // no incomplete
+            false, // no unexecuted
+            false, // not truncated
+        );
+
+        assert_eq!(
+            result,
+            Some(AutoContinueReason::ToolsExecuted),
+            "Should continue after tool execution"
+        );
+    }
+
+    /// CHARACTERIZATION: Incomplete tool call triggers continue
+    #[test]
+    fn incomplete_tool_triggers_continue() {
+        let result = should_auto_continue(
+            true,  // autonomous
+            false, // no tools executed
+            true,  // incomplete tool call
+            false, // no unexecuted
+            false, // not truncated
+        );
+
+        assert_eq!(
+            result,
+            Some(AutoContinueReason::IncompleteToolCall),
+            "Should continue on incomplete tool call"
+        );
+    }
+
+    /// CHARACTERIZATION: Unexecuted tool call triggers continue
+    #[test]
+    fn unexecuted_tool_triggers_continue() {
+        let result = should_auto_continue(
+            true,  // autonomous
+            false, // no tools executed
+            false, // no incomplete
+            true,  // unexecuted tool call
+            false, // not truncated
+        );
+
+        assert_eq!(
+            result,
+            Some(AutoContinueReason::UnexecutedToolCall),
+            "Should continue on unexecuted tool call"
+        );
+    }
+
+    /// CHARACTERIZATION: Max tokens truncation triggers continue
+    #[test]
+    fn truncation_triggers_continue() {
+        let result = should_auto_continue(
+            true,  // autonomous
+            false, // no tools executed
+            false, // no incomplete
+            false, // no unexecuted
+            true,  // was truncated
+        );
+
+        assert_eq!(
+            result,
+            Some(AutoContinueReason::MaxTokensTruncation),
+            "Should continue on truncation"
+        );
+    }
+
+    /// CHARACTERIZATION: Priority order - tools > incomplete > unexecuted > truncated
+    #[test]
+    fn priority_order_is_tools_first() {
+        // When multiple conditions are true, tools executed takes priority
+        let result = should_auto_continue(
+            true, // autonomous
+            true, // tools executed
+            true, // incomplete tool call
+            true, // unexecuted tool call
+            true, // was truncated
+        );
+
+        assert_eq!(
+            result,
+            Some(AutoContinueReason::ToolsExecuted),
+            "Tools executed should have highest priority"
+        );
+    }
+
+    /// CHARACTERIZATION: No conditions means no continue
+    #[test]
+    fn no_conditions_no_continue() {
+        let result = should_auto_continue(
+            true,  // autonomous
+            false, // no tools executed
+            false, // no incomplete
+            false, // no unexecuted
+            false, // not truncated
+        );
+
+        assert!(result.is_none(), "Should not continue when no conditions met");
+    }
+}
+
+// =============================================================================
+// Characterization Tests: Duplicate Detection
+// =============================================================================
+
+mod duplicate_detection_characterization {
+    use g3_core::streaming::deduplicate_tool_calls;
+    use g3_core::ToolCall;
+
+    fn make_tool_call(tool: &str, args: &str) -> ToolCall {
+        ToolCall {
+            tool: tool.to_string(),
+            args: serde_json::from_str(args).unwrap(),
+        }
+    }
+
+    /// CHARACTERIZATION: Sequential duplicates in chunk are detected
+    #[test]
+    fn sequential_duplicates_detected() {
+        let tools = vec![
+            make_tool_call("shell", r#"{"command": "ls"}"#),
+            make_tool_call("shell", r#"{"command": "ls"}"#), // duplicate
+        ];
+
+        let result = deduplicate_tool_calls(tools, |_| None);
+
+        assert_eq!(result.len(), 2);
+        assert!(result[0].1.is_none(), "First call should not be duplicate");
+        assert!(
+            result[1].1.as_ref().map(|s| s.contains("DUP")).unwrap_or(false),
+            "Second call should be marked as duplicate"
+        );
+    }
+
+    /// CHARACTERIZATION: Different tools are not duplicates
+    #[test]
+    fn different_tools_not_duplicates() {
+        let tools = vec![
+            make_tool_call("shell", r#"{"command": "ls"}"#),
+            make_tool_call("read_file", r#"{"file_path": "test.txt"}"#),
+        ];
+
+        let result = deduplicate_tool_calls(tools, |_| None);
+
+        assert!(result[0].1.is_none());
+        assert!(result[1].1.is_none(), "Different tools should not be duplicates");
+    }
+
+    /// CHARACTERIZATION: Same tool with different args is not duplicate
+    #[test]
+    fn same_tool_different_args_not_duplicate() {
+        let tools = vec![
+            make_tool_call("shell", r#"{"command": "ls"}"#),
+            make_tool_call("shell", r#"{"command": "pwd"}"#),
+        ];
+
+        let result = deduplicate_tool_calls(tools, |_| None);
+
+        assert!(result[0].1.is_none());
+        assert!(result[1].1.is_none(), "Same tool with different args should not be duplicate");
+    }
+
+    /// CHARACTERIZATION: First tool checked against previous message
+    #[test]
+    fn first_tool_checked_against_previous() {
+        let tools = vec![make_tool_call("shell", r#"{"command": "ls"}"#)];
+
+        // Simulate previous message had same tool call
+        let result = deduplicate_tool_calls(tools, |tc| {
+            if tc.tool == "shell" {
+                Some("DUP IN MSG".to_string())
+            } else {
+                None
+            }
+        });
+
+        assert!(
+            result[0].1.as_ref().map(|s| s.contains("MSG")).unwrap_or(false),
+            "Should detect duplicate against previous message"
+        );
+    }
+
+    /// CHARACTERIZATION: Second tool not checked against previous message
+    #[test]
+    fn second_tool_not_checked_against_previous() {
+        let tools = vec![
+            make_tool_call("read_file", r#"{"file_path": "a.txt"}"#),
+            make_tool_call("shell", r#"{"command": "ls"}"#),
+        ];
+
+        // Previous message check would mark shell as duplicate
+        let result = deduplicate_tool_calls(tools, |tc| {
+            if tc.tool == "shell" {
+                Some("DUP IN MSG".to_string())
+            } else {
+                None
+            }
+        });
+
+        // First tool checked against previous (not duplicate)
+        assert!(result[0].1.is_none());
+        // Second tool only checked against first in chunk (not previous message)
+        assert!(result[1].1.is_none(), "Second tool should only check against first in chunk");
+    }
+}
+
+// =============================================================================
+// Characterization Tests: Context Window Integration
+// =============================================================================
+
+mod context_window_characterization {
+    use g3_core::ContextWindow;
+    use g3_providers::{Message, MessageRole};
+
+    /// CHARACTERIZATION: Context window tracks token usage
+    #[test]
+    fn context_tracks_tokens() {
+        let mut ctx = ContextWindow::new(10000);
+
+        let msg = Message::new(MessageRole::User, "Hello, world!".to_string());
+        ctx.add_message(msg);
+
+        assert!(ctx.used_tokens > 0, "Should track token usage");
+        assert!(ctx.percentage_used() > 0.0, "Should calculate percentage");
+    }
+
+    /// CHARACTERIZATION: Should compact at 80% capacity
+    #[test]
+    fn should_compact_at_80_percent() {
+        let mut ctx = ContextWindow::new(1000);
+
+        // Add messages until we're at ~80%
+        for i in 0..20 {
+            let msg = Message::new(
+                MessageRole::User,
+                format!("Message {} with some content to use tokens", i),
+            );
+            ctx.add_message(msg);
+        }
+
+        // Check if should_compact triggers around 80%
+        // Note: This is a characterization - we're capturing current behavior
+        let percentage = ctx.percentage_used();
+        let should_compact = ctx.should_compact();
+
+        // The exact threshold may vary, but we're documenting the relationship
+        if percentage >= 80.0 {
+            assert!(
+                should_compact,
+                "Should compact when at {}% (>= 80%)",
+                percentage
+            );
+        }
+    }
+
+    /// CHARACTERIZATION: Conversation history is preserved
+    #[test]
+    fn conversation_history_preserved() {
+        let mut ctx = ContextWindow::new(10000);
+
+        ctx.add_message(Message::new(MessageRole::User, "Question 1".to_string()));
+        ctx.add_message(Message::new(
+            MessageRole::Assistant,
+            "Answer 1".to_string(),
+        ));
+        ctx.add_message(Message::new(MessageRole::User, "Question 2".to_string()));
+
+        assert_eq!(
+            ctx.conversation_history.len(),
+            3,
+            "Should preserve all messages"
+        );
+        assert!(ctx.conversation_history[0].content.contains("Question 1"));
+        assert!(ctx.conversation_history[1].content.contains("Answer 1"));
+        assert!(ctx.conversation_history[2].content.contains("Question 2"));
+    }
+}
+
+// =============================================================================
+// Characterization Tests: Tool Execution Through Agent
+// =============================================================================
+// These tests use the real Agent with tool execution to characterize
+// the end-to-end behavior of stream_completion_with_tools.
+
+mod tool_execution_integration {
+    use super::*;
+    use g3_core::ToolCall;
+    use std::fs;
+
+    /// Create a test agent in a temporary directory
+    async fn create_test_agent(temp_dir: &TempDir) -> Agent<NullUiWriter> {
+        std::env::set_current_dir(temp_dir.path()).unwrap();
+        let config = g3_config::Config::default();
+        let ui_writer = NullUiWriter;
+        Agent::new(config, ui_writer).await.unwrap()
+    }
+
+    /// CHARACTERIZATION: Tool execution adds result to context
+    #[tokio::test]
+    #[serial]
+    async fn tool_execution_updates_context() {
+        let temp_dir = TempDir::new().unwrap();
+        let test_file = temp_dir.path().join("test.txt");
+        fs::write(&test_file, "Hello from test file").unwrap();
+
+        let mut agent = create_test_agent(&temp_dir).await;
+
+        // Execute a tool directly
+        let tool_call = ToolCall {
+            tool: "read_file".to_string(),
+            args: serde_json::json!({ "file_path": test_file.to_string_lossy() }),
+        };
+
+        let result = agent.execute_tool(&tool_call).await.unwrap();
+
+        // Verify tool executed successfully
+        assert!(
+            result.contains("Hello from test file"),
+            "Tool should return file content: {}",
+            result
+        );
+    }
+
+    /// CHARACTERIZATION: Shell tool captures output
+    #[tokio::test]
+    #[serial]
+    async fn shell_tool_captures_output() {
+        let temp_dir = TempDir::new().unwrap();
+        let mut agent = create_test_agent(&temp_dir).await;
+
+        let tool_call = ToolCall {
+            tool: "shell".to_string(),
+            args: serde_json::json!({ "command": "echo 'test output'" }),
+        };
+
+        let result = agent.execute_tool(&tool_call).await.unwrap();
+
+        assert!(
+            result.contains("test output"),
+            "Should capture shell output: {}",
+            result
+        );
+    }
+
+    /// CHARACTERIZATION: Write file creates file and reports success
+    #[tokio::test]
+    #[serial]
+    async fn write_file_creates_and_reports() {
+        let temp_dir = TempDir::new().unwrap();
+        let new_file = temp_dir.path().join("new.txt");
+
+        let mut agent = create_test_agent(&temp_dir).await;
+
+        let tool_call = ToolCall {
+            tool: "write_file".to_string(),
+            args: serde_json::json!({
+                "file_path": new_file.to_string_lossy(),
+                "content": "New content"
+            }),
+        };
+
+        let result = agent.execute_tool(&tool_call).await.unwrap();
+
+        // File should exist
+        assert!(new_file.exists(), "File should be created");
+
+        // Result should indicate success
+        assert!(
+            result.contains("✅") || result.to_lowercase().contains("wrote"),
+            "Should report success: {}",
+            result
+        );
+    }
+
+    /// CHARACTERIZATION: TODO tools work through agent
+    #[tokio::test]
+    #[serial]
+    async fn todo_tools_work() {
+        let temp_dir = TempDir::new().unwrap();
+        let mut agent = create_test_agent(&temp_dir).await;
+
+        // Write TODO
+        let write_call = ToolCall {
+            tool: "todo_write".to_string(),
+            args: serde_json::json!({
+                "content": "- [ ] Test task\n- [x] Done task"
+            }),
+        };
+        let write_result = agent.execute_tool(&write_call).await.unwrap();
+        assert!(
+            write_result.contains("✅"),
+            "Write should succeed: {}",
+            write_result
+        );
+
+        // Read TODO
+        let read_call = ToolCall {
+            tool: "todo_read".to_string(),
+            args: serde_json::json!({}),
+        };
+        let read_result = agent.execute_tool(&read_call).await.unwrap();
+        assert!(
+            read_result.contains("Test task"),
+            "Should read back TODO: {}",
+            read_result
+        );
+    }
+}
+
+// =============================================================================
+// Characterization Tests: Streaming Utilities
+// =============================================================================
+
+mod streaming_utilities_characterization {
+    use g3_core::streaming::{
+        clean_llm_tokens, format_duration, is_empty_response, truncate_for_display,
+    };
+    use std::time::Duration;
+
+    /// CHARACTERIZATION: LLM tokens are cleaned from output
+    #[test]
+    fn llm_tokens_cleaned() {
+        assert_eq!(clean_llm_tokens("hello<|im_end|>"), "hello");
+        assert_eq!(clean_llm_tokens("test</s>more"), "testmore");
+        assert_eq!(clean_llm_tokens("[/INST]response"), "response");
+        assert_eq!(clean_llm_tokens("normal text"), "normal text");
+    }
+
+    /// CHARACTERIZATION: Duration formatting
+    #[test]
+    fn duration_formatting() {
+        assert_eq!(format_duration(Duration::from_millis(500)), "500ms");
+        assert_eq!(format_duration(Duration::from_millis(1500)), "1.5s");
+        assert_eq!(format_duration(Duration::from_secs(90)), "1m 30.0s");
+    }
+
+    /// CHARACTERIZATION: Empty response detection
+    #[test]
+    fn empty_response_detection() {
+        assert!(is_empty_response(""));
+        assert!(is_empty_response("   \n  "));
+        assert!(is_empty_response("⏱️ 1.5s"));
+        assert!(!is_empty_response("Hello"));
+        assert!(!is_empty_response("Some actual content"));
+    }
+
+    /// CHARACTERIZATION: Display truncation
+    #[test]
+    fn display_truncation() {
+        assert_eq!(truncate_for_display("short", 10), "short");
+        assert_eq!(truncate_for_display("this is long", 5), "this ...");
+        // Multi-line uses first line only
+        assert_eq!(
+            truncate_for_display("first line\nsecond line", 20),
+            "first line"
+        );
+    }
+}
+
+// =============================================================================
+// Characterization Tests: Parser Sanitization
+// =============================================================================
+
+mod parser_sanitization_characterization {
+    use g3_core::{sanitize_inline_tool_patterns, LBRACE_HOMOGLYPH};
+
+    /// CHARACTERIZATION: Standalone tool calls are not sanitized
+    #[test]
+    fn standalone_tool_calls_preserved() {
+        let input = r#"{"tool": "shell", "args": {}}"#;
+        let output = sanitize_inline_tool_patterns(input);
+        assert_eq!(output, input, "Standalone tool call should be preserved");
+    }
+
+    /// CHARACTERIZATION: Inline tool patterns are sanitized
+    #[test]
+    fn inline_patterns_sanitized() {
+        let input = r#"Example: {"tool": "shell"} in text"#;
+        let output = sanitize_inline_tool_patterns(input);
+        assert!(
+            output.contains(LBRACE_HOMOGLYPH),
+            "Inline pattern should be sanitized: {}",
+            output
+        );
+        assert!(
+            !output.starts_with('{'),
+            "Should not start with regular brace"
+        );
+    }
+
+    /// CHARACTERIZATION: Tool call on its own line is preserved
+    #[test]
+    fn tool_call_on_own_line_preserved() {
+        let input = "Some text\n{\"tool\": \"shell\"}\nMore text";
+        let output = sanitize_inline_tool_patterns(input);
+        // The tool call line should be preserved
+        assert!(
+            output.contains("{\"tool\""),
+            "Tool call on own line should be preserved: {}",
+            output
+        );
+    }
+}