From 0aa1287ca6fec080175611be715d8f2f26dc94e4 Mon Sep 17 00:00:00 2001 From: "Dhanji R. Prasanna" Date: Sat, 10 Jan 2026 13:43:04 +1100 Subject: [PATCH] Remove final_output tool and improve scout report handback final_output removal: - Remove final_output from tool definitions and dispatch - Update system prompts to request summaries as regular text - Remove final_output_called field from StreamingState - Update auto_continue tests to remove final_output_called parameter - Remove final_output test from tool_execution_test.rs - Update planner and flock prompts to not reference final_output - Keep backwards-compat code in feedback_extraction.rs and task_result.rs Scout report handback: - Change from file-based to delimiter-based report extraction - Scout outputs report between ---SCOUT_REPORT_START/END--- markers - Research tool extracts content between markers, strips ANSI codes - Add comprehensive tests for extraction and ANSI stripping 657 tests pass. --- agents/scout.md | 56 +++++- crates/g3-cli/src/lib.rs | 4 +- crates/g3-core/src/streaming.rs | 2 - crates/g3-core/src/tools/research.rs | 198 +++++++++++++++++--- crates/g3-core/tests/auto_continue_test.rs | 54 ++---- crates/g3-core/tests/tool_execution_test.rs | 21 --- crates/g3-ensembles/src/flock.rs | 2 +- crates/g3-planner/src/llm.rs | 2 +- crates/g3-planner/src/planner.rs | 3 +- 9 files changed, 247 insertions(+), 95 deletions(-) diff --git a/agents/scout.md b/agents/scout.md index dfe098e..f87b365 100644 --- a/agents/scout.md +++ b/agents/scout.md @@ -37,7 +37,7 @@ Short bullets comparing the options where it matters. ### Recommendation (Optional) If one option is clearly dominant, state it. -If not, say “No clear default.” +If not, say "No clear default." ### Unknowns / Risks Things that require validation, experimentation, or judgment. @@ -46,7 +46,56 @@ Things that require validation, experimentation, or judgment. Links only (titles + URLs). Brief quotes or snippets if relevant to decision making. No page dumps. -Write this brief out to a temporary file and write out the full path of the filename as your VERY LAST LINE of output. +**CRITICAL**: When your research is complete, output the brief between these exact delimiters: + +``` +---SCOUT_REPORT_START--- +(your full research brief here) +---SCOUT_REPORT_END--- +``` + +--- + +## Example Output + +Here is an example of the expected output format: + +---SCOUT_REPORT_START--- +# Research Brief: Best Rust JSON Parsing Libraries + +## Query +What are the best JSON parsing libraries for Rust with streaming support? + +## Options + +### 1. **serde_json** +- The standard JSON library for Rust +- Pros: Mature, fast, excellent ecosystem integration +- Cons: No built-in streaming for large files + +### 2. **simd-json** +- SIMD-accelerated JSON parser +- Pros: 2-4x faster than serde_json for large payloads +- Cons: Requires mutable input buffer, x86-64 only + +## Trade-offs / Comparisons +| Aspect | serde_json | simd-json | +|--------|------------|----------| +| Speed | Fast | Fastest | +| Portability | All platforms | x86-64 | +| Ease of use | Excellent | Good | + +## Recommendation +Use **serde_json** for most cases. Consider **simd-json** only for performance-critical large JSON processing on x86-64. + +## Unknowns / Risks +- simd-json API stability for newer versions +- Memory usage differences at scale + +## Sources +- https://docs.rs/serde_json +- https://github.com/simd-lite/simd-json +---SCOUT_REPORT_END--- --- @@ -71,7 +120,7 @@ If nothing useful exists, say so explicitly and back this up with evidence. - Optimize for *your user* making a decision, not for completeness. You are allowed to say: -> “This exists but is immature / fragile / not worth it.” +> "This exists but is immature / fragile / not worth it." --- @@ -91,6 +140,7 @@ You succeed if: - The reader can decide what to try or ignore in under 5 minutes. - The brief is calm, bounded, and opinionated where justified. - No context bloat is introduced. +- **The report is wrapped in the exact delimiters shown above.** If nothing meets the bar, saying so is OK. diff --git a/crates/g3-cli/src/lib.rs b/crates/g3-cli/src/lib.rs index f872a06..54bb47d 100644 --- a/crates/g3-cli/src/lib.rs +++ b/crates/g3-cli/src/lib.rs @@ -2789,7 +2789,7 @@ Remember: Be clear in your review and concise in your feedback. APPROVE iff the // We have a valid coach result, process it let coach_result = coach_result_opt.unwrap(); - // Extract the complete coach feedback from final_output + // Extract the complete coach feedback from the response let coach_feedback_text = extract_coach_feedback_from_logs(&coach_result, &coach_agent, &output)?; @@ -2800,7 +2800,7 @@ Remember: Be clear in your review and concise in your feedback. APPROVE iff the coach_result.response.len() ); - // Check if we got empty feedback (this can happen if the coach doesn't call final_output) + // Check if we got empty feedback (this can happen if the coach doesn't provide substantive feedback) if coach_feedback_text.is_empty() { output.print("⚠️ Coach did not provide feedback. This may be a model issue."); coach_feedback = "The implementation needs review. Please ensure all requirements are met and the code compiles without errors.".to_string(); diff --git a/crates/g3-core/src/streaming.rs b/crates/g3-core/src/streaming.rs index e4cb2a9..7428341 100644 --- a/crates/g3-core/src/streaming.rs +++ b/crates/g3-core/src/streaming.rs @@ -22,7 +22,6 @@ pub struct StreamingState { pub response_started: bool, pub any_tool_executed: bool, pub auto_summary_attempts: usize, - pub final_output_called: bool, pub turn_accumulated_usage: Option, } @@ -36,7 +35,6 @@ impl StreamingState { response_started: false, any_tool_executed: false, auto_summary_attempts: 0, - final_output_called: false, turn_accumulated_usage: None, } } diff --git a/crates/g3-core/src/tools/research.rs b/crates/g3-core/src/tools/research.rs index fb6bc02..424aa08 100644 --- a/crates/g3-core/src/tools/research.rs +++ b/crates/g3-core/src/tools/research.rs @@ -10,12 +10,16 @@ use crate::ToolCall; use super::executor::ToolContext; +/// Delimiter markers for scout report extraction +const REPORT_START_MARKER: &str = "---SCOUT_REPORT_START---"; +const REPORT_END_MARKER: &str = "---SCOUT_REPORT_END---"; + /// Execute the research tool by spawning a scout agent. /// /// This tool: /// 1. Spawns `g3 --agent scout` with the query -/// 2. Captures stdout and extracts the last line (file path to report) -/// 3. Reads the report file and returns its contents +/// 2. Captures stdout and extracts the report between delimiter markers +/// 3. Returns the report content directly pub async fn execute_research( tool_call: &ToolCall, ctx: &mut ToolContext<'_, W>, @@ -46,20 +50,20 @@ pub async fn execute_research( .spawn() .map_err(|e| anyhow::anyhow!("Failed to spawn scout agent: {}", e))?; - // Capture stdout to find the report file path + // Capture stdout to find the report content let stdout = child.stdout.take() .ok_or_else(|| anyhow::anyhow!("Failed to capture scout agent stdout"))?; let mut reader = BufReader::new(stdout).lines(); - let mut last_line = String::new(); + let mut all_output = Vec::new(); // Print a header for the scout output - ctx.ui_writer.println("\n📡 Scout agent output:"); + ctx.ui_writer.println("\n📡 Scout agent researching..."); - // Stream all lines to UI, keeping track of the last one for the report path + // Collect all lines while let Some(line) = reader.next_line().await? { ctx.ui_writer.println(&format!(" {}", line)); - last_line = line; + all_output.push(line); } // Wait for the process to complete @@ -70,31 +74,167 @@ pub async fn execute_research( return Ok(format!("❌ Scout agent failed with exit code: {:?}", status.code())); } - // The last line should be the path to the report file - let report_path = last_line.trim(); + // Join all output and extract the report between markers + let full_output = all_output.join("\n"); - if report_path.is_empty() { - return Ok("❌ Scout agent did not output a report file path".to_string()); + extract_report(&full_output) +} + +/// Extract the research report from scout output. +/// +/// Looks for content between SCOUT_REPORT_START and SCOUT_REPORT_END markers. +/// Strips ANSI escape codes from the extracted content. +fn extract_report(output: &str) -> Result { + // Strip ANSI codes from the entire output first + let clean_output = strip_ansi_codes(output); + + // Find the start marker + let start_pos = clean_output.find(REPORT_START_MARKER) + .ok_or_else(|| anyhow::anyhow!( + "Scout agent did not output a properly formatted report. Expected {} marker.", + REPORT_START_MARKER + ))?; + + // Find the end marker + let end_pos = clean_output.find(REPORT_END_MARKER) + .ok_or_else(|| anyhow::anyhow!( + "Scout agent report is incomplete. Expected {} marker.", + REPORT_END_MARKER + ))?; + + if end_pos <= start_pos { + return Err(anyhow::anyhow!("Invalid report format: end marker before start marker")); + } + + // Extract content between markers + let report_start = start_pos + REPORT_START_MARKER.len(); + let report_content = clean_output[report_start..end_pos].trim(); + + if report_content.is_empty() { + return Ok("❌ Scout agent returned an empty report.".to_string()); + } + + Ok(format!("📋 Research Report:\n\n{}", report_content)) +} + +/// Strip ANSI escape codes from a string. +/// +/// Handles common ANSI sequences like: +/// - CSI sequences: \x1b[...m (colors, styles) +/// - OSC sequences: \x1b]...\x07 (terminal titles, etc.) +fn strip_ansi_codes(s: &str) -> String { + let mut result = String::with_capacity(s.len()); + let mut chars = s.chars().peekable(); + + while let Some(c) = chars.next() { + if c == '\x1b' { + // Start of escape sequence + match chars.peek() { + Some('[') => { + // CSI sequence: \x1b[...X where X is a letter + chars.next(); // consume '[' + while let Some(&next) = chars.peek() { + chars.next(); + if next.is_ascii_alphabetic() { + break; + } + } + } + Some(']') => { + // OSC sequence: \x1b]...\x07 + chars.next(); // consume ']' + while let Some(&next) = chars.peek() { + chars.next(); + if next == '\x07' { + break; + } + } + } + _ => { + // Unknown escape, skip just the ESC + } + } + } else { + result.push(c); + } + } + + result +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_strip_ansi_codes() { + // Simple color code + assert_eq!(strip_ansi_codes("\x1b[31mred\x1b[0m"), "red"); + + // RGB color code (like the bug we saw) + assert_eq!( + strip_ansi_codes("\x1b[38;2;216;177;114mtmp/file.md\x1b[0m"), + "tmp/file.md" + ); + + // Multiple codes + assert_eq!( + strip_ansi_codes("\x1b[1m\x1b[32mbold green\x1b[0m normal"), + "bold green normal" + ); + + // No codes + assert_eq!(strip_ansi_codes("plain text"), "plain text"); + + // Empty string + assert_eq!(strip_ansi_codes(""), ""); } - // Expand tilde if present - let expanded_path = if report_path.starts_with('~') { - if let Ok(home) = std::env::var("HOME") { - std::path::PathBuf::from(home).join(&report_path[2..]) // Skip "~/" - } else { - std::path::PathBuf::from(report_path) - } - } else { - std::path::PathBuf::from(report_path) - }; + #[test] + fn test_extract_report_success() { + let output = r#"Some preamble text +---SCOUT_REPORT_START--- +# Research Brief - // Read the report file - match std::fs::read_to_string(&expanded_path) { - Ok(content) => { - Ok(format!("📋 Research Report:\n\n{}", content)) - } - Err(e) => { - Ok(format!("❌ Failed to read report file '{}': {}", report_path, e)) - } +This is the report content. +---SCOUT_REPORT_END--- +Some trailing text"#; + + let result = extract_report(output).unwrap(); + assert!(result.contains("Research Brief")); + assert!(result.contains("This is the report content.")); + assert!(!result.contains("preamble")); + assert!(!result.contains("trailing")); + } + + #[test] + fn test_extract_report_with_ansi_codes() { + let output = "\x1b[32m---SCOUT_REPORT_START---\x1b[0m\n# Report\n\x1b[31m---SCOUT_REPORT_END---\x1b[0m"; + + let result = extract_report(output).unwrap(); + assert!(result.contains("# Report")); + } + + #[test] + fn test_extract_report_missing_start() { + let output = "No markers here"; + let result = extract_report(output); + assert!(result.is_err()); + assert!(result.unwrap_err().to_string().contains("SCOUT_REPORT_START")); + } + + #[test] + fn test_extract_report_missing_end() { + let output = "---SCOUT_REPORT_START---\nContent but no end"; + let result = extract_report(output); + assert!(result.is_err()); + assert!(result.unwrap_err().to_string().contains("SCOUT_REPORT_END")); + } + + #[test] + fn test_extract_report_empty_content() { + let output = "---SCOUT_REPORT_START---\n---SCOUT_REPORT_END---"; + let result = extract_report(output).unwrap(); + assert!(result.contains("empty report")); } } diff --git a/crates/g3-core/tests/auto_continue_test.rs b/crates/g3-core/tests/auto_continue_test.rs index 47caa9a..64b6157 100644 --- a/crates/g3-core/tests/auto_continue_test.rs +++ b/crates/g3-core/tests/auto_continue_test.rs @@ -4,7 +4,6 @@ //! 1. Empty/trivial responses (just timing lines) //! 2. Incomplete tool calls //! 3. Unexecuted tool calls -//! 4. Missing final_output after tool execution /// Helper function to check if a response is considered "empty" or trivial /// This mirrors the logic in lib.rs for detecting empty responses @@ -117,37 +116,26 @@ fn test_max_auto_summary_attempts_is_reasonable() { // ============================================================================= /// Simulates the should_auto_continue logic from lib.rs +/// After removing final_output, the logic is simpler: +/// - Continue if there's an incomplete tool call +/// - Continue if there's an unexecuted tool call +/// - Continue if tool executed but response is empty (LLM stuttered) fn should_auto_continue( any_tool_executed: bool, - final_output_called: bool, has_incomplete_tool_call: bool, has_unexecuted_tool_call: bool, is_empty_response: bool, ) -> bool { - (any_tool_executed && !final_output_called) - || has_incomplete_tool_call + has_incomplete_tool_call || has_unexecuted_tool_call || (any_tool_executed && is_empty_response) } #[test] -fn test_auto_continue_after_tool_no_final_output() { - // Tool executed but no final_output - should continue - assert!(should_auto_continue( - true, // any_tool_executed - false, // final_output_called - false, // has_incomplete_tool_call - false, // has_unexecuted_tool_call - false, // is_empty_response - )); -} - -#[test] -fn test_auto_continue_with_final_output() { - // Tool executed AND final_output called - should NOT continue +fn test_auto_continue_tool_executed_with_response() { + // Tool executed with substantive response - should NOT continue assert!(!should_auto_continue( true, // any_tool_executed - true, // final_output_called false, // has_incomplete_tool_call false, // has_unexecuted_tool_call false, // is_empty_response @@ -159,7 +147,6 @@ fn test_auto_continue_incomplete_tool_call() { // Incomplete tool call - should continue regardless of other flags assert!(should_auto_continue( false, // any_tool_executed - false, // final_output_called true, // has_incomplete_tool_call false, // has_unexecuted_tool_call false, // is_empty_response @@ -171,7 +158,6 @@ fn test_auto_continue_unexecuted_tool_call() { // Unexecuted tool call - should continue assert!(should_auto_continue( false, // any_tool_executed - false, // final_output_called false, // has_incomplete_tool_call true, // has_unexecuted_tool_call false, // is_empty_response @@ -183,7 +169,6 @@ fn test_auto_continue_empty_response_after_tool() { // Empty response after tool execution - should continue assert!(should_auto_continue( true, // any_tool_executed - false, // final_output_called false, // has_incomplete_tool_call false, // has_unexecuted_tool_call true, // is_empty_response @@ -196,7 +181,6 @@ fn test_auto_continue_empty_response_no_tool() { // (This is a normal case where LLM just didn't respond) assert!(!should_auto_continue( false, // any_tool_executed - false, // final_output_called false, // has_incomplete_tool_call false, // has_unexecuted_tool_call true, // is_empty_response @@ -208,7 +192,6 @@ fn test_auto_continue_no_conditions_met() { // No tools, no incomplete calls, substantive response - should NOT continue assert!(!should_auto_continue( false, // any_tool_executed - false, // final_output_called false, // has_incomplete_tool_call false, // has_unexecuted_tool_call false, // is_empty_response @@ -216,19 +199,22 @@ fn test_auto_continue_no_conditions_met() { } // ============================================================================= -// Test: Redundant condition detection +// Test: Edge cases // ============================================================================= #[test] -fn test_redundant_empty_response_condition() { - // This test documents that (any_tool_executed && is_empty_response) is redundant - // when (any_tool_executed && !final_output_called) is already true +fn test_auto_continue_multiple_conditions() { + // Multiple conditions true - should still continue + assert!(should_auto_continue( + true, // any_tool_executed + true, // has_incomplete_tool_call + true, // has_unexecuted_tool_call + true, // is_empty_response + )); - // Case: tool executed, no final_output, empty response - let result_with_empty = should_auto_continue(true, false, false, false, true); - let result_without_empty = should_auto_continue(true, false, false, false, false); + // Only incomplete tool call + assert!(should_auto_continue(false, true, false, false)); - // Both should be true because (any_tool_executed && !final_output_called) is true - assert_eq!(result_with_empty, result_without_empty, - "The is_empty_response condition is redundant when any_tool_executed && !final_output_called"); + // Only unexecuted tool call + assert!(should_auto_continue(false, false, true, false)); } diff --git a/crates/g3-core/tests/tool_execution_test.rs b/crates/g3-core/tests/tool_execution_test.rs index dde973f..044d573 100644 --- a/crates/g3-core/tests/tool_execution_test.rs +++ b/crates/g3-core/tests/tool_execution_test.rs @@ -326,27 +326,6 @@ mod todo_tests { } } -// ============================================================================= -// Test: final_output tool -// ============================================================================= - -mod final_output_tests { - use super::*; - - #[test] - fn test_final_output_tool_call() { - let tool_call = make_tool_call( - "final_output", - json!({ - "summary": "Task completed successfully.\n\n## Changes Made\n- Added feature X" - }), - ); - - assert_eq!(tool_call.tool, "final_output"); - assert!(tool_call.args.get("summary").is_some()); - } -} - // ============================================================================= // Test: code_search tool // ============================================================================= diff --git a/crates/g3-ensembles/src/flock.rs b/crates/g3-ensembles/src/flock.rs index 78802dd..bbee8ae 100644 --- a/crates/g3-ensembles/src/flock.rs +++ b/crates/g3-ensembles/src/flock.rs @@ -236,7 +236,7 @@ impl FlockMode { - The specific requirements that belong to this module\n\ - Any dependencies on other modules\n\n\ 4. Return your final partitioning exactly once, prefixed by the marker '{{PARTITION JSON}}' followed by a fenced code block that starts with \"```json\" and ends with \"```\". Place only the JSON array inside the fence.\n\ - 5. Use the final_output tool to provide your partitioning as a JSON array of objects, where each object has:\n\ + 5. The JSON array should contain objects, where each object has:\n\ - \"module_name\": string\n\ - \"requirements\": string (the requirements text for this module)\n\ - \"dependencies\": array of strings (names of other modules this depends on)\n\n\ diff --git a/crates/g3-planner/src/llm.rs b/crates/g3-planner/src/llm.rs index 7875ebf..4f6c5a3 100644 --- a/crates/g3-planner/src/llm.rs +++ b/crates/g3-planner/src/llm.rs @@ -391,7 +391,7 @@ Before making suggestions, please: After understanding the context, update the `{codepath}/g3-plan/new_requirements.md` file by prepending your refined requirements under the heading `{{{{CURRENT REQUIREMENTS}}}}`. -Use final_output when you are done to indicate completion."#, +When you are done, provide a brief summary of the refinements you made."#, codepath = codepath ) } diff --git a/crates/g3-planner/src/planner.rs b/crates/g3-planner/src/planner.rs index 8c08d78..6393d07 100644 --- a/crates/g3-planner/src/planner.rs +++ b/crates/g3-planner/src/planner.rs @@ -553,7 +553,6 @@ pub fn get_planner_tools() -> Vec<&'static str> { "shell", "code_search", "str_replace", - "final_output", ] } @@ -673,7 +672,7 @@ pub async fn run_coach_player_loop( ).await?; let coach_prompt = format!( - "You are G3 in coach mode. Review the implementation against these requirements:\n\n{}\n\nCheck:\n1. Are requirements implemented correctly?\n2. Does the code compile?\n3. What's missing?\n\nUse the final_output tool to provide your feedback.\nIf implementation is COMPLETE, include 'IMPLEMENTATION_APPROVED' in your feedback.\nOtherwise, provide specific feedback for the player to fix.", + "You are G3 in coach mode. Review the implementation against these requirements:\n\n{}\n\nCheck:\n1. Are requirements implemented correctly?\n2. Does the code compile?\n3. What's missing?\n\nProvide your feedback as a summary.\nIf implementation is COMPLETE, include 'IMPLEMENTATION_APPROVED' in your feedback.\nOtherwise, provide specific feedback for the player to fix.", requirements_content );