From 0aa1287ca6fec080175611be715d8f2f26dc94e4 Mon Sep 17 00:00:00 2001
From: "Dhanji R. Prasanna" <d@wideplay.com>
Date: Sat, 10 Jan 2026 13:43:04 +1100
Subject: [PATCH] Remove final_output tool and improve scout report handback

final_output removal:
- Remove final_output from tool definitions and dispatch
- Update system prompts to request summaries as regular text
- Remove final_output_called field from StreamingState
- Update auto_continue tests to remove final_output_called parameter
- Remove final_output test from tool_execution_test.rs
- Update planner and flock prompts to not reference final_output
- Keep backwards-compat code in feedback_extraction.rs and task_result.rs

Scout report handback:
- Change from file-based to delimiter-based report extraction
- Scout outputs report between ---SCOUT_REPORT_START/END--- markers
- Research tool extracts content between markers, strips ANSI codes
- Add comprehensive tests for extraction and ANSI stripping

657 tests pass.
---
 agents/scout.md                             |  56 +++++-
 crates/g3-cli/src/lib.rs                    |   4 +-
 crates/g3-core/src/streaming.rs             |   2 -
 crates/g3-core/src/tools/research.rs        | 198 +++++++++++++++++---
 crates/g3-core/tests/auto_continue_test.rs  |  54 ++----
 crates/g3-core/tests/tool_execution_test.rs |  21 ---
 crates/g3-ensembles/src/flock.rs            |   2 +-
 crates/g3-planner/src/llm.rs                |   2 +-
 crates/g3-planner/src/planner.rs            |   3 +-
 9 files changed, 247 insertions(+), 95 deletions(-)

diff --git a/agents/scout.md b/agents/scout.md
index dfe098e..f87b365 100644
--- a/agents/scout.md
+++ b/agents/scout.md
@@ -37,7 +37,7 @@ Short bullets comparing the options where it matters.
 
 ### Recommendation (Optional)
 If one option is clearly dominant, state it.
-If not, say “No clear default.”
+If not, say "No clear default."
 
 ### Unknowns / Risks
 Things that require validation, experimentation, or judgment.
@@ -46,7 +46,56 @@ Things that require validation, experimentation, or judgment.
 Links only (titles + URLs).  
 Brief quotes or snippets if relevant to decision making. No page dumps.
 
-Write this brief out to a temporary file and write out the full path of the filename as your VERY LAST LINE of output.
+**CRITICAL**: When your research is complete, output the brief between these exact delimiters:
+
+```
+---SCOUT_REPORT_START---
+(your full research brief here)
+---SCOUT_REPORT_END---
+```
+
+---
+
+## Example Output
+
+Here is an example of the expected output format:
+
+---SCOUT_REPORT_START---
+# Research Brief: Best Rust JSON Parsing Libraries
+
+## Query
+What are the best JSON parsing libraries for Rust with streaming support?
+
+## Options
+
+### 1. **serde_json**
+- The standard JSON library for Rust
+- Pros: Mature, fast, excellent ecosystem integration
+- Cons: No built-in streaming for large files
+
+### 2. **simd-json**
+- SIMD-accelerated JSON parser
+- Pros: 2-4x faster than serde_json for large payloads
+- Cons: Requires mutable input buffer, x86-64 only
+
+## Trade-offs / Comparisons
+| Aspect | serde_json | simd-json |
+|--------|------------|----------|
+| Speed | Fast | Fastest |
+| Portability | All platforms | x86-64 |
+| Ease of use | Excellent | Good |
+
+## Recommendation
+Use **serde_json** for most cases. Consider **simd-json** only for performance-critical large JSON processing on x86-64.
+
+## Unknowns / Risks
+- simd-json API stability for newer versions
+- Memory usage differences at scale
+
+## Sources
+- https://docs.rs/serde_json
+- https://github.com/simd-lite/simd-json
+---SCOUT_REPORT_END---
 
 ---
 
@@ -71,7 +120,7 @@ If nothing useful exists, say so explicitly and back this up with evidence.
 - Optimize for *your user* making a decision, not for completeness.
 
 You are allowed to say:
-> “This exists but is immature / fragile / not worth it.”
+> "This exists but is immature / fragile / not worth it."
 
 ---
 
@@ -91,6 +140,7 @@ You succeed if:
 - The reader can decide what to try or ignore in under 5 minutes.
 - The brief is calm, bounded, and opinionated where justified.
 - No context bloat is introduced.
+- **The report is wrapped in the exact delimiters shown above.**
 
 If nothing meets the bar, saying so is OK.
 
diff --git a/crates/g3-cli/src/lib.rs b/crates/g3-cli/src/lib.rs
index f872a06..54bb47d 100644
--- a/crates/g3-cli/src/lib.rs
+++ b/crates/g3-cli/src/lib.rs
@@ -2789,7 +2789,7 @@ Remember: Be clear in your review and concise in your feedback. APPROVE iff the
         // We have a valid coach result, process it
         let coach_result = coach_result_opt.unwrap();
 
-        // Extract the complete coach feedback from final_output
+        // Extract the complete coach feedback from the response
         let coach_feedback_text =
             extract_coach_feedback_from_logs(&coach_result, &coach_agent, &output)?;
 
@@ -2800,7 +2800,7 @@ Remember: Be clear in your review and concise in your feedback. APPROVE iff the
             coach_result.response.len()
         );
 
-        // Check if we got empty feedback (this can happen if the coach doesn't call final_output)
+        // Check if we got empty feedback (this can happen if the coach doesn't provide substantive feedback)
         if coach_feedback_text.is_empty() {
             output.print("⚠️ Coach did not provide feedback. This may be a model issue.");
             coach_feedback = "The implementation needs review. Please ensure all requirements are met and the code compiles without errors.".to_string();
diff --git a/crates/g3-core/src/streaming.rs b/crates/g3-core/src/streaming.rs
index e4cb2a9..7428341 100644
--- a/crates/g3-core/src/streaming.rs
+++ b/crates/g3-core/src/streaming.rs
@@ -22,7 +22,6 @@ pub struct StreamingState {
     pub response_started: bool,
     pub any_tool_executed: bool,
     pub auto_summary_attempts: usize,
-    pub final_output_called: bool,
     pub turn_accumulated_usage: Option<g3_providers::Usage>,
 }
 
@@ -36,7 +35,6 @@ impl StreamingState {
             response_started: false,
             any_tool_executed: false,
             auto_summary_attempts: 0,
-            final_output_called: false,
             turn_accumulated_usage: None,
         }
     }
diff --git a/crates/g3-core/src/tools/research.rs b/crates/g3-core/src/tools/research.rs
index fb6bc02..424aa08 100644
--- a/crates/g3-core/src/tools/research.rs
+++ b/crates/g3-core/src/tools/research.rs
@@ -10,12 +10,16 @@ use crate::ToolCall;
 
 use super::executor::ToolContext;
 
+/// Delimiter markers for scout report extraction
+const REPORT_START_MARKER: &str = "---SCOUT_REPORT_START---";
+const REPORT_END_MARKER: &str = "---SCOUT_REPORT_END---";
+
 /// Execute the research tool by spawning a scout agent.
 ///
 /// This tool:
 /// 1. Spawns `g3 --agent scout` with the query
-/// 2. Captures stdout and extracts the last line (file path to report)
-/// 3. Reads the report file and returns its contents
+/// 2. Captures stdout and extracts the report between delimiter markers
+/// 3. Returns the report content directly
 pub async fn execute_research<W: UiWriter>(
     tool_call: &ToolCall,
     ctx: &mut ToolContext<'_, W>,
@@ -46,20 +50,20 @@ pub async fn execute_research<W: UiWriter>(
         .spawn()
         .map_err(|e| anyhow::anyhow!("Failed to spawn scout agent: {}", e))?;
 
-    // Capture stdout to find the report file path
+    // Capture stdout to find the report content
     let stdout = child.stdout.take()
         .ok_or_else(|| anyhow::anyhow!("Failed to capture scout agent stdout"))?;
     
     let mut reader = BufReader::new(stdout).lines();
-    let mut last_line = String::new();
+    let mut all_output = Vec::new();
 
     // Print a header for the scout output
-    ctx.ui_writer.println("\n📡 Scout agent output:");
+    ctx.ui_writer.println("\n📡 Scout agent researching...");
     
-    // Stream all lines to UI, keeping track of the last one for the report path
+    // Collect all lines
     while let Some(line) = reader.next_line().await? {
         ctx.ui_writer.println(&format!("  {}", line));
-        last_line = line;
+        all_output.push(line);
     }
 
     // Wait for the process to complete
@@ -70,31 +74,167 @@ pub async fn execute_research<W: UiWriter>(
         return Ok(format!("❌ Scout agent failed with exit code: {:?}", status.code()));
     }
 
-    // The last line should be the path to the report file
-    let report_path = last_line.trim();
+    // Join all output and extract the report between markers
+    let full_output = all_output.join("\n");
     
-    if report_path.is_empty() {
-        return Ok("❌ Scout agent did not output a report file path".to_string());
+    extract_report(&full_output)
+}
+
+/// Extract the research report from scout output.
+/// 
+/// Looks for content between SCOUT_REPORT_START and SCOUT_REPORT_END markers.
+/// Strips ANSI escape codes from the extracted content.
+fn extract_report(output: &str) -> Result<String> {
+    // Strip ANSI codes from the entire output first
+    let clean_output = strip_ansi_codes(output);
+    
+    // Find the start marker
+    let start_pos = clean_output.find(REPORT_START_MARKER)
+        .ok_or_else(|| anyhow::anyhow!(
+            "Scout agent did not output a properly formatted report. Expected {} marker.",
+            REPORT_START_MARKER
+        ))?;
+    
+    // Find the end marker
+    let end_pos = clean_output.find(REPORT_END_MARKER)
+        .ok_or_else(|| anyhow::anyhow!(
+            "Scout agent report is incomplete. Expected {} marker.",
+            REPORT_END_MARKER
+        ))?;
+    
+    if end_pos <= start_pos {
+        return Err(anyhow::anyhow!("Invalid report format: end marker before start marker"));
+    }
+    
+    // Extract content between markers
+    let report_start = start_pos + REPORT_START_MARKER.len();
+    let report_content = clean_output[report_start..end_pos].trim();
+    
+    if report_content.is_empty() {
+        return Ok("❌ Scout agent returned an empty report.".to_string());
+    }
+    
+    Ok(format!("📋 Research Report:\n\n{}", report_content))
+}
+
+/// Strip ANSI escape codes from a string.
+/// 
+/// Handles common ANSI sequences like:
+/// - CSI sequences: \x1b[...m (colors, styles)
+/// - OSC sequences: \x1b]...\x07 (terminal titles, etc.)
+fn strip_ansi_codes(s: &str) -> String {
+    let mut result = String::with_capacity(s.len());
+    let mut chars = s.chars().peekable();
+    
+    while let Some(c) = chars.next() {
+        if c == '\x1b' {
+            // Start of escape sequence
+            match chars.peek() {
+                Some('[') => {
+                    // CSI sequence: \x1b[...X where X is a letter
+                    chars.next(); // consume '['
+                    while let Some(&next) = chars.peek() {
+                        chars.next();
+                        if next.is_ascii_alphabetic() {
+                            break;
+                        }
+                    }
+                }
+                Some(']') => {
+                    // OSC sequence: \x1b]...\x07
+                    chars.next(); // consume ']'
+                    while let Some(&next) = chars.peek() {
+                        chars.next();
+                        if next == '\x07' {
+                            break;
+                        }
+                    }
+                }
+                _ => {
+                    // Unknown escape, skip just the ESC
+                }
+            }
+        } else {
+            result.push(c);
+        }
+    }
+    
+    result
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_strip_ansi_codes() {
+        // Simple color code
+        assert_eq!(strip_ansi_codes("\x1b[31mred\x1b[0m"), "red");
+        
+        // RGB color code (like the bug we saw)
+        assert_eq!(
+            strip_ansi_codes("\x1b[38;2;216;177;114mtmp/file.md\x1b[0m"),
+            "tmp/file.md"
+        );
+        
+        // Multiple codes
+        assert_eq!(
+            strip_ansi_codes("\x1b[1m\x1b[32mbold green\x1b[0m normal"),
+            "bold green normal"
+        );
+        
+        // No codes
+        assert_eq!(strip_ansi_codes("plain text"), "plain text");
+        
+        // Empty string
+        assert_eq!(strip_ansi_codes(""), "");
     }
 
-    // Expand tilde if present
-    let expanded_path = if report_path.starts_with('~') {
-        if let Ok(home) = std::env::var("HOME") {
-            std::path::PathBuf::from(home).join(&report_path[2..])  // Skip "~/"
-        } else {
-            std::path::PathBuf::from(report_path)
-        }
-    } else {
-        std::path::PathBuf::from(report_path)
-    };
+    #[test]
+    fn test_extract_report_success() {
+        let output = r#"Some preamble text
+---SCOUT_REPORT_START---
+# Research Brief
 
-    // Read the report file
-    match std::fs::read_to_string(&expanded_path) {
-        Ok(content) => {
-            Ok(format!("📋 Research Report:\n\n{}", content))
-        }
-        Err(e) => {
-            Ok(format!("❌ Failed to read report file '{}': {}", report_path, e))
-        }
+This is the report content.
+---SCOUT_REPORT_END---
+Some trailing text"#;
+        
+        let result = extract_report(output).unwrap();
+        assert!(result.contains("Research Brief"));
+        assert!(result.contains("This is the report content."));
+        assert!(!result.contains("preamble"));
+        assert!(!result.contains("trailing"));
+    }
+
+    #[test]
+    fn test_extract_report_with_ansi_codes() {
+        let output = "\x1b[32m---SCOUT_REPORT_START---\x1b[0m\n# Report\n\x1b[31m---SCOUT_REPORT_END---\x1b[0m";
+        
+        let result = extract_report(output).unwrap();
+        assert!(result.contains("# Report"));
+    }
+
+    #[test]
+    fn test_extract_report_missing_start() {
+        let output = "No markers here";
+        let result = extract_report(output);
+        assert!(result.is_err());
+        assert!(result.unwrap_err().to_string().contains("SCOUT_REPORT_START"));
+    }
+
+    #[test]
+    fn test_extract_report_missing_end() {
+        let output = "---SCOUT_REPORT_START---\nContent but no end";
+        let result = extract_report(output);
+        assert!(result.is_err());
+        assert!(result.unwrap_err().to_string().contains("SCOUT_REPORT_END"));
+    }
+
+    #[test]
+    fn test_extract_report_empty_content() {
+        let output = "---SCOUT_REPORT_START---\n---SCOUT_REPORT_END---";
+        let result = extract_report(output).unwrap();
+        assert!(result.contains("empty report"));
     }
 }
diff --git a/crates/g3-core/tests/auto_continue_test.rs b/crates/g3-core/tests/auto_continue_test.rs
index 47caa9a..64b6157 100644
--- a/crates/g3-core/tests/auto_continue_test.rs
+++ b/crates/g3-core/tests/auto_continue_test.rs
@@ -4,7 +4,6 @@
 //! 1. Empty/trivial responses (just timing lines)
 //! 2. Incomplete tool calls
 //! 3. Unexecuted tool calls
-//! 4. Missing final_output after tool execution
 
 /// Helper function to check if a response is considered "empty" or trivial
 /// This mirrors the logic in lib.rs for detecting empty responses
@@ -117,37 +116,26 @@ fn test_max_auto_summary_attempts_is_reasonable() {
 // =============================================================================
 
 /// Simulates the should_auto_continue logic from lib.rs
+/// After removing final_output, the logic is simpler:
+/// - Continue if there's an incomplete tool call
+/// - Continue if there's an unexecuted tool call  
+/// - Continue if tool executed but response is empty (LLM stuttered)
 fn should_auto_continue(
     any_tool_executed: bool,
-    final_output_called: bool,
     has_incomplete_tool_call: bool,
     has_unexecuted_tool_call: bool,
     is_empty_response: bool,
 ) -> bool {
-    (any_tool_executed && !final_output_called)
-        || has_incomplete_tool_call
+    has_incomplete_tool_call
         || has_unexecuted_tool_call
         || (any_tool_executed && is_empty_response)
 }
 
 #[test]
-fn test_auto_continue_after_tool_no_final_output() {
-    // Tool executed but no final_output - should continue
-    assert!(should_auto_continue(
-        true,  // any_tool_executed
-        false, // final_output_called
-        false, // has_incomplete_tool_call
-        false, // has_unexecuted_tool_call
-        false, // is_empty_response
-    ));
-}
-
-#[test]
-fn test_auto_continue_with_final_output() {
-    // Tool executed AND final_output called - should NOT continue
+fn test_auto_continue_tool_executed_with_response() {
+    // Tool executed with substantive response - should NOT continue
     assert!(!should_auto_continue(
         true,  // any_tool_executed
-        true,  // final_output_called
         false, // has_incomplete_tool_call
         false, // has_unexecuted_tool_call
         false, // is_empty_response
@@ -159,7 +147,6 @@ fn test_auto_continue_incomplete_tool_call() {
     // Incomplete tool call - should continue regardless of other flags
     assert!(should_auto_continue(
         false, // any_tool_executed
-        false, // final_output_called
         true,  // has_incomplete_tool_call
         false, // has_unexecuted_tool_call
         false, // is_empty_response
@@ -171,7 +158,6 @@ fn test_auto_continue_unexecuted_tool_call() {
     // Unexecuted tool call - should continue
     assert!(should_auto_continue(
         false, // any_tool_executed
-        false, // final_output_called
         false, // has_incomplete_tool_call
         true,  // has_unexecuted_tool_call
         false, // is_empty_response
@@ -183,7 +169,6 @@ fn test_auto_continue_empty_response_after_tool() {
     // Empty response after tool execution - should continue
     assert!(should_auto_continue(
         true,  // any_tool_executed
-        false, // final_output_called
         false, // has_incomplete_tool_call
         false, // has_unexecuted_tool_call
         true,  // is_empty_response
@@ -196,7 +181,6 @@ fn test_auto_continue_empty_response_no_tool() {
     // (This is a normal case where LLM just didn't respond)
     assert!(!should_auto_continue(
         false, // any_tool_executed
-        false, // final_output_called
         false, // has_incomplete_tool_call
         false, // has_unexecuted_tool_call
         true,  // is_empty_response
@@ -208,7 +192,6 @@ fn test_auto_continue_no_conditions_met() {
     // No tools, no incomplete calls, substantive response - should NOT continue
     assert!(!should_auto_continue(
         false, // any_tool_executed
-        false, // final_output_called
         false, // has_incomplete_tool_call
         false, // has_unexecuted_tool_call
         false, // is_empty_response
@@ -216,19 +199,22 @@ fn test_auto_continue_no_conditions_met() {
 }
 
 // =============================================================================
-// Test: Redundant condition detection
+// Test: Edge cases
 // =============================================================================
 
 #[test]
-fn test_redundant_empty_response_condition() {
-    // This test documents that (any_tool_executed && is_empty_response) is redundant
-    // when (any_tool_executed && !final_output_called) is already true
+fn test_auto_continue_multiple_conditions() {
+    // Multiple conditions true - should still continue
+    assert!(should_auto_continue(
+        true,  // any_tool_executed
+        true,  // has_incomplete_tool_call
+        true,  // has_unexecuted_tool_call
+        true,  // is_empty_response
+    ));
     
-    // Case: tool executed, no final_output, empty response
-    let result_with_empty = should_auto_continue(true, false, false, false, true);
-    let result_without_empty = should_auto_continue(true, false, false, false, false);
+    // Only incomplete tool call
+    assert!(should_auto_continue(false, true, false, false));
     
-    // Both should be true because (any_tool_executed && !final_output_called) is true
-    assert_eq!(result_with_empty, result_without_empty, 
-        "The is_empty_response condition is redundant when any_tool_executed && !final_output_called");
+    // Only unexecuted tool call  
+    assert!(should_auto_continue(false, false, true, false));
 }
diff --git a/crates/g3-core/tests/tool_execution_test.rs b/crates/g3-core/tests/tool_execution_test.rs
index dde973f..044d573 100644
--- a/crates/g3-core/tests/tool_execution_test.rs
+++ b/crates/g3-core/tests/tool_execution_test.rs
@@ -326,27 +326,6 @@ mod todo_tests {
     }
 }
 
-// =============================================================================
-// Test: final_output tool
-// =============================================================================
-
-mod final_output_tests {
-    use super::*;
-
-    #[test]
-    fn test_final_output_tool_call() {
-        let tool_call = make_tool_call(
-            "final_output",
-            json!({
-                "summary": "Task completed successfully.\n\n## Changes Made\n- Added feature X"
-            }),
-        );
-
-        assert_eq!(tool_call.tool, "final_output");
-        assert!(tool_call.args.get("summary").is_some());
-    }
-}
-
 // =============================================================================
 // Test: code_search tool
 // =============================================================================
diff --git a/crates/g3-ensembles/src/flock.rs b/crates/g3-ensembles/src/flock.rs
index 78802dd..bbee8ae 100644
--- a/crates/g3-ensembles/src/flock.rs
+++ b/crates/g3-ensembles/src/flock.rs
@@ -236,7 +236,7 @@ impl FlockMode {
                - The specific requirements that belong to this module\n\
                - Any dependencies on other modules\n\n\
             4. Return your final partitioning exactly once, prefixed by the marker '{{PARTITION JSON}}' followed by a fenced code block that starts with \"```json\" and ends with \"```\". Place only the JSON array inside the fence.\n\
-            5. Use the final_output tool to provide your partitioning as a JSON array of objects, where each object has:\n\
+            5. The JSON array should contain objects, where each object has:\n\
                - \"module_name\": string\n\
                - \"requirements\": string (the requirements text for this module)\n\
                - \"dependencies\": array of strings (names of other modules this depends on)\n\n\
diff --git a/crates/g3-planner/src/llm.rs b/crates/g3-planner/src/llm.rs
index 7875ebf..4f6c5a3 100644
--- a/crates/g3-planner/src/llm.rs
+++ b/crates/g3-planner/src/llm.rs
@@ -391,7 +391,7 @@ Before making suggestions, please:
 After understanding the context, update the `{codepath}/g3-plan/new_requirements.md` file by prepending
 your refined requirements under the heading `{{{{CURRENT REQUIREMENTS}}}}`.
 
-Use final_output when you are done to indicate completion."#,
+When you are done, provide a brief summary of the refinements you made."#,
         codepath = codepath
     )
 }
diff --git a/crates/g3-planner/src/planner.rs b/crates/g3-planner/src/planner.rs
index 8c08d78..6393d07 100644
--- a/crates/g3-planner/src/planner.rs
+++ b/crates/g3-planner/src/planner.rs
@@ -553,7 +553,6 @@ pub fn get_planner_tools() -> Vec<&'static str> {
         "shell",
         "code_search",
         "str_replace",
-        "final_output",
     ]
 }
 
@@ -673,7 +672,7 @@ pub async fn run_coach_player_loop(
         ).await?;
         
         let coach_prompt = format!(
-            "You are G3 in coach mode. Review the implementation against these requirements:\n\n{}\n\nCheck:\n1. Are requirements implemented correctly?\n2. Does the code compile?\n3. What's missing?\n\nUse the final_output tool to provide your feedback.\nIf implementation is COMPLETE, include 'IMPLEMENTATION_APPROVED' in your feedback.\nOtherwise, provide specific feedback for the player to fix.",
+            "You are G3 in coach mode. Review the implementation against these requirements:\n\n{}\n\nCheck:\n1. Are requirements implemented correctly?\n2. Does the code compile?\n3. What's missing?\n\nProvide your feedback as a summary.\nIf implementation is COMPLETE, include 'IMPLEMENTATION_APPROVED' in your feedback.\nOtherwise, provide specific feedback for the player to fix.",
             requirements_content
         );