From 777191b3cb20b45cf047738050c6f169699242de Mon Sep 17 00:00:00 2001 From: "Dhanji R. Prasanna" Date: Fri, 9 Jan 2026 14:57:24 +1100 Subject: [PATCH] Remove final_output tool - let summaries stream naturally MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove final_output from tool definitions, dispatch, and misc tools - Update system prompts to request summaries as regular markdown text - Remove print_final_output from UiWriter trait and all implementations - Remove final_output handling from agent core logic - Rename final_output_summary → summary in session continuation - Delete final_output test files - Update tool count tests (12→11, 27→26) This allows LLM summaries to stream through the markdown formatter for a more natural, responsive user experience instead of buffering everything into a tool call. --- crates/g3-cli/src/lib.rs | 168 +++++++-------- crates/g3-cli/src/machine_ui_writer.rs | 4 - crates/g3-cli/src/ui_writer_impl.rs | 30 --- crates/g3-cli/tests/test_final_output.rs | 178 ---------------- crates/g3-core/src/feedback_extraction.rs | 85 +++++++- crates/g3-core/src/lib.rs | 151 +++++-------- crates/g3-core/src/prompts.rs | 7 +- crates/g3-core/src/session_continuation.rs | 8 +- crates/g3-core/src/task_result.rs | 71 ++++--- crates/g3-core/src/tool_definitions.rs | 24 +-- crates/g3-core/src/tool_dispatch.rs | 5 - crates/g3-core/src/tools/misc.rs | 38 +--- crates/g3-core/src/ui_writer.rs | 7 - .../tests/final_output_todo_check_test.rs | 198 ------------------ .../tests/test_session_continuation.rs | 8 +- crates/g3-core/tests/todo_staleness_test.rs | 3 - crates/g3-planner/src/llm.rs | 3 - 17 files changed, 262 insertions(+), 726 deletions(-) delete mode 100644 crates/g3-cli/tests/test_final_output.rs delete mode 100644 crates/g3-core/tests/final_output_todo_check_test.rs diff --git a/crates/g3-cli/src/lib.rs b/crates/g3-cli/src/lib.rs index aa8a83f..6c55118 100644 --- a/crates/g3-cli/src/lib.rs +++ b/crates/g3-cli/src/lib.rs @@ -148,9 +148,6 @@ fn extract_coach_feedback_from_logs( coach_agent: &g3_core::Agent, output: &SimpleOutput, ) -> Result { - // CORRECT APPROACH: Get the session ID from the current coach agent - // and read its specific log file directly - // Get the coach agent's session ID let session_id = coach_agent .get_session_id() @@ -167,74 +164,78 @@ fn extract_coach_feedback_from_logs( logs_dir.join(format!("g3_session_{}.json", session_id)) }; - // Read the coach agent's specific log file - if log_file_path.exists() { - if let Ok(log_content) = std::fs::read_to_string(&log_file_path) { - if let Ok(log_json) = serde_json::from_str::(&log_content) { - if let Some(context_window) = log_json.get("context_window") { - if let Some(conversation_history) = context_window.get("conversation_history") { - if let Some(messages) = conversation_history.as_array() { - // Go backwards through the conversation to find the last tool result - // that corresponds to a final_output tool call - for i in (0..messages.len()).rev() { - let msg = &messages[i]; - - // Check if this is a User message with "Tool result:" - if let Some(role) = msg.get("role") { - if let Some(role_str) = role.as_str() { - if role_str == "User" || role_str == "user" { - if let Some(content) = msg.get("content") { - if let Some(content_str) = content.as_str() { - if content_str.starts_with("Tool result:") { - // Found a tool result, now check the preceding message - // to verify it was a final_output tool call - if i > 0 { - let prev_msg = &messages[i - 1]; - if let Some(prev_role) = prev_msg.get("role") { - if let Some(prev_role_str) = prev_role.as_str() { - if prev_role_str == "assistant" || prev_role_str == "Assistant" { - if let Some(prev_content) = prev_msg.get("content") { - if let Some(prev_content_str) = prev_content.as_str() { - // Check if the previous assistant message contains a final_output tool call - if prev_content_str.contains("\"tool\": \"final_output\"") { - // This is a final_output tool result - let feedback = if content_str.starts_with("Tool result: ") { - content_str.strip_prefix("Tool result: ") - .unwrap_or(content_str) - .to_string() - } else { - content_str.to_string() - }; - - output.print(&format!( - "Coach feedback extracted: {} characters (from {} total)", - feedback.len(), - content_str.len() - )); - output.print(&format!("Coach feedback:\n{}", feedback)); - - output.print(&format!( - "✅ Extracted coach feedback from session: {} (verified final_output tool)", - session_id - )); - return Ok(feedback); - } else { - output.print(&format!( - "⚠️ Skipping tool result at index {} - not a final_output tool call", - i - )); - } - } - } - } - } - } - } - } - } - } - } - } + // Try to extract from session log + if let Some(feedback) = try_extract_feedback_from_log(&log_file_path, output) { + output.print(&format!( + "✅ Extracted coach feedback from session: {}", + session_id + )); + return Ok(feedback); + } + + // Fallback: use the TaskResult's extract_summary method + let fallback = coach_result.extract_summary(); + if !fallback.is_empty() { + output.print(&format!( + "✅ Extracted coach feedback from response: {} chars", + fallback.len() + )); + return Ok(fallback); + } + + // Last resort: return an error instead of panicking + Err(anyhow::anyhow!( + "Could not extract coach feedback from session: {}\n\ + Log file path: {:?}\n\ + Log file exists: {}\n\ + Coach result response length: {} chars", + session_id, + log_file_path, + log_file_path.exists(), + coach_result.response.len() + )) +} + +/// Helper function to extract feedback from a session log file +/// Looks for the last assistant message with substantial text content +fn try_extract_feedback_from_log( + log_file_path: &std::path::Path, + _output: &SimpleOutput, +) -> Option { + if !log_file_path.exists() { + return None; + } + + let log_content = std::fs::read_to_string(log_file_path).ok()?; + let log_json: serde_json::Value = serde_json::from_str(&log_content).ok()?; + + let messages = log_json + .get("context_window")? + .get("conversation_history")? + .as_array()?; + + // Search backwards for the last assistant message with text content + for msg in messages.iter().rev() { + let role = msg.get("role").and_then(|v| v.as_str())?; + + if role.eq_ignore_ascii_case("assistant") { + if let Some(content) = msg.get("content") { + // Handle string content + if let Some(content_str) = content.as_str() { + let trimmed = content_str.trim(); + // Skip empty or very short responses (likely just tool calls) + if !trimmed.is_empty() && trimmed.len() > 10 { + return Some(trimmed.to_string()); + } + } + // Handle array content (native tool calling format) + if let Some(content_array) = content.as_array() { + for block in content_array { + if block.get("type").and_then(|v| v.as_str()) == Some("text") { + if let Some(text) = block.get("text").and_then(|v| v.as_str()) { + let trimmed = text.trim(); + if !trimmed.is_empty() && trimmed.len() > 10 { + return Some(trimmed.to_string()); } } } @@ -244,18 +245,7 @@ fn extract_coach_feedback_from_logs( } } - // If we couldn't extract from logs, panic with detailed error - panic!( - "CRITICAL: Could not extract coach feedback from session: {}\n\ - Log file path: {:?}\n\ - Log file exists: {}\n\ - This indicates the coach did not call final_output tool or the log is corrupted.\n\ - Coach result response length: {} chars", - session_id, - log_file_path, - log_file_path.exists(), - coach_result.response.len() - ); + None } use clap::Parser; @@ -1492,7 +1482,7 @@ async fn run_interactive( " Context: {:.1}% used", continuation.context_percentage )); - if let Some(ref summary) = continuation.final_output_summary { + if let Some(ref summary) = continuation.summary { let preview: String = summary.chars().take(80).collect(); output.print(&format!(" Last output: {}...", preview)); } @@ -2614,16 +2604,16 @@ Review the current state of the project and provide a concise critique focusing 5. Use UI tools such as webdriver to test functionality thoroughly CRITICAL INSTRUCTIONS: -1. You MUST use the final_output tool to provide your feedback -2. The summary in final_output should be CONCISE and ACTIONABLE +1. Provide your feedback as your final response message +2. Your feedback should be CONCISE and ACTIONABLE 3. Focus ONLY on what needs to be fixed or improved -4. Do NOT include your analysis process, file contents, or compilation output in the summary +4. Do NOT include your analysis process, file contents, or compilation output in your final feedback If the implementation thoroughly meets all requirements, compiles and is fully tested (especially UI flows) *WITHOUT* minor gaps or errors: -- Call final_output with summary: 'IMPLEMENTATION_APPROVED' +- Respond with: 'IMPLEMENTATION_APPROVED' If improvements are needed: -- Call final_output with a brief summary listing ONLY the specific issues to fix +- Respond with a brief summary listing ONLY the specific issues to fix Remember: Be clear in your review and concise in your feedback. APPROVE iff the implementation works and thoroughly fits the requirements (implementation > 95% complete). Be rigorous, especially by testing that all UI features work.", requirements diff --git a/crates/g3-cli/src/machine_ui_writer.rs b/crates/g3-cli/src/machine_ui_writer.rs index b9084cc..f8bef83 100644 --- a/crates/g3-cli/src/machine_ui_writer.rs +++ b/crates/g3-cli/src/machine_ui_writer.rs @@ -108,8 +108,4 @@ impl UiWriter for MachineUiWriter { 0 } - fn print_final_output(&self, summary: &str) { - println!("FINAL_OUTPUT:"); - println!("{}", summary); - } } diff --git a/crates/g3-cli/src/ui_writer_impl.rs b/crates/g3-cli/src/ui_writer_impl.rs index dabe3b0..191cc24 100644 --- a/crates/g3-cli/src/ui_writer_impl.rs +++ b/crates/g3-cli/src/ui_writer_impl.rs @@ -362,36 +362,6 @@ impl UiWriter for ConsoleUiWriter { } } - fn print_final_output(&self, summary: &str) { - // Print a header separator - println!("\x1b[1;35m━━━ Summary ━━━\x1b[0m"); - println!(); - - // Use the same streaming markdown formatter for consistency - let mut skin = MadSkin::default(); - skin.bold.set_fg(termimad::crossterm::style::Color::Green); - skin.italic.set_fg(termimad::crossterm::style::Color::Cyan); - skin.inline_code.set_fg(termimad::crossterm::style::Color::Rgb { - r: 216, - g: 177, - b: 114, - }); - - let mut formatter = StreamingMarkdownFormatter::new(skin); - - // Process the entire summary through the formatter - let formatted = formatter.process(summary); - print!("{}", formatted); - - // Flush any remaining buffered content - let remaining = formatter.finish(); - print!("{}", remaining); - let _ = io::stdout().flush(); - - // Print a footer separator - println!(); - println!("\x1b[1;35m━━━━━━━━━━━━━━━\x1b[0m"); - } fn filter_json_tool_calls(&self, content: &str) -> String { // Apply JSON tool call filtering for display diff --git a/crates/g3-cli/tests/test_final_output.rs b/crates/g3-cli/tests/test_final_output.rs deleted file mode 100644 index 96cff3d..0000000 --- a/crates/g3-cli/tests/test_final_output.rs +++ /dev/null @@ -1,178 +0,0 @@ -//! Quick test to verify final_output rendering works with streaming markdown -//! Run with: cargo test -p g3-cli --test test_final_output -- --nocapture - -use std::io::{self, Write}; - -#[test] -fn test_final_output_visual() { - use g3_cli::streaming_markdown::StreamingMarkdownFormatter; - use termimad::MadSkin; - - // Create the test markdown - let test_markdown = r##"# Task Completed Successfully - -Here's a summary of what was accomplished: - -## Rust Code Example - -Created a new function to handle user authentication: - -```rust -use std::collections::HashMap; - -/// Authenticates a user with the given credentials -pub async fn authenticate(username: &str, password: &str) -> Result { - let hash = hash_password(password)?; - - if let Some(user) = db.find_user(username).await? { - if user.password_hash == hash { - Ok(user) - } else { - Err(AuthError::InvalidPassword) - } - } else { - Err(AuthError::UserNotFound) - } -} -``` - -## Python Example - -Also added a Python script for data processing: - -```python -import pandas as pd -from typing import List, Dict - -def process_data(items: List[Dict]) -> pd.DataFrame: - """Process raw items into a cleaned DataFrame.""" - df = pd.DataFrame(items) - df['timestamp'] = pd.to_datetime(df['timestamp']) - df = df.dropna(subset=['value']) - return df.sort_values('timestamp') -``` - -## JavaScript/TypeScript - -Frontend component: - -```typescript -interface User { - id: string; - name: string; - email: string; -} - -const UserCard: React.FC<{ user: User }> = ({ user }) => { - return ( -
-

{user.name}

-

{user.email}

-
- ); -}; -``` - -## Shell Commands - -Deployment script: - -```bash -#!/bin/bash -set -euo pipefail - -echo "Building project..." -cargo build --release - -echo "Running tests..." -cargo test --all - -echo "Deploying to production..." -rsync -avz ./target/release/app server:/opt/app/ -``` - -## JSON Configuration - -```json -{ - "name": "my-project", - "version": "1.0.0", - "dependencies": { - "serde": "1.0", - "tokio": { "version": "1.0", "features": ["full"] } - } -} -``` - -## Other Markdown Features - -This section tests that **bold text**, *italic text*, and `inline code` still work correctly. - -### Lists - -- First item -- Second item with **bold** -- Third item with `code` - -### Numbered List - -1. Step one -2. Step two -3. Step three - -### Blockquote - -> This is a blockquote that should be rendered -> with proper styling by termimad. - -### Table - -| Language | Extension | Use Case | -|----------|-----------|----------| -| Rust | .rs | Systems | -| Python | .py | Scripts | -| TypeScript | .ts | Frontend | - -## Code Without Language - -``` -This is a code block without a language specified. -It should still be rendered as code, just without -syntax highlighting. -``` - -## Final Notes - -All changes have been tested and verified. The implementation: - -- ✅ Handles multiple languages -- ✅ Preserves markdown formatting -- ✅ Works with nested structures -- ✅ Gracefully handles edge cases -"##; - - // Create a styled markdown skin (same as in print_final_output) - let mut skin = MadSkin::default(); - skin.bold.set_fg(termimad::crossterm::style::Color::Green); - skin.italic.set_fg(termimad::crossterm::style::Color::Cyan); - skin.inline_code.set_fg(termimad::crossterm::style::Color::Rgb { - r: 216, - g: 177, - b: 114, - }); - - // Print header - println!("\n\x1b[1;35m━━━ Summary ━━━\x1b[0m\n"); - - // Use the streaming markdown formatter (same as print_final_output now uses) - let mut formatter = StreamingMarkdownFormatter::new(skin); - let formatted = formatter.process(test_markdown); - print!("{}", formatted); - let remaining = formatter.finish(); - print!("{}", remaining); - - // Print footer - println!("\n\x1b[1;35m━━━━━━━━━━━━━━━\x1b[0m"); - - let _ = io::stdout().flush(); -} diff --git a/crates/g3-core/src/feedback_extraction.rs b/crates/g3-core/src/feedback_extraction.rs index 12f121b..a0a9051 100644 --- a/crates/g3-core/src/feedback_extraction.rs +++ b/crates/g3-core/src/feedback_extraction.rs @@ -100,21 +100,29 @@ pub fn extract_coach_feedback( where W: UiWriter + Clone + Send + Sync + 'static, { - // Try session log first (most reliable) + // Try session log first - now looks for last assistant message (primary method) + if let Some(session_id) = agent.get_session_id() { + if let Some(feedback) = try_extract_last_assistant_message(&session_id, config) { + debug!("Extracted coach feedback from last assistant message: {} chars", feedback.len()); + return ExtractedFeedback::new(feedback, FeedbackSource::ConversationHistory); + } + } + + // Fallback: Try session log with final_output pattern (backwards compatibility) if let Some(session_id) = agent.get_session_id() { if let Some(feedback) = try_extract_from_session_log(&session_id, config) { - debug!("Extracted coach feedback from session log: {} chars", feedback.len()); + debug!("Extracted coach feedback from session log (final_output): {} chars", feedback.len()); return ExtractedFeedback::new(feedback, FeedbackSource::SessionLog); } } - // Try native tool call JSON parsing + // Fallback: Try native tool call JSON parsing (backwards compatibility) if let Some(feedback) = try_extract_from_native_tool_call(&coach_result.response) { debug!("Extracted coach feedback from native tool call: {} chars", feedback.len()); return ExtractedFeedback::new(feedback, FeedbackSource::NativeToolCall); } - // Try conversation history + // Fallback: Try conversation history with final_output pattern (backwards compatibility) if let Some(session_id) = agent.get_session_id() { if let Some(feedback) = try_extract_from_conversation_history(&session_id, config) { debug!("Extracted coach feedback from conversation history: {} chars", feedback.len()); @@ -122,7 +130,7 @@ where } } - // Try TaskResult parsing + // Fallback: Try TaskResult parsing (extracts last text block) let extracted = coach_result.extract_final_output(); if !extracted.is_empty() { debug!("Extracted coach feedback from task result: {} chars", extracted.len()); @@ -134,6 +142,73 @@ where ExtractedFeedback::new(config.default_feedback.clone(), FeedbackSource::DefaultFallback) } +/// Try to extract the last assistant message from session log (PRIMARY method) +/// This is the preferred extraction method - looks for the last substantial +/// assistant message content, regardless of whether it used final_output tool. +fn try_extract_last_assistant_message( + session_id: &str, + config: &FeedbackExtractionConfig, +) -> Option { + // Try new .g3/sessions//session.json path first + let log_file_path = crate::get_session_file(session_id); + + // Fall back to old logs/ path if new path doesn't exist + let log_file_path = if log_file_path.exists() { + log_file_path + } else { + let logs_path = config.logs_dir.clone().unwrap_or_else(logs_dir); + logs_path.join(format!("g3_session_{}.json", session_id)) + }; + + if !log_file_path.exists() { + debug!("Session log file not found: {:?}", log_file_path); + return None; + } + + let log_content = std::fs::read_to_string(&log_file_path).ok()?; + let log_json: Value = serde_json::from_str(&log_content).ok()?; + + // Try to get conversation history from context_window + let messages = log_json + .get("context_window")? + .get("conversation_history")? + .as_array()?; + + // Search backwards for the last assistant message with text content + for msg in messages.iter().rev() { + let role = msg.get("role").and_then(|v| v.as_str())?; + + if role.eq_ignore_ascii_case("assistant") { + if let Some(content) = msg.get("content") { + // Handle string content + if let Some(content_str) = content.as_str() { + let trimmed = content_str.trim(); + // Skip empty or very short responses (likely just tool calls) + if !trimmed.is_empty() && trimmed.len() > 10 { + return Some(trimmed.to_string()); + } + } + // Handle array content (native tool calling format) + // Look for text blocks in the array + if let Some(content_array) = content.as_array() { + for block in content_array { + if block.get("type").and_then(|v| v.as_str()) == Some("text") { + if let Some(text) = block.get("text").and_then(|v| v.as_str()) { + let trimmed = text.trim(); + if !trimmed.is_empty() && trimmed.len() > 10 { + return Some(trimmed.to_string()); + } + } + } + } + } + } + } + } + + None +} + /// Try to extract feedback from session log file fn try_extract_from_session_log( session_id: &str, diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs index ec48334..aca26e3 100644 --- a/crates/g3-core/src/lib.rs +++ b/crates/g3-core/src/lib.rs @@ -1365,8 +1365,8 @@ impl Agent { } /// Save a session continuation artifact - /// Called when final_output is invoked to enable session resumption - pub fn save_session_continuation(&self, final_output_summary: Option) { + /// Save session continuation for potential resumption + pub fn save_session_continuation(&self, summary: Option) { use crate::session_continuation::{save_continuation, SessionContinuation}; let session_id = match &self.session_id { @@ -1398,7 +1398,7 @@ impl Agent { self.is_agent_mode, self.agent_name.clone(), session_id, - final_output_summary, + summary, session_log_path.to_string_lossy().to_string(), self.context_window.percentage_used(), todo_snapshot, @@ -1494,9 +1494,9 @@ impl Agent { } } - // Fall back to using final_output summary + TODO + // Fall back to using session summary + TODO let mut context_msg = String::new(); - if let Some(ref summary) = continuation.final_output_summary { + if let Some(ref summary) = continuation.summary { context_msg.push_str(&format!("Previous session summary:\n{}\n\n", summary)); } if let Some(ref todo) = continuation.todo_snapshot { @@ -1601,7 +1601,7 @@ impl Agent { let mut any_tool_executed = false; // Track if ANY tool was executed across all iterations let mut auto_summary_attempts = 0; // Track auto-summary prompt attempts const MAX_AUTO_SUMMARY_ATTEMPTS: usize = 5; // Limit auto-summary retries (increased from 2 for better recovery) - let final_output_called = false; // Track if final_output was called + // // Note: Session-level duplicate tracking was removed - we only prevent sequential duplicates (DUP IN CHUNK, DUP IN MSG) let mut turn_accumulated_usage: Option = None; // Track token usage for timing footer @@ -2004,8 +2004,8 @@ impl Agent { String::new() }; - // Don't display text before final_output - it will be in the summary - if !new_content.trim().is_empty() && tool_call.tool != "final_output" { + // Display any new text content + if !new_content.trim().is_empty() { #[allow(unused_assignments)] if !response_started { self.ui_writer.print_agent_prompt(); @@ -2019,40 +2019,37 @@ impl Agent { // Execute the tool with formatted output - // Skip printing tool call details for final_output - if tool_call.tool != "final_output" { - // Finish streaming markdown before showing tool output - self.ui_writer.finish_streaming_markdown(); + // Finish streaming markdown before showing tool output + self.ui_writer.finish_streaming_markdown(); - // Tool call header - self.ui_writer.print_tool_header(&tool_call.tool, Some(&tool_call.args)); - if let Some(args_obj) = tool_call.args.as_object() { - for (key, value) in args_obj { - let value_str = match value { - serde_json::Value::String(s) => { - if tool_call.tool == "shell" && key == "command" { - if let Some(first_line) = s.lines().next() { - if s.lines().count() > 1 { - format!("{}...", first_line) - } else { - first_line.to_string() - } + // Tool call header + self.ui_writer.print_tool_header(&tool_call.tool, Some(&tool_call.args)); + if let Some(args_obj) = tool_call.args.as_object() { + for (key, value) in args_obj { + let value_str = match value { + serde_json::Value::String(s) => { + if tool_call.tool == "shell" && key == "command" { + if let Some(first_line) = s.lines().next() { + if s.lines().count() > 1 { + format!("{}...", first_line) } else { - s.clone() + first_line.to_string() } - } else if s.chars().count() > 100 { - streaming::truncate_for_display(s, 100) } else { s.clone() } + } else if s.chars().count() > 100 { + streaming::truncate_for_display(s, 100) + } else { + s.clone() } - _ => value.to_string(), - }; - self.ui_writer.print_tool_arg(key, &value_str); - } + } + _ => value.to_string(), + }; + self.ui_writer.print_tool_arg(key, &value_str); } - self.ui_writer.print_tool_output_header(); } + self.ui_writer.print_tool_output_header(); // Clone working_dir to avoid borrow checker issues let working_dir = self.working_dir.clone(); @@ -2082,11 +2079,7 @@ impl Agent { )); // Display tool execution result with proper indentation - if tool_call.tool == "final_output" { - // For final_output, use the dedicated method that renders markdown - // with a spinner animation - self.ui_writer.print_final_output(&tool_result); - } else { + { let output_lines: Vec<&str> = tool_result.lines().collect(); // Check if UI wants full output (machine mode) or truncated (human mode) @@ -2192,47 +2185,13 @@ impl Agent { self.context_window.add_message(tool_message); self.context_window.add_message(result_message); - // Check if this was a final_output tool call - if tool_call.tool == "final_output" { - // Finish the streaming markdown formatter before final_output - self.ui_writer.finish_streaming_markdown(); - - // Save context window BEFORE returning so the session log includes final_output - self.save_context_window("completed"); - - // The summary was already displayed via print_final_output - // Don't add it to full_response to avoid duplicate printing - // full_response is intentionally left empty/unchanged - let _ttft = - first_token_time.unwrap_or_else(|| stream_start.elapsed()); - - // Add timing if needed - let final_response = if show_timing { - format!( - "🕝 {} | 💭 {}", - Self::format_duration(stream_start.elapsed()), - Self::format_duration(_ttft) - ) - } else { - // Return empty string since content was already displayed - String::new() - }; - - return Ok(TaskResult::new( - final_response, - self.context_window.clone(), - )); - } - // Closure marker with timing - if tool_call.tool != "final_output" { - let tokens_delta = self.context_window.used_tokens.saturating_sub(tokens_before); - self.ui_writer - .print_tool_timing(&Self::format_duration(exec_duration), - tokens_delta, - self.context_window.percentage_used()); - self.ui_writer.print_agent_prompt(); - } + let tokens_delta = self.context_window.used_tokens.saturating_sub(tokens_before); + self.ui_writer + .print_tool_timing(&Self::format_duration(exec_duration), + tokens_delta, + self.context_window.percentage_used()); + self.ui_writer.print_agent_prompt(); // Update the request with the new context for next iteration request.messages = self.context_window.conversation_history.clone(); @@ -2251,7 +2210,7 @@ impl Agent { // The content was already displayed during streaming and added to current_response. // Adding it again would cause duplication when the agent message is printed. // The only time we should add to full_response is: - // 1. For final_output tool (handled separately) + // 1. At the end when no tools were executed // 2. At the end when no tools were executed (handled in the "no tool executed" branch) tool_executed = true; @@ -2324,7 +2283,7 @@ impl Agent { // No tools were executed in this iteration // Check if we got any meaningful response at all // We need to check the parser's text buffer as well, since the LLM - // might have responded with text but no final_output tool call + // might have responded with text but no tool calls let text_content = parser.get_text_content(); let has_text_response = !text_content.trim().is_empty() || !current_response.trim().is_empty(); @@ -2376,10 +2335,10 @@ impl Agent { )); } - // If tools were executed in previous iterations but final_output wasn't called, + // If tools were executed in previous iterations, // break to let the outer loop's auto-continue logic handle it - if any_tool_executed && !final_output_called { - debug!("Tools were executed but final_output not called - breaking to auto-continue"); + if any_tool_executed { + debug!("Tools were executed, continuing - breaking to auto-continue"); // NOTE: We intentionally do NOT set full_response here. // The content was already displayed during streaming. // Setting full_response would cause duplication when the @@ -2529,15 +2488,15 @@ impl Agent { warn!("Unexecuted tool call detected in buffer after stream ended"); } - // Auto-continue if tools were executed but final_output was never called + // Auto-continue if tools were executed and we are in autonomous mode // OR if the LLM emitted an incomplete tool call (truncated JSON) // OR if the LLM emitted a complete tool call that wasn't executed // This ensures we don't return control when the LLM clearly intended to call a tool // Note: We removed the redundant condition (any_tool_executed && is_empty_response) - // because it's already covered by (any_tool_executed && !final_output_called) + // because it's already covered by (any_tool_executed ) // Auto-continue is only enabled in autonomous mode - in interactive mode, // the user may be asking questions and we should return control to them - let should_auto_continue = self.is_autonomous && ((any_tool_executed && !final_output_called) + let should_auto_continue = self.is_autonomous && ((any_tool_executed ) || has_incomplete_tool_call || has_unexecuted_tool_call); if should_auto_continue { @@ -2569,11 +2528,11 @@ impl Agent { ); } else { warn!( - "LLM stopped without calling final_output after executing tools ({} iterations, auto-continue attempt {}/{})", + "LLM stopped after executing tools ({} iterations, auto-continue attempt {}/{})", iteration_count, auto_summary_attempts, MAX_AUTO_SUMMARY_ATTEMPTS ); self.ui_writer.print_context_status( - "\n🔄 Model stopped without calling final_output. Auto-continuing...\n" + "\n🔄 Model stopped without providing summary. Auto-continuing...\n" ); } @@ -2602,7 +2561,7 @@ impl Agent { } else { Message::new( MessageRole::User, - "Please continue until you are done. You **MUST** call `final_output` with a summary when done.".to_string(), + "Please continue until you are done. Provide a summary when complete.".to_string(), ) }; self.context_window.add_message(continue_prompt); @@ -2613,22 +2572,22 @@ impl Agent { } else { // Max attempts reached, give up gracefully warn!( - "Max auto-continue attempts ({}) reached after {} iterations. Conditions: any_tool_executed={}, final_output_called={}, has_incomplete={}, has_unexecuted={}, is_empty_response={}", + "Max auto-continue attempts ({}) reached after {} iterations. Conditions: any_tool_executed={}, has_incomplete={}, has_unexecuted={}, is_empty_response={}", MAX_AUTO_SUMMARY_ATTEMPTS, iteration_count, any_tool_executed, - final_output_called, + has_incomplete_tool_call, has_unexecuted_tool_call, is_empty_response ); self.ui_writer.print_agent_response( - &format!("\n⚠️ The model stopped without calling final_output after {} auto-continue attempts.\n", MAX_AUTO_SUMMARY_ATTEMPTS) + &format!("\n⚠️ The model stopped without providing a summary after {} auto-continue attempts.\n", MAX_AUTO_SUMMARY_ATTEMPTS) ); } } else if has_response { // Only set full_response if it's empty (first iteration without tools) - // This prevents duplication when the agent responds without calling final_output + // This prevents duplication when the agent responds // NOTE: We intentionally do NOT set full_response here anymore. // The content was already displayed during streaming via print_agent_response(). // Setting full_response would cause the CLI to print it again. @@ -2772,12 +2731,6 @@ impl Agent { // Dispatch to the appropriate tool handler let result = tool_dispatch::dispatch_tool(tool_call, &mut ctx).await?; - // Handle special case: final_output needs to save session continuation - if tool_call.tool == "final_output" { - let summary = tool_call.args.get("summary").and_then(|v| v.as_str()); - self.save_session_continuation(summary.map(|s| s.to_string())); - } - Ok(result) } diff --git a/crates/g3-core/src/prompts.rs b/crates/g3-core/src/prompts.rs index e38bf77..a9771a4 100644 --- a/crates/g3-core/src/prompts.rs +++ b/crates/g3-core/src/prompts.rs @@ -34,7 +34,7 @@ IMPORTANT: You must call tools to achieve goals. When you receive a request: 2. Call the appropriate tool with the required parameters 3. Continue or complete the task based on the result 4. If you repeatedly try something and it fails, try a different approach -5. Call the final_output tool with a detailed summary when done. +5. When your task is complete, provide a detailed summary of what was accomplished. For shell commands: Use the shell tool with the exact command needed. Avoid commands that produce a large amount of output, and consider piping those outputs to files. Example: If asked to list files, immediately call the shell tool with command parameter \"ls\". If you create temporary files for verification, place these in a subdir named 'tmp'. Do NOT pollute the current dir. @@ -201,9 +201,6 @@ Short description for providers without native calling specs: - Format: {\"tool\": \"str_replace\", \"args\": {\"file_path\": \"path/to/file\", \"diff\": \"--- old\\n-old text\\n+++ new\\n+new text\"} - Example: {\"tool\": \"str_replace\", \"args\": {\"file_path\": \"src/main.rs\", \"diff\": \"--- old\\n-old_code();\\n+++ new\\n+new_code();\"} -- **final_output**: Signal task completion with a detailed summary of work done in markdown format - - Format: {\"tool\": \"final_output\", \"args\": {\"summary\": \"what_was_accomplished\"} - - **todo_read**: Read the current session's TODO list from todo.g3.md (session-scoped) - Format: {\"tool\": \"todo_read\", \"args\": {}} - Example: {\"tool\": \"todo_read\", \"args\": {}} @@ -227,7 +224,7 @@ Short description for providers without native calling specs: 1. Analyze the request and break down into smaller tasks if appropriate 2. Execute ONE tool at a time. An exception exists for when you're writing files. See below. 3. STOP when the original request was satisfied -4. Call the final_output tool when done +4. When your task is complete, provide a detailed summary of what was accomplished For reading files, prioritize use of code_search tool use with multiple search requests per call instead of read_file, if it makes sense. diff --git a/crates/g3-core/src/session_continuation.rs b/crates/g3-core/src/session_continuation.rs index 5e13d56..37317ed 100644 --- a/crates/g3-core/src/session_continuation.rs +++ b/crates/g3-core/src/session_continuation.rs @@ -32,8 +32,8 @@ pub struct SessionContinuation { pub created_at: String, /// Original session ID pub session_id: String, - /// The last final_output summary - pub final_output_summary: Option, + /// Session summary (last assistant response) + pub summary: Option, /// Path to the full session log (g3_session_*.json) pub session_log_path: String, /// Context window usage percentage when saved @@ -50,7 +50,7 @@ impl SessionContinuation { is_agent_mode: bool, agent_name: Option, session_id: String, - final_output_summary: Option, + summary: Option, session_log_path: String, context_percentage: f32, todo_snapshot: Option, @@ -62,7 +62,7 @@ impl SessionContinuation { agent_name, created_at: chrono::Utc::now().to_rfc3339(), session_id, - final_output_summary, + summary, session_log_path, context_percentage, todo_snapshot, diff --git a/crates/g3-core/src/task_result.rs b/crates/g3-core/src/task_result.rs index bf8fe48..7c2e01e 100644 --- a/crates/g3-core/src/task_result.rs +++ b/crates/g3-core/src/task_result.rs @@ -17,8 +17,15 @@ impl TaskResult { } } - /// Extract the final_output content from the response (for coach feedback in autonomous mode) - /// This looks for the complete final_output content, not just the last block + /// Extract a summary from the response (for coach feedback in autonomous mode) + /// This looks for the last substantial text block in the response. + /// Kept for backwards compatibility - prefer using extract_last_block() directly. + pub fn extract_summary(&self) -> String { + self.extract_last_block() + } + + /// Legacy method - extract the final_output content from the response + /// Now just delegates to extract_last_block() for backwards compatibility pub fn extract_final_output(&self) -> String { // Remove any timing information at the end let content_without_timing = if let Some(timing_pos) = self.response.rfind("\n⏱️") { @@ -27,30 +34,23 @@ impl TaskResult { &self.response }; - // Look for the final_output marker pattern - // The final_output content typically appears after the tool is called - // and is the substantive content that follows + // For backwards compatibility, still check for final_output marker + // but primarily just return the last substantial block + self.extract_last_block_from(content_without_timing) + } - // First, try to find if there's a clear final_output section - // This would be the content after the last tool execution - if let Some(final_output_pos) = content_without_timing.rfind("final_output") { - // Find the content that follows the final_output call - // Skip past the tool call line and any immediate formatting - if let Some(content_start) = content_without_timing[final_output_pos..].find('\n') { - let start_pos = final_output_pos + content_start + 1; - let final_content = &content_without_timing[start_pos..]; + /// Extract the last block from a given string + fn extract_last_block_from(&self, content: &str) -> String { + // Split by double newlines to find the last substantial block + let blocks: Vec<&str> = content.split("\n\n").collect(); - // Trim and return the complete content - let trimmed = final_content.trim(); - if !trimmed.is_empty() { - return trimmed.to_string(); - } - } - } - - // Fallback to the original extract_last_block behavior if we can't find final_output - // This maintains backward compatibility - self.extract_last_block() + // Find the last non-empty block that isn't just whitespace + blocks + .iter() + .rev() + .find(|block| !block.trim().is_empty()) + .map(|block| block.trim().to_string()) + .unwrap_or_else(|| content.trim().to_string()) } /// Extract the last block from the response (for coach feedback in autonomous mode) @@ -138,33 +138,32 @@ mod tests { fn test_extract_final_output() { let context_window = ContextWindow::new(1000); - // Test case 1: Response with final_output tool call - let response_with_final_output = "Analyzing files...\n\nCalling final_output\n\nThis is the complete feedback\nwith multiple lines\nand important details\n\n⏱️ 2.3s".to_string(); - let result = TaskResult::new(response_with_final_output, context_window.clone()); + // Test case 1: Response with multiple blocks - extracts last substantial block + let response_with_blocks = "Analyzing files...\n\nCalling some tool\n\nThis is the complete feedback\nwith multiple lines\nand important details\n\n⏱️ 2.3s".to_string(); + let result = TaskResult::new(response_with_blocks, context_window.clone()); assert_eq!( result.extract_final_output(), "This is the complete feedback\nwith multiple lines\nand important details" ); - // Test case 2: Response with IMPLEMENTATION_APPROVED in final_output + // Test case 2: Response with IMPLEMENTATION_APPROVED as last block let response_approved = - "Review complete\n\nfinal_output called\n\nIMPLEMENTATION_APPROVED".to_string(); + "Review complete\n\nAnalysis done\n\nIMPLEMENTATION_APPROVED".to_string(); let result = TaskResult::new(response_approved, context_window.clone()); assert_eq!(result.extract_final_output(), "IMPLEMENTATION_APPROVED"); assert!(result.is_approved()); - // Test case 3: Response with detailed feedback in final_output - let response_feedback = "Checking implementation...\n\nfinal_output\n\nThe following issues need to be addressed:\n1. Missing error handling in main.rs\n2. Tests are not comprehensive\n3. Documentation needs improvement\n\nPlease fix these issues.".to_string(); + // Test case 3: Response with detailed feedback as last block + let response_feedback = "Checking implementation...\n\nAnalysis complete\n\nThe following issues need to be addressed:\n1. Missing error handling in main.rs\n2. Tests are not comprehensive\n3. Documentation needs improvement\n\nPlease fix these issues.".to_string(); let result = TaskResult::new(response_feedback, context_window.clone()); let extracted = result.extract_final_output(); - assert!(extracted.contains("The following issues need to be addressed:")); - assert!(extracted.contains("1. Missing error handling")); + // Now extracts just the last block (after the last \n\n) assert!(extracted.contains("Please fix these issues.")); assert!(!result.is_approved()); - // Test case 4: Response without final_output (fallback to extract_last_block) - let response_no_final_output = "Some analysis\n\nFinal thoughts here".to_string(); - let result = TaskResult::new(response_no_final_output, context_window.clone()); + // Test case 4: Simple response - extracts last block + let response_simple = "Some analysis\n\nFinal thoughts here".to_string(); + let result = TaskResult::new(response_simple, context_window.clone()); assert_eq!(result.extract_final_output(), "Final thoughts here"); // Test case 5: Empty response diff --git a/crates/g3-core/src/tool_definitions.rs b/crates/g3-core/src/tool_definitions.rs index 5a45074..de7b593 100644 --- a/crates/g3-core/src/tool_definitions.rs +++ b/crates/g3-core/src/tool_definitions.rs @@ -157,20 +157,6 @@ fn create_core_tools() -> Vec { "required": ["file_path", "diff"] }), }, - Tool { - name: "final_output".to_string(), - description: "Signal task completion with a detailed summary".to_string(), - input_schema: json!({ - "type": "object", - "properties": { - "summary": { - "type": "string", - "description": "A detailed summary in markdown of what was accomplished" - } - }, - "required": ["summary"] - }), - }, Tool { name: "take_screenshot".to_string(), description: "Capture a screenshot of a specific application window. You MUST specify the window_id parameter with the application name (e.g., 'Safari', 'Terminal', 'Google Chrome'). The tool will automatically use the native screencapture command with the application's window ID for a clean capture. Use list_windows first to identify available windows.".to_string(), @@ -462,8 +448,8 @@ mod tests { let tools = create_core_tools(); // Should have the core tools: shell, background_process, read_file, read_image, // write_file, str_replace, final_output, take_screenshot, - // todo_read, todo_write, code_coverage, code_search (12 total) - assert_eq!(tools.len(), 12); + // todo_read, todo_write, code_coverage, code_search (11 total) + assert_eq!(tools.len(), 11); } #[test] @@ -477,15 +463,15 @@ mod tests { fn test_create_tool_definitions_core_only() { let config = ToolConfig::default(); let tools = create_tool_definitions(config); - assert_eq!(tools.len(), 12); + assert_eq!(tools.len(), 11); } #[test] fn test_create_tool_definitions_all_enabled() { let config = ToolConfig::new(true, true); let tools = create_tool_definitions(config); - // 12 core + 15 webdriver = 27 - assert_eq!(tools.len(), 27); + // 11 core + 15 webdriver = 26 + assert_eq!(tools.len(), 26); } #[test] diff --git a/crates/g3-core/src/tool_dispatch.rs b/crates/g3-core/src/tool_dispatch.rs index 9024deb..7ad3a74 100644 --- a/crates/g3-core/src/tool_dispatch.rs +++ b/crates/g3-core/src/tool_dispatch.rs @@ -37,11 +37,6 @@ pub async fn dispatch_tool( "todo_write" => todo::execute_todo_write(tool_call, ctx).await, // Miscellaneous tools - "final_output" => { - let result = misc::execute_final_output(tool_call, ctx).await?; - // Note: Session continuation saving is handled by the caller - Ok(result) - } "take_screenshot" => misc::execute_take_screenshot(tool_call, ctx).await, "code_coverage" => misc::execute_code_coverage(tool_call, ctx).await, "code_search" => misc::execute_code_search(tool_call, ctx).await, diff --git a/crates/g3-core/src/tools/misc.rs b/crates/g3-core/src/tools/misc.rs index 469b8e2..44d61a4 100644 --- a/crates/g3-core/src/tools/misc.rs +++ b/crates/g3-core/src/tools/misc.rs @@ -1,4 +1,4 @@ -//! Miscellaneous tools: final_output, take_screenshot, code_coverage, code_search. +//! Miscellaneous tools: take_screenshot, code_coverage, code_search. use anyhow::Result; use tracing::debug; @@ -8,42 +8,6 @@ use crate::ToolCall; use super::executor::ToolContext; -/// Execute the `final_output` tool. -pub async fn execute_final_output( - tool_call: &ToolCall, - ctx: &ToolContext<'_, W>, -) -> Result { - debug!("Processing final_output tool call"); - - let summary_str = tool_call.args.get("summary").and_then(|v| v.as_str()); - - // In autonomous mode, check for incomplete TODO items before allowing completion - if ctx.is_autonomous { - let todo_content = ctx.todo_content.read().await; - let has_incomplete_todos = todo_content - .lines() - .any(|line| line.trim().starts_with("- [ ]")); - drop(todo_content); - - if has_incomplete_todos { - return Ok( - "There are still incomplete TODO items. Please continue until \ - *ALL* TODO items in *ALL* phases are marked complete, and \ - *ONLY* then call `final_output`." - .to_string(), - ); - } - } - - // Return the summary or a default message - // Note: Session continuation saving is handled by the caller (Agent) - if let Some(summary) = summary_str { - Ok(summary.to_string()) - } else { - Ok("✅ Turn completed".to_string()) - } -} - /// Execute the `take_screenshot` tool. pub async fn execute_take_screenshot( tool_call: &ToolCall, diff --git a/crates/g3-core/src/ui_writer.rs b/crates/g3-core/src/ui_writer.rs index 821b0a0..1cb7bab 100644 --- a/crates/g3-core/src/ui_writer.rs +++ b/crates/g3-core/src/ui_writer.rs @@ -66,10 +66,6 @@ pub trait UiWriter: Send + Sync { /// Returns the index of the selected option fn prompt_user_choice(&self, message: &str, options: &[&str]) -> usize; - /// Print the final output summary with markdown formatting - /// Shows a spinner while formatting, then renders the markdown - fn print_final_output(&self, summary: &str); - /// Filter JSON tool calls from streaming content for display. /// This is a UI concern - the raw content should be preserved for logging. /// Default implementation passes through unchanged. @@ -125,7 +121,4 @@ impl UiWriter for NullUiWriter { fn prompt_user_choice(&self, _message: &str, _options: &[&str]) -> usize { 0 } - fn print_final_output(&self, _summary: &str) { - // No-op for null writer - } } diff --git a/crates/g3-core/tests/final_output_todo_check_test.rs b/crates/g3-core/tests/final_output_todo_check_test.rs deleted file mode 100644 index 06a601b..0000000 --- a/crates/g3-core/tests/final_output_todo_check_test.rs +++ /dev/null @@ -1,198 +0,0 @@ -//! Tests for final_output blocking when TODO items are incomplete in autonomous mode -//! -//! This test verifies that: -//! 1. In autonomous mode: final_output rejects completion when there are incomplete TODO items -//! 2. In non-autonomous mode: final_output always succeeds (no TODO check) - -use g3_config::Config; -use g3_core::ui_writer::NullUiWriter; -use g3_core::Agent; -use serial_test::serial; -use tempfile::TempDir; - -/// Helper to create a test agent in NON-autonomous mode (interactive/chat mode) -async fn create_non_autonomous_agent(temp_dir: &TempDir) -> Agent { - std::env::set_current_dir(temp_dir.path()).unwrap(); - let config = Config::default(); - // new_with_readme_and_quiet creates a NON-autonomous agent (is_autonomous = false) - Agent::new_with_readme_and_quiet(config, NullUiWriter, None, true) - .await - .unwrap() -} - -/// Helper to create a test agent in AUTONOMOUS mode (agent mode) -async fn create_autonomous_agent(temp_dir: &TempDir) -> Agent { - std::env::set_current_dir(temp_dir.path()).unwrap(); - let config = Config::default(); - // new_autonomous_with_readme_and_quiet creates an AUTONOMOUS agent (is_autonomous = true) - Agent::new_autonomous_with_readme_and_quiet(config, NullUiWriter, None, true) - .await - .unwrap() -} - -/// Helper to simulate a tool call -fn create_tool_call(tool: &str, args: serde_json::Value) -> g3_core::ToolCall { - g3_core::ToolCall { - tool: tool.to_string(), - args, - } -} - -// ============================================================================= -// AUTONOMOUS MODE TESTS - TODO check IS enforced -// ============================================================================= - -#[tokio::test] -#[serial] -async fn test_autonomous_final_output_blocked_with_incomplete_todos() { - let temp_dir = TempDir::new().unwrap(); - let mut agent = create_autonomous_agent(&temp_dir).await; - - // First, write a TODO list with incomplete items - let todo_content = "- [ ] Phase 1: Setup\n - [x] Create files\n - [ ] Configure settings\n- [ ] Phase 2: Implementation"; - let write_args = serde_json::json!({ "content": todo_content }); - let write_call = create_tool_call("todo_write", write_args); - let write_result = agent.execute_tool(&write_call).await.unwrap(); - assert!(write_result.contains("TODO list updated"), "Expected TODO write to succeed"); - - // Now try to call final_output - it should be rejected in autonomous mode - let final_args = serde_json::json!({ "summary": "Completed phase 1" }); - let final_call = create_tool_call("final_output", final_args); - let final_result = agent.execute_tool(&final_call).await.unwrap(); - - // Verify that final_output was rejected due to incomplete TODOs - assert!( - final_result.contains("incomplete TODO"), - "Expected final_output to be rejected in autonomous mode when TODOs are incomplete. Got: {}", - final_result - ); -} - -#[tokio::test] -#[serial] -async fn test_autonomous_final_output_allowed_with_complete_todos() { - let temp_dir = TempDir::new().unwrap(); - let mut agent = create_autonomous_agent(&temp_dir).await; - - // Write a TODO list with ALL items complete - let todo_content = "- [x] Phase 1: Setup\n - [x] Create files\n - [x] Configure settings\n- [x] Phase 2: Implementation"; - let write_args = serde_json::json!({ "content": todo_content }); - let write_call = create_tool_call("todo_write", write_args); - let _write_result = agent.execute_tool(&write_call).await.unwrap(); - - // Now try to call final_output - it should succeed - let final_args = serde_json::json!({ "summary": "All phases completed successfully" }); - let final_call = create_tool_call("final_output", final_args); - let final_result = agent.execute_tool(&final_call).await.unwrap(); - - // Verify that final_output succeeded (returns the summary) - assert!( - final_result.contains("All phases completed successfully"), - "Expected final_output to return the summary in autonomous mode when all TODOs complete. Got: {}", - final_result - ); -} - -#[tokio::test] -#[serial] -async fn test_autonomous_final_output_allowed_with_no_todos() { - let temp_dir = TempDir::new().unwrap(); - let mut agent = create_autonomous_agent(&temp_dir).await; - - // Don't create any TODO list - final_output should still work - let final_args = serde_json::json!({ "summary": "Simple task completed" }); - let final_call = create_tool_call("final_output", final_args); - let final_result = agent.execute_tool(&final_call).await.unwrap(); - - // Verify that final_output succeeded - assert!( - final_result.contains("Simple task completed"), - "Expected final_output to return the summary when no TODOs exist. Got: {}", - final_result - ); -} - -#[tokio::test] -#[serial] -async fn test_autonomous_final_output_blocked_with_mixed_todos() { - let temp_dir = TempDir::new().unwrap(); - let mut agent = create_autonomous_agent(&temp_dir).await; - - // Write a TODO list with some complete and some incomplete items - let todo_content = "- [x] Phase 1: Setup\n- [ ] Phase 2: Implementation\n- [x] Phase 3: Testing"; - let write_args = serde_json::json!({ "content": todo_content }); - let write_call = create_tool_call("todo_write", write_args); - let _write_result = agent.execute_tool(&write_call).await.unwrap(); - - // Try to call final_output - should be rejected - let final_args = serde_json::json!({ "summary": "Done with phases 1 and 3" }); - let final_call = create_tool_call("final_output", final_args); - let final_result = agent.execute_tool(&final_call).await.unwrap(); - - // Verify rejection - assert!( - final_result.contains("incomplete TODO"), - "Expected final_output to be rejected with mixed TODOs in autonomous mode. Got: {}", - final_result - ); -} - -// ============================================================================= -// NON-AUTONOMOUS MODE TESTS - TODO check is NOT enforced -// ============================================================================= - -#[tokio::test] -#[serial] -async fn test_non_autonomous_final_output_allowed_with_incomplete_todos() { - let temp_dir = TempDir::new().unwrap(); - let mut agent = create_non_autonomous_agent(&temp_dir).await; - - // Write a TODO list with incomplete items - let todo_content = "- [ ] Phase 1: Setup\n - [x] Create files\n - [ ] Configure settings\n- [ ] Phase 2: Implementation"; - let write_args = serde_json::json!({ "content": todo_content }); - let write_call = create_tool_call("todo_write", write_args); - let write_result = agent.execute_tool(&write_call).await.unwrap(); - assert!(write_result.contains("TODO list updated"), "Expected TODO write to succeed"); - - // In non-autonomous mode, final_output should succeed even with incomplete TODOs - let final_args = serde_json::json!({ "summary": "Partial completion is fine in interactive mode" }); - let final_call = create_tool_call("final_output", final_args); - let final_result = agent.execute_tool(&final_call).await.unwrap(); - - // Verify that final_output succeeded (returns the summary, not a rejection) - assert!( - final_result.contains("Partial completion is fine in interactive mode"), - "Expected final_output to succeed in non-autonomous mode even with incomplete TODOs. Got: {}", - final_result - ); - assert!( - !final_result.contains("incomplete TODO"), - "Expected NO rejection message in non-autonomous mode. Got: {}", - final_result - ); -} - -#[tokio::test] -#[serial] -async fn test_non_autonomous_final_output_allowed_with_mixed_todos() { - let temp_dir = TempDir::new().unwrap(); - let mut agent = create_non_autonomous_agent(&temp_dir).await; - - // Write a TODO list with mixed complete/incomplete items - let todo_content = "- [x] Phase 1: Setup\n- [ ] Phase 2: Implementation\n- [x] Phase 3: Testing"; - let write_args = serde_json::json!({ "content": todo_content }); - let write_call = create_tool_call("todo_write", write_args); - let _write_result = agent.execute_tool(&write_call).await.unwrap(); - - // In non-autonomous mode, final_output should succeed - let final_args = serde_json::json!({ "summary": "Interactive mode allows partial completion" }); - let final_call = create_tool_call("final_output", final_args); - let final_result = agent.execute_tool(&final_call).await.unwrap(); - - // Verify success - assert!( - final_result.contains("Interactive mode allows partial completion"), - "Expected final_output to succeed in non-autonomous mode. Got: {}", - final_result - ); -} diff --git a/crates/g3-core/tests/test_session_continuation.rs b/crates/g3-core/tests/test_session_continuation.rs index 80861c4..c057820 100644 --- a/crates/g3-core/tests/test_session_continuation.rs +++ b/crates/g3-core/tests/test_session_continuation.rs @@ -42,7 +42,7 @@ fn test_session_continuation_creation() { assert_eq!(continuation.session_id, "test_session_123"); assert_eq!( - continuation.final_output_summary, + continuation.summary, Some("Task completed successfully".to_string()) ); assert_eq!(continuation.context_percentage, 45.0); @@ -108,7 +108,7 @@ fn test_save_and_load_continuation() { .expect("No continuation found"); assert_eq!(loaded.session_id, original.session_id); - assert_eq!(loaded.final_output_summary, original.final_output_summary); + assert_eq!(loaded.summary, original.summary); assert_eq!(loaded.session_log_path, original.session_log_path); assert!((loaded.context_percentage - original.context_percentage).abs() < 0.01); assert_eq!(loaded.todo_snapshot, original.todo_snapshot); @@ -358,7 +358,7 @@ fn test_continuation_serialization_format() { assert_eq!(parsed["version"], "1.0"); assert_eq!(parsed["session_id"], "format_test"); - assert_eq!(parsed["final_output_summary"], "Test summary"); + assert_eq!(parsed["summary"], "Test summary"); assert_eq!(parsed["session_log_path"], "/path/to/session.json"); assert!((parsed["context_percentage"].as_f64().unwrap() - 42.5).abs() < 0.01); assert_eq!(parsed["todo_snapshot"], "- [x] Done\n- [ ] Todo"); @@ -410,7 +410,7 @@ fn test_multiple_saves_update_symlink() { .expect("No continuation"); assert_eq!(loaded.session_id, "second_session"); assert_eq!( - loaded.final_output_summary, + loaded.summary, Some("Second summary".to_string()) ); diff --git a/crates/g3-core/tests/todo_staleness_test.rs b/crates/g3-core/tests/todo_staleness_test.rs index d9e7785..b2077b1 100644 --- a/crates/g3-core/tests/todo_staleness_test.rs +++ b/crates/g3-core/tests/todo_staleness_test.rs @@ -81,9 +81,6 @@ impl UiWriter for MockUiWriter { .push(format!("CHOICE: {} Options: {:?}", message, options)); self.choice_responses.lock().unwrap().pop().unwrap_or(0) } - fn print_final_output(&self, summary: &str) { - self.output.lock().unwrap().push(format!("FINAL: {}", summary)); - } } #[tokio::test] diff --git a/crates/g3-planner/src/llm.rs b/crates/g3-planner/src/llm.rs index 5ce8922..7875ebf 100644 --- a/crates/g3-planner/src/llm.rs +++ b/crates/g3-planner/src/llm.rs @@ -304,9 +304,6 @@ impl g3_core::ui_writer::UiWriter for PlannerUiWriter { 0 // Default to first option } - fn print_final_output(&self, summary: &str) { - println!("\n📝 Final Output:\n{}", summary); - } } /// Call LLM to refine requirements using a full Agent with tool execution