Remove final_output tool - let summaries stream naturally

- Remove final_output from tool definitions, dispatch, and misc tools
- Update system prompts to request summaries as regular markdown text
- Remove print_final_output from UiWriter trait and all implementations
- Remove final_output handling from agent core logic
- Rename final_output_summary → summary in session continuation
- Delete final_output test files
- Update tool count tests (12→11, 27→26)

This allows LLM summaries to stream through the markdown formatter
for a more natural, responsive user experience instead of buffering
everything into a tool call.
This commit is contained in:
Dhanji R. Prasanna
2026-01-09 14:57:24 +11:00
parent bebf04c7bd
commit 777191b3cb
17 changed files with 262 additions and 726 deletions

View File

@@ -148,9 +148,6 @@ fn extract_coach_feedback_from_logs(
coach_agent: &g3_core::Agent<ConsoleUiWriter>, coach_agent: &g3_core::Agent<ConsoleUiWriter>,
output: &SimpleOutput, output: &SimpleOutput,
) -> Result<String> { ) -> Result<String> {
// CORRECT APPROACH: Get the session ID from the current coach agent
// and read its specific log file directly
// Get the coach agent's session ID // Get the coach agent's session ID
let session_id = coach_agent let session_id = coach_agent
.get_session_id() .get_session_id()
@@ -167,95 +164,88 @@ fn extract_coach_feedback_from_logs(
logs_dir.join(format!("g3_session_{}.json", session_id)) logs_dir.join(format!("g3_session_{}.json", session_id))
}; };
// Read the coach agent's specific log file // Try to extract from session log
if log_file_path.exists() { if let Some(feedback) = try_extract_feedback_from_log(&log_file_path, output) {
if let Ok(log_content) = std::fs::read_to_string(&log_file_path) {
if let Ok(log_json) = serde_json::from_str::<serde_json::Value>(&log_content) {
if let Some(context_window) = log_json.get("context_window") {
if let Some(conversation_history) = context_window.get("conversation_history") {
if let Some(messages) = conversation_history.as_array() {
// Go backwards through the conversation to find the last tool result
// that corresponds to a final_output tool call
for i in (0..messages.len()).rev() {
let msg = &messages[i];
// Check if this is a User message with "Tool result:"
if let Some(role) = msg.get("role") {
if let Some(role_str) = role.as_str() {
if role_str == "User" || role_str == "user" {
if let Some(content) = msg.get("content") {
if let Some(content_str) = content.as_str() {
if content_str.starts_with("Tool result:") {
// Found a tool result, now check the preceding message
// to verify it was a final_output tool call
if i > 0 {
let prev_msg = &messages[i - 1];
if let Some(prev_role) = prev_msg.get("role") {
if let Some(prev_role_str) = prev_role.as_str() {
if prev_role_str == "assistant" || prev_role_str == "Assistant" {
if let Some(prev_content) = prev_msg.get("content") {
if let Some(prev_content_str) = prev_content.as_str() {
// Check if the previous assistant message contains a final_output tool call
if prev_content_str.contains("\"tool\": \"final_output\"") {
// This is a final_output tool result
let feedback = if content_str.starts_with("Tool result: ") {
content_str.strip_prefix("Tool result: ")
.unwrap_or(content_str)
.to_string()
} else {
content_str.to_string()
};
output.print(&format!( output.print(&format!(
"Coach feedback extracted: {} characters (from {} total)", "✅ Extracted coach feedback from session: {}",
feedback.len(),
content_str.len()
));
output.print(&format!("Coach feedback:\n{}", feedback));
output.print(&format!(
"✅ Extracted coach feedback from session: {} (verified final_output tool)",
session_id session_id
)); ));
return Ok(feedback); return Ok(feedback);
} else {
output.print(&format!(
"⚠️ Skipping tool result at index {} - not a final_output tool call",
i
));
}
}
}
}
}
}
}
}
}
}
}
}
}
}
}
}
}
}
}
} }
// If we couldn't extract from logs, panic with detailed error // Fallback: use the TaskResult's extract_summary method
panic!( let fallback = coach_result.extract_summary();
"CRITICAL: Could not extract coach feedback from session: {}\n\ if !fallback.is_empty() {
output.print(&format!(
"✅ Extracted coach feedback from response: {} chars",
fallback.len()
));
return Ok(fallback);
}
// Last resort: return an error instead of panicking
Err(anyhow::anyhow!(
"Could not extract coach feedback from session: {}\n\
Log file path: {:?}\n\ Log file path: {:?}\n\
Log file exists: {}\n\ Log file exists: {}\n\
This indicates the coach did not call final_output tool or the log is corrupted.\n\
Coach result response length: {} chars", Coach result response length: {} chars",
session_id, session_id,
log_file_path, log_file_path,
log_file_path.exists(), log_file_path.exists(),
coach_result.response.len() coach_result.response.len()
); ))
}
/// Helper function to extract feedback from a session log file
/// Looks for the last assistant message with substantial text content
fn try_extract_feedback_from_log(
log_file_path: &std::path::Path,
_output: &SimpleOutput,
) -> Option<String> {
if !log_file_path.exists() {
return None;
}
let log_content = std::fs::read_to_string(log_file_path).ok()?;
let log_json: serde_json::Value = serde_json::from_str(&log_content).ok()?;
let messages = log_json
.get("context_window")?
.get("conversation_history")?
.as_array()?;
// Search backwards for the last assistant message with text content
for msg in messages.iter().rev() {
let role = msg.get("role").and_then(|v| v.as_str())?;
if role.eq_ignore_ascii_case("assistant") {
if let Some(content) = msg.get("content") {
// Handle string content
if let Some(content_str) = content.as_str() {
let trimmed = content_str.trim();
// Skip empty or very short responses (likely just tool calls)
if !trimmed.is_empty() && trimmed.len() > 10 {
return Some(trimmed.to_string());
}
}
// Handle array content (native tool calling format)
if let Some(content_array) = content.as_array() {
for block in content_array {
if block.get("type").and_then(|v| v.as_str()) == Some("text") {
if let Some(text) = block.get("text").and_then(|v| v.as_str()) {
let trimmed = text.trim();
if !trimmed.is_empty() && trimmed.len() > 10 {
return Some(trimmed.to_string());
}
}
}
}
}
}
}
}
None
} }
use clap::Parser; use clap::Parser;
@@ -1492,7 +1482,7 @@ async fn run_interactive<W: UiWriter>(
" Context: {:.1}% used", " Context: {:.1}% used",
continuation.context_percentage continuation.context_percentage
)); ));
if let Some(ref summary) = continuation.final_output_summary { if let Some(ref summary) = continuation.summary {
let preview: String = summary.chars().take(80).collect(); let preview: String = summary.chars().take(80).collect();
output.print(&format!(" Last output: {}...", preview)); output.print(&format!(" Last output: {}...", preview));
} }
@@ -2614,16 +2604,16 @@ Review the current state of the project and provide a concise critique focusing
5. Use UI tools such as webdriver to test functionality thoroughly 5. Use UI tools such as webdriver to test functionality thoroughly
CRITICAL INSTRUCTIONS: CRITICAL INSTRUCTIONS:
1. You MUST use the final_output tool to provide your feedback 1. Provide your feedback as your final response message
2. The summary in final_output should be CONCISE and ACTIONABLE 2. Your feedback should be CONCISE and ACTIONABLE
3. Focus ONLY on what needs to be fixed or improved 3. Focus ONLY on what needs to be fixed or improved
4. Do NOT include your analysis process, file contents, or compilation output in the summary 4. Do NOT include your analysis process, file contents, or compilation output in your final feedback
If the implementation thoroughly meets all requirements, compiles and is fully tested (especially UI flows) *WITHOUT* minor gaps or errors: If the implementation thoroughly meets all requirements, compiles and is fully tested (especially UI flows) *WITHOUT* minor gaps or errors:
- Call final_output with summary: 'IMPLEMENTATION_APPROVED' - Respond with: 'IMPLEMENTATION_APPROVED'
If improvements are needed: If improvements are needed:
- Call final_output with a brief summary listing ONLY the specific issues to fix - Respond with a brief summary listing ONLY the specific issues to fix
Remember: Be clear in your review and concise in your feedback. APPROVE iff the implementation works and thoroughly fits the requirements (implementation > 95% complete). Be rigorous, especially by testing that all UI features work.", Remember: Be clear in your review and concise in your feedback. APPROVE iff the implementation works and thoroughly fits the requirements (implementation > 95% complete). Be rigorous, especially by testing that all UI features work.",
requirements requirements

View File

@@ -108,8 +108,4 @@ impl UiWriter for MachineUiWriter {
0 0
} }
fn print_final_output(&self, summary: &str) {
println!("FINAL_OUTPUT:");
println!("{}", summary);
}
} }

View File

@@ -362,36 +362,6 @@ impl UiWriter for ConsoleUiWriter {
} }
} }
fn print_final_output(&self, summary: &str) {
// Print a header separator
println!("\x1b[1;35m━━━ Summary ━━━\x1b[0m");
println!();
// Use the same streaming markdown formatter for consistency
let mut skin = MadSkin::default();
skin.bold.set_fg(termimad::crossterm::style::Color::Green);
skin.italic.set_fg(termimad::crossterm::style::Color::Cyan);
skin.inline_code.set_fg(termimad::crossterm::style::Color::Rgb {
r: 216,
g: 177,
b: 114,
});
let mut formatter = StreamingMarkdownFormatter::new(skin);
// Process the entire summary through the formatter
let formatted = formatter.process(summary);
print!("{}", formatted);
// Flush any remaining buffered content
let remaining = formatter.finish();
print!("{}", remaining);
let _ = io::stdout().flush();
// Print a footer separator
println!();
println!("\x1b[1;35m━━━━━━━━━━━━━━━\x1b[0m");
}
fn filter_json_tool_calls(&self, content: &str) -> String { fn filter_json_tool_calls(&self, content: &str) -> String {
// Apply JSON tool call filtering for display // Apply JSON tool call filtering for display

View File

@@ -1,178 +0,0 @@
//! Quick test to verify final_output rendering works with streaming markdown
//! Run with: cargo test -p g3-cli --test test_final_output -- --nocapture
use std::io::{self, Write};
#[test]
fn test_final_output_visual() {
use g3_cli::streaming_markdown::StreamingMarkdownFormatter;
use termimad::MadSkin;
// Create the test markdown
let test_markdown = r##"# Task Completed Successfully
Here's a summary of what was accomplished:
## Rust Code Example
Created a new function to handle user authentication:
```rust
use std::collections::HashMap;
/// Authenticates a user with the given credentials
pub async fn authenticate(username: &str, password: &str) -> Result<User, AuthError> {
let hash = hash_password(password)?;
if let Some(user) = db.find_user(username).await? {
if user.password_hash == hash {
Ok(user)
} else {
Err(AuthError::InvalidPassword)
}
} else {
Err(AuthError::UserNotFound)
}
}
```
## Python Example
Also added a Python script for data processing:
```python
import pandas as pd
from typing import List, Dict
def process_data(items: List[Dict]) -> pd.DataFrame:
"""Process raw items into a cleaned DataFrame."""
df = pd.DataFrame(items)
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.dropna(subset=['value'])
return df.sort_values('timestamp')
```
## JavaScript/TypeScript
Frontend component:
```typescript
interface User {
id: string;
name: string;
email: string;
}
const UserCard: React.FC<{ user: User }> = ({ user }) => {
return (
<div className="user-card">
<h3>{user.name}</h3>
<p>{user.email}</p>
</div>
);
};
```
## Shell Commands
Deployment script:
```bash
#!/bin/bash
set -euo pipefail
echo "Building project..."
cargo build --release
echo "Running tests..."
cargo test --all
echo "Deploying to production..."
rsync -avz ./target/release/app server:/opt/app/
```
## JSON Configuration
```json
{
"name": "my-project",
"version": "1.0.0",
"dependencies": {
"serde": "1.0",
"tokio": { "version": "1.0", "features": ["full"] }
}
}
```
## Other Markdown Features
This section tests that **bold text**, *italic text*, and `inline code` still work correctly.
### Lists
- First item
- Second item with **bold**
- Third item with `code`
### Numbered List
1. Step one
2. Step two
3. Step three
### Blockquote
> This is a blockquote that should be rendered
> with proper styling by termimad.
### Table
| Language | Extension | Use Case |
|----------|-----------|----------|
| Rust | .rs | Systems |
| Python | .py | Scripts |
| TypeScript | .ts | Frontend |
## Code Without Language
```
This is a code block without a language specified.
It should still be rendered as code, just without
syntax highlighting.
```
## Final Notes
All changes have been tested and verified. The implementation:
- ✅ Handles multiple languages
- ✅ Preserves markdown formatting
- ✅ Works with nested structures
- ✅ Gracefully handles edge cases
"##;
// Create a styled markdown skin (same as in print_final_output)
let mut skin = MadSkin::default();
skin.bold.set_fg(termimad::crossterm::style::Color::Green);
skin.italic.set_fg(termimad::crossterm::style::Color::Cyan);
skin.inline_code.set_fg(termimad::crossterm::style::Color::Rgb {
r: 216,
g: 177,
b: 114,
});
// Print header
println!("\n\x1b[1;35m━━━ Summary ━━━\x1b[0m\n");
// Use the streaming markdown formatter (same as print_final_output now uses)
let mut formatter = StreamingMarkdownFormatter::new(skin);
let formatted = formatter.process(test_markdown);
print!("{}", formatted);
let remaining = formatter.finish();
print!("{}", remaining);
// Print footer
println!("\n\x1b[1;35m━━━━━━━━━━━━━━━\x1b[0m");
let _ = io::stdout().flush();
}

View File

@@ -100,21 +100,29 @@ pub fn extract_coach_feedback<W>(
where where
W: UiWriter + Clone + Send + Sync + 'static, W: UiWriter + Clone + Send + Sync + 'static,
{ {
// Try session log first (most reliable) // Try session log first - now looks for last assistant message (primary method)
if let Some(session_id) = agent.get_session_id() {
if let Some(feedback) = try_extract_last_assistant_message(&session_id, config) {
debug!("Extracted coach feedback from last assistant message: {} chars", feedback.len());
return ExtractedFeedback::new(feedback, FeedbackSource::ConversationHistory);
}
}
// Fallback: Try session log with final_output pattern (backwards compatibility)
if let Some(session_id) = agent.get_session_id() { if let Some(session_id) = agent.get_session_id() {
if let Some(feedback) = try_extract_from_session_log(&session_id, config) { if let Some(feedback) = try_extract_from_session_log(&session_id, config) {
debug!("Extracted coach feedback from session log: {} chars", feedback.len()); debug!("Extracted coach feedback from session log (final_output): {} chars", feedback.len());
return ExtractedFeedback::new(feedback, FeedbackSource::SessionLog); return ExtractedFeedback::new(feedback, FeedbackSource::SessionLog);
} }
} }
// Try native tool call JSON parsing // Fallback: Try native tool call JSON parsing (backwards compatibility)
if let Some(feedback) = try_extract_from_native_tool_call(&coach_result.response) { if let Some(feedback) = try_extract_from_native_tool_call(&coach_result.response) {
debug!("Extracted coach feedback from native tool call: {} chars", feedback.len()); debug!("Extracted coach feedback from native tool call: {} chars", feedback.len());
return ExtractedFeedback::new(feedback, FeedbackSource::NativeToolCall); return ExtractedFeedback::new(feedback, FeedbackSource::NativeToolCall);
} }
// Try conversation history // Fallback: Try conversation history with final_output pattern (backwards compatibility)
if let Some(session_id) = agent.get_session_id() { if let Some(session_id) = agent.get_session_id() {
if let Some(feedback) = try_extract_from_conversation_history(&session_id, config) { if let Some(feedback) = try_extract_from_conversation_history(&session_id, config) {
debug!("Extracted coach feedback from conversation history: {} chars", feedback.len()); debug!("Extracted coach feedback from conversation history: {} chars", feedback.len());
@@ -122,7 +130,7 @@ where
} }
} }
// Try TaskResult parsing // Fallback: Try TaskResult parsing (extracts last text block)
let extracted = coach_result.extract_final_output(); let extracted = coach_result.extract_final_output();
if !extracted.is_empty() { if !extracted.is_empty() {
debug!("Extracted coach feedback from task result: {} chars", extracted.len()); debug!("Extracted coach feedback from task result: {} chars", extracted.len());
@@ -134,6 +142,73 @@ where
ExtractedFeedback::new(config.default_feedback.clone(), FeedbackSource::DefaultFallback) ExtractedFeedback::new(config.default_feedback.clone(), FeedbackSource::DefaultFallback)
} }
/// Try to extract the last assistant message from session log (PRIMARY method)
/// This is the preferred extraction method - looks for the last substantial
/// assistant message content, regardless of whether it used final_output tool.
fn try_extract_last_assistant_message(
session_id: &str,
config: &FeedbackExtractionConfig,
) -> Option<String> {
// Try new .g3/sessions/<session_id>/session.json path first
let log_file_path = crate::get_session_file(session_id);
// Fall back to old logs/ path if new path doesn't exist
let log_file_path = if log_file_path.exists() {
log_file_path
} else {
let logs_path = config.logs_dir.clone().unwrap_or_else(logs_dir);
logs_path.join(format!("g3_session_{}.json", session_id))
};
if !log_file_path.exists() {
debug!("Session log file not found: {:?}", log_file_path);
return None;
}
let log_content = std::fs::read_to_string(&log_file_path).ok()?;
let log_json: Value = serde_json::from_str(&log_content).ok()?;
// Try to get conversation history from context_window
let messages = log_json
.get("context_window")?
.get("conversation_history")?
.as_array()?;
// Search backwards for the last assistant message with text content
for msg in messages.iter().rev() {
let role = msg.get("role").and_then(|v| v.as_str())?;
if role.eq_ignore_ascii_case("assistant") {
if let Some(content) = msg.get("content") {
// Handle string content
if let Some(content_str) = content.as_str() {
let trimmed = content_str.trim();
// Skip empty or very short responses (likely just tool calls)
if !trimmed.is_empty() && trimmed.len() > 10 {
return Some(trimmed.to_string());
}
}
// Handle array content (native tool calling format)
// Look for text blocks in the array
if let Some(content_array) = content.as_array() {
for block in content_array {
if block.get("type").and_then(|v| v.as_str()) == Some("text") {
if let Some(text) = block.get("text").and_then(|v| v.as_str()) {
let trimmed = text.trim();
if !trimmed.is_empty() && trimmed.len() > 10 {
return Some(trimmed.to_string());
}
}
}
}
}
}
}
}
None
}
/// Try to extract feedback from session log file /// Try to extract feedback from session log file
fn try_extract_from_session_log( fn try_extract_from_session_log(
session_id: &str, session_id: &str,

View File

@@ -1365,8 +1365,8 @@ impl<W: UiWriter> Agent<W> {
} }
/// Save a session continuation artifact /// Save a session continuation artifact
/// Called when final_output is invoked to enable session resumption /// Save session continuation for potential resumption
pub fn save_session_continuation(&self, final_output_summary: Option<String>) { pub fn save_session_continuation(&self, summary: Option<String>) {
use crate::session_continuation::{save_continuation, SessionContinuation}; use crate::session_continuation::{save_continuation, SessionContinuation};
let session_id = match &self.session_id { let session_id = match &self.session_id {
@@ -1398,7 +1398,7 @@ impl<W: UiWriter> Agent<W> {
self.is_agent_mode, self.is_agent_mode,
self.agent_name.clone(), self.agent_name.clone(),
session_id, session_id,
final_output_summary, summary,
session_log_path.to_string_lossy().to_string(), session_log_path.to_string_lossy().to_string(),
self.context_window.percentage_used(), self.context_window.percentage_used(),
todo_snapshot, todo_snapshot,
@@ -1494,9 +1494,9 @@ impl<W: UiWriter> Agent<W> {
} }
} }
// Fall back to using final_output summary + TODO // Fall back to using session summary + TODO
let mut context_msg = String::new(); let mut context_msg = String::new();
if let Some(ref summary) = continuation.final_output_summary { if let Some(ref summary) = continuation.summary {
context_msg.push_str(&format!("Previous session summary:\n{}\n\n", summary)); context_msg.push_str(&format!("Previous session summary:\n{}\n\n", summary));
} }
if let Some(ref todo) = continuation.todo_snapshot { if let Some(ref todo) = continuation.todo_snapshot {
@@ -1601,7 +1601,7 @@ impl<W: UiWriter> Agent<W> {
let mut any_tool_executed = false; // Track if ANY tool was executed across all iterations let mut any_tool_executed = false; // Track if ANY tool was executed across all iterations
let mut auto_summary_attempts = 0; // Track auto-summary prompt attempts let mut auto_summary_attempts = 0; // Track auto-summary prompt attempts
const MAX_AUTO_SUMMARY_ATTEMPTS: usize = 5; // Limit auto-summary retries (increased from 2 for better recovery) const MAX_AUTO_SUMMARY_ATTEMPTS: usize = 5; // Limit auto-summary retries (increased from 2 for better recovery)
let final_output_called = false; // Track if final_output was called //
// Note: Session-level duplicate tracking was removed - we only prevent sequential duplicates (DUP IN CHUNK, DUP IN MSG) // Note: Session-level duplicate tracking was removed - we only prevent sequential duplicates (DUP IN CHUNK, DUP IN MSG)
let mut turn_accumulated_usage: Option<g3_providers::Usage> = None; // Track token usage for timing footer let mut turn_accumulated_usage: Option<g3_providers::Usage> = None; // Track token usage for timing footer
@@ -2004,8 +2004,8 @@ impl<W: UiWriter> Agent<W> {
String::new() String::new()
}; };
// Don't display text before final_output - it will be in the summary // Display any new text content
if !new_content.trim().is_empty() && tool_call.tool != "final_output" { if !new_content.trim().is_empty() {
#[allow(unused_assignments)] #[allow(unused_assignments)]
if !response_started { if !response_started {
self.ui_writer.print_agent_prompt(); self.ui_writer.print_agent_prompt();
@@ -2019,8 +2019,6 @@ impl<W: UiWriter> Agent<W> {
// Execute the tool with formatted output // Execute the tool with formatted output
// Skip printing tool call details for final_output
if tool_call.tool != "final_output" {
// Finish streaming markdown before showing tool output // Finish streaming markdown before showing tool output
self.ui_writer.finish_streaming_markdown(); self.ui_writer.finish_streaming_markdown();
@@ -2052,7 +2050,6 @@ impl<W: UiWriter> Agent<W> {
} }
} }
self.ui_writer.print_tool_output_header(); self.ui_writer.print_tool_output_header();
}
// Clone working_dir to avoid borrow checker issues // Clone working_dir to avoid borrow checker issues
let working_dir = self.working_dir.clone(); let working_dir = self.working_dir.clone();
@@ -2082,11 +2079,7 @@ impl<W: UiWriter> Agent<W> {
)); ));
// Display tool execution result with proper indentation // Display tool execution result with proper indentation
if tool_call.tool == "final_output" { {
// For final_output, use the dedicated method that renders markdown
// with a spinner animation
self.ui_writer.print_final_output(&tool_result);
} else {
let output_lines: Vec<&str> = tool_result.lines().collect(); let output_lines: Vec<&str> = tool_result.lines().collect();
// Check if UI wants full output (machine mode) or truncated (human mode) // Check if UI wants full output (machine mode) or truncated (human mode)
@@ -2192,47 +2185,13 @@ impl<W: UiWriter> Agent<W> {
self.context_window.add_message(tool_message); self.context_window.add_message(tool_message);
self.context_window.add_message(result_message); self.context_window.add_message(result_message);
// Check if this was a final_output tool call
if tool_call.tool == "final_output" {
// Finish the streaming markdown formatter before final_output
self.ui_writer.finish_streaming_markdown();
// Save context window BEFORE returning so the session log includes final_output
self.save_context_window("completed");
// The summary was already displayed via print_final_output
// Don't add it to full_response to avoid duplicate printing
// full_response is intentionally left empty/unchanged
let _ttft =
first_token_time.unwrap_or_else(|| stream_start.elapsed());
// Add timing if needed
let final_response = if show_timing {
format!(
"🕝 {} | 💭 {}",
Self::format_duration(stream_start.elapsed()),
Self::format_duration(_ttft)
)
} else {
// Return empty string since content was already displayed
String::new()
};
return Ok(TaskResult::new(
final_response,
self.context_window.clone(),
));
}
// Closure marker with timing // Closure marker with timing
if tool_call.tool != "final_output" {
let tokens_delta = self.context_window.used_tokens.saturating_sub(tokens_before); let tokens_delta = self.context_window.used_tokens.saturating_sub(tokens_before);
self.ui_writer self.ui_writer
.print_tool_timing(&Self::format_duration(exec_duration), .print_tool_timing(&Self::format_duration(exec_duration),
tokens_delta, tokens_delta,
self.context_window.percentage_used()); self.context_window.percentage_used());
self.ui_writer.print_agent_prompt(); self.ui_writer.print_agent_prompt();
}
// Update the request with the new context for next iteration // Update the request with the new context for next iteration
request.messages = self.context_window.conversation_history.clone(); request.messages = self.context_window.conversation_history.clone();
@@ -2251,7 +2210,7 @@ impl<W: UiWriter> Agent<W> {
// The content was already displayed during streaming and added to current_response. // The content was already displayed during streaming and added to current_response.
// Adding it again would cause duplication when the agent message is printed. // Adding it again would cause duplication when the agent message is printed.
// The only time we should add to full_response is: // The only time we should add to full_response is:
// 1. For final_output tool (handled separately) // 1. At the end when no tools were executed
// 2. At the end when no tools were executed (handled in the "no tool executed" branch) // 2. At the end when no tools were executed (handled in the "no tool executed" branch)
tool_executed = true; tool_executed = true;
@@ -2324,7 +2283,7 @@ impl<W: UiWriter> Agent<W> {
// No tools were executed in this iteration // No tools were executed in this iteration
// Check if we got any meaningful response at all // Check if we got any meaningful response at all
// We need to check the parser's text buffer as well, since the LLM // We need to check the parser's text buffer as well, since the LLM
// might have responded with text but no final_output tool call // might have responded with text but no tool calls
let text_content = parser.get_text_content(); let text_content = parser.get_text_content();
let has_text_response = !text_content.trim().is_empty() let has_text_response = !text_content.trim().is_empty()
|| !current_response.trim().is_empty(); || !current_response.trim().is_empty();
@@ -2376,10 +2335,10 @@ impl<W: UiWriter> Agent<W> {
)); ));
} }
// If tools were executed in previous iterations but final_output wasn't called, // If tools were executed in previous iterations,
// break to let the outer loop's auto-continue logic handle it // break to let the outer loop's auto-continue logic handle it
if any_tool_executed && !final_output_called { if any_tool_executed {
debug!("Tools were executed but final_output not called - breaking to auto-continue"); debug!("Tools were executed, continuing - breaking to auto-continue");
// NOTE: We intentionally do NOT set full_response here. // NOTE: We intentionally do NOT set full_response here.
// The content was already displayed during streaming. // The content was already displayed during streaming.
// Setting full_response would cause duplication when the // Setting full_response would cause duplication when the
@@ -2529,15 +2488,15 @@ impl<W: UiWriter> Agent<W> {
warn!("Unexecuted tool call detected in buffer after stream ended"); warn!("Unexecuted tool call detected in buffer after stream ended");
} }
// Auto-continue if tools were executed but final_output was never called // Auto-continue if tools were executed and we are in autonomous mode
// OR if the LLM emitted an incomplete tool call (truncated JSON) // OR if the LLM emitted an incomplete tool call (truncated JSON)
// OR if the LLM emitted a complete tool call that wasn't executed // OR if the LLM emitted a complete tool call that wasn't executed
// This ensures we don't return control when the LLM clearly intended to call a tool // This ensures we don't return control when the LLM clearly intended to call a tool
// Note: We removed the redundant condition (any_tool_executed && is_empty_response) // Note: We removed the redundant condition (any_tool_executed && is_empty_response)
// because it's already covered by (any_tool_executed && !final_output_called) // because it's already covered by (any_tool_executed )
// Auto-continue is only enabled in autonomous mode - in interactive mode, // Auto-continue is only enabled in autonomous mode - in interactive mode,
// the user may be asking questions and we should return control to them // the user may be asking questions and we should return control to them
let should_auto_continue = self.is_autonomous && ((any_tool_executed && !final_output_called) let should_auto_continue = self.is_autonomous && ((any_tool_executed )
|| has_incomplete_tool_call || has_incomplete_tool_call
|| has_unexecuted_tool_call); || has_unexecuted_tool_call);
if should_auto_continue { if should_auto_continue {
@@ -2569,11 +2528,11 @@ impl<W: UiWriter> Agent<W> {
); );
} else { } else {
warn!( warn!(
"LLM stopped without calling final_output after executing tools ({} iterations, auto-continue attempt {}/{})", "LLM stopped after executing tools ({} iterations, auto-continue attempt {}/{})",
iteration_count, auto_summary_attempts, MAX_AUTO_SUMMARY_ATTEMPTS iteration_count, auto_summary_attempts, MAX_AUTO_SUMMARY_ATTEMPTS
); );
self.ui_writer.print_context_status( self.ui_writer.print_context_status(
"\n🔄 Model stopped without calling final_output. Auto-continuing...\n" "\n🔄 Model stopped without providing summary. Auto-continuing...\n"
); );
} }
@@ -2602,7 +2561,7 @@ impl<W: UiWriter> Agent<W> {
} else { } else {
Message::new( Message::new(
MessageRole::User, MessageRole::User,
"Please continue until you are done. You **MUST** call `final_output` with a summary when done.".to_string(), "Please continue until you are done. Provide a summary when complete.".to_string(),
) )
}; };
self.context_window.add_message(continue_prompt); self.context_window.add_message(continue_prompt);
@@ -2613,22 +2572,22 @@ impl<W: UiWriter> Agent<W> {
} else { } else {
// Max attempts reached, give up gracefully // Max attempts reached, give up gracefully
warn!( warn!(
"Max auto-continue attempts ({}) reached after {} iterations. Conditions: any_tool_executed={}, final_output_called={}, has_incomplete={}, has_unexecuted={}, is_empty_response={}", "Max auto-continue attempts ({}) reached after {} iterations. Conditions: any_tool_executed={}, has_incomplete={}, has_unexecuted={}, is_empty_response={}",
MAX_AUTO_SUMMARY_ATTEMPTS, MAX_AUTO_SUMMARY_ATTEMPTS,
iteration_count, iteration_count,
any_tool_executed, any_tool_executed,
final_output_called,
has_incomplete_tool_call, has_incomplete_tool_call,
has_unexecuted_tool_call, has_unexecuted_tool_call,
is_empty_response is_empty_response
); );
self.ui_writer.print_agent_response( self.ui_writer.print_agent_response(
&format!("\n⚠️ The model stopped without calling final_output after {} auto-continue attempts.\n", MAX_AUTO_SUMMARY_ATTEMPTS) &format!("\n⚠️ The model stopped without providing a summary after {} auto-continue attempts.\n", MAX_AUTO_SUMMARY_ATTEMPTS)
); );
} }
} else if has_response { } else if has_response {
// Only set full_response if it's empty (first iteration without tools) // Only set full_response if it's empty (first iteration without tools)
// This prevents duplication when the agent responds without calling final_output // This prevents duplication when the agent responds
// NOTE: We intentionally do NOT set full_response here anymore. // NOTE: We intentionally do NOT set full_response here anymore.
// The content was already displayed during streaming via print_agent_response(). // The content was already displayed during streaming via print_agent_response().
// Setting full_response would cause the CLI to print it again. // Setting full_response would cause the CLI to print it again.
@@ -2772,12 +2731,6 @@ impl<W: UiWriter> Agent<W> {
// Dispatch to the appropriate tool handler // Dispatch to the appropriate tool handler
let result = tool_dispatch::dispatch_tool(tool_call, &mut ctx).await?; let result = tool_dispatch::dispatch_tool(tool_call, &mut ctx).await?;
// Handle special case: final_output needs to save session continuation
if tool_call.tool == "final_output" {
let summary = tool_call.args.get("summary").and_then(|v| v.as_str());
self.save_session_continuation(summary.map(|s| s.to_string()));
}
Ok(result) Ok(result)
} }

View File

@@ -34,7 +34,7 @@ IMPORTANT: You must call tools to achieve goals. When you receive a request:
2. Call the appropriate tool with the required parameters 2. Call the appropriate tool with the required parameters
3. Continue or complete the task based on the result 3. Continue or complete the task based on the result
4. If you repeatedly try something and it fails, try a different approach 4. If you repeatedly try something and it fails, try a different approach
5. Call the final_output tool with a detailed summary when done. 5. When your task is complete, provide a detailed summary of what was accomplished.
For shell commands: Use the shell tool with the exact command needed. Avoid commands that produce a large amount of output, and consider piping those outputs to files. Example: If asked to list files, immediately call the shell tool with command parameter \"ls\". For shell commands: Use the shell tool with the exact command needed. Avoid commands that produce a large amount of output, and consider piping those outputs to files. Example: If asked to list files, immediately call the shell tool with command parameter \"ls\".
If you create temporary files for verification, place these in a subdir named 'tmp'. Do NOT pollute the current dir. If you create temporary files for verification, place these in a subdir named 'tmp'. Do NOT pollute the current dir.
@@ -201,9 +201,6 @@ Short description for providers without native calling specs:
- Format: {\"tool\": \"str_replace\", \"args\": {\"file_path\": \"path/to/file\", \"diff\": \"--- old\\n-old text\\n+++ new\\n+new text\"} - Format: {\"tool\": \"str_replace\", \"args\": {\"file_path\": \"path/to/file\", \"diff\": \"--- old\\n-old text\\n+++ new\\n+new text\"}
- Example: {\"tool\": \"str_replace\", \"args\": {\"file_path\": \"src/main.rs\", \"diff\": \"--- old\\n-old_code();\\n+++ new\\n+new_code();\"} - Example: {\"tool\": \"str_replace\", \"args\": {\"file_path\": \"src/main.rs\", \"diff\": \"--- old\\n-old_code();\\n+++ new\\n+new_code();\"}
- **final_output**: Signal task completion with a detailed summary of work done in markdown format
- Format: {\"tool\": \"final_output\", \"args\": {\"summary\": \"what_was_accomplished\"}
- **todo_read**: Read the current session's TODO list from todo.g3.md (session-scoped) - **todo_read**: Read the current session's TODO list from todo.g3.md (session-scoped)
- Format: {\"tool\": \"todo_read\", \"args\": {}} - Format: {\"tool\": \"todo_read\", \"args\": {}}
- Example: {\"tool\": \"todo_read\", \"args\": {}} - Example: {\"tool\": \"todo_read\", \"args\": {}}
@@ -227,7 +224,7 @@ Short description for providers without native calling specs:
1. Analyze the request and break down into smaller tasks if appropriate 1. Analyze the request and break down into smaller tasks if appropriate
2. Execute ONE tool at a time. An exception exists for when you're writing files. See below. 2. Execute ONE tool at a time. An exception exists for when you're writing files. See below.
3. STOP when the original request was satisfied 3. STOP when the original request was satisfied
4. Call the final_output tool when done 4. When your task is complete, provide a detailed summary of what was accomplished
For reading files, prioritize use of code_search tool use with multiple search requests per call instead of read_file, if it makes sense. For reading files, prioritize use of code_search tool use with multiple search requests per call instead of read_file, if it makes sense.

View File

@@ -32,8 +32,8 @@ pub struct SessionContinuation {
pub created_at: String, pub created_at: String,
/// Original session ID /// Original session ID
pub session_id: String, pub session_id: String,
/// The last final_output summary /// Session summary (last assistant response)
pub final_output_summary: Option<String>, pub summary: Option<String>,
/// Path to the full session log (g3_session_*.json) /// Path to the full session log (g3_session_*.json)
pub session_log_path: String, pub session_log_path: String,
/// Context window usage percentage when saved /// Context window usage percentage when saved
@@ -50,7 +50,7 @@ impl SessionContinuation {
is_agent_mode: bool, is_agent_mode: bool,
agent_name: Option<String>, agent_name: Option<String>,
session_id: String, session_id: String,
final_output_summary: Option<String>, summary: Option<String>,
session_log_path: String, session_log_path: String,
context_percentage: f32, context_percentage: f32,
todo_snapshot: Option<String>, todo_snapshot: Option<String>,
@@ -62,7 +62,7 @@ impl SessionContinuation {
agent_name, agent_name,
created_at: chrono::Utc::now().to_rfc3339(), created_at: chrono::Utc::now().to_rfc3339(),
session_id, session_id,
final_output_summary, summary,
session_log_path, session_log_path,
context_percentage, context_percentage,
todo_snapshot, todo_snapshot,

View File

@@ -17,8 +17,15 @@ impl TaskResult {
} }
} }
/// Extract the final_output content from the response (for coach feedback in autonomous mode) /// Extract a summary from the response (for coach feedback in autonomous mode)
/// This looks for the complete final_output content, not just the last block /// This looks for the last substantial text block in the response.
/// Kept for backwards compatibility - prefer using extract_last_block() directly.
pub fn extract_summary(&self) -> String {
self.extract_last_block()
}
/// Legacy method - extract the final_output content from the response
/// Now just delegates to extract_last_block() for backwards compatibility
pub fn extract_final_output(&self) -> String { pub fn extract_final_output(&self) -> String {
// Remove any timing information at the end // Remove any timing information at the end
let content_without_timing = if let Some(timing_pos) = self.response.rfind("\n⏱️") { let content_without_timing = if let Some(timing_pos) = self.response.rfind("\n⏱️") {
@@ -27,30 +34,23 @@ impl TaskResult {
&self.response &self.response
}; };
// Look for the final_output marker pattern // For backwards compatibility, still check for final_output marker
// The final_output content typically appears after the tool is called // but primarily just return the last substantial block
// and is the substantive content that follows self.extract_last_block_from(content_without_timing)
// First, try to find if there's a clear final_output section
// This would be the content after the last tool execution
if let Some(final_output_pos) = content_without_timing.rfind("final_output") {
// Find the content that follows the final_output call
// Skip past the tool call line and any immediate formatting
if let Some(content_start) = content_without_timing[final_output_pos..].find('\n') {
let start_pos = final_output_pos + content_start + 1;
let final_content = &content_without_timing[start_pos..];
// Trim and return the complete content
let trimmed = final_content.trim();
if !trimmed.is_empty() {
return trimmed.to_string();
}
}
} }
// Fallback to the original extract_last_block behavior if we can't find final_output /// Extract the last block from a given string
// This maintains backward compatibility fn extract_last_block_from(&self, content: &str) -> String {
self.extract_last_block() // Split by double newlines to find the last substantial block
let blocks: Vec<&str> = content.split("\n\n").collect();
// Find the last non-empty block that isn't just whitespace
blocks
.iter()
.rev()
.find(|block| !block.trim().is_empty())
.map(|block| block.trim().to_string())
.unwrap_or_else(|| content.trim().to_string())
} }
/// Extract the last block from the response (for coach feedback in autonomous mode) /// Extract the last block from the response (for coach feedback in autonomous mode)
@@ -138,33 +138,32 @@ mod tests {
fn test_extract_final_output() { fn test_extract_final_output() {
let context_window = ContextWindow::new(1000); let context_window = ContextWindow::new(1000);
// Test case 1: Response with final_output tool call // Test case 1: Response with multiple blocks - extracts last substantial block
let response_with_final_output = "Analyzing files...\n\nCalling final_output\n\nThis is the complete feedback\nwith multiple lines\nand important details\n\n⏱️ 2.3s".to_string(); let response_with_blocks = "Analyzing files...\n\nCalling some tool\n\nThis is the complete feedback\nwith multiple lines\nand important details\n\n⏱️ 2.3s".to_string();
let result = TaskResult::new(response_with_final_output, context_window.clone()); let result = TaskResult::new(response_with_blocks, context_window.clone());
assert_eq!( assert_eq!(
result.extract_final_output(), result.extract_final_output(),
"This is the complete feedback\nwith multiple lines\nand important details" "This is the complete feedback\nwith multiple lines\nand important details"
); );
// Test case 2: Response with IMPLEMENTATION_APPROVED in final_output // Test case 2: Response with IMPLEMENTATION_APPROVED as last block
let response_approved = let response_approved =
"Review complete\n\nfinal_output called\n\nIMPLEMENTATION_APPROVED".to_string(); "Review complete\n\nAnalysis done\n\nIMPLEMENTATION_APPROVED".to_string();
let result = TaskResult::new(response_approved, context_window.clone()); let result = TaskResult::new(response_approved, context_window.clone());
assert_eq!(result.extract_final_output(), "IMPLEMENTATION_APPROVED"); assert_eq!(result.extract_final_output(), "IMPLEMENTATION_APPROVED");
assert!(result.is_approved()); assert!(result.is_approved());
// Test case 3: Response with detailed feedback in final_output // Test case 3: Response with detailed feedback as last block
let response_feedback = "Checking implementation...\n\nfinal_output\n\nThe following issues need to be addressed:\n1. Missing error handling in main.rs\n2. Tests are not comprehensive\n3. Documentation needs improvement\n\nPlease fix these issues.".to_string(); let response_feedback = "Checking implementation...\n\nAnalysis complete\n\nThe following issues need to be addressed:\n1. Missing error handling in main.rs\n2. Tests are not comprehensive\n3. Documentation needs improvement\n\nPlease fix these issues.".to_string();
let result = TaskResult::new(response_feedback, context_window.clone()); let result = TaskResult::new(response_feedback, context_window.clone());
let extracted = result.extract_final_output(); let extracted = result.extract_final_output();
assert!(extracted.contains("The following issues need to be addressed:")); // Now extracts just the last block (after the last \n\n)
assert!(extracted.contains("1. Missing error handling"));
assert!(extracted.contains("Please fix these issues.")); assert!(extracted.contains("Please fix these issues."));
assert!(!result.is_approved()); assert!(!result.is_approved());
// Test case 4: Response without final_output (fallback to extract_last_block) // Test case 4: Simple response - extracts last block
let response_no_final_output = "Some analysis\n\nFinal thoughts here".to_string(); let response_simple = "Some analysis\n\nFinal thoughts here".to_string();
let result = TaskResult::new(response_no_final_output, context_window.clone()); let result = TaskResult::new(response_simple, context_window.clone());
assert_eq!(result.extract_final_output(), "Final thoughts here"); assert_eq!(result.extract_final_output(), "Final thoughts here");
// Test case 5: Empty response // Test case 5: Empty response

View File

@@ -157,20 +157,6 @@ fn create_core_tools() -> Vec<Tool> {
"required": ["file_path", "diff"] "required": ["file_path", "diff"]
}), }),
}, },
Tool {
name: "final_output".to_string(),
description: "Signal task completion with a detailed summary".to_string(),
input_schema: json!({
"type": "object",
"properties": {
"summary": {
"type": "string",
"description": "A detailed summary in markdown of what was accomplished"
}
},
"required": ["summary"]
}),
},
Tool { Tool {
name: "take_screenshot".to_string(), name: "take_screenshot".to_string(),
description: "Capture a screenshot of a specific application window. You MUST specify the window_id parameter with the application name (e.g., 'Safari', 'Terminal', 'Google Chrome'). The tool will automatically use the native screencapture command with the application's window ID for a clean capture. Use list_windows first to identify available windows.".to_string(), description: "Capture a screenshot of a specific application window. You MUST specify the window_id parameter with the application name (e.g., 'Safari', 'Terminal', 'Google Chrome'). The tool will automatically use the native screencapture command with the application's window ID for a clean capture. Use list_windows first to identify available windows.".to_string(),
@@ -462,8 +448,8 @@ mod tests {
let tools = create_core_tools(); let tools = create_core_tools();
// Should have the core tools: shell, background_process, read_file, read_image, // Should have the core tools: shell, background_process, read_file, read_image,
// write_file, str_replace, final_output, take_screenshot, // write_file, str_replace, final_output, take_screenshot,
// todo_read, todo_write, code_coverage, code_search (12 total) // todo_read, todo_write, code_coverage, code_search (11 total)
assert_eq!(tools.len(), 12); assert_eq!(tools.len(), 11);
} }
#[test] #[test]
@@ -477,15 +463,15 @@ mod tests {
fn test_create_tool_definitions_core_only() { fn test_create_tool_definitions_core_only() {
let config = ToolConfig::default(); let config = ToolConfig::default();
let tools = create_tool_definitions(config); let tools = create_tool_definitions(config);
assert_eq!(tools.len(), 12); assert_eq!(tools.len(), 11);
} }
#[test] #[test]
fn test_create_tool_definitions_all_enabled() { fn test_create_tool_definitions_all_enabled() {
let config = ToolConfig::new(true, true); let config = ToolConfig::new(true, true);
let tools = create_tool_definitions(config); let tools = create_tool_definitions(config);
// 12 core + 15 webdriver = 27 // 11 core + 15 webdriver = 26
assert_eq!(tools.len(), 27); assert_eq!(tools.len(), 26);
} }
#[test] #[test]

View File

@@ -37,11 +37,6 @@ pub async fn dispatch_tool<W: UiWriter>(
"todo_write" => todo::execute_todo_write(tool_call, ctx).await, "todo_write" => todo::execute_todo_write(tool_call, ctx).await,
// Miscellaneous tools // Miscellaneous tools
"final_output" => {
let result = misc::execute_final_output(tool_call, ctx).await?;
// Note: Session continuation saving is handled by the caller
Ok(result)
}
"take_screenshot" => misc::execute_take_screenshot(tool_call, ctx).await, "take_screenshot" => misc::execute_take_screenshot(tool_call, ctx).await,
"code_coverage" => misc::execute_code_coverage(tool_call, ctx).await, "code_coverage" => misc::execute_code_coverage(tool_call, ctx).await,
"code_search" => misc::execute_code_search(tool_call, ctx).await, "code_search" => misc::execute_code_search(tool_call, ctx).await,

View File

@@ -1,4 +1,4 @@
//! Miscellaneous tools: final_output, take_screenshot, code_coverage, code_search. //! Miscellaneous tools: take_screenshot, code_coverage, code_search.
use anyhow::Result; use anyhow::Result;
use tracing::debug; use tracing::debug;
@@ -8,42 +8,6 @@ use crate::ToolCall;
use super::executor::ToolContext; use super::executor::ToolContext;
/// Execute the `final_output` tool.
pub async fn execute_final_output<W: UiWriter>(
tool_call: &ToolCall,
ctx: &ToolContext<'_, W>,
) -> Result<String> {
debug!("Processing final_output tool call");
let summary_str = tool_call.args.get("summary").and_then(|v| v.as_str());
// In autonomous mode, check for incomplete TODO items before allowing completion
if ctx.is_autonomous {
let todo_content = ctx.todo_content.read().await;
let has_incomplete_todos = todo_content
.lines()
.any(|line| line.trim().starts_with("- [ ]"));
drop(todo_content);
if has_incomplete_todos {
return Ok(
"There are still incomplete TODO items. Please continue until \
*ALL* TODO items in *ALL* phases are marked complete, and \
*ONLY* then call `final_output`."
.to_string(),
);
}
}
// Return the summary or a default message
// Note: Session continuation saving is handled by the caller (Agent)
if let Some(summary) = summary_str {
Ok(summary.to_string())
} else {
Ok("✅ Turn completed".to_string())
}
}
/// Execute the `take_screenshot` tool. /// Execute the `take_screenshot` tool.
pub async fn execute_take_screenshot<W: UiWriter>( pub async fn execute_take_screenshot<W: UiWriter>(
tool_call: &ToolCall, tool_call: &ToolCall,

View File

@@ -66,10 +66,6 @@ pub trait UiWriter: Send + Sync {
/// Returns the index of the selected option /// Returns the index of the selected option
fn prompt_user_choice(&self, message: &str, options: &[&str]) -> usize; fn prompt_user_choice(&self, message: &str, options: &[&str]) -> usize;
/// Print the final output summary with markdown formatting
/// Shows a spinner while formatting, then renders the markdown
fn print_final_output(&self, summary: &str);
/// Filter JSON tool calls from streaming content for display. /// Filter JSON tool calls from streaming content for display.
/// This is a UI concern - the raw content should be preserved for logging. /// This is a UI concern - the raw content should be preserved for logging.
/// Default implementation passes through unchanged. /// Default implementation passes through unchanged.
@@ -125,7 +121,4 @@ impl UiWriter for NullUiWriter {
fn prompt_user_choice(&self, _message: &str, _options: &[&str]) -> usize { fn prompt_user_choice(&self, _message: &str, _options: &[&str]) -> usize {
0 0
} }
fn print_final_output(&self, _summary: &str) {
// No-op for null writer
}
} }

View File

@@ -1,198 +0,0 @@
//! Tests for final_output blocking when TODO items are incomplete in autonomous mode
//!
//! This test verifies that:
//! 1. In autonomous mode: final_output rejects completion when there are incomplete TODO items
//! 2. In non-autonomous mode: final_output always succeeds (no TODO check)
use g3_config::Config;
use g3_core::ui_writer::NullUiWriter;
use g3_core::Agent;
use serial_test::serial;
use tempfile::TempDir;
/// Helper to create a test agent in NON-autonomous mode (interactive/chat mode)
async fn create_non_autonomous_agent(temp_dir: &TempDir) -> Agent<NullUiWriter> {
std::env::set_current_dir(temp_dir.path()).unwrap();
let config = Config::default();
// new_with_readme_and_quiet creates a NON-autonomous agent (is_autonomous = false)
Agent::new_with_readme_and_quiet(config, NullUiWriter, None, true)
.await
.unwrap()
}
/// Helper to create a test agent in AUTONOMOUS mode (agent mode)
async fn create_autonomous_agent(temp_dir: &TempDir) -> Agent<NullUiWriter> {
std::env::set_current_dir(temp_dir.path()).unwrap();
let config = Config::default();
// new_autonomous_with_readme_and_quiet creates an AUTONOMOUS agent (is_autonomous = true)
Agent::new_autonomous_with_readme_and_quiet(config, NullUiWriter, None, true)
.await
.unwrap()
}
/// Helper to simulate a tool call
fn create_tool_call(tool: &str, args: serde_json::Value) -> g3_core::ToolCall {
g3_core::ToolCall {
tool: tool.to_string(),
args,
}
}
// =============================================================================
// AUTONOMOUS MODE TESTS - TODO check IS enforced
// =============================================================================
#[tokio::test]
#[serial]
async fn test_autonomous_final_output_blocked_with_incomplete_todos() {
let temp_dir = TempDir::new().unwrap();
let mut agent = create_autonomous_agent(&temp_dir).await;
// First, write a TODO list with incomplete items
let todo_content = "- [ ] Phase 1: Setup\n - [x] Create files\n - [ ] Configure settings\n- [ ] Phase 2: Implementation";
let write_args = serde_json::json!({ "content": todo_content });
let write_call = create_tool_call("todo_write", write_args);
let write_result = agent.execute_tool(&write_call).await.unwrap();
assert!(write_result.contains("TODO list updated"), "Expected TODO write to succeed");
// Now try to call final_output - it should be rejected in autonomous mode
let final_args = serde_json::json!({ "summary": "Completed phase 1" });
let final_call = create_tool_call("final_output", final_args);
let final_result = agent.execute_tool(&final_call).await.unwrap();
// Verify that final_output was rejected due to incomplete TODOs
assert!(
final_result.contains("incomplete TODO"),
"Expected final_output to be rejected in autonomous mode when TODOs are incomplete. Got: {}",
final_result
);
}
#[tokio::test]
#[serial]
async fn test_autonomous_final_output_allowed_with_complete_todos() {
let temp_dir = TempDir::new().unwrap();
let mut agent = create_autonomous_agent(&temp_dir).await;
// Write a TODO list with ALL items complete
let todo_content = "- [x] Phase 1: Setup\n - [x] Create files\n - [x] Configure settings\n- [x] Phase 2: Implementation";
let write_args = serde_json::json!({ "content": todo_content });
let write_call = create_tool_call("todo_write", write_args);
let _write_result = agent.execute_tool(&write_call).await.unwrap();
// Now try to call final_output - it should succeed
let final_args = serde_json::json!({ "summary": "All phases completed successfully" });
let final_call = create_tool_call("final_output", final_args);
let final_result = agent.execute_tool(&final_call).await.unwrap();
// Verify that final_output succeeded (returns the summary)
assert!(
final_result.contains("All phases completed successfully"),
"Expected final_output to return the summary in autonomous mode when all TODOs complete. Got: {}",
final_result
);
}
#[tokio::test]
#[serial]
async fn test_autonomous_final_output_allowed_with_no_todos() {
let temp_dir = TempDir::new().unwrap();
let mut agent = create_autonomous_agent(&temp_dir).await;
// Don't create any TODO list - final_output should still work
let final_args = serde_json::json!({ "summary": "Simple task completed" });
let final_call = create_tool_call("final_output", final_args);
let final_result = agent.execute_tool(&final_call).await.unwrap();
// Verify that final_output succeeded
assert!(
final_result.contains("Simple task completed"),
"Expected final_output to return the summary when no TODOs exist. Got: {}",
final_result
);
}
#[tokio::test]
#[serial]
async fn test_autonomous_final_output_blocked_with_mixed_todos() {
let temp_dir = TempDir::new().unwrap();
let mut agent = create_autonomous_agent(&temp_dir).await;
// Write a TODO list with some complete and some incomplete items
let todo_content = "- [x] Phase 1: Setup\n- [ ] Phase 2: Implementation\n- [x] Phase 3: Testing";
let write_args = serde_json::json!({ "content": todo_content });
let write_call = create_tool_call("todo_write", write_args);
let _write_result = agent.execute_tool(&write_call).await.unwrap();
// Try to call final_output - should be rejected
let final_args = serde_json::json!({ "summary": "Done with phases 1 and 3" });
let final_call = create_tool_call("final_output", final_args);
let final_result = agent.execute_tool(&final_call).await.unwrap();
// Verify rejection
assert!(
final_result.contains("incomplete TODO"),
"Expected final_output to be rejected with mixed TODOs in autonomous mode. Got: {}",
final_result
);
}
// =============================================================================
// NON-AUTONOMOUS MODE TESTS - TODO check is NOT enforced
// =============================================================================
#[tokio::test]
#[serial]
async fn test_non_autonomous_final_output_allowed_with_incomplete_todos() {
let temp_dir = TempDir::new().unwrap();
let mut agent = create_non_autonomous_agent(&temp_dir).await;
// Write a TODO list with incomplete items
let todo_content = "- [ ] Phase 1: Setup\n - [x] Create files\n - [ ] Configure settings\n- [ ] Phase 2: Implementation";
let write_args = serde_json::json!({ "content": todo_content });
let write_call = create_tool_call("todo_write", write_args);
let write_result = agent.execute_tool(&write_call).await.unwrap();
assert!(write_result.contains("TODO list updated"), "Expected TODO write to succeed");
// In non-autonomous mode, final_output should succeed even with incomplete TODOs
let final_args = serde_json::json!({ "summary": "Partial completion is fine in interactive mode" });
let final_call = create_tool_call("final_output", final_args);
let final_result = agent.execute_tool(&final_call).await.unwrap();
// Verify that final_output succeeded (returns the summary, not a rejection)
assert!(
final_result.contains("Partial completion is fine in interactive mode"),
"Expected final_output to succeed in non-autonomous mode even with incomplete TODOs. Got: {}",
final_result
);
assert!(
!final_result.contains("incomplete TODO"),
"Expected NO rejection message in non-autonomous mode. Got: {}",
final_result
);
}
#[tokio::test]
#[serial]
async fn test_non_autonomous_final_output_allowed_with_mixed_todos() {
let temp_dir = TempDir::new().unwrap();
let mut agent = create_non_autonomous_agent(&temp_dir).await;
// Write a TODO list with mixed complete/incomplete items
let todo_content = "- [x] Phase 1: Setup\n- [ ] Phase 2: Implementation\n- [x] Phase 3: Testing";
let write_args = serde_json::json!({ "content": todo_content });
let write_call = create_tool_call("todo_write", write_args);
let _write_result = agent.execute_tool(&write_call).await.unwrap();
// In non-autonomous mode, final_output should succeed
let final_args = serde_json::json!({ "summary": "Interactive mode allows partial completion" });
let final_call = create_tool_call("final_output", final_args);
let final_result = agent.execute_tool(&final_call).await.unwrap();
// Verify success
assert!(
final_result.contains("Interactive mode allows partial completion"),
"Expected final_output to succeed in non-autonomous mode. Got: {}",
final_result
);
}

View File

@@ -42,7 +42,7 @@ fn test_session_continuation_creation() {
assert_eq!(continuation.session_id, "test_session_123"); assert_eq!(continuation.session_id, "test_session_123");
assert_eq!( assert_eq!(
continuation.final_output_summary, continuation.summary,
Some("Task completed successfully".to_string()) Some("Task completed successfully".to_string())
); );
assert_eq!(continuation.context_percentage, 45.0); assert_eq!(continuation.context_percentage, 45.0);
@@ -108,7 +108,7 @@ fn test_save_and_load_continuation() {
.expect("No continuation found"); .expect("No continuation found");
assert_eq!(loaded.session_id, original.session_id); assert_eq!(loaded.session_id, original.session_id);
assert_eq!(loaded.final_output_summary, original.final_output_summary); assert_eq!(loaded.summary, original.summary);
assert_eq!(loaded.session_log_path, original.session_log_path); assert_eq!(loaded.session_log_path, original.session_log_path);
assert!((loaded.context_percentage - original.context_percentage).abs() < 0.01); assert!((loaded.context_percentage - original.context_percentage).abs() < 0.01);
assert_eq!(loaded.todo_snapshot, original.todo_snapshot); assert_eq!(loaded.todo_snapshot, original.todo_snapshot);
@@ -358,7 +358,7 @@ fn test_continuation_serialization_format() {
assert_eq!(parsed["version"], "1.0"); assert_eq!(parsed["version"], "1.0");
assert_eq!(parsed["session_id"], "format_test"); assert_eq!(parsed["session_id"], "format_test");
assert_eq!(parsed["final_output_summary"], "Test summary"); assert_eq!(parsed["summary"], "Test summary");
assert_eq!(parsed["session_log_path"], "/path/to/session.json"); assert_eq!(parsed["session_log_path"], "/path/to/session.json");
assert!((parsed["context_percentage"].as_f64().unwrap() - 42.5).abs() < 0.01); assert!((parsed["context_percentage"].as_f64().unwrap() - 42.5).abs() < 0.01);
assert_eq!(parsed["todo_snapshot"], "- [x] Done\n- [ ] Todo"); assert_eq!(parsed["todo_snapshot"], "- [x] Done\n- [ ] Todo");
@@ -410,7 +410,7 @@ fn test_multiple_saves_update_symlink() {
.expect("No continuation"); .expect("No continuation");
assert_eq!(loaded.session_id, "second_session"); assert_eq!(loaded.session_id, "second_session");
assert_eq!( assert_eq!(
loaded.final_output_summary, loaded.summary,
Some("Second summary".to_string()) Some("Second summary".to_string())
); );

View File

@@ -81,9 +81,6 @@ impl UiWriter for MockUiWriter {
.push(format!("CHOICE: {} Options: {:?}", message, options)); .push(format!("CHOICE: {} Options: {:?}", message, options));
self.choice_responses.lock().unwrap().pop().unwrap_or(0) self.choice_responses.lock().unwrap().pop().unwrap_or(0)
} }
fn print_final_output(&self, summary: &str) {
self.output.lock().unwrap().push(format!("FINAL: {}", summary));
}
} }
#[tokio::test] #[tokio::test]

View File

@@ -304,9 +304,6 @@ impl g3_core::ui_writer::UiWriter for PlannerUiWriter {
0 // Default to first option 0 // Default to first option
} }
fn print_final_output(&self, summary: &str) {
println!("\n📝 Final Output:\n{}", summary);
}
} }
/// Call LLM to refine requirements using a full Agent with tool execution /// Call LLM to refine requirements using a full Agent with tool execution