From 928f2bfa9d1870c3b463528609e9e4d41e86e72f Mon Sep 17 00:00:00 2001 From: Jochen Date: Tue, 2 Dec 2025 21:23:50 +1100 Subject: [PATCH 1/2] actually record coach feedback and use it --- crates/g3-cli/src/lib.rs | 22 ++++++++++- crates/g3-core/src/lib.rs | 77 ++++++++++++++++++++++----------------- 2 files changed, 63 insertions(+), 36 deletions(-) diff --git a/crates/g3-cli/src/lib.rs b/crates/g3-cli/src/lib.rs index 3dd6248..18d6f76 100644 --- a/crates/g3-cli/src/lib.rs +++ b/crates/g3-cli/src/lib.rs @@ -163,15 +163,33 @@ fn extract_coach_feedback_from_logs( if let Some(context_window) = log_json.get("context_window") { if let Some(conversation_history) = context_window.get("conversation_history") { if let Some(messages) = conversation_history.as_array() { - // Simply get the last message content - this is the coach's final feedback + // Get the last message which contains the final_output tool result + // The last message is a USER message with format "Tool result: {summary}" if let Some(last_message) = messages.last() { if let Some(content) = last_message.get("content") { if let Some(content_str) = content.as_str() { + // Strip the "Tool result: " prefix if present + let feedback = if content_str.starts_with("Tool result: ") { + content_str.strip_prefix("Tool result: ") + .unwrap_or(content_str) + .to_string() + } else { + content_str.to_string() + }; + + // Log the extraction for debugging + output.print(&format!( + "Coach feedback extracted: {} characters (from {} total)", + feedback.len(), + content_str.len() + )); + output.print(&format!("Coach feedback:\n{}", feedback)); + output.print(&format!( "✅ Extracted coach feedback from session: {}", session_id )); - return Ok(content_str.to_string()); + return Ok(feedback); } } } diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs index 6da2b44..91fd74a 100644 --- a/crates/g3-core/src/lib.rs +++ b/crates/g3-core/src/lib.rs @@ -3604,40 +3604,6 @@ impl Agent { } } - // Check if this was a final_output tool call - if tool_call.tool == "final_output" { - // The summary was already displayed via print_final_output - // Don't add it to full_response to avoid duplicate printing - // full_response is intentionally left empty/unchanged - self.ui_writer.println(""); - let _ttft = - first_token_time.unwrap_or_else(|| stream_start.elapsed()); - - // Add timing if needed - let final_response = if show_timing { - format!( - "🕝 {} | 💭 {}", - Self::format_duration(stream_start.elapsed()), - Self::format_duration(_ttft) - ) - } else { - // Return empty string since content was already displayed - String::new() - }; - - return Ok(TaskResult::new( - final_response, - self.context_window.clone(), - )); - } - - // Closure marker with timing - if tool_call.tool != "final_output" { - self.ui_writer - .print_tool_timing(&Self::format_duration(exec_duration)); - self.ui_writer.print_agent_prompt(); - } - // Add the tool call and result to the context window using RAW unfiltered content // This ensures the log file contains the true raw content including JSON tool calls let tool_message = if !raw_content_for_log.trim().is_empty() { @@ -3701,6 +3667,43 @@ impl Agent { self.context_window.add_message(tool_message); self.context_window.add_message(result_message); + // Check if this was a final_output tool call + if tool_call.tool == "final_output" { + // Save context window BEFORE returning so the session log includes final_output + self.save_context_window("completed"); + + // The summary was already displayed via print_final_output + // Don't add it to full_response to avoid duplicate printing + // full_response is intentionally left empty/unchanged + self.ui_writer.println(""); + let _ttft = + first_token_time.unwrap_or_else(|| stream_start.elapsed()); + + // Add timing if needed + let final_response = if show_timing { + format!( + "🕝 {} | 💭 {}", + Self::format_duration(stream_start.elapsed()), + Self::format_duration(_ttft) + ) + } else { + // Return empty string since content was already displayed + String::new() + }; + + return Ok(TaskResult::new( + final_response, + self.context_window.clone(), + )); + } + + // Closure marker with timing + if tool_call.tool != "final_output" { + self.ui_writer + .print_tool_timing(&Self::format_duration(exec_duration)); + self.ui_writer.print_agent_prompt(); + } + // Update the request with the new context for next iteration request.messages = self.context_window.conversation_history.clone(); @@ -3922,6 +3925,9 @@ impl Agent { full_response = String::new(); self.ui_writer.println(""); + + // Save context window BEFORE returning + self.save_context_window("completed"); let _ttft = first_token_time.unwrap_or_else(|| stream_start.elapsed()); @@ -4060,6 +4066,9 @@ impl Agent { } } + // Save context window BEFORE returning + self.save_context_window("completed"); + // Add timing if needed let final_response = if show_timing { format!( From 0327a6dfdf102f142c33b79f5ab76d54794c30d7 Mon Sep 17 00:00:00 2001 From: Jochen Date: Tue, 2 Dec 2025 22:00:58 +1100 Subject: [PATCH 2/2] make sure coach feedback is extracted. --- Cargo.lock | 1 + crates/g3-cli/Cargo.toml | 3 + crates/g3-cli/src/lib.rs | 89 +++-- .../tests/coach_feedback_extraction_test.rs | 336 ++++++++++++++++++ 4 files changed, 401 insertions(+), 28 deletions(-) create mode 100644 crates/g3-cli/tests/coach_feedback_extraction_test.rs diff --git a/Cargo.lock b/Cargo.lock index 383c54a..55ba454 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1377,6 +1377,7 @@ dependencies = [ "serde", "serde_json", "sha2", + "tempfile", "termimad", "tokio", "tokio-util", diff --git a/crates/g3-cli/Cargo.toml b/crates/g3-cli/Cargo.toml index 2c31d0e..80e8ed3 100644 --- a/crates/g3-cli/Cargo.toml +++ b/crates/g3-cli/Cargo.toml @@ -27,3 +27,6 @@ chrono = { version = "0.4", features = ["serde"] } crossterm = "0.29.0" ratatui = "0.29" termimad = "0.34.0" + +[dev-dependencies] +tempfile = "3.8" diff --git a/crates/g3-cli/src/lib.rs b/crates/g3-cli/src/lib.rs index 18d6f76..b63412f 100644 --- a/crates/g3-cli/src/lib.rs +++ b/crates/g3-cli/src/lib.rs @@ -163,33 +163,66 @@ fn extract_coach_feedback_from_logs( if let Some(context_window) = log_json.get("context_window") { if let Some(conversation_history) = context_window.get("conversation_history") { if let Some(messages) = conversation_history.as_array() { - // Get the last message which contains the final_output tool result - // The last message is a USER message with format "Tool result: {summary}" - if let Some(last_message) = messages.last() { - if let Some(content) = last_message.get("content") { - if let Some(content_str) = content.as_str() { - // Strip the "Tool result: " prefix if present - let feedback = if content_str.starts_with("Tool result: ") { - content_str.strip_prefix("Tool result: ") - .unwrap_or(content_str) - .to_string() - } else { - content_str.to_string() - }; - - // Log the extraction for debugging - output.print(&format!( - "Coach feedback extracted: {} characters (from {} total)", - feedback.len(), - content_str.len() - )); - output.print(&format!("Coach feedback:\n{}", feedback)); - - output.print(&format!( - "✅ Extracted coach feedback from session: {}", - session_id - )); - return Ok(feedback); + // Go backwards through the conversation to find the last tool result + // that corresponds to a final_output tool call + for i in (0..messages.len()).rev() { + let msg = &messages[i]; + + // Check if this is a User message with "Tool result:" + if let Some(role) = msg.get("role") { + if let Some(role_str) = role.as_str() { + if role_str == "User" || role_str == "user" { + if let Some(content) = msg.get("content") { + if let Some(content_str) = content.as_str() { + if content_str.starts_with("Tool result:") { + // Found a tool result, now check the preceding message + // to verify it was a final_output tool call + if i > 0 { + let prev_msg = &messages[i - 1]; + if let Some(prev_role) = prev_msg.get("role") { + if let Some(prev_role_str) = prev_role.as_str() { + if prev_role_str == "assistant" || prev_role_str == "Assistant" { + if let Some(prev_content) = prev_msg.get("content") { + if let Some(prev_content_str) = prev_content.as_str() { + // Check if the previous assistant message contains a final_output tool call + if prev_content_str.contains("\"tool\": \"final_output\"") { + // This is a final_output tool result + let feedback = if content_str.starts_with("Tool result: ") { + content_str.strip_prefix("Tool result: ") + .unwrap_or(content_str) + .to_string() + } else { + content_str.to_string() + }; + + output.print(&format!( + "Coach feedback extracted: {} characters (from {} total)", + feedback.len(), + content_str.len() + )); + output.print(&format!("Coach feedback:\n{}", feedback)); + + output.print(&format!( + "✅ Extracted coach feedback from session: {} (verified final_output tool)", + session_id + )); + return Ok(feedback); + } else { + output.print(&format!( + "⚠️ Skipping tool result at index {} - not a final_output tool call", + i + )); + } + } + } + } + } + } + } + } + } + } + } } } } @@ -205,7 +238,7 @@ fn extract_coach_feedback_from_logs( "CRITICAL: Could not extract coach feedback from session: {}\n\ Log file path: {:?}\n\ Log file exists: {}\n\ - This indicates the coach did not call any tool or the log is corrupted.\n\ + This indicates the coach did not call final_output tool or the log is corrupted.\n\ Coach result response length: {} chars", session_id, log_file_path, diff --git a/crates/g3-cli/tests/coach_feedback_extraction_test.rs b/crates/g3-cli/tests/coach_feedback_extraction_test.rs new file mode 100644 index 0000000..eb51d2f --- /dev/null +++ b/crates/g3-cli/tests/coach_feedback_extraction_test.rs @@ -0,0 +1,336 @@ +use serde_json::json; +use std::fs; +use tempfile::TempDir; + +#[test] +fn test_extract_coach_feedback_with_timing_message() { + // Create a temporary directory for logs + let temp_dir = TempDir::new().unwrap(); + let logs_dir = temp_dir.path().join("logs"); + fs::create_dir(&logs_dir).unwrap(); + + // Create a mock session log with the problematic conversation history + // where timing message appears after the tool result + let session_id = "test_session_123"; + let log_file_path = logs_dir.join(format!("g3_session_{}.json", session_id)); + + let log_content = json!({ + "session_id": session_id, + "context_window": { + "conversation_history": [ + { + "role": "assistant", + "content": "{\"tool\": \"final_output\", \"args\": {\"summary\":\"IMPLEMENTATION_APPROVED\"}}" + }, + { + "role": "user", + "content": "Tool result: IMPLEMENTATION_APPROVED" + }, + { + "role": "assistant", + "content": "🕝 27.7s | 💭 7.5s" + } + ] + } + }); + + fs::write(&log_file_path, serde_json::to_string_pretty(&log_content).unwrap()).unwrap(); + + // Now test the extraction logic + let log_content_str = fs::read_to_string(&log_file_path).unwrap(); + let log_json: serde_json::Value = serde_json::from_str(&log_content_str).unwrap(); + + if let Some(context_window) = log_json.get("context_window") { + if let Some(conversation_history) = context_window.get("conversation_history") { + if let Some(messages) = conversation_history.as_array() { + // This is the key logic we're testing - find the last USER message with "Tool result:" + let last_tool_result = messages.iter().rev().find(|msg| { + if let Some(role) = msg.get("role") { + if let Some(role_str) = role.as_str() { + if role_str == "User" || role_str == "user" { + if let Some(content) = msg.get("content") { + if let Some(content_str) = content.as_str() { + return content_str.starts_with("Tool result:"); + } + } + } + } + } + false + }); + + // Verify we found the correct message + assert!(last_tool_result.is_some(), "Should find the tool result message"); + + if let Some(last_message) = last_tool_result { + if let Some(content) = last_message.get("content") { + if let Some(content_str) = content.as_str() { + let feedback = if content_str.starts_with("Tool result: ") { + content_str.strip_prefix("Tool result: ").unwrap_or(content_str) + } else { + content_str + }; + + // Verify we extracted the correct feedback + assert_eq!(feedback, "IMPLEMENTATION_APPROVED", "Should extract the actual feedback, not timing"); + + // Verify the feedback is NOT the timing message + assert!(!feedback.contains("🕝"), "Feedback should not be the timing message"); + + println!("✅ Successfully extracted coach feedback: {}", feedback); + return; + } + } + } + } + } + } + + panic!("Failed to extract coach feedback"); +} + +#[test] +fn test_extract_only_final_output_tool_results() { + // Test that we only extract tool results from final_output, not from other tools + let temp_dir = TempDir::new().unwrap(); + let logs_dir = temp_dir.path().join("logs"); + fs::create_dir(&logs_dir).unwrap(); + + let session_id = "test_session_final_output_only"; + let log_file_path = logs_dir.join(format!("g3_session_{}.json", session_id)); + + let log_content = json!({ + "session_id": session_id, + "context_window": { + "conversation_history": [ + { + "role": "assistant", + "content": "{\"tool\": \"shell\", \"args\": {\"command\":\"ls\"}}" + }, + { + "role": "user", + "content": "Tool result: file1.txt\nfile2.txt" + }, + { + "role": "assistant", + "content": "{\"tool\": \"read_file\", \"args\": {\"file_path\":\"test.txt\"}}" + }, + { + "role": "user", + "content": "Tool result: This is test content" + }, + { + "role": "assistant", + "content": "{\"tool\": \"final_output\", \"args\": {\"summary\":\"APPROVED_RESULT\"}}" + }, + { + "role": "user", + "content": "Tool result: APPROVED_RESULT" + }, + { + "role": "assistant", + "content": "🕝 20.5s | 💭 5.2s" + } + ] + } + }); + + fs::write(&log_file_path, serde_json::to_string_pretty(&log_content).unwrap()).unwrap(); + + // Test the new extraction logic that verifies the tool is final_output + let log_content_str = fs::read_to_string(&log_file_path).unwrap(); + let log_json: serde_json::Value = serde_json::from_str(&log_content_str).unwrap(); + + if let Some(context_window) = log_json.get("context_window") { + if let Some(conversation_history) = context_window.get("conversation_history") { + if let Some(messages) = conversation_history.as_array() { + // Go backwards through messages to find final_output tool result + for i in (0..messages.len()).rev() { + let msg = &messages[i]; + + if let Some(role) = msg.get("role") { + if let Some(role_str) = role.as_str() { + if role_str == "User" || role_str == "user" { + if let Some(content) = msg.get("content") { + if let Some(content_str) = content.as_str() { + if content_str.starts_with("Tool result:") { + // Check if preceding message was final_output + if i > 0 { + let prev_msg = &messages[i - 1]; + if let Some(prev_content) = prev_msg.get("content") { + if let Some(prev_content_str) = prev_content.as_str() { + if prev_content_str.contains("\"tool\": \"final_output\"") { + let feedback = content_str.strip_prefix("Tool result: ").unwrap_or(content_str); + assert_eq!(feedback, "APPROVED_RESULT", "Should extract only final_output result"); + println!("✅ Correctly extracted only final_output tool result: {}", feedback); + return; + } + } + } + } + } + } + } + } + } + } + } + } + } + } + + panic!("Failed to extract final_output tool result"); +} + +#[test] +fn test_extract_coach_feedback_without_timing_message() { + // Create a temporary directory for logs + let temp_dir = TempDir::new().unwrap(); + let logs_dir = temp_dir.path().join("logs"); + fs::create_dir(&logs_dir).unwrap(); + + // Test the case where there's no timing message (backward compatibility) + let session_id = "test_session_456"; + let log_file_path = logs_dir.join(format!("g3_session_{}.json", session_id)); + + let log_content = json!({ + "session_id": session_id, + "context_window": { + "conversation_history": [ + { + "role": "assistant", + "content": "{\"tool\": \"final_output\", \"args\": {\"summary\":\"TEST_FEEDBACK\"}}" + }, + { + "role": "user", + "content": "Tool result: TEST_FEEDBACK" + } + ] + } + }); + + fs::write(&log_file_path, serde_json::to_string_pretty(&log_content).unwrap()).unwrap(); + + // Test extraction + let log_content_str = fs::read_to_string(&log_file_path).unwrap(); + let log_json: serde_json::Value = serde_json::from_str(&log_content_str).unwrap(); + + if let Some(context_window) = log_json.get("context_window") { + if let Some(conversation_history) = context_window.get("conversation_history") { + if let Some(messages) = conversation_history.as_array() { + let last_tool_result = messages.iter().rev().find(|msg| { + if let Some(role) = msg.get("role") { + if let Some(role_str) = role.as_str() { + if role_str == "User" || role_str == "user" { + if let Some(content) = msg.get("content") { + if let Some(content_str) = content.as_str() { + return content_str.starts_with("Tool result:"); + } + } + } + } + } + false + }); + + assert!(last_tool_result.is_some()); + + if let Some(last_message) = last_tool_result { + if let Some(content) = last_message.get("content") { + if let Some(content_str) = content.as_str() { + let feedback = content_str.strip_prefix("Tool result: ").unwrap_or(content_str); + assert_eq!(feedback, "TEST_FEEDBACK"); + println!("✅ Successfully extracted coach feedback without timing: {}", feedback); + return; + } + } + } + } + } + } + + panic!("Failed to extract coach feedback"); +} + +#[test] +fn test_extract_coach_feedback_with_multiple_tool_results() { + // Test that we get the LAST tool result when there are multiple + let temp_dir = TempDir::new().unwrap(); + let logs_dir = temp_dir.path().join("logs"); + fs::create_dir(&logs_dir).unwrap(); + + let session_id = "test_session_789"; + let log_file_path = logs_dir.join(format!("g3_session_{}.json", session_id)); + + let log_content = json!({ + "session_id": session_id, + "context_window": { + "conversation_history": [ + { + "role": "assistant", + "content": "{\"tool\": \"shell\", \"args\": {\"command\":\"ls\"}}" + }, + { + "role": "user", + "content": "Tool result: file1.txt\nfile2.txt" + }, + { + "role": "assistant", + "content": "{\"tool\": \"final_output\", \"args\": {\"summary\":\"FINAL_RESULT\"}}" + }, + { + "role": "user", + "content": "Tool result: FINAL_RESULT" + }, + { + "role": "assistant", + "content": "🕝 15.2s | 💭 3.1s" + } + ] + } + }); + + fs::write(&log_file_path, serde_json::to_string_pretty(&log_content).unwrap()).unwrap(); + + // Test extraction + let log_content_str = fs::read_to_string(&log_file_path).unwrap(); + let log_json: serde_json::Value = serde_json::from_str(&log_content_str).unwrap(); + + if let Some(context_window) = log_json.get("context_window") { + if let Some(conversation_history) = context_window.get("conversation_history") { + if let Some(messages) = conversation_history.as_array() { + let last_tool_result = messages.iter().rev().find(|msg| { + if let Some(role) = msg.get("role") { + if let Some(role_str) = role.as_str() { + if role_str == "User" || role_str == "user" { + if let Some(content) = msg.get("content") { + if let Some(content_str) = content.as_str() { + return content_str.starts_with("Tool result:"); + } + } + } + } + } + false + }); + + assert!(last_tool_result.is_some()); + + if let Some(last_message) = last_tool_result { + if let Some(content) = last_message.get("content") { + if let Some(content_str) = content.as_str() { + let feedback = content_str.strip_prefix("Tool result: ").unwrap_or(content_str); + // Should get the LAST tool result (final_output), not the first one (shell) + assert_eq!(feedback, "FINAL_RESULT", "Should extract the last tool result"); + assert!(!feedback.contains("file1.txt"), "Should not extract earlier tool results"); + println!("✅ Successfully extracted last tool result: {}", feedback); + return; + } + } + } + } + } + } + + panic!("Failed to extract coach feedback"); +}