Compare commits

..

2 Commits

Author SHA1 Message Date
Jochen
0327a6dfdf make sure coach feedback is extracted. 2025-12-02 22:00:58 +11:00
Jochen
928f2bfa9d actually record coach feedback and use it 2025-12-02 21:23:50 +11:00
5 changed files with 444 additions and 44 deletions

1
Cargo.lock generated
View File

@@ -1377,6 +1377,7 @@ dependencies = [
"serde", "serde",
"serde_json", "serde_json",
"sha2", "sha2",
"tempfile",
"termimad", "termimad",
"tokio", "tokio",
"tokio-util", "tokio-util",

View File

@@ -27,3 +27,6 @@ chrono = { version = "0.4", features = ["serde"] }
crossterm = "0.29.0" crossterm = "0.29.0"
ratatui = "0.29" ratatui = "0.29"
termimad = "0.34.0" termimad = "0.34.0"
[dev-dependencies]
tempfile = "3.8"

View File

@@ -163,15 +163,66 @@ fn extract_coach_feedback_from_logs(
if let Some(context_window) = log_json.get("context_window") { if let Some(context_window) = log_json.get("context_window") {
if let Some(conversation_history) = context_window.get("conversation_history") { if let Some(conversation_history) = context_window.get("conversation_history") {
if let Some(messages) = conversation_history.as_array() { if let Some(messages) = conversation_history.as_array() {
// Simply get the last message content - this is the coach's final feedback // Go backwards through the conversation to find the last tool result
if let Some(last_message) = messages.last() { // that corresponds to a final_output tool call
if let Some(content) = last_message.get("content") { for i in (0..messages.len()).rev() {
if let Some(content_str) = content.as_str() { let msg = &messages[i];
output.print(&format!(
"✅ Extracted coach feedback from session: {}", // Check if this is a User message with "Tool result:"
session_id if let Some(role) = msg.get("role") {
)); if let Some(role_str) = role.as_str() {
return Ok(content_str.to_string()); if role_str == "User" || role_str == "user" {
if let Some(content) = msg.get("content") {
if let Some(content_str) = content.as_str() {
if content_str.starts_with("Tool result:") {
// Found a tool result, now check the preceding message
// to verify it was a final_output tool call
if i > 0 {
let prev_msg = &messages[i - 1];
if let Some(prev_role) = prev_msg.get("role") {
if let Some(prev_role_str) = prev_role.as_str() {
if prev_role_str == "assistant" || prev_role_str == "Assistant" {
if let Some(prev_content) = prev_msg.get("content") {
if let Some(prev_content_str) = prev_content.as_str() {
// Check if the previous assistant message contains a final_output tool call
if prev_content_str.contains("\"tool\": \"final_output\"") {
// This is a final_output tool result
let feedback = if content_str.starts_with("Tool result: ") {
content_str.strip_prefix("Tool result: ")
.unwrap_or(content_str)
.to_string()
} else {
content_str.to_string()
};
output.print(&format!(
"Coach feedback extracted: {} characters (from {} total)",
feedback.len(),
content_str.len()
));
output.print(&format!("Coach feedback:\n{}", feedback));
output.print(&format!(
"✅ Extracted coach feedback from session: {} (verified final_output tool)",
session_id
));
return Ok(feedback);
} else {
output.print(&format!(
"⚠️ Skipping tool result at index {} - not a final_output tool call",
i
));
}
}
}
}
}
}
}
}
}
}
}
} }
} }
} }
@@ -187,7 +238,7 @@ fn extract_coach_feedback_from_logs(
"CRITICAL: Could not extract coach feedback from session: {}\n\ "CRITICAL: Could not extract coach feedback from session: {}\n\
Log file path: {:?}\n\ Log file path: {:?}\n\
Log file exists: {}\n\ Log file exists: {}\n\
This indicates the coach did not call any tool or the log is corrupted.\n\ This indicates the coach did not call final_output tool or the log is corrupted.\n\
Coach result response length: {} chars", Coach result response length: {} chars",
session_id, session_id,
log_file_path, log_file_path,

View File

@@ -0,0 +1,336 @@
use serde_json::json;
use std::fs;
use tempfile::TempDir;
#[test]
fn test_extract_coach_feedback_with_timing_message() {
// Create a temporary directory for logs
let temp_dir = TempDir::new().unwrap();
let logs_dir = temp_dir.path().join("logs");
fs::create_dir(&logs_dir).unwrap();
// Create a mock session log with the problematic conversation history
// where timing message appears after the tool result
let session_id = "test_session_123";
let log_file_path = logs_dir.join(format!("g3_session_{}.json", session_id));
let log_content = json!({
"session_id": session_id,
"context_window": {
"conversation_history": [
{
"role": "assistant",
"content": "{\"tool\": \"final_output\", \"args\": {\"summary\":\"IMPLEMENTATION_APPROVED\"}}"
},
{
"role": "user",
"content": "Tool result: IMPLEMENTATION_APPROVED"
},
{
"role": "assistant",
"content": "🕝 27.7s | 💭 7.5s"
}
]
}
});
fs::write(&log_file_path, serde_json::to_string_pretty(&log_content).unwrap()).unwrap();
// Now test the extraction logic
let log_content_str = fs::read_to_string(&log_file_path).unwrap();
let log_json: serde_json::Value = serde_json::from_str(&log_content_str).unwrap();
if let Some(context_window) = log_json.get("context_window") {
if let Some(conversation_history) = context_window.get("conversation_history") {
if let Some(messages) = conversation_history.as_array() {
// This is the key logic we're testing - find the last USER message with "Tool result:"
let last_tool_result = messages.iter().rev().find(|msg| {
if let Some(role) = msg.get("role") {
if let Some(role_str) = role.as_str() {
if role_str == "User" || role_str == "user" {
if let Some(content) = msg.get("content") {
if let Some(content_str) = content.as_str() {
return content_str.starts_with("Tool result:");
}
}
}
}
}
false
});
// Verify we found the correct message
assert!(last_tool_result.is_some(), "Should find the tool result message");
if let Some(last_message) = last_tool_result {
if let Some(content) = last_message.get("content") {
if let Some(content_str) = content.as_str() {
let feedback = if content_str.starts_with("Tool result: ") {
content_str.strip_prefix("Tool result: ").unwrap_or(content_str)
} else {
content_str
};
// Verify we extracted the correct feedback
assert_eq!(feedback, "IMPLEMENTATION_APPROVED", "Should extract the actual feedback, not timing");
// Verify the feedback is NOT the timing message
assert!(!feedback.contains("🕝"), "Feedback should not be the timing message");
println!("✅ Successfully extracted coach feedback: {}", feedback);
return;
}
}
}
}
}
}
panic!("Failed to extract coach feedback");
}
#[test]
fn test_extract_only_final_output_tool_results() {
// Test that we only extract tool results from final_output, not from other tools
let temp_dir = TempDir::new().unwrap();
let logs_dir = temp_dir.path().join("logs");
fs::create_dir(&logs_dir).unwrap();
let session_id = "test_session_final_output_only";
let log_file_path = logs_dir.join(format!("g3_session_{}.json", session_id));
let log_content = json!({
"session_id": session_id,
"context_window": {
"conversation_history": [
{
"role": "assistant",
"content": "{\"tool\": \"shell\", \"args\": {\"command\":\"ls\"}}"
},
{
"role": "user",
"content": "Tool result: file1.txt\nfile2.txt"
},
{
"role": "assistant",
"content": "{\"tool\": \"read_file\", \"args\": {\"file_path\":\"test.txt\"}}"
},
{
"role": "user",
"content": "Tool result: This is test content"
},
{
"role": "assistant",
"content": "{\"tool\": \"final_output\", \"args\": {\"summary\":\"APPROVED_RESULT\"}}"
},
{
"role": "user",
"content": "Tool result: APPROVED_RESULT"
},
{
"role": "assistant",
"content": "🕝 20.5s | 💭 5.2s"
}
]
}
});
fs::write(&log_file_path, serde_json::to_string_pretty(&log_content).unwrap()).unwrap();
// Test the new extraction logic that verifies the tool is final_output
let log_content_str = fs::read_to_string(&log_file_path).unwrap();
let log_json: serde_json::Value = serde_json::from_str(&log_content_str).unwrap();
if let Some(context_window) = log_json.get("context_window") {
if let Some(conversation_history) = context_window.get("conversation_history") {
if let Some(messages) = conversation_history.as_array() {
// Go backwards through messages to find final_output tool result
for i in (0..messages.len()).rev() {
let msg = &messages[i];
if let Some(role) = msg.get("role") {
if let Some(role_str) = role.as_str() {
if role_str == "User" || role_str == "user" {
if let Some(content) = msg.get("content") {
if let Some(content_str) = content.as_str() {
if content_str.starts_with("Tool result:") {
// Check if preceding message was final_output
if i > 0 {
let prev_msg = &messages[i - 1];
if let Some(prev_content) = prev_msg.get("content") {
if let Some(prev_content_str) = prev_content.as_str() {
if prev_content_str.contains("\"tool\": \"final_output\"") {
let feedback = content_str.strip_prefix("Tool result: ").unwrap_or(content_str);
assert_eq!(feedback, "APPROVED_RESULT", "Should extract only final_output result");
println!("✅ Correctly extracted only final_output tool result: {}", feedback);
return;
}
}
}
}
}
}
}
}
}
}
}
}
}
}
panic!("Failed to extract final_output tool result");
}
#[test]
fn test_extract_coach_feedback_without_timing_message() {
// Create a temporary directory for logs
let temp_dir = TempDir::new().unwrap();
let logs_dir = temp_dir.path().join("logs");
fs::create_dir(&logs_dir).unwrap();
// Test the case where there's no timing message (backward compatibility)
let session_id = "test_session_456";
let log_file_path = logs_dir.join(format!("g3_session_{}.json", session_id));
let log_content = json!({
"session_id": session_id,
"context_window": {
"conversation_history": [
{
"role": "assistant",
"content": "{\"tool\": \"final_output\", \"args\": {\"summary\":\"TEST_FEEDBACK\"}}"
},
{
"role": "user",
"content": "Tool result: TEST_FEEDBACK"
}
]
}
});
fs::write(&log_file_path, serde_json::to_string_pretty(&log_content).unwrap()).unwrap();
// Test extraction
let log_content_str = fs::read_to_string(&log_file_path).unwrap();
let log_json: serde_json::Value = serde_json::from_str(&log_content_str).unwrap();
if let Some(context_window) = log_json.get("context_window") {
if let Some(conversation_history) = context_window.get("conversation_history") {
if let Some(messages) = conversation_history.as_array() {
let last_tool_result = messages.iter().rev().find(|msg| {
if let Some(role) = msg.get("role") {
if let Some(role_str) = role.as_str() {
if role_str == "User" || role_str == "user" {
if let Some(content) = msg.get("content") {
if let Some(content_str) = content.as_str() {
return content_str.starts_with("Tool result:");
}
}
}
}
}
false
});
assert!(last_tool_result.is_some());
if let Some(last_message) = last_tool_result {
if let Some(content) = last_message.get("content") {
if let Some(content_str) = content.as_str() {
let feedback = content_str.strip_prefix("Tool result: ").unwrap_or(content_str);
assert_eq!(feedback, "TEST_FEEDBACK");
println!("✅ Successfully extracted coach feedback without timing: {}", feedback);
return;
}
}
}
}
}
}
panic!("Failed to extract coach feedback");
}
#[test]
fn test_extract_coach_feedback_with_multiple_tool_results() {
// Test that we get the LAST tool result when there are multiple
let temp_dir = TempDir::new().unwrap();
let logs_dir = temp_dir.path().join("logs");
fs::create_dir(&logs_dir).unwrap();
let session_id = "test_session_789";
let log_file_path = logs_dir.join(format!("g3_session_{}.json", session_id));
let log_content = json!({
"session_id": session_id,
"context_window": {
"conversation_history": [
{
"role": "assistant",
"content": "{\"tool\": \"shell\", \"args\": {\"command\":\"ls\"}}"
},
{
"role": "user",
"content": "Tool result: file1.txt\nfile2.txt"
},
{
"role": "assistant",
"content": "{\"tool\": \"final_output\", \"args\": {\"summary\":\"FINAL_RESULT\"}}"
},
{
"role": "user",
"content": "Tool result: FINAL_RESULT"
},
{
"role": "assistant",
"content": "🕝 15.2s | 💭 3.1s"
}
]
}
});
fs::write(&log_file_path, serde_json::to_string_pretty(&log_content).unwrap()).unwrap();
// Test extraction
let log_content_str = fs::read_to_string(&log_file_path).unwrap();
let log_json: serde_json::Value = serde_json::from_str(&log_content_str).unwrap();
if let Some(context_window) = log_json.get("context_window") {
if let Some(conversation_history) = context_window.get("conversation_history") {
if let Some(messages) = conversation_history.as_array() {
let last_tool_result = messages.iter().rev().find(|msg| {
if let Some(role) = msg.get("role") {
if let Some(role_str) = role.as_str() {
if role_str == "User" || role_str == "user" {
if let Some(content) = msg.get("content") {
if let Some(content_str) = content.as_str() {
return content_str.starts_with("Tool result:");
}
}
}
}
}
false
});
assert!(last_tool_result.is_some());
if let Some(last_message) = last_tool_result {
if let Some(content) = last_message.get("content") {
if let Some(content_str) = content.as_str() {
let feedback = content_str.strip_prefix("Tool result: ").unwrap_or(content_str);
// Should get the LAST tool result (final_output), not the first one (shell)
assert_eq!(feedback, "FINAL_RESULT", "Should extract the last tool result");
assert!(!feedback.contains("file1.txt"), "Should not extract earlier tool results");
println!("✅ Successfully extracted last tool result: {}", feedback);
return;
}
}
}
}
}
}
panic!("Failed to extract coach feedback");
}

View File

@@ -3604,40 +3604,6 @@ impl<W: UiWriter> Agent<W> {
} }
} }
// Check if this was a final_output tool call
if tool_call.tool == "final_output" {
// The summary was already displayed via print_final_output
// Don't add it to full_response to avoid duplicate printing
// full_response is intentionally left empty/unchanged
self.ui_writer.println("");
let _ttft =
first_token_time.unwrap_or_else(|| stream_start.elapsed());
// Add timing if needed
let final_response = if show_timing {
format!(
"🕝 {} | 💭 {}",
Self::format_duration(stream_start.elapsed()),
Self::format_duration(_ttft)
)
} else {
// Return empty string since content was already displayed
String::new()
};
return Ok(TaskResult::new(
final_response,
self.context_window.clone(),
));
}
// Closure marker with timing
if tool_call.tool != "final_output" {
self.ui_writer
.print_tool_timing(&Self::format_duration(exec_duration));
self.ui_writer.print_agent_prompt();
}
// Add the tool call and result to the context window using RAW unfiltered content // Add the tool call and result to the context window using RAW unfiltered content
// This ensures the log file contains the true raw content including JSON tool calls // This ensures the log file contains the true raw content including JSON tool calls
let tool_message = if !raw_content_for_log.trim().is_empty() { let tool_message = if !raw_content_for_log.trim().is_empty() {
@@ -3701,6 +3667,43 @@ impl<W: UiWriter> Agent<W> {
self.context_window.add_message(tool_message); self.context_window.add_message(tool_message);
self.context_window.add_message(result_message); self.context_window.add_message(result_message);
// Check if this was a final_output tool call
if tool_call.tool == "final_output" {
// Save context window BEFORE returning so the session log includes final_output
self.save_context_window("completed");
// The summary was already displayed via print_final_output
// Don't add it to full_response to avoid duplicate printing
// full_response is intentionally left empty/unchanged
self.ui_writer.println("");
let _ttft =
first_token_time.unwrap_or_else(|| stream_start.elapsed());
// Add timing if needed
let final_response = if show_timing {
format!(
"🕝 {} | 💭 {}",
Self::format_duration(stream_start.elapsed()),
Self::format_duration(_ttft)
)
} else {
// Return empty string since content was already displayed
String::new()
};
return Ok(TaskResult::new(
final_response,
self.context_window.clone(),
));
}
// Closure marker with timing
if tool_call.tool != "final_output" {
self.ui_writer
.print_tool_timing(&Self::format_duration(exec_duration));
self.ui_writer.print_agent_prompt();
}
// Update the request with the new context for next iteration // Update the request with the new context for next iteration
request.messages = self.context_window.conversation_history.clone(); request.messages = self.context_window.conversation_history.clone();
@@ -3922,6 +3925,9 @@ impl<W: UiWriter> Agent<W> {
full_response = String::new(); full_response = String::new();
self.ui_writer.println(""); self.ui_writer.println("");
// Save context window BEFORE returning
self.save_context_window("completed");
let _ttft = let _ttft =
first_token_time.unwrap_or_else(|| stream_start.elapsed()); first_token_time.unwrap_or_else(|| stream_start.elapsed());
@@ -4060,6 +4066,9 @@ impl<W: UiWriter> Agent<W> {
} }
} }
// Save context window BEFORE returning
self.save_context_window("completed");
// Add timing if needed // Add timing if needed
let final_response = if show_timing { let final_response = if show_timing {
format!( format!(