coach rigor +++

This commit is contained in:
Dhanji Prasanna
2025-10-24 10:11:43 +11:00
parent 0be4829ca9
commit e1e732150a
5 changed files with 278 additions and 59 deletions

View File

@@ -243,6 +243,10 @@ pub struct Cli {
/// Enable macOS Accessibility API tools for native app automation /// Enable macOS Accessibility API tools for native app automation
#[arg(long)] #[arg(long)]
pub macax: bool, pub macax: bool,
/// Enable WebDriver browser automation tools
#[arg(long)]
pub webdriver: bool,
} }
pub async fn run() -> Result<()> { pub async fn run() -> Result<()> {
@@ -451,6 +455,11 @@ Output ONLY the markdown content, no explanations or meta-commentary."#,
} }
} }
// Apply webdriver flag override
if cli.webdriver {
config.webdriver.enabled = true;
}
// Validate provider if specified // Validate provider if specified
if let Some(ref provider) = cli.provider { if let Some(ref provider) = cli.provider {
let valid_providers = ["anthropic", "databricks", "embedded", "openai"]; let valid_providers = ["anthropic", "databricks", "embedded", "openai"];
@@ -1630,6 +1639,7 @@ Review the current state of the project and provide a concise critique focusing
2. Whether the project compiles successfully 2. Whether the project compiles successfully
3. What requirements are missing or incorrect 3. What requirements are missing or incorrect
4. Specific improvements needed to satisfy requirements 4. Specific improvements needed to satisfy requirements
5. Use UI tools such as webdriver to test functionality thoroughly
CRITICAL INSTRUCTIONS: CRITICAL INSTRUCTIONS:
1. You MUST use the final_output tool to provide your feedback 1. You MUST use the final_output tool to provide your feedback
@@ -1637,13 +1647,13 @@ CRITICAL INSTRUCTIONS:
3. Focus ONLY on what needs to be fixed or improved 3. Focus ONLY on what needs to be fixed or improved
4. Do NOT include your analysis process, file contents, or compilation output in the summary 4. Do NOT include your analysis process, file contents, or compilation output in the summary
If the implementation generally meets all requirements and compiles without errors: If the implementation thoroughly meets all requirements, compiles and is fully tested (especially UI flows) *WITHOUT* gaps or errors:
- Call final_output with summary: 'IMPLEMENTATION_APPROVED' - Call final_output with summary: 'IMPLEMENTATION_APPROVED'
If improvements are needed: If improvements are needed:
- Call final_output with a brief summary listing ONLY the specific issues to fix - Call final_output with a brief summary listing ONLY the specific issues to fix
Remember: Be clear in your review and concise in your feedback. APPROVE if the implementation works and generally fits the requirements. Don't be picky.", Remember: Be clear in your review and concise in your feedback. APPROVE if the implementation works and thoroughly fits the requirements (implementation > 95% complete). Be rigorous, especially by testing that all UI features work.",
requirements requirements
); );

View File

@@ -1,7 +1,7 @@
use core_graphics::window::{kCGWindowListOptionOnScreenOnly, kCGNullWindowID, CGWindowListCopyWindowInfo}; use core_graphics::window::{kCGWindowListOptionOnScreenOnly, kCGNullWindowID, CGWindowListCopyWindowInfo};
use core_foundation::dictionary::CFDictionary; use core_foundation::dictionary::CFDictionary;
use core_foundation::string::CFString; use core_foundation::string::CFString;
use core_foundation::base::TCFType; use core_foundation::base::{TCFType, ToVoid};
fn main() { fn main() {
println!("Listing all on-screen windows..."); println!("Listing all on-screen windows...");
@@ -22,7 +22,7 @@ fn main() {
// Get window ID // Get window ID
let window_id_key = CFString::from_static_string("kCGWindowNumber"); let window_id_key = CFString::from_static_string("kCGWindowNumber");
let window_id: i64 = if let Some(value) = dict.find(window_id_key.as_concrete_TypeRef()) { let window_id: i64 = if let Some(value) = dict.find(window_id_key.to_void()) {
let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _); let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _);
num.to_i64().unwrap_or(0) num.to_i64().unwrap_or(0)
} else { } else {
@@ -31,7 +31,7 @@ fn main() {
// Get owner name // Get owner name
let owner_key = CFString::from_static_string("kCGWindowOwnerName"); let owner_key = CFString::from_static_string("kCGWindowOwnerName");
let owner: String = if let Some(value) = dict.find(owner_key.as_concrete_TypeRef()) { let owner: String = if let Some(value) = dict.find(owner_key.to_void()) {
let s: CFString = TCFType::wrap_under_get_rule(*value as *const _); let s: CFString = TCFType::wrap_under_get_rule(*value as *const _);
s.to_string() s.to_string()
} else { } else {
@@ -40,7 +40,7 @@ fn main() {
// Get window name/title // Get window name/title
let name_key = CFString::from_static_string("kCGWindowName"); let name_key = CFString::from_static_string("kCGWindowName");
let title: String = if let Some(value) = dict.find(name_key.as_concrete_TypeRef()) { let title: String = if let Some(value) = dict.find(name_key.to_void()) {
let s: CFString = TCFType::wrap_under_get_rule(*value as *const _); let s: CFString = TCFType::wrap_under_get_rule(*value as *const _);
s.to_string() s.to_string()
} else { } else {

View File

@@ -1,23 +1,5 @@
use g3_computer_control::*; use g3_computer_control::*;
#[tokio::test]
async fn test_mouse_movement() {
let controller = create_controller().expect("Failed to create controller");
// Move mouse to center of screen (assuming 1920x1080)
let result = controller.move_mouse(960, 540).await;
assert!(result.is_ok(), "Failed to move mouse: {:?}", result.err());
}
#[tokio::test]
async fn test_typing() {
let controller = create_controller().expect("Failed to create controller");
// Type some text
let result = controller.type_text("Hello, World!").await;
assert!(result.is_ok(), "Failed to type text: {:?}", result.err());
}
#[tokio::test] #[tokio::test]
async fn test_screenshot() { async fn test_screenshot() {
let controller = create_controller().expect("Failed to create controller"); let controller = create_controller().expect("Failed to create controller");
@@ -33,30 +15,3 @@ async fn test_screenshot() {
// Clean up // Clean up
let _ = std::fs::remove_file(path); let _ = std::fs::remove_file(path);
} }
#[tokio::test]
async fn test_click() {
let controller = create_controller().expect("Failed to create controller");
// Click at a safe location
let result = controller.click(types::MouseButton::Left).await;
assert!(result.is_ok(), "Failed to click: {:?}", result.err());
}
#[tokio::test]
async fn test_double_click() {
let controller = create_controller().expect("Failed to create controller");
// Double click
let result = controller.double_click(types::MouseButton::Left).await;
assert!(result.is_ok(), "Failed to double click: {:?}", result.err());
}
#[tokio::test]
async fn test_press_key() {
let controller = create_controller().expect("Failed to create controller");
// Press escape key
let result = controller.press_key("escape").await;
assert!(result.is_ok(), "Failed to press key: {:?}", result.err());
}

View File

@@ -466,6 +466,7 @@ Format this as a detailed but concise summary that can be used to resume the con
let first_third_end = (total_messages / 3).max(1); let first_third_end = (total_messages / 3).max(1);
let mut leaned_count = 0; let mut leaned_count = 0;
let mut tool_call_leaned_count = 0;
let mut chars_saved = 0; let mut chars_saved = 0;
// Create ~/tmp directory if it doesn't exist // Create ~/tmp directory if it doesn't exist
@@ -478,7 +479,7 @@ Format this as a detailed but concise summary that can be used to resume the con
// Scan the first third of messages // Scan the first third of messages
for i in 0..first_third_end { for i in 0..first_third_end {
if let Some(message) = self.conversation_history.get_mut(i) { if let Some(message) = self.conversation_history.get_mut(i) {
// Only process User messages that look like tool results // Process User messages that look like tool results
if matches!(message.role, MessageRole::User) && message.content.starts_with("Tool result:") { if matches!(message.role, MessageRole::User) && message.content.starts_with("Tool result:") {
let content_len = message.content.len(); let content_len = message.content.len();
@@ -508,6 +509,109 @@ Format this as a detailed but concise summary that can be used to resume the con
debug!("Thinned tool result {} ({} chars) to {}", i, original_len, file_path); debug!("Thinned tool result {} ({} chars) to {}", i, original_len, file_path);
} }
} }
// Process Assistant messages that contain tool calls with large arguments
if matches!(message.role, MessageRole::Assistant) {
// Try to parse the message content as JSON to find tool calls
let content = &message.content;
// Look for JSON tool call patterns
if let Some(tool_call_start) = content.find(r#"{"tool":"#)
.or_else(|| content.find(r#"{ "tool":"#))
.or_else(|| content.find(r#"{"tool" :"#))
.or_else(|| content.find(r#"{ "tool" :"#))
{
// Try to extract and parse the JSON tool call
let json_portion = &content[tool_call_start..];
// Find the end of the JSON object
if let Some(json_end) = Self::find_json_end(json_portion) {
let json_str = &json_portion[..=json_end];
// Try to parse as ToolCall
if let Ok(mut tool_call) = serde_json::from_str::<ToolCall>(json_str) {
let mut modified = false;
// Handle write_file tool calls
if tool_call.tool == "write_file" {
if let Some(args_obj) = tool_call.args.as_object_mut() {
// Extract content to avoid borrow issues
let content_info = args_obj.get("content")
.and_then(|v| v.as_str())
.map(|s| (s.to_string(), s.len()));
if let Some((content_str, content_len)) = content_info {
// Only thin if content is greater than 1000 chars
if content_len > 1000 {
let timestamp = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.as_secs();
let filename = format!("leaned_write_file_content_{}_{}.txt", timestamp, i);
let file_path = format!("{}/{}", tmp_dir, filename);
if std::fs::write(&file_path, &content_str).is_ok() {
args_obj.insert(
"content".to_string(),
serde_json::Value::String(format!("<content saved to {}>", file_path))
);
modified = true;
chars_saved += content_len;
tool_call_leaned_count += 1;
debug!("Thinned write_file content {} ({} chars) to {}", i, content_len, file_path);
}
}
}
}
}
// Handle str_replace tool calls
if tool_call.tool == "str_replace" {
if let Some(args_obj) = tool_call.args.as_object_mut() {
// Extract diff to avoid borrow issues
let diff_info = args_obj.get("diff")
.and_then(|v| v.as_str())
.map(|s| (s.to_string(), s.len()));
if let Some((diff_str, diff_len)) = diff_info {
// Only thin if diff is greater than 1000 chars
if diff_len > 1000 {
let timestamp = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.as_secs();
let filename = format!("leaned_str_replace_diff_{}_{}.txt", timestamp, i);
let file_path = format!("{}/{}", tmp_dir, filename);
if std::fs::write(&file_path, &diff_str).is_ok() {
args_obj.insert(
"diff".to_string(),
serde_json::Value::String(format!("<diff saved to {}>", file_path))
);
modified = true;
chars_saved += diff_len;
tool_call_leaned_count += 1;
debug!("Thinned str_replace diff {} ({} chars) to {}", i, diff_len, file_path);
}
}
}
}
}
// If we modified the tool call, reconstruct the message
if modified {
let prefix = &content[..tool_call_start];
let suffix = &content[tool_call_start + json_str.len()..];
// Serialize the modified tool call
if let Ok(new_json) = serde_json::to_string(&tool_call) {
message.content = format!("{}{}{}", prefix, new_json, suffix);
}
}
}
}
}
}
} }
} }
@@ -515,10 +619,18 @@ Format this as a detailed but concise summary that can be used to resume the con
self.recalculate_tokens(); self.recalculate_tokens();
if leaned_count > 0 { if leaned_count > 0 {
if tool_call_leaned_count > 0 {
(format!("🥒 Context thinned at {}%: {} tool results + {} tool calls, ~{} chars saved",
current_threshold, leaned_count, tool_call_leaned_count, chars_saved), chars_saved)
} else {
(format!("🥒 Context thinned at {}%: {} tool results, ~{} chars saved", (format!("🥒 Context thinned at {}%: {} tool results, ~{} chars saved",
current_threshold, leaned_count, chars_saved), chars_saved) current_threshold, leaned_count, chars_saved), chars_saved)
}
} else if tool_call_leaned_count > 0 {
(format!("🥒 Context thinned at {}%: {} tool calls, ~{} chars saved",
current_threshold, tool_call_leaned_count, chars_saved), chars_saved)
} else { } else {
(format!(" Context thinning triggered at {}% but no large tool results found in first third", (format!(" Context thinning triggered at {}% but no large tool results or tool calls found in first third",
current_threshold), 0) current_threshold), 0)
} }
} }
@@ -533,6 +645,35 @@ Format this as a detailed but concise summary that can be used to resume the con
debug!("Recalculated tokens after thinning: {} tokens", total); debug!("Recalculated tokens after thinning: {} tokens", total);
} }
/// Helper function to find the end of a JSON object
fn find_json_end(json_str: &str) -> Option<usize> {
let mut brace_count = 0;
let mut in_string = false;
let mut escape_next = false;
for (i, ch) in json_str.char_indices() {
if escape_next {
escape_next = false;
continue;
}
match ch {
'\\' => escape_next = true,
'"' if !escape_next => in_string = !in_string,
'{' if !in_string => brace_count += 1,
'}' if !in_string => {
brace_count -= 1;
if brace_count == 0 {
return Some(i);
}
}
_ => {}
}
}
None
}
} }
pub struct Agent<W: UiWriter> { pub struct Agent<W: UiWriter> {

View File

@@ -72,7 +72,7 @@ fn test_thin_context_basic() {
// Trigger thinning at 50% // Trigger thinning at 50%
context.used_tokens = 5000; context.used_tokens = 5000;
let summary = context.thin_context(); let (summary, _chars_saved) = context.thin_context();
println!("Thinning summary: {}", summary); println!("Thinning summary: {}", summary);
@@ -93,6 +93,119 @@ fn test_thin_context_basic() {
} }
} }
#[test]
fn test_thin_write_file_tool_calls() {
let mut context = ContextWindow::new(10000);
// Add some messages including a write_file tool call with large content
context.add_message(Message {
role: MessageRole::User,
content: "Please create a large file".to_string(),
});
// Add an assistant message with a write_file tool call containing large content
let large_content = "x".repeat(1500);
let tool_call_json = format!(
r#"{{"tool": "write_file", "args": {{"file_path": "test.txt", "content": "{}"}}}}"#,
large_content
);
context.add_message(Message {
role: MessageRole::Assistant,
content: format!("I'll create that file.\n\n{}", tool_call_json),
});
context.add_message(Message {
role: MessageRole::User,
content: "Tool result: ✅ Successfully wrote 1500 lines".to_string(),
});
// Add more messages to ensure we have enough for "first third" logic
for i in 0..6 {
context.add_message(Message {
role: MessageRole::Assistant,
content: format!("Response {}", i),
});
}
// Trigger thinning at 50%
context.used_tokens = 5000;
let (summary, _chars_saved) = context.thin_context();
println!("Thinning summary: {}", summary);
// Should have thinned the write_file tool call
assert!(summary.contains("tool call") || summary.contains("chars saved"));
// Check that the large content was replaced with a file reference
let first_third_end = context.conversation_history.len() / 3;
for i in 0..first_third_end {
if let Some(msg) = context.conversation_history.get(i) {
if matches!(msg.role, MessageRole::Assistant) && msg.content.contains("write_file") {
// The content should now reference an external file
assert!(msg.content.contains("<content saved to"));
assert!(!msg.content.contains(&large_content));
}
}
}
}
#[test]
fn test_thin_str_replace_tool_calls() {
let mut context = ContextWindow::new(10000);
// Add some messages including a str_replace tool call with large diff
context.add_message(Message {
role: MessageRole::User,
content: "Please update the file".to_string(),
});
// Add an assistant message with a str_replace tool call containing large diff
let large_diff = format!("--- old\n{}\n+++ new\n{}", "-old line\n".repeat(100), "+new line\n".repeat(100));
let tool_call_json = format!(
r#"{{"tool": "str_replace", "args": {{"file_path": "test.txt", "diff": "{}"}}}}"#,
large_diff.replace('\n', "\\n")
);
context.add_message(Message {
role: MessageRole::Assistant,
content: format!("I'll update that file.\n\n{}", tool_call_json),
});
context.add_message(Message {
role: MessageRole::User,
content: "Tool result: ✅ applied unified diff".to_string(),
});
// Add more messages to ensure we have enough for "first third" logic
for i in 0..6 {
context.add_message(Message {
role: MessageRole::Assistant,
content: format!("Response {}", i),
});
}
// Trigger thinning at 50%
context.used_tokens = 5000;
let (summary, _chars_saved) = context.thin_context();
println!("Thinning summary: {}", summary);
// Should have thinned the str_replace tool call
assert!(summary.contains("tool call") || summary.contains("chars saved"));
// Check that the large diff was replaced with a file reference
let first_third_end = context.conversation_history.len() / 3;
for i in 0..first_third_end {
if let Some(msg) = context.conversation_history.get(i) {
if matches!(msg.role, MessageRole::Assistant) && msg.content.contains("str_replace") {
// The diff should now reference an external file
assert!(msg.content.contains("<diff saved to"));
// Should not contain the large diff content
assert!(!msg.content.contains("old line"));
}
}
}
}
#[test] #[test]
fn test_thin_context_no_large_results() { fn test_thin_context_no_large_results() {
let mut context = ContextWindow::new(10000); let mut context = ContextWindow::new(10000);
@@ -106,10 +219,10 @@ fn test_thin_context_no_large_results() {
} }
context.used_tokens = 5000; context.used_tokens = 5000;
let summary = context.thin_context(); let (summary, _chars_saved) = context.thin_context();
// Should report no large results found // Should report no large results found
assert!(summary.contains("no large tool results found")); assert!(summary.contains("no large tool results or tool calls found"));
} }
#[test] #[test]
@@ -135,7 +248,7 @@ fn test_thin_context_only_affects_first_third() {
} }
context.used_tokens = 5000; context.used_tokens = 5000;
let summary = context.thin_context(); let (summary, _chars_saved) = context.thin_context();
// First third is 4 messages (indices 0-3), so only indices 1 and 3 should be thinned // First third is 4 messages (indices 0-3), so only indices 1 and 3 should be thinned
// That's 2 tool results // That's 2 tool results