From e1e732150a1a29e7cd1a4850e512dee38cfbf1bf Mon Sep 17 00:00:00 2001
From: Dhanji Prasanna <dhanji@squareup.com>
Date: Fri, 24 Oct 2025 10:11:43 +1100
Subject: [PATCH] coach rigor +++

---
 crates/g3-cli/src/lib.rs                      |  14 +-
 .../examples/list_windows.rs                  |   8 +-
 .../tests/integration_test.rs                 |  45 ------
 crates/g3-core/src/lib.rs                     | 149 +++++++++++++++++-
 crates/g3-core/tests/test_context_thinning.rs | 121 +++++++++++++-
 5 files changed, 278 insertions(+), 59 deletions(-)

diff --git a/crates/g3-cli/src/lib.rs b/crates/g3-cli/src/lib.rs
index db3b3f3..081c5da 100644
--- a/crates/g3-cli/src/lib.rs
+++ b/crates/g3-cli/src/lib.rs
@@ -243,6 +243,10 @@ pub struct Cli {
     /// Enable macOS Accessibility API tools for native app automation
     #[arg(long)]
     pub macax: bool,
+
+    /// Enable WebDriver browser automation tools
+    #[arg(long)]
+    pub webdriver: bool,
 }
 
 pub async fn run() -> Result<()> {
@@ -451,6 +455,11 @@ Output ONLY the markdown content, no explanations or meta-commentary."#,
         }
     }
 
+    // Apply webdriver flag override
+    if cli.webdriver {
+        config.webdriver.enabled = true;
+    }
+
     // Validate provider if specified
     if let Some(ref provider) = cli.provider {
         let valid_providers = ["anthropic", "databricks", "embedded", "openai"];
@@ -1630,6 +1639,7 @@ Review the current state of the project and provide a concise critique focusing
 2. Whether the project compiles successfully
 3. What requirements are missing or incorrect
 4. Specific improvements needed to satisfy requirements
+5. Use UI tools such as webdriver to test functionality thoroughly
 
 CRITICAL INSTRUCTIONS:
 1. You MUST use the final_output tool to provide your feedback
@@ -1637,13 +1647,13 @@ CRITICAL INSTRUCTIONS:
 3. Focus ONLY on what needs to be fixed or improved
 4. Do NOT include your analysis process, file contents, or compilation output in the summary
 
-If the implementation generally meets all requirements and compiles without errors:
+If the implementation thoroughly meets all requirements, compiles and is fully tested (especially UI flows) *WITHOUT* gaps or errors:
 - Call final_output with summary: 'IMPLEMENTATION_APPROVED'
 
 If improvements are needed:
 - Call final_output with a brief summary listing ONLY the specific issues to fix
 
-Remember: Be clear in your review and concise in your feedback. APPROVE if the implementation works and generally fits the requirements. Don't be picky.",
+Remember: Be clear in your review and concise in your feedback. APPROVE if the implementation works and thoroughly fits the requirements (implementation > 95% complete). Be rigorous, especially by testing that all UI features work.",
             requirements
         );
 
diff --git a/crates/g3-computer-control/examples/list_windows.rs b/crates/g3-computer-control/examples/list_windows.rs
index 5b571d9..e638a19 100644
--- a/crates/g3-computer-control/examples/list_windows.rs
+++ b/crates/g3-computer-control/examples/list_windows.rs
@@ -1,7 +1,7 @@
 use core_graphics::window::{kCGWindowListOptionOnScreenOnly, kCGNullWindowID, CGWindowListCopyWindowInfo};
 use core_foundation::dictionary::CFDictionary;
 use core_foundation::string::CFString;
-use core_foundation::base::TCFType;
+use core_foundation::base::{TCFType, ToVoid};
 
 fn main() {
     println!("Listing all on-screen windows...");
@@ -22,7 +22,7 @@ fn main() {
             
             // Get window ID
             let window_id_key = CFString::from_static_string("kCGWindowNumber");
-            let window_id: i64 = if let Some(value) = dict.find(window_id_key.as_concrete_TypeRef()) {
+            let window_id: i64 = if let Some(value) = dict.find(window_id_key.to_void()) {
                 let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _);
                 num.to_i64().unwrap_or(0)
             } else {
@@ -31,7 +31,7 @@ fn main() {
             
             // Get owner name
             let owner_key = CFString::from_static_string("kCGWindowOwnerName");
-            let owner: String = if let Some(value) = dict.find(owner_key.as_concrete_TypeRef()) {
+            let owner: String = if let Some(value) = dict.find(owner_key.to_void()) {
                 let s: CFString = TCFType::wrap_under_get_rule(*value as *const _);
                 s.to_string()
             } else {
@@ -40,7 +40,7 @@ fn main() {
             
             // Get window name/title
             let name_key = CFString::from_static_string("kCGWindowName");
-            let title: String = if let Some(value) = dict.find(name_key.as_concrete_TypeRef()) {
+            let title: String = if let Some(value) = dict.find(name_key.to_void()) {
                 let s: CFString = TCFType::wrap_under_get_rule(*value as *const _);
                 s.to_string()
             } else {
diff --git a/crates/g3-computer-control/tests/integration_test.rs b/crates/g3-computer-control/tests/integration_test.rs
index 75c884f..87227e5 100644
--- a/crates/g3-computer-control/tests/integration_test.rs
+++ b/crates/g3-computer-control/tests/integration_test.rs
@@ -1,23 +1,5 @@
 use g3_computer_control::*;
 
-#[tokio::test]
-async fn test_mouse_movement() {
-    let controller = create_controller().expect("Failed to create controller");
-    
-    // Move mouse to center of screen (assuming 1920x1080)
-    let result = controller.move_mouse(960, 540).await;
-    assert!(result.is_ok(), "Failed to move mouse: {:?}", result.err());
-}
-
-#[tokio::test]
-async fn test_typing() {
-    let controller = create_controller().expect("Failed to create controller");
-    
-    // Type some text
-    let result = controller.type_text("Hello, World!").await;
-    assert!(result.is_ok(), "Failed to type text: {:?}", result.err());
-}
-
 #[tokio::test]
 async fn test_screenshot() {
     let controller = create_controller().expect("Failed to create controller");
@@ -33,30 +15,3 @@ async fn test_screenshot() {
     // Clean up
     let _ = std::fs::remove_file(path);
 }
-
-#[tokio::test]
-async fn test_click() {
-    let controller = create_controller().expect("Failed to create controller");
-    
-    // Click at a safe location
-    let result = controller.click(types::MouseButton::Left).await;
-    assert!(result.is_ok(), "Failed to click: {:?}", result.err());
-}
-
-#[tokio::test]
-async fn test_double_click() {
-    let controller = create_controller().expect("Failed to create controller");
-    
-    // Double click
-    let result = controller.double_click(types::MouseButton::Left).await;
-    assert!(result.is_ok(), "Failed to double click: {:?}", result.err());
-}
-
-#[tokio::test]
-async fn test_press_key() {
-    let controller = create_controller().expect("Failed to create controller");
-    
-    // Press escape key
-    let result = controller.press_key("escape").await;
-    assert!(result.is_ok(), "Failed to press key: {:?}", result.err());
-}
diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs
index 2786182..61bb974 100644
--- a/crates/g3-core/src/lib.rs
+++ b/crates/g3-core/src/lib.rs
@@ -466,6 +466,7 @@ Format this as a detailed but concise summary that can be used to resume the con
         let first_third_end = (total_messages / 3).max(1);
         
         let mut leaned_count = 0;
+        let mut tool_call_leaned_count = 0;
         let mut chars_saved = 0;
         
         // Create ~/tmp directory if it doesn't exist
@@ -478,7 +479,7 @@ Format this as a detailed but concise summary that can be used to resume the con
         // Scan the first third of messages
         for i in 0..first_third_end {
             if let Some(message) = self.conversation_history.get_mut(i) {
-                // Only process User messages that look like tool results
+                // Process User messages that look like tool results
                 if matches!(message.role, MessageRole::User) && message.content.starts_with("Tool result:") {
                     let content_len = message.content.len();
                     
@@ -508,6 +509,109 @@ Format this as a detailed but concise summary that can be used to resume the con
                         debug!("Thinned tool result {} ({} chars) to {}", i, original_len, file_path);
                     }
                 }
+                
+                // Process Assistant messages that contain tool calls with large arguments
+                if matches!(message.role, MessageRole::Assistant) {
+                    // Try to parse the message content as JSON to find tool calls
+                    let content = &message.content;
+                    
+                    // Look for JSON tool call patterns
+                    if let Some(tool_call_start) = content.find(r#"{"tool":"#)
+                        .or_else(|| content.find(r#"{ "tool":"#))
+                        .or_else(|| content.find(r#"{"tool" :"#))
+                        .or_else(|| content.find(r#"{ "tool" :"#))
+                    {
+                        // Try to extract and parse the JSON tool call
+                        let json_portion = &content[tool_call_start..];
+                        
+                        // Find the end of the JSON object
+                        if let Some(json_end) = Self::find_json_end(json_portion) {
+                            let json_str = &json_portion[..=json_end];
+                            
+                            // Try to parse as ToolCall
+                            if let Ok(mut tool_call) = serde_json::from_str::<ToolCall>(json_str) {
+                                let mut modified = false;
+                                
+                                // Handle write_file tool calls
+                                if tool_call.tool == "write_file" {
+                                    if let Some(args_obj) = tool_call.args.as_object_mut() {
+                                        // Extract content to avoid borrow issues
+                                        let content_info = args_obj.get("content")
+                                            .and_then(|v| v.as_str())
+                                            .map(|s| (s.to_string(), s.len()));
+                                        
+                                        if let Some((content_str, content_len)) = content_info {
+                                            // Only thin if content is greater than 1000 chars
+                                            if content_len > 1000 {
+                                                let timestamp = std::time::SystemTime::now()
+                                                    .duration_since(std::time::UNIX_EPOCH)
+                                                    .unwrap_or_default()
+                                                    .as_secs();
+                                                let filename = format!("leaned_write_file_content_{}_{}.txt", timestamp, i);
+                                                let file_path = format!("{}/{}", tmp_dir, filename);
+                                                
+                                                if std::fs::write(&file_path, &content_str).is_ok() {
+                                                    args_obj.insert(
+                                                        "content".to_string(),
+                                                        serde_json::Value::String(format!("<content saved to {}>", file_path))
+                                                    );
+                                                    modified = true;
+                                                    chars_saved += content_len;
+                                                    tool_call_leaned_count += 1;
+                                                    debug!("Thinned write_file content {} ({} chars) to {}", i, content_len, file_path);
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                                
+                                // Handle str_replace tool calls
+                                if tool_call.tool == "str_replace" {
+                                    if let Some(args_obj) = tool_call.args.as_object_mut() {
+                                        // Extract diff to avoid borrow issues
+                                        let diff_info = args_obj.get("diff")
+                                            .and_then(|v| v.as_str())
+                                            .map(|s| (s.to_string(), s.len()));
+                                        
+                                        if let Some((diff_str, diff_len)) = diff_info {
+                                            // Only thin if diff is greater than 1000 chars
+                                            if diff_len > 1000 {
+                                                let timestamp = std::time::SystemTime::now()
+                                                    .duration_since(std::time::UNIX_EPOCH)
+                                                    .unwrap_or_default()
+                                                    .as_secs();
+                                                let filename = format!("leaned_str_replace_diff_{}_{}.txt", timestamp, i);
+                                                let file_path = format!("{}/{}", tmp_dir, filename);
+                                                
+                                                if std::fs::write(&file_path, &diff_str).is_ok() {
+                                                    args_obj.insert(
+                                                        "diff".to_string(),
+                                                        serde_json::Value::String(format!("<diff saved to {}>", file_path))
+                                                    );
+                                                    modified = true;
+                                                    chars_saved += diff_len;
+                                                    tool_call_leaned_count += 1;
+                                                    debug!("Thinned str_replace diff {} ({} chars) to {}", i, diff_len, file_path);
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                                
+                                // If we modified the tool call, reconstruct the message
+                                if modified {
+                                    let prefix = &content[..tool_call_start];
+                                    let suffix = &content[tool_call_start + json_str.len()..];
+                                    
+                                    // Serialize the modified tool call
+                                    if let Ok(new_json) = serde_json::to_string(&tool_call) {
+                                        message.content = format!("{}{}{}", prefix, new_json, suffix);
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
             }
         }
         
@@ -515,10 +619,18 @@ Format this as a detailed but concise summary that can be used to resume the con
         self.recalculate_tokens();
         
         if leaned_count > 0 {
-            (format!("🥒 Context thinned at {}%: {} tool results, ~{} chars saved", 
-                    current_threshold, leaned_count, chars_saved), chars_saved)
+            if tool_call_leaned_count > 0 {
+                (format!("🥒 Context thinned at {}%: {} tool results + {} tool calls, ~{} chars saved", 
+                        current_threshold, leaned_count, tool_call_leaned_count, chars_saved), chars_saved)
+            } else {
+                (format!("🥒 Context thinned at {}%: {} tool results, ~{} chars saved", 
+                        current_threshold, leaned_count, chars_saved), chars_saved)
+            }
+        } else if tool_call_leaned_count > 0 {
+            (format!("🥒 Context thinned at {}%: {} tool calls, ~{} chars saved", 
+                    current_threshold, tool_call_leaned_count, chars_saved), chars_saved)
         } else {
-            (format!("ℹ Context thinning triggered at {}% but no large tool results found in first third", 
+            (format!("ℹ Context thinning triggered at {}% but no large tool results or tool calls found in first third", 
                     current_threshold), 0)
         }
     }
@@ -533,6 +645,35 @@ Format this as a detailed but concise summary that can be used to resume the con
         
         debug!("Recalculated tokens after thinning: {} tokens", total);
     }
+    
+    /// Helper function to find the end of a JSON object
+    fn find_json_end(json_str: &str) -> Option<usize> {
+        let mut brace_count = 0;
+        let mut in_string = false;
+        let mut escape_next = false;
+        
+        for (i, ch) in json_str.char_indices() {
+            if escape_next {
+                escape_next = false;
+                continue;
+            }
+            
+            match ch {
+                '\\' => escape_next = true,
+                '"' if !escape_next => in_string = !in_string,
+                '{' if !in_string => brace_count += 1,
+                '}' if !in_string => {
+                    brace_count -= 1;
+                    if brace_count == 0 {
+                        return Some(i);
+                    }
+                }
+                _ => {}
+            }
+        }
+        
+        None
+    }
 }
 
 pub struct Agent<W: UiWriter> {
diff --git a/crates/g3-core/tests/test_context_thinning.rs b/crates/g3-core/tests/test_context_thinning.rs
index 760524f..db6761f 100644
--- a/crates/g3-core/tests/test_context_thinning.rs
+++ b/crates/g3-core/tests/test_context_thinning.rs
@@ -72,7 +72,7 @@ fn test_thin_context_basic() {
     
     // Trigger thinning at 50%
     context.used_tokens = 5000;
-    let summary = context.thin_context();
+    let (summary, _chars_saved) = context.thin_context();
     
     println!("Thinning summary: {}", summary);
     
@@ -93,6 +93,119 @@ fn test_thin_context_basic() {
     }
 }
 
+#[test]
+fn test_thin_write_file_tool_calls() {
+    let mut context = ContextWindow::new(10000);
+    
+    // Add some messages including a write_file tool call with large content
+    context.add_message(Message {
+        role: MessageRole::User,
+        content: "Please create a large file".to_string(),
+    });
+    
+    // Add an assistant message with a write_file tool call containing large content
+    let large_content = "x".repeat(1500);
+    let tool_call_json = format!(
+        r#"{{"tool": "write_file", "args": {{"file_path": "test.txt", "content": "{}"}}}}"#,
+        large_content
+    );
+    context.add_message(Message {
+        role: MessageRole::Assistant,
+        content: format!("I'll create that file.\n\n{}", tool_call_json),
+    });
+    
+    context.add_message(Message {
+        role: MessageRole::User,
+        content: "Tool result: ✅ Successfully wrote 1500 lines".to_string(),
+    });
+    
+    // Add more messages to ensure we have enough for "first third" logic
+    for i in 0..6 {
+        context.add_message(Message {
+            role: MessageRole::Assistant,
+            content: format!("Response {}", i),
+        });
+    }
+    
+    // Trigger thinning at 50%
+    context.used_tokens = 5000;
+    let (summary, _chars_saved) = context.thin_context();
+    
+    println!("Thinning summary: {}", summary);
+    
+    // Should have thinned the write_file tool call
+    assert!(summary.contains("tool call") || summary.contains("chars saved"));
+    
+    // Check that the large content was replaced with a file reference
+    let first_third_end = context.conversation_history.len() / 3;
+    for i in 0..first_third_end {
+        if let Some(msg) = context.conversation_history.get(i) {
+            if matches!(msg.role, MessageRole::Assistant) && msg.content.contains("write_file") {
+                // The content should now reference an external file
+                assert!(msg.content.contains("<content saved to"));
+                assert!(!msg.content.contains(&large_content));
+            }
+        }
+    }
+}
+
+#[test]
+fn test_thin_str_replace_tool_calls() {
+    let mut context = ContextWindow::new(10000);
+    
+    // Add some messages including a str_replace tool call with large diff
+    context.add_message(Message {
+        role: MessageRole::User,
+        content: "Please update the file".to_string(),
+    });
+    
+    // Add an assistant message with a str_replace tool call containing large diff
+    let large_diff = format!("--- old\n{}\n+++ new\n{}", "-old line\n".repeat(100), "+new line\n".repeat(100));
+    let tool_call_json = format!(
+        r#"{{"tool": "str_replace", "args": {{"file_path": "test.txt", "diff": "{}"}}}}"#,
+        large_diff.replace('\n', "\\n")
+    );
+    context.add_message(Message {
+        role: MessageRole::Assistant,
+        content: format!("I'll update that file.\n\n{}", tool_call_json),
+    });
+    
+    context.add_message(Message {
+        role: MessageRole::User,
+        content: "Tool result: ✅ applied unified diff".to_string(),
+    });
+    
+    // Add more messages to ensure we have enough for "first third" logic
+    for i in 0..6 {
+        context.add_message(Message {
+            role: MessageRole::Assistant,
+            content: format!("Response {}", i),
+        });
+    }
+    
+    // Trigger thinning at 50%
+    context.used_tokens = 5000;
+    let (summary, _chars_saved) = context.thin_context();
+    
+    println!("Thinning summary: {}", summary);
+    
+    // Should have thinned the str_replace tool call
+    assert!(summary.contains("tool call") || summary.contains("chars saved"));
+    
+    // Check that the large diff was replaced with a file reference
+    let first_third_end = context.conversation_history.len() / 3;
+    for i in 0..first_third_end {
+        if let Some(msg) = context.conversation_history.get(i) {
+            if matches!(msg.role, MessageRole::Assistant) && msg.content.contains("str_replace") {
+                // The diff should now reference an external file
+                assert!(msg.content.contains("<diff saved to"));
+                // Should not contain the large diff content
+                assert!(!msg.content.contains("old line"));
+            }
+        }
+    }
+}
+
 #[test]
 fn test_thin_context_no_large_results() {
     let mut context = ContextWindow::new(10000);
@@ -106,10 +219,10 @@ fn test_thin_context_no_large_results() {
     }
     
     context.used_tokens = 5000;
-    let summary = context.thin_context();
+    let (summary, _chars_saved) = context.thin_context();
     
     // Should report no large results found
-    assert!(summary.contains("no large tool results found"));
+    assert!(summary.contains("no large tool results or tool calls found"));
 }
 
 #[test]
@@ -135,7 +248,7 @@ fn test_thin_context_only_affects_first_third() {
     }
     
     context.used_tokens = 5000;
-    let summary = context.thin_context();
+    let (summary, _chars_saved) = context.thin_context();
     
     // First third is 4 messages (indices 0-3), so only indices 1 and 3 should be thinned
     // That's 2 tool results