From d0ac222e2eb71c610f1564c63f409398fa6a9026 Mon Sep 17 00:00:00 2001 From: Dhanji Prasanna Date: Fri, 24 Oct 2025 10:45:24 +1100 Subject: [PATCH] more macax tooling --- crates/g3-cli/src/lib.rs | 6 +- crates/g3-computer-control/src/lib.rs | 6 + .../g3-computer-control/src/platform/macos.rs | 150 ++++++++++++++++- crates/g3-computer-control/src/types.rs | 10 ++ crates/g3-core/src/lib.rs | 157 +++++++++++++++++- crates/g3-providers/src/databricks.rs | 8 + 6 files changed, 328 insertions(+), 9 deletions(-) diff --git a/crates/g3-cli/src/lib.rs b/crates/g3-cli/src/lib.rs index 081c5da..3facf6a 100644 --- a/crates/g3-cli/src/lib.rs +++ b/crates/g3-cli/src/lib.rs @@ -1639,7 +1639,7 @@ Review the current state of the project and provide a concise critique focusing 2. Whether the project compiles successfully 3. What requirements are missing or incorrect 4. Specific improvements needed to satisfy requirements -5. Use UI tools such as webdriver to test functionality thoroughly +5. Use UI tools such as webdriver or macax to test functionality thoroughly CRITICAL INSTRUCTIONS: 1. You MUST use the final_output tool to provide your feedback @@ -1647,13 +1647,13 @@ CRITICAL INSTRUCTIONS: 3. Focus ONLY on what needs to be fixed or improved 4. Do NOT include your analysis process, file contents, or compilation output in the summary -If the implementation thoroughly meets all requirements, compiles and is fully tested (especially UI flows) *WITHOUT* gaps or errors: +If the implementation thoroughly meets all requirements, compiles and is fully tested (especially UI flows) *WITHOUT* minor gaps or errors: - Call final_output with summary: 'IMPLEMENTATION_APPROVED' If improvements are needed: - Call final_output with a brief summary listing ONLY the specific issues to fix -Remember: Be clear in your review and concise in your feedback. APPROVE if the implementation works and thoroughly fits the requirements (implementation > 95% complete). Be rigorous, especially by testing that all UI features work.", +Remember: Be clear in your review and concise in your feedback. APPROVE iff the implementation works and thoroughly fits the requirements (implementation > 95% complete). Be rigorous, especially by testing that all UI features work.", requirements ); diff --git a/crates/g3-computer-control/src/lib.rs b/crates/g3-computer-control/src/lib.rs index e4180c6..ad564b5 100644 --- a/crates/g3-computer-control/src/lib.rs +++ b/crates/g3-computer-control/src/lib.rs @@ -24,6 +24,12 @@ pub trait ComputerController: Send + Sync { // OCR operations async fn extract_text_from_screen(&self, region: Rect) -> Result; async fn extract_text_from_image(&self, path: &str) -> Result; + async fn extract_text_with_locations(&self, path: &str) -> Result>; + async fn find_text_on_screen(&self, search_text: &str) -> Result>; + + // Mouse operations + fn move_mouse(&self, x: i32, y: i32) -> Result<()>; + fn click_at(&self, x: i32, y: i32) -> Result<()>; } // Platform-specific constructor diff --git a/crates/g3-computer-control/src/platform/macos.rs b/crates/g3-computer-control/src/platform/macos.rs index 129b73c..d2e6a0a 100644 --- a/crates/g3-computer-control/src/platform/macos.rs +++ b/crates/g3-computer-control/src/platform/macos.rs @@ -1,5 +1,5 @@ -use crate::{ComputerController, types::Rect}; -use anyhow::Result; +use crate::{ComputerController, types::{Rect, TextLocation}}; +use anyhow::{Result, Context}; use async_trait::async_trait; use std::path::Path; use tesseract::Tesseract; @@ -122,4 +122,150 @@ impl ComputerController for MacOSController { Ok(text) } + + async fn extract_text_with_locations(&self, path: &str) -> Result> { + // For now, use tesseract CLI with TSV output to get bounding boxes + // This is a workaround since the Rust tesseract crate doesn't expose get_component_boxes + let output = std::process::Command::new("tesseract") + .arg(path) + .arg("stdout") + .arg("tsv") + .output() + .map_err(|e| anyhow::anyhow!("Failed to run tesseract: {}", e))?; + + if !output.status.success() { + anyhow::bail!("Tesseract failed: {}", String::from_utf8_lossy(&output.stderr)); + } + + let tsv_text = String::from_utf8_lossy(&output.stdout); + let mut locations = Vec::new(); + + // Parse TSV output (skip header line) + for (i, line) in tsv_text.lines().enumerate() { + if i == 0 { continue; } // Skip header + + let parts: Vec<&str> = line.split('\t').collect(); + if parts.len() >= 12 { + // TSV format: level, page_num, block_num, par_num, line_num, word_num, + // left, top, width, height, conf, text + if let (Ok(x), Ok(y), Ok(w), Ok(h), Ok(conf), text) = ( + parts[6].parse::(), + parts[7].parse::(), + parts[8].parse::(), + parts[9].parse::(), + parts[10].parse::(), + parts[11], + ) { + let trimmed = text.trim(); + if !trimmed.is_empty() && conf > 0.0 { + locations.push(TextLocation { + text: trimmed.to_string(), + x, + y, + width: w, + height: h, + confidence: conf / 100.0, // Convert from 0-100 to 0-1 + }); + } + } + } + } + + Ok(locations) + } + + async fn find_text_on_screen(&self, search_text: &str) -> Result> { + // Take full screenshot + let home = std::env::var("HOME").unwrap_or_else(|_| "/tmp".to_string()); + let temp_path = format!("{}/Desktop/g3_find_text_{}.png", home, uuid::Uuid::new_v4()); + self.take_screenshot(&temp_path, None, None).await?; + + // Extract all text with locations + let locations = self.extract_text_with_locations(&temp_path).await?; + + // Clean up temp file + let _ = std::fs::remove_file(&temp_path); + + // Find matching text (case-insensitive) + let search_lower = search_text.to_lowercase(); + for location in locations { + if location.text.to_lowercase().contains(&search_lower) { + return Ok(Some(location)); + } + } + + Ok(None) + } + + fn move_mouse(&self, x: i32, y: i32) -> Result<()> { + use core_graphics::event::{ + CGEvent, CGEventTapLocation, CGEventType, CGMouseButton, + }; + use core_graphics::event_source::{ + CGEventSource, CGEventSourceStateID, + }; + use core_graphics::geometry::CGPoint; + + let source = CGEventSource::new(CGEventSourceStateID::HIDSystemState) + .ok().context("Failed to create event source")?; + + let event = CGEvent::new_mouse_event( + source, + CGEventType::MouseMoved, + CGPoint::new(x as f64, y as f64), + CGMouseButton::Left, + ).ok().context("Failed to create mouse event")?; + + event.post(CGEventTapLocation::HID); + + Ok(()) + } + + fn click_at(&self, x: i32, y: i32) -> Result<()> { + use core_graphics::event::{ + CGEvent, CGEventTapLocation, CGEventType, CGMouseButton, + }; + use core_graphics::event_source::{ + CGEventSource, CGEventSourceStateID, + }; + use core_graphics::geometry::CGPoint; + + let source = CGEventSource::new(CGEventSourceStateID::HIDSystemState) + .ok().context("Failed to create event source")?; + + let point = CGPoint::new(x as f64, y as f64); + + // Move mouse to position first + let move_event = CGEvent::new_mouse_event( + source.clone(), + CGEventType::MouseMoved, + point, + CGMouseButton::Left, + ).ok().context("Failed to create mouse move event")?; + move_event.post(CGEventTapLocation::HID); + + std::thread::sleep(std::time::Duration::from_millis(100)); + + // Mouse down + let mouse_down = CGEvent::new_mouse_event( + source.clone(), + CGEventType::LeftMouseDown, + point, + CGMouseButton::Left, + ).ok().context("Failed to create mouse down event")?; + mouse_down.post(CGEventTapLocation::HID); + + std::thread::sleep(std::time::Duration::from_millis(50)); + + // Mouse up + let mouse_up = CGEvent::new_mouse_event( + source, + CGEventType::LeftMouseUp, + point, + CGMouseButton::Left, + ).ok().context("Failed to create mouse up event")?; + mouse_up.post(CGEventTapLocation::HID); + + Ok(()) + } } \ No newline at end of file diff --git a/crates/g3-computer-control/src/types.rs b/crates/g3-computer-control/src/types.rs index e7ea40e..7d09042 100644 --- a/crates/g3-computer-control/src/types.rs +++ b/crates/g3-computer-control/src/types.rs @@ -7,3 +7,13 @@ pub struct Rect { pub width: i32, pub height: i32, } + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TextLocation { + pub text: String, + pub x: i32, + pub y: i32, + pub width: i32, + pub height: i32, + pub confidence: f32, +} diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs index 61bb974..14f36c3 100644 --- a/crates/g3-core/src/lib.rs +++ b/crates/g3-core/src/lib.rs @@ -1239,7 +1239,7 @@ Template: // Check if provider supports native tool calling and add tools if so let provider = self.providers.get(None)?; let tools = if provider.has_native_tool_calling() { - Some(Self::create_tool_definitions(self.config.webdriver.enabled, self.config.macax.enabled)) + Some(Self::create_tool_definitions(self.config.webdriver.enabled, self.config.macax.enabled, self.config.computer_control.enabled)) } else { None }; @@ -1700,7 +1700,7 @@ Template: } /// Create tool definitions for native tool calling providers - fn create_tool_definitions(enable_webdriver: bool, enable_macax: bool) -> Vec { + fn create_tool_definitions(enable_webdriver: bool, enable_macax: bool, enable_computer_control: bool) -> Vec { let mut tools = vec![ Tool { name: "shell".to_string(), @@ -2279,7 +2279,65 @@ Template: }), }); } - + + // Add vision-guided tools (requires computer control) + if enable_computer_control { + // Add vision-guided tools + tools.push(Tool { + name: "vision_find_text".to_string(), + description: "Find text on screen and return its location (useful for locating UI elements)".to_string(), + input_schema: json!({ + "type": "object", + "properties": { + "text": { + "type": "string", + "description": "The text to search for on screen" + } + }, + "required": ["text"] + }), + }); + + tools.push(Tool { + name: "vision_click_text".to_string(), + description: "Find text on screen and click on it (useful for clicking buttons, links, menu items)".to_string(), + input_schema: json!({ + "type": "object", + "properties": { + "text": { + "type": "string", + "description": "The text to click on (e.g., 'Submit', 'OK', 'Cancel', '+')" + } + }, + "required": ["text"] + }), + }); + + tools.push(Tool { + name: "vision_click_near_text".to_string(), + description: "Find text on screen and click near it (useful for clicking text fields next to labels)".to_string(), + input_schema: json!({ + "type": "object", + "properties": { + "text": { + "type": "string", + "description": "The label text to find (e.g., 'Name:', 'Email:', 'Task:')" + }, + "direction": { + "type": "string", + "enum": ["right", "below", "left", "above"], + "description": "Direction to click relative to the text (default: right)" + }, + "distance": { + "type": "integer", + "description": "Distance in pixels from the text (default: 50)" + } + }, + "required": ["text"] + }), + }); + } + tools } @@ -2844,7 +2902,7 @@ Template: // Ensure tools are included for native providers in subsequent iterations if provider.has_native_tool_calling() { - request.tools = Some(Self::create_tool_definitions(self.config.webdriver.enabled, self.config.macax.enabled)); + request.tools = Some(Self::create_tool_definitions(self.config.webdriver.enabled, self.config.macax.enabled, self.config.computer_control.enabled)); } // Only add to full_response if we haven't already added it @@ -4529,6 +4587,97 @@ Template: Err(e) => Ok(format!("❌ Failed to focus element: {}", e)), } } + "vision_find_text" => { + debug!("Processing vision_find_text tool call"); + + if let Some(controller) = &self.computer_controller { + let text = tool_call.args.get("text") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing text parameter"))?; + + match controller.find_text_on_screen(text).await { + Ok(Some(location)) => { + Ok(format!( + "✅ Found '{}' at position ({}, {}) with size {}x{} (confidence: {:.0}%)", + location.text, location.x, location.y, location.width, location.height, + location.confidence * 100.0 + )) + } + Ok(None) => Ok(format!("❌ Could not find '{}' on screen", text)), + Err(e) => Ok(format!("❌ Error finding text: {}", e)), + } + } else { + Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string()) + } + } + "vision_click_text" => { + debug!("Processing vision_click_text tool call"); + + if let Some(controller) = &self.computer_controller { + let text = tool_call.args.get("text") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing text parameter"))?; + + match controller.find_text_on_screen(text).await { + Ok(Some(location)) => { + // Click on center of text + let center_x = location.x + location.width / 2; + let center_y = location.y + location.height / 2; + + match controller.click_at(center_x, center_y) { + Ok(_) => Ok(format!("✅ Clicked on '{}' at ({}, {})", text, center_x, center_y)), + Err(e) => Ok(format!("❌ Failed to click: {}", e)), + } + } + Ok(None) => Ok(format!("❌ Could not find '{}' on screen", text)), + Err(e) => Ok(format!("❌ Error finding text: {}", e)), + } + } else { + Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string()) + } + } + "vision_click_near_text" => { + debug!("Processing vision_click_near_text tool call"); + + if let Some(controller) = &self.computer_controller { + let text = tool_call.args.get("text") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing text parameter"))?; + + let direction = tool_call.args.get("direction") + .and_then(|v| v.as_str()) + .unwrap_or("right"); + + let distance = tool_call.args.get("distance") + .and_then(|v| v.as_i64()) + .unwrap_or(50) as i32; + + match controller.find_text_on_screen(text).await { + Ok(Some(location)) => { + // Calculate click position based on direction + let (click_x, click_y) = match direction { + "right" => (location.x + location.width + distance, location.y + location.height / 2), + "below" => (location.x + location.width / 2, location.y + location.height + distance), + "left" => (location.x - distance, location.y + location.height / 2), + "above" => (location.x + location.width / 2, location.y - distance), + _ => (location.x + location.width + distance, location.y + location.height / 2), + }; + + match controller.click_at(click_x, click_y) { + Ok(_) => Ok(format!( + "✅ Clicked {} of '{}' at ({}, {})", + direction, text, click_x, click_y + )), + Err(e) => Ok(format!("❌ Failed to click: {}", e)), + } + } + Ok(None) => Ok(format!("❌ Could not find '{}' on screen", text)), + Err(e) => Ok(format!("❌ Error finding text: {}", e)), + } + } else { + Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string()) + } + } _ => { warn!("Unknown tool: {}", tool_call.tool); Ok(format!("❓ Unknown tool: {}", tool_call.tool)) diff --git a/crates/g3-providers/src/databricks.rs b/crates/g3-providers/src/databricks.rs index 02c669a..50373d6 100644 --- a/crates/g3-providers/src/databricks.rs +++ b/crates/g3-providers/src/databricks.rs @@ -881,6 +881,14 @@ impl LLMProvider for DatabricksProvider { "Processing Databricks streaming request with {} messages", request.messages.len() ); + + // Debug: Log tool count + if let Some(ref tools) = request.tools { + debug!("Request has {} tools", tools.len()); + for tool in tools.iter().take(5) { + debug!(" Tool: {}", tool.name); + } + } let max_tokens = request.max_tokens.unwrap_or(self.max_tokens); let temperature = request.temperature.unwrap_or(self.temperature);