more macax tooling

This commit is contained in:
Dhanji Prasanna
2025-10-24 10:45:24 +11:00
parent e1e732150a
commit d0ac222e2e
6 changed files with 328 additions and 9 deletions

View File

@@ -1639,7 +1639,7 @@ Review the current state of the project and provide a concise critique focusing
2. Whether the project compiles successfully 2. Whether the project compiles successfully
3. What requirements are missing or incorrect 3. What requirements are missing or incorrect
4. Specific improvements needed to satisfy requirements 4. Specific improvements needed to satisfy requirements
5. Use UI tools such as webdriver to test functionality thoroughly 5. Use UI tools such as webdriver or macax to test functionality thoroughly
CRITICAL INSTRUCTIONS: CRITICAL INSTRUCTIONS:
1. You MUST use the final_output tool to provide your feedback 1. You MUST use the final_output tool to provide your feedback
@@ -1647,13 +1647,13 @@ CRITICAL INSTRUCTIONS:
3. Focus ONLY on what needs to be fixed or improved 3. Focus ONLY on what needs to be fixed or improved
4. Do NOT include your analysis process, file contents, or compilation output in the summary 4. Do NOT include your analysis process, file contents, or compilation output in the summary
If the implementation thoroughly meets all requirements, compiles and is fully tested (especially UI flows) *WITHOUT* gaps or errors: If the implementation thoroughly meets all requirements, compiles and is fully tested (especially UI flows) *WITHOUT* minor gaps or errors:
- Call final_output with summary: 'IMPLEMENTATION_APPROVED' - Call final_output with summary: 'IMPLEMENTATION_APPROVED'
If improvements are needed: If improvements are needed:
- Call final_output with a brief summary listing ONLY the specific issues to fix - Call final_output with a brief summary listing ONLY the specific issues to fix
Remember: Be clear in your review and concise in your feedback. APPROVE if the implementation works and thoroughly fits the requirements (implementation > 95% complete). Be rigorous, especially by testing that all UI features work.", Remember: Be clear in your review and concise in your feedback. APPROVE iff the implementation works and thoroughly fits the requirements (implementation > 95% complete). Be rigorous, especially by testing that all UI features work.",
requirements requirements
); );

View File

@@ -24,6 +24,12 @@ pub trait ComputerController: Send + Sync {
// OCR operations // OCR operations
async fn extract_text_from_screen(&self, region: Rect) -> Result<String>; async fn extract_text_from_screen(&self, region: Rect) -> Result<String>;
async fn extract_text_from_image(&self, path: &str) -> Result<String>; async fn extract_text_from_image(&self, path: &str) -> Result<String>;
async fn extract_text_with_locations(&self, path: &str) -> Result<Vec<TextLocation>>;
async fn find_text_on_screen(&self, search_text: &str) -> Result<Option<TextLocation>>;
// Mouse operations
fn move_mouse(&self, x: i32, y: i32) -> Result<()>;
fn click_at(&self, x: i32, y: i32) -> Result<()>;
} }
// Platform-specific constructor // Platform-specific constructor

View File

@@ -1,5 +1,5 @@
use crate::{ComputerController, types::Rect}; use crate::{ComputerController, types::{Rect, TextLocation}};
use anyhow::Result; use anyhow::{Result, Context};
use async_trait::async_trait; use async_trait::async_trait;
use std::path::Path; use std::path::Path;
use tesseract::Tesseract; use tesseract::Tesseract;
@@ -122,4 +122,150 @@ impl ComputerController for MacOSController {
Ok(text) Ok(text)
} }
async fn extract_text_with_locations(&self, path: &str) -> Result<Vec<TextLocation>> {
// For now, use tesseract CLI with TSV output to get bounding boxes
// This is a workaround since the Rust tesseract crate doesn't expose get_component_boxes
let output = std::process::Command::new("tesseract")
.arg(path)
.arg("stdout")
.arg("tsv")
.output()
.map_err(|e| anyhow::anyhow!("Failed to run tesseract: {}", e))?;
if !output.status.success() {
anyhow::bail!("Tesseract failed: {}", String::from_utf8_lossy(&output.stderr));
}
let tsv_text = String::from_utf8_lossy(&output.stdout);
let mut locations = Vec::new();
// Parse TSV output (skip header line)
for (i, line) in tsv_text.lines().enumerate() {
if i == 0 { continue; } // Skip header
let parts: Vec<&str> = line.split('\t').collect();
if parts.len() >= 12 {
// TSV format: level, page_num, block_num, par_num, line_num, word_num,
// left, top, width, height, conf, text
if let (Ok(x), Ok(y), Ok(w), Ok(h), Ok(conf), text) = (
parts[6].parse::<i32>(),
parts[7].parse::<i32>(),
parts[8].parse::<i32>(),
parts[9].parse::<i32>(),
parts[10].parse::<f32>(),
parts[11],
) {
let trimmed = text.trim();
if !trimmed.is_empty() && conf > 0.0 {
locations.push(TextLocation {
text: trimmed.to_string(),
x,
y,
width: w,
height: h,
confidence: conf / 100.0, // Convert from 0-100 to 0-1
});
}
}
}
}
Ok(locations)
}
async fn find_text_on_screen(&self, search_text: &str) -> Result<Option<TextLocation>> {
// Take full screenshot
let home = std::env::var("HOME").unwrap_or_else(|_| "/tmp".to_string());
let temp_path = format!("{}/Desktop/g3_find_text_{}.png", home, uuid::Uuid::new_v4());
self.take_screenshot(&temp_path, None, None).await?;
// Extract all text with locations
let locations = self.extract_text_with_locations(&temp_path).await?;
// Clean up temp file
let _ = std::fs::remove_file(&temp_path);
// Find matching text (case-insensitive)
let search_lower = search_text.to_lowercase();
for location in locations {
if location.text.to_lowercase().contains(&search_lower) {
return Ok(Some(location));
}
}
Ok(None)
}
fn move_mouse(&self, x: i32, y: i32) -> Result<()> {
use core_graphics::event::{
CGEvent, CGEventTapLocation, CGEventType, CGMouseButton,
};
use core_graphics::event_source::{
CGEventSource, CGEventSourceStateID,
};
use core_graphics::geometry::CGPoint;
let source = CGEventSource::new(CGEventSourceStateID::HIDSystemState)
.ok().context("Failed to create event source")?;
let event = CGEvent::new_mouse_event(
source,
CGEventType::MouseMoved,
CGPoint::new(x as f64, y as f64),
CGMouseButton::Left,
).ok().context("Failed to create mouse event")?;
event.post(CGEventTapLocation::HID);
Ok(())
}
fn click_at(&self, x: i32, y: i32) -> Result<()> {
use core_graphics::event::{
CGEvent, CGEventTapLocation, CGEventType, CGMouseButton,
};
use core_graphics::event_source::{
CGEventSource, CGEventSourceStateID,
};
use core_graphics::geometry::CGPoint;
let source = CGEventSource::new(CGEventSourceStateID::HIDSystemState)
.ok().context("Failed to create event source")?;
let point = CGPoint::new(x as f64, y as f64);
// Move mouse to position first
let move_event = CGEvent::new_mouse_event(
source.clone(),
CGEventType::MouseMoved,
point,
CGMouseButton::Left,
).ok().context("Failed to create mouse move event")?;
move_event.post(CGEventTapLocation::HID);
std::thread::sleep(std::time::Duration::from_millis(100));
// Mouse down
let mouse_down = CGEvent::new_mouse_event(
source.clone(),
CGEventType::LeftMouseDown,
point,
CGMouseButton::Left,
).ok().context("Failed to create mouse down event")?;
mouse_down.post(CGEventTapLocation::HID);
std::thread::sleep(std::time::Duration::from_millis(50));
// Mouse up
let mouse_up = CGEvent::new_mouse_event(
source,
CGEventType::LeftMouseUp,
point,
CGMouseButton::Left,
).ok().context("Failed to create mouse up event")?;
mouse_up.post(CGEventTapLocation::HID);
Ok(())
}
} }

View File

@@ -7,3 +7,13 @@ pub struct Rect {
pub width: i32, pub width: i32,
pub height: i32, pub height: i32,
} }
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TextLocation {
pub text: String,
pub x: i32,
pub y: i32,
pub width: i32,
pub height: i32,
pub confidence: f32,
}

View File

@@ -1239,7 +1239,7 @@ Template:
// Check if provider supports native tool calling and add tools if so // Check if provider supports native tool calling and add tools if so
let provider = self.providers.get(None)?; let provider = self.providers.get(None)?;
let tools = if provider.has_native_tool_calling() { let tools = if provider.has_native_tool_calling() {
Some(Self::create_tool_definitions(self.config.webdriver.enabled, self.config.macax.enabled)) Some(Self::create_tool_definitions(self.config.webdriver.enabled, self.config.macax.enabled, self.config.computer_control.enabled))
} else { } else {
None None
}; };
@@ -1700,7 +1700,7 @@ Template:
} }
/// Create tool definitions for native tool calling providers /// Create tool definitions for native tool calling providers
fn create_tool_definitions(enable_webdriver: bool, enable_macax: bool) -> Vec<Tool> { fn create_tool_definitions(enable_webdriver: bool, enable_macax: bool, enable_computer_control: bool) -> Vec<Tool> {
let mut tools = vec![ let mut tools = vec![
Tool { Tool {
name: "shell".to_string(), name: "shell".to_string(),
@@ -2280,6 +2280,64 @@ Template:
}); });
} }
// Add vision-guided tools (requires computer control)
if enable_computer_control {
// Add vision-guided tools
tools.push(Tool {
name: "vision_find_text".to_string(),
description: "Find text on screen and return its location (useful for locating UI elements)".to_string(),
input_schema: json!({
"type": "object",
"properties": {
"text": {
"type": "string",
"description": "The text to search for on screen"
}
},
"required": ["text"]
}),
});
tools.push(Tool {
name: "vision_click_text".to_string(),
description: "Find text on screen and click on it (useful for clicking buttons, links, menu items)".to_string(),
input_schema: json!({
"type": "object",
"properties": {
"text": {
"type": "string",
"description": "The text to click on (e.g., 'Submit', 'OK', 'Cancel', '+')"
}
},
"required": ["text"]
}),
});
tools.push(Tool {
name: "vision_click_near_text".to_string(),
description: "Find text on screen and click near it (useful for clicking text fields next to labels)".to_string(),
input_schema: json!({
"type": "object",
"properties": {
"text": {
"type": "string",
"description": "The label text to find (e.g., 'Name:', 'Email:', 'Task:')"
},
"direction": {
"type": "string",
"enum": ["right", "below", "left", "above"],
"description": "Direction to click relative to the text (default: right)"
},
"distance": {
"type": "integer",
"description": "Distance in pixels from the text (default: 50)"
}
},
"required": ["text"]
}),
});
}
tools tools
} }
@@ -2844,7 +2902,7 @@ Template:
// Ensure tools are included for native providers in subsequent iterations // Ensure tools are included for native providers in subsequent iterations
if provider.has_native_tool_calling() { if provider.has_native_tool_calling() {
request.tools = Some(Self::create_tool_definitions(self.config.webdriver.enabled, self.config.macax.enabled)); request.tools = Some(Self::create_tool_definitions(self.config.webdriver.enabled, self.config.macax.enabled, self.config.computer_control.enabled));
} }
// Only add to full_response if we haven't already added it // Only add to full_response if we haven't already added it
@@ -4529,6 +4587,97 @@ Template:
Err(e) => Ok(format!("❌ Failed to focus element: {}", e)), Err(e) => Ok(format!("❌ Failed to focus element: {}", e)),
} }
} }
"vision_find_text" => {
debug!("Processing vision_find_text tool call");
if let Some(controller) = &self.computer_controller {
let text = tool_call.args.get("text")
.and_then(|v| v.as_str())
.ok_or_else(|| anyhow::anyhow!("Missing text parameter"))?;
match controller.find_text_on_screen(text).await {
Ok(Some(location)) => {
Ok(format!(
"✅ Found '{}' at position ({}, {}) with size {}x{} (confidence: {:.0}%)",
location.text, location.x, location.y, location.width, location.height,
location.confidence * 100.0
))
}
Ok(None) => Ok(format!("❌ Could not find '{}' on screen", text)),
Err(e) => Ok(format!("❌ Error finding text: {}", e)),
}
} else {
Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string())
}
}
"vision_click_text" => {
debug!("Processing vision_click_text tool call");
if let Some(controller) = &self.computer_controller {
let text = tool_call.args.get("text")
.and_then(|v| v.as_str())
.ok_or_else(|| anyhow::anyhow!("Missing text parameter"))?;
match controller.find_text_on_screen(text).await {
Ok(Some(location)) => {
// Click on center of text
let center_x = location.x + location.width / 2;
let center_y = location.y + location.height / 2;
match controller.click_at(center_x, center_y) {
Ok(_) => Ok(format!("✅ Clicked on '{}' at ({}, {})", text, center_x, center_y)),
Err(e) => Ok(format!("❌ Failed to click: {}", e)),
}
}
Ok(None) => Ok(format!("❌ Could not find '{}' on screen", text)),
Err(e) => Ok(format!("❌ Error finding text: {}", e)),
}
} else {
Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string())
}
}
"vision_click_near_text" => {
debug!("Processing vision_click_near_text tool call");
if let Some(controller) = &self.computer_controller {
let text = tool_call.args.get("text")
.and_then(|v| v.as_str())
.ok_or_else(|| anyhow::anyhow!("Missing text parameter"))?;
let direction = tool_call.args.get("direction")
.and_then(|v| v.as_str())
.unwrap_or("right");
let distance = tool_call.args.get("distance")
.and_then(|v| v.as_i64())
.unwrap_or(50) as i32;
match controller.find_text_on_screen(text).await {
Ok(Some(location)) => {
// Calculate click position based on direction
let (click_x, click_y) = match direction {
"right" => (location.x + location.width + distance, location.y + location.height / 2),
"below" => (location.x + location.width / 2, location.y + location.height + distance),
"left" => (location.x - distance, location.y + location.height / 2),
"above" => (location.x + location.width / 2, location.y - distance),
_ => (location.x + location.width + distance, location.y + location.height / 2),
};
match controller.click_at(click_x, click_y) {
Ok(_) => Ok(format!(
"✅ Clicked {} of '{}' at ({}, {})",
direction, text, click_x, click_y
)),
Err(e) => Ok(format!("❌ Failed to click: {}", e)),
}
}
Ok(None) => Ok(format!("❌ Could not find '{}' on screen", text)),
Err(e) => Ok(format!("❌ Error finding text: {}", e)),
}
} else {
Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string())
}
}
_ => { _ => {
warn!("Unknown tool: {}", tool_call.tool); warn!("Unknown tool: {}", tool_call.tool);
Ok(format!("❓ Unknown tool: {}", tool_call.tool)) Ok(format!("❓ Unknown tool: {}", tool_call.tool))

View File

@@ -882,6 +882,14 @@ impl LLMProvider for DatabricksProvider {
request.messages.len() request.messages.len()
); );
// Debug: Log tool count
if let Some(ref tools) = request.tools {
debug!("Request has {} tools", tools.len());
for tool in tools.iter().take(5) {
debug!(" Tool: {}", tool.name);
}
}
let max_tokens = request.max_tokens.unwrap_or(self.max_tokens); let max_tokens = request.max_tokens.unwrap_or(self.max_tokens);
let temperature = request.temperature.unwrap_or(self.temperature); let temperature = request.temperature.unwrap_or(self.temperature);