more macax tooling
This commit is contained in:
@@ -1639,7 +1639,7 @@ Review the current state of the project and provide a concise critique focusing
|
||||
2. Whether the project compiles successfully
|
||||
3. What requirements are missing or incorrect
|
||||
4. Specific improvements needed to satisfy requirements
|
||||
5. Use UI tools such as webdriver to test functionality thoroughly
|
||||
5. Use UI tools such as webdriver or macax to test functionality thoroughly
|
||||
|
||||
CRITICAL INSTRUCTIONS:
|
||||
1. You MUST use the final_output tool to provide your feedback
|
||||
@@ -1647,13 +1647,13 @@ CRITICAL INSTRUCTIONS:
|
||||
3. Focus ONLY on what needs to be fixed or improved
|
||||
4. Do NOT include your analysis process, file contents, or compilation output in the summary
|
||||
|
||||
If the implementation thoroughly meets all requirements, compiles and is fully tested (especially UI flows) *WITHOUT* gaps or errors:
|
||||
If the implementation thoroughly meets all requirements, compiles and is fully tested (especially UI flows) *WITHOUT* minor gaps or errors:
|
||||
- Call final_output with summary: 'IMPLEMENTATION_APPROVED'
|
||||
|
||||
If improvements are needed:
|
||||
- Call final_output with a brief summary listing ONLY the specific issues to fix
|
||||
|
||||
Remember: Be clear in your review and concise in your feedback. APPROVE if the implementation works and thoroughly fits the requirements (implementation > 95% complete). Be rigorous, especially by testing that all UI features work.",
|
||||
Remember: Be clear in your review and concise in your feedback. APPROVE iff the implementation works and thoroughly fits the requirements (implementation > 95% complete). Be rigorous, especially by testing that all UI features work.",
|
||||
requirements
|
||||
);
|
||||
|
||||
|
||||
@@ -24,6 +24,12 @@ pub trait ComputerController: Send + Sync {
|
||||
// OCR operations
|
||||
async fn extract_text_from_screen(&self, region: Rect) -> Result<String>;
|
||||
async fn extract_text_from_image(&self, path: &str) -> Result<String>;
|
||||
async fn extract_text_with_locations(&self, path: &str) -> Result<Vec<TextLocation>>;
|
||||
async fn find_text_on_screen(&self, search_text: &str) -> Result<Option<TextLocation>>;
|
||||
|
||||
// Mouse operations
|
||||
fn move_mouse(&self, x: i32, y: i32) -> Result<()>;
|
||||
fn click_at(&self, x: i32, y: i32) -> Result<()>;
|
||||
}
|
||||
|
||||
// Platform-specific constructor
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use crate::{ComputerController, types::Rect};
|
||||
use anyhow::Result;
|
||||
use crate::{ComputerController, types::{Rect, TextLocation}};
|
||||
use anyhow::{Result, Context};
|
||||
use async_trait::async_trait;
|
||||
use std::path::Path;
|
||||
use tesseract::Tesseract;
|
||||
@@ -122,4 +122,150 @@ impl ComputerController for MacOSController {
|
||||
|
||||
Ok(text)
|
||||
}
|
||||
|
||||
async fn extract_text_with_locations(&self, path: &str) -> Result<Vec<TextLocation>> {
|
||||
// For now, use tesseract CLI with TSV output to get bounding boxes
|
||||
// This is a workaround since the Rust tesseract crate doesn't expose get_component_boxes
|
||||
let output = std::process::Command::new("tesseract")
|
||||
.arg(path)
|
||||
.arg("stdout")
|
||||
.arg("tsv")
|
||||
.output()
|
||||
.map_err(|e| anyhow::anyhow!("Failed to run tesseract: {}", e))?;
|
||||
|
||||
if !output.status.success() {
|
||||
anyhow::bail!("Tesseract failed: {}", String::from_utf8_lossy(&output.stderr));
|
||||
}
|
||||
|
||||
let tsv_text = String::from_utf8_lossy(&output.stdout);
|
||||
let mut locations = Vec::new();
|
||||
|
||||
// Parse TSV output (skip header line)
|
||||
for (i, line) in tsv_text.lines().enumerate() {
|
||||
if i == 0 { continue; } // Skip header
|
||||
|
||||
let parts: Vec<&str> = line.split('\t').collect();
|
||||
if parts.len() >= 12 {
|
||||
// TSV format: level, page_num, block_num, par_num, line_num, word_num,
|
||||
// left, top, width, height, conf, text
|
||||
if let (Ok(x), Ok(y), Ok(w), Ok(h), Ok(conf), text) = (
|
||||
parts[6].parse::<i32>(),
|
||||
parts[7].parse::<i32>(),
|
||||
parts[8].parse::<i32>(),
|
||||
parts[9].parse::<i32>(),
|
||||
parts[10].parse::<f32>(),
|
||||
parts[11],
|
||||
) {
|
||||
let trimmed = text.trim();
|
||||
if !trimmed.is_empty() && conf > 0.0 {
|
||||
locations.push(TextLocation {
|
||||
text: trimmed.to_string(),
|
||||
x,
|
||||
y,
|
||||
width: w,
|
||||
height: h,
|
||||
confidence: conf / 100.0, // Convert from 0-100 to 0-1
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(locations)
|
||||
}
|
||||
|
||||
async fn find_text_on_screen(&self, search_text: &str) -> Result<Option<TextLocation>> {
|
||||
// Take full screenshot
|
||||
let home = std::env::var("HOME").unwrap_or_else(|_| "/tmp".to_string());
|
||||
let temp_path = format!("{}/Desktop/g3_find_text_{}.png", home, uuid::Uuid::new_v4());
|
||||
self.take_screenshot(&temp_path, None, None).await?;
|
||||
|
||||
// Extract all text with locations
|
||||
let locations = self.extract_text_with_locations(&temp_path).await?;
|
||||
|
||||
// Clean up temp file
|
||||
let _ = std::fs::remove_file(&temp_path);
|
||||
|
||||
// Find matching text (case-insensitive)
|
||||
let search_lower = search_text.to_lowercase();
|
||||
for location in locations {
|
||||
if location.text.to_lowercase().contains(&search_lower) {
|
||||
return Ok(Some(location));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
fn move_mouse(&self, x: i32, y: i32) -> Result<()> {
|
||||
use core_graphics::event::{
|
||||
CGEvent, CGEventTapLocation, CGEventType, CGMouseButton,
|
||||
};
|
||||
use core_graphics::event_source::{
|
||||
CGEventSource, CGEventSourceStateID,
|
||||
};
|
||||
use core_graphics::geometry::CGPoint;
|
||||
|
||||
let source = CGEventSource::new(CGEventSourceStateID::HIDSystemState)
|
||||
.ok().context("Failed to create event source")?;
|
||||
|
||||
let event = CGEvent::new_mouse_event(
|
||||
source,
|
||||
CGEventType::MouseMoved,
|
||||
CGPoint::new(x as f64, y as f64),
|
||||
CGMouseButton::Left,
|
||||
).ok().context("Failed to create mouse event")?;
|
||||
|
||||
event.post(CGEventTapLocation::HID);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn click_at(&self, x: i32, y: i32) -> Result<()> {
|
||||
use core_graphics::event::{
|
||||
CGEvent, CGEventTapLocation, CGEventType, CGMouseButton,
|
||||
};
|
||||
use core_graphics::event_source::{
|
||||
CGEventSource, CGEventSourceStateID,
|
||||
};
|
||||
use core_graphics::geometry::CGPoint;
|
||||
|
||||
let source = CGEventSource::new(CGEventSourceStateID::HIDSystemState)
|
||||
.ok().context("Failed to create event source")?;
|
||||
|
||||
let point = CGPoint::new(x as f64, y as f64);
|
||||
|
||||
// Move mouse to position first
|
||||
let move_event = CGEvent::new_mouse_event(
|
||||
source.clone(),
|
||||
CGEventType::MouseMoved,
|
||||
point,
|
||||
CGMouseButton::Left,
|
||||
).ok().context("Failed to create mouse move event")?;
|
||||
move_event.post(CGEventTapLocation::HID);
|
||||
|
||||
std::thread::sleep(std::time::Duration::from_millis(100));
|
||||
|
||||
// Mouse down
|
||||
let mouse_down = CGEvent::new_mouse_event(
|
||||
source.clone(),
|
||||
CGEventType::LeftMouseDown,
|
||||
point,
|
||||
CGMouseButton::Left,
|
||||
).ok().context("Failed to create mouse down event")?;
|
||||
mouse_down.post(CGEventTapLocation::HID);
|
||||
|
||||
std::thread::sleep(std::time::Duration::from_millis(50));
|
||||
|
||||
// Mouse up
|
||||
let mouse_up = CGEvent::new_mouse_event(
|
||||
source,
|
||||
CGEventType::LeftMouseUp,
|
||||
point,
|
||||
CGMouseButton::Left,
|
||||
).ok().context("Failed to create mouse up event")?;
|
||||
mouse_up.post(CGEventTapLocation::HID);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -7,3 +7,13 @@ pub struct Rect {
|
||||
pub width: i32,
|
||||
pub height: i32,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct TextLocation {
|
||||
pub text: String,
|
||||
pub x: i32,
|
||||
pub y: i32,
|
||||
pub width: i32,
|
||||
pub height: i32,
|
||||
pub confidence: f32,
|
||||
}
|
||||
|
||||
@@ -1239,7 +1239,7 @@ Template:
|
||||
// Check if provider supports native tool calling and add tools if so
|
||||
let provider = self.providers.get(None)?;
|
||||
let tools = if provider.has_native_tool_calling() {
|
||||
Some(Self::create_tool_definitions(self.config.webdriver.enabled, self.config.macax.enabled))
|
||||
Some(Self::create_tool_definitions(self.config.webdriver.enabled, self.config.macax.enabled, self.config.computer_control.enabled))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
@@ -1700,7 +1700,7 @@ Template:
|
||||
}
|
||||
|
||||
/// Create tool definitions for native tool calling providers
|
||||
fn create_tool_definitions(enable_webdriver: bool, enable_macax: bool) -> Vec<Tool> {
|
||||
fn create_tool_definitions(enable_webdriver: bool, enable_macax: bool, enable_computer_control: bool) -> Vec<Tool> {
|
||||
let mut tools = vec![
|
||||
Tool {
|
||||
name: "shell".to_string(),
|
||||
@@ -2280,6 +2280,64 @@ Template:
|
||||
});
|
||||
}
|
||||
|
||||
// Add vision-guided tools (requires computer control)
|
||||
if enable_computer_control {
|
||||
// Add vision-guided tools
|
||||
tools.push(Tool {
|
||||
name: "vision_find_text".to_string(),
|
||||
description: "Find text on screen and return its location (useful for locating UI elements)".to_string(),
|
||||
input_schema: json!({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"text": {
|
||||
"type": "string",
|
||||
"description": "The text to search for on screen"
|
||||
}
|
||||
},
|
||||
"required": ["text"]
|
||||
}),
|
||||
});
|
||||
|
||||
tools.push(Tool {
|
||||
name: "vision_click_text".to_string(),
|
||||
description: "Find text on screen and click on it (useful for clicking buttons, links, menu items)".to_string(),
|
||||
input_schema: json!({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"text": {
|
||||
"type": "string",
|
||||
"description": "The text to click on (e.g., 'Submit', 'OK', 'Cancel', '+')"
|
||||
}
|
||||
},
|
||||
"required": ["text"]
|
||||
}),
|
||||
});
|
||||
|
||||
tools.push(Tool {
|
||||
name: "vision_click_near_text".to_string(),
|
||||
description: "Find text on screen and click near it (useful for clicking text fields next to labels)".to_string(),
|
||||
input_schema: json!({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"text": {
|
||||
"type": "string",
|
||||
"description": "The label text to find (e.g., 'Name:', 'Email:', 'Task:')"
|
||||
},
|
||||
"direction": {
|
||||
"type": "string",
|
||||
"enum": ["right", "below", "left", "above"],
|
||||
"description": "Direction to click relative to the text (default: right)"
|
||||
},
|
||||
"distance": {
|
||||
"type": "integer",
|
||||
"description": "Distance in pixels from the text (default: 50)"
|
||||
}
|
||||
},
|
||||
"required": ["text"]
|
||||
}),
|
||||
});
|
||||
}
|
||||
|
||||
tools
|
||||
}
|
||||
|
||||
@@ -2844,7 +2902,7 @@ Template:
|
||||
|
||||
// Ensure tools are included for native providers in subsequent iterations
|
||||
if provider.has_native_tool_calling() {
|
||||
request.tools = Some(Self::create_tool_definitions(self.config.webdriver.enabled, self.config.macax.enabled));
|
||||
request.tools = Some(Self::create_tool_definitions(self.config.webdriver.enabled, self.config.macax.enabled, self.config.computer_control.enabled));
|
||||
}
|
||||
|
||||
// Only add to full_response if we haven't already added it
|
||||
@@ -4529,6 +4587,97 @@ Template:
|
||||
Err(e) => Ok(format!("❌ Failed to focus element: {}", e)),
|
||||
}
|
||||
}
|
||||
"vision_find_text" => {
|
||||
debug!("Processing vision_find_text tool call");
|
||||
|
||||
if let Some(controller) = &self.computer_controller {
|
||||
let text = tool_call.args.get("text")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or_else(|| anyhow::anyhow!("Missing text parameter"))?;
|
||||
|
||||
match controller.find_text_on_screen(text).await {
|
||||
Ok(Some(location)) => {
|
||||
Ok(format!(
|
||||
"✅ Found '{}' at position ({}, {}) with size {}x{} (confidence: {:.0}%)",
|
||||
location.text, location.x, location.y, location.width, location.height,
|
||||
location.confidence * 100.0
|
||||
))
|
||||
}
|
||||
Ok(None) => Ok(format!("❌ Could not find '{}' on screen", text)),
|
||||
Err(e) => Ok(format!("❌ Error finding text: {}", e)),
|
||||
}
|
||||
} else {
|
||||
Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string())
|
||||
}
|
||||
}
|
||||
"vision_click_text" => {
|
||||
debug!("Processing vision_click_text tool call");
|
||||
|
||||
if let Some(controller) = &self.computer_controller {
|
||||
let text = tool_call.args.get("text")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or_else(|| anyhow::anyhow!("Missing text parameter"))?;
|
||||
|
||||
match controller.find_text_on_screen(text).await {
|
||||
Ok(Some(location)) => {
|
||||
// Click on center of text
|
||||
let center_x = location.x + location.width / 2;
|
||||
let center_y = location.y + location.height / 2;
|
||||
|
||||
match controller.click_at(center_x, center_y) {
|
||||
Ok(_) => Ok(format!("✅ Clicked on '{}' at ({}, {})", text, center_x, center_y)),
|
||||
Err(e) => Ok(format!("❌ Failed to click: {}", e)),
|
||||
}
|
||||
}
|
||||
Ok(None) => Ok(format!("❌ Could not find '{}' on screen", text)),
|
||||
Err(e) => Ok(format!("❌ Error finding text: {}", e)),
|
||||
}
|
||||
} else {
|
||||
Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string())
|
||||
}
|
||||
}
|
||||
"vision_click_near_text" => {
|
||||
debug!("Processing vision_click_near_text tool call");
|
||||
|
||||
if let Some(controller) = &self.computer_controller {
|
||||
let text = tool_call.args.get("text")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or_else(|| anyhow::anyhow!("Missing text parameter"))?;
|
||||
|
||||
let direction = tool_call.args.get("direction")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("right");
|
||||
|
||||
let distance = tool_call.args.get("distance")
|
||||
.and_then(|v| v.as_i64())
|
||||
.unwrap_or(50) as i32;
|
||||
|
||||
match controller.find_text_on_screen(text).await {
|
||||
Ok(Some(location)) => {
|
||||
// Calculate click position based on direction
|
||||
let (click_x, click_y) = match direction {
|
||||
"right" => (location.x + location.width + distance, location.y + location.height / 2),
|
||||
"below" => (location.x + location.width / 2, location.y + location.height + distance),
|
||||
"left" => (location.x - distance, location.y + location.height / 2),
|
||||
"above" => (location.x + location.width / 2, location.y - distance),
|
||||
_ => (location.x + location.width + distance, location.y + location.height / 2),
|
||||
};
|
||||
|
||||
match controller.click_at(click_x, click_y) {
|
||||
Ok(_) => Ok(format!(
|
||||
"✅ Clicked {} of '{}' at ({}, {})",
|
||||
direction, text, click_x, click_y
|
||||
)),
|
||||
Err(e) => Ok(format!("❌ Failed to click: {}", e)),
|
||||
}
|
||||
}
|
||||
Ok(None) => Ok(format!("❌ Could not find '{}' on screen", text)),
|
||||
Err(e) => Ok(format!("❌ Error finding text: {}", e)),
|
||||
}
|
||||
} else {
|
||||
Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string())
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
warn!("Unknown tool: {}", tool_call.tool);
|
||||
Ok(format!("❓ Unknown tool: {}", tool_call.tool))
|
||||
|
||||
@@ -882,6 +882,14 @@ impl LLMProvider for DatabricksProvider {
|
||||
request.messages.len()
|
||||
);
|
||||
|
||||
// Debug: Log tool count
|
||||
if let Some(ref tools) = request.tools {
|
||||
debug!("Request has {} tools", tools.len());
|
||||
for tool in tools.iter().take(5) {
|
||||
debug!(" Tool: {}", tool.name);
|
||||
}
|
||||
}
|
||||
|
||||
let max_tokens = request.max_tokens.unwrap_or(self.max_tokens);
|
||||
let temperature = request.temperature.unwrap_or(self.temperature);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user