more macax tooling
This commit is contained in:
@@ -1639,7 +1639,7 @@ Review the current state of the project and provide a concise critique focusing
|
|||||||
2. Whether the project compiles successfully
|
2. Whether the project compiles successfully
|
||||||
3. What requirements are missing or incorrect
|
3. What requirements are missing or incorrect
|
||||||
4. Specific improvements needed to satisfy requirements
|
4. Specific improvements needed to satisfy requirements
|
||||||
5. Use UI tools such as webdriver to test functionality thoroughly
|
5. Use UI tools such as webdriver or macax to test functionality thoroughly
|
||||||
|
|
||||||
CRITICAL INSTRUCTIONS:
|
CRITICAL INSTRUCTIONS:
|
||||||
1. You MUST use the final_output tool to provide your feedback
|
1. You MUST use the final_output tool to provide your feedback
|
||||||
@@ -1647,13 +1647,13 @@ CRITICAL INSTRUCTIONS:
|
|||||||
3. Focus ONLY on what needs to be fixed or improved
|
3. Focus ONLY on what needs to be fixed or improved
|
||||||
4. Do NOT include your analysis process, file contents, or compilation output in the summary
|
4. Do NOT include your analysis process, file contents, or compilation output in the summary
|
||||||
|
|
||||||
If the implementation thoroughly meets all requirements, compiles and is fully tested (especially UI flows) *WITHOUT* gaps or errors:
|
If the implementation thoroughly meets all requirements, compiles and is fully tested (especially UI flows) *WITHOUT* minor gaps or errors:
|
||||||
- Call final_output with summary: 'IMPLEMENTATION_APPROVED'
|
- Call final_output with summary: 'IMPLEMENTATION_APPROVED'
|
||||||
|
|
||||||
If improvements are needed:
|
If improvements are needed:
|
||||||
- Call final_output with a brief summary listing ONLY the specific issues to fix
|
- Call final_output with a brief summary listing ONLY the specific issues to fix
|
||||||
|
|
||||||
Remember: Be clear in your review and concise in your feedback. APPROVE if the implementation works and thoroughly fits the requirements (implementation > 95% complete). Be rigorous, especially by testing that all UI features work.",
|
Remember: Be clear in your review and concise in your feedback. APPROVE iff the implementation works and thoroughly fits the requirements (implementation > 95% complete). Be rigorous, especially by testing that all UI features work.",
|
||||||
requirements
|
requirements
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|||||||
@@ -24,6 +24,12 @@ pub trait ComputerController: Send + Sync {
|
|||||||
// OCR operations
|
// OCR operations
|
||||||
async fn extract_text_from_screen(&self, region: Rect) -> Result<String>;
|
async fn extract_text_from_screen(&self, region: Rect) -> Result<String>;
|
||||||
async fn extract_text_from_image(&self, path: &str) -> Result<String>;
|
async fn extract_text_from_image(&self, path: &str) -> Result<String>;
|
||||||
|
async fn extract_text_with_locations(&self, path: &str) -> Result<Vec<TextLocation>>;
|
||||||
|
async fn find_text_on_screen(&self, search_text: &str) -> Result<Option<TextLocation>>;
|
||||||
|
|
||||||
|
// Mouse operations
|
||||||
|
fn move_mouse(&self, x: i32, y: i32) -> Result<()>;
|
||||||
|
fn click_at(&self, x: i32, y: i32) -> Result<()>;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Platform-specific constructor
|
// Platform-specific constructor
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
use crate::{ComputerController, types::Rect};
|
use crate::{ComputerController, types::{Rect, TextLocation}};
|
||||||
use anyhow::Result;
|
use anyhow::{Result, Context};
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use tesseract::Tesseract;
|
use tesseract::Tesseract;
|
||||||
@@ -122,4 +122,150 @@ impl ComputerController for MacOSController {
|
|||||||
|
|
||||||
Ok(text)
|
Ok(text)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn extract_text_with_locations(&self, path: &str) -> Result<Vec<TextLocation>> {
|
||||||
|
// For now, use tesseract CLI with TSV output to get bounding boxes
|
||||||
|
// This is a workaround since the Rust tesseract crate doesn't expose get_component_boxes
|
||||||
|
let output = std::process::Command::new("tesseract")
|
||||||
|
.arg(path)
|
||||||
|
.arg("stdout")
|
||||||
|
.arg("tsv")
|
||||||
|
.output()
|
||||||
|
.map_err(|e| anyhow::anyhow!("Failed to run tesseract: {}", e))?;
|
||||||
|
|
||||||
|
if !output.status.success() {
|
||||||
|
anyhow::bail!("Tesseract failed: {}", String::from_utf8_lossy(&output.stderr));
|
||||||
|
}
|
||||||
|
|
||||||
|
let tsv_text = String::from_utf8_lossy(&output.stdout);
|
||||||
|
let mut locations = Vec::new();
|
||||||
|
|
||||||
|
// Parse TSV output (skip header line)
|
||||||
|
for (i, line) in tsv_text.lines().enumerate() {
|
||||||
|
if i == 0 { continue; } // Skip header
|
||||||
|
|
||||||
|
let parts: Vec<&str> = line.split('\t').collect();
|
||||||
|
if parts.len() >= 12 {
|
||||||
|
// TSV format: level, page_num, block_num, par_num, line_num, word_num,
|
||||||
|
// left, top, width, height, conf, text
|
||||||
|
if let (Ok(x), Ok(y), Ok(w), Ok(h), Ok(conf), text) = (
|
||||||
|
parts[6].parse::<i32>(),
|
||||||
|
parts[7].parse::<i32>(),
|
||||||
|
parts[8].parse::<i32>(),
|
||||||
|
parts[9].parse::<i32>(),
|
||||||
|
parts[10].parse::<f32>(),
|
||||||
|
parts[11],
|
||||||
|
) {
|
||||||
|
let trimmed = text.trim();
|
||||||
|
if !trimmed.is_empty() && conf > 0.0 {
|
||||||
|
locations.push(TextLocation {
|
||||||
|
text: trimmed.to_string(),
|
||||||
|
x,
|
||||||
|
y,
|
||||||
|
width: w,
|
||||||
|
height: h,
|
||||||
|
confidence: conf / 100.0, // Convert from 0-100 to 0-1
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(locations)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn find_text_on_screen(&self, search_text: &str) -> Result<Option<TextLocation>> {
|
||||||
|
// Take full screenshot
|
||||||
|
let home = std::env::var("HOME").unwrap_or_else(|_| "/tmp".to_string());
|
||||||
|
let temp_path = format!("{}/Desktop/g3_find_text_{}.png", home, uuid::Uuid::new_v4());
|
||||||
|
self.take_screenshot(&temp_path, None, None).await?;
|
||||||
|
|
||||||
|
// Extract all text with locations
|
||||||
|
let locations = self.extract_text_with_locations(&temp_path).await?;
|
||||||
|
|
||||||
|
// Clean up temp file
|
||||||
|
let _ = std::fs::remove_file(&temp_path);
|
||||||
|
|
||||||
|
// Find matching text (case-insensitive)
|
||||||
|
let search_lower = search_text.to_lowercase();
|
||||||
|
for location in locations {
|
||||||
|
if location.text.to_lowercase().contains(&search_lower) {
|
||||||
|
return Ok(Some(location));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn move_mouse(&self, x: i32, y: i32) -> Result<()> {
|
||||||
|
use core_graphics::event::{
|
||||||
|
CGEvent, CGEventTapLocation, CGEventType, CGMouseButton,
|
||||||
|
};
|
||||||
|
use core_graphics::event_source::{
|
||||||
|
CGEventSource, CGEventSourceStateID,
|
||||||
|
};
|
||||||
|
use core_graphics::geometry::CGPoint;
|
||||||
|
|
||||||
|
let source = CGEventSource::new(CGEventSourceStateID::HIDSystemState)
|
||||||
|
.ok().context("Failed to create event source")?;
|
||||||
|
|
||||||
|
let event = CGEvent::new_mouse_event(
|
||||||
|
source,
|
||||||
|
CGEventType::MouseMoved,
|
||||||
|
CGPoint::new(x as f64, y as f64),
|
||||||
|
CGMouseButton::Left,
|
||||||
|
).ok().context("Failed to create mouse event")?;
|
||||||
|
|
||||||
|
event.post(CGEventTapLocation::HID);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn click_at(&self, x: i32, y: i32) -> Result<()> {
|
||||||
|
use core_graphics::event::{
|
||||||
|
CGEvent, CGEventTapLocation, CGEventType, CGMouseButton,
|
||||||
|
};
|
||||||
|
use core_graphics::event_source::{
|
||||||
|
CGEventSource, CGEventSourceStateID,
|
||||||
|
};
|
||||||
|
use core_graphics::geometry::CGPoint;
|
||||||
|
|
||||||
|
let source = CGEventSource::new(CGEventSourceStateID::HIDSystemState)
|
||||||
|
.ok().context("Failed to create event source")?;
|
||||||
|
|
||||||
|
let point = CGPoint::new(x as f64, y as f64);
|
||||||
|
|
||||||
|
// Move mouse to position first
|
||||||
|
let move_event = CGEvent::new_mouse_event(
|
||||||
|
source.clone(),
|
||||||
|
CGEventType::MouseMoved,
|
||||||
|
point,
|
||||||
|
CGMouseButton::Left,
|
||||||
|
).ok().context("Failed to create mouse move event")?;
|
||||||
|
move_event.post(CGEventTapLocation::HID);
|
||||||
|
|
||||||
|
std::thread::sleep(std::time::Duration::from_millis(100));
|
||||||
|
|
||||||
|
// Mouse down
|
||||||
|
let mouse_down = CGEvent::new_mouse_event(
|
||||||
|
source.clone(),
|
||||||
|
CGEventType::LeftMouseDown,
|
||||||
|
point,
|
||||||
|
CGMouseButton::Left,
|
||||||
|
).ok().context("Failed to create mouse down event")?;
|
||||||
|
mouse_down.post(CGEventTapLocation::HID);
|
||||||
|
|
||||||
|
std::thread::sleep(std::time::Duration::from_millis(50));
|
||||||
|
|
||||||
|
// Mouse up
|
||||||
|
let mouse_up = CGEvent::new_mouse_event(
|
||||||
|
source,
|
||||||
|
CGEventType::LeftMouseUp,
|
||||||
|
point,
|
||||||
|
CGMouseButton::Left,
|
||||||
|
).ok().context("Failed to create mouse up event")?;
|
||||||
|
mouse_up.post(CGEventTapLocation::HID);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
@@ -7,3 +7,13 @@ pub struct Rect {
|
|||||||
pub width: i32,
|
pub width: i32,
|
||||||
pub height: i32,
|
pub height: i32,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct TextLocation {
|
||||||
|
pub text: String,
|
||||||
|
pub x: i32,
|
||||||
|
pub y: i32,
|
||||||
|
pub width: i32,
|
||||||
|
pub height: i32,
|
||||||
|
pub confidence: f32,
|
||||||
|
}
|
||||||
|
|||||||
@@ -1239,7 +1239,7 @@ Template:
|
|||||||
// Check if provider supports native tool calling and add tools if so
|
// Check if provider supports native tool calling and add tools if so
|
||||||
let provider = self.providers.get(None)?;
|
let provider = self.providers.get(None)?;
|
||||||
let tools = if provider.has_native_tool_calling() {
|
let tools = if provider.has_native_tool_calling() {
|
||||||
Some(Self::create_tool_definitions(self.config.webdriver.enabled, self.config.macax.enabled))
|
Some(Self::create_tool_definitions(self.config.webdriver.enabled, self.config.macax.enabled, self.config.computer_control.enabled))
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
};
|
};
|
||||||
@@ -1700,7 +1700,7 @@ Template:
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Create tool definitions for native tool calling providers
|
/// Create tool definitions for native tool calling providers
|
||||||
fn create_tool_definitions(enable_webdriver: bool, enable_macax: bool) -> Vec<Tool> {
|
fn create_tool_definitions(enable_webdriver: bool, enable_macax: bool, enable_computer_control: bool) -> Vec<Tool> {
|
||||||
let mut tools = vec![
|
let mut tools = vec![
|
||||||
Tool {
|
Tool {
|
||||||
name: "shell".to_string(),
|
name: "shell".to_string(),
|
||||||
@@ -2280,6 +2280,64 @@ Template:
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Add vision-guided tools (requires computer control)
|
||||||
|
if enable_computer_control {
|
||||||
|
// Add vision-guided tools
|
||||||
|
tools.push(Tool {
|
||||||
|
name: "vision_find_text".to_string(),
|
||||||
|
description: "Find text on screen and return its location (useful for locating UI elements)".to_string(),
|
||||||
|
input_schema: json!({
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"text": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The text to search for on screen"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["text"]
|
||||||
|
}),
|
||||||
|
});
|
||||||
|
|
||||||
|
tools.push(Tool {
|
||||||
|
name: "vision_click_text".to_string(),
|
||||||
|
description: "Find text on screen and click on it (useful for clicking buttons, links, menu items)".to_string(),
|
||||||
|
input_schema: json!({
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"text": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The text to click on (e.g., 'Submit', 'OK', 'Cancel', '+')"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["text"]
|
||||||
|
}),
|
||||||
|
});
|
||||||
|
|
||||||
|
tools.push(Tool {
|
||||||
|
name: "vision_click_near_text".to_string(),
|
||||||
|
description: "Find text on screen and click near it (useful for clicking text fields next to labels)".to_string(),
|
||||||
|
input_schema: json!({
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"text": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The label text to find (e.g., 'Name:', 'Email:', 'Task:')"
|
||||||
|
},
|
||||||
|
"direction": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["right", "below", "left", "above"],
|
||||||
|
"description": "Direction to click relative to the text (default: right)"
|
||||||
|
},
|
||||||
|
"distance": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Distance in pixels from the text (default: 50)"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["text"]
|
||||||
|
}),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
tools
|
tools
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2844,7 +2902,7 @@ Template:
|
|||||||
|
|
||||||
// Ensure tools are included for native providers in subsequent iterations
|
// Ensure tools are included for native providers in subsequent iterations
|
||||||
if provider.has_native_tool_calling() {
|
if provider.has_native_tool_calling() {
|
||||||
request.tools = Some(Self::create_tool_definitions(self.config.webdriver.enabled, self.config.macax.enabled));
|
request.tools = Some(Self::create_tool_definitions(self.config.webdriver.enabled, self.config.macax.enabled, self.config.computer_control.enabled));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Only add to full_response if we haven't already added it
|
// Only add to full_response if we haven't already added it
|
||||||
@@ -4529,6 +4587,97 @@ Template:
|
|||||||
Err(e) => Ok(format!("❌ Failed to focus element: {}", e)),
|
Err(e) => Ok(format!("❌ Failed to focus element: {}", e)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
"vision_find_text" => {
|
||||||
|
debug!("Processing vision_find_text tool call");
|
||||||
|
|
||||||
|
if let Some(controller) = &self.computer_controller {
|
||||||
|
let text = tool_call.args.get("text")
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.ok_or_else(|| anyhow::anyhow!("Missing text parameter"))?;
|
||||||
|
|
||||||
|
match controller.find_text_on_screen(text).await {
|
||||||
|
Ok(Some(location)) => {
|
||||||
|
Ok(format!(
|
||||||
|
"✅ Found '{}' at position ({}, {}) with size {}x{} (confidence: {:.0}%)",
|
||||||
|
location.text, location.x, location.y, location.width, location.height,
|
||||||
|
location.confidence * 100.0
|
||||||
|
))
|
||||||
|
}
|
||||||
|
Ok(None) => Ok(format!("❌ Could not find '{}' on screen", text)),
|
||||||
|
Err(e) => Ok(format!("❌ Error finding text: {}", e)),
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"vision_click_text" => {
|
||||||
|
debug!("Processing vision_click_text tool call");
|
||||||
|
|
||||||
|
if let Some(controller) = &self.computer_controller {
|
||||||
|
let text = tool_call.args.get("text")
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.ok_or_else(|| anyhow::anyhow!("Missing text parameter"))?;
|
||||||
|
|
||||||
|
match controller.find_text_on_screen(text).await {
|
||||||
|
Ok(Some(location)) => {
|
||||||
|
// Click on center of text
|
||||||
|
let center_x = location.x + location.width / 2;
|
||||||
|
let center_y = location.y + location.height / 2;
|
||||||
|
|
||||||
|
match controller.click_at(center_x, center_y) {
|
||||||
|
Ok(_) => Ok(format!("✅ Clicked on '{}' at ({}, {})", text, center_x, center_y)),
|
||||||
|
Err(e) => Ok(format!("❌ Failed to click: {}", e)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(None) => Ok(format!("❌ Could not find '{}' on screen", text)),
|
||||||
|
Err(e) => Ok(format!("❌ Error finding text: {}", e)),
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"vision_click_near_text" => {
|
||||||
|
debug!("Processing vision_click_near_text tool call");
|
||||||
|
|
||||||
|
if let Some(controller) = &self.computer_controller {
|
||||||
|
let text = tool_call.args.get("text")
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.ok_or_else(|| anyhow::anyhow!("Missing text parameter"))?;
|
||||||
|
|
||||||
|
let direction = tool_call.args.get("direction")
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.unwrap_or("right");
|
||||||
|
|
||||||
|
let distance = tool_call.args.get("distance")
|
||||||
|
.and_then(|v| v.as_i64())
|
||||||
|
.unwrap_or(50) as i32;
|
||||||
|
|
||||||
|
match controller.find_text_on_screen(text).await {
|
||||||
|
Ok(Some(location)) => {
|
||||||
|
// Calculate click position based on direction
|
||||||
|
let (click_x, click_y) = match direction {
|
||||||
|
"right" => (location.x + location.width + distance, location.y + location.height / 2),
|
||||||
|
"below" => (location.x + location.width / 2, location.y + location.height + distance),
|
||||||
|
"left" => (location.x - distance, location.y + location.height / 2),
|
||||||
|
"above" => (location.x + location.width / 2, location.y - distance),
|
||||||
|
_ => (location.x + location.width + distance, location.y + location.height / 2),
|
||||||
|
};
|
||||||
|
|
||||||
|
match controller.click_at(click_x, click_y) {
|
||||||
|
Ok(_) => Ok(format!(
|
||||||
|
"✅ Clicked {} of '{}' at ({}, {})",
|
||||||
|
direction, text, click_x, click_y
|
||||||
|
)),
|
||||||
|
Err(e) => Ok(format!("❌ Failed to click: {}", e)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(None) => Ok(format!("❌ Could not find '{}' on screen", text)),
|
||||||
|
Err(e) => Ok(format!("❌ Error finding text: {}", e)),
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string())
|
||||||
|
}
|
||||||
|
}
|
||||||
_ => {
|
_ => {
|
||||||
warn!("Unknown tool: {}", tool_call.tool);
|
warn!("Unknown tool: {}", tool_call.tool);
|
||||||
Ok(format!("❓ Unknown tool: {}", tool_call.tool))
|
Ok(format!("❓ Unknown tool: {}", tool_call.tool))
|
||||||
|
|||||||
@@ -882,6 +882,14 @@ impl LLMProvider for DatabricksProvider {
|
|||||||
request.messages.len()
|
request.messages.len()
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Debug: Log tool count
|
||||||
|
if let Some(ref tools) = request.tools {
|
||||||
|
debug!("Request has {} tools", tools.len());
|
||||||
|
for tool in tools.iter().take(5) {
|
||||||
|
debug!(" Tool: {}", tool.name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let max_tokens = request.max_tokens.unwrap_or(self.max_tokens);
|
let max_tokens = request.max_tokens.unwrap_or(self.max_tokens);
|
||||||
let temperature = request.temperature.unwrap_or(self.temperature);
|
let temperature = request.temperature.unwrap_or(self.temperature);
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user