Remove vision tools (except take_screenshot) and macax tools

Vision tools removed:
- extract_text (OCR from image files)
- extract_text_with_boxes (OCR with bounding boxes)
- vision_find_text (find text in app windows)
- vision_click_text (find and click on text)
- vision_click_near_text (click near text labels)

macax tools removed:
- macax_list_apps
- macax_get_frontmost_app
- macax_activate_app
- macax_press_key
- macax_type_text

The LLM can now read images directly via read_image tool.
take_screenshot is retained for capturing application windows.

Files deleted:
- crates/g3-core/src/tools/vision.rs
- crates/g3-core/src/tools/macax.rs
- docs/macax-tools.md

Updated tool counts: 12 core + 15 webdriver = 27 total
This commit is contained in:
Dhanji R. Prasanna
2026-01-03 17:38:25 +11:00
parent 29e263ac49
commit 386176899e
19 changed files with 15 additions and 1408 deletions

View File

@@ -108,8 +108,6 @@ pub struct Agent<W: UiWriter> {
>,
>,
webdriver_process: std::sync::Arc<tokio::sync::RwLock<Option<tokio::process::Child>>>,
macax_controller:
std::sync::Arc<tokio::sync::RwLock<Option<g3_computer_control::MacAxController>>>,
tool_call_count: usize,
requirements_sha: Option<String>,
/// Working directory for tool execution (set by --codebase-fast-start)
@@ -389,9 +387,6 @@ impl<W: UiWriter> Agent<W> {
None
};
// Capture macax_enabled before moving config
let macax_enabled = config.macax.enabled;
Ok(Self {
providers,
context_window,
@@ -411,13 +406,6 @@ impl<W: UiWriter> Agent<W> {
computer_controller,
webdriver_session: std::sync::Arc::new(tokio::sync::RwLock::new(None)),
webdriver_process: std::sync::Arc::new(tokio::sync::RwLock::new(None)),
macax_controller: {
std::sync::Arc::new(tokio::sync::RwLock::new(if macax_enabled {
Some(g3_computer_control::MacAxController::new()?)
} else {
None
}))
},
tool_call_count: 0,
requirements_sha: None,
working_dir: None,
@@ -921,7 +909,6 @@ impl<W: UiWriter> Agent<W> {
Some(tool_definitions::create_tool_definitions(
tool_definitions::ToolConfig::new(
self.config.webdriver.enabled,
self.config.macax.enabled,
self.config.computer_control.enabled,
)))
} else {
@@ -2674,7 +2661,6 @@ impl<W: UiWriter> Agent<W> {
request.tools = Some(tool_definitions::create_tool_definitions(
tool_definitions::ToolConfig::new(
self.config.webdriver.enabled,
self.config.macax.enabled,
self.config.computer_control.enabled,
)));
}
@@ -3289,7 +3275,6 @@ impl<W: UiWriter> Agent<W> {
computer_controller: self.computer_controller.as_ref(),
webdriver_session: &self.webdriver_session,
webdriver_process: &self.webdriver_process,
macax_controller: &self.macax_controller,
background_process_manager: &self.background_process_manager,
todo_content: &self.todo_content,
pending_images: &mut self.pending_images,

View File

@@ -11,15 +11,13 @@ use serde_json::json;
#[derive(Debug, Clone, Copy, Default)]
pub struct ToolConfig {
pub webdriver: bool,
pub macax: bool,
pub computer_control: bool,
}
impl ToolConfig {
pub fn new(webdriver: bool, macax: bool, computer_control: bool) -> Self {
pub fn new(webdriver: bool, computer_control: bool) -> Self {
Self {
webdriver,
macax,
computer_control,
}
}
@@ -36,14 +34,6 @@ pub fn create_tool_definitions(config: ToolConfig) -> Vec<Tool> {
tools.extend(create_webdriver_tools());
}
if config.macax {
tools.extend(create_macax_tools());
}
if config.computer_control {
tools.extend(create_computer_control_tools());
}
tools
}
@@ -88,7 +78,7 @@ fn create_core_tools() -> Vec<Tool> {
},
Tool {
name: "read_file".to_string(),
description: "Read the contents of a file. For image files (png, jpg, jpeg, gif, bmp, tiff, webp), automatically extracts text using OCR. For text files, optionally read a specific character range.".to_string(),
description: "Read the contents of a file. Optionally read a specific character range.".to_string(),
input_schema: json!({
"type": "object",
"properties": {
@@ -208,19 +198,6 @@ fn create_core_tools() -> Vec<Tool> {
"required": ["path", "window_id"]
}),
},
Tool {
name: "extract_text".to_string(),
description: "Extract text from an image file using OCR. For extracting text from a specific window, use vision_find_text instead which automatically handles window capture.".to_string(),
input_schema: json!({
"type": "object",
"properties": {
"path": {
"type": "string",
"description": "Path to image file (optional if region is provided)"
},
}
}),
},
Tool {
name: "todo_read".to_string(),
description: "Read your current TODO list from todo.g3.md file in the session directory. Shows what tasks are planned and their status. Call this at the start of multi-step tasks to check for existing plans, and during execution to review progress before updating. TODO lists are scoped to the current session.".to_string(),
@@ -476,174 +453,6 @@ fn create_webdriver_tools() -> Vec<Tool> {
]
}
/// Create macOS Accessibility tools
fn create_macax_tools() -> Vec<Tool> {
vec![
Tool {
name: "macax_list_apps".to_string(),
description: "List all running applications that can be controlled via macOS Accessibility API".to_string(),
input_schema: json!({
"type": "object",
"properties": {},
"required": []
}),
},
Tool {
name: "macax_get_frontmost_app".to_string(),
description: "Get the name of the currently active (frontmost) application".to_string(),
input_schema: json!({
"type": "object",
"properties": {},
"required": []
}),
},
Tool {
name: "macax_activate_app".to_string(),
description: "Bring an application to the front (activate it)".to_string(),
input_schema: json!({
"type": "object",
"properties": {
"app_name": {
"type": "string",
"description": "Name of the application to activate (e.g., 'Safari', 'TextEdit')"
}
},
"required": ["app_name"]
}),
},
Tool {
name: "macax_press_key".to_string(),
description: "Press a keyboard key or shortcut in an application (e.g., Cmd+S to save)".to_string(),
input_schema: json!({
"type": "object",
"properties": {
"app_name": {
"type": "string",
"description": "Name of the application"
},
"key": {
"type": "string",
"description": "Key to press (e.g., 's', 'return', 'tab')"
},
"modifiers": {
"type": "array",
"items": {
"type": "string"
},
"description": "Modifier keys (e.g., ['command', 'shift'])"
}
},
"required": ["app_name", "key"]
}),
},
Tool {
name: "macax_type_text".to_string(),
description: "Type arbitrary text into the currently focused element in an application (supports unicode, emojis, etc.)".to_string(),
input_schema: json!({
"type": "object",
"properties": {
"app_name": {
"type": "string",
"description": "Name of the application"
},
"text": {
"type": "string",
"description": "Text to type (can include unicode, emojis, special characters)"
}
},
"required": ["app_name", "text"]
}),
},
Tool {
name: "extract_text_with_boxes".to_string(),
description: "Extract all text from an image file with bounding box coordinates for each text element. Returns JSON array with text, position (x, y), size (width, height), and confidence for each detected text. Uses Apple Vision Framework for precise sub-pixel accuracy.".to_string(),
input_schema: json!({
"type": "object",
"properties": {
"path": {
"type": "string",
"description": "Path to image file to extract text from"
},
"app_name": {
"type": "string",
"description": "Optional: Name of application to screenshot first (e.g., 'Safari', 'Things3'). If provided, takes screenshot of app before extracting text."
}
},
"required": ["path"]
}),
},
]
}
/// Create computer control / vision-guided tools
fn create_computer_control_tools() -> Vec<Tool> {
vec![
Tool {
name: "vision_find_text".to_string(),
description: "Find text in a specific application window and return its location with bounding box coordinates (x, y, width, height) and confidence score. Useful for locating UI elements. Uses Apple Vision Framework for precise sub-pixel accuracy.".to_string(),
input_schema: json!({
"type": "object",
"properties": {
"app_name": {
"type": "string",
"description": "Name of the application to search in (e.g., 'Things3', 'Safari', 'TextEdit')"
},
"text": {
"type": "string",
"description": "The text to search for on screen"
}
},
"required": ["app_name", "text"]
}),
},
Tool {
name: "vision_click_text".to_string(),
description: "Find text in a specific application window and click on it (useful for clicking buttons, links, menu items)".to_string(),
input_schema: json!({
"type": "object",
"properties": {
"app_name": {
"type": "string",
"description": "Name of the application (e.g., 'Things3', 'Safari', 'TextEdit')"
},
"text": {
"type": "string",
"description": "The text to click on (e.g., 'Submit', 'OK', 'Cancel', '+')"
}
},
"required": ["app_name", "text"]
}),
},
Tool {
name: "vision_click_near_text".to_string(),
description: "Find text in a specific application window and click near it (useful for clicking text fields next to labels)".to_string(),
input_schema: json!({
"type": "object",
"properties": {
"app_name": {
"type": "string",
"description": "Name of the application (e.g., 'Things3', 'Safari', 'TextEdit')"
},
"text": {
"type": "string",
"description": "The label text to find (e.g., 'Name:', 'Email:', 'Task:')"
},
"direction": {
"type": "string",
"enum": ["right", "below", "left", "above"],
"description": "Direction to click relative to the text (default: right)"
},
"distance": {
"type": "integer",
"description": "Distance in pixels from the text (default: 50)"
}
},
"required": ["app_name", "text"]
}),
},
]
}
#[cfg(test)]
mod tests {
use super::*;
@@ -652,9 +461,9 @@ mod tests {
fn test_core_tools_count() {
let tools = create_core_tools();
// Should have the core tools: shell, background_process, read_file, read_image,
// write_file, str_replace, final_output, take_screenshot, extract_text,
// todo_read, todo_write, code_coverage, code_search
assert_eq!(tools.len(), 13);
// write_file, str_replace, final_output, take_screenshot,
// todo_read, todo_write, code_coverage, code_search (12 total)
assert_eq!(tools.len(), 12);
}
#[test]
@@ -664,33 +473,19 @@ mod tests {
assert_eq!(tools.len(), 15);
}
#[test]
fn test_macax_tools_count() {
let tools = create_macax_tools();
// 6 macax tools
assert_eq!(tools.len(), 6);
}
#[test]
fn test_computer_control_tools_count() {
let tools = create_computer_control_tools();
// 3 vision tools
assert_eq!(tools.len(), 3);
}
#[test]
fn test_create_tool_definitions_core_only() {
let config = ToolConfig::default();
let tools = create_tool_definitions(config);
assert_eq!(tools.len(), 13);
assert_eq!(tools.len(), 12);
}
#[test]
fn test_create_tool_definitions_all_enabled() {
let config = ToolConfig::new(true, true, true);
let config = ToolConfig::new(true, true);
let tools = create_tool_definitions(config);
// 13 core + 15 webdriver + 6 macax + 3 computer_control = 37
assert_eq!(tools.len(), 37);
// 12 core + 15 webdriver = 27
assert_eq!(tools.len(), 27);
}
#[test]

View File

@@ -7,7 +7,7 @@ use anyhow::Result;
use tracing::{debug, warn};
use crate::tools::executor::ToolContext;
use crate::tools::{file_ops, macax, misc, shell, todo, vision, webdriver};
use crate::tools::{file_ops, misc, shell, todo, webdriver};
use crate::ui_writer::UiWriter;
use crate::ToolCall;
@@ -43,7 +43,6 @@ pub async fn dispatch_tool<W: UiWriter>(
Ok(result)
}
"take_screenshot" => misc::execute_take_screenshot(tool_call, ctx).await,
"extract_text" => misc::execute_extract_text(tool_call, ctx).await,
"code_coverage" => misc::execute_code_coverage(tool_call, ctx).await,
"code_search" => misc::execute_code_search(tool_call, ctx).await,
@@ -64,19 +63,6 @@ pub async fn dispatch_tool<W: UiWriter>(
"webdriver_refresh" => webdriver::execute_webdriver_refresh(tool_call, ctx).await,
"webdriver_quit" => webdriver::execute_webdriver_quit(tool_call, ctx).await,
// macOS Accessibility tools
"macax_list_apps" => macax::execute_macax_list_apps(tool_call, ctx).await,
"macax_get_frontmost_app" => macax::execute_macax_get_frontmost_app(tool_call, ctx).await,
"macax_activate_app" => macax::execute_macax_activate_app(tool_call, ctx).await,
"macax_press_key" => macax::execute_macax_press_key(tool_call, ctx).await,
"macax_type_text" => macax::execute_macax_type_text(tool_call, ctx).await,
// Vision tools
"vision_find_text" => vision::execute_vision_find_text(tool_call, ctx).await,
"vision_click_text" => vision::execute_vision_click_text(tool_call, ctx).await,
"vision_click_near_text" => vision::execute_vision_click_near_text(tool_call, ctx).await,
"extract_text_with_boxes" => vision::execute_extract_text_with_boxes(tool_call, ctx).await,
// Unknown tool
_ => {
warn!("Unknown tool: {}", tool_call.tool);

View File

@@ -20,7 +20,6 @@ pub struct ToolContext<'a, W: UiWriter> {
pub computer_controller: Option<&'a Box<dyn g3_computer_control::ComputerController>>,
pub webdriver_session: &'a Arc<RwLock<Option<Arc<tokio::sync::Mutex<WebDriverSession>>>>>,
pub webdriver_process: &'a Arc<RwLock<Option<tokio::process::Child>>>,
pub macax_controller: &'a Arc<RwLock<Option<g3_computer_control::MacAxController>>>,
pub background_process_manager: &'a Arc<BackgroundProcessManager>,
pub todo_content: &'a Arc<RwLock<String>>,
pub pending_images: &'a mut Vec<g3_providers::ImageContent>,

View File

@@ -13,7 +13,7 @@ use super::executor::ToolContext;
/// Execute the `read_file` tool.
pub async fn execute_read_file<W: UiWriter>(
tool_call: &ToolCall,
ctx: &ToolContext<'_, W>,
_ctx: &ToolContext<'_, W>,
) -> Result<String> {
debug!("Processing read_file tool call");
@@ -28,35 +28,6 @@ pub async fn execute_read_file<W: UiWriter>(
let resolved_path = resolve_path_with_unicode_fallback(expanded_path.as_ref());
let path_str = resolved_path.as_ref();
// Check if this is an image file
let is_image = path_str.to_lowercase().ends_with(".png")
|| path_str.to_lowercase().ends_with(".jpg")
|| path_str.to_lowercase().ends_with(".jpeg")
|| path_str.to_lowercase().ends_with(".gif")
|| path_str.to_lowercase().ends_with(".bmp")
|| path_str.to_lowercase().ends_with(".tiff")
|| path_str.to_lowercase().ends_with(".tif")
|| path_str.to_lowercase().ends_with(".webp");
// If it's an image file, use OCR via extract_text
if is_image {
if let Some(controller) = ctx.computer_controller {
match controller.extract_text_from_image(path_str).await {
Ok(text) => {
return Ok(format!("📄 Image file (OCR extracted):\n{}", text));
}
Err(e) => {
return Ok(format!(
"❌ Failed to extract text from image '{}': {}",
path_str, e
));
}
}
} else {
return Ok("❌ Computer control not enabled. Cannot perform OCR on image files. Set computer_control.enabled = true in config.".to_string());
}
}
// Extract optional start and end positions
let start_char = tool_call
.args

View File

@@ -1,178 +0,0 @@
//! macOS Accessibility API tools.
use anyhow::Result;
use tracing::debug;
use crate::ui_writer::UiWriter;
use crate::ToolCall;
use super::executor::ToolContext;
/// Execute the `macax_list_apps` tool.
pub async fn execute_macax_list_apps<W: UiWriter>(
tool_call: &ToolCall,
ctx: &ToolContext<'_, W>,
) -> Result<String> {
debug!("Processing macax_list_apps tool call");
let _ = tool_call; // unused
if !ctx.config.macax.enabled {
return Ok(
"❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string(),
);
}
let controller_guard = ctx.macax_controller.read().await;
let controller = match controller_guard.as_ref() {
Some(c) => c,
None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()),
};
match controller.list_applications() {
Ok(apps) => {
let app_list: Vec<String> = apps.iter().map(|a| a.name.clone()).collect();
Ok(format!("Running applications:\n{}", app_list.join("\n")))
}
Err(e) => Ok(format!("❌ Failed to list applications: {}", e)),
}
}
/// Execute the `macax_get_frontmost_app` tool.
pub async fn execute_macax_get_frontmost_app<W: UiWriter>(
tool_call: &ToolCall,
ctx: &ToolContext<'_, W>,
) -> Result<String> {
debug!("Processing macax_get_frontmost_app tool call");
let _ = tool_call; // unused
if !ctx.config.macax.enabled {
return Ok(
"❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string(),
);
}
let controller_guard = ctx.macax_controller.read().await;
let controller = match controller_guard.as_ref() {
Some(c) => c,
None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()),
};
match controller.get_frontmost_app() {
Ok(app) => Ok(format!("Frontmost application: {}", app.name)),
Err(e) => Ok(format!("❌ Failed to get frontmost app: {}", e)),
}
}
/// Execute the `macax_activate_app` tool.
pub async fn execute_macax_activate_app<W: UiWriter>(
tool_call: &ToolCall,
ctx: &ToolContext<'_, W>,
) -> Result<String> {
debug!("Processing macax_activate_app tool call");
if !ctx.config.macax.enabled {
return Ok(
"❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string(),
);
}
let app_name = match tool_call.args.get("app_name").and_then(|v| v.as_str()) {
Some(n) => n,
None => return Ok("❌ Missing app_name argument".to_string()),
};
let controller_guard = ctx.macax_controller.read().await;
let controller = match controller_guard.as_ref() {
Some(c) => c,
None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()),
};
match controller.activate_app(app_name) {
Ok(_) => Ok(format!("✅ Activated application: {}", app_name)),
Err(e) => Ok(format!("❌ Failed to activate app: {}", e)),
}
}
/// Execute the `macax_press_key` tool.
pub async fn execute_macax_press_key<W: UiWriter>(
tool_call: &ToolCall,
ctx: &ToolContext<'_, W>,
) -> Result<String> {
debug!("Processing macax_press_key tool call");
if !ctx.config.macax.enabled {
return Ok(
"❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string(),
);
}
let app_name = match tool_call.args.get("app_name").and_then(|v| v.as_str()) {
Some(n) => n,
None => return Ok("❌ Missing app_name argument".to_string()),
};
let key = match tool_call.args.get("key").and_then(|v| v.as_str()) {
Some(k) => k,
None => return Ok("❌ Missing key argument".to_string()),
};
let modifiers_vec: Vec<&str> = tool_call
.args
.get("modifiers")
.and_then(|v| v.as_array())
.map(|arr| arr.iter().filter_map(|v| v.as_str()).collect())
.unwrap_or_default();
let controller_guard = ctx.macax_controller.read().await;
let controller = match controller_guard.as_ref() {
Some(c) => c,
None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()),
};
match controller.press_key(app_name, key, modifiers_vec.clone()) {
Ok(_) => {
let modifier_str = if modifiers_vec.is_empty() {
String::new()
} else {
format!(" with modifiers: {}", modifiers_vec.join("+"))
};
Ok(format!("✅ Pressed key: {}{}", key, modifier_str))
}
Err(e) => Ok(format!("❌ Failed to press key: {}", e)),
}
}
/// Execute the `macax_type_text` tool.
pub async fn execute_macax_type_text<W: UiWriter>(
tool_call: &ToolCall,
ctx: &ToolContext<'_, W>,
) -> Result<String> {
debug!("Processing macax_type_text tool call");
if !ctx.config.macax.enabled {
return Ok(
"❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string(),
);
}
let app_name = match tool_call.args.get("app_name").and_then(|v| v.as_str()) {
Some(n) => n,
None => return Ok("❌ Missing app_name argument".to_string()),
};
let text = match tool_call.args.get("text").and_then(|v| v.as_str()) {
Some(t) => t,
None => return Ok("❌ Missing text argument".to_string()),
};
let controller_guard = ctx.macax_controller.read().await;
let controller = match controller_guard.as_ref() {
Some(c) => c,
None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()),
};
match controller.type_text(app_name, text) {
Ok(_) => Ok(format!("✅ Typed text into {}", app_name)),
Err(e) => Ok(format!("❌ Failed to type text: {}", e)),
}
}

View File

@@ -1,4 +1,4 @@
//! Miscellaneous tools: final_output, take_screenshot, extract_text, code_coverage, code_search.
//! Miscellaneous tools: final_output, take_screenshot, code_coverage, code_search.
use anyhow::Result;
use tracing::debug;
@@ -118,35 +118,6 @@ pub async fn execute_take_screenshot<W: UiWriter>(
}
}
/// Execute the `extract_text` tool.
pub async fn execute_extract_text<W: UiWriter>(
tool_call: &ToolCall,
ctx: &ToolContext<'_, W>,
) -> Result<String> {
debug!("Processing extract_text tool call");
let controller = match ctx.computer_controller {
Some(c) => c,
None => {
return Ok(
"❌ Computer control not enabled. Set computer_control.enabled = true in config."
.to_string(),
)
}
};
let path = tool_call
.args
.get("path")
.and_then(|v| v.as_str())
.ok_or_else(|| anyhow::anyhow!("Missing path argument"))?;
match controller.extract_text_from_image(path).await {
Ok(text) => Ok(format!("✅ Extracted text:\n{}", text)),
Err(e) => Ok(format!("❌ Failed to extract text: {}", e)),
}
}
/// Execute the `code_coverage` tool.
pub async fn execute_code_coverage<W: UiWriter>(
tool_call: &ToolCall,

View File

@@ -6,17 +6,13 @@
//! - `file_ops` - File reading, writing, and editing
//! - `todo` - TODO list management
//! - `webdriver` - Browser automation via WebDriver
//! - `macax` - macOS Accessibility API tools
//! - `vision` - Vision-based text finding and clicking
//! - `misc` - Other tools (screenshots, code search, etc.)
pub mod executor;
pub mod file_ops;
pub mod macax;
pub mod misc;
pub mod shell;
pub mod todo;
pub mod vision;
pub mod webdriver;
pub use executor::ToolExecutor;

View File

@@ -1,275 +0,0 @@
//! Vision-based tools: vision_find_text, vision_click_text, vision_click_near_text, extract_text_with_boxes.
use anyhow::Result;
use tracing::debug;
use crate::ui_writer::UiWriter;
use crate::ToolCall;
use super::executor::ToolContext;
/// Execute the `vision_find_text` tool.
pub async fn execute_vision_find_text<W: UiWriter>(
tool_call: &ToolCall,
ctx: &ToolContext<'_, W>,
) -> Result<String> {
debug!("Processing vision_find_text tool call");
let controller = match ctx.computer_controller {
Some(c) => c,
None => {
return Ok(
"❌ Computer control not enabled. Set computer_control.enabled = true in config."
.to_string(),
)
}
};
let app_name = tool_call
.args
.get("app_name")
.and_then(|v| v.as_str())
.ok_or_else(|| anyhow::anyhow!("Missing app_name parameter"))?;
let text = tool_call
.args
.get("text")
.and_then(|v| v.as_str())
.ok_or_else(|| anyhow::anyhow!("Missing text parameter"))?;
match controller.find_text_in_app(app_name, text).await {
Ok(Some(location)) => Ok(format!(
"✅ Found '{}' in {} at position ({}, {}) with size {}x{} (confidence: {:.0}%)",
location.text,
app_name,
location.x,
location.y,
location.width,
location.height,
location.confidence * 100.0
)),
Ok(None) => Ok(format!("❌ Could not find '{}' in {}", text, app_name)),
Err(e) => Ok(format!("❌ Error finding text: {}", e)),
}
}
/// Execute the `vision_click_text` tool.
pub async fn execute_vision_click_text<W: UiWriter>(
tool_call: &ToolCall,
ctx: &ToolContext<'_, W>,
) -> Result<String> {
debug!("Processing vision_click_text tool call");
let controller = match ctx.computer_controller {
Some(c) => c,
None => {
return Ok(
"❌ Computer control not enabled. Set computer_control.enabled = true in config."
.to_string(),
)
}
};
let app_name = tool_call
.args
.get("app_name")
.and_then(|v| v.as_str())
.ok_or_else(|| anyhow::anyhow!("Missing app_name parameter"))?;
let text = tool_call
.args
.get("text")
.and_then(|v| v.as_str())
.ok_or_else(|| anyhow::anyhow!("Missing text parameter"))?;
match controller.find_text_in_app(app_name, text).await {
Ok(Some(location)) => {
// Click on center of text
// IMPORTANT: location coordinates are in NSScreen space (Y=0 at BOTTOM, increases UPWARD)
// location.x is the LEFT edge of the bounding box
// location.y is the TOP edge of the bounding box (highest Y value in NSScreen space)
// location.width and location.height are already scaled to screen space
// To get center: we need to add half the SCALED width and subtract half the SCALED height
if location.width == 0 || location.height == 0 {
return Ok(format!(
"❌ Invalid bounding box dimensions: width={}, height={}",
location.width, location.height
));
}
debug!(
"[vision_click_text] Location from find_text_in_app: x={}, y={}, width={}, height={}, text='{}'",
location.x, location.y, location.width, location.height, location.text
);
// Calculate center using the SCALED dimensions
// X: Use right edge instead of center (Vision OCR bounding box seems offset)
// This gives us: left edge + full width = right edge
// Y: top edge - half of scaled height (subtract because Y increases upward)
let click_x = location.x + location.width; // Right edge
let half_height = location.height / 2;
let click_y = location.y - half_height;
debug!(
"[vision_click_text] Click position calculation: x={} + {} = {} (right edge), y={} - {} = {}",
location.x, location.width, click_x, location.y, half_height, click_y
);
match controller.click_at(click_x, click_y, Some(app_name)) {
Ok(_) => Ok(format!(
"✅ Clicked on '{}' in {} at ({}, {})",
text, app_name, click_x, click_y
)),
Err(e) => Ok(format!("❌ Failed to click: {}", e)),
}
}
Ok(None) => Ok(format!("❌ Could not find '{}' in {}", text, app_name)),
Err(e) => Ok(format!("❌ Error finding text: {}", e)),
}
}
/// Execute the `vision_click_near_text` tool.
pub async fn execute_vision_click_near_text<W: UiWriter>(
tool_call: &ToolCall,
ctx: &ToolContext<'_, W>,
) -> Result<String> {
debug!("Processing vision_click_near_text tool call");
let controller = match ctx.computer_controller {
Some(c) => c,
None => {
return Ok(
"❌ Computer control not enabled. Set computer_control.enabled = true in config."
.to_string(),
)
}
};
let app_name = tool_call
.args
.get("app_name")
.and_then(|v| v.as_str())
.ok_or_else(|| anyhow::anyhow!("Missing app_name parameter"))?;
let text = tool_call
.args
.get("text")
.and_then(|v| v.as_str())
.ok_or_else(|| anyhow::anyhow!("Missing text parameter"))?;
let direction = tool_call
.args
.get("direction")
.and_then(|v| v.as_str())
.unwrap_or("right");
let distance = tool_call
.args
.get("distance")
.and_then(|v| v.as_i64())
.unwrap_or(50) as i32;
match controller.find_text_in_app(app_name, text).await {
Ok(Some(location)) => {
// Calculate click position based on direction
// location.x is LEFT edge, location.y is TOP edge (in NSScreen space)
let (click_x, click_y) = match direction {
"right" => (
location.x + location.width + distance,
location.y - (location.height / 2),
),
"below" => (
location.x + (location.width / 2),
location.y - location.height - distance,
),
"left" => (location.x - distance, location.y - (location.height / 2)),
"above" => (location.x + (location.width / 2), location.y + distance),
_ => (
location.x + location.width + distance,
location.y - (location.height / 2),
),
};
debug!(
"[vision_click_near_text] Clicking {} of text at ({}, {})",
direction, click_x, click_y
);
match controller.click_at(click_x, click_y, Some(app_name)) {
Ok(_) => Ok(format!(
"✅ Clicked {} of '{}' in {} at ({}, {})",
direction, text, app_name, click_x, click_y
)),
Err(e) => Ok(format!("❌ Failed to click: {}", e)),
}
}
Ok(None) => Ok(format!("❌ Could not find '{}' in {}", text, app_name)),
Err(e) => Ok(format!("❌ Error finding text: {}", e)),
}
}
/// Execute the `extract_text_with_boxes` tool.
pub async fn execute_extract_text_with_boxes<W: UiWriter>(
tool_call: &ToolCall,
ctx: &ToolContext<'_, W>,
) -> Result<String> {
debug!("Processing extract_text_with_boxes tool call");
if !ctx.config.macax.enabled {
return Ok(
"❌ extract_text_with_boxes requires --macax flag to be enabled".to_string(),
);
}
let controller = match ctx.computer_controller {
Some(c) => c,
None => {
return Ok(
"❌ Computer control not enabled. Set computer_control.enabled = true in config."
.to_string(),
)
}
};
let path = tool_call
.args
.get("path")
.and_then(|v| v.as_str())
.ok_or_else(|| anyhow::anyhow!("Missing path parameter"))?;
// Optional: take screenshot of app first
let final_path = if let Some(app_name) = tool_call.args.get("app_name").and_then(|v| v.as_str())
{
let temp_path = format!("/tmp/g3_extract_boxes_{}.png", uuid::Uuid::new_v4());
match controller
.take_screenshot(&temp_path, None, Some(app_name))
.await
{
Ok(_) => temp_path,
Err(e) => return Ok(format!("❌ Failed to take screenshot: {}", e)),
}
} else {
path.to_string()
};
// Extract text with locations
match controller.extract_text_with_locations(&final_path).await {
Ok(locations) => {
// Clean up temp file if we created one
if final_path != path {
let _ = std::fs::remove_file(&final_path);
}
// Return as JSON
match serde_json::to_string_pretty(&locations) {
Ok(json) => Ok(format!(
"✅ Extracted {} text elements:\n{}",
locations.len(),
json
)),
Err(e) => Ok(format!("❌ Failed to serialize results: {}", e)),
}
}
Err(e) => Ok(format!("❌ Failed to extract text: {}", e)),
}
}