Remove vision tools (except take_screenshot) and macax tools

Vision tools removed:
- extract_text (OCR from image files)
- extract_text_with_boxes (OCR with bounding boxes)
- vision_find_text (find text in app windows)
- vision_click_text (find and click on text)
- vision_click_near_text (click near text labels)

macax tools removed:
- macax_list_apps
- macax_get_frontmost_app
- macax_activate_app
- macax_press_key
- macax_type_text

The LLM can now read images directly via read_image tool.
take_screenshot is retained for capturing application windows.

Files deleted:
- crates/g3-core/src/tools/vision.rs
- crates/g3-core/src/tools/macax.rs
- docs/macax-tools.md

Updated tool counts: 12 core + 15 webdriver = 27 total
This commit is contained in:
Dhanji R. Prasanna
2026-01-03 17:38:25 +11:00
parent 29e263ac49
commit 386176899e
19 changed files with 15 additions and 1408 deletions

View File

@@ -20,7 +20,6 @@ pub struct ToolContext<'a, W: UiWriter> {
pub computer_controller: Option<&'a Box<dyn g3_computer_control::ComputerController>>,
pub webdriver_session: &'a Arc<RwLock<Option<Arc<tokio::sync::Mutex<WebDriverSession>>>>>,
pub webdriver_process: &'a Arc<RwLock<Option<tokio::process::Child>>>,
pub macax_controller: &'a Arc<RwLock<Option<g3_computer_control::MacAxController>>>,
pub background_process_manager: &'a Arc<BackgroundProcessManager>,
pub todo_content: &'a Arc<RwLock<String>>,
pub pending_images: &'a mut Vec<g3_providers::ImageContent>,

View File

@@ -13,7 +13,7 @@ use super::executor::ToolContext;
/// Execute the `read_file` tool.
pub async fn execute_read_file<W: UiWriter>(
tool_call: &ToolCall,
ctx: &ToolContext<'_, W>,
_ctx: &ToolContext<'_, W>,
) -> Result<String> {
debug!("Processing read_file tool call");
@@ -28,35 +28,6 @@ pub async fn execute_read_file<W: UiWriter>(
let resolved_path = resolve_path_with_unicode_fallback(expanded_path.as_ref());
let path_str = resolved_path.as_ref();
// Check if this is an image file
let is_image = path_str.to_lowercase().ends_with(".png")
|| path_str.to_lowercase().ends_with(".jpg")
|| path_str.to_lowercase().ends_with(".jpeg")
|| path_str.to_lowercase().ends_with(".gif")
|| path_str.to_lowercase().ends_with(".bmp")
|| path_str.to_lowercase().ends_with(".tiff")
|| path_str.to_lowercase().ends_with(".tif")
|| path_str.to_lowercase().ends_with(".webp");
// If it's an image file, use OCR via extract_text
if is_image {
if let Some(controller) = ctx.computer_controller {
match controller.extract_text_from_image(path_str).await {
Ok(text) => {
return Ok(format!("📄 Image file (OCR extracted):\n{}", text));
}
Err(e) => {
return Ok(format!(
"❌ Failed to extract text from image '{}': {}",
path_str, e
));
}
}
} else {
return Ok("❌ Computer control not enabled. Cannot perform OCR on image files. Set computer_control.enabled = true in config.".to_string());
}
}
// Extract optional start and end positions
let start_char = tool_call
.args

View File

@@ -1,178 +0,0 @@
//! macOS Accessibility API tools.
use anyhow::Result;
use tracing::debug;
use crate::ui_writer::UiWriter;
use crate::ToolCall;
use super::executor::ToolContext;
/// Execute the `macax_list_apps` tool.
pub async fn execute_macax_list_apps<W: UiWriter>(
tool_call: &ToolCall,
ctx: &ToolContext<'_, W>,
) -> Result<String> {
debug!("Processing macax_list_apps tool call");
let _ = tool_call; // unused
if !ctx.config.macax.enabled {
return Ok(
"❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string(),
);
}
let controller_guard = ctx.macax_controller.read().await;
let controller = match controller_guard.as_ref() {
Some(c) => c,
None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()),
};
match controller.list_applications() {
Ok(apps) => {
let app_list: Vec<String> = apps.iter().map(|a| a.name.clone()).collect();
Ok(format!("Running applications:\n{}", app_list.join("\n")))
}
Err(e) => Ok(format!("❌ Failed to list applications: {}", e)),
}
}
/// Execute the `macax_get_frontmost_app` tool.
pub async fn execute_macax_get_frontmost_app<W: UiWriter>(
tool_call: &ToolCall,
ctx: &ToolContext<'_, W>,
) -> Result<String> {
debug!("Processing macax_get_frontmost_app tool call");
let _ = tool_call; // unused
if !ctx.config.macax.enabled {
return Ok(
"❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string(),
);
}
let controller_guard = ctx.macax_controller.read().await;
let controller = match controller_guard.as_ref() {
Some(c) => c,
None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()),
};
match controller.get_frontmost_app() {
Ok(app) => Ok(format!("Frontmost application: {}", app.name)),
Err(e) => Ok(format!("❌ Failed to get frontmost app: {}", e)),
}
}
/// Execute the `macax_activate_app` tool.
pub async fn execute_macax_activate_app<W: UiWriter>(
tool_call: &ToolCall,
ctx: &ToolContext<'_, W>,
) -> Result<String> {
debug!("Processing macax_activate_app tool call");
if !ctx.config.macax.enabled {
return Ok(
"❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string(),
);
}
let app_name = match tool_call.args.get("app_name").and_then(|v| v.as_str()) {
Some(n) => n,
None => return Ok("❌ Missing app_name argument".to_string()),
};
let controller_guard = ctx.macax_controller.read().await;
let controller = match controller_guard.as_ref() {
Some(c) => c,
None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()),
};
match controller.activate_app(app_name) {
Ok(_) => Ok(format!("✅ Activated application: {}", app_name)),
Err(e) => Ok(format!("❌ Failed to activate app: {}", e)),
}
}
/// Execute the `macax_press_key` tool.
pub async fn execute_macax_press_key<W: UiWriter>(
tool_call: &ToolCall,
ctx: &ToolContext<'_, W>,
) -> Result<String> {
debug!("Processing macax_press_key tool call");
if !ctx.config.macax.enabled {
return Ok(
"❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string(),
);
}
let app_name = match tool_call.args.get("app_name").and_then(|v| v.as_str()) {
Some(n) => n,
None => return Ok("❌ Missing app_name argument".to_string()),
};
let key = match tool_call.args.get("key").and_then(|v| v.as_str()) {
Some(k) => k,
None => return Ok("❌ Missing key argument".to_string()),
};
let modifiers_vec: Vec<&str> = tool_call
.args
.get("modifiers")
.and_then(|v| v.as_array())
.map(|arr| arr.iter().filter_map(|v| v.as_str()).collect())
.unwrap_or_default();
let controller_guard = ctx.macax_controller.read().await;
let controller = match controller_guard.as_ref() {
Some(c) => c,
None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()),
};
match controller.press_key(app_name, key, modifiers_vec.clone()) {
Ok(_) => {
let modifier_str = if modifiers_vec.is_empty() {
String::new()
} else {
format!(" with modifiers: {}", modifiers_vec.join("+"))
};
Ok(format!("✅ Pressed key: {}{}", key, modifier_str))
}
Err(e) => Ok(format!("❌ Failed to press key: {}", e)),
}
}
/// Execute the `macax_type_text` tool.
pub async fn execute_macax_type_text<W: UiWriter>(
tool_call: &ToolCall,
ctx: &ToolContext<'_, W>,
) -> Result<String> {
debug!("Processing macax_type_text tool call");
if !ctx.config.macax.enabled {
return Ok(
"❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string(),
);
}
let app_name = match tool_call.args.get("app_name").and_then(|v| v.as_str()) {
Some(n) => n,
None => return Ok("❌ Missing app_name argument".to_string()),
};
let text = match tool_call.args.get("text").and_then(|v| v.as_str()) {
Some(t) => t,
None => return Ok("❌ Missing text argument".to_string()),
};
let controller_guard = ctx.macax_controller.read().await;
let controller = match controller_guard.as_ref() {
Some(c) => c,
None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()),
};
match controller.type_text(app_name, text) {
Ok(_) => Ok(format!("✅ Typed text into {}", app_name)),
Err(e) => Ok(format!("❌ Failed to type text: {}", e)),
}
}

View File

@@ -1,4 +1,4 @@
//! Miscellaneous tools: final_output, take_screenshot, extract_text, code_coverage, code_search.
//! Miscellaneous tools: final_output, take_screenshot, code_coverage, code_search.
use anyhow::Result;
use tracing::debug;
@@ -118,35 +118,6 @@ pub async fn execute_take_screenshot<W: UiWriter>(
}
}
/// Execute the `extract_text` tool.
pub async fn execute_extract_text<W: UiWriter>(
tool_call: &ToolCall,
ctx: &ToolContext<'_, W>,
) -> Result<String> {
debug!("Processing extract_text tool call");
let controller = match ctx.computer_controller {
Some(c) => c,
None => {
return Ok(
"❌ Computer control not enabled. Set computer_control.enabled = true in config."
.to_string(),
)
}
};
let path = tool_call
.args
.get("path")
.and_then(|v| v.as_str())
.ok_or_else(|| anyhow::anyhow!("Missing path argument"))?;
match controller.extract_text_from_image(path).await {
Ok(text) => Ok(format!("✅ Extracted text:\n{}", text)),
Err(e) => Ok(format!("❌ Failed to extract text: {}", e)),
}
}
/// Execute the `code_coverage` tool.
pub async fn execute_code_coverage<W: UiWriter>(
tool_call: &ToolCall,

View File

@@ -6,17 +6,13 @@
//! - `file_ops` - File reading, writing, and editing
//! - `todo` - TODO list management
//! - `webdriver` - Browser automation via WebDriver
//! - `macax` - macOS Accessibility API tools
//! - `vision` - Vision-based text finding and clicking
//! - `misc` - Other tools (screenshots, code search, etc.)
pub mod executor;
pub mod file_ops;
pub mod macax;
pub mod misc;
pub mod shell;
pub mod todo;
pub mod vision;
pub mod webdriver;
pub use executor::ToolExecutor;

View File

@@ -1,275 +0,0 @@
//! Vision-based tools: vision_find_text, vision_click_text, vision_click_near_text, extract_text_with_boxes.
use anyhow::Result;
use tracing::debug;
use crate::ui_writer::UiWriter;
use crate::ToolCall;
use super::executor::ToolContext;
/// Execute the `vision_find_text` tool.
pub async fn execute_vision_find_text<W: UiWriter>(
tool_call: &ToolCall,
ctx: &ToolContext<'_, W>,
) -> Result<String> {
debug!("Processing vision_find_text tool call");
let controller = match ctx.computer_controller {
Some(c) => c,
None => {
return Ok(
"❌ Computer control not enabled. Set computer_control.enabled = true in config."
.to_string(),
)
}
};
let app_name = tool_call
.args
.get("app_name")
.and_then(|v| v.as_str())
.ok_or_else(|| anyhow::anyhow!("Missing app_name parameter"))?;
let text = tool_call
.args
.get("text")
.and_then(|v| v.as_str())
.ok_or_else(|| anyhow::anyhow!("Missing text parameter"))?;
match controller.find_text_in_app(app_name, text).await {
Ok(Some(location)) => Ok(format!(
"✅ Found '{}' in {} at position ({}, {}) with size {}x{} (confidence: {:.0}%)",
location.text,
app_name,
location.x,
location.y,
location.width,
location.height,
location.confidence * 100.0
)),
Ok(None) => Ok(format!("❌ Could not find '{}' in {}", text, app_name)),
Err(e) => Ok(format!("❌ Error finding text: {}", e)),
}
}
/// Execute the `vision_click_text` tool.
pub async fn execute_vision_click_text<W: UiWriter>(
tool_call: &ToolCall,
ctx: &ToolContext<'_, W>,
) -> Result<String> {
debug!("Processing vision_click_text tool call");
let controller = match ctx.computer_controller {
Some(c) => c,
None => {
return Ok(
"❌ Computer control not enabled. Set computer_control.enabled = true in config."
.to_string(),
)
}
};
let app_name = tool_call
.args
.get("app_name")
.and_then(|v| v.as_str())
.ok_or_else(|| anyhow::anyhow!("Missing app_name parameter"))?;
let text = tool_call
.args
.get("text")
.and_then(|v| v.as_str())
.ok_or_else(|| anyhow::anyhow!("Missing text parameter"))?;
match controller.find_text_in_app(app_name, text).await {
Ok(Some(location)) => {
// Click on center of text
// IMPORTANT: location coordinates are in NSScreen space (Y=0 at BOTTOM, increases UPWARD)
// location.x is the LEFT edge of the bounding box
// location.y is the TOP edge of the bounding box (highest Y value in NSScreen space)
// location.width and location.height are already scaled to screen space
// To get center: we need to add half the SCALED width and subtract half the SCALED height
if location.width == 0 || location.height == 0 {
return Ok(format!(
"❌ Invalid bounding box dimensions: width={}, height={}",
location.width, location.height
));
}
debug!(
"[vision_click_text] Location from find_text_in_app: x={}, y={}, width={}, height={}, text='{}'",
location.x, location.y, location.width, location.height, location.text
);
// Calculate center using the SCALED dimensions
// X: Use right edge instead of center (Vision OCR bounding box seems offset)
// This gives us: left edge + full width = right edge
// Y: top edge - half of scaled height (subtract because Y increases upward)
let click_x = location.x + location.width; // Right edge
let half_height = location.height / 2;
let click_y = location.y - half_height;
debug!(
"[vision_click_text] Click position calculation: x={} + {} = {} (right edge), y={} - {} = {}",
location.x, location.width, click_x, location.y, half_height, click_y
);
match controller.click_at(click_x, click_y, Some(app_name)) {
Ok(_) => Ok(format!(
"✅ Clicked on '{}' in {} at ({}, {})",
text, app_name, click_x, click_y
)),
Err(e) => Ok(format!("❌ Failed to click: {}", e)),
}
}
Ok(None) => Ok(format!("❌ Could not find '{}' in {}", text, app_name)),
Err(e) => Ok(format!("❌ Error finding text: {}", e)),
}
}
/// Execute the `vision_click_near_text` tool.
pub async fn execute_vision_click_near_text<W: UiWriter>(
tool_call: &ToolCall,
ctx: &ToolContext<'_, W>,
) -> Result<String> {
debug!("Processing vision_click_near_text tool call");
let controller = match ctx.computer_controller {
Some(c) => c,
None => {
return Ok(
"❌ Computer control not enabled. Set computer_control.enabled = true in config."
.to_string(),
)
}
};
let app_name = tool_call
.args
.get("app_name")
.and_then(|v| v.as_str())
.ok_or_else(|| anyhow::anyhow!("Missing app_name parameter"))?;
let text = tool_call
.args
.get("text")
.and_then(|v| v.as_str())
.ok_or_else(|| anyhow::anyhow!("Missing text parameter"))?;
let direction = tool_call
.args
.get("direction")
.and_then(|v| v.as_str())
.unwrap_or("right");
let distance = tool_call
.args
.get("distance")
.and_then(|v| v.as_i64())
.unwrap_or(50) as i32;
match controller.find_text_in_app(app_name, text).await {
Ok(Some(location)) => {
// Calculate click position based on direction
// location.x is LEFT edge, location.y is TOP edge (in NSScreen space)
let (click_x, click_y) = match direction {
"right" => (
location.x + location.width + distance,
location.y - (location.height / 2),
),
"below" => (
location.x + (location.width / 2),
location.y - location.height - distance,
),
"left" => (location.x - distance, location.y - (location.height / 2)),
"above" => (location.x + (location.width / 2), location.y + distance),
_ => (
location.x + location.width + distance,
location.y - (location.height / 2),
),
};
debug!(
"[vision_click_near_text] Clicking {} of text at ({}, {})",
direction, click_x, click_y
);
match controller.click_at(click_x, click_y, Some(app_name)) {
Ok(_) => Ok(format!(
"✅ Clicked {} of '{}' in {} at ({}, {})",
direction, text, app_name, click_x, click_y
)),
Err(e) => Ok(format!("❌ Failed to click: {}", e)),
}
}
Ok(None) => Ok(format!("❌ Could not find '{}' in {}", text, app_name)),
Err(e) => Ok(format!("❌ Error finding text: {}", e)),
}
}
/// Execute the `extract_text_with_boxes` tool.
pub async fn execute_extract_text_with_boxes<W: UiWriter>(
tool_call: &ToolCall,
ctx: &ToolContext<'_, W>,
) -> Result<String> {
debug!("Processing extract_text_with_boxes tool call");
if !ctx.config.macax.enabled {
return Ok(
"❌ extract_text_with_boxes requires --macax flag to be enabled".to_string(),
);
}
let controller = match ctx.computer_controller {
Some(c) => c,
None => {
return Ok(
"❌ Computer control not enabled. Set computer_control.enabled = true in config."
.to_string(),
)
}
};
let path = tool_call
.args
.get("path")
.and_then(|v| v.as_str())
.ok_or_else(|| anyhow::anyhow!("Missing path parameter"))?;
// Optional: take screenshot of app first
let final_path = if let Some(app_name) = tool_call.args.get("app_name").and_then(|v| v.as_str())
{
let temp_path = format!("/tmp/g3_extract_boxes_{}.png", uuid::Uuid::new_v4());
match controller
.take_screenshot(&temp_path, None, Some(app_name))
.await
{
Ok(_) => temp_path,
Err(e) => return Ok(format!("❌ Failed to take screenshot: {}", e)),
}
} else {
path.to_string()
};
// Extract text with locations
match controller.extract_text_with_locations(&final_path).await {
Ok(locations) => {
// Clean up temp file if we created one
if final_path != path {
let _ = std::fs::remove_file(&final_path);
}
// Return as JSON
match serde_json::to_string_pretty(&locations) {
Ok(json) => Ok(format!(
"✅ Extracted {} text elements:\n{}",
locations.len(),
json
)),
Err(e) => Ok(format!("❌ Failed to serialize results: {}", e)),
}
}
Err(e) => Ok(format!("❌ Failed to extract text: {}", e)),
}
}