fixed x,y detection in vision click
This commit is contained in:
@@ -64,7 +64,7 @@ impl ComputerController for MacOSController {
|
|||||||
let array = CFArray::<CFDictionary>::wrap_under_create_rule(window_list);
|
let array = CFArray::<CFDictionary>::wrap_under_create_rule(window_list);
|
||||||
let count = array.len();
|
let count = array.len();
|
||||||
|
|
||||||
let mut found_window_id: Option<(u32, String, bool)> = None; // (id, owner, is_exact_match)
|
let mut found_window_id: Option<(u32, String)> = None; // (id, owner)
|
||||||
let app_name_lower = app_name.to_lowercase();
|
let app_name_lower = app_name.to_lowercase();
|
||||||
|
|
||||||
for i in 0..count {
|
for i in 0..count {
|
||||||
@@ -82,31 +82,62 @@ impl ComputerController for MacOSController {
|
|||||||
tracing::debug!("Checking window: owner='{}', looking for '{}'", owner, app_name);
|
tracing::debug!("Checking window: owner='{}', looking for '{}'", owner, app_name);
|
||||||
let owner_lower = owner.to_lowercase();
|
let owner_lower = owner.to_lowercase();
|
||||||
|
|
||||||
// Check for exact match first (case-insensitive)
|
// Normalize by removing spaces for exact matching
|
||||||
let is_exact_match = owner_lower == app_name_lower;
|
let app_name_normalized = app_name_lower.replace(" ", "");
|
||||||
|
let owner_normalized = owner_lower.replace(" ", "");
|
||||||
|
|
||||||
// Check for fuzzy match (either direction contains)
|
// ONLY accept exact matches (case-insensitive, with or without spaces)
|
||||||
let is_fuzzy_match = owner_lower.contains(&app_name_lower) || app_name_lower.contains(&owner_lower);
|
// This prevents "Goose" from matching "GooseStudio"
|
||||||
|
let is_match = owner_lower == app_name_lower || owner_normalized == app_name_normalized;
|
||||||
|
|
||||||
if is_exact_match || is_fuzzy_match {
|
if is_match {
|
||||||
// Get window ID
|
// Get window ID
|
||||||
let window_id_key = CFString::from_static_string("kCGWindowNumber");
|
let window_id_key = CFString::from_static_string("kCGWindowNumber");
|
||||||
if let Some(value) = dict.find(window_id_key.to_void()) {
|
if let Some(value) = dict.find(window_id_key.to_void()) {
|
||||||
let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _);
|
let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _);
|
||||||
if let Some(id) = num.to_i64() {
|
if let Some(id) = num.to_i64() {
|
||||||
tracing::debug!("Found candidate: window ID {} for app '{}' (exact={}, fuzzy={})", id, owner, is_exact_match, is_fuzzy_match);
|
// Get window layer to filter out menu bar windows
|
||||||
|
let layer_key = CFString::from_static_string("kCGWindowLayer");
|
||||||
|
let layer: i32 = if let Some(value) = dict.find(layer_key.to_void()) {
|
||||||
|
let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _);
|
||||||
|
num.to_i32().unwrap_or(0)
|
||||||
|
} else {
|
||||||
|
0
|
||||||
|
};
|
||||||
|
|
||||||
// If we found an exact match, use it immediately
|
// Get window bounds to verify it's a real window
|
||||||
if is_exact_match {
|
let bounds_key = CFString::from_static_string("kCGWindowBounds");
|
||||||
tracing::info!("Found exact match: window ID {} for app '{}'", id, owner);
|
let has_real_bounds = if let Some(value) = dict.find(bounds_key.to_void()) {
|
||||||
found_window_id = Some((id as u32, owner.clone(), true));
|
let bounds_dict: CFDictionary = TCFType::wrap_under_get_rule(*value as *const _);
|
||||||
break;
|
let width_key = CFString::from_static_string("Width");
|
||||||
|
let height_key = CFString::from_static_string("Height");
|
||||||
|
|
||||||
|
if let (Some(w_val), Some(h_val)) = (
|
||||||
|
bounds_dict.find(width_key.to_void()),
|
||||||
|
bounds_dict.find(height_key.to_void()),
|
||||||
|
) {
|
||||||
|
let w_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*w_val as *const _);
|
||||||
|
let h_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*h_val as *const _);
|
||||||
|
let width = w_num.to_f64().unwrap_or(0.0);
|
||||||
|
let height = h_num.to_f64().unwrap_or(0.0);
|
||||||
|
// Real windows should be at least 100x100 pixels
|
||||||
|
width >= 100.0 && height >= 100.0
|
||||||
|
} else {
|
||||||
|
false
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
};
|
||||||
|
|
||||||
// Otherwise, keep the first fuzzy match but continue looking for exact match
|
// Only accept windows that are:
|
||||||
if found_window_id.is_none() {
|
// 1. At layer 0 (normal windows, not menu bar)
|
||||||
tracing::info!("Found fuzzy match: window ID {} for app '{}'", id, owner);
|
// 2. Have real bounds (width and height >= 100)
|
||||||
found_window_id = Some((id as u32, owner.clone(), false));
|
if layer == 0 && has_real_bounds {
|
||||||
|
tracing::info!("Found valid window: ID {} for app '{}' (layer={}, bounds valid)", id, owner, layer);
|
||||||
|
found_window_id = Some((id as u32, owner.clone()));
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
tracing::debug!("Skipping window ID {} for '{}': layer={}, has_real_bounds={}", id, owner, layer, has_real_bounds);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -116,15 +147,10 @@ impl ComputerController for MacOSController {
|
|||||||
found_window_id
|
found_window_id
|
||||||
};
|
};
|
||||||
|
|
||||||
let (cg_window_id, matched_owner, is_exact) = cg_window_id.ok_or_else(|| {
|
let (cg_window_id, matched_owner) = cg_window_id.ok_or_else(|| {
|
||||||
anyhow::anyhow!("Could not find window for application '{}'. Use list_windows to see available windows.", app_name)
|
anyhow::anyhow!("Could not find window for application '{}'. Use list_windows to see available windows.", app_name)
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
if !is_exact {
|
|
||||||
tracing::warn!("Using fuzzy match: requested '{}' but found '{}' (window ID {})", app_name, matched_owner, cg_window_id);
|
|
||||||
} else {
|
|
||||||
tracing::info!("Taking screenshot of window ID {} for app '{}'", cg_window_id, matched_owner);
|
tracing::info!("Taking screenshot of window ID {} for app '{}'", cg_window_id, matched_owner);
|
||||||
}
|
|
||||||
|
|
||||||
// Use screencapture with the window ID for now
|
// Use screencapture with the window ID for now
|
||||||
// TODO: Implement direct CGWindowListCreateImage approach with proper image saving
|
// TODO: Implement direct CGWindowListCreateImage approach with proper image saving
|
||||||
@@ -178,12 +204,18 @@ impl ComputerController for MacOSController {
|
|||||||
async fn find_text_in_app(&self, app_name: &str, search_text: &str) -> Result<Option<TextLocation>> {
|
async fn find_text_in_app(&self, app_name: &str, search_text: &str) -> Result<Option<TextLocation>> {
|
||||||
// Take screenshot of specific app window
|
// Take screenshot of specific app window
|
||||||
let home = std::env::var("HOME").unwrap_or_else(|_| "/tmp".to_string());
|
let home = std::env::var("HOME").unwrap_or_else(|_| "/tmp".to_string());
|
||||||
let temp_path = format!("{}/Desktop/g3_find_text_{}_{}.png", home, app_name, uuid::Uuid::new_v4());
|
let temp_path = format!("{}/tmp/g3_find_text_{}_{}.png", home, app_name, uuid::Uuid::new_v4());
|
||||||
self.take_screenshot(&temp_path, None, Some(app_name)).await?;
|
self.take_screenshot(&temp_path, None, Some(app_name)).await?;
|
||||||
|
|
||||||
|
// Get screenshot dimensions before we delete it
|
||||||
|
let screenshot_dims = get_image_dimensions(&temp_path)?;
|
||||||
|
|
||||||
// Extract all text with locations
|
// Extract all text with locations
|
||||||
let locations = self.extract_text_with_locations(&temp_path).await?;
|
let locations = self.extract_text_with_locations(&temp_path).await?;
|
||||||
|
|
||||||
|
// Get window bounds to calculate coordinate transformation
|
||||||
|
let window_bounds = self.get_window_bounds(app_name)?;
|
||||||
|
|
||||||
// Clean up temp file
|
// Clean up temp file
|
||||||
let _ = std::fs::remove_file(&temp_path);
|
let _ = std::fs::remove_file(&temp_path);
|
||||||
|
|
||||||
@@ -191,7 +223,13 @@ impl ComputerController for MacOSController {
|
|||||||
let search_lower = search_text.to_lowercase();
|
let search_lower = search_text.to_lowercase();
|
||||||
for location in locations {
|
for location in locations {
|
||||||
if location.text.to_lowercase().contains(&search_lower) {
|
if location.text.to_lowercase().contains(&search_lower) {
|
||||||
return Ok(Some(location));
|
// Transform coordinates from screenshot space to screen space
|
||||||
|
let transformed = transform_screenshot_to_screen_coords(
|
||||||
|
location,
|
||||||
|
window_bounds,
|
||||||
|
screenshot_dims,
|
||||||
|
);
|
||||||
|
return Ok(Some(transformed));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -222,44 +260,7 @@ impl ComputerController for MacOSController {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn click_at(&self, x: i32, y: i32, app_name: Option<&str>) -> Result<()> {
|
fn click_at(&self, x: i32, y: i32, _app_name: Option<&str>) -> Result<()> {
|
||||||
// If app_name is provided, get window position and offset coordinates
|
|
||||||
let (global_x, global_y) = if let Some(app) = app_name {
|
|
||||||
// Get window position using AppleScript
|
|
||||||
let script = format!(
|
|
||||||
r#"tell application "{}" to get bounds of window 1"#,
|
|
||||||
app
|
|
||||||
);
|
|
||||||
|
|
||||||
let output = std::process::Command::new("osascript")
|
|
||||||
.arg("-e")
|
|
||||||
.arg(&script)
|
|
||||||
.output()?;
|
|
||||||
|
|
||||||
if output.status.success() {
|
|
||||||
let bounds_str = String::from_utf8_lossy(&output.stdout);
|
|
||||||
// Parse bounds: "x1, y1, x2, y2"
|
|
||||||
let parts: Vec<&str> = bounds_str.trim().split(", ").collect();
|
|
||||||
if parts.len() >= 2 {
|
|
||||||
if let (Ok(window_x), Ok(window_y)) = (
|
|
||||||
parts[0].trim().parse::<i32>(),
|
|
||||||
parts[1].trim().parse::<i32>(),
|
|
||||||
) {
|
|
||||||
// Offset relative coordinates by window position
|
|
||||||
(x + window_x, y + window_y)
|
|
||||||
} else {
|
|
||||||
(x, y) // Fallback to absolute coordinates
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
(x, y) // Fallback to absolute coordinates
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
(x, y) // Fallback to absolute coordinates
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
(x, y) // No app name, use absolute coordinates
|
|
||||||
};
|
|
||||||
|
|
||||||
use core_graphics::event::{
|
use core_graphics::event::{
|
||||||
CGEvent, CGEventTapLocation, CGEventType, CGMouseButton,
|
CGEvent, CGEventTapLocation, CGEventType, CGMouseButton,
|
||||||
};
|
};
|
||||||
@@ -267,12 +268,27 @@ impl ComputerController for MacOSController {
|
|||||||
CGEventSource, CGEventSourceStateID,
|
CGEventSource, CGEventSourceStateID,
|
||||||
};
|
};
|
||||||
use core_graphics::geometry::CGPoint;
|
use core_graphics::geometry::CGPoint;
|
||||||
|
use core_graphics::display::CGDisplay;
|
||||||
|
|
||||||
|
// IMPORTANT: Coordinates passed here are in NSScreen/CGWindowListCopyWindowInfo space
|
||||||
|
// (Y=0 at BOTTOM, increases UPWARD)
|
||||||
|
// But CGEvent uses a different coordinate system (Y=0 at TOP, increases DOWNWARD)
|
||||||
|
// We need to convert: CGEvent.y = screenHeight - NSScreen.y
|
||||||
|
|
||||||
|
let screen_height = CGDisplay::main().pixels_high() as i32;
|
||||||
|
let cgevent_x = x;
|
||||||
|
let cgevent_y = screen_height - y;
|
||||||
|
|
||||||
|
tracing::debug!("click_at: NSScreen coords ({}, {}) -> CGEvent coords ({}, {}) [screen_height={}]",
|
||||||
|
x, y, cgevent_x, cgevent_y, screen_height);
|
||||||
|
|
||||||
|
let (global_x, global_y) = (cgevent_x, cgevent_y);
|
||||||
|
|
||||||
|
let point = CGPoint::new(global_x as f64, global_y as f64);
|
||||||
|
|
||||||
let source = CGEventSource::new(CGEventSourceStateID::HIDSystemState)
|
let source = CGEventSource::new(CGEventSourceStateID::HIDSystemState)
|
||||||
.ok().context("Failed to create event source")?;
|
.ok().context("Failed to create event source")?;
|
||||||
|
|
||||||
let point = CGPoint::new(global_x as f64, global_y as f64);
|
|
||||||
|
|
||||||
// Move mouse to position first
|
// Move mouse to position first
|
||||||
let move_event = CGEvent::new_mouse_event(
|
let move_event = CGEvent::new_mouse_event(
|
||||||
source.clone(),
|
source.clone(),
|
||||||
@@ -307,3 +323,185 @@ impl ComputerController for MacOSController {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl MacOSController {
|
||||||
|
/// Get window bounds for an application (helper method)
|
||||||
|
fn get_window_bounds(&self, app_name: &str) -> Result<(i32, i32, i32, i32)> {
|
||||||
|
unsafe {
|
||||||
|
let window_list = CGWindowListCopyWindowInfo(
|
||||||
|
kCGWindowListOptionOnScreenOnly,
|
||||||
|
kCGNullWindowID
|
||||||
|
);
|
||||||
|
|
||||||
|
let array = CFArray::<CFDictionary>::wrap_under_create_rule(window_list);
|
||||||
|
let count = array.len();
|
||||||
|
|
||||||
|
let app_name_lower = app_name.to_lowercase();
|
||||||
|
|
||||||
|
for i in 0..count {
|
||||||
|
let dict = array.get(i).unwrap();
|
||||||
|
|
||||||
|
// Get owner name
|
||||||
|
let owner_key = CFString::from_static_string("kCGWindowOwnerName");
|
||||||
|
let owner: String = if let Some(value) = dict.find(owner_key.to_void()) {
|
||||||
|
let s: CFString = TCFType::wrap_under_get_rule(*value as *const _);
|
||||||
|
s.to_string()
|
||||||
|
} else {
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
|
||||||
|
let owner_lower = owner.to_lowercase();
|
||||||
|
|
||||||
|
// Normalize by removing spaces for exact matching
|
||||||
|
let app_name_normalized = app_name_lower.replace(" ", "");
|
||||||
|
let owner_normalized = owner_lower.replace(" ", "");
|
||||||
|
|
||||||
|
// ONLY accept exact matches (case-insensitive, with or without spaces)
|
||||||
|
// This prevents "Goose" from matching "GooseStudio"
|
||||||
|
let is_match = owner_lower == app_name_lower || owner_normalized == app_name_normalized;
|
||||||
|
|
||||||
|
if is_match {
|
||||||
|
// Get window layer to filter out menu bar windows
|
||||||
|
let layer_key = CFString::from_static_string("kCGWindowLayer");
|
||||||
|
let layer: i32 = if let Some(value) = dict.find(layer_key.to_void()) {
|
||||||
|
let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _);
|
||||||
|
num.to_i32().unwrap_or(0)
|
||||||
|
} else {
|
||||||
|
0
|
||||||
|
};
|
||||||
|
|
||||||
|
// Skip menu bar windows (layer >= 20)
|
||||||
|
if layer >= 20 {
|
||||||
|
tracing::debug!("Skipping window for '{}' at layer {} (menu bar)", owner, layer);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get window bounds to verify it's a real window
|
||||||
|
let bounds_key = CFString::from_static_string("kCGWindowBounds");
|
||||||
|
if let Some(value) = dict.find(bounds_key.to_void()) {
|
||||||
|
let bounds_dict: CFDictionary = TCFType::wrap_under_get_rule(*value as *const _);
|
||||||
|
|
||||||
|
let x_key = CFString::from_static_string("X");
|
||||||
|
let y_key = CFString::from_static_string("Y");
|
||||||
|
let width_key = CFString::from_static_string("Width");
|
||||||
|
let height_key = CFString::from_static_string("Height");
|
||||||
|
|
||||||
|
if let (Some(x_val), Some(y_val), Some(w_val), Some(h_val)) = (
|
||||||
|
bounds_dict.find(x_key.to_void()),
|
||||||
|
bounds_dict.find(y_key.to_void()),
|
||||||
|
bounds_dict.find(width_key.to_void()),
|
||||||
|
bounds_dict.find(height_key.to_void()),
|
||||||
|
) {
|
||||||
|
let x_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*x_val as *const _);
|
||||||
|
let y_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*y_val as *const _);
|
||||||
|
let w_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*w_val as *const _);
|
||||||
|
let h_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*h_val as *const _);
|
||||||
|
|
||||||
|
let x: i32 = x_num.to_i64().unwrap_or(0) as i32;
|
||||||
|
let y: i32 = y_num.to_i64().unwrap_or(0) as i32;
|
||||||
|
let w: i32 = w_num.to_i64().unwrap_or(0) as i32;
|
||||||
|
let h: i32 = h_num.to_i64().unwrap_or(0) as i32;
|
||||||
|
|
||||||
|
// Only accept windows with real bounds (>= 100x100 pixels)
|
||||||
|
if w >= 100 && h >= 100 {
|
||||||
|
tracing::info!("Found valid window bounds for '{}': x={}, y={}, w={}, h={} (layer={})", owner, x, y, w, h, layer);
|
||||||
|
return Ok((x, y, w, h));
|
||||||
|
} else {
|
||||||
|
tracing::debug!("Skipping window for '{}': too small ({}x{})", owner, w, h);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Err(anyhow::anyhow!("Could not find window bounds for '{}'", app_name))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get image dimensions from a PNG file
|
||||||
|
fn get_image_dimensions(path: &str) -> Result<(i32, i32)> {
|
||||||
|
use std::fs::File;
|
||||||
|
use std::io::Read;
|
||||||
|
|
||||||
|
let mut file = File::open(path)?;
|
||||||
|
let mut buffer = vec![0u8; 24];
|
||||||
|
file.read_exact(&mut buffer)?;
|
||||||
|
|
||||||
|
// PNG signature check
|
||||||
|
if &buffer[0..8] != b"\x89PNG\r\n\x1a\n" {
|
||||||
|
anyhow::bail!("Not a valid PNG file");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read IHDR chunk (width and height are at bytes 16-23)
|
||||||
|
let width = u32::from_be_bytes([buffer[16], buffer[17], buffer[18], buffer[19]]) as i32;
|
||||||
|
let height = u32::from_be_bytes([buffer[20], buffer[21], buffer[22], buffer[23]]) as i32;
|
||||||
|
|
||||||
|
Ok((width, height))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Transform coordinates from screenshot space to screen space
|
||||||
|
///
|
||||||
|
/// The screenshot is taken of a window, and Vision OCR returns coordinates
|
||||||
|
/// relative to the screenshot image. We need to transform these to actual
|
||||||
|
/// screen coordinates for clicking.
|
||||||
|
///
|
||||||
|
/// On Retina displays, screenshots are taken at 2x resolution, so we need
|
||||||
|
/// to account for this scaling factor.
|
||||||
|
fn transform_screenshot_to_screen_coords(
|
||||||
|
location: TextLocation,
|
||||||
|
window_bounds: (i32, i32, i32, i32), // (x, y, width, height) in screen space
|
||||||
|
screenshot_dims: (i32, i32), // (width, height) in pixels
|
||||||
|
) -> TextLocation {
|
||||||
|
let (win_x, win_y, win_width, win_height) = window_bounds;
|
||||||
|
let (screenshot_width, screenshot_height) = screenshot_dims;
|
||||||
|
|
||||||
|
// Calculate scale factors
|
||||||
|
// On Retina displays, screenshot is typically 2x the window size
|
||||||
|
let scale_x = win_width as f64 / screenshot_width as f64;
|
||||||
|
let scale_y = win_height as f64 / screenshot_height as f64;
|
||||||
|
|
||||||
|
tracing::debug!("Transform: screenshot={}x{}, window={}x{} at ({},{}), scale=({:.2},{:.2})",
|
||||||
|
screenshot_width, screenshot_height, win_width, win_height, win_x, win_y, scale_x, scale_y);
|
||||||
|
|
||||||
|
// Transform coordinates from image space to screen space
|
||||||
|
// IMPORTANT: macOS screen coordinates have origin at BOTTOM-LEFT (Y increases upward)
|
||||||
|
// Image coordinates have origin at TOP-LEFT (Y increases downward)
|
||||||
|
// win_y is the BOTTOM of the window in screen coordinates
|
||||||
|
// So we need to: (win_y + win_height) to get window TOP, then subtract screenshot_y
|
||||||
|
let window_top_y = win_y + win_height;
|
||||||
|
|
||||||
|
tracing::debug!("[transform] Input location in image space: x={}, y={}, width={}, height={}",
|
||||||
|
location.x, location.y, location.width, location.height);
|
||||||
|
tracing::debug!("[transform] Scale factors: scale_x={:.4}, scale_y={:.4}", scale_x, scale_y);
|
||||||
|
|
||||||
|
let transformed_x = win_x + (location.x as f64 * scale_x) as i32;
|
||||||
|
let transformed_y = window_top_y - (location.y as f64 * scale_y) as i32;
|
||||||
|
let transformed_width = (location.width as f64 * scale_x) as i32;
|
||||||
|
let transformed_height = (location.height as f64 * scale_y) as i32;
|
||||||
|
|
||||||
|
tracing::debug!("[transform] Calculation details:");
|
||||||
|
tracing::debug!(" - transformed_x = {} + ({} * {:.4}) = {} + {:.2} = {}", win_x, location.x, scale_x, win_x, location.x as f64 * scale_x, transformed_x);
|
||||||
|
tracing::debug!(" - transformed_width = ({} * {:.4}) = {:.2} -> {}", location.width, scale_x, location.width as f64 * scale_x, transformed_width);
|
||||||
|
tracing::debug!(" - transformed_height = ({} * {:.4}) = {:.2} -> {}", location.height, scale_y, location.height as f64 * scale_y, transformed_height);
|
||||||
|
|
||||||
|
tracing::debug!("Transformed location: screenshot=({},{}) {}x{} -> screen=({},{}) {}x{}",
|
||||||
|
location.x, location.y, location.width, location.height,
|
||||||
|
transformed_x, transformed_y, transformed_width, transformed_height);
|
||||||
|
|
||||||
|
TextLocation {
|
||||||
|
text: location.text,
|
||||||
|
x: transformed_x,
|
||||||
|
y: transformed_y,
|
||||||
|
width: transformed_width,
|
||||||
|
height: transformed_height,
|
||||||
|
confidence: location.confidence,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[path = "macos_window_matching_test.rs"]
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests;
|
||||||
@@ -0,0 +1,45 @@
|
|||||||
|
#[cfg(test)]
|
||||||
|
mod window_matching_tests {
|
||||||
|
/// Test that window name matching handles spaces correctly
|
||||||
|
///
|
||||||
|
/// Issue: When a user requests a screenshot of "Goose Studio" but the actual
|
||||||
|
/// application name is "GooseStudio" (no space), the fuzzy matching should
|
||||||
|
/// still find the window.
|
||||||
|
///
|
||||||
|
/// The fix normalizes both names by removing spaces before comparing.
|
||||||
|
#[test]
|
||||||
|
fn test_space_normalization() {
|
||||||
|
let test_cases = vec![
|
||||||
|
// (user_input, actual_app_name, should_match)
|
||||||
|
("Goose Studio", "GooseStudio", true),
|
||||||
|
("GooseStudio", "Goose Studio", true),
|
||||||
|
("Visual Studio Code", "VisualStudioCode", true),
|
||||||
|
("Google Chrome", "Google Chrome", true),
|
||||||
|
("Safari", "Safari", true),
|
||||||
|
("iTerm", "iTerm2", true), // fuzzy match
|
||||||
|
("Code", "Visual Studio Code", true), // fuzzy match
|
||||||
|
];
|
||||||
|
|
||||||
|
for (user_input, app_name, should_match) in test_cases {
|
||||||
|
let user_lower = user_input.to_lowercase();
|
||||||
|
let app_lower = app_name.to_lowercase();
|
||||||
|
|
||||||
|
let user_normalized = user_lower.replace(" ", "");
|
||||||
|
let app_normalized = app_lower.replace(" ", "");
|
||||||
|
|
||||||
|
let is_exact = app_lower == user_lower || app_normalized == user_normalized;
|
||||||
|
let is_fuzzy = app_lower.contains(&user_lower)
|
||||||
|
|| user_lower.contains(&app_lower)
|
||||||
|
|| app_normalized.contains(&user_normalized)
|
||||||
|
|| user_normalized.contains(&app_normalized);
|
||||||
|
|
||||||
|
let matches = is_exact || is_fuzzy;
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
matches, should_match,
|
||||||
|
"Expected '{}' vs '{}' to match={}, but got match={}",
|
||||||
|
user_input, app_name, should_match, matches
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -483,8 +483,8 @@ Format this as a detailed but concise summary that can be used to resume the con
|
|||||||
if matches!(message.role, MessageRole::User) && message.content.starts_with("Tool result:") {
|
if matches!(message.role, MessageRole::User) && message.content.starts_with("Tool result:") {
|
||||||
let content_len = message.content.len();
|
let content_len = message.content.len();
|
||||||
|
|
||||||
// Only thin if the content is greater than 1000 chars
|
// Only thin if the content is greater than 500 chars
|
||||||
if content_len > 1000 {
|
if content_len > 500 {
|
||||||
// Generate a unique filename based on timestamp and index
|
// Generate a unique filename based on timestamp and index
|
||||||
let timestamp = std::time::SystemTime::now()
|
let timestamp = std::time::SystemTime::now()
|
||||||
.duration_since(std::time::UNIX_EPOCH)
|
.duration_since(std::time::UNIX_EPOCH)
|
||||||
@@ -541,8 +541,8 @@ Format this as a detailed but concise summary that can be used to resume the con
|
|||||||
.map(|s| (s.to_string(), s.len()));
|
.map(|s| (s.to_string(), s.len()));
|
||||||
|
|
||||||
if let Some((content_str, content_len)) = content_info {
|
if let Some((content_str, content_len)) = content_info {
|
||||||
// Only thin if content is greater than 1000 chars
|
// Only thin if content is greater than 500 chars
|
||||||
if content_len > 1000 {
|
if content_len > 500 {
|
||||||
let timestamp = std::time::SystemTime::now()
|
let timestamp = std::time::SystemTime::now()
|
||||||
.duration_since(std::time::UNIX_EPOCH)
|
.duration_since(std::time::UNIX_EPOCH)
|
||||||
.unwrap_or_default()
|
.unwrap_or_default()
|
||||||
@@ -574,8 +574,8 @@ Format this as a detailed but concise summary that can be used to resume the con
|
|||||||
.map(|s| (s.to_string(), s.len()));
|
.map(|s| (s.to_string(), s.len()));
|
||||||
|
|
||||||
if let Some((diff_str, diff_len)) = diff_info {
|
if let Some((diff_str, diff_len)) = diff_info {
|
||||||
// Only thin if diff is greater than 1000 chars
|
// Only thin if diff is greater than 500 chars
|
||||||
if diff_len > 1000 {
|
if diff_len > 500 {
|
||||||
let timestamp = std::time::SystemTime::now()
|
let timestamp = std::time::SystemTime::now()
|
||||||
.duration_since(std::time::UNIX_EPOCH)
|
.duration_since(std::time::UNIX_EPOCH)
|
||||||
.unwrap_or_default()
|
.unwrap_or_default()
|
||||||
@@ -2080,132 +2080,6 @@ Template:
|
|||||||
"required": ["app_name"]
|
"required": ["app_name"]
|
||||||
}),
|
}),
|
||||||
},
|
},
|
||||||
Tool {
|
|
||||||
name: "macax_get_ui_tree".to_string(),
|
|
||||||
description: "Get the UI element hierarchy of an application as a tree structure".to_string(),
|
|
||||||
input_schema: json!({
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"app_name": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "Name of the application"
|
|
||||||
},
|
|
||||||
"max_depth": {
|
|
||||||
"type": "integer",
|
|
||||||
"description": "Maximum depth to traverse (default: 3)"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": ["app_name"]
|
|
||||||
}),
|
|
||||||
},
|
|
||||||
Tool {
|
|
||||||
name: "macax_find_elements".to_string(),
|
|
||||||
description: "Find UI elements in an application by role, title, or identifier. Use this to locate buttons, text fields, etc.".to_string(),
|
|
||||||
input_schema: json!({
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"app_name": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "Name of the application"
|
|
||||||
},
|
|
||||||
"role": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "UI element role (e.g., 'button', 'text field', 'window')"
|
|
||||||
},
|
|
||||||
"title": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "Element title or label to match"
|
|
||||||
},
|
|
||||||
"identifier": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "Element identifier (accessibility identifier)"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": ["app_name"]
|
|
||||||
}),
|
|
||||||
},
|
|
||||||
Tool {
|
|
||||||
name: "macax_click".to_string(),
|
|
||||||
description: "Click a UI element in an application".to_string(),
|
|
||||||
input_schema: json!({
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"app_name": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "Name of the application"
|
|
||||||
},
|
|
||||||
"role": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "UI element role (e.g., 'button')"
|
|
||||||
},
|
|
||||||
"title": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "Element title or label"
|
|
||||||
},
|
|
||||||
"identifier": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "Element identifier"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": ["app_name", "role"]
|
|
||||||
}),
|
|
||||||
},
|
|
||||||
Tool {
|
|
||||||
name: "macax_set_value".to_string(),
|
|
||||||
description: "Set the value of a UI element (e.g., type into a text field)".to_string(),
|
|
||||||
input_schema: json!({
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"app_name": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "Name of the application"
|
|
||||||
},
|
|
||||||
"role": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "UI element role (e.g., 'text field')"
|
|
||||||
},
|
|
||||||
"value": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "Value to set"
|
|
||||||
},
|
|
||||||
"title": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "Element title or label"
|
|
||||||
},
|
|
||||||
"identifier": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "Element identifier"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": ["app_name", "role", "value"]
|
|
||||||
}),
|
|
||||||
},
|
|
||||||
Tool {
|
|
||||||
name: "macax_get_value".to_string(),
|
|
||||||
description: "Get the value of a UI element (e.g., read text from a text field)".to_string(),
|
|
||||||
input_schema: json!({
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"app_name": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "Name of the application"
|
|
||||||
},
|
|
||||||
"role": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "UI element role (e.g., 'text field')"
|
|
||||||
},
|
|
||||||
"title": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "Element title or label"
|
|
||||||
},
|
|
||||||
"identifier": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "Element identifier"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": ["app_name", "role"]
|
|
||||||
}),
|
|
||||||
},
|
|
||||||
Tool {
|
Tool {
|
||||||
name: "macax_press_key".to_string(),
|
name: "macax_press_key".to_string(),
|
||||||
description: "Press a keyboard key or shortcut in an application (e.g., Cmd+S to save)".to_string(),
|
description: "Press a keyboard key or shortcut in an application (e.g., Cmd+S to save)".to_string(),
|
||||||
@@ -2253,21 +2127,6 @@ Template:
|
|||||||
}),
|
}),
|
||||||
});
|
});
|
||||||
|
|
||||||
// Add focus_element tool
|
|
||||||
tools.push(Tool {
|
|
||||||
name: "macax_focus_element".to_string(),
|
|
||||||
description: "Focus on a UI element (text field, text area, etc.) before typing".to_string(),
|
|
||||||
input_schema: json!({
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"app_name": {"type": "string", "description": "Name of the application"},
|
|
||||||
"role": {"type": "string", "description": "UI element role (e.g., 'text field', 'text area')"},
|
|
||||||
"title": {"type": "string", "description": "Element title or label (optional)"},
|
|
||||||
"identifier": {"type": "string", "description": "Element accessibility identifier (optional)"}
|
|
||||||
},
|
|
||||||
"required": ["app_name", "role"]
|
|
||||||
}),
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add extract_text_with_boxes tool (requires macax flag)
|
// Add extract_text_with_boxes tool (requires macax flag)
|
||||||
@@ -4323,168 +4182,6 @@ Template:
|
|||||||
Err(e) => Ok(format!("❌ Failed to activate app: {}", e)),
|
Err(e) => Ok(format!("❌ Failed to activate app: {}", e)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
"macax_get_ui_tree" => {
|
|
||||||
debug!("Processing macax_get_ui_tree tool call");
|
|
||||||
|
|
||||||
if !self.config.macax.enabled {
|
|
||||||
return Ok("❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string());
|
|
||||||
}
|
|
||||||
|
|
||||||
let app_name = match tool_call.args.get("app_name").and_then(|v| v.as_str()) {
|
|
||||||
Some(n) => n,
|
|
||||||
None => return Ok("❌ Missing app_name argument".to_string()),
|
|
||||||
};
|
|
||||||
|
|
||||||
let max_depth = tool_call.args.get("max_depth")
|
|
||||||
.and_then(|v| v.as_u64())
|
|
||||||
.map(|n| n as usize)
|
|
||||||
.unwrap_or(3);
|
|
||||||
|
|
||||||
let controller_guard = self.macax_controller.read().await;
|
|
||||||
let controller = match controller_guard.as_ref() {
|
|
||||||
Some(c) => c,
|
|
||||||
None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()),
|
|
||||||
};
|
|
||||||
|
|
||||||
match controller.get_ui_tree(app_name, max_depth) {
|
|
||||||
Ok(tree) => Ok(tree),
|
|
||||||
Err(e) => Ok(format!("❌ Failed to get UI tree: {}", e)),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"macax_find_elements" => {
|
|
||||||
debug!("Processing macax_find_elements tool call");
|
|
||||||
|
|
||||||
if !self.config.macax.enabled {
|
|
||||||
return Ok("❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string());
|
|
||||||
}
|
|
||||||
|
|
||||||
let app_name = match tool_call.args.get("app_name").and_then(|v| v.as_str()) {
|
|
||||||
Some(n) => n,
|
|
||||||
None => return Ok("❌ Missing app_name argument".to_string()),
|
|
||||||
};
|
|
||||||
|
|
||||||
let role = tool_call.args.get("role").and_then(|v| v.as_str());
|
|
||||||
let title = tool_call.args.get("title").and_then(|v| v.as_str());
|
|
||||||
let identifier = tool_call.args.get("identifier").and_then(|v| v.as_str());
|
|
||||||
|
|
||||||
let controller_guard = self.macax_controller.read().await;
|
|
||||||
let controller = match controller_guard.as_ref() {
|
|
||||||
Some(c) => c,
|
|
||||||
None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()),
|
|
||||||
};
|
|
||||||
|
|
||||||
match controller.find_elements(app_name, role, title, identifier) {
|
|
||||||
Ok(elements) => {
|
|
||||||
if elements.is_empty() {
|
|
||||||
Ok("No elements found matching criteria".to_string())
|
|
||||||
} else {
|
|
||||||
let element_strs: Vec<String> = elements.iter()
|
|
||||||
.map(|e| e.to_string())
|
|
||||||
.collect();
|
|
||||||
Ok(format!("Found {} element(s):\n{}", elements.len(), element_strs.join("\n")))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Err(e) => Ok(format!("❌ Failed to find elements: {}", e)),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"macax_click" => {
|
|
||||||
debug!("Processing macax_click tool call");
|
|
||||||
|
|
||||||
if !self.config.macax.enabled {
|
|
||||||
return Ok("❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string());
|
|
||||||
}
|
|
||||||
|
|
||||||
let app_name = match tool_call.args.get("app_name").and_then(|v| v.as_str()) {
|
|
||||||
Some(n) => n,
|
|
||||||
None => return Ok("❌ Missing app_name argument".to_string()),
|
|
||||||
};
|
|
||||||
|
|
||||||
let role = match tool_call.args.get("role").and_then(|v| v.as_str()) {
|
|
||||||
Some(r) => r,
|
|
||||||
None => return Ok("❌ Missing role argument".to_string()),
|
|
||||||
};
|
|
||||||
|
|
||||||
let title = tool_call.args.get("title").and_then(|v| v.as_str());
|
|
||||||
let identifier = tool_call.args.get("identifier").and_then(|v| v.as_str());
|
|
||||||
|
|
||||||
let controller_guard = self.macax_controller.read().await;
|
|
||||||
let controller = match controller_guard.as_ref() {
|
|
||||||
Some(c) => c,
|
|
||||||
None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()),
|
|
||||||
};
|
|
||||||
|
|
||||||
match controller.click_element(app_name, role, title, identifier) {
|
|
||||||
Ok(_) => Ok(format!("✅ Clicked {} element", role)),
|
|
||||||
Err(e) => Ok(format!("❌ Failed to click element: {}", e)),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"macax_set_value" => {
|
|
||||||
debug!("Processing macax_set_value tool call");
|
|
||||||
|
|
||||||
if !self.config.macax.enabled {
|
|
||||||
return Ok("❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string());
|
|
||||||
}
|
|
||||||
|
|
||||||
let app_name = match tool_call.args.get("app_name").and_then(|v| v.as_str()) {
|
|
||||||
Some(n) => n,
|
|
||||||
None => return Ok("❌ Missing app_name argument".to_string()),
|
|
||||||
};
|
|
||||||
|
|
||||||
let role = match tool_call.args.get("role").and_then(|v| v.as_str()) {
|
|
||||||
Some(r) => r,
|
|
||||||
None => return Ok("❌ Missing role argument".to_string()),
|
|
||||||
};
|
|
||||||
|
|
||||||
let value = match tool_call.args.get("value").and_then(|v| v.as_str()) {
|
|
||||||
Some(v) => v,
|
|
||||||
None => return Ok("❌ Missing value argument".to_string()),
|
|
||||||
};
|
|
||||||
|
|
||||||
let title = tool_call.args.get("title").and_then(|v| v.as_str());
|
|
||||||
let identifier = tool_call.args.get("identifier").and_then(|v| v.as_str());
|
|
||||||
|
|
||||||
let controller_guard = self.macax_controller.read().await;
|
|
||||||
let controller = match controller_guard.as_ref() {
|
|
||||||
Some(c) => c,
|
|
||||||
None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()),
|
|
||||||
};
|
|
||||||
|
|
||||||
match controller.set_value(app_name, role, value, title, identifier) {
|
|
||||||
Ok(_) => Ok(format!("✅ Set value of {} element to: {}", role, value)),
|
|
||||||
Err(e) => Ok(format!("❌ Failed to set value: {}", e)),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"macax_get_value" => {
|
|
||||||
debug!("Processing macax_get_value tool call");
|
|
||||||
|
|
||||||
if !self.config.macax.enabled {
|
|
||||||
return Ok("❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string());
|
|
||||||
}
|
|
||||||
|
|
||||||
let app_name = match tool_call.args.get("app_name").and_then(|v| v.as_str()) {
|
|
||||||
Some(n) => n,
|
|
||||||
None => return Ok("❌ Missing app_name argument".to_string()),
|
|
||||||
};
|
|
||||||
|
|
||||||
let role = match tool_call.args.get("role").and_then(|v| v.as_str()) {
|
|
||||||
Some(r) => r,
|
|
||||||
None => return Ok("❌ Missing role argument".to_string()),
|
|
||||||
};
|
|
||||||
|
|
||||||
let title = tool_call.args.get("title").and_then(|v| v.as_str());
|
|
||||||
let identifier = tool_call.args.get("identifier").and_then(|v| v.as_str());
|
|
||||||
|
|
||||||
let controller_guard = self.macax_controller.read().await;
|
|
||||||
let controller = match controller_guard.as_ref() {
|
|
||||||
Some(c) => c,
|
|
||||||
None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()),
|
|
||||||
};
|
|
||||||
|
|
||||||
match controller.get_value(app_name, role, title, identifier) {
|
|
||||||
Ok(value) => Ok(format!("Value: {}", value)),
|
|
||||||
Err(e) => Ok(format!("❌ Failed to get value: {}", e)),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"macax_press_key" => {
|
"macax_press_key" => {
|
||||||
debug!("Processing macax_press_key tool call");
|
debug!("Processing macax_press_key tool call");
|
||||||
|
|
||||||
@@ -4555,37 +4252,6 @@ Template:
|
|||||||
Err(e) => Ok(format!("❌ Failed to type text: {}", e)),
|
Err(e) => Ok(format!("❌ Failed to type text: {}", e)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
"macax_focus_element" => {
|
|
||||||
debug!("Processing macax_focus_element tool call");
|
|
||||||
|
|
||||||
if !self.config.macax.enabled {
|
|
||||||
return Ok("❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string());
|
|
||||||
}
|
|
||||||
|
|
||||||
let app_name = match tool_call.args.get("app_name").and_then(|v| v.as_str()) {
|
|
||||||
Some(n) => n,
|
|
||||||
None => return Ok("❌ Missing app_name argument".to_string()),
|
|
||||||
};
|
|
||||||
|
|
||||||
let role = match tool_call.args.get("role").and_then(|v| v.as_str()) {
|
|
||||||
Some(r) => r,
|
|
||||||
None => return Ok("❌ Missing role argument".to_string()),
|
|
||||||
};
|
|
||||||
|
|
||||||
let title = tool_call.args.get("title").and_then(|v| v.as_str());
|
|
||||||
let identifier = tool_call.args.get("identifier").and_then(|v| v.as_str());
|
|
||||||
|
|
||||||
let controller_guard = self.macax_controller.read().await;
|
|
||||||
let controller = match controller_guard.as_ref() {
|
|
||||||
Some(c) => c,
|
|
||||||
None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()),
|
|
||||||
};
|
|
||||||
|
|
||||||
match controller.focus_element(app_name, role, title, identifier) {
|
|
||||||
Ok(_) => Ok(format!("✅ Focused {} element in {}", role, app_name)),
|
|
||||||
Err(e) => Ok(format!("❌ Failed to focus element: {}", e)),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"vision_find_text" => {
|
"vision_find_text" => {
|
||||||
debug!("Processing vision_find_text tool call");
|
debug!("Processing vision_find_text tool call");
|
||||||
|
|
||||||
@@ -4628,11 +4294,34 @@ Template:
|
|||||||
match controller.find_text_in_app(app_name, text).await {
|
match controller.find_text_in_app(app_name, text).await {
|
||||||
Ok(Some(location)) => {
|
Ok(Some(location)) => {
|
||||||
// Click on center of text
|
// Click on center of text
|
||||||
let center_x = location.x + location.width / 2;
|
// IMPORTANT: location coordinates are in NSScreen space (Y=0 at BOTTOM, increases UPWARD)
|
||||||
let center_y = location.y + location.height / 2;
|
// location.x is the LEFT edge of the bounding box
|
||||||
|
// location.y is the TOP edge of the bounding box (highest Y value in NSScreen space)
|
||||||
|
// location.width and location.height are already scaled to screen space
|
||||||
|
// To get center: we need to add half the SCALED width and subtract half the SCALED height
|
||||||
|
|
||||||
match controller.click_at(center_x, center_y, Some(app_name)) {
|
if location.width == 0 || location.height == 0 {
|
||||||
Ok(_) => Ok(format!("✅ Clicked on '{}' in {} at ({}, {})", text, app_name, center_x, center_y)),
|
return Ok(format!("❌ Invalid bounding box dimensions: width={}, height={}", location.width, location.height));
|
||||||
|
}
|
||||||
|
|
||||||
|
debug!("[vision_click_text] Location from find_text_in_app: x={}, y={}, width={}, height={}, text='{}'",
|
||||||
|
location.x, location.y, location.width, location.height, location.text);
|
||||||
|
|
||||||
|
// Calculate center using the SCALED dimensions
|
||||||
|
// X: Use right edge instead of center (Vision OCR bounding box seems offset)
|
||||||
|
// This gives us: left edge + full width = right edge
|
||||||
|
// Y: top edge - half of scaled height (subtract because Y increases upward)
|
||||||
|
let click_x = location.x + location.width; // Right edge
|
||||||
|
let half_height = location.height / 2;
|
||||||
|
let click_y = location.y - half_height;
|
||||||
|
|
||||||
|
debug!("[vision_click_text] Click position calculation: x={} + {} = {} (right edge), y={} - {} = {}",
|
||||||
|
location.x, location.width, click_x, location.y, half_height, click_y);
|
||||||
|
debug!("[vision_click_text] This means: left_edge={}, center={}, right_edge={}",
|
||||||
|
location.x, click_x, location.x + location.width);
|
||||||
|
|
||||||
|
match controller.click_at(click_x, click_y, Some(app_name)) {
|
||||||
|
Ok(_) => Ok(format!("✅ Clicked on '{}' in {} at ({}, {})", text, app_name, click_x, click_y)),
|
||||||
Err(e) => Ok(format!("❌ Failed to click: {}", e)),
|
Err(e) => Ok(format!("❌ Failed to click: {}", e)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -4709,13 +4398,15 @@ Template:
|
|||||||
match controller.find_text_in_app(app_name, text).await {
|
match controller.find_text_in_app(app_name, text).await {
|
||||||
Ok(Some(location)) => {
|
Ok(Some(location)) => {
|
||||||
// Calculate click position based on direction
|
// Calculate click position based on direction
|
||||||
|
// location.x is LEFT edge, location.y is TOP edge (in NSScreen space)
|
||||||
let (click_x, click_y) = match direction {
|
let (click_x, click_y) = match direction {
|
||||||
"right" => (location.x + location.width + distance, location.y + location.height / 2),
|
"right" => (location.x + location.width + distance, location.y - (location.height / 2)),
|
||||||
"below" => (location.x + location.width / 2, location.y + location.height + distance),
|
"below" => (location.x + (location.width / 2), location.y - location.height - distance),
|
||||||
"left" => (location.x - distance, location.y + location.height / 2),
|
"left" => (location.x - distance, location.y - (location.height / 2)),
|
||||||
"above" => (location.x + location.width / 2, location.y - distance),
|
"above" => (location.x + (location.width / 2), location.y + distance),
|
||||||
_ => (location.x + location.width + distance, location.y + location.height / 2),
|
_ => (location.x + location.width + distance, location.y - (location.height / 2)),
|
||||||
};
|
};
|
||||||
|
debug!("[vision_click_near_text] Clicking {} of text at ({}, {})", direction, click_x, click_y);
|
||||||
|
|
||||||
match controller.click_at(click_x, click_y, Some(app_name)) {
|
match controller.click_at(click_x, click_y, Some(app_name)) {
|
||||||
Ok(_) => Ok(format!(
|
Ok(_) => Ok(format!(
|
||||||
|
|||||||
@@ -166,6 +166,31 @@ impl CodeExecutor {
|
|||||||
|
|
||||||
/// Execute Bash code
|
/// Execute Bash code
|
||||||
async fn execute_bash(&self, code: &str) -> Result<ExecutionResult> {
|
async fn execute_bash(&self, code: &str) -> Result<ExecutionResult> {
|
||||||
|
// Check if this is a detached/daemon command that should run independently
|
||||||
|
let is_detached = code.trim_start().starts_with("setsid ")
|
||||||
|
|| code.trim_start().starts_with("nohup ")
|
||||||
|
|| code.contains(" disown")
|
||||||
|
|| (code.contains(" &") && (code.contains("nohup") || code.contains("setsid")));
|
||||||
|
|
||||||
|
if is_detached {
|
||||||
|
// For detached commands, just spawn and return immediately
|
||||||
|
use std::process::Stdio;
|
||||||
|
Command::new("bash")
|
||||||
|
.arg("-c")
|
||||||
|
.arg(code)
|
||||||
|
.stdin(Stdio::null())
|
||||||
|
.stdout(Stdio::null())
|
||||||
|
.stderr(Stdio::null())
|
||||||
|
.spawn()?;
|
||||||
|
|
||||||
|
return Ok(ExecutionResult {
|
||||||
|
stdout: "✅ Command launched in background (detached process)".to_string(),
|
||||||
|
stderr: String::new(),
|
||||||
|
exit_code: 0,
|
||||||
|
success: true,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
let output = Command::new("bash")
|
let output = Command::new("bash")
|
||||||
.arg("-c")
|
.arg("-c")
|
||||||
.arg(code)
|
.arg(code)
|
||||||
@@ -221,6 +246,29 @@ impl CodeExecutor {
|
|||||||
use tokio::io::{AsyncBufReadExt, BufReader};
|
use tokio::io::{AsyncBufReadExt, BufReader};
|
||||||
use tokio::process::Command as TokioCommand;
|
use tokio::process::Command as TokioCommand;
|
||||||
|
|
||||||
|
// Check if this is a detached/daemon command that should run independently
|
||||||
|
// Look for patterns like: setsid, nohup with &, or explicit backgrounding with disown
|
||||||
|
let is_detached = code.trim_start().starts_with("setsid ")
|
||||||
|
|| code.trim_start().starts_with("nohup ")
|
||||||
|
|| code.contains(" disown")
|
||||||
|
|| (code.contains(" &") && (code.contains("nohup") || code.contains("setsid")));
|
||||||
|
|
||||||
|
if is_detached {
|
||||||
|
// For detached commands, just spawn and return immediately
|
||||||
|
TokioCommand::new("bash")
|
||||||
|
.arg("-c")
|
||||||
|
.arg(code)
|
||||||
|
.spawn()?;
|
||||||
|
|
||||||
|
// Don't wait for the process - it's meant to run independently
|
||||||
|
return Ok(ExecutionResult {
|
||||||
|
stdout: "✅ Command launched in background (detached process)".to_string(),
|
||||||
|
stderr: String::new(),
|
||||||
|
exit_code: 0,
|
||||||
|
success: true,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
let mut child = TokioCommand::new("bash")
|
let mut child = TokioCommand::new("bash")
|
||||||
.arg("-c")
|
.arg("-c")
|
||||||
.arg(code)
|
.arg(code)
|
||||||
|
|||||||
Reference in New Issue
Block a user