fixed x,y detection in vision click
This commit is contained in:
@@ -64,7 +64,7 @@ impl ComputerController for MacOSController {
|
||||
let array = CFArray::<CFDictionary>::wrap_under_create_rule(window_list);
|
||||
let count = array.len();
|
||||
|
||||
let mut found_window_id: Option<(u32, String, bool)> = None; // (id, owner, is_exact_match)
|
||||
let mut found_window_id: Option<(u32, String)> = None; // (id, owner)
|
||||
let app_name_lower = app_name.to_lowercase();
|
||||
|
||||
for i in 0..count {
|
||||
@@ -82,31 +82,62 @@ impl ComputerController for MacOSController {
|
||||
tracing::debug!("Checking window: owner='{}', looking for '{}'", owner, app_name);
|
||||
let owner_lower = owner.to_lowercase();
|
||||
|
||||
// Check for exact match first (case-insensitive)
|
||||
let is_exact_match = owner_lower == app_name_lower;
|
||||
// Normalize by removing spaces for exact matching
|
||||
let app_name_normalized = app_name_lower.replace(" ", "");
|
||||
let owner_normalized = owner_lower.replace(" ", "");
|
||||
|
||||
// Check for fuzzy match (either direction contains)
|
||||
let is_fuzzy_match = owner_lower.contains(&app_name_lower) || app_name_lower.contains(&owner_lower);
|
||||
// ONLY accept exact matches (case-insensitive, with or without spaces)
|
||||
// This prevents "Goose" from matching "GooseStudio"
|
||||
let is_match = owner_lower == app_name_lower || owner_normalized == app_name_normalized;
|
||||
|
||||
if is_exact_match || is_fuzzy_match {
|
||||
if is_match {
|
||||
// Get window ID
|
||||
let window_id_key = CFString::from_static_string("kCGWindowNumber");
|
||||
if let Some(value) = dict.find(window_id_key.to_void()) {
|
||||
let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _);
|
||||
if let Some(id) = num.to_i64() {
|
||||
tracing::debug!("Found candidate: window ID {} for app '{}' (exact={}, fuzzy={})", id, owner, is_exact_match, is_fuzzy_match);
|
||||
// Get window layer to filter out menu bar windows
|
||||
let layer_key = CFString::from_static_string("kCGWindowLayer");
|
||||
let layer: i32 = if let Some(value) = dict.find(layer_key.to_void()) {
|
||||
let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _);
|
||||
num.to_i32().unwrap_or(0)
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
// If we found an exact match, use it immediately
|
||||
if is_exact_match {
|
||||
tracing::info!("Found exact match: window ID {} for app '{}'", id, owner);
|
||||
found_window_id = Some((id as u32, owner.clone(), true));
|
||||
// Get window bounds to verify it's a real window
|
||||
let bounds_key = CFString::from_static_string("kCGWindowBounds");
|
||||
let has_real_bounds = if let Some(value) = dict.find(bounds_key.to_void()) {
|
||||
let bounds_dict: CFDictionary = TCFType::wrap_under_get_rule(*value as *const _);
|
||||
let width_key = CFString::from_static_string("Width");
|
||||
let height_key = CFString::from_static_string("Height");
|
||||
|
||||
if let (Some(w_val), Some(h_val)) = (
|
||||
bounds_dict.find(width_key.to_void()),
|
||||
bounds_dict.find(height_key.to_void()),
|
||||
) {
|
||||
let w_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*w_val as *const _);
|
||||
let h_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*h_val as *const _);
|
||||
let width = w_num.to_f64().unwrap_or(0.0);
|
||||
let height = h_num.to_f64().unwrap_or(0.0);
|
||||
// Real windows should be at least 100x100 pixels
|
||||
width >= 100.0 && height >= 100.0
|
||||
} else {
|
||||
false
|
||||
}
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
// Only accept windows that are:
|
||||
// 1. At layer 0 (normal windows, not menu bar)
|
||||
// 2. Have real bounds (width and height >= 100)
|
||||
if layer == 0 && has_real_bounds {
|
||||
tracing::info!("Found valid window: ID {} for app '{}' (layer={}, bounds valid)", id, owner, layer);
|
||||
found_window_id = Some((id as u32, owner.clone()));
|
||||
break;
|
||||
}
|
||||
|
||||
// Otherwise, keep the first fuzzy match but continue looking for exact match
|
||||
if found_window_id.is_none() {
|
||||
tracing::info!("Found fuzzy match: window ID {} for app '{}'", id, owner);
|
||||
found_window_id = Some((id as u32, owner.clone(), false));
|
||||
} else {
|
||||
tracing::debug!("Skipping window ID {} for '{}': layer={}, has_real_bounds={}", id, owner, layer, has_real_bounds);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -116,15 +147,10 @@ impl ComputerController for MacOSController {
|
||||
found_window_id
|
||||
};
|
||||
|
||||
let (cg_window_id, matched_owner, is_exact) = cg_window_id.ok_or_else(|| {
|
||||
let (cg_window_id, matched_owner) = cg_window_id.ok_or_else(|| {
|
||||
anyhow::anyhow!("Could not find window for application '{}'. Use list_windows to see available windows.", app_name)
|
||||
})?;
|
||||
|
||||
if !is_exact {
|
||||
tracing::warn!("Using fuzzy match: requested '{}' but found '{}' (window ID {})", app_name, matched_owner, cg_window_id);
|
||||
} else {
|
||||
tracing::info!("Taking screenshot of window ID {} for app '{}'", cg_window_id, matched_owner);
|
||||
}
|
||||
|
||||
// Use screencapture with the window ID for now
|
||||
// TODO: Implement direct CGWindowListCreateImage approach with proper image saving
|
||||
@@ -178,12 +204,18 @@ impl ComputerController for MacOSController {
|
||||
async fn find_text_in_app(&self, app_name: &str, search_text: &str) -> Result<Option<TextLocation>> {
|
||||
// Take screenshot of specific app window
|
||||
let home = std::env::var("HOME").unwrap_or_else(|_| "/tmp".to_string());
|
||||
let temp_path = format!("{}/Desktop/g3_find_text_{}_{}.png", home, app_name, uuid::Uuid::new_v4());
|
||||
let temp_path = format!("{}/tmp/g3_find_text_{}_{}.png", home, app_name, uuid::Uuid::new_v4());
|
||||
self.take_screenshot(&temp_path, None, Some(app_name)).await?;
|
||||
|
||||
// Get screenshot dimensions before we delete it
|
||||
let screenshot_dims = get_image_dimensions(&temp_path)?;
|
||||
|
||||
// Extract all text with locations
|
||||
let locations = self.extract_text_with_locations(&temp_path).await?;
|
||||
|
||||
// Get window bounds to calculate coordinate transformation
|
||||
let window_bounds = self.get_window_bounds(app_name)?;
|
||||
|
||||
// Clean up temp file
|
||||
let _ = std::fs::remove_file(&temp_path);
|
||||
|
||||
@@ -191,7 +223,13 @@ impl ComputerController for MacOSController {
|
||||
let search_lower = search_text.to_lowercase();
|
||||
for location in locations {
|
||||
if location.text.to_lowercase().contains(&search_lower) {
|
||||
return Ok(Some(location));
|
||||
// Transform coordinates from screenshot space to screen space
|
||||
let transformed = transform_screenshot_to_screen_coords(
|
||||
location,
|
||||
window_bounds,
|
||||
screenshot_dims,
|
||||
);
|
||||
return Ok(Some(transformed));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -222,44 +260,7 @@ impl ComputerController for MacOSController {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn click_at(&self, x: i32, y: i32, app_name: Option<&str>) -> Result<()> {
|
||||
// If app_name is provided, get window position and offset coordinates
|
||||
let (global_x, global_y) = if let Some(app) = app_name {
|
||||
// Get window position using AppleScript
|
||||
let script = format!(
|
||||
r#"tell application "{}" to get bounds of window 1"#,
|
||||
app
|
||||
);
|
||||
|
||||
let output = std::process::Command::new("osascript")
|
||||
.arg("-e")
|
||||
.arg(&script)
|
||||
.output()?;
|
||||
|
||||
if output.status.success() {
|
||||
let bounds_str = String::from_utf8_lossy(&output.stdout);
|
||||
// Parse bounds: "x1, y1, x2, y2"
|
||||
let parts: Vec<&str> = bounds_str.trim().split(", ").collect();
|
||||
if parts.len() >= 2 {
|
||||
if let (Ok(window_x), Ok(window_y)) = (
|
||||
parts[0].trim().parse::<i32>(),
|
||||
parts[1].trim().parse::<i32>(),
|
||||
) {
|
||||
// Offset relative coordinates by window position
|
||||
(x + window_x, y + window_y)
|
||||
} else {
|
||||
(x, y) // Fallback to absolute coordinates
|
||||
}
|
||||
} else {
|
||||
(x, y) // Fallback to absolute coordinates
|
||||
}
|
||||
} else {
|
||||
(x, y) // Fallback to absolute coordinates
|
||||
}
|
||||
} else {
|
||||
(x, y) // No app name, use absolute coordinates
|
||||
};
|
||||
|
||||
fn click_at(&self, x: i32, y: i32, _app_name: Option<&str>) -> Result<()> {
|
||||
use core_graphics::event::{
|
||||
CGEvent, CGEventTapLocation, CGEventType, CGMouseButton,
|
||||
};
|
||||
@@ -267,12 +268,27 @@ impl ComputerController for MacOSController {
|
||||
CGEventSource, CGEventSourceStateID,
|
||||
};
|
||||
use core_graphics::geometry::CGPoint;
|
||||
use core_graphics::display::CGDisplay;
|
||||
|
||||
// IMPORTANT: Coordinates passed here are in NSScreen/CGWindowListCopyWindowInfo space
|
||||
// (Y=0 at BOTTOM, increases UPWARD)
|
||||
// But CGEvent uses a different coordinate system (Y=0 at TOP, increases DOWNWARD)
|
||||
// We need to convert: CGEvent.y = screenHeight - NSScreen.y
|
||||
|
||||
let screen_height = CGDisplay::main().pixels_high() as i32;
|
||||
let cgevent_x = x;
|
||||
let cgevent_y = screen_height - y;
|
||||
|
||||
tracing::debug!("click_at: NSScreen coords ({}, {}) -> CGEvent coords ({}, {}) [screen_height={}]",
|
||||
x, y, cgevent_x, cgevent_y, screen_height);
|
||||
|
||||
let (global_x, global_y) = (cgevent_x, cgevent_y);
|
||||
|
||||
let point = CGPoint::new(global_x as f64, global_y as f64);
|
||||
|
||||
let source = CGEventSource::new(CGEventSourceStateID::HIDSystemState)
|
||||
.ok().context("Failed to create event source")?;
|
||||
|
||||
let point = CGPoint::new(global_x as f64, global_y as f64);
|
||||
|
||||
// Move mouse to position first
|
||||
let move_event = CGEvent::new_mouse_event(
|
||||
source.clone(),
|
||||
@@ -306,4 +322,186 @@ impl ComputerController for MacOSController {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl MacOSController {
|
||||
/// Get window bounds for an application (helper method)
|
||||
fn get_window_bounds(&self, app_name: &str) -> Result<(i32, i32, i32, i32)> {
|
||||
unsafe {
|
||||
let window_list = CGWindowListCopyWindowInfo(
|
||||
kCGWindowListOptionOnScreenOnly,
|
||||
kCGNullWindowID
|
||||
);
|
||||
|
||||
let array = CFArray::<CFDictionary>::wrap_under_create_rule(window_list);
|
||||
let count = array.len();
|
||||
|
||||
let app_name_lower = app_name.to_lowercase();
|
||||
|
||||
for i in 0..count {
|
||||
let dict = array.get(i).unwrap();
|
||||
|
||||
// Get owner name
|
||||
let owner_key = CFString::from_static_string("kCGWindowOwnerName");
|
||||
let owner: String = if let Some(value) = dict.find(owner_key.to_void()) {
|
||||
let s: CFString = TCFType::wrap_under_get_rule(*value as *const _);
|
||||
s.to_string()
|
||||
} else {
|
||||
continue;
|
||||
};
|
||||
|
||||
let owner_lower = owner.to_lowercase();
|
||||
|
||||
// Normalize by removing spaces for exact matching
|
||||
let app_name_normalized = app_name_lower.replace(" ", "");
|
||||
let owner_normalized = owner_lower.replace(" ", "");
|
||||
|
||||
// ONLY accept exact matches (case-insensitive, with or without spaces)
|
||||
// This prevents "Goose" from matching "GooseStudio"
|
||||
let is_match = owner_lower == app_name_lower || owner_normalized == app_name_normalized;
|
||||
|
||||
if is_match {
|
||||
// Get window layer to filter out menu bar windows
|
||||
let layer_key = CFString::from_static_string("kCGWindowLayer");
|
||||
let layer: i32 = if let Some(value) = dict.find(layer_key.to_void()) {
|
||||
let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _);
|
||||
num.to_i32().unwrap_or(0)
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
// Skip menu bar windows (layer >= 20)
|
||||
if layer >= 20 {
|
||||
tracing::debug!("Skipping window for '{}' at layer {} (menu bar)", owner, layer);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Get window bounds to verify it's a real window
|
||||
let bounds_key = CFString::from_static_string("kCGWindowBounds");
|
||||
if let Some(value) = dict.find(bounds_key.to_void()) {
|
||||
let bounds_dict: CFDictionary = TCFType::wrap_under_get_rule(*value as *const _);
|
||||
|
||||
let x_key = CFString::from_static_string("X");
|
||||
let y_key = CFString::from_static_string("Y");
|
||||
let width_key = CFString::from_static_string("Width");
|
||||
let height_key = CFString::from_static_string("Height");
|
||||
|
||||
if let (Some(x_val), Some(y_val), Some(w_val), Some(h_val)) = (
|
||||
bounds_dict.find(x_key.to_void()),
|
||||
bounds_dict.find(y_key.to_void()),
|
||||
bounds_dict.find(width_key.to_void()),
|
||||
bounds_dict.find(height_key.to_void()),
|
||||
) {
|
||||
let x_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*x_val as *const _);
|
||||
let y_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*y_val as *const _);
|
||||
let w_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*w_val as *const _);
|
||||
let h_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*h_val as *const _);
|
||||
|
||||
let x: i32 = x_num.to_i64().unwrap_or(0) as i32;
|
||||
let y: i32 = y_num.to_i64().unwrap_or(0) as i32;
|
||||
let w: i32 = w_num.to_i64().unwrap_or(0) as i32;
|
||||
let h: i32 = h_num.to_i64().unwrap_or(0) as i32;
|
||||
|
||||
// Only accept windows with real bounds (>= 100x100 pixels)
|
||||
if w >= 100 && h >= 100 {
|
||||
tracing::info!("Found valid window bounds for '{}': x={}, y={}, w={}, h={} (layer={})", owner, x, y, w, h, layer);
|
||||
return Ok((x, y, w, h));
|
||||
} else {
|
||||
tracing::debug!("Skipping window for '{}': too small ({}x{})", owner, w, h);
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Err(anyhow::anyhow!("Could not find window bounds for '{}'", app_name))
|
||||
}
|
||||
}
|
||||
|
||||
/// Get image dimensions from a PNG file
|
||||
fn get_image_dimensions(path: &str) -> Result<(i32, i32)> {
|
||||
use std::fs::File;
|
||||
use std::io::Read;
|
||||
|
||||
let mut file = File::open(path)?;
|
||||
let mut buffer = vec![0u8; 24];
|
||||
file.read_exact(&mut buffer)?;
|
||||
|
||||
// PNG signature check
|
||||
if &buffer[0..8] != b"\x89PNG\r\n\x1a\n" {
|
||||
anyhow::bail!("Not a valid PNG file");
|
||||
}
|
||||
|
||||
// Read IHDR chunk (width and height are at bytes 16-23)
|
||||
let width = u32::from_be_bytes([buffer[16], buffer[17], buffer[18], buffer[19]]) as i32;
|
||||
let height = u32::from_be_bytes([buffer[20], buffer[21], buffer[22], buffer[23]]) as i32;
|
||||
|
||||
Ok((width, height))
|
||||
}
|
||||
|
||||
/// Transform coordinates from screenshot space to screen space
|
||||
///
|
||||
/// The screenshot is taken of a window, and Vision OCR returns coordinates
|
||||
/// relative to the screenshot image. We need to transform these to actual
|
||||
/// screen coordinates for clicking.
|
||||
///
|
||||
/// On Retina displays, screenshots are taken at 2x resolution, so we need
|
||||
/// to account for this scaling factor.
|
||||
fn transform_screenshot_to_screen_coords(
|
||||
location: TextLocation,
|
||||
window_bounds: (i32, i32, i32, i32), // (x, y, width, height) in screen space
|
||||
screenshot_dims: (i32, i32), // (width, height) in pixels
|
||||
) -> TextLocation {
|
||||
let (win_x, win_y, win_width, win_height) = window_bounds;
|
||||
let (screenshot_width, screenshot_height) = screenshot_dims;
|
||||
|
||||
// Calculate scale factors
|
||||
// On Retina displays, screenshot is typically 2x the window size
|
||||
let scale_x = win_width as f64 / screenshot_width as f64;
|
||||
let scale_y = win_height as f64 / screenshot_height as f64;
|
||||
|
||||
tracing::debug!("Transform: screenshot={}x{}, window={}x{} at ({},{}), scale=({:.2},{:.2})",
|
||||
screenshot_width, screenshot_height, win_width, win_height, win_x, win_y, scale_x, scale_y);
|
||||
|
||||
// Transform coordinates from image space to screen space
|
||||
// IMPORTANT: macOS screen coordinates have origin at BOTTOM-LEFT (Y increases upward)
|
||||
// Image coordinates have origin at TOP-LEFT (Y increases downward)
|
||||
// win_y is the BOTTOM of the window in screen coordinates
|
||||
// So we need to: (win_y + win_height) to get window TOP, then subtract screenshot_y
|
||||
let window_top_y = win_y + win_height;
|
||||
|
||||
tracing::debug!("[transform] Input location in image space: x={}, y={}, width={}, height={}",
|
||||
location.x, location.y, location.width, location.height);
|
||||
tracing::debug!("[transform] Scale factors: scale_x={:.4}, scale_y={:.4}", scale_x, scale_y);
|
||||
|
||||
let transformed_x = win_x + (location.x as f64 * scale_x) as i32;
|
||||
let transformed_y = window_top_y - (location.y as f64 * scale_y) as i32;
|
||||
let transformed_width = (location.width as f64 * scale_x) as i32;
|
||||
let transformed_height = (location.height as f64 * scale_y) as i32;
|
||||
|
||||
tracing::debug!("[transform] Calculation details:");
|
||||
tracing::debug!(" - transformed_x = {} + ({} * {:.4}) = {} + {:.2} = {}", win_x, location.x, scale_x, win_x, location.x as f64 * scale_x, transformed_x);
|
||||
tracing::debug!(" - transformed_width = ({} * {:.4}) = {:.2} -> {}", location.width, scale_x, location.width as f64 * scale_x, transformed_width);
|
||||
tracing::debug!(" - transformed_height = ({} * {:.4}) = {:.2} -> {}", location.height, scale_y, location.height as f64 * scale_y, transformed_height);
|
||||
|
||||
tracing::debug!("Transformed location: screenshot=({},{}) {}x{} -> screen=({},{}) {}x{}",
|
||||
location.x, location.y, location.width, location.height,
|
||||
transformed_x, transformed_y, transformed_width, transformed_height);
|
||||
|
||||
TextLocation {
|
||||
text: location.text,
|
||||
x: transformed_x,
|
||||
y: transformed_y,
|
||||
width: transformed_width,
|
||||
height: transformed_height,
|
||||
confidence: location.confidence,
|
||||
}
|
||||
}
|
||||
|
||||
#[path = "macos_window_matching_test.rs"]
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
Reference in New Issue
Block a user