diff --git a/crates/g3-computer-control/src/platform/macos.rs b/crates/g3-computer-control/src/platform/macos.rs index c3bff9e..da9c81b 100644 --- a/crates/g3-computer-control/src/platform/macos.rs +++ b/crates/g3-computer-control/src/platform/macos.rs @@ -64,7 +64,7 @@ impl ComputerController for MacOSController { let array = CFArray::::wrap_under_create_rule(window_list); let count = array.len(); - let mut found_window_id: Option<(u32, String, bool)> = None; // (id, owner, is_exact_match) + let mut found_window_id: Option<(u32, String)> = None; // (id, owner) let app_name_lower = app_name.to_lowercase(); for i in 0..count { @@ -82,31 +82,62 @@ impl ComputerController for MacOSController { tracing::debug!("Checking window: owner='{}', looking for '{}'", owner, app_name); let owner_lower = owner.to_lowercase(); - // Check for exact match first (case-insensitive) - let is_exact_match = owner_lower == app_name_lower; + // Normalize by removing spaces for exact matching + let app_name_normalized = app_name_lower.replace(" ", ""); + let owner_normalized = owner_lower.replace(" ", ""); - // Check for fuzzy match (either direction contains) - let is_fuzzy_match = owner_lower.contains(&app_name_lower) || app_name_lower.contains(&owner_lower); + // ONLY accept exact matches (case-insensitive, with or without spaces) + // This prevents "Goose" from matching "GooseStudio" + let is_match = owner_lower == app_name_lower || owner_normalized == app_name_normalized; - if is_exact_match || is_fuzzy_match { + if is_match { // Get window ID let window_id_key = CFString::from_static_string("kCGWindowNumber"); if let Some(value) = dict.find(window_id_key.to_void()) { let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _); if let Some(id) = num.to_i64() { - tracing::debug!("Found candidate: window ID {} for app '{}' (exact={}, fuzzy={})", id, owner, is_exact_match, is_fuzzy_match); + // Get window layer to filter out menu bar windows + let layer_key = CFString::from_static_string("kCGWindowLayer"); + let layer: i32 = if let Some(value) = dict.find(layer_key.to_void()) { + let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _); + num.to_i32().unwrap_or(0) + } else { + 0 + }; - // If we found an exact match, use it immediately - if is_exact_match { - tracing::info!("Found exact match: window ID {} for app '{}'", id, owner); - found_window_id = Some((id as u32, owner.clone(), true)); + // Get window bounds to verify it's a real window + let bounds_key = CFString::from_static_string("kCGWindowBounds"); + let has_real_bounds = if let Some(value) = dict.find(bounds_key.to_void()) { + let bounds_dict: CFDictionary = TCFType::wrap_under_get_rule(*value as *const _); + let width_key = CFString::from_static_string("Width"); + let height_key = CFString::from_static_string("Height"); + + if let (Some(w_val), Some(h_val)) = ( + bounds_dict.find(width_key.to_void()), + bounds_dict.find(height_key.to_void()), + ) { + let w_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*w_val as *const _); + let h_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*h_val as *const _); + let width = w_num.to_f64().unwrap_or(0.0); + let height = h_num.to_f64().unwrap_or(0.0); + // Real windows should be at least 100x100 pixels + width >= 100.0 && height >= 100.0 + } else { + false + } + } else { + false + }; + + // Only accept windows that are: + // 1. At layer 0 (normal windows, not menu bar) + // 2. Have real bounds (width and height >= 100) + if layer == 0 && has_real_bounds { + tracing::info!("Found valid window: ID {} for app '{}' (layer={}, bounds valid)", id, owner, layer); + found_window_id = Some((id as u32, owner.clone())); break; - } - - // Otherwise, keep the first fuzzy match but continue looking for exact match - if found_window_id.is_none() { - tracing::info!("Found fuzzy match: window ID {} for app '{}'", id, owner); - found_window_id = Some((id as u32, owner.clone(), false)); + } else { + tracing::debug!("Skipping window ID {} for '{}': layer={}, has_real_bounds={}", id, owner, layer, has_real_bounds); } } } @@ -116,15 +147,10 @@ impl ComputerController for MacOSController { found_window_id }; - let (cg_window_id, matched_owner, is_exact) = cg_window_id.ok_or_else(|| { + let (cg_window_id, matched_owner) = cg_window_id.ok_or_else(|| { anyhow::anyhow!("Could not find window for application '{}'. Use list_windows to see available windows.", app_name) })?; - - if !is_exact { - tracing::warn!("Using fuzzy match: requested '{}' but found '{}' (window ID {})", app_name, matched_owner, cg_window_id); - } else { tracing::info!("Taking screenshot of window ID {} for app '{}'", cg_window_id, matched_owner); - } // Use screencapture with the window ID for now // TODO: Implement direct CGWindowListCreateImage approach with proper image saving @@ -178,12 +204,18 @@ impl ComputerController for MacOSController { async fn find_text_in_app(&self, app_name: &str, search_text: &str) -> Result> { // Take screenshot of specific app window let home = std::env::var("HOME").unwrap_or_else(|_| "/tmp".to_string()); - let temp_path = format!("{}/Desktop/g3_find_text_{}_{}.png", home, app_name, uuid::Uuid::new_v4()); + let temp_path = format!("{}/tmp/g3_find_text_{}_{}.png", home, app_name, uuid::Uuid::new_v4()); self.take_screenshot(&temp_path, None, Some(app_name)).await?; + // Get screenshot dimensions before we delete it + let screenshot_dims = get_image_dimensions(&temp_path)?; + // Extract all text with locations let locations = self.extract_text_with_locations(&temp_path).await?; + // Get window bounds to calculate coordinate transformation + let window_bounds = self.get_window_bounds(app_name)?; + // Clean up temp file let _ = std::fs::remove_file(&temp_path); @@ -191,7 +223,13 @@ impl ComputerController for MacOSController { let search_lower = search_text.to_lowercase(); for location in locations { if location.text.to_lowercase().contains(&search_lower) { - return Ok(Some(location)); + // Transform coordinates from screenshot space to screen space + let transformed = transform_screenshot_to_screen_coords( + location, + window_bounds, + screenshot_dims, + ); + return Ok(Some(transformed)); } } @@ -222,44 +260,7 @@ impl ComputerController for MacOSController { Ok(()) } - fn click_at(&self, x: i32, y: i32, app_name: Option<&str>) -> Result<()> { - // If app_name is provided, get window position and offset coordinates - let (global_x, global_y) = if let Some(app) = app_name { - // Get window position using AppleScript - let script = format!( - r#"tell application "{}" to get bounds of window 1"#, - app - ); - - let output = std::process::Command::new("osascript") - .arg("-e") - .arg(&script) - .output()?; - - if output.status.success() { - let bounds_str = String::from_utf8_lossy(&output.stdout); - // Parse bounds: "x1, y1, x2, y2" - let parts: Vec<&str> = bounds_str.trim().split(", ").collect(); - if parts.len() >= 2 { - if let (Ok(window_x), Ok(window_y)) = ( - parts[0].trim().parse::(), - parts[1].trim().parse::(), - ) { - // Offset relative coordinates by window position - (x + window_x, y + window_y) - } else { - (x, y) // Fallback to absolute coordinates - } - } else { - (x, y) // Fallback to absolute coordinates - } - } else { - (x, y) // Fallback to absolute coordinates - } - } else { - (x, y) // No app name, use absolute coordinates - }; - + fn click_at(&self, x: i32, y: i32, _app_name: Option<&str>) -> Result<()> { use core_graphics::event::{ CGEvent, CGEventTapLocation, CGEventType, CGMouseButton, }; @@ -267,12 +268,27 @@ impl ComputerController for MacOSController { CGEventSource, CGEventSourceStateID, }; use core_graphics::geometry::CGPoint; + use core_graphics::display::CGDisplay; + + // IMPORTANT: Coordinates passed here are in NSScreen/CGWindowListCopyWindowInfo space + // (Y=0 at BOTTOM, increases UPWARD) + // But CGEvent uses a different coordinate system (Y=0 at TOP, increases DOWNWARD) + // We need to convert: CGEvent.y = screenHeight - NSScreen.y + + let screen_height = CGDisplay::main().pixels_high() as i32; + let cgevent_x = x; + let cgevent_y = screen_height - y; + + tracing::debug!("click_at: NSScreen coords ({}, {}) -> CGEvent coords ({}, {}) [screen_height={}]", + x, y, cgevent_x, cgevent_y, screen_height); + + let (global_x, global_y) = (cgevent_x, cgevent_y); + + let point = CGPoint::new(global_x as f64, global_y as f64); let source = CGEventSource::new(CGEventSourceStateID::HIDSystemState) .ok().context("Failed to create event source")?; - let point = CGPoint::new(global_x as f64, global_y as f64); - // Move mouse to position first let move_event = CGEvent::new_mouse_event( source.clone(), @@ -306,4 +322,186 @@ impl ComputerController for MacOSController { Ok(()) } -} \ No newline at end of file +} + +impl MacOSController { + /// Get window bounds for an application (helper method) + fn get_window_bounds(&self, app_name: &str) -> Result<(i32, i32, i32, i32)> { + unsafe { + let window_list = CGWindowListCopyWindowInfo( + kCGWindowListOptionOnScreenOnly, + kCGNullWindowID + ); + + let array = CFArray::::wrap_under_create_rule(window_list); + let count = array.len(); + + let app_name_lower = app_name.to_lowercase(); + + for i in 0..count { + let dict = array.get(i).unwrap(); + + // Get owner name + let owner_key = CFString::from_static_string("kCGWindowOwnerName"); + let owner: String = if let Some(value) = dict.find(owner_key.to_void()) { + let s: CFString = TCFType::wrap_under_get_rule(*value as *const _); + s.to_string() + } else { + continue; + }; + + let owner_lower = owner.to_lowercase(); + + // Normalize by removing spaces for exact matching + let app_name_normalized = app_name_lower.replace(" ", ""); + let owner_normalized = owner_lower.replace(" ", ""); + + // ONLY accept exact matches (case-insensitive, with or without spaces) + // This prevents "Goose" from matching "GooseStudio" + let is_match = owner_lower == app_name_lower || owner_normalized == app_name_normalized; + + if is_match { + // Get window layer to filter out menu bar windows + let layer_key = CFString::from_static_string("kCGWindowLayer"); + let layer: i32 = if let Some(value) = dict.find(layer_key.to_void()) { + let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _); + num.to_i32().unwrap_or(0) + } else { + 0 + }; + + // Skip menu bar windows (layer >= 20) + if layer >= 20 { + tracing::debug!("Skipping window for '{}' at layer {} (menu bar)", owner, layer); + continue; + } + + // Get window bounds to verify it's a real window + let bounds_key = CFString::from_static_string("kCGWindowBounds"); + if let Some(value) = dict.find(bounds_key.to_void()) { + let bounds_dict: CFDictionary = TCFType::wrap_under_get_rule(*value as *const _); + + let x_key = CFString::from_static_string("X"); + let y_key = CFString::from_static_string("Y"); + let width_key = CFString::from_static_string("Width"); + let height_key = CFString::from_static_string("Height"); + + if let (Some(x_val), Some(y_val), Some(w_val), Some(h_val)) = ( + bounds_dict.find(x_key.to_void()), + bounds_dict.find(y_key.to_void()), + bounds_dict.find(width_key.to_void()), + bounds_dict.find(height_key.to_void()), + ) { + let x_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*x_val as *const _); + let y_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*y_val as *const _); + let w_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*w_val as *const _); + let h_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*h_val as *const _); + + let x: i32 = x_num.to_i64().unwrap_or(0) as i32; + let y: i32 = y_num.to_i64().unwrap_or(0) as i32; + let w: i32 = w_num.to_i64().unwrap_or(0) as i32; + let h: i32 = h_num.to_i64().unwrap_or(0) as i32; + + // Only accept windows with real bounds (>= 100x100 pixels) + if w >= 100 && h >= 100 { + tracing::info!("Found valid window bounds for '{}': x={}, y={}, w={}, h={} (layer={})", owner, x, y, w, h, layer); + return Ok((x, y, w, h)); + } else { + tracing::debug!("Skipping window for '{}': too small ({}x{})", owner, w, h); + continue; + } + } else { + continue; + } + } + } + } + } + + Err(anyhow::anyhow!("Could not find window bounds for '{}'", app_name)) + } +} + +/// Get image dimensions from a PNG file +fn get_image_dimensions(path: &str) -> Result<(i32, i32)> { + use std::fs::File; + use std::io::Read; + + let mut file = File::open(path)?; + let mut buffer = vec![0u8; 24]; + file.read_exact(&mut buffer)?; + + // PNG signature check + if &buffer[0..8] != b"\x89PNG\r\n\x1a\n" { + anyhow::bail!("Not a valid PNG file"); + } + + // Read IHDR chunk (width and height are at bytes 16-23) + let width = u32::from_be_bytes([buffer[16], buffer[17], buffer[18], buffer[19]]) as i32; + let height = u32::from_be_bytes([buffer[20], buffer[21], buffer[22], buffer[23]]) as i32; + + Ok((width, height)) +} + +/// Transform coordinates from screenshot space to screen space +/// +/// The screenshot is taken of a window, and Vision OCR returns coordinates +/// relative to the screenshot image. We need to transform these to actual +/// screen coordinates for clicking. +/// +/// On Retina displays, screenshots are taken at 2x resolution, so we need +/// to account for this scaling factor. +fn transform_screenshot_to_screen_coords( + location: TextLocation, + window_bounds: (i32, i32, i32, i32), // (x, y, width, height) in screen space + screenshot_dims: (i32, i32), // (width, height) in pixels +) -> TextLocation { + let (win_x, win_y, win_width, win_height) = window_bounds; + let (screenshot_width, screenshot_height) = screenshot_dims; + + // Calculate scale factors + // On Retina displays, screenshot is typically 2x the window size + let scale_x = win_width as f64 / screenshot_width as f64; + let scale_y = win_height as f64 / screenshot_height as f64; + + tracing::debug!("Transform: screenshot={}x{}, window={}x{} at ({},{}), scale=({:.2},{:.2})", + screenshot_width, screenshot_height, win_width, win_height, win_x, win_y, scale_x, scale_y); + + // Transform coordinates from image space to screen space + // IMPORTANT: macOS screen coordinates have origin at BOTTOM-LEFT (Y increases upward) + // Image coordinates have origin at TOP-LEFT (Y increases downward) + // win_y is the BOTTOM of the window in screen coordinates + // So we need to: (win_y + win_height) to get window TOP, then subtract screenshot_y + let window_top_y = win_y + win_height; + + tracing::debug!("[transform] Input location in image space: x={}, y={}, width={}, height={}", + location.x, location.y, location.width, location.height); + tracing::debug!("[transform] Scale factors: scale_x={:.4}, scale_y={:.4}", scale_x, scale_y); + + let transformed_x = win_x + (location.x as f64 * scale_x) as i32; + let transformed_y = window_top_y - (location.y as f64 * scale_y) as i32; + let transformed_width = (location.width as f64 * scale_x) as i32; + let transformed_height = (location.height as f64 * scale_y) as i32; + + tracing::debug!("[transform] Calculation details:"); + tracing::debug!(" - transformed_x = {} + ({} * {:.4}) = {} + {:.2} = {}", win_x, location.x, scale_x, win_x, location.x as f64 * scale_x, transformed_x); + tracing::debug!(" - transformed_width = ({} * {:.4}) = {:.2} -> {}", location.width, scale_x, location.width as f64 * scale_x, transformed_width); + tracing::debug!(" - transformed_height = ({} * {:.4}) = {:.2} -> {}", location.height, scale_y, location.height as f64 * scale_y, transformed_height); + + tracing::debug!("Transformed location: screenshot=({},{}) {}x{} -> screen=({},{}) {}x{}", + location.x, location.y, location.width, location.height, + transformed_x, transformed_y, transformed_width, transformed_height); + + TextLocation { + text: location.text, + x: transformed_x, + y: transformed_y, + width: transformed_width, + height: transformed_height, + confidence: location.confidence, + } +} + +#[path = "macos_window_matching_test.rs"] +#[cfg(test)] +mod tests; \ No newline at end of file diff --git a/crates/g3-computer-control/src/platform/macos_window_matching_test.rs b/crates/g3-computer-control/src/platform/macos_window_matching_test.rs new file mode 100644 index 0000000..387988f --- /dev/null +++ b/crates/g3-computer-control/src/platform/macos_window_matching_test.rs @@ -0,0 +1,45 @@ +#[cfg(test)] +mod window_matching_tests { + /// Test that window name matching handles spaces correctly + /// + /// Issue: When a user requests a screenshot of "Goose Studio" but the actual + /// application name is "GooseStudio" (no space), the fuzzy matching should + /// still find the window. + /// + /// The fix normalizes both names by removing spaces before comparing. + #[test] + fn test_space_normalization() { + let test_cases = vec![ + // (user_input, actual_app_name, should_match) + ("Goose Studio", "GooseStudio", true), + ("GooseStudio", "Goose Studio", true), + ("Visual Studio Code", "VisualStudioCode", true), + ("Google Chrome", "Google Chrome", true), + ("Safari", "Safari", true), + ("iTerm", "iTerm2", true), // fuzzy match + ("Code", "Visual Studio Code", true), // fuzzy match + ]; + + for (user_input, app_name, should_match) in test_cases { + let user_lower = user_input.to_lowercase(); + let app_lower = app_name.to_lowercase(); + + let user_normalized = user_lower.replace(" ", ""); + let app_normalized = app_lower.replace(" ", ""); + + let is_exact = app_lower == user_lower || app_normalized == user_normalized; + let is_fuzzy = app_lower.contains(&user_lower) + || user_lower.contains(&app_lower) + || app_normalized.contains(&user_normalized) + || user_normalized.contains(&app_normalized); + + let matches = is_exact || is_fuzzy; + + assert_eq!( + matches, should_match, + "Expected '{}' vs '{}' to match={}, but got match={}", + user_input, app_name, should_match, matches + ); + } + } +} diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs index 69a90ca..b32dce9 100644 --- a/crates/g3-core/src/lib.rs +++ b/crates/g3-core/src/lib.rs @@ -483,8 +483,8 @@ Format this as a detailed but concise summary that can be used to resume the con if matches!(message.role, MessageRole::User) && message.content.starts_with("Tool result:") { let content_len = message.content.len(); - // Only thin if the content is greater than 1000 chars - if content_len > 1000 { + // Only thin if the content is greater than 500 chars + if content_len > 500 { // Generate a unique filename based on timestamp and index let timestamp = std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) @@ -541,8 +541,8 @@ Format this as a detailed but concise summary that can be used to resume the con .map(|s| (s.to_string(), s.len())); if let Some((content_str, content_len)) = content_info { - // Only thin if content is greater than 1000 chars - if content_len > 1000 { + // Only thin if content is greater than 500 chars + if content_len > 500 { let timestamp = std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) .unwrap_or_default() @@ -574,8 +574,8 @@ Format this as a detailed but concise summary that can be used to resume the con .map(|s| (s.to_string(), s.len())); if let Some((diff_str, diff_len)) = diff_info { - // Only thin if diff is greater than 1000 chars - if diff_len > 1000 { + // Only thin if diff is greater than 500 chars + if diff_len > 500 { let timestamp = std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) .unwrap_or_default() @@ -2080,132 +2080,6 @@ Template: "required": ["app_name"] }), }, - Tool { - name: "macax_get_ui_tree".to_string(), - description: "Get the UI element hierarchy of an application as a tree structure".to_string(), - input_schema: json!({ - "type": "object", - "properties": { - "app_name": { - "type": "string", - "description": "Name of the application" - }, - "max_depth": { - "type": "integer", - "description": "Maximum depth to traverse (default: 3)" - } - }, - "required": ["app_name"] - }), - }, - Tool { - name: "macax_find_elements".to_string(), - description: "Find UI elements in an application by role, title, or identifier. Use this to locate buttons, text fields, etc.".to_string(), - input_schema: json!({ - "type": "object", - "properties": { - "app_name": { - "type": "string", - "description": "Name of the application" - }, - "role": { - "type": "string", - "description": "UI element role (e.g., 'button', 'text field', 'window')" - }, - "title": { - "type": "string", - "description": "Element title or label to match" - }, - "identifier": { - "type": "string", - "description": "Element identifier (accessibility identifier)" - } - }, - "required": ["app_name"] - }), - }, - Tool { - name: "macax_click".to_string(), - description: "Click a UI element in an application".to_string(), - input_schema: json!({ - "type": "object", - "properties": { - "app_name": { - "type": "string", - "description": "Name of the application" - }, - "role": { - "type": "string", - "description": "UI element role (e.g., 'button')" - }, - "title": { - "type": "string", - "description": "Element title or label" - }, - "identifier": { - "type": "string", - "description": "Element identifier" - } - }, - "required": ["app_name", "role"] - }), - }, - Tool { - name: "macax_set_value".to_string(), - description: "Set the value of a UI element (e.g., type into a text field)".to_string(), - input_schema: json!({ - "type": "object", - "properties": { - "app_name": { - "type": "string", - "description": "Name of the application" - }, - "role": { - "type": "string", - "description": "UI element role (e.g., 'text field')" - }, - "value": { - "type": "string", - "description": "Value to set" - }, - "title": { - "type": "string", - "description": "Element title or label" - }, - "identifier": { - "type": "string", - "description": "Element identifier" - } - }, - "required": ["app_name", "role", "value"] - }), - }, - Tool { - name: "macax_get_value".to_string(), - description: "Get the value of a UI element (e.g., read text from a text field)".to_string(), - input_schema: json!({ - "type": "object", - "properties": { - "app_name": { - "type": "string", - "description": "Name of the application" - }, - "role": { - "type": "string", - "description": "UI element role (e.g., 'text field')" - }, - "title": { - "type": "string", - "description": "Element title or label" - }, - "identifier": { - "type": "string", - "description": "Element identifier" - } - }, - "required": ["app_name", "role"] - }), - }, Tool { name: "macax_press_key".to_string(), description: "Press a keyboard key or shortcut in an application (e.g., Cmd+S to save)".to_string(), @@ -2253,21 +2127,6 @@ Template: }), }); - // Add focus_element tool - tools.push(Tool { - name: "macax_focus_element".to_string(), - description: "Focus on a UI element (text field, text area, etc.) before typing".to_string(), - input_schema: json!({ - "type": "object", - "properties": { - "app_name": {"type": "string", "description": "Name of the application"}, - "role": {"type": "string", "description": "UI element role (e.g., 'text field', 'text area')"}, - "title": {"type": "string", "description": "Element title or label (optional)"}, - "identifier": {"type": "string", "description": "Element accessibility identifier (optional)"} - }, - "required": ["app_name", "role"] - }), - }); } // Add extract_text_with_boxes tool (requires macax flag) @@ -4323,168 +4182,6 @@ Template: Err(e) => Ok(format!("❌ Failed to activate app: {}", e)), } } - "macax_get_ui_tree" => { - debug!("Processing macax_get_ui_tree tool call"); - - if !self.config.macax.enabled { - return Ok("❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string()); - } - - let app_name = match tool_call.args.get("app_name").and_then(|v| v.as_str()) { - Some(n) => n, - None => return Ok("❌ Missing app_name argument".to_string()), - }; - - let max_depth = tool_call.args.get("max_depth") - .and_then(|v| v.as_u64()) - .map(|n| n as usize) - .unwrap_or(3); - - let controller_guard = self.macax_controller.read().await; - let controller = match controller_guard.as_ref() { - Some(c) => c, - None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()), - }; - - match controller.get_ui_tree(app_name, max_depth) { - Ok(tree) => Ok(tree), - Err(e) => Ok(format!("❌ Failed to get UI tree: {}", e)), - } - } - "macax_find_elements" => { - debug!("Processing macax_find_elements tool call"); - - if !self.config.macax.enabled { - return Ok("❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string()); - } - - let app_name = match tool_call.args.get("app_name").and_then(|v| v.as_str()) { - Some(n) => n, - None => return Ok("❌ Missing app_name argument".to_string()), - }; - - let role = tool_call.args.get("role").and_then(|v| v.as_str()); - let title = tool_call.args.get("title").and_then(|v| v.as_str()); - let identifier = tool_call.args.get("identifier").and_then(|v| v.as_str()); - - let controller_guard = self.macax_controller.read().await; - let controller = match controller_guard.as_ref() { - Some(c) => c, - None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()), - }; - - match controller.find_elements(app_name, role, title, identifier) { - Ok(elements) => { - if elements.is_empty() { - Ok("No elements found matching criteria".to_string()) - } else { - let element_strs: Vec = elements.iter() - .map(|e| e.to_string()) - .collect(); - Ok(format!("Found {} element(s):\n{}", elements.len(), element_strs.join("\n"))) - } - } - Err(e) => Ok(format!("❌ Failed to find elements: {}", e)), - } - } - "macax_click" => { - debug!("Processing macax_click tool call"); - - if !self.config.macax.enabled { - return Ok("❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string()); - } - - let app_name = match tool_call.args.get("app_name").and_then(|v| v.as_str()) { - Some(n) => n, - None => return Ok("❌ Missing app_name argument".to_string()), - }; - - let role = match tool_call.args.get("role").and_then(|v| v.as_str()) { - Some(r) => r, - None => return Ok("❌ Missing role argument".to_string()), - }; - - let title = tool_call.args.get("title").and_then(|v| v.as_str()); - let identifier = tool_call.args.get("identifier").and_then(|v| v.as_str()); - - let controller_guard = self.macax_controller.read().await; - let controller = match controller_guard.as_ref() { - Some(c) => c, - None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()), - }; - - match controller.click_element(app_name, role, title, identifier) { - Ok(_) => Ok(format!("✅ Clicked {} element", role)), - Err(e) => Ok(format!("❌ Failed to click element: {}", e)), - } - } - "macax_set_value" => { - debug!("Processing macax_set_value tool call"); - - if !self.config.macax.enabled { - return Ok("❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string()); - } - - let app_name = match tool_call.args.get("app_name").and_then(|v| v.as_str()) { - Some(n) => n, - None => return Ok("❌ Missing app_name argument".to_string()), - }; - - let role = match tool_call.args.get("role").and_then(|v| v.as_str()) { - Some(r) => r, - None => return Ok("❌ Missing role argument".to_string()), - }; - - let value = match tool_call.args.get("value").and_then(|v| v.as_str()) { - Some(v) => v, - None => return Ok("❌ Missing value argument".to_string()), - }; - - let title = tool_call.args.get("title").and_then(|v| v.as_str()); - let identifier = tool_call.args.get("identifier").and_then(|v| v.as_str()); - - let controller_guard = self.macax_controller.read().await; - let controller = match controller_guard.as_ref() { - Some(c) => c, - None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()), - }; - - match controller.set_value(app_name, role, value, title, identifier) { - Ok(_) => Ok(format!("✅ Set value of {} element to: {}", role, value)), - Err(e) => Ok(format!("❌ Failed to set value: {}", e)), - } - } - "macax_get_value" => { - debug!("Processing macax_get_value tool call"); - - if !self.config.macax.enabled { - return Ok("❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string()); - } - - let app_name = match tool_call.args.get("app_name").and_then(|v| v.as_str()) { - Some(n) => n, - None => return Ok("❌ Missing app_name argument".to_string()), - }; - - let role = match tool_call.args.get("role").and_then(|v| v.as_str()) { - Some(r) => r, - None => return Ok("❌ Missing role argument".to_string()), - }; - - let title = tool_call.args.get("title").and_then(|v| v.as_str()); - let identifier = tool_call.args.get("identifier").and_then(|v| v.as_str()); - - let controller_guard = self.macax_controller.read().await; - let controller = match controller_guard.as_ref() { - Some(c) => c, - None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()), - }; - - match controller.get_value(app_name, role, title, identifier) { - Ok(value) => Ok(format!("Value: {}", value)), - Err(e) => Ok(format!("❌ Failed to get value: {}", e)), - } - } "macax_press_key" => { debug!("Processing macax_press_key tool call"); @@ -4555,37 +4252,6 @@ Template: Err(e) => Ok(format!("❌ Failed to type text: {}", e)), } } - "macax_focus_element" => { - debug!("Processing macax_focus_element tool call"); - - if !self.config.macax.enabled { - return Ok("❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string()); - } - - let app_name = match tool_call.args.get("app_name").and_then(|v| v.as_str()) { - Some(n) => n, - None => return Ok("❌ Missing app_name argument".to_string()), - }; - - let role = match tool_call.args.get("role").and_then(|v| v.as_str()) { - Some(r) => r, - None => return Ok("❌ Missing role argument".to_string()), - }; - - let title = tool_call.args.get("title").and_then(|v| v.as_str()); - let identifier = tool_call.args.get("identifier").and_then(|v| v.as_str()); - - let controller_guard = self.macax_controller.read().await; - let controller = match controller_guard.as_ref() { - Some(c) => c, - None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()), - }; - - match controller.focus_element(app_name, role, title, identifier) { - Ok(_) => Ok(format!("✅ Focused {} element in {}", role, app_name)), - Err(e) => Ok(format!("❌ Failed to focus element: {}", e)), - } - } "vision_find_text" => { debug!("Processing vision_find_text tool call"); @@ -4628,11 +4294,34 @@ Template: match controller.find_text_in_app(app_name, text).await { Ok(Some(location)) => { // Click on center of text - let center_x = location.x + location.width / 2; - let center_y = location.y + location.height / 2; + // IMPORTANT: location coordinates are in NSScreen space (Y=0 at BOTTOM, increases UPWARD) + // location.x is the LEFT edge of the bounding box + // location.y is the TOP edge of the bounding box (highest Y value in NSScreen space) + // location.width and location.height are already scaled to screen space + // To get center: we need to add half the SCALED width and subtract half the SCALED height - match controller.click_at(center_x, center_y, Some(app_name)) { - Ok(_) => Ok(format!("✅ Clicked on '{}' in {} at ({}, {})", text, app_name, center_x, center_y)), + if location.width == 0 || location.height == 0 { + return Ok(format!("❌ Invalid bounding box dimensions: width={}, height={}", location.width, location.height)); + } + + debug!("[vision_click_text] Location from find_text_in_app: x={}, y={}, width={}, height={}, text='{}'", + location.x, location.y, location.width, location.height, location.text); + + // Calculate center using the SCALED dimensions + // X: Use right edge instead of center (Vision OCR bounding box seems offset) + // This gives us: left edge + full width = right edge + // Y: top edge - half of scaled height (subtract because Y increases upward) + let click_x = location.x + location.width; // Right edge + let half_height = location.height / 2; + let click_y = location.y - half_height; + + debug!("[vision_click_text] Click position calculation: x={} + {} = {} (right edge), y={} - {} = {}", + location.x, location.width, click_x, location.y, half_height, click_y); + debug!("[vision_click_text] This means: left_edge={}, center={}, right_edge={}", + location.x, click_x, location.x + location.width); + + match controller.click_at(click_x, click_y, Some(app_name)) { + Ok(_) => Ok(format!("✅ Clicked on '{}' in {} at ({}, {})", text, app_name, click_x, click_y)), Err(e) => Ok(format!("❌ Failed to click: {}", e)), } } @@ -4709,13 +4398,15 @@ Template: match controller.find_text_in_app(app_name, text).await { Ok(Some(location)) => { // Calculate click position based on direction + // location.x is LEFT edge, location.y is TOP edge (in NSScreen space) let (click_x, click_y) = match direction { - "right" => (location.x + location.width + distance, location.y + location.height / 2), - "below" => (location.x + location.width / 2, location.y + location.height + distance), - "left" => (location.x - distance, location.y + location.height / 2), - "above" => (location.x + location.width / 2, location.y - distance), - _ => (location.x + location.width + distance, location.y + location.height / 2), + "right" => (location.x + location.width + distance, location.y - (location.height / 2)), + "below" => (location.x + (location.width / 2), location.y - location.height - distance), + "left" => (location.x - distance, location.y - (location.height / 2)), + "above" => (location.x + (location.width / 2), location.y + distance), + _ => (location.x + location.width + distance, location.y - (location.height / 2)), }; + debug!("[vision_click_near_text] Clicking {} of text at ({}, {})", direction, click_x, click_y); match controller.click_at(click_x, click_y, Some(app_name)) { Ok(_) => Ok(format!( diff --git a/crates/g3-execution/src/lib.rs b/crates/g3-execution/src/lib.rs index a42ba97..2a2e871 100644 --- a/crates/g3-execution/src/lib.rs +++ b/crates/g3-execution/src/lib.rs @@ -166,6 +166,31 @@ impl CodeExecutor { /// Execute Bash code async fn execute_bash(&self, code: &str) -> Result { + // Check if this is a detached/daemon command that should run independently + let is_detached = code.trim_start().starts_with("setsid ") + || code.trim_start().starts_with("nohup ") + || code.contains(" disown") + || (code.contains(" &") && (code.contains("nohup") || code.contains("setsid"))); + + if is_detached { + // For detached commands, just spawn and return immediately + use std::process::Stdio; + Command::new("bash") + .arg("-c") + .arg(code) + .stdin(Stdio::null()) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .spawn()?; + + return Ok(ExecutionResult { + stdout: "✅ Command launched in background (detached process)".to_string(), + stderr: String::new(), + exit_code: 0, + success: true, + }); + } + let output = Command::new("bash") .arg("-c") .arg(code) @@ -221,6 +246,29 @@ impl CodeExecutor { use tokio::io::{AsyncBufReadExt, BufReader}; use tokio::process::Command as TokioCommand; + // Check if this is a detached/daemon command that should run independently + // Look for patterns like: setsid, nohup with &, or explicit backgrounding with disown + let is_detached = code.trim_start().starts_with("setsid ") + || code.trim_start().starts_with("nohup ") + || code.contains(" disown") + || (code.contains(" &") && (code.contains("nohup") || code.contains("setsid"))); + + if is_detached { + // For detached commands, just spawn and return immediately + TokioCommand::new("bash") + .arg("-c") + .arg(code) + .spawn()?; + + // Don't wait for the process - it's meant to run independently + return Ok(ExecutionResult { + stdout: "✅ Command launched in background (detached process)".to_string(), + stderr: String::new(), + exit_code: 0, + success: true, + }); + } + let mut child = TokioCommand::new("bash") .arg("-c") .arg(code)