use crate::{ComputerController, types::*}; use anyhow::Result; use async_trait::async_trait; use core_graphics::display::CGPoint; use core_graphics::event::{CGEvent, CGEventType, CGMouseButton, CGEventTapLocation}; use core_graphics::event_source::{CGEventSource, CGEventSourceStateID}; use std::path::Path; use tesseract::Tesseract; // MacOSController doesn't store CGEventSource to avoid Send/Sync issues // We create it fresh for each operation pub struct MacOSController { // Empty struct - event source created per operation } impl MacOSController { pub fn new() -> Result { // Test that we can create an event source let _event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState) .map_err(|_| anyhow::anyhow!("Failed to create event source. Make sure Accessibility permissions are granted."))?; Ok(Self {}) } fn key_to_keycode(&self, key: &str) -> Result { // Map key names to macOS keycodes let keycode = match key.to_lowercase().as_str() { "return" | "enter" => 36, "tab" => 48, "space" => 49, "delete" | "backspace" => 51, "escape" | "esc" => 53, "command" | "cmd" => 55, "shift" => 56, "capslock" => 57, "option" | "alt" => 58, "control" | "ctrl" => 59, "left" => 123, "right" => 124, "down" => 125, "up" => 126, _ => anyhow::bail!("Unknown key: {}", key), }; Ok(keycode) } } #[async_trait] impl ComputerController for MacOSController { async fn move_mouse(&self, x: i32, y: i32) -> Result<()> { let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState) .map_err(|_| anyhow::anyhow!("Failed to create event source"))?; let point = CGPoint::new(x as f64, y as f64); let event = CGEvent::new_mouse_event( event_source, CGEventType::MouseMoved, point, CGMouseButton::Left, ).map_err(|_| anyhow::anyhow!("Failed to create mouse move event"))?; event.post(CGEventTapLocation::HID); Ok(()) } async fn click(&self, button: MouseButton) -> Result<()> { let (cg_button, down_type, up_type) = match button { MouseButton::Left => (CGMouseButton::Left, CGEventType::LeftMouseDown, CGEventType::LeftMouseUp), MouseButton::Right => (CGMouseButton::Right, CGEventType::RightMouseDown, CGEventType::RightMouseUp), MouseButton::Middle => (CGMouseButton::Center, CGEventType::OtherMouseDown, CGEventType::OtherMouseUp), }; let point = { // Get current mouse position let temp_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState) .map_err(|_| anyhow::anyhow!("Failed to create event source"))?; let event = CGEvent::new(temp_source) .map_err(|_| anyhow::anyhow!("Failed to get mouse position"))?; let p = event.location(); p }; { let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState) .map_err(|_| anyhow::anyhow!("Failed to create event source"))?; // Mouse down let down_event = CGEvent::new_mouse_event( event_source, down_type, point, cg_button, ).map_err(|_| anyhow::anyhow!("Failed to create mouse down event"))?; down_event.post(CGEventTapLocation::HID); } // event_source and down_event dropped here // Small delay tokio::time::sleep(tokio::time::Duration::from_millis(50)).await; { let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState) .map_err(|_| anyhow::anyhow!("Failed to create event source"))?; let up_event = CGEvent::new_mouse_event( event_source, up_type, point, cg_button, ).map_err(|_| anyhow::anyhow!("Failed to create mouse up event"))?; up_event.post(CGEventTapLocation::HID); } // event_source and up_event dropped here Ok(()) } async fn double_click(&self, button: MouseButton) -> Result<()> { self.click(button).await?; tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; self.click(button).await?; Ok(()) } async fn type_text(&self, text: &str) -> Result<()> { for ch in text.chars() { { let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState) .map_err(|_| anyhow::anyhow!("Failed to create event source"))?; // Create keyboard event for character let event = CGEvent::new_keyboard_event( event_source, 0, // keycode (0 for unicode) true, ).map_err(|_| anyhow::anyhow!("Failed to create keyboard event"))?; // Set unicode string let mut utf16_buf = [0u16; 2]; let utf16_slice = ch.encode_utf16(&mut utf16_buf); let utf16_chars: Vec = utf16_slice.iter().copied().collect(); event.set_string_from_utf16_unchecked(utf16_chars.as_slice()); event.post(CGEventTapLocation::HID); } // event_source and event dropped here tokio::time::sleep(tokio::time::Duration::from_millis(10)).await; } Ok(()) } async fn press_key(&self, key: &str) -> Result<()> { let keycode = self.key_to_keycode(key)?; { let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState) .map_err(|_| anyhow::anyhow!("Failed to create event source"))?; // Key down let down_event = CGEvent::new_keyboard_event( event_source, keycode, true, ).map_err(|_| anyhow::anyhow!("Failed to create key down event"))?; down_event.post(CGEventTapLocation::HID); } // event_source and down_event dropped here tokio::time::sleep(tokio::time::Duration::from_millis(50)).await; { let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState) .map_err(|_| anyhow::anyhow!("Failed to create event source"))?; // Key up let up_event = CGEvent::new_keyboard_event( event_source, keycode, false, ).map_err(|_| anyhow::anyhow!("Failed to create key up event"))?; up_event.post(CGEventTapLocation::HID); } // event_source and up_event dropped here Ok(()) } async fn list_windows(&self) -> Result> { // Note: Full implementation would use CGWindowListCopyWindowInfo // For now, return empty list as this requires more complex FFI tracing::warn!("list_windows not fully implemented on macOS"); Ok(vec![]) } async fn focus_window(&self, _window_id: &str) -> Result<()> { // Note: Full implementation would use NSWorkspace to activate application tracing::warn!("focus_window not fully implemented on macOS"); Ok(()) } async fn get_window_bounds(&self, _window_id: &str) -> Result { // Note: Full implementation would use Accessibility API tracing::warn!("get_window_bounds not fully implemented on macOS"); Ok(Rect { x: 0, y: 0, width: 800, height: 600 }) } async fn find_element(&self, _selector: &ElementSelector) -> Result> { // Note: Full implementation would use macOS Accessibility API tracing::warn!("find_element not fully implemented on macOS"); Ok(None) } async fn get_element_text(&self, _element_id: &str) -> Result { // Note: Full implementation would use Accessibility API tracing::warn!("get_element_text not fully implemented on macOS"); Ok(String::new()) } async fn get_element_bounds(&self, _element_id: &str) -> Result { // Note: Full implementation would use Accessibility API tracing::warn!("get_element_bounds not fully implemented on macOS"); Ok(Rect { x: 0, y: 0, width: 100, height: 30 }) } async fn take_screenshot(&self, path: &str, _region: Option, window_id: Option<&str>) -> Result<()> { // Use native macOS screencapture command which handles all the format complexities // Check if we have Screen Recording permission by attempting a test capture // If we only get wallpaper/menubar but no windows, we need permission let needs_permission_check = std::env::var("G3_SKIP_PERMISSION_CHECK").is_err(); if needs_permission_check { // Try to open Screen Recording settings if this is the first screenshot static PERMISSION_PROMPTED: std::sync::atomic::AtomicBool = std::sync::atomic::AtomicBool::new(false); if !PERMISSION_PROMPTED.swap(true, std::sync::atomic::Ordering::Relaxed) { tracing::warn!("\n=== Screen Recording Permission Required ===\n\ macOS requires explicit permission to capture window content.\n\ If screenshots only show wallpaper/menubar (no windows):\n\n\ 1. Open System Settings > Privacy & Security > Screen Recording\n\ 2. Enable permission for your terminal (iTerm/Terminal) or g3\n\ 3. Restart your terminal if needed\n\n\ Opening Screen Recording settings now...\n"); // Try to open the settings (non-blocking) let _ = std::process::Command::new("open") .arg("x-apple.systempreferences:com.apple.preference.security?Privacy_ScreenCapture") .spawn(); } } let path_obj = Path::new(path); if let Some(parent) = path_obj.parent() { std::fs::create_dir_all(parent)?; } let mut cmd = std::process::Command::new("screencapture"); // Add flags cmd.arg("-x"); // No sound if let Some(window_id) = window_id { // Capture specific window by getting its bounds and using region capture // window_id format: "AppName" or "AppName:WindowTitle" let app_name = window_id.split(':').next().unwrap_or(window_id); // Use AppleScript to get window bounds let script = format!( r#"tell application "{}" tell current window get bounds end tell end tell"#, app_name ); let output = std::process::Command::new("osascript") .arg("-e") .arg(&script) .output() .map_err(|e| anyhow::anyhow!("Failed to get window bounds: {}", e))?; if output.status.success() { let bounds_str = String::from_utf8_lossy(&output.stdout); let bounds: Vec = bounds_str .trim() .split(',') .filter_map(|s| s.trim().parse().ok()) .collect(); if bounds.len() == 4 { let (left, top, right, bottom) = (bounds[0], bounds[1], bounds[2], bounds[3]); let width = right - left; let height = bottom - top; cmd.arg("-R"); cmd.arg(format!("{},{},{},{}", left, top, width, height)); tracing::debug!("Capturing window '{}' at region: {},{} {}x{}", app_name, left, top, width, height); } else { tracing::warn!("Failed to parse window bounds, capturing full screen"); } } else { tracing::warn!("Failed to get window bounds for '{}', capturing full screen", app_name); } } else if let Some(region) = _region { // Capture specific region: -R x,y,width,height cmd.arg("-R"); cmd.arg(format!("{},{},{},{}", region.x, region.y, region.width, region.height)); } cmd.arg(path); let output = cmd.output() .map_err(|e| anyhow::anyhow!("Failed to execute screencapture: {}", e))?; if !output.status.success() { let stderr = String::from_utf8_lossy(&output.stderr); anyhow::bail!("screencapture failed: {}", stderr); } tracing::debug!("Screenshot saved using screencapture: {}", path); Ok(()) } } async fn extract_text_from_screen(&self, region: Rect) -> Result { // Take screenshot of region first let temp_path = format!("/tmp/g3_ocr_{}.png", uuid::Uuid::new_v4()); self.take_screenshot(&temp_path, Some(region), None).await?; // Extract text from the screenshot let result = self.extract_text_from_image(&temp_path).await?; // Clean up temp file let _ = std::fs::remove_file(&temp_path); Ok(result) } async fn extract_text_from_image(&self, _path: &str) -> Result { // Check if tesseract is available on the system let tesseract_check = std::process::Command::new("which") .arg("tesseract") .output(); if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() { anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\ To install tesseract:\n macOS: brew install tesseract\n \ Linux: sudo apt-get install tesseract-ocr (Ubuntu/Debian)\n \ sudo yum install tesseract (RHEL/CentOS)\n \ Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki\n\n\ After installation, restart your terminal and try again."); } // Initialize Tesseract let tess = Tesseract::new(None, Some("eng")) .map_err(|e| { anyhow::anyhow!("Failed to initialize Tesseract: {}\n\n\ This usually means:\n1. Tesseract is not properly installed\n\ 2. Language data files are missing\n\nTo fix:\n \ macOS: brew reinstall tesseract\n \ Linux: sudo apt-get install tesseract-ocr-eng\n \ Windows: Reinstall tesseract and ensure language files are included", e) })?; let text = tess.set_image(_path) .map_err(|e| anyhow::anyhow!("Failed to load image '{}': {}", _path, e))? .get_text() .map_err(|e| anyhow::anyhow!("Failed to extract text from image: {}", e))?; // Get confidence (simplified - would need more complex API calls for per-word confidence) let confidence = 0.85; // Placeholder Ok(OCRResult { text, confidence, bounds: Rect { x: 0, y: 0, width: 0, height: 0 }, // Would need image dimensions }) } async fn find_text_on_screen(&self, _text: &str) -> Result> { // Check if tesseract is available on the system let tesseract_check = std::process::Command::new("which") .arg("tesseract") .output(); if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() { anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\ To install tesseract:\n macOS: brew install tesseract\n \ Linux: sudo apt-get install tesseract-ocr (Ubuntu/Debian)\n \ sudo yum install tesseract (RHEL/CentOS)\n \ Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki\n\n\ After installation, restart your terminal and try again."); } // Take full screen screenshot let temp_path = format!("/tmp/g3_ocr_search_{}.png", uuid::Uuid::new_v4()); self.take_screenshot(&temp_path, None, None).await?; // Use Tesseract to find text with bounding boxes let tess = Tesseract::new(None, Some("eng")) .map_err(|e| { anyhow::anyhow!("Failed to initialize Tesseract: {}\n\n\ This usually means:\n1. Tesseract is not properly installed\n\ 2. Language data files are missing\n\nTo fix:\n \ macOS: brew reinstall tesseract\n \ Linux: sudo apt-get install tesseract-ocr-eng\n \ Windows: Reinstall tesseract and ensure language files are included", e) })?; let full_text = tess.set_image(temp_path.as_str()) .map_err(|e| anyhow::anyhow!("Failed to load screenshot: {}", e))? .get_text() .map_err(|e| anyhow::anyhow!("Failed to extract text from screen: {}", e))?; // Clean up temp file let _ = std::fs::remove_file(&temp_path); // Simple text search - full implementation would use get_component_images // to get bounding boxes for each word if full_text.contains(_text) { tracing::warn!("Text found but precise coordinates not available in simplified implementation"); Ok(Some(Point { x: 0, y: 0 })) } else { Ok(None) } } }