computer control tools

2025-10-18 14:16:42 +11:00
parent a566171203
commit da652bf287
22 changed files with 2720 additions and 36 deletions
--- a/crates/g3-computer-control/Cargo.toml
+++ b/crates/g3-computer-control/Cargo.toml
@@ -0,0 +1,42 @@
+[package]
+name = "g3-computer-control"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+# Workspace dependencies
+tokio = { workspace = true }
+anyhow = { workspace = true }
+thiserror = { workspace = true }
+serde = { workspace = true }
+serde_json = { workspace = true }
+tracing = { workspace = true }
+uuid = { workspace = true }
+
+# Async trait support
+async-trait = "0.1"
+
+# OCR dependencies
+tesseract = "0.14"
+
+# macOS dependencies
+[target.'cfg(target_os = "macos")'.dependencies]
+core-graphics = "0.23"
+core-foundation = "0.9"
+cocoa = "0.25"
+objc = "0.2"
+image = "0.24"
+
+# Linux dependencies
+[target.'cfg(target_os = "linux")'.dependencies]
+x11 = { version = "2.21", features = ["xlib", "xtest"] }
+image = "0.24"
+
+# Windows dependencies
+[target.'cfg(target_os = "windows")'.dependencies]
+windows = { version = "0.52", features = [
+    "Win32_Foundation",
+    "Win32_UI_WindowsAndMessaging",
+    "Win32_UI_Input_KeyboardAndMouse",
+    "Win32_Graphics_Gdi",
+] }
--- a/crates/g3-computer-control/examples/debug_screenshot.rs
+++ b/crates/g3-computer-control/examples/debug_screenshot.rs
@@ -0,0 +1,46 @@
+use core_graphics::display::CGDisplay;
+
+fn main() {
+    let display = CGDisplay::main();
+    let image = display.image().expect("Failed to capture screen");
+    
+    println!("CGImage properties:");
+    println!("  Width: {}", image.width());
+    println!("  Height: {}", image.height());
+    println!("  Bits per component: {}", image.bits_per_component());
+    println!("  Bits per pixel: {}", image.bits_per_pixel());
+    println!("  Bytes per row: {}", image.bytes_per_row());
+    
+    let data = image.data();
+    let expected_size = image.width() * image.height() * 4;
+    println!("  Data length: {}", data.len());
+    println!("  Expected (w*h*4): {}", expected_size);
+    
+    // Check if there's padding in rows
+    let bytes_per_row = image.bytes_per_row();
+    let width = image.width();
+    let expected_bytes_per_row = width * 4;
+    println!("\nRow alignment:");
+    println!("  Actual bytes per row: {}", bytes_per_row);
+    println!("  Expected (width * 4): {}", expected_bytes_per_row);
+    println!("  Padding per row: {}", bytes_per_row - expected_bytes_per_row);
+    
+    // Sample some pixels from different locations
+    println!("\nFirst 3 pixels (raw bytes):");
+    for i in 0..3 {
+        let offset = i * 4;
+        println!("  Pixel {}: [{:3}, {:3}, {:3}, {:3}]", 
+                 i, data[offset], data[offset+1], data[offset+2], data[offset+3]);
+    }
+    
+    // Check a pixel from the middle
+    let mid_row = image.height() / 2;
+    let mid_col = image.width() / 2;
+    let mid_offset = (mid_row * bytes_per_row + mid_col * 4) as usize;
+    println!("\nMiddle pixel (row {}, col {}):", mid_row, mid_col);
+    println!("  Offset: {}", mid_offset);
+    if mid_offset + 3 < data.len() as usize {
+        println!("  Bytes: [{:3}, {:3}, {:3}, {:3}]", 
+                 data[mid_offset], data[mid_offset+1], data[mid_offset+2], data[mid_offset+3]);
+    }
+}
--- a/crates/g3-computer-control/examples/list_windows.rs
+++ b/crates/g3-computer-control/examples/list_windows.rs
@@ -0,0 +1,56 @@
+use core_graphics::window::{kCGWindowListOptionOnScreenOnly, kCGNullWindowID, CGWindowListCopyWindowInfo};
+use core_foundation::dictionary::CFDictionary;
+use core_foundation::string::CFString;
+use core_foundation::base::TCFType;
+
+fn main() {
+    println!("Listing all on-screen windows...");
+    println!("{:<10} {:<25} {}", "Window ID", "Owner", "Title");
+    println!("{}", "-".repeat(80));
+    
+    unsafe {
+        let window_list = CGWindowListCopyWindowInfo(
+            kCGWindowListOptionOnScreenOnly,
+            kCGNullWindowID
+        );
+        
+        let count = core_foundation::array::CFArray::<CFDictionary>::wrap_under_create_rule(window_list).len();
+        let array = core_foundation::array::CFArray::<CFDictionary>::wrap_under_create_rule(window_list);
+        
+        for i in 0..count {
+            let dict = array.get(i).unwrap();
+            
+            // Get window ID
+            let window_id_key = CFString::from_static_string("kCGWindowNumber");
+            let window_id: i64 = if let Some(value) = dict.find(window_id_key.as_concrete_TypeRef()) {
+                let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _);
+                num.to_i64().unwrap_or(0)
+            } else {
+                0
+            };
+            
+            // Get owner name
+            let owner_key = CFString::from_static_string("kCGWindowOwnerName");
+            let owner: String = if let Some(value) = dict.find(owner_key.as_concrete_TypeRef()) {
+                let s: CFString = TCFType::wrap_under_get_rule(*value as *const _);
+                s.to_string()
+            } else {
+                "Unknown".to_string()
+            };
+            
+            // Get window name/title
+            let name_key = CFString::from_static_string("kCGWindowName");
+            let title: String = if let Some(value) = dict.find(name_key.as_concrete_TypeRef()) {
+                let s: CFString = TCFType::wrap_under_get_rule(*value as *const _);
+                s.to_string()
+            } else {
+                "".to_string()
+            };
+            
+            // Filter for iTerm or show all
+            if owner.contains("iTerm") || owner.contains("Terminal") {
+                println!("{:<10} {:<25} {}", window_id, owner, title);
+            }
+        }
+    }
+}
--- a/crates/g3-computer-control/examples/test_permission_prompt.rs
+++ b/crates/g3-computer-control/examples/test_permission_prompt.rs
@@ -0,0 +1,21 @@
+use g3_computer_control::{create_controller, ComputerController};
+
+#[tokio::main]
+async fn main() {
+    println!("Testing screenshot with permission prompt...");
+    
+    let controller = create_controller().expect("Failed to create controller");
+    
+    match controller.take_screenshot("/tmp/test_with_prompt.png", None, None).await {
+        Ok(_) => {
+            println!("\n✅ Screenshot saved to /tmp/test_with_prompt.png");
+            println!("Opening screenshot...");
+            let _ = std::process::Command::new("open")
+                .arg("/tmp/test_with_prompt.png")
+                .spawn();
+        }
+        Err(e) => {
+            println!("❌ Screenshot failed: {}", e);
+        }
+    }
+}
--- a/crates/g3-computer-control/examples/test_screencapture_direct.rs
+++ b/crates/g3-computer-control/examples/test_screencapture_direct.rs
@@ -0,0 +1,39 @@
+use std::process::Command;
+
+fn main() {
+    let path = "/tmp/rust_screencapture_test.png";
+    
+    println!("Testing screencapture command from Rust...");
+    
+    let mut cmd = Command::new("screencapture");
+    cmd.arg("-x"); // No sound
+    cmd.arg(path);
+    
+    println!("Command: {:?}", cmd);
+    
+    match cmd.output() {
+        Ok(output) => {
+            println!("Exit status: {}", output.status);
+            println!("Stdout: {}", String::from_utf8_lossy(&output.stdout));
+            println!("Stderr: {}", String::from_utf8_lossy(&output.stderr));
+            
+            if output.status.success() {
+                println!("\n✅ Screenshot saved to: {}", path);
+                
+                // Check file exists and size
+                if let Ok(metadata) = std::fs::metadata(path) {
+                    println!("File size: {} bytes ({:.1} MB)", metadata.len(), metadata.len() as f64 / 1_000_000.0);
+                }
+                
+                // Open it
+                let _ = Command::new("open").arg(path).spawn();
+                println!("\nOpened screenshot - please verify it looks correct!");
+            } else {
+                println!("\n❌ Screenshot failed!");
+            }
+        }
+        Err(e) => {
+            println!("❌ Failed to execute screencapture: {}", e);
+        }
+    }
+}
--- a/crates/g3-computer-control/examples/test_screenshot_fix.rs
+++ b/crates/g3-computer-control/examples/test_screenshot_fix.rs
@@ -0,0 +1,69 @@
+use core_graphics::display::CGDisplay;
+use image::{ImageBuffer, RgbaImage};
+use std::path::Path;
+
+fn main() {
+    let display = CGDisplay::main();
+    let image = display.image().expect("Failed to capture screen");
+    
+    let width = image.width() as u32;
+    let height = image.height() as u32;
+    let bytes_per_row = image.bytes_per_row() as usize;
+    let data = image.data();
+    
+    println!("Testing screenshot fix...");
+    println!("Image: {}x{}, bytes_per_row: {}", width, height, bytes_per_row);
+    println!("Expected bytes per row: {}", width * 4);
+    println!("Padding per row: {} bytes", bytes_per_row - (width as usize * 4));
+    
+    // OLD METHOD (broken) - treating data as continuous
+    println!("\n=== OLD METHOD (BROKEN) ===");
+    let mut old_rgba = Vec::with_capacity(data.len() as usize);
+    for chunk in data.chunks_exact(4) {
+        old_rgba.push(chunk[2]); // R
+        old_rgba.push(chunk[1]); // G
+        old_rgba.push(chunk[0]); // B
+        old_rgba.push(chunk[3]); // A
+    }
+    println!("Converted {} pixels", old_rgba.len() / 4);
+    println!("Expected {} pixels", width * height);
+    
+    // NEW METHOD (fixed) - handling row padding
+    println!("\n=== NEW METHOD (FIXED) ===");
+    let mut new_rgba = Vec::with_capacity((width * height * 4) as usize);
+    for row in 0..height as usize {
+        let row_start = row * bytes_per_row;
+        let row_end = row_start + (width as usize * 4);
+        
+        for chunk in data[row_start..row_end].chunks_exact(4) {
+            new_rgba.push(chunk[2]); // R
+            new_rgba.push(chunk[1]); // G
+            new_rgba.push(chunk[0]); // B
+            new_rgba.push(chunk[3]); // A
+        }
+    }
+    println!("Converted {} pixels", new_rgba.len() / 4);
+    println!("Expected {} pixels", width * height);
+    
+    // Save a small crop from both methods
+    let crop_size = 200;
+    
+    // Old method crop
+    let old_crop: Vec<u8> = old_rgba.iter().take((crop_size * crop_size * 4) as usize).copied().collect();
+    if let Some(old_img) = ImageBuffer::from_raw(crop_size, crop_size, old_crop) {
+        let old_img: RgbaImage = old_img;
+        old_img.save("/tmp/screenshot_old_method.png").unwrap();
+        println!("\nSaved OLD method crop to: /tmp/screenshot_old_method.png");
+    }
+    
+    // New method crop
+    let new_crop: Vec<u8> = new_rgba.iter().take((crop_size * crop_size * 4) as usize).copied().collect();
+    if let Some(new_img) = ImageBuffer::from_raw(crop_size, crop_size, new_crop) {
+        let new_img: RgbaImage = new_img;
+        new_img.save("/tmp/screenshot_new_method.png").unwrap();
+        println!("Saved NEW method crop to: /tmp/screenshot_new_method.png");
+    }
+    
+    println!("\nOpen both images to compare:");
+    println!("  open /tmp/screenshot_old_method.png /tmp/screenshot_new_method.png");
+}
--- a/crates/g3-computer-control/examples/test_window_capture.rs
+++ b/crates/g3-computer-control/examples/test_window_capture.rs
@@ -0,0 +1,45 @@
+use g3_computer_control::create_controller;
+
+#[tokio::main]
+async fn main() {
+    println!("Testing window-specific screenshot capture...");
+    
+    let controller = create_controller().expect("Failed to create controller");
+    
+    // Test 1: Capture iTerm2 window
+    println!("\n1. Capturing iTerm2 window...");
+    match controller.take_screenshot("/tmp/iterm_window.png", None, Some("iTerm2")).await {
+        Ok(_) => {
+            println!("   ✅ iTerm2 window captured to /tmp/iterm_window.png");
+            let _ = std::process::Command::new("open").arg("/tmp/iterm_window.png").spawn();
+        }
+        Err(e) => println!("   ❌ Failed: {}", e),
+    }
+    
+    // Wait a moment for the image to open
+    tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
+    
+    // Test 2: Full screen capture for comparison
+    println!("\n2. Capturing full screen for comparison...");
+    match controller.take_screenshot("/tmp/fullscreen.png", None, None).await {
+        Ok(_) => {
+            println!("   ✅ Full screen captured to /tmp/fullscreen.png");
+            let _ = std::process::Command::new("open").arg("/tmp/fullscreen.png").spawn();
+        }
+        Err(e) => println!("   ❌ Failed: {}", e),
+    }
+    
+    println!("\n=== Comparison ===");
+    println!("iTerm window:  /tmp/iterm_window.png (should show ONLY iTerm window)");
+    println!("Full screen:   /tmp/fullscreen.png (should show entire desktop)");
+    
+    // Show file sizes
+    if let Ok(meta1) = std::fs::metadata("/tmp/iterm_window.png") {
+        if let Ok(meta2) = std::fs::metadata("/tmp/fullscreen.png") {
+            println!("\nFile sizes:");
+            println!("  iTerm window: {:.1} MB", meta1.len() as f64 / 1_000_000.0);
+            println!("  Full screen:  {:.1} MB", meta2.len() as f64 / 1_000_000.0);
+            println!("\nWindow capture should be smaller than full screen.");
+        }
+    }
+}
--- a/crates/g3-computer-control/src/lib.rs
+++ b/crates/g3-computer-control/src/lib.rs
@@ -0,0 +1,51 @@
+pub mod types;
+pub mod platform;
+
+use anyhow::Result;
+use async_trait::async_trait;
+use types::*;
+
+#[async_trait]
+pub trait ComputerController: Send + Sync {
+    // Mouse operations
+    async fn move_mouse(&self, x: i32, y: i32) -> Result<()>;
+    async fn click(&self, button: MouseButton) -> Result<()>;
+    async fn double_click(&self, button: MouseButton) -> Result<()>;
+    
+    // Keyboard operations
+    async fn type_text(&self, text: &str) -> Result<()>;
+    async fn press_key(&self, key: &str) -> Result<()>;
+    
+    // Window management
+    async fn list_windows(&self) -> Result<Vec<Window>>;
+    async fn focus_window(&self, window_id: &str) -> Result<()>;
+    async fn get_window_bounds(&self, window_id: &str) -> Result<Rect>;
+    
+    // UI element inspection
+    async fn find_element(&self, selector: &ElementSelector) -> Result<Option<UIElement>>;
+    async fn get_element_text(&self, element_id: &str) -> Result<String>;
+    async fn get_element_bounds(&self, element_id: &str) -> Result<Rect>;
+    
+    // Screen capture
+    async fn take_screenshot(&self, path: &str, region: Option<Rect>, window_id: Option<&str>) -> Result<()>;
+    
+    // OCR operations
+    async fn extract_text_from_screen(&self, region: Rect) -> Result<OCRResult>;
+    async fn extract_text_from_image(&self, path: &str) -> Result<OCRResult>;
+    async fn find_text_on_screen(&self, text: &str) -> Result<Option<Point>>;
+}
+
+// Platform-specific constructor
+pub fn create_controller() -> Result<Box<dyn ComputerController>> {
+    #[cfg(target_os = "macos")]
+    return Ok(Box::new(platform::macos::MacOSController::new()?));
+    
+    #[cfg(target_os = "linux")]
+    return Ok(Box::new(platform::linux::LinuxController::new()?));
+    
+    #[cfg(target_os = "windows")]
+    return Ok(Box::new(platform::windows::WindowsController::new()?));
+    
+    #[cfg(not(any(target_os = "macos", target_os = "linux", target_os = "windows")))]
+    anyhow::bail!("Unsupported platform")
+}
--- a/crates/g3-computer-control/src/platform/linux.rs
+++ b/crates/g3-computer-control/src/platform/linux.rs
@@ -0,0 +1,161 @@
+use crate::{ComputerController, types::*};
+use anyhow::Result;
+use async_trait::async_trait;
+use tesseract::Tesseract;
+use uuid::Uuid;
+
+pub struct LinuxController {
+    // Placeholder for X11 connection or other state
+}
+
+impl LinuxController {
+    pub fn new() -> Result<Self> {
+        // Initialize X11 connection
+        tracing::warn!("Linux computer control not fully implemented");
+        Ok(Self {})
+    }
+}
+
+#[async_trait]
+impl ComputerController for LinuxController {
+    async fn move_mouse(&self, _x: i32, _y: i32) -> Result<()> {
+        anyhow::bail!("Linux implementation not yet available")
+    }
+    
+    async fn click(&self, _button: MouseButton) -> Result<()> {
+        anyhow::bail!("Linux implementation not yet available")
+    }
+    
+    async fn double_click(&self, _button: MouseButton) -> Result<()> {
+        anyhow::bail!("Linux implementation not yet available")
+    }
+    
+    async fn type_text(&self, _text: &str) -> Result<()> {
+        anyhow::bail!("Linux implementation not yet available")
+    }
+    
+    async fn press_key(&self, _key: &str) -> Result<()> {
+        anyhow::bail!("Linux implementation not yet available")
+    }
+    
+    async fn list_windows(&self) -> Result<Vec<Window>> {
+        anyhow::bail!("Linux implementation not yet available")
+    }
+    
+    async fn focus_window(&self, _window_id: &str) -> Result<()> {
+        anyhow::bail!("Linux implementation not yet available")
+    }
+    
+    async fn get_window_bounds(&self, _window_id: &str) -> Result<Rect> {
+        anyhow::bail!("Linux implementation not yet available")
+    }
+    
+    async fn find_element(&self, _selector: &ElementSelector) -> Result<Option<UIElement>> {
+        anyhow::bail!("Linux implementation not yet available")
+    }
+    
+    async fn get_element_text(&self, _element_id: &str) -> Result<String> {
+        anyhow::bail!("Linux implementation not yet available")
+    }
+    
+    async fn get_element_bounds(&self, _element_id: &str) -> Result<Rect> {
+        anyhow::bail!("Linux implementation not yet available")
+    }
+    
+    async fn take_screenshot(&self, _path: &str, _region: Option<Rect>, _window_id: Option<&str>) -> Result<()> {
+        anyhow::bail!("Linux implementation not yet available")
+    }
+    
+    async fn extract_text_from_screen(&self, _region: Rect) -> Result<OCRResult> {
+        anyhow::bail!("Linux implementation not yet available")
+    }
+    
+    async fn extract_text_from_image(&self, _path: &str) -> Result<OCRResult> {
+        // Check if tesseract is available on the system
+        let tesseract_check = std::process::Command::new("which")
+            .arg("tesseract")
+            .output();
+        
+        if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() {
+            anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\
+                To install tesseract:\n  \
+                Ubuntu/Debian: sudo apt-get install tesseract-ocr\n  \
+                RHEL/CentOS:   sudo yum install tesseract\n  \
+                Arch Linux:    sudo pacman -S tesseract\n\n\
+                After installation, restart your terminal and try again.");
+        }
+        
+        // Initialize Tesseract
+        let tess = Tesseract::new(None, Some("eng"))
+            .map_err(|e| {
+                anyhow::anyhow!("Failed to initialize Tesseract: {}\n\n\
+                    This usually means:\n1. Tesseract is not properly installed\n\
+                    2. Language data files are missing\n\nTo fix:\n  \
+                    Ubuntu/Debian: sudo apt-get install tesseract-ocr-eng\n  \
+                    RHEL/CentOS:   sudo yum install tesseract-langpack-eng\n  \
+                    Arch Linux:    sudo pacman -S tesseract-data-eng", e)
+            })?;
+        
+        let text = tess.set_image(_path)
+            .map_err(|e| anyhow::anyhow!("Failed to load image '{}': {}", _path, e))?
+            .get_text()
+            .map_err(|e| anyhow::anyhow!("Failed to extract text from image: {}", e))?;
+        
+        // Get confidence (simplified - would need more complex API calls for per-word confidence)
+        let confidence = 0.85; // Placeholder
+        
+        Ok(OCRResult {
+            text,
+            confidence,
+            bounds: Rect { x: 0, y: 0, width: 0, height: 0 }, // Would need image dimensions
+        })
+    }
+    
+    async fn find_text_on_screen(&self, _text: &str) -> Result<Option<Point>> {
+        // Check if tesseract is available on the system
+        let tesseract_check = std::process::Command::new("which")
+            .arg("tesseract")
+            .output();
+        
+        if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() {
+            anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\
+                To install tesseract:\n  \
+                Ubuntu/Debian: sudo apt-get install tesseract-ocr\n  \
+                RHEL/CentOS:   sudo yum install tesseract\n  \
+                Arch Linux:    sudo pacman -S tesseract\n\n\
+                After installation, restart your terminal and try again.");
+        }
+        
+        // Take full screen screenshot
+        let temp_path = format!("/tmp/g3_ocr_search_{}.png", uuid::Uuid::new_v4());
+        self.take_screenshot(&temp_path, None, None).await?;
+        
+        // Use Tesseract to find text with bounding boxes
+        let tess = Tesseract::new(None, Some("eng"))
+            .map_err(|e| {
+                anyhow::anyhow!("Failed to initialize Tesseract: {}\n\n\
+                    This usually means:\n1. Tesseract is not properly installed\n\
+                    2. Language data files are missing\n\nTo fix:\n  \
+                    Ubuntu/Debian: sudo apt-get install tesseract-ocr-eng\n  \
+                    RHEL/CentOS:   sudo yum install tesseract-langpack-eng\n  \
+                    Arch Linux:    sudo pacman -S tesseract-data-eng", e)
+            })?;
+        
+        let full_text = tess.set_image(temp_path.as_str())
+            .map_err(|e| anyhow::anyhow!("Failed to load screenshot: {}", e))?
+            .get_text()
+            .map_err(|e| anyhow::anyhow!("Failed to extract text from screen: {}", e))?;
+        
+        // Clean up temp file
+        let _ = std::fs::remove_file(&temp_path);
+        
+        // Simple text search - full implementation would use get_component_images
+        // to get bounding boxes for each word
+        if full_text.contains(_text) {
+            tracing::warn!("Text found but precise coordinates not available in simplified implementation");
+            Ok(Some(Point { x: 0, y: 0 }))
+        } else {
+            Ok(None)
+        }
+    }
+}
--- a/crates/g3-computer-control/src/platform/macos.rs
+++ b/crates/g3-computer-control/src/platform/macos.rs
@@ -0,0 +1,562 @@
+use crate::{ComputerController, types::*};
+use anyhow::Result;
+use async_trait::async_trait;
+use core_graphics::display::CGPoint;
+use core_graphics::event::{CGEvent, CGEventType, CGMouseButton, CGEventTapLocation};
+use core_graphics::event_source::{CGEventSource, CGEventSourceStateID};
+use std::path::Path;
+use tesseract::Tesseract;
+use core_graphics::window::{kCGWindowListOptionOnScreenOnly, kCGNullWindowID, CGWindowListCopyWindowInfo};
+use core_foundation::dictionary::CFDictionary;
+use core_foundation::string::CFString;
+use core_foundation::base::{TCFType, ToVoid};
+
+// MacOSController doesn't store CGEventSource to avoid Send/Sync issues
+// We create it fresh for each operation
+pub struct MacOSController {
+    // Empty struct - event source created per operation
+}
+
+impl MacOSController {
+    pub fn new() -> Result<Self> {
+        // Test that we can create an event source
+        let _event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
+            .map_err(|_| anyhow::anyhow!("Failed to create event source. Make sure Accessibility permissions are granted."))?;
+        Ok(Self {})
+    }
+    
+    fn key_to_keycode(&self, key: &str) -> Result<u16> {
+        // Map key names to macOS keycodes
+        let keycode = match key.to_lowercase().as_str() {
+            "return" | "enter" => 36,
+            "tab" => 48,
+            "space" => 49,
+            "delete" | "backspace" => 51,
+            "escape" | "esc" => 53,
+            "command" | "cmd" => 55,
+            "shift" => 56,
+            "capslock" => 57,
+            "option" | "alt" => 58,
+            "control" | "ctrl" => 59,
+            "left" => 123,
+            "right" => 124,
+            "down" => 125,
+            "up" => 126,
+            _ => anyhow::bail!("Unknown key: {}", key),
+        };
+        Ok(keycode)
+    }
+}
+
+#[async_trait]
+impl ComputerController for MacOSController {
+    async fn move_mouse(&self, x: i32, y: i32) -> Result<()> {
+        let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
+            .map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
+        let point = CGPoint::new(x as f64, y as f64);
+        let event = CGEvent::new_mouse_event(
+            event_source,
+            CGEventType::MouseMoved,
+            point,
+            CGMouseButton::Left,
+        ).map_err(|_| anyhow::anyhow!("Failed to create mouse move event"))?;
+        
+        event.post(CGEventTapLocation::HID);
+        Ok(())
+    }
+    
+    async fn click(&self, button: MouseButton) -> Result<()> {
+        let (cg_button, down_type, up_type) = match button {
+            MouseButton::Left => (CGMouseButton::Left, CGEventType::LeftMouseDown, CGEventType::LeftMouseUp),
+            MouseButton::Right => (CGMouseButton::Right, CGEventType::RightMouseDown, CGEventType::RightMouseUp),
+            MouseButton::Middle => (CGMouseButton::Center, CGEventType::OtherMouseDown, CGEventType::OtherMouseUp),
+        };
+        
+        let point = {
+            // Get current mouse position
+            let temp_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
+                .map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
+            let event = CGEvent::new(temp_source)
+                .map_err(|_| anyhow::anyhow!("Failed to get mouse position"))?;
+            let p = event.location();
+            p
+        };
+        
+        {
+            let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
+                .map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
+            
+            // Mouse down
+            let down_event = CGEvent::new_mouse_event(
+                event_source,
+                down_type,
+                point,
+                cg_button,
+            ).map_err(|_| anyhow::anyhow!("Failed to create mouse down event"))?;
+            down_event.post(CGEventTapLocation::HID);
+        } // event_source and down_event dropped here
+        
+        // Small delay
+        tokio::time::sleep(tokio::time::Duration::from_millis(50)).await;
+        
+        {
+            let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
+                .map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
+            
+            let up_event = CGEvent::new_mouse_event(
+                event_source,
+                up_type,
+                point,
+                cg_button,
+            ).map_err(|_| anyhow::anyhow!("Failed to create mouse up event"))?;
+            up_event.post(CGEventTapLocation::HID);
+        } // event_source and up_event dropped here
+        
+        Ok(())
+    }
+    
+    async fn double_click(&self, button: MouseButton) -> Result<()> {
+        self.click(button).await?;
+        tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
+        self.click(button).await?;
+        Ok(())
+    }
+    
+    async fn type_text(&self, text: &str) -> Result<()> {
+        for ch in text.chars() {
+            {
+                let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
+                    .map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
+                
+                // Create keyboard event for character
+                let event = CGEvent::new_keyboard_event(
+                    event_source,
+                    0, // keycode (0 for unicode)
+                    true,
+                ).map_err(|_| anyhow::anyhow!("Failed to create keyboard event"))?;
+                
+                // Set unicode string
+                let mut utf16_buf = [0u16; 2];
+                let utf16_slice = ch.encode_utf16(&mut utf16_buf);
+                let utf16_chars: Vec<u16> = utf16_slice.iter().copied().collect();
+                
+                event.set_string_from_utf16_unchecked(utf16_chars.as_slice());
+                event.post(CGEventTapLocation::HID);
+            } // event_source and event dropped here
+            
+            tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;
+        }
+        Ok(())
+    }
+    
+    async fn press_key(&self, key: &str) -> Result<()> {
+        let keycode = self.key_to_keycode(key)?;
+        
+        {
+            let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
+                .map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
+            
+            // Key down
+            let down_event = CGEvent::new_keyboard_event(
+                event_source,
+                keycode,
+                true,
+            ).map_err(|_| anyhow::anyhow!("Failed to create key down event"))?;
+            down_event.post(CGEventTapLocation::HID);
+        } // event_source and down_event dropped here
+        
+        tokio::time::sleep(tokio::time::Duration::from_millis(50)).await;
+        
+        {
+            let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
+                .map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
+            
+            // Key up
+            let up_event = CGEvent::new_keyboard_event(
+                event_source,
+                keycode,
+                false,
+            ).map_err(|_| anyhow::anyhow!("Failed to create key up event"))?;
+            up_event.post(CGEventTapLocation::HID);
+        } // event_source and up_event dropped here
+        
+        Ok(())
+    }
+    
+    async fn list_windows(&self) -> Result<Vec<Window>> {
+        let mut windows = Vec::new();
+        
+        unsafe {
+            let window_list = CGWindowListCopyWindowInfo(
+                kCGWindowListOptionOnScreenOnly,
+                kCGNullWindowID
+            );
+            
+            let array = core_foundation::array::CFArray::<CFDictionary>::wrap_under_create_rule(window_list);
+            let count = array.len();
+            
+            for i in 0..count {
+                let dict = array.get(i).unwrap();
+                
+                // Get window ID
+                let window_id_key = CFString::from_static_string("kCGWindowNumber");
+                let window_id: i64 = if let Some(value) = dict.find(window_id_key.to_void()) {
+                    let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _);
+                    num.to_i64().unwrap_or(0)
+                } else {
+                    0
+                };
+                
+                // Get owner name (app name)
+                let owner_key = CFString::from_static_string("kCGWindowOwnerName");
+                let app_name: String = if let Some(value) = dict.find(owner_key.to_void()) {
+                    let s: CFString = TCFType::wrap_under_get_rule(*value as *const _);
+                    s.to_string()
+                } else {
+                    "Unknown".to_string()
+                };
+                
+                // Get window name/title
+                let name_key = CFString::from_static_string("kCGWindowName");
+                let title: String = if let Some(value) = dict.find(name_key.to_void()) {
+                    let s: CFString = TCFType::wrap_under_get_rule(*value as *const _);
+                    s.to_string()
+                } else {
+                    "".to_string()
+                };
+                
+                // Get window bounds
+                let bounds_key = CFString::from_static_string("kCGWindowBounds");
+                let bounds = if let Some(bounds_value) = dict.find(bounds_key.to_void()) {
+                    let bounds_dict: CFDictionary = TCFType::wrap_under_get_rule(*bounds_value as *const _);
+                    
+                    let x_key = CFString::from_static_string("X");
+                    let y_key = CFString::from_static_string("Y");
+                    let width_key = CFString::from_static_string("Width");
+                    let height_key = CFString::from_static_string("Height");
+                    
+                    let x = if let Some(x_value) = bounds_dict.find(x_key.to_void()) {
+                        let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*x_value as *const _);
+                        num.to_i32().unwrap_or(0)
+                    } else { 0 };
+                    let y = if let Some(y_value) = bounds_dict.find(y_key.to_void()) {
+                        let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*y_value as *const _);
+                        num.to_i32().unwrap_or(0)
+                    } else { 0 };
+                    let width = if let Some(width_value) = bounds_dict.find(width_key.to_void()) {
+                        let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*width_value as *const _);
+                        num.to_i32().unwrap_or(0)
+                    } else { 0 };
+                    let height = if let Some(height_value) = bounds_dict.find(height_key.to_void()) {
+                        let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*height_value as *const _);
+                        num.to_i32().unwrap_or(0)
+                    } else { 0 };
+                    
+                    Rect { x, y, width, height }
+                } else {
+                    Rect { x: 0, y: 0, width: 0, height: 0 }
+                };
+                
+                // Skip windows without meaningful content (system UI elements, etc.)
+                if app_name.is_empty() || (title.is_empty() && bounds.width < 100) {
+                    continue;
+                }
+                
+                windows.push(Window {
+                    id: format!("{}:{}", app_name, window_id),
+                    title,
+                    app_name,
+                    bounds,
+                    is_active: false, // We'd need additional API calls to determine this
+                });
+            }
+        }
+        
+        Ok(windows)
+    }
+    
+    async fn focus_window(&self, _window_id: &str) -> Result<()> {
+        // Note: Full implementation would use NSWorkspace to activate application
+        tracing::warn!("focus_window not fully implemented on macOS");
+        Ok(())
+    }
+    
+    async fn get_window_bounds(&self, _window_id: &str) -> Result<Rect> {
+        // Note: Full implementation would use Accessibility API
+        tracing::warn!("get_window_bounds not fully implemented on macOS");
+        Ok(Rect { x: 0, y: 0, width: 800, height: 600 })
+    }
+    
+    async fn find_element(&self, _selector: &ElementSelector) -> Result<Option<UIElement>> {
+        // Note: Full implementation would use macOS Accessibility API
+        tracing::warn!("find_element not fully implemented on macOS");
+        Ok(None)
+    }
+    
+    async fn get_element_text(&self, _element_id: &str) -> Result<String> {
+        // Note: Full implementation would use Accessibility API
+        tracing::warn!("get_element_text not fully implemented on macOS");
+        Ok(String::new())
+    }
+    
+    async fn get_element_bounds(&self, _element_id: &str) -> Result<Rect> {
+        // Note: Full implementation would use Accessibility API
+        tracing::warn!("get_element_bounds not fully implemented on macOS");
+        Ok(Rect { x: 0, y: 0, width: 100, height: 30 })
+    }
+    
+    async fn take_screenshot(&self, path: &str, region: Option<Rect>, window_id: Option<&str>) -> Result<()> {
+        // Determine the temporary directory for screenshots
+        let temp_dir = std::env::var("TMPDIR")
+            .or_else(|_| std::env::var("HOME").map(|h| format!("{}/tmp", h)))
+            .unwrap_or_else(|_| "/tmp".to_string());
+        
+        // Ensure temp directory exists
+        std::fs::create_dir_all(&temp_dir)?;
+        
+        // If path is relative or doesn't specify a directory, use temp_dir
+        let final_path = if path.starts_with('/') {
+            path.to_string()
+        } else {
+            format!("{}/{}", temp_dir.trim_end_matches('/'), path)
+        };
+        
+        // Get the currently focused application before taking screenshot
+        let current_app = std::process::Command::new("osascript")
+            .arg("-e")
+            .arg("tell application \"System Events\" to get name of first application process whose frontmost is true")
+            .output()
+            .ok()
+            .and_then(|output| {
+                if output.status.success() {
+                    Some(String::from_utf8_lossy(&output.stdout).trim().to_string())
+                } else {
+                    None
+                }
+            });
+        
+        // Handle application-based window capture
+        let app_name_opt = window_id.and_then(|id| {
+            // Extract app name from window_id format "AppName:WindowNumber"
+            id.split(':').next().map(String::from)
+        });
+        
+        // If we're capturing a specific window, foreground it first
+        if let Some(ref app) = app_name_opt {
+            tracing::debug!("Foregrounding application: {}", app);
+            let _ = std::process::Command::new("osascript")
+                .arg("-e")
+                .arg(format!("tell application \"{}\" to activate", app))
+                .output();
+            
+            // Give the window time to come to the front
+            tokio::time::sleep(tokio::time::Duration::from_millis(500)).await;
+        }
+        
+        let screenshot_result = if let Some(ref app) = app_name_opt {
+            // Use screencapture with AppleScript to get window ID
+            let script = format!(
+                r#"tell application "{}" to id of window 1"#,
+                app
+            );
+            
+            let output = std::process::Command::new("osascript")
+                .arg("-e")
+                .arg(&script)
+                .output()?;
+            
+            if output.status.success() {
+                let window_id_str = String::from_utf8_lossy(&output.stdout).trim().to_string();
+                std::process::Command::new("screencapture")
+                    .arg(format!("-l{}", window_id_str))
+                    .arg("-o")
+                    .arg(&final_path)
+                    .output()
+            } else {
+                // Fallback to regular screenshot if we can't get window ID
+                std::process::Command::new("screencapture")
+                    .arg("-x")
+                    .arg(&final_path)
+                    .output()
+            }
+        } else {
+            // Regular screenshot (full screen or region)
+        // Use native macOS screencapture command which handles all the format complexities
+        
+        // Check if we have Screen Recording permission by attempting a test capture
+        // If we only get wallpaper/menubar but no windows, we need permission
+        let needs_permission_check = std::env::var("G3_SKIP_PERMISSION_CHECK").is_err();
+        
+        if needs_permission_check {
+            // Try to open Screen Recording settings if this is the first screenshot
+            static PERMISSION_PROMPTED: std::sync::atomic::AtomicBool = std::sync::atomic::AtomicBool::new(false);
+            
+            if !PERMISSION_PROMPTED.swap(true, std::sync::atomic::Ordering::Relaxed) {
+                tracing::warn!("\n=== Screen Recording Permission Required ===\n\
+                    macOS requires explicit permission to capture window content.\n\
+                    If screenshots only show wallpaper/menubar (no windows):\n\n\
+                    1. Open System Settings > Privacy & Security > Screen Recording\n\
+                    2. Enable permission for your terminal (iTerm/Terminal) or g3\n\
+                    3. Restart your terminal if needed\n\n\
+                    Opening Screen Recording settings now...\n");
+                
+                // Try to open the settings (non-blocking)
+                let _ = std::process::Command::new("open")
+                    .arg("x-apple.systempreferences:com.apple.preference.security?Privacy_ScreenCapture")
+                    .spawn();
+            }
+        }
+        
+        let path_obj = Path::new(&final_path);
+        if let Some(parent) = path_obj.parent() {
+            std::fs::create_dir_all(parent)?;
+        }
+        
+        let mut cmd = std::process::Command::new("screencapture");
+        
+        // Add flags
+        cmd.arg("-x"); // No sound
+        
+        if let Some(region) = region {
+            // Capture specific region: -R x,y,width,height
+            cmd.arg("-R");
+            cmd.arg(format!("{},{},{},{}", region.x, region.y, region.width, region.height));
+        }
+        
+        cmd.arg(&final_path);
+        
+            cmd.output()
+        }?;
+        
+        if !screenshot_result.status.success() {
+            let stderr = String::from_utf8_lossy(&screenshot_result.stderr);
+            return Err(anyhow::anyhow!("screencapture failed: {}", stderr));
+        }
+        
+        // Re-foreground the original application if we foregrounded a different window
+        if let Some(ref target_app) = app_name_opt {
+            if let Some(ref original_app) = current_app {
+                // Only restore if we actually changed the foreground app
+                if target_app != original_app {
+                    tracing::debug!("Restoring focus to original application: {}", original_app);
+                    
+                    // Small delay to ensure screenshot is complete
+                    tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
+                    
+                    let _ = std::process::Command::new("osascript")
+                        .arg("-e")
+                    .arg(format!("tell application \"{}\" to activate", original_app))
+                    .output();
+                }
+            }
+        }
+        
+        tracing::debug!("Screenshot saved using screencapture: {}", final_path);
+        
+        Ok(())
+    }
+    
+    
+    async fn extract_text_from_screen(&self, region: Rect) -> Result<OCRResult> {
+        // Take screenshot of region first
+        let temp_path = format!("/tmp/g3_ocr_{}.png", uuid::Uuid::new_v4());
+        self.take_screenshot(&temp_path, Some(region), None).await?;
+        
+        // Extract text from the screenshot
+        let result = self.extract_text_from_image(&temp_path).await?;
+        
+        // Clean up temp file
+        let _ = std::fs::remove_file(&temp_path);
+        
+        Ok(result)
+    }
+    
+    async fn extract_text_from_image(&self, _path: &str) -> Result<OCRResult> {
+        // Check if tesseract is available on the system
+        let tesseract_check = std::process::Command::new("which")
+            .arg("tesseract")
+            .output();
+        
+        if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() {
+            anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\
+                To install tesseract:\n  macOS:   brew install tesseract\n  \
+                Linux:   sudo apt-get install tesseract-ocr (Ubuntu/Debian)\n           \
+                sudo yum install tesseract (RHEL/CentOS)\n  \
+                Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki\n\n\
+                After installation, restart your terminal and try again.");
+        }
+        
+        // Initialize Tesseract
+        let tess = Tesseract::new(None, Some("eng"))
+            .map_err(|e| {
+                anyhow::anyhow!("Failed to initialize Tesseract: {}\n\n\
+                    This usually means:\n1. Tesseract is not properly installed\n\
+                    2. Language data files are missing\n\nTo fix:\n  \
+                    macOS:   brew reinstall tesseract\n  \
+                    Linux:   sudo apt-get install tesseract-ocr-eng\n  \
+                    Windows: Reinstall tesseract and ensure language files are included", e)
+            })?;
+        
+        let text = tess.set_image(_path)
+            .map_err(|e| anyhow::anyhow!("Failed to load image '{}': {}", _path, e))?
+            .get_text()
+            .map_err(|e| anyhow::anyhow!("Failed to extract text from image: {}", e))?;
+        
+        // Get confidence (simplified - would need more complex API calls for per-word confidence)
+        let confidence = 0.85; // Placeholder
+        
+        Ok(OCRResult {
+            text,
+            confidence,
+            bounds: Rect { x: 0, y: 0, width: 0, height: 0 }, // Would need image dimensions
+        })
+    }
+    
+    async fn find_text_on_screen(&self, _text: &str) -> Result<Option<Point>> {
+        // Check if tesseract is available on the system
+        let tesseract_check = std::process::Command::new("which")
+            .arg("tesseract")
+            .output();
+        
+        if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() {
+            anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\
+                To install tesseract:\n  macOS:   brew install tesseract\n  \
+                Linux:   sudo apt-get install tesseract-ocr (Ubuntu/Debian)\n           \
+                sudo yum install tesseract (RHEL/CentOS)\n  \
+                Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki\n\n\
+                After installation, restart your terminal and try again.");
+        }
+        
+        // Take full screen screenshot
+        let temp_path = format!("/tmp/g3_ocr_search_{}.png", uuid::Uuid::new_v4());
+        self.take_screenshot(&temp_path, None, None).await?;
+        
+        // Use Tesseract to find text with bounding boxes
+        let tess = Tesseract::new(None, Some("eng"))
+            .map_err(|e| {
+                anyhow::anyhow!("Failed to initialize Tesseract: {}\n\n\
+                    This usually means:\n1. Tesseract is not properly installed\n\
+                    2. Language data files are missing\n\nTo fix:\n  \
+                    macOS:   brew reinstall tesseract\n  \
+                    Linux:   sudo apt-get install tesseract-ocr-eng\n  \
+                    Windows: Reinstall tesseract and ensure language files are included", e)
+            })?;
+        
+        let full_text = tess.set_image(temp_path.as_str())
+            .map_err(|e| anyhow::anyhow!("Failed to load screenshot: {}", e))?
+            .get_text()
+            .map_err(|e| anyhow::anyhow!("Failed to extract text from screen: {}", e))?;
+        
+        // Clean up temp file
+        let _ = std::fs::remove_file(&temp_path);
+        
+        // Simple text search - full implementation would use get_component_images
+        // to get bounding boxes for each word
+        if full_text.contains(_text) {
+            tracing::warn!("Text found but precise coordinates not available in simplified implementation");
+            Ok(Some(Point { x: 0, y: 0 }))
+        } else {
+            Ok(None)
+        }
+    }
+}
--- a/crates/g3-computer-control/src/platform/macos.rs.bak
+++ b/crates/g3-computer-control/src/platform/macos.rs.bak
@@ -0,0 +1,425 @@
+use crate::{ComputerController, types::*};
+use anyhow::Result;
+use async_trait::async_trait;
+use core_graphics::display::CGPoint;
+use core_graphics::event::{CGEvent, CGEventType, CGMouseButton, CGEventTapLocation};
+use core_graphics::event_source::{CGEventSource, CGEventSourceStateID};
+use std::path::Path;
+use tesseract::Tesseract;
+
+// MacOSController doesn't store CGEventSource to avoid Send/Sync issues
+// We create it fresh for each operation
+pub struct MacOSController {
+    // Empty struct - event source created per operation
+}
+
+impl MacOSController {
+    pub fn new() -> Result<Self> {
+        // Test that we can create an event source
+        let _event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
+            .map_err(|_| anyhow::anyhow!("Failed to create event source. Make sure Accessibility permissions are granted."))?;
+        Ok(Self {})
+    }
+    
+    fn key_to_keycode(&self, key: &str) -> Result<u16> {
+        // Map key names to macOS keycodes
+        let keycode = match key.to_lowercase().as_str() {
+            "return" | "enter" => 36,
+            "tab" => 48,
+            "space" => 49,
+            "delete" | "backspace" => 51,
+            "escape" | "esc" => 53,
+            "command" | "cmd" => 55,
+            "shift" => 56,
+            "capslock" => 57,
+            "option" | "alt" => 58,
+            "control" | "ctrl" => 59,
+            "left" => 123,
+            "right" => 124,
+            "down" => 125,
+            "up" => 126,
+            _ => anyhow::bail!("Unknown key: {}", key),
+        };
+        Ok(keycode)
+    }
+}
+
+#[async_trait]
+impl ComputerController for MacOSController {
+    async fn move_mouse(&self, x: i32, y: i32) -> Result<()> {
+        let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
+            .map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
+        let point = CGPoint::new(x as f64, y as f64);
+        let event = CGEvent::new_mouse_event(
+            event_source,
+            CGEventType::MouseMoved,
+            point,
+            CGMouseButton::Left,
+        ).map_err(|_| anyhow::anyhow!("Failed to create mouse move event"))?;
+        
+        event.post(CGEventTapLocation::HID);
+        Ok(())
+    }
+    
+    async fn click(&self, button: MouseButton) -> Result<()> {
+        let (cg_button, down_type, up_type) = match button {
+            MouseButton::Left => (CGMouseButton::Left, CGEventType::LeftMouseDown, CGEventType::LeftMouseUp),
+            MouseButton::Right => (CGMouseButton::Right, CGEventType::RightMouseDown, CGEventType::RightMouseUp),
+            MouseButton::Middle => (CGMouseButton::Center, CGEventType::OtherMouseDown, CGEventType::OtherMouseUp),
+        };
+        
+        let point = {
+            // Get current mouse position
+            let temp_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
+                .map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
+            let event = CGEvent::new(temp_source)
+                .map_err(|_| anyhow::anyhow!("Failed to get mouse position"))?;
+            let p = event.location();
+            p
+        };
+        
+        {
+            let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
+                .map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
+            
+            // Mouse down
+            let down_event = CGEvent::new_mouse_event(
+                event_source,
+                down_type,
+                point,
+                cg_button,
+            ).map_err(|_| anyhow::anyhow!("Failed to create mouse down event"))?;
+            down_event.post(CGEventTapLocation::HID);
+        } // event_source and down_event dropped here
+        
+        // Small delay
+        tokio::time::sleep(tokio::time::Duration::from_millis(50)).await;
+        
+        {
+            let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
+                .map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
+            
+            let up_event = CGEvent::new_mouse_event(
+                event_source,
+                up_type,
+                point,
+                cg_button,
+            ).map_err(|_| anyhow::anyhow!("Failed to create mouse up event"))?;
+            up_event.post(CGEventTapLocation::HID);
+        } // event_source and up_event dropped here
+        
+        Ok(())
+    }
+    
+    async fn double_click(&self, button: MouseButton) -> Result<()> {
+        self.click(button).await?;
+        tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
+        self.click(button).await?;
+        Ok(())
+    }
+    
+    async fn type_text(&self, text: &str) -> Result<()> {
+        for ch in text.chars() {
+            {
+                let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
+                    .map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
+                
+                // Create keyboard event for character
+                let event = CGEvent::new_keyboard_event(
+                    event_source,
+                    0, // keycode (0 for unicode)
+                    true,
+                ).map_err(|_| anyhow::anyhow!("Failed to create keyboard event"))?;
+                
+                // Set unicode string
+                let mut utf16_buf = [0u16; 2];
+                let utf16_slice = ch.encode_utf16(&mut utf16_buf);
+                let utf16_chars: Vec<u16> = utf16_slice.iter().copied().collect();
+                
+                event.set_string_from_utf16_unchecked(utf16_chars.as_slice());
+                event.post(CGEventTapLocation::HID);
+            } // event_source and event dropped here
+            
+            tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;
+        }
+        Ok(())
+    }
+    
+    async fn press_key(&self, key: &str) -> Result<()> {
+        let keycode = self.key_to_keycode(key)?;
+        
+        {
+            let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
+                .map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
+            
+            // Key down
+            let down_event = CGEvent::new_keyboard_event(
+                event_source,
+                keycode,
+                true,
+            ).map_err(|_| anyhow::anyhow!("Failed to create key down event"))?;
+            down_event.post(CGEventTapLocation::HID);
+        } // event_source and down_event dropped here
+        
+        tokio::time::sleep(tokio::time::Duration::from_millis(50)).await;
+        
+        {
+            let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
+                .map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
+            
+            // Key up
+            let up_event = CGEvent::new_keyboard_event(
+                event_source,
+                keycode,
+                false,
+            ).map_err(|_| anyhow::anyhow!("Failed to create key up event"))?;
+            up_event.post(CGEventTapLocation::HID);
+        } // event_source and up_event dropped here
+        
+        Ok(())
+    }
+    
+    async fn list_windows(&self) -> Result<Vec<Window>> {
+        // Note: Full implementation would use CGWindowListCopyWindowInfo
+        // For now, return empty list as this requires more complex FFI
+        tracing::warn!("list_windows not fully implemented on macOS");
+        Ok(vec![])
+    }
+    
+    async fn focus_window(&self, _window_id: &str) -> Result<()> {
+        // Note: Full implementation would use NSWorkspace to activate application
+        tracing::warn!("focus_window not fully implemented on macOS");
+        Ok(())
+    }
+    
+    async fn get_window_bounds(&self, _window_id: &str) -> Result<Rect> {
+        // Note: Full implementation would use Accessibility API
+        tracing::warn!("get_window_bounds not fully implemented on macOS");
+        Ok(Rect { x: 0, y: 0, width: 800, height: 600 })
+    }
+    
+    async fn find_element(&self, _selector: &ElementSelector) -> Result<Option<UIElement>> {
+        // Note: Full implementation would use macOS Accessibility API
+        tracing::warn!("find_element not fully implemented on macOS");
+        Ok(None)
+    }
+    
+    async fn get_element_text(&self, _element_id: &str) -> Result<String> {
+        // Note: Full implementation would use Accessibility API
+        tracing::warn!("get_element_text not fully implemented on macOS");
+        Ok(String::new())
+    }
+    
+    async fn get_element_bounds(&self, _element_id: &str) -> Result<Rect> {
+        // Note: Full implementation would use Accessibility API
+        tracing::warn!("get_element_bounds not fully implemented on macOS");
+        Ok(Rect { x: 0, y: 0, width: 100, height: 30 })
+    }
+    
+    async fn take_screenshot(&self, path: &str, _region: Option<Rect>, window_id: Option<&str>) -> Result<()> {
+        // Use native macOS screencapture command which handles all the format complexities
+        
+        // Check if we have Screen Recording permission by attempting a test capture
+        // If we only get wallpaper/menubar but no windows, we need permission
+        let needs_permission_check = std::env::var("G3_SKIP_PERMISSION_CHECK").is_err();
+        
+        if needs_permission_check {
+            // Try to open Screen Recording settings if this is the first screenshot
+            static PERMISSION_PROMPTED: std::sync::atomic::AtomicBool = std::sync::atomic::AtomicBool::new(false);
+            
+            if !PERMISSION_PROMPTED.swap(true, std::sync::atomic::Ordering::Relaxed) {
+                tracing::warn!("\n=== Screen Recording Permission Required ===\n\
+                    macOS requires explicit permission to capture window content.\n\
+                    If screenshots only show wallpaper/menubar (no windows):\n\n\
+                    1. Open System Settings > Privacy & Security > Screen Recording\n\
+                    2. Enable permission for your terminal (iTerm/Terminal) or g3\n\
+                    3. Restart your terminal if needed\n\n\
+                    Opening Screen Recording settings now...\n");
+                
+                // Try to open the settings (non-blocking)
+                let _ = std::process::Command::new("open")
+                    .arg("x-apple.systempreferences:com.apple.preference.security?Privacy_ScreenCapture")
+                    .spawn();
+            }
+        }
+        
+        let path_obj = Path::new(path);
+        if let Some(parent) = path_obj.parent() {
+            std::fs::create_dir_all(parent)?;
+        }
+        
+        let mut cmd = std::process::Command::new("screencapture");
+        
+        // Add flags
+        cmd.arg("-x"); // No sound
+        
+        if let Some(window_id) = window_id {
+            // Capture specific window by getting its bounds and using region capture
+            // window_id format: "AppName" or "AppName:WindowTitle"
+            let app_name = window_id.split(':').next().unwrap_or(window_id);
+            
+            // Use AppleScript to get window bounds
+            let script = format!(
+                r#"tell application "{}"
+                    tell current window
+                        get bounds
+                    end tell
+                end tell"#,
+                app_name
+            );
+            
+            let output = std::process::Command::new("osascript")
+                .arg("-e")
+                .arg(&script)
+                .output()
+                .map_err(|e| anyhow::anyhow!("Failed to get window bounds: {}", e))?;
+            
+            if output.status.success() {
+                let bounds_str = String::from_utf8_lossy(&output.stdout);
+                let bounds: Vec<i32> = bounds_str
+                    .trim()
+                    .split(',')
+                    .filter_map(|s| s.trim().parse().ok())
+                    .collect();
+                
+                if bounds.len() == 4 {
+                    let (left, top, right, bottom) = (bounds[0], bounds[1], bounds[2], bounds[3]);
+                    let width = right - left;
+                    let height = bottom - top;
+                    
+                    cmd.arg("-R");
+                    cmd.arg(format!("{},{},{},{}", left, top, width, height));
+                    
+                    tracing::debug!("Capturing window '{}' at region: {},{} {}x{}", app_name, left, top, width, height);
+                } else {
+                    tracing::warn!("Failed to parse window bounds, capturing full screen");
+                }
+            } else {
+                tracing::warn!("Failed to get window bounds for '{}', capturing full screen", app_name);
+            }
+        } else if let Some(region) = _region {
+            // Capture specific region: -R x,y,width,height
+            cmd.arg("-R");
+            cmd.arg(format!("{},{},{},{}", region.x, region.y, region.width, region.height));
+        }
+        
+        cmd.arg(path);
+        
+        let output = cmd.output()
+            .map_err(|e| anyhow::anyhow!("Failed to execute screencapture: {}", e))?;
+        
+        if !output.status.success() {
+            let stderr = String::from_utf8_lossy(&output.stderr);
+            anyhow::bail!("screencapture failed: {}", stderr);
+        }
+        
+        tracing::debug!("Screenshot saved using screencapture: {}", path);
+        
+        Ok(())
+    }
+    
+    }
+    
+    async fn extract_text_from_screen(&self, region: Rect) -> Result<OCRResult> {
+        // Take screenshot of region first
+        let temp_path = format!("/tmp/g3_ocr_{}.png", uuid::Uuid::new_v4());
+        self.take_screenshot(&temp_path, Some(region), None).await?;
+        
+        // Extract text from the screenshot
+        let result = self.extract_text_from_image(&temp_path).await?;
+        
+        // Clean up temp file
+        let _ = std::fs::remove_file(&temp_path);
+        
+        Ok(result)
+    }
+    
+    async fn extract_text_from_image(&self, _path: &str) -> Result<OCRResult> {
+        // Check if tesseract is available on the system
+        let tesseract_check = std::process::Command::new("which")
+            .arg("tesseract")
+            .output();
+        
+        if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() {
+            anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\
+                To install tesseract:\n  macOS:   brew install tesseract\n  \
+                Linux:   sudo apt-get install tesseract-ocr (Ubuntu/Debian)\n           \
+                sudo yum install tesseract (RHEL/CentOS)\n  \
+                Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki\n\n\
+                After installation, restart your terminal and try again.");
+        }
+        
+        // Initialize Tesseract
+        let tess = Tesseract::new(None, Some("eng"))
+            .map_err(|e| {
+                anyhow::anyhow!("Failed to initialize Tesseract: {}\n\n\
+                    This usually means:\n1. Tesseract is not properly installed\n\
+                    2. Language data files are missing\n\nTo fix:\n  \
+                    macOS:   brew reinstall tesseract\n  \
+                    Linux:   sudo apt-get install tesseract-ocr-eng\n  \
+                    Windows: Reinstall tesseract and ensure language files are included", e)
+            })?;
+        
+        let text = tess.set_image(_path)
+            .map_err(|e| anyhow::anyhow!("Failed to load image '{}': {}", _path, e))?
+            .get_text()
+            .map_err(|e| anyhow::anyhow!("Failed to extract text from image: {}", e))?;
+        
+        // Get confidence (simplified - would need more complex API calls for per-word confidence)
+        let confidence = 0.85; // Placeholder
+        
+        Ok(OCRResult {
+            text,
+            confidence,
+            bounds: Rect { x: 0, y: 0, width: 0, height: 0 }, // Would need image dimensions
+        })
+    }
+    
+    async fn find_text_on_screen(&self, _text: &str) -> Result<Option<Point>> {
+        // Check if tesseract is available on the system
+        let tesseract_check = std::process::Command::new("which")
+            .arg("tesseract")
+            .output();
+        
+        if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() {
+            anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\
+                To install tesseract:\n  macOS:   brew install tesseract\n  \
+                Linux:   sudo apt-get install tesseract-ocr (Ubuntu/Debian)\n           \
+                sudo yum install tesseract (RHEL/CentOS)\n  \
+                Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki\n\n\
+                After installation, restart your terminal and try again.");
+        }
+        
+        // Take full screen screenshot
+        let temp_path = format!("/tmp/g3_ocr_search_{}.png", uuid::Uuid::new_v4());
+        self.take_screenshot(&temp_path, None, None).await?;
+        
+        // Use Tesseract to find text with bounding boxes
+        let tess = Tesseract::new(None, Some("eng"))
+            .map_err(|e| {
+                anyhow::anyhow!("Failed to initialize Tesseract: {}\n\n\
+                    This usually means:\n1. Tesseract is not properly installed\n\
+                    2. Language data files are missing\n\nTo fix:\n  \
+                    macOS:   brew reinstall tesseract\n  \
+                    Linux:   sudo apt-get install tesseract-ocr-eng\n  \
+                    Windows: Reinstall tesseract and ensure language files are included", e)
+            })?;
+        
+        let full_text = tess.set_image(temp_path.as_str())
+            .map_err(|e| anyhow::anyhow!("Failed to load screenshot: {}", e))?
+            .get_text()
+            .map_err(|e| anyhow::anyhow!("Failed to extract text from screen: {}", e))?;
+        
+        // Clean up temp file
+        let _ = std::fs::remove_file(&temp_path);
+        
+        // Simple text search - full implementation would use get_component_images
+        // to get bounding boxes for each word
+        if full_text.contains(_text) {
+            tracing::warn!("Text found but precise coordinates not available in simplified implementation");
+            Ok(Some(Point { x: 0, y: 0 }))
+        } else {
+            Ok(None)
+        }
+    }
+}
--- a/crates/g3-computer-control/src/platform/mod.rs
+++ b/crates/g3-computer-control/src/platform/mod.rs
@@ -0,0 +1,8 @@
+#[cfg(target_os = "macos")]
+pub mod macos;
+
+#[cfg(target_os = "linux")]
+pub mod linux;
+
+#[cfg(target_os = "windows")]
+pub mod windows;
--- a/crates/g3-computer-control/src/platform/windows.rs
+++ b/crates/g3-computer-control/src/platform/windows.rs
@@ -0,0 +1,162 @@
+use crate::{ComputerController, types::*};
+use anyhow::Result;
+use async_trait::async_trait;
+use tesseract::Tesseract;
+use uuid::Uuid;
+
+pub struct WindowsController {
+    // Placeholder for Windows-specific state
+}
+
+impl WindowsController {
+    pub fn new() -> Result<Self> {
+        tracing::warn!("Windows computer control not fully implemented");
+        Ok(Self {})
+    }
+}
+
+#[async_trait]
+impl ComputerController for WindowsController {
+    async fn move_mouse(&self, _x: i32, _y: i32) -> Result<()> {
+        anyhow::bail!("Windows implementation not yet available")
+    }
+    
+    async fn click(&self, _button: MouseButton) -> Result<()> {
+        anyhow::bail!("Windows implementation not yet available")
+    }
+    
+    async fn double_click(&self, _button: MouseButton) -> Result<()> {
+        anyhow::bail!("Windows implementation not yet available")
+    }
+    
+    async fn type_text(&self, _text: &str) -> Result<()> {
+        anyhow::bail!("Windows implementation not yet available")
+    }
+    
+    async fn press_key(&self, _key: &str) -> Result<()> {
+        anyhow::bail!("Windows implementation not yet available")
+    }
+    
+    async fn list_windows(&self) -> Result<Vec<Window>> {
+        anyhow::bail!("Windows implementation not yet available")
+    }
+    
+    async fn focus_window(&self, _window_id: &str) -> Result<()> {
+        anyhow::bail!("Windows implementation not yet available")
+    }
+    
+    async fn get_window_bounds(&self, _window_id: &str) -> Result<Rect> {
+        anyhow::bail!("Windows implementation not yet available")
+    }
+    
+    async fn find_element(&self, _selector: &ElementSelector) -> Result<Option<UIElement>> {
+        anyhow::bail!("Windows implementation not yet available")
+    }
+    
+    async fn get_element_text(&self, _element_id: &str) -> Result<String> {
+        anyhow::bail!("Windows implementation not yet available")
+    }
+    
+    async fn get_element_bounds(&self, _element_id: &str) -> Result<Rect> {
+        anyhow::bail!("Windows implementation not yet available")
+    }
+    
+    async fn take_screenshot(&self, _path: &str, _region: Option<Rect>, _window_id: Option<&str>) -> Result<()> {
+        anyhow::bail!("Windows implementation not yet available")
+    }
+    
+    async fn extract_text_from_screen(&self, _region: Rect) -> Result<OCRResult> {
+        anyhow::bail!("Windows implementation not yet available")
+    }
+    
+    async fn extract_text_from_image(&self, _path: &str) -> Result<OCRResult> {
+        // Check if tesseract is available on the system
+        let tesseract_check = std::process::Command::new("where")
+            .arg("tesseract")
+            .output();
+        
+        if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() {
+            anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\
+                To install tesseract on Windows:\n  \
+                1. Download the installer from: https://github.com/UB-Mannheim/tesseract/wiki\n  \
+                2. Run the installer and follow the instructions\n  \
+                3. Add tesseract to your PATH environment variable\n  \
+                4. Restart your terminal/command prompt\n\n\
+                After installation, restart your terminal and try again.");
+        }
+        
+        // Initialize Tesseract
+        let tess = Tesseract::new(None, Some("eng"))
+            .map_err(|e| {
+                anyhow::anyhow!("Failed to initialize Tesseract: {}\n\n\
+                    This usually means:\n1. Tesseract is not properly installed\n\
+                    2. Language data files are missing\n\nTo fix:\n  \
+                    1. Reinstall tesseract from https://github.com/UB-Mannheim/tesseract/wiki\n  \
+                    2. Make sure to select 'Additional language data' during installation\n  \
+                    3. Ensure tesseract is in your PATH", e)
+            })?;
+        
+        let text = tess.set_image(_path)
+            .map_err(|e| anyhow::anyhow!("Failed to load image '{}': {}", _path, e))?
+            .get_text()
+            .map_err(|e| anyhow::anyhow!("Failed to extract text from image: {}", e))?;
+        
+        // Get confidence (simplified - would need more complex API calls for per-word confidence)
+        let confidence = 0.85; // Placeholder
+        
+        Ok(OCRResult {
+            text,
+            confidence,
+            bounds: Rect { x: 0, y: 0, width: 0, height: 0 }, // Would need image dimensions
+        })
+    }
+    
+    async fn find_text_on_screen(&self, _text: &str) -> Result<Option<Point>> {
+        // Check if tesseract is available on the system
+        let tesseract_check = std::process::Command::new("where")
+            .arg("tesseract")
+            .output();
+        
+        if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() {
+            anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\
+                To install tesseract on Windows:\n  \
+                1. Download the installer from: https://github.com/UB-Mannheim/tesseract/wiki\n  \
+                2. Run the installer and follow the instructions\n  \
+                3. Add tesseract to your PATH environment variable\n  \
+                4. Restart your terminal/command prompt\n\n\
+                After installation, restart your terminal and try again.");
+        }
+        
+        // Take full screen screenshot
+        let temp_path = format!("C:\\\\Temp\\\\g3_ocr_search_{}.png", uuid::Uuid::new_v4());
+        self.take_screenshot(&temp_path, None, None).await?;
+        
+        // Use Tesseract to find text with bounding boxes
+        let tess = Tesseract::new(None, Some("eng"))
+            .map_err(|e| {
+                anyhow::anyhow!("Failed to initialize Tesseract: {}\n\n\
+                    This usually means:\n1. Tesseract is not properly installed\n\
+                    2. Language data files are missing\n\nTo fix:\n  \
+                    1. Reinstall tesseract from https://github.com/UB-Mannheim/tesseract/wiki\n  \
+                    2. Make sure to select 'Additional language data' during installation\n  \
+                    3. Ensure tesseract is in your PATH", e)
+            })?;
+        
+        let full_text = tess.set_image(temp_path.as_str())
+            .map_err(|e| anyhow::anyhow!("Failed to load screenshot: {}", e))?
+            .get_text()
+            .map_err(|e| anyhow::anyhow!("Failed to extract text from screen: {}", e))?;
+        
+        // Clean up temp file
+        let _ = std::fs::remove_file(&temp_path);
+        
+        // Simple text search - full implementation would use get_component_images
+        // to get bounding boxes for each word
+        if full_text.contains(_text) {
+            tracing::warn!("Text found but precise coordinates not available in simplified implementation");
+            Ok(Some(Point { x: 0, y: 0 }))
+        } else {
+            Ok(None)
+        }
+    }
+}
--- a/crates/g3-computer-control/src/types.rs
+++ b/crates/g3-computer-control/src/types.rs
@@ -0,0 +1,65 @@
+use serde::{Deserialize, Serialize};
+
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+pub struct Point {
+    pub x: i32,
+    pub y: i32,
+}
+
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+pub struct Rect {
+    pub x: i32,
+    pub y: i32,
+    pub width: i32,
+    pub height: i32,
+}
+
+impl Rect {
+    pub fn center(&self) -> Point {
+        Point {
+            x: self.x + self.width / 2,
+            y: self.y + self.height / 2,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Window {
+    pub id: String,
+    pub title: String,
+    pub app_name: String,
+    pub bounds: Rect,
+    pub is_active: bool,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct UIElement {
+    pub id: String,
+    pub text: String,
+    pub role: String,
+    pub bounds: Rect,
+    pub enabled: bool,
+    pub visible: bool,
+    pub value: Option<String>,
+}
+
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+pub enum MouseButton {
+    Left,
+    Right,
+    Middle,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ElementSelector {
+    pub text: Option<String>,
+    pub role: Option<String>,
+    pub window_id: Option<String>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct OCRResult {
+    pub text: String,
+    pub confidence: f32,
+    pub bounds: Rect,
+}
--- a/crates/g3-computer-control/tests/integration_test.rs
+++ b/crates/g3-computer-control/tests/integration_test.rs
@@ -0,0 +1,62 @@
+use g3_computer_control::*;
+
+#[tokio::test]
+async fn test_mouse_movement() {
+    let controller = create_controller().expect("Failed to create controller");
+    
+    // Move mouse to center of screen (assuming 1920x1080)
+    let result = controller.move_mouse(960, 540).await;
+    assert!(result.is_ok(), "Failed to move mouse: {:?}", result.err());
+}
+
+#[tokio::test]
+async fn test_typing() {
+    let controller = create_controller().expect("Failed to create controller");
+    
+    // Type some text
+    let result = controller.type_text("Hello, World!").await;
+    assert!(result.is_ok(), "Failed to type text: {:?}", result.err());
+}
+
+#[tokio::test]
+async fn test_screenshot() {
+    let controller = create_controller().expect("Failed to create controller");
+    
+    // Take screenshot
+    let path = "/tmp/test_screenshot.png";
+    let result = controller.take_screenshot(path, None, None).await;
+    assert!(result.is_ok(), "Failed to take screenshot: {:?}", result.err());
+    
+    // Verify file exists
+    assert!(std::path::Path::new(path).exists(), "Screenshot file was not created");
+    
+    // Clean up
+    let _ = std::fs::remove_file(path);
+}
+
+#[tokio::test]
+async fn test_click() {
+    let controller = create_controller().expect("Failed to create controller");
+    
+    // Click at a safe location
+    let result = controller.click(types::MouseButton::Left).await;
+    assert!(result.is_ok(), "Failed to click: {:?}", result.err());
+}
+
+#[tokio::test]
+async fn test_double_click() {
+    let controller = create_controller().expect("Failed to create controller");
+    
+    // Double click
+    let result = controller.double_click(types::MouseButton::Left).await;
+    assert!(result.is_ok(), "Failed to double click: {:?}", result.err());
+}
+
+#[tokio::test]
+async fn test_press_key() {
+    let controller = create_controller().expect("Failed to create controller");
+    
+    // Press escape key
+    let result = controller.press_key("escape").await;
+    assert!(result.is_ok(), "Failed to press key: {:?}", result.err());
+}
--- a/crates/g3-config/src/lib.rs
+++ b/crates/g3-config/src/lib.rs
@@ -6,6 +6,7 @@ use std::path::Path;
 pub struct Config {
    pub providers: ProvidersConfig,
    pub agent: AgentConfig,
+    pub computer_control: ComputerControlConfig,
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -62,6 +63,23 @@ pub struct AgentConfig {
    pub timeout_seconds: u64,
 }

+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ComputerControlConfig {
+    pub enabled: bool,
+    pub require_confirmation: bool,
+    pub max_actions_per_second: u32,
+}
+
+impl Default for ComputerControlConfig {
+    fn default() -> Self {
+        Self {
+            enabled: false, // Disabled by default for safety
+            require_confirmation: true,
+            max_actions_per_second: 5,
+        }
+    }
+}
+
 impl Default for Config {
    fn default() -> Self {
        Self {
@@ -84,6 +102,7 @@ impl Default for Config {
                enable_streaming: true,
                timeout_seconds: 60,
            },
+            computer_control: ComputerControlConfig::default(),
        }
    }
 }
@@ -194,6 +213,7 @@ impl Config {
                enable_streaming: true,
                timeout_seconds: 60,
            },
+            computer_control: ComputerControlConfig::default(),
        }
    }
    
--- a/crates/g3-core/Cargo.toml
+++ b/crates/g3-core/Cargo.toml
@@ -8,6 +8,7 @@ description = "Core engine for G3 AI coding agent"
 g3-providers = { path = "../g3-providers" }
 g3-config = { path = "../g3-config" }
 g3-execution = { path = "../g3-execution" }
+g3-computer-control = { path = "../g3-computer-control" }
 tokio = { workspace = true }
 reqwest = { workspace = true }
 anyhow = { workspace = true }
--- a/crates/g3-core/src/lib.rs
+++ b/crates/g3-core/src/lib.rs
@@ -423,6 +423,7 @@ pub struct Agent<W: UiWriter> {
    ui_writer: W,
    is_autonomous: bool,
    quiet: bool,
+    computer_controller: Option<Box<dyn g3_computer_control::ComputerController>>,
 }

 impl<W: UiWriter> Agent<W> {
@@ -576,6 +577,22 @@ impl<W: UiWriter> Agent<W> {
            info!("Added project README to context window");
        }

+        // Initialize computer controller if enabled
+        let computer_controller = if config.computer_control.enabled {
+            match g3_computer_control::create_controller() {
+                Ok(controller) => {
+                    info!("Computer control enabled");
+                    Some(controller)
+                }
+                Err(e) => {
+                    warn!("Failed to initialize computer control: {}", e);
+                    None
+                }
+            }
+        } else {
+            None
+        };
+
        Ok(Self {
            providers,
            context_window,
@@ -585,6 +602,7 @@ impl<W: UiWriter> Agent<W> {
            ui_writer,
            is_autonomous,
            quiet,
+            computer_controller,
        })
    }

@@ -761,6 +779,8 @@ If you create test or data files temporarily, place these in a subdir named 'tmp

 IMPORTANT: If the user asks you to just respond with text (like \"just say hello\" or \"tell me about X\"), do NOT use tools. Simply respond with the requested text directly. Only use tools when you need to execute commands or complete tasks that require action.

+When taking screenshots of specific windows (like \"my Safari window\" or \"my terminal\"), ALWAYS use list_windows first to identify the correct window ID, then use take_screenshot with the window_id parameter.
+
 Do not explain what you're going to do - just do it by calling the tools.

 # Response Guidelines
@@ -1037,7 +1057,7 @@ The tool will execute immediately and you'll receive the result (success or erro
            },
            Tool {
                name: "read_file".to_string(),
-                description: "Read the contents of a file. Optionally read a specific character range.".to_string(),
+                description: "Read the contents of a file. For image files (png, jpg, jpeg, gif, bmp, tiff, webp), automatically extracts text using OCR. For text files, optionally read a specific character range.".to_string(),
                input_schema: json!({
                    "type": "object",
                    "properties": {
@@ -1115,6 +1135,137 @@ The tool will execute immediately and you'll receive the result (success or erro
                    "required": ["summary"]
                }),
            },
+            Tool {
+                name: "mouse_click".to_string(),
+                description: "Click the mouse at specific coordinates".to_string(),
+                input_schema: json!({
+                    "type": "object",
+                    "properties": {
+                        "x": {
+                            "type": "integer",
+                            "description": "X coordinate"
+                        },
+                        "y": {
+                            "type": "integer",
+                            "description": "Y coordinate"
+                        },
+                        "button": {
+                            "type": "string",
+                            "enum": ["left", "right", "middle"],
+                            "description": "Mouse button to click",
+                            "default": "left"
+                        }
+                    },
+                    "required": ["x", "y"]
+                }),
+            },
+            Tool {
+                name: "type_text".to_string(),
+                description: "Type text at the current cursor position".to_string(),
+                input_schema: json!({
+                    "type": "object",
+                    "properties": {
+                        "text": {
+                            "type": "string",
+                            "description": "Text to type"
+                        }
+                    },
+                    "required": ["text"]
+                }),
+            },
+            Tool {
+                name: "find_element".to_string(),
+                description: "Find a UI element by text, role, or other attributes".to_string(),
+                input_schema: json!({
+                    "type": "object",
+                    "properties": {
+                        "text": {
+                            "type": "string",
+                            "description": "Text to search for"
+                        },
+                        "role": {
+                            "type": "string",
+                            "description": "Element role (button, textfield, etc.)"
+                        },
+                        "window_id": {
+                            "type": "string",
+                            "description": "Optional window ID to search in"
+                        }
+                    }
+                }),
+            },
+            Tool {
+                name: "take_screenshot".to_string(),
+                description: "Capture a screenshot of the screen, region, or window. When capturing a specific application window (e.g., 'Safari', 'Terminal'), use the window_id parameter with just the application name. The tool will automatically use the native screencapture command with the application's window ID for a clean capture.".to_string(),
+                input_schema: json!({
+                    "type": "object",
+                    "properties": {
+                        "path": {
+                            "type": "string",
+                            "description": "Filename for the screenshot (e.g., 'safari.png'). If a relative path is provided, the screenshot will be saved to ~/tmp or $TMPDIR. Use an absolute path to save elsewhere."
+                        },
+                        "window_id": {
+                            "type": "string",
+                            "description": "Optional application name to capture (e.g., 'Safari', 'Terminal', 'Google Chrome'). The tool will capture the frontmost window of that application using its native window ID."
+                        },
+                        "region": {
+                            "type": "object",
+                            "properties": {
+                                "x": {"type": "integer"},
+                                "y": {"type": "integer"},
+                                "width": {"type": "integer"},
+                                "height": {"type": "integer"}
+                            }
+                        }
+                    },
+                    "required": ["path"]
+                }),
+            },
+            Tool {
+                name: "extract_text".to_string(),
+                description: "Extract text from a screen region or image file using OCR".to_string(),
+                input_schema: json!({
+                    "type": "object",
+                    "properties": {
+                        "path": {
+                            "type": "string",
+                            "description": "Path to image file (optional if region is provided)"
+                        },
+                        "region": {
+                            "type": "object",
+                            "description": "Screen region to capture and extract text from",
+                            "properties": {
+                                "x": {"type": "integer"},
+                                "y": {"type": "integer"},
+                                "width": {"type": "integer"},
+                                "height": {"type": "integer"}
+                            }
+                        }
+                    }
+                }),
+            },
+            Tool {
+                name: "find_text_on_screen".to_string(),
+                description: "Find text visually on screen and return its coordinates".to_string(),
+                input_schema: json!({
+                    "type": "object",
+                    "properties": {
+                        "text": {
+                            "type": "string",
+                            "description": "Text to search for on screen"
+                        }
+                    },
+                    "required": ["text"]
+                }),
+            },
+            Tool {
+                name: "list_windows".to_string(),
+                description: "List all currently open windows with their IDs, titles, and application names. Use this to identify which window to interact with before taking screenshots or performing other window-specific operations.".to_string(),
+                input_schema: json!({
+                    "type": "object",
+                    "properties": {}
+                }),
+            },
        ]
    }

@@ -2060,6 +2211,31 @@ The tool will execute immediately and you'll receive the result (success or erro
                debug!("Processing read_file tool call");
                if let Some(file_path) = tool_call.args.get("file_path") {
                    if let Some(path_str) = file_path.as_str() {
+                        // Check if this is an image file
+                        let is_image = path_str.to_lowercase().ends_with(".png")
+                            || path_str.to_lowercase().ends_with(".jpg")
+                            || path_str.to_lowercase().ends_with(".jpeg")
+                            || path_str.to_lowercase().ends_with(".gif")
+                            || path_str.to_lowercase().ends_with(".bmp")
+                            || path_str.to_lowercase().ends_with(".tiff")
+                            || path_str.to_lowercase().ends_with(".tif")
+                            || path_str.to_lowercase().ends_with(".webp");
+
+                        // If it's an image file, use OCR via extract_text
+                        if is_image {
+                            if let Some(controller) = &self.computer_controller {
+                                match controller.extract_text_from_image(path_str).await {
+                                    Ok(result) => {
+                                        return Ok(format!("📄 Image file (OCR extracted, confidence: {:.2}):\n{}", 
+                                            result.confidence, result.text));
+                                    }
+                                    Err(e) => return Ok(format!("❌ Failed to extract text from image '{}': {}", path_str, e)),
+                                }
+                            } else {
+                                return Ok("❌ Computer control not enabled. Cannot perform OCR on image files. Set computer_control.enabled = true in config.".to_string());
+                            }
+                        }
+
                        // Extract optional start and end positions
                        let start_char = tool_call
                            .args
@@ -2397,6 +2573,188 @@ The tool will execute immediately and you'll receive the result (success or erro
                    Ok("✅ Turn completed".to_string())
                }
            }
+            "mouse_click" => {
+                if let Some(controller) = &self.computer_controller {
+                    let x = tool_call.args.get("x").and_then(|v| v.as_i64()).unwrap_or(0) as i32;
+                    let y = tool_call.args.get("y").and_then(|v| v.as_i64()).unwrap_or(0) as i32;
+                    let button_str = tool_call.args.get("button").and_then(|v| v.as_str()).unwrap_or("left");
+                    
+                    let button = match button_str {
+                        "left" => g3_computer_control::types::MouseButton::Left,
+                        "right" => g3_computer_control::types::MouseButton::Right,
+                        "middle" => g3_computer_control::types::MouseButton::Middle,
+                        _ => g3_computer_control::types::MouseButton::Left,
+                    };
+                    
+                    match controller.move_mouse(x, y).await {
+                        Ok(_) => {
+                            tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
+                            match controller.click(button).await {
+                                Ok(_) => Ok(format!("✅ Clicked {} button at ({}, {})", button_str, x, y)),
+                                Err(e) => Ok(format!("❌ Failed to click: {}", e)),
+                            }
+                        }
+                        Err(e) => Ok(format!("❌ Failed to move mouse: {}", e)),
+                    }
+                } else {
+                    Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string())
+                }
+            }
+            "type_text" => {
+                if let Some(controller) = &self.computer_controller {
+                    let text = tool_call.args.get("text").and_then(|v| v.as_str())
+                        .ok_or_else(|| anyhow::anyhow!("Missing text argument"))?;
+                    
+                    match controller.type_text(text).await {
+                        Ok(_) => Ok(format!("✅ Typed text: {}", text)),
+                        Err(e) => Ok(format!("❌ Failed to type text: {}", e)),
+                    }
+                } else {
+                    Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string())
+                }
+            }
+            "find_element" => {
+                if let Some(controller) = &self.computer_controller {
+                    let selector = g3_computer_control::types::ElementSelector {
+                        text: tool_call.args.get("text").and_then(|v| v.as_str()).map(String::from),
+                        role: tool_call.args.get("role").and_then(|v| v.as_str()).map(String::from),
+                        window_id: tool_call.args.get("window_id").and_then(|v| v.as_str()).map(String::from),
+                    };
+                    
+                    match controller.find_element(&selector).await {
+                        Ok(Some(element)) => {
+                            match serde_json::to_string_pretty(&element) {
+                                Ok(json) => Ok(format!("✅ Found element:\n{}", json)),
+                                Err(e) => Ok(format!("✅ Found element but failed to serialize: {}", e)),
+                            }
+                        }
+                        Ok(None) => Ok("❌ Element not found".to_string()),
+                        Err(e) => Ok(format!("❌ Failed to find element: {}", e)),
+                    }
+                } else {
+                    Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string())
+                }
+            }
+            "take_screenshot" => {
+                if let Some(controller) = &self.computer_controller {
+                let path = tool_call.args.get("path").and_then(|v| v.as_str())
+                    .ok_or_else(|| anyhow::anyhow!("Missing path argument"))?;
+                
+                    // Extract window_id (app name) if provided
+                    let window_id = tool_call.args.get("window_id").and_then(|v| v.as_str());
+                    
+                    // Extract region if provided
+                    let region = tool_call.args.get("region").and_then(|v| v.as_object()).map(|region_obj| {
+                        g3_computer_control::types::Rect {
+                            x: region_obj.get("x").and_then(|v| v.as_i64()).unwrap_or(0) as i32,
+                            y: region_obj.get("y").and_then(|v| v.as_i64()).unwrap_or(0) as i32,
+                            width: region_obj.get("width").and_then(|v| v.as_i64()).unwrap_or(0) as i32,
+                            height: region_obj.get("height").and_then(|v| v.as_i64()).unwrap_or(0) as i32,
+                        }
+                    });
+                    
+                    match controller.take_screenshot(path, region, window_id).await {
+                        Ok(_) => {
+                            // Get the actual path where the screenshot was saved
+                            let actual_path = if path.starts_with('/') {
+                                path.to_string()
+                            } else {
+                                let temp_dir = std::env::var("TMPDIR")
+                                    .or_else(|_| std::env::var("HOME").map(|h| format!("{}/tmp", h)))
+                                    .unwrap_or_else(|_| "/tmp".to_string());
+                                format!("{}/{}", temp_dir.trim_end_matches('/'), path)
+                            };
+                            
+                            if let Some(app) = window_id {
+                                Ok(format!("✅ Screenshot of {} saved to: {}", app, actual_path))
+                            } else {
+                                Ok(format!("✅ Screenshot saved to: {}", actual_path))
+                            }
+                        }
+                        Err(e) => Ok(format!("❌ Failed to take screenshot: {}", e)),
+                    }
+                } else {
+                    Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string())
+                }
+            }
+            "extract_text" => {
+                if let Some(controller) = &self.computer_controller {
+                    // Check if we have a path or a region
+                    if let Some(path) = tool_call.args.get("path").and_then(|v| v.as_str()) {
+                        // Extract text from image file
+                        match controller.extract_text_from_image(path).await {
+                            Ok(result) => {
+                                Ok(format!("✅ Extracted text (confidence: {:.2}):\n{}", 
+                                    result.confidence, result.text))
+                            }
+                            Err(e) => Ok(format!("❌ Failed to extract text: {}", e)),
+                        }
+                    } else if let Some(region_obj) = tool_call.args.get("region").and_then(|v| v.as_object()) {
+                        // Extract text from screen region
+                        let region = g3_computer_control::types::Rect {
+                            x: region_obj.get("x").and_then(|v| v.as_i64()).unwrap_or(0) as i32,
+                            y: region_obj.get("y").and_then(|v| v.as_i64()).unwrap_or(0) as i32,
+                            width: region_obj.get("width").and_then(|v| v.as_i64()).unwrap_or(0) as i32,
+                            height: region_obj.get("height").and_then(|v| v.as_i64()).unwrap_or(0) as i32,
+                        };
+                        
+                        match controller.extract_text_from_screen(region).await {
+                            Ok(result) => {
+                                Ok(format!("✅ Extracted text (confidence: {:.2}):\n{}", 
+                                    result.confidence, result.text))
+                            }
+                            Err(e) => Ok(format!("❌ Failed to extract text: {}", e)),
+                        }
+                    } else {
+                        Ok("❌ Missing path or region argument".to_string())
+                    }
+                } else {
+                    Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string())
+                }
+            }
+            "find_text_on_screen" => {
+                if let Some(controller) = &self.computer_controller {
+                    let text = tool_call.args.get("text").and_then(|v| v.as_str())
+                        .ok_or_else(|| anyhow::anyhow!("Missing text argument"))?;
+                    
+                    match controller.find_text_on_screen(text).await {
+                        Ok(Some(point)) => {
+                            Ok(format!("✅ Found text '{}' at coordinates ({}, {})", text, point.x, point.y))
+                        }
+                        Ok(None) => Ok(format!("❌ Text '{}' not found on screen", text)),
+                        Err(e) => Ok(format!("❌ Failed to search for text: {}", e)),
+                    }
+                } else {
+                    Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string())
+                }
+            }
+            "list_windows" => {
+                if let Some(controller) = &self.computer_controller {
+                    match controller.list_windows().await {
+                        Ok(windows) => {
+                            if windows.is_empty() {
+                                Ok("📋 No windows found".to_string())
+                            } else {
+                                let mut output = format!("📋 Found {} windows:\n\n", windows.len());
+                                for window in windows {
+                                    output.push_str(&format!(
+                                        "• **{}** ({}x{})\n  ID: `{}`\n  Title: {}\n\n",
+                                        window.app_name,
+                                        window.bounds.width,
+                                        window.bounds.height,
+                                        window.id,
+                                        if window.title.is_empty() { "(no title)" } else { &window.title }
+                                    ));
+                                }
+                                Ok(output)
+                            }
+                        }
+                        Err(e) => Ok(format!("❌ Failed to list windows: {}", e)),
+                    }
+                } else {
+                    Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string())
+                }
+            }
            _ => {
                warn!("Unknown tool: {}", tool_call.tool);
                Ok(format!("❓ Unknown tool: {}", tool_call.tool))