diff --git a/crates/g3-computer-control/build.rs b/crates/g3-computer-control/build.rs index 60d5598..49ce8b7 100644 --- a/crates/g3-computer-control/build.rs +++ b/crates/g3-computer-control/build.rs @@ -1,100 +1,4 @@ -use std::env; -use std::path::PathBuf; -use std::process::Command; - fn main() { - // Only build Vision bridge on macOS - if env::var("CARGO_CFG_TARGET_OS").unwrap() != "macos" { - return; - } - - println!("cargo:rerun-if-changed=vision-bridge/Sources/VisionBridge/VisionOCR.swift"); - println!("cargo:rerun-if-changed=vision-bridge/Sources/VisionBridge/VisionBridge.h"); - println!("cargo:rerun-if-changed=vision-bridge/Package.swift"); - - let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap()); - let vision_bridge_dir = manifest_dir.join("vision-bridge"); - - // Build Swift package - println!("cargo:warning=Building VisionBridge Swift package..."); - let build_status = Command::new("swift") - .args(&["build", "-c", "release"]) - .current_dir(&vision_bridge_dir) - .status() - .expect("Failed to build Swift package"); - - if !build_status.success() { - panic!("Swift build failed"); - } - - // Find the built library - let lib_path = vision_bridge_dir - .join(".build/release") - .canonicalize() - .expect("Failed to find .build/release directory"); - - // Copy the dylib to the output directory so it can be found at runtime - let target_dir = manifest_dir - .parent() - .unwrap() - .parent() - .unwrap() - .join("target"); - let profile = env::var("PROFILE").unwrap_or_else(|_| "debug".to_string()); - - // Determine the actual target directory (could be llvm-cov-target or regular target) - let target_dir_name = - env::var("CARGO_TARGET_DIR").unwrap_or_else(|_| target_dir.to_string_lossy().to_string()); - let actual_target_dir = PathBuf::from(&target_dir_name); - let output_dir = actual_target_dir.join(&profile); - - let dylib_src = lib_path.join("libVisionBridge.dylib"); - let dylib_dst = output_dir.join("libVisionBridge.dylib"); - - // Create output directory if it doesn't exist - std::fs::create_dir_all(&output_dir).expect(&format!( - "Failed to create output directory {}", - output_dir.display() - )); - - std::fs::copy(&dylib_src, &dylib_dst).expect(&format!( - "Failed to copy dylib from {} to {}", - dylib_src.display(), - dylib_dst.display() - )); - - println!( - "cargo:warning=Copied libVisionBridge.dylib to {}", - dylib_dst.display() - ); - - // Re-sign the dylib with ad-hoc signature to fix code signing issues on Apple Silicon - // This is necessary because incremental compilation can invalidate signatures - let codesign_status = Command::new("codesign") - .args(&["-f", "-s", "-", dylib_dst.to_str().unwrap()]) - .status(); - - if let Ok(status) = codesign_status { - if !status.success() { - println!("cargo:warning=Failed to codesign libVisionBridge.dylib (non-fatal)"); - } - } - - // Add rpath so the dylib can be found at runtime - println!("cargo:rustc-link-arg=-Wl,-rpath,@executable_path"); - println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path"); - println!("cargo:rustc-link-search=native={}", lib_path.display()); - println!("cargo:rustc-link-lib=dylib=VisionBridge"); - - // Link required frameworks - println!("cargo:rustc-link-lib=framework=Vision"); - println!("cargo:rustc-link-lib=framework=AppKit"); - println!("cargo:rustc-link-lib=framework=Foundation"); - println!("cargo:rustc-link-lib=framework=CoreGraphics"); - println!("cargo:rustc-link-lib=framework=CoreImage"); - - println!( - "cargo:warning=VisionBridge built successfully at {}", - lib_path.display() - ); + // No build-time dependencies required + // VisionBridge OCR has been removed } diff --git a/crates/g3-computer-control/examples/test_vision.rs b/crates/g3-computer-control/examples/test_vision.rs deleted file mode 100644 index 3b65f93..0000000 --- a/crates/g3-computer-control/examples/test_vision.rs +++ /dev/null @@ -1,92 +0,0 @@ -use anyhow::Result; -use g3_computer_control::ocr::{DefaultOCR, OCREngine}; - -#[tokio::main] -async fn main() -> Result<()> { - println!("๐Ÿงช Testing Apple Vision OCR"); - println!("===========================\n"); - - // Initialize OCR engine - println!("๐Ÿ“ฆ Initializing OCR engine..."); - let ocr = DefaultOCR::new()?; - println!("โœ… OCR engine: {}\n", ocr.name()); - - // Check if test image exists - let test_image = "/tmp/safari_test.png"; - if !std::path::Path::new(test_image).exists() { - println!("โš ๏ธ Test image not found: {}", test_image); - println!(" Creating a screenshot..."); - - let status = std::process::Command::new("screencapture") - .arg("-x") - .arg("-R") - .arg("0,0,1200,800") - .arg(test_image) - .status()?; - - if !status.success() { - anyhow::bail!("Failed to create screenshot"); - } - - println!("โœ… Screenshot created\n"); - } - - // Run OCR - println!("๐Ÿ” Running Apple Vision OCR on {}...", test_image); - let start = std::time::Instant::now(); - let locations = ocr.extract_text_with_locations(test_image).await?; - let duration = start.elapsed(); - - println!("โœ… OCR completed in {:.3}s\n", duration.as_secs_f64()); - - // Display results - println!("๐Ÿ“Š Results:"); - println!(" Found {} text elements\n", locations.len()); - - if locations.is_empty() { - println!("โš ๏ธ No text found in image"); - } else { - println!(" Top 20 results:"); - println!( - " {:<4} {:<40} {:<15} {:<12} {:<8}", - "#", "Text", "Position", "Size", "Conf" - ); - println!(" {}", "-".repeat(85)); - - for (i, loc) in locations.iter().take(20).enumerate() { - let text = if loc.text.len() > 37 { - format!("{}...", &loc.text[..37]) - } else { - loc.text.clone() - }; - - println!( - " {:<4} {:<40} ({:>4},{:>4}) {:>4}x{:<4} {:.2}", - i + 1, - text, - loc.x, - loc.y, - loc.width, - loc.height, - loc.confidence - ); - } - - if locations.len() > 20 { - println!("\n ... and {} more", locations.len() - 20); - } - - // Performance comparison - println!("\n๐Ÿ“ˆ Performance:"); - println!(" OCR Speed: {:.3}s", duration.as_secs_f64()); - println!(" Text elements: {}", locations.len()); - println!( - " Avg per element: {:.1}ms", - duration.as_millis() as f64 / locations.len() as f64 - ); - } - - println!("\nโœ… Test complete!"); - - Ok(()) -} diff --git a/crates/g3-computer-control/src/lib.rs b/crates/g3-computer-control/src/lib.rs index d37e295..23632ea 100644 --- a/crates/g3-computer-control/src/lib.rs +++ b/crates/g3-computer-control/src/lib.rs @@ -2,7 +2,6 @@ #![allow(unexpected_cfgs)] pub mod macax; -pub mod ocr; pub mod platform; pub mod types; pub mod webdriver; @@ -30,16 +29,6 @@ pub trait ComputerController: Send + Sync { window_id: Option<&str>, ) -> Result<()>; - // OCR operations - async fn extract_text_from_screen(&self, region: Rect, window_id: &str) -> Result; - async fn extract_text_from_image(&self, path: &str) -> Result; - async fn extract_text_with_locations(&self, path: &str) -> Result>; - async fn find_text_in_app( - &self, - app_name: &str, - search_text: &str, - ) -> Result>; - // Mouse operations fn move_mouse(&self, x: i32, y: i32) -> Result<()>; fn click_at(&self, x: i32, y: i32, app_name: Option<&str>) -> Result<()>; diff --git a/crates/g3-computer-control/src/ocr/mod.rs b/crates/g3-computer-control/src/ocr/mod.rs deleted file mode 100644 index a5c59d4..0000000 --- a/crates/g3-computer-control/src/ocr/mod.rs +++ /dev/null @@ -1,26 +0,0 @@ -use crate::types::TextLocation; -use anyhow::Result; -use async_trait::async_trait; - -/// OCR engine trait for text recognition with bounding boxes -#[async_trait] -pub trait OCREngine: Send + Sync { - /// Extract text with locations from an image file - async fn extract_text_with_locations(&self, path: &str) -> Result>; - - /// Get the name of the OCR engine - fn name(&self) -> &str; -} - -// Platform-specific modules -#[cfg(target_os = "macos")] -pub mod vision; - -pub mod tesseract; - -// Re-export the default OCR engine for the platform -#[cfg(target_os = "macos")] -pub use vision::AppleVisionOCR as DefaultOCR; - -#[cfg(not(target_os = "macos"))] -pub use tesseract::TesseractOCR as DefaultOCR; diff --git a/crates/g3-computer-control/src/ocr/tesseract.rs b/crates/g3-computer-control/src/ocr/tesseract.rs deleted file mode 100644 index 7c11129..0000000 --- a/crates/g3-computer-control/src/ocr/tesseract.rs +++ /dev/null @@ -1,91 +0,0 @@ -use super::OCREngine; -use crate::types::TextLocation; -use anyhow::Result; -use async_trait::async_trait; - -/// Tesseract OCR engine (fallback/cross-platform) -pub struct TesseractOCR; - -impl TesseractOCR { - pub fn new() -> Result { - // Check if tesseract is available - let tesseract_check = std::process::Command::new("which") - .arg("tesseract") - .output(); - - if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() { - anyhow::bail!( - "Tesseract OCR is not installed on your system.\n\n\ - To install tesseract:\n macOS: brew install tesseract\n \ - Linux: sudo apt-get install tesseract-ocr (Ubuntu/Debian)\n \ - sudo yum install tesseract (RHEL/CentOS)\n \ - Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki\n\n\ - After installation, restart your terminal and try again." - ); - } - - Ok(Self) - } -} - -#[async_trait] -impl OCREngine for TesseractOCR { - async fn extract_text_with_locations(&self, path: &str) -> Result> { - // Use tesseract CLI with TSV output to get bounding boxes - let output = std::process::Command::new("tesseract") - .arg(path) - .arg("stdout") - .arg("tsv") - .output() - .map_err(|e| anyhow::anyhow!("Failed to run tesseract: {}", e))?; - - if !output.status.success() { - anyhow::bail!( - "Tesseract failed: {}", - String::from_utf8_lossy(&output.stderr) - ); - } - - let tsv_text = String::from_utf8_lossy(&output.stdout); - let mut locations = Vec::new(); - - // Parse TSV output (skip header line) - for (i, line) in tsv_text.lines().enumerate() { - if i == 0 { - continue; - } // Skip header - - let parts: Vec<&str> = line.split('\t').collect(); - if parts.len() >= 12 { - // TSV format: level, page_num, block_num, par_num, line_num, word_num, - // left, top, width, height, conf, text - if let (Ok(x), Ok(y), Ok(w), Ok(h), Ok(conf), text) = ( - parts[6].parse::(), - parts[7].parse::(), - parts[8].parse::(), - parts[9].parse::(), - parts[10].parse::(), - parts[11], - ) { - let trimmed = text.trim(); - if !trimmed.is_empty() && conf > 0.0 { - locations.push(TextLocation { - text: trimmed.to_string(), - x, - y, - width: w, - height: h, - confidence: conf / 100.0, // Convert from 0-100 to 0-1 - }); - } - } - } - } - - Ok(locations) - } - - fn name(&self) -> &str { - "Tesseract OCR" - } -} diff --git a/crates/g3-computer-control/src/ocr/vision.rs b/crates/g3-computer-control/src/ocr/vision.rs deleted file mode 100644 index acc93e9..0000000 --- a/crates/g3-computer-control/src/ocr/vision.rs +++ /dev/null @@ -1,100 +0,0 @@ -use super::OCREngine; -use crate::types::TextLocation; -use anyhow::{Context, Result}; -use async_trait::async_trait; -use std::ffi::{CStr, CString}; -use std::os::raw::{c_char, c_float, c_uint}; - -// FFI bindings to Swift VisionBridge -#[repr(C)] -struct VisionTextBox { - text: *const c_char, - text_len: c_uint, - x: i32, - y: i32, - width: i32, - height: i32, - confidence: c_float, -} - -extern "C" { - fn vision_recognize_text( - image_path: *const c_char, - image_path_len: c_uint, - out_boxes: *mut *mut std::ffi::c_void, - out_count: *mut c_uint, - ) -> bool; - - fn vision_free_boxes(boxes: *mut std::ffi::c_void, count: c_uint); -} - -/// Apple Vision Framework OCR engine -pub struct AppleVisionOCR; - -impl AppleVisionOCR { - pub fn new() -> Result { - Ok(Self) - } -} - -#[async_trait] -impl OCREngine for AppleVisionOCR { - async fn extract_text_with_locations(&self, path: &str) -> Result> { - // Convert path to C string - let c_path = CString::new(path).context("Failed to convert path to C string")?; - - let mut boxes_ptr: *mut std::ffi::c_void = std::ptr::null_mut(); - let mut count: c_uint = 0; - - // Call Swift Vision API - let success = unsafe { - vision_recognize_text( - c_path.as_ptr(), - path.len() as c_uint, - &mut boxes_ptr, - &mut count, - ) - }; - - if !success || boxes_ptr.is_null() { - anyhow::bail!("Apple Vision OCR failed"); - } - - // Convert C array to Rust Vec - let mut locations = Vec::new(); - - unsafe { - let typed_boxes = boxes_ptr as *const VisionTextBox; - let boxes_slice = std::slice::from_raw_parts(typed_boxes, count as usize); - - for box_data in boxes_slice { - // Convert C string to Rust String - let text = if !box_data.text.is_null() { - CStr::from_ptr(box_data.text).to_string_lossy().into_owned() - } else { - String::new() - }; - - if !text.is_empty() { - locations.push(TextLocation { - text, - x: box_data.x, - y: box_data.y, - width: box_data.width, - height: box_data.height, - confidence: box_data.confidence, - }); - } - } - - // Free the C array - vision_free_boxes(boxes_ptr, count); - } - - Ok(locations) - } - - fn name(&self) -> &str { - "Apple Vision Framework" - } -} diff --git a/crates/g3-computer-control/src/platform/linux.rs b/crates/g3-computer-control/src/platform/linux.rs index cdaf64e..01db21b 100644 --- a/crates/g3-computer-control/src/platform/linux.rs +++ b/crates/g3-computer-control/src/platform/linux.rs @@ -1,188 +1,32 @@ -use crate::{types::*, ComputerController}; +use crate::{types::Rect, ComputerController}; use anyhow::Result; use async_trait::async_trait; -use tesseract::Tesseract; -use uuid::Uuid; -pub struct LinuxController { - // Placeholder for X11 connection or other state -} +pub struct LinuxController; impl LinuxController { pub fn new() -> Result { - // Initialize X11 connection tracing::warn!("Linux computer control not fully implemented"); - Ok(Self {}) + Ok(Self) } } #[async_trait] impl ComputerController for LinuxController { - async fn move_mouse(&self, _x: i32, _y: i32) -> Result<()> { - anyhow::bail!("Linux implementation not yet available") - } - - async fn click(&self, _button: MouseButton) -> Result<()> { - anyhow::bail!("Linux implementation not yet available") - } - - async fn double_click(&self, _button: MouseButton) -> Result<()> { - anyhow::bail!("Linux implementation not yet available") - } - - async fn type_text(&self, _text: &str) -> Result<()> { - anyhow::bail!("Linux implementation not yet available") - } - - async fn press_key(&self, _key: &str) -> Result<()> { - anyhow::bail!("Linux implementation not yet available") - } - - async fn list_windows(&self) -> Result> { - anyhow::bail!("Linux implementation not yet available") - } - - async fn focus_window(&self, _window_id: &str) -> Result<()> { - anyhow::bail!("Linux implementation not yet available") - } - - async fn get_window_bounds(&self, _window_id: &str) -> Result { - anyhow::bail!("Linux implementation not yet available") - } - - async fn find_element(&self, _selector: &ElementSelector) -> Result> { - anyhow::bail!("Linux implementation not yet available") - } - - async fn get_element_text(&self, _element_id: &str) -> Result { - anyhow::bail!("Linux implementation not yet available") - } - - async fn get_element_bounds(&self, _element_id: &str) -> Result { - anyhow::bail!("Linux implementation not yet available") - } - async fn take_screenshot( &self, _path: &str, _region: Option, _window_id: Option<&str>, ) -> Result<()> { - // Enforce that window_id must be provided - if _window_id.is_none() { - anyhow::bail!("window_id is required. You must specify which window to capture (e.g., 'Firefox', 'Terminal', 'gedit'). Use list_windows to see available windows."); - } - - anyhow::bail!("Linux implementation not yet available") + anyhow::bail!("Linux screenshot implementation not yet available") } - async fn extract_text_from_screen(&self, _region: Rect, _window_id: &str) -> Result { - anyhow::bail!("Linux implementation not yet available") + fn move_mouse(&self, _x: i32, _y: i32) -> Result<()> { + anyhow::bail!("Linux mouse control not yet available") } - async fn extract_text_from_image(&self, _path: &str) -> Result { - // Check if tesseract is available on the system - let tesseract_check = std::process::Command::new("which") - .arg("tesseract") - .output(); - - if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() { - anyhow::bail!( - "Tesseract OCR is not installed on your system.\n\n\ - To install tesseract:\n \ - Ubuntu/Debian: sudo apt-get install tesseract-ocr\n \ - RHEL/CentOS: sudo yum install tesseract\n \ - Arch Linux: sudo pacman -S tesseract\n\n\ - After installation, restart your terminal and try again." - ); - } - - // Initialize Tesseract - let tess = Tesseract::new(None, Some("eng")).map_err(|e| { - anyhow::anyhow!( - "Failed to initialize Tesseract: {}\n\n\ - This usually means:\n1. Tesseract is not properly installed\n\ - 2. Language data files are missing\n\nTo fix:\n \ - Ubuntu/Debian: sudo apt-get install tesseract-ocr-eng\n \ - RHEL/CentOS: sudo yum install tesseract-langpack-eng\n \ - Arch Linux: sudo pacman -S tesseract-data-eng", - e - ) - })?; - - let text = tess - .set_image(_path) - .map_err(|e| anyhow::anyhow!("Failed to load image '{}': {}", _path, e))? - .get_text() - .map_err(|e| anyhow::anyhow!("Failed to extract text from image: {}", e))?; - - // Get confidence (simplified - would need more complex API calls for per-word confidence) - let confidence = 0.85; // Placeholder - - Ok(OCRResult { - text, - confidence, - bounds: Rect { - x: 0, - y: 0, - width: 0, - height: 0, - }, // Would need image dimensions - }) - } - - async fn find_text_on_screen(&self, _text: &str) -> Result> { - // Check if tesseract is available on the system - let tesseract_check = std::process::Command::new("which") - .arg("tesseract") - .output(); - - if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() { - anyhow::bail!( - "Tesseract OCR is not installed on your system.\n\n\ - To install tesseract:\n \ - Ubuntu/Debian: sudo apt-get install tesseract-ocr\n \ - RHEL/CentOS: sudo yum install tesseract\n \ - Arch Linux: sudo pacman -S tesseract\n\n\ - After installation, restart your terminal and try again." - ); - } - - // Take full screen screenshot - let temp_path = format!("/tmp/g3_ocr_search_{}.png", uuid::Uuid::new_v4()); - self.take_screenshot(&temp_path, None, None).await?; - - // Use Tesseract to find text with bounding boxes - let tess = Tesseract::new(None, Some("eng")).map_err(|e| { - anyhow::anyhow!( - "Failed to initialize Tesseract: {}\n\n\ - This usually means:\n1. Tesseract is not properly installed\n\ - 2. Language data files are missing\n\nTo fix:\n \ - Ubuntu/Debian: sudo apt-get install tesseract-ocr-eng\n \ - RHEL/CentOS: sudo yum install tesseract-langpack-eng\n \ - Arch Linux: sudo pacman -S tesseract-data-eng", - e - ) - })?; - - let full_text = tess - .set_image(temp_path.as_str()) - .map_err(|e| anyhow::anyhow!("Failed to load screenshot: {}", e))? - .get_text() - .map_err(|e| anyhow::anyhow!("Failed to extract text from screen: {}", e))?; - - // Clean up temp file - let _ = std::fs::remove_file(&temp_path); - - // Simple text search - full implementation would use get_component_images - // to get bounding boxes for each word - if full_text.contains(_text) { - tracing::warn!( - "Text found but precise coordinates not available in simplified implementation" - ); - Ok(Some(Point { x: 0, y: 0 })) - } else { - Ok(None) - } + fn click_at(&self, _x: i32, _y: i32, _app_name: Option<&str>) -> Result<()> { + anyhow::bail!("Linux click control not yet available") } } diff --git a/crates/g3-computer-control/src/platform/macos.rs b/crates/g3-computer-control/src/platform/macos.rs index db64480..2b533aa 100644 --- a/crates/g3-computer-control/src/platform/macos.rs +++ b/crates/g3-computer-control/src/platform/macos.rs @@ -1,7 +1,5 @@ -use crate::ocr::{DefaultOCR, OCREngine}; use crate::{ - types::{Rect, TextLocation}, - ComputerController, + types::Rect, ComputerController, }; use anyhow::{Context, Result}; use async_trait::async_trait; @@ -14,21 +12,12 @@ use core_graphics::window::{ }; use std::path::Path; -pub struct MacOSController { - ocr_engine: Box, - #[allow(dead_code)] - ocr_name: String, -} +pub struct MacOSController; impl MacOSController { pub fn new() -> Result { - let ocr = Box::new(DefaultOCR::new()?); - let ocr_name = ocr.name().to_string(); - tracing::debug!("Initialized macOS controller with OCR engine: {}", ocr_name); - Ok(Self { - ocr_engine: ocr, - ocr_name, - }) + tracing::debug!("Initialized macOS controller"); + Ok(Self) } } @@ -215,78 +204,6 @@ impl ComputerController for MacOSController { Ok(()) } - async fn extract_text_from_screen(&self, region: Rect, window_id: &str) -> Result { - // Take screenshot of region first - let temp_path = format!("/tmp/g3_ocr_{}.png", uuid::Uuid::new_v4()); - self.take_screenshot(&temp_path, Some(region), Some(window_id)) - .await?; - - // Extract text from the screenshot - let result = self.extract_text_from_image(&temp_path).await?; - - // Clean up temp file - let _ = std::fs::remove_file(&temp_path); - - Ok(result) - } - - async fn extract_text_from_image(&self, path: &str) -> Result { - // Extract all text and concatenate - let locations = self.ocr_engine.extract_text_with_locations(path).await?; - Ok(locations - .iter() - .map(|loc| loc.text.as_str()) - .collect::>() - .join(" ")) - } - - async fn extract_text_with_locations(&self, path: &str) -> Result> { - // Use the OCR engine - self.ocr_engine.extract_text_with_locations(path).await - } - - async fn find_text_in_app( - &self, - app_name: &str, - search_text: &str, - ) -> Result> { - // Take screenshot of specific app window - let home = std::env::var("HOME").unwrap_or_else(|_| "/tmp".to_string()); - let temp_path = format!( - "{}/tmp/g3_find_text_{}_{}.png", - home, - app_name, - uuid::Uuid::new_v4() - ); - self.take_screenshot(&temp_path, None, Some(app_name)) - .await?; - - // Get screenshot dimensions before we delete it - let screenshot_dims = get_image_dimensions(&temp_path)?; - - // Extract all text with locations - let locations = self.extract_text_with_locations(&temp_path).await?; - - // Get window bounds to calculate coordinate transformation - let window_bounds = self.get_window_bounds(app_name)?; - - // Clean up temp file - let _ = std::fs::remove_file(&temp_path); - - // Find matching text (case-insensitive) - let search_lower = search_text.to_lowercase(); - for location in locations { - if location.text.to_lowercase().contains(&search_lower) { - // Transform coordinates from screenshot space to screen space - let transformed = - transform_screenshot_to_screen_coords(location, window_bounds, screenshot_dims); - return Ok(Some(transformed)); - } - } - - Ok(None) - } - fn move_mouse(&self, x: i32, y: i32) -> Result<()> { use core_graphics::event::{CGEvent, CGEventTapLocation, CGEventType, CGMouseButton}; use core_graphics::event_source::{CGEventSource, CGEventSourceStateID}; @@ -379,246 +296,6 @@ impl ComputerController for MacOSController { } } -impl MacOSController { - /// Get window bounds for an application (helper method) - fn get_window_bounds(&self, app_name: &str) -> Result<(i32, i32, i32, i32)> { - unsafe { - let window_list = - CGWindowListCopyWindowInfo(kCGWindowListOptionOnScreenOnly, kCGNullWindowID); - - let array = CFArray::::wrap_under_create_rule(window_list); - let count = array.len(); - - let app_name_lower = app_name.to_lowercase(); - - for i in 0..count { - let dict = array.get(i).unwrap(); - - // Get owner name - let owner_key = CFString::from_static_string("kCGWindowOwnerName"); - let owner: String = if let Some(value) = dict.find(owner_key.to_void()) { - let s: CFString = TCFType::wrap_under_get_rule(*value as *const _); - s.to_string() - } else { - continue; - }; - - let owner_lower = owner.to_lowercase(); - - // Normalize by removing spaces for exact matching - let app_name_normalized = app_name_lower.replace(" ", ""); - let owner_normalized = owner_lower.replace(" ", ""); - - // ONLY accept exact matches (case-insensitive, with or without spaces) - // This prevents "Goose" from matching "GooseStudio" - let is_match = - owner_lower == app_name_lower || owner_normalized == app_name_normalized; - - if is_match { - // Get window layer to filter out menu bar windows - let layer_key = CFString::from_static_string("kCGWindowLayer"); - let layer: i32 = if let Some(value) = dict.find(layer_key.to_void()) { - let num: core_foundation::number::CFNumber = - TCFType::wrap_under_get_rule(*value as *const _); - num.to_i32().unwrap_or(0) - } else { - 0 - }; - - // Skip menu bar windows (layer >= 20) - if layer >= 20 { - tracing::debug!( - "Skipping window for '{}' at layer {} (menu bar)", - owner, - layer - ); - continue; - } - - // Get window bounds to verify it's a real window - let bounds_key = CFString::from_static_string("kCGWindowBounds"); - if let Some(value) = dict.find(bounds_key.to_void()) { - let bounds_dict: CFDictionary = - TCFType::wrap_under_get_rule(*value as *const _); - - let x_key = CFString::from_static_string("X"); - let y_key = CFString::from_static_string("Y"); - let width_key = CFString::from_static_string("Width"); - let height_key = CFString::from_static_string("Height"); - - if let (Some(x_val), Some(y_val), Some(w_val), Some(h_val)) = ( - bounds_dict.find(x_key.to_void()), - bounds_dict.find(y_key.to_void()), - bounds_dict.find(width_key.to_void()), - bounds_dict.find(height_key.to_void()), - ) { - let x_num: core_foundation::number::CFNumber = - TCFType::wrap_under_get_rule(*x_val as *const _); - let y_num: core_foundation::number::CFNumber = - TCFType::wrap_under_get_rule(*y_val as *const _); - let w_num: core_foundation::number::CFNumber = - TCFType::wrap_under_get_rule(*w_val as *const _); - let h_num: core_foundation::number::CFNumber = - TCFType::wrap_under_get_rule(*h_val as *const _); - - let x: i32 = x_num.to_i64().unwrap_or(0) as i32; - let y: i32 = y_num.to_i64().unwrap_or(0) as i32; - let w: i32 = w_num.to_i64().unwrap_or(0) as i32; - let h: i32 = h_num.to_i64().unwrap_or(0) as i32; - - // Only accept windows with real bounds (>= 100x100 pixels) - if w >= 100 && h >= 100 { - tracing::debug!("Found valid window bounds for '{}': x={}, y={}, w={}, h={} (layer={})", owner, x, y, w, h, layer); - return Ok((x, y, w, h)); - } else { - tracing::debug!( - "Skipping window for '{}': too small ({}x{})", - owner, - w, - h - ); - continue; - } - } else { - continue; - } - } - } - } - } - - Err(anyhow::anyhow!( - "Could not find window bounds for '{}'", - app_name - )) - } -} - -/// Get image dimensions from a PNG file -fn get_image_dimensions(path: &str) -> Result<(i32, i32)> { - use std::fs::File; - use std::io::Read; - - let mut file = File::open(path)?; - let mut buffer = vec![0u8; 24]; - file.read_exact(&mut buffer)?; - - // PNG signature check - if &buffer[0..8] != b"\x89PNG\r\n\x1a\n" { - anyhow::bail!("Not a valid PNG file"); - } - - // Read IHDR chunk (width and height are at bytes 16-23) - let width = u32::from_be_bytes([buffer[16], buffer[17], buffer[18], buffer[19]]) as i32; - let height = u32::from_be_bytes([buffer[20], buffer[21], buffer[22], buffer[23]]) as i32; - - Ok((width, height)) -} - -/// Transform coordinates from screenshot space to screen space -/// -/// The screenshot is taken of a window, and Vision OCR returns coordinates -/// relative to the screenshot image. We need to transform these to actual -/// screen coordinates for clicking. -/// -/// On Retina displays, screenshots are taken at 2x resolution, so we need -/// to account for this scaling factor. -fn transform_screenshot_to_screen_coords( - location: TextLocation, - window_bounds: (i32, i32, i32, i32), // (x, y, width, height) in screen space - screenshot_dims: (i32, i32), // (width, height) in pixels -) -> TextLocation { - let (win_x, win_y, win_width, win_height) = window_bounds; - let (screenshot_width, screenshot_height) = screenshot_dims; - - // Calculate scale factors - // On Retina displays, screenshot is typically 2x the window size - let scale_x = win_width as f64 / screenshot_width as f64; - let scale_y = win_height as f64 / screenshot_height as f64; - - tracing::debug!( - "Transform: screenshot={}x{}, window={}x{} at ({},{}), scale=({:.2},{:.2})", - screenshot_width, - screenshot_height, - win_width, - win_height, - win_x, - win_y, - scale_x, - scale_y - ); - - // Transform coordinates from image space to screen space - // IMPORTANT: macOS screen coordinates have origin at BOTTOM-LEFT (Y increases upward) - // Image coordinates have origin at TOP-LEFT (Y increases downward) - // win_y is the BOTTOM of the window in screen coordinates - // So we need to: (win_y + win_height) to get window TOP, then subtract screenshot_y - let window_top_y = win_y + win_height; - - tracing::debug!( - "[transform] Input location in image space: x={}, y={}, width={}, height={}", - location.x, - location.y, - location.width, - location.height - ); - tracing::debug!( - "[transform] Scale factors: scale_x={:.4}, scale_y={:.4}", - scale_x, - scale_y - ); - - let transformed_x = win_x + (location.x as f64 * scale_x) as i32; - let transformed_y = window_top_y - (location.y as f64 * scale_y) as i32; - let transformed_width = (location.width as f64 * scale_x) as i32; - let transformed_height = (location.height as f64 * scale_y) as i32; - - tracing::debug!("[transform] Calculation details:"); - tracing::debug!( - " - transformed_x = {} + ({} * {:.4}) = {} + {:.2} = {}", - win_x, - location.x, - scale_x, - win_x, - location.x as f64 * scale_x, - transformed_x - ); - tracing::debug!( - " - transformed_width = ({} * {:.4}) = {:.2} -> {}", - location.width, - scale_x, - location.width as f64 * scale_x, - transformed_width - ); - tracing::debug!( - " - transformed_height = ({} * {:.4}) = {:.2} -> {}", - location.height, - scale_y, - location.height as f64 * scale_y, - transformed_height - ); - - tracing::debug!( - "Transformed location: screenshot=({},{}) {}x{} -> screen=({},{}) {}x{}", - location.x, - location.y, - location.width, - location.height, - transformed_x, - transformed_y, - transformed_width, - transformed_height - ); - - TextLocation { - text: location.text, - x: transformed_x, - y: transformed_y, - width: transformed_width, - height: transformed_height, - confidence: location.confidence, - } -} #[path = "macos_window_matching_test.rs"] #[cfg(test)] diff --git a/crates/g3-computer-control/src/platform/windows.rs b/crates/g3-computer-control/src/platform/windows.rs index 1209084..a551fc0 100644 --- a/crates/g3-computer-control/src/platform/windows.rs +++ b/crates/g3-computer-control/src/platform/windows.rs @@ -1,189 +1,32 @@ -use crate::{types::*, ComputerController}; +use crate::{types::Rect, ComputerController}; use anyhow::Result; use async_trait::async_trait; -use tesseract::Tesseract; -use uuid::Uuid; -pub struct WindowsController { - // Placeholder for Windows-specific state -} +pub struct WindowsController; impl WindowsController { pub fn new() -> Result { tracing::warn!("Windows computer control not fully implemented"); - Ok(Self {}) + Ok(Self) } } #[async_trait] impl ComputerController for WindowsController { - async fn move_mouse(&self, _x: i32, _y: i32) -> Result<()> { - anyhow::bail!("Windows implementation not yet available") - } - - async fn click(&self, _button: MouseButton) -> Result<()> { - anyhow::bail!("Windows implementation not yet available") - } - - async fn double_click(&self, _button: MouseButton) -> Result<()> { - anyhow::bail!("Windows implementation not yet available") - } - - async fn type_text(&self, _text: &str) -> Result<()> { - anyhow::bail!("Windows implementation not yet available") - } - - async fn press_key(&self, _key: &str) -> Result<()> { - anyhow::bail!("Windows implementation not yet available") - } - - async fn list_windows(&self) -> Result> { - anyhow::bail!("Windows implementation not yet available") - } - - async fn focus_window(&self, _window_id: &str) -> Result<()> { - anyhow::bail!("Windows implementation not yet available") - } - - async fn get_window_bounds(&self, _window_id: &str) -> Result { - anyhow::bail!("Windows implementation not yet available") - } - - async fn find_element(&self, _selector: &ElementSelector) -> Result> { - anyhow::bail!("Windows implementation not yet available") - } - - async fn get_element_text(&self, _element_id: &str) -> Result { - anyhow::bail!("Windows implementation not yet available") - } - - async fn get_element_bounds(&self, _element_id: &str) -> Result { - anyhow::bail!("Windows implementation not yet available") - } - async fn take_screenshot( &self, _path: &str, _region: Option, _window_id: Option<&str>, ) -> Result<()> { - // Enforce that window_id must be provided - if _window_id.is_none() { - anyhow::bail!("window_id is required. You must specify which window to capture (e.g., 'Chrome', 'Terminal', 'Notepad'). Use list_windows to see available windows."); - } - - anyhow::bail!("Windows implementation not yet available") + anyhow::bail!("Windows screenshot implementation not yet available") } - async fn extract_text_from_screen(&self, _region: Rect, _window_id: &str) -> Result { - anyhow::bail!("Windows implementation not yet available") + fn move_mouse(&self, _x: i32, _y: i32) -> Result<()> { + anyhow::bail!("Windows mouse control not yet available") } - async fn extract_text_from_image(&self, _path: &str) -> Result { - // Check if tesseract is available on the system - let tesseract_check = std::process::Command::new("where") - .arg("tesseract") - .output(); - - if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() { - anyhow::bail!( - "Tesseract OCR is not installed on your system.\n\n\ - To install tesseract on Windows:\n \ - 1. Download the installer from: https://github.com/UB-Mannheim/tesseract/wiki\n \ - 2. Run the installer and follow the instructions\n \ - 3. Add tesseract to your PATH environment variable\n \ - 4. Restart your terminal/command prompt\n\n\ - After installation, restart your terminal and try again." - ); - } - - // Initialize Tesseract - let tess = Tesseract::new(None, Some("eng")).map_err(|e| { - anyhow::anyhow!( - "Failed to initialize Tesseract: {}\n\n\ - This usually means:\n1. Tesseract is not properly installed\n\ - 2. Language data files are missing\n\nTo fix:\n \ - 1. Reinstall tesseract from https://github.com/UB-Mannheim/tesseract/wiki\n \ - 2. Make sure to select 'Additional language data' during installation\n \ - 3. Ensure tesseract is in your PATH", - e - ) - })?; - - let text = tess - .set_image(_path) - .map_err(|e| anyhow::anyhow!("Failed to load image '{}': {}", _path, e))? - .get_text() - .map_err(|e| anyhow::anyhow!("Failed to extract text from image: {}", e))?; - - // Get confidence (simplified - would need more complex API calls for per-word confidence) - let confidence = 0.85; // Placeholder - - Ok(OCRResult { - text, - confidence, - bounds: Rect { - x: 0, - y: 0, - width: 0, - height: 0, - }, // Would need image dimensions - }) - } - - async fn find_text_on_screen(&self, _text: &str) -> Result> { - // Check if tesseract is available on the system - let tesseract_check = std::process::Command::new("where") - .arg("tesseract") - .output(); - - if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() { - anyhow::bail!( - "Tesseract OCR is not installed on your system.\n\n\ - To install tesseract on Windows:\n \ - 1. Download the installer from: https://github.com/UB-Mannheim/tesseract/wiki\n \ - 2. Run the installer and follow the instructions\n \ - 3. Add tesseract to your PATH environment variable\n \ - 4. Restart your terminal/command prompt\n\n\ - After installation, restart your terminal and try again." - ); - } - - // Take full screen screenshot - let temp_path = format!("C:\\\\Temp\\\\g3_ocr_search_{}.png", uuid::Uuid::new_v4()); - self.take_screenshot(&temp_path, None, None).await?; - - // Use Tesseract to find text with bounding boxes - let tess = Tesseract::new(None, Some("eng")).map_err(|e| { - anyhow::anyhow!( - "Failed to initialize Tesseract: {}\n\n\ - This usually means:\n1. Tesseract is not properly installed\n\ - 2. Language data files are missing\n\nTo fix:\n \ - 1. Reinstall tesseract from https://github.com/UB-Mannheim/tesseract/wiki\n \ - 2. Make sure to select 'Additional language data' during installation\n \ - 3. Ensure tesseract is in your PATH", - e - ) - })?; - - let full_text = tess - .set_image(temp_path.as_str()) - .map_err(|e| anyhow::anyhow!("Failed to load screenshot: {}", e))? - .get_text() - .map_err(|e| anyhow::anyhow!("Failed to extract text from screen: {}", e))?; - - // Clean up temp file - let _ = std::fs::remove_file(&temp_path); - - // Simple text search - full implementation would use get_component_images - // to get bounding boxes for each word - if full_text.contains(_text) { - tracing::warn!( - "Text found but precise coordinates not available in simplified implementation" - ); - Ok(Some(Point { x: 0, y: 0 })) - } else { - Ok(None) - } + fn click_at(&self, _x: i32, _y: i32, _app_name: Option<&str>) -> Result<()> { + anyhow::bail!("Windows click control not yet available") } } diff --git a/crates/g3-computer-control/src/types.rs b/crates/g3-computer-control/src/types.rs index 7d09042..e7ea40e 100644 --- a/crates/g3-computer-control/src/types.rs +++ b/crates/g3-computer-control/src/types.rs @@ -7,13 +7,3 @@ pub struct Rect { pub width: i32, pub height: i32, } - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct TextLocation { - pub text: String, - pub x: i32, - pub y: i32, - pub width: i32, - pub height: i32, - pub confidence: f32, -} diff --git a/crates/g3-computer-control/vision-bridge/Package.swift b/crates/g3-computer-control/vision-bridge/Package.swift deleted file mode 100644 index 76d0503..0000000 --- a/crates/g3-computer-control/vision-bridge/Package.swift +++ /dev/null @@ -1,24 +0,0 @@ -// swift-tools-version:5.9 -import PackageDescription - -let package = Package( - name: "VisionBridge", - platforms: [ - .macOS(.v11) - ], - products: [ - .library( - name: "VisionBridge", - type: .dynamic, - targets: ["VisionBridge"] - ), - ], - targets: [ - .target( - name: "VisionBridge", - dependencies: [], - path: "Sources/VisionBridge", - publicHeadersPath: "." - ), - ] -) diff --git a/crates/g3-computer-control/vision-bridge/Sources/VisionBridge/VisionBridge.h b/crates/g3-computer-control/vision-bridge/Sources/VisionBridge/VisionBridge.h deleted file mode 100644 index a83d1dc..0000000 --- a/crates/g3-computer-control/vision-bridge/Sources/VisionBridge/VisionBridge.h +++ /dev/null @@ -1,39 +0,0 @@ -#ifndef VisionBridge_h -#define VisionBridge_h - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -// Text box structure for FFI -typedef struct { - const char* text; - uint32_t text_len; - int32_t x; - int32_t y; - int32_t width; - int32_t height; - float confidence; -} VisionTextBox; - -// Recognize text in an image and return bounding boxes -// Returns true on success, false on failure -// Caller must free the returned boxes using vision_free_boxes -bool vision_recognize_text( - const char* image_path, - uint32_t image_path_len, - VisionTextBox** out_boxes, - uint32_t* out_count -); - -// Free memory allocated by vision_recognize_text -void vision_free_boxes(VisionTextBox* boxes, uint32_t count); - -#ifdef __cplusplus -} -#endif - -#endif /* VisionBridge_h */ diff --git a/crates/g3-computer-control/vision-bridge/Sources/VisionBridge/VisionOCR.swift b/crates/g3-computer-control/vision-bridge/Sources/VisionBridge/VisionOCR.swift deleted file mode 100644 index 5ff12d0..0000000 --- a/crates/g3-computer-control/vision-bridge/Sources/VisionBridge/VisionOCR.swift +++ /dev/null @@ -1,145 +0,0 @@ -import Foundation -import Vision -import AppKit -import CoreGraphics - -// MARK: - C Bridge Functions - -@_cdecl("vision_recognize_text") -public func vision_recognize_text( - _ imagePath: UnsafePointer, - _ imagePathLen: UInt32, - _ outBoxes: UnsafeMutablePointer, - _ outCount: UnsafeMutablePointer -) -> Bool { - // Convert C string to Swift String - guard let pathData = Data(bytes: imagePath, count: Int(imagePathLen)).withUnsafeBytes({ - String(bytes: $0, encoding: .utf8) - }) else { - return false - } - - let path = pathData.trimmingCharacters(in: .whitespaces) - - // Load image - guard let image = NSImage(contentsOfFile: path), - let cgImage = image.cgImage(forProposedRect: nil, context: nil, hints: nil) else { - return false - } - - // Perform OCR - var textBoxes: [CTextBox] = [] - let semaphore = DispatchSemaphore(value: 0) - var success = false - - let request = VNRecognizeTextRequest { request, error in - defer { semaphore.signal() } - - if let error = error { - print("Vision OCR error: \(error.localizedDescription)") - return - } - - guard let observations = request.results as? [VNRecognizedTextObservation] else { - return - } - - let imageSize = CGSize(width: cgImage.width, height: cgImage.height) - - for observation in observations { - guard let candidate = observation.topCandidates(1).first else { continue } - - let text = candidate.string - let boundingBox = observation.boundingBox - - // Convert normalized coordinates (bottom-left origin) to pixel coordinates (top-left origin) - let x = Int32(boundingBox.origin.x * imageSize.width) - let y = Int32((1.0 - boundingBox.origin.y - boundingBox.height) * imageSize.height) - let width = Int32(boundingBox.width * imageSize.width) - let height = Int32(boundingBox.height * imageSize.height) - - // Allocate C string for text - let cString = strdup(text) - - textBoxes.append(CTextBox( - text: cString, - text_len: UInt32(text.utf8.count), - x: x, - y: y, - width: width, - height: height, - confidence: observation.confidence - )) - } - - success = true - } - - // Configure request for best accuracy - request.recognitionLevel = .accurate - request.usesLanguageCorrection = true - request.recognitionLanguages = ["en-US"] - - // Perform request - let handler = VNImageRequestHandler(cgImage: cgImage, options: [:]) - do { - try handler.perform([request]) - } catch { - print("Vision request failed: \(error.localizedDescription)") - return false - } - - // Wait for completion - semaphore.wait() - - if !success { - return false - } - - // Allocate array for results - let boxesPtr = UnsafeMutablePointer.allocate(capacity: textBoxes.count) - for (index, box) in textBoxes.enumerated() { - boxesPtr[index] = box - } - - outBoxes.pointee = UnsafeMutableRawPointer(boxesPtr) - outCount.pointee = UInt32(textBoxes.count) - - return true -} - -@_cdecl("vision_free_boxes") -public func vision_free_boxes( - _ boxes: UnsafeMutableRawPointer, - _ count: UInt32 -) { - let typedBoxes = boxes.assumingMemoryBound(to: CTextBox.self) - for i in 0..? - public let text_len: UInt32 - public let x: Int32 - public let y: Int32 - public let width: Int32 - public let height: Int32 - public let confidence: Float - - public init(text: UnsafePointer?, text_len: UInt32, x: Int32, y: Int32, width: Int32, height: Int32, confidence: Float) { - self.text = text - self.text_len = text_len - self.x = x - self.y = y - self.width = width - self.height = height - self.confidence = confidence - } -}