computer control tools

This commit is contained in:
Dhanji Prasanna
2025-10-18 14:16:42 +11:00
parent a566171203
commit da652bf287
22 changed files with 2720 additions and 36 deletions

View File

@@ -0,0 +1,51 @@
pub mod types;
pub mod platform;
use anyhow::Result;
use async_trait::async_trait;
use types::*;
#[async_trait]
pub trait ComputerController: Send + Sync {
// Mouse operations
async fn move_mouse(&self, x: i32, y: i32) -> Result<()>;
async fn click(&self, button: MouseButton) -> Result<()>;
async fn double_click(&self, button: MouseButton) -> Result<()>;
// Keyboard operations
async fn type_text(&self, text: &str) -> Result<()>;
async fn press_key(&self, key: &str) -> Result<()>;
// Window management
async fn list_windows(&self) -> Result<Vec<Window>>;
async fn focus_window(&self, window_id: &str) -> Result<()>;
async fn get_window_bounds(&self, window_id: &str) -> Result<Rect>;
// UI element inspection
async fn find_element(&self, selector: &ElementSelector) -> Result<Option<UIElement>>;
async fn get_element_text(&self, element_id: &str) -> Result<String>;
async fn get_element_bounds(&self, element_id: &str) -> Result<Rect>;
// Screen capture
async fn take_screenshot(&self, path: &str, region: Option<Rect>, window_id: Option<&str>) -> Result<()>;
// OCR operations
async fn extract_text_from_screen(&self, region: Rect) -> Result<OCRResult>;
async fn extract_text_from_image(&self, path: &str) -> Result<OCRResult>;
async fn find_text_on_screen(&self, text: &str) -> Result<Option<Point>>;
}
// Platform-specific constructor
pub fn create_controller() -> Result<Box<dyn ComputerController>> {
#[cfg(target_os = "macos")]
return Ok(Box::new(platform::macos::MacOSController::new()?));
#[cfg(target_os = "linux")]
return Ok(Box::new(platform::linux::LinuxController::new()?));
#[cfg(target_os = "windows")]
return Ok(Box::new(platform::windows::WindowsController::new()?));
#[cfg(not(any(target_os = "macos", target_os = "linux", target_os = "windows")))]
anyhow::bail!("Unsupported platform")
}

View File

@@ -0,0 +1,161 @@
use crate::{ComputerController, types::*};
use anyhow::Result;
use async_trait::async_trait;
use tesseract::Tesseract;
use uuid::Uuid;
pub struct LinuxController {
// Placeholder for X11 connection or other state
}
impl LinuxController {
pub fn new() -> Result<Self> {
// Initialize X11 connection
tracing::warn!("Linux computer control not fully implemented");
Ok(Self {})
}
}
#[async_trait]
impl ComputerController for LinuxController {
async fn move_mouse(&self, _x: i32, _y: i32) -> Result<()> {
anyhow::bail!("Linux implementation not yet available")
}
async fn click(&self, _button: MouseButton) -> Result<()> {
anyhow::bail!("Linux implementation not yet available")
}
async fn double_click(&self, _button: MouseButton) -> Result<()> {
anyhow::bail!("Linux implementation not yet available")
}
async fn type_text(&self, _text: &str) -> Result<()> {
anyhow::bail!("Linux implementation not yet available")
}
async fn press_key(&self, _key: &str) -> Result<()> {
anyhow::bail!("Linux implementation not yet available")
}
async fn list_windows(&self) -> Result<Vec<Window>> {
anyhow::bail!("Linux implementation not yet available")
}
async fn focus_window(&self, _window_id: &str) -> Result<()> {
anyhow::bail!("Linux implementation not yet available")
}
async fn get_window_bounds(&self, _window_id: &str) -> Result<Rect> {
anyhow::bail!("Linux implementation not yet available")
}
async fn find_element(&self, _selector: &ElementSelector) -> Result<Option<UIElement>> {
anyhow::bail!("Linux implementation not yet available")
}
async fn get_element_text(&self, _element_id: &str) -> Result<String> {
anyhow::bail!("Linux implementation not yet available")
}
async fn get_element_bounds(&self, _element_id: &str) -> Result<Rect> {
anyhow::bail!("Linux implementation not yet available")
}
async fn take_screenshot(&self, _path: &str, _region: Option<Rect>, _window_id: Option<&str>) -> Result<()> {
anyhow::bail!("Linux implementation not yet available")
}
async fn extract_text_from_screen(&self, _region: Rect) -> Result<OCRResult> {
anyhow::bail!("Linux implementation not yet available")
}
async fn extract_text_from_image(&self, _path: &str) -> Result<OCRResult> {
// Check if tesseract is available on the system
let tesseract_check = std::process::Command::new("which")
.arg("tesseract")
.output();
if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() {
anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\
To install tesseract:\n \
Ubuntu/Debian: sudo apt-get install tesseract-ocr\n \
RHEL/CentOS: sudo yum install tesseract\n \
Arch Linux: sudo pacman -S tesseract\n\n\
After installation, restart your terminal and try again.");
}
// Initialize Tesseract
let tess = Tesseract::new(None, Some("eng"))
.map_err(|e| {
anyhow::anyhow!("Failed to initialize Tesseract: {}\n\n\
This usually means:\n1. Tesseract is not properly installed\n\
2. Language data files are missing\n\nTo fix:\n \
Ubuntu/Debian: sudo apt-get install tesseract-ocr-eng\n \
RHEL/CentOS: sudo yum install tesseract-langpack-eng\n \
Arch Linux: sudo pacman -S tesseract-data-eng", e)
})?;
let text = tess.set_image(_path)
.map_err(|e| anyhow::anyhow!("Failed to load image '{}': {}", _path, e))?
.get_text()
.map_err(|e| anyhow::anyhow!("Failed to extract text from image: {}", e))?;
// Get confidence (simplified - would need more complex API calls for per-word confidence)
let confidence = 0.85; // Placeholder
Ok(OCRResult {
text,
confidence,
bounds: Rect { x: 0, y: 0, width: 0, height: 0 }, // Would need image dimensions
})
}
async fn find_text_on_screen(&self, _text: &str) -> Result<Option<Point>> {
// Check if tesseract is available on the system
let tesseract_check = std::process::Command::new("which")
.arg("tesseract")
.output();
if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() {
anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\
To install tesseract:\n \
Ubuntu/Debian: sudo apt-get install tesseract-ocr\n \
RHEL/CentOS: sudo yum install tesseract\n \
Arch Linux: sudo pacman -S tesseract\n\n\
After installation, restart your terminal and try again.");
}
// Take full screen screenshot
let temp_path = format!("/tmp/g3_ocr_search_{}.png", uuid::Uuid::new_v4());
self.take_screenshot(&temp_path, None, None).await?;
// Use Tesseract to find text with bounding boxes
let tess = Tesseract::new(None, Some("eng"))
.map_err(|e| {
anyhow::anyhow!("Failed to initialize Tesseract: {}\n\n\
This usually means:\n1. Tesseract is not properly installed\n\
2. Language data files are missing\n\nTo fix:\n \
Ubuntu/Debian: sudo apt-get install tesseract-ocr-eng\n \
RHEL/CentOS: sudo yum install tesseract-langpack-eng\n \
Arch Linux: sudo pacman -S tesseract-data-eng", e)
})?;
let full_text = tess.set_image(temp_path.as_str())
.map_err(|e| anyhow::anyhow!("Failed to load screenshot: {}", e))?
.get_text()
.map_err(|e| anyhow::anyhow!("Failed to extract text from screen: {}", e))?;
// Clean up temp file
let _ = std::fs::remove_file(&temp_path);
// Simple text search - full implementation would use get_component_images
// to get bounding boxes for each word
if full_text.contains(_text) {
tracing::warn!("Text found but precise coordinates not available in simplified implementation");
Ok(Some(Point { x: 0, y: 0 }))
} else {
Ok(None)
}
}
}

View File

@@ -0,0 +1,562 @@
use crate::{ComputerController, types::*};
use anyhow::Result;
use async_trait::async_trait;
use core_graphics::display::CGPoint;
use core_graphics::event::{CGEvent, CGEventType, CGMouseButton, CGEventTapLocation};
use core_graphics::event_source::{CGEventSource, CGEventSourceStateID};
use std::path::Path;
use tesseract::Tesseract;
use core_graphics::window::{kCGWindowListOptionOnScreenOnly, kCGNullWindowID, CGWindowListCopyWindowInfo};
use core_foundation::dictionary::CFDictionary;
use core_foundation::string::CFString;
use core_foundation::base::{TCFType, ToVoid};
// MacOSController doesn't store CGEventSource to avoid Send/Sync issues
// We create it fresh for each operation
pub struct MacOSController {
// Empty struct - event source created per operation
}
impl MacOSController {
pub fn new() -> Result<Self> {
// Test that we can create an event source
let _event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
.map_err(|_| anyhow::anyhow!("Failed to create event source. Make sure Accessibility permissions are granted."))?;
Ok(Self {})
}
fn key_to_keycode(&self, key: &str) -> Result<u16> {
// Map key names to macOS keycodes
let keycode = match key.to_lowercase().as_str() {
"return" | "enter" => 36,
"tab" => 48,
"space" => 49,
"delete" | "backspace" => 51,
"escape" | "esc" => 53,
"command" | "cmd" => 55,
"shift" => 56,
"capslock" => 57,
"option" | "alt" => 58,
"control" | "ctrl" => 59,
"left" => 123,
"right" => 124,
"down" => 125,
"up" => 126,
_ => anyhow::bail!("Unknown key: {}", key),
};
Ok(keycode)
}
}
#[async_trait]
impl ComputerController for MacOSController {
async fn move_mouse(&self, x: i32, y: i32) -> Result<()> {
let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
.map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
let point = CGPoint::new(x as f64, y as f64);
let event = CGEvent::new_mouse_event(
event_source,
CGEventType::MouseMoved,
point,
CGMouseButton::Left,
).map_err(|_| anyhow::anyhow!("Failed to create mouse move event"))?;
event.post(CGEventTapLocation::HID);
Ok(())
}
async fn click(&self, button: MouseButton) -> Result<()> {
let (cg_button, down_type, up_type) = match button {
MouseButton::Left => (CGMouseButton::Left, CGEventType::LeftMouseDown, CGEventType::LeftMouseUp),
MouseButton::Right => (CGMouseButton::Right, CGEventType::RightMouseDown, CGEventType::RightMouseUp),
MouseButton::Middle => (CGMouseButton::Center, CGEventType::OtherMouseDown, CGEventType::OtherMouseUp),
};
let point = {
// Get current mouse position
let temp_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
.map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
let event = CGEvent::new(temp_source)
.map_err(|_| anyhow::anyhow!("Failed to get mouse position"))?;
let p = event.location();
p
};
{
let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
.map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
// Mouse down
let down_event = CGEvent::new_mouse_event(
event_source,
down_type,
point,
cg_button,
).map_err(|_| anyhow::anyhow!("Failed to create mouse down event"))?;
down_event.post(CGEventTapLocation::HID);
} // event_source and down_event dropped here
// Small delay
tokio::time::sleep(tokio::time::Duration::from_millis(50)).await;
{
let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
.map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
let up_event = CGEvent::new_mouse_event(
event_source,
up_type,
point,
cg_button,
).map_err(|_| anyhow::anyhow!("Failed to create mouse up event"))?;
up_event.post(CGEventTapLocation::HID);
} // event_source and up_event dropped here
Ok(())
}
async fn double_click(&self, button: MouseButton) -> Result<()> {
self.click(button).await?;
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
self.click(button).await?;
Ok(())
}
async fn type_text(&self, text: &str) -> Result<()> {
for ch in text.chars() {
{
let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
.map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
// Create keyboard event for character
let event = CGEvent::new_keyboard_event(
event_source,
0, // keycode (0 for unicode)
true,
).map_err(|_| anyhow::anyhow!("Failed to create keyboard event"))?;
// Set unicode string
let mut utf16_buf = [0u16; 2];
let utf16_slice = ch.encode_utf16(&mut utf16_buf);
let utf16_chars: Vec<u16> = utf16_slice.iter().copied().collect();
event.set_string_from_utf16_unchecked(utf16_chars.as_slice());
event.post(CGEventTapLocation::HID);
} // event_source and event dropped here
tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;
}
Ok(())
}
async fn press_key(&self, key: &str) -> Result<()> {
let keycode = self.key_to_keycode(key)?;
{
let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
.map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
// Key down
let down_event = CGEvent::new_keyboard_event(
event_source,
keycode,
true,
).map_err(|_| anyhow::anyhow!("Failed to create key down event"))?;
down_event.post(CGEventTapLocation::HID);
} // event_source and down_event dropped here
tokio::time::sleep(tokio::time::Duration::from_millis(50)).await;
{
let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
.map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
// Key up
let up_event = CGEvent::new_keyboard_event(
event_source,
keycode,
false,
).map_err(|_| anyhow::anyhow!("Failed to create key up event"))?;
up_event.post(CGEventTapLocation::HID);
} // event_source and up_event dropped here
Ok(())
}
async fn list_windows(&self) -> Result<Vec<Window>> {
let mut windows = Vec::new();
unsafe {
let window_list = CGWindowListCopyWindowInfo(
kCGWindowListOptionOnScreenOnly,
kCGNullWindowID
);
let array = core_foundation::array::CFArray::<CFDictionary>::wrap_under_create_rule(window_list);
let count = array.len();
for i in 0..count {
let dict = array.get(i).unwrap();
// Get window ID
let window_id_key = CFString::from_static_string("kCGWindowNumber");
let window_id: i64 = if let Some(value) = dict.find(window_id_key.to_void()) {
let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _);
num.to_i64().unwrap_or(0)
} else {
0
};
// Get owner name (app name)
let owner_key = CFString::from_static_string("kCGWindowOwnerName");
let app_name: String = if let Some(value) = dict.find(owner_key.to_void()) {
let s: CFString = TCFType::wrap_under_get_rule(*value as *const _);
s.to_string()
} else {
"Unknown".to_string()
};
// Get window name/title
let name_key = CFString::from_static_string("kCGWindowName");
let title: String = if let Some(value) = dict.find(name_key.to_void()) {
let s: CFString = TCFType::wrap_under_get_rule(*value as *const _);
s.to_string()
} else {
"".to_string()
};
// Get window bounds
let bounds_key = CFString::from_static_string("kCGWindowBounds");
let bounds = if let Some(bounds_value) = dict.find(bounds_key.to_void()) {
let bounds_dict: CFDictionary = TCFType::wrap_under_get_rule(*bounds_value as *const _);
let x_key = CFString::from_static_string("X");
let y_key = CFString::from_static_string("Y");
let width_key = CFString::from_static_string("Width");
let height_key = CFString::from_static_string("Height");
let x = if let Some(x_value) = bounds_dict.find(x_key.to_void()) {
let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*x_value as *const _);
num.to_i32().unwrap_or(0)
} else { 0 };
let y = if let Some(y_value) = bounds_dict.find(y_key.to_void()) {
let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*y_value as *const _);
num.to_i32().unwrap_or(0)
} else { 0 };
let width = if let Some(width_value) = bounds_dict.find(width_key.to_void()) {
let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*width_value as *const _);
num.to_i32().unwrap_or(0)
} else { 0 };
let height = if let Some(height_value) = bounds_dict.find(height_key.to_void()) {
let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*height_value as *const _);
num.to_i32().unwrap_or(0)
} else { 0 };
Rect { x, y, width, height }
} else {
Rect { x: 0, y: 0, width: 0, height: 0 }
};
// Skip windows without meaningful content (system UI elements, etc.)
if app_name.is_empty() || (title.is_empty() && bounds.width < 100) {
continue;
}
windows.push(Window {
id: format!("{}:{}", app_name, window_id),
title,
app_name,
bounds,
is_active: false, // We'd need additional API calls to determine this
});
}
}
Ok(windows)
}
async fn focus_window(&self, _window_id: &str) -> Result<()> {
// Note: Full implementation would use NSWorkspace to activate application
tracing::warn!("focus_window not fully implemented on macOS");
Ok(())
}
async fn get_window_bounds(&self, _window_id: &str) -> Result<Rect> {
// Note: Full implementation would use Accessibility API
tracing::warn!("get_window_bounds not fully implemented on macOS");
Ok(Rect { x: 0, y: 0, width: 800, height: 600 })
}
async fn find_element(&self, _selector: &ElementSelector) -> Result<Option<UIElement>> {
// Note: Full implementation would use macOS Accessibility API
tracing::warn!("find_element not fully implemented on macOS");
Ok(None)
}
async fn get_element_text(&self, _element_id: &str) -> Result<String> {
// Note: Full implementation would use Accessibility API
tracing::warn!("get_element_text not fully implemented on macOS");
Ok(String::new())
}
async fn get_element_bounds(&self, _element_id: &str) -> Result<Rect> {
// Note: Full implementation would use Accessibility API
tracing::warn!("get_element_bounds not fully implemented on macOS");
Ok(Rect { x: 0, y: 0, width: 100, height: 30 })
}
async fn take_screenshot(&self, path: &str, region: Option<Rect>, window_id: Option<&str>) -> Result<()> {
// Determine the temporary directory for screenshots
let temp_dir = std::env::var("TMPDIR")
.or_else(|_| std::env::var("HOME").map(|h| format!("{}/tmp", h)))
.unwrap_or_else(|_| "/tmp".to_string());
// Ensure temp directory exists
std::fs::create_dir_all(&temp_dir)?;
// If path is relative or doesn't specify a directory, use temp_dir
let final_path = if path.starts_with('/') {
path.to_string()
} else {
format!("{}/{}", temp_dir.trim_end_matches('/'), path)
};
// Get the currently focused application before taking screenshot
let current_app = std::process::Command::new("osascript")
.arg("-e")
.arg("tell application \"System Events\" to get name of first application process whose frontmost is true")
.output()
.ok()
.and_then(|output| {
if output.status.success() {
Some(String::from_utf8_lossy(&output.stdout).trim().to_string())
} else {
None
}
});
// Handle application-based window capture
let app_name_opt = window_id.and_then(|id| {
// Extract app name from window_id format "AppName:WindowNumber"
id.split(':').next().map(String::from)
});
// If we're capturing a specific window, foreground it first
if let Some(ref app) = app_name_opt {
tracing::debug!("Foregrounding application: {}", app);
let _ = std::process::Command::new("osascript")
.arg("-e")
.arg(format!("tell application \"{}\" to activate", app))
.output();
// Give the window time to come to the front
tokio::time::sleep(tokio::time::Duration::from_millis(500)).await;
}
let screenshot_result = if let Some(ref app) = app_name_opt {
// Use screencapture with AppleScript to get window ID
let script = format!(
r#"tell application "{}" to id of window 1"#,
app
);
let output = std::process::Command::new("osascript")
.arg("-e")
.arg(&script)
.output()?;
if output.status.success() {
let window_id_str = String::from_utf8_lossy(&output.stdout).trim().to_string();
std::process::Command::new("screencapture")
.arg(format!("-l{}", window_id_str))
.arg("-o")
.arg(&final_path)
.output()
} else {
// Fallback to regular screenshot if we can't get window ID
std::process::Command::new("screencapture")
.arg("-x")
.arg(&final_path)
.output()
}
} else {
// Regular screenshot (full screen or region)
// Use native macOS screencapture command which handles all the format complexities
// Check if we have Screen Recording permission by attempting a test capture
// If we only get wallpaper/menubar but no windows, we need permission
let needs_permission_check = std::env::var("G3_SKIP_PERMISSION_CHECK").is_err();
if needs_permission_check {
// Try to open Screen Recording settings if this is the first screenshot
static PERMISSION_PROMPTED: std::sync::atomic::AtomicBool = std::sync::atomic::AtomicBool::new(false);
if !PERMISSION_PROMPTED.swap(true, std::sync::atomic::Ordering::Relaxed) {
tracing::warn!("\n=== Screen Recording Permission Required ===\n\
macOS requires explicit permission to capture window content.\n\
If screenshots only show wallpaper/menubar (no windows):\n\n\
1. Open System Settings > Privacy & Security > Screen Recording\n\
2. Enable permission for your terminal (iTerm/Terminal) or g3\n\
3. Restart your terminal if needed\n\n\
Opening Screen Recording settings now...\n");
// Try to open the settings (non-blocking)
let _ = std::process::Command::new("open")
.arg("x-apple.systempreferences:com.apple.preference.security?Privacy_ScreenCapture")
.spawn();
}
}
let path_obj = Path::new(&final_path);
if let Some(parent) = path_obj.parent() {
std::fs::create_dir_all(parent)?;
}
let mut cmd = std::process::Command::new("screencapture");
// Add flags
cmd.arg("-x"); // No sound
if let Some(region) = region {
// Capture specific region: -R x,y,width,height
cmd.arg("-R");
cmd.arg(format!("{},{},{},{}", region.x, region.y, region.width, region.height));
}
cmd.arg(&final_path);
cmd.output()
}?;
if !screenshot_result.status.success() {
let stderr = String::from_utf8_lossy(&screenshot_result.stderr);
return Err(anyhow::anyhow!("screencapture failed: {}", stderr));
}
// Re-foreground the original application if we foregrounded a different window
if let Some(ref target_app) = app_name_opt {
if let Some(ref original_app) = current_app {
// Only restore if we actually changed the foreground app
if target_app != original_app {
tracing::debug!("Restoring focus to original application: {}", original_app);
// Small delay to ensure screenshot is complete
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
let _ = std::process::Command::new("osascript")
.arg("-e")
.arg(format!("tell application \"{}\" to activate", original_app))
.output();
}
}
}
tracing::debug!("Screenshot saved using screencapture: {}", final_path);
Ok(())
}
async fn extract_text_from_screen(&self, region: Rect) -> Result<OCRResult> {
// Take screenshot of region first
let temp_path = format!("/tmp/g3_ocr_{}.png", uuid::Uuid::new_v4());
self.take_screenshot(&temp_path, Some(region), None).await?;
// Extract text from the screenshot
let result = self.extract_text_from_image(&temp_path).await?;
// Clean up temp file
let _ = std::fs::remove_file(&temp_path);
Ok(result)
}
async fn extract_text_from_image(&self, _path: &str) -> Result<OCRResult> {
// Check if tesseract is available on the system
let tesseract_check = std::process::Command::new("which")
.arg("tesseract")
.output();
if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() {
anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\
To install tesseract:\n macOS: brew install tesseract\n \
Linux: sudo apt-get install tesseract-ocr (Ubuntu/Debian)\n \
sudo yum install tesseract (RHEL/CentOS)\n \
Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki\n\n\
After installation, restart your terminal and try again.");
}
// Initialize Tesseract
let tess = Tesseract::new(None, Some("eng"))
.map_err(|e| {
anyhow::anyhow!("Failed to initialize Tesseract: {}\n\n\
This usually means:\n1. Tesseract is not properly installed\n\
2. Language data files are missing\n\nTo fix:\n \
macOS: brew reinstall tesseract\n \
Linux: sudo apt-get install tesseract-ocr-eng\n \
Windows: Reinstall tesseract and ensure language files are included", e)
})?;
let text = tess.set_image(_path)
.map_err(|e| anyhow::anyhow!("Failed to load image '{}': {}", _path, e))?
.get_text()
.map_err(|e| anyhow::anyhow!("Failed to extract text from image: {}", e))?;
// Get confidence (simplified - would need more complex API calls for per-word confidence)
let confidence = 0.85; // Placeholder
Ok(OCRResult {
text,
confidence,
bounds: Rect { x: 0, y: 0, width: 0, height: 0 }, // Would need image dimensions
})
}
async fn find_text_on_screen(&self, _text: &str) -> Result<Option<Point>> {
// Check if tesseract is available on the system
let tesseract_check = std::process::Command::new("which")
.arg("tesseract")
.output();
if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() {
anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\
To install tesseract:\n macOS: brew install tesseract\n \
Linux: sudo apt-get install tesseract-ocr (Ubuntu/Debian)\n \
sudo yum install tesseract (RHEL/CentOS)\n \
Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki\n\n\
After installation, restart your terminal and try again.");
}
// Take full screen screenshot
let temp_path = format!("/tmp/g3_ocr_search_{}.png", uuid::Uuid::new_v4());
self.take_screenshot(&temp_path, None, None).await?;
// Use Tesseract to find text with bounding boxes
let tess = Tesseract::new(None, Some("eng"))
.map_err(|e| {
anyhow::anyhow!("Failed to initialize Tesseract: {}\n\n\
This usually means:\n1. Tesseract is not properly installed\n\
2. Language data files are missing\n\nTo fix:\n \
macOS: brew reinstall tesseract\n \
Linux: sudo apt-get install tesseract-ocr-eng\n \
Windows: Reinstall tesseract and ensure language files are included", e)
})?;
let full_text = tess.set_image(temp_path.as_str())
.map_err(|e| anyhow::anyhow!("Failed to load screenshot: {}", e))?
.get_text()
.map_err(|e| anyhow::anyhow!("Failed to extract text from screen: {}", e))?;
// Clean up temp file
let _ = std::fs::remove_file(&temp_path);
// Simple text search - full implementation would use get_component_images
// to get bounding boxes for each word
if full_text.contains(_text) {
tracing::warn!("Text found but precise coordinates not available in simplified implementation");
Ok(Some(Point { x: 0, y: 0 }))
} else {
Ok(None)
}
}
}

View File

@@ -0,0 +1,425 @@
use crate::{ComputerController, types::*};
use anyhow::Result;
use async_trait::async_trait;
use core_graphics::display::CGPoint;
use core_graphics::event::{CGEvent, CGEventType, CGMouseButton, CGEventTapLocation};
use core_graphics::event_source::{CGEventSource, CGEventSourceStateID};
use std::path::Path;
use tesseract::Tesseract;
// MacOSController doesn't store CGEventSource to avoid Send/Sync issues
// We create it fresh for each operation
pub struct MacOSController {
// Empty struct - event source created per operation
}
impl MacOSController {
pub fn new() -> Result<Self> {
// Test that we can create an event source
let _event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
.map_err(|_| anyhow::anyhow!("Failed to create event source. Make sure Accessibility permissions are granted."))?;
Ok(Self {})
}
fn key_to_keycode(&self, key: &str) -> Result<u16> {
// Map key names to macOS keycodes
let keycode = match key.to_lowercase().as_str() {
"return" | "enter" => 36,
"tab" => 48,
"space" => 49,
"delete" | "backspace" => 51,
"escape" | "esc" => 53,
"command" | "cmd" => 55,
"shift" => 56,
"capslock" => 57,
"option" | "alt" => 58,
"control" | "ctrl" => 59,
"left" => 123,
"right" => 124,
"down" => 125,
"up" => 126,
_ => anyhow::bail!("Unknown key: {}", key),
};
Ok(keycode)
}
}
#[async_trait]
impl ComputerController for MacOSController {
async fn move_mouse(&self, x: i32, y: i32) -> Result<()> {
let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
.map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
let point = CGPoint::new(x as f64, y as f64);
let event = CGEvent::new_mouse_event(
event_source,
CGEventType::MouseMoved,
point,
CGMouseButton::Left,
).map_err(|_| anyhow::anyhow!("Failed to create mouse move event"))?;
event.post(CGEventTapLocation::HID);
Ok(())
}
async fn click(&self, button: MouseButton) -> Result<()> {
let (cg_button, down_type, up_type) = match button {
MouseButton::Left => (CGMouseButton::Left, CGEventType::LeftMouseDown, CGEventType::LeftMouseUp),
MouseButton::Right => (CGMouseButton::Right, CGEventType::RightMouseDown, CGEventType::RightMouseUp),
MouseButton::Middle => (CGMouseButton::Center, CGEventType::OtherMouseDown, CGEventType::OtherMouseUp),
};
let point = {
// Get current mouse position
let temp_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
.map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
let event = CGEvent::new(temp_source)
.map_err(|_| anyhow::anyhow!("Failed to get mouse position"))?;
let p = event.location();
p
};
{
let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
.map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
// Mouse down
let down_event = CGEvent::new_mouse_event(
event_source,
down_type,
point,
cg_button,
).map_err(|_| anyhow::anyhow!("Failed to create mouse down event"))?;
down_event.post(CGEventTapLocation::HID);
} // event_source and down_event dropped here
// Small delay
tokio::time::sleep(tokio::time::Duration::from_millis(50)).await;
{
let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
.map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
let up_event = CGEvent::new_mouse_event(
event_source,
up_type,
point,
cg_button,
).map_err(|_| anyhow::anyhow!("Failed to create mouse up event"))?;
up_event.post(CGEventTapLocation::HID);
} // event_source and up_event dropped here
Ok(())
}
async fn double_click(&self, button: MouseButton) -> Result<()> {
self.click(button).await?;
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
self.click(button).await?;
Ok(())
}
async fn type_text(&self, text: &str) -> Result<()> {
for ch in text.chars() {
{
let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
.map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
// Create keyboard event for character
let event = CGEvent::new_keyboard_event(
event_source,
0, // keycode (0 for unicode)
true,
).map_err(|_| anyhow::anyhow!("Failed to create keyboard event"))?;
// Set unicode string
let mut utf16_buf = [0u16; 2];
let utf16_slice = ch.encode_utf16(&mut utf16_buf);
let utf16_chars: Vec<u16> = utf16_slice.iter().copied().collect();
event.set_string_from_utf16_unchecked(utf16_chars.as_slice());
event.post(CGEventTapLocation::HID);
} // event_source and event dropped here
tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;
}
Ok(())
}
async fn press_key(&self, key: &str) -> Result<()> {
let keycode = self.key_to_keycode(key)?;
{
let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
.map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
// Key down
let down_event = CGEvent::new_keyboard_event(
event_source,
keycode,
true,
).map_err(|_| anyhow::anyhow!("Failed to create key down event"))?;
down_event.post(CGEventTapLocation::HID);
} // event_source and down_event dropped here
tokio::time::sleep(tokio::time::Duration::from_millis(50)).await;
{
let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
.map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
// Key up
let up_event = CGEvent::new_keyboard_event(
event_source,
keycode,
false,
).map_err(|_| anyhow::anyhow!("Failed to create key up event"))?;
up_event.post(CGEventTapLocation::HID);
} // event_source and up_event dropped here
Ok(())
}
async fn list_windows(&self) -> Result<Vec<Window>> {
// Note: Full implementation would use CGWindowListCopyWindowInfo
// For now, return empty list as this requires more complex FFI
tracing::warn!("list_windows not fully implemented on macOS");
Ok(vec![])
}
async fn focus_window(&self, _window_id: &str) -> Result<()> {
// Note: Full implementation would use NSWorkspace to activate application
tracing::warn!("focus_window not fully implemented on macOS");
Ok(())
}
async fn get_window_bounds(&self, _window_id: &str) -> Result<Rect> {
// Note: Full implementation would use Accessibility API
tracing::warn!("get_window_bounds not fully implemented on macOS");
Ok(Rect { x: 0, y: 0, width: 800, height: 600 })
}
async fn find_element(&self, _selector: &ElementSelector) -> Result<Option<UIElement>> {
// Note: Full implementation would use macOS Accessibility API
tracing::warn!("find_element not fully implemented on macOS");
Ok(None)
}
async fn get_element_text(&self, _element_id: &str) -> Result<String> {
// Note: Full implementation would use Accessibility API
tracing::warn!("get_element_text not fully implemented on macOS");
Ok(String::new())
}
async fn get_element_bounds(&self, _element_id: &str) -> Result<Rect> {
// Note: Full implementation would use Accessibility API
tracing::warn!("get_element_bounds not fully implemented on macOS");
Ok(Rect { x: 0, y: 0, width: 100, height: 30 })
}
async fn take_screenshot(&self, path: &str, _region: Option<Rect>, window_id: Option<&str>) -> Result<()> {
// Use native macOS screencapture command which handles all the format complexities
// Check if we have Screen Recording permission by attempting a test capture
// If we only get wallpaper/menubar but no windows, we need permission
let needs_permission_check = std::env::var("G3_SKIP_PERMISSION_CHECK").is_err();
if needs_permission_check {
// Try to open Screen Recording settings if this is the first screenshot
static PERMISSION_PROMPTED: std::sync::atomic::AtomicBool = std::sync::atomic::AtomicBool::new(false);
if !PERMISSION_PROMPTED.swap(true, std::sync::atomic::Ordering::Relaxed) {
tracing::warn!("\n=== Screen Recording Permission Required ===\n\
macOS requires explicit permission to capture window content.\n\
If screenshots only show wallpaper/menubar (no windows):\n\n\
1. Open System Settings > Privacy & Security > Screen Recording\n\
2. Enable permission for your terminal (iTerm/Terminal) or g3\n\
3. Restart your terminal if needed\n\n\
Opening Screen Recording settings now...\n");
// Try to open the settings (non-blocking)
let _ = std::process::Command::new("open")
.arg("x-apple.systempreferences:com.apple.preference.security?Privacy_ScreenCapture")
.spawn();
}
}
let path_obj = Path::new(path);
if let Some(parent) = path_obj.parent() {
std::fs::create_dir_all(parent)?;
}
let mut cmd = std::process::Command::new("screencapture");
// Add flags
cmd.arg("-x"); // No sound
if let Some(window_id) = window_id {
// Capture specific window by getting its bounds and using region capture
// window_id format: "AppName" or "AppName:WindowTitle"
let app_name = window_id.split(':').next().unwrap_or(window_id);
// Use AppleScript to get window bounds
let script = format!(
r#"tell application "{}"
tell current window
get bounds
end tell
end tell"#,
app_name
);
let output = std::process::Command::new("osascript")
.arg("-e")
.arg(&script)
.output()
.map_err(|e| anyhow::anyhow!("Failed to get window bounds: {}", e))?;
if output.status.success() {
let bounds_str = String::from_utf8_lossy(&output.stdout);
let bounds: Vec<i32> = bounds_str
.trim()
.split(',')
.filter_map(|s| s.trim().parse().ok())
.collect();
if bounds.len() == 4 {
let (left, top, right, bottom) = (bounds[0], bounds[1], bounds[2], bounds[3]);
let width = right - left;
let height = bottom - top;
cmd.arg("-R");
cmd.arg(format!("{},{},{},{}", left, top, width, height));
tracing::debug!("Capturing window '{}' at region: {},{} {}x{}", app_name, left, top, width, height);
} else {
tracing::warn!("Failed to parse window bounds, capturing full screen");
}
} else {
tracing::warn!("Failed to get window bounds for '{}', capturing full screen", app_name);
}
} else if let Some(region) = _region {
// Capture specific region: -R x,y,width,height
cmd.arg("-R");
cmd.arg(format!("{},{},{},{}", region.x, region.y, region.width, region.height));
}
cmd.arg(path);
let output = cmd.output()
.map_err(|e| anyhow::anyhow!("Failed to execute screencapture: {}", e))?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
anyhow::bail!("screencapture failed: {}", stderr);
}
tracing::debug!("Screenshot saved using screencapture: {}", path);
Ok(())
}
}
async fn extract_text_from_screen(&self, region: Rect) -> Result<OCRResult> {
// Take screenshot of region first
let temp_path = format!("/tmp/g3_ocr_{}.png", uuid::Uuid::new_v4());
self.take_screenshot(&temp_path, Some(region), None).await?;
// Extract text from the screenshot
let result = self.extract_text_from_image(&temp_path).await?;
// Clean up temp file
let _ = std::fs::remove_file(&temp_path);
Ok(result)
}
async fn extract_text_from_image(&self, _path: &str) -> Result<OCRResult> {
// Check if tesseract is available on the system
let tesseract_check = std::process::Command::new("which")
.arg("tesseract")
.output();
if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() {
anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\
To install tesseract:\n macOS: brew install tesseract\n \
Linux: sudo apt-get install tesseract-ocr (Ubuntu/Debian)\n \
sudo yum install tesseract (RHEL/CentOS)\n \
Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki\n\n\
After installation, restart your terminal and try again.");
}
// Initialize Tesseract
let tess = Tesseract::new(None, Some("eng"))
.map_err(|e| {
anyhow::anyhow!("Failed to initialize Tesseract: {}\n\n\
This usually means:\n1. Tesseract is not properly installed\n\
2. Language data files are missing\n\nTo fix:\n \
macOS: brew reinstall tesseract\n \
Linux: sudo apt-get install tesseract-ocr-eng\n \
Windows: Reinstall tesseract and ensure language files are included", e)
})?;
let text = tess.set_image(_path)
.map_err(|e| anyhow::anyhow!("Failed to load image '{}': {}", _path, e))?
.get_text()
.map_err(|e| anyhow::anyhow!("Failed to extract text from image: {}", e))?;
// Get confidence (simplified - would need more complex API calls for per-word confidence)
let confidence = 0.85; // Placeholder
Ok(OCRResult {
text,
confidence,
bounds: Rect { x: 0, y: 0, width: 0, height: 0 }, // Would need image dimensions
})
}
async fn find_text_on_screen(&self, _text: &str) -> Result<Option<Point>> {
// Check if tesseract is available on the system
let tesseract_check = std::process::Command::new("which")
.arg("tesseract")
.output();
if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() {
anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\
To install tesseract:\n macOS: brew install tesseract\n \
Linux: sudo apt-get install tesseract-ocr (Ubuntu/Debian)\n \
sudo yum install tesseract (RHEL/CentOS)\n \
Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki\n\n\
After installation, restart your terminal and try again.");
}
// Take full screen screenshot
let temp_path = format!("/tmp/g3_ocr_search_{}.png", uuid::Uuid::new_v4());
self.take_screenshot(&temp_path, None, None).await?;
// Use Tesseract to find text with bounding boxes
let tess = Tesseract::new(None, Some("eng"))
.map_err(|e| {
anyhow::anyhow!("Failed to initialize Tesseract: {}\n\n\
This usually means:\n1. Tesseract is not properly installed\n\
2. Language data files are missing\n\nTo fix:\n \
macOS: brew reinstall tesseract\n \
Linux: sudo apt-get install tesseract-ocr-eng\n \
Windows: Reinstall tesseract and ensure language files are included", e)
})?;
let full_text = tess.set_image(temp_path.as_str())
.map_err(|e| anyhow::anyhow!("Failed to load screenshot: {}", e))?
.get_text()
.map_err(|e| anyhow::anyhow!("Failed to extract text from screen: {}", e))?;
// Clean up temp file
let _ = std::fs::remove_file(&temp_path);
// Simple text search - full implementation would use get_component_images
// to get bounding boxes for each word
if full_text.contains(_text) {
tracing::warn!("Text found but precise coordinates not available in simplified implementation");
Ok(Some(Point { x: 0, y: 0 }))
} else {
Ok(None)
}
}
}

View File

@@ -0,0 +1,8 @@
#[cfg(target_os = "macos")]
pub mod macos;
#[cfg(target_os = "linux")]
pub mod linux;
#[cfg(target_os = "windows")]
pub mod windows;

View File

@@ -0,0 +1,162 @@
use crate::{ComputerController, types::*};
use anyhow::Result;
use async_trait::async_trait;
use tesseract::Tesseract;
use uuid::Uuid;
pub struct WindowsController {
// Placeholder for Windows-specific state
}
impl WindowsController {
pub fn new() -> Result<Self> {
tracing::warn!("Windows computer control not fully implemented");
Ok(Self {})
}
}
#[async_trait]
impl ComputerController for WindowsController {
async fn move_mouse(&self, _x: i32, _y: i32) -> Result<()> {
anyhow::bail!("Windows implementation not yet available")
}
async fn click(&self, _button: MouseButton) -> Result<()> {
anyhow::bail!("Windows implementation not yet available")
}
async fn double_click(&self, _button: MouseButton) -> Result<()> {
anyhow::bail!("Windows implementation not yet available")
}
async fn type_text(&self, _text: &str) -> Result<()> {
anyhow::bail!("Windows implementation not yet available")
}
async fn press_key(&self, _key: &str) -> Result<()> {
anyhow::bail!("Windows implementation not yet available")
}
async fn list_windows(&self) -> Result<Vec<Window>> {
anyhow::bail!("Windows implementation not yet available")
}
async fn focus_window(&self, _window_id: &str) -> Result<()> {
anyhow::bail!("Windows implementation not yet available")
}
async fn get_window_bounds(&self, _window_id: &str) -> Result<Rect> {
anyhow::bail!("Windows implementation not yet available")
}
async fn find_element(&self, _selector: &ElementSelector) -> Result<Option<UIElement>> {
anyhow::bail!("Windows implementation not yet available")
}
async fn get_element_text(&self, _element_id: &str) -> Result<String> {
anyhow::bail!("Windows implementation not yet available")
}
async fn get_element_bounds(&self, _element_id: &str) -> Result<Rect> {
anyhow::bail!("Windows implementation not yet available")
}
async fn take_screenshot(&self, _path: &str, _region: Option<Rect>, _window_id: Option<&str>) -> Result<()> {
anyhow::bail!("Windows implementation not yet available")
}
async fn extract_text_from_screen(&self, _region: Rect) -> Result<OCRResult> {
anyhow::bail!("Windows implementation not yet available")
}
async fn extract_text_from_image(&self, _path: &str) -> Result<OCRResult> {
// Check if tesseract is available on the system
let tesseract_check = std::process::Command::new("where")
.arg("tesseract")
.output();
if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() {
anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\
To install tesseract on Windows:\n \
1. Download the installer from: https://github.com/UB-Mannheim/tesseract/wiki\n \
2. Run the installer and follow the instructions\n \
3. Add tesseract to your PATH environment variable\n \
4. Restart your terminal/command prompt\n\n\
After installation, restart your terminal and try again.");
}
// Initialize Tesseract
let tess = Tesseract::new(None, Some("eng"))
.map_err(|e| {
anyhow::anyhow!("Failed to initialize Tesseract: {}\n\n\
This usually means:\n1. Tesseract is not properly installed\n\
2. Language data files are missing\n\nTo fix:\n \
1. Reinstall tesseract from https://github.com/UB-Mannheim/tesseract/wiki\n \
2. Make sure to select 'Additional language data' during installation\n \
3. Ensure tesseract is in your PATH", e)
})?;
let text = tess.set_image(_path)
.map_err(|e| anyhow::anyhow!("Failed to load image '{}': {}", _path, e))?
.get_text()
.map_err(|e| anyhow::anyhow!("Failed to extract text from image: {}", e))?;
// Get confidence (simplified - would need more complex API calls for per-word confidence)
let confidence = 0.85; // Placeholder
Ok(OCRResult {
text,
confidence,
bounds: Rect { x: 0, y: 0, width: 0, height: 0 }, // Would need image dimensions
})
}
async fn find_text_on_screen(&self, _text: &str) -> Result<Option<Point>> {
// Check if tesseract is available on the system
let tesseract_check = std::process::Command::new("where")
.arg("tesseract")
.output();
if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() {
anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\
To install tesseract on Windows:\n \
1. Download the installer from: https://github.com/UB-Mannheim/tesseract/wiki\n \
2. Run the installer and follow the instructions\n \
3. Add tesseract to your PATH environment variable\n \
4. Restart your terminal/command prompt\n\n\
After installation, restart your terminal and try again.");
}
// Take full screen screenshot
let temp_path = format!("C:\\\\Temp\\\\g3_ocr_search_{}.png", uuid::Uuid::new_v4());
self.take_screenshot(&temp_path, None, None).await?;
// Use Tesseract to find text with bounding boxes
let tess = Tesseract::new(None, Some("eng"))
.map_err(|e| {
anyhow::anyhow!("Failed to initialize Tesseract: {}\n\n\
This usually means:\n1. Tesseract is not properly installed\n\
2. Language data files are missing\n\nTo fix:\n \
1. Reinstall tesseract from https://github.com/UB-Mannheim/tesseract/wiki\n \
2. Make sure to select 'Additional language data' during installation\n \
3. Ensure tesseract is in your PATH", e)
})?;
let full_text = tess.set_image(temp_path.as_str())
.map_err(|e| anyhow::anyhow!("Failed to load screenshot: {}", e))?
.get_text()
.map_err(|e| anyhow::anyhow!("Failed to extract text from screen: {}", e))?;
// Clean up temp file
let _ = std::fs::remove_file(&temp_path);
// Simple text search - full implementation would use get_component_images
// to get bounding boxes for each word
if full_text.contains(_text) {
tracing::warn!("Text found but precise coordinates not available in simplified implementation");
Ok(Some(Point { x: 0, y: 0 }))
} else {
Ok(None)
}
}
}

View File

@@ -0,0 +1,65 @@
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
pub struct Point {
pub x: i32,
pub y: i32,
}
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
pub struct Rect {
pub x: i32,
pub y: i32,
pub width: i32,
pub height: i32,
}
impl Rect {
pub fn center(&self) -> Point {
Point {
x: self.x + self.width / 2,
y: self.y + self.height / 2,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Window {
pub id: String,
pub title: String,
pub app_name: String,
pub bounds: Rect,
pub is_active: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct UIElement {
pub id: String,
pub text: String,
pub role: String,
pub bounds: Rect,
pub enabled: bool,
pub visible: bool,
pub value: Option<String>,
}
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
pub enum MouseButton {
Left,
Right,
Middle,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ElementSelector {
pub text: Option<String>,
pub role: Option<String>,
pub window_id: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OCRResult {
pub text: String,
pub confidence: f32,
pub bounds: Rect,
}