webdriver tools
This commit is contained in:
@@ -1,5 +1,9 @@
|
||||
pub mod types;
|
||||
pub mod platform;
|
||||
pub mod webdriver;
|
||||
|
||||
// Re-export webdriver types for convenience
|
||||
pub use webdriver::{WebDriverController, WebElement, safari::SafariDriver};
|
||||
|
||||
use anyhow::Result;
|
||||
use async_trait::async_trait;
|
||||
@@ -7,32 +11,12 @@ use types::*;
|
||||
|
||||
#[async_trait]
|
||||
pub trait ComputerController: Send + Sync {
|
||||
// Mouse operations
|
||||
async fn move_mouse(&self, x: i32, y: i32) -> Result<()>;
|
||||
async fn click(&self, button: MouseButton) -> Result<()>;
|
||||
async fn double_click(&self, button: MouseButton) -> Result<()>;
|
||||
|
||||
// Keyboard operations
|
||||
async fn type_text(&self, text: &str) -> Result<()>;
|
||||
async fn press_key(&self, key: &str) -> Result<()>;
|
||||
|
||||
// Window management
|
||||
async fn list_windows(&self) -> Result<Vec<Window>>;
|
||||
async fn focus_window(&self, window_id: &str) -> Result<()>;
|
||||
async fn get_window_bounds(&self, window_id: &str) -> Result<Rect>;
|
||||
|
||||
// UI element inspection
|
||||
async fn find_element(&self, selector: &ElementSelector) -> Result<Option<UIElement>>;
|
||||
async fn get_element_text(&self, element_id: &str) -> Result<String>;
|
||||
async fn get_element_bounds(&self, element_id: &str) -> Result<Rect>;
|
||||
|
||||
// Screen capture
|
||||
async fn take_screenshot(&self, path: &str, region: Option<Rect>, window_id: Option<&str>) -> Result<()>;
|
||||
|
||||
// OCR operations
|
||||
async fn extract_text_from_screen(&self, region: Rect) -> Result<OCRResult>;
|
||||
async fn extract_text_from_image(&self, path: &str) -> Result<OCRResult>;
|
||||
async fn find_text_on_screen(&self, text: &str) -> Result<Option<Point>>;
|
||||
async fn extract_text_from_screen(&self, region: Rect) -> Result<String>;
|
||||
async fn extract_text_from_image(&self, path: &str) -> Result<String>;
|
||||
}
|
||||
|
||||
// Platform-specific constructor
|
||||
|
||||
@@ -1,310 +1,21 @@
|
||||
use crate::{ComputerController, types::*};
|
||||
use crate::{ComputerController, types::Rect};
|
||||
use anyhow::Result;
|
||||
use async_trait::async_trait;
|
||||
use core_graphics::display::CGPoint;
|
||||
use core_graphics::event::{CGEvent, CGEventType, CGMouseButton, CGEventTapLocation};
|
||||
use core_graphics::event_source::{CGEventSource, CGEventSourceStateID};
|
||||
use std::path::Path;
|
||||
use tesseract::Tesseract;
|
||||
use core_graphics::window::{kCGWindowListOptionOnScreenOnly, kCGNullWindowID, CGWindowListCopyWindowInfo};
|
||||
use core_foundation::dictionary::CFDictionary;
|
||||
use core_foundation::string::CFString;
|
||||
use core_foundation::base::{TCFType, ToVoid};
|
||||
|
||||
// MacOSController doesn't store CGEventSource to avoid Send/Sync issues
|
||||
// We create it fresh for each operation
|
||||
pub struct MacOSController {
|
||||
// Empty struct - event source created per operation
|
||||
// Empty struct for now
|
||||
}
|
||||
|
||||
impl MacOSController {
|
||||
pub fn new() -> Result<Self> {
|
||||
// Test that we can create an event source
|
||||
let _event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
|
||||
.map_err(|_| anyhow::anyhow!("Failed to create event source. Make sure Accessibility permissions are granted."))?;
|
||||
Ok(Self {})
|
||||
}
|
||||
|
||||
fn key_to_keycode(&self, key: &str) -> Result<u16> {
|
||||
// Map key names to macOS keycodes
|
||||
let keycode = match key.to_lowercase().as_str() {
|
||||
"return" | "enter" => 36,
|
||||
"tab" => 48,
|
||||
"space" => 49,
|
||||
"delete" | "backspace" => 51,
|
||||
"escape" | "esc" => 53,
|
||||
"command" | "cmd" => 55,
|
||||
"shift" => 56,
|
||||
"capslock" => 57,
|
||||
"option" | "alt" => 58,
|
||||
"control" | "ctrl" => 59,
|
||||
"left" => 123,
|
||||
"right" => 124,
|
||||
"down" => 125,
|
||||
"up" => 126,
|
||||
_ => anyhow::bail!("Unknown key: {}", key),
|
||||
};
|
||||
Ok(keycode)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl ComputerController for MacOSController {
|
||||
async fn move_mouse(&self, x: i32, y: i32) -> Result<()> {
|
||||
let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
|
||||
.map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
|
||||
let point = CGPoint::new(x as f64, y as f64);
|
||||
let event = CGEvent::new_mouse_event(
|
||||
event_source,
|
||||
CGEventType::MouseMoved,
|
||||
point,
|
||||
CGMouseButton::Left,
|
||||
).map_err(|_| anyhow::anyhow!("Failed to create mouse move event"))?;
|
||||
|
||||
event.post(CGEventTapLocation::HID);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn click(&self, button: MouseButton) -> Result<()> {
|
||||
let (cg_button, down_type, up_type) = match button {
|
||||
MouseButton::Left => (CGMouseButton::Left, CGEventType::LeftMouseDown, CGEventType::LeftMouseUp),
|
||||
MouseButton::Right => (CGMouseButton::Right, CGEventType::RightMouseDown, CGEventType::RightMouseUp),
|
||||
MouseButton::Middle => (CGMouseButton::Center, CGEventType::OtherMouseDown, CGEventType::OtherMouseUp),
|
||||
};
|
||||
|
||||
let point = {
|
||||
// Get current mouse position
|
||||
let temp_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
|
||||
.map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
|
||||
let event = CGEvent::new(temp_source)
|
||||
.map_err(|_| anyhow::anyhow!("Failed to get mouse position"))?;
|
||||
let p = event.location();
|
||||
p
|
||||
};
|
||||
|
||||
{
|
||||
let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
|
||||
.map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
|
||||
|
||||
// Mouse down
|
||||
let down_event = CGEvent::new_mouse_event(
|
||||
event_source,
|
||||
down_type,
|
||||
point,
|
||||
cg_button,
|
||||
).map_err(|_| anyhow::anyhow!("Failed to create mouse down event"))?;
|
||||
down_event.post(CGEventTapLocation::HID);
|
||||
} // event_source and down_event dropped here
|
||||
|
||||
// Small delay
|
||||
tokio::time::sleep(tokio::time::Duration::from_millis(50)).await;
|
||||
|
||||
{
|
||||
let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
|
||||
.map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
|
||||
|
||||
let up_event = CGEvent::new_mouse_event(
|
||||
event_source,
|
||||
up_type,
|
||||
point,
|
||||
cg_button,
|
||||
).map_err(|_| anyhow::anyhow!("Failed to create mouse up event"))?;
|
||||
up_event.post(CGEventTapLocation::HID);
|
||||
} // event_source and up_event dropped here
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn double_click(&self, button: MouseButton) -> Result<()> {
|
||||
self.click(button).await?;
|
||||
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
|
||||
self.click(button).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn type_text(&self, text: &str) -> Result<()> {
|
||||
for ch in text.chars() {
|
||||
{
|
||||
let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
|
||||
.map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
|
||||
|
||||
// Create keyboard event for character
|
||||
let event = CGEvent::new_keyboard_event(
|
||||
event_source,
|
||||
0, // keycode (0 for unicode)
|
||||
true,
|
||||
).map_err(|_| anyhow::anyhow!("Failed to create keyboard event"))?;
|
||||
|
||||
// Set unicode string
|
||||
let mut utf16_buf = [0u16; 2];
|
||||
let utf16_slice = ch.encode_utf16(&mut utf16_buf);
|
||||
let utf16_chars: Vec<u16> = utf16_slice.iter().copied().collect();
|
||||
|
||||
event.set_string_from_utf16_unchecked(utf16_chars.as_slice());
|
||||
event.post(CGEventTapLocation::HID);
|
||||
} // event_source and event dropped here
|
||||
|
||||
tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn press_key(&self, key: &str) -> Result<()> {
|
||||
let keycode = self.key_to_keycode(key)?;
|
||||
|
||||
{
|
||||
let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
|
||||
.map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
|
||||
|
||||
// Key down
|
||||
let down_event = CGEvent::new_keyboard_event(
|
||||
event_source,
|
||||
keycode,
|
||||
true,
|
||||
).map_err(|_| anyhow::anyhow!("Failed to create key down event"))?;
|
||||
down_event.post(CGEventTapLocation::HID);
|
||||
} // event_source and down_event dropped here
|
||||
|
||||
tokio::time::sleep(tokio::time::Duration::from_millis(50)).await;
|
||||
|
||||
{
|
||||
let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
|
||||
.map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
|
||||
|
||||
// Key up
|
||||
let up_event = CGEvent::new_keyboard_event(
|
||||
event_source,
|
||||
keycode,
|
||||
false,
|
||||
).map_err(|_| anyhow::anyhow!("Failed to create key up event"))?;
|
||||
up_event.post(CGEventTapLocation::HID);
|
||||
} // event_source and up_event dropped here
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn list_windows(&self) -> Result<Vec<Window>> {
|
||||
let mut windows = Vec::new();
|
||||
|
||||
unsafe {
|
||||
let window_list = CGWindowListCopyWindowInfo(
|
||||
kCGWindowListOptionOnScreenOnly,
|
||||
kCGNullWindowID
|
||||
);
|
||||
|
||||
let array = core_foundation::array::CFArray::<CFDictionary>::wrap_under_create_rule(window_list);
|
||||
let count = array.len();
|
||||
|
||||
for i in 0..count {
|
||||
let dict = array.get(i).unwrap();
|
||||
|
||||
// Get window ID
|
||||
let window_id_key = CFString::from_static_string("kCGWindowNumber");
|
||||
let window_id: i64 = if let Some(value) = dict.find(window_id_key.to_void()) {
|
||||
let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _);
|
||||
num.to_i64().unwrap_or(0)
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
// Get owner name (app name)
|
||||
let owner_key = CFString::from_static_string("kCGWindowOwnerName");
|
||||
let app_name: String = if let Some(value) = dict.find(owner_key.to_void()) {
|
||||
let s: CFString = TCFType::wrap_under_get_rule(*value as *const _);
|
||||
s.to_string()
|
||||
} else {
|
||||
"Unknown".to_string()
|
||||
};
|
||||
|
||||
// Get window name/title
|
||||
let name_key = CFString::from_static_string("kCGWindowName");
|
||||
let title: String = if let Some(value) = dict.find(name_key.to_void()) {
|
||||
let s: CFString = TCFType::wrap_under_get_rule(*value as *const _);
|
||||
s.to_string()
|
||||
} else {
|
||||
"".to_string()
|
||||
};
|
||||
|
||||
// Get window bounds
|
||||
let bounds_key = CFString::from_static_string("kCGWindowBounds");
|
||||
let bounds = if let Some(bounds_value) = dict.find(bounds_key.to_void()) {
|
||||
let bounds_dict: CFDictionary = TCFType::wrap_under_get_rule(*bounds_value as *const _);
|
||||
|
||||
let x_key = CFString::from_static_string("X");
|
||||
let y_key = CFString::from_static_string("Y");
|
||||
let width_key = CFString::from_static_string("Width");
|
||||
let height_key = CFString::from_static_string("Height");
|
||||
|
||||
let x = if let Some(x_value) = bounds_dict.find(x_key.to_void()) {
|
||||
let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*x_value as *const _);
|
||||
num.to_i32().unwrap_or(0)
|
||||
} else { 0 };
|
||||
let y = if let Some(y_value) = bounds_dict.find(y_key.to_void()) {
|
||||
let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*y_value as *const _);
|
||||
num.to_i32().unwrap_or(0)
|
||||
} else { 0 };
|
||||
let width = if let Some(width_value) = bounds_dict.find(width_key.to_void()) {
|
||||
let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*width_value as *const _);
|
||||
num.to_i32().unwrap_or(0)
|
||||
} else { 0 };
|
||||
let height = if let Some(height_value) = bounds_dict.find(height_key.to_void()) {
|
||||
let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*height_value as *const _);
|
||||
num.to_i32().unwrap_or(0)
|
||||
} else { 0 };
|
||||
|
||||
Rect { x, y, width, height }
|
||||
} else {
|
||||
Rect { x: 0, y: 0, width: 0, height: 0 }
|
||||
};
|
||||
|
||||
// Skip windows without meaningful content (system UI elements, etc.)
|
||||
if app_name.is_empty() || (title.is_empty() && bounds.width < 100) {
|
||||
continue;
|
||||
}
|
||||
|
||||
windows.push(Window {
|
||||
id: format!("{}:{}", app_name, window_id),
|
||||
title,
|
||||
app_name,
|
||||
bounds,
|
||||
is_active: false, // We'd need additional API calls to determine this
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Ok(windows)
|
||||
}
|
||||
|
||||
async fn focus_window(&self, _window_id: &str) -> Result<()> {
|
||||
// Note: Full implementation would use NSWorkspace to activate application
|
||||
tracing::warn!("focus_window not fully implemented on macOS");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn get_window_bounds(&self, _window_id: &str) -> Result<Rect> {
|
||||
// Note: Full implementation would use Accessibility API
|
||||
tracing::warn!("get_window_bounds not fully implemented on macOS");
|
||||
Ok(Rect { x: 0, y: 0, width: 800, height: 600 })
|
||||
}
|
||||
|
||||
async fn find_element(&self, _selector: &ElementSelector) -> Result<Option<UIElement>> {
|
||||
// Note: Full implementation would use macOS Accessibility API
|
||||
tracing::warn!("find_element not fully implemented on macOS");
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
async fn get_element_text(&self, _element_id: &str) -> Result<String> {
|
||||
// Note: Full implementation would use Accessibility API
|
||||
tracing::warn!("get_element_text not fully implemented on macOS");
|
||||
Ok(String::new())
|
||||
}
|
||||
|
||||
async fn get_element_bounds(&self, _element_id: &str) -> Result<Rect> {
|
||||
// Note: Full implementation would use Accessibility API
|
||||
tracing::warn!("get_element_bounds not fully implemented on macOS");
|
||||
Ok(Rect { x: 0, y: 0, width: 100, height: 30 })
|
||||
}
|
||||
|
||||
async fn take_screenshot(&self, path: &str, region: Option<Rect>, window_id: Option<&str>) -> Result<()> {
|
||||
// Determine the temporary directory for screenshots
|
||||
let temp_dir = std::env::var("TMPDIR")
|
||||
@@ -321,92 +32,6 @@ impl ComputerController for MacOSController {
|
||||
format!("{}/{}", temp_dir.trim_end_matches('/'), path)
|
||||
};
|
||||
|
||||
// Get the currently focused application before taking screenshot
|
||||
let current_app = std::process::Command::new("osascript")
|
||||
.arg("-e")
|
||||
.arg("tell application \"System Events\" to get name of first application process whose frontmost is true")
|
||||
.output()
|
||||
.ok()
|
||||
.and_then(|output| {
|
||||
if output.status.success() {
|
||||
Some(String::from_utf8_lossy(&output.stdout).trim().to_string())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
});
|
||||
|
||||
// Handle application-based window capture
|
||||
let app_name_opt = window_id.and_then(|id| {
|
||||
// Extract app name from window_id format "AppName:WindowNumber"
|
||||
id.split(':').next().map(String::from)
|
||||
});
|
||||
|
||||
// If we're capturing a specific window, foreground it first
|
||||
if let Some(ref app) = app_name_opt {
|
||||
tracing::debug!("Foregrounding application: {}", app);
|
||||
let _ = std::process::Command::new("osascript")
|
||||
.arg("-e")
|
||||
.arg(format!("tell application \"{}\" to activate", app))
|
||||
.output();
|
||||
|
||||
// Give the window time to come to the front
|
||||
tokio::time::sleep(tokio::time::Duration::from_millis(500)).await;
|
||||
}
|
||||
|
||||
let screenshot_result = if let Some(ref app) = app_name_opt {
|
||||
// Use screencapture with AppleScript to get window ID
|
||||
let script = format!(
|
||||
r#"tell application "{}" to id of window 1"#,
|
||||
app
|
||||
);
|
||||
|
||||
let output = std::process::Command::new("osascript")
|
||||
.arg("-e")
|
||||
.arg(&script)
|
||||
.output()?;
|
||||
|
||||
if output.status.success() {
|
||||
let window_id_str = String::from_utf8_lossy(&output.stdout).trim().to_string();
|
||||
std::process::Command::new("screencapture")
|
||||
.arg(format!("-l{}", window_id_str))
|
||||
.arg("-o")
|
||||
.arg(&final_path)
|
||||
.output()
|
||||
} else {
|
||||
// Fallback to regular screenshot if we can't get window ID
|
||||
std::process::Command::new("screencapture")
|
||||
.arg("-x")
|
||||
.arg(&final_path)
|
||||
.output()
|
||||
}
|
||||
} else {
|
||||
// Regular screenshot (full screen or region)
|
||||
// Use native macOS screencapture command which handles all the format complexities
|
||||
|
||||
// Check if we have Screen Recording permission by attempting a test capture
|
||||
// If we only get wallpaper/menubar but no windows, we need permission
|
||||
let needs_permission_check = std::env::var("G3_SKIP_PERMISSION_CHECK").is_err();
|
||||
|
||||
if needs_permission_check {
|
||||
// Try to open Screen Recording settings if this is the first screenshot
|
||||
static PERMISSION_PROMPTED: std::sync::atomic::AtomicBool = std::sync::atomic::AtomicBool::new(false);
|
||||
|
||||
if !PERMISSION_PROMPTED.swap(true, std::sync::atomic::Ordering::Relaxed) {
|
||||
tracing::warn!("\n=== Screen Recording Permission Required ===\n\
|
||||
macOS requires explicit permission to capture window content.\n\
|
||||
If screenshots only show wallpaper/menubar (no windows):\n\n\
|
||||
1. Open System Settings > Privacy & Security > Screen Recording\n\
|
||||
2. Enable permission for your terminal (iTerm/Terminal) or g3\n\
|
||||
3. Restart your terminal if needed\n\n\
|
||||
Opening Screen Recording settings now...\n");
|
||||
|
||||
// Try to open the settings (non-blocking)
|
||||
let _ = std::process::Command::new("open")
|
||||
.arg("x-apple.systempreferences:com.apple.preference.security?Privacy_ScreenCapture")
|
||||
.spawn();
|
||||
}
|
||||
}
|
||||
|
||||
let path_obj = Path::new(&final_path);
|
||||
if let Some(parent) = path_obj.parent() {
|
||||
std::fs::create_dir_all(parent)?;
|
||||
@@ -423,41 +48,34 @@ impl ComputerController for MacOSController {
|
||||
cmd.arg(format!("{},{},{},{}", region.x, region.y, region.width, region.height));
|
||||
}
|
||||
|
||||
if let Some(app_name) = window_id {
|
||||
// Capture specific window by app name
|
||||
// Use AppleScript to get window ID
|
||||
let script = format!(r#"tell application "{}" to id of window 1"#, app_name);
|
||||
let output = std::process::Command::new("osascript")
|
||||
.arg("-e")
|
||||
.arg(&script)
|
||||
.output()?;
|
||||
|
||||
if output.status.success() {
|
||||
let window_id_str = String::from_utf8_lossy(&output.stdout).trim().to_string();
|
||||
cmd.arg(format!("-l{}", window_id_str));
|
||||
}
|
||||
}
|
||||
|
||||
cmd.arg(&final_path);
|
||||
|
||||
cmd.output()
|
||||
}?;
|
||||
let screenshot_result = cmd.output()?;
|
||||
|
||||
if !screenshot_result.status.success() {
|
||||
let stderr = String::from_utf8_lossy(&screenshot_result.stderr);
|
||||
return Err(anyhow::anyhow!("screencapture failed: {}", stderr));
|
||||
}
|
||||
|
||||
// Re-foreground the original application if we foregrounded a different window
|
||||
if let Some(ref target_app) = app_name_opt {
|
||||
if let Some(ref original_app) = current_app {
|
||||
// Only restore if we actually changed the foreground app
|
||||
if target_app != original_app {
|
||||
tracing::debug!("Restoring focus to original application: {}", original_app);
|
||||
|
||||
// Small delay to ensure screenshot is complete
|
||||
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
|
||||
|
||||
let _ = std::process::Command::new("osascript")
|
||||
.arg("-e")
|
||||
.arg(format!("tell application \"{}\" to activate", original_app))
|
||||
.output();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tracing::debug!("Screenshot saved using screencapture: {}", final_path);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
async fn extract_text_from_screen(&self, region: Rect) -> Result<OCRResult> {
|
||||
async fn extract_text_from_screen(&self, region: Rect) -> Result<String> {
|
||||
// Take screenshot of region first
|
||||
let temp_path = format!("/tmp/g3_ocr_{}.png", uuid::Uuid::new_v4());
|
||||
self.take_screenshot(&temp_path, Some(region), None).await?;
|
||||
@@ -471,7 +89,7 @@ impl ComputerController for MacOSController {
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
async fn extract_text_from_image(&self, _path: &str) -> Result<OCRResult> {
|
||||
async fn extract_text_from_image(&self, path: &str) -> Result<String> {
|
||||
// Check if tesseract is available on the system
|
||||
let tesseract_check = std::process::Command::new("which")
|
||||
.arg("tesseract")
|
||||
@@ -497,66 +115,11 @@ impl ComputerController for MacOSController {
|
||||
Windows: Reinstall tesseract and ensure language files are included", e)
|
||||
})?;
|
||||
|
||||
let text = tess.set_image(_path)
|
||||
.map_err(|e| anyhow::anyhow!("Failed to load image '{}': {}", _path, e))?
|
||||
let text = tess.set_image(path)
|
||||
.map_err(|e| anyhow::anyhow!("Failed to load image '{}': {}", path, e))?
|
||||
.get_text()
|
||||
.map_err(|e| anyhow::anyhow!("Failed to extract text from image: {}", e))?;
|
||||
|
||||
// Get confidence (simplified - would need more complex API calls for per-word confidence)
|
||||
let confidence = 0.85; // Placeholder
|
||||
|
||||
Ok(OCRResult {
|
||||
text,
|
||||
confidence,
|
||||
bounds: Rect { x: 0, y: 0, width: 0, height: 0 }, // Would need image dimensions
|
||||
})
|
||||
Ok(text)
|
||||
}
|
||||
|
||||
async fn find_text_on_screen(&self, _text: &str) -> Result<Option<Point>> {
|
||||
// Check if tesseract is available on the system
|
||||
let tesseract_check = std::process::Command::new("which")
|
||||
.arg("tesseract")
|
||||
.output();
|
||||
|
||||
if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() {
|
||||
anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\
|
||||
To install tesseract:\n macOS: brew install tesseract\n \
|
||||
Linux: sudo apt-get install tesseract-ocr (Ubuntu/Debian)\n \
|
||||
sudo yum install tesseract (RHEL/CentOS)\n \
|
||||
Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki\n\n\
|
||||
After installation, restart your terminal and try again.");
|
||||
}
|
||||
|
||||
// Take full screen screenshot
|
||||
let temp_path = format!("/tmp/g3_ocr_search_{}.png", uuid::Uuid::new_v4());
|
||||
self.take_screenshot(&temp_path, None, None).await?;
|
||||
|
||||
// Use Tesseract to find text with bounding boxes
|
||||
let tess = Tesseract::new(None, Some("eng"))
|
||||
.map_err(|e| {
|
||||
anyhow::anyhow!("Failed to initialize Tesseract: {}\n\n\
|
||||
This usually means:\n1. Tesseract is not properly installed\n\
|
||||
2. Language data files are missing\n\nTo fix:\n \
|
||||
macOS: brew reinstall tesseract\n \
|
||||
Linux: sudo apt-get install tesseract-ocr-eng\n \
|
||||
Windows: Reinstall tesseract and ensure language files are included", e)
|
||||
})?;
|
||||
|
||||
let full_text = tess.set_image(temp_path.as_str())
|
||||
.map_err(|e| anyhow::anyhow!("Failed to load screenshot: {}", e))?
|
||||
.get_text()
|
||||
.map_err(|e| anyhow::anyhow!("Failed to extract text from screen: {}", e))?;
|
||||
|
||||
// Clean up temp file
|
||||
let _ = std::fs::remove_file(&temp_path);
|
||||
|
||||
// Simple text search - full implementation would use get_component_images
|
||||
// to get bounding boxes for each word
|
||||
if full_text.contains(_text) {
|
||||
tracing::warn!("Text found but precise coordinates not available in simplified implementation");
|
||||
Ok(Some(Point { x: 0, y: 0 }))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,11 +1,5 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
|
||||
pub struct Point {
|
||||
pub x: i32,
|
||||
pub y: i32,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
|
||||
pub struct Rect {
|
||||
pub x: i32,
|
||||
@@ -13,53 +7,3 @@ pub struct Rect {
|
||||
pub width: i32,
|
||||
pub height: i32,
|
||||
}
|
||||
|
||||
impl Rect {
|
||||
pub fn center(&self) -> Point {
|
||||
Point {
|
||||
x: self.x + self.width / 2,
|
||||
y: self.y + self.height / 2,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Window {
|
||||
pub id: String,
|
||||
pub title: String,
|
||||
pub app_name: String,
|
||||
pub bounds: Rect,
|
||||
pub is_active: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct UIElement {
|
||||
pub id: String,
|
||||
pub text: String,
|
||||
pub role: String,
|
||||
pub bounds: Rect,
|
||||
pub enabled: bool,
|
||||
pub visible: bool,
|
||||
pub value: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
|
||||
pub enum MouseButton {
|
||||
Left,
|
||||
Right,
|
||||
Middle,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ElementSelector {
|
||||
pub text: Option<String>,
|
||||
pub role: Option<String>,
|
||||
pub window_id: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct OCRResult {
|
||||
pub text: String,
|
||||
pub confidence: f32,
|
||||
pub bounds: Rect,
|
||||
}
|
||||
|
||||
111
crates/g3-computer-control/src/webdriver/mod.rs
Normal file
111
crates/g3-computer-control/src/webdriver/mod.rs
Normal file
@@ -0,0 +1,111 @@
|
||||
pub mod safari;
|
||||
|
||||
use anyhow::Result;
|
||||
use async_trait::async_trait;
|
||||
use serde_json::Value;
|
||||
|
||||
/// WebDriver controller for browser automation
|
||||
#[async_trait]
|
||||
pub trait WebDriverController: Send + Sync {
|
||||
/// Navigate to a URL
|
||||
async fn navigate(&mut self, url: &str) -> Result<()>;
|
||||
|
||||
/// Get the current URL
|
||||
async fn current_url(&self) -> Result<String>;
|
||||
|
||||
/// Get the page title
|
||||
async fn title(&self) -> Result<String>;
|
||||
|
||||
/// Find an element by CSS selector
|
||||
async fn find_element(&mut self, selector: &str) -> Result<WebElement>;
|
||||
|
||||
/// Find multiple elements by CSS selector
|
||||
async fn find_elements(&mut self, selector: &str) -> Result<Vec<WebElement>>;
|
||||
|
||||
/// Execute JavaScript in the browser
|
||||
async fn execute_script(&mut self, script: &str, args: Vec<Value>) -> Result<Value>;
|
||||
|
||||
/// Get the page source (HTML)
|
||||
async fn page_source(&self) -> Result<String>;
|
||||
|
||||
/// Take a screenshot and save to path
|
||||
async fn screenshot(&mut self, path: &str) -> Result<()>;
|
||||
|
||||
/// Close the current window/tab
|
||||
async fn close(&mut self) -> Result<()>;
|
||||
|
||||
/// Quit the browser session
|
||||
async fn quit(self) -> Result<()>;
|
||||
}
|
||||
|
||||
/// Represents a web element in the DOM
|
||||
pub struct WebElement {
|
||||
pub(crate) inner: fantoccini::elements::Element,
|
||||
}
|
||||
|
||||
impl WebElement {
|
||||
/// Click the element
|
||||
pub async fn click(&mut self) -> Result<()> {
|
||||
self.inner.click().await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Send keys/text to the element
|
||||
pub async fn send_keys(&mut self, text: &str) -> Result<()> {
|
||||
self.inner.send_keys(text).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Clear the element's content (for input fields)
|
||||
pub async fn clear(&mut self) -> Result<()> {
|
||||
self.inner.clear().await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get the element's text content
|
||||
pub async fn text(&self) -> Result<String> {
|
||||
Ok(self.inner.text().await?)
|
||||
}
|
||||
|
||||
/// Get an attribute value
|
||||
pub async fn attr(&self, name: &str) -> Result<Option<String>> {
|
||||
Ok(self.inner.attr(name).await?)
|
||||
}
|
||||
|
||||
/// Get a property value
|
||||
pub async fn prop(&self, name: &str) -> Result<Option<String>> {
|
||||
Ok(self.inner.prop(name).await?)
|
||||
}
|
||||
|
||||
/// Get the element's HTML
|
||||
pub async fn html(&self, inner: bool) -> Result<String> {
|
||||
Ok(self.inner.html(inner).await?)
|
||||
}
|
||||
|
||||
/// Check if element is displayed
|
||||
pub async fn is_displayed(&self) -> Result<bool> {
|
||||
Ok(self.inner.is_displayed().await?)
|
||||
}
|
||||
|
||||
/// Check if element is enabled
|
||||
pub async fn is_enabled(&self) -> Result<bool> {
|
||||
Ok(self.inner.is_enabled().await?)
|
||||
}
|
||||
|
||||
/// Check if element is selected (for checkboxes/radio buttons)
|
||||
pub async fn is_selected(&self) -> Result<bool> {
|
||||
Ok(self.inner.is_selected().await?)
|
||||
}
|
||||
|
||||
/// Find a child element by CSS selector
|
||||
pub async fn find_element(&mut self, selector: &str) -> Result<WebElement> {
|
||||
let elem = self.inner.find(fantoccini::Locator::Css(selector)).await?;
|
||||
Ok(WebElement { inner: elem })
|
||||
}
|
||||
|
||||
/// Find multiple child elements by CSS selector
|
||||
pub async fn find_elements(&mut self, selector: &str) -> Result<Vec<WebElement>> {
|
||||
let elems = self.inner.find_all(fantoccini::Locator::Css(selector)).await?;
|
||||
Ok(elems.into_iter().map(|inner| WebElement { inner }).collect())
|
||||
}
|
||||
}
|
||||
212
crates/g3-computer-control/src/webdriver/safari.rs
Normal file
212
crates/g3-computer-control/src/webdriver/safari.rs
Normal file
@@ -0,0 +1,212 @@
|
||||
use super::{WebDriverController, WebElement};
|
||||
use anyhow::{Context, Result};
|
||||
use async_trait::async_trait;
|
||||
use fantoccini::{Client, ClientBuilder};
|
||||
use serde_json::Value;
|
||||
use std::time::Duration;
|
||||
|
||||
/// SafariDriver WebDriver controller
|
||||
pub struct SafariDriver {
|
||||
client: Client,
|
||||
}
|
||||
|
||||
impl SafariDriver {
|
||||
/// Create a new SafariDriver instance
|
||||
///
|
||||
/// This will connect to SafariDriver running on the default port (4444).
|
||||
/// Make sure to enable "Allow Remote Automation" in Safari's Develop menu first.
|
||||
///
|
||||
/// You can start SafariDriver manually with:
|
||||
/// ```bash
|
||||
/// /usr/bin/safaridriver --enable
|
||||
/// ```
|
||||
pub async fn new() -> Result<Self> {
|
||||
Self::with_port(4444).await
|
||||
}
|
||||
|
||||
/// Create a new SafariDriver instance with a custom port
|
||||
pub async fn with_port(port: u16) -> Result<Self> {
|
||||
let url = format!("http://localhost:{}", port);
|
||||
|
||||
let mut caps = serde_json::Map::new();
|
||||
caps.insert("browserName".to_string(), Value::String("safari".to_string()));
|
||||
|
||||
let client = ClientBuilder::native()
|
||||
.capabilities(caps)
|
||||
.connect(&url)
|
||||
.await
|
||||
.context("Failed to connect to SafariDriver. Make sure SafariDriver is running and 'Allow Remote Automation' is enabled in Safari's Develop menu.")?;
|
||||
|
||||
Ok(Self { client })
|
||||
}
|
||||
|
||||
/// Go back in browser history
|
||||
pub async fn back(&mut self) -> Result<()> {
|
||||
self.client.back().await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Go forward in browser history
|
||||
pub async fn forward(&mut self) -> Result<()> {
|
||||
self.client.forward().await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Refresh the current page
|
||||
pub async fn refresh(&mut self) -> Result<()> {
|
||||
self.client.refresh().await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get all window handles
|
||||
pub async fn window_handles(&mut self) -> Result<Vec<String>> {
|
||||
let handles = self.client.windows().await?;
|
||||
Ok(handles.into_iter()
|
||||
.map(|h| h.into())
|
||||
.collect())
|
||||
}
|
||||
|
||||
/// Switch to a window by handle
|
||||
pub async fn switch_to_window(&mut self, handle: &str) -> Result<()> {
|
||||
let window_handle: fantoccini::wd::WindowHandle = handle.to_string().try_into()?;
|
||||
self.client.switch_to_window(window_handle).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get the current window handle
|
||||
pub async fn current_window_handle(&mut self) -> Result<String> {
|
||||
Ok(self.client.window().await?.into())
|
||||
}
|
||||
|
||||
/// Close the current window
|
||||
pub async fn close_window(&mut self) -> Result<()> {
|
||||
self.client.close_window().await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Create a new window/tab
|
||||
pub async fn new_window(&mut self, is_tab: bool) -> Result<String> {
|
||||
let window_type = if is_tab { "tab" } else { "window" };
|
||||
let response = self.client.new_window(window_type == "tab").await?;
|
||||
Ok(response.handle.into())
|
||||
}
|
||||
|
||||
/// Get cookies
|
||||
pub async fn get_cookies(&mut self) -> Result<Vec<fantoccini::cookies::Cookie<'static>>> {
|
||||
Ok(self.client.get_all_cookies().await?)
|
||||
}
|
||||
|
||||
/// Add a cookie
|
||||
pub async fn add_cookie(&mut self, cookie: fantoccini::cookies::Cookie<'static>) -> Result<()> {
|
||||
self.client.add_cookie(cookie).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Delete all cookies
|
||||
pub async fn delete_all_cookies(&mut self) -> Result<()> {
|
||||
self.client.delete_all_cookies().await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Wait for an element to appear (with timeout)
|
||||
pub async fn wait_for_element(&mut self, selector: &str, timeout: Duration) -> Result<WebElement> {
|
||||
let start = std::time::Instant::now();
|
||||
let poll_interval = Duration::from_millis(100);
|
||||
|
||||
loop {
|
||||
if let Ok(elem) = self.find_element(selector).await {
|
||||
return Ok(elem);
|
||||
}
|
||||
|
||||
if start.elapsed() >= timeout {
|
||||
anyhow::bail!("Timeout waiting for element: {}", selector);
|
||||
}
|
||||
|
||||
tokio::time::sleep(poll_interval).await;
|
||||
}
|
||||
}
|
||||
|
||||
/// Wait for an element to be visible (with timeout)
|
||||
pub async fn wait_for_visible(&mut self, selector: &str, timeout: Duration) -> Result<WebElement> {
|
||||
let start = std::time::Instant::now();
|
||||
let poll_interval = Duration::from_millis(100);
|
||||
|
||||
loop {
|
||||
if let Ok(elem) = self.find_element(selector).await {
|
||||
if elem.is_displayed().await.unwrap_or(false) {
|
||||
return Ok(elem);
|
||||
}
|
||||
}
|
||||
|
||||
if start.elapsed() >= timeout {
|
||||
anyhow::bail!("Timeout waiting for element to be visible: {}", selector);
|
||||
}
|
||||
|
||||
tokio::time::sleep(poll_interval).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl WebDriverController for SafariDriver {
|
||||
async fn navigate(&mut self, url: &str) -> Result<()> {
|
||||
self.client.goto(url).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn current_url(&self) -> Result<String> {
|
||||
Ok(self.client.current_url().await?.to_string())
|
||||
}
|
||||
|
||||
async fn title(&self) -> Result<String> {
|
||||
Ok(self.client.title().await?)
|
||||
}
|
||||
|
||||
async fn find_element(&mut self, selector: &str) -> Result<WebElement> {
|
||||
let elem = self.client.find(fantoccini::Locator::Css(selector)).await
|
||||
.context(format!("Failed to find element with selector: {}", selector))?;
|
||||
Ok(WebElement { inner: elem })
|
||||
}
|
||||
|
||||
async fn find_elements(&mut self, selector: &str) -> Result<Vec<WebElement>> {
|
||||
let elems = self.client.find_all(fantoccini::Locator::Css(selector)).await?;
|
||||
Ok(elems.into_iter().map(|inner| WebElement { inner }).collect())
|
||||
}
|
||||
|
||||
async fn execute_script(&mut self, script: &str, args: Vec<Value>) -> Result<Value> {
|
||||
Ok(self.client.execute(script, args).await?)
|
||||
}
|
||||
|
||||
async fn page_source(&self) -> Result<String> {
|
||||
Ok(self.client.source().await?)
|
||||
}
|
||||
|
||||
async fn screenshot(&mut self, path: &str) -> Result<()> {
|
||||
let screenshot_data = self.client.screenshot().await?;
|
||||
|
||||
// Expand tilde in path
|
||||
let expanded_path = shellexpand::tilde(path);
|
||||
let path_str = expanded_path.as_ref();
|
||||
|
||||
// Create parent directories if needed
|
||||
if let Some(parent) = std::path::Path::new(path_str).parent() {
|
||||
std::fs::create_dir_all(parent)
|
||||
.context("Failed to create parent directories for screenshot")?;
|
||||
}
|
||||
|
||||
std::fs::write(path_str, screenshot_data)
|
||||
.context("Failed to write screenshot to file")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn close(&mut self) -> Result<()> {
|
||||
self.client.close_window().await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn quit(mut self) -> Result<()> {
|
||||
self.client.close().await?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user