add context window monitor

Writes the current context window to logs/current_context_window (uses a symlink to a session ID).

This PR was unfortunately generated by a different LLM and did a ton of superficial reformating, it's actually a fairly small and benign change, but I don't want to roll back everything. Hope that's ok.
This commit is contained in:
Jochen
2025-11-27 21:00:02 +11:00
parent 93dc4acf86
commit 52f78653b4
89 changed files with 4040 additions and 2576 deletions

View File

@@ -34,27 +34,40 @@ fn main() {
.expect("Failed to find .build/release directory");
// Copy the dylib to the output directory so it can be found at runtime
let target_dir = manifest_dir.parent().unwrap().parent().unwrap().join("target");
let target_dir = manifest_dir
.parent()
.unwrap()
.parent()
.unwrap()
.join("target");
let profile = env::var("PROFILE").unwrap_or_else(|_| "debug".to_string());
// Determine the actual target directory (could be llvm-cov-target or regular target)
let target_dir_name = env::var("CARGO_TARGET_DIR")
.unwrap_or_else(|_| target_dir.to_string_lossy().to_string());
let target_dir_name =
env::var("CARGO_TARGET_DIR").unwrap_or_else(|_| target_dir.to_string_lossy().to_string());
let actual_target_dir = PathBuf::from(&target_dir_name);
let output_dir = actual_target_dir.join(&profile);
let dylib_src = lib_path.join("libVisionBridge.dylib");
let dylib_dst = output_dir.join("libVisionBridge.dylib");
// Create output directory if it doesn't exist
std::fs::create_dir_all(&output_dir)
.expect(&format!("Failed to create output directory {}", output_dir.display()));
std::fs::copy(&dylib_src, &dylib_dst)
.expect(&format!("Failed to copy dylib from {} to {}", dylib_src.display(), dylib_dst.display()));
println!("cargo:warning=Copied libVisionBridge.dylib to {}", dylib_dst.display());
std::fs::create_dir_all(&output_dir).expect(&format!(
"Failed to create output directory {}",
output_dir.display()
));
std::fs::copy(&dylib_src, &dylib_dst).expect(&format!(
"Failed to copy dylib from {} to {}",
dylib_src.display(),
dylib_dst.display()
));
println!(
"cargo:warning=Copied libVisionBridge.dylib to {}",
dylib_dst.display()
);
// Add rpath so the dylib can be found at runtime
println!("cargo:rustc-link-arg=-Wl,-rpath,@executable_path");
println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
@@ -68,5 +81,8 @@ fn main() {
println!("cargo:rustc-link-lib=framework=CoreGraphics");
println!("cargo:rustc-link-lib=framework=CoreImage");
println!("cargo:warning=VisionBridge built successfully at {}", lib_path.display());
println!(
"cargo:warning=VisionBridge built successfully at {}",
lib_path.display()
);
}

View File

@@ -3,19 +3,19 @@ use core_graphics::display::CGDisplay;
fn main() {
let display = CGDisplay::main();
let image = display.image().expect("Failed to capture screen");
println!("CGImage properties:");
println!(" Width: {}", image.width());
println!(" Height: {}", image.height());
println!(" Bits per component: {}", image.bits_per_component());
println!(" Bits per pixel: {}", image.bits_per_pixel());
println!(" Bytes per row: {}", image.bytes_per_row());
let data = image.data();
let expected_size = image.width() * image.height() * 4;
println!(" Data length: {}", data.len());
println!(" Expected (w*h*4): {}", expected_size);
// Check if there's padding in rows
let bytes_per_row = image.bytes_per_row();
let width = image.width();
@@ -23,16 +23,25 @@ fn main() {
println!("\nRow alignment:");
println!(" Actual bytes per row: {}", bytes_per_row);
println!(" Expected (width * 4): {}", expected_bytes_per_row);
println!(" Padding per row: {}", bytes_per_row - expected_bytes_per_row);
println!(
" Padding per row: {}",
bytes_per_row - expected_bytes_per_row
);
// Sample some pixels from different locations
println!("\nFirst 3 pixels (raw bytes):");
for i in 0..3 {
let offset = i * 4;
println!(" Pixel {}: [{:3}, {:3}, {:3}, {:3}]",
i, data[offset], data[offset+1], data[offset+2], data[offset+3]);
println!(
" Pixel {}: [{:3}, {:3}, {:3}, {:3}]",
i,
data[offset],
data[offset + 1],
data[offset + 2],
data[offset + 3]
);
}
// Check a pixel from the middle
let mid_row = image.height() / 2;
let mid_col = image.width() / 2;
@@ -40,7 +49,12 @@ fn main() {
println!("\nMiddle pixel (row {}, col {}):", mid_row, mid_col);
println!(" Offset: {}", mid_offset);
if mid_offset + 3 < data.len() as usize {
println!(" Bytes: [{:3}, {:3}, {:3}, {:3}]",
data[mid_offset], data[mid_offset+1], data[mid_offset+2], data[mid_offset+3]);
println!(
" Bytes: [{:3}, {:3}, {:3}, {:3}]",
data[mid_offset],
data[mid_offset + 1],
data[mid_offset + 2],
data[mid_offset + 3]
);
}
}

View File

@@ -1,34 +1,38 @@
use core_graphics::window::{kCGWindowListOptionOnScreenOnly, kCGNullWindowID, CGWindowListCopyWindowInfo};
use core_foundation::base::{TCFType, ToVoid};
use core_foundation::dictionary::CFDictionary;
use core_foundation::string::CFString;
use core_foundation::base::{TCFType, ToVoid};
use core_graphics::window::{
kCGNullWindowID, kCGWindowListOptionOnScreenOnly, CGWindowListCopyWindowInfo,
};
fn main() {
println!("Listing all on-screen windows...");
println!("{:<10} {:<25} {}", "Window ID", "Owner", "Title");
println!("{}", "-".repeat(80));
unsafe {
let window_list = CGWindowListCopyWindowInfo(
kCGWindowListOptionOnScreenOnly,
kCGNullWindowID
);
let count = core_foundation::array::CFArray::<CFDictionary>::wrap_under_create_rule(window_list).len();
let array = core_foundation::array::CFArray::<CFDictionary>::wrap_under_create_rule(window_list);
let window_list =
CGWindowListCopyWindowInfo(kCGWindowListOptionOnScreenOnly, kCGNullWindowID);
let count =
core_foundation::array::CFArray::<CFDictionary>::wrap_under_create_rule(window_list)
.len();
let array =
core_foundation::array::CFArray::<CFDictionary>::wrap_under_create_rule(window_list);
for i in 0..count {
let dict = array.get(i).unwrap();
// Get window ID
let window_id_key = CFString::from_static_string("kCGWindowNumber");
let window_id: i64 = if let Some(value) = dict.find(window_id_key.to_void()) {
let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _);
let num: core_foundation::number::CFNumber =
TCFType::wrap_under_get_rule(*value as *const _);
num.to_i64().unwrap_or(0)
} else {
0
};
// Get owner name
let owner_key = CFString::from_static_string("kCGWindowOwnerName");
let owner: String = if let Some(value) = dict.find(owner_key.to_void()) {
@@ -37,7 +41,7 @@ fn main() {
} else {
"Unknown".to_string()
};
// Get window name/title
let name_key = CFString::from_static_string("kCGWindowName");
let title: String = if let Some(value) = dict.find(name_key.to_void()) {
@@ -46,7 +50,7 @@ fn main() {
} else {
"".to_string()
};
// Show all windows
if !owner.is_empty() {
println!("{:<10} {:<25} {}", window_id, owner, title);

View File

@@ -11,11 +11,11 @@ use g3_computer_control::MacAxController;
async fn main() -> Result<()> {
println!("🍎 macOS Accessibility API Demo\n");
println!("This demo shows how to control macOS applications using the Accessibility API.\n");
// Create controller
let controller = MacAxController::new()?;
println!("✅ MacAxController initialized\n");
// List running applications
println!("📱 Listing running applications:");
match controller.list_applications() {
@@ -30,7 +30,7 @@ async fn main() -> Result<()> {
Err(e) => println!(" ❌ Error: {}", e),
}
println!();
// Get frontmost app
println!("🎯 Getting frontmost application:");
match controller.get_frontmost_app() {
@@ -38,16 +38,16 @@ async fn main() -> Result<()> {
Err(e) => println!(" ❌ Error: {}", e),
}
println!();
// Example: Activate Finder and get its UI tree
println!("📂 Activating Finder and inspecting UI:");
match controller.activate_app("Finder") {
Ok(_) => {
println!(" ✅ Finder activated");
// Wait a moment for activation
tokio::time::sleep(tokio::time::Duration::from_millis(500)).await;
// Get UI tree
match controller.get_ui_tree("Finder", 2) {
Ok(tree) => {
@@ -62,13 +62,13 @@ async fn main() -> Result<()> {
Err(e) => println!(" ❌ Error: {}", e),
}
println!();
println!("✨ Demo complete!\n");
println!("💡 Tips:");
println!(" - Use --macax flag with g3 to enable these tools");
println!(" - Grant accessibility permissions in System Preferences");
println!(" - Add accessibility identifiers to your apps for easier automation");
println!(" - See docs/macax-tools.md for full documentation\n");
Ok(())
}

View File

@@ -1,64 +1,66 @@
use g3_computer_control::SafariDriver;
use g3_computer_control::webdriver::WebDriverController;
use anyhow::Result;
use g3_computer_control::webdriver::WebDriverController;
use g3_computer_control::SafariDriver;
#[tokio::main]
async fn main() -> Result<()> {
println!("Safari WebDriver Demo");
println!("=====================\n");
println!("Make sure to:");
println!("1. Enable 'Allow Remote Automation' in Safari's Develop menu");
println!("2. Run: /usr/bin/safaridriver --enable");
println!("3. Start safaridriver in another terminal: safaridriver --port 4444\n");
println!("Connecting to SafariDriver...");
let mut driver = SafariDriver::new().await?;
println!("✅ Connected!\n");
// Navigate to a website
println!("Navigating to example.com...");
driver.navigate("https://example.com").await?;
println!("✅ Navigated\n");
// Get page title
let title = driver.title().await?;
println!("Page title: {}\n", title);
// Get current URL
let url = driver.current_url().await?;
println!("Current URL: {}\n", url);
// Find an element
println!("Finding h1 element...");
let h1 = driver.find_element("h1").await?;
let h1_text = h1.text().await?;
println!("H1 text: {}\n", h1_text);
// Find all paragraphs
println!("Finding all paragraphs...");
let paragraphs = driver.find_elements("p").await?;
println!("Found {} paragraphs\n", paragraphs.len());
// Get page source
println!("Getting page source...");
let source = driver.page_source().await?;
println!("Page source length: {} bytes\n", source.len());
// Execute JavaScript
println!("Executing JavaScript...");
let result = driver.execute_script("return document.title", vec![]).await?;
let result = driver
.execute_script("return document.title", vec![])
.await?;
println!("JS result: {:?}\n", result);
// Take a screenshot
println!("Taking screenshot...");
driver.screenshot("/tmp/safari_demo.png").await?;
println!("✅ Screenshot saved to /tmp/safari_demo.png\n");
// Close the browser
println!("Closing browser...");
driver.quit().await?;
println!("✅ Done!");
Ok(())
}

View File

@@ -3,10 +3,13 @@ use g3_computer_control::create_controller;
#[tokio::main]
async fn main() {
println!("Testing screenshot with permission prompt...");
let controller = create_controller().expect("Failed to create controller");
match controller.take_screenshot("/tmp/test_with_prompt.png", None, None).await {
match controller
.take_screenshot("/tmp/test_with_prompt.png", None, None)
.await
{
Ok(_) => {
println!("\n✅ Screenshot saved to /tmp/test_with_prompt.png");
println!("Opening screenshot...");

View File

@@ -2,29 +2,33 @@ use std::process::Command;
fn main() {
let path = "/tmp/rust_screencapture_test.png";
println!("Testing screencapture command from Rust...");
let mut cmd = Command::new("screencapture");
cmd.arg("-x"); // No sound
cmd.arg(path);
println!("Command: {:?}", cmd);
match cmd.output() {
Ok(output) => {
println!("Exit status: {}", output.status);
println!("Stdout: {}", String::from_utf8_lossy(&output.stdout));
println!("Stderr: {}", String::from_utf8_lossy(&output.stderr));
if output.status.success() {
println!("\n✅ Screenshot saved to: {}", path);
// Check file exists and size
if let Ok(metadata) = std::fs::metadata(path) {
println!("File size: {} bytes ({:.1} MB)", metadata.len(), metadata.len() as f64 / 1_000_000.0);
println!(
"File size: {} bytes ({:.1} MB)",
metadata.len(),
metadata.len() as f64 / 1_000_000.0
);
}
// Open it
let _ = Command::new("open").arg(path).spawn();
println!("\nOpened screenshot - please verify it looks correct!");

View File

@@ -4,17 +4,23 @@ use image::{ImageBuffer, RgbaImage};
fn main() {
let display = CGDisplay::main();
let image = display.image().expect("Failed to capture screen");
let width = image.width() as u32;
let height = image.height() as u32;
let bytes_per_row = image.bytes_per_row() as usize;
let data = image.data();
println!("Testing screenshot fix...");
println!("Image: {}x{}, bytes_per_row: {}", width, height, bytes_per_row);
println!(
"Image: {}x{}, bytes_per_row: {}",
width, height, bytes_per_row
);
println!("Expected bytes per row: {}", width * 4);
println!("Padding per row: {} bytes", bytes_per_row - (width as usize * 4));
println!(
"Padding per row: {} bytes",
bytes_per_row - (width as usize * 4)
);
// OLD METHOD (broken) - treating data as continuous
println!("\n=== OLD METHOD (BROKEN) ===");
let mut old_rgba = Vec::with_capacity(data.len() as usize);
@@ -26,14 +32,14 @@ fn main() {
}
println!("Converted {} pixels", old_rgba.len() / 4);
println!("Expected {} pixels", width * height);
// NEW METHOD (fixed) - handling row padding
println!("\n=== NEW METHOD (FIXED) ===");
let mut new_rgba = Vec::with_capacity((width * height * 4) as usize);
for row in 0..height as usize {
let row_start = row * bytes_per_row;
let row_end = row_start + (width as usize * 4);
for chunk in data[row_start..row_end].chunks_exact(4) {
new_rgba.push(chunk[2]); // R
new_rgba.push(chunk[1]); // G
@@ -43,26 +49,34 @@ fn main() {
}
println!("Converted {} pixels", new_rgba.len() / 4);
println!("Expected {} pixels", width * height);
// Save a small crop from both methods
let crop_size = 200;
// Old method crop
let old_crop: Vec<u8> = old_rgba.iter().take((crop_size * crop_size * 4) as usize).copied().collect();
let old_crop: Vec<u8> = old_rgba
.iter()
.take((crop_size * crop_size * 4) as usize)
.copied()
.collect();
if let Some(old_img) = ImageBuffer::from_raw(crop_size, crop_size, old_crop) {
let old_img: RgbaImage = old_img;
old_img.save("/tmp/screenshot_old_method.png").unwrap();
println!("\nSaved OLD method crop to: /tmp/screenshot_old_method.png");
}
// New method crop
let new_crop: Vec<u8> = new_rgba.iter().take((crop_size * crop_size * 4) as usize).copied().collect();
let new_crop: Vec<u8> = new_rgba
.iter()
.take((crop_size * crop_size * 4) as usize)
.copied()
.collect();
if let Some(new_img) = ImageBuffer::from_raw(crop_size, crop_size, new_crop) {
let new_img: RgbaImage = new_img;
new_img.save("/tmp/screenshot_new_method.png").unwrap();
println!("Saved NEW method crop to: /tmp/screenshot_new_method.png");
}
println!("\nOpen both images to compare:");
println!(" open /tmp/screenshot_old_method.png /tmp/screenshot_new_method.png");
}

View File

@@ -6,43 +6,43 @@ use g3_computer_control::MacAxController;
#[tokio::main]
async fn main() -> Result<()> {
println!("🧪 Testing macax type_text functionality\n");
let controller = MacAxController::new()?;
println!("✅ Controller initialized\n");
// Test 1: Type simple text
println!("Test 1: Typing simple text into TextEdit");
println!(" Please open TextEdit and create a new document...");
std::thread::sleep(std::time::Duration::from_secs(3));
match controller.type_text("TextEdit", "Hello, World!") {
Ok(_) => println!(" ✅ Successfully typed simple text\n"),
Err(e) => println!(" ❌ Failed: {}\n", e),
}
std::thread::sleep(std::time::Duration::from_secs(1));
// Test 2: Type unicode and emojis
println!("Test 2: Typing unicode and emojis");
match controller.type_text("TextEdit", "\n🌟 Unicode test: café, naïve, 日本語 🎉") {
Ok(_) => println!(" ✅ Successfully typed unicode text\n"),
Err(e) => println!(" ❌ Failed: {}\n", e),
}
std::thread::sleep(std::time::Duration::from_secs(1));
// Test 3: Type special characters
println!("Test 3: Typing special characters");
match controller.type_text("TextEdit", "\nSpecial: @#$%^&*()_+-=[]{}|;':,.<>?/") {
Ok(_) => println!(" ✅ Successfully typed special characters\n"),
Err(e) => println!(" ❌ Failed: {}\n", e),
}
println!("\n✨ Tests complete!");
println!("\n💡 Now try with Things3:");
println!(" 1. Open Things3");
println!(" 2. Press Cmd+N to create a new task");
println!(" 3. Run: g3 --macax 'type \"🌟 My awesome task\" into Things'");
Ok(())
}

View File

@@ -1,63 +1,67 @@
use g3_computer_control::ocr::{OCREngine, DefaultOCR};
use anyhow::Result;
use g3_computer_control::ocr::{DefaultOCR, OCREngine};
#[tokio::main]
async fn main() -> Result<()> {
println!("🧪 Testing Apple Vision OCR");
println!("===========================\n");
// Initialize OCR engine
println!("📦 Initializing OCR engine...");
let ocr = DefaultOCR::new()?;
println!("✅ OCR engine: {}\n", ocr.name());
// Check if test image exists
let test_image = "/tmp/safari_test.png";
if !std::path::Path::new(test_image).exists() {
println!("⚠️ Test image not found: {}", test_image);
println!(" Creating a screenshot...");
let status = std::process::Command::new("screencapture")
.arg("-x")
.arg("-R")
.arg("0,0,1200,800")
.arg(test_image)
.status()?;
if !status.success() {
anyhow::bail!("Failed to create screenshot");
}
println!("✅ Screenshot created\n");
}
// Run OCR
println!("🔍 Running Apple Vision OCR on {}...", test_image);
let start = std::time::Instant::now();
let locations = ocr.extract_text_with_locations(test_image).await?;
let duration = start.elapsed();
println!("✅ OCR completed in {:.3}s\n", duration.as_secs_f64());
// Display results
println!("📊 Results:");
println!(" Found {} text elements\n", locations.len());
if locations.is_empty() {
println!("⚠️ No text found in image");
} else {
println!(" Top 20 results:");
println!(" {:<4} {:<40} {:<15} {:<12} {:<8}", "#", "Text", "Position", "Size", "Conf");
println!(
" {:<4} {:<40} {:<15} {:<12} {:<8}",
"#", "Text", "Position", "Size", "Conf"
);
println!(" {}", "-".repeat(85));
for (i, loc) in locations.iter().take(20).enumerate() {
let text = if loc.text.len() > 37 {
format!("{}...", &loc.text[..37])
} else {
loc.text.clone()
};
println!(" {:<4} {:<40} ({:>4},{:>4}) {:>4}x{:<4} {:.2}",
println!(
" {:<4} {:<40} ({:>4},{:>4}) {:>4}x{:<4} {:.2}",
i + 1,
text,
loc.x,
@@ -67,19 +71,22 @@ async fn main() -> Result<()> {
loc.confidence
);
}
if locations.len() > 20 {
println!("\n ... and {} more", locations.len() - 20);
}
// Performance comparison
println!("\n📈 Performance:");
println!(" OCR Speed: {:.3}s", duration.as_secs_f64());
println!(" Text elements: {}", locations.len());
println!(" Avg per element: {:.1}ms", duration.as_millis() as f64 / locations.len() as f64);
println!(
" Avg per element: {:.1}ms",
duration.as_millis() as f64 / locations.len() as f64
);
}
println!("\n✅ Test complete!");
Ok(())
}

View File

@@ -3,36 +3,46 @@ use g3_computer_control::create_controller;
#[tokio::main]
async fn main() {
println!("Testing window-specific screenshot capture...");
let controller = create_controller().expect("Failed to create controller");
// Test 1: Capture iTerm2 window
println!("\n1. Capturing iTerm2 window...");
match controller.take_screenshot("/tmp/iterm_window.png", None, Some("iTerm2")).await {
match controller
.take_screenshot("/tmp/iterm_window.png", None, Some("iTerm2"))
.await
{
Ok(_) => {
println!(" ✅ iTerm2 window captured to /tmp/iterm_window.png");
let _ = std::process::Command::new("open").arg("/tmp/iterm_window.png").spawn();
let _ = std::process::Command::new("open")
.arg("/tmp/iterm_window.png")
.spawn();
}
Err(e) => println!(" ❌ Failed: {}", e),
}
// Wait a moment for the image to open
tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
// Test 2: Full screen capture for comparison
println!("\n2. Capturing full screen for comparison...");
match controller.take_screenshot("/tmp/fullscreen.png", None, None).await {
match controller
.take_screenshot("/tmp/fullscreen.png", None, None)
.await
{
Ok(_) => {
println!(" ✅ Full screen captured to /tmp/fullscreen.png");
let _ = std::process::Command::new("open").arg("/tmp/fullscreen.png").spawn();
let _ = std::process::Command::new("open")
.arg("/tmp/fullscreen.png")
.spawn();
}
Err(e) => println!(" ❌ Failed: {}", e),
}
println!("\n=== Comparison ===");
println!("iTerm window: /tmp/iterm_window.png (should show ONLY iTerm window)");
println!("Full screen: /tmp/fullscreen.png (should show entire desktop)");
// Show file sizes
if let Ok(meta1) = std::fs::metadata("/tmp/iterm_window.png") {
if let Ok(meta2) = std::fs::metadata("/tmp/fullscreen.png") {

View File

@@ -1,17 +1,17 @@
// Suppress warnings from objc crate macros
#![allow(unexpected_cfgs)]
pub mod types;
pub mod platform;
pub mod ocr;
pub mod webdriver;
pub mod macax;
pub mod ocr;
pub mod platform;
pub mod types;
pub mod webdriver;
// Re-export webdriver types for convenience
pub use webdriver::{WebDriverController, WebElement, safari::SafariDriver};
pub use webdriver::{safari::SafariDriver, WebDriverController, WebElement};
// Re-export macax types for convenience
pub use macax::{MacAxController, AXElement, AXApplication};
pub use macax::{AXApplication, AXElement, MacAxController};
use anyhow::Result;
use async_trait::async_trait;
@@ -20,14 +20,23 @@ use types::*;
#[async_trait]
pub trait ComputerController: Send + Sync {
// Screen capture
async fn take_screenshot(&self, path: &str, region: Option<Rect>, window_id: Option<&str>) -> Result<()>;
async fn take_screenshot(
&self,
path: &str,
region: Option<Rect>,
window_id: Option<&str>,
) -> Result<()>;
// OCR operations
async fn extract_text_from_screen(&self, region: Rect, window_id: &str) -> Result<String>;
async fn extract_text_from_image(&self, path: &str) -> Result<String>;
async fn extract_text_with_locations(&self, path: &str) -> Result<Vec<TextLocation>>;
async fn find_text_in_app(&self, app_name: &str, search_text: &str) -> Result<Option<TextLocation>>;
async fn find_text_in_app(
&self,
app_name: &str,
search_text: &str,
) -> Result<Option<TextLocation>>;
// Mouse operations
fn move_mouse(&self, x: i32, y: i32) -> Result<()>;
fn click_at(&self, x: i32, y: i32, app_name: Option<&str>) -> Result<()>;
@@ -37,13 +46,13 @@ pub trait ComputerController: Send + Sync {
pub fn create_controller() -> Result<Box<dyn ComputerController>> {
#[cfg(target_os = "macos")]
return Ok(Box::new(platform::macos::MacOSController::new()?));
#[cfg(target_os = "linux")]
return Ok(Box::new(platform::linux::LinuxController::new()?));
#[cfg(target_os = "windows")]
return Ok(Box::new(platform::windows::WindowsController::new()?));
#[cfg(not(any(target_os = "macos", target_os = "linux", target_os = "windows")))]
anyhow::bail!("Unsupported platform")
}

View File

@@ -3,7 +3,9 @@ use anyhow::{Context, Result};
use std::collections::HashMap;
#[cfg(target_os = "macos")]
use accessibility::{AXUIElement, AXUIElementAttributes, ElementFinder, TreeVisitor, TreeWalker, TreeWalkerFlow};
use accessibility::{
AXUIElement, AXUIElementAttributes, ElementFinder, TreeVisitor, TreeWalker, TreeWalkerFlow,
};
#[cfg(target_os = "macos")]
use core_foundation::base::TCFType;
@@ -23,46 +25,46 @@ impl MacAxController {
{
// Check if we have accessibility permissions by trying to get system-wide element
let _system = AXUIElement::system_wide();
Ok(Self {
app_cache: std::sync::Mutex::new(HashMap::new()),
})
}
#[cfg(not(target_os = "macos"))]
{
anyhow::bail!("macOS Accessibility API is only available on macOS")
}
}
/// List all running applications
#[cfg(target_os = "macos")]
pub fn list_applications(&self) -> Result<Vec<AXApplication>> {
let apps = Self::get_running_applications()?;
Ok(apps)
}
#[cfg(not(target_os = "macos"))]
pub fn list_applications(&self) -> Result<Vec<AXApplication>> {
anyhow::bail!("Not supported on this platform")
}
#[cfg(target_os = "macos")]
fn get_running_applications() -> Result<Vec<AXApplication>> {
use cocoa::appkit::NSApplicationActivationPolicy;
use cocoa::base::{id, nil};
use objc::{class, msg_send, sel, sel_impl};
unsafe {
let workspace: id = msg_send![class!(NSWorkspace), sharedWorkspace];
let running_apps: id = msg_send![workspace, runningApplications];
let count: usize = msg_send![running_apps, count];
let mut apps = Vec::new();
for i in 0..count {
let app: id = msg_send![running_apps, objectAtIndex: i];
// Get app name
let localized_name: id = msg_send![app, localizedName];
if localized_name == nil {
@@ -76,7 +78,7 @@ impl MacAxController {
} else {
continue;
};
// Get bundle ID
let bundle_id_obj: id = msg_send![app, bundleIdentifier];
let bundle_id = if bundle_id_obj != nil {
@@ -93,13 +95,15 @@ impl MacAxController {
} else {
None
};
// Get PID
let pid: i32 = msg_send![app, processIdentifier];
// Skip background-only apps
let activation_policy: i64 = msg_send![app, activationPolicy];
if activation_policy == NSApplicationActivationPolicy::NSApplicationActivationPolicyRegular as i64 {
if activation_policy
== NSApplicationActivationPolicy::NSApplicationActivationPolicyRegular as i64
{
apps.push(AXApplication {
name,
bundle_id,
@@ -107,32 +111,32 @@ impl MacAxController {
});
}
}
Ok(apps)
}
}
/// Get the frontmost (active) application
#[cfg(target_os = "macos")]
pub fn get_frontmost_app(&self) -> Result<AXApplication> {
use cocoa::base::{id, nil};
use objc::{class, msg_send, sel, sel_impl};
unsafe {
let workspace: id = msg_send![class!(NSWorkspace), sharedWorkspace];
let frontmost_app: id = msg_send![workspace, frontmostApplication];
if frontmost_app == nil {
anyhow::bail!("No frontmost application");
}
// Get app name
let localized_name: id = msg_send![frontmost_app, localizedName];
let name_ptr: *const i8 = msg_send![localized_name, UTF8String];
let name = std::ffi::CStr::from_ptr(name_ptr)
.to_string_lossy()
.to_string();
// Get bundle ID
let bundle_id_obj: id = msg_send![frontmost_app, bundleIdentifier];
let bundle_id = if bundle_id_obj != nil {
@@ -149,10 +153,10 @@ impl MacAxController {
} else {
None
};
// Get PID
let pid: i32 = msg_send![frontmost_app, processIdentifier];
Ok(AXApplication {
name,
bundle_id,
@@ -160,12 +164,12 @@ impl MacAxController {
})
}
}
#[cfg(not(target_os = "macos"))]
pub fn get_frontmost_app(&self) -> Result<AXApplication> {
anyhow::bail!("Not supported on this platform")
}
/// Get AXUIElement for an application by name or PID
#[cfg(target_os = "macos")]
fn get_app_element(&self, app_name: &str) -> Result<AXUIElement> {
@@ -176,79 +180,79 @@ impl MacAxController {
return Ok(element.clone());
}
}
// Find the app by name
let apps = Self::get_running_applications()?;
let app = apps
.iter()
.find(|a| a.name == app_name)
.ok_or_else(|| anyhow::anyhow!("Application '{}' not found", app_name))?;
// Create AXUIElement for the app
let element = AXUIElement::application(app.pid);
// Cache it
{
let mut cache = self.app_cache.lock().unwrap();
cache.insert(app_name.to_string(), element.clone());
}
Ok(element)
}
/// Activate (bring to front) an application
#[cfg(target_os = "macos")]
pub fn activate_app(&self, app_name: &str) -> Result<()> {
use cocoa::base::id;
use objc::{class, msg_send, sel, sel_impl};
// Find the app
let apps = Self::get_running_applications()?;
let app = apps
.iter()
.find(|a| a.name == app_name)
.ok_or_else(|| anyhow::anyhow!("Application '{}' not found", app_name))?;
unsafe {
let workspace: id = msg_send![class!(NSWorkspace), sharedWorkspace];
let running_apps: id = msg_send![workspace, runningApplications];
let count: usize = msg_send![running_apps, count];
for i in 0..count {
let running_app: id = msg_send![running_apps, objectAtIndex: i];
let pid: i32 = msg_send![running_app, processIdentifier];
if pid == app.pid {
let _: bool = msg_send![running_app, activateWithOptions: 0];
return Ok(());
}
}
}
anyhow::bail!("Failed to activate application")
}
#[cfg(not(target_os = "macos"))]
pub fn activate_app(&self, _app_name: &str) -> Result<()> {
anyhow::bail!("Not supported on this platform")
}
/// Get the UI hierarchy of an application
#[cfg(target_os = "macos")]
pub fn get_ui_tree(&self, app_name: &str, max_depth: usize) -> Result<String> {
let app_element = self.get_app_element(app_name)?;
let mut output = format!("Application: {}\n", app_name);
Self::build_ui_tree(&app_element, &mut output, 0, max_depth)?;
Ok(output)
}
#[cfg(not(target_os = "macos"))]
pub fn get_ui_tree(&self, _app_name: &str, _max_depth: usize) -> Result<String> {
anyhow::bail!("Not supported on this platform")
}
#[cfg(target_os = "macos")]
fn build_ui_tree(
element: &AXUIElement,
@@ -259,21 +263,22 @@ impl MacAxController {
if depth >= max_depth {
return Ok(());
}
let indent = " ".repeat(depth);
// Get role
let role = element.role().ok().map(|s| s.to_string())
let role = element
.role()
.ok()
.map(|s| s.to_string())
.unwrap_or_else(|| "Unknown".to_string());
// Get title
let title = element.title().ok()
.map(|s| s.to_string());
let title = element.title().ok().map(|s| s.to_string());
// Get identifier
let identifier = element.identifier().ok()
.map(|s| s.to_string());
let identifier = element.identifier().ok().map(|s| s.to_string());
// Format output
output.push_str(&format!("{}Role: {}", indent, role));
if let Some(t) = title {
@@ -283,7 +288,7 @@ impl MacAxController {
output.push_str(&format!(", ID: {}", id));
}
output.push('\n');
// Get children
if let Ok(children) = element.children() {
for i in 0..children.len() {
@@ -292,10 +297,10 @@ impl MacAxController {
}
}
}
Ok(())
}
/// Find UI elements in an application
#[cfg(target_os = "macos")]
pub fn find_elements(
@@ -307,7 +312,7 @@ impl MacAxController {
) -> Result<Vec<AXElement>> {
let app_element = self.get_app_element(app_name)?;
let mut found_elements = Vec::new();
let visitor = ElementCollector {
role_filter: role.map(|s| s.to_string()),
title_filter: title.map(|s| s.to_string()),
@@ -315,13 +320,13 @@ impl MacAxController {
results: std::cell::RefCell::new(&mut found_elements),
depth: std::cell::Cell::new(0),
};
let walker = TreeWalker::new();
walker.walk(&app_element, &visitor);
Ok(found_elements)
}
#[cfg(not(target_os = "macos"))]
pub fn find_elements(
&self,
@@ -332,7 +337,7 @@ impl MacAxController {
) -> Result<Vec<AXElement>> {
anyhow::bail!("Not supported on this platform")
}
/// Find a single element (helper for click, set_value, etc.)
#[cfg(target_os = "macos")]
fn find_element(
@@ -343,19 +348,17 @@ impl MacAxController {
identifier: Option<&str>,
) -> Result<AXUIElement> {
let app_element = self.get_app_element(app_name)?;
let role_str = role.to_string();
let title_str = title.map(|s| s.to_string());
let identifier_str = identifier.map(|s| s.to_string());
let finder = ElementFinder::new(
&app_element,
move |element| {
// Check role
let elem_role = element.role()
.ok()
.map(|s| s.to_string());
let elem_role = element.role().ok().map(|s| s.to_string());
if let Some(r) = elem_role {
if !r.contains(&role_str) {
return false;
@@ -363,13 +366,11 @@ impl MacAxController {
} else {
return false;
}
// Check title if specified
if let Some(ref title_filter) = title_str {
let elem_title = element.title()
.ok()
.map(|s| s.to_string());
let elem_title = element.title().ok().map(|s| s.to_string());
if let Some(t) = elem_title {
if !t.contains(title_filter) {
return false;
@@ -378,13 +379,11 @@ impl MacAxController {
return false;
}
}
// Check identifier if specified
if let Some(ref id_filter) = identifier_str {
let elem_id = element.identifier()
.ok()
.map(|s| s.to_string());
let elem_id = element.identifier().ok().map(|s| s.to_string());
if let Some(id) = elem_id {
if !id.contains(id_filter) {
return false;
@@ -393,15 +392,15 @@ impl MacAxController {
return false;
}
}
true
},
Some(std::time::Duration::from_secs(2)),
);
finder.find().context("Element not found")
}
/// Click on a UI element
#[cfg(target_os = "macos")]
pub fn click_element(
@@ -412,16 +411,16 @@ impl MacAxController {
identifier: Option<&str>,
) -> Result<()> {
let element = self.find_element(app_name, role, title, identifier)?;
// Perform the press action
let action_name = CFString::new("AXPress");
element
.perform_action(&action_name)
.map_err(|e| anyhow::anyhow!("Failed to perform press action: {:?}", e))?;
Ok(())
}
#[cfg(not(target_os = "macos"))]
pub fn click_element(
&self,
@@ -432,7 +431,7 @@ impl MacAxController {
) -> Result<()> {
anyhow::bail!("Not supported on this platform")
}
/// Set the value of a UI element
#[cfg(target_os = "macos")]
pub fn set_value(
@@ -444,16 +443,17 @@ impl MacAxController {
identifier: Option<&str>,
) -> Result<()> {
let element = self.find_element(app_name, role, title, identifier)?;
// Set the value - convert CFString to CFType
let cf_value = CFString::new(value);
element.set_value(cf_value.as_CFType())
element
.set_value(cf_value.as_CFType())
.map_err(|e| anyhow::anyhow!("Failed to set value: {:?}", e))?;
Ok(())
}
#[cfg(not(target_os = "macos"))]
pub fn set_value(
&self,
@@ -465,7 +465,7 @@ impl MacAxController {
) -> Result<()> {
anyhow::bail!("Not supported on this platform")
}
/// Get the value of a UI element
#[cfg(target_os = "macos")]
pub fn get_value(
@@ -476,11 +476,12 @@ impl MacAxController {
identifier: Option<&str>,
) -> Result<String> {
let element = self.find_element(app_name, role, title, identifier)?;
// Get the value
let value_type = element.value()
let value_type = element
.value()
.map_err(|e| anyhow::anyhow!("Failed to get value: {:?}", e))?;
// Try to downcast to CFString
if let Some(cf_string) = value_type.downcast::<CFString>() {
Ok(cf_string.to_string())
@@ -489,7 +490,7 @@ impl MacAxController {
Ok(format!("<non-string value>"))
}
}
#[cfg(not(target_os = "macos"))]
pub fn get_value(
&self,
@@ -500,52 +501,52 @@ impl MacAxController {
) -> Result<String> {
anyhow::bail!("Not supported on this platform")
}
/// Type text into the currently focused element (uses system text input)
#[cfg(target_os = "macos")]
pub fn type_text(&self, app_name: &str, text: &str) -> Result<()> {
use cocoa::base::{id, nil};
use cocoa::foundation::NSString;
use objc::{class, msg_send, sel, sel_impl};
// First, make sure the app is active
self.activate_app(app_name)?;
// Wait for app to fully activate
std::thread::sleep(std::time::Duration::from_millis(500));
// Send a Tab key to try to focus on a text field
// This helps ensure something is focused before we paste
let _ = self.press_key(app_name, "tab", vec![]);
std::thread::sleep(std::time::Duration::from_millis(800));
// Save old clipboard, set new content, paste, then restore
let old_content: id;
unsafe {
// Get the general pasteboard
let pasteboard: id = msg_send![class!(NSPasteboard), generalPasteboard];
// Save current clipboard content
let ns_string_type = NSString::alloc(nil).init_str("public.utf8-plain-text");
old_content = msg_send![pasteboard, stringForType: ns_string_type];
// Clear and set new content
let _: () = msg_send![pasteboard, clearContents];
let ns_string = NSString::alloc(nil).init_str(text);
let ns_type = NSString::alloc(nil).init_str("public.utf8-plain-text");
let _: bool = msg_send![pasteboard, setString:ns_string forType:ns_type];
}
// Wait a moment for clipboard to update
std::thread::sleep(std::time::Duration::from_millis(200));
// Paste using Cmd+V (outside unsafe block)
self.press_key(app_name, "v", vec!["command"])?;
// Wait for paste to complete
std::thread::sleep(std::time::Duration::from_millis(300));
// Restore old clipboard content if it existed
unsafe {
if old_content != nil {
@@ -555,15 +556,15 @@ impl MacAxController {
let _: bool = msg_send![pasteboard, setString:old_content forType:ns_type];
}
}
Ok(())
}
#[cfg(not(target_os = "macos"))]
pub fn type_text(&self, _app_name: &str, _text: &str) -> Result<()> {
anyhow::bail!("Not supported on this platform")
}
/// Focus on a text field or text area element
#[cfg(target_os = "macos")]
pub fn focus_element(
@@ -574,40 +575,34 @@ impl MacAxController {
identifier: Option<&str>,
) -> Result<()> {
let element = self.find_element(app_name, role, title, identifier)?;
// Set focused attribute to true
use core_foundation::boolean::CFBoolean;
let cf_true = CFBoolean::true_value();
element.set_attribute(&accessibility::AXAttribute::focused(), cf_true)
element
.set_attribute(&accessibility::AXAttribute::focused(), cf_true)
.map_err(|e| anyhow::anyhow!("Failed to focus element: {:?}", e))?;
Ok(())
}
/// Press a keyboard shortcut
#[cfg(target_os = "macos")]
pub fn press_key(
&self,
app_name: &str,
key: &str,
modifiers: Vec<&str>,
) -> Result<()> {
use core_graphics::event::{
CGEvent, CGEventFlags, CGEventTapLocation,
};
pub fn press_key(&self, app_name: &str, key: &str, modifiers: Vec<&str>) -> Result<()> {
use core_graphics::event::{CGEvent, CGEventFlags, CGEventTapLocation};
use core_graphics::event_source::{CGEventSource, CGEventSourceStateID};
// First, make sure the app is active
self.activate_app(app_name)?;
// Wait a bit for activation
std::thread::sleep(std::time::Duration::from_millis(100));
// Map key string to key code
let key_code = Self::key_to_keycode(key)
.ok_or_else(|| anyhow::anyhow!("Unknown key: {}", key))?;
let key_code =
Self::key_to_keycode(key).ok_or_else(|| anyhow::anyhow!("Unknown key: {}", key))?;
// Map modifiers to flags
let mut flags = CGEventFlags::CGEventFlagNull;
for modifier in modifiers {
@@ -619,39 +614,37 @@ impl MacAxController {
_ => {}
}
}
// Create event source
let source = CGEventSource::new(CGEventSourceStateID::HIDSystemState)
.ok().context("Failed to create event source")?;
.ok()
.context("Failed to create event source")?;
// Create key down event
let key_down = CGEvent::new_keyboard_event(source.clone(), key_code, true)
.ok().context("Failed to create key down event")?;
.ok()
.context("Failed to create key down event")?;
key_down.set_flags(flags);
// Create key up event
let key_up = CGEvent::new_keyboard_event(source, key_code, false)
.ok().context("Failed to create key up event")?;
.ok()
.context("Failed to create key up event")?;
key_up.set_flags(flags);
// Post events
key_down.post(CGEventTapLocation::HID);
std::thread::sleep(std::time::Duration::from_millis(50));
key_up.post(CGEventTapLocation::HID);
Ok(())
}
#[cfg(not(target_os = "macos"))]
pub fn press_key(
&self,
_app_name: &str,
_key: &str,
_modifiers: Vec<&str>,
) -> Result<()> {
pub fn press_key(&self, _app_name: &str, _key: &str, _modifiers: Vec<&str>) -> Result<()> {
anyhow::bail!("Not supported on this platform")
}
#[cfg(target_os = "macos")]
fn key_to_keycode(key: &str) -> Option<u16> {
// Map common keys to keycodes
@@ -743,62 +736,55 @@ struct ElementCollector<'a> {
impl<'a> TreeVisitor for ElementCollector<'a> {
fn enter_element(&self, element: &AXUIElement) -> TreeWalkerFlow {
self.depth.set(self.depth.get() + 1);
if self.depth.get() > 20 {
return TreeWalkerFlow::SkipSubtree;
}
// Get element properties
let role = element.role()
let role = element
.role()
.ok()
.map(|s| s.to_string())
.unwrap_or_else(|| "Unknown".to_string());
let title = element.title()
.ok()
.map(|s| s.to_string());
let identifier = element.identifier()
.ok()
.map(|s| s.to_string());
let title = element.title().ok().map(|s| s.to_string());
let identifier = element.identifier().ok().map(|s| s.to_string());
// Check if this element matches the filters
let role_matches = self.role_filter.as_ref().map_or(true, |r| role.contains(r));
let title_matches = self.title_filter.as_ref().map_or(true, |t| {
title.as_ref().map_or(false, |title_str| title_str.contains(t))
title
.as_ref()
.map_or(false, |title_str| title_str.contains(t))
});
let identifier_matches = self.identifier_filter.as_ref().map_or(true, |id| {
identifier.as_ref().map_or(false, |id_str| id_str.contains(id))
identifier
.as_ref()
.map_or(false, |id_str| id_str.contains(id))
});
if role_matches && title_matches && identifier_matches {
// Get additional properties
let value = element.value()
let value = element
.value()
.ok()
.and_then(|v| {
v.downcast::<CFString>().map(|s| s.to_string())
});
let label = element.description()
.ok()
.map(|s| s.to_string());
let enabled = element.enabled()
.ok()
.map(|b| b.into())
.unwrap_or(false);
let focused = element.focused()
.ok()
.map(|b| b.into())
.unwrap_or(false);
.and_then(|v| v.downcast::<CFString>().map(|s| s.to_string()));
let label = element.description().ok().map(|s| s.to_string());
let enabled = element.enabled().ok().map(|b| b.into()).unwrap_or(false);
let focused = element.focused().ok().map(|b| b.into()).unwrap_or(false);
// Count children
let children_count = element.children()
let children_count = element
.children()
.ok()
.map(|arr| arr.len() as usize)
.unwrap_or(0);
self.results.borrow_mut().push(AXElement {
role,
title,
@@ -812,10 +798,10 @@ impl<'a> TreeVisitor for ElementCollector<'a> {
children_count,
});
}
TreeWalkerFlow::Continue
}
fn exit_element(&self, _element: &AXUIElement) {
self.depth.set(self.depth.get() - 1);
}

View File

@@ -34,7 +34,7 @@ impl AXElement {
/// Convert to a human-readable string representation
pub fn to_string(&self) -> String {
let mut parts = vec![format!("Role: {}", self.role)];
if let Some(ref title) = self.title {
parts.push(format!("Title: {}", title));
}
@@ -47,19 +47,19 @@ impl AXElement {
if let Some(ref id) = self.identifier {
parts.push(format!("ID: {}", id));
}
parts.push(format!("Enabled: {}", self.enabled));
parts.push(format!("Focused: {}", self.focused));
if let Some((x, y)) = self.position {
parts.push(format!("Position: ({:.0}, {:.0})", x, y));
}
if let Some((w, h)) = self.size {
parts.push(format!("Size: ({:.0}, {:.0})", w, h));
}
parts.push(format!("Children: {}", self.children_count));
parts.join(", ")
}
}

View File

@@ -7,7 +7,7 @@ use async_trait::async_trait;
pub trait OCREngine: Send + Sync {
/// Extract text with locations from an image file
async fn extract_text_with_locations(&self, path: &str) -> Result<Vec<TextLocation>>;
/// Get the name of the OCR engine
fn name(&self) -> &str;
}

View File

@@ -12,16 +12,18 @@ impl TesseractOCR {
let tesseract_check = std::process::Command::new("which")
.arg("tesseract")
.output();
if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() {
anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\
anyhow::bail!(
"Tesseract OCR is not installed on your system.\n\n\
To install tesseract:\n macOS: brew install tesseract\n \
Linux: sudo apt-get install tesseract-ocr (Ubuntu/Debian)\n \
sudo yum install tesseract (RHEL/CentOS)\n \
Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki\n\n\
After installation, restart your terminal and try again.");
After installation, restart your terminal and try again."
);
}
Ok(Self)
}
}
@@ -36,18 +38,23 @@ impl OCREngine for TesseractOCR {
.arg("tsv")
.output()
.map_err(|e| anyhow::anyhow!("Failed to run tesseract: {}", e))?;
if !output.status.success() {
anyhow::bail!("Tesseract failed: {}", String::from_utf8_lossy(&output.stderr));
anyhow::bail!(
"Tesseract failed: {}",
String::from_utf8_lossy(&output.stderr)
);
}
let tsv_text = String::from_utf8_lossy(&output.stdout);
let mut locations = Vec::new();
// Parse TSV output (skip header line)
for (i, line) in tsv_text.lines().enumerate() {
if i == 0 { continue; } // Skip header
if i == 0 {
continue;
} // Skip header
let parts: Vec<&str> = line.split('\t').collect();
if parts.len() >= 12 {
// TSV format: level, page_num, block_num, par_num, line_num, word_num,
@@ -74,10 +81,10 @@ impl OCREngine for TesseractOCR {
}
}
}
Ok(locations)
}
fn name(&self) -> &str {
"Tesseract OCR"
}

View File

@@ -1,6 +1,6 @@
use super::OCREngine;
use crate::types::TextLocation;
use anyhow::{Result, Context};
use anyhow::{Context, Result};
use async_trait::async_trait;
use std::ffi::{CStr, CString};
use std::os::raw::{c_char, c_float, c_uint};
@@ -24,7 +24,7 @@ extern "C" {
out_boxes: *mut *mut std::ffi::c_void,
out_count: *mut c_uint,
) -> bool;
fn vision_free_boxes(boxes: *mut std::ffi::c_void, count: c_uint);
}
@@ -41,12 +41,11 @@ impl AppleVisionOCR {
impl OCREngine for AppleVisionOCR {
async fn extract_text_with_locations(&self, path: &str) -> Result<Vec<TextLocation>> {
// Convert path to C string
let c_path = CString::new(path)
.context("Failed to convert path to C string")?;
let c_path = CString::new(path).context("Failed to convert path to C string")?;
let mut boxes_ptr: *mut std::ffi::c_void = std::ptr::null_mut();
let mut count: c_uint = 0;
// Call Swift Vision API
let success = unsafe {
vision_recognize_text(
@@ -56,28 +55,26 @@ impl OCREngine for AppleVisionOCR {
&mut count,
)
};
if !success || boxes_ptr.is_null() {
anyhow::bail!("Apple Vision OCR failed");
}
// Convert C array to Rust Vec
let mut locations = Vec::new();
unsafe {
let typed_boxes = boxes_ptr as *const VisionTextBox;
let boxes_slice = std::slice::from_raw_parts(typed_boxes, count as usize);
for box_data in boxes_slice {
// Convert C string to Rust String
let text = if !box_data.text.is_null() {
CStr::from_ptr(box_data.text)
.to_string_lossy()
.into_owned()
CStr::from_ptr(box_data.text).to_string_lossy().into_owned()
} else {
String::new()
};
if !text.is_empty() {
locations.push(TextLocation {
text,
@@ -89,14 +86,14 @@ impl OCREngine for AppleVisionOCR {
});
}
}
// Free the C array
vision_free_boxes(boxes_ptr, count);
}
Ok(locations)
}
fn name(&self) -> &str {
"Apple Vision Framework"
}

View File

@@ -1,4 +1,4 @@
use crate::{ComputerController, types::*};
use crate::{types::*, ComputerController};
use anyhow::Result;
use async_trait::async_trait;
use tesseract::Tesseract;
@@ -21,48 +21,53 @@ impl ComputerController for LinuxController {
async fn move_mouse(&self, _x: i32, _y: i32) -> Result<()> {
anyhow::bail!("Linux implementation not yet available")
}
async fn click(&self, _button: MouseButton) -> Result<()> {
anyhow::bail!("Linux implementation not yet available")
}
async fn double_click(&self, _button: MouseButton) -> Result<()> {
anyhow::bail!("Linux implementation not yet available")
}
async fn type_text(&self, _text: &str) -> Result<()> {
anyhow::bail!("Linux implementation not yet available")
}
async fn press_key(&self, _key: &str) -> Result<()> {
anyhow::bail!("Linux implementation not yet available")
}
async fn list_windows(&self) -> Result<Vec<Window>> {
anyhow::bail!("Linux implementation not yet available")
}
async fn focus_window(&self, _window_id: &str) -> Result<()> {
anyhow::bail!("Linux implementation not yet available")
}
async fn get_window_bounds(&self, _window_id: &str) -> Result<Rect> {
anyhow::bail!("Linux implementation not yet available")
}
async fn find_element(&self, _selector: &ElementSelector) -> Result<Option<UIElement>> {
anyhow::bail!("Linux implementation not yet available")
}
async fn get_element_text(&self, _element_id: &str) -> Result<String> {
anyhow::bail!("Linux implementation not yet available")
}
async fn get_element_bounds(&self, _element_id: &str) -> Result<Rect> {
anyhow::bail!("Linux implementation not yet available")
}
async fn take_screenshot(&self, _path: &str, _region: Option<Rect>, _window_id: Option<&str>) -> Result<()> {
async fn take_screenshot(
&self,
_path: &str,
_region: Option<Rect>,
_window_id: Option<&str>,
) -> Result<()> {
// Enforce that window_id must be provided
if _window_id.is_none() {
anyhow::bail!("window_id is required. You must specify which window to capture (e.g., 'Firefox', 'Terminal', 'gedit'). Use list_windows to see available windows.");
@@ -70,94 +75,111 @@ impl ComputerController for LinuxController {
anyhow::bail!("Linux implementation not yet available")
}
async fn extract_text_from_screen(&self, _region: Rect, _window_id: &str) -> Result<String> {
anyhow::bail!("Linux implementation not yet available")
}
async fn extract_text_from_image(&self, _path: &str) -> Result<OCRResult> {
// Check if tesseract is available on the system
let tesseract_check = std::process::Command::new("which")
.arg("tesseract")
.output();
if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() {
anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\
anyhow::bail!(
"Tesseract OCR is not installed on your system.\n\n\
To install tesseract:\n \
Ubuntu/Debian: sudo apt-get install tesseract-ocr\n \
RHEL/CentOS: sudo yum install tesseract\n \
Arch Linux: sudo pacman -S tesseract\n\n\
After installation, restart your terminal and try again.");
After installation, restart your terminal and try again."
);
}
// Initialize Tesseract
let tess = Tesseract::new(None, Some("eng"))
.map_err(|e| {
anyhow::anyhow!("Failed to initialize Tesseract: {}\n\n\
let tess = Tesseract::new(None, Some("eng")).map_err(|e| {
anyhow::anyhow!(
"Failed to initialize Tesseract: {}\n\n\
This usually means:\n1. Tesseract is not properly installed\n\
2. Language data files are missing\n\nTo fix:\n \
Ubuntu/Debian: sudo apt-get install tesseract-ocr-eng\n \
RHEL/CentOS: sudo yum install tesseract-langpack-eng\n \
Arch Linux: sudo pacman -S tesseract-data-eng", e)
})?;
let text = tess.set_image(_path)
Arch Linux: sudo pacman -S tesseract-data-eng",
e
)
})?;
let text = tess
.set_image(_path)
.map_err(|e| anyhow::anyhow!("Failed to load image '{}': {}", _path, e))?
.get_text()
.map_err(|e| anyhow::anyhow!("Failed to extract text from image: {}", e))?;
// Get confidence (simplified - would need more complex API calls for per-word confidence)
let confidence = 0.85; // Placeholder
Ok(OCRResult {
text,
confidence,
bounds: Rect { x: 0, y: 0, width: 0, height: 0 }, // Would need image dimensions
bounds: Rect {
x: 0,
y: 0,
width: 0,
height: 0,
}, // Would need image dimensions
})
}
async fn find_text_on_screen(&self, _text: &str) -> Result<Option<Point>> {
// Check if tesseract is available on the system
let tesseract_check = std::process::Command::new("which")
.arg("tesseract")
.output();
if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() {
anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\
anyhow::bail!(
"Tesseract OCR is not installed on your system.\n\n\
To install tesseract:\n \
Ubuntu/Debian: sudo apt-get install tesseract-ocr\n \
RHEL/CentOS: sudo yum install tesseract\n \
Arch Linux: sudo pacman -S tesseract\n\n\
After installation, restart your terminal and try again.");
After installation, restart your terminal and try again."
);
}
// Take full screen screenshot
let temp_path = format!("/tmp/g3_ocr_search_{}.png", uuid::Uuid::new_v4());
self.take_screenshot(&temp_path, None, None).await?;
// Use Tesseract to find text with bounding boxes
let tess = Tesseract::new(None, Some("eng"))
.map_err(|e| {
anyhow::anyhow!("Failed to initialize Tesseract: {}\n\n\
let tess = Tesseract::new(None, Some("eng")).map_err(|e| {
anyhow::anyhow!(
"Failed to initialize Tesseract: {}\n\n\
This usually means:\n1. Tesseract is not properly installed\n\
2. Language data files are missing\n\nTo fix:\n \
Ubuntu/Debian: sudo apt-get install tesseract-ocr-eng\n \
RHEL/CentOS: sudo yum install tesseract-langpack-eng\n \
Arch Linux: sudo pacman -S tesseract-data-eng", e)
})?;
let full_text = tess.set_image(temp_path.as_str())
Arch Linux: sudo pacman -S tesseract-data-eng",
e
)
})?;
let full_text = tess
.set_image(temp_path.as_str())
.map_err(|e| anyhow::anyhow!("Failed to load screenshot: {}", e))?
.get_text()
.map_err(|e| anyhow::anyhow!("Failed to extract text from screen: {}", e))?;
// Clean up temp file
let _ = std::fs::remove_file(&temp_path);
// Simple text search - full implementation would use get_component_images
// to get bounding boxes for each word
if full_text.contains(_text) {
tracing::warn!("Text found but precise coordinates not available in simplified implementation");
tracing::warn!(
"Text found but precise coordinates not available in simplified implementation"
);
Ok(Some(Point { x: 0, y: 0 }))
} else {
Ok(None)

View File

@@ -1,13 +1,18 @@
use crate::{ComputerController, types::{Rect, TextLocation}};
use crate::ocr::{OCREngine, DefaultOCR};
use anyhow::{Result, Context};
use crate::ocr::{DefaultOCR, OCREngine};
use crate::{
types::{Rect, TextLocation},
ComputerController,
};
use anyhow::{Context, Result};
use async_trait::async_trait;
use std::path::Path;
use core_graphics::window::{kCGWindowListOptionOnScreenOnly, kCGNullWindowID, CGWindowListCopyWindowInfo};
use core_foundation::array::CFArray;
use core_foundation::base::{TCFType, ToVoid};
use core_foundation::dictionary::CFDictionary;
use core_foundation::string::CFString;
use core_foundation::base::{TCFType, ToVoid};
use core_foundation::array::CFArray;
use core_graphics::window::{
kCGNullWindowID, kCGWindowListOptionOnScreenOnly, CGWindowListCopyWindowInfo,
};
use std::path::Path;
pub struct MacOSController {
ocr_engine: Box<dyn OCREngine>,
@@ -20,13 +25,21 @@ impl MacOSController {
let ocr = Box::new(DefaultOCR::new()?);
let ocr_name = ocr.name().to_string();
tracing::info!("Initialized macOS controller with OCR engine: {}", ocr_name);
Ok(Self { ocr_engine: ocr, ocr_name })
Ok(Self {
ocr_engine: ocr,
ocr_name,
})
}
}
#[async_trait]
impl ComputerController for MacOSController {
async fn take_screenshot(&self, path: &str, region: Option<Rect>, window_id: Option<&str>) -> Result<()> {
async fn take_screenshot(
&self,
path: &str,
region: Option<Rect>,
window_id: Option<&str>,
) -> Result<()> {
// Enforce that window_id must be provided
if window_id.is_none() {
return Err(anyhow::anyhow!("window_id is required. You must specify which window to capture (e.g., 'Safari', 'Terminal', 'Google Chrome'). Use list_windows to see available windows."));
@@ -36,40 +49,38 @@ impl ComputerController for MacOSController {
let temp_dir = std::env::var("TMPDIR")
.or_else(|_| std::env::var("HOME").map(|h| format!("{}/tmp", h)))
.unwrap_or_else(|_| "/tmp".to_string());
// Ensure temp directory exists
std::fs::create_dir_all(&temp_dir)?;
// If path is relative or doesn't specify a directory, use temp_dir
let final_path = if path.starts_with('/') {
path.to_string()
} else {
format!("{}/{}", temp_dir.trim_end_matches('/'), path)
};
let path_obj = Path::new(&final_path);
if let Some(parent) = path_obj.parent() {
std::fs::create_dir_all(parent)?;
}
let app_name = window_id.unwrap(); // Safe because we checked is_none() above
// Get the window ID for the specified application
let cg_window_id = unsafe {
let window_list = CGWindowListCopyWindowInfo(
kCGWindowListOptionOnScreenOnly,
kCGNullWindowID
);
let window_list =
CGWindowListCopyWindowInfo(kCGWindowListOptionOnScreenOnly, kCGNullWindowID);
let array = CFArray::<CFDictionary>::wrap_under_create_rule(window_list);
let count = array.len();
let mut found_window_id: Option<(u32, String)> = None; // (id, owner)
let app_name_lower = app_name.to_lowercase();
for i in 0..count {
let dict = array.get(i).unwrap();
// Get owner name
let owner_key = CFString::from_static_string("kCGWindowOwnerName");
let owner: String = if let Some(value) = dict.find(owner_key.to_void()) {
@@ -78,57 +89,68 @@ impl ComputerController for MacOSController {
} else {
continue;
};
tracing::debug!("Checking window: owner='{}', looking for '{}'", owner, app_name);
tracing::debug!(
"Checking window: owner='{}', looking for '{}'",
owner,
app_name
);
let owner_lower = owner.to_lowercase();
// Normalize by removing spaces for exact matching
let app_name_normalized = app_name_lower.replace(" ", "");
let owner_normalized = owner_lower.replace(" ", "");
// ONLY accept exact matches (case-insensitive, with or without spaces)
// This prevents "Goose" from matching "GooseStudio"
let is_match = owner_lower == app_name_lower || owner_normalized == app_name_normalized;
let is_match =
owner_lower == app_name_lower || owner_normalized == app_name_normalized;
if is_match {
// Get window ID
let window_id_key = CFString::from_static_string("kCGWindowNumber");
if let Some(value) = dict.find(window_id_key.to_void()) {
let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _);
let num: core_foundation::number::CFNumber =
TCFType::wrap_under_get_rule(*value as *const _);
if let Some(id) = num.to_i64() {
// Get window layer to filter out menu bar windows
let layer_key = CFString::from_static_string("kCGWindowLayer");
let layer: i32 = if let Some(value) = dict.find(layer_key.to_void()) {
let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _);
let num: core_foundation::number::CFNumber =
TCFType::wrap_under_get_rule(*value as *const _);
num.to_i32().unwrap_or(0)
} else {
0
};
// Get window bounds to verify it's a real window
let bounds_key = CFString::from_static_string("kCGWindowBounds");
let has_real_bounds = if let Some(value) = dict.find(bounds_key.to_void()) {
let bounds_dict: CFDictionary = TCFType::wrap_under_get_rule(*value as *const _);
let width_key = CFString::from_static_string("Width");
let height_key = CFString::from_static_string("Height");
if let (Some(w_val), Some(h_val)) = (
bounds_dict.find(width_key.to_void()),
bounds_dict.find(height_key.to_void()),
) {
let w_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*w_val as *const _);
let h_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*h_val as *const _);
let width = w_num.to_f64().unwrap_or(0.0);
let height = h_num.to_f64().unwrap_or(0.0);
// Real windows should be at least 100x100 pixels
width >= 100.0 && height >= 100.0
let has_real_bounds =
if let Some(value) = dict.find(bounds_key.to_void()) {
let bounds_dict: CFDictionary =
TCFType::wrap_under_get_rule(*value as *const _);
let width_key = CFString::from_static_string("Width");
let height_key = CFString::from_static_string("Height");
if let (Some(w_val), Some(h_val)) = (
bounds_dict.find(width_key.to_void()),
bounds_dict.find(height_key.to_void()),
) {
let w_num: core_foundation::number::CFNumber =
TCFType::wrap_under_get_rule(*w_val as *const _);
let h_num: core_foundation::number::CFNumber =
TCFType::wrap_under_get_rule(*h_val as *const _);
let width = w_num.to_f64().unwrap_or(0.0);
let height = h_num.to_f64().unwrap_or(0.0);
// Real windows should be at least 100x100 pixels
width >= 100.0 && height >= 100.0
} else {
false
}
} else {
false
}
} else {
false
};
};
// Only accept windows that are:
// 1. At layer 0 (normal windows, not menu bar)
// 2. Have real bounds (width and height >= 100)
@@ -137,189 +159,222 @@ impl ComputerController for MacOSController {
found_window_id = Some((id as u32, owner.clone()));
break;
} else {
tracing::debug!("Skipping window ID {} for '{}': layer={}, has_real_bounds={}", id, owner, layer, has_real_bounds);
tracing::debug!(
"Skipping window ID {} for '{}': layer={}, has_real_bounds={}",
id,
owner,
layer,
has_real_bounds
);
}
}
}
}
}
found_window_id
};
let (cg_window_id, matched_owner) = cg_window_id.ok_or_else(|| {
anyhow::anyhow!("Could not find window for application '{}'. Use list_windows to see available windows.", app_name)
})?;
tracing::info!("Taking screenshot of window ID {} for app '{}'", cg_window_id, matched_owner);
tracing::info!(
"Taking screenshot of window ID {} for app '{}'",
cg_window_id,
matched_owner
);
// Use screencapture with the window ID for now
// TODO: Implement direct CGWindowListCreateImage approach with proper image saving
let mut cmd = std::process::Command::new("screencapture");
cmd.arg("-x"); // No sound
cmd.arg("-l");
cmd.arg(cg_window_id.to_string());
if let Some(region) = region {
cmd.arg("-R");
cmd.arg(format!("{},{},{},{}", region.x, region.y, region.width, region.height));
cmd.arg(format!(
"{},{},{},{}",
region.x, region.y, region.width, region.height
));
}
cmd.arg(&final_path);
let screenshot_result = cmd.output()?;
if !screenshot_result.status.success() {
let stderr = String::from_utf8_lossy(&screenshot_result.stderr);
return Err(anyhow::anyhow!("screencapture failed for window {}: {}", cg_window_id, stderr));
return Err(anyhow::anyhow!(
"screencapture failed for window {}: {}",
cg_window_id,
stderr
));
}
Ok(())
}
async fn extract_text_from_screen(&self, region: Rect, window_id: &str) -> Result<String> {
// Take screenshot of region first
let temp_path = format!("/tmp/g3_ocr_{}.png", uuid::Uuid::new_v4());
self.take_screenshot(&temp_path, Some(region), Some(window_id)).await?;
self.take_screenshot(&temp_path, Some(region), Some(window_id))
.await?;
// Extract text from the screenshot
let result = self.extract_text_from_image(&temp_path).await?;
// Clean up temp file
let _ = std::fs::remove_file(&temp_path);
Ok(result)
}
async fn extract_text_from_image(&self, path: &str) -> Result<String> {
// Extract all text and concatenate
let locations = self.ocr_engine.extract_text_with_locations(path).await?;
Ok(locations.iter().map(|loc| loc.text.as_str()).collect::<Vec<_>>().join(" "))
Ok(locations
.iter()
.map(|loc| loc.text.as_str())
.collect::<Vec<_>>()
.join(" "))
}
async fn extract_text_with_locations(&self, path: &str) -> Result<Vec<TextLocation>> {
// Use the OCR engine
self.ocr_engine.extract_text_with_locations(path).await
}
async fn find_text_in_app(&self, app_name: &str, search_text: &str) -> Result<Option<TextLocation>> {
async fn find_text_in_app(
&self,
app_name: &str,
search_text: &str,
) -> Result<Option<TextLocation>> {
// Take screenshot of specific app window
let home = std::env::var("HOME").unwrap_or_else(|_| "/tmp".to_string());
let temp_path = format!("{}/tmp/g3_find_text_{}_{}.png", home, app_name, uuid::Uuid::new_v4());
self.take_screenshot(&temp_path, None, Some(app_name)).await?;
let temp_path = format!(
"{}/tmp/g3_find_text_{}_{}.png",
home,
app_name,
uuid::Uuid::new_v4()
);
self.take_screenshot(&temp_path, None, Some(app_name))
.await?;
// Get screenshot dimensions before we delete it
let screenshot_dims = get_image_dimensions(&temp_path)?;
// Extract all text with locations
let locations = self.extract_text_with_locations(&temp_path).await?;
// Get window bounds to calculate coordinate transformation
let window_bounds = self.get_window_bounds(app_name)?;
// Clean up temp file
let _ = std::fs::remove_file(&temp_path);
// Find matching text (case-insensitive)
let search_lower = search_text.to_lowercase();
for location in locations {
if location.text.to_lowercase().contains(&search_lower) {
// Transform coordinates from screenshot space to screen space
let transformed = transform_screenshot_to_screen_coords(
location,
window_bounds,
screenshot_dims,
);
let transformed =
transform_screenshot_to_screen_coords(location, window_bounds, screenshot_dims);
return Ok(Some(transformed));
}
}
Ok(None)
}
fn move_mouse(&self, x: i32, y: i32) -> Result<()> {
use core_graphics::event::{
CGEvent, CGEventTapLocation, CGEventType, CGMouseButton,
};
use core_graphics::event_source::{
CGEventSource, CGEventSourceStateID,
};
use core_graphics::event::{CGEvent, CGEventTapLocation, CGEventType, CGMouseButton};
use core_graphics::event_source::{CGEventSource, CGEventSourceStateID};
use core_graphics::geometry::CGPoint;
let source = CGEventSource::new(CGEventSourceStateID::HIDSystemState)
.ok().context("Failed to create event source")?;
.ok()
.context("Failed to create event source")?;
let event = CGEvent::new_mouse_event(
source,
CGEventType::MouseMoved,
CGPoint::new(x as f64, y as f64),
CGMouseButton::Left,
).ok().context("Failed to create mouse event")?;
)
.ok()
.context("Failed to create mouse event")?;
event.post(CGEventTapLocation::HID);
Ok(())
}
fn click_at(&self, x: i32, y: i32, _app_name: Option<&str>) -> Result<()> {
use core_graphics::event::{
CGEvent, CGEventTapLocation, CGEventType, CGMouseButton,
};
use core_graphics::event_source::{
CGEventSource, CGEventSourceStateID,
};
use core_graphics::geometry::CGPoint;
use core_graphics::display::CGDisplay;
use core_graphics::event::{CGEvent, CGEventTapLocation, CGEventType, CGMouseButton};
use core_graphics::event_source::{CGEventSource, CGEventSourceStateID};
use core_graphics::geometry::CGPoint;
// IMPORTANT: Coordinates passed here are in NSScreen/CGWindowListCopyWindowInfo space
// (Y=0 at BOTTOM, increases UPWARD)
// But CGEvent uses a different coordinate system (Y=0 at TOP, increases DOWNWARD)
// We need to convert: CGEvent.y = screenHeight - NSScreen.y
let screen_height = CGDisplay::main().pixels_high() as i32;
let cgevent_x = x;
let cgevent_y = screen_height - y;
tracing::debug!("click_at: NSScreen coords ({}, {}) -> CGEvent coords ({}, {}) [screen_height={}]",
x, y, cgevent_x, cgevent_y, screen_height);
tracing::debug!(
"click_at: NSScreen coords ({}, {}) -> CGEvent coords ({}, {}) [screen_height={}]",
x,
y,
cgevent_x,
cgevent_y,
screen_height
);
let (global_x, global_y) = (cgevent_x, cgevent_y);
let point = CGPoint::new(global_x as f64, global_y as f64);
let source = CGEventSource::new(CGEventSourceStateID::HIDSystemState)
.ok().context("Failed to create event source")?;
.ok()
.context("Failed to create event source")?;
// Move mouse to position first
let move_event = CGEvent::new_mouse_event(
source.clone(),
CGEventType::MouseMoved,
point,
CGMouseButton::Left,
).ok().context("Failed to create mouse move event")?;
)
.ok()
.context("Failed to create mouse move event")?;
move_event.post(CGEventTapLocation::HID);
std::thread::sleep(std::time::Duration::from_millis(100));
// Mouse down
let mouse_down = CGEvent::new_mouse_event(
source.clone(),
CGEventType::LeftMouseDown,
point,
CGMouseButton::Left,
).ok().context("Failed to create mouse down event")?;
)
.ok()
.context("Failed to create mouse down event")?;
mouse_down.post(CGEventTapLocation::HID);
std::thread::sleep(std::time::Duration::from_millis(50));
// Mouse up
let mouse_up = CGEvent::new_mouse_event(
source,
CGEventType::LeftMouseUp,
point,
CGMouseButton::Left,
).ok().context("Failed to create mouse up event")?;
let mouse_up =
CGEvent::new_mouse_event(source, CGEventType::LeftMouseUp, point, CGMouseButton::Left)
.ok()
.context("Failed to create mouse up event")?;
mouse_up.post(CGEventTapLocation::HID);
Ok(())
}
}
@@ -328,19 +383,17 @@ impl MacOSController {
/// Get window bounds for an application (helper method)
fn get_window_bounds(&self, app_name: &str) -> Result<(i32, i32, i32, i32)> {
unsafe {
let window_list = CGWindowListCopyWindowInfo(
kCGWindowListOptionOnScreenOnly,
kCGNullWindowID
);
let window_list =
CGWindowListCopyWindowInfo(kCGWindowListOptionOnScreenOnly, kCGNullWindowID);
let array = CFArray::<CFDictionary>::wrap_under_create_rule(window_list);
let count = array.len();
let app_name_lower = app_name.to_lowercase();
for i in 0..count {
let dict = array.get(i).unwrap();
// Get owner name
let owner_key = CFString::from_static_string("kCGWindowOwnerName");
let owner: String = if let Some(value) = dict.find(owner_key.to_void()) {
@@ -349,65 +402,81 @@ impl MacOSController {
} else {
continue;
};
let owner_lower = owner.to_lowercase();
// Normalize by removing spaces for exact matching
let app_name_normalized = app_name_lower.replace(" ", "");
let owner_normalized = owner_lower.replace(" ", "");
// ONLY accept exact matches (case-insensitive, with or without spaces)
// This prevents "Goose" from matching "GooseStudio"
let is_match = owner_lower == app_name_lower || owner_normalized == app_name_normalized;
let is_match =
owner_lower == app_name_lower || owner_normalized == app_name_normalized;
if is_match {
// Get window layer to filter out menu bar windows
let layer_key = CFString::from_static_string("kCGWindowLayer");
let layer: i32 = if let Some(value) = dict.find(layer_key.to_void()) {
let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _);
let num: core_foundation::number::CFNumber =
TCFType::wrap_under_get_rule(*value as *const _);
num.to_i32().unwrap_or(0)
} else {
0
};
// Skip menu bar windows (layer >= 20)
if layer >= 20 {
tracing::debug!("Skipping window for '{}' at layer {} (menu bar)", owner, layer);
tracing::debug!(
"Skipping window for '{}' at layer {} (menu bar)",
owner,
layer
);
continue;
}
// Get window bounds to verify it's a real window
let bounds_key = CFString::from_static_string("kCGWindowBounds");
if let Some(value) = dict.find(bounds_key.to_void()) {
let bounds_dict: CFDictionary = TCFType::wrap_under_get_rule(*value as *const _);
let bounds_dict: CFDictionary =
TCFType::wrap_under_get_rule(*value as *const _);
let x_key = CFString::from_static_string("X");
let y_key = CFString::from_static_string("Y");
let width_key = CFString::from_static_string("Width");
let height_key = CFString::from_static_string("Height");
if let (Some(x_val), Some(y_val), Some(w_val), Some(h_val)) = (
bounds_dict.find(x_key.to_void()),
bounds_dict.find(y_key.to_void()),
bounds_dict.find(width_key.to_void()),
bounds_dict.find(height_key.to_void()),
) {
let x_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*x_val as *const _);
let y_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*y_val as *const _);
let w_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*w_val as *const _);
let h_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*h_val as *const _);
let x_num: core_foundation::number::CFNumber =
TCFType::wrap_under_get_rule(*x_val as *const _);
let y_num: core_foundation::number::CFNumber =
TCFType::wrap_under_get_rule(*y_val as *const _);
let w_num: core_foundation::number::CFNumber =
TCFType::wrap_under_get_rule(*w_val as *const _);
let h_num: core_foundation::number::CFNumber =
TCFType::wrap_under_get_rule(*h_val as *const _);
let x: i32 = x_num.to_i64().unwrap_or(0) as i32;
let y: i32 = y_num.to_i64().unwrap_or(0) as i32;
let w: i32 = w_num.to_i64().unwrap_or(0) as i32;
let h: i32 = h_num.to_i64().unwrap_or(0) as i32;
// Only accept windows with real bounds (>= 100x100 pixels)
if w >= 100 && h >= 100 {
tracing::info!("Found valid window bounds for '{}': x={}, y={}, w={}, h={} (layer={})", owner, x, y, w, h, layer);
return Ok((x, y, w, h));
} else {
tracing::debug!("Skipping window for '{}': too small ({}x{})", owner, w, h);
tracing::debug!(
"Skipping window for '{}': too small ({}x{})",
owner,
w,
h
);
continue;
}
} else {
@@ -417,8 +486,11 @@ impl MacOSController {
}
}
}
Err(anyhow::anyhow!("Could not find window bounds for '{}'", app_name))
Err(anyhow::anyhow!(
"Could not find window bounds for '{}'",
app_name
))
}
}
@@ -426,72 +498,118 @@ impl MacOSController {
fn get_image_dimensions(path: &str) -> Result<(i32, i32)> {
use std::fs::File;
use std::io::Read;
let mut file = File::open(path)?;
let mut buffer = vec![0u8; 24];
file.read_exact(&mut buffer)?;
// PNG signature check
if &buffer[0..8] != b"\x89PNG\r\n\x1a\n" {
anyhow::bail!("Not a valid PNG file");
}
// Read IHDR chunk (width and height are at bytes 16-23)
let width = u32::from_be_bytes([buffer[16], buffer[17], buffer[18], buffer[19]]) as i32;
let height = u32::from_be_bytes([buffer[20], buffer[21], buffer[22], buffer[23]]) as i32;
Ok((width, height))
}
/// Transform coordinates from screenshot space to screen space
///
///
/// The screenshot is taken of a window, and Vision OCR returns coordinates
/// relative to the screenshot image. We need to transform these to actual
/// screen coordinates for clicking.
///
///
/// On Retina displays, screenshots are taken at 2x resolution, so we need
/// to account for this scaling factor.
fn transform_screenshot_to_screen_coords(
location: TextLocation,
window_bounds: (i32, i32, i32, i32), // (x, y, width, height) in screen space
screenshot_dims: (i32, i32), // (width, height) in pixels
screenshot_dims: (i32, i32), // (width, height) in pixels
) -> TextLocation {
let (win_x, win_y, win_width, win_height) = window_bounds;
let (screenshot_width, screenshot_height) = screenshot_dims;
// Calculate scale factors
// On Retina displays, screenshot is typically 2x the window size
let scale_x = win_width as f64 / screenshot_width as f64;
let scale_y = win_height as f64 / screenshot_height as f64;
tracing::debug!("Transform: screenshot={}x{}, window={}x{} at ({},{}), scale=({:.2},{:.2})",
screenshot_width, screenshot_height, win_width, win_height, win_x, win_y, scale_x, scale_y);
tracing::debug!(
"Transform: screenshot={}x{}, window={}x{} at ({},{}), scale=({:.2},{:.2})",
screenshot_width,
screenshot_height,
win_width,
win_height,
win_x,
win_y,
scale_x,
scale_y
);
// Transform coordinates from image space to screen space
// IMPORTANT: macOS screen coordinates have origin at BOTTOM-LEFT (Y increases upward)
// Image coordinates have origin at TOP-LEFT (Y increases downward)
// win_y is the BOTTOM of the window in screen coordinates
// So we need to: (win_y + win_height) to get window TOP, then subtract screenshot_y
let window_top_y = win_y + win_height;
tracing::debug!("[transform] Input location in image space: x={}, y={}, width={}, height={}",
location.x, location.y, location.width, location.height);
tracing::debug!("[transform] Scale factors: scale_x={:.4}, scale_y={:.4}", scale_x, scale_y);
tracing::debug!(
"[transform] Input location in image space: x={}, y={}, width={}, height={}",
location.x,
location.y,
location.width,
location.height
);
tracing::debug!(
"[transform] Scale factors: scale_x={:.4}, scale_y={:.4}",
scale_x,
scale_y
);
let transformed_x = win_x + (location.x as f64 * scale_x) as i32;
let transformed_y = window_top_y - (location.y as f64 * scale_y) as i32;
let transformed_width = (location.width as f64 * scale_x) as i32;
let transformed_height = (location.height as f64 * scale_y) as i32;
tracing::debug!("[transform] Calculation details:");
tracing::debug!(" - transformed_x = {} + ({} * {:.4}) = {} + {:.2} = {}", win_x, location.x, scale_x, win_x, location.x as f64 * scale_x, transformed_x);
tracing::debug!(" - transformed_width = ({} * {:.4}) = {:.2} -> {}", location.width, scale_x, location.width as f64 * scale_x, transformed_width);
tracing::debug!(" - transformed_height = ({} * {:.4}) = {:.2} -> {}", location.height, scale_y, location.height as f64 * scale_y, transformed_height);
tracing::debug!("Transformed location: screenshot=({},{}) {}x{} -> screen=({},{}) {}x{}",
location.x, location.y, location.width, location.height,
transformed_x, transformed_y, transformed_width, transformed_height);
tracing::debug!(
" - transformed_x = {} + ({} * {:.4}) = {} + {:.2} = {}",
win_x,
location.x,
scale_x,
win_x,
location.x as f64 * scale_x,
transformed_x
);
tracing::debug!(
" - transformed_width = ({} * {:.4}) = {:.2} -> {}",
location.width,
scale_x,
location.width as f64 * scale_x,
transformed_width
);
tracing::debug!(
" - transformed_height = ({} * {:.4}) = {:.2} -> {}",
location.height,
scale_y,
location.height as f64 * scale_y,
transformed_height
);
tracing::debug!(
"Transformed location: screenshot=({},{}) {}x{} -> screen=({},{}) {}x{}",
location.x,
location.y,
location.width,
location.height,
transformed_x,
transformed_y,
transformed_width,
transformed_height
);
TextLocation {
text: location.text,
x: transformed_x,
@@ -504,4 +622,4 @@ fn transform_screenshot_to_screen_coords(
#[path = "macos_window_matching_test.rs"]
#[cfg(test)]
mod tests;
mod tests;

View File

@@ -1,11 +1,11 @@
#[cfg(test)]
mod window_matching_tests {
/// Test that window name matching handles spaces correctly
///
///
/// Issue: When a user requests a screenshot of "Goose Studio" but the actual
/// application name is "GooseStudio" (no space), the fuzzy matching should
/// still find the window.
///
///
/// The fix normalizes both names by removing spaces before comparing.
#[test]
fn test_space_normalization() {
@@ -16,25 +16,25 @@ mod window_matching_tests {
("Visual Studio Code", "VisualStudioCode", true),
("Google Chrome", "Google Chrome", true),
("Safari", "Safari", true),
("iTerm", "iTerm2", true), // fuzzy match
("iTerm", "iTerm2", true), // fuzzy match
("Code", "Visual Studio Code", true), // fuzzy match
];
for (user_input, app_name, should_match) in test_cases {
let user_lower = user_input.to_lowercase();
let app_lower = app_name.to_lowercase();
let user_normalized = user_lower.replace(" ", "");
let app_normalized = app_lower.replace(" ", "");
let is_exact = app_lower == user_lower || app_normalized == user_normalized;
let is_fuzzy = app_lower.contains(&user_lower)
let is_fuzzy = app_lower.contains(&user_lower)
|| user_lower.contains(&app_lower)
|| app_normalized.contains(&user_normalized)
|| user_normalized.contains(&app_normalized);
let matches = is_exact || is_fuzzy;
assert_eq!(
matches, should_match,
"Expected '{}' vs '{}' to match={}, but got match={}",

View File

@@ -1,4 +1,4 @@
use crate::{ComputerController, types::*};
use crate::{types::*, ComputerController};
use anyhow::Result;
use async_trait::async_trait;
use tesseract::Tesseract;
@@ -20,48 +20,53 @@ impl ComputerController for WindowsController {
async fn move_mouse(&self, _x: i32, _y: i32) -> Result<()> {
anyhow::bail!("Windows implementation not yet available")
}
async fn click(&self, _button: MouseButton) -> Result<()> {
anyhow::bail!("Windows implementation not yet available")
}
async fn double_click(&self, _button: MouseButton) -> Result<()> {
anyhow::bail!("Windows implementation not yet available")
}
async fn type_text(&self, _text: &str) -> Result<()> {
anyhow::bail!("Windows implementation not yet available")
}
async fn press_key(&self, _key: &str) -> Result<()> {
anyhow::bail!("Windows implementation not yet available")
}
async fn list_windows(&self) -> Result<Vec<Window>> {
anyhow::bail!("Windows implementation not yet available")
}
async fn focus_window(&self, _window_id: &str) -> Result<()> {
anyhow::bail!("Windows implementation not yet available")
}
async fn get_window_bounds(&self, _window_id: &str) -> Result<Rect> {
anyhow::bail!("Windows implementation not yet available")
}
async fn find_element(&self, _selector: &ElementSelector) -> Result<Option<UIElement>> {
anyhow::bail!("Windows implementation not yet available")
}
async fn get_element_text(&self, _element_id: &str) -> Result<String> {
anyhow::bail!("Windows implementation not yet available")
}
async fn get_element_bounds(&self, _element_id: &str) -> Result<Rect> {
anyhow::bail!("Windows implementation not yet available")
}
async fn take_screenshot(&self, _path: &str, _region: Option<Rect>, _window_id: Option<&str>) -> Result<()> {
async fn take_screenshot(
&self,
_path: &str,
_region: Option<Rect>,
_window_id: Option<&str>,
) -> Result<()> {
// Enforce that window_id must be provided
if _window_id.is_none() {
anyhow::bail!("window_id is required. You must specify which window to capture (e.g., 'Chrome', 'Terminal', 'Notepad'). Use list_windows to see available windows.");
@@ -69,96 +74,113 @@ impl ComputerController for WindowsController {
anyhow::bail!("Windows implementation not yet available")
}
async fn extract_text_from_screen(&self, _region: Rect, _window_id: &str) -> Result<String> {
anyhow::bail!("Windows implementation not yet available")
}
async fn extract_text_from_image(&self, _path: &str) -> Result<OCRResult> {
// Check if tesseract is available on the system
let tesseract_check = std::process::Command::new("where")
.arg("tesseract")
.output();
if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() {
anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\
anyhow::bail!(
"Tesseract OCR is not installed on your system.\n\n\
To install tesseract on Windows:\n \
1. Download the installer from: https://github.com/UB-Mannheim/tesseract/wiki\n \
2. Run the installer and follow the instructions\n \
3. Add tesseract to your PATH environment variable\n \
4. Restart your terminal/command prompt\n\n\
After installation, restart your terminal and try again.");
After installation, restart your terminal and try again."
);
}
// Initialize Tesseract
let tess = Tesseract::new(None, Some("eng"))
.map_err(|e| {
anyhow::anyhow!("Failed to initialize Tesseract: {}\n\n\
let tess = Tesseract::new(None, Some("eng")).map_err(|e| {
anyhow::anyhow!(
"Failed to initialize Tesseract: {}\n\n\
This usually means:\n1. Tesseract is not properly installed\n\
2. Language data files are missing\n\nTo fix:\n \
1. Reinstall tesseract from https://github.com/UB-Mannheim/tesseract/wiki\n \
2. Make sure to select 'Additional language data' during installation\n \
3. Ensure tesseract is in your PATH", e)
})?;
let text = tess.set_image(_path)
3. Ensure tesseract is in your PATH",
e
)
})?;
let text = tess
.set_image(_path)
.map_err(|e| anyhow::anyhow!("Failed to load image '{}': {}", _path, e))?
.get_text()
.map_err(|e| anyhow::anyhow!("Failed to extract text from image: {}", e))?;
// Get confidence (simplified - would need more complex API calls for per-word confidence)
let confidence = 0.85; // Placeholder
Ok(OCRResult {
text,
confidence,
bounds: Rect { x: 0, y: 0, width: 0, height: 0 }, // Would need image dimensions
bounds: Rect {
x: 0,
y: 0,
width: 0,
height: 0,
}, // Would need image dimensions
})
}
async fn find_text_on_screen(&self, _text: &str) -> Result<Option<Point>> {
// Check if tesseract is available on the system
let tesseract_check = std::process::Command::new("where")
.arg("tesseract")
.output();
if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() {
anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\
anyhow::bail!(
"Tesseract OCR is not installed on your system.\n\n\
To install tesseract on Windows:\n \
1. Download the installer from: https://github.com/UB-Mannheim/tesseract/wiki\n \
2. Run the installer and follow the instructions\n \
3. Add tesseract to your PATH environment variable\n \
4. Restart your terminal/command prompt\n\n\
After installation, restart your terminal and try again.");
After installation, restart your terminal and try again."
);
}
// Take full screen screenshot
let temp_path = format!("C:\\\\Temp\\\\g3_ocr_search_{}.png", uuid::Uuid::new_v4());
self.take_screenshot(&temp_path, None, None).await?;
// Use Tesseract to find text with bounding boxes
let tess = Tesseract::new(None, Some("eng"))
.map_err(|e| {
anyhow::anyhow!("Failed to initialize Tesseract: {}\n\n\
let tess = Tesseract::new(None, Some("eng")).map_err(|e| {
anyhow::anyhow!(
"Failed to initialize Tesseract: {}\n\n\
This usually means:\n1. Tesseract is not properly installed\n\
2. Language data files are missing\n\nTo fix:\n \
1. Reinstall tesseract from https://github.com/UB-Mannheim/tesseract/wiki\n \
2. Make sure to select 'Additional language data' during installation\n \
3. Ensure tesseract is in your PATH", e)
})?;
let full_text = tess.set_image(temp_path.as_str())
3. Ensure tesseract is in your PATH",
e
)
})?;
let full_text = tess
.set_image(temp_path.as_str())
.map_err(|e| anyhow::anyhow!("Failed to load screenshot: {}", e))?
.get_text()
.map_err(|e| anyhow::anyhow!("Failed to extract text from screen: {}", e))?;
// Clean up temp file
let _ = std::fs::remove_file(&temp_path);
// Simple text search - full implementation would use get_component_images
// to get bounding boxes for each word
if full_text.contains(_text) {
tracing::warn!("Text found but precise coordinates not available in simplified implementation");
tracing::warn!(
"Text found but precise coordinates not available in simplified implementation"
);
Ok(Some(Point { x: 0, y: 0 }))
} else {
Ok(None)

View File

@@ -9,31 +9,31 @@ use serde_json::Value;
pub trait WebDriverController: Send + Sync {
/// Navigate to a URL
async fn navigate(&mut self, url: &str) -> Result<()>;
/// Get the current URL
async fn current_url(&self) -> Result<String>;
/// Get the page title
async fn title(&self) -> Result<String>;
/// Find an element by CSS selector
async fn find_element(&mut self, selector: &str) -> Result<WebElement>;
/// Find multiple elements by CSS selector
async fn find_elements(&mut self, selector: &str) -> Result<Vec<WebElement>>;
/// Execute JavaScript in the browser
async fn execute_script(&mut self, script: &str, args: Vec<Value>) -> Result<Value>;
/// Get the page source (HTML)
async fn page_source(&self) -> Result<String>;
/// Take a screenshot and save to path
async fn screenshot(&mut self, path: &str) -> Result<()>;
/// Close the current window/tab
async fn close(&mut self) -> Result<()>;
/// Quit the browser session
async fn quit(self) -> Result<()>;
}
@@ -49,63 +49,69 @@ impl WebElement {
self.inner.click().await?;
Ok(())
}
/// Send keys/text to the element
pub async fn send_keys(&mut self, text: &str) -> Result<()> {
self.inner.send_keys(text).await?;
Ok(())
}
/// Clear the element's content (for input fields)
pub async fn clear(&mut self) -> Result<()> {
self.inner.clear().await?;
Ok(())
}
/// Get the element's text content
pub async fn text(&self) -> Result<String> {
Ok(self.inner.text().await?)
}
/// Get an attribute value
pub async fn attr(&self, name: &str) -> Result<Option<String>> {
Ok(self.inner.attr(name).await?)
}
/// Get a property value
pub async fn prop(&self, name: &str) -> Result<Option<String>> {
Ok(self.inner.prop(name).await?)
}
/// Get the element's HTML
pub async fn html(&self, inner: bool) -> Result<String> {
Ok(self.inner.html(inner).await?)
}
/// Check if element is displayed
pub async fn is_displayed(&self) -> Result<bool> {
Ok(self.inner.is_displayed().await?)
}
/// Check if element is enabled
pub async fn is_enabled(&self) -> Result<bool> {
Ok(self.inner.is_enabled().await?)
}
/// Check if element is selected (for checkboxes/radio buttons)
pub async fn is_selected(&self) -> Result<bool> {
Ok(self.inner.is_selected().await?)
}
/// Find a child element by CSS selector
pub async fn find_element(&mut self, selector: &str) -> Result<WebElement> {
let elem = self.inner.find(fantoccini::Locator::Css(selector)).await?;
Ok(WebElement { inner: elem })
}
/// Find multiple child elements by CSS selector
pub async fn find_elements(&mut self, selector: &str) -> Result<Vec<WebElement>> {
let elems = self.inner.find_all(fantoccini::Locator::Css(selector)).await?;
Ok(elems.into_iter().map(|inner| WebElement { inner }).collect())
let elems = self
.inner
.find_all(fantoccini::Locator::Css(selector))
.await?;
Ok(elems
.into_iter()
.map(|inner| WebElement { inner })
.collect())
}
}

View File

@@ -12,10 +12,10 @@ pub struct SafariDriver {
impl SafariDriver {
/// Create a new SafariDriver instance
///
///
/// This will connect to SafariDriver running on the default port (4444).
/// Make sure to enable "Allow Remote Automation" in Safari's Develop menu first.
///
///
/// You can start SafariDriver manually with:
/// ```bash
/// /usr/bin/safaridriver --enable
@@ -23,125 +23,134 @@ impl SafariDriver {
pub async fn new() -> Result<Self> {
Self::with_port(4444).await
}
/// Create a new SafariDriver instance with a custom port
pub async fn with_port(port: u16) -> Result<Self> {
let url = format!("http://localhost:{}", port);
let mut caps = serde_json::Map::new();
caps.insert("browserName".to_string(), Value::String("safari".to_string()));
caps.insert(
"browserName".to_string(),
Value::String("safari".to_string()),
);
let client = ClientBuilder::native()
.capabilities(caps)
.connect(&url)
.await
.context("Failed to connect to SafariDriver. Make sure SafariDriver is running and 'Allow Remote Automation' is enabled in Safari's Develop menu.")?;
Ok(Self { client })
}
/// Go back in browser history
pub async fn back(&mut self) -> Result<()> {
self.client.back().await?;
Ok(())
}
/// Go forward in browser history
pub async fn forward(&mut self) -> Result<()> {
self.client.forward().await?;
Ok(())
}
/// Refresh the current page
pub async fn refresh(&mut self) -> Result<()> {
self.client.refresh().await?;
Ok(())
}
/// Get all window handles
pub async fn window_handles(&mut self) -> Result<Vec<String>> {
let handles = self.client.windows().await?;
Ok(handles.into_iter()
.map(|h| h.into())
.collect())
Ok(handles.into_iter().map(|h| h.into()).collect())
}
/// Switch to a window by handle
pub async fn switch_to_window(&mut self, handle: &str) -> Result<()> {
let window_handle: fantoccini::wd::WindowHandle = handle.to_string().try_into()?;
self.client.switch_to_window(window_handle).await?;
Ok(())
}
/// Get the current window handle
pub async fn current_window_handle(&mut self) -> Result<String> {
Ok(self.client.window().await?.into())
}
/// Close the current window
pub async fn close_window(&mut self) -> Result<()> {
self.client.close_window().await?;
Ok(())
}
/// Create a new window/tab
pub async fn new_window(&mut self, is_tab: bool) -> Result<String> {
let window_type = if is_tab { "tab" } else { "window" };
let response = self.client.new_window(window_type == "tab").await?;
Ok(response.handle.into())
}
/// Get cookies
pub async fn get_cookies(&mut self) -> Result<Vec<fantoccini::cookies::Cookie<'static>>> {
Ok(self.client.get_all_cookies().await?)
}
/// Add a cookie
pub async fn add_cookie(&mut self, cookie: fantoccini::cookies::Cookie<'static>) -> Result<()> {
self.client.add_cookie(cookie).await?;
Ok(())
}
/// Delete all cookies
pub async fn delete_all_cookies(&mut self) -> Result<()> {
self.client.delete_all_cookies().await?;
Ok(())
}
/// Wait for an element to appear (with timeout)
pub async fn wait_for_element(&mut self, selector: &str, timeout: Duration) -> Result<WebElement> {
pub async fn wait_for_element(
&mut self,
selector: &str,
timeout: Duration,
) -> Result<WebElement> {
let start = std::time::Instant::now();
let poll_interval = Duration::from_millis(100);
loop {
if let Ok(elem) = self.find_element(selector).await {
return Ok(elem);
}
if start.elapsed() >= timeout {
anyhow::bail!("Timeout waiting for element: {}", selector);
}
tokio::time::sleep(poll_interval).await;
}
}
/// Wait for an element to be visible (with timeout)
pub async fn wait_for_visible(&mut self, selector: &str, timeout: Duration) -> Result<WebElement> {
pub async fn wait_for_visible(
&mut self,
selector: &str,
timeout: Duration,
) -> Result<WebElement> {
let start = std::time::Instant::now();
let poll_interval = Duration::from_millis(100);
loop {
if let Ok(elem) = self.find_element(selector).await {
if elem.is_displayed().await.unwrap_or(false) {
return Ok(elem);
}
}
if start.elapsed() >= timeout {
anyhow::bail!("Timeout waiting for element to be visible: {}", selector);
}
tokio::time::sleep(poll_interval).await;
}
}
@@ -153,58 +162,69 @@ impl WebDriverController for SafariDriver {
self.client.goto(url).await?;
Ok(())
}
async fn current_url(&self) -> Result<String> {
Ok(self.client.current_url().await?.to_string())
}
async fn title(&self) -> Result<String> {
Ok(self.client.title().await?)
}
async fn find_element(&mut self, selector: &str) -> Result<WebElement> {
let elem = self.client.find(fantoccini::Locator::Css(selector)).await
.context(format!("Failed to find element with selector: {}", selector))?;
let elem = self
.client
.find(fantoccini::Locator::Css(selector))
.await
.context(format!(
"Failed to find element with selector: {}",
selector
))?;
Ok(WebElement { inner: elem })
}
async fn find_elements(&mut self, selector: &str) -> Result<Vec<WebElement>> {
let elems = self.client.find_all(fantoccini::Locator::Css(selector)).await?;
Ok(elems.into_iter().map(|inner| WebElement { inner }).collect())
let elems = self
.client
.find_all(fantoccini::Locator::Css(selector))
.await?;
Ok(elems
.into_iter()
.map(|inner| WebElement { inner })
.collect())
}
async fn execute_script(&mut self, script: &str, args: Vec<Value>) -> Result<Value> {
Ok(self.client.execute(script, args).await?)
}
async fn page_source(&self) -> Result<String> {
Ok(self.client.source().await?)
}
async fn screenshot(&mut self, path: &str) -> Result<()> {
let screenshot_data = self.client.screenshot().await?;
// Expand tilde in path
let expanded_path = shellexpand::tilde(path);
let path_str = expanded_path.as_ref();
// Create parent directories if needed
if let Some(parent) = std::path::Path::new(path_str).parent() {
std::fs::create_dir_all(parent)
.context("Failed to create parent directories for screenshot")?;
}
std::fs::write(path_str, screenshot_data)
.context("Failed to write screenshot to file")?;
std::fs::write(path_str, screenshot_data).context("Failed to write screenshot to file")?;
Ok(())
}
async fn close(&mut self) -> Result<()> {
self.client.close_window().await?;
Ok(())
}
async fn quit(mut self) -> Result<()> {
self.client.close().await?;
Ok(())

View File

@@ -3,29 +3,35 @@ use g3_computer_control::*;
#[tokio::test]
async fn test_screenshot() {
let controller = create_controller().expect("Failed to create controller");
// Test that screenshot without window_id fails with appropriate error
let path = "/tmp/test_screenshot.png";
let result = controller.take_screenshot(path, None, None).await;
assert!(result.is_err(), "Expected error when window_id is not provided");
assert!(
result.is_err(),
"Expected error when window_id is not provided"
);
let error_msg = result.unwrap_err().to_string();
assert!(error_msg.contains("window_id is required"),
"Expected error message about window_id being required, got: {}", error_msg);
assert!(
error_msg.contains("window_id is required"),
"Expected error message about window_id being required, got: {}",
error_msg
);
}
#[tokio::test]
async fn test_screenshot_with_window() {
let controller = create_controller().expect("Failed to create controller");
// Take screenshot of Finder (should always be available on macOS)
let path = "/tmp/test_screenshot_finder.png";
let result = controller.take_screenshot(path, None, Some("Finder")).await;
// This test may fail if Finder is not running, so we just check it doesn't panic
// and returns a proper Result
let _ = result; // Don't assert success since Finder might not be visible
// Clean up
let _ = std::fs::remove_file(path);
}