add context window monitor
Writes the current context window to logs/current_context_window (uses a symlink to a session ID). This PR was unfortunately generated by a different LLM and did a ton of superficial reformating, it's actually a fairly small and benign change, but I don't want to roll back everything. Hope that's ok.
This commit is contained in:
@@ -34,27 +34,40 @@ fn main() {
|
||||
.expect("Failed to find .build/release directory");
|
||||
|
||||
// Copy the dylib to the output directory so it can be found at runtime
|
||||
let target_dir = manifest_dir.parent().unwrap().parent().unwrap().join("target");
|
||||
let target_dir = manifest_dir
|
||||
.parent()
|
||||
.unwrap()
|
||||
.parent()
|
||||
.unwrap()
|
||||
.join("target");
|
||||
let profile = env::var("PROFILE").unwrap_or_else(|_| "debug".to_string());
|
||||
|
||||
|
||||
// Determine the actual target directory (could be llvm-cov-target or regular target)
|
||||
let target_dir_name = env::var("CARGO_TARGET_DIR")
|
||||
.unwrap_or_else(|_| target_dir.to_string_lossy().to_string());
|
||||
let target_dir_name =
|
||||
env::var("CARGO_TARGET_DIR").unwrap_or_else(|_| target_dir.to_string_lossy().to_string());
|
||||
let actual_target_dir = PathBuf::from(&target_dir_name);
|
||||
let output_dir = actual_target_dir.join(&profile);
|
||||
|
||||
|
||||
let dylib_src = lib_path.join("libVisionBridge.dylib");
|
||||
let dylib_dst = output_dir.join("libVisionBridge.dylib");
|
||||
|
||||
|
||||
// Create output directory if it doesn't exist
|
||||
std::fs::create_dir_all(&output_dir)
|
||||
.expect(&format!("Failed to create output directory {}", output_dir.display()));
|
||||
|
||||
std::fs::copy(&dylib_src, &dylib_dst)
|
||||
.expect(&format!("Failed to copy dylib from {} to {}", dylib_src.display(), dylib_dst.display()));
|
||||
|
||||
println!("cargo:warning=Copied libVisionBridge.dylib to {}", dylib_dst.display());
|
||||
|
||||
std::fs::create_dir_all(&output_dir).expect(&format!(
|
||||
"Failed to create output directory {}",
|
||||
output_dir.display()
|
||||
));
|
||||
|
||||
std::fs::copy(&dylib_src, &dylib_dst).expect(&format!(
|
||||
"Failed to copy dylib from {} to {}",
|
||||
dylib_src.display(),
|
||||
dylib_dst.display()
|
||||
));
|
||||
|
||||
println!(
|
||||
"cargo:warning=Copied libVisionBridge.dylib to {}",
|
||||
dylib_dst.display()
|
||||
);
|
||||
|
||||
// Add rpath so the dylib can be found at runtime
|
||||
println!("cargo:rustc-link-arg=-Wl,-rpath,@executable_path");
|
||||
println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
|
||||
@@ -68,5 +81,8 @@ fn main() {
|
||||
println!("cargo:rustc-link-lib=framework=CoreGraphics");
|
||||
println!("cargo:rustc-link-lib=framework=CoreImage");
|
||||
|
||||
println!("cargo:warning=VisionBridge built successfully at {}", lib_path.display());
|
||||
println!(
|
||||
"cargo:warning=VisionBridge built successfully at {}",
|
||||
lib_path.display()
|
||||
);
|
||||
}
|
||||
|
||||
@@ -3,19 +3,19 @@ use core_graphics::display::CGDisplay;
|
||||
fn main() {
|
||||
let display = CGDisplay::main();
|
||||
let image = display.image().expect("Failed to capture screen");
|
||||
|
||||
|
||||
println!("CGImage properties:");
|
||||
println!(" Width: {}", image.width());
|
||||
println!(" Height: {}", image.height());
|
||||
println!(" Bits per component: {}", image.bits_per_component());
|
||||
println!(" Bits per pixel: {}", image.bits_per_pixel());
|
||||
println!(" Bytes per row: {}", image.bytes_per_row());
|
||||
|
||||
|
||||
let data = image.data();
|
||||
let expected_size = image.width() * image.height() * 4;
|
||||
println!(" Data length: {}", data.len());
|
||||
println!(" Expected (w*h*4): {}", expected_size);
|
||||
|
||||
|
||||
// Check if there's padding in rows
|
||||
let bytes_per_row = image.bytes_per_row();
|
||||
let width = image.width();
|
||||
@@ -23,16 +23,25 @@ fn main() {
|
||||
println!("\nRow alignment:");
|
||||
println!(" Actual bytes per row: {}", bytes_per_row);
|
||||
println!(" Expected (width * 4): {}", expected_bytes_per_row);
|
||||
println!(" Padding per row: {}", bytes_per_row - expected_bytes_per_row);
|
||||
|
||||
println!(
|
||||
" Padding per row: {}",
|
||||
bytes_per_row - expected_bytes_per_row
|
||||
);
|
||||
|
||||
// Sample some pixels from different locations
|
||||
println!("\nFirst 3 pixels (raw bytes):");
|
||||
for i in 0..3 {
|
||||
let offset = i * 4;
|
||||
println!(" Pixel {}: [{:3}, {:3}, {:3}, {:3}]",
|
||||
i, data[offset], data[offset+1], data[offset+2], data[offset+3]);
|
||||
println!(
|
||||
" Pixel {}: [{:3}, {:3}, {:3}, {:3}]",
|
||||
i,
|
||||
data[offset],
|
||||
data[offset + 1],
|
||||
data[offset + 2],
|
||||
data[offset + 3]
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
// Check a pixel from the middle
|
||||
let mid_row = image.height() / 2;
|
||||
let mid_col = image.width() / 2;
|
||||
@@ -40,7 +49,12 @@ fn main() {
|
||||
println!("\nMiddle pixel (row {}, col {}):", mid_row, mid_col);
|
||||
println!(" Offset: {}", mid_offset);
|
||||
if mid_offset + 3 < data.len() as usize {
|
||||
println!(" Bytes: [{:3}, {:3}, {:3}, {:3}]",
|
||||
data[mid_offset], data[mid_offset+1], data[mid_offset+2], data[mid_offset+3]);
|
||||
println!(
|
||||
" Bytes: [{:3}, {:3}, {:3}, {:3}]",
|
||||
data[mid_offset],
|
||||
data[mid_offset + 1],
|
||||
data[mid_offset + 2],
|
||||
data[mid_offset + 3]
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,34 +1,38 @@
|
||||
use core_graphics::window::{kCGWindowListOptionOnScreenOnly, kCGNullWindowID, CGWindowListCopyWindowInfo};
|
||||
use core_foundation::base::{TCFType, ToVoid};
|
||||
use core_foundation::dictionary::CFDictionary;
|
||||
use core_foundation::string::CFString;
|
||||
use core_foundation::base::{TCFType, ToVoid};
|
||||
use core_graphics::window::{
|
||||
kCGNullWindowID, kCGWindowListOptionOnScreenOnly, CGWindowListCopyWindowInfo,
|
||||
};
|
||||
|
||||
fn main() {
|
||||
println!("Listing all on-screen windows...");
|
||||
println!("{:<10} {:<25} {}", "Window ID", "Owner", "Title");
|
||||
println!("{}", "-".repeat(80));
|
||||
|
||||
|
||||
unsafe {
|
||||
let window_list = CGWindowListCopyWindowInfo(
|
||||
kCGWindowListOptionOnScreenOnly,
|
||||
kCGNullWindowID
|
||||
);
|
||||
|
||||
let count = core_foundation::array::CFArray::<CFDictionary>::wrap_under_create_rule(window_list).len();
|
||||
let array = core_foundation::array::CFArray::<CFDictionary>::wrap_under_create_rule(window_list);
|
||||
|
||||
let window_list =
|
||||
CGWindowListCopyWindowInfo(kCGWindowListOptionOnScreenOnly, kCGNullWindowID);
|
||||
|
||||
let count =
|
||||
core_foundation::array::CFArray::<CFDictionary>::wrap_under_create_rule(window_list)
|
||||
.len();
|
||||
let array =
|
||||
core_foundation::array::CFArray::<CFDictionary>::wrap_under_create_rule(window_list);
|
||||
|
||||
for i in 0..count {
|
||||
let dict = array.get(i).unwrap();
|
||||
|
||||
|
||||
// Get window ID
|
||||
let window_id_key = CFString::from_static_string("kCGWindowNumber");
|
||||
let window_id: i64 = if let Some(value) = dict.find(window_id_key.to_void()) {
|
||||
let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _);
|
||||
let num: core_foundation::number::CFNumber =
|
||||
TCFType::wrap_under_get_rule(*value as *const _);
|
||||
num.to_i64().unwrap_or(0)
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
|
||||
// Get owner name
|
||||
let owner_key = CFString::from_static_string("kCGWindowOwnerName");
|
||||
let owner: String = if let Some(value) = dict.find(owner_key.to_void()) {
|
||||
@@ -37,7 +41,7 @@ fn main() {
|
||||
} else {
|
||||
"Unknown".to_string()
|
||||
};
|
||||
|
||||
|
||||
// Get window name/title
|
||||
let name_key = CFString::from_static_string("kCGWindowName");
|
||||
let title: String = if let Some(value) = dict.find(name_key.to_void()) {
|
||||
@@ -46,7 +50,7 @@ fn main() {
|
||||
} else {
|
||||
"".to_string()
|
||||
};
|
||||
|
||||
|
||||
// Show all windows
|
||||
if !owner.is_empty() {
|
||||
println!("{:<10} {:<25} {}", window_id, owner, title);
|
||||
|
||||
@@ -11,11 +11,11 @@ use g3_computer_control::MacAxController;
|
||||
async fn main() -> Result<()> {
|
||||
println!("🍎 macOS Accessibility API Demo\n");
|
||||
println!("This demo shows how to control macOS applications using the Accessibility API.\n");
|
||||
|
||||
|
||||
// Create controller
|
||||
let controller = MacAxController::new()?;
|
||||
println!("✅ MacAxController initialized\n");
|
||||
|
||||
|
||||
// List running applications
|
||||
println!("📱 Listing running applications:");
|
||||
match controller.list_applications() {
|
||||
@@ -30,7 +30,7 @@ async fn main() -> Result<()> {
|
||||
Err(e) => println!(" ❌ Error: {}", e),
|
||||
}
|
||||
println!();
|
||||
|
||||
|
||||
// Get frontmost app
|
||||
println!("🎯 Getting frontmost application:");
|
||||
match controller.get_frontmost_app() {
|
||||
@@ -38,16 +38,16 @@ async fn main() -> Result<()> {
|
||||
Err(e) => println!(" ❌ Error: {}", e),
|
||||
}
|
||||
println!();
|
||||
|
||||
|
||||
// Example: Activate Finder and get its UI tree
|
||||
println!("📂 Activating Finder and inspecting UI:");
|
||||
match controller.activate_app("Finder") {
|
||||
Ok(_) => {
|
||||
println!(" ✅ Finder activated");
|
||||
|
||||
|
||||
// Wait a moment for activation
|
||||
tokio::time::sleep(tokio::time::Duration::from_millis(500)).await;
|
||||
|
||||
|
||||
// Get UI tree
|
||||
match controller.get_ui_tree("Finder", 2) {
|
||||
Ok(tree) => {
|
||||
@@ -62,13 +62,13 @@ async fn main() -> Result<()> {
|
||||
Err(e) => println!(" ❌ Error: {}", e),
|
||||
}
|
||||
println!();
|
||||
|
||||
|
||||
println!("✨ Demo complete!\n");
|
||||
println!("💡 Tips:");
|
||||
println!(" - Use --macax flag with g3 to enable these tools");
|
||||
println!(" - Grant accessibility permissions in System Preferences");
|
||||
println!(" - Add accessibility identifiers to your apps for easier automation");
|
||||
println!(" - See docs/macax-tools.md for full documentation\n");
|
||||
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,64 +1,66 @@
|
||||
use g3_computer_control::SafariDriver;
|
||||
use g3_computer_control::webdriver::WebDriverController;
|
||||
use anyhow::Result;
|
||||
use g3_computer_control::webdriver::WebDriverController;
|
||||
use g3_computer_control::SafariDriver;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
println!("Safari WebDriver Demo");
|
||||
println!("=====================\n");
|
||||
|
||||
|
||||
println!("Make sure to:");
|
||||
println!("1. Enable 'Allow Remote Automation' in Safari's Develop menu");
|
||||
println!("2. Run: /usr/bin/safaridriver --enable");
|
||||
println!("3. Start safaridriver in another terminal: safaridriver --port 4444\n");
|
||||
|
||||
|
||||
println!("Connecting to SafariDriver...");
|
||||
let mut driver = SafariDriver::new().await?;
|
||||
println!("✅ Connected!\n");
|
||||
|
||||
|
||||
// Navigate to a website
|
||||
println!("Navigating to example.com...");
|
||||
driver.navigate("https://example.com").await?;
|
||||
println!("✅ Navigated\n");
|
||||
|
||||
|
||||
// Get page title
|
||||
let title = driver.title().await?;
|
||||
println!("Page title: {}\n", title);
|
||||
|
||||
|
||||
// Get current URL
|
||||
let url = driver.current_url().await?;
|
||||
println!("Current URL: {}\n", url);
|
||||
|
||||
|
||||
// Find an element
|
||||
println!("Finding h1 element...");
|
||||
let h1 = driver.find_element("h1").await?;
|
||||
let h1_text = h1.text().await?;
|
||||
println!("H1 text: {}\n", h1_text);
|
||||
|
||||
|
||||
// Find all paragraphs
|
||||
println!("Finding all paragraphs...");
|
||||
let paragraphs = driver.find_elements("p").await?;
|
||||
println!("Found {} paragraphs\n", paragraphs.len());
|
||||
|
||||
|
||||
// Get page source
|
||||
println!("Getting page source...");
|
||||
let source = driver.page_source().await?;
|
||||
println!("Page source length: {} bytes\n", source.len());
|
||||
|
||||
|
||||
// Execute JavaScript
|
||||
println!("Executing JavaScript...");
|
||||
let result = driver.execute_script("return document.title", vec![]).await?;
|
||||
let result = driver
|
||||
.execute_script("return document.title", vec![])
|
||||
.await?;
|
||||
println!("JS result: {:?}\n", result);
|
||||
|
||||
|
||||
// Take a screenshot
|
||||
println!("Taking screenshot...");
|
||||
driver.screenshot("/tmp/safari_demo.png").await?;
|
||||
println!("✅ Screenshot saved to /tmp/safari_demo.png\n");
|
||||
|
||||
|
||||
// Close the browser
|
||||
println!("Closing browser...");
|
||||
driver.quit().await?;
|
||||
println!("✅ Done!");
|
||||
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -3,10 +3,13 @@ use g3_computer_control::create_controller;
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
println!("Testing screenshot with permission prompt...");
|
||||
|
||||
|
||||
let controller = create_controller().expect("Failed to create controller");
|
||||
|
||||
match controller.take_screenshot("/tmp/test_with_prompt.png", None, None).await {
|
||||
|
||||
match controller
|
||||
.take_screenshot("/tmp/test_with_prompt.png", None, None)
|
||||
.await
|
||||
{
|
||||
Ok(_) => {
|
||||
println!("\n✅ Screenshot saved to /tmp/test_with_prompt.png");
|
||||
println!("Opening screenshot...");
|
||||
|
||||
@@ -2,29 +2,33 @@ use std::process::Command;
|
||||
|
||||
fn main() {
|
||||
let path = "/tmp/rust_screencapture_test.png";
|
||||
|
||||
|
||||
println!("Testing screencapture command from Rust...");
|
||||
|
||||
|
||||
let mut cmd = Command::new("screencapture");
|
||||
cmd.arg("-x"); // No sound
|
||||
cmd.arg(path);
|
||||
|
||||
|
||||
println!("Command: {:?}", cmd);
|
||||
|
||||
|
||||
match cmd.output() {
|
||||
Ok(output) => {
|
||||
println!("Exit status: {}", output.status);
|
||||
println!("Stdout: {}", String::from_utf8_lossy(&output.stdout));
|
||||
println!("Stderr: {}", String::from_utf8_lossy(&output.stderr));
|
||||
|
||||
|
||||
if output.status.success() {
|
||||
println!("\n✅ Screenshot saved to: {}", path);
|
||||
|
||||
|
||||
// Check file exists and size
|
||||
if let Ok(metadata) = std::fs::metadata(path) {
|
||||
println!("File size: {} bytes ({:.1} MB)", metadata.len(), metadata.len() as f64 / 1_000_000.0);
|
||||
println!(
|
||||
"File size: {} bytes ({:.1} MB)",
|
||||
metadata.len(),
|
||||
metadata.len() as f64 / 1_000_000.0
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
// Open it
|
||||
let _ = Command::new("open").arg(path).spawn();
|
||||
println!("\nOpened screenshot - please verify it looks correct!");
|
||||
|
||||
@@ -4,17 +4,23 @@ use image::{ImageBuffer, RgbaImage};
|
||||
fn main() {
|
||||
let display = CGDisplay::main();
|
||||
let image = display.image().expect("Failed to capture screen");
|
||||
|
||||
|
||||
let width = image.width() as u32;
|
||||
let height = image.height() as u32;
|
||||
let bytes_per_row = image.bytes_per_row() as usize;
|
||||
let data = image.data();
|
||||
|
||||
|
||||
println!("Testing screenshot fix...");
|
||||
println!("Image: {}x{}, bytes_per_row: {}", width, height, bytes_per_row);
|
||||
println!(
|
||||
"Image: {}x{}, bytes_per_row: {}",
|
||||
width, height, bytes_per_row
|
||||
);
|
||||
println!("Expected bytes per row: {}", width * 4);
|
||||
println!("Padding per row: {} bytes", bytes_per_row - (width as usize * 4));
|
||||
|
||||
println!(
|
||||
"Padding per row: {} bytes",
|
||||
bytes_per_row - (width as usize * 4)
|
||||
);
|
||||
|
||||
// OLD METHOD (broken) - treating data as continuous
|
||||
println!("\n=== OLD METHOD (BROKEN) ===");
|
||||
let mut old_rgba = Vec::with_capacity(data.len() as usize);
|
||||
@@ -26,14 +32,14 @@ fn main() {
|
||||
}
|
||||
println!("Converted {} pixels", old_rgba.len() / 4);
|
||||
println!("Expected {} pixels", width * height);
|
||||
|
||||
|
||||
// NEW METHOD (fixed) - handling row padding
|
||||
println!("\n=== NEW METHOD (FIXED) ===");
|
||||
let mut new_rgba = Vec::with_capacity((width * height * 4) as usize);
|
||||
for row in 0..height as usize {
|
||||
let row_start = row * bytes_per_row;
|
||||
let row_end = row_start + (width as usize * 4);
|
||||
|
||||
|
||||
for chunk in data[row_start..row_end].chunks_exact(4) {
|
||||
new_rgba.push(chunk[2]); // R
|
||||
new_rgba.push(chunk[1]); // G
|
||||
@@ -43,26 +49,34 @@ fn main() {
|
||||
}
|
||||
println!("Converted {} pixels", new_rgba.len() / 4);
|
||||
println!("Expected {} pixels", width * height);
|
||||
|
||||
|
||||
// Save a small crop from both methods
|
||||
let crop_size = 200;
|
||||
|
||||
|
||||
// Old method crop
|
||||
let old_crop: Vec<u8> = old_rgba.iter().take((crop_size * crop_size * 4) as usize).copied().collect();
|
||||
let old_crop: Vec<u8> = old_rgba
|
||||
.iter()
|
||||
.take((crop_size * crop_size * 4) as usize)
|
||||
.copied()
|
||||
.collect();
|
||||
if let Some(old_img) = ImageBuffer::from_raw(crop_size, crop_size, old_crop) {
|
||||
let old_img: RgbaImage = old_img;
|
||||
old_img.save("/tmp/screenshot_old_method.png").unwrap();
|
||||
println!("\nSaved OLD method crop to: /tmp/screenshot_old_method.png");
|
||||
}
|
||||
|
||||
|
||||
// New method crop
|
||||
let new_crop: Vec<u8> = new_rgba.iter().take((crop_size * crop_size * 4) as usize).copied().collect();
|
||||
let new_crop: Vec<u8> = new_rgba
|
||||
.iter()
|
||||
.take((crop_size * crop_size * 4) as usize)
|
||||
.copied()
|
||||
.collect();
|
||||
if let Some(new_img) = ImageBuffer::from_raw(crop_size, crop_size, new_crop) {
|
||||
let new_img: RgbaImage = new_img;
|
||||
new_img.save("/tmp/screenshot_new_method.png").unwrap();
|
||||
println!("Saved NEW method crop to: /tmp/screenshot_new_method.png");
|
||||
}
|
||||
|
||||
|
||||
println!("\nOpen both images to compare:");
|
||||
println!(" open /tmp/screenshot_old_method.png /tmp/screenshot_new_method.png");
|
||||
}
|
||||
|
||||
@@ -6,43 +6,43 @@ use g3_computer_control::MacAxController;
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
println!("🧪 Testing macax type_text functionality\n");
|
||||
|
||||
|
||||
let controller = MacAxController::new()?;
|
||||
println!("✅ Controller initialized\n");
|
||||
|
||||
|
||||
// Test 1: Type simple text
|
||||
println!("Test 1: Typing simple text into TextEdit");
|
||||
println!(" Please open TextEdit and create a new document...");
|
||||
std::thread::sleep(std::time::Duration::from_secs(3));
|
||||
|
||||
|
||||
match controller.type_text("TextEdit", "Hello, World!") {
|
||||
Ok(_) => println!(" ✅ Successfully typed simple text\n"),
|
||||
Err(e) => println!(" ❌ Failed: {}\n", e),
|
||||
}
|
||||
|
||||
|
||||
std::thread::sleep(std::time::Duration::from_secs(1));
|
||||
|
||||
|
||||
// Test 2: Type unicode and emojis
|
||||
println!("Test 2: Typing unicode and emojis");
|
||||
match controller.type_text("TextEdit", "\n🌟 Unicode test: café, naïve, 日本語 🎉") {
|
||||
Ok(_) => println!(" ✅ Successfully typed unicode text\n"),
|
||||
Err(e) => println!(" ❌ Failed: {}\n", e),
|
||||
}
|
||||
|
||||
|
||||
std::thread::sleep(std::time::Duration::from_secs(1));
|
||||
|
||||
|
||||
// Test 3: Type special characters
|
||||
println!("Test 3: Typing special characters");
|
||||
match controller.type_text("TextEdit", "\nSpecial: @#$%^&*()_+-=[]{}|;':,.<>?/") {
|
||||
Ok(_) => println!(" ✅ Successfully typed special characters\n"),
|
||||
Err(e) => println!(" ❌ Failed: {}\n", e),
|
||||
}
|
||||
|
||||
|
||||
println!("\n✨ Tests complete!");
|
||||
println!("\n💡 Now try with Things3:");
|
||||
println!(" 1. Open Things3");
|
||||
println!(" 2. Press Cmd+N to create a new task");
|
||||
println!(" 3. Run: g3 --macax 'type \"🌟 My awesome task\" into Things'");
|
||||
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,63 +1,67 @@
|
||||
use g3_computer_control::ocr::{OCREngine, DefaultOCR};
|
||||
use anyhow::Result;
|
||||
use g3_computer_control::ocr::{DefaultOCR, OCREngine};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
println!("🧪 Testing Apple Vision OCR");
|
||||
println!("===========================\n");
|
||||
|
||||
|
||||
// Initialize OCR engine
|
||||
println!("📦 Initializing OCR engine...");
|
||||
let ocr = DefaultOCR::new()?;
|
||||
println!("✅ OCR engine: {}\n", ocr.name());
|
||||
|
||||
|
||||
// Check if test image exists
|
||||
let test_image = "/tmp/safari_test.png";
|
||||
if !std::path::Path::new(test_image).exists() {
|
||||
println!("⚠️ Test image not found: {}", test_image);
|
||||
println!(" Creating a screenshot...");
|
||||
|
||||
|
||||
let status = std::process::Command::new("screencapture")
|
||||
.arg("-x")
|
||||
.arg("-R")
|
||||
.arg("0,0,1200,800")
|
||||
.arg(test_image)
|
||||
.status()?;
|
||||
|
||||
|
||||
if !status.success() {
|
||||
anyhow::bail!("Failed to create screenshot");
|
||||
}
|
||||
|
||||
|
||||
println!("✅ Screenshot created\n");
|
||||
}
|
||||
|
||||
|
||||
// Run OCR
|
||||
println!("🔍 Running Apple Vision OCR on {}...", test_image);
|
||||
let start = std::time::Instant::now();
|
||||
let locations = ocr.extract_text_with_locations(test_image).await?;
|
||||
let duration = start.elapsed();
|
||||
|
||||
|
||||
println!("✅ OCR completed in {:.3}s\n", duration.as_secs_f64());
|
||||
|
||||
|
||||
// Display results
|
||||
println!("📊 Results:");
|
||||
println!(" Found {} text elements\n", locations.len());
|
||||
|
||||
|
||||
if locations.is_empty() {
|
||||
println!("⚠️ No text found in image");
|
||||
} else {
|
||||
println!(" Top 20 results:");
|
||||
println!(" {:<4} {:<40} {:<15} {:<12} {:<8}", "#", "Text", "Position", "Size", "Conf");
|
||||
println!(
|
||||
" {:<4} {:<40} {:<15} {:<12} {:<8}",
|
||||
"#", "Text", "Position", "Size", "Conf"
|
||||
);
|
||||
println!(" {}", "-".repeat(85));
|
||||
|
||||
|
||||
for (i, loc) in locations.iter().take(20).enumerate() {
|
||||
let text = if loc.text.len() > 37 {
|
||||
format!("{}...", &loc.text[..37])
|
||||
} else {
|
||||
loc.text.clone()
|
||||
};
|
||||
|
||||
println!(" {:<4} {:<40} ({:>4},{:>4}) {:>4}x{:<4} {:.2}",
|
||||
|
||||
println!(
|
||||
" {:<4} {:<40} ({:>4},{:>4}) {:>4}x{:<4} {:.2}",
|
||||
i + 1,
|
||||
text,
|
||||
loc.x,
|
||||
@@ -67,19 +71,22 @@ async fn main() -> Result<()> {
|
||||
loc.confidence
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
if locations.len() > 20 {
|
||||
println!("\n ... and {} more", locations.len() - 20);
|
||||
}
|
||||
|
||||
|
||||
// Performance comparison
|
||||
println!("\n📈 Performance:");
|
||||
println!(" OCR Speed: {:.3}s", duration.as_secs_f64());
|
||||
println!(" Text elements: {}", locations.len());
|
||||
println!(" Avg per element: {:.1}ms", duration.as_millis() as f64 / locations.len() as f64);
|
||||
println!(
|
||||
" Avg per element: {:.1}ms",
|
||||
duration.as_millis() as f64 / locations.len() as f64
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
println!("\n✅ Test complete!");
|
||||
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -3,36 +3,46 @@ use g3_computer_control::create_controller;
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
println!("Testing window-specific screenshot capture...");
|
||||
|
||||
|
||||
let controller = create_controller().expect("Failed to create controller");
|
||||
|
||||
|
||||
// Test 1: Capture iTerm2 window
|
||||
println!("\n1. Capturing iTerm2 window...");
|
||||
match controller.take_screenshot("/tmp/iterm_window.png", None, Some("iTerm2")).await {
|
||||
match controller
|
||||
.take_screenshot("/tmp/iterm_window.png", None, Some("iTerm2"))
|
||||
.await
|
||||
{
|
||||
Ok(_) => {
|
||||
println!(" ✅ iTerm2 window captured to /tmp/iterm_window.png");
|
||||
let _ = std::process::Command::new("open").arg("/tmp/iterm_window.png").spawn();
|
||||
let _ = std::process::Command::new("open")
|
||||
.arg("/tmp/iterm_window.png")
|
||||
.spawn();
|
||||
}
|
||||
Err(e) => println!(" ❌ Failed: {}", e),
|
||||
}
|
||||
|
||||
|
||||
// Wait a moment for the image to open
|
||||
tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
|
||||
|
||||
|
||||
// Test 2: Full screen capture for comparison
|
||||
println!("\n2. Capturing full screen for comparison...");
|
||||
match controller.take_screenshot("/tmp/fullscreen.png", None, None).await {
|
||||
match controller
|
||||
.take_screenshot("/tmp/fullscreen.png", None, None)
|
||||
.await
|
||||
{
|
||||
Ok(_) => {
|
||||
println!(" ✅ Full screen captured to /tmp/fullscreen.png");
|
||||
let _ = std::process::Command::new("open").arg("/tmp/fullscreen.png").spawn();
|
||||
let _ = std::process::Command::new("open")
|
||||
.arg("/tmp/fullscreen.png")
|
||||
.spawn();
|
||||
}
|
||||
Err(e) => println!(" ❌ Failed: {}", e),
|
||||
}
|
||||
|
||||
|
||||
println!("\n=== Comparison ===");
|
||||
println!("iTerm window: /tmp/iterm_window.png (should show ONLY iTerm window)");
|
||||
println!("Full screen: /tmp/fullscreen.png (should show entire desktop)");
|
||||
|
||||
|
||||
// Show file sizes
|
||||
if let Ok(meta1) = std::fs::metadata("/tmp/iterm_window.png") {
|
||||
if let Ok(meta2) = std::fs::metadata("/tmp/fullscreen.png") {
|
||||
|
||||
@@ -1,17 +1,17 @@
|
||||
// Suppress warnings from objc crate macros
|
||||
#![allow(unexpected_cfgs)]
|
||||
|
||||
pub mod types;
|
||||
pub mod platform;
|
||||
pub mod ocr;
|
||||
pub mod webdriver;
|
||||
pub mod macax;
|
||||
pub mod ocr;
|
||||
pub mod platform;
|
||||
pub mod types;
|
||||
pub mod webdriver;
|
||||
|
||||
// Re-export webdriver types for convenience
|
||||
pub use webdriver::{WebDriverController, WebElement, safari::SafariDriver};
|
||||
pub use webdriver::{safari::SafariDriver, WebDriverController, WebElement};
|
||||
|
||||
// Re-export macax types for convenience
|
||||
pub use macax::{MacAxController, AXElement, AXApplication};
|
||||
pub use macax::{AXApplication, AXElement, MacAxController};
|
||||
|
||||
use anyhow::Result;
|
||||
use async_trait::async_trait;
|
||||
@@ -20,14 +20,23 @@ use types::*;
|
||||
#[async_trait]
|
||||
pub trait ComputerController: Send + Sync {
|
||||
// Screen capture
|
||||
async fn take_screenshot(&self, path: &str, region: Option<Rect>, window_id: Option<&str>) -> Result<()>;
|
||||
|
||||
async fn take_screenshot(
|
||||
&self,
|
||||
path: &str,
|
||||
region: Option<Rect>,
|
||||
window_id: Option<&str>,
|
||||
) -> Result<()>;
|
||||
|
||||
// OCR operations
|
||||
async fn extract_text_from_screen(&self, region: Rect, window_id: &str) -> Result<String>;
|
||||
async fn extract_text_from_image(&self, path: &str) -> Result<String>;
|
||||
async fn extract_text_with_locations(&self, path: &str) -> Result<Vec<TextLocation>>;
|
||||
async fn find_text_in_app(&self, app_name: &str, search_text: &str) -> Result<Option<TextLocation>>;
|
||||
|
||||
async fn find_text_in_app(
|
||||
&self,
|
||||
app_name: &str,
|
||||
search_text: &str,
|
||||
) -> Result<Option<TextLocation>>;
|
||||
|
||||
// Mouse operations
|
||||
fn move_mouse(&self, x: i32, y: i32) -> Result<()>;
|
||||
fn click_at(&self, x: i32, y: i32, app_name: Option<&str>) -> Result<()>;
|
||||
@@ -37,13 +46,13 @@ pub trait ComputerController: Send + Sync {
|
||||
pub fn create_controller() -> Result<Box<dyn ComputerController>> {
|
||||
#[cfg(target_os = "macos")]
|
||||
return Ok(Box::new(platform::macos::MacOSController::new()?));
|
||||
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
return Ok(Box::new(platform::linux::LinuxController::new()?));
|
||||
|
||||
|
||||
#[cfg(target_os = "windows")]
|
||||
return Ok(Box::new(platform::windows::WindowsController::new()?));
|
||||
|
||||
|
||||
#[cfg(not(any(target_os = "macos", target_os = "linux", target_os = "windows")))]
|
||||
anyhow::bail!("Unsupported platform")
|
||||
}
|
||||
|
||||
@@ -3,7 +3,9 @@ use anyhow::{Context, Result};
|
||||
use std::collections::HashMap;
|
||||
|
||||
#[cfg(target_os = "macos")]
|
||||
use accessibility::{AXUIElement, AXUIElementAttributes, ElementFinder, TreeVisitor, TreeWalker, TreeWalkerFlow};
|
||||
use accessibility::{
|
||||
AXUIElement, AXUIElementAttributes, ElementFinder, TreeVisitor, TreeWalker, TreeWalkerFlow,
|
||||
};
|
||||
|
||||
#[cfg(target_os = "macos")]
|
||||
use core_foundation::base::TCFType;
|
||||
@@ -23,46 +25,46 @@ impl MacAxController {
|
||||
{
|
||||
// Check if we have accessibility permissions by trying to get system-wide element
|
||||
let _system = AXUIElement::system_wide();
|
||||
|
||||
|
||||
Ok(Self {
|
||||
app_cache: std::sync::Mutex::new(HashMap::new()),
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
{
|
||||
anyhow::bail!("macOS Accessibility API is only available on macOS")
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// List all running applications
|
||||
#[cfg(target_os = "macos")]
|
||||
pub fn list_applications(&self) -> Result<Vec<AXApplication>> {
|
||||
let apps = Self::get_running_applications()?;
|
||||
Ok(apps)
|
||||
}
|
||||
|
||||
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
pub fn list_applications(&self) -> Result<Vec<AXApplication>> {
|
||||
anyhow::bail!("Not supported on this platform")
|
||||
}
|
||||
|
||||
|
||||
#[cfg(target_os = "macos")]
|
||||
fn get_running_applications() -> Result<Vec<AXApplication>> {
|
||||
use cocoa::appkit::NSApplicationActivationPolicy;
|
||||
use cocoa::base::{id, nil};
|
||||
use objc::{class, msg_send, sel, sel_impl};
|
||||
|
||||
|
||||
unsafe {
|
||||
let workspace: id = msg_send![class!(NSWorkspace), sharedWorkspace];
|
||||
let running_apps: id = msg_send![workspace, runningApplications];
|
||||
let count: usize = msg_send![running_apps, count];
|
||||
|
||||
|
||||
let mut apps = Vec::new();
|
||||
|
||||
|
||||
for i in 0..count {
|
||||
let app: id = msg_send![running_apps, objectAtIndex: i];
|
||||
|
||||
|
||||
// Get app name
|
||||
let localized_name: id = msg_send![app, localizedName];
|
||||
if localized_name == nil {
|
||||
@@ -76,7 +78,7 @@ impl MacAxController {
|
||||
} else {
|
||||
continue;
|
||||
};
|
||||
|
||||
|
||||
// Get bundle ID
|
||||
let bundle_id_obj: id = msg_send![app, bundleIdentifier];
|
||||
let bundle_id = if bundle_id_obj != nil {
|
||||
@@ -93,13 +95,15 @@ impl MacAxController {
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
|
||||
// Get PID
|
||||
let pid: i32 = msg_send![app, processIdentifier];
|
||||
|
||||
|
||||
// Skip background-only apps
|
||||
let activation_policy: i64 = msg_send![app, activationPolicy];
|
||||
if activation_policy == NSApplicationActivationPolicy::NSApplicationActivationPolicyRegular as i64 {
|
||||
if activation_policy
|
||||
== NSApplicationActivationPolicy::NSApplicationActivationPolicyRegular as i64
|
||||
{
|
||||
apps.push(AXApplication {
|
||||
name,
|
||||
bundle_id,
|
||||
@@ -107,32 +111,32 @@ impl MacAxController {
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Ok(apps)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Get the frontmost (active) application
|
||||
#[cfg(target_os = "macos")]
|
||||
pub fn get_frontmost_app(&self) -> Result<AXApplication> {
|
||||
use cocoa::base::{id, nil};
|
||||
use objc::{class, msg_send, sel, sel_impl};
|
||||
|
||||
|
||||
unsafe {
|
||||
let workspace: id = msg_send![class!(NSWorkspace), sharedWorkspace];
|
||||
let frontmost_app: id = msg_send![workspace, frontmostApplication];
|
||||
|
||||
|
||||
if frontmost_app == nil {
|
||||
anyhow::bail!("No frontmost application");
|
||||
}
|
||||
|
||||
|
||||
// Get app name
|
||||
let localized_name: id = msg_send![frontmost_app, localizedName];
|
||||
let name_ptr: *const i8 = msg_send![localized_name, UTF8String];
|
||||
let name = std::ffi::CStr::from_ptr(name_ptr)
|
||||
.to_string_lossy()
|
||||
.to_string();
|
||||
|
||||
|
||||
// Get bundle ID
|
||||
let bundle_id_obj: id = msg_send![frontmost_app, bundleIdentifier];
|
||||
let bundle_id = if bundle_id_obj != nil {
|
||||
@@ -149,10 +153,10 @@ impl MacAxController {
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
|
||||
// Get PID
|
||||
let pid: i32 = msg_send![frontmost_app, processIdentifier];
|
||||
|
||||
|
||||
Ok(AXApplication {
|
||||
name,
|
||||
bundle_id,
|
||||
@@ -160,12 +164,12 @@ impl MacAxController {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
pub fn get_frontmost_app(&self) -> Result<AXApplication> {
|
||||
anyhow::bail!("Not supported on this platform")
|
||||
}
|
||||
|
||||
|
||||
/// Get AXUIElement for an application by name or PID
|
||||
#[cfg(target_os = "macos")]
|
||||
fn get_app_element(&self, app_name: &str) -> Result<AXUIElement> {
|
||||
@@ -176,79 +180,79 @@ impl MacAxController {
|
||||
return Ok(element.clone());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Find the app by name
|
||||
let apps = Self::get_running_applications()?;
|
||||
let app = apps
|
||||
.iter()
|
||||
.find(|a| a.name == app_name)
|
||||
.ok_or_else(|| anyhow::anyhow!("Application '{}' not found", app_name))?;
|
||||
|
||||
|
||||
// Create AXUIElement for the app
|
||||
let element = AXUIElement::application(app.pid);
|
||||
|
||||
|
||||
// Cache it
|
||||
{
|
||||
let mut cache = self.app_cache.lock().unwrap();
|
||||
cache.insert(app_name.to_string(), element.clone());
|
||||
}
|
||||
|
||||
|
||||
Ok(element)
|
||||
}
|
||||
|
||||
|
||||
/// Activate (bring to front) an application
|
||||
#[cfg(target_os = "macos")]
|
||||
pub fn activate_app(&self, app_name: &str) -> Result<()> {
|
||||
use cocoa::base::id;
|
||||
use objc::{class, msg_send, sel, sel_impl};
|
||||
|
||||
|
||||
// Find the app
|
||||
let apps = Self::get_running_applications()?;
|
||||
let app = apps
|
||||
.iter()
|
||||
.find(|a| a.name == app_name)
|
||||
.ok_or_else(|| anyhow::anyhow!("Application '{}' not found", app_name))?;
|
||||
|
||||
|
||||
unsafe {
|
||||
let workspace: id = msg_send![class!(NSWorkspace), sharedWorkspace];
|
||||
let running_apps: id = msg_send![workspace, runningApplications];
|
||||
let count: usize = msg_send![running_apps, count];
|
||||
|
||||
|
||||
for i in 0..count {
|
||||
let running_app: id = msg_send![running_apps, objectAtIndex: i];
|
||||
let pid: i32 = msg_send![running_app, processIdentifier];
|
||||
|
||||
|
||||
if pid == app.pid {
|
||||
let _: bool = msg_send![running_app, activateWithOptions: 0];
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
anyhow::bail!("Failed to activate application")
|
||||
}
|
||||
|
||||
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
pub fn activate_app(&self, _app_name: &str) -> Result<()> {
|
||||
anyhow::bail!("Not supported on this platform")
|
||||
}
|
||||
|
||||
|
||||
/// Get the UI hierarchy of an application
|
||||
#[cfg(target_os = "macos")]
|
||||
pub fn get_ui_tree(&self, app_name: &str, max_depth: usize) -> Result<String> {
|
||||
let app_element = self.get_app_element(app_name)?;
|
||||
let mut output = format!("Application: {}\n", app_name);
|
||||
|
||||
|
||||
Self::build_ui_tree(&app_element, &mut output, 0, max_depth)?;
|
||||
|
||||
|
||||
Ok(output)
|
||||
}
|
||||
|
||||
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
pub fn get_ui_tree(&self, _app_name: &str, _max_depth: usize) -> Result<String> {
|
||||
anyhow::bail!("Not supported on this platform")
|
||||
}
|
||||
|
||||
|
||||
#[cfg(target_os = "macos")]
|
||||
fn build_ui_tree(
|
||||
element: &AXUIElement,
|
||||
@@ -259,21 +263,22 @@ impl MacAxController {
|
||||
if depth >= max_depth {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
|
||||
let indent = " ".repeat(depth);
|
||||
|
||||
|
||||
// Get role
|
||||
let role = element.role().ok().map(|s| s.to_string())
|
||||
let role = element
|
||||
.role()
|
||||
.ok()
|
||||
.map(|s| s.to_string())
|
||||
.unwrap_or_else(|| "Unknown".to_string());
|
||||
|
||||
|
||||
// Get title
|
||||
let title = element.title().ok()
|
||||
.map(|s| s.to_string());
|
||||
|
||||
let title = element.title().ok().map(|s| s.to_string());
|
||||
|
||||
// Get identifier
|
||||
let identifier = element.identifier().ok()
|
||||
.map(|s| s.to_string());
|
||||
|
||||
let identifier = element.identifier().ok().map(|s| s.to_string());
|
||||
|
||||
// Format output
|
||||
output.push_str(&format!("{}Role: {}", indent, role));
|
||||
if let Some(t) = title {
|
||||
@@ -283,7 +288,7 @@ impl MacAxController {
|
||||
output.push_str(&format!(", ID: {}", id));
|
||||
}
|
||||
output.push('\n');
|
||||
|
||||
|
||||
// Get children
|
||||
if let Ok(children) = element.children() {
|
||||
for i in 0..children.len() {
|
||||
@@ -292,10 +297,10 @@ impl MacAxController {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
/// Find UI elements in an application
|
||||
#[cfg(target_os = "macos")]
|
||||
pub fn find_elements(
|
||||
@@ -307,7 +312,7 @@ impl MacAxController {
|
||||
) -> Result<Vec<AXElement>> {
|
||||
let app_element = self.get_app_element(app_name)?;
|
||||
let mut found_elements = Vec::new();
|
||||
|
||||
|
||||
let visitor = ElementCollector {
|
||||
role_filter: role.map(|s| s.to_string()),
|
||||
title_filter: title.map(|s| s.to_string()),
|
||||
@@ -315,13 +320,13 @@ impl MacAxController {
|
||||
results: std::cell::RefCell::new(&mut found_elements),
|
||||
depth: std::cell::Cell::new(0),
|
||||
};
|
||||
|
||||
|
||||
let walker = TreeWalker::new();
|
||||
walker.walk(&app_element, &visitor);
|
||||
|
||||
|
||||
Ok(found_elements)
|
||||
}
|
||||
|
||||
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
pub fn find_elements(
|
||||
&self,
|
||||
@@ -332,7 +337,7 @@ impl MacAxController {
|
||||
) -> Result<Vec<AXElement>> {
|
||||
anyhow::bail!("Not supported on this platform")
|
||||
}
|
||||
|
||||
|
||||
/// Find a single element (helper for click, set_value, etc.)
|
||||
#[cfg(target_os = "macos")]
|
||||
fn find_element(
|
||||
@@ -343,19 +348,17 @@ impl MacAxController {
|
||||
identifier: Option<&str>,
|
||||
) -> Result<AXUIElement> {
|
||||
let app_element = self.get_app_element(app_name)?;
|
||||
|
||||
|
||||
let role_str = role.to_string();
|
||||
let title_str = title.map(|s| s.to_string());
|
||||
let identifier_str = identifier.map(|s| s.to_string());
|
||||
|
||||
|
||||
let finder = ElementFinder::new(
|
||||
&app_element,
|
||||
move |element| {
|
||||
// Check role
|
||||
let elem_role = element.role()
|
||||
.ok()
|
||||
.map(|s| s.to_string());
|
||||
|
||||
let elem_role = element.role().ok().map(|s| s.to_string());
|
||||
|
||||
if let Some(r) = elem_role {
|
||||
if !r.contains(&role_str) {
|
||||
return false;
|
||||
@@ -363,13 +366,11 @@ impl MacAxController {
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
// Check title if specified
|
||||
if let Some(ref title_filter) = title_str {
|
||||
let elem_title = element.title()
|
||||
.ok()
|
||||
.map(|s| s.to_string());
|
||||
|
||||
let elem_title = element.title().ok().map(|s| s.to_string());
|
||||
|
||||
if let Some(t) = elem_title {
|
||||
if !t.contains(title_filter) {
|
||||
return false;
|
||||
@@ -378,13 +379,11 @@ impl MacAxController {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Check identifier if specified
|
||||
if let Some(ref id_filter) = identifier_str {
|
||||
let elem_id = element.identifier()
|
||||
.ok()
|
||||
.map(|s| s.to_string());
|
||||
|
||||
let elem_id = element.identifier().ok().map(|s| s.to_string());
|
||||
|
||||
if let Some(id) = elem_id {
|
||||
if !id.contains(id_filter) {
|
||||
return false;
|
||||
@@ -393,15 +392,15 @@ impl MacAxController {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
true
|
||||
},
|
||||
Some(std::time::Duration::from_secs(2)),
|
||||
);
|
||||
|
||||
|
||||
finder.find().context("Element not found")
|
||||
}
|
||||
|
||||
|
||||
/// Click on a UI element
|
||||
#[cfg(target_os = "macos")]
|
||||
pub fn click_element(
|
||||
@@ -412,16 +411,16 @@ impl MacAxController {
|
||||
identifier: Option<&str>,
|
||||
) -> Result<()> {
|
||||
let element = self.find_element(app_name, role, title, identifier)?;
|
||||
|
||||
|
||||
// Perform the press action
|
||||
let action_name = CFString::new("AXPress");
|
||||
element
|
||||
.perform_action(&action_name)
|
||||
.map_err(|e| anyhow::anyhow!("Failed to perform press action: {:?}", e))?;
|
||||
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
pub fn click_element(
|
||||
&self,
|
||||
@@ -432,7 +431,7 @@ impl MacAxController {
|
||||
) -> Result<()> {
|
||||
anyhow::bail!("Not supported on this platform")
|
||||
}
|
||||
|
||||
|
||||
/// Set the value of a UI element
|
||||
#[cfg(target_os = "macos")]
|
||||
pub fn set_value(
|
||||
@@ -444,16 +443,17 @@ impl MacAxController {
|
||||
identifier: Option<&str>,
|
||||
) -> Result<()> {
|
||||
let element = self.find_element(app_name, role, title, identifier)?;
|
||||
|
||||
|
||||
// Set the value - convert CFString to CFType
|
||||
let cf_value = CFString::new(value);
|
||||
|
||||
element.set_value(cf_value.as_CFType())
|
||||
|
||||
element
|
||||
.set_value(cf_value.as_CFType())
|
||||
.map_err(|e| anyhow::anyhow!("Failed to set value: {:?}", e))?;
|
||||
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
pub fn set_value(
|
||||
&self,
|
||||
@@ -465,7 +465,7 @@ impl MacAxController {
|
||||
) -> Result<()> {
|
||||
anyhow::bail!("Not supported on this platform")
|
||||
}
|
||||
|
||||
|
||||
/// Get the value of a UI element
|
||||
#[cfg(target_os = "macos")]
|
||||
pub fn get_value(
|
||||
@@ -476,11 +476,12 @@ impl MacAxController {
|
||||
identifier: Option<&str>,
|
||||
) -> Result<String> {
|
||||
let element = self.find_element(app_name, role, title, identifier)?;
|
||||
|
||||
|
||||
// Get the value
|
||||
let value_type = element.value()
|
||||
let value_type = element
|
||||
.value()
|
||||
.map_err(|e| anyhow::anyhow!("Failed to get value: {:?}", e))?;
|
||||
|
||||
|
||||
// Try to downcast to CFString
|
||||
if let Some(cf_string) = value_type.downcast::<CFString>() {
|
||||
Ok(cf_string.to_string())
|
||||
@@ -489,7 +490,7 @@ impl MacAxController {
|
||||
Ok(format!("<non-string value>"))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
pub fn get_value(
|
||||
&self,
|
||||
@@ -500,52 +501,52 @@ impl MacAxController {
|
||||
) -> Result<String> {
|
||||
anyhow::bail!("Not supported on this platform")
|
||||
}
|
||||
|
||||
|
||||
/// Type text into the currently focused element (uses system text input)
|
||||
#[cfg(target_os = "macos")]
|
||||
pub fn type_text(&self, app_name: &str, text: &str) -> Result<()> {
|
||||
use cocoa::base::{id, nil};
|
||||
use cocoa::foundation::NSString;
|
||||
use objc::{class, msg_send, sel, sel_impl};
|
||||
|
||||
|
||||
// First, make sure the app is active
|
||||
self.activate_app(app_name)?;
|
||||
|
||||
|
||||
// Wait for app to fully activate
|
||||
std::thread::sleep(std::time::Duration::from_millis(500));
|
||||
|
||||
|
||||
// Send a Tab key to try to focus on a text field
|
||||
// This helps ensure something is focused before we paste
|
||||
let _ = self.press_key(app_name, "tab", vec![]);
|
||||
std::thread::sleep(std::time::Duration::from_millis(800));
|
||||
|
||||
|
||||
// Save old clipboard, set new content, paste, then restore
|
||||
let old_content: id;
|
||||
unsafe {
|
||||
// Get the general pasteboard
|
||||
let pasteboard: id = msg_send![class!(NSPasteboard), generalPasteboard];
|
||||
|
||||
|
||||
// Save current clipboard content
|
||||
let ns_string_type = NSString::alloc(nil).init_str("public.utf8-plain-text");
|
||||
old_content = msg_send![pasteboard, stringForType: ns_string_type];
|
||||
|
||||
|
||||
// Clear and set new content
|
||||
let _: () = msg_send![pasteboard, clearContents];
|
||||
|
||||
|
||||
let ns_string = NSString::alloc(nil).init_str(text);
|
||||
let ns_type = NSString::alloc(nil).init_str("public.utf8-plain-text");
|
||||
let _: bool = msg_send![pasteboard, setString:ns_string forType:ns_type];
|
||||
}
|
||||
|
||||
|
||||
// Wait a moment for clipboard to update
|
||||
std::thread::sleep(std::time::Duration::from_millis(200));
|
||||
|
||||
|
||||
// Paste using Cmd+V (outside unsafe block)
|
||||
self.press_key(app_name, "v", vec!["command"])?;
|
||||
|
||||
|
||||
// Wait for paste to complete
|
||||
std::thread::sleep(std::time::Duration::from_millis(300));
|
||||
|
||||
|
||||
// Restore old clipboard content if it existed
|
||||
unsafe {
|
||||
if old_content != nil {
|
||||
@@ -555,15 +556,15 @@ impl MacAxController {
|
||||
let _: bool = msg_send![pasteboard, setString:old_content forType:ns_type];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
pub fn type_text(&self, _app_name: &str, _text: &str) -> Result<()> {
|
||||
anyhow::bail!("Not supported on this platform")
|
||||
}
|
||||
|
||||
|
||||
/// Focus on a text field or text area element
|
||||
#[cfg(target_os = "macos")]
|
||||
pub fn focus_element(
|
||||
@@ -574,40 +575,34 @@ impl MacAxController {
|
||||
identifier: Option<&str>,
|
||||
) -> Result<()> {
|
||||
let element = self.find_element(app_name, role, title, identifier)?;
|
||||
|
||||
|
||||
// Set focused attribute to true
|
||||
use core_foundation::boolean::CFBoolean;
|
||||
let cf_true = CFBoolean::true_value();
|
||||
|
||||
element.set_attribute(&accessibility::AXAttribute::focused(), cf_true)
|
||||
|
||||
element
|
||||
.set_attribute(&accessibility::AXAttribute::focused(), cf_true)
|
||||
.map_err(|e| anyhow::anyhow!("Failed to focus element: {:?}", e))?;
|
||||
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
/// Press a keyboard shortcut
|
||||
#[cfg(target_os = "macos")]
|
||||
pub fn press_key(
|
||||
&self,
|
||||
app_name: &str,
|
||||
key: &str,
|
||||
modifiers: Vec<&str>,
|
||||
) -> Result<()> {
|
||||
use core_graphics::event::{
|
||||
CGEvent, CGEventFlags, CGEventTapLocation,
|
||||
};
|
||||
pub fn press_key(&self, app_name: &str, key: &str, modifiers: Vec<&str>) -> Result<()> {
|
||||
use core_graphics::event::{CGEvent, CGEventFlags, CGEventTapLocation};
|
||||
use core_graphics::event_source::{CGEventSource, CGEventSourceStateID};
|
||||
|
||||
|
||||
// First, make sure the app is active
|
||||
self.activate_app(app_name)?;
|
||||
|
||||
|
||||
// Wait a bit for activation
|
||||
std::thread::sleep(std::time::Duration::from_millis(100));
|
||||
|
||||
|
||||
// Map key string to key code
|
||||
let key_code = Self::key_to_keycode(key)
|
||||
.ok_or_else(|| anyhow::anyhow!("Unknown key: {}", key))?;
|
||||
|
||||
let key_code =
|
||||
Self::key_to_keycode(key).ok_or_else(|| anyhow::anyhow!("Unknown key: {}", key))?;
|
||||
|
||||
// Map modifiers to flags
|
||||
let mut flags = CGEventFlags::CGEventFlagNull;
|
||||
for modifier in modifiers {
|
||||
@@ -619,39 +614,37 @@ impl MacAxController {
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Create event source
|
||||
let source = CGEventSource::new(CGEventSourceStateID::HIDSystemState)
|
||||
.ok().context("Failed to create event source")?;
|
||||
|
||||
.ok()
|
||||
.context("Failed to create event source")?;
|
||||
|
||||
// Create key down event
|
||||
let key_down = CGEvent::new_keyboard_event(source.clone(), key_code, true)
|
||||
.ok().context("Failed to create key down event")?;
|
||||
.ok()
|
||||
.context("Failed to create key down event")?;
|
||||
key_down.set_flags(flags);
|
||||
|
||||
|
||||
// Create key up event
|
||||
let key_up = CGEvent::new_keyboard_event(source, key_code, false)
|
||||
.ok().context("Failed to create key up event")?;
|
||||
.ok()
|
||||
.context("Failed to create key up event")?;
|
||||
key_up.set_flags(flags);
|
||||
|
||||
|
||||
// Post events
|
||||
key_down.post(CGEventTapLocation::HID);
|
||||
std::thread::sleep(std::time::Duration::from_millis(50));
|
||||
key_up.post(CGEventTapLocation::HID);
|
||||
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
pub fn press_key(
|
||||
&self,
|
||||
_app_name: &str,
|
||||
_key: &str,
|
||||
_modifiers: Vec<&str>,
|
||||
) -> Result<()> {
|
||||
pub fn press_key(&self, _app_name: &str, _key: &str, _modifiers: Vec<&str>) -> Result<()> {
|
||||
anyhow::bail!("Not supported on this platform")
|
||||
}
|
||||
|
||||
|
||||
#[cfg(target_os = "macos")]
|
||||
fn key_to_keycode(key: &str) -> Option<u16> {
|
||||
// Map common keys to keycodes
|
||||
@@ -743,62 +736,55 @@ struct ElementCollector<'a> {
|
||||
impl<'a> TreeVisitor for ElementCollector<'a> {
|
||||
fn enter_element(&self, element: &AXUIElement) -> TreeWalkerFlow {
|
||||
self.depth.set(self.depth.get() + 1);
|
||||
|
||||
|
||||
if self.depth.get() > 20 {
|
||||
return TreeWalkerFlow::SkipSubtree;
|
||||
}
|
||||
|
||||
|
||||
// Get element properties
|
||||
let role = element.role()
|
||||
let role = element
|
||||
.role()
|
||||
.ok()
|
||||
.map(|s| s.to_string())
|
||||
.unwrap_or_else(|| "Unknown".to_string());
|
||||
|
||||
let title = element.title()
|
||||
.ok()
|
||||
.map(|s| s.to_string());
|
||||
|
||||
let identifier = element.identifier()
|
||||
.ok()
|
||||
.map(|s| s.to_string());
|
||||
|
||||
|
||||
let title = element.title().ok().map(|s| s.to_string());
|
||||
|
||||
let identifier = element.identifier().ok().map(|s| s.to_string());
|
||||
|
||||
// Check if this element matches the filters
|
||||
let role_matches = self.role_filter.as_ref().map_or(true, |r| role.contains(r));
|
||||
let title_matches = self.title_filter.as_ref().map_or(true, |t| {
|
||||
title.as_ref().map_or(false, |title_str| title_str.contains(t))
|
||||
title
|
||||
.as_ref()
|
||||
.map_or(false, |title_str| title_str.contains(t))
|
||||
});
|
||||
let identifier_matches = self.identifier_filter.as_ref().map_or(true, |id| {
|
||||
identifier.as_ref().map_or(false, |id_str| id_str.contains(id))
|
||||
identifier
|
||||
.as_ref()
|
||||
.map_or(false, |id_str| id_str.contains(id))
|
||||
});
|
||||
|
||||
|
||||
if role_matches && title_matches && identifier_matches {
|
||||
// Get additional properties
|
||||
let value = element.value()
|
||||
let value = element
|
||||
.value()
|
||||
.ok()
|
||||
.and_then(|v| {
|
||||
v.downcast::<CFString>().map(|s| s.to_string())
|
||||
});
|
||||
|
||||
let label = element.description()
|
||||
.ok()
|
||||
.map(|s| s.to_string());
|
||||
|
||||
let enabled = element.enabled()
|
||||
.ok()
|
||||
.map(|b| b.into())
|
||||
.unwrap_or(false);
|
||||
|
||||
let focused = element.focused()
|
||||
.ok()
|
||||
.map(|b| b.into())
|
||||
.unwrap_or(false);
|
||||
|
||||
.and_then(|v| v.downcast::<CFString>().map(|s| s.to_string()));
|
||||
|
||||
let label = element.description().ok().map(|s| s.to_string());
|
||||
|
||||
let enabled = element.enabled().ok().map(|b| b.into()).unwrap_or(false);
|
||||
|
||||
let focused = element.focused().ok().map(|b| b.into()).unwrap_or(false);
|
||||
|
||||
// Count children
|
||||
let children_count = element.children()
|
||||
let children_count = element
|
||||
.children()
|
||||
.ok()
|
||||
.map(|arr| arr.len() as usize)
|
||||
.unwrap_or(0);
|
||||
|
||||
|
||||
self.results.borrow_mut().push(AXElement {
|
||||
role,
|
||||
title,
|
||||
@@ -812,10 +798,10 @@ impl<'a> TreeVisitor for ElementCollector<'a> {
|
||||
children_count,
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
TreeWalkerFlow::Continue
|
||||
}
|
||||
|
||||
|
||||
fn exit_element(&self, _element: &AXUIElement) {
|
||||
self.depth.set(self.depth.get() - 1);
|
||||
}
|
||||
|
||||
@@ -34,7 +34,7 @@ impl AXElement {
|
||||
/// Convert to a human-readable string representation
|
||||
pub fn to_string(&self) -> String {
|
||||
let mut parts = vec![format!("Role: {}", self.role)];
|
||||
|
||||
|
||||
if let Some(ref title) = self.title {
|
||||
parts.push(format!("Title: {}", title));
|
||||
}
|
||||
@@ -47,19 +47,19 @@ impl AXElement {
|
||||
if let Some(ref id) = self.identifier {
|
||||
parts.push(format!("ID: {}", id));
|
||||
}
|
||||
|
||||
|
||||
parts.push(format!("Enabled: {}", self.enabled));
|
||||
parts.push(format!("Focused: {}", self.focused));
|
||||
|
||||
|
||||
if let Some((x, y)) = self.position {
|
||||
parts.push(format!("Position: ({:.0}, {:.0})", x, y));
|
||||
}
|
||||
if let Some((w, h)) = self.size {
|
||||
parts.push(format!("Size: ({:.0}, {:.0})", w, h));
|
||||
}
|
||||
|
||||
|
||||
parts.push(format!("Children: {}", self.children_count));
|
||||
|
||||
|
||||
parts.join(", ")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,7 +7,7 @@ use async_trait::async_trait;
|
||||
pub trait OCREngine: Send + Sync {
|
||||
/// Extract text with locations from an image file
|
||||
async fn extract_text_with_locations(&self, path: &str) -> Result<Vec<TextLocation>>;
|
||||
|
||||
|
||||
/// Get the name of the OCR engine
|
||||
fn name(&self) -> &str;
|
||||
}
|
||||
|
||||
@@ -12,16 +12,18 @@ impl TesseractOCR {
|
||||
let tesseract_check = std::process::Command::new("which")
|
||||
.arg("tesseract")
|
||||
.output();
|
||||
|
||||
|
||||
if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() {
|
||||
anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\
|
||||
anyhow::bail!(
|
||||
"Tesseract OCR is not installed on your system.\n\n\
|
||||
To install tesseract:\n macOS: brew install tesseract\n \
|
||||
Linux: sudo apt-get install tesseract-ocr (Ubuntu/Debian)\n \
|
||||
sudo yum install tesseract (RHEL/CentOS)\n \
|
||||
Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki\n\n\
|
||||
After installation, restart your terminal and try again.");
|
||||
After installation, restart your terminal and try again."
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
Ok(Self)
|
||||
}
|
||||
}
|
||||
@@ -36,18 +38,23 @@ impl OCREngine for TesseractOCR {
|
||||
.arg("tsv")
|
||||
.output()
|
||||
.map_err(|e| anyhow::anyhow!("Failed to run tesseract: {}", e))?;
|
||||
|
||||
|
||||
if !output.status.success() {
|
||||
anyhow::bail!("Tesseract failed: {}", String::from_utf8_lossy(&output.stderr));
|
||||
anyhow::bail!(
|
||||
"Tesseract failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
let tsv_text = String::from_utf8_lossy(&output.stdout);
|
||||
let mut locations = Vec::new();
|
||||
|
||||
|
||||
// Parse TSV output (skip header line)
|
||||
for (i, line) in tsv_text.lines().enumerate() {
|
||||
if i == 0 { continue; } // Skip header
|
||||
|
||||
if i == 0 {
|
||||
continue;
|
||||
} // Skip header
|
||||
|
||||
let parts: Vec<&str> = line.split('\t').collect();
|
||||
if parts.len() >= 12 {
|
||||
// TSV format: level, page_num, block_num, par_num, line_num, word_num,
|
||||
@@ -74,10 +81,10 @@ impl OCREngine for TesseractOCR {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Ok(locations)
|
||||
}
|
||||
|
||||
|
||||
fn name(&self) -> &str {
|
||||
"Tesseract OCR"
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use super::OCREngine;
|
||||
use crate::types::TextLocation;
|
||||
use anyhow::{Result, Context};
|
||||
use anyhow::{Context, Result};
|
||||
use async_trait::async_trait;
|
||||
use std::ffi::{CStr, CString};
|
||||
use std::os::raw::{c_char, c_float, c_uint};
|
||||
@@ -24,7 +24,7 @@ extern "C" {
|
||||
out_boxes: *mut *mut std::ffi::c_void,
|
||||
out_count: *mut c_uint,
|
||||
) -> bool;
|
||||
|
||||
|
||||
fn vision_free_boxes(boxes: *mut std::ffi::c_void, count: c_uint);
|
||||
}
|
||||
|
||||
@@ -41,12 +41,11 @@ impl AppleVisionOCR {
|
||||
impl OCREngine for AppleVisionOCR {
|
||||
async fn extract_text_with_locations(&self, path: &str) -> Result<Vec<TextLocation>> {
|
||||
// Convert path to C string
|
||||
let c_path = CString::new(path)
|
||||
.context("Failed to convert path to C string")?;
|
||||
|
||||
let c_path = CString::new(path).context("Failed to convert path to C string")?;
|
||||
|
||||
let mut boxes_ptr: *mut std::ffi::c_void = std::ptr::null_mut();
|
||||
let mut count: c_uint = 0;
|
||||
|
||||
|
||||
// Call Swift Vision API
|
||||
let success = unsafe {
|
||||
vision_recognize_text(
|
||||
@@ -56,28 +55,26 @@ impl OCREngine for AppleVisionOCR {
|
||||
&mut count,
|
||||
)
|
||||
};
|
||||
|
||||
|
||||
if !success || boxes_ptr.is_null() {
|
||||
anyhow::bail!("Apple Vision OCR failed");
|
||||
}
|
||||
|
||||
|
||||
// Convert C array to Rust Vec
|
||||
let mut locations = Vec::new();
|
||||
|
||||
|
||||
unsafe {
|
||||
let typed_boxes = boxes_ptr as *const VisionTextBox;
|
||||
let boxes_slice = std::slice::from_raw_parts(typed_boxes, count as usize);
|
||||
|
||||
|
||||
for box_data in boxes_slice {
|
||||
// Convert C string to Rust String
|
||||
let text = if !box_data.text.is_null() {
|
||||
CStr::from_ptr(box_data.text)
|
||||
.to_string_lossy()
|
||||
.into_owned()
|
||||
CStr::from_ptr(box_data.text).to_string_lossy().into_owned()
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
|
||||
|
||||
if !text.is_empty() {
|
||||
locations.push(TextLocation {
|
||||
text,
|
||||
@@ -89,14 +86,14 @@ impl OCREngine for AppleVisionOCR {
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Free the C array
|
||||
vision_free_boxes(boxes_ptr, count);
|
||||
}
|
||||
|
||||
|
||||
Ok(locations)
|
||||
}
|
||||
|
||||
|
||||
fn name(&self) -> &str {
|
||||
"Apple Vision Framework"
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::{ComputerController, types::*};
|
||||
use crate::{types::*, ComputerController};
|
||||
use anyhow::Result;
|
||||
use async_trait::async_trait;
|
||||
use tesseract::Tesseract;
|
||||
@@ -21,48 +21,53 @@ impl ComputerController for LinuxController {
|
||||
async fn move_mouse(&self, _x: i32, _y: i32) -> Result<()> {
|
||||
anyhow::bail!("Linux implementation not yet available")
|
||||
}
|
||||
|
||||
|
||||
async fn click(&self, _button: MouseButton) -> Result<()> {
|
||||
anyhow::bail!("Linux implementation not yet available")
|
||||
}
|
||||
|
||||
|
||||
async fn double_click(&self, _button: MouseButton) -> Result<()> {
|
||||
anyhow::bail!("Linux implementation not yet available")
|
||||
}
|
||||
|
||||
|
||||
async fn type_text(&self, _text: &str) -> Result<()> {
|
||||
anyhow::bail!("Linux implementation not yet available")
|
||||
}
|
||||
|
||||
|
||||
async fn press_key(&self, _key: &str) -> Result<()> {
|
||||
anyhow::bail!("Linux implementation not yet available")
|
||||
}
|
||||
|
||||
|
||||
async fn list_windows(&self) -> Result<Vec<Window>> {
|
||||
anyhow::bail!("Linux implementation not yet available")
|
||||
}
|
||||
|
||||
|
||||
async fn focus_window(&self, _window_id: &str) -> Result<()> {
|
||||
anyhow::bail!("Linux implementation not yet available")
|
||||
}
|
||||
|
||||
|
||||
async fn get_window_bounds(&self, _window_id: &str) -> Result<Rect> {
|
||||
anyhow::bail!("Linux implementation not yet available")
|
||||
}
|
||||
|
||||
|
||||
async fn find_element(&self, _selector: &ElementSelector) -> Result<Option<UIElement>> {
|
||||
anyhow::bail!("Linux implementation not yet available")
|
||||
}
|
||||
|
||||
|
||||
async fn get_element_text(&self, _element_id: &str) -> Result<String> {
|
||||
anyhow::bail!("Linux implementation not yet available")
|
||||
}
|
||||
|
||||
|
||||
async fn get_element_bounds(&self, _element_id: &str) -> Result<Rect> {
|
||||
anyhow::bail!("Linux implementation not yet available")
|
||||
}
|
||||
|
||||
async fn take_screenshot(&self, _path: &str, _region: Option<Rect>, _window_id: Option<&str>) -> Result<()> {
|
||||
|
||||
async fn take_screenshot(
|
||||
&self,
|
||||
_path: &str,
|
||||
_region: Option<Rect>,
|
||||
_window_id: Option<&str>,
|
||||
) -> Result<()> {
|
||||
// Enforce that window_id must be provided
|
||||
if _window_id.is_none() {
|
||||
anyhow::bail!("window_id is required. You must specify which window to capture (e.g., 'Firefox', 'Terminal', 'gedit'). Use list_windows to see available windows.");
|
||||
@@ -70,94 +75,111 @@ impl ComputerController for LinuxController {
|
||||
|
||||
anyhow::bail!("Linux implementation not yet available")
|
||||
}
|
||||
|
||||
|
||||
async fn extract_text_from_screen(&self, _region: Rect, _window_id: &str) -> Result<String> {
|
||||
anyhow::bail!("Linux implementation not yet available")
|
||||
}
|
||||
|
||||
|
||||
async fn extract_text_from_image(&self, _path: &str) -> Result<OCRResult> {
|
||||
// Check if tesseract is available on the system
|
||||
let tesseract_check = std::process::Command::new("which")
|
||||
.arg("tesseract")
|
||||
.output();
|
||||
|
||||
|
||||
if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() {
|
||||
anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\
|
||||
anyhow::bail!(
|
||||
"Tesseract OCR is not installed on your system.\n\n\
|
||||
To install tesseract:\n \
|
||||
Ubuntu/Debian: sudo apt-get install tesseract-ocr\n \
|
||||
RHEL/CentOS: sudo yum install tesseract\n \
|
||||
Arch Linux: sudo pacman -S tesseract\n\n\
|
||||
After installation, restart your terminal and try again.");
|
||||
After installation, restart your terminal and try again."
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
// Initialize Tesseract
|
||||
let tess = Tesseract::new(None, Some("eng"))
|
||||
.map_err(|e| {
|
||||
anyhow::anyhow!("Failed to initialize Tesseract: {}\n\n\
|
||||
let tess = Tesseract::new(None, Some("eng")).map_err(|e| {
|
||||
anyhow::anyhow!(
|
||||
"Failed to initialize Tesseract: {}\n\n\
|
||||
This usually means:\n1. Tesseract is not properly installed\n\
|
||||
2. Language data files are missing\n\nTo fix:\n \
|
||||
Ubuntu/Debian: sudo apt-get install tesseract-ocr-eng\n \
|
||||
RHEL/CentOS: sudo yum install tesseract-langpack-eng\n \
|
||||
Arch Linux: sudo pacman -S tesseract-data-eng", e)
|
||||
})?;
|
||||
|
||||
let text = tess.set_image(_path)
|
||||
Arch Linux: sudo pacman -S tesseract-data-eng",
|
||||
e
|
||||
)
|
||||
})?;
|
||||
|
||||
let text = tess
|
||||
.set_image(_path)
|
||||
.map_err(|e| anyhow::anyhow!("Failed to load image '{}': {}", _path, e))?
|
||||
.get_text()
|
||||
.map_err(|e| anyhow::anyhow!("Failed to extract text from image: {}", e))?;
|
||||
|
||||
|
||||
// Get confidence (simplified - would need more complex API calls for per-word confidence)
|
||||
let confidence = 0.85; // Placeholder
|
||||
|
||||
|
||||
Ok(OCRResult {
|
||||
text,
|
||||
confidence,
|
||||
bounds: Rect { x: 0, y: 0, width: 0, height: 0 }, // Would need image dimensions
|
||||
bounds: Rect {
|
||||
x: 0,
|
||||
y: 0,
|
||||
width: 0,
|
||||
height: 0,
|
||||
}, // Would need image dimensions
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
async fn find_text_on_screen(&self, _text: &str) -> Result<Option<Point>> {
|
||||
// Check if tesseract is available on the system
|
||||
let tesseract_check = std::process::Command::new("which")
|
||||
.arg("tesseract")
|
||||
.output();
|
||||
|
||||
|
||||
if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() {
|
||||
anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\
|
||||
anyhow::bail!(
|
||||
"Tesseract OCR is not installed on your system.\n\n\
|
||||
To install tesseract:\n \
|
||||
Ubuntu/Debian: sudo apt-get install tesseract-ocr\n \
|
||||
RHEL/CentOS: sudo yum install tesseract\n \
|
||||
Arch Linux: sudo pacman -S tesseract\n\n\
|
||||
After installation, restart your terminal and try again.");
|
||||
After installation, restart your terminal and try again."
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
// Take full screen screenshot
|
||||
let temp_path = format!("/tmp/g3_ocr_search_{}.png", uuid::Uuid::new_v4());
|
||||
self.take_screenshot(&temp_path, None, None).await?;
|
||||
|
||||
|
||||
// Use Tesseract to find text with bounding boxes
|
||||
let tess = Tesseract::new(None, Some("eng"))
|
||||
.map_err(|e| {
|
||||
anyhow::anyhow!("Failed to initialize Tesseract: {}\n\n\
|
||||
let tess = Tesseract::new(None, Some("eng")).map_err(|e| {
|
||||
anyhow::anyhow!(
|
||||
"Failed to initialize Tesseract: {}\n\n\
|
||||
This usually means:\n1. Tesseract is not properly installed\n\
|
||||
2. Language data files are missing\n\nTo fix:\n \
|
||||
Ubuntu/Debian: sudo apt-get install tesseract-ocr-eng\n \
|
||||
RHEL/CentOS: sudo yum install tesseract-langpack-eng\n \
|
||||
Arch Linux: sudo pacman -S tesseract-data-eng", e)
|
||||
})?;
|
||||
|
||||
let full_text = tess.set_image(temp_path.as_str())
|
||||
Arch Linux: sudo pacman -S tesseract-data-eng",
|
||||
e
|
||||
)
|
||||
})?;
|
||||
|
||||
let full_text = tess
|
||||
.set_image(temp_path.as_str())
|
||||
.map_err(|e| anyhow::anyhow!("Failed to load screenshot: {}", e))?
|
||||
.get_text()
|
||||
.map_err(|e| anyhow::anyhow!("Failed to extract text from screen: {}", e))?;
|
||||
|
||||
|
||||
// Clean up temp file
|
||||
let _ = std::fs::remove_file(&temp_path);
|
||||
|
||||
|
||||
// Simple text search - full implementation would use get_component_images
|
||||
// to get bounding boxes for each word
|
||||
if full_text.contains(_text) {
|
||||
tracing::warn!("Text found but precise coordinates not available in simplified implementation");
|
||||
tracing::warn!(
|
||||
"Text found but precise coordinates not available in simplified implementation"
|
||||
);
|
||||
Ok(Some(Point { x: 0, y: 0 }))
|
||||
} else {
|
||||
Ok(None)
|
||||
|
||||
@@ -1,13 +1,18 @@
|
||||
use crate::{ComputerController, types::{Rect, TextLocation}};
|
||||
use crate::ocr::{OCREngine, DefaultOCR};
|
||||
use anyhow::{Result, Context};
|
||||
use crate::ocr::{DefaultOCR, OCREngine};
|
||||
use crate::{
|
||||
types::{Rect, TextLocation},
|
||||
ComputerController,
|
||||
};
|
||||
use anyhow::{Context, Result};
|
||||
use async_trait::async_trait;
|
||||
use std::path::Path;
|
||||
use core_graphics::window::{kCGWindowListOptionOnScreenOnly, kCGNullWindowID, CGWindowListCopyWindowInfo};
|
||||
use core_foundation::array::CFArray;
|
||||
use core_foundation::base::{TCFType, ToVoid};
|
||||
use core_foundation::dictionary::CFDictionary;
|
||||
use core_foundation::string::CFString;
|
||||
use core_foundation::base::{TCFType, ToVoid};
|
||||
use core_foundation::array::CFArray;
|
||||
use core_graphics::window::{
|
||||
kCGNullWindowID, kCGWindowListOptionOnScreenOnly, CGWindowListCopyWindowInfo,
|
||||
};
|
||||
use std::path::Path;
|
||||
|
||||
pub struct MacOSController {
|
||||
ocr_engine: Box<dyn OCREngine>,
|
||||
@@ -20,13 +25,21 @@ impl MacOSController {
|
||||
let ocr = Box::new(DefaultOCR::new()?);
|
||||
let ocr_name = ocr.name().to_string();
|
||||
tracing::info!("Initialized macOS controller with OCR engine: {}", ocr_name);
|
||||
Ok(Self { ocr_engine: ocr, ocr_name })
|
||||
Ok(Self {
|
||||
ocr_engine: ocr,
|
||||
ocr_name,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl ComputerController for MacOSController {
|
||||
async fn take_screenshot(&self, path: &str, region: Option<Rect>, window_id: Option<&str>) -> Result<()> {
|
||||
async fn take_screenshot(
|
||||
&self,
|
||||
path: &str,
|
||||
region: Option<Rect>,
|
||||
window_id: Option<&str>,
|
||||
) -> Result<()> {
|
||||
// Enforce that window_id must be provided
|
||||
if window_id.is_none() {
|
||||
return Err(anyhow::anyhow!("window_id is required. You must specify which window to capture (e.g., 'Safari', 'Terminal', 'Google Chrome'). Use list_windows to see available windows."));
|
||||
@@ -36,40 +49,38 @@ impl ComputerController for MacOSController {
|
||||
let temp_dir = std::env::var("TMPDIR")
|
||||
.or_else(|_| std::env::var("HOME").map(|h| format!("{}/tmp", h)))
|
||||
.unwrap_or_else(|_| "/tmp".to_string());
|
||||
|
||||
|
||||
// Ensure temp directory exists
|
||||
std::fs::create_dir_all(&temp_dir)?;
|
||||
|
||||
|
||||
// If path is relative or doesn't specify a directory, use temp_dir
|
||||
let final_path = if path.starts_with('/') {
|
||||
path.to_string()
|
||||
} else {
|
||||
format!("{}/{}", temp_dir.trim_end_matches('/'), path)
|
||||
};
|
||||
|
||||
|
||||
let path_obj = Path::new(&final_path);
|
||||
if let Some(parent) = path_obj.parent() {
|
||||
std::fs::create_dir_all(parent)?;
|
||||
}
|
||||
|
||||
|
||||
let app_name = window_id.unwrap(); // Safe because we checked is_none() above
|
||||
|
||||
|
||||
// Get the window ID for the specified application
|
||||
let cg_window_id = unsafe {
|
||||
let window_list = CGWindowListCopyWindowInfo(
|
||||
kCGWindowListOptionOnScreenOnly,
|
||||
kCGNullWindowID
|
||||
);
|
||||
|
||||
let window_list =
|
||||
CGWindowListCopyWindowInfo(kCGWindowListOptionOnScreenOnly, kCGNullWindowID);
|
||||
|
||||
let array = CFArray::<CFDictionary>::wrap_under_create_rule(window_list);
|
||||
let count = array.len();
|
||||
|
||||
|
||||
let mut found_window_id: Option<(u32, String)> = None; // (id, owner)
|
||||
let app_name_lower = app_name.to_lowercase();
|
||||
|
||||
|
||||
for i in 0..count {
|
||||
let dict = array.get(i).unwrap();
|
||||
|
||||
|
||||
// Get owner name
|
||||
let owner_key = CFString::from_static_string("kCGWindowOwnerName");
|
||||
let owner: String = if let Some(value) = dict.find(owner_key.to_void()) {
|
||||
@@ -78,57 +89,68 @@ impl ComputerController for MacOSController {
|
||||
} else {
|
||||
continue;
|
||||
};
|
||||
|
||||
tracing::debug!("Checking window: owner='{}', looking for '{}'", owner, app_name);
|
||||
|
||||
tracing::debug!(
|
||||
"Checking window: owner='{}', looking for '{}'",
|
||||
owner,
|
||||
app_name
|
||||
);
|
||||
let owner_lower = owner.to_lowercase();
|
||||
|
||||
|
||||
// Normalize by removing spaces for exact matching
|
||||
let app_name_normalized = app_name_lower.replace(" ", "");
|
||||
let owner_normalized = owner_lower.replace(" ", "");
|
||||
|
||||
|
||||
// ONLY accept exact matches (case-insensitive, with or without spaces)
|
||||
// This prevents "Goose" from matching "GooseStudio"
|
||||
let is_match = owner_lower == app_name_lower || owner_normalized == app_name_normalized;
|
||||
|
||||
let is_match =
|
||||
owner_lower == app_name_lower || owner_normalized == app_name_normalized;
|
||||
|
||||
if is_match {
|
||||
// Get window ID
|
||||
let window_id_key = CFString::from_static_string("kCGWindowNumber");
|
||||
if let Some(value) = dict.find(window_id_key.to_void()) {
|
||||
let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _);
|
||||
let num: core_foundation::number::CFNumber =
|
||||
TCFType::wrap_under_get_rule(*value as *const _);
|
||||
if let Some(id) = num.to_i64() {
|
||||
// Get window layer to filter out menu bar windows
|
||||
let layer_key = CFString::from_static_string("kCGWindowLayer");
|
||||
let layer: i32 = if let Some(value) = dict.find(layer_key.to_void()) {
|
||||
let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _);
|
||||
let num: core_foundation::number::CFNumber =
|
||||
TCFType::wrap_under_get_rule(*value as *const _);
|
||||
num.to_i32().unwrap_or(0)
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
|
||||
// Get window bounds to verify it's a real window
|
||||
let bounds_key = CFString::from_static_string("kCGWindowBounds");
|
||||
let has_real_bounds = if let Some(value) = dict.find(bounds_key.to_void()) {
|
||||
let bounds_dict: CFDictionary = TCFType::wrap_under_get_rule(*value as *const _);
|
||||
let width_key = CFString::from_static_string("Width");
|
||||
let height_key = CFString::from_static_string("Height");
|
||||
|
||||
if let (Some(w_val), Some(h_val)) = (
|
||||
bounds_dict.find(width_key.to_void()),
|
||||
bounds_dict.find(height_key.to_void()),
|
||||
) {
|
||||
let w_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*w_val as *const _);
|
||||
let h_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*h_val as *const _);
|
||||
let width = w_num.to_f64().unwrap_or(0.0);
|
||||
let height = h_num.to_f64().unwrap_or(0.0);
|
||||
// Real windows should be at least 100x100 pixels
|
||||
width >= 100.0 && height >= 100.0
|
||||
let has_real_bounds =
|
||||
if let Some(value) = dict.find(bounds_key.to_void()) {
|
||||
let bounds_dict: CFDictionary =
|
||||
TCFType::wrap_under_get_rule(*value as *const _);
|
||||
let width_key = CFString::from_static_string("Width");
|
||||
let height_key = CFString::from_static_string("Height");
|
||||
|
||||
if let (Some(w_val), Some(h_val)) = (
|
||||
bounds_dict.find(width_key.to_void()),
|
||||
bounds_dict.find(height_key.to_void()),
|
||||
) {
|
||||
let w_num: core_foundation::number::CFNumber =
|
||||
TCFType::wrap_under_get_rule(*w_val as *const _);
|
||||
let h_num: core_foundation::number::CFNumber =
|
||||
TCFType::wrap_under_get_rule(*h_val as *const _);
|
||||
let width = w_num.to_f64().unwrap_or(0.0);
|
||||
let height = h_num.to_f64().unwrap_or(0.0);
|
||||
// Real windows should be at least 100x100 pixels
|
||||
width >= 100.0 && height >= 100.0
|
||||
} else {
|
||||
false
|
||||
}
|
||||
} else {
|
||||
false
|
||||
}
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
// Only accept windows that are:
|
||||
// 1. At layer 0 (normal windows, not menu bar)
|
||||
// 2. Have real bounds (width and height >= 100)
|
||||
@@ -137,189 +159,222 @@ impl ComputerController for MacOSController {
|
||||
found_window_id = Some((id as u32, owner.clone()));
|
||||
break;
|
||||
} else {
|
||||
tracing::debug!("Skipping window ID {} for '{}': layer={}, has_real_bounds={}", id, owner, layer, has_real_bounds);
|
||||
tracing::debug!(
|
||||
"Skipping window ID {} for '{}': layer={}, has_real_bounds={}",
|
||||
id,
|
||||
owner,
|
||||
layer,
|
||||
has_real_bounds
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
found_window_id
|
||||
};
|
||||
|
||||
|
||||
let (cg_window_id, matched_owner) = cg_window_id.ok_or_else(|| {
|
||||
anyhow::anyhow!("Could not find window for application '{}'. Use list_windows to see available windows.", app_name)
|
||||
})?;
|
||||
tracing::info!("Taking screenshot of window ID {} for app '{}'", cg_window_id, matched_owner);
|
||||
|
||||
tracing::info!(
|
||||
"Taking screenshot of window ID {} for app '{}'",
|
||||
cg_window_id,
|
||||
matched_owner
|
||||
);
|
||||
|
||||
// Use screencapture with the window ID for now
|
||||
// TODO: Implement direct CGWindowListCreateImage approach with proper image saving
|
||||
let mut cmd = std::process::Command::new("screencapture");
|
||||
cmd.arg("-x"); // No sound
|
||||
cmd.arg("-l");
|
||||
cmd.arg(cg_window_id.to_string());
|
||||
|
||||
|
||||
if let Some(region) = region {
|
||||
cmd.arg("-R");
|
||||
cmd.arg(format!("{},{},{},{}", region.x, region.y, region.width, region.height));
|
||||
cmd.arg(format!(
|
||||
"{},{},{},{}",
|
||||
region.x, region.y, region.width, region.height
|
||||
));
|
||||
}
|
||||
|
||||
|
||||
cmd.arg(&final_path);
|
||||
|
||||
|
||||
let screenshot_result = cmd.output()?;
|
||||
|
||||
|
||||
if !screenshot_result.status.success() {
|
||||
let stderr = String::from_utf8_lossy(&screenshot_result.stderr);
|
||||
return Err(anyhow::anyhow!("screencapture failed for window {}: {}", cg_window_id, stderr));
|
||||
return Err(anyhow::anyhow!(
|
||||
"screencapture failed for window {}: {}",
|
||||
cg_window_id,
|
||||
stderr
|
||||
));
|
||||
}
|
||||
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
async fn extract_text_from_screen(&self, region: Rect, window_id: &str) -> Result<String> {
|
||||
// Take screenshot of region first
|
||||
let temp_path = format!("/tmp/g3_ocr_{}.png", uuid::Uuid::new_v4());
|
||||
self.take_screenshot(&temp_path, Some(region), Some(window_id)).await?;
|
||||
|
||||
self.take_screenshot(&temp_path, Some(region), Some(window_id))
|
||||
.await?;
|
||||
|
||||
// Extract text from the screenshot
|
||||
let result = self.extract_text_from_image(&temp_path).await?;
|
||||
|
||||
|
||||
// Clean up temp file
|
||||
let _ = std::fs::remove_file(&temp_path);
|
||||
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
|
||||
async fn extract_text_from_image(&self, path: &str) -> Result<String> {
|
||||
// Extract all text and concatenate
|
||||
let locations = self.ocr_engine.extract_text_with_locations(path).await?;
|
||||
Ok(locations.iter().map(|loc| loc.text.as_str()).collect::<Vec<_>>().join(" "))
|
||||
Ok(locations
|
||||
.iter()
|
||||
.map(|loc| loc.text.as_str())
|
||||
.collect::<Vec<_>>()
|
||||
.join(" "))
|
||||
}
|
||||
|
||||
|
||||
async fn extract_text_with_locations(&self, path: &str) -> Result<Vec<TextLocation>> {
|
||||
// Use the OCR engine
|
||||
self.ocr_engine.extract_text_with_locations(path).await
|
||||
}
|
||||
|
||||
async fn find_text_in_app(&self, app_name: &str, search_text: &str) -> Result<Option<TextLocation>> {
|
||||
|
||||
async fn find_text_in_app(
|
||||
&self,
|
||||
app_name: &str,
|
||||
search_text: &str,
|
||||
) -> Result<Option<TextLocation>> {
|
||||
// Take screenshot of specific app window
|
||||
let home = std::env::var("HOME").unwrap_or_else(|_| "/tmp".to_string());
|
||||
let temp_path = format!("{}/tmp/g3_find_text_{}_{}.png", home, app_name, uuid::Uuid::new_v4());
|
||||
self.take_screenshot(&temp_path, None, Some(app_name)).await?;
|
||||
|
||||
let temp_path = format!(
|
||||
"{}/tmp/g3_find_text_{}_{}.png",
|
||||
home,
|
||||
app_name,
|
||||
uuid::Uuid::new_v4()
|
||||
);
|
||||
self.take_screenshot(&temp_path, None, Some(app_name))
|
||||
.await?;
|
||||
|
||||
// Get screenshot dimensions before we delete it
|
||||
let screenshot_dims = get_image_dimensions(&temp_path)?;
|
||||
|
||||
|
||||
// Extract all text with locations
|
||||
let locations = self.extract_text_with_locations(&temp_path).await?;
|
||||
|
||||
|
||||
// Get window bounds to calculate coordinate transformation
|
||||
let window_bounds = self.get_window_bounds(app_name)?;
|
||||
|
||||
|
||||
// Clean up temp file
|
||||
let _ = std::fs::remove_file(&temp_path);
|
||||
|
||||
|
||||
// Find matching text (case-insensitive)
|
||||
let search_lower = search_text.to_lowercase();
|
||||
for location in locations {
|
||||
if location.text.to_lowercase().contains(&search_lower) {
|
||||
// Transform coordinates from screenshot space to screen space
|
||||
let transformed = transform_screenshot_to_screen_coords(
|
||||
location,
|
||||
window_bounds,
|
||||
screenshot_dims,
|
||||
);
|
||||
let transformed =
|
||||
transform_screenshot_to_screen_coords(location, window_bounds, screenshot_dims);
|
||||
return Ok(Some(transformed));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
|
||||
fn move_mouse(&self, x: i32, y: i32) -> Result<()> {
|
||||
use core_graphics::event::{
|
||||
CGEvent, CGEventTapLocation, CGEventType, CGMouseButton,
|
||||
};
|
||||
use core_graphics::event_source::{
|
||||
CGEventSource, CGEventSourceStateID,
|
||||
};
|
||||
use core_graphics::event::{CGEvent, CGEventTapLocation, CGEventType, CGMouseButton};
|
||||
use core_graphics::event_source::{CGEventSource, CGEventSourceStateID};
|
||||
use core_graphics::geometry::CGPoint;
|
||||
|
||||
|
||||
let source = CGEventSource::new(CGEventSourceStateID::HIDSystemState)
|
||||
.ok().context("Failed to create event source")?;
|
||||
|
||||
.ok()
|
||||
.context("Failed to create event source")?;
|
||||
|
||||
let event = CGEvent::new_mouse_event(
|
||||
source,
|
||||
CGEventType::MouseMoved,
|
||||
CGPoint::new(x as f64, y as f64),
|
||||
CGMouseButton::Left,
|
||||
).ok().context("Failed to create mouse event")?;
|
||||
|
||||
)
|
||||
.ok()
|
||||
.context("Failed to create mouse event")?;
|
||||
|
||||
event.post(CGEventTapLocation::HID);
|
||||
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
fn click_at(&self, x: i32, y: i32, _app_name: Option<&str>) -> Result<()> {
|
||||
use core_graphics::event::{
|
||||
CGEvent, CGEventTapLocation, CGEventType, CGMouseButton,
|
||||
};
|
||||
use core_graphics::event_source::{
|
||||
CGEventSource, CGEventSourceStateID,
|
||||
};
|
||||
use core_graphics::geometry::CGPoint;
|
||||
use core_graphics::display::CGDisplay;
|
||||
|
||||
use core_graphics::event::{CGEvent, CGEventTapLocation, CGEventType, CGMouseButton};
|
||||
use core_graphics::event_source::{CGEventSource, CGEventSourceStateID};
|
||||
use core_graphics::geometry::CGPoint;
|
||||
|
||||
// IMPORTANT: Coordinates passed here are in NSScreen/CGWindowListCopyWindowInfo space
|
||||
// (Y=0 at BOTTOM, increases UPWARD)
|
||||
// But CGEvent uses a different coordinate system (Y=0 at TOP, increases DOWNWARD)
|
||||
// We need to convert: CGEvent.y = screenHeight - NSScreen.y
|
||||
|
||||
|
||||
let screen_height = CGDisplay::main().pixels_high() as i32;
|
||||
let cgevent_x = x;
|
||||
let cgevent_y = screen_height - y;
|
||||
|
||||
tracing::debug!("click_at: NSScreen coords ({}, {}) -> CGEvent coords ({}, {}) [screen_height={}]",
|
||||
x, y, cgevent_x, cgevent_y, screen_height);
|
||||
|
||||
|
||||
tracing::debug!(
|
||||
"click_at: NSScreen coords ({}, {}) -> CGEvent coords ({}, {}) [screen_height={}]",
|
||||
x,
|
||||
y,
|
||||
cgevent_x,
|
||||
cgevent_y,
|
||||
screen_height
|
||||
);
|
||||
|
||||
let (global_x, global_y) = (cgevent_x, cgevent_y);
|
||||
|
||||
|
||||
let point = CGPoint::new(global_x as f64, global_y as f64);
|
||||
|
||||
|
||||
let source = CGEventSource::new(CGEventSourceStateID::HIDSystemState)
|
||||
.ok().context("Failed to create event source")?;
|
||||
|
||||
.ok()
|
||||
.context("Failed to create event source")?;
|
||||
|
||||
// Move mouse to position first
|
||||
let move_event = CGEvent::new_mouse_event(
|
||||
source.clone(),
|
||||
CGEventType::MouseMoved,
|
||||
point,
|
||||
CGMouseButton::Left,
|
||||
).ok().context("Failed to create mouse move event")?;
|
||||
)
|
||||
.ok()
|
||||
.context("Failed to create mouse move event")?;
|
||||
move_event.post(CGEventTapLocation::HID);
|
||||
|
||||
|
||||
std::thread::sleep(std::time::Duration::from_millis(100));
|
||||
|
||||
|
||||
// Mouse down
|
||||
let mouse_down = CGEvent::new_mouse_event(
|
||||
source.clone(),
|
||||
CGEventType::LeftMouseDown,
|
||||
point,
|
||||
CGMouseButton::Left,
|
||||
).ok().context("Failed to create mouse down event")?;
|
||||
)
|
||||
.ok()
|
||||
.context("Failed to create mouse down event")?;
|
||||
mouse_down.post(CGEventTapLocation::HID);
|
||||
|
||||
|
||||
std::thread::sleep(std::time::Duration::from_millis(50));
|
||||
|
||||
|
||||
// Mouse up
|
||||
let mouse_up = CGEvent::new_mouse_event(
|
||||
source,
|
||||
CGEventType::LeftMouseUp,
|
||||
point,
|
||||
CGMouseButton::Left,
|
||||
).ok().context("Failed to create mouse up event")?;
|
||||
let mouse_up =
|
||||
CGEvent::new_mouse_event(source, CGEventType::LeftMouseUp, point, CGMouseButton::Left)
|
||||
.ok()
|
||||
.context("Failed to create mouse up event")?;
|
||||
mouse_up.post(CGEventTapLocation::HID);
|
||||
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -328,19 +383,17 @@ impl MacOSController {
|
||||
/// Get window bounds for an application (helper method)
|
||||
fn get_window_bounds(&self, app_name: &str) -> Result<(i32, i32, i32, i32)> {
|
||||
unsafe {
|
||||
let window_list = CGWindowListCopyWindowInfo(
|
||||
kCGWindowListOptionOnScreenOnly,
|
||||
kCGNullWindowID
|
||||
);
|
||||
|
||||
let window_list =
|
||||
CGWindowListCopyWindowInfo(kCGWindowListOptionOnScreenOnly, kCGNullWindowID);
|
||||
|
||||
let array = CFArray::<CFDictionary>::wrap_under_create_rule(window_list);
|
||||
let count = array.len();
|
||||
|
||||
|
||||
let app_name_lower = app_name.to_lowercase();
|
||||
|
||||
|
||||
for i in 0..count {
|
||||
let dict = array.get(i).unwrap();
|
||||
|
||||
|
||||
// Get owner name
|
||||
let owner_key = CFString::from_static_string("kCGWindowOwnerName");
|
||||
let owner: String = if let Some(value) = dict.find(owner_key.to_void()) {
|
||||
@@ -349,65 +402,81 @@ impl MacOSController {
|
||||
} else {
|
||||
continue;
|
||||
};
|
||||
|
||||
|
||||
let owner_lower = owner.to_lowercase();
|
||||
|
||||
|
||||
// Normalize by removing spaces for exact matching
|
||||
let app_name_normalized = app_name_lower.replace(" ", "");
|
||||
let owner_normalized = owner_lower.replace(" ", "");
|
||||
|
||||
|
||||
// ONLY accept exact matches (case-insensitive, with or without spaces)
|
||||
// This prevents "Goose" from matching "GooseStudio"
|
||||
let is_match = owner_lower == app_name_lower || owner_normalized == app_name_normalized;
|
||||
|
||||
let is_match =
|
||||
owner_lower == app_name_lower || owner_normalized == app_name_normalized;
|
||||
|
||||
if is_match {
|
||||
// Get window layer to filter out menu bar windows
|
||||
let layer_key = CFString::from_static_string("kCGWindowLayer");
|
||||
let layer: i32 = if let Some(value) = dict.find(layer_key.to_void()) {
|
||||
let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _);
|
||||
let num: core_foundation::number::CFNumber =
|
||||
TCFType::wrap_under_get_rule(*value as *const _);
|
||||
num.to_i32().unwrap_or(0)
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
|
||||
// Skip menu bar windows (layer >= 20)
|
||||
if layer >= 20 {
|
||||
tracing::debug!("Skipping window for '{}' at layer {} (menu bar)", owner, layer);
|
||||
tracing::debug!(
|
||||
"Skipping window for '{}' at layer {} (menu bar)",
|
||||
owner,
|
||||
layer
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
// Get window bounds to verify it's a real window
|
||||
let bounds_key = CFString::from_static_string("kCGWindowBounds");
|
||||
if let Some(value) = dict.find(bounds_key.to_void()) {
|
||||
let bounds_dict: CFDictionary = TCFType::wrap_under_get_rule(*value as *const _);
|
||||
|
||||
let bounds_dict: CFDictionary =
|
||||
TCFType::wrap_under_get_rule(*value as *const _);
|
||||
|
||||
let x_key = CFString::from_static_string("X");
|
||||
let y_key = CFString::from_static_string("Y");
|
||||
let width_key = CFString::from_static_string("Width");
|
||||
let height_key = CFString::from_static_string("Height");
|
||||
|
||||
|
||||
if let (Some(x_val), Some(y_val), Some(w_val), Some(h_val)) = (
|
||||
bounds_dict.find(x_key.to_void()),
|
||||
bounds_dict.find(y_key.to_void()),
|
||||
bounds_dict.find(width_key.to_void()),
|
||||
bounds_dict.find(height_key.to_void()),
|
||||
) {
|
||||
let x_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*x_val as *const _);
|
||||
let y_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*y_val as *const _);
|
||||
let w_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*w_val as *const _);
|
||||
let h_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*h_val as *const _);
|
||||
|
||||
let x_num: core_foundation::number::CFNumber =
|
||||
TCFType::wrap_under_get_rule(*x_val as *const _);
|
||||
let y_num: core_foundation::number::CFNumber =
|
||||
TCFType::wrap_under_get_rule(*y_val as *const _);
|
||||
let w_num: core_foundation::number::CFNumber =
|
||||
TCFType::wrap_under_get_rule(*w_val as *const _);
|
||||
let h_num: core_foundation::number::CFNumber =
|
||||
TCFType::wrap_under_get_rule(*h_val as *const _);
|
||||
|
||||
let x: i32 = x_num.to_i64().unwrap_or(0) as i32;
|
||||
let y: i32 = y_num.to_i64().unwrap_or(0) as i32;
|
||||
let w: i32 = w_num.to_i64().unwrap_or(0) as i32;
|
||||
let h: i32 = h_num.to_i64().unwrap_or(0) as i32;
|
||||
|
||||
|
||||
// Only accept windows with real bounds (>= 100x100 pixels)
|
||||
if w >= 100 && h >= 100 {
|
||||
tracing::info!("Found valid window bounds for '{}': x={}, y={}, w={}, h={} (layer={})", owner, x, y, w, h, layer);
|
||||
return Ok((x, y, w, h));
|
||||
} else {
|
||||
tracing::debug!("Skipping window for '{}': too small ({}x{})", owner, w, h);
|
||||
tracing::debug!(
|
||||
"Skipping window for '{}': too small ({}x{})",
|
||||
owner,
|
||||
w,
|
||||
h
|
||||
);
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
@@ -417,8 +486,11 @@ impl MacOSController {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Err(anyhow::anyhow!("Could not find window bounds for '{}'", app_name))
|
||||
|
||||
Err(anyhow::anyhow!(
|
||||
"Could not find window bounds for '{}'",
|
||||
app_name
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -426,72 +498,118 @@ impl MacOSController {
|
||||
fn get_image_dimensions(path: &str) -> Result<(i32, i32)> {
|
||||
use std::fs::File;
|
||||
use std::io::Read;
|
||||
|
||||
|
||||
let mut file = File::open(path)?;
|
||||
let mut buffer = vec![0u8; 24];
|
||||
file.read_exact(&mut buffer)?;
|
||||
|
||||
|
||||
// PNG signature check
|
||||
if &buffer[0..8] != b"\x89PNG\r\n\x1a\n" {
|
||||
anyhow::bail!("Not a valid PNG file");
|
||||
}
|
||||
|
||||
|
||||
// Read IHDR chunk (width and height are at bytes 16-23)
|
||||
let width = u32::from_be_bytes([buffer[16], buffer[17], buffer[18], buffer[19]]) as i32;
|
||||
let height = u32::from_be_bytes([buffer[20], buffer[21], buffer[22], buffer[23]]) as i32;
|
||||
|
||||
|
||||
Ok((width, height))
|
||||
}
|
||||
|
||||
/// Transform coordinates from screenshot space to screen space
|
||||
///
|
||||
///
|
||||
/// The screenshot is taken of a window, and Vision OCR returns coordinates
|
||||
/// relative to the screenshot image. We need to transform these to actual
|
||||
/// screen coordinates for clicking.
|
||||
///
|
||||
///
|
||||
/// On Retina displays, screenshots are taken at 2x resolution, so we need
|
||||
/// to account for this scaling factor.
|
||||
fn transform_screenshot_to_screen_coords(
|
||||
location: TextLocation,
|
||||
window_bounds: (i32, i32, i32, i32), // (x, y, width, height) in screen space
|
||||
screenshot_dims: (i32, i32), // (width, height) in pixels
|
||||
screenshot_dims: (i32, i32), // (width, height) in pixels
|
||||
) -> TextLocation {
|
||||
let (win_x, win_y, win_width, win_height) = window_bounds;
|
||||
let (screenshot_width, screenshot_height) = screenshot_dims;
|
||||
|
||||
|
||||
// Calculate scale factors
|
||||
// On Retina displays, screenshot is typically 2x the window size
|
||||
let scale_x = win_width as f64 / screenshot_width as f64;
|
||||
let scale_y = win_height as f64 / screenshot_height as f64;
|
||||
|
||||
tracing::debug!("Transform: screenshot={}x{}, window={}x{} at ({},{}), scale=({:.2},{:.2})",
|
||||
screenshot_width, screenshot_height, win_width, win_height, win_x, win_y, scale_x, scale_y);
|
||||
|
||||
|
||||
tracing::debug!(
|
||||
"Transform: screenshot={}x{}, window={}x{} at ({},{}), scale=({:.2},{:.2})",
|
||||
screenshot_width,
|
||||
screenshot_height,
|
||||
win_width,
|
||||
win_height,
|
||||
win_x,
|
||||
win_y,
|
||||
scale_x,
|
||||
scale_y
|
||||
);
|
||||
|
||||
// Transform coordinates from image space to screen space
|
||||
// IMPORTANT: macOS screen coordinates have origin at BOTTOM-LEFT (Y increases upward)
|
||||
// Image coordinates have origin at TOP-LEFT (Y increases downward)
|
||||
// win_y is the BOTTOM of the window in screen coordinates
|
||||
// So we need to: (win_y + win_height) to get window TOP, then subtract screenshot_y
|
||||
let window_top_y = win_y + win_height;
|
||||
|
||||
tracing::debug!("[transform] Input location in image space: x={}, y={}, width={}, height={}",
|
||||
location.x, location.y, location.width, location.height);
|
||||
tracing::debug!("[transform] Scale factors: scale_x={:.4}, scale_y={:.4}", scale_x, scale_y);
|
||||
|
||||
|
||||
tracing::debug!(
|
||||
"[transform] Input location in image space: x={}, y={}, width={}, height={}",
|
||||
location.x,
|
||||
location.y,
|
||||
location.width,
|
||||
location.height
|
||||
);
|
||||
tracing::debug!(
|
||||
"[transform] Scale factors: scale_x={:.4}, scale_y={:.4}",
|
||||
scale_x,
|
||||
scale_y
|
||||
);
|
||||
|
||||
let transformed_x = win_x + (location.x as f64 * scale_x) as i32;
|
||||
let transformed_y = window_top_y - (location.y as f64 * scale_y) as i32;
|
||||
let transformed_width = (location.width as f64 * scale_x) as i32;
|
||||
let transformed_height = (location.height as f64 * scale_y) as i32;
|
||||
|
||||
|
||||
tracing::debug!("[transform] Calculation details:");
|
||||
tracing::debug!(" - transformed_x = {} + ({} * {:.4}) = {} + {:.2} = {}", win_x, location.x, scale_x, win_x, location.x as f64 * scale_x, transformed_x);
|
||||
tracing::debug!(" - transformed_width = ({} * {:.4}) = {:.2} -> {}", location.width, scale_x, location.width as f64 * scale_x, transformed_width);
|
||||
tracing::debug!(" - transformed_height = ({} * {:.4}) = {:.2} -> {}", location.height, scale_y, location.height as f64 * scale_y, transformed_height);
|
||||
|
||||
tracing::debug!("Transformed location: screenshot=({},{}) {}x{} -> screen=({},{}) {}x{}",
|
||||
location.x, location.y, location.width, location.height,
|
||||
transformed_x, transformed_y, transformed_width, transformed_height);
|
||||
|
||||
tracing::debug!(
|
||||
" - transformed_x = {} + ({} * {:.4}) = {} + {:.2} = {}",
|
||||
win_x,
|
||||
location.x,
|
||||
scale_x,
|
||||
win_x,
|
||||
location.x as f64 * scale_x,
|
||||
transformed_x
|
||||
);
|
||||
tracing::debug!(
|
||||
" - transformed_width = ({} * {:.4}) = {:.2} -> {}",
|
||||
location.width,
|
||||
scale_x,
|
||||
location.width as f64 * scale_x,
|
||||
transformed_width
|
||||
);
|
||||
tracing::debug!(
|
||||
" - transformed_height = ({} * {:.4}) = {:.2} -> {}",
|
||||
location.height,
|
||||
scale_y,
|
||||
location.height as f64 * scale_y,
|
||||
transformed_height
|
||||
);
|
||||
|
||||
tracing::debug!(
|
||||
"Transformed location: screenshot=({},{}) {}x{} -> screen=({},{}) {}x{}",
|
||||
location.x,
|
||||
location.y,
|
||||
location.width,
|
||||
location.height,
|
||||
transformed_x,
|
||||
transformed_y,
|
||||
transformed_width,
|
||||
transformed_height
|
||||
);
|
||||
|
||||
TextLocation {
|
||||
text: location.text,
|
||||
x: transformed_x,
|
||||
@@ -504,4 +622,4 @@ fn transform_screenshot_to_screen_coords(
|
||||
|
||||
#[path = "macos_window_matching_test.rs"]
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
mod tests;
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
#[cfg(test)]
|
||||
mod window_matching_tests {
|
||||
/// Test that window name matching handles spaces correctly
|
||||
///
|
||||
///
|
||||
/// Issue: When a user requests a screenshot of "Goose Studio" but the actual
|
||||
/// application name is "GooseStudio" (no space), the fuzzy matching should
|
||||
/// still find the window.
|
||||
///
|
||||
///
|
||||
/// The fix normalizes both names by removing spaces before comparing.
|
||||
#[test]
|
||||
fn test_space_normalization() {
|
||||
@@ -16,25 +16,25 @@ mod window_matching_tests {
|
||||
("Visual Studio Code", "VisualStudioCode", true),
|
||||
("Google Chrome", "Google Chrome", true),
|
||||
("Safari", "Safari", true),
|
||||
("iTerm", "iTerm2", true), // fuzzy match
|
||||
("iTerm", "iTerm2", true), // fuzzy match
|
||||
("Code", "Visual Studio Code", true), // fuzzy match
|
||||
];
|
||||
|
||||
for (user_input, app_name, should_match) in test_cases {
|
||||
let user_lower = user_input.to_lowercase();
|
||||
let app_lower = app_name.to_lowercase();
|
||||
|
||||
|
||||
let user_normalized = user_lower.replace(" ", "");
|
||||
let app_normalized = app_lower.replace(" ", "");
|
||||
|
||||
|
||||
let is_exact = app_lower == user_lower || app_normalized == user_normalized;
|
||||
let is_fuzzy = app_lower.contains(&user_lower)
|
||||
let is_fuzzy = app_lower.contains(&user_lower)
|
||||
|| user_lower.contains(&app_lower)
|
||||
|| app_normalized.contains(&user_normalized)
|
||||
|| user_normalized.contains(&app_normalized);
|
||||
|
||||
|
||||
let matches = is_exact || is_fuzzy;
|
||||
|
||||
|
||||
assert_eq!(
|
||||
matches, should_match,
|
||||
"Expected '{}' vs '{}' to match={}, but got match={}",
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::{ComputerController, types::*};
|
||||
use crate::{types::*, ComputerController};
|
||||
use anyhow::Result;
|
||||
use async_trait::async_trait;
|
||||
use tesseract::Tesseract;
|
||||
@@ -20,48 +20,53 @@ impl ComputerController for WindowsController {
|
||||
async fn move_mouse(&self, _x: i32, _y: i32) -> Result<()> {
|
||||
anyhow::bail!("Windows implementation not yet available")
|
||||
}
|
||||
|
||||
|
||||
async fn click(&self, _button: MouseButton) -> Result<()> {
|
||||
anyhow::bail!("Windows implementation not yet available")
|
||||
}
|
||||
|
||||
|
||||
async fn double_click(&self, _button: MouseButton) -> Result<()> {
|
||||
anyhow::bail!("Windows implementation not yet available")
|
||||
}
|
||||
|
||||
|
||||
async fn type_text(&self, _text: &str) -> Result<()> {
|
||||
anyhow::bail!("Windows implementation not yet available")
|
||||
}
|
||||
|
||||
|
||||
async fn press_key(&self, _key: &str) -> Result<()> {
|
||||
anyhow::bail!("Windows implementation not yet available")
|
||||
}
|
||||
|
||||
|
||||
async fn list_windows(&self) -> Result<Vec<Window>> {
|
||||
anyhow::bail!("Windows implementation not yet available")
|
||||
}
|
||||
|
||||
|
||||
async fn focus_window(&self, _window_id: &str) -> Result<()> {
|
||||
anyhow::bail!("Windows implementation not yet available")
|
||||
}
|
||||
|
||||
|
||||
async fn get_window_bounds(&self, _window_id: &str) -> Result<Rect> {
|
||||
anyhow::bail!("Windows implementation not yet available")
|
||||
}
|
||||
|
||||
|
||||
async fn find_element(&self, _selector: &ElementSelector) -> Result<Option<UIElement>> {
|
||||
anyhow::bail!("Windows implementation not yet available")
|
||||
}
|
||||
|
||||
|
||||
async fn get_element_text(&self, _element_id: &str) -> Result<String> {
|
||||
anyhow::bail!("Windows implementation not yet available")
|
||||
}
|
||||
|
||||
|
||||
async fn get_element_bounds(&self, _element_id: &str) -> Result<Rect> {
|
||||
anyhow::bail!("Windows implementation not yet available")
|
||||
}
|
||||
|
||||
async fn take_screenshot(&self, _path: &str, _region: Option<Rect>, _window_id: Option<&str>) -> Result<()> {
|
||||
|
||||
async fn take_screenshot(
|
||||
&self,
|
||||
_path: &str,
|
||||
_region: Option<Rect>,
|
||||
_window_id: Option<&str>,
|
||||
) -> Result<()> {
|
||||
// Enforce that window_id must be provided
|
||||
if _window_id.is_none() {
|
||||
anyhow::bail!("window_id is required. You must specify which window to capture (e.g., 'Chrome', 'Terminal', 'Notepad'). Use list_windows to see available windows.");
|
||||
@@ -69,96 +74,113 @@ impl ComputerController for WindowsController {
|
||||
|
||||
anyhow::bail!("Windows implementation not yet available")
|
||||
}
|
||||
|
||||
|
||||
async fn extract_text_from_screen(&self, _region: Rect, _window_id: &str) -> Result<String> {
|
||||
anyhow::bail!("Windows implementation not yet available")
|
||||
}
|
||||
|
||||
|
||||
async fn extract_text_from_image(&self, _path: &str) -> Result<OCRResult> {
|
||||
// Check if tesseract is available on the system
|
||||
let tesseract_check = std::process::Command::new("where")
|
||||
.arg("tesseract")
|
||||
.output();
|
||||
|
||||
|
||||
if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() {
|
||||
anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\
|
||||
anyhow::bail!(
|
||||
"Tesseract OCR is not installed on your system.\n\n\
|
||||
To install tesseract on Windows:\n \
|
||||
1. Download the installer from: https://github.com/UB-Mannheim/tesseract/wiki\n \
|
||||
2. Run the installer and follow the instructions\n \
|
||||
3. Add tesseract to your PATH environment variable\n \
|
||||
4. Restart your terminal/command prompt\n\n\
|
||||
After installation, restart your terminal and try again.");
|
||||
After installation, restart your terminal and try again."
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
// Initialize Tesseract
|
||||
let tess = Tesseract::new(None, Some("eng"))
|
||||
.map_err(|e| {
|
||||
anyhow::anyhow!("Failed to initialize Tesseract: {}\n\n\
|
||||
let tess = Tesseract::new(None, Some("eng")).map_err(|e| {
|
||||
anyhow::anyhow!(
|
||||
"Failed to initialize Tesseract: {}\n\n\
|
||||
This usually means:\n1. Tesseract is not properly installed\n\
|
||||
2. Language data files are missing\n\nTo fix:\n \
|
||||
1. Reinstall tesseract from https://github.com/UB-Mannheim/tesseract/wiki\n \
|
||||
2. Make sure to select 'Additional language data' during installation\n \
|
||||
3. Ensure tesseract is in your PATH", e)
|
||||
})?;
|
||||
|
||||
let text = tess.set_image(_path)
|
||||
3. Ensure tesseract is in your PATH",
|
||||
e
|
||||
)
|
||||
})?;
|
||||
|
||||
let text = tess
|
||||
.set_image(_path)
|
||||
.map_err(|e| anyhow::anyhow!("Failed to load image '{}': {}", _path, e))?
|
||||
.get_text()
|
||||
.map_err(|e| anyhow::anyhow!("Failed to extract text from image: {}", e))?;
|
||||
|
||||
|
||||
// Get confidence (simplified - would need more complex API calls for per-word confidence)
|
||||
let confidence = 0.85; // Placeholder
|
||||
|
||||
|
||||
Ok(OCRResult {
|
||||
text,
|
||||
confidence,
|
||||
bounds: Rect { x: 0, y: 0, width: 0, height: 0 }, // Would need image dimensions
|
||||
bounds: Rect {
|
||||
x: 0,
|
||||
y: 0,
|
||||
width: 0,
|
||||
height: 0,
|
||||
}, // Would need image dimensions
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
async fn find_text_on_screen(&self, _text: &str) -> Result<Option<Point>> {
|
||||
// Check if tesseract is available on the system
|
||||
let tesseract_check = std::process::Command::new("where")
|
||||
.arg("tesseract")
|
||||
.output();
|
||||
|
||||
|
||||
if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() {
|
||||
anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\
|
||||
anyhow::bail!(
|
||||
"Tesseract OCR is not installed on your system.\n\n\
|
||||
To install tesseract on Windows:\n \
|
||||
1. Download the installer from: https://github.com/UB-Mannheim/tesseract/wiki\n \
|
||||
2. Run the installer and follow the instructions\n \
|
||||
3. Add tesseract to your PATH environment variable\n \
|
||||
4. Restart your terminal/command prompt\n\n\
|
||||
After installation, restart your terminal and try again.");
|
||||
After installation, restart your terminal and try again."
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
// Take full screen screenshot
|
||||
let temp_path = format!("C:\\\\Temp\\\\g3_ocr_search_{}.png", uuid::Uuid::new_v4());
|
||||
self.take_screenshot(&temp_path, None, None).await?;
|
||||
|
||||
|
||||
// Use Tesseract to find text with bounding boxes
|
||||
let tess = Tesseract::new(None, Some("eng"))
|
||||
.map_err(|e| {
|
||||
anyhow::anyhow!("Failed to initialize Tesseract: {}\n\n\
|
||||
let tess = Tesseract::new(None, Some("eng")).map_err(|e| {
|
||||
anyhow::anyhow!(
|
||||
"Failed to initialize Tesseract: {}\n\n\
|
||||
This usually means:\n1. Tesseract is not properly installed\n\
|
||||
2. Language data files are missing\n\nTo fix:\n \
|
||||
1. Reinstall tesseract from https://github.com/UB-Mannheim/tesseract/wiki\n \
|
||||
2. Make sure to select 'Additional language data' during installation\n \
|
||||
3. Ensure tesseract is in your PATH", e)
|
||||
})?;
|
||||
|
||||
let full_text = tess.set_image(temp_path.as_str())
|
||||
3. Ensure tesseract is in your PATH",
|
||||
e
|
||||
)
|
||||
})?;
|
||||
|
||||
let full_text = tess
|
||||
.set_image(temp_path.as_str())
|
||||
.map_err(|e| anyhow::anyhow!("Failed to load screenshot: {}", e))?
|
||||
.get_text()
|
||||
.map_err(|e| anyhow::anyhow!("Failed to extract text from screen: {}", e))?;
|
||||
|
||||
|
||||
// Clean up temp file
|
||||
let _ = std::fs::remove_file(&temp_path);
|
||||
|
||||
|
||||
// Simple text search - full implementation would use get_component_images
|
||||
// to get bounding boxes for each word
|
||||
if full_text.contains(_text) {
|
||||
tracing::warn!("Text found but precise coordinates not available in simplified implementation");
|
||||
tracing::warn!(
|
||||
"Text found but precise coordinates not available in simplified implementation"
|
||||
);
|
||||
Ok(Some(Point { x: 0, y: 0 }))
|
||||
} else {
|
||||
Ok(None)
|
||||
|
||||
@@ -9,31 +9,31 @@ use serde_json::Value;
|
||||
pub trait WebDriverController: Send + Sync {
|
||||
/// Navigate to a URL
|
||||
async fn navigate(&mut self, url: &str) -> Result<()>;
|
||||
|
||||
|
||||
/// Get the current URL
|
||||
async fn current_url(&self) -> Result<String>;
|
||||
|
||||
|
||||
/// Get the page title
|
||||
async fn title(&self) -> Result<String>;
|
||||
|
||||
|
||||
/// Find an element by CSS selector
|
||||
async fn find_element(&mut self, selector: &str) -> Result<WebElement>;
|
||||
|
||||
|
||||
/// Find multiple elements by CSS selector
|
||||
async fn find_elements(&mut self, selector: &str) -> Result<Vec<WebElement>>;
|
||||
|
||||
|
||||
/// Execute JavaScript in the browser
|
||||
async fn execute_script(&mut self, script: &str, args: Vec<Value>) -> Result<Value>;
|
||||
|
||||
|
||||
/// Get the page source (HTML)
|
||||
async fn page_source(&self) -> Result<String>;
|
||||
|
||||
|
||||
/// Take a screenshot and save to path
|
||||
async fn screenshot(&mut self, path: &str) -> Result<()>;
|
||||
|
||||
|
||||
/// Close the current window/tab
|
||||
async fn close(&mut self) -> Result<()>;
|
||||
|
||||
|
||||
/// Quit the browser session
|
||||
async fn quit(self) -> Result<()>;
|
||||
}
|
||||
@@ -49,63 +49,69 @@ impl WebElement {
|
||||
self.inner.click().await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
/// Send keys/text to the element
|
||||
pub async fn send_keys(&mut self, text: &str) -> Result<()> {
|
||||
self.inner.send_keys(text).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
/// Clear the element's content (for input fields)
|
||||
pub async fn clear(&mut self) -> Result<()> {
|
||||
self.inner.clear().await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
/// Get the element's text content
|
||||
pub async fn text(&self) -> Result<String> {
|
||||
Ok(self.inner.text().await?)
|
||||
}
|
||||
|
||||
|
||||
/// Get an attribute value
|
||||
pub async fn attr(&self, name: &str) -> Result<Option<String>> {
|
||||
Ok(self.inner.attr(name).await?)
|
||||
}
|
||||
|
||||
|
||||
/// Get a property value
|
||||
pub async fn prop(&self, name: &str) -> Result<Option<String>> {
|
||||
Ok(self.inner.prop(name).await?)
|
||||
}
|
||||
|
||||
|
||||
/// Get the element's HTML
|
||||
pub async fn html(&self, inner: bool) -> Result<String> {
|
||||
Ok(self.inner.html(inner).await?)
|
||||
}
|
||||
|
||||
|
||||
/// Check if element is displayed
|
||||
pub async fn is_displayed(&self) -> Result<bool> {
|
||||
Ok(self.inner.is_displayed().await?)
|
||||
}
|
||||
|
||||
|
||||
/// Check if element is enabled
|
||||
pub async fn is_enabled(&self) -> Result<bool> {
|
||||
Ok(self.inner.is_enabled().await?)
|
||||
}
|
||||
|
||||
|
||||
/// Check if element is selected (for checkboxes/radio buttons)
|
||||
pub async fn is_selected(&self) -> Result<bool> {
|
||||
Ok(self.inner.is_selected().await?)
|
||||
}
|
||||
|
||||
|
||||
/// Find a child element by CSS selector
|
||||
pub async fn find_element(&mut self, selector: &str) -> Result<WebElement> {
|
||||
let elem = self.inner.find(fantoccini::Locator::Css(selector)).await?;
|
||||
Ok(WebElement { inner: elem })
|
||||
}
|
||||
|
||||
|
||||
/// Find multiple child elements by CSS selector
|
||||
pub async fn find_elements(&mut self, selector: &str) -> Result<Vec<WebElement>> {
|
||||
let elems = self.inner.find_all(fantoccini::Locator::Css(selector)).await?;
|
||||
Ok(elems.into_iter().map(|inner| WebElement { inner }).collect())
|
||||
let elems = self
|
||||
.inner
|
||||
.find_all(fantoccini::Locator::Css(selector))
|
||||
.await?;
|
||||
Ok(elems
|
||||
.into_iter()
|
||||
.map(|inner| WebElement { inner })
|
||||
.collect())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,10 +12,10 @@ pub struct SafariDriver {
|
||||
|
||||
impl SafariDriver {
|
||||
/// Create a new SafariDriver instance
|
||||
///
|
||||
///
|
||||
/// This will connect to SafariDriver running on the default port (4444).
|
||||
/// Make sure to enable "Allow Remote Automation" in Safari's Develop menu first.
|
||||
///
|
||||
///
|
||||
/// You can start SafariDriver manually with:
|
||||
/// ```bash
|
||||
/// /usr/bin/safaridriver --enable
|
||||
@@ -23,125 +23,134 @@ impl SafariDriver {
|
||||
pub async fn new() -> Result<Self> {
|
||||
Self::with_port(4444).await
|
||||
}
|
||||
|
||||
|
||||
/// Create a new SafariDriver instance with a custom port
|
||||
pub async fn with_port(port: u16) -> Result<Self> {
|
||||
let url = format!("http://localhost:{}", port);
|
||||
|
||||
|
||||
let mut caps = serde_json::Map::new();
|
||||
caps.insert("browserName".to_string(), Value::String("safari".to_string()));
|
||||
|
||||
caps.insert(
|
||||
"browserName".to_string(),
|
||||
Value::String("safari".to_string()),
|
||||
);
|
||||
|
||||
let client = ClientBuilder::native()
|
||||
.capabilities(caps)
|
||||
.connect(&url)
|
||||
.await
|
||||
.context("Failed to connect to SafariDriver. Make sure SafariDriver is running and 'Allow Remote Automation' is enabled in Safari's Develop menu.")?;
|
||||
|
||||
|
||||
Ok(Self { client })
|
||||
}
|
||||
|
||||
|
||||
/// Go back in browser history
|
||||
pub async fn back(&mut self) -> Result<()> {
|
||||
self.client.back().await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
/// Go forward in browser history
|
||||
pub async fn forward(&mut self) -> Result<()> {
|
||||
self.client.forward().await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
/// Refresh the current page
|
||||
pub async fn refresh(&mut self) -> Result<()> {
|
||||
self.client.refresh().await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
/// Get all window handles
|
||||
pub async fn window_handles(&mut self) -> Result<Vec<String>> {
|
||||
let handles = self.client.windows().await?;
|
||||
Ok(handles.into_iter()
|
||||
.map(|h| h.into())
|
||||
.collect())
|
||||
Ok(handles.into_iter().map(|h| h.into()).collect())
|
||||
}
|
||||
|
||||
|
||||
/// Switch to a window by handle
|
||||
pub async fn switch_to_window(&mut self, handle: &str) -> Result<()> {
|
||||
let window_handle: fantoccini::wd::WindowHandle = handle.to_string().try_into()?;
|
||||
self.client.switch_to_window(window_handle).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
/// Get the current window handle
|
||||
pub async fn current_window_handle(&mut self) -> Result<String> {
|
||||
Ok(self.client.window().await?.into())
|
||||
}
|
||||
|
||||
|
||||
/// Close the current window
|
||||
pub async fn close_window(&mut self) -> Result<()> {
|
||||
self.client.close_window().await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
/// Create a new window/tab
|
||||
pub async fn new_window(&mut self, is_tab: bool) -> Result<String> {
|
||||
let window_type = if is_tab { "tab" } else { "window" };
|
||||
let response = self.client.new_window(window_type == "tab").await?;
|
||||
Ok(response.handle.into())
|
||||
}
|
||||
|
||||
|
||||
/// Get cookies
|
||||
pub async fn get_cookies(&mut self) -> Result<Vec<fantoccini::cookies::Cookie<'static>>> {
|
||||
Ok(self.client.get_all_cookies().await?)
|
||||
}
|
||||
|
||||
|
||||
/// Add a cookie
|
||||
pub async fn add_cookie(&mut self, cookie: fantoccini::cookies::Cookie<'static>) -> Result<()> {
|
||||
self.client.add_cookie(cookie).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
/// Delete all cookies
|
||||
pub async fn delete_all_cookies(&mut self) -> Result<()> {
|
||||
self.client.delete_all_cookies().await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
/// Wait for an element to appear (with timeout)
|
||||
pub async fn wait_for_element(&mut self, selector: &str, timeout: Duration) -> Result<WebElement> {
|
||||
pub async fn wait_for_element(
|
||||
&mut self,
|
||||
selector: &str,
|
||||
timeout: Duration,
|
||||
) -> Result<WebElement> {
|
||||
let start = std::time::Instant::now();
|
||||
let poll_interval = Duration::from_millis(100);
|
||||
|
||||
|
||||
loop {
|
||||
if let Ok(elem) = self.find_element(selector).await {
|
||||
return Ok(elem);
|
||||
}
|
||||
|
||||
|
||||
if start.elapsed() >= timeout {
|
||||
anyhow::bail!("Timeout waiting for element: {}", selector);
|
||||
}
|
||||
|
||||
|
||||
tokio::time::sleep(poll_interval).await;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Wait for an element to be visible (with timeout)
|
||||
pub async fn wait_for_visible(&mut self, selector: &str, timeout: Duration) -> Result<WebElement> {
|
||||
pub async fn wait_for_visible(
|
||||
&mut self,
|
||||
selector: &str,
|
||||
timeout: Duration,
|
||||
) -> Result<WebElement> {
|
||||
let start = std::time::Instant::now();
|
||||
let poll_interval = Duration::from_millis(100);
|
||||
|
||||
|
||||
loop {
|
||||
if let Ok(elem) = self.find_element(selector).await {
|
||||
if elem.is_displayed().await.unwrap_or(false) {
|
||||
return Ok(elem);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if start.elapsed() >= timeout {
|
||||
anyhow::bail!("Timeout waiting for element to be visible: {}", selector);
|
||||
}
|
||||
|
||||
|
||||
tokio::time::sleep(poll_interval).await;
|
||||
}
|
||||
}
|
||||
@@ -153,58 +162,69 @@ impl WebDriverController for SafariDriver {
|
||||
self.client.goto(url).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
async fn current_url(&self) -> Result<String> {
|
||||
Ok(self.client.current_url().await?.to_string())
|
||||
}
|
||||
|
||||
|
||||
async fn title(&self) -> Result<String> {
|
||||
Ok(self.client.title().await?)
|
||||
}
|
||||
|
||||
|
||||
async fn find_element(&mut self, selector: &str) -> Result<WebElement> {
|
||||
let elem = self.client.find(fantoccini::Locator::Css(selector)).await
|
||||
.context(format!("Failed to find element with selector: {}", selector))?;
|
||||
let elem = self
|
||||
.client
|
||||
.find(fantoccini::Locator::Css(selector))
|
||||
.await
|
||||
.context(format!(
|
||||
"Failed to find element with selector: {}",
|
||||
selector
|
||||
))?;
|
||||
Ok(WebElement { inner: elem })
|
||||
}
|
||||
|
||||
|
||||
async fn find_elements(&mut self, selector: &str) -> Result<Vec<WebElement>> {
|
||||
let elems = self.client.find_all(fantoccini::Locator::Css(selector)).await?;
|
||||
Ok(elems.into_iter().map(|inner| WebElement { inner }).collect())
|
||||
let elems = self
|
||||
.client
|
||||
.find_all(fantoccini::Locator::Css(selector))
|
||||
.await?;
|
||||
Ok(elems
|
||||
.into_iter()
|
||||
.map(|inner| WebElement { inner })
|
||||
.collect())
|
||||
}
|
||||
|
||||
|
||||
async fn execute_script(&mut self, script: &str, args: Vec<Value>) -> Result<Value> {
|
||||
Ok(self.client.execute(script, args).await?)
|
||||
}
|
||||
|
||||
|
||||
async fn page_source(&self) -> Result<String> {
|
||||
Ok(self.client.source().await?)
|
||||
}
|
||||
|
||||
|
||||
async fn screenshot(&mut self, path: &str) -> Result<()> {
|
||||
let screenshot_data = self.client.screenshot().await?;
|
||||
|
||||
|
||||
// Expand tilde in path
|
||||
let expanded_path = shellexpand::tilde(path);
|
||||
let path_str = expanded_path.as_ref();
|
||||
|
||||
|
||||
// Create parent directories if needed
|
||||
if let Some(parent) = std::path::Path::new(path_str).parent() {
|
||||
std::fs::create_dir_all(parent)
|
||||
.context("Failed to create parent directories for screenshot")?;
|
||||
}
|
||||
|
||||
std::fs::write(path_str, screenshot_data)
|
||||
.context("Failed to write screenshot to file")?;
|
||||
|
||||
|
||||
std::fs::write(path_str, screenshot_data).context("Failed to write screenshot to file")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
async fn close(&mut self) -> Result<()> {
|
||||
self.client.close_window().await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
async fn quit(mut self) -> Result<()> {
|
||||
self.client.close().await?;
|
||||
Ok(())
|
||||
|
||||
@@ -3,29 +3,35 @@ use g3_computer_control::*;
|
||||
#[tokio::test]
|
||||
async fn test_screenshot() {
|
||||
let controller = create_controller().expect("Failed to create controller");
|
||||
|
||||
|
||||
// Test that screenshot without window_id fails with appropriate error
|
||||
let path = "/tmp/test_screenshot.png";
|
||||
let result = controller.take_screenshot(path, None, None).await;
|
||||
assert!(result.is_err(), "Expected error when window_id is not provided");
|
||||
|
||||
assert!(
|
||||
result.is_err(),
|
||||
"Expected error when window_id is not provided"
|
||||
);
|
||||
|
||||
let error_msg = result.unwrap_err().to_string();
|
||||
assert!(error_msg.contains("window_id is required"),
|
||||
"Expected error message about window_id being required, got: {}", error_msg);
|
||||
assert!(
|
||||
error_msg.contains("window_id is required"),
|
||||
"Expected error message about window_id being required, got: {}",
|
||||
error_msg
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_screenshot_with_window() {
|
||||
let controller = create_controller().expect("Failed to create controller");
|
||||
|
||||
|
||||
// Take screenshot of Finder (should always be available on macOS)
|
||||
let path = "/tmp/test_screenshot_finder.png";
|
||||
let result = controller.take_screenshot(path, None, Some("Finder")).await;
|
||||
|
||||
|
||||
// This test may fail if Finder is not running, so we just check it doesn't panic
|
||||
// and returns a proper Result
|
||||
let _ = result; // Don't assert success since Finder might not be visible
|
||||
|
||||
|
||||
// Clean up
|
||||
let _ = std::fs::remove_file(path);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user