Remove VisionBridge OCR (unused)
VisionBridge was a Swift library for Apple Vision OCR that was built every compile but never actually used by any g3 tool. Removed: - vision-bridge/ Swift package directory - src/ocr/ module (vision.rs, tesseract.rs, mod.rs) - OCR methods from ComputerController trait - OCR-related code from platform implementations - TextLocation type (no longer needed) - test_vision.rs example Simplified: - build.rs (now empty, no Swift compilation) - MacOSController (no longer holds OCR engine) - LinuxController and WindowsController (stub implementations) Build time improvement: No more 'Building VisionBridge Swift package...' messages on every compile.
This commit is contained in:
@@ -1,100 +1,4 @@
|
||||
use std::env;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
|
||||
fn main() {
|
||||
// Only build Vision bridge on macOS
|
||||
if env::var("CARGO_CFG_TARGET_OS").unwrap() != "macos" {
|
||||
return;
|
||||
}
|
||||
|
||||
println!("cargo:rerun-if-changed=vision-bridge/Sources/VisionBridge/VisionOCR.swift");
|
||||
println!("cargo:rerun-if-changed=vision-bridge/Sources/VisionBridge/VisionBridge.h");
|
||||
println!("cargo:rerun-if-changed=vision-bridge/Package.swift");
|
||||
|
||||
let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
|
||||
let vision_bridge_dir = manifest_dir.join("vision-bridge");
|
||||
|
||||
// Build Swift package
|
||||
println!("cargo:warning=Building VisionBridge Swift package...");
|
||||
let build_status = Command::new("swift")
|
||||
.args(&["build", "-c", "release"])
|
||||
.current_dir(&vision_bridge_dir)
|
||||
.status()
|
||||
.expect("Failed to build Swift package");
|
||||
|
||||
if !build_status.success() {
|
||||
panic!("Swift build failed");
|
||||
}
|
||||
|
||||
// Find the built library
|
||||
let lib_path = vision_bridge_dir
|
||||
.join(".build/release")
|
||||
.canonicalize()
|
||||
.expect("Failed to find .build/release directory");
|
||||
|
||||
// Copy the dylib to the output directory so it can be found at runtime
|
||||
let target_dir = manifest_dir
|
||||
.parent()
|
||||
.unwrap()
|
||||
.parent()
|
||||
.unwrap()
|
||||
.join("target");
|
||||
let profile = env::var("PROFILE").unwrap_or_else(|_| "debug".to_string());
|
||||
|
||||
// Determine the actual target directory (could be llvm-cov-target or regular target)
|
||||
let target_dir_name =
|
||||
env::var("CARGO_TARGET_DIR").unwrap_or_else(|_| target_dir.to_string_lossy().to_string());
|
||||
let actual_target_dir = PathBuf::from(&target_dir_name);
|
||||
let output_dir = actual_target_dir.join(&profile);
|
||||
|
||||
let dylib_src = lib_path.join("libVisionBridge.dylib");
|
||||
let dylib_dst = output_dir.join("libVisionBridge.dylib");
|
||||
|
||||
// Create output directory if it doesn't exist
|
||||
std::fs::create_dir_all(&output_dir).expect(&format!(
|
||||
"Failed to create output directory {}",
|
||||
output_dir.display()
|
||||
));
|
||||
|
||||
std::fs::copy(&dylib_src, &dylib_dst).expect(&format!(
|
||||
"Failed to copy dylib from {} to {}",
|
||||
dylib_src.display(),
|
||||
dylib_dst.display()
|
||||
));
|
||||
|
||||
println!(
|
||||
"cargo:warning=Copied libVisionBridge.dylib to {}",
|
||||
dylib_dst.display()
|
||||
);
|
||||
|
||||
// Re-sign the dylib with ad-hoc signature to fix code signing issues on Apple Silicon
|
||||
// This is necessary because incremental compilation can invalidate signatures
|
||||
let codesign_status = Command::new("codesign")
|
||||
.args(&["-f", "-s", "-", dylib_dst.to_str().unwrap()])
|
||||
.status();
|
||||
|
||||
if let Ok(status) = codesign_status {
|
||||
if !status.success() {
|
||||
println!("cargo:warning=Failed to codesign libVisionBridge.dylib (non-fatal)");
|
||||
}
|
||||
}
|
||||
|
||||
// Add rpath so the dylib can be found at runtime
|
||||
println!("cargo:rustc-link-arg=-Wl,-rpath,@executable_path");
|
||||
println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
|
||||
println!("cargo:rustc-link-search=native={}", lib_path.display());
|
||||
println!("cargo:rustc-link-lib=dylib=VisionBridge");
|
||||
|
||||
// Link required frameworks
|
||||
println!("cargo:rustc-link-lib=framework=Vision");
|
||||
println!("cargo:rustc-link-lib=framework=AppKit");
|
||||
println!("cargo:rustc-link-lib=framework=Foundation");
|
||||
println!("cargo:rustc-link-lib=framework=CoreGraphics");
|
||||
println!("cargo:rustc-link-lib=framework=CoreImage");
|
||||
|
||||
println!(
|
||||
"cargo:warning=VisionBridge built successfully at {}",
|
||||
lib_path.display()
|
||||
);
|
||||
// No build-time dependencies required
|
||||
// VisionBridge OCR has been removed
|
||||
}
|
||||
|
||||
@@ -1,92 +0,0 @@
|
||||
use anyhow::Result;
|
||||
use g3_computer_control::ocr::{DefaultOCR, OCREngine};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
println!("🧪 Testing Apple Vision OCR");
|
||||
println!("===========================\n");
|
||||
|
||||
// Initialize OCR engine
|
||||
println!("📦 Initializing OCR engine...");
|
||||
let ocr = DefaultOCR::new()?;
|
||||
println!("✅ OCR engine: {}\n", ocr.name());
|
||||
|
||||
// Check if test image exists
|
||||
let test_image = "/tmp/safari_test.png";
|
||||
if !std::path::Path::new(test_image).exists() {
|
||||
println!("⚠️ Test image not found: {}", test_image);
|
||||
println!(" Creating a screenshot...");
|
||||
|
||||
let status = std::process::Command::new("screencapture")
|
||||
.arg("-x")
|
||||
.arg("-R")
|
||||
.arg("0,0,1200,800")
|
||||
.arg(test_image)
|
||||
.status()?;
|
||||
|
||||
if !status.success() {
|
||||
anyhow::bail!("Failed to create screenshot");
|
||||
}
|
||||
|
||||
println!("✅ Screenshot created\n");
|
||||
}
|
||||
|
||||
// Run OCR
|
||||
println!("🔍 Running Apple Vision OCR on {}...", test_image);
|
||||
let start = std::time::Instant::now();
|
||||
let locations = ocr.extract_text_with_locations(test_image).await?;
|
||||
let duration = start.elapsed();
|
||||
|
||||
println!("✅ OCR completed in {:.3}s\n", duration.as_secs_f64());
|
||||
|
||||
// Display results
|
||||
println!("📊 Results:");
|
||||
println!(" Found {} text elements\n", locations.len());
|
||||
|
||||
if locations.is_empty() {
|
||||
println!("⚠️ No text found in image");
|
||||
} else {
|
||||
println!(" Top 20 results:");
|
||||
println!(
|
||||
" {:<4} {:<40} {:<15} {:<12} {:<8}",
|
||||
"#", "Text", "Position", "Size", "Conf"
|
||||
);
|
||||
println!(" {}", "-".repeat(85));
|
||||
|
||||
for (i, loc) in locations.iter().take(20).enumerate() {
|
||||
let text = if loc.text.len() > 37 {
|
||||
format!("{}...", &loc.text[..37])
|
||||
} else {
|
||||
loc.text.clone()
|
||||
};
|
||||
|
||||
println!(
|
||||
" {:<4} {:<40} ({:>4},{:>4}) {:>4}x{:<4} {:.2}",
|
||||
i + 1,
|
||||
text,
|
||||
loc.x,
|
||||
loc.y,
|
||||
loc.width,
|
||||
loc.height,
|
||||
loc.confidence
|
||||
);
|
||||
}
|
||||
|
||||
if locations.len() > 20 {
|
||||
println!("\n ... and {} more", locations.len() - 20);
|
||||
}
|
||||
|
||||
// Performance comparison
|
||||
println!("\n📈 Performance:");
|
||||
println!(" OCR Speed: {:.3}s", duration.as_secs_f64());
|
||||
println!(" Text elements: {}", locations.len());
|
||||
println!(
|
||||
" Avg per element: {:.1}ms",
|
||||
duration.as_millis() as f64 / locations.len() as f64
|
||||
);
|
||||
}
|
||||
|
||||
println!("\n✅ Test complete!");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -2,7 +2,6 @@
|
||||
#![allow(unexpected_cfgs)]
|
||||
|
||||
pub mod macax;
|
||||
pub mod ocr;
|
||||
pub mod platform;
|
||||
pub mod types;
|
||||
pub mod webdriver;
|
||||
@@ -30,16 +29,6 @@ pub trait ComputerController: Send + Sync {
|
||||
window_id: Option<&str>,
|
||||
) -> Result<()>;
|
||||
|
||||
// OCR operations
|
||||
async fn extract_text_from_screen(&self, region: Rect, window_id: &str) -> Result<String>;
|
||||
async fn extract_text_from_image(&self, path: &str) -> Result<String>;
|
||||
async fn extract_text_with_locations(&self, path: &str) -> Result<Vec<TextLocation>>;
|
||||
async fn find_text_in_app(
|
||||
&self,
|
||||
app_name: &str,
|
||||
search_text: &str,
|
||||
) -> Result<Option<TextLocation>>;
|
||||
|
||||
// Mouse operations
|
||||
fn move_mouse(&self, x: i32, y: i32) -> Result<()>;
|
||||
fn click_at(&self, x: i32, y: i32, app_name: Option<&str>) -> Result<()>;
|
||||
|
||||
@@ -1,26 +0,0 @@
|
||||
use crate::types::TextLocation;
|
||||
use anyhow::Result;
|
||||
use async_trait::async_trait;
|
||||
|
||||
/// OCR engine trait for text recognition with bounding boxes
|
||||
#[async_trait]
|
||||
pub trait OCREngine: Send + Sync {
|
||||
/// Extract text with locations from an image file
|
||||
async fn extract_text_with_locations(&self, path: &str) -> Result<Vec<TextLocation>>;
|
||||
|
||||
/// Get the name of the OCR engine
|
||||
fn name(&self) -> &str;
|
||||
}
|
||||
|
||||
// Platform-specific modules
|
||||
#[cfg(target_os = "macos")]
|
||||
pub mod vision;
|
||||
|
||||
pub mod tesseract;
|
||||
|
||||
// Re-export the default OCR engine for the platform
|
||||
#[cfg(target_os = "macos")]
|
||||
pub use vision::AppleVisionOCR as DefaultOCR;
|
||||
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
pub use tesseract::TesseractOCR as DefaultOCR;
|
||||
@@ -1,91 +0,0 @@
|
||||
use super::OCREngine;
|
||||
use crate::types::TextLocation;
|
||||
use anyhow::Result;
|
||||
use async_trait::async_trait;
|
||||
|
||||
/// Tesseract OCR engine (fallback/cross-platform)
|
||||
pub struct TesseractOCR;
|
||||
|
||||
impl TesseractOCR {
|
||||
pub fn new() -> Result<Self> {
|
||||
// Check if tesseract is available
|
||||
let tesseract_check = std::process::Command::new("which")
|
||||
.arg("tesseract")
|
||||
.output();
|
||||
|
||||
if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() {
|
||||
anyhow::bail!(
|
||||
"Tesseract OCR is not installed on your system.\n\n\
|
||||
To install tesseract:\n macOS: brew install tesseract\n \
|
||||
Linux: sudo apt-get install tesseract-ocr (Ubuntu/Debian)\n \
|
||||
sudo yum install tesseract (RHEL/CentOS)\n \
|
||||
Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki\n\n\
|
||||
After installation, restart your terminal and try again."
|
||||
);
|
||||
}
|
||||
|
||||
Ok(Self)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl OCREngine for TesseractOCR {
|
||||
async fn extract_text_with_locations(&self, path: &str) -> Result<Vec<TextLocation>> {
|
||||
// Use tesseract CLI with TSV output to get bounding boxes
|
||||
let output = std::process::Command::new("tesseract")
|
||||
.arg(path)
|
||||
.arg("stdout")
|
||||
.arg("tsv")
|
||||
.output()
|
||||
.map_err(|e| anyhow::anyhow!("Failed to run tesseract: {}", e))?;
|
||||
|
||||
if !output.status.success() {
|
||||
anyhow::bail!(
|
||||
"Tesseract failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
let tsv_text = String::from_utf8_lossy(&output.stdout);
|
||||
let mut locations = Vec::new();
|
||||
|
||||
// Parse TSV output (skip header line)
|
||||
for (i, line) in tsv_text.lines().enumerate() {
|
||||
if i == 0 {
|
||||
continue;
|
||||
} // Skip header
|
||||
|
||||
let parts: Vec<&str> = line.split('\t').collect();
|
||||
if parts.len() >= 12 {
|
||||
// TSV format: level, page_num, block_num, par_num, line_num, word_num,
|
||||
// left, top, width, height, conf, text
|
||||
if let (Ok(x), Ok(y), Ok(w), Ok(h), Ok(conf), text) = (
|
||||
parts[6].parse::<i32>(),
|
||||
parts[7].parse::<i32>(),
|
||||
parts[8].parse::<i32>(),
|
||||
parts[9].parse::<i32>(),
|
||||
parts[10].parse::<f32>(),
|
||||
parts[11],
|
||||
) {
|
||||
let trimmed = text.trim();
|
||||
if !trimmed.is_empty() && conf > 0.0 {
|
||||
locations.push(TextLocation {
|
||||
text: trimmed.to_string(),
|
||||
x,
|
||||
y,
|
||||
width: w,
|
||||
height: h,
|
||||
confidence: conf / 100.0, // Convert from 0-100 to 0-1
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(locations)
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
"Tesseract OCR"
|
||||
}
|
||||
}
|
||||
@@ -1,100 +0,0 @@
|
||||
use super::OCREngine;
|
||||
use crate::types::TextLocation;
|
||||
use anyhow::{Context, Result};
|
||||
use async_trait::async_trait;
|
||||
use std::ffi::{CStr, CString};
|
||||
use std::os::raw::{c_char, c_float, c_uint};
|
||||
|
||||
// FFI bindings to Swift VisionBridge
|
||||
#[repr(C)]
|
||||
struct VisionTextBox {
|
||||
text: *const c_char,
|
||||
text_len: c_uint,
|
||||
x: i32,
|
||||
y: i32,
|
||||
width: i32,
|
||||
height: i32,
|
||||
confidence: c_float,
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
fn vision_recognize_text(
|
||||
image_path: *const c_char,
|
||||
image_path_len: c_uint,
|
||||
out_boxes: *mut *mut std::ffi::c_void,
|
||||
out_count: *mut c_uint,
|
||||
) -> bool;
|
||||
|
||||
fn vision_free_boxes(boxes: *mut std::ffi::c_void, count: c_uint);
|
||||
}
|
||||
|
||||
/// Apple Vision Framework OCR engine
|
||||
pub struct AppleVisionOCR;
|
||||
|
||||
impl AppleVisionOCR {
|
||||
pub fn new() -> Result<Self> {
|
||||
Ok(Self)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl OCREngine for AppleVisionOCR {
|
||||
async fn extract_text_with_locations(&self, path: &str) -> Result<Vec<TextLocation>> {
|
||||
// Convert path to C string
|
||||
let c_path = CString::new(path).context("Failed to convert path to C string")?;
|
||||
|
||||
let mut boxes_ptr: *mut std::ffi::c_void = std::ptr::null_mut();
|
||||
let mut count: c_uint = 0;
|
||||
|
||||
// Call Swift Vision API
|
||||
let success = unsafe {
|
||||
vision_recognize_text(
|
||||
c_path.as_ptr(),
|
||||
path.len() as c_uint,
|
||||
&mut boxes_ptr,
|
||||
&mut count,
|
||||
)
|
||||
};
|
||||
|
||||
if !success || boxes_ptr.is_null() {
|
||||
anyhow::bail!("Apple Vision OCR failed");
|
||||
}
|
||||
|
||||
// Convert C array to Rust Vec
|
||||
let mut locations = Vec::new();
|
||||
|
||||
unsafe {
|
||||
let typed_boxes = boxes_ptr as *const VisionTextBox;
|
||||
let boxes_slice = std::slice::from_raw_parts(typed_boxes, count as usize);
|
||||
|
||||
for box_data in boxes_slice {
|
||||
// Convert C string to Rust String
|
||||
let text = if !box_data.text.is_null() {
|
||||
CStr::from_ptr(box_data.text).to_string_lossy().into_owned()
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
|
||||
if !text.is_empty() {
|
||||
locations.push(TextLocation {
|
||||
text,
|
||||
x: box_data.x,
|
||||
y: box_data.y,
|
||||
width: box_data.width,
|
||||
height: box_data.height,
|
||||
confidence: box_data.confidence,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Free the C array
|
||||
vision_free_boxes(boxes_ptr, count);
|
||||
}
|
||||
|
||||
Ok(locations)
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
"Apple Vision Framework"
|
||||
}
|
||||
}
|
||||
@@ -1,188 +1,32 @@
|
||||
use crate::{types::*, ComputerController};
|
||||
use crate::{types::Rect, ComputerController};
|
||||
use anyhow::Result;
|
||||
use async_trait::async_trait;
|
||||
use tesseract::Tesseract;
|
||||
use uuid::Uuid;
|
||||
|
||||
pub struct LinuxController {
|
||||
// Placeholder for X11 connection or other state
|
||||
}
|
||||
pub struct LinuxController;
|
||||
|
||||
impl LinuxController {
|
||||
pub fn new() -> Result<Self> {
|
||||
// Initialize X11 connection
|
||||
tracing::warn!("Linux computer control not fully implemented");
|
||||
Ok(Self {})
|
||||
Ok(Self)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl ComputerController for LinuxController {
|
||||
async fn move_mouse(&self, _x: i32, _y: i32) -> Result<()> {
|
||||
anyhow::bail!("Linux implementation not yet available")
|
||||
}
|
||||
|
||||
async fn click(&self, _button: MouseButton) -> Result<()> {
|
||||
anyhow::bail!("Linux implementation not yet available")
|
||||
}
|
||||
|
||||
async fn double_click(&self, _button: MouseButton) -> Result<()> {
|
||||
anyhow::bail!("Linux implementation not yet available")
|
||||
}
|
||||
|
||||
async fn type_text(&self, _text: &str) -> Result<()> {
|
||||
anyhow::bail!("Linux implementation not yet available")
|
||||
}
|
||||
|
||||
async fn press_key(&self, _key: &str) -> Result<()> {
|
||||
anyhow::bail!("Linux implementation not yet available")
|
||||
}
|
||||
|
||||
async fn list_windows(&self) -> Result<Vec<Window>> {
|
||||
anyhow::bail!("Linux implementation not yet available")
|
||||
}
|
||||
|
||||
async fn focus_window(&self, _window_id: &str) -> Result<()> {
|
||||
anyhow::bail!("Linux implementation not yet available")
|
||||
}
|
||||
|
||||
async fn get_window_bounds(&self, _window_id: &str) -> Result<Rect> {
|
||||
anyhow::bail!("Linux implementation not yet available")
|
||||
}
|
||||
|
||||
async fn find_element(&self, _selector: &ElementSelector) -> Result<Option<UIElement>> {
|
||||
anyhow::bail!("Linux implementation not yet available")
|
||||
}
|
||||
|
||||
async fn get_element_text(&self, _element_id: &str) -> Result<String> {
|
||||
anyhow::bail!("Linux implementation not yet available")
|
||||
}
|
||||
|
||||
async fn get_element_bounds(&self, _element_id: &str) -> Result<Rect> {
|
||||
anyhow::bail!("Linux implementation not yet available")
|
||||
}
|
||||
|
||||
async fn take_screenshot(
|
||||
&self,
|
||||
_path: &str,
|
||||
_region: Option<Rect>,
|
||||
_window_id: Option<&str>,
|
||||
) -> Result<()> {
|
||||
// Enforce that window_id must be provided
|
||||
if _window_id.is_none() {
|
||||
anyhow::bail!("window_id is required. You must specify which window to capture (e.g., 'Firefox', 'Terminal', 'gedit'). Use list_windows to see available windows.");
|
||||
}
|
||||
|
||||
anyhow::bail!("Linux implementation not yet available")
|
||||
anyhow::bail!("Linux screenshot implementation not yet available")
|
||||
}
|
||||
|
||||
async fn extract_text_from_screen(&self, _region: Rect, _window_id: &str) -> Result<String> {
|
||||
anyhow::bail!("Linux implementation not yet available")
|
||||
fn move_mouse(&self, _x: i32, _y: i32) -> Result<()> {
|
||||
anyhow::bail!("Linux mouse control not yet available")
|
||||
}
|
||||
|
||||
async fn extract_text_from_image(&self, _path: &str) -> Result<OCRResult> {
|
||||
// Check if tesseract is available on the system
|
||||
let tesseract_check = std::process::Command::new("which")
|
||||
.arg("tesseract")
|
||||
.output();
|
||||
|
||||
if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() {
|
||||
anyhow::bail!(
|
||||
"Tesseract OCR is not installed on your system.\n\n\
|
||||
To install tesseract:\n \
|
||||
Ubuntu/Debian: sudo apt-get install tesseract-ocr\n \
|
||||
RHEL/CentOS: sudo yum install tesseract\n \
|
||||
Arch Linux: sudo pacman -S tesseract\n\n\
|
||||
After installation, restart your terminal and try again."
|
||||
);
|
||||
}
|
||||
|
||||
// Initialize Tesseract
|
||||
let tess = Tesseract::new(None, Some("eng")).map_err(|e| {
|
||||
anyhow::anyhow!(
|
||||
"Failed to initialize Tesseract: {}\n\n\
|
||||
This usually means:\n1. Tesseract is not properly installed\n\
|
||||
2. Language data files are missing\n\nTo fix:\n \
|
||||
Ubuntu/Debian: sudo apt-get install tesseract-ocr-eng\n \
|
||||
RHEL/CentOS: sudo yum install tesseract-langpack-eng\n \
|
||||
Arch Linux: sudo pacman -S tesseract-data-eng",
|
||||
e
|
||||
)
|
||||
})?;
|
||||
|
||||
let text = tess
|
||||
.set_image(_path)
|
||||
.map_err(|e| anyhow::anyhow!("Failed to load image '{}': {}", _path, e))?
|
||||
.get_text()
|
||||
.map_err(|e| anyhow::anyhow!("Failed to extract text from image: {}", e))?;
|
||||
|
||||
// Get confidence (simplified - would need more complex API calls for per-word confidence)
|
||||
let confidence = 0.85; // Placeholder
|
||||
|
||||
Ok(OCRResult {
|
||||
text,
|
||||
confidence,
|
||||
bounds: Rect {
|
||||
x: 0,
|
||||
y: 0,
|
||||
width: 0,
|
||||
height: 0,
|
||||
}, // Would need image dimensions
|
||||
})
|
||||
}
|
||||
|
||||
async fn find_text_on_screen(&self, _text: &str) -> Result<Option<Point>> {
|
||||
// Check if tesseract is available on the system
|
||||
let tesseract_check = std::process::Command::new("which")
|
||||
.arg("tesseract")
|
||||
.output();
|
||||
|
||||
if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() {
|
||||
anyhow::bail!(
|
||||
"Tesseract OCR is not installed on your system.\n\n\
|
||||
To install tesseract:\n \
|
||||
Ubuntu/Debian: sudo apt-get install tesseract-ocr\n \
|
||||
RHEL/CentOS: sudo yum install tesseract\n \
|
||||
Arch Linux: sudo pacman -S tesseract\n\n\
|
||||
After installation, restart your terminal and try again."
|
||||
);
|
||||
}
|
||||
|
||||
// Take full screen screenshot
|
||||
let temp_path = format!("/tmp/g3_ocr_search_{}.png", uuid::Uuid::new_v4());
|
||||
self.take_screenshot(&temp_path, None, None).await?;
|
||||
|
||||
// Use Tesseract to find text with bounding boxes
|
||||
let tess = Tesseract::new(None, Some("eng")).map_err(|e| {
|
||||
anyhow::anyhow!(
|
||||
"Failed to initialize Tesseract: {}\n\n\
|
||||
This usually means:\n1. Tesseract is not properly installed\n\
|
||||
2. Language data files are missing\n\nTo fix:\n \
|
||||
Ubuntu/Debian: sudo apt-get install tesseract-ocr-eng\n \
|
||||
RHEL/CentOS: sudo yum install tesseract-langpack-eng\n \
|
||||
Arch Linux: sudo pacman -S tesseract-data-eng",
|
||||
e
|
||||
)
|
||||
})?;
|
||||
|
||||
let full_text = tess
|
||||
.set_image(temp_path.as_str())
|
||||
.map_err(|e| anyhow::anyhow!("Failed to load screenshot: {}", e))?
|
||||
.get_text()
|
||||
.map_err(|e| anyhow::anyhow!("Failed to extract text from screen: {}", e))?;
|
||||
|
||||
// Clean up temp file
|
||||
let _ = std::fs::remove_file(&temp_path);
|
||||
|
||||
// Simple text search - full implementation would use get_component_images
|
||||
// to get bounding boxes for each word
|
||||
if full_text.contains(_text) {
|
||||
tracing::warn!(
|
||||
"Text found but precise coordinates not available in simplified implementation"
|
||||
);
|
||||
Ok(Some(Point { x: 0, y: 0 }))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
fn click_at(&self, _x: i32, _y: i32, _app_name: Option<&str>) -> Result<()> {
|
||||
anyhow::bail!("Linux click control not yet available")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
use crate::ocr::{DefaultOCR, OCREngine};
|
||||
use crate::{
|
||||
types::{Rect, TextLocation},
|
||||
ComputerController,
|
||||
types::Rect, ComputerController,
|
||||
};
|
||||
use anyhow::{Context, Result};
|
||||
use async_trait::async_trait;
|
||||
@@ -14,21 +12,12 @@ use core_graphics::window::{
|
||||
};
|
||||
use std::path::Path;
|
||||
|
||||
pub struct MacOSController {
|
||||
ocr_engine: Box<dyn OCREngine>,
|
||||
#[allow(dead_code)]
|
||||
ocr_name: String,
|
||||
}
|
||||
pub struct MacOSController;
|
||||
|
||||
impl MacOSController {
|
||||
pub fn new() -> Result<Self> {
|
||||
let ocr = Box::new(DefaultOCR::new()?);
|
||||
let ocr_name = ocr.name().to_string();
|
||||
tracing::debug!("Initialized macOS controller with OCR engine: {}", ocr_name);
|
||||
Ok(Self {
|
||||
ocr_engine: ocr,
|
||||
ocr_name,
|
||||
})
|
||||
tracing::debug!("Initialized macOS controller");
|
||||
Ok(Self)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -215,78 +204,6 @@ impl ComputerController for MacOSController {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn extract_text_from_screen(&self, region: Rect, window_id: &str) -> Result<String> {
|
||||
// Take screenshot of region first
|
||||
let temp_path = format!("/tmp/g3_ocr_{}.png", uuid::Uuid::new_v4());
|
||||
self.take_screenshot(&temp_path, Some(region), Some(window_id))
|
||||
.await?;
|
||||
|
||||
// Extract text from the screenshot
|
||||
let result = self.extract_text_from_image(&temp_path).await?;
|
||||
|
||||
// Clean up temp file
|
||||
let _ = std::fs::remove_file(&temp_path);
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
async fn extract_text_from_image(&self, path: &str) -> Result<String> {
|
||||
// Extract all text and concatenate
|
||||
let locations = self.ocr_engine.extract_text_with_locations(path).await?;
|
||||
Ok(locations
|
||||
.iter()
|
||||
.map(|loc| loc.text.as_str())
|
||||
.collect::<Vec<_>>()
|
||||
.join(" "))
|
||||
}
|
||||
|
||||
async fn extract_text_with_locations(&self, path: &str) -> Result<Vec<TextLocation>> {
|
||||
// Use the OCR engine
|
||||
self.ocr_engine.extract_text_with_locations(path).await
|
||||
}
|
||||
|
||||
async fn find_text_in_app(
|
||||
&self,
|
||||
app_name: &str,
|
||||
search_text: &str,
|
||||
) -> Result<Option<TextLocation>> {
|
||||
// Take screenshot of specific app window
|
||||
let home = std::env::var("HOME").unwrap_or_else(|_| "/tmp".to_string());
|
||||
let temp_path = format!(
|
||||
"{}/tmp/g3_find_text_{}_{}.png",
|
||||
home,
|
||||
app_name,
|
||||
uuid::Uuid::new_v4()
|
||||
);
|
||||
self.take_screenshot(&temp_path, None, Some(app_name))
|
||||
.await?;
|
||||
|
||||
// Get screenshot dimensions before we delete it
|
||||
let screenshot_dims = get_image_dimensions(&temp_path)?;
|
||||
|
||||
// Extract all text with locations
|
||||
let locations = self.extract_text_with_locations(&temp_path).await?;
|
||||
|
||||
// Get window bounds to calculate coordinate transformation
|
||||
let window_bounds = self.get_window_bounds(app_name)?;
|
||||
|
||||
// Clean up temp file
|
||||
let _ = std::fs::remove_file(&temp_path);
|
||||
|
||||
// Find matching text (case-insensitive)
|
||||
let search_lower = search_text.to_lowercase();
|
||||
for location in locations {
|
||||
if location.text.to_lowercase().contains(&search_lower) {
|
||||
// Transform coordinates from screenshot space to screen space
|
||||
let transformed =
|
||||
transform_screenshot_to_screen_coords(location, window_bounds, screenshot_dims);
|
||||
return Ok(Some(transformed));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
fn move_mouse(&self, x: i32, y: i32) -> Result<()> {
|
||||
use core_graphics::event::{CGEvent, CGEventTapLocation, CGEventType, CGMouseButton};
|
||||
use core_graphics::event_source::{CGEventSource, CGEventSourceStateID};
|
||||
@@ -379,246 +296,6 @@ impl ComputerController for MacOSController {
|
||||
}
|
||||
}
|
||||
|
||||
impl MacOSController {
|
||||
/// Get window bounds for an application (helper method)
|
||||
fn get_window_bounds(&self, app_name: &str) -> Result<(i32, i32, i32, i32)> {
|
||||
unsafe {
|
||||
let window_list =
|
||||
CGWindowListCopyWindowInfo(kCGWindowListOptionOnScreenOnly, kCGNullWindowID);
|
||||
|
||||
let array = CFArray::<CFDictionary>::wrap_under_create_rule(window_list);
|
||||
let count = array.len();
|
||||
|
||||
let app_name_lower = app_name.to_lowercase();
|
||||
|
||||
for i in 0..count {
|
||||
let dict = array.get(i).unwrap();
|
||||
|
||||
// Get owner name
|
||||
let owner_key = CFString::from_static_string("kCGWindowOwnerName");
|
||||
let owner: String = if let Some(value) = dict.find(owner_key.to_void()) {
|
||||
let s: CFString = TCFType::wrap_under_get_rule(*value as *const _);
|
||||
s.to_string()
|
||||
} else {
|
||||
continue;
|
||||
};
|
||||
|
||||
let owner_lower = owner.to_lowercase();
|
||||
|
||||
// Normalize by removing spaces for exact matching
|
||||
let app_name_normalized = app_name_lower.replace(" ", "");
|
||||
let owner_normalized = owner_lower.replace(" ", "");
|
||||
|
||||
// ONLY accept exact matches (case-insensitive, with or without spaces)
|
||||
// This prevents "Goose" from matching "GooseStudio"
|
||||
let is_match =
|
||||
owner_lower == app_name_lower || owner_normalized == app_name_normalized;
|
||||
|
||||
if is_match {
|
||||
// Get window layer to filter out menu bar windows
|
||||
let layer_key = CFString::from_static_string("kCGWindowLayer");
|
||||
let layer: i32 = if let Some(value) = dict.find(layer_key.to_void()) {
|
||||
let num: core_foundation::number::CFNumber =
|
||||
TCFType::wrap_under_get_rule(*value as *const _);
|
||||
num.to_i32().unwrap_or(0)
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
// Skip menu bar windows (layer >= 20)
|
||||
if layer >= 20 {
|
||||
tracing::debug!(
|
||||
"Skipping window for '{}' at layer {} (menu bar)",
|
||||
owner,
|
||||
layer
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Get window bounds to verify it's a real window
|
||||
let bounds_key = CFString::from_static_string("kCGWindowBounds");
|
||||
if let Some(value) = dict.find(bounds_key.to_void()) {
|
||||
let bounds_dict: CFDictionary =
|
||||
TCFType::wrap_under_get_rule(*value as *const _);
|
||||
|
||||
let x_key = CFString::from_static_string("X");
|
||||
let y_key = CFString::from_static_string("Y");
|
||||
let width_key = CFString::from_static_string("Width");
|
||||
let height_key = CFString::from_static_string("Height");
|
||||
|
||||
if let (Some(x_val), Some(y_val), Some(w_val), Some(h_val)) = (
|
||||
bounds_dict.find(x_key.to_void()),
|
||||
bounds_dict.find(y_key.to_void()),
|
||||
bounds_dict.find(width_key.to_void()),
|
||||
bounds_dict.find(height_key.to_void()),
|
||||
) {
|
||||
let x_num: core_foundation::number::CFNumber =
|
||||
TCFType::wrap_under_get_rule(*x_val as *const _);
|
||||
let y_num: core_foundation::number::CFNumber =
|
||||
TCFType::wrap_under_get_rule(*y_val as *const _);
|
||||
let w_num: core_foundation::number::CFNumber =
|
||||
TCFType::wrap_under_get_rule(*w_val as *const _);
|
||||
let h_num: core_foundation::number::CFNumber =
|
||||
TCFType::wrap_under_get_rule(*h_val as *const _);
|
||||
|
||||
let x: i32 = x_num.to_i64().unwrap_or(0) as i32;
|
||||
let y: i32 = y_num.to_i64().unwrap_or(0) as i32;
|
||||
let w: i32 = w_num.to_i64().unwrap_or(0) as i32;
|
||||
let h: i32 = h_num.to_i64().unwrap_or(0) as i32;
|
||||
|
||||
// Only accept windows with real bounds (>= 100x100 pixels)
|
||||
if w >= 100 && h >= 100 {
|
||||
tracing::debug!("Found valid window bounds for '{}': x={}, y={}, w={}, h={} (layer={})", owner, x, y, w, h, layer);
|
||||
return Ok((x, y, w, h));
|
||||
} else {
|
||||
tracing::debug!(
|
||||
"Skipping window for '{}': too small ({}x{})",
|
||||
owner,
|
||||
w,
|
||||
h
|
||||
);
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Err(anyhow::anyhow!(
|
||||
"Could not find window bounds for '{}'",
|
||||
app_name
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
/// Get image dimensions from a PNG file
|
||||
fn get_image_dimensions(path: &str) -> Result<(i32, i32)> {
|
||||
use std::fs::File;
|
||||
use std::io::Read;
|
||||
|
||||
let mut file = File::open(path)?;
|
||||
let mut buffer = vec![0u8; 24];
|
||||
file.read_exact(&mut buffer)?;
|
||||
|
||||
// PNG signature check
|
||||
if &buffer[0..8] != b"\x89PNG\r\n\x1a\n" {
|
||||
anyhow::bail!("Not a valid PNG file");
|
||||
}
|
||||
|
||||
// Read IHDR chunk (width and height are at bytes 16-23)
|
||||
let width = u32::from_be_bytes([buffer[16], buffer[17], buffer[18], buffer[19]]) as i32;
|
||||
let height = u32::from_be_bytes([buffer[20], buffer[21], buffer[22], buffer[23]]) as i32;
|
||||
|
||||
Ok((width, height))
|
||||
}
|
||||
|
||||
/// Transform coordinates from screenshot space to screen space
|
||||
///
|
||||
/// The screenshot is taken of a window, and Vision OCR returns coordinates
|
||||
/// relative to the screenshot image. We need to transform these to actual
|
||||
/// screen coordinates for clicking.
|
||||
///
|
||||
/// On Retina displays, screenshots are taken at 2x resolution, so we need
|
||||
/// to account for this scaling factor.
|
||||
fn transform_screenshot_to_screen_coords(
|
||||
location: TextLocation,
|
||||
window_bounds: (i32, i32, i32, i32), // (x, y, width, height) in screen space
|
||||
screenshot_dims: (i32, i32), // (width, height) in pixels
|
||||
) -> TextLocation {
|
||||
let (win_x, win_y, win_width, win_height) = window_bounds;
|
||||
let (screenshot_width, screenshot_height) = screenshot_dims;
|
||||
|
||||
// Calculate scale factors
|
||||
// On Retina displays, screenshot is typically 2x the window size
|
||||
let scale_x = win_width as f64 / screenshot_width as f64;
|
||||
let scale_y = win_height as f64 / screenshot_height as f64;
|
||||
|
||||
tracing::debug!(
|
||||
"Transform: screenshot={}x{}, window={}x{} at ({},{}), scale=({:.2},{:.2})",
|
||||
screenshot_width,
|
||||
screenshot_height,
|
||||
win_width,
|
||||
win_height,
|
||||
win_x,
|
||||
win_y,
|
||||
scale_x,
|
||||
scale_y
|
||||
);
|
||||
|
||||
// Transform coordinates from image space to screen space
|
||||
// IMPORTANT: macOS screen coordinates have origin at BOTTOM-LEFT (Y increases upward)
|
||||
// Image coordinates have origin at TOP-LEFT (Y increases downward)
|
||||
// win_y is the BOTTOM of the window in screen coordinates
|
||||
// So we need to: (win_y + win_height) to get window TOP, then subtract screenshot_y
|
||||
let window_top_y = win_y + win_height;
|
||||
|
||||
tracing::debug!(
|
||||
"[transform] Input location in image space: x={}, y={}, width={}, height={}",
|
||||
location.x,
|
||||
location.y,
|
||||
location.width,
|
||||
location.height
|
||||
);
|
||||
tracing::debug!(
|
||||
"[transform] Scale factors: scale_x={:.4}, scale_y={:.4}",
|
||||
scale_x,
|
||||
scale_y
|
||||
);
|
||||
|
||||
let transformed_x = win_x + (location.x as f64 * scale_x) as i32;
|
||||
let transformed_y = window_top_y - (location.y as f64 * scale_y) as i32;
|
||||
let transformed_width = (location.width as f64 * scale_x) as i32;
|
||||
let transformed_height = (location.height as f64 * scale_y) as i32;
|
||||
|
||||
tracing::debug!("[transform] Calculation details:");
|
||||
tracing::debug!(
|
||||
" - transformed_x = {} + ({} * {:.4}) = {} + {:.2} = {}",
|
||||
win_x,
|
||||
location.x,
|
||||
scale_x,
|
||||
win_x,
|
||||
location.x as f64 * scale_x,
|
||||
transformed_x
|
||||
);
|
||||
tracing::debug!(
|
||||
" - transformed_width = ({} * {:.4}) = {:.2} -> {}",
|
||||
location.width,
|
||||
scale_x,
|
||||
location.width as f64 * scale_x,
|
||||
transformed_width
|
||||
);
|
||||
tracing::debug!(
|
||||
" - transformed_height = ({} * {:.4}) = {:.2} -> {}",
|
||||
location.height,
|
||||
scale_y,
|
||||
location.height as f64 * scale_y,
|
||||
transformed_height
|
||||
);
|
||||
|
||||
tracing::debug!(
|
||||
"Transformed location: screenshot=({},{}) {}x{} -> screen=({},{}) {}x{}",
|
||||
location.x,
|
||||
location.y,
|
||||
location.width,
|
||||
location.height,
|
||||
transformed_x,
|
||||
transformed_y,
|
||||
transformed_width,
|
||||
transformed_height
|
||||
);
|
||||
|
||||
TextLocation {
|
||||
text: location.text,
|
||||
x: transformed_x,
|
||||
y: transformed_y,
|
||||
width: transformed_width,
|
||||
height: transformed_height,
|
||||
confidence: location.confidence,
|
||||
}
|
||||
}
|
||||
|
||||
#[path = "macos_window_matching_test.rs"]
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -1,189 +1,32 @@
|
||||
use crate::{types::*, ComputerController};
|
||||
use crate::{types::Rect, ComputerController};
|
||||
use anyhow::Result;
|
||||
use async_trait::async_trait;
|
||||
use tesseract::Tesseract;
|
||||
use uuid::Uuid;
|
||||
|
||||
pub struct WindowsController {
|
||||
// Placeholder for Windows-specific state
|
||||
}
|
||||
pub struct WindowsController;
|
||||
|
||||
impl WindowsController {
|
||||
pub fn new() -> Result<Self> {
|
||||
tracing::warn!("Windows computer control not fully implemented");
|
||||
Ok(Self {})
|
||||
Ok(Self)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl ComputerController for WindowsController {
|
||||
async fn move_mouse(&self, _x: i32, _y: i32) -> Result<()> {
|
||||
anyhow::bail!("Windows implementation not yet available")
|
||||
}
|
||||
|
||||
async fn click(&self, _button: MouseButton) -> Result<()> {
|
||||
anyhow::bail!("Windows implementation not yet available")
|
||||
}
|
||||
|
||||
async fn double_click(&self, _button: MouseButton) -> Result<()> {
|
||||
anyhow::bail!("Windows implementation not yet available")
|
||||
}
|
||||
|
||||
async fn type_text(&self, _text: &str) -> Result<()> {
|
||||
anyhow::bail!("Windows implementation not yet available")
|
||||
}
|
||||
|
||||
async fn press_key(&self, _key: &str) -> Result<()> {
|
||||
anyhow::bail!("Windows implementation not yet available")
|
||||
}
|
||||
|
||||
async fn list_windows(&self) -> Result<Vec<Window>> {
|
||||
anyhow::bail!("Windows implementation not yet available")
|
||||
}
|
||||
|
||||
async fn focus_window(&self, _window_id: &str) -> Result<()> {
|
||||
anyhow::bail!("Windows implementation not yet available")
|
||||
}
|
||||
|
||||
async fn get_window_bounds(&self, _window_id: &str) -> Result<Rect> {
|
||||
anyhow::bail!("Windows implementation not yet available")
|
||||
}
|
||||
|
||||
async fn find_element(&self, _selector: &ElementSelector) -> Result<Option<UIElement>> {
|
||||
anyhow::bail!("Windows implementation not yet available")
|
||||
}
|
||||
|
||||
async fn get_element_text(&self, _element_id: &str) -> Result<String> {
|
||||
anyhow::bail!("Windows implementation not yet available")
|
||||
}
|
||||
|
||||
async fn get_element_bounds(&self, _element_id: &str) -> Result<Rect> {
|
||||
anyhow::bail!("Windows implementation not yet available")
|
||||
}
|
||||
|
||||
async fn take_screenshot(
|
||||
&self,
|
||||
_path: &str,
|
||||
_region: Option<Rect>,
|
||||
_window_id: Option<&str>,
|
||||
) -> Result<()> {
|
||||
// Enforce that window_id must be provided
|
||||
if _window_id.is_none() {
|
||||
anyhow::bail!("window_id is required. You must specify which window to capture (e.g., 'Chrome', 'Terminal', 'Notepad'). Use list_windows to see available windows.");
|
||||
}
|
||||
|
||||
anyhow::bail!("Windows implementation not yet available")
|
||||
anyhow::bail!("Windows screenshot implementation not yet available")
|
||||
}
|
||||
|
||||
async fn extract_text_from_screen(&self, _region: Rect, _window_id: &str) -> Result<String> {
|
||||
anyhow::bail!("Windows implementation not yet available")
|
||||
fn move_mouse(&self, _x: i32, _y: i32) -> Result<()> {
|
||||
anyhow::bail!("Windows mouse control not yet available")
|
||||
}
|
||||
|
||||
async fn extract_text_from_image(&self, _path: &str) -> Result<OCRResult> {
|
||||
// Check if tesseract is available on the system
|
||||
let tesseract_check = std::process::Command::new("where")
|
||||
.arg("tesseract")
|
||||
.output();
|
||||
|
||||
if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() {
|
||||
anyhow::bail!(
|
||||
"Tesseract OCR is not installed on your system.\n\n\
|
||||
To install tesseract on Windows:\n \
|
||||
1. Download the installer from: https://github.com/UB-Mannheim/tesseract/wiki\n \
|
||||
2. Run the installer and follow the instructions\n \
|
||||
3. Add tesseract to your PATH environment variable\n \
|
||||
4. Restart your terminal/command prompt\n\n\
|
||||
After installation, restart your terminal and try again."
|
||||
);
|
||||
}
|
||||
|
||||
// Initialize Tesseract
|
||||
let tess = Tesseract::new(None, Some("eng")).map_err(|e| {
|
||||
anyhow::anyhow!(
|
||||
"Failed to initialize Tesseract: {}\n\n\
|
||||
This usually means:\n1. Tesseract is not properly installed\n\
|
||||
2. Language data files are missing\n\nTo fix:\n \
|
||||
1. Reinstall tesseract from https://github.com/UB-Mannheim/tesseract/wiki\n \
|
||||
2. Make sure to select 'Additional language data' during installation\n \
|
||||
3. Ensure tesseract is in your PATH",
|
||||
e
|
||||
)
|
||||
})?;
|
||||
|
||||
let text = tess
|
||||
.set_image(_path)
|
||||
.map_err(|e| anyhow::anyhow!("Failed to load image '{}': {}", _path, e))?
|
||||
.get_text()
|
||||
.map_err(|e| anyhow::anyhow!("Failed to extract text from image: {}", e))?;
|
||||
|
||||
// Get confidence (simplified - would need more complex API calls for per-word confidence)
|
||||
let confidence = 0.85; // Placeholder
|
||||
|
||||
Ok(OCRResult {
|
||||
text,
|
||||
confidence,
|
||||
bounds: Rect {
|
||||
x: 0,
|
||||
y: 0,
|
||||
width: 0,
|
||||
height: 0,
|
||||
}, // Would need image dimensions
|
||||
})
|
||||
}
|
||||
|
||||
async fn find_text_on_screen(&self, _text: &str) -> Result<Option<Point>> {
|
||||
// Check if tesseract is available on the system
|
||||
let tesseract_check = std::process::Command::new("where")
|
||||
.arg("tesseract")
|
||||
.output();
|
||||
|
||||
if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() {
|
||||
anyhow::bail!(
|
||||
"Tesseract OCR is not installed on your system.\n\n\
|
||||
To install tesseract on Windows:\n \
|
||||
1. Download the installer from: https://github.com/UB-Mannheim/tesseract/wiki\n \
|
||||
2. Run the installer and follow the instructions\n \
|
||||
3. Add tesseract to your PATH environment variable\n \
|
||||
4. Restart your terminal/command prompt\n\n\
|
||||
After installation, restart your terminal and try again."
|
||||
);
|
||||
}
|
||||
|
||||
// Take full screen screenshot
|
||||
let temp_path = format!("C:\\\\Temp\\\\g3_ocr_search_{}.png", uuid::Uuid::new_v4());
|
||||
self.take_screenshot(&temp_path, None, None).await?;
|
||||
|
||||
// Use Tesseract to find text with bounding boxes
|
||||
let tess = Tesseract::new(None, Some("eng")).map_err(|e| {
|
||||
anyhow::anyhow!(
|
||||
"Failed to initialize Tesseract: {}\n\n\
|
||||
This usually means:\n1. Tesseract is not properly installed\n\
|
||||
2. Language data files are missing\n\nTo fix:\n \
|
||||
1. Reinstall tesseract from https://github.com/UB-Mannheim/tesseract/wiki\n \
|
||||
2. Make sure to select 'Additional language data' during installation\n \
|
||||
3. Ensure tesseract is in your PATH",
|
||||
e
|
||||
)
|
||||
})?;
|
||||
|
||||
let full_text = tess
|
||||
.set_image(temp_path.as_str())
|
||||
.map_err(|e| anyhow::anyhow!("Failed to load screenshot: {}", e))?
|
||||
.get_text()
|
||||
.map_err(|e| anyhow::anyhow!("Failed to extract text from screen: {}", e))?;
|
||||
|
||||
// Clean up temp file
|
||||
let _ = std::fs::remove_file(&temp_path);
|
||||
|
||||
// Simple text search - full implementation would use get_component_images
|
||||
// to get bounding boxes for each word
|
||||
if full_text.contains(_text) {
|
||||
tracing::warn!(
|
||||
"Text found but precise coordinates not available in simplified implementation"
|
||||
);
|
||||
Ok(Some(Point { x: 0, y: 0 }))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
fn click_at(&self, _x: i32, _y: i32, _app_name: Option<&str>) -> Result<()> {
|
||||
anyhow::bail!("Windows click control not yet available")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,13 +7,3 @@ pub struct Rect {
|
||||
pub width: i32,
|
||||
pub height: i32,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct TextLocation {
|
||||
pub text: String,
|
||||
pub x: i32,
|
||||
pub y: i32,
|
||||
pub width: i32,
|
||||
pub height: i32,
|
||||
pub confidence: f32,
|
||||
}
|
||||
|
||||
@@ -1,24 +0,0 @@
|
||||
// swift-tools-version:5.9
|
||||
import PackageDescription
|
||||
|
||||
let package = Package(
|
||||
name: "VisionBridge",
|
||||
platforms: [
|
||||
.macOS(.v11)
|
||||
],
|
||||
products: [
|
||||
.library(
|
||||
name: "VisionBridge",
|
||||
type: .dynamic,
|
||||
targets: ["VisionBridge"]
|
||||
),
|
||||
],
|
||||
targets: [
|
||||
.target(
|
||||
name: "VisionBridge",
|
||||
dependencies: [],
|
||||
path: "Sources/VisionBridge",
|
||||
publicHeadersPath: "."
|
||||
),
|
||||
]
|
||||
)
|
||||
@@ -1,39 +0,0 @@
|
||||
#ifndef VisionBridge_h
|
||||
#define VisionBridge_h
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Text box structure for FFI
|
||||
typedef struct {
|
||||
const char* text;
|
||||
uint32_t text_len;
|
||||
int32_t x;
|
||||
int32_t y;
|
||||
int32_t width;
|
||||
int32_t height;
|
||||
float confidence;
|
||||
} VisionTextBox;
|
||||
|
||||
// Recognize text in an image and return bounding boxes
|
||||
// Returns true on success, false on failure
|
||||
// Caller must free the returned boxes using vision_free_boxes
|
||||
bool vision_recognize_text(
|
||||
const char* image_path,
|
||||
uint32_t image_path_len,
|
||||
VisionTextBox** out_boxes,
|
||||
uint32_t* out_count
|
||||
);
|
||||
|
||||
// Free memory allocated by vision_recognize_text
|
||||
void vision_free_boxes(VisionTextBox* boxes, uint32_t count);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* VisionBridge_h */
|
||||
@@ -1,145 +0,0 @@
|
||||
import Foundation
|
||||
import Vision
|
||||
import AppKit
|
||||
import CoreGraphics
|
||||
|
||||
// MARK: - C Bridge Functions
|
||||
|
||||
@_cdecl("vision_recognize_text")
|
||||
public func vision_recognize_text(
|
||||
_ imagePath: UnsafePointer<CChar>,
|
||||
_ imagePathLen: UInt32,
|
||||
_ outBoxes: UnsafeMutablePointer<UnsafeMutableRawPointer?>,
|
||||
_ outCount: UnsafeMutablePointer<UInt32>
|
||||
) -> Bool {
|
||||
// Convert C string to Swift String
|
||||
guard let pathData = Data(bytes: imagePath, count: Int(imagePathLen)).withUnsafeBytes({
|
||||
String(bytes: $0, encoding: .utf8)
|
||||
}) else {
|
||||
return false
|
||||
}
|
||||
|
||||
let path = pathData.trimmingCharacters(in: .whitespaces)
|
||||
|
||||
// Load image
|
||||
guard let image = NSImage(contentsOfFile: path),
|
||||
let cgImage = image.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
|
||||
return false
|
||||
}
|
||||
|
||||
// Perform OCR
|
||||
var textBoxes: [CTextBox] = []
|
||||
let semaphore = DispatchSemaphore(value: 0)
|
||||
var success = false
|
||||
|
||||
let request = VNRecognizeTextRequest { request, error in
|
||||
defer { semaphore.signal() }
|
||||
|
||||
if let error = error {
|
||||
print("Vision OCR error: \(error.localizedDescription)")
|
||||
return
|
||||
}
|
||||
|
||||
guard let observations = request.results as? [VNRecognizedTextObservation] else {
|
||||
return
|
||||
}
|
||||
|
||||
let imageSize = CGSize(width: cgImage.width, height: cgImage.height)
|
||||
|
||||
for observation in observations {
|
||||
guard let candidate = observation.topCandidates(1).first else { continue }
|
||||
|
||||
let text = candidate.string
|
||||
let boundingBox = observation.boundingBox
|
||||
|
||||
// Convert normalized coordinates (bottom-left origin) to pixel coordinates (top-left origin)
|
||||
let x = Int32(boundingBox.origin.x * imageSize.width)
|
||||
let y = Int32((1.0 - boundingBox.origin.y - boundingBox.height) * imageSize.height)
|
||||
let width = Int32(boundingBox.width * imageSize.width)
|
||||
let height = Int32(boundingBox.height * imageSize.height)
|
||||
|
||||
// Allocate C string for text
|
||||
let cString = strdup(text)
|
||||
|
||||
textBoxes.append(CTextBox(
|
||||
text: cString,
|
||||
text_len: UInt32(text.utf8.count),
|
||||
x: x,
|
||||
y: y,
|
||||
width: width,
|
||||
height: height,
|
||||
confidence: observation.confidence
|
||||
))
|
||||
}
|
||||
|
||||
success = true
|
||||
}
|
||||
|
||||
// Configure request for best accuracy
|
||||
request.recognitionLevel = .accurate
|
||||
request.usesLanguageCorrection = true
|
||||
request.recognitionLanguages = ["en-US"]
|
||||
|
||||
// Perform request
|
||||
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
|
||||
do {
|
||||
try handler.perform([request])
|
||||
} catch {
|
||||
print("Vision request failed: \(error.localizedDescription)")
|
||||
return false
|
||||
}
|
||||
|
||||
// Wait for completion
|
||||
semaphore.wait()
|
||||
|
||||
if !success {
|
||||
return false
|
||||
}
|
||||
|
||||
// Allocate array for results
|
||||
let boxesPtr = UnsafeMutablePointer<CTextBox>.allocate(capacity: textBoxes.count)
|
||||
for (index, box) in textBoxes.enumerated() {
|
||||
boxesPtr[index] = box
|
||||
}
|
||||
|
||||
outBoxes.pointee = UnsafeMutableRawPointer(boxesPtr)
|
||||
outCount.pointee = UInt32(textBoxes.count)
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
@_cdecl("vision_free_boxes")
|
||||
public func vision_free_boxes(
|
||||
_ boxes: UnsafeMutableRawPointer,
|
||||
_ count: UInt32
|
||||
) {
|
||||
let typedBoxes = boxes.assumingMemoryBound(to: CTextBox.self)
|
||||
for i in 0..<Int(count) {
|
||||
if let text = typedBoxes[i].text {
|
||||
free(UnsafeMutableRawPointer(mutating: text))
|
||||
}
|
||||
}
|
||||
typedBoxes.deallocate()
|
||||
}
|
||||
|
||||
// MARK: - C-Compatible Structure
|
||||
|
||||
public struct CTextBox {
|
||||
public let text: UnsafePointer<CChar>?
|
||||
public let text_len: UInt32
|
||||
public let x: Int32
|
||||
public let y: Int32
|
||||
public let width: Int32
|
||||
public let height: Int32
|
||||
public let confidence: Float
|
||||
|
||||
public init(text: UnsafePointer<CChar>?, text_len: UInt32, x: Int32, y: Int32, width: Int32, height: Int32, confidence: Float) {
|
||||
self.text = text
|
||||
self.text_len = text_len
|
||||
self.x = x
|
||||
self.y = y
|
||||
self.width = width
|
||||
self.height = height
|
||||
self.confidence = confidence
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user