replace tesseract with apple vision

This commit is contained in:
Dhanji Prasanna
2025-10-24 15:35:47 +11:00
parent d0ac222e2e
commit 61d748034d
16 changed files with 785 additions and 318 deletions

1
.gitignore vendored
View File

@@ -2,6 +2,7 @@
# will have compiled files and executables # will have compiled files and executables
debug debug
target target
.build
# These are backup files generated by rustfmt # These are backup files generated by rustfmt
**/*.rs.bk **/*.rs.bk

160
Cargo.lock generated
View File

@@ -136,7 +136,7 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.107", "syn",
] ]
[[package]] [[package]]
@@ -218,28 +218,6 @@ version = "0.22.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
[[package]]
name = "bindgen"
version = "0.64.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c4243e6031260db77ede97ad86c27e501d646a27ab57b59a574f725d98ab1fb4"
dependencies = [
"bitflags 1.3.2",
"cexpr",
"clang-sys",
"lazy_static",
"lazycell",
"log",
"peeking_take_while",
"proc-macro2",
"quote",
"regex",
"rustc-hash",
"shlex",
"syn 1.0.109",
"which",
]
[[package]] [[package]]
name = "bindgen" name = "bindgen"
version = "0.69.5" version = "0.69.5"
@@ -259,7 +237,7 @@ dependencies = [
"regex", "regex",
"rustc-hash", "rustc-hash",
"shlex", "shlex",
"syn 2.0.107", "syn",
"which", "which",
] ]
@@ -433,7 +411,7 @@ dependencies = [
"heck", "heck",
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.107", "syn",
] ]
[[package]] [[package]]
@@ -767,7 +745,7 @@ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"strict", "strict",
"syn 2.0.107", "syn",
] ]
[[package]] [[package]]
@@ -906,7 +884,7 @@ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"strsim", "strsim",
"syn 2.0.107", "syn",
] ]
[[package]] [[package]]
@@ -917,7 +895,7 @@ checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead"
dependencies = [ dependencies = [
"darling_core", "darling_core",
"quote", "quote",
"syn 2.0.107", "syn",
] ]
[[package]] [[package]]
@@ -939,7 +917,7 @@ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"rustc_version", "rustc_version",
"syn 2.0.107", "syn",
] ]
[[package]] [[package]]
@@ -960,7 +938,7 @@ dependencies = [
"convert_case 0.7.1", "convert_case 0.7.1",
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.107", "syn",
] ]
[[package]] [[package]]
@@ -1023,7 +1001,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.107", "syn",
] ]
[[package]] [[package]]
@@ -1213,7 +1191,7 @@ checksum = "1a5c6c585bc94aaf2c7b51dd4c2ba22680844aba4c687be581871a6f518c5742"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.107", "syn",
] ]
[[package]] [[package]]
@@ -1293,7 +1271,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.107", "syn",
] ]
[[package]] [[package]]
@@ -1374,7 +1352,6 @@ dependencies = [
"serde", "serde",
"serde_json", "serde_json",
"shellexpand", "shellexpand",
"tesseract",
"thiserror 1.0.69", "thiserror 1.0.69",
"tokio", "tokio",
"tracing", "tracing",
@@ -1959,7 +1936,7 @@ dependencies = [
"indoc", "indoc",
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.107", "syn",
] ]
[[package]] [[package]]
@@ -2080,7 +2057,7 @@ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"regex", "regex",
"syn 2.0.107", "syn",
] ]
[[package]] [[package]]
@@ -2101,28 +2078,6 @@ version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a79a3332a6609480d7d0c9eab957bca6b455b91bb84e66d19f5ff66294b85b8" checksum = "7a79a3332a6609480d7d0c9eab957bca6b455b91bb84e66d19f5ff66294b85b8"
[[package]]
name = "leptonica-plumbing"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cc7a74c43d6f090d39158d233f326f47cd8bba545217595c93662b4e31156f42"
dependencies = [
"leptonica-sys",
"libc",
"thiserror 1.0.69",
]
[[package]]
name = "leptonica-sys"
version = "0.4.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da627c72b2499a8106f4dd33143843015e4a631f445d561f3481f7fba35b6151"
dependencies = [
"bindgen 0.64.0",
"pkg-config",
"vcpkg",
]
[[package]] [[package]]
name = "libc" name = "libc"
version = "0.2.177" version = "0.2.177"
@@ -2203,7 +2158,7 @@ version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "037a1881ada3592c6a922224d5177b4b4f452e6b2979eb97393b71989e48357f" checksum = "037a1881ada3592c6a922224d5177b4b4f452e6b2979eb97393b71989e48357f"
dependencies = [ dependencies = [
"bindgen 0.69.5", "bindgen",
"cc", "cc",
"link-cplusplus", "link-cplusplus",
"once_cell", "once_cell",
@@ -2478,7 +2433,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.107", "syn",
] ]
[[package]] [[package]]
@@ -2550,12 +2505,6 @@ version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df94ce210e5bc13cb6651479fa48d14f601d9858cfe0467f43ae157023b938d3" checksum = "df94ce210e5bc13cb6651479fa48d14f601d9858cfe0467f43ae157023b938d3"
[[package]]
name = "peeking_take_while"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
[[package]] [[package]]
name = "percent-encoding" name = "percent-encoding"
version = "2.3.2" version = "2.3.2"
@@ -2592,7 +2541,7 @@ dependencies = [
"pest_meta", "pest_meta",
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.107", "syn",
] ]
[[package]] [[package]]
@@ -2673,7 +2622,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"syn 2.0.107", "syn",
] ]
[[package]] [[package]]
@@ -3078,7 +3027,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.107", "syn",
] ]
[[package]] [[package]]
@@ -3272,18 +3221,7 @@ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"rustversion", "rustversion",
"syn 2.0.107", "syn",
]
[[package]]
name = "syn"
version = "1.0.109"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
] ]
[[package]] [[package]]
@@ -3317,7 +3255,7 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.107", "syn",
] ]
[[package]] [[package]]
@@ -3370,40 +3308,6 @@ dependencies = [
"unicode-width 0.1.14", "unicode-width 0.1.14",
] ]
[[package]]
name = "tesseract"
version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2ee0c2c608b63817b095f7fded5c50add36a29e2be2b2fc4901357163329290a"
dependencies = [
"tesseract-plumbing",
"tesseract-sys",
"thiserror 1.0.69",
]
[[package]]
name = "tesseract-plumbing"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3e496d3e29eba540a276975394b85dccb5fd344b3eefb743d9286c8150f766d5"
dependencies = [
"leptonica-plumbing",
"tesseract-sys",
"thiserror 1.0.69",
]
[[package]]
name = "tesseract-sys"
version = "0.5.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bd33f6f216124cfaf0fa86c2c0cdf04da39b6257bd78c5e44fa4fa98c3a5857b"
dependencies = [
"bindgen 0.64.0",
"leptonica-sys",
"pkg-config",
"vcpkg",
]
[[package]] [[package]]
name = "thiserror" name = "thiserror"
version = "1.0.69" version = "1.0.69"
@@ -3430,7 +3334,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.107", "syn",
] ]
[[package]] [[package]]
@@ -3441,7 +3345,7 @@ checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.107", "syn",
] ]
[[package]] [[package]]
@@ -3539,7 +3443,7 @@ checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.107", "syn",
] ]
[[package]] [[package]]
@@ -3665,7 +3569,7 @@ checksum = "81383ab64e72a7a8b8e13130c49e3dab29def6d0c7d76a03087b3cf71c5c6903"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.107", "syn",
] ]
[[package]] [[package]]
@@ -3870,7 +3774,7 @@ dependencies = [
"log", "log",
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.107", "syn",
"wasm-bindgen-shared", "wasm-bindgen-shared",
] ]
@@ -3905,7 +3809,7 @@ checksum = "9f07d2f20d4da7b26400c9f4a0511e6e0345b040694e8a75bd41d578fa4421d7"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.107", "syn",
"wasm-bindgen-backend", "wasm-bindgen-backend",
"wasm-bindgen-shared", "wasm-bindgen-shared",
] ]
@@ -4077,7 +3981,7 @@ checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.107", "syn",
] ]
[[package]] [[package]]
@@ -4088,7 +3992,7 @@ checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.107", "syn",
] ]
[[package]] [[package]]
@@ -4484,7 +4388,7 @@ checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.107", "syn",
"synstructure", "synstructure",
] ]
@@ -4505,7 +4409,7 @@ checksum = "88d2b8d9c68ad2b9e4340d7832716a4d21a22a1154777ad56ea55c51a9cf3831"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.107", "syn",
] ]
[[package]] [[package]]
@@ -4525,7 +4429,7 @@ checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.107", "syn",
"synstructure", "synstructure",
] ]
@@ -4559,7 +4463,7 @@ checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.107", "syn",
] ]
[[package]] [[package]]

View File

@@ -193,7 +193,12 @@ impl UiWriter for ConsoleUiWriter {
// Truncate long values for display // Truncate long values for display
let display_value = if first_line.len() > 80 { let display_value = if first_line.len() > 80 {
format!("{}...", &first_line[..77]) // Use char_indices to safely truncate at character boundary
let truncate_at = first_line.char_indices()
.nth(77)
.map(|(i, _)| i)
.unwrap_or(first_line.len());
format!("{}...", &first_line[..truncate_at])
} else { } else {
first_line.to_string() first_line.to_string()
}; };
@@ -440,7 +445,12 @@ impl UiWriter for RetroTuiWriter {
if caption.is_empty() && (key == "file_path" || key == "command" || key == "path") { if caption.is_empty() && (key == "file_path" || key == "command" || key == "path") {
// Truncate long values for the caption // Truncate long values for the caption
let truncated = if value.len() > 50 { let truncated = if value.len() > 50 {
format!("{}...", &value[..47]) // Use char_indices to safely truncate at character boundary
let truncate_at = value.char_indices()
.nth(47)
.map(|(i, _)| i)
.unwrap_or(value.len());
format!("{}...", &value[..truncate_at])
} else { } else {
value.to_string() value.to_string()
}; };

View File

@@ -3,6 +3,9 @@ name = "g3-computer-control"
version = "0.1.0" version = "0.1.0"
edition = "2021" edition = "2021"
[build-dependencies]
# Only needed for building Swift bridge on macOS
[dependencies] [dependencies]
# Workspace dependencies # Workspace dependencies
tokio = { workspace = true } tokio = { workspace = true }
@@ -20,9 +23,6 @@ async-trait = "0.1"
# WebDriver support # WebDriver support
fantoccini = "0.21" fantoccini = "0.21"
# OCR dependencies
tesseract = "0.14"
# macOS dependencies # macOS dependencies
[target.'cfg(target_os = "macos")'.dependencies] [target.'cfg(target_os = "macos")'.dependencies]
core-graphics = "0.23" core-graphics = "0.23"

View File

@@ -0,0 +1,63 @@
use std::env;
use std::path::PathBuf;
use std::process::Command;
fn main() {
// Only build Vision bridge on macOS
if env::var("CARGO_CFG_TARGET_OS").unwrap() != "macos" {
return;
}
println!("cargo:rerun-if-changed=vision-bridge/Sources/VisionBridge/VisionOCR.swift");
println!("cargo:rerun-if-changed=vision-bridge/Sources/VisionBridge/VisionBridge.h");
println!("cargo:rerun-if-changed=vision-bridge/Package.swift");
let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
let vision_bridge_dir = manifest_dir.join("vision-bridge");
// Build Swift package
println!("cargo:warning=Building VisionBridge Swift package...");
let build_status = Command::new("swift")
.args(&["build", "-c", "release"])
.current_dir(&vision_bridge_dir)
.status()
.expect("Failed to build Swift package");
if !build_status.success() {
panic!("Swift build failed");
}
// Find the built library
let lib_path = vision_bridge_dir
.join(".build/release")
.canonicalize()
.expect("Failed to find .build/release directory");
// Copy the dylib to the output directory so it can be found at runtime
let target_dir = manifest_dir.parent().unwrap().parent().unwrap().join("target");
let profile = env::var("PROFILE").unwrap_or_else(|_| "debug".to_string());
let output_dir = target_dir.join(&profile);
let dylib_src = lib_path.join("libVisionBridge.dylib");
let dylib_dst = output_dir.join("libVisionBridge.dylib");
std::fs::copy(&dylib_src, &dylib_dst)
.expect(&format!("Failed to copy dylib from {} to {}", dylib_src.display(), dylib_dst.display()));
println!("cargo:warning=Copied libVisionBridge.dylib to {}", dylib_dst.display());
// Add rpath so the dylib can be found at runtime
println!("cargo:rustc-link-arg=-Wl,-rpath,@executable_path");
println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
println!("cargo:rustc-link-search=native={}", lib_path.display());
println!("cargo:rustc-link-lib=dylib=VisionBridge");
// Link required frameworks
println!("cargo:rustc-link-lib=framework=Vision");
println!("cargo:rustc-link-lib=framework=AppKit");
println!("cargo:rustc-link-lib=framework=Foundation");
println!("cargo:rustc-link-lib=framework=CoreGraphics");
println!("cargo:rustc-link-lib=framework=CoreImage");
println!("cargo:warning=VisionBridge built successfully at {}", lib_path.display());
}

View File

@@ -0,0 +1,85 @@
use g3_computer_control::ocr::{OCREngine, DefaultOCR};
use anyhow::Result;
#[tokio::main]
async fn main() -> Result<()> {
println!("🧪 Testing Apple Vision OCR");
println!("===========================\n");
// Initialize OCR engine
println!("📦 Initializing OCR engine...");
let ocr = DefaultOCR::new()?;
println!("✅ OCR engine: {}\n", ocr.name());
// Check if test image exists
let test_image = "/tmp/safari_test.png";
if !std::path::Path::new(test_image).exists() {
println!("⚠️ Test image not found: {}", test_image);
println!(" Creating a screenshot...");
let status = std::process::Command::new("screencapture")
.arg("-x")
.arg("-R")
.arg("0,0,1200,800")
.arg(test_image)
.status()?;
if !status.success() {
anyhow::bail!("Failed to create screenshot");
}
println!("✅ Screenshot created\n");
}
// Run OCR
println!("🔍 Running Apple Vision OCR on {}...", test_image);
let start = std::time::Instant::now();
let locations = ocr.extract_text_with_locations(test_image).await?;
let duration = start.elapsed();
println!("✅ OCR completed in {:.3}s\n", duration.as_secs_f64());
// Display results
println!("📊 Results:");
println!(" Found {} text elements\n", locations.len());
if locations.is_empty() {
println!("⚠️ No text found in image");
} else {
println!(" Top 20 results:");
println!(" {:<4} {:<40} {:<15} {:<12} {:<8}", "#", "Text", "Position", "Size", "Conf");
println!(" {}", "-".repeat(85));
for (i, loc) in locations.iter().take(20).enumerate() {
let text = if loc.text.len() > 37 {
format!("{}...", &loc.text[..37])
} else {
loc.text.clone()
};
println!(" {:<4} {:<40} ({:>4},{:>4}) {:>4}x{:<4} {:.2}",
i + 1,
text,
loc.x,
loc.y,
loc.width,
loc.height,
loc.confidence
);
}
if locations.len() > 20 {
println!("\n ... and {} more", locations.len() - 20);
}
// Performance comparison
println!("\n📈 Performance:");
println!(" OCR Speed: {:.3}s", duration.as_secs_f64());
println!(" Text elements: {}", locations.len());
println!(" Avg per element: {:.1}ms", duration.as_millis() as f64 / locations.len() as f64);
}
println!("\n✅ Test complete!");
Ok(())
}

View File

@@ -3,6 +3,7 @@
pub mod types; pub mod types;
pub mod platform; pub mod platform;
pub mod ocr;
pub mod webdriver; pub mod webdriver;
pub mod macax; pub mod macax;
@@ -25,11 +26,11 @@ pub trait ComputerController: Send + Sync {
async fn extract_text_from_screen(&self, region: Rect) -> Result<String>; async fn extract_text_from_screen(&self, region: Rect) -> Result<String>;
async fn extract_text_from_image(&self, path: &str) -> Result<String>; async fn extract_text_from_image(&self, path: &str) -> Result<String>;
async fn extract_text_with_locations(&self, path: &str) -> Result<Vec<TextLocation>>; async fn extract_text_with_locations(&self, path: &str) -> Result<Vec<TextLocation>>;
async fn find_text_on_screen(&self, search_text: &str) -> Result<Option<TextLocation>>; async fn find_text_in_app(&self, app_name: &str, search_text: &str) -> Result<Option<TextLocation>>;
// Mouse operations // Mouse operations
fn move_mouse(&self, x: i32, y: i32) -> Result<()>; fn move_mouse(&self, x: i32, y: i32) -> Result<()>;
fn click_at(&self, x: i32, y: i32) -> Result<()>; fn click_at(&self, x: i32, y: i32, app_name: Option<&str>) -> Result<()>;
} }
// Platform-specific constructor // Platform-specific constructor

View File

@@ -0,0 +1,26 @@
use crate::types::TextLocation;
use anyhow::Result;
use async_trait::async_trait;
/// OCR engine trait for text recognition with bounding boxes
#[async_trait]
pub trait OCREngine: Send + Sync {
/// Extract text with locations from an image file
async fn extract_text_with_locations(&self, path: &str) -> Result<Vec<TextLocation>>;
/// Get the name of the OCR engine
fn name(&self) -> &str;
}
// Platform-specific modules
#[cfg(target_os = "macos")]
pub mod vision;
pub mod tesseract;
// Re-export the default OCR engine for the platform
#[cfg(target_os = "macos")]
pub use vision::AppleVisionOCR as DefaultOCR;
#[cfg(not(target_os = "macos"))]
pub use tesseract::TesseractOCR as DefaultOCR;

View File

@@ -0,0 +1,84 @@
use super::OCREngine;
use crate::types::TextLocation;
use anyhow::Result;
use async_trait::async_trait;
/// Tesseract OCR engine (fallback/cross-platform)
pub struct TesseractOCR;
impl TesseractOCR {
pub fn new() -> Result<Self> {
// Check if tesseract is available
let tesseract_check = std::process::Command::new("which")
.arg("tesseract")
.output();
if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() {
anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\
To install tesseract:\n macOS: brew install tesseract\n \
Linux: sudo apt-get install tesseract-ocr (Ubuntu/Debian)\n \
sudo yum install tesseract (RHEL/CentOS)\n \
Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki\n\n\
After installation, restart your terminal and try again.");
}
Ok(Self)
}
}
#[async_trait]
impl OCREngine for TesseractOCR {
async fn extract_text_with_locations(&self, path: &str) -> Result<Vec<TextLocation>> {
// Use tesseract CLI with TSV output to get bounding boxes
let output = std::process::Command::new("tesseract")
.arg(path)
.arg("stdout")
.arg("tsv")
.output()
.map_err(|e| anyhow::anyhow!("Failed to run tesseract: {}", e))?;
if !output.status.success() {
anyhow::bail!("Tesseract failed: {}", String::from_utf8_lossy(&output.stderr));
}
let tsv_text = String::from_utf8_lossy(&output.stdout);
let mut locations = Vec::new();
// Parse TSV output (skip header line)
for (i, line) in tsv_text.lines().enumerate() {
if i == 0 { continue; } // Skip header
let parts: Vec<&str> = line.split('\t').collect();
if parts.len() >= 12 {
// TSV format: level, page_num, block_num, par_num, line_num, word_num,
// left, top, width, height, conf, text
if let (Ok(x), Ok(y), Ok(w), Ok(h), Ok(conf), text) = (
parts[6].parse::<i32>(),
parts[7].parse::<i32>(),
parts[8].parse::<i32>(),
parts[9].parse::<i32>(),
parts[10].parse::<f32>(),
parts[11],
) {
let trimmed = text.trim();
if !trimmed.is_empty() && conf > 0.0 {
locations.push(TextLocation {
text: trimmed.to_string(),
x,
y,
width: w,
height: h,
confidence: conf / 100.0, // Convert from 0-100 to 0-1
});
}
}
}
}
Ok(locations)
}
fn name(&self) -> &str {
"Tesseract OCR"
}
}

View File

@@ -0,0 +1,103 @@
use super::OCREngine;
use crate::types::TextLocation;
use anyhow::{Result, Context};
use async_trait::async_trait;
use std::ffi::{CStr, CString};
use std::os::raw::{c_char, c_float, c_uint};
// FFI bindings to Swift VisionBridge
#[repr(C)]
struct VisionTextBox {
text: *const c_char,
text_len: c_uint,
x: i32,
y: i32,
width: i32,
height: i32,
confidence: c_float,
}
extern "C" {
fn vision_recognize_text(
image_path: *const c_char,
image_path_len: c_uint,
out_boxes: *mut *mut std::ffi::c_void,
out_count: *mut c_uint,
) -> bool;
fn vision_free_boxes(boxes: *mut std::ffi::c_void, count: c_uint);
}
/// Apple Vision Framework OCR engine
pub struct AppleVisionOCR;
impl AppleVisionOCR {
pub fn new() -> Result<Self> {
Ok(Self)
}
}
#[async_trait]
impl OCREngine for AppleVisionOCR {
async fn extract_text_with_locations(&self, path: &str) -> Result<Vec<TextLocation>> {
// Convert path to C string
let c_path = CString::new(path)
.context("Failed to convert path to C string")?;
let mut boxes_ptr: *mut std::ffi::c_void = std::ptr::null_mut();
let mut count: c_uint = 0;
// Call Swift Vision API
let success = unsafe {
vision_recognize_text(
c_path.as_ptr(),
path.len() as c_uint,
&mut boxes_ptr,
&mut count,
)
};
if !success || boxes_ptr.is_null() {
anyhow::bail!("Apple Vision OCR failed");
}
// Convert C array to Rust Vec
let mut locations = Vec::new();
unsafe {
let typed_boxes = boxes_ptr as *const VisionTextBox;
let boxes_slice = std::slice::from_raw_parts(typed_boxes, count as usize);
for box_data in boxes_slice {
// Convert C string to Rust String
let text = if !box_data.text.is_null() {
CStr::from_ptr(box_data.text)
.to_string_lossy()
.into_owned()
} else {
String::new()
};
if !text.is_empty() {
locations.push(TextLocation {
text,
x: box_data.x,
y: box_data.y,
width: box_data.width,
height: box_data.height,
confidence: box_data.confidence,
});
}
}
// Free the C array
vision_free_boxes(boxes_ptr, count);
}
Ok(locations)
}
fn name(&self) -> &str {
"Apple Vision Framework"
}
}

View File

@@ -1,16 +1,21 @@
use crate::{ComputerController, types::{Rect, TextLocation}}; use crate::{ComputerController, types::{Rect, TextLocation}};
use crate::ocr::{OCREngine, DefaultOCR};
use anyhow::{Result, Context}; use anyhow::{Result, Context};
use async_trait::async_trait; use async_trait::async_trait;
use std::path::Path; use std::path::Path;
use tesseract::Tesseract;
pub struct MacOSController { pub struct MacOSController {
// Empty struct for now ocr_engine: Box<dyn OCREngine>,
#[allow(dead_code)]
ocr_name: String,
} }
impl MacOSController { impl MacOSController {
pub fn new() -> Result<Self> { pub fn new() -> Result<Self> {
Ok(Self {}) let ocr = Box::new(DefaultOCR::new()?);
let ocr_name = ocr.name().to_string();
tracing::info!("Initialized macOS controller with OCR engine: {}", ocr_name);
Ok(Self { ocr_engine: ocr, ocr_name })
} }
} }
@@ -90,95 +95,21 @@ impl ComputerController for MacOSController {
} }
async fn extract_text_from_image(&self, path: &str) -> Result<String> { async fn extract_text_from_image(&self, path: &str) -> Result<String> {
// Check if tesseract is available on the system // Extract all text and concatenate
let tesseract_check = std::process::Command::new("which") let locations = self.ocr_engine.extract_text_with_locations(path).await?;
.arg("tesseract") Ok(locations.iter().map(|loc| loc.text.as_str()).collect::<Vec<_>>().join(" "))
.output();
if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() {
anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\
To install tesseract:\n macOS: brew install tesseract\n \
Linux: sudo apt-get install tesseract-ocr (Ubuntu/Debian)\n \
sudo yum install tesseract (RHEL/CentOS)\n \
Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki\n\n\
After installation, restart your terminal and try again.");
}
// Initialize Tesseract
let tess = Tesseract::new(None, Some("eng"))
.map_err(|e| {
anyhow::anyhow!("Failed to initialize Tesseract: {}\n\n\
This usually means:\n1. Tesseract is not properly installed\n\
2. Language data files are missing\n\nTo fix:\n \
macOS: brew reinstall tesseract\n \
Linux: sudo apt-get install tesseract-ocr-eng\n \
Windows: Reinstall tesseract and ensure language files are included", e)
})?;
let text = tess.set_image(path)
.map_err(|e| anyhow::anyhow!("Failed to load image '{}': {}", path, e))?
.get_text()
.map_err(|e| anyhow::anyhow!("Failed to extract text from image: {}", e))?;
Ok(text)
} }
async fn extract_text_with_locations(&self, path: &str) -> Result<Vec<TextLocation>> { async fn extract_text_with_locations(&self, path: &str) -> Result<Vec<TextLocation>> {
// For now, use tesseract CLI with TSV output to get bounding boxes // Use the OCR engine
// This is a workaround since the Rust tesseract crate doesn't expose get_component_boxes self.ocr_engine.extract_text_with_locations(path).await
let output = std::process::Command::new("tesseract")
.arg(path)
.arg("stdout")
.arg("tsv")
.output()
.map_err(|e| anyhow::anyhow!("Failed to run tesseract: {}", e))?;
if !output.status.success() {
anyhow::bail!("Tesseract failed: {}", String::from_utf8_lossy(&output.stderr));
} }
let tsv_text = String::from_utf8_lossy(&output.stdout); async fn find_text_in_app(&self, app_name: &str, search_text: &str) -> Result<Option<TextLocation>> {
let mut locations = Vec::new(); // Take screenshot of specific app window
// Parse TSV output (skip header line)
for (i, line) in tsv_text.lines().enumerate() {
if i == 0 { continue; } // Skip header
let parts: Vec<&str> = line.split('\t').collect();
if parts.len() >= 12 {
// TSV format: level, page_num, block_num, par_num, line_num, word_num,
// left, top, width, height, conf, text
if let (Ok(x), Ok(y), Ok(w), Ok(h), Ok(conf), text) = (
parts[6].parse::<i32>(),
parts[7].parse::<i32>(),
parts[8].parse::<i32>(),
parts[9].parse::<i32>(),
parts[10].parse::<f32>(),
parts[11],
) {
let trimmed = text.trim();
if !trimmed.is_empty() && conf > 0.0 {
locations.push(TextLocation {
text: trimmed.to_string(),
x,
y,
width: w,
height: h,
confidence: conf / 100.0, // Convert from 0-100 to 0-1
});
}
}
}
}
Ok(locations)
}
async fn find_text_on_screen(&self, search_text: &str) -> Result<Option<TextLocation>> {
// Take full screenshot
let home = std::env::var("HOME").unwrap_or_else(|_| "/tmp".to_string()); let home = std::env::var("HOME").unwrap_or_else(|_| "/tmp".to_string());
let temp_path = format!("{}/Desktop/g3_find_text_{}.png", home, uuid::Uuid::new_v4()); let temp_path = format!("{}/Desktop/g3_find_text_{}_{}.png", home, app_name, uuid::Uuid::new_v4());
self.take_screenshot(&temp_path, None, None).await?; self.take_screenshot(&temp_path, None, Some(app_name)).await?;
// Extract all text with locations // Extract all text with locations
let locations = self.extract_text_with_locations(&temp_path).await?; let locations = self.extract_text_with_locations(&temp_path).await?;
@@ -221,7 +152,44 @@ impl ComputerController for MacOSController {
Ok(()) Ok(())
} }
fn click_at(&self, x: i32, y: i32) -> Result<()> { fn click_at(&self, x: i32, y: i32, app_name: Option<&str>) -> Result<()> {
// If app_name is provided, get window position and offset coordinates
let (global_x, global_y) = if let Some(app) = app_name {
// Get window position using AppleScript
let script = format!(
r#"tell application "{}" to get bounds of window 1"#,
app
);
let output = std::process::Command::new("osascript")
.arg("-e")
.arg(&script)
.output()?;
if output.status.success() {
let bounds_str = String::from_utf8_lossy(&output.stdout);
// Parse bounds: "x1, y1, x2, y2"
let parts: Vec<&str> = bounds_str.trim().split(", ").collect();
if parts.len() >= 2 {
if let (Ok(window_x), Ok(window_y)) = (
parts[0].trim().parse::<i32>(),
parts[1].trim().parse::<i32>(),
) {
// Offset relative coordinates by window position
(x + window_x, y + window_y)
} else {
(x, y) // Fallback to absolute coordinates
}
} else {
(x, y) // Fallback to absolute coordinates
}
} else {
(x, y) // Fallback to absolute coordinates
}
} else {
(x, y) // No app name, use absolute coordinates
};
use core_graphics::event::{ use core_graphics::event::{
CGEvent, CGEventTapLocation, CGEventType, CGMouseButton, CGEvent, CGEventTapLocation, CGEventType, CGMouseButton,
}; };
@@ -233,7 +201,7 @@ impl ComputerController for MacOSController {
let source = CGEventSource::new(CGEventSourceStateID::HIDSystemState) let source = CGEventSource::new(CGEventSourceStateID::HIDSystemState)
.ok().context("Failed to create event source")?; .ok().context("Failed to create event source")?;
let point = CGPoint::new(x as f64, y as f64); let point = CGPoint::new(global_x as f64, global_y as f64);
// Move mouse to position first // Move mouse to position first
let move_event = CGEvent::new_mouse_event( let move_event = CGEvent::new_mouse_event(

View File

@@ -0,0 +1,24 @@
// swift-tools-version:5.9
import PackageDescription
let package = Package(
name: "VisionBridge",
platforms: [
.macOS(.v11)
],
products: [
.library(
name: "VisionBridge",
type: .dynamic,
targets: ["VisionBridge"]
),
],
targets: [
.target(
name: "VisionBridge",
dependencies: [],
path: "Sources/VisionBridge",
publicHeadersPath: "."
),
]
)

View File

@@ -0,0 +1,39 @@
#ifndef VisionBridge_h
#define VisionBridge_h
#include <stdint.h>
#include <stdbool.h>
#ifdef __cplusplus
extern "C" {
#endif
// Text box structure for FFI
typedef struct {
const char* text;
uint32_t text_len;
int32_t x;
int32_t y;
int32_t width;
int32_t height;
float confidence;
} VisionTextBox;
// Recognize text in an image and return bounding boxes
// Returns true on success, false on failure
// Caller must free the returned boxes using vision_free_boxes
bool vision_recognize_text(
const char* image_path,
uint32_t image_path_len,
VisionTextBox** out_boxes,
uint32_t* out_count
);
// Free memory allocated by vision_recognize_text
void vision_free_boxes(VisionTextBox* boxes, uint32_t count);
#ifdef __cplusplus
}
#endif
#endif /* VisionBridge_h */

View File

@@ -0,0 +1,145 @@
import Foundation
import Vision
import AppKit
import CoreGraphics
// MARK: - C Bridge Functions
@_cdecl("vision_recognize_text")
public func vision_recognize_text(
_ imagePath: UnsafePointer<CChar>,
_ imagePathLen: UInt32,
_ outBoxes: UnsafeMutablePointer<UnsafeMutableRawPointer?>,
_ outCount: UnsafeMutablePointer<UInt32>
) -> Bool {
// Convert C string to Swift String
guard let pathData = Data(bytes: imagePath, count: Int(imagePathLen)).withUnsafeBytes({
String(bytes: $0, encoding: .utf8)
}) else {
return false
}
let path = pathData.trimmingCharacters(in: .whitespaces)
// Load image
guard let image = NSImage(contentsOfFile: path),
let cgImage = image.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
return false
}
// Perform OCR
var textBoxes: [CTextBox] = []
let semaphore = DispatchSemaphore(value: 0)
var success = false
let request = VNRecognizeTextRequest { request, error in
defer { semaphore.signal() }
if let error = error {
print("Vision OCR error: \(error.localizedDescription)")
return
}
guard let observations = request.results as? [VNRecognizedTextObservation] else {
return
}
let imageSize = CGSize(width: cgImage.width, height: cgImage.height)
for observation in observations {
guard let candidate = observation.topCandidates(1).first else { continue }
let text = candidate.string
let boundingBox = observation.boundingBox
// Convert normalized coordinates (bottom-left origin) to pixel coordinates (top-left origin)
let x = Int32(boundingBox.origin.x * imageSize.width)
let y = Int32((1.0 - boundingBox.origin.y - boundingBox.height) * imageSize.height)
let width = Int32(boundingBox.width * imageSize.width)
let height = Int32(boundingBox.height * imageSize.height)
// Allocate C string for text
let cString = strdup(text)
textBoxes.append(CTextBox(
text: cString,
text_len: UInt32(text.utf8.count),
x: x,
y: y,
width: width,
height: height,
confidence: observation.confidence
))
}
success = true
}
// Configure request for best accuracy
request.recognitionLevel = .accurate
request.usesLanguageCorrection = true
request.recognitionLanguages = ["en-US"]
// Perform request
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
do {
try handler.perform([request])
} catch {
print("Vision request failed: \(error.localizedDescription)")
return false
}
// Wait for completion
semaphore.wait()
if !success {
return false
}
// Allocate array for results
let boxesPtr = UnsafeMutablePointer<CTextBox>.allocate(capacity: textBoxes.count)
for (index, box) in textBoxes.enumerated() {
boxesPtr[index] = box
}
outBoxes.pointee = UnsafeMutableRawPointer(boxesPtr)
outCount.pointee = UInt32(textBoxes.count)
return true
}
@_cdecl("vision_free_boxes")
public func vision_free_boxes(
_ boxes: UnsafeMutableRawPointer,
_ count: UInt32
) {
let typedBoxes = boxes.assumingMemoryBound(to: CTextBox.self)
for i in 0..<Int(count) {
if let text = typedBoxes[i].text {
free(UnsafeMutableRawPointer(mutating: text))
}
}
typedBoxes.deallocate()
}
// MARK: - C-Compatible Structure
public struct CTextBox {
public let text: UnsafePointer<CChar>?
public let text_len: UInt32
public let x: Int32
public let y: Int32
public let width: Int32
public let height: Int32
public let confidence: Float
public init(text: UnsafePointer<CChar>?, text_len: UInt32, x: Int32, y: Int32, width: Int32, height: Int32, confidence: Float) {
self.text = text
self.text_len = text_len
self.x = x
self.y = y
self.width = width
self.height = height
self.confidence = confidence
}
}

View File

@@ -1825,7 +1825,7 @@ Template:
}, },
Tool { Tool {
name: "extract_text".to_string(), name: "extract_text".to_string(),
description: "Extract text from a screen region or image file using OCR".to_string(), description: "Extract text from a screen region or image file using OCR. Returns plain text only (no bounding boxes). For text with location/coordinates, use vision_find_text instead.".to_string(),
input_schema: json!({ input_schema: json!({
"type": "object", "type": "object",
"properties": { "properties": {
@@ -2280,45 +2280,79 @@ Template:
}); });
} }
// Add extract_text_with_boxes tool (requires macax flag)
if enable_macax {
tools.push(Tool {
name: "extract_text_with_boxes".to_string(),
description: "Extract all text from an image file with bounding box coordinates for each text element. Returns JSON array with text, position (x, y), size (width, height), and confidence for each detected text. Uses Apple Vision Framework for precise sub-pixel accuracy.".to_string(),
input_schema: json!({
"type": "object",
"properties": {
"path": {
"type": "string",
"description": "Path to image file to extract text from"
},
"app_name": {
"type": "string",
"description": "Optional: Name of application to screenshot first (e.g., 'Safari', 'Things3'). If provided, takes screenshot of app before extracting text."
}
},
"required": ["path"]
}),
});
}
// Add vision-guided tools (requires computer control) // Add vision-guided tools (requires computer control)
if enable_computer_control { if enable_computer_control {
// Add vision-guided tools // Add vision-guided tools
tools.push(Tool { tools.push(Tool {
name: "vision_find_text".to_string(), name: "vision_find_text".to_string(),
description: "Find text on screen and return its location (useful for locating UI elements)".to_string(), description: "Find text in a specific application window and return its location with bounding box coordinates (x, y, width, height) and confidence score. Useful for locating UI elements. Uses Apple Vision Framework for precise sub-pixel accuracy.".to_string(),
input_schema: json!({ input_schema: json!({
"type": "object", "type": "object",
"properties": { "properties": {
"app_name": {
"type": "string",
"description": "Name of the application to search in (e.g., 'Things3', 'Safari', 'TextEdit')"
},
"text": { "text": {
"type": "string", "type": "string",
"description": "The text to search for on screen" "description": "The text to search for on screen"
} }
}, },
"required": ["text"] "required": ["app_name", "text"]
}), }),
}); });
tools.push(Tool { tools.push(Tool {
name: "vision_click_text".to_string(), name: "vision_click_text".to_string(),
description: "Find text on screen and click on it (useful for clicking buttons, links, menu items)".to_string(), description: "Find text in a specific application window and click on it (useful for clicking buttons, links, menu items)".to_string(),
input_schema: json!({ input_schema: json!({
"type": "object", "type": "object",
"properties": { "properties": {
"app_name": {
"type": "string",
"description": "Name of the application (e.g., 'Things3', 'Safari', 'TextEdit')"
},
"text": { "text": {
"type": "string", "type": "string",
"description": "The text to click on (e.g., 'Submit', 'OK', 'Cancel', '+')" "description": "The text to click on (e.g., 'Submit', 'OK', 'Cancel', '+')"
} }
}, },
"required": ["text"] "required": ["app_name", "text"]
}), }),
}); });
tools.push(Tool { tools.push(Tool {
name: "vision_click_near_text".to_string(), name: "vision_click_near_text".to_string(),
description: "Find text on screen and click near it (useful for clicking text fields next to labels)".to_string(), description: "Find text in a specific application window and click near it (useful for clicking text fields next to labels)".to_string(),
input_schema: json!({ input_schema: json!({
"type": "object", "type": "object",
"properties": { "properties": {
"app_name": {
"type": "string",
"description": "Name of the application (e.g., 'Things3', 'Safari', 'TextEdit')"
},
"text": { "text": {
"type": "string", "type": "string",
"description": "The label text to find (e.g., 'Name:', 'Email:', 'Task:')" "description": "The label text to find (e.g., 'Name:', 'Email:', 'Task:')"
@@ -2333,7 +2367,7 @@ Template:
"description": "Distance in pixels from the text (default: 50)" "description": "Distance in pixels from the text (default: 50)"
} }
}, },
"required": ["text"] "required": ["app_name", "text"]
}), }),
}); });
} }
@@ -4591,19 +4625,23 @@ Template:
debug!("Processing vision_find_text tool call"); debug!("Processing vision_find_text tool call");
if let Some(controller) = &self.computer_controller { if let Some(controller) = &self.computer_controller {
let app_name = tool_call.args.get("app_name")
.and_then(|v| v.as_str())
.ok_or_else(|| anyhow::anyhow!("Missing app_name parameter"))?;
let text = tool_call.args.get("text") let text = tool_call.args.get("text")
.and_then(|v| v.as_str()) .and_then(|v| v.as_str())
.ok_or_else(|| anyhow::anyhow!("Missing text parameter"))?; .ok_or_else(|| anyhow::anyhow!("Missing text parameter"))?;
match controller.find_text_on_screen(text).await { match controller.find_text_in_app(app_name, text).await {
Ok(Some(location)) => { Ok(Some(location)) => {
Ok(format!( Ok(format!(
"✅ Found '{}' at position ({}, {}) with size {}x{} (confidence: {:.0}%)", "✅ Found '{}' in {} at position ({}, {}) with size {}x{} (confidence: {:.0}%)",
location.text, location.x, location.y, location.width, location.height, location.text, app_name, location.x, location.y, location.width, location.height,
location.confidence * 100.0 location.confidence * 100.0
)) ))
} }
Ok(None) => Ok(format!("❌ Could not find '{}' on screen", text)), Ok(None) => Ok(format!("❌ Could not find '{}' in {}", text, app_name)),
Err(e) => Ok(format!("❌ Error finding text: {}", e)), Err(e) => Ok(format!("❌ Error finding text: {}", e)),
} }
} else { } else {
@@ -4614,32 +4652,83 @@ Template:
debug!("Processing vision_click_text tool call"); debug!("Processing vision_click_text tool call");
if let Some(controller) = &self.computer_controller { if let Some(controller) = &self.computer_controller {
let app_name = tool_call.args.get("app_name")
.and_then(|v| v.as_str())
.ok_or_else(|| anyhow::anyhow!("Missing app_name parameter"))?;
let text = tool_call.args.get("text") let text = tool_call.args.get("text")
.and_then(|v| v.as_str()) .and_then(|v| v.as_str())
.ok_or_else(|| anyhow::anyhow!("Missing text parameter"))?; .ok_or_else(|| anyhow::anyhow!("Missing text parameter"))?;
match controller.find_text_on_screen(text).await { match controller.find_text_in_app(app_name, text).await {
Ok(Some(location)) => { Ok(Some(location)) => {
// Click on center of text // Click on center of text
let center_x = location.x + location.width / 2; let center_x = location.x + location.width / 2;
let center_y = location.y + location.height / 2; let center_y = location.y + location.height / 2;
match controller.click_at(center_x, center_y) { match controller.click_at(center_x, center_y, Some(app_name)) {
Ok(_) => Ok(format!("✅ Clicked on '{}' at ({}, {})", text, center_x, center_y)), Ok(_) => Ok(format!("✅ Clicked on '{}' in {} at ({}, {})", text, app_name, center_x, center_y)),
Err(e) => Ok(format!("❌ Failed to click: {}", e)), Err(e) => Ok(format!("❌ Failed to click: {}", e)),
} }
} }
Ok(None) => Ok(format!("❌ Could not find '{}' on screen", text)), Ok(None) => Ok(format!("❌ Could not find '{}' in {}", text, app_name)),
Err(e) => Ok(format!("❌ Error finding text: {}", e)), Err(e) => Ok(format!("❌ Error finding text: {}", e)),
} }
} else { } else {
Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string()) Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string())
} }
} }
"extract_text_with_boxes" => {
debug!("Processing extract_text_with_boxes tool call");
if !self.config.macax.enabled {
return Ok("❌ extract_text_with_boxes requires --macax flag to be enabled".to_string());
}
if let Some(controller) = &self.computer_controller {
let path = tool_call.args.get("path")
.and_then(|v| v.as_str())
.ok_or_else(|| anyhow::anyhow!("Missing path parameter"))?;
// Optional: take screenshot of app first
let final_path = if let Some(app_name) = tool_call.args.get("app_name").and_then(|v| v.as_str()) {
let temp_path = format!("/tmp/g3_extract_boxes_{}.png", uuid::Uuid::new_v4());
match controller.take_screenshot(&temp_path, None, Some(app_name)).await {
Ok(_) => temp_path,
Err(e) => return Ok(format!("❌ Failed to take screenshot: {}", e)),
}
} else {
path.to_string()
};
// Extract text with locations
match controller.extract_text_with_locations(&final_path).await {
Ok(locations) => {
// Clean up temp file if we created one
if final_path != path {
let _ = std::fs::remove_file(&final_path);
}
// Return as JSON
match serde_json::to_string_pretty(&locations) {
Ok(json) => Ok(format!("✅ Extracted {} text elements:\n{}", locations.len(), json)),
Err(e) => Ok(format!("❌ Failed to serialize results: {}", e)),
}
}
Err(e) => Ok(format!("❌ Failed to extract text: {}", e)),
}
} else {
Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string())
}
}
"vision_click_near_text" => { "vision_click_near_text" => {
debug!("Processing vision_click_near_text tool call"); debug!("Processing vision_click_near_text tool call");
if let Some(controller) = &self.computer_controller { if let Some(controller) = &self.computer_controller {
let app_name = tool_call.args.get("app_name")
.and_then(|v| v.as_str())
.ok_or_else(|| anyhow::anyhow!("Missing app_name parameter"))?;
let text = tool_call.args.get("text") let text = tool_call.args.get("text")
.and_then(|v| v.as_str()) .and_then(|v| v.as_str())
.ok_or_else(|| anyhow::anyhow!("Missing text parameter"))?; .ok_or_else(|| anyhow::anyhow!("Missing text parameter"))?;
@@ -4652,7 +4741,7 @@ Template:
.and_then(|v| v.as_i64()) .and_then(|v| v.as_i64())
.unwrap_or(50) as i32; .unwrap_or(50) as i32;
match controller.find_text_on_screen(text).await { match controller.find_text_in_app(app_name, text).await {
Ok(Some(location)) => { Ok(Some(location)) => {
// Calculate click position based on direction // Calculate click position based on direction
let (click_x, click_y) = match direction { let (click_x, click_y) = match direction {
@@ -4663,15 +4752,15 @@ Template:
_ => (location.x + location.width + distance, location.y + location.height / 2), _ => (location.x + location.width + distance, location.y + location.height / 2),
}; };
match controller.click_at(click_x, click_y) { match controller.click_at(click_x, click_y, Some(app_name)) {
Ok(_) => Ok(format!( Ok(_) => Ok(format!(
"✅ Clicked {} of '{}' at ({}, {})", "✅ Clicked {} of '{}' in {} at ({}, {})",
direction, text, click_x, click_y direction, text, app_name, click_x, click_y
)), )),
Err(e) => Ok(format!("❌ Failed to click: {}", e)), Err(e) => Ok(format!("❌ Failed to click: {}", e)),
} }
} }
Ok(None) => Ok(format!("❌ Could not find '{}' on screen", text)), Ok(None) => Ok(format!("❌ Could not find '{}' in {}", text, app_name)),
Err(e) => Ok(format!("❌ Error finding text: {}", e)), Err(e) => Ok(format!("❌ Error finding text: {}", e)),
} }
} else { } else {

View File

@@ -1,75 +0,0 @@
# Coach-Player Provider Configuration
G3 now supports specifying different LLM providers for the coach and player agents when running in autonomous mode. This allows you to optimize for different requirements:
- **Player**: The agent that implements code - might benefit from a faster, more cost-effective model
- **Coach**: The agent that reviews code - might benefit from a more powerful, analytical model
## Configuration
In your `config.toml` file, under the `[providers]` section, you can specify:
```toml
[providers]
default_provider = "databricks" # Used for normal operations
coach = "databricks" # Provider for coach (code reviewer)
player = "anthropic" # Provider for player (code implementer)
```
If `coach` or `player` are not specified, they will default to using the `default_provider`.
## Example Use Cases
### Cost Optimization
Use a cheaper, faster model for initial implementations (player) and a more powerful model for review (coach):
```toml
coach = "anthropic" # Claude Sonnet for thorough review
player = "anthropic" # Claude Haiku for quick implementation
```
### Speed vs Quality Trade-off
Use a local embedded model for fast iterations (player) and a cloud model for quality review (coach):
```toml
coach = "databricks" # Cloud model for quality review
player = "embedded" # Local model for fast implementation
```
### Specialized Models
Use different models optimized for different tasks:
```toml
coach = "databricks" # Model fine-tuned for code review
player = "openai" # Model optimized for code generation
```
## Requirements
- Both providers must be properly configured in your config file
- Each provider must have valid credentials
- The models specified for each provider must be accessible
## How It Works
When running in autonomous mode (`g3 --autonomous`), the system will:
1. Use the `player` provider (or default) for the initial implementation
2. Switch to the `coach` provider (or default) for code review
3. Return to the `player` provider for implementing feedback
4. Continue this cycle for the specified number of turns
The providers are logged at startup so you can verify which models are being used:
```
🎮 Player provider: anthropic
👨‍🏫 Coach provider: databricks
Using different providers for player and coach
```
## Benefits
- **Cost Efficiency**: Use expensive models only where they add the most value
- **Speed Optimization**: Use faster models for iterative development
- **Specialization**: Leverage models that excel at specific tasks
- **Flexibility**: Easy to experiment with different provider combinations