From 61d748034d0ceb7dbc18b36c8de8e354b0415b58 Mon Sep 17 00:00:00 2001 From: Dhanji Prasanna Date: Fri, 24 Oct 2025 15:35:47 +1100 Subject: [PATCH] replace tesseract with apple vision --- .gitignore | 1 + Cargo.lock | 160 ++++-------------- crates/g3-cli/src/ui_writer_impl.rs | 14 +- crates/g3-computer-control/Cargo.toml | 6 +- crates/g3-computer-control/build.rs | 63 +++++++ .../examples/test_vision.rs | 85 ++++++++++ crates/g3-computer-control/src/lib.rs | 5 +- crates/g3-computer-control/src/ocr/mod.rs | 26 +++ .../g3-computer-control/src/ocr/tesseract.rs | 84 +++++++++ crates/g3-computer-control/src/ocr/vision.rs | 103 +++++++++++ .../g3-computer-control/src/platform/macos.rs | 144 ++++++---------- .../vision-bridge/Package.swift | 24 +++ .../Sources/VisionBridge/VisionBridge.h | 39 +++++ .../Sources/VisionBridge/VisionOCR.swift | 145 ++++++++++++++++ crates/g3-core/src/lib.rs | 129 +++++++++++--- docs/coach-player-providers.md | 75 -------- 16 files changed, 785 insertions(+), 318 deletions(-) create mode 100644 crates/g3-computer-control/build.rs create mode 100644 crates/g3-computer-control/examples/test_vision.rs create mode 100644 crates/g3-computer-control/src/ocr/mod.rs create mode 100644 crates/g3-computer-control/src/ocr/tesseract.rs create mode 100644 crates/g3-computer-control/src/ocr/vision.rs create mode 100644 crates/g3-computer-control/vision-bridge/Package.swift create mode 100644 crates/g3-computer-control/vision-bridge/Sources/VisionBridge/VisionBridge.h create mode 100644 crates/g3-computer-control/vision-bridge/Sources/VisionBridge/VisionOCR.swift delete mode 100644 docs/coach-player-providers.md diff --git a/.gitignore b/.gitignore index fe29988..f9f70c3 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ # will have compiled files and executables debug target +.build # These are backup files generated by rustfmt **/*.rs.bk diff --git a/Cargo.lock b/Cargo.lock index 38b9eb8..a09efd0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -136,7 +136,7 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -218,28 +218,6 @@ version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" -[[package]] -name = "bindgen" -version = "0.64.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4243e6031260db77ede97ad86c27e501d646a27ab57b59a574f725d98ab1fb4" -dependencies = [ - "bitflags 1.3.2", - "cexpr", - "clang-sys", - "lazy_static", - "lazycell", - "log", - "peeking_take_while", - "proc-macro2", - "quote", - "regex", - "rustc-hash", - "shlex", - "syn 1.0.109", - "which", -] - [[package]] name = "bindgen" version = "0.69.5" @@ -259,7 +237,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn 2.0.107", + "syn", "which", ] @@ -433,7 +411,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -767,7 +745,7 @@ dependencies = [ "proc-macro2", "quote", "strict", - "syn 2.0.107", + "syn", ] [[package]] @@ -906,7 +884,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.107", + "syn", ] [[package]] @@ -917,7 +895,7 @@ checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" dependencies = [ "darling_core", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -939,7 +917,7 @@ dependencies = [ "proc-macro2", "quote", "rustc_version", - "syn 2.0.107", + "syn", ] [[package]] @@ -960,7 +938,7 @@ dependencies = [ "convert_case 0.7.1", "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -1023,7 +1001,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -1213,7 +1191,7 @@ checksum = "1a5c6c585bc94aaf2c7b51dd4c2ba22680844aba4c687be581871a6f518c5742" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -1293,7 +1271,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -1374,7 +1352,6 @@ dependencies = [ "serde", "serde_json", "shellexpand", - "tesseract", "thiserror 1.0.69", "tokio", "tracing", @@ -1959,7 +1936,7 @@ dependencies = [ "indoc", "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -2080,7 +2057,7 @@ dependencies = [ "proc-macro2", "quote", "regex", - "syn 2.0.107", + "syn", ] [[package]] @@ -2101,28 +2078,6 @@ version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a79a3332a6609480d7d0c9eab957bca6b455b91bb84e66d19f5ff66294b85b8" -[[package]] -name = "leptonica-plumbing" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc7a74c43d6f090d39158d233f326f47cd8bba545217595c93662b4e31156f42" -dependencies = [ - "leptonica-sys", - "libc", - "thiserror 1.0.69", -] - -[[package]] -name = "leptonica-sys" -version = "0.4.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da627c72b2499a8106f4dd33143843015e4a631f445d561f3481f7fba35b6151" -dependencies = [ - "bindgen 0.64.0", - "pkg-config", - "vcpkg", -] - [[package]] name = "libc" version = "0.2.177" @@ -2203,7 +2158,7 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "037a1881ada3592c6a922224d5177b4b4f452e6b2979eb97393b71989e48357f" dependencies = [ - "bindgen 0.69.5", + "bindgen", "cc", "link-cplusplus", "once_cell", @@ -2478,7 +2433,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -2550,12 +2505,6 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df94ce210e5bc13cb6651479fa48d14f601d9858cfe0467f43ae157023b938d3" -[[package]] -name = "peeking_take_while" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" - [[package]] name = "percent-encoding" version = "2.3.2" @@ -2592,7 +2541,7 @@ dependencies = [ "pest_meta", "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -2673,7 +2622,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" dependencies = [ "proc-macro2", - "syn 2.0.107", + "syn", ] [[package]] @@ -3078,7 +3027,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -3272,18 +3221,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.107", -] - -[[package]] -name = "syn" -version = "1.0.109" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", + "syn", ] [[package]] @@ -3317,7 +3255,7 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -3370,40 +3308,6 @@ dependencies = [ "unicode-width 0.1.14", ] -[[package]] -name = "tesseract" -version = "0.14.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ee0c2c608b63817b095f7fded5c50add36a29e2be2b2fc4901357163329290a" -dependencies = [ - "tesseract-plumbing", - "tesseract-sys", - "thiserror 1.0.69", -] - -[[package]] -name = "tesseract-plumbing" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e496d3e29eba540a276975394b85dccb5fd344b3eefb743d9286c8150f766d5" -dependencies = [ - "leptonica-plumbing", - "tesseract-sys", - "thiserror 1.0.69", -] - -[[package]] -name = "tesseract-sys" -version = "0.5.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd33f6f216124cfaf0fa86c2c0cdf04da39b6257bd78c5e44fa4fa98c3a5857b" -dependencies = [ - "bindgen 0.64.0", - "leptonica-sys", - "pkg-config", - "vcpkg", -] - [[package]] name = "thiserror" version = "1.0.69" @@ -3430,7 +3334,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -3441,7 +3345,7 @@ checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -3539,7 +3443,7 @@ checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -3665,7 +3569,7 @@ checksum = "81383ab64e72a7a8b8e13130c49e3dab29def6d0c7d76a03087b3cf71c5c6903" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -3870,7 +3774,7 @@ dependencies = [ "log", "proc-macro2", "quote", - "syn 2.0.107", + "syn", "wasm-bindgen-shared", ] @@ -3905,7 +3809,7 @@ checksum = "9f07d2f20d4da7b26400c9f4a0511e6e0345b040694e8a75bd41d578fa4421d7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -4077,7 +3981,7 @@ checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -4088,7 +3992,7 @@ checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -4484,7 +4388,7 @@ checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", "synstructure", ] @@ -4505,7 +4409,7 @@ checksum = "88d2b8d9c68ad2b9e4340d7832716a4d21a22a1154777ad56ea55c51a9cf3831" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -4525,7 +4429,7 @@ checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", "synstructure", ] @@ -4559,7 +4463,7 @@ checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] diff --git a/crates/g3-cli/src/ui_writer_impl.rs b/crates/g3-cli/src/ui_writer_impl.rs index 407e0d1..ec1a203 100644 --- a/crates/g3-cli/src/ui_writer_impl.rs +++ b/crates/g3-cli/src/ui_writer_impl.rs @@ -193,7 +193,12 @@ impl UiWriter for ConsoleUiWriter { // Truncate long values for display let display_value = if first_line.len() > 80 { - format!("{}...", &first_line[..77]) + // Use char_indices to safely truncate at character boundary + let truncate_at = first_line.char_indices() + .nth(77) + .map(|(i, _)| i) + .unwrap_or(first_line.len()); + format!("{}...", &first_line[..truncate_at]) } else { first_line.to_string() }; @@ -440,7 +445,12 @@ impl UiWriter for RetroTuiWriter { if caption.is_empty() && (key == "file_path" || key == "command" || key == "path") { // Truncate long values for the caption let truncated = if value.len() > 50 { - format!("{}...", &value[..47]) + // Use char_indices to safely truncate at character boundary + let truncate_at = value.char_indices() + .nth(47) + .map(|(i, _)| i) + .unwrap_or(value.len()); + format!("{}...", &value[..truncate_at]) } else { value.to_string() }; diff --git a/crates/g3-computer-control/Cargo.toml b/crates/g3-computer-control/Cargo.toml index 4300dc1..b9ed189 100644 --- a/crates/g3-computer-control/Cargo.toml +++ b/crates/g3-computer-control/Cargo.toml @@ -3,6 +3,9 @@ name = "g3-computer-control" version = "0.1.0" edition = "2021" +[build-dependencies] +# Only needed for building Swift bridge on macOS + [dependencies] # Workspace dependencies tokio = { workspace = true } @@ -20,9 +23,6 @@ async-trait = "0.1" # WebDriver support fantoccini = "0.21" -# OCR dependencies -tesseract = "0.14" - # macOS dependencies [target.'cfg(target_os = "macos")'.dependencies] core-graphics = "0.23" diff --git a/crates/g3-computer-control/build.rs b/crates/g3-computer-control/build.rs new file mode 100644 index 0000000..fed302c --- /dev/null +++ b/crates/g3-computer-control/build.rs @@ -0,0 +1,63 @@ +use std::env; +use std::path::PathBuf; +use std::process::Command; + +fn main() { + // Only build Vision bridge on macOS + if env::var("CARGO_CFG_TARGET_OS").unwrap() != "macos" { + return; + } + + println!("cargo:rerun-if-changed=vision-bridge/Sources/VisionBridge/VisionOCR.swift"); + println!("cargo:rerun-if-changed=vision-bridge/Sources/VisionBridge/VisionBridge.h"); + println!("cargo:rerun-if-changed=vision-bridge/Package.swift"); + + let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap()); + let vision_bridge_dir = manifest_dir.join("vision-bridge"); + + // Build Swift package + println!("cargo:warning=Building VisionBridge Swift package..."); + let build_status = Command::new("swift") + .args(&["build", "-c", "release"]) + .current_dir(&vision_bridge_dir) + .status() + .expect("Failed to build Swift package"); + + if !build_status.success() { + panic!("Swift build failed"); + } + + // Find the built library + let lib_path = vision_bridge_dir + .join(".build/release") + .canonicalize() + .expect("Failed to find .build/release directory"); + + // Copy the dylib to the output directory so it can be found at runtime + let target_dir = manifest_dir.parent().unwrap().parent().unwrap().join("target"); + let profile = env::var("PROFILE").unwrap_or_else(|_| "debug".to_string()); + let output_dir = target_dir.join(&profile); + + let dylib_src = lib_path.join("libVisionBridge.dylib"); + let dylib_dst = output_dir.join("libVisionBridge.dylib"); + + std::fs::copy(&dylib_src, &dylib_dst) + .expect(&format!("Failed to copy dylib from {} to {}", dylib_src.display(), dylib_dst.display())); + + println!("cargo:warning=Copied libVisionBridge.dylib to {}", dylib_dst.display()); + + // Add rpath so the dylib can be found at runtime + println!("cargo:rustc-link-arg=-Wl,-rpath,@executable_path"); + println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path"); + println!("cargo:rustc-link-search=native={}", lib_path.display()); + println!("cargo:rustc-link-lib=dylib=VisionBridge"); + + // Link required frameworks + println!("cargo:rustc-link-lib=framework=Vision"); + println!("cargo:rustc-link-lib=framework=AppKit"); + println!("cargo:rustc-link-lib=framework=Foundation"); + println!("cargo:rustc-link-lib=framework=CoreGraphics"); + println!("cargo:rustc-link-lib=framework=CoreImage"); + + println!("cargo:warning=VisionBridge built successfully at {}", lib_path.display()); +} diff --git a/crates/g3-computer-control/examples/test_vision.rs b/crates/g3-computer-control/examples/test_vision.rs new file mode 100644 index 0000000..5ff09a5 --- /dev/null +++ b/crates/g3-computer-control/examples/test_vision.rs @@ -0,0 +1,85 @@ +use g3_computer_control::ocr::{OCREngine, DefaultOCR}; +use anyhow::Result; + +#[tokio::main] +async fn main() -> Result<()> { + println!("๐Ÿงช Testing Apple Vision OCR"); + println!("===========================\n"); + + // Initialize OCR engine + println!("๐Ÿ“ฆ Initializing OCR engine..."); + let ocr = DefaultOCR::new()?; + println!("โœ… OCR engine: {}\n", ocr.name()); + + // Check if test image exists + let test_image = "/tmp/safari_test.png"; + if !std::path::Path::new(test_image).exists() { + println!("โš ๏ธ Test image not found: {}", test_image); + println!(" Creating a screenshot..."); + + let status = std::process::Command::new("screencapture") + .arg("-x") + .arg("-R") + .arg("0,0,1200,800") + .arg(test_image) + .status()?; + + if !status.success() { + anyhow::bail!("Failed to create screenshot"); + } + + println!("โœ… Screenshot created\n"); + } + + // Run OCR + println!("๐Ÿ” Running Apple Vision OCR on {}...", test_image); + let start = std::time::Instant::now(); + let locations = ocr.extract_text_with_locations(test_image).await?; + let duration = start.elapsed(); + + println!("โœ… OCR completed in {:.3}s\n", duration.as_secs_f64()); + + // Display results + println!("๐Ÿ“Š Results:"); + println!(" Found {} text elements\n", locations.len()); + + if locations.is_empty() { + println!("โš ๏ธ No text found in image"); + } else { + println!(" Top 20 results:"); + println!(" {:<4} {:<40} {:<15} {:<12} {:<8}", "#", "Text", "Position", "Size", "Conf"); + println!(" {}", "-".repeat(85)); + + for (i, loc) in locations.iter().take(20).enumerate() { + let text = if loc.text.len() > 37 { + format!("{}...", &loc.text[..37]) + } else { + loc.text.clone() + }; + + println!(" {:<4} {:<40} ({:>4},{:>4}) {:>4}x{:<4} {:.2}", + i + 1, + text, + loc.x, + loc.y, + loc.width, + loc.height, + loc.confidence + ); + } + + if locations.len() > 20 { + println!("\n ... and {} more", locations.len() - 20); + } + + // Performance comparison + println!("\n๐Ÿ“ˆ Performance:"); + println!(" OCR Speed: {:.3}s", duration.as_secs_f64()); + println!(" Text elements: {}", locations.len()); + println!(" Avg per element: {:.1}ms", duration.as_millis() as f64 / locations.len() as f64); + } + + println!("\nโœ… Test complete!"); + + Ok(()) +} diff --git a/crates/g3-computer-control/src/lib.rs b/crates/g3-computer-control/src/lib.rs index ad564b5..355a591 100644 --- a/crates/g3-computer-control/src/lib.rs +++ b/crates/g3-computer-control/src/lib.rs @@ -3,6 +3,7 @@ pub mod types; pub mod platform; +pub mod ocr; pub mod webdriver; pub mod macax; @@ -25,11 +26,11 @@ pub trait ComputerController: Send + Sync { async fn extract_text_from_screen(&self, region: Rect) -> Result; async fn extract_text_from_image(&self, path: &str) -> Result; async fn extract_text_with_locations(&self, path: &str) -> Result>; - async fn find_text_on_screen(&self, search_text: &str) -> Result>; + async fn find_text_in_app(&self, app_name: &str, search_text: &str) -> Result>; // Mouse operations fn move_mouse(&self, x: i32, y: i32) -> Result<()>; - fn click_at(&self, x: i32, y: i32) -> Result<()>; + fn click_at(&self, x: i32, y: i32, app_name: Option<&str>) -> Result<()>; } // Platform-specific constructor diff --git a/crates/g3-computer-control/src/ocr/mod.rs b/crates/g3-computer-control/src/ocr/mod.rs new file mode 100644 index 0000000..b651da3 --- /dev/null +++ b/crates/g3-computer-control/src/ocr/mod.rs @@ -0,0 +1,26 @@ +use crate::types::TextLocation; +use anyhow::Result; +use async_trait::async_trait; + +/// OCR engine trait for text recognition with bounding boxes +#[async_trait] +pub trait OCREngine: Send + Sync { + /// Extract text with locations from an image file + async fn extract_text_with_locations(&self, path: &str) -> Result>; + + /// Get the name of the OCR engine + fn name(&self) -> &str; +} + +// Platform-specific modules +#[cfg(target_os = "macos")] +pub mod vision; + +pub mod tesseract; + +// Re-export the default OCR engine for the platform +#[cfg(target_os = "macos")] +pub use vision::AppleVisionOCR as DefaultOCR; + +#[cfg(not(target_os = "macos"))] +pub use tesseract::TesseractOCR as DefaultOCR; diff --git a/crates/g3-computer-control/src/ocr/tesseract.rs b/crates/g3-computer-control/src/ocr/tesseract.rs new file mode 100644 index 0000000..d55fc3f --- /dev/null +++ b/crates/g3-computer-control/src/ocr/tesseract.rs @@ -0,0 +1,84 @@ +use super::OCREngine; +use crate::types::TextLocation; +use anyhow::Result; +use async_trait::async_trait; + +/// Tesseract OCR engine (fallback/cross-platform) +pub struct TesseractOCR; + +impl TesseractOCR { + pub fn new() -> Result { + // Check if tesseract is available + let tesseract_check = std::process::Command::new("which") + .arg("tesseract") + .output(); + + if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() { + anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\ + To install tesseract:\n macOS: brew install tesseract\n \ + Linux: sudo apt-get install tesseract-ocr (Ubuntu/Debian)\n \ + sudo yum install tesseract (RHEL/CentOS)\n \ + Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki\n\n\ + After installation, restart your terminal and try again."); + } + + Ok(Self) + } +} + +#[async_trait] +impl OCREngine for TesseractOCR { + async fn extract_text_with_locations(&self, path: &str) -> Result> { + // Use tesseract CLI with TSV output to get bounding boxes + let output = std::process::Command::new("tesseract") + .arg(path) + .arg("stdout") + .arg("tsv") + .output() + .map_err(|e| anyhow::anyhow!("Failed to run tesseract: {}", e))?; + + if !output.status.success() { + anyhow::bail!("Tesseract failed: {}", String::from_utf8_lossy(&output.stderr)); + } + + let tsv_text = String::from_utf8_lossy(&output.stdout); + let mut locations = Vec::new(); + + // Parse TSV output (skip header line) + for (i, line) in tsv_text.lines().enumerate() { + if i == 0 { continue; } // Skip header + + let parts: Vec<&str> = line.split('\t').collect(); + if parts.len() >= 12 { + // TSV format: level, page_num, block_num, par_num, line_num, word_num, + // left, top, width, height, conf, text + if let (Ok(x), Ok(y), Ok(w), Ok(h), Ok(conf), text) = ( + parts[6].parse::(), + parts[7].parse::(), + parts[8].parse::(), + parts[9].parse::(), + parts[10].parse::(), + parts[11], + ) { + let trimmed = text.trim(); + if !trimmed.is_empty() && conf > 0.0 { + locations.push(TextLocation { + text: trimmed.to_string(), + x, + y, + width: w, + height: h, + confidence: conf / 100.0, // Convert from 0-100 to 0-1 + }); + } + } + } + } + + Ok(locations) + } + + fn name(&self) -> &str { + "Tesseract OCR" + } +} diff --git a/crates/g3-computer-control/src/ocr/vision.rs b/crates/g3-computer-control/src/ocr/vision.rs new file mode 100644 index 0000000..d35491d --- /dev/null +++ b/crates/g3-computer-control/src/ocr/vision.rs @@ -0,0 +1,103 @@ +use super::OCREngine; +use crate::types::TextLocation; +use anyhow::{Result, Context}; +use async_trait::async_trait; +use std::ffi::{CStr, CString}; +use std::os::raw::{c_char, c_float, c_uint}; + +// FFI bindings to Swift VisionBridge +#[repr(C)] +struct VisionTextBox { + text: *const c_char, + text_len: c_uint, + x: i32, + y: i32, + width: i32, + height: i32, + confidence: c_float, +} + +extern "C" { + fn vision_recognize_text( + image_path: *const c_char, + image_path_len: c_uint, + out_boxes: *mut *mut std::ffi::c_void, + out_count: *mut c_uint, + ) -> bool; + + fn vision_free_boxes(boxes: *mut std::ffi::c_void, count: c_uint); +} + +/// Apple Vision Framework OCR engine +pub struct AppleVisionOCR; + +impl AppleVisionOCR { + pub fn new() -> Result { + Ok(Self) + } +} + +#[async_trait] +impl OCREngine for AppleVisionOCR { + async fn extract_text_with_locations(&self, path: &str) -> Result> { + // Convert path to C string + let c_path = CString::new(path) + .context("Failed to convert path to C string")?; + + let mut boxes_ptr: *mut std::ffi::c_void = std::ptr::null_mut(); + let mut count: c_uint = 0; + + // Call Swift Vision API + let success = unsafe { + vision_recognize_text( + c_path.as_ptr(), + path.len() as c_uint, + &mut boxes_ptr, + &mut count, + ) + }; + + if !success || boxes_ptr.is_null() { + anyhow::bail!("Apple Vision OCR failed"); + } + + // Convert C array to Rust Vec + let mut locations = Vec::new(); + + unsafe { + let typed_boxes = boxes_ptr as *const VisionTextBox; + let boxes_slice = std::slice::from_raw_parts(typed_boxes, count as usize); + + for box_data in boxes_slice { + // Convert C string to Rust String + let text = if !box_data.text.is_null() { + CStr::from_ptr(box_data.text) + .to_string_lossy() + .into_owned() + } else { + String::new() + }; + + if !text.is_empty() { + locations.push(TextLocation { + text, + x: box_data.x, + y: box_data.y, + width: box_data.width, + height: box_data.height, + confidence: box_data.confidence, + }); + } + } + + // Free the C array + vision_free_boxes(boxes_ptr, count); + } + + Ok(locations) + } + + fn name(&self) -> &str { + "Apple Vision Framework" + } +} diff --git a/crates/g3-computer-control/src/platform/macos.rs b/crates/g3-computer-control/src/platform/macos.rs index d2e6a0a..da1aa95 100644 --- a/crates/g3-computer-control/src/platform/macos.rs +++ b/crates/g3-computer-control/src/platform/macos.rs @@ -1,16 +1,21 @@ use crate::{ComputerController, types::{Rect, TextLocation}}; +use crate::ocr::{OCREngine, DefaultOCR}; use anyhow::{Result, Context}; use async_trait::async_trait; use std::path::Path; -use tesseract::Tesseract; pub struct MacOSController { - // Empty struct for now + ocr_engine: Box, + #[allow(dead_code)] + ocr_name: String, } impl MacOSController { pub fn new() -> Result { - Ok(Self {}) + let ocr = Box::new(DefaultOCR::new()?); + let ocr_name = ocr.name().to_string(); + tracing::info!("Initialized macOS controller with OCR engine: {}", ocr_name); + Ok(Self { ocr_engine: ocr, ocr_name }) } } @@ -90,95 +95,21 @@ impl ComputerController for MacOSController { } async fn extract_text_from_image(&self, path: &str) -> Result { - // Check if tesseract is available on the system - let tesseract_check = std::process::Command::new("which") - .arg("tesseract") - .output(); - - if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() { - anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\ - To install tesseract:\n macOS: brew install tesseract\n \ - Linux: sudo apt-get install tesseract-ocr (Ubuntu/Debian)\n \ - sudo yum install tesseract (RHEL/CentOS)\n \ - Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki\n\n\ - After installation, restart your terminal and try again."); - } - - // Initialize Tesseract - let tess = Tesseract::new(None, Some("eng")) - .map_err(|e| { - anyhow::anyhow!("Failed to initialize Tesseract: {}\n\n\ - This usually means:\n1. Tesseract is not properly installed\n\ - 2. Language data files are missing\n\nTo fix:\n \ - macOS: brew reinstall tesseract\n \ - Linux: sudo apt-get install tesseract-ocr-eng\n \ - Windows: Reinstall tesseract and ensure language files are included", e) - })?; - - let text = tess.set_image(path) - .map_err(|e| anyhow::anyhow!("Failed to load image '{}': {}", path, e))? - .get_text() - .map_err(|e| anyhow::anyhow!("Failed to extract text from image: {}", e))?; - - Ok(text) + // Extract all text and concatenate + let locations = self.ocr_engine.extract_text_with_locations(path).await?; + Ok(locations.iter().map(|loc| loc.text.as_str()).collect::>().join(" ")) } async fn extract_text_with_locations(&self, path: &str) -> Result> { - // For now, use tesseract CLI with TSV output to get bounding boxes - // This is a workaround since the Rust tesseract crate doesn't expose get_component_boxes - let output = std::process::Command::new("tesseract") - .arg(path) - .arg("stdout") - .arg("tsv") - .output() - .map_err(|e| anyhow::anyhow!("Failed to run tesseract: {}", e))?; - - if !output.status.success() { - anyhow::bail!("Tesseract failed: {}", String::from_utf8_lossy(&output.stderr)); - } - - let tsv_text = String::from_utf8_lossy(&output.stdout); - let mut locations = Vec::new(); - - // Parse TSV output (skip header line) - for (i, line) in tsv_text.lines().enumerate() { - if i == 0 { continue; } // Skip header - - let parts: Vec<&str> = line.split('\t').collect(); - if parts.len() >= 12 { - // TSV format: level, page_num, block_num, par_num, line_num, word_num, - // left, top, width, height, conf, text - if let (Ok(x), Ok(y), Ok(w), Ok(h), Ok(conf), text) = ( - parts[6].parse::(), - parts[7].parse::(), - parts[8].parse::(), - parts[9].parse::(), - parts[10].parse::(), - parts[11], - ) { - let trimmed = text.trim(); - if !trimmed.is_empty() && conf > 0.0 { - locations.push(TextLocation { - text: trimmed.to_string(), - x, - y, - width: w, - height: h, - confidence: conf / 100.0, // Convert from 0-100 to 0-1 - }); - } - } - } - } - - Ok(locations) + // Use the OCR engine + self.ocr_engine.extract_text_with_locations(path).await } - async fn find_text_on_screen(&self, search_text: &str) -> Result> { - // Take full screenshot + async fn find_text_in_app(&self, app_name: &str, search_text: &str) -> Result> { + // Take screenshot of specific app window let home = std::env::var("HOME").unwrap_or_else(|_| "/tmp".to_string()); - let temp_path = format!("{}/Desktop/g3_find_text_{}.png", home, uuid::Uuid::new_v4()); - self.take_screenshot(&temp_path, None, None).await?; + let temp_path = format!("{}/Desktop/g3_find_text_{}_{}.png", home, app_name, uuid::Uuid::new_v4()); + self.take_screenshot(&temp_path, None, Some(app_name)).await?; // Extract all text with locations let locations = self.extract_text_with_locations(&temp_path).await?; @@ -221,7 +152,44 @@ impl ComputerController for MacOSController { Ok(()) } - fn click_at(&self, x: i32, y: i32) -> Result<()> { + fn click_at(&self, x: i32, y: i32, app_name: Option<&str>) -> Result<()> { + // If app_name is provided, get window position and offset coordinates + let (global_x, global_y) = if let Some(app) = app_name { + // Get window position using AppleScript + let script = format!( + r#"tell application "{}" to get bounds of window 1"#, + app + ); + + let output = std::process::Command::new("osascript") + .arg("-e") + .arg(&script) + .output()?; + + if output.status.success() { + let bounds_str = String::from_utf8_lossy(&output.stdout); + // Parse bounds: "x1, y1, x2, y2" + let parts: Vec<&str> = bounds_str.trim().split(", ").collect(); + if parts.len() >= 2 { + if let (Ok(window_x), Ok(window_y)) = ( + parts[0].trim().parse::(), + parts[1].trim().parse::(), + ) { + // Offset relative coordinates by window position + (x + window_x, y + window_y) + } else { + (x, y) // Fallback to absolute coordinates + } + } else { + (x, y) // Fallback to absolute coordinates + } + } else { + (x, y) // Fallback to absolute coordinates + } + } else { + (x, y) // No app name, use absolute coordinates + }; + use core_graphics::event::{ CGEvent, CGEventTapLocation, CGEventType, CGMouseButton, }; @@ -233,7 +201,7 @@ impl ComputerController for MacOSController { let source = CGEventSource::new(CGEventSourceStateID::HIDSystemState) .ok().context("Failed to create event source")?; - let point = CGPoint::new(x as f64, y as f64); + let point = CGPoint::new(global_x as f64, global_y as f64); // Move mouse to position first let move_event = CGEvent::new_mouse_event( diff --git a/crates/g3-computer-control/vision-bridge/Package.swift b/crates/g3-computer-control/vision-bridge/Package.swift new file mode 100644 index 0000000..76d0503 --- /dev/null +++ b/crates/g3-computer-control/vision-bridge/Package.swift @@ -0,0 +1,24 @@ +// swift-tools-version:5.9 +import PackageDescription + +let package = Package( + name: "VisionBridge", + platforms: [ + .macOS(.v11) + ], + products: [ + .library( + name: "VisionBridge", + type: .dynamic, + targets: ["VisionBridge"] + ), + ], + targets: [ + .target( + name: "VisionBridge", + dependencies: [], + path: "Sources/VisionBridge", + publicHeadersPath: "." + ), + ] +) diff --git a/crates/g3-computer-control/vision-bridge/Sources/VisionBridge/VisionBridge.h b/crates/g3-computer-control/vision-bridge/Sources/VisionBridge/VisionBridge.h new file mode 100644 index 0000000..a83d1dc --- /dev/null +++ b/crates/g3-computer-control/vision-bridge/Sources/VisionBridge/VisionBridge.h @@ -0,0 +1,39 @@ +#ifndef VisionBridge_h +#define VisionBridge_h + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// Text box structure for FFI +typedef struct { + const char* text; + uint32_t text_len; + int32_t x; + int32_t y; + int32_t width; + int32_t height; + float confidence; +} VisionTextBox; + +// Recognize text in an image and return bounding boxes +// Returns true on success, false on failure +// Caller must free the returned boxes using vision_free_boxes +bool vision_recognize_text( + const char* image_path, + uint32_t image_path_len, + VisionTextBox** out_boxes, + uint32_t* out_count +); + +// Free memory allocated by vision_recognize_text +void vision_free_boxes(VisionTextBox* boxes, uint32_t count); + +#ifdef __cplusplus +} +#endif + +#endif /* VisionBridge_h */ diff --git a/crates/g3-computer-control/vision-bridge/Sources/VisionBridge/VisionOCR.swift b/crates/g3-computer-control/vision-bridge/Sources/VisionBridge/VisionOCR.swift new file mode 100644 index 0000000..5ff12d0 --- /dev/null +++ b/crates/g3-computer-control/vision-bridge/Sources/VisionBridge/VisionOCR.swift @@ -0,0 +1,145 @@ +import Foundation +import Vision +import AppKit +import CoreGraphics + +// MARK: - C Bridge Functions + +@_cdecl("vision_recognize_text") +public func vision_recognize_text( + _ imagePath: UnsafePointer, + _ imagePathLen: UInt32, + _ outBoxes: UnsafeMutablePointer, + _ outCount: UnsafeMutablePointer +) -> Bool { + // Convert C string to Swift String + guard let pathData = Data(bytes: imagePath, count: Int(imagePathLen)).withUnsafeBytes({ + String(bytes: $0, encoding: .utf8) + }) else { + return false + } + + let path = pathData.trimmingCharacters(in: .whitespaces) + + // Load image + guard let image = NSImage(contentsOfFile: path), + let cgImage = image.cgImage(forProposedRect: nil, context: nil, hints: nil) else { + return false + } + + // Perform OCR + var textBoxes: [CTextBox] = [] + let semaphore = DispatchSemaphore(value: 0) + var success = false + + let request = VNRecognizeTextRequest { request, error in + defer { semaphore.signal() } + + if let error = error { + print("Vision OCR error: \(error.localizedDescription)") + return + } + + guard let observations = request.results as? [VNRecognizedTextObservation] else { + return + } + + let imageSize = CGSize(width: cgImage.width, height: cgImage.height) + + for observation in observations { + guard let candidate = observation.topCandidates(1).first else { continue } + + let text = candidate.string + let boundingBox = observation.boundingBox + + // Convert normalized coordinates (bottom-left origin) to pixel coordinates (top-left origin) + let x = Int32(boundingBox.origin.x * imageSize.width) + let y = Int32((1.0 - boundingBox.origin.y - boundingBox.height) * imageSize.height) + let width = Int32(boundingBox.width * imageSize.width) + let height = Int32(boundingBox.height * imageSize.height) + + // Allocate C string for text + let cString = strdup(text) + + textBoxes.append(CTextBox( + text: cString, + text_len: UInt32(text.utf8.count), + x: x, + y: y, + width: width, + height: height, + confidence: observation.confidence + )) + } + + success = true + } + + // Configure request for best accuracy + request.recognitionLevel = .accurate + request.usesLanguageCorrection = true + request.recognitionLanguages = ["en-US"] + + // Perform request + let handler = VNImageRequestHandler(cgImage: cgImage, options: [:]) + do { + try handler.perform([request]) + } catch { + print("Vision request failed: \(error.localizedDescription)") + return false + } + + // Wait for completion + semaphore.wait() + + if !success { + return false + } + + // Allocate array for results + let boxesPtr = UnsafeMutablePointer.allocate(capacity: textBoxes.count) + for (index, box) in textBoxes.enumerated() { + boxesPtr[index] = box + } + + outBoxes.pointee = UnsafeMutableRawPointer(boxesPtr) + outCount.pointee = UInt32(textBoxes.count) + + return true +} + +@_cdecl("vision_free_boxes") +public func vision_free_boxes( + _ boxes: UnsafeMutableRawPointer, + _ count: UInt32 +) { + let typedBoxes = boxes.assumingMemoryBound(to: CTextBox.self) + for i in 0..? + public let text_len: UInt32 + public let x: Int32 + public let y: Int32 + public let width: Int32 + public let height: Int32 + public let confidence: Float + + public init(text: UnsafePointer?, text_len: UInt32, x: Int32, y: Int32, width: Int32, height: Int32, confidence: Float) { + self.text = text + self.text_len = text_len + self.x = x + self.y = y + self.width = width + self.height = height + self.confidence = confidence + } +} diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs index 14f36c3..dd3e52c 100644 --- a/crates/g3-core/src/lib.rs +++ b/crates/g3-core/src/lib.rs @@ -1825,7 +1825,7 @@ Template: }, Tool { name: "extract_text".to_string(), - description: "Extract text from a screen region or image file using OCR".to_string(), + description: "Extract text from a screen region or image file using OCR. Returns plain text only (no bounding boxes). For text with location/coordinates, use vision_find_text instead.".to_string(), input_schema: json!({ "type": "object", "properties": { @@ -2280,45 +2280,79 @@ Template: }); } + // Add extract_text_with_boxes tool (requires macax flag) + if enable_macax { + tools.push(Tool { + name: "extract_text_with_boxes".to_string(), + description: "Extract all text from an image file with bounding box coordinates for each text element. Returns JSON array with text, position (x, y), size (width, height), and confidence for each detected text. Uses Apple Vision Framework for precise sub-pixel accuracy.".to_string(), + input_schema: json!({ + "type": "object", + "properties": { + "path": { + "type": "string", + "description": "Path to image file to extract text from" + }, + "app_name": { + "type": "string", + "description": "Optional: Name of application to screenshot first (e.g., 'Safari', 'Things3'). If provided, takes screenshot of app before extracting text." + } + }, + "required": ["path"] + }), + }); + } + // Add vision-guided tools (requires computer control) if enable_computer_control { // Add vision-guided tools tools.push(Tool { name: "vision_find_text".to_string(), - description: "Find text on screen and return its location (useful for locating UI elements)".to_string(), + description: "Find text in a specific application window and return its location with bounding box coordinates (x, y, width, height) and confidence score. Useful for locating UI elements. Uses Apple Vision Framework for precise sub-pixel accuracy.".to_string(), input_schema: json!({ "type": "object", "properties": { + "app_name": { + "type": "string", + "description": "Name of the application to search in (e.g., 'Things3', 'Safari', 'TextEdit')" + }, "text": { "type": "string", "description": "The text to search for on screen" } }, - "required": ["text"] + "required": ["app_name", "text"] }), }); tools.push(Tool { name: "vision_click_text".to_string(), - description: "Find text on screen and click on it (useful for clicking buttons, links, menu items)".to_string(), + description: "Find text in a specific application window and click on it (useful for clicking buttons, links, menu items)".to_string(), input_schema: json!({ "type": "object", "properties": { + "app_name": { + "type": "string", + "description": "Name of the application (e.g., 'Things3', 'Safari', 'TextEdit')" + }, "text": { "type": "string", "description": "The text to click on (e.g., 'Submit', 'OK', 'Cancel', '+')" } }, - "required": ["text"] + "required": ["app_name", "text"] }), }); tools.push(Tool { name: "vision_click_near_text".to_string(), - description: "Find text on screen and click near it (useful for clicking text fields next to labels)".to_string(), + description: "Find text in a specific application window and click near it (useful for clicking text fields next to labels)".to_string(), input_schema: json!({ "type": "object", "properties": { + "app_name": { + "type": "string", + "description": "Name of the application (e.g., 'Things3', 'Safari', 'TextEdit')" + }, "text": { "type": "string", "description": "The label text to find (e.g., 'Name:', 'Email:', 'Task:')" @@ -2333,7 +2367,7 @@ Template: "description": "Distance in pixels from the text (default: 50)" } }, - "required": ["text"] + "required": ["app_name", "text"] }), }); } @@ -4591,19 +4625,23 @@ Template: debug!("Processing vision_find_text tool call"); if let Some(controller) = &self.computer_controller { + let app_name = tool_call.args.get("app_name") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing app_name parameter"))?; + let text = tool_call.args.get("text") .and_then(|v| v.as_str()) .ok_or_else(|| anyhow::anyhow!("Missing text parameter"))?; - match controller.find_text_on_screen(text).await { + match controller.find_text_in_app(app_name, text).await { Ok(Some(location)) => { Ok(format!( - "โœ… Found '{}' at position ({}, {}) with size {}x{} (confidence: {:.0}%)", - location.text, location.x, location.y, location.width, location.height, + "โœ… Found '{}' in {} at position ({}, {}) with size {}x{} (confidence: {:.0}%)", + location.text, app_name, location.x, location.y, location.width, location.height, location.confidence * 100.0 )) } - Ok(None) => Ok(format!("โŒ Could not find '{}' on screen", text)), + Ok(None) => Ok(format!("โŒ Could not find '{}' in {}", text, app_name)), Err(e) => Ok(format!("โŒ Error finding text: {}", e)), } } else { @@ -4614,32 +4652,83 @@ Template: debug!("Processing vision_click_text tool call"); if let Some(controller) = &self.computer_controller { + let app_name = tool_call.args.get("app_name") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing app_name parameter"))?; + let text = tool_call.args.get("text") .and_then(|v| v.as_str()) .ok_or_else(|| anyhow::anyhow!("Missing text parameter"))?; - match controller.find_text_on_screen(text).await { + match controller.find_text_in_app(app_name, text).await { Ok(Some(location)) => { // Click on center of text let center_x = location.x + location.width / 2; let center_y = location.y + location.height / 2; - match controller.click_at(center_x, center_y) { - Ok(_) => Ok(format!("โœ… Clicked on '{}' at ({}, {})", text, center_x, center_y)), + match controller.click_at(center_x, center_y, Some(app_name)) { + Ok(_) => Ok(format!("โœ… Clicked on '{}' in {} at ({}, {})", text, app_name, center_x, center_y)), Err(e) => Ok(format!("โŒ Failed to click: {}", e)), } } - Ok(None) => Ok(format!("โŒ Could not find '{}' on screen", text)), + Ok(None) => Ok(format!("โŒ Could not find '{}' in {}", text, app_name)), Err(e) => Ok(format!("โŒ Error finding text: {}", e)), } } else { Ok("โŒ Computer control not enabled. Set computer_control.enabled = true in config.".to_string()) } } + "extract_text_with_boxes" => { + debug!("Processing extract_text_with_boxes tool call"); + + if !self.config.macax.enabled { + return Ok("โŒ extract_text_with_boxes requires --macax flag to be enabled".to_string()); + } + + if let Some(controller) = &self.computer_controller { + let path = tool_call.args.get("path") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing path parameter"))?; + + // Optional: take screenshot of app first + let final_path = if let Some(app_name) = tool_call.args.get("app_name").and_then(|v| v.as_str()) { + let temp_path = format!("/tmp/g3_extract_boxes_{}.png", uuid::Uuid::new_v4()); + match controller.take_screenshot(&temp_path, None, Some(app_name)).await { + Ok(_) => temp_path, + Err(e) => return Ok(format!("โŒ Failed to take screenshot: {}", e)), + } + } else { + path.to_string() + }; + + // Extract text with locations + match controller.extract_text_with_locations(&final_path).await { + Ok(locations) => { + // Clean up temp file if we created one + if final_path != path { + let _ = std::fs::remove_file(&final_path); + } + + // Return as JSON + match serde_json::to_string_pretty(&locations) { + Ok(json) => Ok(format!("โœ… Extracted {} text elements:\n{}", locations.len(), json)), + Err(e) => Ok(format!("โŒ Failed to serialize results: {}", e)), + } + } + Err(e) => Ok(format!("โŒ Failed to extract text: {}", e)), + } + } else { + Ok("โŒ Computer control not enabled. Set computer_control.enabled = true in config.".to_string()) + } + } "vision_click_near_text" => { debug!("Processing vision_click_near_text tool call"); if let Some(controller) = &self.computer_controller { + let app_name = tool_call.args.get("app_name") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing app_name parameter"))?; + let text = tool_call.args.get("text") .and_then(|v| v.as_str()) .ok_or_else(|| anyhow::anyhow!("Missing text parameter"))?; @@ -4652,7 +4741,7 @@ Template: .and_then(|v| v.as_i64()) .unwrap_or(50) as i32; - match controller.find_text_on_screen(text).await { + match controller.find_text_in_app(app_name, text).await { Ok(Some(location)) => { // Calculate click position based on direction let (click_x, click_y) = match direction { @@ -4663,15 +4752,15 @@ Template: _ => (location.x + location.width + distance, location.y + location.height / 2), }; - match controller.click_at(click_x, click_y) { + match controller.click_at(click_x, click_y, Some(app_name)) { Ok(_) => Ok(format!( - "โœ… Clicked {} of '{}' at ({}, {})", - direction, text, click_x, click_y + "โœ… Clicked {} of '{}' in {} at ({}, {})", + direction, text, app_name, click_x, click_y )), Err(e) => Ok(format!("โŒ Failed to click: {}", e)), } } - Ok(None) => Ok(format!("โŒ Could not find '{}' on screen", text)), + Ok(None) => Ok(format!("โŒ Could not find '{}' in {}", text, app_name)), Err(e) => Ok(format!("โŒ Error finding text: {}", e)), } } else { diff --git a/docs/coach-player-providers.md b/docs/coach-player-providers.md deleted file mode 100644 index d1e05e4..0000000 --- a/docs/coach-player-providers.md +++ /dev/null @@ -1,75 +0,0 @@ -# Coach-Player Provider Configuration - -G3 now supports specifying different LLM providers for the coach and player agents when running in autonomous mode. This allows you to optimize for different requirements: - -- **Player**: The agent that implements code - might benefit from a faster, more cost-effective model -- **Coach**: The agent that reviews code - might benefit from a more powerful, analytical model - -## Configuration - -In your `config.toml` file, under the `[providers]` section, you can specify: - -```toml -[providers] -default_provider = "databricks" # Used for normal operations -coach = "databricks" # Provider for coach (code reviewer) -player = "anthropic" # Provider for player (code implementer) -``` - -If `coach` or `player` are not specified, they will default to using the `default_provider`. - -## Example Use Cases - -### Cost Optimization -Use a cheaper, faster model for initial implementations (player) and a more powerful model for review (coach): - -```toml -coach = "anthropic" # Claude Sonnet for thorough review -player = "anthropic" # Claude Haiku for quick implementation -``` - -### Speed vs Quality Trade-off -Use a local embedded model for fast iterations (player) and a cloud model for quality review (coach): - -```toml -coach = "databricks" # Cloud model for quality review -player = "embedded" # Local model for fast implementation -``` - -### Specialized Models -Use different models optimized for different tasks: - -```toml -coach = "databricks" # Model fine-tuned for code review -player = "openai" # Model optimized for code generation -``` - -## Requirements - -- Both providers must be properly configured in your config file -- Each provider must have valid credentials -- The models specified for each provider must be accessible - -## How It Works - -When running in autonomous mode (`g3 --autonomous`), the system will: - -1. Use the `player` provider (or default) for the initial implementation -2. Switch to the `coach` provider (or default) for code review -3. Return to the `player` provider for implementing feedback -4. Continue this cycle for the specified number of turns - -The providers are logged at startup so you can verify which models are being used: - -``` -๐ŸŽฎ Player provider: anthropic -๐Ÿ‘จโ€๐Ÿซ Coach provider: databricks -โ„น๏ธ Using different providers for player and coach -``` - -## Benefits - -- **Cost Efficiency**: Use expensive models only where they add the most value -- **Speed Optimization**: Use faster models for iterative development -- **Specialization**: Leverage models that excel at specific tasks -- **Flexibility**: Easy to experiment with different provider combinations