diff --git a/.gitignore b/.gitignore index fe29988..f9f70c3 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ # will have compiled files and executables debug target +.build # These are backup files generated by rustfmt **/*.rs.bk diff --git a/Cargo.lock b/Cargo.lock index b3cf969..7ec765c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,28 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "accessibility" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ac9f33ffc1ef16eddb2451c03c983e56a5182ac760c3f2733da55ba8f48eac4" +dependencies = [ + "accessibility-sys", + "cocoa 0.26.1", + "core-foundation 0.10.1", + "objc", + "thiserror 1.0.69", +] + +[[package]] +name = "accessibility-sys" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46a6a8e90a1d8b96a48249e7c8f5b4058447bea8847280db7bfccb6dcab6b8e1" +dependencies = [ + "core-foundation-sys", +] + [[package]] name = "adler2" version = "2.0.1" @@ -114,7 +136,7 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -196,28 +218,6 @@ version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" -[[package]] -name = "bindgen" -version = "0.64.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4243e6031260db77ede97ad86c27e501d646a27ab57b59a574f725d98ab1fb4" -dependencies = [ - "bitflags 1.3.2", - "cexpr", - "clang-sys", - "lazy_static", - "lazycell", - "log", - "peeking_take_while", - "proc-macro2", - "quote", - "regex", - "rustc-hash", - "shlex", - "syn 1.0.109", - "which", -] - [[package]] name = "bindgen" version = "0.69.5" @@ -237,7 +237,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn 2.0.107", + "syn", "which", ] @@ -318,9 +318,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.41" +version = "1.2.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac9fe6cdbb24b6ade63616c0a0688e45bb56732262c158df3c0c4bea4ca47cb7" +checksum = "739eb0f94557554b3ca9a86d2d37bebd49c5e6d0c1d2bda35ba5bdac830befc2" dependencies = [ "find-msvc-tools", "jobserver", @@ -411,7 +411,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -437,9 +437,25 @@ checksum = "f6140449f97a6e97f9511815c5632d84c8aacf8ac271ad77c559218161a1373c" dependencies = [ "bitflags 1.3.2", "block", - "cocoa-foundation", + "cocoa-foundation 0.1.2", "core-foundation 0.9.4", - "core-graphics", + "core-graphics 0.23.2", + "foreign-types 0.5.0", + "libc", + "objc", +] + +[[package]] +name = "cocoa" +version = "0.26.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad36507aeb7e16159dfe68db81ccc27571c3ccd4b76fb2fb72fc59e7a4b1b64c" +dependencies = [ + "bitflags 2.10.0", + "block", + "cocoa-foundation 0.2.1", + "core-foundation 0.10.1", + "core-graphics 0.24.0", "foreign-types 0.5.0", "libc", "objc", @@ -454,11 +470,24 @@ dependencies = [ "bitflags 1.3.2", "block", "core-foundation 0.9.4", - "core-graphics-types", + "core-graphics-types 0.1.3", "libc", "objc", ] +[[package]] +name = "cocoa-foundation" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81411967c50ee9a1fc11365f8c585f863a22a9697c89239c452292c40ba79b0d" +dependencies = [ + "bitflags 2.10.0", + "block", + "core-foundation 0.10.1", + "core-graphics-types 0.2.0", + "objc", +] + [[package]] name = "color_quant" version = "1.1.0" @@ -635,7 +664,20 @@ checksum = "c07782be35f9e1140080c6b96f0d44b739e2278479f64e02fdab4e32dfd8b081" dependencies = [ "bitflags 1.3.2", "core-foundation 0.9.4", - "core-graphics-types", + "core-graphics-types 0.1.3", + "foreign-types 0.5.0", + "libc", +] + +[[package]] +name = "core-graphics" +version = "0.24.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa95a34622365fa5bbf40b20b75dba8dfa8c94c734aea8ac9a5ca38af14316f1" +dependencies = [ + "bitflags 2.10.0", + "core-foundation 0.10.1", + "core-graphics-types 0.2.0", "foreign-types 0.5.0", "libc", ] @@ -651,6 +693,17 @@ dependencies = [ "libc", ] +[[package]] +name = "core-graphics-types" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d44a101f213f6c4cdc1853d4b78aef6db6bdfa3468798cc1d9912f4735013eb" +dependencies = [ + "bitflags 2.10.0", + "core-foundation 0.10.1", + "libc", +] + [[package]] name = "cpufeatures" version = "0.2.17" @@ -692,7 +745,7 @@ dependencies = [ "proc-macro2", "quote", "strict", - "syn 2.0.107", + "syn", ] [[package]] @@ -831,7 +884,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.107", + "syn", ] [[package]] @@ -842,14 +895,14 @@ checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" dependencies = [ "darling_core", "quote", - "syn 2.0.107", + "syn", ] [[package]] name = "deranged" -version = "0.5.4" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a41953f86f8a05768a6cda24def994fd2f424b04ec5c719cf89989779f199071" +checksum = "ececcb659e7ba858fb4f10388c250a7252eb0a27373f1a72b8748afdd248e587" dependencies = [ "powerfmt", ] @@ -864,7 +917,7 @@ dependencies = [ "proc-macro2", "quote", "rustc_version", - "syn 2.0.107", + "syn", ] [[package]] @@ -885,7 +938,7 @@ dependencies = [ "convert_case 0.7.1", "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -937,7 +990,7 @@ dependencies = [ "libc", "option-ext", "redox_users 0.5.2", - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -948,7 +1001,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -962,9 +1015,9 @@ dependencies = [ [[package]] name = "document-features" -version = "0.2.11" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95249b50c6c185bee49034bcb378a49dc2b5dff0be90ff6616d31d64febab05d" +checksum = "d4b8a88685455ed29a21542a33abd9cb6510b6b129abadabdcef0f4c55bc8f61" dependencies = [ "litrs", ] @@ -1009,7 +1062,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -1091,9 +1144,9 @@ checksum = "52051878f80a721bb68ebfbc930e07b65ba72f2da88968ea5c06fd6ca3d3a127" [[package]] name = "flate2" -version = "1.1.4" +version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc5a4e564e38c699f2880d3fda590bedc2e69f3f84cd48b457bd892ce61d0aa9" +checksum = "bfe33edd8e85a12a67454e37f8c75e730830d83e313556ab9ebf9ee7fbeb3bfb" dependencies = [ "crc32fast", "miniz_oxide", @@ -1138,7 +1191,7 @@ checksum = "1a5c6c585bc94aaf2c7b51dd4c2ba22680844aba4c687be581871a6f518c5742" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -1218,7 +1271,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -1287,18 +1340,18 @@ dependencies = [ name = "g3-computer-control" version = "0.1.0" dependencies = [ + "accessibility", "anyhow", "async-trait", - "cocoa", - "core-foundation 0.9.4", - "core-graphics", + "cocoa 0.25.0", + "core-foundation 0.10.1", + "core-graphics 0.23.2", "fantoccini", "image", "objc", "serde", "serde_json", "shellexpand", - "tesseract", "thiserror 1.0.69", "tokio", "tracing", @@ -1518,11 +1571,11 @@ checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" [[package]] name = "home" -version = "0.5.11" +version = "0.5.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "589533453244b0995c858700322199b2becb13b627df2851f64a2775d024abcf" +checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5" dependencies = [ - "windows-sys 0.59.0", + "windows-sys 0.52.0", ] [[package]] @@ -1869,9 +1922,12 @@ dependencies = [ [[package]] name = "indoc" -version = "2.0.6" +version = "2.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd" +checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706" +dependencies = [ + "rustversion", +] [[package]] name = "instability" @@ -1883,7 +1939,7 @@ dependencies = [ "indoc", "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -1894,9 +1950,9 @@ checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" [[package]] name = "is_terminal_polyfill" -version = "1.70.1" +version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" [[package]] name = "itertools" @@ -2004,7 +2060,7 @@ dependencies = [ "proc-macro2", "quote", "regex", - "syn 2.0.107", + "syn", ] [[package]] @@ -2025,28 +2081,6 @@ version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a79a3332a6609480d7d0c9eab957bca6b455b91bb84e66d19f5ff66294b85b8" -[[package]] -name = "leptonica-plumbing" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc7a74c43d6f090d39158d233f326f47cd8bba545217595c93662b4e31156f42" -dependencies = [ - "leptonica-sys", - "libc", - "thiserror 1.0.69", -] - -[[package]] -name = "leptonica-sys" -version = "0.4.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da627c72b2499a8106f4dd33143843015e4a631f445d561f3481f7fba35b6151" -dependencies = [ - "bindgen 0.64.0", - "pkg-config", - "vcpkg", -] - [[package]] name = "libc" version = "0.2.177" @@ -2102,9 +2136,9 @@ checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956" [[package]] name = "litrs" -version = "0.4.2" +version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f5e54036fe321fd421e10d732f155734c4e4afd610dd556d9a82833ab3ee0bed" +checksum = "11d3d7f243d5c5a8b9bb5d6dd2b1602c0cb0b9db1621bafc7ed66e35ff9fe092" [[package]] name = "llama_cpp" @@ -2127,7 +2161,7 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "037a1881ada3592c6a922224d5177b4b4f452e6b2979eb97393b71989e48357f" dependencies = [ - "bindgen 0.69.5", + "bindgen", "cc", "link-cplusplus", "once_cell", @@ -2220,14 +2254,14 @@ dependencies = [ [[package]] name = "mio" -version = "1.0.4" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78bed444cc8a2160f01cbcf811ef18cac863ad68ae8ca62092e8db51d51c761c" +checksum = "69d83b0086dc8ecf3ce9ae2874b2d1290252e2a30720bea58a5c6639b0092873" dependencies = [ "libc", "log", "wasi", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -2299,7 +2333,7 @@ version = "0.50.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -2375,9 +2409,9 @@ checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" [[package]] name = "once_cell_polyfill" -version = "1.70.1" +version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" [[package]] name = "openssl" @@ -2402,7 +2436,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -2474,12 +2508,6 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df94ce210e5bc13cb6651479fa48d14f601d9858cfe0467f43ae157023b938d3" -[[package]] -name = "peeking_take_while" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" - [[package]] name = "percent-encoding" version = "2.3.2" @@ -2516,7 +2544,7 @@ dependencies = [ "pest_meta", "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -2597,14 +2625,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" dependencies = [ "proc-macro2", - "syn 2.0.107", + "syn", ] [[package]] name = "proc-macro2" -version = "1.0.101" +version = "1.0.103" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de" +checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8" dependencies = [ "unicode-ident", ] @@ -2876,7 +2904,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys 0.11.0", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -3002,7 +3030,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -3097,9 +3125,9 @@ dependencies = [ [[package]] name = "signal-hook-mio" -version = "0.2.4" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34db1a06d485c9142248b7a054f034b349b212551f3dfd19c94d45a754a217cd" +checksum = "b75a19a7a740b25bc7944bdee6172368f988763b744e3d4dfe753f6b4ece40cc" dependencies = [ "libc", "mio", @@ -3196,25 +3224,14 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.107", + "syn", ] [[package]] name = "syn" -version = "1.0.109" +version = "2.0.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - -[[package]] -name = "syn" -version = "2.0.107" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a26dbd934e5451d21ef060c018dae56fc073894c5a7896f882928a76e6d081b" +checksum = "da58917d35242480a05c2897064da0a80589a2a0476c9a3f2fdc83b53502e917" dependencies = [ "proc-macro2", "quote", @@ -3241,7 +3258,7 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -3275,7 +3292,7 @@ dependencies = [ "getrandom 0.3.4", "once_cell", "rustix 1.1.2", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -3294,40 +3311,6 @@ dependencies = [ "unicode-width 0.1.14", ] -[[package]] -name = "tesseract" -version = "0.14.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ee0c2c608b63817b095f7fded5c50add36a29e2be2b2fc4901357163329290a" -dependencies = [ - "tesseract-plumbing", - "tesseract-sys", - "thiserror 1.0.69", -] - -[[package]] -name = "tesseract-plumbing" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e496d3e29eba540a276975394b85dccb5fd344b3eefb743d9286c8150f766d5" -dependencies = [ - "leptonica-plumbing", - "tesseract-sys", - "thiserror 1.0.69", -] - -[[package]] -name = "tesseract-sys" -version = "0.5.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd33f6f216124cfaf0fa86c2c0cdf04da39b6257bd78c5e44fa4fa98c3a5857b" -dependencies = [ - "bindgen 0.64.0", - "leptonica-sys", - "pkg-config", - "vcpkg", -] - [[package]] name = "thiserror" version = "1.0.69" @@ -3354,7 +3337,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -3365,7 +3348,7 @@ checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -3463,7 +3446,7 @@ checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -3589,7 +3572,7 @@ checksum = "81383ab64e72a7a8b8e13130c49e3dab29def6d0c7d76a03087b3cf71c5c6903" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -3651,9 +3634,9 @@ checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971" [[package]] name = "unicode-ident" -version = "1.0.19" +version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d" +checksum = "462eeb75aeb73aea900253ce739c8e18a67423fadf006037cd3ff27e82748a06" [[package]] name = "unicode-segmentation" @@ -3794,7 +3777,7 @@ dependencies = [ "log", "proc-macro2", "quote", - "syn 2.0.107", + "syn", "wasm-bindgen-shared", ] @@ -3829,7 +3812,7 @@ checksum = "9f07d2f20d4da7b26400c9f4a0511e6e0345b040694e8a75bd41d578fa4421d7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -3952,7 +3935,7 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.48.0", ] [[package]] @@ -4001,7 +3984,7 @@ checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -4012,7 +3995,7 @@ checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -4408,7 +4391,7 @@ checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", "synstructure", ] @@ -4429,7 +4412,7 @@ checksum = "88d2b8d9c68ad2b9e4340d7832716a4d21a22a1154777ad56ea55c51a9cf3831" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -4449,7 +4432,7 @@ checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", "synstructure", ] @@ -4483,7 +4466,7 @@ checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] diff --git a/README.md b/README.md index a205213..8bb807c 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,16 @@ G3 includes robust error handling with automatic retry logic: - Conversation history preservation through summaries - Dynamic token allocation for different providers (4k to 200k+ tokens) +### Interactive Control Commands +G3's interactive CLI includes control commands for manual context management: +- **`/compact`**: Manually trigger summarization to compact conversation history +- **`/thinnify`**: Manually trigger context thinning to replace large tool results with file references +- **`/readme`**: Reload README.md and AGENTS.md from disk without restarting +- **`/stats`**: Show detailed context and performance statistics +- **`/help`**: Display all available control commands + +These commands give you fine-grained control over context management, allowing you to proactively optimize token usage and refresh project documentation. See [Control Commands Documentation](docs/CONTROL_COMMANDS.md) for detailed usage. + ### Tool Ecosystem - **File Operations**: Read, write, and edit files with line-range precision - **Shell Integration**: Execute system commands with output capture @@ -79,6 +89,7 @@ G3 includes robust error handling with automatic retry logic: - **TODO Management**: Read and write TODO lists with markdown checkbox format - **Computer Control** (Experimental): Automate desktop applications - Mouse and keyboard control + - macOS Accessibility API for native app automation (via `--macax` flag) - UI element inspection - Screenshot capture and window management - OCR text extraction from images and screen regions @@ -156,6 +167,19 @@ safaridriver --enable # Requires password **Usage**: Run G3 with the `--webdriver` flag to enable browser automation tools. +## macOS Accessibility API Tools + +G3 includes support for controlling macOS applications via the Accessibility API, allowing you to automate native macOS apps. + +**Available Tools**: `macax_list_apps`, `macax_get_frontmost_app`, `macax_activate_app`, `macax_get_ui_tree`, `macax_find_elements`, `macax_click`, `macax_set_value`, `macax_get_value`, `macax_press_key` + +**Setup**: Enable with the `--macax` flag or in config with `macax.enabled = true`. Grant accessibility permissions: +- **macOS**: System Preferences → Security & Privacy → Privacy → Accessibility → Add your terminal app + +**For detailed documentation**, see [macOS Accessibility Tools Guide](docs/macax-tools.md). + +**Note**: This is particularly useful for testing and automating apps you're building with G3, as you can add accessibility identifiers to your UI elements. + ## Computer Control (Experimental) G3 can interact with your computer's GUI for automation tasks: diff --git a/crates/g3-cli/src/lib.rs b/crates/g3-cli/src/lib.rs index e901f22..355cffc 100644 --- a/crates/g3-cli/src/lib.rs +++ b/crates/g3-cli/src/lib.rs @@ -167,14 +167,12 @@ use tokio_util::sync::CancellationToken; use tracing::{error, info}; use g3_core::error_handling::{classify_error, ErrorType, RecoverableError}; -mod retro_tui; -mod theme; -mod tui; mod ui_writer_impl; -use retro_tui::RetroTui; -use theme::ColorTheme; -use tui::SimpleOutput; -use ui_writer_impl::{ConsoleUiWriter, RetroTuiWriter}; +mod simple_output; +use simple_output::SimpleOutput; +mod machine_ui_writer; +use machine_ui_writer::MachineUiWriter; +use ui_writer_impl::ConsoleUiWriter; #[derive(Parser)] #[command(name = "g3")] @@ -216,13 +214,13 @@ pub struct Cli { #[arg(long, value_name = "TEXT")] pub requirements: Option, - /// Use retro terminal UI (inspired by 80s sci-fi) + /// Interactive mode: prompt for requirements and save to requirements.md before starting autonomous mode #[arg(long)] - pub retro: bool, + pub interactive_requirements: bool, - /// Color theme for retro mode (default, dracula, or path to theme file) - #[arg(long, value_name = "THEME")] - pub theme: Option, + /// Enable machine-friendly output mode with JSON markers and stats + #[arg(long)] + pub machine: bool, /// Override the configured provider (anthropic, databricks, embedded, openai) #[arg(long, value_name = "PROVIDER")] @@ -235,13 +233,21 @@ pub struct Cli { /// Disable log file creation (no logs/ directory or session logs) #[arg(long)] pub quiet: bool, + + /// Enable macOS Accessibility API tools for native app automation + #[arg(long)] + pub macax: bool, + + /// Enable WebDriver browser automation tools + #[arg(long)] + pub webdriver: bool, } pub async fn run() -> Result<()> { let cli = Cli::parse(); // Only initialize logging if not in retro mode - if !cli.retro { + if !cli.machine { // Initialize logging with filtering use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt, EnvFilter}; @@ -279,16 +285,16 @@ pub async fn run() -> Result<()> { tracing_subscriber::registry().with(filter).init(); } - if !cli.retro { + if !cli.machine { info!("Starting G3 AI Coding Agent"); } // Set up workspace directory - let workspace_dir = if let Some(ws) = cli.workspace { - ws + let workspace_dir = if let Some(ws) = &cli.workspace { + ws.clone() } else if cli.autonomous { // For autonomous mode, use G3_WORKSPACE env var or default - setup_workspace_directory()? + setup_workspace_directory(cli.machine)? } else { // Default to current directory for interactive/single-shot mode std::env::current_dir()? @@ -303,9 +309,115 @@ pub async fn run() -> Result<()> { // Create project model let project = if cli.autonomous { - if let Some(requirements_text) = cli.requirements { + // Handle interactive requirements mode with AI enhancement + if cli.interactive_requirements { + println!("\nšŸ“ Interactive Requirements Mode"); + println!("================================\n"); + println!("Describe what you want to build (can be brief):"); + println!("Press Ctrl+D (Unix) or Ctrl+Z (Windows) when done.\n"); + + use std::io::{self, Read, Write}; + let mut requirements_input = String::new(); + io::stdin().read_to_string(&mut requirements_input)?; + + if requirements_input.trim().is_empty() { + anyhow::bail!("No requirements provided. Exiting."); + } + + println!("\nšŸ¤– Enhancing your requirements with AI...\n"); + + // Create a temporary agent to enhance the requirements + let temp_config = Config::load_with_overrides( + cli.config.as_deref(), + cli.provider.clone(), + cli.model.clone(), + )?; + + let ui_writer = ConsoleUiWriter::new(); + let mut temp_agent = Agent::new_with_readme_and_quiet( + temp_config, + ui_writer, + None, + true, // quiet mode + ).await?; + + // Craft the enhancement prompt + let enhancement_prompt = format!( + r#"You are a requirements analyst. Take this brief user input and expand it into a structured requirements document. + +USER INPUT: +{} + +Create a professional requirements document with: +1. A clear project title (# heading) +2. An overview section explaining what will be built +3. Organized requirements (functional, technical, quality) +4. Acceptance criteria +5. Any technical constraints or preferences mentioned + +Format as proper markdown. Be specific and actionable. If the user's input is vague, make reasonable assumptions but keep it focused on what they described. + +Output ONLY the markdown content, no explanations or meta-commentary."#, + requirements_input.trim() + ); + + // Execute enhancement task + let result = temp_agent + .execute_task_with_timing(&enhancement_prompt, None, false, false, false, false) + .await?; + + let enhanced_requirements = result.response.trim().to_string(); + + // Show the enhanced requirements + println!("\nšŸ“‹ Enhanced Requirements Document:"); + println!("{}\n", "=".repeat(60)); + println!("{}", enhanced_requirements); + println!("{}\n", "=".repeat(60)); + + // Ask for confirmation + println!("\nā“ Is this requirements document acceptable?"); + println!(" [y] Yes, proceed with autonomous mode"); + println!(" [e] Edit and save manually"); + println!(" [n] No, cancel\n"); + + print!("Your choice (y/e/n): "); + io::stdout().flush()?; + + let mut choice = String::new(); + io::stdin().read_line(&mut choice)?; + let choice = choice.trim().to_lowercase(); + + let requirements_path = workspace_dir.join("requirements.md"); + + match choice.as_str() { + "y" | "yes" => { + // Save enhanced requirements + std::fs::write(&requirements_path, &enhanced_requirements)?; + println!("\nāœ… Requirements saved to: {}", requirements_path.display()); + println!("šŸš€ Starting autonomous mode...\n"); + } + "e" | "edit" => { + // Save enhanced requirements for manual editing + std::fs::write(&requirements_path, &enhanced_requirements)?; + println!("\nāœ… Requirements saved to: {}", requirements_path.display()); + println!("šŸ“ Please edit the file and run: g3 --autonomous"); + println!(" Exiting for now.\n"); + return Ok(()); + } + "n" | "no" => { + println!("\nāŒ Cancelled. No files were saved.\n"); + return Ok(()); + } + _ => { + println!("\nāŒ Invalid choice. Cancelled.\n"); + return Ok(()); + } + } + } + + if let Some(requirements_text) = &cli.requirements { // Use requirements text override - Project::new_autonomous_with_requirements(workspace_dir.clone(), requirements_text)? + Project::new_autonomous_with_requirements(workspace_dir.clone(), requirements_text.clone())? } else { // Use traditional requirements.md file Project::new_autonomous(workspace_dir.clone())? @@ -318,17 +430,30 @@ pub async fn run() -> Result<()> { project.ensure_workspace_exists()?; project.enter_workspace()?; - if !cli.retro { + if !cli.machine { info!("Using workspace: {}", project.workspace().display()); } // Load configuration with CLI overrides - let config = Config::load_with_overrides( + let mut config = Config::load_with_overrides( cli.config.as_deref(), cli.provider.clone(), cli.model.clone(), )?; + // Apply macax flag override + if cli.macax { + config.macax.enabled = true; + if !cli.machine { + info!("macOS Accessibility API tools enabled"); + } + } + + // Apply webdriver flag override + if cli.webdriver { + config.webdriver.enabled = true; + } + // Validate provider if specified if let Some(ref provider) = cli.provider { let valid_providers = ["anthropic", "databricks", "embedded", "openai"]; @@ -342,7 +467,7 @@ pub async fn run() -> Result<()> { } // Initialize agent - let ui_writer = ConsoleUiWriter::new(); + // ui_writer will be created conditionally based on machine mode // Combine AGENTS.md and README content if both exist let combined_content = match (agents_content.clone(), readme_content.clone()) { @@ -354,28 +479,117 @@ pub async fn run() -> Result<()> { (None, None) => None, }; - let mut agent = if cli.autonomous { - Agent::new_autonomous_with_readme_and_quiet( - config.clone(), - ui_writer, - combined_content.clone(), - cli.quiet, - ) - .await? + // Execute task, autonomous mode, or start interactive mode based on machine mode + if cli.machine { + // Machine mode - use MachineUiWriter + let ui_writer = MachineUiWriter::new(); + + let agent = if cli.autonomous { + Agent::new_autonomous_with_readme_and_quiet( + config.clone(), + ui_writer, + combined_content.clone(), + cli.quiet, + ) + .await? + } else { + Agent::new_with_readme_and_quiet( + config.clone(), + ui_writer, + combined_content.clone(), + cli.quiet, + ) + .await? + }; + + run_with_machine_mode(agent, cli, project).await?; } else { - Agent::new_with_readme_and_quiet( - config.clone(), - ui_writer, - combined_content.clone(), - cli.quiet, - ) - .await? + // Normal mode - use ConsoleUiWriter + let ui_writer = ConsoleUiWriter::new(); + + let agent = if cli.autonomous { + Agent::new_autonomous_with_readme_and_quiet( + config.clone(), + ui_writer, + combined_content.clone(), + cli.quiet, + ) + .await? + } else { + Agent::new_with_readme_and_quiet( + config.clone(), + ui_writer, + combined_content.clone(), + cli.quiet, + ) + .await? + }; + + run_with_console_mode(agent, cli, project, combined_content).await?; + } + + Ok(()) +} + +// Simplified machine mode version of autonomous mode +async fn run_autonomous_machine( + mut agent: Agent, + project: Project, + show_prompt: bool, + show_code: bool, + max_turns: usize, + _quiet: bool, +) -> Result<()> { + println!("AUTONOMOUS_MODE_STARTED"); + println!("WORKSPACE: {}", project.workspace().display()); + println!("MAX_TURNS: {}", max_turns); + + // Check if requirements exist + if !project.has_requirements() { + println!("ERROR: requirements.md not found in workspace directory"); + return Ok(()); + } + + // Read requirements + let requirements = match project.read_requirements()? { + Some(content) => content, + None => { + println!("ERROR: Could not read requirements"); + return Ok(()); + } }; + println!("REQUIREMENTS_LOADED"); + + // For now, just execute a simple autonomous loop + // This is a simplified version - full implementation would need coach-player loop + let task = format!( + "You are G3 in implementation mode. Read and implement the following requirements:\n\n{}\n\nImplement this step by step, creating all necessary files and code.", + requirements + ); + + println!("TASK_START"); + let result = agent.execute_task_with_timing(&task, None, false, show_prompt, show_code, true).await?; + println!("AGENT_RESPONSE:"); + println!("{}", result.response); + println!("END_AGENT_RESPONSE"); + println!("TASK_END"); + + println!("AUTONOMOUS_MODE_ENDED"); + Ok(()) +} + +async fn run_with_console_mode( + mut agent: Agent, + cli: Cli, + project: Project, + combined_content: Option, +) -> Result<()> { + // Execute task, autonomous mode, or start interactive mode if cli.autonomous { // Autonomous mode with coach-player feedback loop - if !cli.retro { + if !cli.machine { info!("Starting autonomous mode"); } run_autonomous( @@ -389,7 +603,7 @@ pub async fn run() -> Result<()> { .await?; } else if let Some(task) = cli.task { // Single-shot mode - if !cli.retro { + if !cli.machine { info!("Executing task: {}", task); } let output = SimpleOutput::new(); @@ -399,26 +613,43 @@ pub async fn run() -> Result<()> { output.print_smart(&result.response); } else { // Interactive mode (default) - if !cli.retro { + if !cli.machine { info!("Starting interactive mode"); } + println!("šŸ“ Workspace: {}", project.workspace().display()); + run_interactive(agent, cli.show_prompt, cli.show_code, combined_content).await?; + } - if cli.retro { - // Use retro terminal UI - run_interactive_retro( - config, // Already has overrides applied - cli.show_prompt, - cli.show_code, - cli.theme, - combined_content, - ) + Ok(()) +} + +async fn run_with_machine_mode( + mut agent: Agent, + cli: Cli, + project: Project, +) -> Result<()> { + if cli.autonomous { + // Autonomous mode with coach-player feedback loop + run_autonomous_machine( + agent, + project, + cli.show_prompt, + cli.show_code, + cli.max_turns, + cli.quiet, + ) + .await?; + } else if let Some(task) = cli.task { + // Single-shot mode + let result = agent + .execute_task_with_timing(&task, None, false, cli.show_prompt, cli.show_code, true) .await?; - } else { - // Use standard terminal UI - let output = SimpleOutput::new(); - output.print(&format!("šŸ“ Workspace: {}", project.workspace().display())); - run_interactive(agent, cli.show_prompt, cli.show_code, combined_content).await?; - } + println!("AGENT_RESPONSE:"); + println!("{}", result.response); + println!("END_AGENT_RESPONSE"); + } else { + // Interactive mode + run_interactive_machine(agent, cli.show_prompt, cli.show_code).await?; } Ok(()) @@ -527,8 +758,8 @@ fn extract_readme_heading(readme_content: &str) -> Option { let trimmed = line.trim(); // Check for H1 heading (# Title) - if trimmed.starts_with("# ") { - let title = trimmed[2..].trim(); + if let Some(stripped) = trimmed.strip_prefix("# ") { + let title = stripped.trim(); if !title.is_empty() { // Return the full title (including any description after dash) return Some(title.to_string()); @@ -560,275 +791,6 @@ fn extract_readme_heading(readme_content: &str) -> Option { None } -async fn run_interactive_retro( - config: Config, - show_prompt: bool, - show_code: bool, - theme_name: Option, - combined_content: Option, -) -> Result<()> { - use crossterm::event::{self, Event, KeyCode, KeyModifiers}; - use std::time::Duration; - - // Set environment variable to suppress println in other crates - std::env::set_var("G3_RETRO_MODE", "1"); - - // Load the color theme - let theme = match ColorTheme::load(theme_name.as_deref()) { - Ok(t) => t, - Err(e) => { - eprintln!("Failed to load theme: {}. Using default.", e); - ColorTheme::default() - } - }; - - // Initialize the retro terminal UI - let tui = RetroTui::start(theme).await?; - - // Create agent with RetroTuiWriter - let ui_writer = RetroTuiWriter::new(tui.clone()); - let mut agent = Agent::new_with_readme_and_quiet(config, ui_writer, combined_content.clone(), false).await?; - - // Display initial system messages - tui.output("SYSTEM: AGENT ONLINE\n\n"); - - // Display message if AGENTS.md or README was loaded - if let Some(ref content) = combined_content { - // Check what was loaded - let has_agents = content.contains("Agent Configuration"); - let has_readme = content.contains("Project README"); - - if has_agents { - tui.output("SYSTEM: AGENT CONFIGURATION LOADED\n\n"); - } - - if has_readme { - // Extract the first heading or title from the README - let readme_snippet = extract_readme_heading(content) - .unwrap_or_else(|| "PROJECT DOCUMENTATION LOADED".to_string()); - - tui.output(&format!( - "SYSTEM: PROJECT README LOADED - {}\n\n", - readme_snippet - )); - } - } - tui.output("SYSTEM: READY FOR INPUT\n\n"); - tui.output("\n\n"); - - // Display provider and model information - match agent.get_provider_info() { - Ok((provider, model)) => { - tui.update_provider_info(&provider, &model); - } - Err(e) => { - tui.update_provider_info("ERROR", &e.to_string()); - } - } - - // Track multiline input - let mut multiline_buffer = String::new(); - let mut in_multiline = false; - - // Main event loop - loop { - // Update context window display - let context = agent.get_context_window(); - tui.update_context( - context.used_tokens, - context.total_tokens, - context.percentage_used(), - ); - - // Poll for keyboard events - if event::poll(Duration::from_millis(50))? { - if let Event::Key(key) = event::read()? { - match key.code { - KeyCode::Char('c') if key.modifiers.contains(KeyModifiers::CONTROL) => { - tui.exit(); - break; - } - KeyCode::Char('d') if key.modifiers.contains(KeyModifiers::CONTROL) => { - tui.exit(); - break; - } - // Emacs/bash-like shortcuts - KeyCode::Char('a') if key.modifiers.contains(KeyModifiers::CONTROL) => { - tui.cursor_home(); - } - KeyCode::Char('e') if key.modifiers.contains(KeyModifiers::CONTROL) => { - tui.cursor_end(); - } - KeyCode::Char('w') if key.modifiers.contains(KeyModifiers::CONTROL) => { - tui.delete_word(); - } - KeyCode::Char('k') if key.modifiers.contains(KeyModifiers::CONTROL) => { - tui.delete_to_end(); - } - KeyCode::Char('u') if key.modifiers.contains(KeyModifiers::CONTROL) => { - // Delete from beginning to cursor (similar to Ctrl-K but opposite direction) - let (input_buffer, cursor_pos) = tui.get_input_state(); - if cursor_pos > 0 { - let after = input_buffer.chars().skip(cursor_pos).collect::(); - tui.update_input(&after); - tui.cursor_home(); - } - } - KeyCode::Left => { - tui.cursor_left(); - } - KeyCode::Right => { - tui.cursor_right(); - } - KeyCode::Home if !key.modifiers.contains(KeyModifiers::CONTROL) => { - tui.cursor_home(); - } - KeyCode::End if !key.modifiers.contains(KeyModifiers::CONTROL) => { - tui.cursor_end(); - } - KeyCode::Delete => { - tui.delete_char(); - } - KeyCode::Enter => { - let (input_buffer, _) = tui.get_input_state(); - if !input_buffer.is_empty() { - // Clear the input for next command - tui.update_input(""); - let trimmed = input_buffer.trim_end(); - - // Check if line ends with backslash for continuation - if trimmed.ends_with('\\') { - // Remove the backslash and add to buffer - let without_backslash = &trimmed[..trimmed.len() - 1]; - multiline_buffer.push_str(without_backslash); - multiline_buffer.push('\n'); - in_multiline = true; - tui.status("MULTILINE INPUT"); - continue; - } - - // If we're in multiline mode and no backslash, this is the final line - let final_input = if in_multiline { - multiline_buffer.push_str(&input_buffer); - in_multiline = false; - let result = multiline_buffer.clone(); - multiline_buffer.clear(); - tui.status("READY"); - result - } else { - input_buffer.clone() - }; - - let input = final_input.trim().to_string(); - if input.is_empty() { - continue; - } - - if input == "exit" || input == "quit" { - tui.exit(); - break; - } - - // Execute the task - tui.output(&format!("> {}", input)); - tui.status("PROCESSING"); - - const MAX_TIMEOUT_RETRIES: u32 = 3; - let mut attempt = 0; - - loop { - attempt += 1; - - match agent - .execute_task_with_timing( - &input, - None, - false, - show_prompt, - show_code, - true, - ) - .await - { - Ok(result) => { - if attempt > 1 { - tui.output(&format!( - "SYSTEM: REQUEST SUCCEEDED AFTER {} ATTEMPTS", - attempt - )); - } - tui.output(&result.response); - tui.status("READY"); - break; - } - Err(e) => { - // Check if this is a timeout error that we should retry - let error_type = classify_error(&e); - - if matches!( - error_type, - ErrorType::Recoverable(RecoverableError::Timeout) - ) && attempt < MAX_TIMEOUT_RETRIES - { - // Calculate retry delay with exponential backoff - let delay_ms = 1000 * (2_u64.pow(attempt - 1)); - let delay = std::time::Duration::from_millis(delay_ms); - - tui.output(&format!("SYSTEM: TIMEOUT ERROR (ATTEMPT {}/{}). RETRYING IN {:?}...", - attempt, MAX_TIMEOUT_RETRIES, delay)); - tui.status("RETRYING"); - - // Wait before retrying - tokio::time::sleep(delay).await; - continue; - } - - // For non-timeout errors or after max retries - tui.error(&format!("Task execution failed: {}", e)); - tui.status("ERROR"); - break; - } - } - } - } - } - KeyCode::Char(c) => { - tui.insert_char(c); - } - KeyCode::Backspace => { - tui.backspace(); - } - KeyCode::Up => { - tui.scroll_up(); - } - KeyCode::Down => { - tui.scroll_down(); - } - KeyCode::PageUp => { - tui.scroll_page_up(); - } - KeyCode::PageDown => { - tui.scroll_page_down(); - } - KeyCode::Home if key.modifiers.contains(KeyModifiers::CONTROL) => { - tui.scroll_home(); // Ctrl+Home for scrolling to top - } - KeyCode::End if key.modifiers.contains(KeyModifiers::CONTROL) => { - tui.scroll_end(); // Ctrl+End for scrolling to bottom - } - _ => {} - } - } - } - - // Small delay to prevent CPU spinning - tokio::time::sleep(Duration::from_millis(10)).await; - } - - tui.output("SYSTEM: SHUTDOWN INITIATED"); - Ok(()) -} - async fn run_interactive( mut agent: Agent, show_prompt: bool, @@ -903,9 +865,8 @@ async fn run_interactive( let trimmed = line.trim_end(); // Check if line ends with backslash for continuation - if trimmed.ends_with('\\') { + if let Some(without_backslash) = trimmed.strip_suffix('\\') { // Remove the backslash and add to buffer - let without_backslash = &trimmed[..trimmed.len() - 1]; multiline_buffer.push_str(without_backslash); multiline_buffer.push('\n'); in_multiline = true; @@ -948,6 +909,62 @@ async fn run_interactive( // Add to history rl.add_history_entry(&input)?; + // Check for control commands + if input.starts_with('/') { + match input.as_str() { + "/help" => { + output.print(""); + output.print("šŸ“– Control Commands:"); + output.print(" /compact - Trigger auto-summarization (compacts conversation history)"); + output.print(" /thinnify - Trigger context thinning (replaces large tool results with file references)"); + output.print(" /readme - Reload README.md and AGENTS.md from disk"); + output.print(" /stats - Show detailed context and performance statistics"); + output.print(" /help - Show this help message"); + output.print(" exit/quit - Exit the interactive session"); + output.print(""); + continue; + } + "/compact" => { + output.print("šŸ—œļø Triggering manual summarization..."); + match agent.force_summarize().await { + Ok(true) => { + output.print("āœ… Summarization completed successfully"); + } + Ok(false) => { + output.print("āš ļø Summarization failed"); + } + Err(e) => { + output.print(&format!("āŒ Error during summarization: {}", e)); + } + } + continue; + } + "/thinnify" => { + let summary = agent.force_thin(); + println!("{}", summary); + continue; + } + "/readme" => { + output.print("šŸ“š Reloading README.md and AGENTS.md..."); + match agent.reload_readme() { + Ok(true) => output.print("āœ… README content reloaded successfully"), + Ok(false) => output.print("āš ļø No README was loaded at startup, cannot reload"), + Err(e) => output.print(&format!("āŒ Error reloading README: {}", e)), + } + continue; + } + "/stats" => { + let stats = agent.get_stats(); + output.print(&stats); + continue; + } + _ => { + output.print(&format!("āŒ Unknown command: {}. Type /help for available commands.", input)); + continue; + } + } + } + // Process the single line input execute_task(&mut agent, &input, show_prompt, show_code, &output).await; } @@ -1062,6 +1079,199 @@ async fn execute_task( } } +async fn run_interactive_machine( + mut agent: Agent, + show_prompt: bool, + show_code: bool, +) -> Result<()> { + println!("INTERACTIVE_MODE_STARTED"); + + // Display provider and model information + match agent.get_provider_info() { + Ok((provider, model)) => { + println!("PROVIDER: {}", provider); + println!("MODEL: {}", model); + } + Err(e) => { + println!("ERROR: Failed to get provider info: {}", e); + } + } + + // Initialize rustyline editor with history + let mut rl = DefaultEditor::new()?; + + // Try to load history from a file in the user's home directory + let history_file = dirs::home_dir().map(|mut path| { + path.push(".g3_history"); + path + }); + + if let Some(ref history_path) = history_file { + let _ = rl.load_history(history_path); + } + + loop { + let readline = rl.readline(""); + match readline { + Ok(line) => { + let input = line.trim().to_string(); + + if input.is_empty() { + continue; + } + + if input == "exit" || input == "quit" { + break; + } + + // Add to history + rl.add_history_entry(&input)?; + + // Check for control commands + if input.starts_with('/') { + match input.as_str() { + "/compact" => { + println!("COMMAND: compact"); + match agent.force_summarize().await { + Ok(true) => println!("RESULT: Summarization completed"), + Ok(false) => println!("RESULT: Summarization failed"), + Err(e) => println!("ERROR: {}", e), + } + continue; + } + "/thinnify" => { + println!("COMMAND: thinnify"); + let summary = agent.force_thin(); + println!("{}", summary); + continue; + } + "/readme" => { + println!("COMMAND: readme"); + match agent.reload_readme() { + Ok(true) => println!("RESULT: README content reloaded successfully"), + Ok(false) => println!("RESULT: No README was loaded at startup, cannot reload"), + Err(e) => println!("ERROR: {}", e), + } + continue; + } + "/stats" => { + println!("COMMAND: stats"); + let stats = agent.get_stats(); + // Emit stats as structured data (name: value pairs) + println!("{}", stats); + continue; + } + "/help" => { + println!("COMMAND: help"); + println!("AVAILABLE_COMMANDS: /compact /thinnify /readme /stats /help"); + continue; + } + _ => { + println!("ERROR: Unknown command: {}", input); + continue; + } + } + } + + // Execute task + println!("TASK_START"); + execute_task_machine(&mut agent, &input, show_prompt, show_code).await; + println!("TASK_END"); + } + Err(ReadlineError::Interrupted) => continue, + Err(ReadlineError::Eof) => break, + Err(err) => { + println!("ERROR: {:?}", err); + break; + } + } + } + + // Save history before exiting + if let Some(ref history_path) = history_file { + let _ = rl.save_history(history_path); + } + + println!("INTERACTIVE_MODE_ENDED"); + Ok(()) +} + +async fn execute_task_machine( + agent: &mut Agent, + input: &str, + show_prompt: bool, + show_code: bool, +) { + const MAX_TIMEOUT_RETRIES: u32 = 3; + let mut attempt = 0; + + // Create cancellation token for this request + let cancellation_token = CancellationToken::new(); + let cancel_token_clone = cancellation_token.clone(); + + loop { + attempt += 1; + + // Execute task with cancellation support + let execution_result = tokio::select! { + result = agent.execute_task_with_timing_cancellable( + input, None, false, show_prompt, show_code, true, cancellation_token.clone() + ) => { + result + } + _ = tokio::signal::ctrl_c() => { + cancel_token_clone.cancel(); + println!("CANCELLED"); + return; + } + }; + + match execution_result { + Ok(result) => { + if attempt > 1 { + println!("RETRY_SUCCESS: attempt {}", attempt); + } + println!("AGENT_RESPONSE:"); + println!("{}", result.response); + println!("END_AGENT_RESPONSE"); + return; + } + Err(e) => { + if e.to_string().contains("cancelled") { + println!("CANCELLED"); + return; + } + + // Check if this is a timeout error that we should retry + let error_type = classify_error(&e); + + if matches!( + error_type, + ErrorType::Recoverable(RecoverableError::Timeout) + ) && attempt < MAX_TIMEOUT_RETRIES + { + // Calculate retry delay with exponential backoff + let delay_ms = 1000 * (2_u64.pow(attempt - 1)); + let delay = std::time::Duration::from_millis(delay_ms); + + println!("TIMEOUT: attempt {} of {}, retrying in {:?}", attempt, MAX_TIMEOUT_RETRIES, delay); + + // Wait before retrying + tokio::time::sleep(delay).await; + continue; + } + + // For non-timeout errors or after max retries + println!("ERROR: {}", e); + if attempt > 1 { + println!("FAILED_AFTER_RETRIES: {}", attempt); + } + return; + } + } + } +} + fn handle_execution_error(e: &anyhow::Error, input: &str, output: &SimpleOutput, attempt: u32) { // Enhanced error logging with detailed information error!("=== TASK EXECUTION ERROR ==="); @@ -1095,16 +1305,13 @@ fn handle_execution_error(e: &anyhow::Error, input: &str, output: &SimpleOutput, fn display_context_progress(agent: &Agent, output: &SimpleOutput) { let context = agent.get_context_window(); - output.print_context( - context.used_tokens, - context.total_tokens, - context.percentage_used(), - ); + output.print(&format!("Context: {}/{} tokens ({:.1}%)", + context.used_tokens, context.total_tokens, context.percentage_used())); } /// Set up the workspace directory for autonomous mode /// Uses G3_WORKSPACE environment variable or defaults to ~/tmp/workspace -fn setup_workspace_directory() -> Result { +fn setup_workspace_directory(machine_mode: bool) -> Result { let workspace_dir = if let Ok(env_workspace) = std::env::var("G3_WORKSPACE") { PathBuf::from(env_workspace) } else { @@ -1117,7 +1324,7 @@ fn setup_workspace_directory() -> Result { // Create the directory if it doesn't exist if !workspace_dir.exists() { std::fs::create_dir_all(&workspace_dir)?; - let output = SimpleOutput::new(); + let output = SimpleOutput::new_with_mode(machine_mode); output.print(&format!( "šŸ“ Created workspace directory: {}", workspace_dir.display() @@ -1172,7 +1379,7 @@ async fn run_autonomous( elapsed.as_secs_f64() )); output.print(&format!("šŸ”„ Turns Taken: 0/{}", max_turns)); - output.print(&format!("šŸ“ Final Status: āš ļø NO REQUIREMENTS FILE")); + output.print("šŸ“ Final Status: āš ļø NO REQUIREMENTS FILE"); output.print("\nšŸ“ˆ Token Usage Statistics:"); output.print(&format!(" • Used Tokens: {}", context_window.used_tokens)); @@ -1214,7 +1421,7 @@ async fn run_autonomous( elapsed.as_secs_f64() )); output.print(&format!("šŸ”„ Turns Taken: 0/{}", max_turns)); - output.print(&format!("šŸ“ Final Status: āš ļø CANNOT READ REQUIREMENTS")); + output.print("šŸ“ Final Status: āš ļø CANNOT READ REQUIREMENTS"); output.print("\nšŸ“ˆ Token Usage Statistics:"); output.print(&format!(" • Used Tokens: {}", context_window.used_tokens)); @@ -1300,7 +1507,7 @@ async fn run_autonomous( "šŸ“‹ Player received coach feedback ({} chars):", coach_feedback.len() )); - output.print(&format!("{}", coach_feedback)); + output.print(&coach_feedback.to_string()); } output.print(""); // Empty line for readability @@ -1345,7 +1552,7 @@ async fn run_autonomous( elapsed.as_secs_f64() )); output.print(&format!("šŸ”„ Turns Taken: {}/{}", turn, max_turns)); - output.print(&format!("šŸ“ Final Status: šŸ’„ PLAYER PANIC")); + output.print("šŸ“ Final Status: šŸ’„ PLAYER PANIC"); output.print("\nšŸ“ˆ Token Usage Statistics:"); output.print(&format!( @@ -1454,6 +1661,7 @@ Review the current state of the project and provide a concise critique focusing 2. Whether the project compiles successfully 3. What requirements are missing or incorrect 4. Specific improvements needed to satisfy requirements +5. Use UI tools such as webdriver or macax to test functionality thoroughly CRITICAL INSTRUCTIONS: 1. You MUST use the final_output tool to provide your feedback @@ -1461,13 +1669,13 @@ CRITICAL INSTRUCTIONS: 3. Focus ONLY on what needs to be fixed or improved 4. Do NOT include your analysis process, file contents, or compilation output in the summary -If the implementation generally meets all requirements and compiles without errors: +If the implementation thoroughly meets all requirements, compiles and is fully tested (especially UI flows) *WITHOUT* minor gaps or errors: - Call final_output with summary: 'IMPLEMENTATION_APPROVED' If improvements are needed: - Call final_output with a brief summary listing ONLY the specific issues to fix -Remember: Be clear in your review and concise in your feedback. APPROVE if the implementation works and generally fits the requirements. Don't be picky.", +Remember: Be clear in your review and concise in your feedback. APPROVE iff the implementation works and thoroughly fits the requirements (implementation > 95% complete). Be rigorous, especially by testing that all UI features work.", requirements ); @@ -1506,7 +1714,7 @@ Remember: Be clear in your review and concise in your feedback. APPROVE if the i elapsed.as_secs_f64() )); output.print(&format!("šŸ”„ Turns Taken: {}/{}", turn, max_turns)); - output.print(&format!("šŸ“ Final Status: šŸ’„ COACH PANIC")); + output.print("šŸ“ Final Status: šŸ’„ COACH PANIC"); output.print("\nšŸ“ˆ Token Usage Statistics:"); output.print(&format!(" • Used Tokens: {}", context_window.used_tokens)); diff --git a/crates/g3-cli/src/machine_ui_writer.rs b/crates/g3-cli/src/machine_ui_writer.rs new file mode 100644 index 0000000..0d97292 --- /dev/null +++ b/crates/g3-cli/src/machine_ui_writer.rs @@ -0,0 +1,94 @@ +use g3_core::ui_writer::UiWriter; +use std::io::{self, Write}; + +/// Machine-mode implementation of UiWriter that prints plain, unformatted output +/// This is designed for programmatic consumption and outputs everything verbatim +pub struct MachineUiWriter; + +impl MachineUiWriter { + pub fn new() -> Self { + Self + } +} + +impl UiWriter for MachineUiWriter { + fn print(&self, message: &str) { + print!("{}", message); + } + + fn println(&self, message: &str) { + println!("{}", message); + } + + fn print_inline(&self, message: &str) { + print!("{}", message); + let _ = io::stdout().flush(); + } + + fn print_system_prompt(&self, prompt: &str) { + println!("SYSTEM_PROMPT:"); + println!("{}", prompt); + println!("END_SYSTEM_PROMPT"); + println!(); + } + + fn print_context_status(&self, message: &str) { + println!("CONTEXT_STATUS: {}", message); + } + + fn print_context_thinning(&self, message: &str) { + println!("CONTEXT_THINNING: {}", message); + } + + fn print_tool_header(&self, tool_name: &str) { + println!("TOOL_CALL: {}", tool_name); + } + + fn print_tool_arg(&self, key: &str, value: &str) { + println!("TOOL_ARG: {} = {}", key, value); + } + + fn print_tool_output_header(&self) { + println!("TOOL_OUTPUT:"); + } + + fn update_tool_output_line(&self, line: &str) { + println!("{}", line); + } + + fn print_tool_output_line(&self, line: &str) { + println!("{}", line); + } + + fn print_tool_output_summary(&self, count: usize) { + println!("TOOL_OUTPUT_LINES: {}", count); + } + + fn print_tool_timing(&self, duration_str: &str) { + println!("TOOL_DURATION: {}", duration_str); + println!("END_TOOL_OUTPUT"); + println!(); + } + + fn print_agent_prompt(&self) { + println!("AGENT_RESPONSE:"); + let _ = io::stdout().flush(); + } + + fn print_agent_response(&self, content: &str) { + print!("{}", content); + let _ = io::stdout().flush(); + } + + fn notify_sse_received(&self) { + // No-op for machine mode + } + + fn flush(&self) { + let _ = io::stdout().flush(); + } + + fn wants_full_output(&self) -> bool { + true // Machine mode wants complete, untruncated output + } +} diff --git a/crates/g3-cli/src/retro_tui.rs b/crates/g3-cli/src/retro_tui.rs index 2aa9e97..9c84172 100644 --- a/crates/g3-cli/src/retro_tui.rs +++ b/crates/g3-cli/src/retro_tui.rs @@ -267,23 +267,23 @@ impl TerminalState { let mut current_text = String::new(); // Check for headers first - if line.starts_with("### ") { + if let Some(stripped) = line.strip_prefix("### ") { return Line::from(Span::styled( - format!(" {}", &line[4..]), + format!(" {}", stripped), Style::default() .fg(self.theme.terminal_cyan.to_color()) .add_modifier(Modifier::BOLD | Modifier::UNDERLINED), )); - } else if line.starts_with("## ") { + } else if let Some(stripped) = line.strip_prefix("## ") { return Line::from(Span::styled( - format!(" {}", &line[3..]), + format!(" {}", stripped), Style::default() .fg(self.theme.terminal_amber.to_color()) .add_modifier(Modifier::BOLD), )); - } else if line.starts_with("# ") { + } else if let Some(stripped) = line.strip_prefix("# ") { return Line::from(Span::styled( - format!(" {}", &line[2..]), + format!(" {}", stripped), Style::default() .fg(self.theme.terminal_green.to_color()) .add_modifier(Modifier::BOLD), @@ -343,7 +343,7 @@ impl TerminalState { } // Find closing * let mut italic_text = String::new(); - while let Some(ch) = chars.next() { + for ch in chars.by_ref() { if ch == '*' { break; } @@ -367,7 +367,7 @@ impl TerminalState { } // Find closing ` let mut code_text = String::new(); - while let Some(ch) = chars.next() { + for ch in chars.by_ref() { if ch == '`' { break; } @@ -612,11 +612,9 @@ impl RetroTui { } // Update status blink only if status is "PROCESSING" - if state.status_line == "PROCESSING" { - if state.last_status_blink.elapsed() > Duration::from_millis(500) { - state.status_blink = !state.status_blink; - state.last_status_blink = Instant::now(); - } + if state.status_line == "PROCESSING" && state.last_status_blink.elapsed() > Duration::from_millis(500) { + state.status_blink = !state.status_blink; + state.last_status_blink = Instant::now(); } // Update activity area animation @@ -771,12 +769,7 @@ impl RetroTui { let total_cursor_pos = cursor_position; // Determine the window into the buffer we should show - let window_start = if total_cursor_pos > available_width - 1 { - // Cursor is beyond the visible area, scroll the view - total_cursor_pos - (available_width - 1) - } else { - 0 - }; + let window_start = total_cursor_pos.saturating_sub(available_width - 1); // Get the visible portion of the buffer let visible_buffer: String = input_buffer @@ -1013,9 +1006,9 @@ impl RetroTui { let fade_color = |color: Color| -> Color { match color { Color::Rgb(r, g, b) => { - let faded_r = ((r as f32 * opacity) as u8).max(0); - let faded_g = ((g as f32 * opacity) as u8).max(0); - let faded_b = ((b as f32 * opacity) as u8).max(0); + let faded_r = (r as f32 * opacity) as u8; + let faded_g = (g as f32 * opacity) as u8; + let faded_b = (b as f32 * opacity) as u8; Color::Rgb(faded_r, faded_g, faded_b) } _ => color, @@ -1098,9 +1091,9 @@ impl RetroTui { let fade_color = |color: Color| -> Color { match color { Color::Rgb(r, g, b) => { - let faded_r = ((r as f32 * opacity) as u8).max(0); - let faded_g = ((g as f32 * opacity) as u8).max(0); - let faded_b = ((b as f32 * opacity) as u8).max(0); + let faded_r = (r as f32 * opacity) as u8; + let faded_g = (g as f32 * opacity) as u8; + let faded_b = (b as f32 * opacity) as u8; Color::Rgb(faded_r, faded_g, faded_b) } _ => color, @@ -1176,7 +1169,7 @@ impl RetroTui { } // Wave characters for smooth animation - let wave_chars = vec!['▁', 'ā–‚', 'ā–ƒ', 'ā–„', 'ā–…', 'ā–†', 'ā–‡', 'ā–ˆ']; + let wave_chars = ['▁', 'ā–‚', 'ā–ƒ', 'ā–„', 'ā–…', 'ā–†', 'ā–‡', 'ā–ˆ']; // Build the wave line let mut wave_line = String::new(); @@ -1190,7 +1183,7 @@ impl RetroTui { let idx = wave_data.len().saturating_sub(display_width) + i; if idx < wave_data.len() { - let value = wave_data[idx].min(1.0).max(0.0); + let value = wave_data[idx].clamp(0.0, 1.0); let char_idx = ((value * 7.0) as usize).min(7); wave_line.push(wave_chars[char_idx]); } else { @@ -1206,8 +1199,6 @@ impl RetroTui { f.render_widget(wave_paragraph, area); } - /// Draw the status bar - /// Draw the status bar fn draw_status_bar( f: &mut Frame, diff --git a/crates/g3-cli/src/simple_output.rs b/crates/g3-cli/src/simple_output.rs new file mode 100644 index 0000000..456da9e --- /dev/null +++ b/crates/g3-cli/src/simple_output.rs @@ -0,0 +1,32 @@ +/// Simple output helper for printing messages +pub struct SimpleOutput { + machine_mode: bool, +} + +impl SimpleOutput { + pub fn new() -> Self { + SimpleOutput { machine_mode: false } + } + + pub fn new_with_mode(machine_mode: bool) -> Self { + SimpleOutput { machine_mode } + } + + pub fn print(&self, message: &str) { + if !self.machine_mode { + println!("{}", message); + } + } + + pub fn print_smart(&self, message: &str) { + if !self.machine_mode { + println!("{}", message); + } + } +} + +impl Default for SimpleOutput { + fn default() -> Self { + Self::new() + } +} diff --git a/crates/g3-cli/src/tui.rs b/crates/g3-cli/src/tui.rs index 261408e..452db5c 100644 --- a/crates/g3-cli/src/tui.rs +++ b/crates/g3-cli/src/tui.rs @@ -1,5 +1,6 @@ use crossterm::style::Color; use crossterm::style::{SetForegroundColor, ResetColor}; +use std::io::{self, Write}; use termimad::MadSkin; /// Simple output handler with markdown support @@ -40,7 +41,7 @@ impl SimpleOutput { trimmed.starts_with("* ") || trimmed.starts_with("+ ") || (trimmed.len() > 2 && - trimmed.chars().next().map_or(false, |c| c.is_ascii_digit()) && + trimmed.chars().next().is_some_and(|c| c.is_ascii_digit()) && trimmed.chars().nth(1) == Some('.') && trimmed.chars().nth(2) == Some(' ')) || (trimmed.contains('[') && trimmed.contains("](")) @@ -93,6 +94,37 @@ impl SimpleOutput { print!("{}", ResetColor); println!(" {:.1}% | {}/{} tokens", percentage, used, total); } + + pub fn print_context_thinning(&self, message: &str) { + // Animated highlight for context thinning + // Use bright cyan/green with a quick flash animation + + // Flash animation: print with bright background, then normal + let frames = vec![ + "\x1b[1;97;46m", // Frame 1: Bold white on cyan background + "\x1b[1;97;42m", // Frame 2: Bold white on green background + "\x1b[1;96;40m", // Frame 3: Bold cyan on black background + ]; + + println!(); + + // Quick flash animation + for frame in &frames { + print!("\r{} ✨ {} ✨\x1b[0m", frame, message); + let _ = io::stdout().flush(); + std::thread::sleep(std::time::Duration::from_millis(80)); + } + + // Final display with bright cyan and sparkle emojis + print!("\r\x1b[1;96m✨ {} ✨\x1b[0m", message); + println!(); + + // Add a subtle "success" indicator line + println!("\x1b[2;36m └─ Context optimized successfully\x1b[0m"); + println!(); + + let _ = io::stdout().flush(); + } } #[cfg(test)] diff --git a/crates/g3-cli/src/ui_writer_impl.rs b/crates/g3-cli/src/ui_writer_impl.rs index c69034c..2f336fd 100644 --- a/crates/g3-cli/src/ui_writer_impl.rs +++ b/crates/g3-cli/src/ui_writer_impl.rs @@ -1,8 +1,6 @@ -use crate::retro_tui::RetroTui; use g3_core::ui_writer::UiWriter; use std::io::{self, Write}; use std::sync::Mutex; -use std::time::Instant; /// Console implementation of UiWriter that prints to stdout pub struct ConsoleUiWriter { @@ -104,6 +102,37 @@ impl UiWriter for ConsoleUiWriter { println!("{}", message); } + fn print_context_thinning(&self, message: &str) { + // Animated highlight for context thinning + // Use bright cyan/green with a quick flash animation + + // Flash animation: print with bright background, then normal + let frames = vec![ + "\x1b[1;97;46m", // Frame 1: Bold white on cyan background + "\x1b[1;97;42m", // Frame 2: Bold white on green background + "\x1b[1;96;40m", // Frame 3: Bold cyan on black background + ]; + + println!(); + + // Quick flash animation + for frame in &frames { + print!("\r{} ✨ {} ✨\x1b[0m", frame, message); + let _ = io::stdout().flush(); + std::thread::sleep(std::time::Duration::from_millis(80)); + } + + // Final display with bright cyan and sparkle emojis + print!("\r\x1b[1;96m✨ {} ✨\x1b[0m", message); + println!(); + + // Add a subtle "success" indicator line + println!("\x1b[2;36m └─ Context optimized successfully\x1b[0m"); + println!(); + + let _ = io::stdout().flush(); + } + fn print_tool_header(&self, tool_name: &str) { // Store the tool name and clear args for collection *self.current_tool_name.lock().unwrap() = Some(tool_name.to_string()); @@ -115,7 +144,6 @@ impl UiWriter for ConsoleUiWriter { // For todo tools, we'll skip the normal header and print a custom one later if is_todo { - return; } } @@ -163,7 +191,12 @@ impl UiWriter for ConsoleUiWriter { // Truncate long values for display let display_value = if first_line.len() > 80 { - format!("{}...", &first_line[..77]) + // Use char_indices to safely truncate at character boundary + let truncate_at = first_line.char_indices() + .nth(77) + .map(|(i, _)| i) + .unwrap_or(first_line.len()); + format!("{}...", &first_line[..truncate_at]) } else { first_line.to_string() }; @@ -312,223 +345,3 @@ impl UiWriter for ConsoleUiWriter { } } -/// RetroTui implementation of UiWriter that sends output to the TUI -pub struct RetroTuiWriter { - tui: RetroTui, - current_tool_name: Mutex>, - current_tool_output: Mutex>, - current_tool_start: Mutex>, - current_tool_caption: Mutex, -} - -impl RetroTuiWriter { - pub fn new(tui: RetroTui) -> Self { - Self { - tui, - current_tool_name: Mutex::new(None), - current_tool_output: Mutex::new(Vec::new()), - current_tool_start: Mutex::new(None), - current_tool_caption: Mutex::new(String::new()), - } - } -} - -impl UiWriter for RetroTuiWriter { - fn print(&self, message: &str) { - self.tui.output(message); - } - - fn println(&self, message: &str) { - self.tui.output(message); - } - - fn print_inline(&self, message: &str) { - // For inline printing, we'll just append to the output - self.tui.output(message); - } - - fn print_system_prompt(&self, prompt: &str) { - self.tui.output("šŸ” System Prompt:"); - self.tui.output("================"); - for line in prompt.lines() { - self.tui.output(line); - } - self.tui.output("================"); - self.tui.output(""); - } - - fn print_context_status(&self, message: &str) { - self.tui.output(message); - } - - fn print_tool_header(&self, tool_name: &str) { - // Start collecting tool output - *self.current_tool_start.lock().unwrap() = Some(Instant::now()); - *self.current_tool_name.lock().unwrap() = Some(tool_name.to_string()); - self.current_tool_output.lock().unwrap().clear(); - self.current_tool_output - .lock() - .unwrap() - .push(format!("Tool: {}", tool_name)); - - // Initialize caption - *self.current_tool_caption.lock().unwrap() = String::new(); - } - - fn print_tool_arg(&self, key: &str, value: &str) { - // Filter out any keys that look like they might be agent message content - // (e.g., keys that are suspiciously long or contain message-like content) - let is_valid_arg_key = key.len() < 50 - && !key.contains('\n') - && !key.contains("I'll") - && !key.contains("Let me") - && !key.contains("Here's") - && !key.contains("I can"); - - if is_valid_arg_key { - self.current_tool_output - .lock() - .unwrap() - .push(format!("{}: {}", key, value)); - } - - // Build caption from first argument (usually the most important one) - let mut caption = self.current_tool_caption.lock().unwrap(); - if caption.is_empty() && (key == "file_path" || key == "command" || key == "path") { - // Truncate long values for the caption - let truncated = if value.len() > 50 { - format!("{}...", &value[..47]) - } else { - value.to_string() - }; - - // Add range information for read_file tool calls - let tool_name = self.current_tool_name.lock().unwrap(); - let range_suffix = if tool_name.as_ref().map_or(false, |name| name == "read_file") { - // We need to check if start/end args will be provided - for now just check if this is a partial read - // This is a simplified approach since we're building the caption incrementally - String::new() // We'll handle this in print_tool_output_header instead - } else { - String::new() - }; - - *caption = format!("{}{}", truncated, range_suffix); - } - } - - fn print_tool_output_header(&self) { - // This is called right before tool execution starts - // Send the initial tool header to the TUI now - if let Some(tool_name) = self.current_tool_name.lock().unwrap().as_ref() { - let mut caption = self.current_tool_caption.lock().unwrap().clone(); - - // Add range information for read_file tool calls - if tool_name == "read_file" { - // Check the tool output for start/end parameters - let output = self.current_tool_output.lock().unwrap(); - let has_start = output.iter().any(|line| line.starts_with("start:")); - let has_end = output.iter().any(|line| line.starts_with("end:")); - - if has_start || has_end { - let start_val = output.iter().find(|line| line.starts_with("start:")).map(|line| line.split(':').nth(1).unwrap_or("0").trim()).unwrap_or("0"); - let end_val = output.iter().find(|line| line.starts_with("end:")).map(|line| line.split(':').nth(1).unwrap_or("end").trim()).unwrap_or("end"); - caption = format!("{} [{}..{}]", caption, start_val, end_val); - } - } - - // Send the tool output with initial header - self.tui.tool_output(tool_name, &caption, ""); - } - - self.current_tool_output.lock().unwrap().push(String::new()); - self.current_tool_output - .lock() - .unwrap() - .push("Output:".to_string()); - } - - fn update_tool_output_line(&self, line: &str) { - // For retro mode, we'll just add to the output buffer - self.current_tool_output - .lock() - .unwrap() - .push(line.to_string()); - } - - fn print_tool_output_line(&self, line: &str) { - self.current_tool_output - .lock() - .unwrap() - .push(line.to_string()); - } - - fn print_tool_output_summary(&self, hidden_count: usize) { - self.current_tool_output.lock().unwrap().push(format!( - "... ({} more line{})", - hidden_count, - if hidden_count == 1 { "" } else { "s" } - )); - } - - fn print_tool_timing(&self, duration_str: &str) { - self.current_tool_output - .lock() - .unwrap() - .push(format!("āš”ļø {}", duration_str)); - - // Calculate the actual duration - let duration_ms = if let Some(start) = *self.current_tool_start.lock().unwrap() { - start.elapsed().as_millis() - } else { - 0 - }; - - // Get the tool name and caption - if let Some(tool_name) = self.current_tool_name.lock().unwrap().as_ref() { - let content = self.current_tool_output.lock().unwrap().join("\n"); - let caption = self.current_tool_caption.lock().unwrap().clone(); - let caption = if caption.is_empty() { - "Completed".to_string() - } else { - caption - }; - - // Update the tool detail panel with the complete output without adding a new header - // This keeps the original header in place to be updated by tool_complete - self.tui.update_tool_detail(tool_name, &content); - - // Determine success based on whether there's an error in the output - // This is a simple heuristic - you might want to make this more sophisticated - let success = !content.contains("error") - && !content.contains("Error") - && !content.contains("ERROR"); - - // Send the completion status to update the header - self.tui - .tool_complete(tool_name, success, duration_ms, &caption); - } - - // Clear the buffers - *self.current_tool_name.lock().unwrap() = None; - self.current_tool_output.lock().unwrap().clear(); - *self.current_tool_start.lock().unwrap() = None; - *self.current_tool_caption.lock().unwrap() = String::new(); - } - - fn print_agent_prompt(&self) { - self.tui.output("\nšŸ’¬ "); - } - - fn print_agent_response(&self, content: &str) { - self.tui.output(content); - } - - fn notify_sse_received(&self) { - // Notify the TUI that an SSE was received - self.tui.sse_received(); - } - - fn flush(&self) { - // No-op for TUI since it handles its own rendering - } -} diff --git a/crates/g3-computer-control/Cargo.toml b/crates/g3-computer-control/Cargo.toml index 9aa522c..b9ed189 100644 --- a/crates/g3-computer-control/Cargo.toml +++ b/crates/g3-computer-control/Cargo.toml @@ -3,6 +3,9 @@ name = "g3-computer-control" version = "0.1.0" edition = "2021" +[build-dependencies] +# Only needed for building Swift bridge on macOS + [dependencies] # Workspace dependencies tokio = { workspace = true } @@ -20,15 +23,13 @@ async-trait = "0.1" # WebDriver support fantoccini = "0.21" -# OCR dependencies -tesseract = "0.14" - # macOS dependencies [target.'cfg(target_os = "macos")'.dependencies] core-graphics = "0.23" -core-foundation = "0.9" +core-foundation = "0.10" cocoa = "0.25" objc = "0.2" +accessibility = "0.2" image = "0.24" # Linux dependencies diff --git a/crates/g3-computer-control/build.rs b/crates/g3-computer-control/build.rs new file mode 100644 index 0000000..fed302c --- /dev/null +++ b/crates/g3-computer-control/build.rs @@ -0,0 +1,63 @@ +use std::env; +use std::path::PathBuf; +use std::process::Command; + +fn main() { + // Only build Vision bridge on macOS + if env::var("CARGO_CFG_TARGET_OS").unwrap() != "macos" { + return; + } + + println!("cargo:rerun-if-changed=vision-bridge/Sources/VisionBridge/VisionOCR.swift"); + println!("cargo:rerun-if-changed=vision-bridge/Sources/VisionBridge/VisionBridge.h"); + println!("cargo:rerun-if-changed=vision-bridge/Package.swift"); + + let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap()); + let vision_bridge_dir = manifest_dir.join("vision-bridge"); + + // Build Swift package + println!("cargo:warning=Building VisionBridge Swift package..."); + let build_status = Command::new("swift") + .args(&["build", "-c", "release"]) + .current_dir(&vision_bridge_dir) + .status() + .expect("Failed to build Swift package"); + + if !build_status.success() { + panic!("Swift build failed"); + } + + // Find the built library + let lib_path = vision_bridge_dir + .join(".build/release") + .canonicalize() + .expect("Failed to find .build/release directory"); + + // Copy the dylib to the output directory so it can be found at runtime + let target_dir = manifest_dir.parent().unwrap().parent().unwrap().join("target"); + let profile = env::var("PROFILE").unwrap_or_else(|_| "debug".to_string()); + let output_dir = target_dir.join(&profile); + + let dylib_src = lib_path.join("libVisionBridge.dylib"); + let dylib_dst = output_dir.join("libVisionBridge.dylib"); + + std::fs::copy(&dylib_src, &dylib_dst) + .expect(&format!("Failed to copy dylib from {} to {}", dylib_src.display(), dylib_dst.display())); + + println!("cargo:warning=Copied libVisionBridge.dylib to {}", dylib_dst.display()); + + // Add rpath so the dylib can be found at runtime + println!("cargo:rustc-link-arg=-Wl,-rpath,@executable_path"); + println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path"); + println!("cargo:rustc-link-search=native={}", lib_path.display()); + println!("cargo:rustc-link-lib=dylib=VisionBridge"); + + // Link required frameworks + println!("cargo:rustc-link-lib=framework=Vision"); + println!("cargo:rustc-link-lib=framework=AppKit"); + println!("cargo:rustc-link-lib=framework=Foundation"); + println!("cargo:rustc-link-lib=framework=CoreGraphics"); + println!("cargo:rustc-link-lib=framework=CoreImage"); + + println!("cargo:warning=VisionBridge built successfully at {}", lib_path.display()); +} diff --git a/crates/g3-computer-control/examples/list_windows.rs b/crates/g3-computer-control/examples/list_windows.rs index 5b571d9..f1681ff 100644 --- a/crates/g3-computer-control/examples/list_windows.rs +++ b/crates/g3-computer-control/examples/list_windows.rs @@ -1,7 +1,7 @@ use core_graphics::window::{kCGWindowListOptionOnScreenOnly, kCGNullWindowID, CGWindowListCopyWindowInfo}; use core_foundation::dictionary::CFDictionary; use core_foundation::string::CFString; -use core_foundation::base::TCFType; +use core_foundation::base::{TCFType, ToVoid}; fn main() { println!("Listing all on-screen windows..."); @@ -22,7 +22,7 @@ fn main() { // Get window ID let window_id_key = CFString::from_static_string("kCGWindowNumber"); - let window_id: i64 = if let Some(value) = dict.find(window_id_key.as_concrete_TypeRef()) { + let window_id: i64 = if let Some(value) = dict.find(window_id_key.to_void()) { let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _); num.to_i64().unwrap_or(0) } else { @@ -31,7 +31,7 @@ fn main() { // Get owner name let owner_key = CFString::from_static_string("kCGWindowOwnerName"); - let owner: String = if let Some(value) = dict.find(owner_key.as_concrete_TypeRef()) { + let owner: String = if let Some(value) = dict.find(owner_key.to_void()) { let s: CFString = TCFType::wrap_under_get_rule(*value as *const _); s.to_string() } else { @@ -40,15 +40,15 @@ fn main() { // Get window name/title let name_key = CFString::from_static_string("kCGWindowName"); - let title: String = if let Some(value) = dict.find(name_key.as_concrete_TypeRef()) { + let title: String = if let Some(value) = dict.find(name_key.to_void()) { let s: CFString = TCFType::wrap_under_get_rule(*value as *const _); s.to_string() } else { "".to_string() }; - // Filter for iTerm or show all - if owner.contains("iTerm") || owner.contains("Terminal") { + // Show all windows + if !owner.is_empty() { println!("{:<10} {:<25} {}", window_id, owner, title); } } diff --git a/crates/g3-computer-control/examples/macax_demo.rs b/crates/g3-computer-control/examples/macax_demo.rs new file mode 100644 index 0000000..ff1398d --- /dev/null +++ b/crates/g3-computer-control/examples/macax_demo.rs @@ -0,0 +1,74 @@ +//! Example demonstrating macOS Accessibility API tools +//! +//! This example shows how to use the macax tools to control macOS applications. +//! +//! Run with: cargo run --example macax_demo + +use anyhow::Result; +use g3_computer_control::MacAxController; + +#[tokio::main] +async fn main() -> Result<()> { + println!("šŸŽ macOS Accessibility API Demo\n"); + println!("This demo shows how to control macOS applications using the Accessibility API.\n"); + + // Create controller + let controller = MacAxController::new()?; + println!("āœ… MacAxController initialized\n"); + + // List running applications + println!("šŸ“± Listing running applications:"); + match controller.list_applications() { + Ok(apps) => { + for app in apps.iter().take(10) { + println!(" - {}", app.name); + } + if apps.len() > 10 { + println!(" ... and {} more", apps.len() - 10); + } + } + Err(e) => println!(" āŒ Error: {}", e), + } + println!(); + + // Get frontmost app + println!("šŸŽÆ Getting frontmost application:"); + match controller.get_frontmost_app() { + Ok(app) => println!(" Current: {}", app.name), + Err(e) => println!(" āŒ Error: {}", e), + } + println!(); + + // Example: Activate Finder and get its UI tree + println!("šŸ“‚ Activating Finder and inspecting UI:"); + match controller.activate_app("Finder") { + Ok(_) => { + println!(" āœ… Finder activated"); + + // Wait a moment for activation + tokio::time::sleep(tokio::time::Duration::from_millis(500)).await; + + // Get UI tree + match controller.get_ui_tree("Finder", 2) { + Ok(tree) => { + println!("\n UI Tree:"); + for line in tree.lines().take(10) { + println!(" {}", line); + } + } + Err(e) => println!(" āŒ Error getting UI tree: {}", e), + } + } + Err(e) => println!(" āŒ Error: {}", e), + } + println!(); + + println!("✨ Demo complete!\n"); + println!("šŸ’” Tips:"); + println!(" - Use --macax flag with g3 to enable these tools"); + println!(" - Grant accessibility permissions in System Preferences"); + println!(" - Add accessibility identifiers to your apps for easier automation"); + println!(" - See docs/macax-tools.md for full documentation\n"); + + Ok(()) +} diff --git a/crates/g3-computer-control/examples/safari_demo.rs b/crates/g3-computer-control/examples/safari_demo.rs index aed4c1e..b28ebd6 100644 --- a/crates/g3-computer-control/examples/safari_demo.rs +++ b/crates/g3-computer-control/examples/safari_demo.rs @@ -31,7 +31,7 @@ async fn main() -> Result<()> { // Find an element println!("Finding h1 element..."); - let mut h1 = driver.find_element("h1").await?; + let h1 = driver.find_element("h1").await?; let h1_text = h1.text().await?; println!("H1 text: {}\n", h1_text); diff --git a/crates/g3-computer-control/examples/test_permission_prompt.rs b/crates/g3-computer-control/examples/test_permission_prompt.rs index bf1d640..fdd5a4b 100644 --- a/crates/g3-computer-control/examples/test_permission_prompt.rs +++ b/crates/g3-computer-control/examples/test_permission_prompt.rs @@ -1,4 +1,4 @@ -use g3_computer_control::{create_controller, ComputerController}; +use g3_computer_control::create_controller; #[tokio::main] async fn main() { diff --git a/crates/g3-computer-control/examples/test_screenshot_fix.rs b/crates/g3-computer-control/examples/test_screenshot_fix.rs index bcfb60b..467da49 100644 --- a/crates/g3-computer-control/examples/test_screenshot_fix.rs +++ b/crates/g3-computer-control/examples/test_screenshot_fix.rs @@ -1,6 +1,5 @@ use core_graphics::display::CGDisplay; use image::{ImageBuffer, RgbaImage}; -use std::path::Path; fn main() { let display = CGDisplay::main(); diff --git a/crates/g3-computer-control/examples/test_type_text.rs b/crates/g3-computer-control/examples/test_type_text.rs new file mode 100644 index 0000000..2d1aea0 --- /dev/null +++ b/crates/g3-computer-control/examples/test_type_text.rs @@ -0,0 +1,48 @@ +//! Test the new type_text functionality + +use anyhow::Result; +use g3_computer_control::MacAxController; + +#[tokio::main] +async fn main() -> Result<()> { + println!("🧪 Testing macax type_text functionality\n"); + + let controller = MacAxController::new()?; + println!("āœ… Controller initialized\n"); + + // Test 1: Type simple text + println!("Test 1: Typing simple text into TextEdit"); + println!(" Please open TextEdit and create a new document..."); + std::thread::sleep(std::time::Duration::from_secs(3)); + + match controller.type_text("TextEdit", "Hello, World!") { + Ok(_) => println!(" āœ… Successfully typed simple text\n"), + Err(e) => println!(" āŒ Failed: {}\n", e), + } + + std::thread::sleep(std::time::Duration::from_secs(1)); + + // Test 2: Type unicode and emojis + println!("Test 2: Typing unicode and emojis"); + match controller.type_text("TextEdit", "\n🌟 Unicode test: cafĆ©, naĆÆve, ę—„ęœ¬čŖž šŸŽ‰") { + Ok(_) => println!(" āœ… Successfully typed unicode text\n"), + Err(e) => println!(" āŒ Failed: {}\n", e), + } + + std::thread::sleep(std::time::Duration::from_secs(1)); + + // Test 3: Type special characters + println!("Test 3: Typing special characters"); + match controller.type_text("TextEdit", "\nSpecial: @#$%^&*()_+-=[]{}|;':,.<>?/") { + Ok(_) => println!(" āœ… Successfully typed special characters\n"), + Err(e) => println!(" āŒ Failed: {}\n", e), + } + + println!("\n✨ Tests complete!"); + println!("\nšŸ’” Now try with Things3:"); + println!(" 1. Open Things3"); + println!(" 2. Press Cmd+N to create a new task"); + println!(" 3. Run: g3 --macax 'type \"🌟 My awesome task\" into Things'"); + + Ok(()) +} diff --git a/crates/g3-computer-control/examples/test_vision.rs b/crates/g3-computer-control/examples/test_vision.rs new file mode 100644 index 0000000..5ff09a5 --- /dev/null +++ b/crates/g3-computer-control/examples/test_vision.rs @@ -0,0 +1,85 @@ +use g3_computer_control::ocr::{OCREngine, DefaultOCR}; +use anyhow::Result; + +#[tokio::main] +async fn main() -> Result<()> { + println!("🧪 Testing Apple Vision OCR"); + println!("===========================\n"); + + // Initialize OCR engine + println!("šŸ“¦ Initializing OCR engine..."); + let ocr = DefaultOCR::new()?; + println!("āœ… OCR engine: {}\n", ocr.name()); + + // Check if test image exists + let test_image = "/tmp/safari_test.png"; + if !std::path::Path::new(test_image).exists() { + println!("āš ļø Test image not found: {}", test_image); + println!(" Creating a screenshot..."); + + let status = std::process::Command::new("screencapture") + .arg("-x") + .arg("-R") + .arg("0,0,1200,800") + .arg(test_image) + .status()?; + + if !status.success() { + anyhow::bail!("Failed to create screenshot"); + } + + println!("āœ… Screenshot created\n"); + } + + // Run OCR + println!("šŸ” Running Apple Vision OCR on {}...", test_image); + let start = std::time::Instant::now(); + let locations = ocr.extract_text_with_locations(test_image).await?; + let duration = start.elapsed(); + + println!("āœ… OCR completed in {:.3}s\n", duration.as_secs_f64()); + + // Display results + println!("šŸ“Š Results:"); + println!(" Found {} text elements\n", locations.len()); + + if locations.is_empty() { + println!("āš ļø No text found in image"); + } else { + println!(" Top 20 results:"); + println!(" {:<4} {:<40} {:<15} {:<12} {:<8}", "#", "Text", "Position", "Size", "Conf"); + println!(" {}", "-".repeat(85)); + + for (i, loc) in locations.iter().take(20).enumerate() { + let text = if loc.text.len() > 37 { + format!("{}...", &loc.text[..37]) + } else { + loc.text.clone() + }; + + println!(" {:<4} {:<40} ({:>4},{:>4}) {:>4}x{:<4} {:.2}", + i + 1, + text, + loc.x, + loc.y, + loc.width, + loc.height, + loc.confidence + ); + } + + if locations.len() > 20 { + println!("\n ... and {} more", locations.len() - 20); + } + + // Performance comparison + println!("\nšŸ“ˆ Performance:"); + println!(" OCR Speed: {:.3}s", duration.as_secs_f64()); + println!(" Text elements: {}", locations.len()); + println!(" Avg per element: {:.1}ms", duration.as_millis() as f64 / locations.len() as f64); + } + + println!("\nāœ… Test complete!"); + + Ok(()) +} diff --git a/crates/g3-computer-control/src/lib.rs b/crates/g3-computer-control/src/lib.rs index 5c72d65..b1cbc36 100644 --- a/crates/g3-computer-control/src/lib.rs +++ b/crates/g3-computer-control/src/lib.rs @@ -1,10 +1,18 @@ +// Suppress warnings from objc crate macros +#![allow(unexpected_cfgs)] + pub mod types; pub mod platform; +pub mod ocr; pub mod webdriver; +pub mod macax; // Re-export webdriver types for convenience pub use webdriver::{WebDriverController, WebElement, safari::SafariDriver}; +// Re-export macax types for convenience +pub use macax::{MacAxController, AXElement, AXApplication}; + use anyhow::Result; use async_trait::async_trait; use types::*; @@ -15,8 +23,14 @@ pub trait ComputerController: Send + Sync { async fn take_screenshot(&self, path: &str, region: Option, window_id: Option<&str>) -> Result<()>; // OCR operations - async fn extract_text_from_screen(&self, region: Rect) -> Result; + async fn extract_text_from_screen(&self, region: Rect, window_id: &str) -> Result; async fn extract_text_from_image(&self, path: &str) -> Result; + async fn extract_text_with_locations(&self, path: &str) -> Result>; + async fn find_text_in_app(&self, app_name: &str, search_text: &str) -> Result>; + + // Mouse operations + fn move_mouse(&self, x: i32, y: i32) -> Result<()>; + fn click_at(&self, x: i32, y: i32, app_name: Option<&str>) -> Result<()>; } // Platform-specific constructor diff --git a/crates/g3-computer-control/src/macax/controller.rs b/crates/g3-computer-control/src/macax/controller.rs new file mode 100644 index 0000000..ac91ac1 --- /dev/null +++ b/crates/g3-computer-control/src/macax/controller.rs @@ -0,0 +1,822 @@ +use super::{AXApplication, AXElement}; +use anyhow::{Context, Result}; +use std::collections::HashMap; + +#[cfg(target_os = "macos")] +use accessibility::{AXUIElement, AXUIElementAttributes, ElementFinder, TreeVisitor, TreeWalker, TreeWalkerFlow}; + +#[cfg(target_os = "macos")] +use core_foundation::base::TCFType; + +#[cfg(target_os = "macos")] +use core_foundation::string::CFString; + +/// macOS Accessibility API controller using native APIs +pub struct MacAxController { + // Cache for application elements + app_cache: std::sync::Mutex>, +} + +impl MacAxController { + pub fn new() -> Result { + #[cfg(target_os = "macos")] + { + // Check if we have accessibility permissions by trying to get system-wide element + let _system = AXUIElement::system_wide(); + + Ok(Self { + app_cache: std::sync::Mutex::new(HashMap::new()), + }) + } + + #[cfg(not(target_os = "macos"))] + { + anyhow::bail!("macOS Accessibility API is only available on macOS") + } + } + + /// List all running applications + #[cfg(target_os = "macos")] + pub fn list_applications(&self) -> Result> { + let apps = Self::get_running_applications()?; + Ok(apps) + } + + #[cfg(not(target_os = "macos"))] + pub fn list_applications(&self) -> Result> { + anyhow::bail!("Not supported on this platform") + } + + #[cfg(target_os = "macos")] + fn get_running_applications() -> Result> { + use cocoa::appkit::NSApplicationActivationPolicy; + use cocoa::base::{id, nil}; + use objc::{class, msg_send, sel, sel_impl}; + + unsafe { + let workspace: id = msg_send![class!(NSWorkspace), sharedWorkspace]; + let running_apps: id = msg_send![workspace, runningApplications]; + let count: usize = msg_send![running_apps, count]; + + let mut apps = Vec::new(); + + for i in 0..count { + let app: id = msg_send![running_apps, objectAtIndex: i]; + + // Get app name + let localized_name: id = msg_send![app, localizedName]; + if localized_name == nil { + continue; + } + let name_ptr: *const i8 = msg_send![localized_name, UTF8String]; + let name = if !name_ptr.is_null() { + std::ffi::CStr::from_ptr(name_ptr) + .to_string_lossy() + .to_string() + } else { + continue; + }; + + // Get bundle ID + let bundle_id_obj: id = msg_send![app, bundleIdentifier]; + let bundle_id = if bundle_id_obj != nil { + let bundle_id_ptr: *const i8 = msg_send![bundle_id_obj, UTF8String]; + if !bundle_id_ptr.is_null() { + Some( + std::ffi::CStr::from_ptr(bundle_id_ptr) + .to_string_lossy() + .to_string(), + ) + } else { + None + } + } else { + None + }; + + // Get PID + let pid: i32 = msg_send![app, processIdentifier]; + + // Skip background-only apps + let activation_policy: i64 = msg_send![app, activationPolicy]; + if activation_policy == NSApplicationActivationPolicy::NSApplicationActivationPolicyRegular as i64 { + apps.push(AXApplication { + name, + bundle_id, + pid, + }); + } + } + + Ok(apps) + } + } + + /// Get the frontmost (active) application + #[cfg(target_os = "macos")] + pub fn get_frontmost_app(&self) -> Result { + use cocoa::base::{id, nil}; + use objc::{class, msg_send, sel, sel_impl}; + + unsafe { + let workspace: id = msg_send![class!(NSWorkspace), sharedWorkspace]; + let frontmost_app: id = msg_send![workspace, frontmostApplication]; + + if frontmost_app == nil { + anyhow::bail!("No frontmost application"); + } + + // Get app name + let localized_name: id = msg_send![frontmost_app, localizedName]; + let name_ptr: *const i8 = msg_send![localized_name, UTF8String]; + let name = std::ffi::CStr::from_ptr(name_ptr) + .to_string_lossy() + .to_string(); + + // Get bundle ID + let bundle_id_obj: id = msg_send![frontmost_app, bundleIdentifier]; + let bundle_id = if bundle_id_obj != nil { + let bundle_id_ptr: *const i8 = msg_send![bundle_id_obj, UTF8String]; + if !bundle_id_ptr.is_null() { + Some( + std::ffi::CStr::from_ptr(bundle_id_ptr) + .to_string_lossy() + .to_string(), + ) + } else { + None + } + } else { + None + }; + + // Get PID + let pid: i32 = msg_send![frontmost_app, processIdentifier]; + + Ok(AXApplication { + name, + bundle_id, + pid, + }) + } + } + + #[cfg(not(target_os = "macos"))] + pub fn get_frontmost_app(&self) -> Result { + anyhow::bail!("Not supported on this platform") + } + + /// Get AXUIElement for an application by name or PID + #[cfg(target_os = "macos")] + fn get_app_element(&self, app_name: &str) -> Result { + // Check cache first + { + let cache = self.app_cache.lock().unwrap(); + if let Some(element) = cache.get(app_name) { + return Ok(element.clone()); + } + } + + // Find the app by name + let apps = Self::get_running_applications()?; + let app = apps + .iter() + .find(|a| a.name == app_name) + .ok_or_else(|| anyhow::anyhow!("Application '{}' not found", app_name))?; + + // Create AXUIElement for the app + let element = AXUIElement::application(app.pid); + + // Cache it + { + let mut cache = self.app_cache.lock().unwrap(); + cache.insert(app_name.to_string(), element.clone()); + } + + Ok(element) + } + + /// Activate (bring to front) an application + #[cfg(target_os = "macos")] + pub fn activate_app(&self, app_name: &str) -> Result<()> { + use cocoa::base::id; + use objc::{class, msg_send, sel, sel_impl}; + + // Find the app + let apps = Self::get_running_applications()?; + let app = apps + .iter() + .find(|a| a.name == app_name) + .ok_or_else(|| anyhow::anyhow!("Application '{}' not found", app_name))?; + + unsafe { + let workspace: id = msg_send![class!(NSWorkspace), sharedWorkspace]; + let running_apps: id = msg_send![workspace, runningApplications]; + let count: usize = msg_send![running_apps, count]; + + for i in 0..count { + let running_app: id = msg_send![running_apps, objectAtIndex: i]; + let pid: i32 = msg_send![running_app, processIdentifier]; + + if pid == app.pid { + let _: bool = msg_send![running_app, activateWithOptions: 0]; + return Ok(()); + } + } + } + + anyhow::bail!("Failed to activate application") + } + + #[cfg(not(target_os = "macos"))] + pub fn activate_app(&self, _app_name: &str) -> Result<()> { + anyhow::bail!("Not supported on this platform") + } + + /// Get the UI hierarchy of an application + #[cfg(target_os = "macos")] + pub fn get_ui_tree(&self, app_name: &str, max_depth: usize) -> Result { + let app_element = self.get_app_element(app_name)?; + let mut output = format!("Application: {}\n", app_name); + + Self::build_ui_tree(&app_element, &mut output, 0, max_depth)?; + + Ok(output) + } + + #[cfg(not(target_os = "macos"))] + pub fn get_ui_tree(&self, _app_name: &str, _max_depth: usize) -> Result { + anyhow::bail!("Not supported on this platform") + } + + #[cfg(target_os = "macos")] + fn build_ui_tree( + element: &AXUIElement, + output: &mut String, + depth: usize, + max_depth: usize, + ) -> Result<()> { + if depth >= max_depth { + return Ok(()); + } + + let indent = " ".repeat(depth); + + // Get role + let role = element.role().ok().map(|s| s.to_string()) + .unwrap_or_else(|| "Unknown".to_string()); + + // Get title + let title = element.title().ok() + .map(|s| s.to_string()); + + // Get identifier + let identifier = element.identifier().ok() + .map(|s| s.to_string()); + + // Format output + output.push_str(&format!("{}Role: {}", indent, role)); + if let Some(t) = title { + output.push_str(&format!(", Title: {}", t)); + } + if let Some(id) = identifier { + output.push_str(&format!(", ID: {}", id)); + } + output.push('\n'); + + // Get children + if let Ok(children) = element.children() { + for i in 0..children.len() { + if let Some(child) = children.get(i) { + let _ = Self::build_ui_tree(&child, output, depth + 1, max_depth); + } + } + } + + Ok(()) + } + + /// Find UI elements in an application + #[cfg(target_os = "macos")] + pub fn find_elements( + &self, + app_name: &str, + role: Option<&str>, + title: Option<&str>, + identifier: Option<&str>, + ) -> Result> { + let app_element = self.get_app_element(app_name)?; + let mut found_elements = Vec::new(); + + let visitor = ElementCollector { + role_filter: role.map(|s| s.to_string()), + title_filter: title.map(|s| s.to_string()), + identifier_filter: identifier.map(|s| s.to_string()), + results: std::cell::RefCell::new(&mut found_elements), + depth: std::cell::Cell::new(0), + }; + + let walker = TreeWalker::new(); + walker.walk(&app_element, &visitor); + + Ok(found_elements) + } + + #[cfg(not(target_os = "macos"))] + pub fn find_elements( + &self, + _app_name: &str, + _role: Option<&str>, + _title: Option<&str>, + _identifier: Option<&str>, + ) -> Result> { + anyhow::bail!("Not supported on this platform") + } + + /// Find a single element (helper for click, set_value, etc.) + #[cfg(target_os = "macos")] + fn find_element( + &self, + app_name: &str, + role: &str, + title: Option<&str>, + identifier: Option<&str>, + ) -> Result { + let app_element = self.get_app_element(app_name)?; + + let role_str = role.to_string(); + let title_str = title.map(|s| s.to_string()); + let identifier_str = identifier.map(|s| s.to_string()); + + let finder = ElementFinder::new( + &app_element, + move |element| { + // Check role + let elem_role = element.role() + .ok() + .map(|s| s.to_string()); + + if let Some(r) = elem_role { + if !r.contains(&role_str) { + return false; + } + } else { + return false; + } + + // Check title if specified + if let Some(ref title_filter) = title_str { + let elem_title = element.title() + .ok() + .map(|s| s.to_string()); + + if let Some(t) = elem_title { + if !t.contains(title_filter) { + return false; + } + } else { + return false; + } + } + + // Check identifier if specified + if let Some(ref id_filter) = identifier_str { + let elem_id = element.identifier() + .ok() + .map(|s| s.to_string()); + + if let Some(id) = elem_id { + if !id.contains(id_filter) { + return false; + } + } else { + return false; + } + } + + true + }, + Some(std::time::Duration::from_secs(2)), + ); + + finder.find().context("Element not found") + } + + /// Click on a UI element + #[cfg(target_os = "macos")] + pub fn click_element( + &self, + app_name: &str, + role: &str, + title: Option<&str>, + identifier: Option<&str>, + ) -> Result<()> { + let element = self.find_element(app_name, role, title, identifier)?; + + // Perform the press action + let action_name = CFString::new("AXPress"); + element + .perform_action(&action_name) + .map_err(|e| anyhow::anyhow!("Failed to perform press action: {:?}", e))?; + + Ok(()) + } + + #[cfg(not(target_os = "macos"))] + pub fn click_element( + &self, + _app_name: &str, + _role: &str, + _title: Option<&str>, + _identifier: Option<&str>, + ) -> Result<()> { + anyhow::bail!("Not supported on this platform") + } + + /// Set the value of a UI element + #[cfg(target_os = "macos")] + pub fn set_value( + &self, + app_name: &str, + role: &str, + value: &str, + title: Option<&str>, + identifier: Option<&str>, + ) -> Result<()> { + let element = self.find_element(app_name, role, title, identifier)?; + + // Set the value - convert CFString to CFType + let cf_value = CFString::new(value); + + element.set_value(cf_value.as_CFType()) + .map_err(|e| anyhow::anyhow!("Failed to set value: {:?}", e))?; + + Ok(()) + } + + #[cfg(not(target_os = "macos"))] + pub fn set_value( + &self, + _app_name: &str, + _role: &str, + _value: &str, + _title: Option<&str>, + _identifier: Option<&str>, + ) -> Result<()> { + anyhow::bail!("Not supported on this platform") + } + + /// Get the value of a UI element + #[cfg(target_os = "macos")] + pub fn get_value( + &self, + app_name: &str, + role: &str, + title: Option<&str>, + identifier: Option<&str>, + ) -> Result { + let element = self.find_element(app_name, role, title, identifier)?; + + // Get the value + let value_type = element.value() + .map_err(|e| anyhow::anyhow!("Failed to get value: {:?}", e))?; + + // Try to downcast to CFString + if let Some(cf_string) = value_type.downcast::() { + Ok(cf_string.to_string()) + } else { + // For non-string values, try to get a description + Ok(format!("")) + } + } + + #[cfg(not(target_os = "macos"))] + pub fn get_value( + &self, + _app_name: &str, + _role: &str, + _title: Option<&str>, + _identifier: Option<&str>, + ) -> Result { + anyhow::bail!("Not supported on this platform") + } + + /// Type text into the currently focused element (uses system text input) + #[cfg(target_os = "macos")] + pub fn type_text(&self, app_name: &str, text: &str) -> Result<()> { + use cocoa::base::{id, nil}; + use cocoa::foundation::NSString; + use objc::{class, msg_send, sel, sel_impl}; + + // First, make sure the app is active + self.activate_app(app_name)?; + + // Wait for app to fully activate + std::thread::sleep(std::time::Duration::from_millis(500)); + + // Send a Tab key to try to focus on a text field + // This helps ensure something is focused before we paste + let _ = self.press_key(app_name, "tab", vec![]); + std::thread::sleep(std::time::Duration::from_millis(800)); + + // Save old clipboard, set new content, paste, then restore + let old_content: id; + unsafe { + // Get the general pasteboard + let pasteboard: id = msg_send![class!(NSPasteboard), generalPasteboard]; + + // Save current clipboard content + let ns_string_type = NSString::alloc(nil).init_str("public.utf8-plain-text"); + old_content = msg_send![pasteboard, stringForType: ns_string_type]; + + // Clear and set new content + let _: () = msg_send![pasteboard, clearContents]; + + let ns_string = NSString::alloc(nil).init_str(text); + let ns_type = NSString::alloc(nil).init_str("public.utf8-plain-text"); + let _: bool = msg_send![pasteboard, setString:ns_string forType:ns_type]; + } + + // Wait a moment for clipboard to update + std::thread::sleep(std::time::Duration::from_millis(200)); + + // Paste using Cmd+V (outside unsafe block) + self.press_key(app_name, "v", vec!["command"])?; + + // Wait for paste to complete + std::thread::sleep(std::time::Duration::from_millis(300)); + + // Restore old clipboard content if it existed + unsafe { + if old_content != nil { + let pasteboard: id = msg_send![class!(NSPasteboard), generalPasteboard]; + let _: () = msg_send![pasteboard, clearContents]; + let ns_type = NSString::alloc(nil).init_str("public.utf8-plain-text"); + let _: bool = msg_send![pasteboard, setString:old_content forType:ns_type]; + } + } + + Ok(()) + } + + #[cfg(not(target_os = "macos"))] + pub fn type_text(&self, _app_name: &str, _text: &str) -> Result<()> { + anyhow::bail!("Not supported on this platform") + } + + /// Focus on a text field or text area element + #[cfg(target_os = "macos")] + pub fn focus_element( + &self, + app_name: &str, + role: &str, + title: Option<&str>, + identifier: Option<&str>, + ) -> Result<()> { + let element = self.find_element(app_name, role, title, identifier)?; + + // Set focused attribute to true + use core_foundation::boolean::CFBoolean; + let cf_true = CFBoolean::true_value(); + + element.set_attribute(&accessibility::AXAttribute::focused(), cf_true) + .map_err(|e| anyhow::anyhow!("Failed to focus element: {:?}", e))?; + + Ok(()) + } + + /// Press a keyboard shortcut + #[cfg(target_os = "macos")] + pub fn press_key( + &self, + app_name: &str, + key: &str, + modifiers: Vec<&str>, + ) -> Result<()> { + use core_graphics::event::{ + CGEvent, CGEventFlags, CGEventTapLocation, + }; + use core_graphics::event_source::{CGEventSource, CGEventSourceStateID}; + + // First, make sure the app is active + self.activate_app(app_name)?; + + // Wait a bit for activation + std::thread::sleep(std::time::Duration::from_millis(100)); + + // Map key string to key code + let key_code = Self::key_to_keycode(key) + .ok_or_else(|| anyhow::anyhow!("Unknown key: {}", key))?; + + // Map modifiers to flags + let mut flags = CGEventFlags::CGEventFlagNull; + for modifier in modifiers { + match modifier.to_lowercase().as_str() { + "command" | "cmd" => flags |= CGEventFlags::CGEventFlagCommand, + "option" | "alt" => flags |= CGEventFlags::CGEventFlagAlternate, + "control" | "ctrl" => flags |= CGEventFlags::CGEventFlagControl, + "shift" => flags |= CGEventFlags::CGEventFlagShift, + _ => {} + } + } + + // Create event source + let source = CGEventSource::new(CGEventSourceStateID::HIDSystemState) + .ok().context("Failed to create event source")?; + + // Create key down event + let key_down = CGEvent::new_keyboard_event(source.clone(), key_code, true) + .ok().context("Failed to create key down event")?; + key_down.set_flags(flags); + + // Create key up event + let key_up = CGEvent::new_keyboard_event(source, key_code, false) + .ok().context("Failed to create key up event")?; + key_up.set_flags(flags); + + // Post events + key_down.post(CGEventTapLocation::HID); + std::thread::sleep(std::time::Duration::from_millis(50)); + key_up.post(CGEventTapLocation::HID); + + Ok(()) + } + + #[cfg(not(target_os = "macos"))] + pub fn press_key( + &self, + _app_name: &str, + _key: &str, + _modifiers: Vec<&str>, + ) -> Result<()> { + anyhow::bail!("Not supported on this platform") + } + + #[cfg(target_os = "macos")] + fn key_to_keycode(key: &str) -> Option { + // Map common keys to keycodes + // See: https://eastmanreference.com/complete-list-of-applescript-key-codes + match key.to_lowercase().as_str() { + "a" => Some(0x00), + "s" => Some(0x01), + "d" => Some(0x02), + "f" => Some(0x03), + "h" => Some(0x04), + "g" => Some(0x05), + "z" => Some(0x06), + "x" => Some(0x07), + "c" => Some(0x08), + "v" => Some(0x09), + "b" => Some(0x0B), + "q" => Some(0x0C), + "w" => Some(0x0D), + "e" => Some(0x0E), + "r" => Some(0x0F), + "y" => Some(0x10), + "t" => Some(0x11), + "1" => Some(0x12), + "2" => Some(0x13), + "3" => Some(0x14), + "4" => Some(0x15), + "6" => Some(0x16), + "5" => Some(0x17), + "=" => Some(0x18), + "9" => Some(0x19), + "7" => Some(0x1A), + "-" => Some(0x1B), + "8" => Some(0x1C), + "0" => Some(0x1D), + "]" => Some(0x1E), + "o" => Some(0x1F), + "u" => Some(0x20), + "[" => Some(0x21), + "i" => Some(0x22), + "p" => Some(0x23), + "return" | "enter" => Some(0x24), + "l" => Some(0x25), + "j" => Some(0x26), + "'" => Some(0x27), + "k" => Some(0x28), + ";" => Some(0x29), + "\\" => Some(0x2A), + "," => Some(0x2B), + "/" => Some(0x2C), + "n" => Some(0x2D), + "m" => Some(0x2E), + "." => Some(0x2F), + "tab" => Some(0x30), + "space" => Some(0x31), + "`" => Some(0x32), + "delete" | "backspace" => Some(0x33), + "escape" | "esc" => Some(0x35), + "f1" => Some(0x7A), + "f2" => Some(0x78), + "f3" => Some(0x63), + "f4" => Some(0x76), + "f5" => Some(0x60), + "f6" => Some(0x61), + "f7" => Some(0x62), + "f8" => Some(0x64), + "f9" => Some(0x65), + "f10" => Some(0x6D), + "f11" => Some(0x67), + "f12" => Some(0x6F), + "left" => Some(0x7B), + "right" => Some(0x7C), + "down" => Some(0x7D), + "up" => Some(0x7E), + _ => None, + } + } +} + +#[cfg(target_os = "macos")] +struct ElementCollector<'a> { + role_filter: Option, + title_filter: Option, + identifier_filter: Option, + results: std::cell::RefCell<&'a mut Vec>, + depth: std::cell::Cell, +} + +#[cfg(target_os = "macos")] +impl<'a> TreeVisitor for ElementCollector<'a> { + fn enter_element(&self, element: &AXUIElement) -> TreeWalkerFlow { + self.depth.set(self.depth.get() + 1); + + if self.depth.get() > 20 { + return TreeWalkerFlow::SkipSubtree; + } + + // Get element properties + let role = element.role() + .ok() + .map(|s| s.to_string()) + .unwrap_or_else(|| "Unknown".to_string()); + + let title = element.title() + .ok() + .map(|s| s.to_string()); + + let identifier = element.identifier() + .ok() + .map(|s| s.to_string()); + + // Check if this element matches the filters + let role_matches = self.role_filter.as_ref().map_or(true, |r| role.contains(r)); + let title_matches = self.title_filter.as_ref().map_or(true, |t| { + title.as_ref().map_or(false, |title_str| title_str.contains(t)) + }); + let identifier_matches = self.identifier_filter.as_ref().map_or(true, |id| { + identifier.as_ref().map_or(false, |id_str| id_str.contains(id)) + }); + + if role_matches && title_matches && identifier_matches { + // Get additional properties + let value = element.value() + .ok() + .and_then(|v| { + v.downcast::().map(|s| s.to_string()) + }); + + let label = element.description() + .ok() + .map(|s| s.to_string()); + + let enabled = element.enabled() + .ok() + .map(|b| b.into()) + .unwrap_or(false); + + let focused = element.focused() + .ok() + .map(|b| b.into()) + .unwrap_or(false); + + // Count children + let children_count = element.children() + .ok() + .map(|arr| arr.len() as usize) + .unwrap_or(0); + + self.results.borrow_mut().push(AXElement { + role, + title, + value, + label, + identifier, + enabled, + focused, + position: None, + size: None, + children_count, + }); + } + + TreeWalkerFlow::Continue + } + + fn exit_element(&self, _element: &AXUIElement) { + self.depth.set(self.depth.get() - 1); + } +} diff --git a/crates/g3-computer-control/src/macax/mod.rs b/crates/g3-computer-control/src/macax/mod.rs new file mode 100644 index 0000000..b62e87d --- /dev/null +++ b/crates/g3-computer-control/src/macax/mod.rs @@ -0,0 +1,65 @@ +pub mod controller; + +pub use controller::MacAxController; + +use serde::{Deserialize, Serialize}; + +#[cfg(test)] +mod tests; + +/// Represents an accessibility element in the UI hierarchy +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AXElement { + pub role: String, + pub title: Option, + pub value: Option, + pub label: Option, + pub identifier: Option, + pub enabled: bool, + pub focused: bool, + pub position: Option<(f64, f64)>, + pub size: Option<(f64, f64)>, + pub children_count: usize, +} + +/// Represents a macOS application +#[derive(Debug, Clone)] +pub struct AXApplication { + pub name: String, + pub bundle_id: Option, + pub pid: i32, +} + +impl AXElement { + /// Convert to a human-readable string representation + pub fn to_string(&self) -> String { + let mut parts = vec![format!("Role: {}", self.role)]; + + if let Some(ref title) = self.title { + parts.push(format!("Title: {}", title)); + } + if let Some(ref value) = self.value { + parts.push(format!("Value: {}", value)); + } + if let Some(ref label) = self.label { + parts.push(format!("Label: {}", label)); + } + if let Some(ref id) = self.identifier { + parts.push(format!("ID: {}", id)); + } + + parts.push(format!("Enabled: {}", self.enabled)); + parts.push(format!("Focused: {}", self.focused)); + + if let Some((x, y)) = self.position { + parts.push(format!("Position: ({:.0}, {:.0})", x, y)); + } + if let Some((w, h)) = self.size { + parts.push(format!("Size: ({:.0}, {:.0})", w, h)); + } + + parts.push(format!("Children: {}", self.children_count)); + + parts.join(", ") + } +} diff --git a/crates/g3-computer-control/src/macax/tests.rs b/crates/g3-computer-control/src/macax/tests.rs new file mode 100644 index 0000000..01f44e3 --- /dev/null +++ b/crates/g3-computer-control/src/macax/tests.rs @@ -0,0 +1,37 @@ +#[cfg(test)] +mod tests { + use crate::{AXElement, MacAxController}; + + #[test] + fn test_ax_element_to_string() { + let element = AXElement { + role: "button".to_string(), + title: Some("Click Me".to_string()), + value: None, + label: Some("Submit Button".to_string()), + identifier: Some("submitBtn".to_string()), + enabled: true, + focused: false, + position: Some((100.0, 200.0)), + size: Some((80.0, 30.0)), + children_count: 0, + }; + + let string_repr = element.to_string(); + assert!(string_repr.contains("Role: button")); + assert!(string_repr.contains("Title: Click Me")); + assert!(string_repr.contains("Label: Submit Button")); + assert!(string_repr.contains("ID: submitBtn")); + assert!(string_repr.contains("Enabled: true")); + assert!(string_repr.contains("Position: (100, 200)")); + assert!(string_repr.contains("Size: (80, 30)")); + } + + #[test] + fn test_controller_creation() { + // Just test that we can create a controller + // Actual functionality requires macOS and permissions + let result = MacAxController::new(); + assert!(result.is_ok()); + } +} diff --git a/crates/g3-computer-control/src/ocr/mod.rs b/crates/g3-computer-control/src/ocr/mod.rs new file mode 100644 index 0000000..b651da3 --- /dev/null +++ b/crates/g3-computer-control/src/ocr/mod.rs @@ -0,0 +1,26 @@ +use crate::types::TextLocation; +use anyhow::Result; +use async_trait::async_trait; + +/// OCR engine trait for text recognition with bounding boxes +#[async_trait] +pub trait OCREngine: Send + Sync { + /// Extract text with locations from an image file + async fn extract_text_with_locations(&self, path: &str) -> Result>; + + /// Get the name of the OCR engine + fn name(&self) -> &str; +} + +// Platform-specific modules +#[cfg(target_os = "macos")] +pub mod vision; + +pub mod tesseract; + +// Re-export the default OCR engine for the platform +#[cfg(target_os = "macos")] +pub use vision::AppleVisionOCR as DefaultOCR; + +#[cfg(not(target_os = "macos"))] +pub use tesseract::TesseractOCR as DefaultOCR; diff --git a/crates/g3-computer-control/src/ocr/tesseract.rs b/crates/g3-computer-control/src/ocr/tesseract.rs new file mode 100644 index 0000000..d55fc3f --- /dev/null +++ b/crates/g3-computer-control/src/ocr/tesseract.rs @@ -0,0 +1,84 @@ +use super::OCREngine; +use crate::types::TextLocation; +use anyhow::Result; +use async_trait::async_trait; + +/// Tesseract OCR engine (fallback/cross-platform) +pub struct TesseractOCR; + +impl TesseractOCR { + pub fn new() -> Result { + // Check if tesseract is available + let tesseract_check = std::process::Command::new("which") + .arg("tesseract") + .output(); + + if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() { + anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\ + To install tesseract:\n macOS: brew install tesseract\n \ + Linux: sudo apt-get install tesseract-ocr (Ubuntu/Debian)\n \ + sudo yum install tesseract (RHEL/CentOS)\n \ + Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki\n\n\ + After installation, restart your terminal and try again."); + } + + Ok(Self) + } +} + +#[async_trait] +impl OCREngine for TesseractOCR { + async fn extract_text_with_locations(&self, path: &str) -> Result> { + // Use tesseract CLI with TSV output to get bounding boxes + let output = std::process::Command::new("tesseract") + .arg(path) + .arg("stdout") + .arg("tsv") + .output() + .map_err(|e| anyhow::anyhow!("Failed to run tesseract: {}", e))?; + + if !output.status.success() { + anyhow::bail!("Tesseract failed: {}", String::from_utf8_lossy(&output.stderr)); + } + + let tsv_text = String::from_utf8_lossy(&output.stdout); + let mut locations = Vec::new(); + + // Parse TSV output (skip header line) + for (i, line) in tsv_text.lines().enumerate() { + if i == 0 { continue; } // Skip header + + let parts: Vec<&str> = line.split('\t').collect(); + if parts.len() >= 12 { + // TSV format: level, page_num, block_num, par_num, line_num, word_num, + // left, top, width, height, conf, text + if let (Ok(x), Ok(y), Ok(w), Ok(h), Ok(conf), text) = ( + parts[6].parse::(), + parts[7].parse::(), + parts[8].parse::(), + parts[9].parse::(), + parts[10].parse::(), + parts[11], + ) { + let trimmed = text.trim(); + if !trimmed.is_empty() && conf > 0.0 { + locations.push(TextLocation { + text: trimmed.to_string(), + x, + y, + width: w, + height: h, + confidence: conf / 100.0, // Convert from 0-100 to 0-1 + }); + } + } + } + } + + Ok(locations) + } + + fn name(&self) -> &str { + "Tesseract OCR" + } +} diff --git a/crates/g3-computer-control/src/ocr/vision.rs b/crates/g3-computer-control/src/ocr/vision.rs new file mode 100644 index 0000000..d35491d --- /dev/null +++ b/crates/g3-computer-control/src/ocr/vision.rs @@ -0,0 +1,103 @@ +use super::OCREngine; +use crate::types::TextLocation; +use anyhow::{Result, Context}; +use async_trait::async_trait; +use std::ffi::{CStr, CString}; +use std::os::raw::{c_char, c_float, c_uint}; + +// FFI bindings to Swift VisionBridge +#[repr(C)] +struct VisionTextBox { + text: *const c_char, + text_len: c_uint, + x: i32, + y: i32, + width: i32, + height: i32, + confidence: c_float, +} + +extern "C" { + fn vision_recognize_text( + image_path: *const c_char, + image_path_len: c_uint, + out_boxes: *mut *mut std::ffi::c_void, + out_count: *mut c_uint, + ) -> bool; + + fn vision_free_boxes(boxes: *mut std::ffi::c_void, count: c_uint); +} + +/// Apple Vision Framework OCR engine +pub struct AppleVisionOCR; + +impl AppleVisionOCR { + pub fn new() -> Result { + Ok(Self) + } +} + +#[async_trait] +impl OCREngine for AppleVisionOCR { + async fn extract_text_with_locations(&self, path: &str) -> Result> { + // Convert path to C string + let c_path = CString::new(path) + .context("Failed to convert path to C string")?; + + let mut boxes_ptr: *mut std::ffi::c_void = std::ptr::null_mut(); + let mut count: c_uint = 0; + + // Call Swift Vision API + let success = unsafe { + vision_recognize_text( + c_path.as_ptr(), + path.len() as c_uint, + &mut boxes_ptr, + &mut count, + ) + }; + + if !success || boxes_ptr.is_null() { + anyhow::bail!("Apple Vision OCR failed"); + } + + // Convert C array to Rust Vec + let mut locations = Vec::new(); + + unsafe { + let typed_boxes = boxes_ptr as *const VisionTextBox; + let boxes_slice = std::slice::from_raw_parts(typed_boxes, count as usize); + + for box_data in boxes_slice { + // Convert C string to Rust String + let text = if !box_data.text.is_null() { + CStr::from_ptr(box_data.text) + .to_string_lossy() + .into_owned() + } else { + String::new() + }; + + if !text.is_empty() { + locations.push(TextLocation { + text, + x: box_data.x, + y: box_data.y, + width: box_data.width, + height: box_data.height, + confidence: box_data.confidence, + }); + } + } + + // Free the C array + vision_free_boxes(boxes_ptr, count); + } + + Ok(locations) + } + + fn name(&self) -> &str { + "Apple Vision Framework" + } +} diff --git a/crates/g3-computer-control/src/platform/linux.rs b/crates/g3-computer-control/src/platform/linux.rs index 2a9d89c..cf485ed 100644 --- a/crates/g3-computer-control/src/platform/linux.rs +++ b/crates/g3-computer-control/src/platform/linux.rs @@ -63,10 +63,15 @@ impl ComputerController for LinuxController { } async fn take_screenshot(&self, _path: &str, _region: Option, _window_id: Option<&str>) -> Result<()> { + // Enforce that window_id must be provided + if _window_id.is_none() { + anyhow::bail!("window_id is required. You must specify which window to capture (e.g., 'Firefox', 'Terminal', 'gedit'). Use list_windows to see available windows."); + } + anyhow::bail!("Linux implementation not yet available") } - async fn extract_text_from_screen(&self, _region: Rect) -> Result { + async fn extract_text_from_screen(&self, _region: Rect, _window_id: &str) -> Result { anyhow::bail!("Linux implementation not yet available") } diff --git a/crates/g3-computer-control/src/platform/macos.rs b/crates/g3-computer-control/src/platform/macos.rs index 129b73c..da9c81b 100644 --- a/crates/g3-computer-control/src/platform/macos.rs +++ b/crates/g3-computer-control/src/platform/macos.rs @@ -1,22 +1,37 @@ -use crate::{ComputerController, types::Rect}; -use anyhow::Result; +use crate::{ComputerController, types::{Rect, TextLocation}}; +use crate::ocr::{OCREngine, DefaultOCR}; +use anyhow::{Result, Context}; use async_trait::async_trait; use std::path::Path; -use tesseract::Tesseract; +use core_graphics::window::{kCGWindowListOptionOnScreenOnly, kCGNullWindowID, CGWindowListCopyWindowInfo}; +use core_foundation::dictionary::CFDictionary; +use core_foundation::string::CFString; +use core_foundation::base::{TCFType, ToVoid}; +use core_foundation::array::CFArray; pub struct MacOSController { - // Empty struct for now + ocr_engine: Box, + #[allow(dead_code)] + ocr_name: String, } impl MacOSController { pub fn new() -> Result { - Ok(Self {}) + let ocr = Box::new(DefaultOCR::new()?); + let ocr_name = ocr.name().to_string(); + tracing::info!("Initialized macOS controller with OCR engine: {}", ocr_name); + Ok(Self { ocr_engine: ocr, ocr_name }) } } #[async_trait] impl ComputerController for MacOSController { async fn take_screenshot(&self, path: &str, region: Option, window_id: Option<&str>) -> Result<()> { + // Enforce that window_id must be provided + if window_id.is_none() { + return Err(anyhow::anyhow!("window_id is required. You must specify which window to capture (e.g., 'Safari', 'Terminal', 'Google Chrome'). Use list_windows to see available windows.")); + } + // Determine the temporary directory for screenshots let temp_dir = std::env::var("TMPDIR") .or_else(|_| std::env::var("HOME").map(|h| format!("{}/tmp", h))) @@ -37,48 +52,134 @@ impl ComputerController for MacOSController { std::fs::create_dir_all(parent)?; } - let mut cmd = std::process::Command::new("screencapture"); + let app_name = window_id.unwrap(); // Safe because we checked is_none() above - // Add flags + // Get the window ID for the specified application + let cg_window_id = unsafe { + let window_list = CGWindowListCopyWindowInfo( + kCGWindowListOptionOnScreenOnly, + kCGNullWindowID + ); + + let array = CFArray::::wrap_under_create_rule(window_list); + let count = array.len(); + + let mut found_window_id: Option<(u32, String)> = None; // (id, owner) + let app_name_lower = app_name.to_lowercase(); + + for i in 0..count { + let dict = array.get(i).unwrap(); + + // Get owner name + let owner_key = CFString::from_static_string("kCGWindowOwnerName"); + let owner: String = if let Some(value) = dict.find(owner_key.to_void()) { + let s: CFString = TCFType::wrap_under_get_rule(*value as *const _); + s.to_string() + } else { + continue; + }; + + tracing::debug!("Checking window: owner='{}', looking for '{}'", owner, app_name); + let owner_lower = owner.to_lowercase(); + + // Normalize by removing spaces for exact matching + let app_name_normalized = app_name_lower.replace(" ", ""); + let owner_normalized = owner_lower.replace(" ", ""); + + // ONLY accept exact matches (case-insensitive, with or without spaces) + // This prevents "Goose" from matching "GooseStudio" + let is_match = owner_lower == app_name_lower || owner_normalized == app_name_normalized; + + if is_match { + // Get window ID + let window_id_key = CFString::from_static_string("kCGWindowNumber"); + if let Some(value) = dict.find(window_id_key.to_void()) { + let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _); + if let Some(id) = num.to_i64() { + // Get window layer to filter out menu bar windows + let layer_key = CFString::from_static_string("kCGWindowLayer"); + let layer: i32 = if let Some(value) = dict.find(layer_key.to_void()) { + let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _); + num.to_i32().unwrap_or(0) + } else { + 0 + }; + + // Get window bounds to verify it's a real window + let bounds_key = CFString::from_static_string("kCGWindowBounds"); + let has_real_bounds = if let Some(value) = dict.find(bounds_key.to_void()) { + let bounds_dict: CFDictionary = TCFType::wrap_under_get_rule(*value as *const _); + let width_key = CFString::from_static_string("Width"); + let height_key = CFString::from_static_string("Height"); + + if let (Some(w_val), Some(h_val)) = ( + bounds_dict.find(width_key.to_void()), + bounds_dict.find(height_key.to_void()), + ) { + let w_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*w_val as *const _); + let h_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*h_val as *const _); + let width = w_num.to_f64().unwrap_or(0.0); + let height = h_num.to_f64().unwrap_or(0.0); + // Real windows should be at least 100x100 pixels + width >= 100.0 && height >= 100.0 + } else { + false + } + } else { + false + }; + + // Only accept windows that are: + // 1. At layer 0 (normal windows, not menu bar) + // 2. Have real bounds (width and height >= 100) + if layer == 0 && has_real_bounds { + tracing::info!("Found valid window: ID {} for app '{}' (layer={}, bounds valid)", id, owner, layer); + found_window_id = Some((id as u32, owner.clone())); + break; + } else { + tracing::debug!("Skipping window ID {} for '{}': layer={}, has_real_bounds={}", id, owner, layer, has_real_bounds); + } + } + } + } + } + + found_window_id + }; + + let (cg_window_id, matched_owner) = cg_window_id.ok_or_else(|| { + anyhow::anyhow!("Could not find window for application '{}'. Use list_windows to see available windows.", app_name) + })?; + tracing::info!("Taking screenshot of window ID {} for app '{}'", cg_window_id, matched_owner); + + // Use screencapture with the window ID for now + // TODO: Implement direct CGWindowListCreateImage approach with proper image saving + let mut cmd = std::process::Command::new("screencapture"); cmd.arg("-x"); // No sound + cmd.arg("-l"); + cmd.arg(cg_window_id.to_string()); if let Some(region) = region { - // Capture specific region: -R x,y,width,height cmd.arg("-R"); cmd.arg(format!("{},{},{},{}", region.x, region.y, region.width, region.height)); } - if let Some(app_name) = window_id { - // Capture specific window by app name - // Use AppleScript to get window ID - let script = format!(r#"tell application "{}" to id of window 1"#, app_name); - let output = std::process::Command::new("osascript") - .arg("-e") - .arg(&script) - .output()?; - - if output.status.success() { - let window_id_str = String::from_utf8_lossy(&output.stdout).trim().to_string(); - cmd.arg(format!("-l{}", window_id_str)); - } - } - cmd.arg(&final_path); let screenshot_result = cmd.output()?; if !screenshot_result.status.success() { let stderr = String::from_utf8_lossy(&screenshot_result.stderr); - return Err(anyhow::anyhow!("screencapture failed: {}", stderr)); + return Err(anyhow::anyhow!("screencapture failed for window {}: {}", cg_window_id, stderr)); } Ok(()) } - async fn extract_text_from_screen(&self, region: Rect) -> Result { + async fn extract_text_from_screen(&self, region: Rect, window_id: &str) -> Result { // Take screenshot of region first let temp_path = format!("/tmp/g3_ocr_{}.png", uuid::Uuid::new_v4()); - self.take_screenshot(&temp_path, Some(region), None).await?; + self.take_screenshot(&temp_path, Some(region), Some(window_id)).await?; // Extract text from the screenshot let result = self.extract_text_from_image(&temp_path).await?; @@ -90,36 +191,317 @@ impl ComputerController for MacOSController { } async fn extract_text_from_image(&self, path: &str) -> Result { - // Check if tesseract is available on the system - let tesseract_check = std::process::Command::new("which") - .arg("tesseract") - .output(); + // Extract all text and concatenate + let locations = self.ocr_engine.extract_text_with_locations(path).await?; + Ok(locations.iter().map(|loc| loc.text.as_str()).collect::>().join(" ")) + } + + async fn extract_text_with_locations(&self, path: &str) -> Result> { + // Use the OCR engine + self.ocr_engine.extract_text_with_locations(path).await + } + + async fn find_text_in_app(&self, app_name: &str, search_text: &str) -> Result> { + // Take screenshot of specific app window + let home = std::env::var("HOME").unwrap_or_else(|_| "/tmp".to_string()); + let temp_path = format!("{}/tmp/g3_find_text_{}_{}.png", home, app_name, uuid::Uuid::new_v4()); + self.take_screenshot(&temp_path, None, Some(app_name)).await?; - if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() { - anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\ - To install tesseract:\n macOS: brew install tesseract\n \ - Linux: sudo apt-get install tesseract-ocr (Ubuntu/Debian)\n \ - sudo yum install tesseract (RHEL/CentOS)\n \ - Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki\n\n\ - After installation, restart your terminal and try again."); + // Get screenshot dimensions before we delete it + let screenshot_dims = get_image_dimensions(&temp_path)?; + + // Extract all text with locations + let locations = self.extract_text_with_locations(&temp_path).await?; + + // Get window bounds to calculate coordinate transformation + let window_bounds = self.get_window_bounds(app_name)?; + + // Clean up temp file + let _ = std::fs::remove_file(&temp_path); + + // Find matching text (case-insensitive) + let search_lower = search_text.to_lowercase(); + for location in locations { + if location.text.to_lowercase().contains(&search_lower) { + // Transform coordinates from screenshot space to screen space + let transformed = transform_screenshot_to_screen_coords( + location, + window_bounds, + screenshot_dims, + ); + return Ok(Some(transformed)); + } } - // Initialize Tesseract - let tess = Tesseract::new(None, Some("eng")) - .map_err(|e| { - anyhow::anyhow!("Failed to initialize Tesseract: {}\n\n\ - This usually means:\n1. Tesseract is not properly installed\n\ - 2. Language data files are missing\n\nTo fix:\n \ - macOS: brew reinstall tesseract\n \ - Linux: sudo apt-get install tesseract-ocr-eng\n \ - Windows: Reinstall tesseract and ensure language files are included", e) - })?; - - let text = tess.set_image(path) - .map_err(|e| anyhow::anyhow!("Failed to load image '{}': {}", path, e))? - .get_text() - .map_err(|e| anyhow::anyhow!("Failed to extract text from image: {}", e))?; - - Ok(text) + Ok(None) } -} \ No newline at end of file + + fn move_mouse(&self, x: i32, y: i32) -> Result<()> { + use core_graphics::event::{ + CGEvent, CGEventTapLocation, CGEventType, CGMouseButton, + }; + use core_graphics::event_source::{ + CGEventSource, CGEventSourceStateID, + }; + use core_graphics::geometry::CGPoint; + + let source = CGEventSource::new(CGEventSourceStateID::HIDSystemState) + .ok().context("Failed to create event source")?; + + let event = CGEvent::new_mouse_event( + source, + CGEventType::MouseMoved, + CGPoint::new(x as f64, y as f64), + CGMouseButton::Left, + ).ok().context("Failed to create mouse event")?; + + event.post(CGEventTapLocation::HID); + + Ok(()) + } + + fn click_at(&self, x: i32, y: i32, _app_name: Option<&str>) -> Result<()> { + use core_graphics::event::{ + CGEvent, CGEventTapLocation, CGEventType, CGMouseButton, + }; + use core_graphics::event_source::{ + CGEventSource, CGEventSourceStateID, + }; + use core_graphics::geometry::CGPoint; + use core_graphics::display::CGDisplay; + + // IMPORTANT: Coordinates passed here are in NSScreen/CGWindowListCopyWindowInfo space + // (Y=0 at BOTTOM, increases UPWARD) + // But CGEvent uses a different coordinate system (Y=0 at TOP, increases DOWNWARD) + // We need to convert: CGEvent.y = screenHeight - NSScreen.y + + let screen_height = CGDisplay::main().pixels_high() as i32; + let cgevent_x = x; + let cgevent_y = screen_height - y; + + tracing::debug!("click_at: NSScreen coords ({}, {}) -> CGEvent coords ({}, {}) [screen_height={}]", + x, y, cgevent_x, cgevent_y, screen_height); + + let (global_x, global_y) = (cgevent_x, cgevent_y); + + let point = CGPoint::new(global_x as f64, global_y as f64); + + let source = CGEventSource::new(CGEventSourceStateID::HIDSystemState) + .ok().context("Failed to create event source")?; + + // Move mouse to position first + let move_event = CGEvent::new_mouse_event( + source.clone(), + CGEventType::MouseMoved, + point, + CGMouseButton::Left, + ).ok().context("Failed to create mouse move event")?; + move_event.post(CGEventTapLocation::HID); + + std::thread::sleep(std::time::Duration::from_millis(100)); + + // Mouse down + let mouse_down = CGEvent::new_mouse_event( + source.clone(), + CGEventType::LeftMouseDown, + point, + CGMouseButton::Left, + ).ok().context("Failed to create mouse down event")?; + mouse_down.post(CGEventTapLocation::HID); + + std::thread::sleep(std::time::Duration::from_millis(50)); + + // Mouse up + let mouse_up = CGEvent::new_mouse_event( + source, + CGEventType::LeftMouseUp, + point, + CGMouseButton::Left, + ).ok().context("Failed to create mouse up event")?; + mouse_up.post(CGEventTapLocation::HID); + + Ok(()) + } +} + +impl MacOSController { + /// Get window bounds for an application (helper method) + fn get_window_bounds(&self, app_name: &str) -> Result<(i32, i32, i32, i32)> { + unsafe { + let window_list = CGWindowListCopyWindowInfo( + kCGWindowListOptionOnScreenOnly, + kCGNullWindowID + ); + + let array = CFArray::::wrap_under_create_rule(window_list); + let count = array.len(); + + let app_name_lower = app_name.to_lowercase(); + + for i in 0..count { + let dict = array.get(i).unwrap(); + + // Get owner name + let owner_key = CFString::from_static_string("kCGWindowOwnerName"); + let owner: String = if let Some(value) = dict.find(owner_key.to_void()) { + let s: CFString = TCFType::wrap_under_get_rule(*value as *const _); + s.to_string() + } else { + continue; + }; + + let owner_lower = owner.to_lowercase(); + + // Normalize by removing spaces for exact matching + let app_name_normalized = app_name_lower.replace(" ", ""); + let owner_normalized = owner_lower.replace(" ", ""); + + // ONLY accept exact matches (case-insensitive, with or without spaces) + // This prevents "Goose" from matching "GooseStudio" + let is_match = owner_lower == app_name_lower || owner_normalized == app_name_normalized; + + if is_match { + // Get window layer to filter out menu bar windows + let layer_key = CFString::from_static_string("kCGWindowLayer"); + let layer: i32 = if let Some(value) = dict.find(layer_key.to_void()) { + let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _); + num.to_i32().unwrap_or(0) + } else { + 0 + }; + + // Skip menu bar windows (layer >= 20) + if layer >= 20 { + tracing::debug!("Skipping window for '{}' at layer {} (menu bar)", owner, layer); + continue; + } + + // Get window bounds to verify it's a real window + let bounds_key = CFString::from_static_string("kCGWindowBounds"); + if let Some(value) = dict.find(bounds_key.to_void()) { + let bounds_dict: CFDictionary = TCFType::wrap_under_get_rule(*value as *const _); + + let x_key = CFString::from_static_string("X"); + let y_key = CFString::from_static_string("Y"); + let width_key = CFString::from_static_string("Width"); + let height_key = CFString::from_static_string("Height"); + + if let (Some(x_val), Some(y_val), Some(w_val), Some(h_val)) = ( + bounds_dict.find(x_key.to_void()), + bounds_dict.find(y_key.to_void()), + bounds_dict.find(width_key.to_void()), + bounds_dict.find(height_key.to_void()), + ) { + let x_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*x_val as *const _); + let y_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*y_val as *const _); + let w_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*w_val as *const _); + let h_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*h_val as *const _); + + let x: i32 = x_num.to_i64().unwrap_or(0) as i32; + let y: i32 = y_num.to_i64().unwrap_or(0) as i32; + let w: i32 = w_num.to_i64().unwrap_or(0) as i32; + let h: i32 = h_num.to_i64().unwrap_or(0) as i32; + + // Only accept windows with real bounds (>= 100x100 pixels) + if w >= 100 && h >= 100 { + tracing::info!("Found valid window bounds for '{}': x={}, y={}, w={}, h={} (layer={})", owner, x, y, w, h, layer); + return Ok((x, y, w, h)); + } else { + tracing::debug!("Skipping window for '{}': too small ({}x{})", owner, w, h); + continue; + } + } else { + continue; + } + } + } + } + } + + Err(anyhow::anyhow!("Could not find window bounds for '{}'", app_name)) + } +} + +/// Get image dimensions from a PNG file +fn get_image_dimensions(path: &str) -> Result<(i32, i32)> { + use std::fs::File; + use std::io::Read; + + let mut file = File::open(path)?; + let mut buffer = vec![0u8; 24]; + file.read_exact(&mut buffer)?; + + // PNG signature check + if &buffer[0..8] != b"\x89PNG\r\n\x1a\n" { + anyhow::bail!("Not a valid PNG file"); + } + + // Read IHDR chunk (width and height are at bytes 16-23) + let width = u32::from_be_bytes([buffer[16], buffer[17], buffer[18], buffer[19]]) as i32; + let height = u32::from_be_bytes([buffer[20], buffer[21], buffer[22], buffer[23]]) as i32; + + Ok((width, height)) +} + +/// Transform coordinates from screenshot space to screen space +/// +/// The screenshot is taken of a window, and Vision OCR returns coordinates +/// relative to the screenshot image. We need to transform these to actual +/// screen coordinates for clicking. +/// +/// On Retina displays, screenshots are taken at 2x resolution, so we need +/// to account for this scaling factor. +fn transform_screenshot_to_screen_coords( + location: TextLocation, + window_bounds: (i32, i32, i32, i32), // (x, y, width, height) in screen space + screenshot_dims: (i32, i32), // (width, height) in pixels +) -> TextLocation { + let (win_x, win_y, win_width, win_height) = window_bounds; + let (screenshot_width, screenshot_height) = screenshot_dims; + + // Calculate scale factors + // On Retina displays, screenshot is typically 2x the window size + let scale_x = win_width as f64 / screenshot_width as f64; + let scale_y = win_height as f64 / screenshot_height as f64; + + tracing::debug!("Transform: screenshot={}x{}, window={}x{} at ({},{}), scale=({:.2},{:.2})", + screenshot_width, screenshot_height, win_width, win_height, win_x, win_y, scale_x, scale_y); + + // Transform coordinates from image space to screen space + // IMPORTANT: macOS screen coordinates have origin at BOTTOM-LEFT (Y increases upward) + // Image coordinates have origin at TOP-LEFT (Y increases downward) + // win_y is the BOTTOM of the window in screen coordinates + // So we need to: (win_y + win_height) to get window TOP, then subtract screenshot_y + let window_top_y = win_y + win_height; + + tracing::debug!("[transform] Input location in image space: x={}, y={}, width={}, height={}", + location.x, location.y, location.width, location.height); + tracing::debug!("[transform] Scale factors: scale_x={:.4}, scale_y={:.4}", scale_x, scale_y); + + let transformed_x = win_x + (location.x as f64 * scale_x) as i32; + let transformed_y = window_top_y - (location.y as f64 * scale_y) as i32; + let transformed_width = (location.width as f64 * scale_x) as i32; + let transformed_height = (location.height as f64 * scale_y) as i32; + + tracing::debug!("[transform] Calculation details:"); + tracing::debug!(" - transformed_x = {} + ({} * {:.4}) = {} + {:.2} = {}", win_x, location.x, scale_x, win_x, location.x as f64 * scale_x, transformed_x); + tracing::debug!(" - transformed_width = ({} * {:.4}) = {:.2} -> {}", location.width, scale_x, location.width as f64 * scale_x, transformed_width); + tracing::debug!(" - transformed_height = ({} * {:.4}) = {:.2} -> {}", location.height, scale_y, location.height as f64 * scale_y, transformed_height); + + tracing::debug!("Transformed location: screenshot=({},{}) {}x{} -> screen=({},{}) {}x{}", + location.x, location.y, location.width, location.height, + transformed_x, transformed_y, transformed_width, transformed_height); + + TextLocation { + text: location.text, + x: transformed_x, + y: transformed_y, + width: transformed_width, + height: transformed_height, + confidence: location.confidence, + } +} + +#[path = "macos_window_matching_test.rs"] +#[cfg(test)] +mod tests; \ No newline at end of file diff --git a/crates/g3-computer-control/src/platform/macos.rs.bak b/crates/g3-computer-control/src/platform/macos.rs.bak deleted file mode 100644 index 03d5050..0000000 --- a/crates/g3-computer-control/src/platform/macos.rs.bak +++ /dev/null @@ -1,425 +0,0 @@ -use crate::{ComputerController, types::*}; -use anyhow::Result; -use async_trait::async_trait; -use core_graphics::display::CGPoint; -use core_graphics::event::{CGEvent, CGEventType, CGMouseButton, CGEventTapLocation}; -use core_graphics::event_source::{CGEventSource, CGEventSourceStateID}; -use std::path::Path; -use tesseract::Tesseract; - -// MacOSController doesn't store CGEventSource to avoid Send/Sync issues -// We create it fresh for each operation -pub struct MacOSController { - // Empty struct - event source created per operation -} - -impl MacOSController { - pub fn new() -> Result { - // Test that we can create an event source - let _event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState) - .map_err(|_| anyhow::anyhow!("Failed to create event source. Make sure Accessibility permissions are granted."))?; - Ok(Self {}) - } - - fn key_to_keycode(&self, key: &str) -> Result { - // Map key names to macOS keycodes - let keycode = match key.to_lowercase().as_str() { - "return" | "enter" => 36, - "tab" => 48, - "space" => 49, - "delete" | "backspace" => 51, - "escape" | "esc" => 53, - "command" | "cmd" => 55, - "shift" => 56, - "capslock" => 57, - "option" | "alt" => 58, - "control" | "ctrl" => 59, - "left" => 123, - "right" => 124, - "down" => 125, - "up" => 126, - _ => anyhow::bail!("Unknown key: {}", key), - }; - Ok(keycode) - } -} - -#[async_trait] -impl ComputerController for MacOSController { - async fn move_mouse(&self, x: i32, y: i32) -> Result<()> { - let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState) - .map_err(|_| anyhow::anyhow!("Failed to create event source"))?; - let point = CGPoint::new(x as f64, y as f64); - let event = CGEvent::new_mouse_event( - event_source, - CGEventType::MouseMoved, - point, - CGMouseButton::Left, - ).map_err(|_| anyhow::anyhow!("Failed to create mouse move event"))?; - - event.post(CGEventTapLocation::HID); - Ok(()) - } - - async fn click(&self, button: MouseButton) -> Result<()> { - let (cg_button, down_type, up_type) = match button { - MouseButton::Left => (CGMouseButton::Left, CGEventType::LeftMouseDown, CGEventType::LeftMouseUp), - MouseButton::Right => (CGMouseButton::Right, CGEventType::RightMouseDown, CGEventType::RightMouseUp), - MouseButton::Middle => (CGMouseButton::Center, CGEventType::OtherMouseDown, CGEventType::OtherMouseUp), - }; - - let point = { - // Get current mouse position - let temp_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState) - .map_err(|_| anyhow::anyhow!("Failed to create event source"))?; - let event = CGEvent::new(temp_source) - .map_err(|_| anyhow::anyhow!("Failed to get mouse position"))?; - let p = event.location(); - p - }; - - { - let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState) - .map_err(|_| anyhow::anyhow!("Failed to create event source"))?; - - // Mouse down - let down_event = CGEvent::new_mouse_event( - event_source, - down_type, - point, - cg_button, - ).map_err(|_| anyhow::anyhow!("Failed to create mouse down event"))?; - down_event.post(CGEventTapLocation::HID); - } // event_source and down_event dropped here - - // Small delay - tokio::time::sleep(tokio::time::Duration::from_millis(50)).await; - - { - let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState) - .map_err(|_| anyhow::anyhow!("Failed to create event source"))?; - - let up_event = CGEvent::new_mouse_event( - event_source, - up_type, - point, - cg_button, - ).map_err(|_| anyhow::anyhow!("Failed to create mouse up event"))?; - up_event.post(CGEventTapLocation::HID); - } // event_source and up_event dropped here - - Ok(()) - } - - async fn double_click(&self, button: MouseButton) -> Result<()> { - self.click(button).await?; - tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; - self.click(button).await?; - Ok(()) - } - - async fn type_text(&self, text: &str) -> Result<()> { - for ch in text.chars() { - { - let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState) - .map_err(|_| anyhow::anyhow!("Failed to create event source"))?; - - // Create keyboard event for character - let event = CGEvent::new_keyboard_event( - event_source, - 0, // keycode (0 for unicode) - true, - ).map_err(|_| anyhow::anyhow!("Failed to create keyboard event"))?; - - // Set unicode string - let mut utf16_buf = [0u16; 2]; - let utf16_slice = ch.encode_utf16(&mut utf16_buf); - let utf16_chars: Vec = utf16_slice.iter().copied().collect(); - - event.set_string_from_utf16_unchecked(utf16_chars.as_slice()); - event.post(CGEventTapLocation::HID); - } // event_source and event dropped here - - tokio::time::sleep(tokio::time::Duration::from_millis(10)).await; - } - Ok(()) - } - - async fn press_key(&self, key: &str) -> Result<()> { - let keycode = self.key_to_keycode(key)?; - - { - let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState) - .map_err(|_| anyhow::anyhow!("Failed to create event source"))?; - - // Key down - let down_event = CGEvent::new_keyboard_event( - event_source, - keycode, - true, - ).map_err(|_| anyhow::anyhow!("Failed to create key down event"))?; - down_event.post(CGEventTapLocation::HID); - } // event_source and down_event dropped here - - tokio::time::sleep(tokio::time::Duration::from_millis(50)).await; - - { - let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState) - .map_err(|_| anyhow::anyhow!("Failed to create event source"))?; - - // Key up - let up_event = CGEvent::new_keyboard_event( - event_source, - keycode, - false, - ).map_err(|_| anyhow::anyhow!("Failed to create key up event"))?; - up_event.post(CGEventTapLocation::HID); - } // event_source and up_event dropped here - - Ok(()) - } - - async fn list_windows(&self) -> Result> { - // Note: Full implementation would use CGWindowListCopyWindowInfo - // For now, return empty list as this requires more complex FFI - tracing::warn!("list_windows not fully implemented on macOS"); - Ok(vec![]) - } - - async fn focus_window(&self, _window_id: &str) -> Result<()> { - // Note: Full implementation would use NSWorkspace to activate application - tracing::warn!("focus_window not fully implemented on macOS"); - Ok(()) - } - - async fn get_window_bounds(&self, _window_id: &str) -> Result { - // Note: Full implementation would use Accessibility API - tracing::warn!("get_window_bounds not fully implemented on macOS"); - Ok(Rect { x: 0, y: 0, width: 800, height: 600 }) - } - - async fn find_element(&self, _selector: &ElementSelector) -> Result> { - // Note: Full implementation would use macOS Accessibility API - tracing::warn!("find_element not fully implemented on macOS"); - Ok(None) - } - - async fn get_element_text(&self, _element_id: &str) -> Result { - // Note: Full implementation would use Accessibility API - tracing::warn!("get_element_text not fully implemented on macOS"); - Ok(String::new()) - } - - async fn get_element_bounds(&self, _element_id: &str) -> Result { - // Note: Full implementation would use Accessibility API - tracing::warn!("get_element_bounds not fully implemented on macOS"); - Ok(Rect { x: 0, y: 0, width: 100, height: 30 }) - } - - async fn take_screenshot(&self, path: &str, _region: Option, window_id: Option<&str>) -> Result<()> { - // Use native macOS screencapture command which handles all the format complexities - - // Check if we have Screen Recording permission by attempting a test capture - // If we only get wallpaper/menubar but no windows, we need permission - let needs_permission_check = std::env::var("G3_SKIP_PERMISSION_CHECK").is_err(); - - if needs_permission_check { - // Try to open Screen Recording settings if this is the first screenshot - static PERMISSION_PROMPTED: std::sync::atomic::AtomicBool = std::sync::atomic::AtomicBool::new(false); - - if !PERMISSION_PROMPTED.swap(true, std::sync::atomic::Ordering::Relaxed) { - tracing::warn!("\n=== Screen Recording Permission Required ===\n\ - macOS requires explicit permission to capture window content.\n\ - If screenshots only show wallpaper/menubar (no windows):\n\n\ - 1. Open System Settings > Privacy & Security > Screen Recording\n\ - 2. Enable permission for your terminal (iTerm/Terminal) or g3\n\ - 3. Restart your terminal if needed\n\n\ - Opening Screen Recording settings now...\n"); - - // Try to open the settings (non-blocking) - let _ = std::process::Command::new("open") - .arg("x-apple.systempreferences:com.apple.preference.security?Privacy_ScreenCapture") - .spawn(); - } - } - - let path_obj = Path::new(path); - if let Some(parent) = path_obj.parent() { - std::fs::create_dir_all(parent)?; - } - - let mut cmd = std::process::Command::new("screencapture"); - - // Add flags - cmd.arg("-x"); // No sound - - if let Some(window_id) = window_id { - // Capture specific window by getting its bounds and using region capture - // window_id format: "AppName" or "AppName:WindowTitle" - let app_name = window_id.split(':').next().unwrap_or(window_id); - - // Use AppleScript to get window bounds - let script = format!( - r#"tell application "{}" - tell current window - get bounds - end tell - end tell"#, - app_name - ); - - let output = std::process::Command::new("osascript") - .arg("-e") - .arg(&script) - .output() - .map_err(|e| anyhow::anyhow!("Failed to get window bounds: {}", e))?; - - if output.status.success() { - let bounds_str = String::from_utf8_lossy(&output.stdout); - let bounds: Vec = bounds_str - .trim() - .split(',') - .filter_map(|s| s.trim().parse().ok()) - .collect(); - - if bounds.len() == 4 { - let (left, top, right, bottom) = (bounds[0], bounds[1], bounds[2], bounds[3]); - let width = right - left; - let height = bottom - top; - - cmd.arg("-R"); - cmd.arg(format!("{},{},{},{}", left, top, width, height)); - - tracing::debug!("Capturing window '{}' at region: {},{} {}x{}", app_name, left, top, width, height); - } else { - tracing::warn!("Failed to parse window bounds, capturing full screen"); - } - } else { - tracing::warn!("Failed to get window bounds for '{}', capturing full screen", app_name); - } - } else if let Some(region) = _region { - // Capture specific region: -R x,y,width,height - cmd.arg("-R"); - cmd.arg(format!("{},{},{},{}", region.x, region.y, region.width, region.height)); - } - - cmd.arg(path); - - let output = cmd.output() - .map_err(|e| anyhow::anyhow!("Failed to execute screencapture: {}", e))?; - - if !output.status.success() { - let stderr = String::from_utf8_lossy(&output.stderr); - anyhow::bail!("screencapture failed: {}", stderr); - } - - tracing::debug!("Screenshot saved using screencapture: {}", path); - - Ok(()) - } - - } - - async fn extract_text_from_screen(&self, region: Rect) -> Result { - // Take screenshot of region first - let temp_path = format!("/tmp/g3_ocr_{}.png", uuid::Uuid::new_v4()); - self.take_screenshot(&temp_path, Some(region), None).await?; - - // Extract text from the screenshot - let result = self.extract_text_from_image(&temp_path).await?; - - // Clean up temp file - let _ = std::fs::remove_file(&temp_path); - - Ok(result) - } - - async fn extract_text_from_image(&self, _path: &str) -> Result { - // Check if tesseract is available on the system - let tesseract_check = std::process::Command::new("which") - .arg("tesseract") - .output(); - - if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() { - anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\ - To install tesseract:\n macOS: brew install tesseract\n \ - Linux: sudo apt-get install tesseract-ocr (Ubuntu/Debian)\n \ - sudo yum install tesseract (RHEL/CentOS)\n \ - Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki\n\n\ - After installation, restart your terminal and try again."); - } - - // Initialize Tesseract - let tess = Tesseract::new(None, Some("eng")) - .map_err(|e| { - anyhow::anyhow!("Failed to initialize Tesseract: {}\n\n\ - This usually means:\n1. Tesseract is not properly installed\n\ - 2. Language data files are missing\n\nTo fix:\n \ - macOS: brew reinstall tesseract\n \ - Linux: sudo apt-get install tesseract-ocr-eng\n \ - Windows: Reinstall tesseract and ensure language files are included", e) - })?; - - let text = tess.set_image(_path) - .map_err(|e| anyhow::anyhow!("Failed to load image '{}': {}", _path, e))? - .get_text() - .map_err(|e| anyhow::anyhow!("Failed to extract text from image: {}", e))?; - - // Get confidence (simplified - would need more complex API calls for per-word confidence) - let confidence = 0.85; // Placeholder - - Ok(OCRResult { - text, - confidence, - bounds: Rect { x: 0, y: 0, width: 0, height: 0 }, // Would need image dimensions - }) - } - - async fn find_text_on_screen(&self, _text: &str) -> Result> { - // Check if tesseract is available on the system - let tesseract_check = std::process::Command::new("which") - .arg("tesseract") - .output(); - - if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() { - anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\ - To install tesseract:\n macOS: brew install tesseract\n \ - Linux: sudo apt-get install tesseract-ocr (Ubuntu/Debian)\n \ - sudo yum install tesseract (RHEL/CentOS)\n \ - Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki\n\n\ - After installation, restart your terminal and try again."); - } - - // Take full screen screenshot - let temp_path = format!("/tmp/g3_ocr_search_{}.png", uuid::Uuid::new_v4()); - self.take_screenshot(&temp_path, None, None).await?; - - // Use Tesseract to find text with bounding boxes - let tess = Tesseract::new(None, Some("eng")) - .map_err(|e| { - anyhow::anyhow!("Failed to initialize Tesseract: {}\n\n\ - This usually means:\n1. Tesseract is not properly installed\n\ - 2. Language data files are missing\n\nTo fix:\n \ - macOS: brew reinstall tesseract\n \ - Linux: sudo apt-get install tesseract-ocr-eng\n \ - Windows: Reinstall tesseract and ensure language files are included", e) - })?; - - let full_text = tess.set_image(temp_path.as_str()) - .map_err(|e| anyhow::anyhow!("Failed to load screenshot: {}", e))? - .get_text() - .map_err(|e| anyhow::anyhow!("Failed to extract text from screen: {}", e))?; - - // Clean up temp file - let _ = std::fs::remove_file(&temp_path); - - // Simple text search - full implementation would use get_component_images - // to get bounding boxes for each word - if full_text.contains(_text) { - tracing::warn!("Text found but precise coordinates not available in simplified implementation"); - Ok(Some(Point { x: 0, y: 0 })) - } else { - Ok(None) - } - } -} diff --git a/crates/g3-computer-control/src/platform/macos_window_matching_test.rs b/crates/g3-computer-control/src/platform/macos_window_matching_test.rs new file mode 100644 index 0000000..387988f --- /dev/null +++ b/crates/g3-computer-control/src/platform/macos_window_matching_test.rs @@ -0,0 +1,45 @@ +#[cfg(test)] +mod window_matching_tests { + /// Test that window name matching handles spaces correctly + /// + /// Issue: When a user requests a screenshot of "Goose Studio" but the actual + /// application name is "GooseStudio" (no space), the fuzzy matching should + /// still find the window. + /// + /// The fix normalizes both names by removing spaces before comparing. + #[test] + fn test_space_normalization() { + let test_cases = vec![ + // (user_input, actual_app_name, should_match) + ("Goose Studio", "GooseStudio", true), + ("GooseStudio", "Goose Studio", true), + ("Visual Studio Code", "VisualStudioCode", true), + ("Google Chrome", "Google Chrome", true), + ("Safari", "Safari", true), + ("iTerm", "iTerm2", true), // fuzzy match + ("Code", "Visual Studio Code", true), // fuzzy match + ]; + + for (user_input, app_name, should_match) in test_cases { + let user_lower = user_input.to_lowercase(); + let app_lower = app_name.to_lowercase(); + + let user_normalized = user_lower.replace(" ", ""); + let app_normalized = app_lower.replace(" ", ""); + + let is_exact = app_lower == user_lower || app_normalized == user_normalized; + let is_fuzzy = app_lower.contains(&user_lower) + || user_lower.contains(&app_lower) + || app_normalized.contains(&user_normalized) + || user_normalized.contains(&app_normalized); + + let matches = is_exact || is_fuzzy; + + assert_eq!( + matches, should_match, + "Expected '{}' vs '{}' to match={}, but got match={}", + user_input, app_name, should_match, matches + ); + } + } +} diff --git a/crates/g3-computer-control/src/platform/windows.rs b/crates/g3-computer-control/src/platform/windows.rs index 6213d56..f3250f7 100644 --- a/crates/g3-computer-control/src/platform/windows.rs +++ b/crates/g3-computer-control/src/platform/windows.rs @@ -62,10 +62,15 @@ impl ComputerController for WindowsController { } async fn take_screenshot(&self, _path: &str, _region: Option, _window_id: Option<&str>) -> Result<()> { + // Enforce that window_id must be provided + if _window_id.is_none() { + anyhow::bail!("window_id is required. You must specify which window to capture (e.g., 'Chrome', 'Terminal', 'Notepad'). Use list_windows to see available windows."); + } + anyhow::bail!("Windows implementation not yet available") } - async fn extract_text_from_screen(&self, _region: Rect) -> Result { + async fn extract_text_from_screen(&self, _region: Rect, _window_id: &str) -> Result { anyhow::bail!("Windows implementation not yet available") } diff --git a/crates/g3-computer-control/src/types.rs b/crates/g3-computer-control/src/types.rs index e7ea40e..7d09042 100644 --- a/crates/g3-computer-control/src/types.rs +++ b/crates/g3-computer-control/src/types.rs @@ -7,3 +7,13 @@ pub struct Rect { pub width: i32, pub height: i32, } + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TextLocation { + pub text: String, + pub x: i32, + pub y: i32, + pub width: i32, + pub height: i32, + pub confidence: f32, +} diff --git a/crates/g3-computer-control/tests/integration_test.rs b/crates/g3-computer-control/tests/integration_test.rs index 75c884f..87227e5 100644 --- a/crates/g3-computer-control/tests/integration_test.rs +++ b/crates/g3-computer-control/tests/integration_test.rs @@ -1,23 +1,5 @@ use g3_computer_control::*; -#[tokio::test] -async fn test_mouse_movement() { - let controller = create_controller().expect("Failed to create controller"); - - // Move mouse to center of screen (assuming 1920x1080) - let result = controller.move_mouse(960, 540).await; - assert!(result.is_ok(), "Failed to move mouse: {:?}", result.err()); -} - -#[tokio::test] -async fn test_typing() { - let controller = create_controller().expect("Failed to create controller"); - - // Type some text - let result = controller.type_text("Hello, World!").await; - assert!(result.is_ok(), "Failed to type text: {:?}", result.err()); -} - #[tokio::test] async fn test_screenshot() { let controller = create_controller().expect("Failed to create controller"); @@ -33,30 +15,3 @@ async fn test_screenshot() { // Clean up let _ = std::fs::remove_file(path); } - -#[tokio::test] -async fn test_click() { - let controller = create_controller().expect("Failed to create controller"); - - // Click at a safe location - let result = controller.click(types::MouseButton::Left).await; - assert!(result.is_ok(), "Failed to click: {:?}", result.err()); -} - -#[tokio::test] -async fn test_double_click() { - let controller = create_controller().expect("Failed to create controller"); - - // Double click - let result = controller.double_click(types::MouseButton::Left).await; - assert!(result.is_ok(), "Failed to double click: {:?}", result.err()); -} - -#[tokio::test] -async fn test_press_key() { - let controller = create_controller().expect("Failed to create controller"); - - // Press escape key - let result = controller.press_key("escape").await; - assert!(result.is_ok(), "Failed to press key: {:?}", result.err()); -} diff --git a/crates/g3-computer-control/vision-bridge/Package.swift b/crates/g3-computer-control/vision-bridge/Package.swift new file mode 100644 index 0000000..76d0503 --- /dev/null +++ b/crates/g3-computer-control/vision-bridge/Package.swift @@ -0,0 +1,24 @@ +// swift-tools-version:5.9 +import PackageDescription + +let package = Package( + name: "VisionBridge", + platforms: [ + .macOS(.v11) + ], + products: [ + .library( + name: "VisionBridge", + type: .dynamic, + targets: ["VisionBridge"] + ), + ], + targets: [ + .target( + name: "VisionBridge", + dependencies: [], + path: "Sources/VisionBridge", + publicHeadersPath: "." + ), + ] +) diff --git a/crates/g3-computer-control/vision-bridge/Sources/VisionBridge/VisionBridge.h b/crates/g3-computer-control/vision-bridge/Sources/VisionBridge/VisionBridge.h new file mode 100644 index 0000000..a83d1dc --- /dev/null +++ b/crates/g3-computer-control/vision-bridge/Sources/VisionBridge/VisionBridge.h @@ -0,0 +1,39 @@ +#ifndef VisionBridge_h +#define VisionBridge_h + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// Text box structure for FFI +typedef struct { + const char* text; + uint32_t text_len; + int32_t x; + int32_t y; + int32_t width; + int32_t height; + float confidence; +} VisionTextBox; + +// Recognize text in an image and return bounding boxes +// Returns true on success, false on failure +// Caller must free the returned boxes using vision_free_boxes +bool vision_recognize_text( + const char* image_path, + uint32_t image_path_len, + VisionTextBox** out_boxes, + uint32_t* out_count +); + +// Free memory allocated by vision_recognize_text +void vision_free_boxes(VisionTextBox* boxes, uint32_t count); + +#ifdef __cplusplus +} +#endif + +#endif /* VisionBridge_h */ diff --git a/crates/g3-computer-control/vision-bridge/Sources/VisionBridge/VisionOCR.swift b/crates/g3-computer-control/vision-bridge/Sources/VisionBridge/VisionOCR.swift new file mode 100644 index 0000000..5ff12d0 --- /dev/null +++ b/crates/g3-computer-control/vision-bridge/Sources/VisionBridge/VisionOCR.swift @@ -0,0 +1,145 @@ +import Foundation +import Vision +import AppKit +import CoreGraphics + +// MARK: - C Bridge Functions + +@_cdecl("vision_recognize_text") +public func vision_recognize_text( + _ imagePath: UnsafePointer, + _ imagePathLen: UInt32, + _ outBoxes: UnsafeMutablePointer, + _ outCount: UnsafeMutablePointer +) -> Bool { + // Convert C string to Swift String + guard let pathData = Data(bytes: imagePath, count: Int(imagePathLen)).withUnsafeBytes({ + String(bytes: $0, encoding: .utf8) + }) else { + return false + } + + let path = pathData.trimmingCharacters(in: .whitespaces) + + // Load image + guard let image = NSImage(contentsOfFile: path), + let cgImage = image.cgImage(forProposedRect: nil, context: nil, hints: nil) else { + return false + } + + // Perform OCR + var textBoxes: [CTextBox] = [] + let semaphore = DispatchSemaphore(value: 0) + var success = false + + let request = VNRecognizeTextRequest { request, error in + defer { semaphore.signal() } + + if let error = error { + print("Vision OCR error: \(error.localizedDescription)") + return + } + + guard let observations = request.results as? [VNRecognizedTextObservation] else { + return + } + + let imageSize = CGSize(width: cgImage.width, height: cgImage.height) + + for observation in observations { + guard let candidate = observation.topCandidates(1).first else { continue } + + let text = candidate.string + let boundingBox = observation.boundingBox + + // Convert normalized coordinates (bottom-left origin) to pixel coordinates (top-left origin) + let x = Int32(boundingBox.origin.x * imageSize.width) + let y = Int32((1.0 - boundingBox.origin.y - boundingBox.height) * imageSize.height) + let width = Int32(boundingBox.width * imageSize.width) + let height = Int32(boundingBox.height * imageSize.height) + + // Allocate C string for text + let cString = strdup(text) + + textBoxes.append(CTextBox( + text: cString, + text_len: UInt32(text.utf8.count), + x: x, + y: y, + width: width, + height: height, + confidence: observation.confidence + )) + } + + success = true + } + + // Configure request for best accuracy + request.recognitionLevel = .accurate + request.usesLanguageCorrection = true + request.recognitionLanguages = ["en-US"] + + // Perform request + let handler = VNImageRequestHandler(cgImage: cgImage, options: [:]) + do { + try handler.perform([request]) + } catch { + print("Vision request failed: \(error.localizedDescription)") + return false + } + + // Wait for completion + semaphore.wait() + + if !success { + return false + } + + // Allocate array for results + let boxesPtr = UnsafeMutablePointer.allocate(capacity: textBoxes.count) + for (index, box) in textBoxes.enumerated() { + boxesPtr[index] = box + } + + outBoxes.pointee = UnsafeMutableRawPointer(boxesPtr) + outCount.pointee = UInt32(textBoxes.count) + + return true +} + +@_cdecl("vision_free_boxes") +public func vision_free_boxes( + _ boxes: UnsafeMutableRawPointer, + _ count: UInt32 +) { + let typedBoxes = boxes.assumingMemoryBound(to: CTextBox.self) + for i in 0..? + public let text_len: UInt32 + public let x: Int32 + public let y: Int32 + public let width: Int32 + public let height: Int32 + public let confidence: Float + + public init(text: UnsafePointer?, text_len: UInt32, x: Int32, y: Int32, width: Int32, height: Int32, confidence: Float) { + self.text = text + self.text_len = text_len + self.x = x + self.y = y + self.width = width + self.height = height + self.confidence = confidence + } +} diff --git a/crates/g3-config/src/lib.rs b/crates/g3-config/src/lib.rs index 4b6dc9d..272367d 100644 --- a/crates/g3-config/src/lib.rs +++ b/crates/g3-config/src/lib.rs @@ -8,6 +8,7 @@ pub struct Config { pub agent: AgentConfig, pub computer_control: ComputerControlConfig, pub webdriver: WebDriverConfig, + pub macax: MacAxConfig, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -79,6 +80,19 @@ pub struct WebDriverConfig { pub safari_port: u16, } +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct MacAxConfig { + pub enabled: bool, +} + +impl Default for MacAxConfig { + fn default() -> Self { + Self { + enabled: false, + } + } +} + impl Default for WebDriverConfig { fn default() -> Self { Self { @@ -124,6 +138,7 @@ impl Default for Config { }, computer_control: ComputerControlConfig::default(), webdriver: WebDriverConfig::default(), + macax: MacAxConfig::default(), } } } @@ -238,6 +253,7 @@ impl Config { }, computer_control: ComputerControlConfig::default(), webdriver: WebDriverConfig::default(), + macax: MacAxConfig::default(), } } diff --git a/crates/g3-core/src/fixed_filter_json.rs b/crates/g3-core/src/fixed_filter_json.rs index 67dfa59..5ed6a89 100644 --- a/crates/g3-core/src/fixed_filter_json.rs +++ b/crates/g3-core/src/fixed_filter_json.rs @@ -156,15 +156,15 @@ pub fn fixed_filter_json_tool_calls(content: &str) -> String { } // No JSON tool call detected, return only the new content we haven't returned yet - let new_content = if state.buffer.len() > state.content_returned_up_to { + + + if state.buffer.len() > state.content_returned_up_to { let result = state.buffer[state.content_returned_up_to..].to_string(); state.content_returned_up_to = state.buffer.len(); result } else { String::new() - }; - - new_content + } }) } diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs index 13308b8..686be36 100644 --- a/crates/g3-core/src/lib.rs +++ b/crates/g3-core/src/lib.rs @@ -60,6 +60,12 @@ pub struct StreamingToolParser { json_tool_start: Option, } +impl Default for StreamingToolParser { + fn default() -> Self { + Self::new() + } +} + impl StreamingToolParser { pub fn new() -> Self { Self { @@ -408,7 +414,12 @@ Format this as a detailed but concise summary that can be used to resume the con } /// Reset the context window with a summary - pub fn reset_with_summary(&mut self, summary: String, latest_user_message: Option) { + pub fn reset_with_summary(&mut self, summary: String, latest_user_message: Option) -> usize { + // Calculate chars saved (old history minus new summary) + let old_chars: usize = self.conversation_history.iter() + .map(|m| m.content.len()) + .sum(); + // Clear the conversation history self.conversation_history.clear(); self.used_tokens = 0; @@ -427,6 +438,11 @@ Format this as a detailed but concise summary that can be used to resume the con content: user_msg, }); } + + let new_chars: usize = self.conversation_history.iter() + .map(|m| m.content.len()) + .sum(); + old_chars.saturating_sub(new_chars) } /// Check if we should trigger context thinning @@ -457,7 +473,7 @@ Format this as a detailed but concise summary that can be used to resume the con /// Perform context thinning: scan first third of conversation and replace large tool results /// Returns a summary message about what was thinned - pub fn thin_context(&mut self) -> String { + pub fn thin_context(&mut self) -> (String, usize) { let current_percentage = self.percentage_used() as u32; let current_threshold = (current_percentage / 10) * 10; @@ -469,24 +485,25 @@ Format this as a detailed but concise summary that can be used to resume the con let first_third_end = (total_messages / 3).max(1); let mut leaned_count = 0; + let mut tool_call_leaned_count = 0; let mut chars_saved = 0; // Create ~/tmp directory if it doesn't exist let tmp_dir = shellexpand::tilde("~/tmp").to_string(); if let Err(e) = std::fs::create_dir_all(&tmp_dir) { warn!("Failed to create ~/tmp directory: {}", e); - return format!("āš ļø Context thinning failed: could not create ~/tmp directory"); + return ("āš ļø Context thinning failed: could not create ~/tmp directory".to_string(), 0); } // Scan the first third of messages for i in 0..first_third_end { if let Some(message) = self.conversation_history.get_mut(i) { - // Only process User messages that look like tool results + // Process User messages that look like tool results if matches!(message.role, MessageRole::User) && message.content.starts_with("Tool result:") { let content_len = message.content.len(); - // Only thin if the content is greater than 1000 chars - if content_len > 1000 { + // Only thin if the content is greater than 500 chars + if content_len > 500 { // Generate a unique filename based on timestamp and index let timestamp = std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) @@ -511,6 +528,109 @@ Format this as a detailed but concise summary that can be used to resume the con debug!("Thinned tool result {} ({} chars) to {}", i, original_len, file_path); } } + + // Process Assistant messages that contain tool calls with large arguments + if matches!(message.role, MessageRole::Assistant) { + // Try to parse the message content as JSON to find tool calls + let content = &message.content; + + // Look for JSON tool call patterns + if let Some(tool_call_start) = content.find(r#"{"tool":"#) + .or_else(|| content.find(r#"{ "tool":"#)) + .or_else(|| content.find(r#"{"tool" :"#)) + .or_else(|| content.find(r#"{ "tool" :"#)) + { + // Try to extract and parse the JSON tool call + let json_portion = &content[tool_call_start..]; + + // Find the end of the JSON object + if let Some(json_end) = Self::find_json_end(json_portion) { + let json_str = &json_portion[..=json_end]; + + // Try to parse as ToolCall + if let Ok(mut tool_call) = serde_json::from_str::(json_str) { + let mut modified = false; + + // Handle write_file tool calls + if tool_call.tool == "write_file" { + if let Some(args_obj) = tool_call.args.as_object_mut() { + // Extract content to avoid borrow issues + let content_info = args_obj.get("content") + .and_then(|v| v.as_str()) + .map(|s| (s.to_string(), s.len())); + + if let Some((content_str, content_len)) = content_info { + // Only thin if content is greater than 500 chars + if content_len > 500 { + let timestamp = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + let filename = format!("leaned_write_file_content_{}_{}.txt", timestamp, i); + let file_path = format!("{}/{}", tmp_dir, filename); + + if std::fs::write(&file_path, &content_str).is_ok() { + args_obj.insert( + "content".to_string(), + serde_json::Value::String(format!("", file_path)) + ); + modified = true; + chars_saved += content_len; + tool_call_leaned_count += 1; + debug!("Thinned write_file content {} ({} chars) to {}", i, content_len, file_path); + } + } + } + } + } + + // Handle str_replace tool calls + if tool_call.tool == "str_replace" { + if let Some(args_obj) = tool_call.args.as_object_mut() { + // Extract diff to avoid borrow issues + let diff_info = args_obj.get("diff") + .and_then(|v| v.as_str()) + .map(|s| (s.to_string(), s.len())); + + if let Some((diff_str, diff_len)) = diff_info { + // Only thin if diff is greater than 500 chars + if diff_len > 500 { + let timestamp = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + let filename = format!("leaned_str_replace_diff_{}_{}.txt", timestamp, i); + let file_path = format!("{}/{}", tmp_dir, filename); + + if std::fs::write(&file_path, &diff_str).is_ok() { + args_obj.insert( + "diff".to_string(), + serde_json::Value::String(format!("", file_path)) + ); + modified = true; + chars_saved += diff_len; + tool_call_leaned_count += 1; + debug!("Thinned str_replace diff {} ({} chars) to {}", i, diff_len, file_path); + } + } + } + } + } + + // If we modified the tool call, reconstruct the message + if modified { + let prefix = &content[..tool_call_start]; + let suffix = &content[tool_call_start + json_str.len()..]; + + // Serialize the modified tool call + if let Ok(new_json) = serde_json::to_string(&tool_call) { + message.content = format!("{}{}{}", prefix, new_json, suffix); + } + } + } + } + } + } } } @@ -518,11 +638,19 @@ Format this as a detailed but concise summary that can be used to resume the con self.recalculate_tokens(); if leaned_count > 0 { - format!("šŸ„’ Context thinned at {}%: {} tool results, ~{} chars saved", - current_threshold, leaned_count, chars_saved) + if tool_call_leaned_count > 0 { + (format!("šŸ„’ Context thinned at {}%: {} tool results + {} tool calls, ~{} chars saved", + current_threshold, leaned_count, tool_call_leaned_count, chars_saved), chars_saved) + } else { + (format!("šŸ„’ Context thinned at {}%: {} tool results, ~{} chars saved", + current_threshold, leaned_count, chars_saved), chars_saved) + } + } else if tool_call_leaned_count > 0 { + (format!("šŸ„’ Context thinned at {}%: {} tool calls, ~{} chars saved", + current_threshold, tool_call_leaned_count, chars_saved), chars_saved) } else { - format!("ℹ Context thinning triggered at {}% but no large tool results found in first third", - current_threshold) + (format!("ℹ Context thinning triggered at {}% but no large tool results or tool calls found in first third", + current_threshold), 0) } } @@ -536,11 +664,43 @@ Format this as a detailed but concise summary that can be used to resume the con debug!("Recalculated tokens after thinning: {} tokens", total); } + + /// Helper function to find the end of a JSON object + fn find_json_end(json_str: &str) -> Option { + let mut brace_count = 0; + let mut in_string = false; + let mut escape_next = false; + + for (i, ch) in json_str.char_indices() { + if escape_next { + escape_next = false; + continue; + } + + match ch { + '\\' => escape_next = true, + '"' if !escape_next => in_string = !in_string, + '{' if !in_string => brace_count += 1, + '}' if !in_string => { + brace_count -= 1; + if brace_count == 0 { + return Some(i); + } + } + _ => {} + } + } + + None + } } pub struct Agent { providers: ProviderRegistry, context_window: ContextWindow, + thinning_events: Vec, // chars saved per thinning event + summarization_events: Vec, // chars saved per summarization event + first_token_times: Vec, // time to first token for each completion config: Config, session_id: Option, tool_call_metrics: Vec<(String, Duration, bool)>, // (tool_name, duration, success) @@ -551,6 +711,7 @@ pub struct Agent { todo_content: std::sync::Arc>, webdriver_session: std::sync::Arc>>>>, safaridriver_process: std::sync::Arc>>, + macax_controller: std::sync::Arc>>, } impl Agent { @@ -761,9 +922,15 @@ impl Agent { None }; + // Capture macax_enabled before moving config + let macax_enabled = config.macax.enabled; + Ok(Self { providers, context_window, + thinning_events: Vec::new(), + summarization_events: Vec::new(), + first_token_times: Vec::new(), config, session_id: None, tool_call_metrics: Vec::new(), @@ -774,6 +941,12 @@ impl Agent { computer_controller, webdriver_session: std::sync::Arc::new(tokio::sync::RwLock::new(None)), safaridriver_process: std::sync::Arc::new(tokio::sync::RwLock::new(None)), + macax_controller: { + std::sync::Arc::new(tokio::sync::RwLock::new( + if macax_enabled { Some(g3_computer_control::MacAxController::new()?) } + else { None } + )) + }, }) } @@ -813,9 +986,7 @@ impl Agent { // Databricks models have varying context windows depending on the model if model_name.contains("claude") { 200000 // Claude models on Databricks have large context windows - } else if model_name.contains("llama") { - 32768 // Llama models typically support 32k context - } else if model_name.contains("dbrx") { + } else if model_name.contains("llama") || model_name.contains("dbrx") { 32768 // DBRX supports 32k context } else { 16384 // Conservative default for other Databricks models @@ -894,6 +1065,7 @@ impl Agent { .await } + #[allow(clippy::too_many_arguments)] pub async fn execute_task_with_timing_cancellable( &mut self, description: &str, @@ -1086,7 +1258,7 @@ Template: // Check if provider supports native tool calling and add tools if so let provider = self.providers.get(None)?; let tools = if provider.has_native_tool_calling() { - Some(Self::create_tool_definitions(self.config.webdriver.enabled)) + Some(Self::create_tool_definitions(self.config.webdriver.enabled, self.config.macax.enabled, self.config.computer_control.enabled)) } else { None }; @@ -1244,6 +1416,291 @@ Template: &self.context_window } + /// Manually trigger context summarization regardless of context window size + /// Returns Ok(true) if summarization was successful, Ok(false) if it failed + pub async fn force_summarize(&mut self) -> Result { + info!("Manual summarization triggered"); + + self.ui_writer.print_context_status(&format!( + "\nšŸ—œļø Manual summarization requested (current usage: {}%)...", + self.context_window.percentage_used() as u32 + )); + + // Create summary request with FULL history + let summary_prompt = self.context_window.create_summary_prompt(); + + // Get the full conversation history + let conversation_text = self + .context_window + .conversation_history + .iter() + .map(|m| format!("{:?}: {}", m.role, m.content)) + .collect::>() + .join("\n\n"); + + let summary_messages = vec![ + Message { + role: MessageRole::System, + content: "You are a helpful assistant that creates concise summaries." + .to_string(), + }, + Message { + role: MessageRole::User, + content: format!( + "Based on this conversation history, {}\n\nConversation:\n{}", + summary_prompt, conversation_text + ), + }, + ]; + + let provider = self.providers.get(None)?; + + // Dynamically calculate max_tokens for summary based on what's left + let summary_max_tokens = match provider.name() { + "databricks" | "anthropic" => { + let model_limit = 200_000u32; + let current_usage = self.context_window.used_tokens; + let available = model_limit + .saturating_sub(current_usage) + .saturating_sub(5000); + Some(available.min(10_000)) + } + "embedded" => { + let model_limit = self.context_window.total_tokens; + let current_usage = self.context_window.used_tokens; + let available = model_limit + .saturating_sub(current_usage) + .saturating_sub(1000); + Some(available.min(3000)) + } + _ => { + let available = self.context_window.remaining_tokens().saturating_sub(2000); + Some(available.min(5000)) + } + }; + + debug!( + "Requesting summary with max_tokens: {:?} (current usage: {} tokens)", + summary_max_tokens, self.context_window.used_tokens + ); + + let summary_request = CompletionRequest { + messages: summary_messages, + max_tokens: summary_max_tokens, + temperature: Some(0.3), + stream: false, + tools: None, + }; + + // Get the summary + match provider.complete(summary_request).await { + Ok(summary_response) => { + self.ui_writer.print_context_status( + "āœ… Context compacted successfully.\n", + ); + + // Get the latest user message to preserve it + let latest_user_msg = self + .context_window + .conversation_history + .iter() + .rev() + .find(|m| matches!(m.role, MessageRole::User)) + .map(|m| m.content.clone()); + + // Reset context with summary + let chars_saved = self.context_window + .reset_with_summary(summary_response.content, latest_user_msg); + self.summarization_events.push(chars_saved); + + Ok(true) + } + Err(e) => { + error!("Failed to create summary: {}", e); + self.ui_writer.print_context_status( + "āš ļø Unable to create summary. Please try again or start a new session.\n", + ); + Ok(false) + } + } + } + + /// Manually trigger context thinning regardless of thresholds + pub fn force_thin(&mut self) -> String { + info!("Manual context thinning triggered"); + let (message, chars_saved) = self.context_window.thin_context(); + self.thinning_events.push(chars_saved); + message + } + + /// Reload README.md and AGENTS.md and replace the first system message + /// Returns Ok(true) if README was found and reloaded, Ok(false) if no README was present initially + pub fn reload_readme(&mut self) -> Result { + info!("Manual README reload triggered"); + + // Check if the first message in conversation history is a system message with README content + let has_readme = self + .context_window + .conversation_history + .first() + .map(|m| matches!(m.role, MessageRole::System) && + (m.content.contains("Project README") || m.content.contains("Agent Configuration"))) + .unwrap_or(false); + + if !has_readme { + return Ok(false); + } + + // Try to load README.md and AGENTS.md + let mut combined_content = String::new(); + let mut found_any = false; + + if let Ok(agents_content) = std::fs::read_to_string("AGENTS.md") { + combined_content.push_str("# Agent Configuration\n\n"); + combined_content.push_str(&agents_content); + combined_content.push_str("\n\n"); + found_any = true; + } + + if let Ok(readme_content) = std::fs::read_to_string("README.md") { + combined_content.push_str("# Project README\n\n"); + combined_content.push_str(&readme_content); + found_any = true; + } + + if found_any { + // Replace the first message with the new content + if let Some(first_msg) = self.context_window.conversation_history.first_mut() { + first_msg.content = combined_content; + info!("README content reloaded successfully"); + Ok(true) + } else { + Ok(false) + } + } else { + Ok(false) + } + } + + /// Get detailed context statistics + pub fn get_stats(&self) -> String { + let mut stats = String::new(); + use std::time::Duration; + + stats.push_str("\nšŸ“Š Context Window Statistics\n"); + stats.push_str(&"=".repeat(60)); + stats.push_str("\n\n"); + + // Context window usage + stats.push_str("šŸ—‚ļø Context Window:\n"); + stats.push_str(&format!(" • Used Tokens: {:>10} / {}\n", + self.context_window.used_tokens, + self.context_window.total_tokens)); + stats.push_str(&format!(" • Usage Percentage: {:>10.1}%\n", + self.context_window.percentage_used())); + stats.push_str(&format!(" • Remaining Tokens: {:>10}\n", + self.context_window.remaining_tokens())); + stats.push_str(&format!(" • Cumulative Tokens: {:>10}\n", + self.context_window.cumulative_tokens)); + stats.push_str(&format!(" • Last Thinning: {:>10}%\n", + self.context_window.last_thinning_percentage)); + stats.push('\n'); + + // Context optimization metrics + stats.push_str("šŸ—œļø Context Optimization:\n"); + stats.push_str(&format!(" • Thinning Events: {:>10}\n", + self.thinning_events.len())); + if !self.thinning_events.is_empty() { + let total_thinned: usize = self.thinning_events.iter().sum(); + let avg_thinned = total_thinned / self.thinning_events.len(); + stats.push_str(&format!(" • Total Chars Saved: {:>10}\n", total_thinned)); + stats.push_str(&format!(" • Avg Chars/Event: {:>10}\n", avg_thinned)); + } + + stats.push_str(&format!(" • Summarizations: {:>10}\n", + self.summarization_events.len())); + if !self.summarization_events.is_empty() { + let total_summarized: usize = self.summarization_events.iter().sum(); + let avg_summarized = total_summarized / self.summarization_events.len(); + stats.push_str(&format!(" • Total Chars Saved: {:>10}\n", total_summarized)); + stats.push_str(&format!(" • Avg Chars/Event: {:>10}\n", avg_summarized)); + } + stats.push('\n'); + + // Performance metrics + stats.push_str("⚔ Performance:\n"); + if !self.first_token_times.is_empty() { + let avg_ttft = self.first_token_times.iter().sum::() / self.first_token_times.len() as u32; + let mut sorted_times = self.first_token_times.clone(); + sorted_times.sort(); + let median_ttft = sorted_times[sorted_times.len() / 2]; + stats.push_str(&format!(" • Avg Time to First Token: {:>6.3}s\n", avg_ttft.as_secs_f64())); + stats.push_str(&format!(" • Median Time to First Token: {:>6.3}s\n", median_ttft.as_secs_f64())); + } + stats.push('\n'); + + // Conversation history + stats.push_str("šŸ’¬ Conversation History:\n"); + stats.push_str(&format!(" • Total Messages: {:>10}\n", + self.context_window.conversation_history.len())); + + // Count messages by role + let mut system_count = 0; + let mut user_count = 0; + let mut assistant_count = 0; + + for msg in &self.context_window.conversation_history { + match msg.role { + MessageRole::System => system_count += 1, + MessageRole::User => user_count += 1, + MessageRole::Assistant => assistant_count += 1, + } + } + + stats.push_str(&format!(" • System Messages: {:>10}\n", system_count)); + stats.push_str(&format!(" • User Messages: {:>10}\n", user_count)); + stats.push_str(&format!(" • Assistant Messages:{:>10}\n", assistant_count)); + stats.push('\n'); + + // Tool call metrics + stats.push_str("šŸ”§ Tool Call Metrics:\n"); + stats.push_str(&format!(" • Total Tool Calls: {:>10}\n", + self.tool_call_metrics.len())); + + let successful_calls = self.tool_call_metrics.iter() + .filter(|(_, _, success)| *success) + .count(); + let failed_calls = self.tool_call_metrics.len() - successful_calls; + + stats.push_str(&format!(" • Successful: {:>10}\n", successful_calls)); + stats.push_str(&format!(" • Failed: {:>10}\n", failed_calls)); + + if !self.tool_call_metrics.is_empty() { + let total_duration: Duration = self.tool_call_metrics.iter() + .map(|(_, duration, _)| *duration) + .sum(); + let avg_duration = total_duration / self.tool_call_metrics.len() as u32; + + stats.push_str(&format!(" • Total Duration: {:>10.2}s\n", + total_duration.as_secs_f64())); + stats.push_str(&format!(" • Average Duration: {:>10.2}s\n", + avg_duration.as_secs_f64())); + } + stats.push('\n'); + + // Provider info + stats.push_str("šŸ”Œ Provider:\n"); + if let Ok((provider, model)) = self.get_provider_info() { + stats.push_str(&format!(" • Provider: {}\n", provider)); + stats.push_str(&format!(" • Model: {}\n", model)); + } + + stats.push_str(&"=".repeat(60)); + stats.push('\n'); + + stats + } + pub fn get_tool_call_metrics(&self) -> &Vec<(String, Duration, bool)> { &self.tool_call_metrics } @@ -1262,7 +1719,7 @@ Template: } /// Create tool definitions for native tool calling providers - fn create_tool_definitions(enable_webdriver: bool) -> Vec { + fn create_tool_definitions(enable_webdriver: bool, enable_macax: bool, enable_computer_control: bool) -> Vec { let mut tools = vec![ Tool { name: "shell".to_string(), @@ -1360,7 +1817,7 @@ Template: }, Tool { name: "take_screenshot".to_string(), - description: "Capture a screenshot of the screen, region, or window. When capturing a specific application window (e.g., 'Safari', 'Terminal'), use the window_id parameter with just the application name. The tool will automatically use the native screencapture command with the application's window ID for a clean capture.".to_string(), + description: "Capture a screenshot of a specific application window. You MUST specify the window_id parameter with the application name (e.g., 'Safari', 'Terminal', 'Google Chrome'). The tool will automatically use the native screencapture command with the application's window ID for a clean capture. Use list_windows first to identify available windows.".to_string(), input_schema: json!({ "type": "object", "properties": { @@ -1370,7 +1827,7 @@ Template: }, "window_id": { "type": "string", - "description": "Optional application name to capture (e.g., 'Safari', 'Terminal', 'Google Chrome'). The tool will capture the frontmost window of that application using its native window ID." + "description": "REQUIRED: Application name to capture (e.g., 'Safari', 'Terminal', 'Google Chrome'). The tool will capture the frontmost window of that application using its native window ID." }, "region": { "type": "object", @@ -1382,12 +1839,12 @@ Template: } } }, - "required": ["path"] + "required": ["path", "window_id"] }), }, Tool { name: "extract_text".to_string(), - description: "Extract text from a screen region or image file using OCR".to_string(), + description: "Extract text from an image file using OCR. For extracting text from a specific window, use vision_find_text instead which automatically handles window capture.".to_string(), input_schema: json!({ "type": "object", "properties": { @@ -1395,16 +1852,6 @@ Template: "type": "string", "description": "Path to image file (optional if region is provided)" }, - "region": { - "type": "object", - "description": "Screen region to capture and extract text from", - "properties": { - "x": {"type": "integer"}, - "y": {"type": "integer"}, - "width": {"type": "integer"}, - "height": {"type": "integer"} - } - } } }), }, @@ -1617,6 +2064,182 @@ Template: ]); } + // Add macOS Accessibility tools if enabled + if enable_macax { + tools.extend(vec![ + Tool { + name: "macax_list_apps".to_string(), + description: "List all running applications that can be controlled via macOS Accessibility API".to_string(), + input_schema: json!({ + "type": "object", + "properties": {}, + "required": [] + }), + }, + Tool { + name: "macax_get_frontmost_app".to_string(), + description: "Get the name of the currently active (frontmost) application".to_string(), + input_schema: json!({ + "type": "object", + "properties": {}, + "required": [] + }), + }, + Tool { + name: "macax_activate_app".to_string(), + description: "Bring an application to the front (activate it)".to_string(), + input_schema: json!({ + "type": "object", + "properties": { + "app_name": { + "type": "string", + "description": "Name of the application to activate (e.g., 'Safari', 'TextEdit')" + } + }, + "required": ["app_name"] + }), + }, + Tool { + name: "macax_press_key".to_string(), + description: "Press a keyboard key or shortcut in an application (e.g., Cmd+S to save)".to_string(), + input_schema: json!({ + "type": "object", + "properties": { + "app_name": { + "type": "string", + "description": "Name of the application" + }, + "key": { + "type": "string", + "description": "Key to press (e.g., 's', 'return', 'tab')" + }, + "modifiers": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Modifier keys (e.g., ['command', 'shift'])" + } + }, + "required": ["app_name", "key"] + }), + }, + ]); + + // Add type_text tool for typing arbitrary text + tools.push(Tool { + name: "macax_type_text".to_string(), + description: "Type arbitrary text into the currently focused element in an application (supports unicode, emojis, etc.)".to_string(), + input_schema: json!({ + "type": "object", + "properties": { + "app_name": { + "type": "string", + "description": "Name of the application" + }, + "text": { + "type": "string", + "description": "Text to type (can include unicode, emojis, special characters)" + } + }, + "required": ["app_name", "text"] + }), + }); + + } + + // Add extract_text_with_boxes tool (requires macax flag) + if enable_macax { + tools.push(Tool { + name: "extract_text_with_boxes".to_string(), + description: "Extract all text from an image file with bounding box coordinates for each text element. Returns JSON array with text, position (x, y), size (width, height), and confidence for each detected text. Uses Apple Vision Framework for precise sub-pixel accuracy.".to_string(), + input_schema: json!({ + "type": "object", + "properties": { + "path": { + "type": "string", + "description": "Path to image file to extract text from" + }, + "app_name": { + "type": "string", + "description": "Optional: Name of application to screenshot first (e.g., 'Safari', 'Things3'). If provided, takes screenshot of app before extracting text." + } + }, + "required": ["path"] + }), + }); + } + + // Add vision-guided tools (requires computer control) + if enable_computer_control { + // Add vision-guided tools + tools.push(Tool { + name: "vision_find_text".to_string(), + description: "Find text in a specific application window and return its location with bounding box coordinates (x, y, width, height) and confidence score. Useful for locating UI elements. Uses Apple Vision Framework for precise sub-pixel accuracy.".to_string(), + input_schema: json!({ + "type": "object", + "properties": { + "app_name": { + "type": "string", + "description": "Name of the application to search in (e.g., 'Things3', 'Safari', 'TextEdit')" + }, + "text": { + "type": "string", + "description": "The text to search for on screen" + } + }, + "required": ["app_name", "text"] + }), + }); + + tools.push(Tool { + name: "vision_click_text".to_string(), + description: "Find text in a specific application window and click on it (useful for clicking buttons, links, menu items)".to_string(), + input_schema: json!({ + "type": "object", + "properties": { + "app_name": { + "type": "string", + "description": "Name of the application (e.g., 'Things3', 'Safari', 'TextEdit')" + }, + "text": { + "type": "string", + "description": "The text to click on (e.g., 'Submit', 'OK', 'Cancel', '+')" + } + }, + "required": ["app_name", "text"] + }), + }); + + tools.push(Tool { + name: "vision_click_near_text".to_string(), + description: "Find text in a specific application window and click near it (useful for clicking text fields next to labels)".to_string(), + input_schema: json!({ + "type": "object", + "properties": { + "app_name": { + "type": "string", + "description": "Name of the application (e.g., 'Things3', 'Safari', 'TextEdit')" + }, + "text": { + "type": "string", + "description": "The label text to find (e.g., 'Name:', 'Email:', 'Task:')" + }, + "direction": { + "type": "string", + "enum": ["right", "below", "left", "above"], + "description": "Direction to click relative to the text (default: right)" + }, + "distance": { + "type": "integer", + "description": "Distance in pixels from the text (default: 50)" + } + }, + "required": ["app_name", "text"] + }), + }); + } + tools } @@ -1793,8 +2416,9 @@ Template: .map(|m| m.content.clone()); // Reset context with summary - self.context_window + let chars_saved = self.context_window .reset_with_summary(summary_response.content, latest_user_msg); + self.summarization_events.push(chars_saved); // Update the request with new context request.messages = self.context_window.conversation_history.clone(); @@ -1923,6 +2547,10 @@ Template: // Record time to first token if first_token_time.is_none() && !chunk.content.is_empty() { first_token_time = Some(stream_start.elapsed()); + // Record in agent metrics + if let Some(ttft) = first_token_time { + self.first_token_times.push(ttft); + } } chunks_received += 1; @@ -1938,15 +2566,15 @@ Template: let completed_tools = parser.process_chunk(&chunk); // Handle completed tool calls - for tool_call in completed_tools { + if let Some(tool_call) = completed_tools.into_iter().next() { debug!("Processing completed tool call: {:?}", tool_call); // Check if we should thin the context BEFORE executing the tool if self.context_window.should_thin() { - let thin_summary = self.context_window.thin_context(); + let (thin_summary, chars_saved) = self.context_window.thin_context(); + self.thinning_events.push(chars_saved); // Print the thinning summary to the user - self.ui_writer.println(""); - self.ui_writer.print_context_status(&format!("{}\n", thin_summary)); + self.ui_writer.print_context_thinning(&thin_summary); } // Track what we've already displayed before getting new text @@ -2020,18 +2648,16 @@ Template: } else { s.clone() } + } else if s.len() > 100 { + // Use char_indices to respect UTF-8 boundaries + let truncated = s + .char_indices() + .take(100) + .map(|(_, c)| c) + .collect::(); + format!("{}...", truncated) } else { - if s.len() > 100 { - // Use char_indices to respect UTF-8 boundaries - let truncated = s - .char_indices() - .take(100) - .map(|(_, c)| c) - .collect::(); - format!("{}...", truncated) - } else { - s.clone() - } + s.clone() } } _ => value.to_string(), @@ -2053,7 +2679,7 @@ Template: Ok(result) => result?, Err(_) => { warn!("Tool call {} timed out after 8 minutes", tool_call.tool); - format!("āŒ Tool execution timed out after 8 minutes") + "āŒ Tool execution timed out after 8 minutes".to_string() } }; let exec_duration = exec_start.elapsed(); @@ -2068,14 +2694,26 @@ Template: // Display tool execution result with proper indentation if tool_call.tool != "final_output" { - let output_lines: Vec<&str> = tool_result.lines().collect(); + // Skip displaying output for shell tool since it was already streamed + let should_display_output = tool_call.tool != "shell"; + + let output_lines: Vec<&str> = if should_display_output { + tool_result.lines().collect() + } else { vec![] }; + // Check if UI wants full output (machine mode) or truncated (human mode) + let wants_full = self.ui_writer.wants_full_output(); + // Helper function to safely truncate strings at character boundaries - let truncate_line = |line: &str, max_width: usize| -> String { - let char_count = line.chars().count(); - if char_count <= max_width { + let truncate_line = |line: &str, max_width: usize, truncate: bool| -> String { + if !truncate { + // Machine mode - return full line + line.to_string() + } else if line.chars().count() <= max_width { + // Human mode - line fits within limit line.to_string() } else { + // Human mode - truncate long line let truncated: String = line .chars() .take(max_width.saturating_sub(3)) @@ -2090,25 +2728,26 @@ Template: // For todo tools, show all lines without truncation let is_todo_tool = tool_call.tool == "todo_read" || tool_call.tool == "todo_write"; - let max_lines_to_show = if is_todo_tool { output_len } else { MAX_LINES }; + let max_lines_to_show = if is_todo_tool || wants_full { output_len } else { MAX_LINES }; for (idx, line) in output_lines.iter().enumerate() { - if !is_todo_tool && idx >= max_lines_to_show { + if !is_todo_tool && !wants_full && idx >= max_lines_to_show { break; } // Clip line to max width - let clipped_line = truncate_line(line, MAX_LINE_WIDTH); + let clipped_line = truncate_line(line, MAX_LINE_WIDTH, !wants_full); self.ui_writer.update_tool_output_line(&clipped_line); } - if !is_todo_tool && output_len > MAX_LINES { + if !is_todo_tool && !wants_full && output_len > MAX_LINES { self.ui_writer.print_tool_output_summary(output_len); } } // Check if this was a final_output tool call if tool_call.tool == "final_output" { - full_response.push_str(final_display_content); + // Don't add final_display_content here - it was already added before tool execution + // Adding it again would duplicate the output if let Some(summary) = tool_call.args.get("summary") { if let Some(summary_str) = summary.as_str() { full_response.push_str(&format!("\n\n{}", summary_str)); @@ -2178,7 +2817,7 @@ Template: // Ensure tools are included for native providers in subsequent iterations if provider.has_native_tool_calling() { - request.tools = Some(Self::create_tool_definitions(self.config.webdriver.enabled)); + request.tools = Some(Self::create_tool_definitions(self.config.webdriver.enabled, self.config.macax.enabled, self.config.computer_control.enabled)); } // Only add to full_response if we haven't already added it @@ -2572,13 +3211,16 @@ Template: { Ok(result) => { if result.success { - Ok(if result.stdout.is_empty() { - "āœ… Command executed successfully".to_string() - } else { - result.stdout.trim().to_string() - }) + // Don't return stdout - it was already streamed to the UI + // Returning it would cause duplicate output + Ok("āœ… Command executed successfully".to_string()) } else { - Ok(format!("āŒ Command failed: {}", result.stderr.trim())) + // For errors, return stderr since it wasn't streamed + Ok(if result.stderr.is_empty() { + "āŒ Command failed".to_string() + } else { + format!("āŒ Command failed: {}", result.stderr.trim()) + }) } } Err(e) => Ok(format!("āŒ Execution error: {}", e)), @@ -2969,14 +3611,14 @@ Template: // Write the result back to the file match std::fs::write(&file_path, &result) { - Ok(()) => Ok(format!("āœ… applied unified diff")), + Ok(()) => Ok("āœ… applied unified diff".to_string()), Err(e) => Ok(format!("āŒ Failed to write to file '{}': {}", file_path, e)), } } "final_output" => { if let Some(summary) = tool_call.args.get("summary") { if let Some(summary_str) = summary.as_str() { - Ok(format!("{}", summary_str)) + Ok(summary_str.to_string()) } else { Ok("āœ… Turn completed".to_string()) } @@ -2992,8 +3634,9 @@ Template: .and_then(|v| v.as_str()) .ok_or_else(|| anyhow::anyhow!("Missing path argument"))?; - // Extract window_id (app name) if provided - let window_id = tool_call.args.get("window_id").and_then(|v| v.as_str()); + // Extract window_id (app name) - REQUIRED + let window_id = tool_call.args.get("window_id").and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing window_id argument. You must specify which window to capture (e.g., 'Safari', 'Terminal', 'Google Chrome')."))?; // Extract region if provided let region = tool_call @@ -3013,7 +3656,7 @@ Template: .unwrap_or(0) as i32, }); - match controller.take_screenshot(path, region, window_id).await { + match controller.take_screenshot(path, region, Some(window_id)).await { Ok(_) => { // Get the actual path where the screenshot was saved let actual_path = if path.starts_with('/') { @@ -3027,14 +3670,10 @@ Template: format!("{}/{}", temp_dir.trim_end_matches('/'), path) }; - if let Some(app) = window_id { - Ok(format!( - "āœ… Screenshot of {} saved to: {}", - app, actual_path - )) - } else { - Ok(format!("āœ… Screenshot saved to: {}", actual_path)) - } + Ok(format!( + "āœ… Screenshot of {} saved to: {}", + window_id, actual_path + )) } Err(e) => Ok(format!("āŒ Failed to take screenshot: {}", e)), } @@ -3044,36 +3683,14 @@ Template: } "extract_text" => { if let Some(controller) = &self.computer_controller { - // Check if we have a path or a region - if let Some(path) = tool_call.args.get("path").and_then(|v| v.as_str()) { - // Extract text from image file - match controller.extract_text_from_image(path).await { - Ok(text) => Ok(format!("āœ… Extracted text:\n{}", text)), - Err(e) => Ok(format!("āŒ Failed to extract text: {}", e)), - } - } else if let Some(region_obj) = - tool_call.args.get("region").and_then(|v| v.as_object()) - { - // Extract text from screen region - let region = g3_computer_control::types::Rect { - x: region_obj.get("x").and_then(|v| v.as_i64()).unwrap_or(0) as i32, - y: region_obj.get("y").and_then(|v| v.as_i64()).unwrap_or(0) as i32, - width: region_obj - .get("width") - .and_then(|v| v.as_i64()) - .unwrap_or(0) as i32, - height: region_obj - .get("height") - .and_then(|v| v.as_i64()) - .unwrap_or(0) as i32, - }; - - match controller.extract_text_from_screen(region).await { - Ok(text) => Ok(format!("āœ… Extracted text:\n{}", text)), - Err(e) => Ok(format!("āŒ Failed to extract text: {}", e)), - } - } else { - Ok("āŒ Missing path or region argument".to_string()) + let path = tool_call.args.get("path") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing path argument"))?; + + // Extract text from image file only + match controller.extract_text_from_image(path).await { + Ok(text) => Ok(format!("āœ… Extracted text:\n{}", text)), + Err(e) => Ok(format!("āŒ Failed to extract text: {}", e)), } } else { Ok("āŒ Computer control not enabled. Set computer_control.enabled = true in config.".to_string()) @@ -3538,6 +4155,309 @@ Template: Err(_) => Ok("āŒ Cannot quit: WebDriver session is still in use".to_string()), } } + "macax_list_apps" => { + debug!("Processing macax_list_apps tool call"); + + if !self.config.macax.enabled { + return Ok("āŒ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string()); + } + + let controller_guard = self.macax_controller.read().await; + let controller = match controller_guard.as_ref() { + Some(c) => c, + None => return Ok("āŒ macOS Accessibility controller not initialized.".to_string()), + }; + + match controller.list_applications() { + Ok(apps) => { + let app_list: Vec = apps.iter().map(|a| a.name.clone()).collect(); + Ok(format!("Running applications:\n{}", app_list.join("\n"))) + } + Err(e) => Ok(format!("āŒ Failed to list applications: {}", e)), + } + } + "macax_get_frontmost_app" => { + debug!("Processing macax_get_frontmost_app tool call"); + + if !self.config.macax.enabled { + return Ok("āŒ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string()); + } + + let controller_guard = self.macax_controller.read().await; + let controller = match controller_guard.as_ref() { + Some(c) => c, + None => return Ok("āŒ macOS Accessibility controller not initialized.".to_string()), + }; + + match controller.get_frontmost_app() { + Ok(app) => Ok(format!("Frontmost application: {}", app.name)), + Err(e) => Ok(format!("āŒ Failed to get frontmost app: {}", e)), + } + } + "macax_activate_app" => { + debug!("Processing macax_activate_app tool call"); + + if !self.config.macax.enabled { + return Ok("āŒ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string()); + } + + let app_name = match tool_call.args.get("app_name").and_then(|v| v.as_str()) { + Some(n) => n, + None => return Ok("āŒ Missing app_name argument".to_string()), + }; + + let controller_guard = self.macax_controller.read().await; + let controller = match controller_guard.as_ref() { + Some(c) => c, + None => return Ok("āŒ macOS Accessibility controller not initialized.".to_string()), + }; + + match controller.activate_app(app_name) { + Ok(_) => Ok(format!("āœ… Activated application: {}", app_name)), + Err(e) => Ok(format!("āŒ Failed to activate app: {}", e)), + } + } + "macax_press_key" => { + debug!("Processing macax_press_key tool call"); + + if !self.config.macax.enabled { + return Ok("āŒ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string()); + } + + let app_name = match tool_call.args.get("app_name").and_then(|v| v.as_str()) { + Some(n) => n, + None => return Ok("āŒ Missing app_name argument".to_string()), + }; + + let key = match tool_call.args.get("key").and_then(|v| v.as_str()) { + Some(k) => k, + None => return Ok("āŒ Missing key argument".to_string()), + }; + + let modifiers_vec: Vec<&str> = tool_call.args.get("modifiers") + .and_then(|v| v.as_array()) + .map(|arr| arr.iter() + .filter_map(|v| v.as_str()) + .collect()) + .unwrap_or_default(); + + let controller_guard = self.macax_controller.read().await; + let controller = match controller_guard.as_ref() { + Some(c) => c, + None => return Ok("āŒ macOS Accessibility controller not initialized.".to_string()), + }; + + match controller.press_key(app_name, key, modifiers_vec.clone()) { + Ok(_) => { + let modifier_str = if modifiers_vec.is_empty() { + String::new() + } else { + format!(" with modifiers: {}", modifiers_vec.join("+")) + }; + Ok(format!("āœ… Pressed key: {}{}", key, modifier_str)) + } + Err(e) => Ok(format!("āŒ Failed to press key: {}", e)), + } + } + "macax_type_text" => { + debug!("Processing macax_type_text tool call"); + + if !self.config.macax.enabled { + return Ok("āŒ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string()); + } + + let app_name = match tool_call.args.get("app_name").and_then(|v| v.as_str()) { + Some(n) => n, + None => return Ok("āŒ Missing app_name argument".to_string()), + }; + + let text = match tool_call.args.get("text").and_then(|v| v.as_str()) { + Some(t) => t, + None => return Ok("āŒ Missing text argument".to_string()), + }; + + let controller_guard = self.macax_controller.read().await; + let controller = match controller_guard.as_ref() { + Some(c) => c, + None => return Ok("āŒ macOS Accessibility controller not initialized.".to_string()), + }; + + match controller.type_text(app_name, text) { + Ok(_) => Ok(format!("āœ… Typed text into {}", app_name)), + Err(e) => Ok(format!("āŒ Failed to type text: {}", e)), + } + } + "vision_find_text" => { + debug!("Processing vision_find_text tool call"); + + if let Some(controller) = &self.computer_controller { + let app_name = tool_call.args.get("app_name") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing app_name parameter"))?; + + let text = tool_call.args.get("text") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing text parameter"))?; + + match controller.find_text_in_app(app_name, text).await { + Ok(Some(location)) => { + Ok(format!( + "āœ… Found '{}' in {} at position ({}, {}) with size {}x{} (confidence: {:.0}%)", + location.text, app_name, location.x, location.y, location.width, location.height, + location.confidence * 100.0 + )) + } + Ok(None) => Ok(format!("āŒ Could not find '{}' in {}", text, app_name)), + Err(e) => Ok(format!("āŒ Error finding text: {}", e)), + } + } else { + Ok("āŒ Computer control not enabled. Set computer_control.enabled = true in config.".to_string()) + } + } + "vision_click_text" => { + debug!("Processing vision_click_text tool call"); + + if let Some(controller) = &self.computer_controller { + let app_name = tool_call.args.get("app_name") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing app_name parameter"))?; + + let text = tool_call.args.get("text") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing text parameter"))?; + + match controller.find_text_in_app(app_name, text).await { + Ok(Some(location)) => { + // Click on center of text + // IMPORTANT: location coordinates are in NSScreen space (Y=0 at BOTTOM, increases UPWARD) + // location.x is the LEFT edge of the bounding box + // location.y is the TOP edge of the bounding box (highest Y value in NSScreen space) + // location.width and location.height are already scaled to screen space + // To get center: we need to add half the SCALED width and subtract half the SCALED height + + if location.width == 0 || location.height == 0 { + return Ok(format!("āŒ Invalid bounding box dimensions: width={}, height={}", location.width, location.height)); + } + + debug!("[vision_click_text] Location from find_text_in_app: x={}, y={}, width={}, height={}, text='{}'", + location.x, location.y, location.width, location.height, location.text); + + // Calculate center using the SCALED dimensions + // X: Use right edge instead of center (Vision OCR bounding box seems offset) + // This gives us: left edge + full width = right edge + // Y: top edge - half of scaled height (subtract because Y increases upward) + let click_x = location.x + location.width; // Right edge + let half_height = location.height / 2; + let click_y = location.y - half_height; + + debug!("[vision_click_text] Click position calculation: x={} + {} = {} (right edge), y={} - {} = {}", + location.x, location.width, click_x, location.y, half_height, click_y); + debug!("[vision_click_text] This means: left_edge={}, center={}, right_edge={}", + location.x, click_x, location.x + location.width); + + match controller.click_at(click_x, click_y, Some(app_name)) { + Ok(_) => Ok(format!("āœ… Clicked on '{}' in {} at ({}, {})", text, app_name, click_x, click_y)), + Err(e) => Ok(format!("āŒ Failed to click: {}", e)), + } + } + Ok(None) => Ok(format!("āŒ Could not find '{}' in {}", text, app_name)), + Err(e) => Ok(format!("āŒ Error finding text: {}", e)), + } + } else { + Ok("āŒ Computer control not enabled. Set computer_control.enabled = true in config.".to_string()) + } + } + "extract_text_with_boxes" => { + debug!("Processing extract_text_with_boxes tool call"); + + if !self.config.macax.enabled { + return Ok("āŒ extract_text_with_boxes requires --macax flag to be enabled".to_string()); + } + + if let Some(controller) = &self.computer_controller { + let path = tool_call.args.get("path") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing path parameter"))?; + + // Optional: take screenshot of app first + let final_path = if let Some(app_name) = tool_call.args.get("app_name").and_then(|v| v.as_str()) { + let temp_path = format!("/tmp/g3_extract_boxes_{}.png", uuid::Uuid::new_v4()); + match controller.take_screenshot(&temp_path, None, Some(app_name)).await { + Ok(_) => temp_path, + Err(e) => return Ok(format!("āŒ Failed to take screenshot: {}", e)), + } + } else { + path.to_string() + }; + + // Extract text with locations + match controller.extract_text_with_locations(&final_path).await { + Ok(locations) => { + // Clean up temp file if we created one + if final_path != path { + let _ = std::fs::remove_file(&final_path); + } + + // Return as JSON + match serde_json::to_string_pretty(&locations) { + Ok(json) => Ok(format!("āœ… Extracted {} text elements:\n{}", locations.len(), json)), + Err(e) => Ok(format!("āŒ Failed to serialize results: {}", e)), + } + } + Err(e) => Ok(format!("āŒ Failed to extract text: {}", e)), + } + } else { + Ok("āŒ Computer control not enabled. Set computer_control.enabled = true in config.".to_string()) + } + } + "vision_click_near_text" => { + debug!("Processing vision_click_near_text tool call"); + + if let Some(controller) = &self.computer_controller { + let app_name = tool_call.args.get("app_name") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing app_name parameter"))?; + + let text = tool_call.args.get("text") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing text parameter"))?; + + let direction = tool_call.args.get("direction") + .and_then(|v| v.as_str()) + .unwrap_or("right"); + + let distance = tool_call.args.get("distance") + .and_then(|v| v.as_i64()) + .unwrap_or(50) as i32; + + match controller.find_text_in_app(app_name, text).await { + Ok(Some(location)) => { + // Calculate click position based on direction + // location.x is LEFT edge, location.y is TOP edge (in NSScreen space) + let (click_x, click_y) = match direction { + "right" => (location.x + location.width + distance, location.y - (location.height / 2)), + "below" => (location.x + (location.width / 2), location.y - location.height - distance), + "left" => (location.x - distance, location.y - (location.height / 2)), + "above" => (location.x + (location.width / 2), location.y + distance), + _ => (location.x + location.width + distance, location.y - (location.height / 2)), + }; + debug!("[vision_click_near_text] Clicking {} of text at ({}, {})", direction, click_x, click_y); + + match controller.click_at(click_x, click_y, Some(app_name)) { + Ok(_) => Ok(format!( + "āœ… Clicked {} of '{}' in {} at ({}, {})", + direction, text, app_name, click_x, click_y + )), + Err(e) => Ok(format!("āŒ Failed to click: {}", e)), + } + } + Ok(None) => Ok(format!("āŒ Could not find '{}' in {}", text, app_name)), + Err(e) => Ok(format!("āŒ Error finding text: {}", e)), + } + } else { + Ok("āŒ Computer control not enabled. Set computer_control.enabled = true in config.".to_string()) + } + } _ => { warn!("Unknown tool: {}", tool_call.tool); Ok(format!("ā“ Unknown tool: {}", tool_call.tool)) @@ -3721,8 +4641,7 @@ fn parse_unified_diff_hunks(diff: &str) -> Vec<(String, String)> { } } - if line.starts_with(' ') { - let content = &line[1..]; + if let Some(content) = line.strip_prefix(' ') { old_lines.push(content.to_string()); new_lines.push(content.to_string()); } else if line.starts_with('+') && !line.starts_with("+++") { diff --git a/crates/g3-core/src/project.rs b/crates/g3-core/src/project.rs index 5028455..edaa954 100644 --- a/crates/g3-core/src/project.rs +++ b/crates/g3-core/src/project.rs @@ -104,6 +104,7 @@ impl Project { } /// Recursively check a directory for implementation files + #[allow(clippy::only_used_in_recursion)] fn check_dir_for_implementation_files(&self, dir: &Path) -> bool { // Common source file extensions let extensions = vec![ diff --git a/crates/g3-core/src/take_screenshot_test.rs b/crates/g3-core/src/take_screenshot_test.rs new file mode 100644 index 0000000..a90d81e --- /dev/null +++ b/crates/g3-core/src/take_screenshot_test.rs @@ -0,0 +1,37 @@ +// Test to verify take_screenshot requires window_id + +#[cfg(test)] +mod take_screenshot_tests { + use super::*; + use serde_json::json; + + #[test] + fn test_take_screenshot_requires_window_id() { + // Create a tool call without window_id + let tool_call = ToolCall { + tool: "take_screenshot".to_string(), + args: json!({ + "path": "test.png" + }), + }; + + // Verify that window_id is missing + assert!(tool_call.args.get("window_id").is_none()); + } + + #[test] + fn test_take_screenshot_with_window_id() { + // Create a tool call with window_id + let tool_call = ToolCall { + tool: "take_screenshot".to_string(), + args: json!({ + "path": "test.png", + "window_id": "Safari" + }), + }; + + // Verify that window_id is present + assert!(tool_call.args.get("window_id").is_some()); + assert_eq!(tool_call.args.get("window_id").unwrap().as_str().unwrap(), "Safari"); + } +} diff --git a/crates/g3-core/src/ui_writer.rs b/crates/g3-core/src/ui_writer.rs index 1b532e7..49e29b9 100644 --- a/crates/g3-core/src/ui_writer.rs +++ b/crates/g3-core/src/ui_writer.rs @@ -17,6 +17,9 @@ pub trait UiWriter: Send + Sync { /// Print a context window status message fn print_context_status(&self, message: &str); + /// Print a context thinning success message with highlight and animation + fn print_context_thinning(&self, message: &str); + /// Print a tool execution header fn print_tool_header(&self, tool_name: &str); @@ -49,6 +52,10 @@ pub trait UiWriter: Send + Sync { /// Flush any buffered output fn flush(&self); + + /// Returns true if this UI writer wants full, untruncated output + /// Default is false (truncate for human readability) + fn wants_full_output(&self) -> bool { false } } /// A no-op implementation for when UI output is not needed @@ -60,6 +67,7 @@ impl UiWriter for NullUiWriter { fn print_inline(&self, _message: &str) {} fn print_system_prompt(&self, _prompt: &str) {} fn print_context_status(&self, _message: &str) {} + fn print_context_thinning(&self, _message: &str) {} fn print_tool_header(&self, _tool_name: &str) {} fn print_tool_arg(&self, _key: &str, _value: &str) {} fn print_tool_output_header(&self) {} @@ -71,4 +79,5 @@ impl UiWriter for NullUiWriter { fn print_agent_response(&self, _content: &str) {} fn notify_sse_received(&self) {} fn flush(&self) {} + fn wants_full_output(&self) -> bool { false } } \ No newline at end of file diff --git a/crates/g3-core/tests/test_context_thinning.rs b/crates/g3-core/tests/test_context_thinning.rs index 760524f..db6761f 100644 --- a/crates/g3-core/tests/test_context_thinning.rs +++ b/crates/g3-core/tests/test_context_thinning.rs @@ -72,7 +72,7 @@ fn test_thin_context_basic() { // Trigger thinning at 50% context.used_tokens = 5000; - let summary = context.thin_context(); + let (summary, _chars_saved) = context.thin_context(); println!("Thinning summary: {}", summary); @@ -93,6 +93,119 @@ fn test_thin_context_basic() { } } +#[test] +fn test_thin_write_file_tool_calls() { + let mut context = ContextWindow::new(10000); + + // Add some messages including a write_file tool call with large content + context.add_message(Message { + role: MessageRole::User, + content: "Please create a large file".to_string(), + }); + + // Add an assistant message with a write_file tool call containing large content + let large_content = "x".repeat(1500); + let tool_call_json = format!( + r#"{{"tool": "write_file", "args": {{"file_path": "test.txt", "content": "{}"}}}}"#, + large_content + ); + context.add_message(Message { + role: MessageRole::Assistant, + content: format!("I'll create that file.\n\n{}", tool_call_json), + }); + + context.add_message(Message { + role: MessageRole::User, + content: "Tool result: āœ… Successfully wrote 1500 lines".to_string(), + }); + + // Add more messages to ensure we have enough for "first third" logic + for i in 0..6 { + context.add_message(Message { + role: MessageRole::Assistant, + content: format!("Response {}", i), + }); + } + + // Trigger thinning at 50% + context.used_tokens = 5000; + let (summary, _chars_saved) = context.thin_context(); + + println!("Thinning summary: {}", summary); + + // Should have thinned the write_file tool call + assert!(summary.contains("tool call") || summary.contains("chars saved")); + + // Check that the large content was replaced with a file reference + let first_third_end = context.conversation_history.len() / 3; + for i in 0..first_third_end { + if let Some(msg) = context.conversation_history.get(i) { + if matches!(msg.role, MessageRole::Assistant) && msg.content.contains("write_file") { + // The content should now reference an external file + assert!(msg.content.contains(" Result { + // Check if this is a detached/daemon command that should run independently + let is_detached = code.trim_start().starts_with("setsid ") + || code.trim_start().starts_with("nohup ") + || code.contains(" disown") + || (code.contains(" &") && (code.contains("nohup") || code.contains("setsid"))); + + if is_detached { + // For detached commands, just spawn and return immediately + use std::process::Stdio; + Command::new("bash") + .arg("-c") + .arg(code) + .stdin(Stdio::null()) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .spawn()?; + + return Ok(ExecutionResult { + stdout: "āœ… Command launched in background (detached process)".to_string(), + stderr: String::new(), + exit_code: 0, + success: true, + }); + } + let output = Command::new("bash") .arg("-c") .arg(code) @@ -221,6 +246,29 @@ impl CodeExecutor { use tokio::io::{AsyncBufReadExt, BufReader}; use tokio::process::Command as TokioCommand; + // Check if this is a detached/daemon command that should run independently + // Look for patterns like: setsid, nohup with &, or explicit backgrounding with disown + let is_detached = code.trim_start().starts_with("setsid ") + || code.trim_start().starts_with("nohup ") + || code.contains(" disown") + || (code.contains(" &") && (code.contains("nohup") || code.contains("setsid"))); + + if is_detached { + // For detached commands, just spawn and return immediately + TokioCommand::new("bash") + .arg("-c") + .arg(code) + .spawn()?; + + // Don't wait for the process - it's meant to run independently + return Ok(ExecutionResult { + stdout: "āœ… Command launched in background (detached process)".to_string(), + stderr: String::new(), + exit_code: 0, + success: true, + }); + } + let mut child = TokioCommand::new("bash") .arg("-c") .arg(code) @@ -259,7 +307,7 @@ impl CodeExecutor { line = stderr_lines.next_line() => { match line { Ok(Some(line)) => { - receiver.on_output_line(&format!("{}", line)); + receiver.on_output_line(&line.to_string()); stderr_output.push(line); } Ok(None) => {}, // stderr EOF, continue diff --git a/crates/g3-providers/src/databricks.rs b/crates/g3-providers/src/databricks.rs index 68fa413..50373d6 100644 --- a/crates/g3-providers/src/databricks.rs +++ b/crates/g3-providers/src/databricks.rs @@ -213,7 +213,7 @@ impl DatabricksProvider { let mut builder = self .client - .post(&format!( + .post(format!( "{}/serving-endpoints/{}/invocations", self.host, self.model )) @@ -881,6 +881,14 @@ impl LLMProvider for DatabricksProvider { "Processing Databricks streaming request with {} messages", request.messages.len() ); + + // Debug: Log tool count + if let Some(ref tools) = request.tools { + debug!("Request has {} tools", tools.len()); + for tool in tools.iter().take(5) { + debug!(" Tool: {}", tool.name); + } + } let max_tokens = request.max_tokens.unwrap_or(self.max_tokens); let temperature = request.temperature.unwrap_or(self.temperature); diff --git a/crates/g3-providers/src/oauth.rs b/crates/g3-providers/src/oauth.rs index 406d893..75c9d50 100644 --- a/crates/g3-providers/src/oauth.rs +++ b/crates/g3-providers/src/oauth.rs @@ -102,7 +102,7 @@ async fn get_workspace_endpoints(host: &str) -> Result { if !resp.status().is_success() { return Err(anyhow::anyhow!( "Failed to get OIDC configuration from {}", - oidc_url.to_string() + oidc_url )); } diff --git a/crates/g3-providers/src/openai.rs b/crates/g3-providers/src/openai.rs index e8b4dab..52ad6b0 100644 --- a/crates/g3-providers/src/openai.rs +++ b/crates/g3-providers/src/openai.rs @@ -259,7 +259,7 @@ impl LLMProvider for OpenAIProvider { let response = self .client - .post(&format!("{}/chat/completions", self.base_url)) + .post(format!("{}/chat/completions", self.base_url)) .header("Authorization", format!("Bearer {}", self.api_key)) .json(&body) .send() @@ -318,7 +318,7 @@ impl LLMProvider for OpenAIProvider { let response = self .client - .post(&format!("{}/chat/completions", self.base_url)) + .post(format!("{}/chat/completions", self.base_url)) .header("Authorization", format!("Bearer {}", self.api_key)) .json(&body) .send() diff --git a/docs/coach-player-providers.md b/docs/coach-player-providers.md deleted file mode 100644 index d1e05e4..0000000 --- a/docs/coach-player-providers.md +++ /dev/null @@ -1,75 +0,0 @@ -# Coach-Player Provider Configuration - -G3 now supports specifying different LLM providers for the coach and player agents when running in autonomous mode. This allows you to optimize for different requirements: - -- **Player**: The agent that implements code - might benefit from a faster, more cost-effective model -- **Coach**: The agent that reviews code - might benefit from a more powerful, analytical model - -## Configuration - -In your `config.toml` file, under the `[providers]` section, you can specify: - -```toml -[providers] -default_provider = "databricks" # Used for normal operations -coach = "databricks" # Provider for coach (code reviewer) -player = "anthropic" # Provider for player (code implementer) -``` - -If `coach` or `player` are not specified, they will default to using the `default_provider`. - -## Example Use Cases - -### Cost Optimization -Use a cheaper, faster model for initial implementations (player) and a more powerful model for review (coach): - -```toml -coach = "anthropic" # Claude Sonnet for thorough review -player = "anthropic" # Claude Haiku for quick implementation -``` - -### Speed vs Quality Trade-off -Use a local embedded model for fast iterations (player) and a cloud model for quality review (coach): - -```toml -coach = "databricks" # Cloud model for quality review -player = "embedded" # Local model for fast implementation -``` - -### Specialized Models -Use different models optimized for different tasks: - -```toml -coach = "databricks" # Model fine-tuned for code review -player = "openai" # Model optimized for code generation -``` - -## Requirements - -- Both providers must be properly configured in your config file -- Each provider must have valid credentials -- The models specified for each provider must be accessible - -## How It Works - -When running in autonomous mode (`g3 --autonomous`), the system will: - -1. Use the `player` provider (or default) for the initial implementation -2. Switch to the `coach` provider (or default) for code review -3. Return to the `player` provider for implementing feedback -4. Continue this cycle for the specified number of turns - -The providers are logged at startup so you can verify which models are being used: - -``` -šŸŽ® Player provider: anthropic -šŸ‘Øā€šŸ« Coach provider: databricks -ā„¹ļø Using different providers for player and coach -``` - -## Benefits - -- **Cost Efficiency**: Use expensive models only where they add the most value -- **Speed Optimization**: Use faster models for iterative development -- **Specialization**: Leverage models that excel at specific tasks -- **Flexibility**: Easy to experiment with different provider combinations diff --git a/test-ai-requirements.sh b/test-ai-requirements.sh new file mode 100755 index 0000000..06c97fc --- /dev/null +++ b/test-ai-requirements.sh @@ -0,0 +1,39 @@ +#!/bin/bash +# Test script for AI-enhanced interactive requirements mode + +echo "Testing AI-enhanced interactive requirements mode..." +echo "" + +# Create a test workspace +TEST_WORKSPACE="/tmp/g3-test-interactive-$(date +%s)" +mkdir -p "$TEST_WORKSPACE" + +echo "Test workspace: $TEST_WORKSPACE" +echo "" + +# Create sample brief input +BRIEF_INPUT="build a calculator cli in rust with basic operations" + +echo "Brief input:" +echo "---" +echo "$BRIEF_INPUT" +echo "---" +echo "" + +echo "This will:" +echo "1. Send brief input to AI" +echo "2. AI generates structured requirements.md" +echo "3. Show enhanced requirements" +echo "4. Prompt for confirmation (y/e/n)" +echo "" + +echo "To test manually, run:" +echo "cargo run -- --autonomous --interactive-requirements --workspace $TEST_WORKSPACE" +echo "" +echo "Then type: $BRIEF_INPUT" +echo "Press Ctrl+D" +echo "Review the AI-generated requirements" +echo "Choose 'y' to proceed, 'e' to edit, or 'n' to cancel" +echo "" + +echo "Test workspace will be at: $TEST_WORKSPACE"