webdriver tools
This commit is contained in:
739
Cargo.lock
generated
739
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -321,6 +321,10 @@ pub struct Cli {
|
|||||||
/// Disable log file creation (no logs/ directory or session logs)
|
/// Disable log file creation (no logs/ directory or session logs)
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
pub quiet: bool,
|
pub quiet: bool,
|
||||||
|
|
||||||
|
/// Enable WebDriver tools for browser automation (Safari)
|
||||||
|
#[arg(long)]
|
||||||
|
pub webdriver: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn run() -> Result<()> {
|
pub async fn run() -> Result<()> {
|
||||||
@@ -409,12 +413,17 @@ pub async fn run() -> Result<()> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Load configuration with CLI overrides
|
// Load configuration with CLI overrides
|
||||||
let config = Config::load_with_overrides(
|
let mut config = Config::load_with_overrides(
|
||||||
cli.config.as_deref(),
|
cli.config.as_deref(),
|
||||||
cli.provider.clone(),
|
cli.provider.clone(),
|
||||||
cli.model.clone(),
|
cli.model.clone(),
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
|
// Override webdriver setting from CLI flag
|
||||||
|
if cli.webdriver {
|
||||||
|
config.webdriver.enabled = true;
|
||||||
|
}
|
||||||
|
|
||||||
// Validate provider if specified
|
// Validate provider if specified
|
||||||
if let Some(ref provider) = cli.provider {
|
if let Some(ref provider) = cli.provider {
|
||||||
let valid_providers = ["anthropic", "databricks", "embedded", "openai"];
|
let valid_providers = ["anthropic", "databricks", "embedded", "openai"];
|
||||||
|
|||||||
@@ -13,9 +13,13 @@ serde_json = { workspace = true }
|
|||||||
tracing = { workspace = true }
|
tracing = { workspace = true }
|
||||||
uuid = { workspace = true }
|
uuid = { workspace = true }
|
||||||
|
|
||||||
|
shellexpand = "3.1"
|
||||||
# Async trait support
|
# Async trait support
|
||||||
async-trait = "0.1"
|
async-trait = "0.1"
|
||||||
|
|
||||||
|
# WebDriver support
|
||||||
|
fantoccini = "0.21"
|
||||||
|
|
||||||
# OCR dependencies
|
# OCR dependencies
|
||||||
tesseract = "0.14"
|
tesseract = "0.14"
|
||||||
|
|
||||||
|
|||||||
64
crates/g3-computer-control/examples/safari_demo.rs
Normal file
64
crates/g3-computer-control/examples/safari_demo.rs
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
use g3_computer_control::SafariDriver;
|
||||||
|
use g3_computer_control::webdriver::WebDriverController;
|
||||||
|
use anyhow::Result;
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() -> Result<()> {
|
||||||
|
println!("Safari WebDriver Demo");
|
||||||
|
println!("=====================\n");
|
||||||
|
|
||||||
|
println!("Make sure to:");
|
||||||
|
println!("1. Enable 'Allow Remote Automation' in Safari's Develop menu");
|
||||||
|
println!("2. Run: /usr/bin/safaridriver --enable");
|
||||||
|
println!("3. Start safaridriver in another terminal: safaridriver --port 4444\n");
|
||||||
|
|
||||||
|
println!("Connecting to SafariDriver...");
|
||||||
|
let mut driver = SafariDriver::new().await?;
|
||||||
|
println!("✅ Connected!\n");
|
||||||
|
|
||||||
|
// Navigate to a website
|
||||||
|
println!("Navigating to example.com...");
|
||||||
|
driver.navigate("https://example.com").await?;
|
||||||
|
println!("✅ Navigated\n");
|
||||||
|
|
||||||
|
// Get page title
|
||||||
|
let title = driver.title().await?;
|
||||||
|
println!("Page title: {}\n", title);
|
||||||
|
|
||||||
|
// Get current URL
|
||||||
|
let url = driver.current_url().await?;
|
||||||
|
println!("Current URL: {}\n", url);
|
||||||
|
|
||||||
|
// Find an element
|
||||||
|
println!("Finding h1 element...");
|
||||||
|
let mut h1 = driver.find_element("h1").await?;
|
||||||
|
let h1_text = h1.text().await?;
|
||||||
|
println!("H1 text: {}\n", h1_text);
|
||||||
|
|
||||||
|
// Find all paragraphs
|
||||||
|
println!("Finding all paragraphs...");
|
||||||
|
let paragraphs = driver.find_elements("p").await?;
|
||||||
|
println!("Found {} paragraphs\n", paragraphs.len());
|
||||||
|
|
||||||
|
// Get page source
|
||||||
|
println!("Getting page source...");
|
||||||
|
let source = driver.page_source().await?;
|
||||||
|
println!("Page source length: {} bytes\n", source.len());
|
||||||
|
|
||||||
|
// Execute JavaScript
|
||||||
|
println!("Executing JavaScript...");
|
||||||
|
let result = driver.execute_script("return document.title", vec![]).await?;
|
||||||
|
println!("JS result: {:?}\n", result);
|
||||||
|
|
||||||
|
// Take a screenshot
|
||||||
|
println!("Taking screenshot...");
|
||||||
|
driver.screenshot("/tmp/safari_demo.png").await?;
|
||||||
|
println!("✅ Screenshot saved to /tmp/safari_demo.png\n");
|
||||||
|
|
||||||
|
// Close the browser
|
||||||
|
println!("Closing browser...");
|
||||||
|
driver.quit().await?;
|
||||||
|
println!("✅ Done!");
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
@@ -1,5 +1,9 @@
|
|||||||
pub mod types;
|
pub mod types;
|
||||||
pub mod platform;
|
pub mod platform;
|
||||||
|
pub mod webdriver;
|
||||||
|
|
||||||
|
// Re-export webdriver types for convenience
|
||||||
|
pub use webdriver::{WebDriverController, WebElement, safari::SafariDriver};
|
||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
@@ -7,32 +11,12 @@ use types::*;
|
|||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
pub trait ComputerController: Send + Sync {
|
pub trait ComputerController: Send + Sync {
|
||||||
// Mouse operations
|
|
||||||
async fn move_mouse(&self, x: i32, y: i32) -> Result<()>;
|
|
||||||
async fn click(&self, button: MouseButton) -> Result<()>;
|
|
||||||
async fn double_click(&self, button: MouseButton) -> Result<()>;
|
|
||||||
|
|
||||||
// Keyboard operations
|
|
||||||
async fn type_text(&self, text: &str) -> Result<()>;
|
|
||||||
async fn press_key(&self, key: &str) -> Result<()>;
|
|
||||||
|
|
||||||
// Window management
|
|
||||||
async fn list_windows(&self) -> Result<Vec<Window>>;
|
|
||||||
async fn focus_window(&self, window_id: &str) -> Result<()>;
|
|
||||||
async fn get_window_bounds(&self, window_id: &str) -> Result<Rect>;
|
|
||||||
|
|
||||||
// UI element inspection
|
|
||||||
async fn find_element(&self, selector: &ElementSelector) -> Result<Option<UIElement>>;
|
|
||||||
async fn get_element_text(&self, element_id: &str) -> Result<String>;
|
|
||||||
async fn get_element_bounds(&self, element_id: &str) -> Result<Rect>;
|
|
||||||
|
|
||||||
// Screen capture
|
// Screen capture
|
||||||
async fn take_screenshot(&self, path: &str, region: Option<Rect>, window_id: Option<&str>) -> Result<()>;
|
async fn take_screenshot(&self, path: &str, region: Option<Rect>, window_id: Option<&str>) -> Result<()>;
|
||||||
|
|
||||||
// OCR operations
|
// OCR operations
|
||||||
async fn extract_text_from_screen(&self, region: Rect) -> Result<OCRResult>;
|
async fn extract_text_from_screen(&self, region: Rect) -> Result<String>;
|
||||||
async fn extract_text_from_image(&self, path: &str) -> Result<OCRResult>;
|
async fn extract_text_from_image(&self, path: &str) -> Result<String>;
|
||||||
async fn find_text_on_screen(&self, text: &str) -> Result<Option<Point>>;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Platform-specific constructor
|
// Platform-specific constructor
|
||||||
|
|||||||
@@ -1,310 +1,21 @@
|
|||||||
use crate::{ComputerController, types::*};
|
use crate::{ComputerController, types::Rect};
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use core_graphics::display::CGPoint;
|
|
||||||
use core_graphics::event::{CGEvent, CGEventType, CGMouseButton, CGEventTapLocation};
|
|
||||||
use core_graphics::event_source::{CGEventSource, CGEventSourceStateID};
|
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use tesseract::Tesseract;
|
use tesseract::Tesseract;
|
||||||
use core_graphics::window::{kCGWindowListOptionOnScreenOnly, kCGNullWindowID, CGWindowListCopyWindowInfo};
|
|
||||||
use core_foundation::dictionary::CFDictionary;
|
|
||||||
use core_foundation::string::CFString;
|
|
||||||
use core_foundation::base::{TCFType, ToVoid};
|
|
||||||
|
|
||||||
// MacOSController doesn't store CGEventSource to avoid Send/Sync issues
|
|
||||||
// We create it fresh for each operation
|
|
||||||
pub struct MacOSController {
|
pub struct MacOSController {
|
||||||
// Empty struct - event source created per operation
|
// Empty struct for now
|
||||||
}
|
}
|
||||||
|
|
||||||
impl MacOSController {
|
impl MacOSController {
|
||||||
pub fn new() -> Result<Self> {
|
pub fn new() -> Result<Self> {
|
||||||
// Test that we can create an event source
|
|
||||||
let _event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
|
|
||||||
.map_err(|_| anyhow::anyhow!("Failed to create event source. Make sure Accessibility permissions are granted."))?;
|
|
||||||
Ok(Self {})
|
Ok(Self {})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn key_to_keycode(&self, key: &str) -> Result<u16> {
|
|
||||||
// Map key names to macOS keycodes
|
|
||||||
let keycode = match key.to_lowercase().as_str() {
|
|
||||||
"return" | "enter" => 36,
|
|
||||||
"tab" => 48,
|
|
||||||
"space" => 49,
|
|
||||||
"delete" | "backspace" => 51,
|
|
||||||
"escape" | "esc" => 53,
|
|
||||||
"command" | "cmd" => 55,
|
|
||||||
"shift" => 56,
|
|
||||||
"capslock" => 57,
|
|
||||||
"option" | "alt" => 58,
|
|
||||||
"control" | "ctrl" => 59,
|
|
||||||
"left" => 123,
|
|
||||||
"right" => 124,
|
|
||||||
"down" => 125,
|
|
||||||
"up" => 126,
|
|
||||||
_ => anyhow::bail!("Unknown key: {}", key),
|
|
||||||
};
|
|
||||||
Ok(keycode)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
impl ComputerController for MacOSController {
|
impl ComputerController for MacOSController {
|
||||||
async fn move_mouse(&self, x: i32, y: i32) -> Result<()> {
|
|
||||||
let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
|
|
||||||
.map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
|
|
||||||
let point = CGPoint::new(x as f64, y as f64);
|
|
||||||
let event = CGEvent::new_mouse_event(
|
|
||||||
event_source,
|
|
||||||
CGEventType::MouseMoved,
|
|
||||||
point,
|
|
||||||
CGMouseButton::Left,
|
|
||||||
).map_err(|_| anyhow::anyhow!("Failed to create mouse move event"))?;
|
|
||||||
|
|
||||||
event.post(CGEventTapLocation::HID);
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn click(&self, button: MouseButton) -> Result<()> {
|
|
||||||
let (cg_button, down_type, up_type) = match button {
|
|
||||||
MouseButton::Left => (CGMouseButton::Left, CGEventType::LeftMouseDown, CGEventType::LeftMouseUp),
|
|
||||||
MouseButton::Right => (CGMouseButton::Right, CGEventType::RightMouseDown, CGEventType::RightMouseUp),
|
|
||||||
MouseButton::Middle => (CGMouseButton::Center, CGEventType::OtherMouseDown, CGEventType::OtherMouseUp),
|
|
||||||
};
|
|
||||||
|
|
||||||
let point = {
|
|
||||||
// Get current mouse position
|
|
||||||
let temp_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
|
|
||||||
.map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
|
|
||||||
let event = CGEvent::new(temp_source)
|
|
||||||
.map_err(|_| anyhow::anyhow!("Failed to get mouse position"))?;
|
|
||||||
let p = event.location();
|
|
||||||
p
|
|
||||||
};
|
|
||||||
|
|
||||||
{
|
|
||||||
let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
|
|
||||||
.map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
|
|
||||||
|
|
||||||
// Mouse down
|
|
||||||
let down_event = CGEvent::new_mouse_event(
|
|
||||||
event_source,
|
|
||||||
down_type,
|
|
||||||
point,
|
|
||||||
cg_button,
|
|
||||||
).map_err(|_| anyhow::anyhow!("Failed to create mouse down event"))?;
|
|
||||||
down_event.post(CGEventTapLocation::HID);
|
|
||||||
} // event_source and down_event dropped here
|
|
||||||
|
|
||||||
// Small delay
|
|
||||||
tokio::time::sleep(tokio::time::Duration::from_millis(50)).await;
|
|
||||||
|
|
||||||
{
|
|
||||||
let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
|
|
||||||
.map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
|
|
||||||
|
|
||||||
let up_event = CGEvent::new_mouse_event(
|
|
||||||
event_source,
|
|
||||||
up_type,
|
|
||||||
point,
|
|
||||||
cg_button,
|
|
||||||
).map_err(|_| anyhow::anyhow!("Failed to create mouse up event"))?;
|
|
||||||
up_event.post(CGEventTapLocation::HID);
|
|
||||||
} // event_source and up_event dropped here
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn double_click(&self, button: MouseButton) -> Result<()> {
|
|
||||||
self.click(button).await?;
|
|
||||||
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
|
|
||||||
self.click(button).await?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn type_text(&self, text: &str) -> Result<()> {
|
|
||||||
for ch in text.chars() {
|
|
||||||
{
|
|
||||||
let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
|
|
||||||
.map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
|
|
||||||
|
|
||||||
// Create keyboard event for character
|
|
||||||
let event = CGEvent::new_keyboard_event(
|
|
||||||
event_source,
|
|
||||||
0, // keycode (0 for unicode)
|
|
||||||
true,
|
|
||||||
).map_err(|_| anyhow::anyhow!("Failed to create keyboard event"))?;
|
|
||||||
|
|
||||||
// Set unicode string
|
|
||||||
let mut utf16_buf = [0u16; 2];
|
|
||||||
let utf16_slice = ch.encode_utf16(&mut utf16_buf);
|
|
||||||
let utf16_chars: Vec<u16> = utf16_slice.iter().copied().collect();
|
|
||||||
|
|
||||||
event.set_string_from_utf16_unchecked(utf16_chars.as_slice());
|
|
||||||
event.post(CGEventTapLocation::HID);
|
|
||||||
} // event_source and event dropped here
|
|
||||||
|
|
||||||
tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn press_key(&self, key: &str) -> Result<()> {
|
|
||||||
let keycode = self.key_to_keycode(key)?;
|
|
||||||
|
|
||||||
{
|
|
||||||
let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
|
|
||||||
.map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
|
|
||||||
|
|
||||||
// Key down
|
|
||||||
let down_event = CGEvent::new_keyboard_event(
|
|
||||||
event_source,
|
|
||||||
keycode,
|
|
||||||
true,
|
|
||||||
).map_err(|_| anyhow::anyhow!("Failed to create key down event"))?;
|
|
||||||
down_event.post(CGEventTapLocation::HID);
|
|
||||||
} // event_source and down_event dropped here
|
|
||||||
|
|
||||||
tokio::time::sleep(tokio::time::Duration::from_millis(50)).await;
|
|
||||||
|
|
||||||
{
|
|
||||||
let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState)
|
|
||||||
.map_err(|_| anyhow::anyhow!("Failed to create event source"))?;
|
|
||||||
|
|
||||||
// Key up
|
|
||||||
let up_event = CGEvent::new_keyboard_event(
|
|
||||||
event_source,
|
|
||||||
keycode,
|
|
||||||
false,
|
|
||||||
).map_err(|_| anyhow::anyhow!("Failed to create key up event"))?;
|
|
||||||
up_event.post(CGEventTapLocation::HID);
|
|
||||||
} // event_source and up_event dropped here
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn list_windows(&self) -> Result<Vec<Window>> {
|
|
||||||
let mut windows = Vec::new();
|
|
||||||
|
|
||||||
unsafe {
|
|
||||||
let window_list = CGWindowListCopyWindowInfo(
|
|
||||||
kCGWindowListOptionOnScreenOnly,
|
|
||||||
kCGNullWindowID
|
|
||||||
);
|
|
||||||
|
|
||||||
let array = core_foundation::array::CFArray::<CFDictionary>::wrap_under_create_rule(window_list);
|
|
||||||
let count = array.len();
|
|
||||||
|
|
||||||
for i in 0..count {
|
|
||||||
let dict = array.get(i).unwrap();
|
|
||||||
|
|
||||||
// Get window ID
|
|
||||||
let window_id_key = CFString::from_static_string("kCGWindowNumber");
|
|
||||||
let window_id: i64 = if let Some(value) = dict.find(window_id_key.to_void()) {
|
|
||||||
let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _);
|
|
||||||
num.to_i64().unwrap_or(0)
|
|
||||||
} else {
|
|
||||||
0
|
|
||||||
};
|
|
||||||
|
|
||||||
// Get owner name (app name)
|
|
||||||
let owner_key = CFString::from_static_string("kCGWindowOwnerName");
|
|
||||||
let app_name: String = if let Some(value) = dict.find(owner_key.to_void()) {
|
|
||||||
let s: CFString = TCFType::wrap_under_get_rule(*value as *const _);
|
|
||||||
s.to_string()
|
|
||||||
} else {
|
|
||||||
"Unknown".to_string()
|
|
||||||
};
|
|
||||||
|
|
||||||
// Get window name/title
|
|
||||||
let name_key = CFString::from_static_string("kCGWindowName");
|
|
||||||
let title: String = if let Some(value) = dict.find(name_key.to_void()) {
|
|
||||||
let s: CFString = TCFType::wrap_under_get_rule(*value as *const _);
|
|
||||||
s.to_string()
|
|
||||||
} else {
|
|
||||||
"".to_string()
|
|
||||||
};
|
|
||||||
|
|
||||||
// Get window bounds
|
|
||||||
let bounds_key = CFString::from_static_string("kCGWindowBounds");
|
|
||||||
let bounds = if let Some(bounds_value) = dict.find(bounds_key.to_void()) {
|
|
||||||
let bounds_dict: CFDictionary = TCFType::wrap_under_get_rule(*bounds_value as *const _);
|
|
||||||
|
|
||||||
let x_key = CFString::from_static_string("X");
|
|
||||||
let y_key = CFString::from_static_string("Y");
|
|
||||||
let width_key = CFString::from_static_string("Width");
|
|
||||||
let height_key = CFString::from_static_string("Height");
|
|
||||||
|
|
||||||
let x = if let Some(x_value) = bounds_dict.find(x_key.to_void()) {
|
|
||||||
let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*x_value as *const _);
|
|
||||||
num.to_i32().unwrap_or(0)
|
|
||||||
} else { 0 };
|
|
||||||
let y = if let Some(y_value) = bounds_dict.find(y_key.to_void()) {
|
|
||||||
let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*y_value as *const _);
|
|
||||||
num.to_i32().unwrap_or(0)
|
|
||||||
} else { 0 };
|
|
||||||
let width = if let Some(width_value) = bounds_dict.find(width_key.to_void()) {
|
|
||||||
let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*width_value as *const _);
|
|
||||||
num.to_i32().unwrap_or(0)
|
|
||||||
} else { 0 };
|
|
||||||
let height = if let Some(height_value) = bounds_dict.find(height_key.to_void()) {
|
|
||||||
let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*height_value as *const _);
|
|
||||||
num.to_i32().unwrap_or(0)
|
|
||||||
} else { 0 };
|
|
||||||
|
|
||||||
Rect { x, y, width, height }
|
|
||||||
} else {
|
|
||||||
Rect { x: 0, y: 0, width: 0, height: 0 }
|
|
||||||
};
|
|
||||||
|
|
||||||
// Skip windows without meaningful content (system UI elements, etc.)
|
|
||||||
if app_name.is_empty() || (title.is_empty() && bounds.width < 100) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
windows.push(Window {
|
|
||||||
id: format!("{}:{}", app_name, window_id),
|
|
||||||
title,
|
|
||||||
app_name,
|
|
||||||
bounds,
|
|
||||||
is_active: false, // We'd need additional API calls to determine this
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(windows)
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn focus_window(&self, _window_id: &str) -> Result<()> {
|
|
||||||
// Note: Full implementation would use NSWorkspace to activate application
|
|
||||||
tracing::warn!("focus_window not fully implemented on macOS");
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_window_bounds(&self, _window_id: &str) -> Result<Rect> {
|
|
||||||
// Note: Full implementation would use Accessibility API
|
|
||||||
tracing::warn!("get_window_bounds not fully implemented on macOS");
|
|
||||||
Ok(Rect { x: 0, y: 0, width: 800, height: 600 })
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn find_element(&self, _selector: &ElementSelector) -> Result<Option<UIElement>> {
|
|
||||||
// Note: Full implementation would use macOS Accessibility API
|
|
||||||
tracing::warn!("find_element not fully implemented on macOS");
|
|
||||||
Ok(None)
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_element_text(&self, _element_id: &str) -> Result<String> {
|
|
||||||
// Note: Full implementation would use Accessibility API
|
|
||||||
tracing::warn!("get_element_text not fully implemented on macOS");
|
|
||||||
Ok(String::new())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_element_bounds(&self, _element_id: &str) -> Result<Rect> {
|
|
||||||
// Note: Full implementation would use Accessibility API
|
|
||||||
tracing::warn!("get_element_bounds not fully implemented on macOS");
|
|
||||||
Ok(Rect { x: 0, y: 0, width: 100, height: 30 })
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn take_screenshot(&self, path: &str, region: Option<Rect>, window_id: Option<&str>) -> Result<()> {
|
async fn take_screenshot(&self, path: &str, region: Option<Rect>, window_id: Option<&str>) -> Result<()> {
|
||||||
// Determine the temporary directory for screenshots
|
// Determine the temporary directory for screenshots
|
||||||
let temp_dir = std::env::var("TMPDIR")
|
let temp_dir = std::env::var("TMPDIR")
|
||||||
@@ -321,92 +32,6 @@ impl ComputerController for MacOSController {
|
|||||||
format!("{}/{}", temp_dir.trim_end_matches('/'), path)
|
format!("{}/{}", temp_dir.trim_end_matches('/'), path)
|
||||||
};
|
};
|
||||||
|
|
||||||
// Get the currently focused application before taking screenshot
|
|
||||||
let current_app = std::process::Command::new("osascript")
|
|
||||||
.arg("-e")
|
|
||||||
.arg("tell application \"System Events\" to get name of first application process whose frontmost is true")
|
|
||||||
.output()
|
|
||||||
.ok()
|
|
||||||
.and_then(|output| {
|
|
||||||
if output.status.success() {
|
|
||||||
Some(String::from_utf8_lossy(&output.stdout).trim().to_string())
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
// Handle application-based window capture
|
|
||||||
let app_name_opt = window_id.and_then(|id| {
|
|
||||||
// Extract app name from window_id format "AppName:WindowNumber"
|
|
||||||
id.split(':').next().map(String::from)
|
|
||||||
});
|
|
||||||
|
|
||||||
// If we're capturing a specific window, foreground it first
|
|
||||||
if let Some(ref app) = app_name_opt {
|
|
||||||
tracing::debug!("Foregrounding application: {}", app);
|
|
||||||
let _ = std::process::Command::new("osascript")
|
|
||||||
.arg("-e")
|
|
||||||
.arg(format!("tell application \"{}\" to activate", app))
|
|
||||||
.output();
|
|
||||||
|
|
||||||
// Give the window time to come to the front
|
|
||||||
tokio::time::sleep(tokio::time::Duration::from_millis(500)).await;
|
|
||||||
}
|
|
||||||
|
|
||||||
let screenshot_result = if let Some(ref app) = app_name_opt {
|
|
||||||
// Use screencapture with AppleScript to get window ID
|
|
||||||
let script = format!(
|
|
||||||
r#"tell application "{}" to id of window 1"#,
|
|
||||||
app
|
|
||||||
);
|
|
||||||
|
|
||||||
let output = std::process::Command::new("osascript")
|
|
||||||
.arg("-e")
|
|
||||||
.arg(&script)
|
|
||||||
.output()?;
|
|
||||||
|
|
||||||
if output.status.success() {
|
|
||||||
let window_id_str = String::from_utf8_lossy(&output.stdout).trim().to_string();
|
|
||||||
std::process::Command::new("screencapture")
|
|
||||||
.arg(format!("-l{}", window_id_str))
|
|
||||||
.arg("-o")
|
|
||||||
.arg(&final_path)
|
|
||||||
.output()
|
|
||||||
} else {
|
|
||||||
// Fallback to regular screenshot if we can't get window ID
|
|
||||||
std::process::Command::new("screencapture")
|
|
||||||
.arg("-x")
|
|
||||||
.arg(&final_path)
|
|
||||||
.output()
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Regular screenshot (full screen or region)
|
|
||||||
// Use native macOS screencapture command which handles all the format complexities
|
|
||||||
|
|
||||||
// Check if we have Screen Recording permission by attempting a test capture
|
|
||||||
// If we only get wallpaper/menubar but no windows, we need permission
|
|
||||||
let needs_permission_check = std::env::var("G3_SKIP_PERMISSION_CHECK").is_err();
|
|
||||||
|
|
||||||
if needs_permission_check {
|
|
||||||
// Try to open Screen Recording settings if this is the first screenshot
|
|
||||||
static PERMISSION_PROMPTED: std::sync::atomic::AtomicBool = std::sync::atomic::AtomicBool::new(false);
|
|
||||||
|
|
||||||
if !PERMISSION_PROMPTED.swap(true, std::sync::atomic::Ordering::Relaxed) {
|
|
||||||
tracing::warn!("\n=== Screen Recording Permission Required ===\n\
|
|
||||||
macOS requires explicit permission to capture window content.\n\
|
|
||||||
If screenshots only show wallpaper/menubar (no windows):\n\n\
|
|
||||||
1. Open System Settings > Privacy & Security > Screen Recording\n\
|
|
||||||
2. Enable permission for your terminal (iTerm/Terminal) or g3\n\
|
|
||||||
3. Restart your terminal if needed\n\n\
|
|
||||||
Opening Screen Recording settings now...\n");
|
|
||||||
|
|
||||||
// Try to open the settings (non-blocking)
|
|
||||||
let _ = std::process::Command::new("open")
|
|
||||||
.arg("x-apple.systempreferences:com.apple.preference.security?Privacy_ScreenCapture")
|
|
||||||
.spawn();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let path_obj = Path::new(&final_path);
|
let path_obj = Path::new(&final_path);
|
||||||
if let Some(parent) = path_obj.parent() {
|
if let Some(parent) = path_obj.parent() {
|
||||||
std::fs::create_dir_all(parent)?;
|
std::fs::create_dir_all(parent)?;
|
||||||
@@ -423,41 +48,34 @@ impl ComputerController for MacOSController {
|
|||||||
cmd.arg(format!("{},{},{},{}", region.x, region.y, region.width, region.height));
|
cmd.arg(format!("{},{},{},{}", region.x, region.y, region.width, region.height));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if let Some(app_name) = window_id {
|
||||||
|
// Capture specific window by app name
|
||||||
|
// Use AppleScript to get window ID
|
||||||
|
let script = format!(r#"tell application "{}" to id of window 1"#, app_name);
|
||||||
|
let output = std::process::Command::new("osascript")
|
||||||
|
.arg("-e")
|
||||||
|
.arg(&script)
|
||||||
|
.output()?;
|
||||||
|
|
||||||
|
if output.status.success() {
|
||||||
|
let window_id_str = String::from_utf8_lossy(&output.stdout).trim().to_string();
|
||||||
|
cmd.arg(format!("-l{}", window_id_str));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
cmd.arg(&final_path);
|
cmd.arg(&final_path);
|
||||||
|
|
||||||
cmd.output()
|
let screenshot_result = cmd.output()?;
|
||||||
}?;
|
|
||||||
|
|
||||||
if !screenshot_result.status.success() {
|
if !screenshot_result.status.success() {
|
||||||
let stderr = String::from_utf8_lossy(&screenshot_result.stderr);
|
let stderr = String::from_utf8_lossy(&screenshot_result.stderr);
|
||||||
return Err(anyhow::anyhow!("screencapture failed: {}", stderr));
|
return Err(anyhow::anyhow!("screencapture failed: {}", stderr));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Re-foreground the original application if we foregrounded a different window
|
|
||||||
if let Some(ref target_app) = app_name_opt {
|
|
||||||
if let Some(ref original_app) = current_app {
|
|
||||||
// Only restore if we actually changed the foreground app
|
|
||||||
if target_app != original_app {
|
|
||||||
tracing::debug!("Restoring focus to original application: {}", original_app);
|
|
||||||
|
|
||||||
// Small delay to ensure screenshot is complete
|
|
||||||
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
|
|
||||||
|
|
||||||
let _ = std::process::Command::new("osascript")
|
|
||||||
.arg("-e")
|
|
||||||
.arg(format!("tell application \"{}\" to activate", original_app))
|
|
||||||
.output();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
tracing::debug!("Screenshot saved using screencapture: {}", final_path);
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn extract_text_from_screen(&self, region: Rect) -> Result<String> {
|
||||||
async fn extract_text_from_screen(&self, region: Rect) -> Result<OCRResult> {
|
|
||||||
// Take screenshot of region first
|
// Take screenshot of region first
|
||||||
let temp_path = format!("/tmp/g3_ocr_{}.png", uuid::Uuid::new_v4());
|
let temp_path = format!("/tmp/g3_ocr_{}.png", uuid::Uuid::new_v4());
|
||||||
self.take_screenshot(&temp_path, Some(region), None).await?;
|
self.take_screenshot(&temp_path, Some(region), None).await?;
|
||||||
@@ -471,7 +89,7 @@ impl ComputerController for MacOSController {
|
|||||||
Ok(result)
|
Ok(result)
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn extract_text_from_image(&self, _path: &str) -> Result<OCRResult> {
|
async fn extract_text_from_image(&self, path: &str) -> Result<String> {
|
||||||
// Check if tesseract is available on the system
|
// Check if tesseract is available on the system
|
||||||
let tesseract_check = std::process::Command::new("which")
|
let tesseract_check = std::process::Command::new("which")
|
||||||
.arg("tesseract")
|
.arg("tesseract")
|
||||||
@@ -497,66 +115,11 @@ impl ComputerController for MacOSController {
|
|||||||
Windows: Reinstall tesseract and ensure language files are included", e)
|
Windows: Reinstall tesseract and ensure language files are included", e)
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
let text = tess.set_image(_path)
|
let text = tess.set_image(path)
|
||||||
.map_err(|e| anyhow::anyhow!("Failed to load image '{}': {}", _path, e))?
|
.map_err(|e| anyhow::anyhow!("Failed to load image '{}': {}", path, e))?
|
||||||
.get_text()
|
.get_text()
|
||||||
.map_err(|e| anyhow::anyhow!("Failed to extract text from image: {}", e))?;
|
.map_err(|e| anyhow::anyhow!("Failed to extract text from image: {}", e))?;
|
||||||
|
|
||||||
// Get confidence (simplified - would need more complex API calls for per-word confidence)
|
Ok(text)
|
||||||
let confidence = 0.85; // Placeholder
|
|
||||||
|
|
||||||
Ok(OCRResult {
|
|
||||||
text,
|
|
||||||
confidence,
|
|
||||||
bounds: Rect { x: 0, y: 0, width: 0, height: 0 }, // Would need image dimensions
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn find_text_on_screen(&self, _text: &str) -> Result<Option<Point>> {
|
|
||||||
// Check if tesseract is available on the system
|
|
||||||
let tesseract_check = std::process::Command::new("which")
|
|
||||||
.arg("tesseract")
|
|
||||||
.output();
|
|
||||||
|
|
||||||
if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() {
|
|
||||||
anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\
|
|
||||||
To install tesseract:\n macOS: brew install tesseract\n \
|
|
||||||
Linux: sudo apt-get install tesseract-ocr (Ubuntu/Debian)\n \
|
|
||||||
sudo yum install tesseract (RHEL/CentOS)\n \
|
|
||||||
Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki\n\n\
|
|
||||||
After installation, restart your terminal and try again.");
|
|
||||||
}
|
|
||||||
|
|
||||||
// Take full screen screenshot
|
|
||||||
let temp_path = format!("/tmp/g3_ocr_search_{}.png", uuid::Uuid::new_v4());
|
|
||||||
self.take_screenshot(&temp_path, None, None).await?;
|
|
||||||
|
|
||||||
// Use Tesseract to find text with bounding boxes
|
|
||||||
let tess = Tesseract::new(None, Some("eng"))
|
|
||||||
.map_err(|e| {
|
|
||||||
anyhow::anyhow!("Failed to initialize Tesseract: {}\n\n\
|
|
||||||
This usually means:\n1. Tesseract is not properly installed\n\
|
|
||||||
2. Language data files are missing\n\nTo fix:\n \
|
|
||||||
macOS: brew reinstall tesseract\n \
|
|
||||||
Linux: sudo apt-get install tesseract-ocr-eng\n \
|
|
||||||
Windows: Reinstall tesseract and ensure language files are included", e)
|
|
||||||
})?;
|
|
||||||
|
|
||||||
let full_text = tess.set_image(temp_path.as_str())
|
|
||||||
.map_err(|e| anyhow::anyhow!("Failed to load screenshot: {}", e))?
|
|
||||||
.get_text()
|
|
||||||
.map_err(|e| anyhow::anyhow!("Failed to extract text from screen: {}", e))?;
|
|
||||||
|
|
||||||
// Clean up temp file
|
|
||||||
let _ = std::fs::remove_file(&temp_path);
|
|
||||||
|
|
||||||
// Simple text search - full implementation would use get_component_images
|
|
||||||
// to get bounding boxes for each word
|
|
||||||
if full_text.contains(_text) {
|
|
||||||
tracing::warn!("Text found but precise coordinates not available in simplified implementation");
|
|
||||||
Ok(Some(Point { x: 0, y: 0 }))
|
|
||||||
} else {
|
|
||||||
Ok(None)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1,11 +1,5 @@
|
|||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
|
|
||||||
pub struct Point {
|
|
||||||
pub x: i32,
|
|
||||||
pub y: i32,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
|
||||||
pub struct Rect {
|
pub struct Rect {
|
||||||
pub x: i32,
|
pub x: i32,
|
||||||
@@ -13,53 +7,3 @@ pub struct Rect {
|
|||||||
pub width: i32,
|
pub width: i32,
|
||||||
pub height: i32,
|
pub height: i32,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Rect {
|
|
||||||
pub fn center(&self) -> Point {
|
|
||||||
Point {
|
|
||||||
x: self.x + self.width / 2,
|
|
||||||
y: self.y + self.height / 2,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
||||||
pub struct Window {
|
|
||||||
pub id: String,
|
|
||||||
pub title: String,
|
|
||||||
pub app_name: String,
|
|
||||||
pub bounds: Rect,
|
|
||||||
pub is_active: bool,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
||||||
pub struct UIElement {
|
|
||||||
pub id: String,
|
|
||||||
pub text: String,
|
|
||||||
pub role: String,
|
|
||||||
pub bounds: Rect,
|
|
||||||
pub enabled: bool,
|
|
||||||
pub visible: bool,
|
|
||||||
pub value: Option<String>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
|
|
||||||
pub enum MouseButton {
|
|
||||||
Left,
|
|
||||||
Right,
|
|
||||||
Middle,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
||||||
pub struct ElementSelector {
|
|
||||||
pub text: Option<String>,
|
|
||||||
pub role: Option<String>,
|
|
||||||
pub window_id: Option<String>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
||||||
pub struct OCRResult {
|
|
||||||
pub text: String,
|
|
||||||
pub confidence: f32,
|
|
||||||
pub bounds: Rect,
|
|
||||||
}
|
|
||||||
|
|||||||
111
crates/g3-computer-control/src/webdriver/mod.rs
Normal file
111
crates/g3-computer-control/src/webdriver/mod.rs
Normal file
@@ -0,0 +1,111 @@
|
|||||||
|
pub mod safari;
|
||||||
|
|
||||||
|
use anyhow::Result;
|
||||||
|
use async_trait::async_trait;
|
||||||
|
use serde_json::Value;
|
||||||
|
|
||||||
|
/// WebDriver controller for browser automation
|
||||||
|
#[async_trait]
|
||||||
|
pub trait WebDriverController: Send + Sync {
|
||||||
|
/// Navigate to a URL
|
||||||
|
async fn navigate(&mut self, url: &str) -> Result<()>;
|
||||||
|
|
||||||
|
/// Get the current URL
|
||||||
|
async fn current_url(&self) -> Result<String>;
|
||||||
|
|
||||||
|
/// Get the page title
|
||||||
|
async fn title(&self) -> Result<String>;
|
||||||
|
|
||||||
|
/// Find an element by CSS selector
|
||||||
|
async fn find_element(&mut self, selector: &str) -> Result<WebElement>;
|
||||||
|
|
||||||
|
/// Find multiple elements by CSS selector
|
||||||
|
async fn find_elements(&mut self, selector: &str) -> Result<Vec<WebElement>>;
|
||||||
|
|
||||||
|
/// Execute JavaScript in the browser
|
||||||
|
async fn execute_script(&mut self, script: &str, args: Vec<Value>) -> Result<Value>;
|
||||||
|
|
||||||
|
/// Get the page source (HTML)
|
||||||
|
async fn page_source(&self) -> Result<String>;
|
||||||
|
|
||||||
|
/// Take a screenshot and save to path
|
||||||
|
async fn screenshot(&mut self, path: &str) -> Result<()>;
|
||||||
|
|
||||||
|
/// Close the current window/tab
|
||||||
|
async fn close(&mut self) -> Result<()>;
|
||||||
|
|
||||||
|
/// Quit the browser session
|
||||||
|
async fn quit(self) -> Result<()>;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Represents a web element in the DOM
|
||||||
|
pub struct WebElement {
|
||||||
|
pub(crate) inner: fantoccini::elements::Element,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl WebElement {
|
||||||
|
/// Click the element
|
||||||
|
pub async fn click(&mut self) -> Result<()> {
|
||||||
|
self.inner.click().await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Send keys/text to the element
|
||||||
|
pub async fn send_keys(&mut self, text: &str) -> Result<()> {
|
||||||
|
self.inner.send_keys(text).await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Clear the element's content (for input fields)
|
||||||
|
pub async fn clear(&mut self) -> Result<()> {
|
||||||
|
self.inner.clear().await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the element's text content
|
||||||
|
pub async fn text(&self) -> Result<String> {
|
||||||
|
Ok(self.inner.text().await?)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get an attribute value
|
||||||
|
pub async fn attr(&self, name: &str) -> Result<Option<String>> {
|
||||||
|
Ok(self.inner.attr(name).await?)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get a property value
|
||||||
|
pub async fn prop(&self, name: &str) -> Result<Option<String>> {
|
||||||
|
Ok(self.inner.prop(name).await?)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the element's HTML
|
||||||
|
pub async fn html(&self, inner: bool) -> Result<String> {
|
||||||
|
Ok(self.inner.html(inner).await?)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if element is displayed
|
||||||
|
pub async fn is_displayed(&self) -> Result<bool> {
|
||||||
|
Ok(self.inner.is_displayed().await?)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if element is enabled
|
||||||
|
pub async fn is_enabled(&self) -> Result<bool> {
|
||||||
|
Ok(self.inner.is_enabled().await?)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if element is selected (for checkboxes/radio buttons)
|
||||||
|
pub async fn is_selected(&self) -> Result<bool> {
|
||||||
|
Ok(self.inner.is_selected().await?)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Find a child element by CSS selector
|
||||||
|
pub async fn find_element(&mut self, selector: &str) -> Result<WebElement> {
|
||||||
|
let elem = self.inner.find(fantoccini::Locator::Css(selector)).await?;
|
||||||
|
Ok(WebElement { inner: elem })
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Find multiple child elements by CSS selector
|
||||||
|
pub async fn find_elements(&mut self, selector: &str) -> Result<Vec<WebElement>> {
|
||||||
|
let elems = self.inner.find_all(fantoccini::Locator::Css(selector)).await?;
|
||||||
|
Ok(elems.into_iter().map(|inner| WebElement { inner }).collect())
|
||||||
|
}
|
||||||
|
}
|
||||||
212
crates/g3-computer-control/src/webdriver/safari.rs
Normal file
212
crates/g3-computer-control/src/webdriver/safari.rs
Normal file
@@ -0,0 +1,212 @@
|
|||||||
|
use super::{WebDriverController, WebElement};
|
||||||
|
use anyhow::{Context, Result};
|
||||||
|
use async_trait::async_trait;
|
||||||
|
use fantoccini::{Client, ClientBuilder};
|
||||||
|
use serde_json::Value;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
/// SafariDriver WebDriver controller
|
||||||
|
pub struct SafariDriver {
|
||||||
|
client: Client,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SafariDriver {
|
||||||
|
/// Create a new SafariDriver instance
|
||||||
|
///
|
||||||
|
/// This will connect to SafariDriver running on the default port (4444).
|
||||||
|
/// Make sure to enable "Allow Remote Automation" in Safari's Develop menu first.
|
||||||
|
///
|
||||||
|
/// You can start SafariDriver manually with:
|
||||||
|
/// ```bash
|
||||||
|
/// /usr/bin/safaridriver --enable
|
||||||
|
/// ```
|
||||||
|
pub async fn new() -> Result<Self> {
|
||||||
|
Self::with_port(4444).await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a new SafariDriver instance with a custom port
|
||||||
|
pub async fn with_port(port: u16) -> Result<Self> {
|
||||||
|
let url = format!("http://localhost:{}", port);
|
||||||
|
|
||||||
|
let mut caps = serde_json::Map::new();
|
||||||
|
caps.insert("browserName".to_string(), Value::String("safari".to_string()));
|
||||||
|
|
||||||
|
let client = ClientBuilder::native()
|
||||||
|
.capabilities(caps)
|
||||||
|
.connect(&url)
|
||||||
|
.await
|
||||||
|
.context("Failed to connect to SafariDriver. Make sure SafariDriver is running and 'Allow Remote Automation' is enabled in Safari's Develop menu.")?;
|
||||||
|
|
||||||
|
Ok(Self { client })
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Go back in browser history
|
||||||
|
pub async fn back(&mut self) -> Result<()> {
|
||||||
|
self.client.back().await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Go forward in browser history
|
||||||
|
pub async fn forward(&mut self) -> Result<()> {
|
||||||
|
self.client.forward().await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Refresh the current page
|
||||||
|
pub async fn refresh(&mut self) -> Result<()> {
|
||||||
|
self.client.refresh().await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get all window handles
|
||||||
|
pub async fn window_handles(&mut self) -> Result<Vec<String>> {
|
||||||
|
let handles = self.client.windows().await?;
|
||||||
|
Ok(handles.into_iter()
|
||||||
|
.map(|h| h.into())
|
||||||
|
.collect())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Switch to a window by handle
|
||||||
|
pub async fn switch_to_window(&mut self, handle: &str) -> Result<()> {
|
||||||
|
let window_handle: fantoccini::wd::WindowHandle = handle.to_string().try_into()?;
|
||||||
|
self.client.switch_to_window(window_handle).await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the current window handle
|
||||||
|
pub async fn current_window_handle(&mut self) -> Result<String> {
|
||||||
|
Ok(self.client.window().await?.into())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Close the current window
|
||||||
|
pub async fn close_window(&mut self) -> Result<()> {
|
||||||
|
self.client.close_window().await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a new window/tab
|
||||||
|
pub async fn new_window(&mut self, is_tab: bool) -> Result<String> {
|
||||||
|
let window_type = if is_tab { "tab" } else { "window" };
|
||||||
|
let response = self.client.new_window(window_type == "tab").await?;
|
||||||
|
Ok(response.handle.into())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get cookies
|
||||||
|
pub async fn get_cookies(&mut self) -> Result<Vec<fantoccini::cookies::Cookie<'static>>> {
|
||||||
|
Ok(self.client.get_all_cookies().await?)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Add a cookie
|
||||||
|
pub async fn add_cookie(&mut self, cookie: fantoccini::cookies::Cookie<'static>) -> Result<()> {
|
||||||
|
self.client.add_cookie(cookie).await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Delete all cookies
|
||||||
|
pub async fn delete_all_cookies(&mut self) -> Result<()> {
|
||||||
|
self.client.delete_all_cookies().await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Wait for an element to appear (with timeout)
|
||||||
|
pub async fn wait_for_element(&mut self, selector: &str, timeout: Duration) -> Result<WebElement> {
|
||||||
|
let start = std::time::Instant::now();
|
||||||
|
let poll_interval = Duration::from_millis(100);
|
||||||
|
|
||||||
|
loop {
|
||||||
|
if let Ok(elem) = self.find_element(selector).await {
|
||||||
|
return Ok(elem);
|
||||||
|
}
|
||||||
|
|
||||||
|
if start.elapsed() >= timeout {
|
||||||
|
anyhow::bail!("Timeout waiting for element: {}", selector);
|
||||||
|
}
|
||||||
|
|
||||||
|
tokio::time::sleep(poll_interval).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Wait for an element to be visible (with timeout)
|
||||||
|
pub async fn wait_for_visible(&mut self, selector: &str, timeout: Duration) -> Result<WebElement> {
|
||||||
|
let start = std::time::Instant::now();
|
||||||
|
let poll_interval = Duration::from_millis(100);
|
||||||
|
|
||||||
|
loop {
|
||||||
|
if let Ok(elem) = self.find_element(selector).await {
|
||||||
|
if elem.is_displayed().await.unwrap_or(false) {
|
||||||
|
return Ok(elem);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if start.elapsed() >= timeout {
|
||||||
|
anyhow::bail!("Timeout waiting for element to be visible: {}", selector);
|
||||||
|
}
|
||||||
|
|
||||||
|
tokio::time::sleep(poll_interval).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl WebDriverController for SafariDriver {
|
||||||
|
async fn navigate(&mut self, url: &str) -> Result<()> {
|
||||||
|
self.client.goto(url).await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn current_url(&self) -> Result<String> {
|
||||||
|
Ok(self.client.current_url().await?.to_string())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn title(&self) -> Result<String> {
|
||||||
|
Ok(self.client.title().await?)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn find_element(&mut self, selector: &str) -> Result<WebElement> {
|
||||||
|
let elem = self.client.find(fantoccini::Locator::Css(selector)).await
|
||||||
|
.context(format!("Failed to find element with selector: {}", selector))?;
|
||||||
|
Ok(WebElement { inner: elem })
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn find_elements(&mut self, selector: &str) -> Result<Vec<WebElement>> {
|
||||||
|
let elems = self.client.find_all(fantoccini::Locator::Css(selector)).await?;
|
||||||
|
Ok(elems.into_iter().map(|inner| WebElement { inner }).collect())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn execute_script(&mut self, script: &str, args: Vec<Value>) -> Result<Value> {
|
||||||
|
Ok(self.client.execute(script, args).await?)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn page_source(&self) -> Result<String> {
|
||||||
|
Ok(self.client.source().await?)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn screenshot(&mut self, path: &str) -> Result<()> {
|
||||||
|
let screenshot_data = self.client.screenshot().await?;
|
||||||
|
|
||||||
|
// Expand tilde in path
|
||||||
|
let expanded_path = shellexpand::tilde(path);
|
||||||
|
let path_str = expanded_path.as_ref();
|
||||||
|
|
||||||
|
// Create parent directories if needed
|
||||||
|
if let Some(parent) = std::path::Path::new(path_str).parent() {
|
||||||
|
std::fs::create_dir_all(parent)
|
||||||
|
.context("Failed to create parent directories for screenshot")?;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::fs::write(path_str, screenshot_data)
|
||||||
|
.context("Failed to write screenshot to file")?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn close(&mut self) -> Result<()> {
|
||||||
|
self.client.close_window().await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn quit(mut self) -> Result<()> {
|
||||||
|
self.client.close().await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -7,6 +7,7 @@ pub struct Config {
|
|||||||
pub providers: ProvidersConfig,
|
pub providers: ProvidersConfig,
|
||||||
pub agent: AgentConfig,
|
pub agent: AgentConfig,
|
||||||
pub computer_control: ComputerControlConfig,
|
pub computer_control: ComputerControlConfig,
|
||||||
|
pub webdriver: WebDriverConfig,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
@@ -70,6 +71,21 @@ pub struct ComputerControlConfig {
|
|||||||
pub max_actions_per_second: u32,
|
pub max_actions_per_second: u32,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct WebDriverConfig {
|
||||||
|
pub enabled: bool,
|
||||||
|
pub safari_port: u16,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for WebDriverConfig {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
enabled: false,
|
||||||
|
safari_port: 4444,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl Default for ComputerControlConfig {
|
impl Default for ComputerControlConfig {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
Self {
|
Self {
|
||||||
@@ -103,6 +119,7 @@ impl Default for Config {
|
|||||||
timeout_seconds: 60,
|
timeout_seconds: 60,
|
||||||
},
|
},
|
||||||
computer_control: ComputerControlConfig::default(),
|
computer_control: ComputerControlConfig::default(),
|
||||||
|
webdriver: WebDriverConfig::default(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -214,6 +231,7 @@ impl Config {
|
|||||||
timeout_seconds: 60,
|
timeout_seconds: 60,
|
||||||
},
|
},
|
||||||
computer_control: ComputerControlConfig::default(),
|
computer_control: ComputerControlConfig::default(),
|
||||||
|
webdriver: WebDriverConfig::default(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ mod error_handling_test;
|
|||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use g3_config::Config;
|
use g3_config::Config;
|
||||||
use g3_execution::CodeExecutor;
|
use g3_execution::CodeExecutor;
|
||||||
|
use g3_computer_control::WebDriverController;
|
||||||
use g3_providers::{CompletionRequest, Message, MessageRole, ProviderRegistry, Tool};
|
use g3_providers::{CompletionRequest, Message, MessageRole, ProviderRegistry, Tool};
|
||||||
#[allow(unused_imports)]
|
#[allow(unused_imports)]
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
@@ -529,6 +530,8 @@ pub struct Agent<W: UiWriter> {
|
|||||||
quiet: bool,
|
quiet: bool,
|
||||||
computer_controller: Option<Box<dyn g3_computer_control::ComputerController>>,
|
computer_controller: Option<Box<dyn g3_computer_control::ComputerController>>,
|
||||||
todo_content: std::sync::Arc<tokio::sync::RwLock<String>>,
|
todo_content: std::sync::Arc<tokio::sync::RwLock<String>>,
|
||||||
|
webdriver_session: std::sync::Arc<tokio::sync::RwLock<Option<std::sync::Arc<tokio::sync::Mutex<g3_computer_control::SafariDriver>>>>>,
|
||||||
|
safaridriver_process: std::sync::Arc<tokio::sync::RwLock<Option<tokio::process::Child>>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<W: UiWriter> Agent<W> {
|
impl<W: UiWriter> Agent<W> {
|
||||||
@@ -714,6 +717,8 @@ impl<W: UiWriter> Agent<W> {
|
|||||||
is_autonomous,
|
is_autonomous,
|
||||||
quiet,
|
quiet,
|
||||||
computer_controller,
|
computer_controller,
|
||||||
|
webdriver_session: std::sync::Arc::new(tokio::sync::RwLock::new(None)),
|
||||||
|
safaridriver_process: std::sync::Arc::new(tokio::sync::RwLock::new(None)),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1023,7 +1028,7 @@ Template:
|
|||||||
// Check if provider supports native tool calling and add tools if so
|
// Check if provider supports native tool calling and add tools if so
|
||||||
let provider = self.providers.get(None)?;
|
let provider = self.providers.get(None)?;
|
||||||
let tools = if provider.has_native_tool_calling() {
|
let tools = if provider.has_native_tool_calling() {
|
||||||
Some(Self::create_tool_definitions())
|
Some(Self::create_tool_definitions(self.config.webdriver.enabled))
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
};
|
};
|
||||||
@@ -1200,8 +1205,8 @@ Template:
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Create tool definitions for native tool calling providers
|
/// Create tool definitions for native tool calling providers
|
||||||
fn create_tool_definitions() -> Vec<Tool> {
|
fn create_tool_definitions(enable_webdriver: bool) -> Vec<Tool> {
|
||||||
vec![
|
let mut tools = vec![
|
||||||
Tool {
|
Tool {
|
||||||
name: "shell".to_string(),
|
name: "shell".to_string(),
|
||||||
description: "Execute shell commands".to_string(),
|
description: "Execute shell commands".to_string(),
|
||||||
@@ -1296,65 +1301,6 @@ Template:
|
|||||||
"required": ["summary"]
|
"required": ["summary"]
|
||||||
}),
|
}),
|
||||||
},
|
},
|
||||||
Tool {
|
|
||||||
name: "mouse_click".to_string(),
|
|
||||||
description: "Click the mouse at specific coordinates".to_string(),
|
|
||||||
input_schema: json!({
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"x": {
|
|
||||||
"type": "integer",
|
|
||||||
"description": "X coordinate"
|
|
||||||
},
|
|
||||||
"y": {
|
|
||||||
"type": "integer",
|
|
||||||
"description": "Y coordinate"
|
|
||||||
},
|
|
||||||
"button": {
|
|
||||||
"type": "string",
|
|
||||||
"enum": ["left", "right", "middle"],
|
|
||||||
"description": "Mouse button to click",
|
|
||||||
"default": "left"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": ["x", "y"]
|
|
||||||
}),
|
|
||||||
},
|
|
||||||
Tool {
|
|
||||||
name: "type_text".to_string(),
|
|
||||||
description: "Type text at the current cursor position".to_string(),
|
|
||||||
input_schema: json!({
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"text": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "Text to type"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": ["text"]
|
|
||||||
}),
|
|
||||||
},
|
|
||||||
Tool {
|
|
||||||
name: "find_element".to_string(),
|
|
||||||
description: "Find a UI element by text, role, or other attributes".to_string(),
|
|
||||||
input_schema: json!({
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"text": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "Text to search for"
|
|
||||||
},
|
|
||||||
"role": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "Element role (button, textfield, etc.)"
|
|
||||||
},
|
|
||||||
"window_id": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "Optional window ID to search in"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}),
|
|
||||||
},
|
|
||||||
Tool {
|
Tool {
|
||||||
name: "take_screenshot".to_string(),
|
name: "take_screenshot".to_string(),
|
||||||
description: "Capture a screenshot of the screen, region, or window. When capturing a specific application window (e.g., 'Safari', 'Terminal'), use the window_id parameter with just the application name. The tool will automatically use the native screencapture command with the application's window ID for a clean capture.".to_string(),
|
description: "Capture a screenshot of the screen, region, or window. When capturing a specific application window (e.g., 'Safari', 'Terminal'), use the window_id parameter with just the application name. The tool will automatically use the native screencapture command with the application's window ID for a clean capture.".to_string(),
|
||||||
@@ -1405,28 +1351,6 @@ Template:
|
|||||||
}
|
}
|
||||||
}),
|
}),
|
||||||
},
|
},
|
||||||
Tool {
|
|
||||||
name: "find_text_on_screen".to_string(),
|
|
||||||
description: "Find text visually on screen and return its coordinates".to_string(),
|
|
||||||
input_schema: json!({
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"text": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "Text to search for on screen"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": ["text"]
|
|
||||||
}),
|
|
||||||
},
|
|
||||||
Tool {
|
|
||||||
name: "list_windows".to_string(),
|
|
||||||
description: "List all currently open windows with their IDs, titles, and application names. Use this to identify which window to interact with before taking screenshots or performing other window-specific operations.".to_string(),
|
|
||||||
input_schema: json!({
|
|
||||||
"type": "object",
|
|
||||||
"properties": {}
|
|
||||||
}),
|
|
||||||
},
|
|
||||||
Tool {
|
Tool {
|
||||||
name: "todo_read".to_string(),
|
name: "todo_read".to_string(),
|
||||||
description: "Read the entire TODO list content. Use this to view current tasks, notes, and any other information stored in the TODO list.".to_string(),
|
description: "Read the entire TODO list content. Use this to view current tasks, notes, and any other information stored in the TODO list.".to_string(),
|
||||||
@@ -1450,7 +1374,193 @@ Template:
|
|||||||
"required": ["content"]
|
"required": ["content"]
|
||||||
}),
|
}),
|
||||||
},
|
},
|
||||||
]
|
];
|
||||||
|
|
||||||
|
// Add WebDriver tools if enabled
|
||||||
|
if enable_webdriver {
|
||||||
|
tools.extend(vec![
|
||||||
|
Tool {
|
||||||
|
name: "webdriver_start".to_string(),
|
||||||
|
description: "Start a Safari WebDriver session for browser automation. Must be called before any other webdriver tools. Requires Safari's 'Allow Remote Automation' to be enabled in Develop menu.".to_string(),
|
||||||
|
input_schema: json!({
|
||||||
|
"type": "object",
|
||||||
|
"properties": {},
|
||||||
|
"required": []
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
Tool {
|
||||||
|
name: "webdriver_navigate".to_string(),
|
||||||
|
description: "Navigate to a URL in the browser".to_string(),
|
||||||
|
input_schema: json!({
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"url": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The URL to navigate to (must include protocol, e.g., https://)"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["url"]
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
Tool {
|
||||||
|
name: "webdriver_get_url".to_string(),
|
||||||
|
description: "Get the current URL of the browser".to_string(),
|
||||||
|
input_schema: json!({
|
||||||
|
"type": "object",
|
||||||
|
"properties": {},
|
||||||
|
"required": []
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
Tool {
|
||||||
|
name: "webdriver_get_title".to_string(),
|
||||||
|
description: "Get the title of the current page".to_string(),
|
||||||
|
input_schema: json!({
|
||||||
|
"type": "object",
|
||||||
|
"properties": {},
|
||||||
|
"required": []
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
Tool {
|
||||||
|
name: "webdriver_find_element".to_string(),
|
||||||
|
description: "Find an element on the page by CSS selector and return its text content".to_string(),
|
||||||
|
input_schema: json!({
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"selector": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "CSS selector to find the element (e.g., 'h1', '.class-name', '#id')"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["selector"]
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
Tool {
|
||||||
|
name: "webdriver_find_elements".to_string(),
|
||||||
|
description: "Find all elements matching a CSS selector and return their text content".to_string(),
|
||||||
|
input_schema: json!({
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"selector": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "CSS selector to find elements"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["selector"]
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
Tool {
|
||||||
|
name: "webdriver_click".to_string(),
|
||||||
|
description: "Click an element on the page".to_string(),
|
||||||
|
input_schema: json!({
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"selector": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "CSS selector for the element to click"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["selector"]
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
Tool {
|
||||||
|
name: "webdriver_send_keys".to_string(),
|
||||||
|
description: "Type text into an input element".to_string(),
|
||||||
|
input_schema: json!({
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"selector": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "CSS selector for the input element"
|
||||||
|
},
|
||||||
|
"text": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Text to type into the element"
|
||||||
|
},
|
||||||
|
"clear_first": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Whether to clear the element before typing (default: true)"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["selector", "text"]
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
Tool {
|
||||||
|
name: "webdriver_execute_script".to_string(),
|
||||||
|
description: "Execute JavaScript code in the browser and return the result".to_string(),
|
||||||
|
input_schema: json!({
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"script": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "JavaScript code to execute (use 'return' to return a value)"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["script"]
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
Tool {
|
||||||
|
name: "webdriver_get_page_source".to_string(),
|
||||||
|
description: "Get the HTML source of the current page".to_string(),
|
||||||
|
input_schema: json!({
|
||||||
|
"type": "object",
|
||||||
|
"properties": {},
|
||||||
|
"required": []
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
Tool {
|
||||||
|
name: "webdriver_screenshot".to_string(),
|
||||||
|
description: "Take a screenshot of the browser window".to_string(),
|
||||||
|
input_schema: json!({
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"path": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Path where to save the screenshot (e.g., '/tmp/screenshot.png')"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["path"]
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
Tool {
|
||||||
|
name: "webdriver_back".to_string(),
|
||||||
|
description: "Navigate back in browser history".to_string(),
|
||||||
|
input_schema: json!({
|
||||||
|
"type": "object",
|
||||||
|
"properties": {},
|
||||||
|
"required": []
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
Tool {
|
||||||
|
name: "webdriver_forward".to_string(),
|
||||||
|
description: "Navigate forward in browser history".to_string(),
|
||||||
|
input_schema: json!({
|
||||||
|
"type": "object",
|
||||||
|
"properties": {},
|
||||||
|
"required": []
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
Tool {
|
||||||
|
name: "webdriver_refresh".to_string(),
|
||||||
|
description: "Refresh the current page".to_string(),
|
||||||
|
input_schema: json!({
|
||||||
|
"type": "object",
|
||||||
|
"properties": {},
|
||||||
|
"required": []
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
Tool {
|
||||||
|
name: "webdriver_quit".to_string(),
|
||||||
|
description: "Close the browser and end the WebDriver session".to_string(),
|
||||||
|
input_schema: json!({
|
||||||
|
"type": "object",
|
||||||
|
"properties": {},
|
||||||
|
"required": []
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
|
||||||
|
tools
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Helper method to stream with retry logic
|
/// Helper method to stream with retry logic
|
||||||
@@ -2011,7 +2121,7 @@ Template:
|
|||||||
|
|
||||||
// Ensure tools are included for native providers in subsequent iterations
|
// Ensure tools are included for native providers in subsequent iterations
|
||||||
if provider.has_native_tool_calling() {
|
if provider.has_native_tool_calling() {
|
||||||
request.tools = Some(Self::create_tool_definitions());
|
request.tools = Some(Self::create_tool_definitions(self.config.webdriver.enabled));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Only add to full_response if we haven't already added it
|
// Only add to full_response if we haven't already added it
|
||||||
@@ -2454,10 +2564,10 @@ Template:
|
|||||||
if is_image {
|
if is_image {
|
||||||
if let Some(controller) = &self.computer_controller {
|
if let Some(controller) = &self.computer_controller {
|
||||||
match controller.extract_text_from_image(path_str).await {
|
match controller.extract_text_from_image(path_str).await {
|
||||||
Ok(result) => {
|
Ok(text) => {
|
||||||
return Ok(format!(
|
return Ok(format!(
|
||||||
"📄 Image file (OCR extracted, confidence: {:.2}):\n{}",
|
"📄 Image file (OCR extracted):\n{}",
|
||||||
result.confidence, result.text
|
text
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
@@ -2817,98 +2927,6 @@ Template:
|
|||||||
Ok("✅ Turn completed".to_string())
|
Ok("✅ Turn completed".to_string())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
"mouse_click" => {
|
|
||||||
if let Some(controller) = &self.computer_controller {
|
|
||||||
let x = tool_call
|
|
||||||
.args
|
|
||||||
.get("x")
|
|
||||||
.and_then(|v| v.as_i64())
|
|
||||||
.unwrap_or(0) as i32;
|
|
||||||
let y = tool_call
|
|
||||||
.args
|
|
||||||
.get("y")
|
|
||||||
.and_then(|v| v.as_i64())
|
|
||||||
.unwrap_or(0) as i32;
|
|
||||||
let button_str = tool_call
|
|
||||||
.args
|
|
||||||
.get("button")
|
|
||||||
.and_then(|v| v.as_str())
|
|
||||||
.unwrap_or("left");
|
|
||||||
|
|
||||||
let button = match button_str {
|
|
||||||
"left" => g3_computer_control::types::MouseButton::Left,
|
|
||||||
"right" => g3_computer_control::types::MouseButton::Right,
|
|
||||||
"middle" => g3_computer_control::types::MouseButton::Middle,
|
|
||||||
_ => g3_computer_control::types::MouseButton::Left,
|
|
||||||
};
|
|
||||||
|
|
||||||
match controller.move_mouse(x, y).await {
|
|
||||||
Ok(_) => {
|
|
||||||
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
|
|
||||||
match controller.click(button).await {
|
|
||||||
Ok(_) => Ok(format!(
|
|
||||||
"✅ Clicked {} button at ({}, {})",
|
|
||||||
button_str, x, y
|
|
||||||
)),
|
|
||||||
Err(e) => Ok(format!("❌ Failed to click: {}", e)),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Err(e) => Ok(format!("❌ Failed to move mouse: {}", e)),
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"type_text" => {
|
|
||||||
if let Some(controller) = &self.computer_controller {
|
|
||||||
let text = tool_call
|
|
||||||
.args
|
|
||||||
.get("text")
|
|
||||||
.and_then(|v| v.as_str())
|
|
||||||
.ok_or_else(|| anyhow::anyhow!("Missing text argument"))?;
|
|
||||||
|
|
||||||
match controller.type_text(text).await {
|
|
||||||
Ok(_) => Ok(format!("✅ Typed text: {}", text)),
|
|
||||||
Err(e) => Ok(format!("❌ Failed to type text: {}", e)),
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"find_element" => {
|
|
||||||
if let Some(controller) = &self.computer_controller {
|
|
||||||
let selector = g3_computer_control::types::ElementSelector {
|
|
||||||
text: tool_call
|
|
||||||
.args
|
|
||||||
.get("text")
|
|
||||||
.and_then(|v| v.as_str())
|
|
||||||
.map(String::from),
|
|
||||||
role: tool_call
|
|
||||||
.args
|
|
||||||
.get("role")
|
|
||||||
.and_then(|v| v.as_str())
|
|
||||||
.map(String::from),
|
|
||||||
window_id: tool_call
|
|
||||||
.args
|
|
||||||
.get("window_id")
|
|
||||||
.and_then(|v| v.as_str())
|
|
||||||
.map(String::from),
|
|
||||||
};
|
|
||||||
|
|
||||||
match controller.find_element(&selector).await {
|
|
||||||
Ok(Some(element)) => match serde_json::to_string_pretty(&element) {
|
|
||||||
Ok(json) => Ok(format!("✅ Found element:\n{}", json)),
|
|
||||||
Err(e) => {
|
|
||||||
Ok(format!("✅ Found element but failed to serialize: {}", e))
|
|
||||||
}
|
|
||||||
},
|
|
||||||
Ok(None) => Ok("❌ Element not found".to_string()),
|
|
||||||
Err(e) => Ok(format!("❌ Failed to find element: {}", e)),
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"take_screenshot" => {
|
"take_screenshot" => {
|
||||||
if let Some(controller) = &self.computer_controller {
|
if let Some(controller) = &self.computer_controller {
|
||||||
let path = tool_call
|
let path = tool_call
|
||||||
@@ -2973,10 +2991,7 @@ Template:
|
|||||||
if let Some(path) = tool_call.args.get("path").and_then(|v| v.as_str()) {
|
if let Some(path) = tool_call.args.get("path").and_then(|v| v.as_str()) {
|
||||||
// Extract text from image file
|
// Extract text from image file
|
||||||
match controller.extract_text_from_image(path).await {
|
match controller.extract_text_from_image(path).await {
|
||||||
Ok(result) => Ok(format!(
|
Ok(text) => Ok(format!("✅ Extracted text:\n{}", text)),
|
||||||
"✅ Extracted text (confidence: {:.2}):\n{}",
|
|
||||||
result.confidence, result.text
|
|
||||||
)),
|
|
||||||
Err(e) => Ok(format!("❌ Failed to extract text: {}", e)),
|
Err(e) => Ok(format!("❌ Failed to extract text: {}", e)),
|
||||||
}
|
}
|
||||||
} else if let Some(region_obj) =
|
} else if let Some(region_obj) =
|
||||||
@@ -2997,10 +3012,7 @@ Template:
|
|||||||
};
|
};
|
||||||
|
|
||||||
match controller.extract_text_from_screen(region).await {
|
match controller.extract_text_from_screen(region).await {
|
||||||
Ok(result) => Ok(format!(
|
Ok(text) => Ok(format!("✅ Extracted text:\n{}", text)),
|
||||||
"✅ Extracted text (confidence: {:.2}):\n{}",
|
|
||||||
result.confidence, result.text
|
|
||||||
)),
|
|
||||||
Err(e) => Ok(format!("❌ Failed to extract text: {}", e)),
|
Err(e) => Ok(format!("❌ Failed to extract text: {}", e)),
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@@ -3010,57 +3022,6 @@ Template:
|
|||||||
Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string())
|
Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
"find_text_on_screen" => {
|
|
||||||
if let Some(controller) = &self.computer_controller {
|
|
||||||
let text = tool_call
|
|
||||||
.args
|
|
||||||
.get("text")
|
|
||||||
.and_then(|v| v.as_str())
|
|
||||||
.ok_or_else(|| anyhow::anyhow!("Missing text argument"))?;
|
|
||||||
|
|
||||||
match controller.find_text_on_screen(text).await {
|
|
||||||
Ok(Some(point)) => Ok(format!(
|
|
||||||
"✅ Found text '{}' at coordinates ({}, {})",
|
|
||||||
text, point.x, point.y
|
|
||||||
)),
|
|
||||||
Ok(None) => Ok(format!("❌ Text '{}' not found on screen", text)),
|
|
||||||
Err(e) => Ok(format!("❌ Failed to search for text: {}", e)),
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"list_windows" => {
|
|
||||||
if let Some(controller) = &self.computer_controller {
|
|
||||||
match controller.list_windows().await {
|
|
||||||
Ok(windows) => {
|
|
||||||
if windows.is_empty() {
|
|
||||||
Ok("📋 No windows found".to_string())
|
|
||||||
} else {
|
|
||||||
let mut output = format!("📋 Found {} windows:\n\n", windows.len());
|
|
||||||
for window in windows {
|
|
||||||
output.push_str(&format!(
|
|
||||||
"• **{}** ({}x{})\n ID: `{}`\n Title: {}\n\n",
|
|
||||||
window.app_name,
|
|
||||||
window.bounds.width,
|
|
||||||
window.bounds.height,
|
|
||||||
window.id,
|
|
||||||
if window.title.is_empty() {
|
|
||||||
"(no title)"
|
|
||||||
} else {
|
|
||||||
&window.title
|
|
||||||
}
|
|
||||||
));
|
|
||||||
}
|
|
||||||
Ok(output)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Err(e) => Ok(format!("❌ Failed to list windows: {}", e)),
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"todo_read" => {
|
"todo_read" => {
|
||||||
debug!("Processing todo_read tool call");
|
debug!("Processing todo_read tool call");
|
||||||
let content = self.todo_content.read().await;
|
let content = self.todo_content.read().await;
|
||||||
@@ -3094,6 +3055,446 @@ Template:
|
|||||||
Ok("❌ Missing content argument".to_string())
|
Ok("❌ Missing content argument".to_string())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
"webdriver_start" => {
|
||||||
|
debug!("Processing webdriver_start tool call");
|
||||||
|
|
||||||
|
if !self.config.webdriver.enabled {
|
||||||
|
return Ok("❌ WebDriver is not enabled. Use --webdriver flag to enable.".to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if session already exists
|
||||||
|
let session_guard = self.webdriver_session.read().await;
|
||||||
|
if session_guard.is_some() {
|
||||||
|
drop(session_guard);
|
||||||
|
return Ok("✅ WebDriver session already active".to_string());
|
||||||
|
}
|
||||||
|
drop(session_guard);
|
||||||
|
|
||||||
|
// Check if Safari Remote Automation is enabled
|
||||||
|
let check_enabled = tokio::process::Command::new("safaridriver")
|
||||||
|
.arg("--enable")
|
||||||
|
.output()
|
||||||
|
.await;
|
||||||
|
|
||||||
|
match check_enabled {
|
||||||
|
Ok(output) if !output.status.success() => {
|
||||||
|
return Ok("❌ Safari Remote Automation is not enabled.\n\nTo enable it (one-time setup):\n 1. Run: safaridriver --enable\n 2. Enter your password when prompted\n 3. Try again\n\nAlternatively, enable it manually:\n Safari → Develop → Allow Remote Automation".to_string());
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
return Ok(format!("❌ Failed to check Safari automation status: {}\n\nMake sure safaridriver is installed (it comes with macOS).", e));
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
debug!("Safari Remote Automation is enabled");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start safaridriver process
|
||||||
|
let port = self.config.webdriver.safari_port;
|
||||||
|
info!("Starting safaridriver on port {}", port);
|
||||||
|
|
||||||
|
let safaridriver_result = tokio::process::Command::new("safaridriver")
|
||||||
|
.arg("--port")
|
||||||
|
.arg(port.to_string())
|
||||||
|
.stdout(std::process::Stdio::null())
|
||||||
|
.stderr(std::process::Stdio::null())
|
||||||
|
.spawn();
|
||||||
|
|
||||||
|
let mut safaridriver_process = match safaridriver_result {
|
||||||
|
Ok(process) => process,
|
||||||
|
Err(e) => {
|
||||||
|
return Ok(format!("❌ Failed to start safaridriver: {}\n\nMake sure safaridriver is installed.", e));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Wait for safaridriver to start up
|
||||||
|
info!("Waiting for safaridriver to start...");
|
||||||
|
tokio::time::sleep(tokio::time::Duration::from_millis(1000)).await;
|
||||||
|
|
||||||
|
// Connect to SafariDriver
|
||||||
|
match g3_computer_control::SafariDriver::with_port(port).await {
|
||||||
|
Ok(driver) => {
|
||||||
|
let session = std::sync::Arc::new(tokio::sync::Mutex::new(driver));
|
||||||
|
*self.webdriver_session.write().await = Some(session);
|
||||||
|
|
||||||
|
// Store the process handle
|
||||||
|
*self.safaridriver_process.write().await = Some(safaridriver_process);
|
||||||
|
|
||||||
|
info!("WebDriver session started successfully");
|
||||||
|
Ok("✅ WebDriver session started successfully! Safari should open automatically.".to_string())
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
// Kill the safaridriver process if connection failed
|
||||||
|
let _ = safaridriver_process.kill().await;
|
||||||
|
|
||||||
|
Ok(format!("❌ Failed to connect to SafariDriver: {}\n\nThis might be because:\n - Port {} is already in use\n - Safari failed to start\n - Network connectivity issue\n\nTry a different port or check if another safaridriver is running.", e, port))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"webdriver_navigate" => {
|
||||||
|
debug!("Processing webdriver_navigate tool call");
|
||||||
|
|
||||||
|
if !self.config.webdriver.enabled {
|
||||||
|
return Ok("❌ WebDriver is not enabled. Use --webdriver flag to enable.".to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
let session_guard = self.webdriver_session.read().await;
|
||||||
|
let session = match session_guard.as_ref() {
|
||||||
|
Some(s) => s.clone(),
|
||||||
|
None => return Ok("❌ No active WebDriver session. Call webdriver_start first.".to_string()),
|
||||||
|
};
|
||||||
|
drop(session_guard);
|
||||||
|
let url = match tool_call.args.get("url").and_then(|v| v.as_str()) {
|
||||||
|
Some(u) => u,
|
||||||
|
None => return Ok("❌ Missing url argument".to_string()),
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut driver = session.lock().await;
|
||||||
|
match driver.navigate(url).await {
|
||||||
|
Ok(_) => Ok(format!("✅ Navigated to {}", url)),
|
||||||
|
Err(e) => Ok(format!("❌ Failed to navigate: {}", e)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"webdriver_get_url" => {
|
||||||
|
debug!("Processing webdriver_get_url tool call");
|
||||||
|
|
||||||
|
if !self.config.webdriver.enabled {
|
||||||
|
return Ok("❌ WebDriver is not enabled. Use --webdriver flag to enable.".to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
let session_guard = self.webdriver_session.read().await;
|
||||||
|
let session = match session_guard.as_ref() {
|
||||||
|
Some(s) => s.clone(),
|
||||||
|
None => return Ok("❌ No active WebDriver session. Call webdriver_start first.".to_string()),
|
||||||
|
};
|
||||||
|
|
||||||
|
let driver = session.lock().await;
|
||||||
|
match driver.current_url().await {
|
||||||
|
Ok(url) => Ok(format!("Current URL: {}", url)),
|
||||||
|
Err(e) => Ok(format!("❌ Failed to get URL: {}", e)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"webdriver_get_title" => {
|
||||||
|
debug!("Processing webdriver_get_title tool call");
|
||||||
|
|
||||||
|
if !self.config.webdriver.enabled {
|
||||||
|
return Ok("❌ WebDriver is not enabled. Use --webdriver flag to enable.".to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
let session_guard = self.webdriver_session.read().await;
|
||||||
|
let session = match session_guard.as_ref() {
|
||||||
|
Some(s) => s.clone(),
|
||||||
|
None => return Ok("❌ No active WebDriver session. Call webdriver_start first.".to_string()),
|
||||||
|
};
|
||||||
|
|
||||||
|
let driver = session.lock().await;
|
||||||
|
match driver.title().await {
|
||||||
|
Ok(title) => Ok(format!("Page title: {}", title)),
|
||||||
|
Err(e) => Ok(format!("❌ Failed to get title: {}", e)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"webdriver_find_element" => {
|
||||||
|
debug!("Processing webdriver_find_element tool call");
|
||||||
|
|
||||||
|
if !self.config.webdriver.enabled {
|
||||||
|
return Ok("❌ WebDriver is not enabled. Use --webdriver flag to enable.".to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
let session_guard = self.webdriver_session.read().await;
|
||||||
|
let session = match session_guard.as_ref() {
|
||||||
|
Some(s) => s.clone(),
|
||||||
|
None => return Ok("❌ No active WebDriver session. Call webdriver_start first.".to_string()),
|
||||||
|
};
|
||||||
|
|
||||||
|
let selector = match tool_call.args.get("selector").and_then(|v| v.as_str()) {
|
||||||
|
Some(s) => s,
|
||||||
|
None => return Ok("❌ Missing selector argument".to_string()),
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut driver = session.lock().await;
|
||||||
|
match driver.find_element(selector).await {
|
||||||
|
Ok(elem) => {
|
||||||
|
match elem.text().await {
|
||||||
|
Ok(text) => Ok(format!("Element text: {}", text)),
|
||||||
|
Err(e) => Ok(format!("❌ Failed to get element text: {}", e)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => Ok(format!("❌ Failed to find element '{}': {}", selector, e)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"webdriver_find_elements" => {
|
||||||
|
debug!("Processing webdriver_find_elements tool call");
|
||||||
|
|
||||||
|
if !self.config.webdriver.enabled {
|
||||||
|
return Ok("❌ WebDriver is not enabled. Use --webdriver flag to enable.".to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
let session_guard = self.webdriver_session.read().await;
|
||||||
|
let session = match session_guard.as_ref() {
|
||||||
|
Some(s) => s.clone(),
|
||||||
|
None => return Ok("❌ No active WebDriver session. Call webdriver_start first.".to_string()),
|
||||||
|
};
|
||||||
|
|
||||||
|
let selector = match tool_call.args.get("selector").and_then(|v| v.as_str()) {
|
||||||
|
Some(s) => s,
|
||||||
|
None => return Ok("❌ Missing selector argument".to_string()),
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut driver = session.lock().await;
|
||||||
|
match driver.find_elements(selector).await {
|
||||||
|
Ok(elements) => {
|
||||||
|
let mut results = Vec::new();
|
||||||
|
for (i, elem) in elements.iter().enumerate() {
|
||||||
|
match elem.text().await {
|
||||||
|
Ok(text) => results.push(format!("[{}]: {}", i, text)),
|
||||||
|
Err(_) => results.push(format!("[{}]: <error getting text>", i)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(format!("Found {} elements:\n{}", results.len(), results.join("\n")))
|
||||||
|
}
|
||||||
|
Err(e) => Ok(format!("❌ Failed to find elements '{}': {}", selector, e)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"webdriver_click" => {
|
||||||
|
debug!("Processing webdriver_click tool call");
|
||||||
|
|
||||||
|
if !self.config.webdriver.enabled {
|
||||||
|
return Ok("❌ WebDriver is not enabled. Use --webdriver flag to enable.".to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
let session_guard = self.webdriver_session.read().await;
|
||||||
|
let session = match session_guard.as_ref() {
|
||||||
|
Some(s) => s.clone(),
|
||||||
|
None => return Ok("❌ No active WebDriver session. Call webdriver_start first.".to_string()),
|
||||||
|
};
|
||||||
|
|
||||||
|
let selector = match tool_call.args.get("selector").and_then(|v| v.as_str()) {
|
||||||
|
Some(s) => s,
|
||||||
|
None => return Ok("❌ Missing selector argument".to_string()),
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut driver = session.lock().await;
|
||||||
|
match driver.find_element(selector).await {
|
||||||
|
Ok(mut elem) => {
|
||||||
|
match elem.click().await {
|
||||||
|
Ok(_) => Ok(format!("✅ Clicked element '{}'", selector)),
|
||||||
|
Err(e) => Ok(format!("❌ Failed to click element: {}", e)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => Ok(format!("❌ Failed to find element '{}': {}", selector, e)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"webdriver_send_keys" => {
|
||||||
|
debug!("Processing webdriver_send_keys tool call");
|
||||||
|
|
||||||
|
if !self.config.webdriver.enabled {
|
||||||
|
return Ok("❌ WebDriver is not enabled. Use --webdriver flag to enable.".to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
let session_guard = self.webdriver_session.read().await;
|
||||||
|
let session = match session_guard.as_ref() {
|
||||||
|
Some(s) => s.clone(),
|
||||||
|
None => return Ok("❌ No active WebDriver session. Call webdriver_start first.".to_string()),
|
||||||
|
};
|
||||||
|
|
||||||
|
let selector = match tool_call.args.get("selector").and_then(|v| v.as_str()) {
|
||||||
|
Some(s) => s,
|
||||||
|
None => return Ok("❌ Missing selector argument".to_string()),
|
||||||
|
};
|
||||||
|
|
||||||
|
let text = match tool_call.args.get("text").and_then(|v| v.as_str()) {
|
||||||
|
Some(t) => t,
|
||||||
|
None => return Ok("❌ Missing text argument".to_string()),
|
||||||
|
};
|
||||||
|
|
||||||
|
let clear_first = tool_call.args.get("clear_first")
|
||||||
|
.and_then(|v| v.as_bool())
|
||||||
|
.unwrap_or(true);
|
||||||
|
|
||||||
|
let mut driver = session.lock().await;
|
||||||
|
match driver.find_element(selector).await {
|
||||||
|
Ok(mut elem) => {
|
||||||
|
if clear_first {
|
||||||
|
if let Err(e) = elem.clear().await {
|
||||||
|
return Ok(format!("❌ Failed to clear element: {}", e));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
match elem.send_keys(text).await {
|
||||||
|
Ok(_) => Ok(format!("✅ Sent keys to element '{}'", selector)),
|
||||||
|
Err(e) => Ok(format!("❌ Failed to send keys: {}", e)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => Ok(format!("❌ Failed to find element '{}': {}", selector, e)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"webdriver_execute_script" => {
|
||||||
|
debug!("Processing webdriver_execute_script tool call");
|
||||||
|
|
||||||
|
if !self.config.webdriver.enabled {
|
||||||
|
return Ok("❌ WebDriver is not enabled. Use --webdriver flag to enable.".to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
let session_guard = self.webdriver_session.read().await;
|
||||||
|
let session = match session_guard.as_ref() {
|
||||||
|
Some(s) => s.clone(),
|
||||||
|
None => return Ok("❌ No active WebDriver session. Call webdriver_start first.".to_string()),
|
||||||
|
};
|
||||||
|
|
||||||
|
let script = match tool_call.args.get("script").and_then(|v| v.as_str()) {
|
||||||
|
Some(s) => s,
|
||||||
|
None => return Ok("❌ Missing script argument".to_string()),
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut driver = session.lock().await;
|
||||||
|
match driver.execute_script(script, vec![]).await {
|
||||||
|
Ok(result) => Ok(format!("Script result: {:?}", result)),
|
||||||
|
Err(e) => Ok(format!("❌ Failed to execute script: {}", e)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"webdriver_get_page_source" => {
|
||||||
|
debug!("Processing webdriver_get_page_source tool call");
|
||||||
|
|
||||||
|
if !self.config.webdriver.enabled {
|
||||||
|
return Ok("❌ WebDriver is not enabled. Use --webdriver flag to enable.".to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
let session_guard = self.webdriver_session.read().await;
|
||||||
|
let session = match session_guard.as_ref() {
|
||||||
|
Some(s) => s.clone(),
|
||||||
|
None => return Ok("❌ No active WebDriver session. Call webdriver_start first.".to_string()),
|
||||||
|
};
|
||||||
|
|
||||||
|
let driver = session.lock().await;
|
||||||
|
match driver.page_source().await {
|
||||||
|
Ok(source) => {
|
||||||
|
// Truncate if too long
|
||||||
|
if source.len() > 10000 {
|
||||||
|
Ok(format!("Page source ({} chars, truncated to 10000):\n{}...", source.len(), &source[..10000]))
|
||||||
|
} else {
|
||||||
|
Ok(format!("Page source ({} chars):\n{}", source.len(), source))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => Ok(format!("❌ Failed to get page source: {}", e)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"webdriver_screenshot" => {
|
||||||
|
debug!("Processing webdriver_screenshot tool call");
|
||||||
|
|
||||||
|
if !self.config.webdriver.enabled {
|
||||||
|
return Ok("❌ WebDriver is not enabled. Use --webdriver flag to enable.".to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
let session_guard = self.webdriver_session.read().await;
|
||||||
|
let session = match session_guard.as_ref() {
|
||||||
|
Some(s) => s.clone(),
|
||||||
|
None => return Ok("❌ No active WebDriver session. Call webdriver_start first.".to_string()),
|
||||||
|
};
|
||||||
|
|
||||||
|
let path = match tool_call.args.get("path").and_then(|v| v.as_str()) {
|
||||||
|
Some(p) => p,
|
||||||
|
None => return Ok("❌ Missing path argument".to_string()),
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut driver = session.lock().await;
|
||||||
|
match driver.screenshot(path).await {
|
||||||
|
Ok(_) => Ok(format!("✅ Screenshot saved to {}", path)),
|
||||||
|
Err(e) => Ok(format!("❌ Failed to take screenshot: {}", e)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"webdriver_back" => {
|
||||||
|
debug!("Processing webdriver_back tool call");
|
||||||
|
|
||||||
|
if !self.config.webdriver.enabled {
|
||||||
|
return Ok("❌ WebDriver is not enabled. Use --webdriver flag to enable.".to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
let session_guard = self.webdriver_session.read().await;
|
||||||
|
let session = match session_guard.as_ref() {
|
||||||
|
Some(s) => s.clone(),
|
||||||
|
None => return Ok("❌ No active WebDriver session. Call webdriver_start first.".to_string()),
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut driver = session.lock().await;
|
||||||
|
match driver.back().await {
|
||||||
|
Ok(_) => Ok("✅ Navigated back".to_string()),
|
||||||
|
Err(e) => Ok(format!("❌ Failed to navigate back: {}", e)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"webdriver_forward" => {
|
||||||
|
debug!("Processing webdriver_forward tool call");
|
||||||
|
|
||||||
|
if !self.config.webdriver.enabled {
|
||||||
|
return Ok("❌ WebDriver is not enabled. Use --webdriver flag to enable.".to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
let session_guard = self.webdriver_session.read().await;
|
||||||
|
let session = match session_guard.as_ref() {
|
||||||
|
Some(s) => s.clone(),
|
||||||
|
None => return Ok("❌ No active WebDriver session. Call webdriver_start first.".to_string()),
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut driver = session.lock().await;
|
||||||
|
match driver.forward().await {
|
||||||
|
Ok(_) => Ok("✅ Navigated forward".to_string()),
|
||||||
|
Err(e) => Ok(format!("❌ Failed to navigate forward: {}", e)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"webdriver_refresh" => {
|
||||||
|
debug!("Processing webdriver_refresh tool call");
|
||||||
|
|
||||||
|
if !self.config.webdriver.enabled {
|
||||||
|
return Ok("❌ WebDriver is not enabled. Use --webdriver flag to enable.".to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
let session_guard = self.webdriver_session.read().await;
|
||||||
|
let session = match session_guard.as_ref() {
|
||||||
|
Some(s) => s.clone(),
|
||||||
|
None => return Ok("❌ No active WebDriver session. Call webdriver_start first.".to_string()),
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut driver = session.lock().await;
|
||||||
|
match driver.refresh().await {
|
||||||
|
Ok(_) => Ok("✅ Page refreshed".to_string()),
|
||||||
|
Err(e) => Ok(format!("❌ Failed to refresh page: {}", e)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"webdriver_quit" => {
|
||||||
|
debug!("Processing webdriver_quit tool call");
|
||||||
|
|
||||||
|
if !self.config.webdriver.enabled {
|
||||||
|
return Ok("❌ WebDriver is not enabled. Use --webdriver flag to enable.".to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Take the session
|
||||||
|
let session = match self.webdriver_session.write().await.take() {
|
||||||
|
Some(s) => s.clone(),
|
||||||
|
None => return Ok("❌ No active WebDriver session.".to_string()),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Quit the WebDriver session
|
||||||
|
match std::sync::Arc::try_unwrap(session) {
|
||||||
|
Ok(mutex) => {
|
||||||
|
let driver = mutex.into_inner();
|
||||||
|
match driver.quit().await {
|
||||||
|
Ok(_) => {
|
||||||
|
info!("WebDriver session closed successfully");
|
||||||
|
|
||||||
|
// Kill the safaridriver process
|
||||||
|
if let Some(mut process) = self.safaridriver_process.write().await.take() {
|
||||||
|
if let Err(e) = process.kill().await {
|
||||||
|
warn!("Failed to kill safaridriver process: {}", e);
|
||||||
|
} else {
|
||||||
|
info!("Safaridriver process terminated");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok("✅ WebDriver session closed and safaridriver stopped".to_string())
|
||||||
|
}
|
||||||
|
Err(e) => Ok(format!("❌ Failed to quit WebDriver: {}", e)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(_) => Ok("❌ Cannot quit: WebDriver session is still in use".to_string()),
|
||||||
|
}
|
||||||
|
}
|
||||||
_ => {
|
_ => {
|
||||||
warn!("Unknown tool: {}", tool_call.tool);
|
warn!("Unknown tool: {}", tool_call.tool);
|
||||||
Ok(format!("❓ Unknown tool: {}", tool_call.tool))
|
Ok(format!("❓ Unknown tool: {}", tool_call.tool))
|
||||||
@@ -3545,3 +3946,23 @@ mod integration_tests {
|
|||||||
assert_eq!(result, expected);
|
assert_eq!(result, expected);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Implement Drop to clean up safaridriver process
|
||||||
|
impl<W: UiWriter> Drop for Agent<W> {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
// Try to kill safaridriver process if it's still running
|
||||||
|
// We need to use try_lock since we can't await in Drop
|
||||||
|
if let Ok(mut process_guard) = self.safaridriver_process.try_write() {
|
||||||
|
if let Some(process) = process_guard.take() {
|
||||||
|
// Use blocking kill since we can't await in Drop
|
||||||
|
// This is a best-effort cleanup
|
||||||
|
let _ = std::process::Command::new("kill")
|
||||||
|
.arg("-9")
|
||||||
|
.arg(process.id().unwrap_or(0).to_string())
|
||||||
|
.output();
|
||||||
|
|
||||||
|
debug!("Attempted to clean up safaridriver process on Agent drop");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user