diff --git a/crates/g3-computer-control/src/lib.rs b/crates/g3-computer-control/src/lib.rs index 355a591..b1cbc36 100644 --- a/crates/g3-computer-control/src/lib.rs +++ b/crates/g3-computer-control/src/lib.rs @@ -23,7 +23,7 @@ pub trait ComputerController: Send + Sync { async fn take_screenshot(&self, path: &str, region: Option, window_id: Option<&str>) -> Result<()>; // OCR operations - async fn extract_text_from_screen(&self, region: Rect) -> Result; + async fn extract_text_from_screen(&self, region: Rect, window_id: &str) -> Result; async fn extract_text_from_image(&self, path: &str) -> Result; async fn extract_text_with_locations(&self, path: &str) -> Result>; async fn find_text_in_app(&self, app_name: &str, search_text: &str) -> Result>; diff --git a/crates/g3-computer-control/src/platform/linux.rs b/crates/g3-computer-control/src/platform/linux.rs index 2a9d89c..cf485ed 100644 --- a/crates/g3-computer-control/src/platform/linux.rs +++ b/crates/g3-computer-control/src/platform/linux.rs @@ -63,10 +63,15 @@ impl ComputerController for LinuxController { } async fn take_screenshot(&self, _path: &str, _region: Option, _window_id: Option<&str>) -> Result<()> { + // Enforce that window_id must be provided + if _window_id.is_none() { + anyhow::bail!("window_id is required. You must specify which window to capture (e.g., 'Firefox', 'Terminal', 'gedit'). Use list_windows to see available windows."); + } + anyhow::bail!("Linux implementation not yet available") } - async fn extract_text_from_screen(&self, _region: Rect) -> Result { + async fn extract_text_from_screen(&self, _region: Rect, _window_id: &str) -> Result { anyhow::bail!("Linux implementation not yet available") } diff --git a/crates/g3-computer-control/src/platform/macos.rs b/crates/g3-computer-control/src/platform/macos.rs index da1aa95..b8b6bea 100644 --- a/crates/g3-computer-control/src/platform/macos.rs +++ b/crates/g3-computer-control/src/platform/macos.rs @@ -3,6 +3,11 @@ use crate::ocr::{OCREngine, DefaultOCR}; use anyhow::{Result, Context}; use async_trait::async_trait; use std::path::Path; +use core_graphics::window::{kCGWindowListOptionOnScreenOnly, kCGNullWindowID, CGWindowListCopyWindowInfo}; +use core_foundation::dictionary::CFDictionary; +use core_foundation::string::CFString; +use core_foundation::base::{TCFType, ToVoid}; +use core_foundation::array::CFArray; pub struct MacOSController { ocr_engine: Box, @@ -22,6 +27,11 @@ impl MacOSController { #[async_trait] impl ComputerController for MacOSController { async fn take_screenshot(&self, path: &str, region: Option, window_id: Option<&str>) -> Result<()> { + // Enforce that window_id must be provided + if window_id.is_none() { + return Err(anyhow::anyhow!("window_id is required. You must specify which window to capture (e.g., 'Safari', 'Terminal', 'Google Chrome'). Use list_windows to see available windows.")); + } + // Determine the temporary directory for screenshots let temp_dir = std::env::var("TMPDIR") .or_else(|_| std::env::var("HOME").map(|h| format!("{}/tmp", h))) @@ -42,48 +52,81 @@ impl ComputerController for MacOSController { std::fs::create_dir_all(parent)?; } - let mut cmd = std::process::Command::new("screencapture"); + let app_name = window_id.unwrap(); // Safe because we checked is_none() above - // Add flags + // Get the window ID for the specified application + let cg_window_id = unsafe { + let window_list = CGWindowListCopyWindowInfo( + kCGWindowListOptionOnScreenOnly, + kCGNullWindowID + ); + + let array = CFArray::::wrap_under_create_rule(window_list); + let count = array.len(); + + let mut found_window_id: Option = None; + + for i in 0..count { + let dict = array.get(i).unwrap(); + + // Get owner name + let owner_key = CFString::from_static_string("kCGWindowOwnerName"); + let owner: String = if let Some(value) = dict.find(owner_key.to_void()) { + let s: CFString = TCFType::wrap_under_get_rule(*value as *const _); + s.to_string() + } else { + continue; + }; + + // Check if this is the app we're looking for + if owner.to_lowercase().contains(&app_name.to_lowercase()) || app_name.to_lowercase().contains(&owner.to_lowercase()) { + // Get window ID + let window_id_key = CFString::from_static_string("kCGWindowNumber"); + if let Some(value) = dict.find(window_id_key.to_void()) { + let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _); + if let Some(id) = num.to_i64() { + found_window_id = Some(id as u32); + break; + } + } + } + } + + found_window_id + }; + + let cg_window_id = cg_window_id.ok_or_else(|| { + anyhow::anyhow!("Could not find window for application '{}'. Use list_windows to see available windows.", app_name) + })?; + + // Use screencapture with the window ID for now + // TODO: Implement direct CGWindowListCreateImage approach with proper image saving + let mut cmd = std::process::Command::new("screencapture"); cmd.arg("-x"); // No sound + cmd.arg("-l"); + cmd.arg(cg_window_id.to_string()); if let Some(region) = region { - // Capture specific region: -R x,y,width,height cmd.arg("-R"); cmd.arg(format!("{},{},{},{}", region.x, region.y, region.width, region.height)); } - if let Some(app_name) = window_id { - // Capture specific window by app name - // Use AppleScript to get window ID - let script = format!(r#"tell application "{}" to id of window 1"#, app_name); - let output = std::process::Command::new("osascript") - .arg("-e") - .arg(&script) - .output()?; - - if output.status.success() { - let window_id_str = String::from_utf8_lossy(&output.stdout).trim().to_string(); - cmd.arg(format!("-l{}", window_id_str)); - } - } - cmd.arg(&final_path); let screenshot_result = cmd.output()?; if !screenshot_result.status.success() { let stderr = String::from_utf8_lossy(&screenshot_result.stderr); - return Err(anyhow::anyhow!("screencapture failed: {}", stderr)); + return Err(anyhow::anyhow!("screencapture failed for window {}: {}", cg_window_id, stderr)); } Ok(()) } - async fn extract_text_from_screen(&self, region: Rect) -> Result { + async fn extract_text_from_screen(&self, region: Rect, window_id: &str) -> Result { // Take screenshot of region first let temp_path = format!("/tmp/g3_ocr_{}.png", uuid::Uuid::new_v4()); - self.take_screenshot(&temp_path, Some(region), None).await?; + self.take_screenshot(&temp_path, Some(region), Some(window_id)).await?; // Extract text from the screenshot let result = self.extract_text_from_image(&temp_path).await?; diff --git a/crates/g3-computer-control/src/platform/windows.rs b/crates/g3-computer-control/src/platform/windows.rs index 6213d56..f3250f7 100644 --- a/crates/g3-computer-control/src/platform/windows.rs +++ b/crates/g3-computer-control/src/platform/windows.rs @@ -62,10 +62,15 @@ impl ComputerController for WindowsController { } async fn take_screenshot(&self, _path: &str, _region: Option, _window_id: Option<&str>) -> Result<()> { + // Enforce that window_id must be provided + if _window_id.is_none() { + anyhow::bail!("window_id is required. You must specify which window to capture (e.g., 'Chrome', 'Terminal', 'Notepad'). Use list_windows to see available windows."); + } + anyhow::bail!("Windows implementation not yet available") } - async fn extract_text_from_screen(&self, _region: Rect) -> Result { + async fn extract_text_from_screen(&self, _region: Rect, _window_id: &str) -> Result { anyhow::bail!("Windows implementation not yet available") } diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs index dd3e52c..69a90ca 100644 --- a/crates/g3-core/src/lib.rs +++ b/crates/g3-core/src/lib.rs @@ -1798,7 +1798,7 @@ Template: }, Tool { name: "take_screenshot".to_string(), - description: "Capture a screenshot of the screen, region, or window. When capturing a specific application window (e.g., 'Safari', 'Terminal'), use the window_id parameter with just the application name. The tool will automatically use the native screencapture command with the application's window ID for a clean capture.".to_string(), + description: "Capture a screenshot of a specific application window. You MUST specify the window_id parameter with the application name (e.g., 'Safari', 'Terminal', 'Google Chrome'). The tool will automatically use the native screencapture command with the application's window ID for a clean capture. Use list_windows first to identify available windows.".to_string(), input_schema: json!({ "type": "object", "properties": { @@ -1808,7 +1808,7 @@ Template: }, "window_id": { "type": "string", - "description": "Optional application name to capture (e.g., 'Safari', 'Terminal', 'Google Chrome'). The tool will capture the frontmost window of that application using its native window ID." + "description": "REQUIRED: Application name to capture (e.g., 'Safari', 'Terminal', 'Google Chrome'). The tool will capture the frontmost window of that application using its native window ID." }, "region": { "type": "object", @@ -1820,12 +1820,12 @@ Template: } } }, - "required": ["path"] + "required": ["path", "window_id"] }), }, Tool { name: "extract_text".to_string(), - description: "Extract text from a screen region or image file using OCR. Returns plain text only (no bounding boxes). For text with location/coordinates, use vision_find_text instead.".to_string(), + description: "Extract text from an image file using OCR. For extracting text from a specific window, use vision_find_text instead which automatically handles window capture.".to_string(), input_schema: json!({ "type": "object", "properties": { @@ -1833,16 +1833,6 @@ Template: "type": "string", "description": "Path to image file (optional if region is provided)" }, - "region": { - "type": "object", - "description": "Screen region to capture and extract text from", - "properties": { - "x": {"type": "integer"}, - "y": {"type": "integer"}, - "width": {"type": "integer"}, - "height": {"type": "integer"} - } - } } }), }, @@ -3750,8 +3740,9 @@ Template: .and_then(|v| v.as_str()) .ok_or_else(|| anyhow::anyhow!("Missing path argument"))?; - // Extract window_id (app name) if provided - let window_id = tool_call.args.get("window_id").and_then(|v| v.as_str()); + // Extract window_id (app name) - REQUIRED + let window_id = tool_call.args.get("window_id").and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing window_id argument. You must specify which window to capture (e.g., 'Safari', 'Terminal', 'Google Chrome')."))?; // Extract region if provided let region = tool_call @@ -3771,7 +3762,7 @@ Template: .unwrap_or(0) as i32, }); - match controller.take_screenshot(path, region, window_id).await { + match controller.take_screenshot(path, region, Some(window_id)).await { Ok(_) => { // Get the actual path where the screenshot was saved let actual_path = if path.starts_with('/') { @@ -3785,14 +3776,10 @@ Template: format!("{}/{}", temp_dir.trim_end_matches('/'), path) }; - if let Some(app) = window_id { - Ok(format!( - "✅ Screenshot of {} saved to: {}", - app, actual_path - )) - } else { - Ok(format!("✅ Screenshot saved to: {}", actual_path)) - } + Ok(format!( + "✅ Screenshot of {} saved to: {}", + window_id, actual_path + )) } Err(e) => Ok(format!("❌ Failed to take screenshot: {}", e)), } @@ -3802,36 +3789,14 @@ Template: } "extract_text" => { if let Some(controller) = &self.computer_controller { - // Check if we have a path or a region - if let Some(path) = tool_call.args.get("path").and_then(|v| v.as_str()) { - // Extract text from image file - match controller.extract_text_from_image(path).await { - Ok(text) => Ok(format!("✅ Extracted text:\n{}", text)), - Err(e) => Ok(format!("❌ Failed to extract text: {}", e)), - } - } else if let Some(region_obj) = - tool_call.args.get("region").and_then(|v| v.as_object()) - { - // Extract text from screen region - let region = g3_computer_control::types::Rect { - x: region_obj.get("x").and_then(|v| v.as_i64()).unwrap_or(0) as i32, - y: region_obj.get("y").and_then(|v| v.as_i64()).unwrap_or(0) as i32, - width: region_obj - .get("width") - .and_then(|v| v.as_i64()) - .unwrap_or(0) as i32, - height: region_obj - .get("height") - .and_then(|v| v.as_i64()) - .unwrap_or(0) as i32, - }; - - match controller.extract_text_from_screen(region).await { - Ok(text) => Ok(format!("✅ Extracted text:\n{}", text)), - Err(e) => Ok(format!("❌ Failed to extract text: {}", e)), - } - } else { - Ok("❌ Missing path or region argument".to_string()) + let path = tool_call.args.get("path") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing path argument"))?; + + // Extract text from image file only + match controller.extract_text_from_image(path).await { + Ok(text) => Ok(format!("✅ Extracted text:\n{}", text)), + Err(e) => Ok(format!("❌ Failed to extract text: {}", e)), } } else { Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string())