Native api for screen capture

This commit is contained in:
Dhanji Prasanna
2025-10-24 16:11:12 +11:00
parent 61d748034d
commit a8af5d7cc1
5 changed files with 97 additions and 79 deletions

View File

@@ -23,7 +23,7 @@ pub trait ComputerController: Send + Sync {
async fn take_screenshot(&self, path: &str, region: Option<Rect>, window_id: Option<&str>) -> Result<()>; async fn take_screenshot(&self, path: &str, region: Option<Rect>, window_id: Option<&str>) -> Result<()>;
// OCR operations // OCR operations
async fn extract_text_from_screen(&self, region: Rect) -> Result<String>; async fn extract_text_from_screen(&self, region: Rect, window_id: &str) -> Result<String>;
async fn extract_text_from_image(&self, path: &str) -> Result<String>; async fn extract_text_from_image(&self, path: &str) -> Result<String>;
async fn extract_text_with_locations(&self, path: &str) -> Result<Vec<TextLocation>>; async fn extract_text_with_locations(&self, path: &str) -> Result<Vec<TextLocation>>;
async fn find_text_in_app(&self, app_name: &str, search_text: &str) -> Result<Option<TextLocation>>; async fn find_text_in_app(&self, app_name: &str, search_text: &str) -> Result<Option<TextLocation>>;

View File

@@ -63,10 +63,15 @@ impl ComputerController for LinuxController {
} }
async fn take_screenshot(&self, _path: &str, _region: Option<Rect>, _window_id: Option<&str>) -> Result<()> { async fn take_screenshot(&self, _path: &str, _region: Option<Rect>, _window_id: Option<&str>) -> Result<()> {
// Enforce that window_id must be provided
if _window_id.is_none() {
anyhow::bail!("window_id is required. You must specify which window to capture (e.g., 'Firefox', 'Terminal', 'gedit'). Use list_windows to see available windows.");
}
anyhow::bail!("Linux implementation not yet available") anyhow::bail!("Linux implementation not yet available")
} }
async fn extract_text_from_screen(&self, _region: Rect) -> Result<OCRResult> { async fn extract_text_from_screen(&self, _region: Rect, _window_id: &str) -> Result<String> {
anyhow::bail!("Linux implementation not yet available") anyhow::bail!("Linux implementation not yet available")
} }

View File

@@ -3,6 +3,11 @@ use crate::ocr::{OCREngine, DefaultOCR};
use anyhow::{Result, Context}; use anyhow::{Result, Context};
use async_trait::async_trait; use async_trait::async_trait;
use std::path::Path; use std::path::Path;
use core_graphics::window::{kCGWindowListOptionOnScreenOnly, kCGNullWindowID, CGWindowListCopyWindowInfo};
use core_foundation::dictionary::CFDictionary;
use core_foundation::string::CFString;
use core_foundation::base::{TCFType, ToVoid};
use core_foundation::array::CFArray;
pub struct MacOSController { pub struct MacOSController {
ocr_engine: Box<dyn OCREngine>, ocr_engine: Box<dyn OCREngine>,
@@ -22,6 +27,11 @@ impl MacOSController {
#[async_trait] #[async_trait]
impl ComputerController for MacOSController { impl ComputerController for MacOSController {
async fn take_screenshot(&self, path: &str, region: Option<Rect>, window_id: Option<&str>) -> Result<()> { async fn take_screenshot(&self, path: &str, region: Option<Rect>, window_id: Option<&str>) -> Result<()> {
// Enforce that window_id must be provided
if window_id.is_none() {
return Err(anyhow::anyhow!("window_id is required. You must specify which window to capture (e.g., 'Safari', 'Terminal', 'Google Chrome'). Use list_windows to see available windows."));
}
// Determine the temporary directory for screenshots // Determine the temporary directory for screenshots
let temp_dir = std::env::var("TMPDIR") let temp_dir = std::env::var("TMPDIR")
.or_else(|_| std::env::var("HOME").map(|h| format!("{}/tmp", h))) .or_else(|_| std::env::var("HOME").map(|h| format!("{}/tmp", h)))
@@ -42,48 +52,81 @@ impl ComputerController for MacOSController {
std::fs::create_dir_all(parent)?; std::fs::create_dir_all(parent)?;
} }
let mut cmd = std::process::Command::new("screencapture"); let app_name = window_id.unwrap(); // Safe because we checked is_none() above
// Add flags // Get the window ID for the specified application
let cg_window_id = unsafe {
let window_list = CGWindowListCopyWindowInfo(
kCGWindowListOptionOnScreenOnly,
kCGNullWindowID
);
let array = CFArray::<CFDictionary>::wrap_under_create_rule(window_list);
let count = array.len();
let mut found_window_id: Option<u32> = None;
for i in 0..count {
let dict = array.get(i).unwrap();
// Get owner name
let owner_key = CFString::from_static_string("kCGWindowOwnerName");
let owner: String = if let Some(value) = dict.find(owner_key.to_void()) {
let s: CFString = TCFType::wrap_under_get_rule(*value as *const _);
s.to_string()
} else {
continue;
};
// Check if this is the app we're looking for
if owner.to_lowercase().contains(&app_name.to_lowercase()) || app_name.to_lowercase().contains(&owner.to_lowercase()) {
// Get window ID
let window_id_key = CFString::from_static_string("kCGWindowNumber");
if let Some(value) = dict.find(window_id_key.to_void()) {
let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _);
if let Some(id) = num.to_i64() {
found_window_id = Some(id as u32);
break;
}
}
}
}
found_window_id
};
let cg_window_id = cg_window_id.ok_or_else(|| {
anyhow::anyhow!("Could not find window for application '{}'. Use list_windows to see available windows.", app_name)
})?;
// Use screencapture with the window ID for now
// TODO: Implement direct CGWindowListCreateImage approach with proper image saving
let mut cmd = std::process::Command::new("screencapture");
cmd.arg("-x"); // No sound cmd.arg("-x"); // No sound
cmd.arg("-l");
cmd.arg(cg_window_id.to_string());
if let Some(region) = region { if let Some(region) = region {
// Capture specific region: -R x,y,width,height
cmd.arg("-R"); cmd.arg("-R");
cmd.arg(format!("{},{},{},{}", region.x, region.y, region.width, region.height)); cmd.arg(format!("{},{},{},{}", region.x, region.y, region.width, region.height));
} }
if let Some(app_name) = window_id {
// Capture specific window by app name
// Use AppleScript to get window ID
let script = format!(r#"tell application "{}" to id of window 1"#, app_name);
let output = std::process::Command::new("osascript")
.arg("-e")
.arg(&script)
.output()?;
if output.status.success() {
let window_id_str = String::from_utf8_lossy(&output.stdout).trim().to_string();
cmd.arg(format!("-l{}", window_id_str));
}
}
cmd.arg(&final_path); cmd.arg(&final_path);
let screenshot_result = cmd.output()?; let screenshot_result = cmd.output()?;
if !screenshot_result.status.success() { if !screenshot_result.status.success() {
let stderr = String::from_utf8_lossy(&screenshot_result.stderr); let stderr = String::from_utf8_lossy(&screenshot_result.stderr);
return Err(anyhow::anyhow!("screencapture failed: {}", stderr)); return Err(anyhow::anyhow!("screencapture failed for window {}: {}", cg_window_id, stderr));
} }
Ok(()) Ok(())
} }
async fn extract_text_from_screen(&self, region: Rect) -> Result<String> { async fn extract_text_from_screen(&self, region: Rect, window_id: &str) -> Result<String> {
// Take screenshot of region first // Take screenshot of region first
let temp_path = format!("/tmp/g3_ocr_{}.png", uuid::Uuid::new_v4()); let temp_path = format!("/tmp/g3_ocr_{}.png", uuid::Uuid::new_v4());
self.take_screenshot(&temp_path, Some(region), None).await?; self.take_screenshot(&temp_path, Some(region), Some(window_id)).await?;
// Extract text from the screenshot // Extract text from the screenshot
let result = self.extract_text_from_image(&temp_path).await?; let result = self.extract_text_from_image(&temp_path).await?;

View File

@@ -62,10 +62,15 @@ impl ComputerController for WindowsController {
} }
async fn take_screenshot(&self, _path: &str, _region: Option<Rect>, _window_id: Option<&str>) -> Result<()> { async fn take_screenshot(&self, _path: &str, _region: Option<Rect>, _window_id: Option<&str>) -> Result<()> {
// Enforce that window_id must be provided
if _window_id.is_none() {
anyhow::bail!("window_id is required. You must specify which window to capture (e.g., 'Chrome', 'Terminal', 'Notepad'). Use list_windows to see available windows.");
}
anyhow::bail!("Windows implementation not yet available") anyhow::bail!("Windows implementation not yet available")
} }
async fn extract_text_from_screen(&self, _region: Rect) -> Result<OCRResult> { async fn extract_text_from_screen(&self, _region: Rect, _window_id: &str) -> Result<String> {
anyhow::bail!("Windows implementation not yet available") anyhow::bail!("Windows implementation not yet available")
} }

View File

@@ -1798,7 +1798,7 @@ Template:
}, },
Tool { Tool {
name: "take_screenshot".to_string(), name: "take_screenshot".to_string(),
description: "Capture a screenshot of the screen, region, or window. When capturing a specific application window (e.g., 'Safari', 'Terminal'), use the window_id parameter with just the application name. The tool will automatically use the native screencapture command with the application's window ID for a clean capture.".to_string(), description: "Capture a screenshot of a specific application window. You MUST specify the window_id parameter with the application name (e.g., 'Safari', 'Terminal', 'Google Chrome'). The tool will automatically use the native screencapture command with the application's window ID for a clean capture. Use list_windows first to identify available windows.".to_string(),
input_schema: json!({ input_schema: json!({
"type": "object", "type": "object",
"properties": { "properties": {
@@ -1808,7 +1808,7 @@ Template:
}, },
"window_id": { "window_id": {
"type": "string", "type": "string",
"description": "Optional application name to capture (e.g., 'Safari', 'Terminal', 'Google Chrome'). The tool will capture the frontmost window of that application using its native window ID." "description": "REQUIRED: Application name to capture (e.g., 'Safari', 'Terminal', 'Google Chrome'). The tool will capture the frontmost window of that application using its native window ID."
}, },
"region": { "region": {
"type": "object", "type": "object",
@@ -1820,12 +1820,12 @@ Template:
} }
} }
}, },
"required": ["path"] "required": ["path", "window_id"]
}), }),
}, },
Tool { Tool {
name: "extract_text".to_string(), name: "extract_text".to_string(),
description: "Extract text from a screen region or image file using OCR. Returns plain text only (no bounding boxes). For text with location/coordinates, use vision_find_text instead.".to_string(), description: "Extract text from an image file using OCR. For extracting text from a specific window, use vision_find_text instead which automatically handles window capture.".to_string(),
input_schema: json!({ input_schema: json!({
"type": "object", "type": "object",
"properties": { "properties": {
@@ -1833,16 +1833,6 @@ Template:
"type": "string", "type": "string",
"description": "Path to image file (optional if region is provided)" "description": "Path to image file (optional if region is provided)"
}, },
"region": {
"type": "object",
"description": "Screen region to capture and extract text from",
"properties": {
"x": {"type": "integer"},
"y": {"type": "integer"},
"width": {"type": "integer"},
"height": {"type": "integer"}
}
}
} }
}), }),
}, },
@@ -3750,8 +3740,9 @@ Template:
.and_then(|v| v.as_str()) .and_then(|v| v.as_str())
.ok_or_else(|| anyhow::anyhow!("Missing path argument"))?; .ok_or_else(|| anyhow::anyhow!("Missing path argument"))?;
// Extract window_id (app name) if provided // Extract window_id (app name) - REQUIRED
let window_id = tool_call.args.get("window_id").and_then(|v| v.as_str()); let window_id = tool_call.args.get("window_id").and_then(|v| v.as_str())
.ok_or_else(|| anyhow::anyhow!("Missing window_id argument. You must specify which window to capture (e.g., 'Safari', 'Terminal', 'Google Chrome')."))?;
// Extract region if provided // Extract region if provided
let region = tool_call let region = tool_call
@@ -3771,7 +3762,7 @@ Template:
.unwrap_or(0) as i32, .unwrap_or(0) as i32,
}); });
match controller.take_screenshot(path, region, window_id).await { match controller.take_screenshot(path, region, Some(window_id)).await {
Ok(_) => { Ok(_) => {
// Get the actual path where the screenshot was saved // Get the actual path where the screenshot was saved
let actual_path = if path.starts_with('/') { let actual_path = if path.starts_with('/') {
@@ -3785,14 +3776,10 @@ Template:
format!("{}/{}", temp_dir.trim_end_matches('/'), path) format!("{}/{}", temp_dir.trim_end_matches('/'), path)
}; };
if let Some(app) = window_id { Ok(format!(
Ok(format!( "✅ Screenshot of {} saved to: {}",
"✅ Screenshot of {} saved to: {}", window_id, actual_path
app, actual_path ))
))
} else {
Ok(format!("✅ Screenshot saved to: {}", actual_path))
}
} }
Err(e) => Ok(format!("❌ Failed to take screenshot: {}", e)), Err(e) => Ok(format!("❌ Failed to take screenshot: {}", e)),
} }
@@ -3802,36 +3789,14 @@ Template:
} }
"extract_text" => { "extract_text" => {
if let Some(controller) = &self.computer_controller { if let Some(controller) = &self.computer_controller {
// Check if we have a path or a region let path = tool_call.args.get("path")
if let Some(path) = tool_call.args.get("path").and_then(|v| v.as_str()) { .and_then(|v| v.as_str())
// Extract text from image file .ok_or_else(|| anyhow::anyhow!("Missing path argument"))?;
match controller.extract_text_from_image(path).await {
Ok(text) => Ok(format!("✅ Extracted text:\n{}", text)),
Err(e) => Ok(format!("❌ Failed to extract text: {}", e)),
}
} else if let Some(region_obj) =
tool_call.args.get("region").and_then(|v| v.as_object())
{
// Extract text from screen region
let region = g3_computer_control::types::Rect {
x: region_obj.get("x").and_then(|v| v.as_i64()).unwrap_or(0) as i32,
y: region_obj.get("y").and_then(|v| v.as_i64()).unwrap_or(0) as i32,
width: region_obj
.get("width")
.and_then(|v| v.as_i64())
.unwrap_or(0) as i32,
height: region_obj
.get("height")
.and_then(|v| v.as_i64())
.unwrap_or(0) as i32,
};
match controller.extract_text_from_screen(region).await { // Extract text from image file only
Ok(text) => Ok(format!("✅ Extracted text:\n{}", text)), match controller.extract_text_from_image(path).await {
Err(e) => Ok(format!("❌ Failed to extract text: {}", e)), Ok(text) => Ok(format!("✅ Extracted text:\n{}", text)),
} Err(e) => Ok(format!("❌ Failed to extract text: {}", e)),
} else {
Ok("❌ Missing path or region argument".to_string())
} }
} else { } else {
Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string()) Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string())