From af6d37a8e27f0573b7d4cc17b1e7dd189feff517 Mon Sep 17 00:00:00 2001 From: Michael Neale Date: Wed, 22 Oct 2025 14:58:35 +1100 Subject: [PATCH 01/16] Add --interactive-requirements flag for AI-enhanced requirements mode - Adds new --interactive-requirements CLI flag for autonomous mode - Prompts user for brief requirements input - Uses AI to enhance and structure requirements into proper markdown - Shows enhanced requirements and allows user to approve/edit/cancel - Saves to requirements.md and proceeds with autonomous mode if approved - Includes test script for manual verification --- crates/g3-cli/src/lib.rs | 110 +++++++++++++++++++++++++++++++++++++++ test-ai-requirements.sh | 39 ++++++++++++++ 2 files changed, 149 insertions(+) create mode 100755 test-ai-requirements.sh diff --git a/crates/g3-cli/src/lib.rs b/crates/g3-cli/src/lib.rs index e901f22..be0dc19 100644 --- a/crates/g3-cli/src/lib.rs +++ b/crates/g3-cli/src/lib.rs @@ -216,6 +216,10 @@ pub struct Cli { #[arg(long, value_name = "TEXT")] pub requirements: Option, + /// Interactive mode: prompt for requirements and save to requirements.md before starting autonomous mode + #[arg(long)] + pub interactive_requirements: bool, + /// Use retro terminal UI (inspired by 80s sci-fi) #[arg(long)] pub retro: bool, @@ -303,6 +307,112 @@ pub async fn run() -> Result<()> { // Create project model let project = if cli.autonomous { + // Handle interactive requirements mode with AI enhancement + if cli.interactive_requirements { + println!("\n📝 Interactive Requirements Mode"); + println!("================================\n"); + println!("Describe what you want to build (can be brief):"); + println!("Press Ctrl+D (Unix) or Ctrl+Z (Windows) when done.\n"); + + use std::io::{self, Read, Write}; + let mut requirements_input = String::new(); + io::stdin().read_to_string(&mut requirements_input)?; + + if requirements_input.trim().is_empty() { + anyhow::bail!("No requirements provided. Exiting."); + } + + println!("\n🤖 Enhancing your requirements with AI...\n"); + + // Create a temporary agent to enhance the requirements + let temp_config = Config::load_with_overrides( + cli.config.as_deref(), + cli.provider.clone(), + cli.model.clone(), + )?; + + let ui_writer = ConsoleUiWriter::new(); + let mut temp_agent = Agent::new_with_readme_and_quiet( + temp_config, + ui_writer, + None, + true, // quiet mode + ).await?; + + // Craft the enhancement prompt + let enhancement_prompt = format!( + r#"You are a requirements analyst. Take this brief user input and expand it into a structured requirements document. + +USER INPUT: +{} + +Create a professional requirements document with: +1. A clear project title (# heading) +2. An overview section explaining what will be built +3. Organized requirements (functional, technical, quality) +4. Acceptance criteria +5. Any technical constraints or preferences mentioned + +Format as proper markdown. Be specific and actionable. If the user's input is vague, make reasonable assumptions but keep it focused on what they described. + +Output ONLY the markdown content, no explanations or meta-commentary."#, + requirements_input.trim() + ); + + // Execute enhancement task + let result = temp_agent + .execute_task_with_timing(&enhancement_prompt, None, false, false, false, false) + .await?; + + let enhanced_requirements = result.response.trim().to_string(); + + // Show the enhanced requirements + println!("\n📋 Enhanced Requirements Document:"); + println!("{}\n", "=".repeat(60)); + println!("{}", enhanced_requirements); + println!("{}\n", "=".repeat(60)); + + // Ask for confirmation + println!("\n❓ Is this requirements document acceptable?"); + println!(" [y] Yes, proceed with autonomous mode"); + println!(" [e] Edit and save manually"); + println!(" [n] No, cancel\n"); + + print!("Your choice (y/e/n): "); + io::stdout().flush()?; + + let mut choice = String::new(); + io::stdin().read_line(&mut choice)?; + let choice = choice.trim().to_lowercase(); + + let requirements_path = workspace_dir.join("requirements.md"); + + match choice.as_str() { + "y" | "yes" => { + // Save enhanced requirements + std::fs::write(&requirements_path, &enhanced_requirements)?; + println!("\n✅ Requirements saved to: {}", requirements_path.display()); + println!("🚀 Starting autonomous mode...\n"); + } + "e" | "edit" => { + // Save enhanced requirements for manual editing + std::fs::write(&requirements_path, &enhanced_requirements)?; + println!("\n✅ Requirements saved to: {}", requirements_path.display()); + println!("📝 Please edit the file and run: g3 --autonomous"); + println!(" Exiting for now.\n"); + return Ok(()); + } + "n" | "no" => { + println!("\n❌ Cancelled. No files were saved.\n"); + return Ok(()); + } + _ => { + println!("\n❌ Invalid choice. Cancelled.\n"); + return Ok(()); + } + } + } + if let Some(requirements_text) = cli.requirements { // Use requirements text override Project::new_autonomous_with_requirements(workspace_dir.clone(), requirements_text)? diff --git a/test-ai-requirements.sh b/test-ai-requirements.sh new file mode 100755 index 0000000..06c97fc --- /dev/null +++ b/test-ai-requirements.sh @@ -0,0 +1,39 @@ +#!/bin/bash +# Test script for AI-enhanced interactive requirements mode + +echo "Testing AI-enhanced interactive requirements mode..." +echo "" + +# Create a test workspace +TEST_WORKSPACE="/tmp/g3-test-interactive-$(date +%s)" +mkdir -p "$TEST_WORKSPACE" + +echo "Test workspace: $TEST_WORKSPACE" +echo "" + +# Create sample brief input +BRIEF_INPUT="build a calculator cli in rust with basic operations" + +echo "Brief input:" +echo "---" +echo "$BRIEF_INPUT" +echo "---" +echo "" + +echo "This will:" +echo "1. Send brief input to AI" +echo "2. AI generates structured requirements.md" +echo "3. Show enhanced requirements" +echo "4. Prompt for confirmation (y/e/n)" +echo "" + +echo "To test manually, run:" +echo "cargo run -- --autonomous --interactive-requirements --workspace $TEST_WORKSPACE" +echo "" +echo "Then type: $BRIEF_INPUT" +echo "Press Ctrl+D" +echo "Review the AI-generated requirements" +echo "Choose 'y' to proceed, 'e' to edit, or 'n' to cancel" +echo "" + +echo "Test workspace will be at: $TEST_WORKSPACE" From c5d6fbef0851e7f76586780c2ad5ce523570854c Mon Sep 17 00:00:00 2001 From: Dhanji Prasanna Date: Wed, 22 Oct 2025 22:14:12 +1100 Subject: [PATCH 02/16] control commands --- README.md | 10 + crates/g3-cli/src/lib.rs | 77 ++++- crates/g3-cli/src/retro_tui.rs | 49 ++-- crates/g3-cli/src/tui.rs | 2 +- crates/g3-cli/src/ui_writer_impl.rs | 3 +- crates/g3-core/src/fixed_filter_json.rs | 8 +- crates/g3-core/src/lib.rs | 367 ++++++++++++++++++++++-- crates/g3-core/src/project.rs | 1 + crates/g3-execution/src/lib.rs | 2 +- crates/g3-providers/src/databricks.rs | 2 +- crates/g3-providers/src/oauth.rs | 2 +- crates/g3-providers/src/openai.rs | 4 +- 12 files changed, 446 insertions(+), 81 deletions(-) diff --git a/README.md b/README.md index a205213..e3aefc1 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,16 @@ G3 includes robust error handling with automatic retry logic: - Conversation history preservation through summaries - Dynamic token allocation for different providers (4k to 200k+ tokens) +### Interactive Control Commands +G3's interactive CLI includes control commands for manual context management: +- **`/compact`**: Manually trigger summarization to compact conversation history +- **`/thinnify`**: Manually trigger context thinning to replace large tool results with file references +- **`/readme`**: Reload README.md and AGENTS.md from disk without restarting +- **`/stats`**: Show detailed context and performance statistics +- **`/help`**: Display all available control commands + +These commands give you fine-grained control over context management, allowing you to proactively optimize token usage and refresh project documentation. See [Control Commands Documentation](docs/CONTROL_COMMANDS.md) for detailed usage. + ### Tool Ecosystem - **File Operations**: Read, write, and edit files with line-range precision - **Shell Integration**: Execute system commands with output capture diff --git a/crates/g3-cli/src/lib.rs b/crates/g3-cli/src/lib.rs index be0dc19..1382bd5 100644 --- a/crates/g3-cli/src/lib.rs +++ b/crates/g3-cli/src/lib.rs @@ -637,8 +637,8 @@ fn extract_readme_heading(readme_content: &str) -> Option { let trimmed = line.trim(); // Check for H1 heading (# Title) - if trimmed.starts_with("# ") { - let title = trimmed[2..].trim(); + if let Some(stripped) = trimmed.strip_prefix("# ") { + let title = stripped.trim(); if !title.is_empty() { // Return the full title (including any description after dash) return Some(title.to_string()); @@ -807,9 +807,8 @@ async fn run_interactive_retro( let trimmed = input_buffer.trim_end(); // Check if line ends with backslash for continuation - if trimmed.ends_with('\\') { + if let Some(without_backslash) = trimmed.strip_suffix('\\') { // Remove the backslash and add to buffer - let without_backslash = &trimmed[..trimmed.len() - 1]; multiline_buffer.push_str(without_backslash); multiline_buffer.push('\n'); in_multiline = true; @@ -1013,9 +1012,8 @@ async fn run_interactive( let trimmed = line.trim_end(); // Check if line ends with backslash for continuation - if trimmed.ends_with('\\') { + if let Some(without_backslash) = trimmed.strip_suffix('\\') { // Remove the backslash and add to buffer - let without_backslash = &trimmed[..trimmed.len() - 1]; multiline_buffer.push_str(without_backslash); multiline_buffer.push('\n'); in_multiline = true; @@ -1058,6 +1056,63 @@ async fn run_interactive( // Add to history rl.add_history_entry(&input)?; + // Check for control commands + if input.starts_with('/') { + match input.as_str() { + "/help" => { + output.print(""); + output.print("📖 Control Commands:"); + output.print(" /compact - Trigger auto-summarization (compacts conversation history)"); + output.print(" /thinnify - Trigger context thinning (replaces large tool results with file references)"); + output.print(" /readme - Reload README.md and AGENTS.md from disk"); + output.print(" /stats - Show detailed context and performance statistics"); + output.print(" /help - Show this help message"); + output.print(" exit/quit - Exit the interactive session"); + output.print(""); + continue; + } + "/compact" => { + output.print("🗜️ Triggering manual summarization..."); + match agent.force_summarize().await { + Ok(true) => { + output.print("✅ Summarization completed successfully"); + } + Ok(false) => { + output.print("⚠️ Summarization failed"); + } + Err(e) => { + output.print(&format!("❌ Error during summarization: {}", e)); + } + } + continue; + } + "/thinnify" => { + output.print("🔧 Triggering manual context thinning..."); + let summary = agent.force_thin(); + output.print(&summary); + continue; + } + "/readme" => { + output.print("📚 Reloading README.md and AGENTS.md..."); + match agent.reload_readme() { + Ok(true) => output.print("✅ README content reloaded successfully"), + Ok(false) => output.print("⚠️ No README was loaded at startup, cannot reload"), + Err(e) => output.print(&format!("❌ Error reloading README: {}", e)), + } + continue; + } + "/stats" => { + let stats = agent.get_stats(); + output.print(&stats); + continue; + } + _ => { + output.print(&format!("❌ Unknown command: {}. Type /help for available commands.", input)); + continue; + } + } + } + // Process the single line input execute_task(&mut agent, &input, show_prompt, show_code, &output).await; } @@ -1282,7 +1337,7 @@ async fn run_autonomous( elapsed.as_secs_f64() )); output.print(&format!("🔄 Turns Taken: 0/{}", max_turns)); - output.print(&format!("📝 Final Status: ⚠️ NO REQUIREMENTS FILE")); + output.print("📝 Final Status: ⚠️ NO REQUIREMENTS FILE"); output.print("\n📈 Token Usage Statistics:"); output.print(&format!(" • Used Tokens: {}", context_window.used_tokens)); @@ -1324,7 +1379,7 @@ async fn run_autonomous( elapsed.as_secs_f64() )); output.print(&format!("🔄 Turns Taken: 0/{}", max_turns)); - output.print(&format!("📝 Final Status: ⚠️ CANNOT READ REQUIREMENTS")); + output.print("📝 Final Status: ⚠️ CANNOT READ REQUIREMENTS"); output.print("\n📈 Token Usage Statistics:"); output.print(&format!(" • Used Tokens: {}", context_window.used_tokens)); @@ -1410,7 +1465,7 @@ async fn run_autonomous( "📋 Player received coach feedback ({} chars):", coach_feedback.len() )); - output.print(&format!("{}", coach_feedback)); + output.print(&coach_feedback.to_string()); } output.print(""); // Empty line for readability @@ -1455,7 +1510,7 @@ async fn run_autonomous( elapsed.as_secs_f64() )); output.print(&format!("🔄 Turns Taken: {}/{}", turn, max_turns)); - output.print(&format!("📝 Final Status: 💥 PLAYER PANIC")); + output.print("📝 Final Status: 💥 PLAYER PANIC"); output.print("\n📈 Token Usage Statistics:"); output.print(&format!( @@ -1616,7 +1671,7 @@ Remember: Be clear in your review and concise in your feedback. APPROVE if the i elapsed.as_secs_f64() )); output.print(&format!("🔄 Turns Taken: {}/{}", turn, max_turns)); - output.print(&format!("📝 Final Status: 💥 COACH PANIC")); + output.print("📝 Final Status: 💥 COACH PANIC"); output.print("\n📈 Token Usage Statistics:"); output.print(&format!(" • Used Tokens: {}", context_window.used_tokens)); diff --git a/crates/g3-cli/src/retro_tui.rs b/crates/g3-cli/src/retro_tui.rs index 2aa9e97..9c84172 100644 --- a/crates/g3-cli/src/retro_tui.rs +++ b/crates/g3-cli/src/retro_tui.rs @@ -267,23 +267,23 @@ impl TerminalState { let mut current_text = String::new(); // Check for headers first - if line.starts_with("### ") { + if let Some(stripped) = line.strip_prefix("### ") { return Line::from(Span::styled( - format!(" {}", &line[4..]), + format!(" {}", stripped), Style::default() .fg(self.theme.terminal_cyan.to_color()) .add_modifier(Modifier::BOLD | Modifier::UNDERLINED), )); - } else if line.starts_with("## ") { + } else if let Some(stripped) = line.strip_prefix("## ") { return Line::from(Span::styled( - format!(" {}", &line[3..]), + format!(" {}", stripped), Style::default() .fg(self.theme.terminal_amber.to_color()) .add_modifier(Modifier::BOLD), )); - } else if line.starts_with("# ") { + } else if let Some(stripped) = line.strip_prefix("# ") { return Line::from(Span::styled( - format!(" {}", &line[2..]), + format!(" {}", stripped), Style::default() .fg(self.theme.terminal_green.to_color()) .add_modifier(Modifier::BOLD), @@ -343,7 +343,7 @@ impl TerminalState { } // Find closing * let mut italic_text = String::new(); - while let Some(ch) = chars.next() { + for ch in chars.by_ref() { if ch == '*' { break; } @@ -367,7 +367,7 @@ impl TerminalState { } // Find closing ` let mut code_text = String::new(); - while let Some(ch) = chars.next() { + for ch in chars.by_ref() { if ch == '`' { break; } @@ -612,11 +612,9 @@ impl RetroTui { } // Update status blink only if status is "PROCESSING" - if state.status_line == "PROCESSING" { - if state.last_status_blink.elapsed() > Duration::from_millis(500) { - state.status_blink = !state.status_blink; - state.last_status_blink = Instant::now(); - } + if state.status_line == "PROCESSING" && state.last_status_blink.elapsed() > Duration::from_millis(500) { + state.status_blink = !state.status_blink; + state.last_status_blink = Instant::now(); } // Update activity area animation @@ -771,12 +769,7 @@ impl RetroTui { let total_cursor_pos = cursor_position; // Determine the window into the buffer we should show - let window_start = if total_cursor_pos > available_width - 1 { - // Cursor is beyond the visible area, scroll the view - total_cursor_pos - (available_width - 1) - } else { - 0 - }; + let window_start = total_cursor_pos.saturating_sub(available_width - 1); // Get the visible portion of the buffer let visible_buffer: String = input_buffer @@ -1013,9 +1006,9 @@ impl RetroTui { let fade_color = |color: Color| -> Color { match color { Color::Rgb(r, g, b) => { - let faded_r = ((r as f32 * opacity) as u8).max(0); - let faded_g = ((g as f32 * opacity) as u8).max(0); - let faded_b = ((b as f32 * opacity) as u8).max(0); + let faded_r = (r as f32 * opacity) as u8; + let faded_g = (g as f32 * opacity) as u8; + let faded_b = (b as f32 * opacity) as u8; Color::Rgb(faded_r, faded_g, faded_b) } _ => color, @@ -1098,9 +1091,9 @@ impl RetroTui { let fade_color = |color: Color| -> Color { match color { Color::Rgb(r, g, b) => { - let faded_r = ((r as f32 * opacity) as u8).max(0); - let faded_g = ((g as f32 * opacity) as u8).max(0); - let faded_b = ((b as f32 * opacity) as u8).max(0); + let faded_r = (r as f32 * opacity) as u8; + let faded_g = (g as f32 * opacity) as u8; + let faded_b = (b as f32 * opacity) as u8; Color::Rgb(faded_r, faded_g, faded_b) } _ => color, @@ -1176,7 +1169,7 @@ impl RetroTui { } // Wave characters for smooth animation - let wave_chars = vec!['▁', '▂', '▃', '▄', '▅', '▆', '▇', '█']; + let wave_chars = ['▁', '▂', '▃', '▄', '▅', '▆', '▇', '█']; // Build the wave line let mut wave_line = String::new(); @@ -1190,7 +1183,7 @@ impl RetroTui { let idx = wave_data.len().saturating_sub(display_width) + i; if idx < wave_data.len() { - let value = wave_data[idx].min(1.0).max(0.0); + let value = wave_data[idx].clamp(0.0, 1.0); let char_idx = ((value * 7.0) as usize).min(7); wave_line.push(wave_chars[char_idx]); } else { @@ -1206,8 +1199,6 @@ impl RetroTui { f.render_widget(wave_paragraph, area); } - /// Draw the status bar - /// Draw the status bar fn draw_status_bar( f: &mut Frame, diff --git a/crates/g3-cli/src/tui.rs b/crates/g3-cli/src/tui.rs index 261408e..aedd522 100644 --- a/crates/g3-cli/src/tui.rs +++ b/crates/g3-cli/src/tui.rs @@ -40,7 +40,7 @@ impl SimpleOutput { trimmed.starts_with("* ") || trimmed.starts_with("+ ") || (trimmed.len() > 2 && - trimmed.chars().next().map_or(false, |c| c.is_ascii_digit()) && + trimmed.chars().next().is_some_and(|c| c.is_ascii_digit()) && trimmed.chars().nth(1) == Some('.') && trimmed.chars().nth(2) == Some(' ')) || (trimmed.contains('[') && trimmed.contains("](")) diff --git a/crates/g3-cli/src/ui_writer_impl.rs b/crates/g3-cli/src/ui_writer_impl.rs index c69034c..ae0b7a3 100644 --- a/crates/g3-cli/src/ui_writer_impl.rs +++ b/crates/g3-cli/src/ui_writer_impl.rs @@ -115,7 +115,6 @@ impl UiWriter for ConsoleUiWriter { // For todo tools, we'll skip the normal header and print a custom one later if is_todo { - return; } } @@ -404,7 +403,7 @@ impl UiWriter for RetroTuiWriter { // Add range information for read_file tool calls let tool_name = self.current_tool_name.lock().unwrap(); - let range_suffix = if tool_name.as_ref().map_or(false, |name| name == "read_file") { + let range_suffix = if tool_name.as_ref().is_some_and(|name| name == "read_file") { // We need to check if start/end args will be provided - for now just check if this is a partial read // This is a simplified approach since we're building the caption incrementally String::new() // We'll handle this in print_tool_output_header instead diff --git a/crates/g3-core/src/fixed_filter_json.rs b/crates/g3-core/src/fixed_filter_json.rs index 67dfa59..5ed6a89 100644 --- a/crates/g3-core/src/fixed_filter_json.rs +++ b/crates/g3-core/src/fixed_filter_json.rs @@ -156,15 +156,15 @@ pub fn fixed_filter_json_tool_calls(content: &str) -> String { } // No JSON tool call detected, return only the new content we haven't returned yet - let new_content = if state.buffer.len() > state.content_returned_up_to { + + + if state.buffer.len() > state.content_returned_up_to { let result = state.buffer[state.content_returned_up_to..].to_string(); state.content_returned_up_to = state.buffer.len(); result } else { String::new() - }; - - new_content + } }) } diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs index 76efe31..2106d3d 100644 --- a/crates/g3-core/src/lib.rs +++ b/crates/g3-core/src/lib.rs @@ -60,6 +60,12 @@ pub struct StreamingToolParser { json_tool_start: Option, } +impl Default for StreamingToolParser { + fn default() -> Self { + Self::new() + } +} + impl StreamingToolParser { pub fn new() -> Self { Self { @@ -399,7 +405,12 @@ Format this as a detailed but concise summary that can be used to resume the con } /// Reset the context window with a summary - pub fn reset_with_summary(&mut self, summary: String, latest_user_message: Option) { + pub fn reset_with_summary(&mut self, summary: String, latest_user_message: Option) -> usize { + // Calculate chars saved (old history minus new summary) + let old_chars: usize = self.conversation_history.iter() + .map(|m| m.content.len()) + .sum(); + // Clear the conversation history self.conversation_history.clear(); self.used_tokens = 0; @@ -418,6 +429,11 @@ Format this as a detailed but concise summary that can be used to resume the con content: user_msg, }); } + + let new_chars: usize = self.conversation_history.iter() + .map(|m| m.content.len()) + .sum(); + old_chars.saturating_sub(new_chars) } /// Check if we should trigger context thinning @@ -438,7 +454,7 @@ Format this as a detailed but concise summary that can be used to resume the con /// Perform context thinning: scan first third of conversation and replace large tool results /// Returns a summary message about what was thinned - pub fn thin_context(&mut self) -> String { + pub fn thin_context(&mut self) -> (String, usize) { let current_percentage = self.percentage_used() as u32; let current_threshold = (current_percentage / 10) * 10; @@ -456,7 +472,7 @@ Format this as a detailed but concise summary that can be used to resume the con let tmp_dir = shellexpand::tilde("~/tmp").to_string(); if let Err(e) = std::fs::create_dir_all(&tmp_dir) { warn!("Failed to create ~/tmp directory: {}", e); - return format!("⚠️ Context thinning failed: could not create ~/tmp directory"); + return ("⚠️ Context thinning failed: could not create ~/tmp directory".to_string(), 0); } // Scan the first third of messages @@ -499,11 +515,11 @@ Format this as a detailed but concise summary that can be used to resume the con self.recalculate_tokens(); if leaned_count > 0 { - format!("🥒 Context thinned at {}%: {} tool results, ~{} chars saved", - current_threshold, leaned_count, chars_saved) + (format!("🥒 Context thinned at {}%: {} tool results, ~{} chars saved", + current_threshold, leaned_count, chars_saved), chars_saved) } else { - format!("ℹ Context thinning triggered at {}% but no large tool results found in first third", - current_threshold) + (format!("ℹ Context thinning triggered at {}% but no large tool results found in first third", + current_threshold), 0) } } @@ -522,6 +538,9 @@ Format this as a detailed but concise summary that can be used to resume the con pub struct Agent { providers: ProviderRegistry, context_window: ContextWindow, + thinning_events: Vec, // chars saved per thinning event + summarization_events: Vec, // chars saved per summarization event + first_token_times: Vec, // time to first token for each completion config: Config, session_id: Option, tool_call_metrics: Vec<(String, Duration, bool)>, // (tool_name, duration, success) @@ -745,6 +764,9 @@ impl Agent { Ok(Self { providers, context_window, + thinning_events: Vec::new(), + summarization_events: Vec::new(), + first_token_times: Vec::new(), config, session_id: None, tool_call_metrics: Vec::new(), @@ -794,9 +816,7 @@ impl Agent { // Databricks models have varying context windows depending on the model if model_name.contains("claude") { 200000 // Claude models on Databricks have large context windows - } else if model_name.contains("llama") { - 32768 // Llama models typically support 32k context - } else if model_name.contains("dbrx") { + } else if model_name.contains("llama") || model_name.contains("dbrx") { 32768 // DBRX supports 32k context } else { 16384 // Conservative default for other Databricks models @@ -875,6 +895,7 @@ impl Agent { .await } + #[allow(clippy::too_many_arguments)] pub async fn execute_task_with_timing_cancellable( &mut self, description: &str, @@ -1225,6 +1246,291 @@ Template: &self.context_window } + /// Manually trigger context summarization regardless of context window size + /// Returns Ok(true) if summarization was successful, Ok(false) if it failed + pub async fn force_summarize(&mut self) -> Result { + info!("Manual summarization triggered"); + + self.ui_writer.print_context_status(&format!( + "\n🗜️ Manual summarization requested (current usage: {}%)...", + self.context_window.percentage_used() as u32 + )); + + // Create summary request with FULL history + let summary_prompt = self.context_window.create_summary_prompt(); + + // Get the full conversation history + let conversation_text = self + .context_window + .conversation_history + .iter() + .map(|m| format!("{:?}: {}", m.role, m.content)) + .collect::>() + .join("\n\n"); + + let summary_messages = vec![ + Message { + role: MessageRole::System, + content: "You are a helpful assistant that creates concise summaries." + .to_string(), + }, + Message { + role: MessageRole::User, + content: format!( + "Based on this conversation history, {}\n\nConversation:\n{}", + summary_prompt, conversation_text + ), + }, + ]; + + let provider = self.providers.get(None)?; + + // Dynamically calculate max_tokens for summary based on what's left + let summary_max_tokens = match provider.name() { + "databricks" | "anthropic" => { + let model_limit = 200_000u32; + let current_usage = self.context_window.used_tokens; + let available = model_limit + .saturating_sub(current_usage) + .saturating_sub(5000); + Some(available.min(10_000)) + } + "embedded" => { + let model_limit = self.context_window.total_tokens; + let current_usage = self.context_window.used_tokens; + let available = model_limit + .saturating_sub(current_usage) + .saturating_sub(1000); + Some(available.min(3000)) + } + _ => { + let available = self.context_window.remaining_tokens().saturating_sub(2000); + Some(available.min(5000)) + } + }; + + debug!( + "Requesting summary with max_tokens: {:?} (current usage: {} tokens)", + summary_max_tokens, self.context_window.used_tokens + ); + + let summary_request = CompletionRequest { + messages: summary_messages, + max_tokens: summary_max_tokens, + temperature: Some(0.3), + stream: false, + tools: None, + }; + + // Get the summary + match provider.complete(summary_request).await { + Ok(summary_response) => { + self.ui_writer.print_context_status( + "✅ Context compacted successfully.\n", + ); + + // Get the latest user message to preserve it + let latest_user_msg = self + .context_window + .conversation_history + .iter() + .rev() + .find(|m| matches!(m.role, MessageRole::User)) + .map(|m| m.content.clone()); + + // Reset context with summary + let chars_saved = self.context_window + .reset_with_summary(summary_response.content, latest_user_msg); + self.summarization_events.push(chars_saved); + + Ok(true) + } + Err(e) => { + error!("Failed to create summary: {}", e); + self.ui_writer.print_context_status( + "⚠️ Unable to create summary. Please try again or start a new session.\n", + ); + Ok(false) + } + } + } + + /// Manually trigger context thinning regardless of thresholds + pub fn force_thin(&mut self) -> String { + info!("Manual context thinning triggered"); + let (message, chars_saved) = self.context_window.thin_context(); + self.thinning_events.push(chars_saved); + message + } + + /// Reload README.md and AGENTS.md and replace the first system message + /// Returns Ok(true) if README was found and reloaded, Ok(false) if no README was present initially + pub fn reload_readme(&mut self) -> Result { + info!("Manual README reload triggered"); + + // Check if the first message in conversation history is a system message with README content + let has_readme = self + .context_window + .conversation_history + .first() + .map(|m| matches!(m.role, MessageRole::System) && + (m.content.contains("Project README") || m.content.contains("Agent Configuration"))) + .unwrap_or(false); + + if !has_readme { + return Ok(false); + } + + // Try to load README.md and AGENTS.md + let mut combined_content = String::new(); + let mut found_any = false; + + if let Ok(agents_content) = std::fs::read_to_string("AGENTS.md") { + combined_content.push_str("# Agent Configuration\n\n"); + combined_content.push_str(&agents_content); + combined_content.push_str("\n\n"); + found_any = true; + } + + if let Ok(readme_content) = std::fs::read_to_string("README.md") { + combined_content.push_str("# Project README\n\n"); + combined_content.push_str(&readme_content); + found_any = true; + } + + if found_any { + // Replace the first message with the new content + if let Some(first_msg) = self.context_window.conversation_history.first_mut() { + first_msg.content = combined_content; + info!("README content reloaded successfully"); + Ok(true) + } else { + Ok(false) + } + } else { + Ok(false) + } + } + + /// Get detailed context statistics + pub fn get_stats(&self) -> String { + let mut stats = String::new(); + use std::time::Duration; + + stats.push_str("\n📊 Context Window Statistics\n"); + stats.push_str(&"=".repeat(60)); + stats.push_str("\n\n"); + + // Context window usage + stats.push_str("🗂️ Context Window:\n"); + stats.push_str(&format!(" • Used Tokens: {:>10} / {}\n", + self.context_window.used_tokens, + self.context_window.total_tokens)); + stats.push_str(&format!(" • Usage Percentage: {:>10.1}%\n", + self.context_window.percentage_used())); + stats.push_str(&format!(" • Remaining Tokens: {:>10}\n", + self.context_window.remaining_tokens())); + stats.push_str(&format!(" • Cumulative Tokens: {:>10}\n", + self.context_window.cumulative_tokens)); + stats.push_str(&format!(" • Last Thinning: {:>10}%\n", + self.context_window.last_thinning_percentage)); + stats.push('\n'); + + // Context optimization metrics + stats.push_str("🗜️ Context Optimization:\n"); + stats.push_str(&format!(" • Thinning Events: {:>10}\n", + self.thinning_events.len())); + if !self.thinning_events.is_empty() { + let total_thinned: usize = self.thinning_events.iter().sum(); + let avg_thinned = total_thinned / self.thinning_events.len(); + stats.push_str(&format!(" • Total Chars Saved: {:>10}\n", total_thinned)); + stats.push_str(&format!(" • Avg Chars/Event: {:>10}\n", avg_thinned)); + } + + stats.push_str(&format!(" • Summarizations: {:>10}\n", + self.summarization_events.len())); + if !self.summarization_events.is_empty() { + let total_summarized: usize = self.summarization_events.iter().sum(); + let avg_summarized = total_summarized / self.summarization_events.len(); + stats.push_str(&format!(" • Total Chars Saved: {:>10}\n", total_summarized)); + stats.push_str(&format!(" • Avg Chars/Event: {:>10}\n", avg_summarized)); + } + stats.push('\n'); + + // Performance metrics + stats.push_str("⚡ Performance:\n"); + if !self.first_token_times.is_empty() { + let avg_ttft = self.first_token_times.iter().sum::() / self.first_token_times.len() as u32; + let mut sorted_times = self.first_token_times.clone(); + sorted_times.sort(); + let median_ttft = sorted_times[sorted_times.len() / 2]; + stats.push_str(&format!(" • Avg Time to First Token: {:>6.3}s\n", avg_ttft.as_secs_f64())); + stats.push_str(&format!(" • Median Time to First Token: {:>6.3}s\n", median_ttft.as_secs_f64())); + } + stats.push('\n'); + + // Conversation history + stats.push_str("💬 Conversation History:\n"); + stats.push_str(&format!(" • Total Messages: {:>10}\n", + self.context_window.conversation_history.len())); + + // Count messages by role + let mut system_count = 0; + let mut user_count = 0; + let mut assistant_count = 0; + + for msg in &self.context_window.conversation_history { + match msg.role { + MessageRole::System => system_count += 1, + MessageRole::User => user_count += 1, + MessageRole::Assistant => assistant_count += 1, + } + } + + stats.push_str(&format!(" • System Messages: {:>10}\n", system_count)); + stats.push_str(&format!(" • User Messages: {:>10}\n", user_count)); + stats.push_str(&format!(" • Assistant Messages:{:>10}\n", assistant_count)); + stats.push('\n'); + + // Tool call metrics + stats.push_str("🔧 Tool Call Metrics:\n"); + stats.push_str(&format!(" • Total Tool Calls: {:>10}\n", + self.tool_call_metrics.len())); + + let successful_calls = self.tool_call_metrics.iter() + .filter(|(_, _, success)| *success) + .count(); + let failed_calls = self.tool_call_metrics.len() - successful_calls; + + stats.push_str(&format!(" • Successful: {:>10}\n", successful_calls)); + stats.push_str(&format!(" • Failed: {:>10}\n", failed_calls)); + + if !self.tool_call_metrics.is_empty() { + let total_duration: Duration = self.tool_call_metrics.iter() + .map(|(_, duration, _)| *duration) + .sum(); + let avg_duration = total_duration / self.tool_call_metrics.len() as u32; + + stats.push_str(&format!(" • Total Duration: {:>10.2}s\n", + total_duration.as_secs_f64())); + stats.push_str(&format!(" • Average Duration: {:>10.2}s\n", + avg_duration.as_secs_f64())); + } + stats.push('\n'); + + // Provider info + stats.push_str("🔌 Provider:\n"); + if let Ok((provider, model)) = self.get_provider_info() { + stats.push_str(&format!(" • Provider: {}\n", provider)); + stats.push_str(&format!(" • Model: {}\n", model)); + } + + stats.push_str(&"=".repeat(60)); + stats.push('\n'); + + stats + } + pub fn get_tool_call_metrics(&self) -> &Vec<(String, Duration, bool)> { &self.tool_call_metrics } @@ -1774,8 +2080,9 @@ Template: .map(|m| m.content.clone()); // Reset context with summary - self.context_window + let chars_saved = self.context_window .reset_with_summary(summary_response.content, latest_user_msg); + self.summarization_events.push(chars_saved); // Update the request with new context request.messages = self.context_window.conversation_history.clone(); @@ -1904,6 +2211,10 @@ Template: // Record time to first token if first_token_time.is_none() && !chunk.content.is_empty() { first_token_time = Some(stream_start.elapsed()); + // Record in agent metrics + if let Some(ttft) = first_token_time { + self.first_token_times.push(ttft); + } } chunks_received += 1; @@ -1919,12 +2230,13 @@ Template: let completed_tools = parser.process_chunk(&chunk); // Handle completed tool calls - for tool_call in completed_tools { + if let Some(tool_call) = completed_tools.into_iter().next() { debug!("Processing completed tool call: {:?}", tool_call); // Check if we should thin the context BEFORE executing the tool if self.context_window.should_thin() { - let thin_summary = self.context_window.thin_context(); + let (thin_summary, chars_saved) = self.context_window.thin_context(); + self.thinning_events.push(chars_saved); // Print the thinning summary to the user self.ui_writer.println(""); self.ui_writer.print_context_status(&format!("{}\n", thin_summary)); @@ -2001,18 +2313,16 @@ Template: } else { s.clone() } + } else if s.len() > 100 { + // Use char_indices to respect UTF-8 boundaries + let truncated = s + .char_indices() + .take(100) + .map(|(_, c)| c) + .collect::(); + format!("{}...", truncated) } else { - if s.len() > 100 { - // Use char_indices to respect UTF-8 boundaries - let truncated = s - .char_indices() - .take(100) - .map(|(_, c)| c) - .collect::(); - format!("{}...", truncated) - } else { - s.clone() - } + s.clone() } } _ => value.to_string(), @@ -2034,7 +2344,7 @@ Template: Ok(result) => result?, Err(_) => { warn!("Tool call {} timed out after 8 minutes", tool_call.tool); - format!("❌ Tool execution timed out after 8 minutes") + "❌ Tool execution timed out after 8 minutes".to_string() } }; let exec_duration = exec_start.elapsed(); @@ -2950,14 +3260,14 @@ Template: // Write the result back to the file match std::fs::write(&file_path, &result) { - Ok(()) => Ok(format!("✅ applied unified diff")), + Ok(()) => Ok("✅ applied unified diff".to_string()), Err(e) => Ok(format!("❌ Failed to write to file '{}': {}", file_path, e)), } } "final_output" => { if let Some(summary) = tool_call.args.get("summary") { if let Some(summary_str) = summary.as_str() { - Ok(format!("{}", summary_str)) + Ok(summary_str.to_string()) } else { Ok("✅ Turn completed".to_string()) } @@ -3702,8 +4012,7 @@ fn parse_unified_diff_hunks(diff: &str) -> Vec<(String, String)> { } } - if line.starts_with(' ') { - let content = &line[1..]; + if let Some(content) = line.strip_prefix(' ') { old_lines.push(content.to_string()); new_lines.push(content.to_string()); } else if line.starts_with('+') && !line.starts_with("+++") { diff --git a/crates/g3-core/src/project.rs b/crates/g3-core/src/project.rs index 5028455..edaa954 100644 --- a/crates/g3-core/src/project.rs +++ b/crates/g3-core/src/project.rs @@ -104,6 +104,7 @@ impl Project { } /// Recursively check a directory for implementation files + #[allow(clippy::only_used_in_recursion)] fn check_dir_for_implementation_files(&self, dir: &Path) -> bool { // Common source file extensions let extensions = vec![ diff --git a/crates/g3-execution/src/lib.rs b/crates/g3-execution/src/lib.rs index 2f2769b..a42ba97 100644 --- a/crates/g3-execution/src/lib.rs +++ b/crates/g3-execution/src/lib.rs @@ -259,7 +259,7 @@ impl CodeExecutor { line = stderr_lines.next_line() => { match line { Ok(Some(line)) => { - receiver.on_output_line(&format!("{}", line)); + receiver.on_output_line(&line.to_string()); stderr_output.push(line); } Ok(None) => {}, // stderr EOF, continue diff --git a/crates/g3-providers/src/databricks.rs b/crates/g3-providers/src/databricks.rs index 68fa413..02c669a 100644 --- a/crates/g3-providers/src/databricks.rs +++ b/crates/g3-providers/src/databricks.rs @@ -213,7 +213,7 @@ impl DatabricksProvider { let mut builder = self .client - .post(&format!( + .post(format!( "{}/serving-endpoints/{}/invocations", self.host, self.model )) diff --git a/crates/g3-providers/src/oauth.rs b/crates/g3-providers/src/oauth.rs index 406d893..75c9d50 100644 --- a/crates/g3-providers/src/oauth.rs +++ b/crates/g3-providers/src/oauth.rs @@ -102,7 +102,7 @@ async fn get_workspace_endpoints(host: &str) -> Result { if !resp.status().is_success() { return Err(anyhow::anyhow!( "Failed to get OIDC configuration from {}", - oidc_url.to_string() + oidc_url )); } diff --git a/crates/g3-providers/src/openai.rs b/crates/g3-providers/src/openai.rs index e8b4dab..52ad6b0 100644 --- a/crates/g3-providers/src/openai.rs +++ b/crates/g3-providers/src/openai.rs @@ -259,7 +259,7 @@ impl LLMProvider for OpenAIProvider { let response = self .client - .post(&format!("{}/chat/completions", self.base_url)) + .post(format!("{}/chat/completions", self.base_url)) .header("Authorization", format!("Bearer {}", self.api_key)) .json(&body) .send() @@ -318,7 +318,7 @@ impl LLMProvider for OpenAIProvider { let response = self .client - .post(&format!("{}/chat/completions", self.base_url)) + .post(format!("{}/chat/completions", self.base_url)) .header("Authorization", format!("Bearer {}", self.api_key)) .json(&body) .send() From 3ec65e38ee479e7665648149d6db69814ac54b53 Mon Sep 17 00:00:00 2001 From: Dhanji Prasanna Date: Thu, 23 Oct 2025 06:53:42 +1100 Subject: [PATCH 03/16] macax tools --- Cargo.lock | 90 +- README.md | 14 + crates/g3-cli/src/lib.rs | 14 +- crates/g3-computer-control/Cargo.toml | 3 +- .../examples/macax_demo.rs | 74 ++ .../examples/test_type_text.rs | 48 + crates/g3-computer-control/src/lib.rs | 4 + .../src/macax/controller.rs | 826 ++++++++++++++++++ crates/g3-computer-control/src/macax/mod.rs | 65 ++ crates/g3-computer-control/src/macax/tests.rs | 37 + crates/g3-config/src/lib.rs | 16 + crates/g3-core/src/lib.rs | 566 +++++++++++- 12 files changed, 1745 insertions(+), 12 deletions(-) create mode 100644 crates/g3-computer-control/examples/macax_demo.rs create mode 100644 crates/g3-computer-control/examples/test_type_text.rs create mode 100644 crates/g3-computer-control/src/macax/controller.rs create mode 100644 crates/g3-computer-control/src/macax/mod.rs create mode 100644 crates/g3-computer-control/src/macax/tests.rs diff --git a/Cargo.lock b/Cargo.lock index b3cf969..38b9eb8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,28 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "accessibility" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ac9f33ffc1ef16eddb2451c03c983e56a5182ac760c3f2733da55ba8f48eac4" +dependencies = [ + "accessibility-sys", + "cocoa 0.26.1", + "core-foundation 0.10.1", + "objc", + "thiserror 1.0.69", +] + +[[package]] +name = "accessibility-sys" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46a6a8e90a1d8b96a48249e7c8f5b4058447bea8847280db7bfccb6dcab6b8e1" +dependencies = [ + "core-foundation-sys", +] + [[package]] name = "adler2" version = "2.0.1" @@ -437,9 +459,25 @@ checksum = "f6140449f97a6e97f9511815c5632d84c8aacf8ac271ad77c559218161a1373c" dependencies = [ "bitflags 1.3.2", "block", - "cocoa-foundation", + "cocoa-foundation 0.1.2", "core-foundation 0.9.4", - "core-graphics", + "core-graphics 0.23.2", + "foreign-types 0.5.0", + "libc", + "objc", +] + +[[package]] +name = "cocoa" +version = "0.26.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad36507aeb7e16159dfe68db81ccc27571c3ccd4b76fb2fb72fc59e7a4b1b64c" +dependencies = [ + "bitflags 2.10.0", + "block", + "cocoa-foundation 0.2.1", + "core-foundation 0.10.1", + "core-graphics 0.24.0", "foreign-types 0.5.0", "libc", "objc", @@ -454,11 +492,24 @@ dependencies = [ "bitflags 1.3.2", "block", "core-foundation 0.9.4", - "core-graphics-types", + "core-graphics-types 0.1.3", "libc", "objc", ] +[[package]] +name = "cocoa-foundation" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81411967c50ee9a1fc11365f8c585f863a22a9697c89239c452292c40ba79b0d" +dependencies = [ + "bitflags 2.10.0", + "block", + "core-foundation 0.10.1", + "core-graphics-types 0.2.0", + "objc", +] + [[package]] name = "color_quant" version = "1.1.0" @@ -635,7 +686,20 @@ checksum = "c07782be35f9e1140080c6b96f0d44b739e2278479f64e02fdab4e32dfd8b081" dependencies = [ "bitflags 1.3.2", "core-foundation 0.9.4", - "core-graphics-types", + "core-graphics-types 0.1.3", + "foreign-types 0.5.0", + "libc", +] + +[[package]] +name = "core-graphics" +version = "0.24.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa95a34622365fa5bbf40b20b75dba8dfa8c94c734aea8ac9a5ca38af14316f1" +dependencies = [ + "bitflags 2.10.0", + "core-foundation 0.10.1", + "core-graphics-types 0.2.0", "foreign-types 0.5.0", "libc", ] @@ -651,6 +715,17 @@ dependencies = [ "libc", ] +[[package]] +name = "core-graphics-types" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d44a101f213f6c4cdc1853d4b78aef6db6bdfa3468798cc1d9912f4735013eb" +dependencies = [ + "bitflags 2.10.0", + "core-foundation 0.10.1", + "libc", +] + [[package]] name = "cpufeatures" version = "0.2.17" @@ -1287,11 +1362,12 @@ dependencies = [ name = "g3-computer-control" version = "0.1.0" dependencies = [ + "accessibility", "anyhow", "async-trait", - "cocoa", - "core-foundation 0.9.4", - "core-graphics", + "cocoa 0.25.0", + "core-foundation 0.10.1", + "core-graphics 0.23.2", "fantoccini", "image", "objc", diff --git a/README.md b/README.md index e3aefc1..8bb807c 100644 --- a/README.md +++ b/README.md @@ -89,6 +89,7 @@ These commands give you fine-grained control over context management, allowing y - **TODO Management**: Read and write TODO lists with markdown checkbox format - **Computer Control** (Experimental): Automate desktop applications - Mouse and keyboard control + - macOS Accessibility API for native app automation (via `--macax` flag) - UI element inspection - Screenshot capture and window management - OCR text extraction from images and screen regions @@ -166,6 +167,19 @@ safaridriver --enable # Requires password **Usage**: Run G3 with the `--webdriver` flag to enable browser automation tools. +## macOS Accessibility API Tools + +G3 includes support for controlling macOS applications via the Accessibility API, allowing you to automate native macOS apps. + +**Available Tools**: `macax_list_apps`, `macax_get_frontmost_app`, `macax_activate_app`, `macax_get_ui_tree`, `macax_find_elements`, `macax_click`, `macax_set_value`, `macax_get_value`, `macax_press_key` + +**Setup**: Enable with the `--macax` flag or in config with `macax.enabled = true`. Grant accessibility permissions: +- **macOS**: System Preferences → Security & Privacy → Privacy → Accessibility → Add your terminal app + +**For detailed documentation**, see [macOS Accessibility Tools Guide](docs/macax-tools.md). + +**Note**: This is particularly useful for testing and automating apps you're building with G3, as you can add accessibility identifiers to your UI elements. + ## Computer Control (Experimental) G3 can interact with your computer's GUI for automation tasks: diff --git a/crates/g3-cli/src/lib.rs b/crates/g3-cli/src/lib.rs index 1382bd5..32000b3 100644 --- a/crates/g3-cli/src/lib.rs +++ b/crates/g3-cli/src/lib.rs @@ -239,6 +239,10 @@ pub struct Cli { /// Disable log file creation (no logs/ directory or session logs) #[arg(long)] pub quiet: bool, + + /// Enable macOS Accessibility API tools for native app automation + #[arg(long)] + pub macax: bool, } pub async fn run() -> Result<()> { @@ -433,12 +437,20 @@ Output ONLY the markdown content, no explanations or meta-commentary."#, } // Load configuration with CLI overrides - let config = Config::load_with_overrides( + let mut config = Config::load_with_overrides( cli.config.as_deref(), cli.provider.clone(), cli.model.clone(), )?; + // Apply macax flag override + if cli.macax { + config.macax.enabled = true; + if !cli.retro { + info!("macOS Accessibility API tools enabled"); + } + } + // Validate provider if specified if let Some(ref provider) = cli.provider { let valid_providers = ["anthropic", "databricks", "embedded", "openai"]; diff --git a/crates/g3-computer-control/Cargo.toml b/crates/g3-computer-control/Cargo.toml index 9aa522c..4300dc1 100644 --- a/crates/g3-computer-control/Cargo.toml +++ b/crates/g3-computer-control/Cargo.toml @@ -26,9 +26,10 @@ tesseract = "0.14" # macOS dependencies [target.'cfg(target_os = "macos")'.dependencies] core-graphics = "0.23" -core-foundation = "0.9" +core-foundation = "0.10" cocoa = "0.25" objc = "0.2" +accessibility = "0.2" image = "0.24" # Linux dependencies diff --git a/crates/g3-computer-control/examples/macax_demo.rs b/crates/g3-computer-control/examples/macax_demo.rs new file mode 100644 index 0000000..ff1398d --- /dev/null +++ b/crates/g3-computer-control/examples/macax_demo.rs @@ -0,0 +1,74 @@ +//! Example demonstrating macOS Accessibility API tools +//! +//! This example shows how to use the macax tools to control macOS applications. +//! +//! Run with: cargo run --example macax_demo + +use anyhow::Result; +use g3_computer_control::MacAxController; + +#[tokio::main] +async fn main() -> Result<()> { + println!("🍎 macOS Accessibility API Demo\n"); + println!("This demo shows how to control macOS applications using the Accessibility API.\n"); + + // Create controller + let controller = MacAxController::new()?; + println!("✅ MacAxController initialized\n"); + + // List running applications + println!("📱 Listing running applications:"); + match controller.list_applications() { + Ok(apps) => { + for app in apps.iter().take(10) { + println!(" - {}", app.name); + } + if apps.len() > 10 { + println!(" ... and {} more", apps.len() - 10); + } + } + Err(e) => println!(" ❌ Error: {}", e), + } + println!(); + + // Get frontmost app + println!("🎯 Getting frontmost application:"); + match controller.get_frontmost_app() { + Ok(app) => println!(" Current: {}", app.name), + Err(e) => println!(" ❌ Error: {}", e), + } + println!(); + + // Example: Activate Finder and get its UI tree + println!("📂 Activating Finder and inspecting UI:"); + match controller.activate_app("Finder") { + Ok(_) => { + println!(" ✅ Finder activated"); + + // Wait a moment for activation + tokio::time::sleep(tokio::time::Duration::from_millis(500)).await; + + // Get UI tree + match controller.get_ui_tree("Finder", 2) { + Ok(tree) => { + println!("\n UI Tree:"); + for line in tree.lines().take(10) { + println!(" {}", line); + } + } + Err(e) => println!(" ❌ Error getting UI tree: {}", e), + } + } + Err(e) => println!(" ❌ Error: {}", e), + } + println!(); + + println!("✨ Demo complete!\n"); + println!("💡 Tips:"); + println!(" - Use --macax flag with g3 to enable these tools"); + println!(" - Grant accessibility permissions in System Preferences"); + println!(" - Add accessibility identifiers to your apps for easier automation"); + println!(" - See docs/macax-tools.md for full documentation\n"); + + Ok(()) +} diff --git a/crates/g3-computer-control/examples/test_type_text.rs b/crates/g3-computer-control/examples/test_type_text.rs new file mode 100644 index 0000000..2d1aea0 --- /dev/null +++ b/crates/g3-computer-control/examples/test_type_text.rs @@ -0,0 +1,48 @@ +//! Test the new type_text functionality + +use anyhow::Result; +use g3_computer_control::MacAxController; + +#[tokio::main] +async fn main() -> Result<()> { + println!("🧪 Testing macax type_text functionality\n"); + + let controller = MacAxController::new()?; + println!("✅ Controller initialized\n"); + + // Test 1: Type simple text + println!("Test 1: Typing simple text into TextEdit"); + println!(" Please open TextEdit and create a new document..."); + std::thread::sleep(std::time::Duration::from_secs(3)); + + match controller.type_text("TextEdit", "Hello, World!") { + Ok(_) => println!(" ✅ Successfully typed simple text\n"), + Err(e) => println!(" ❌ Failed: {}\n", e), + } + + std::thread::sleep(std::time::Duration::from_secs(1)); + + // Test 2: Type unicode and emojis + println!("Test 2: Typing unicode and emojis"); + match controller.type_text("TextEdit", "\n🌟 Unicode test: café, naïve, 日本語 🎉") { + Ok(_) => println!(" ✅ Successfully typed unicode text\n"), + Err(e) => println!(" ❌ Failed: {}\n", e), + } + + std::thread::sleep(std::time::Duration::from_secs(1)); + + // Test 3: Type special characters + println!("Test 3: Typing special characters"); + match controller.type_text("TextEdit", "\nSpecial: @#$%^&*()_+-=[]{}|;':,.<>?/") { + Ok(_) => println!(" ✅ Successfully typed special characters\n"), + Err(e) => println!(" ❌ Failed: {}\n", e), + } + + println!("\n✨ Tests complete!"); + println!("\n💡 Now try with Things3:"); + println!(" 1. Open Things3"); + println!(" 2. Press Cmd+N to create a new task"); + println!(" 3. Run: g3 --macax 'type \"🌟 My awesome task\" into Things'"); + + Ok(()) +} diff --git a/crates/g3-computer-control/src/lib.rs b/crates/g3-computer-control/src/lib.rs index 5c72d65..2eb686c 100644 --- a/crates/g3-computer-control/src/lib.rs +++ b/crates/g3-computer-control/src/lib.rs @@ -1,10 +1,14 @@ pub mod types; pub mod platform; pub mod webdriver; +pub mod macax; // Re-export webdriver types for convenience pub use webdriver::{WebDriverController, WebElement, safari::SafariDriver}; +// Re-export macax types for convenience +pub use macax::{MacAxController, AXElement, AXApplication}; + use anyhow::Result; use async_trait::async_trait; use types::*; diff --git a/crates/g3-computer-control/src/macax/controller.rs b/crates/g3-computer-control/src/macax/controller.rs new file mode 100644 index 0000000..a887714 --- /dev/null +++ b/crates/g3-computer-control/src/macax/controller.rs @@ -0,0 +1,826 @@ +use super::{AXApplication, AXElement}; +use anyhow::{Context, Result}; +use std::collections::HashMap; + +#[cfg(target_os = "macos")] +use accessibility::{AXUIElement, AXUIElementAttributes, ElementFinder, TreeVisitor, TreeWalker, TreeWalkerFlow}; + +#[cfg(target_os = "macos")] +use core_foundation::base::TCFType; + +#[cfg(target_os = "macos")] +use core_foundation::string::CFString; + +#[cfg(target_os = "macos")] +use core_foundation::boolean::CFBoolean; + +/// macOS Accessibility API controller using native APIs +pub struct MacAxController { + // Cache for application elements + app_cache: std::sync::Mutex>, +} + +impl MacAxController { + pub fn new() -> Result { + #[cfg(target_os = "macos")] + { + // Check if we have accessibility permissions by trying to get system-wide element + let _system = AXUIElement::system_wide(); + + Ok(Self { + app_cache: std::sync::Mutex::new(HashMap::new()), + }) + } + + #[cfg(not(target_os = "macos"))] + { + anyhow::bail!("macOS Accessibility API is only available on macOS") + } + } + + /// List all running applications + #[cfg(target_os = "macos")] + pub fn list_applications(&self) -> Result> { + let apps = Self::get_running_applications()?; + Ok(apps) + } + + #[cfg(not(target_os = "macos"))] + pub fn list_applications(&self) -> Result> { + anyhow::bail!("Not supported on this platform") + } + + #[cfg(target_os = "macos")] + fn get_running_applications() -> Result> { + use cocoa::appkit::NSApplicationActivationPolicy; + use cocoa::base::{id, nil}; + use objc::{class, msg_send, sel, sel_impl}; + + unsafe { + let workspace: id = msg_send![class!(NSWorkspace), sharedWorkspace]; + let running_apps: id = msg_send![workspace, runningApplications]; + let count: usize = msg_send![running_apps, count]; + + let mut apps = Vec::new(); + + for i in 0..count { + let app: id = msg_send![running_apps, objectAtIndex: i]; + + // Get app name + let localized_name: id = msg_send![app, localizedName]; + if localized_name == nil { + continue; + } + let name_ptr: *const i8 = msg_send![localized_name, UTF8String]; + let name = if !name_ptr.is_null() { + std::ffi::CStr::from_ptr(name_ptr) + .to_string_lossy() + .to_string() + } else { + continue; + }; + + // Get bundle ID + let bundle_id_obj: id = msg_send![app, bundleIdentifier]; + let bundle_id = if bundle_id_obj != nil { + let bundle_id_ptr: *const i8 = msg_send![bundle_id_obj, UTF8String]; + if !bundle_id_ptr.is_null() { + Some( + std::ffi::CStr::from_ptr(bundle_id_ptr) + .to_string_lossy() + .to_string(), + ) + } else { + None + } + } else { + None + }; + + // Get PID + let pid: i32 = msg_send![app, processIdentifier]; + + // Skip background-only apps + let activation_policy: i64 = msg_send![app, activationPolicy]; + if activation_policy == NSApplicationActivationPolicy::NSApplicationActivationPolicyRegular as i64 { + apps.push(AXApplication { + name, + bundle_id, + pid, + }); + } + } + + Ok(apps) + } + } + + /// Get the frontmost (active) application + #[cfg(target_os = "macos")] + pub fn get_frontmost_app(&self) -> Result { + use cocoa::base::{id, nil}; + use objc::{class, msg_send, sel, sel_impl}; + + unsafe { + let workspace: id = msg_send![class!(NSWorkspace), sharedWorkspace]; + let frontmost_app: id = msg_send![workspace, frontmostApplication]; + + if frontmost_app == nil { + anyhow::bail!("No frontmost application"); + } + + // Get app name + let localized_name: id = msg_send![frontmost_app, localizedName]; + let name_ptr: *const i8 = msg_send![localized_name, UTF8String]; + let name = std::ffi::CStr::from_ptr(name_ptr) + .to_string_lossy() + .to_string(); + + // Get bundle ID + let bundle_id_obj: id = msg_send![frontmost_app, bundleIdentifier]; + let bundle_id = if bundle_id_obj != nil { + let bundle_id_ptr: *const i8 = msg_send![bundle_id_obj, UTF8String]; + if !bundle_id_ptr.is_null() { + Some( + std::ffi::CStr::from_ptr(bundle_id_ptr) + .to_string_lossy() + .to_string(), + ) + } else { + None + } + } else { + None + }; + + // Get PID + let pid: i32 = msg_send![frontmost_app, processIdentifier]; + + Ok(AXApplication { + name, + bundle_id, + pid, + }) + } + } + + #[cfg(not(target_os = "macos"))] + pub fn get_frontmost_app(&self) -> Result { + anyhow::bail!("Not supported on this platform") + } + + /// Get AXUIElement for an application by name or PID + #[cfg(target_os = "macos")] + fn get_app_element(&self, app_name: &str) -> Result { + // Check cache first + { + let cache = self.app_cache.lock().unwrap(); + if let Some(element) = cache.get(app_name) { + return Ok(element.clone()); + } + } + + // Find the app by name + let apps = Self::get_running_applications()?; + let app = apps + .iter() + .find(|a| a.name == app_name) + .ok_or_else(|| anyhow::anyhow!("Application '{}' not found", app_name))?; + + // Create AXUIElement for the app + let element = AXUIElement::application(app.pid); + + // Cache it + { + let mut cache = self.app_cache.lock().unwrap(); + cache.insert(app_name.to_string(), element.clone()); + } + + Ok(element) + } + + /// Activate (bring to front) an application + #[cfg(target_os = "macos")] + pub fn activate_app(&self, app_name: &str) -> Result<()> { + use cocoa::base::{id, nil}; + use objc::{class, msg_send, sel, sel_impl}; + + // Find the app + let apps = Self::get_running_applications()?; + let app = apps + .iter() + .find(|a| a.name == app_name) + .ok_or_else(|| anyhow::anyhow!("Application '{}' not found", app_name))?; + + unsafe { + let workspace: id = msg_send![class!(NSWorkspace), sharedWorkspace]; + let running_apps: id = msg_send![workspace, runningApplications]; + let count: usize = msg_send![running_apps, count]; + + for i in 0..count { + let running_app: id = msg_send![running_apps, objectAtIndex: i]; + let pid: i32 = msg_send![running_app, processIdentifier]; + + if pid == app.pid { + let _: bool = msg_send![running_app, activateWithOptions: 0]; + return Ok(()); + } + } + } + + anyhow::bail!("Failed to activate application") + } + + #[cfg(not(target_os = "macos"))] + pub fn activate_app(&self, _app_name: &str) -> Result<()> { + anyhow::bail!("Not supported on this platform") + } + + /// Get the UI hierarchy of an application + #[cfg(target_os = "macos")] + pub fn get_ui_tree(&self, app_name: &str, max_depth: usize) -> Result { + let app_element = self.get_app_element(app_name)?; + let mut output = format!("Application: {}\n", app_name); + + Self::build_ui_tree(&app_element, &mut output, 0, max_depth)?; + + Ok(output) + } + + #[cfg(not(target_os = "macos"))] + pub fn get_ui_tree(&self, _app_name: &str, _max_depth: usize) -> Result { + anyhow::bail!("Not supported on this platform") + } + + #[cfg(target_os = "macos")] + fn build_ui_tree( + element: &AXUIElement, + output: &mut String, + depth: usize, + max_depth: usize, + ) -> Result<()> { + if depth >= max_depth { + return Ok(()); + } + + let indent = " ".repeat(depth); + + // Get role + let role = element.role().ok().map(|s| s.to_string()) + .unwrap_or_else(|| "Unknown".to_string()); + + // Get title + let title = element.title().ok() + .map(|s| s.to_string()); + + // Get identifier + let identifier = element.identifier().ok() + .map(|s| s.to_string()); + + // Format output + output.push_str(&format!("{}Role: {}", indent, role)); + if let Some(t) = title { + output.push_str(&format!(", Title: {}", t)); + } + if let Some(id) = identifier { + output.push_str(&format!(", ID: {}", id)); + } + output.push('\n'); + + // Get children + if let Ok(children) = element.children() { + for i in 0..children.len() { + if let Some(child) = children.get(i) { + let _ = Self::build_ui_tree(&child, output, depth + 1, max_depth); + } + } + } + + Ok(()) + } + + /// Find UI elements in an application + #[cfg(target_os = "macos")] + pub fn find_elements( + &self, + app_name: &str, + role: Option<&str>, + title: Option<&str>, + identifier: Option<&str>, + ) -> Result> { + let app_element = self.get_app_element(app_name)?; + let mut found_elements = Vec::new(); + + let visitor = ElementCollector { + role_filter: role.map(|s| s.to_string()), + title_filter: title.map(|s| s.to_string()), + identifier_filter: identifier.map(|s| s.to_string()), + results: std::cell::RefCell::new(&mut found_elements), + depth: std::cell::Cell::new(0), + }; + + let walker = TreeWalker::new(); + walker.walk(&app_element, &visitor); + + Ok(found_elements) + } + + #[cfg(not(target_os = "macos"))] + pub fn find_elements( + &self, + _app_name: &str, + _role: Option<&str>, + _title: Option<&str>, + _identifier: Option<&str>, + ) -> Result> { + anyhow::bail!("Not supported on this platform") + } + + /// Find a single element (helper for click, set_value, etc.) + #[cfg(target_os = "macos")] + fn find_element( + &self, + app_name: &str, + role: &str, + title: Option<&str>, + identifier: Option<&str>, + ) -> Result { + let app_element = self.get_app_element(app_name)?; + + let role_str = role.to_string(); + let title_str = title.map(|s| s.to_string()); + let identifier_str = identifier.map(|s| s.to_string()); + + let finder = ElementFinder::new( + &app_element, + move |element| { + // Check role + let elem_role = element.role() + .ok() + .map(|s| s.to_string()); + + if let Some(r) = elem_role { + if !r.contains(&role_str) { + return false; + } + } else { + return false; + } + + // Check title if specified + if let Some(ref title_filter) = title_str { + let elem_title = element.title() + .ok() + .map(|s| s.to_string()); + + if let Some(t) = elem_title { + if !t.contains(title_filter) { + return false; + } + } else { + return false; + } + } + + // Check identifier if specified + if let Some(ref id_filter) = identifier_str { + let elem_id = element.identifier() + .ok() + .map(|s| s.to_string()); + + if let Some(id) = elem_id { + if !id.contains(id_filter) { + return false; + } + } else { + return false; + } + } + + true + }, + Some(std::time::Duration::from_secs(2)), + ); + + finder.find().context("Element not found") + } + + /// Click on a UI element + #[cfg(target_os = "macos")] + pub fn click_element( + &self, + app_name: &str, + role: &str, + title: Option<&str>, + identifier: Option<&str>, + ) -> Result<()> { + let element = self.find_element(app_name, role, title, identifier)?; + + // Perform the press action + let action_name = CFString::new("AXPress"); + element + .perform_action(&action_name) + .map_err(|e| anyhow::anyhow!("Failed to perform press action: {:?}", e))?; + + Ok(()) + } + + #[cfg(not(target_os = "macos"))] + pub fn click_element( + &self, + _app_name: &str, + _role: &str, + _title: Option<&str>, + _identifier: Option<&str>, + ) -> Result<()> { + anyhow::bail!("Not supported on this platform") + } + + /// Set the value of a UI element + #[cfg(target_os = "macos")] + pub fn set_value( + &self, + app_name: &str, + role: &str, + value: &str, + title: Option<&str>, + identifier: Option<&str>, + ) -> Result<()> { + let element = self.find_element(app_name, role, title, identifier)?; + + // Set the value - convert CFString to CFType + let cf_value = CFString::new(value); + + element.set_value(cf_value.as_CFType()) + .map_err(|e| anyhow::anyhow!("Failed to set value: {:?}", e))?; + + Ok(()) + } + + #[cfg(not(target_os = "macos"))] + pub fn set_value( + &self, + _app_name: &str, + _role: &str, + _value: &str, + _title: Option<&str>, + _identifier: Option<&str>, + ) -> Result<()> { + anyhow::bail!("Not supported on this platform") + } + + /// Get the value of a UI element + #[cfg(target_os = "macos")] + pub fn get_value( + &self, + app_name: &str, + role: &str, + title: Option<&str>, + identifier: Option<&str>, + ) -> Result { + let element = self.find_element(app_name, role, title, identifier)?; + + // Get the value + let value_type = element.value() + .map_err(|e| anyhow::anyhow!("Failed to get value: {:?}", e))?; + + // Try to downcast to CFString + if let Some(cf_string) = value_type.downcast::() { + Ok(cf_string.to_string()) + } else { + // For non-string values, try to get a description + Ok(format!("")) + } + } + + #[cfg(not(target_os = "macos"))] + pub fn get_value( + &self, + _app_name: &str, + _role: &str, + _title: Option<&str>, + _identifier: Option<&str>, + ) -> Result { + anyhow::bail!("Not supported on this platform") + } + + /// Type text into the currently focused element (uses system text input) + #[cfg(target_os = "macos")] + pub fn type_text(&self, app_name: &str, text: &str) -> Result<()> { + use cocoa::appkit::NSPasteboard; + use cocoa::base::{id, nil}; + use cocoa::foundation::NSString; + use objc::{class, msg_send, sel, sel_impl}; + + // First, make sure the app is active + self.activate_app(app_name)?; + + // Wait for app to fully activate + std::thread::sleep(std::time::Duration::from_millis(500)); + + // Send a Tab key to try to focus on a text field + // This helps ensure something is focused before we paste + let _ = self.press_key(app_name, "tab", vec![]); + std::thread::sleep(std::time::Duration::from_millis(800)); + + // Save old clipboard, set new content, paste, then restore + let old_content: id; + unsafe { + // Get the general pasteboard + let pasteboard: id = msg_send![class!(NSPasteboard), generalPasteboard]; + + // Save current clipboard content + let ns_string_type = NSString::alloc(nil).init_str("public.utf8-plain-text"); + old_content = msg_send![pasteboard, stringForType: ns_string_type]; + + // Clear and set new content + let _: () = msg_send![pasteboard, clearContents]; + + let ns_string = NSString::alloc(nil).init_str(text); + let ns_type = NSString::alloc(nil).init_str("public.utf8-plain-text"); + let _: bool = msg_send![pasteboard, setString:ns_string forType:ns_type]; + } + + // Wait a moment for clipboard to update + std::thread::sleep(std::time::Duration::from_millis(200)); + + // Paste using Cmd+V (outside unsafe block) + self.press_key(app_name, "v", vec!["command"])?; + + // Wait for paste to complete + std::thread::sleep(std::time::Duration::from_millis(300)); + + // Restore old clipboard content if it existed + unsafe { + if old_content != nil { + let pasteboard: id = msg_send![class!(NSPasteboard), generalPasteboard]; + let _: () = msg_send![pasteboard, clearContents]; + let ns_type = NSString::alloc(nil).init_str("public.utf8-plain-text"); + let _: bool = msg_send![pasteboard, setString:old_content forType:ns_type]; + } + } + + Ok(()) + } + + #[cfg(not(target_os = "macos"))] + pub fn type_text(&self, _app_name: &str, _text: &str) -> Result<()> { + anyhow::bail!("Not supported on this platform") + } + + /// Focus on a text field or text area element + #[cfg(target_os = "macos")] + pub fn focus_element( + &self, + app_name: &str, + role: &str, + title: Option<&str>, + identifier: Option<&str>, + ) -> Result<()> { + let element = self.find_element(app_name, role, title, identifier)?; + + // Set focused attribute to true + use core_foundation::boolean::CFBoolean; + let cf_true = CFBoolean::true_value(); + + element.set_attribute(&accessibility::AXAttribute::focused(), cf_true) + .map_err(|e| anyhow::anyhow!("Failed to focus element: {:?}", e))?; + + Ok(()) + } + + /// Press a keyboard shortcut + #[cfg(target_os = "macos")] + pub fn press_key( + &self, + app_name: &str, + key: &str, + modifiers: Vec<&str>, + ) -> Result<()> { + use core_graphics::event::{ + CGEvent, CGEventFlags, CGEventTapLocation, + }; + use core_graphics::event_source::{CGEventSource, CGEventSourceStateID}; + + // First, make sure the app is active + self.activate_app(app_name)?; + + // Wait a bit for activation + std::thread::sleep(std::time::Duration::from_millis(100)); + + // Map key string to key code + let key_code = Self::key_to_keycode(key) + .ok_or_else(|| anyhow::anyhow!("Unknown key: {}", key))?; + + // Map modifiers to flags + let mut flags = CGEventFlags::CGEventFlagNull; + for modifier in modifiers { + match modifier.to_lowercase().as_str() { + "command" | "cmd" => flags |= CGEventFlags::CGEventFlagCommand, + "option" | "alt" => flags |= CGEventFlags::CGEventFlagAlternate, + "control" | "ctrl" => flags |= CGEventFlags::CGEventFlagControl, + "shift" => flags |= CGEventFlags::CGEventFlagShift, + _ => {} + } + } + + // Create event source + let source = CGEventSource::new(CGEventSourceStateID::HIDSystemState) + .ok().context("Failed to create event source")?; + + // Create key down event + let key_down = CGEvent::new_keyboard_event(source.clone(), key_code, true) + .ok().context("Failed to create key down event")?; + key_down.set_flags(flags); + + // Create key up event + let key_up = CGEvent::new_keyboard_event(source, key_code, false) + .ok().context("Failed to create key up event")?; + key_up.set_flags(flags); + + // Post events + key_down.post(CGEventTapLocation::HID); + std::thread::sleep(std::time::Duration::from_millis(50)); + key_up.post(CGEventTapLocation::HID); + + Ok(()) + } + + #[cfg(not(target_os = "macos"))] + pub fn press_key( + &self, + _app_name: &str, + _key: &str, + _modifiers: Vec<&str>, + ) -> Result<()> { + anyhow::bail!("Not supported on this platform") + } + + #[cfg(target_os = "macos")] + fn key_to_keycode(key: &str) -> Option { + // Map common keys to keycodes + // See: https://eastmanreference.com/complete-list-of-applescript-key-codes + match key.to_lowercase().as_str() { + "a" => Some(0x00), + "s" => Some(0x01), + "d" => Some(0x02), + "f" => Some(0x03), + "h" => Some(0x04), + "g" => Some(0x05), + "z" => Some(0x06), + "x" => Some(0x07), + "c" => Some(0x08), + "v" => Some(0x09), + "b" => Some(0x0B), + "q" => Some(0x0C), + "w" => Some(0x0D), + "e" => Some(0x0E), + "r" => Some(0x0F), + "y" => Some(0x10), + "t" => Some(0x11), + "1" => Some(0x12), + "2" => Some(0x13), + "3" => Some(0x14), + "4" => Some(0x15), + "6" => Some(0x16), + "5" => Some(0x17), + "=" => Some(0x18), + "9" => Some(0x19), + "7" => Some(0x1A), + "-" => Some(0x1B), + "8" => Some(0x1C), + "0" => Some(0x1D), + "]" => Some(0x1E), + "o" => Some(0x1F), + "u" => Some(0x20), + "[" => Some(0x21), + "i" => Some(0x22), + "p" => Some(0x23), + "return" | "enter" => Some(0x24), + "l" => Some(0x25), + "j" => Some(0x26), + "'" => Some(0x27), + "k" => Some(0x28), + ";" => Some(0x29), + "\\" => Some(0x2A), + "," => Some(0x2B), + "/" => Some(0x2C), + "n" => Some(0x2D), + "m" => Some(0x2E), + "." => Some(0x2F), + "tab" => Some(0x30), + "space" => Some(0x31), + "`" => Some(0x32), + "delete" | "backspace" => Some(0x33), + "escape" | "esc" => Some(0x35), + "f1" => Some(0x7A), + "f2" => Some(0x78), + "f3" => Some(0x63), + "f4" => Some(0x76), + "f5" => Some(0x60), + "f6" => Some(0x61), + "f7" => Some(0x62), + "f8" => Some(0x64), + "f9" => Some(0x65), + "f10" => Some(0x6D), + "f11" => Some(0x67), + "f12" => Some(0x6F), + "left" => Some(0x7B), + "right" => Some(0x7C), + "down" => Some(0x7D), + "up" => Some(0x7E), + _ => None, + } + } +} + +#[cfg(target_os = "macos")] +struct ElementCollector<'a> { + role_filter: Option, + title_filter: Option, + identifier_filter: Option, + results: std::cell::RefCell<&'a mut Vec>, + depth: std::cell::Cell, +} + +#[cfg(target_os = "macos")] +impl<'a> TreeVisitor for ElementCollector<'a> { + fn enter_element(&self, element: &AXUIElement) -> TreeWalkerFlow { + self.depth.set(self.depth.get() + 1); + + if self.depth.get() > 20 { + return TreeWalkerFlow::SkipSubtree; + } + + // Get element properties + let role = element.role() + .ok() + .map(|s| s.to_string()) + .unwrap_or_else(|| "Unknown".to_string()); + + let title = element.title() + .ok() + .map(|s| s.to_string()); + + let identifier = element.identifier() + .ok() + .map(|s| s.to_string()); + + // Check if this element matches the filters + let role_matches = self.role_filter.as_ref().map_or(true, |r| role.contains(r)); + let title_matches = self.title_filter.as_ref().map_or(true, |t| { + title.as_ref().map_or(false, |title_str| title_str.contains(t)) + }); + let identifier_matches = self.identifier_filter.as_ref().map_or(true, |id| { + identifier.as_ref().map_or(false, |id_str| id_str.contains(id)) + }); + + if role_matches && title_matches && identifier_matches { + // Get additional properties + let value = element.value() + .ok() + .and_then(|v| { + v.downcast::().map(|s| s.to_string()) + }); + + let label = element.description() + .ok() + .map(|s| s.to_string()); + + let enabled = element.enabled() + .ok() + .map(|b| b.into()) + .unwrap_or(false); + + let focused = element.focused() + .ok() + .map(|b| b.into()) + .unwrap_or(false); + + // Count children + let children_count = element.children() + .ok() + .map(|arr| arr.len() as usize) + .unwrap_or(0); + + self.results.borrow_mut().push(AXElement { + role, + title, + value, + label, + identifier, + enabled, + focused, + position: None, + size: None, + children_count, + }); + } + + TreeWalkerFlow::Continue + } + + fn exit_element(&self, _element: &AXUIElement) { + self.depth.set(self.depth.get() - 1); + } +} diff --git a/crates/g3-computer-control/src/macax/mod.rs b/crates/g3-computer-control/src/macax/mod.rs new file mode 100644 index 0000000..b62e87d --- /dev/null +++ b/crates/g3-computer-control/src/macax/mod.rs @@ -0,0 +1,65 @@ +pub mod controller; + +pub use controller::MacAxController; + +use serde::{Deserialize, Serialize}; + +#[cfg(test)] +mod tests; + +/// Represents an accessibility element in the UI hierarchy +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AXElement { + pub role: String, + pub title: Option, + pub value: Option, + pub label: Option, + pub identifier: Option, + pub enabled: bool, + pub focused: bool, + pub position: Option<(f64, f64)>, + pub size: Option<(f64, f64)>, + pub children_count: usize, +} + +/// Represents a macOS application +#[derive(Debug, Clone)] +pub struct AXApplication { + pub name: String, + pub bundle_id: Option, + pub pid: i32, +} + +impl AXElement { + /// Convert to a human-readable string representation + pub fn to_string(&self) -> String { + let mut parts = vec![format!("Role: {}", self.role)]; + + if let Some(ref title) = self.title { + parts.push(format!("Title: {}", title)); + } + if let Some(ref value) = self.value { + parts.push(format!("Value: {}", value)); + } + if let Some(ref label) = self.label { + parts.push(format!("Label: {}", label)); + } + if let Some(ref id) = self.identifier { + parts.push(format!("ID: {}", id)); + } + + parts.push(format!("Enabled: {}", self.enabled)); + parts.push(format!("Focused: {}", self.focused)); + + if let Some((x, y)) = self.position { + parts.push(format!("Position: ({:.0}, {:.0})", x, y)); + } + if let Some((w, h)) = self.size { + parts.push(format!("Size: ({:.0}, {:.0})", w, h)); + } + + parts.push(format!("Children: {}", self.children_count)); + + parts.join(", ") + } +} diff --git a/crates/g3-computer-control/src/macax/tests.rs b/crates/g3-computer-control/src/macax/tests.rs new file mode 100644 index 0000000..01f44e3 --- /dev/null +++ b/crates/g3-computer-control/src/macax/tests.rs @@ -0,0 +1,37 @@ +#[cfg(test)] +mod tests { + use crate::{AXElement, MacAxController}; + + #[test] + fn test_ax_element_to_string() { + let element = AXElement { + role: "button".to_string(), + title: Some("Click Me".to_string()), + value: None, + label: Some("Submit Button".to_string()), + identifier: Some("submitBtn".to_string()), + enabled: true, + focused: false, + position: Some((100.0, 200.0)), + size: Some((80.0, 30.0)), + children_count: 0, + }; + + let string_repr = element.to_string(); + assert!(string_repr.contains("Role: button")); + assert!(string_repr.contains("Title: Click Me")); + assert!(string_repr.contains("Label: Submit Button")); + assert!(string_repr.contains("ID: submitBtn")); + assert!(string_repr.contains("Enabled: true")); + assert!(string_repr.contains("Position: (100, 200)")); + assert!(string_repr.contains("Size: (80, 30)")); + } + + #[test] + fn test_controller_creation() { + // Just test that we can create a controller + // Actual functionality requires macOS and permissions + let result = MacAxController::new(); + assert!(result.is_ok()); + } +} diff --git a/crates/g3-config/src/lib.rs b/crates/g3-config/src/lib.rs index 4b6dc9d..272367d 100644 --- a/crates/g3-config/src/lib.rs +++ b/crates/g3-config/src/lib.rs @@ -8,6 +8,7 @@ pub struct Config { pub agent: AgentConfig, pub computer_control: ComputerControlConfig, pub webdriver: WebDriverConfig, + pub macax: MacAxConfig, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -79,6 +80,19 @@ pub struct WebDriverConfig { pub safari_port: u16, } +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct MacAxConfig { + pub enabled: bool, +} + +impl Default for MacAxConfig { + fn default() -> Self { + Self { + enabled: false, + } + } +} + impl Default for WebDriverConfig { fn default() -> Self { Self { @@ -124,6 +138,7 @@ impl Default for Config { }, computer_control: ComputerControlConfig::default(), webdriver: WebDriverConfig::default(), + macax: MacAxConfig::default(), } } } @@ -238,6 +253,7 @@ impl Config { }, computer_control: ComputerControlConfig::default(), webdriver: WebDriverConfig::default(), + macax: MacAxConfig::default(), } } diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs index 2106d3d..21e7bd0 100644 --- a/crates/g3-core/src/lib.rs +++ b/crates/g3-core/src/lib.rs @@ -551,6 +551,7 @@ pub struct Agent { todo_content: std::sync::Arc>, webdriver_session: std::sync::Arc>>>>, safaridriver_process: std::sync::Arc>>, + macax_controller: std::sync::Arc>>, } impl Agent { @@ -761,6 +762,9 @@ impl Agent { None }; + // Capture macax_enabled before moving config + let macax_enabled = config.macax.enabled; + Ok(Self { providers, context_window, @@ -777,6 +781,12 @@ impl Agent { computer_controller, webdriver_session: std::sync::Arc::new(tokio::sync::RwLock::new(None)), safaridriver_process: std::sync::Arc::new(tokio::sync::RwLock::new(None)), + macax_controller: { + std::sync::Arc::new(tokio::sync::RwLock::new( + if macax_enabled { Some(g3_computer_control::MacAxController::new()?) } + else { None } + )) + }, }) } @@ -1088,7 +1098,7 @@ Template: // Check if provider supports native tool calling and add tools if so let provider = self.providers.get(None)?; let tools = if provider.has_native_tool_calling() { - Some(Self::create_tool_definitions(self.config.webdriver.enabled)) + Some(Self::create_tool_definitions(self.config.webdriver.enabled, self.config.macax.enabled)) } else { None }; @@ -1549,7 +1559,7 @@ Template: } /// Create tool definitions for native tool calling providers - fn create_tool_definitions(enable_webdriver: bool) -> Vec { + fn create_tool_definitions(enable_webdriver: bool, enable_macax: bool) -> Vec { let mut tools = vec![ Tool { name: "shell".to_string(), @@ -1904,6 +1914,231 @@ Template: ]); } + // Add macOS Accessibility tools if enabled + if enable_macax { + tools.extend(vec![ + Tool { + name: "macax_list_apps".to_string(), + description: "List all running applications that can be controlled via macOS Accessibility API".to_string(), + input_schema: json!({ + "type": "object", + "properties": {}, + "required": [] + }), + }, + Tool { + name: "macax_get_frontmost_app".to_string(), + description: "Get the name of the currently active (frontmost) application".to_string(), + input_schema: json!({ + "type": "object", + "properties": {}, + "required": [] + }), + }, + Tool { + name: "macax_activate_app".to_string(), + description: "Bring an application to the front (activate it)".to_string(), + input_schema: json!({ + "type": "object", + "properties": { + "app_name": { + "type": "string", + "description": "Name of the application to activate (e.g., 'Safari', 'TextEdit')" + } + }, + "required": ["app_name"] + }), + }, + Tool { + name: "macax_get_ui_tree".to_string(), + description: "Get the UI element hierarchy of an application as a tree structure".to_string(), + input_schema: json!({ + "type": "object", + "properties": { + "app_name": { + "type": "string", + "description": "Name of the application" + }, + "max_depth": { + "type": "integer", + "description": "Maximum depth to traverse (default: 3)" + } + }, + "required": ["app_name"] + }), + }, + Tool { + name: "macax_find_elements".to_string(), + description: "Find UI elements in an application by role, title, or identifier. Use this to locate buttons, text fields, etc.".to_string(), + input_schema: json!({ + "type": "object", + "properties": { + "app_name": { + "type": "string", + "description": "Name of the application" + }, + "role": { + "type": "string", + "description": "UI element role (e.g., 'button', 'text field', 'window')" + }, + "title": { + "type": "string", + "description": "Element title or label to match" + }, + "identifier": { + "type": "string", + "description": "Element identifier (accessibility identifier)" + } + }, + "required": ["app_name"] + }), + }, + Tool { + name: "macax_click".to_string(), + description: "Click a UI element in an application".to_string(), + input_schema: json!({ + "type": "object", + "properties": { + "app_name": { + "type": "string", + "description": "Name of the application" + }, + "role": { + "type": "string", + "description": "UI element role (e.g., 'button')" + }, + "title": { + "type": "string", + "description": "Element title or label" + }, + "identifier": { + "type": "string", + "description": "Element identifier" + } + }, + "required": ["app_name", "role"] + }), + }, + Tool { + name: "macax_set_value".to_string(), + description: "Set the value of a UI element (e.g., type into a text field)".to_string(), + input_schema: json!({ + "type": "object", + "properties": { + "app_name": { + "type": "string", + "description": "Name of the application" + }, + "role": { + "type": "string", + "description": "UI element role (e.g., 'text field')" + }, + "value": { + "type": "string", + "description": "Value to set" + }, + "title": { + "type": "string", + "description": "Element title or label" + }, + "identifier": { + "type": "string", + "description": "Element identifier" + } + }, + "required": ["app_name", "role", "value"] + }), + }, + Tool { + name: "macax_get_value".to_string(), + description: "Get the value of a UI element (e.g., read text from a text field)".to_string(), + input_schema: json!({ + "type": "object", + "properties": { + "app_name": { + "type": "string", + "description": "Name of the application" + }, + "role": { + "type": "string", + "description": "UI element role (e.g., 'text field')" + }, + "title": { + "type": "string", + "description": "Element title or label" + }, + "identifier": { + "type": "string", + "description": "Element identifier" + } + }, + "required": ["app_name", "role"] + }), + }, + Tool { + name: "macax_press_key".to_string(), + description: "Press a keyboard key or shortcut in an application (e.g., Cmd+S to save)".to_string(), + input_schema: json!({ + "type": "object", + "properties": { + "app_name": { + "type": "string", + "description": "Name of the application" + }, + "key": { + "type": "string", + "description": "Key to press (e.g., 's', 'return', 'tab')" + }, + "modifiers": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Modifier keys (e.g., ['command', 'shift'])" + } + }, + "required": ["app_name", "key"] + }), + }, + ]); + + // Add type_text tool for typing arbitrary text + tools.push(Tool { + name: "macax_type_text".to_string(), + description: "Type arbitrary text into the currently focused element in an application (supports unicode, emojis, etc.)".to_string(), + input_schema: json!({ + "type": "object", + "properties": { + "app_name": { + "type": "string", + "description": "Name of the application" + }, + "text": { + "type": "string", + "description": "Text to type (can include unicode, emojis, special characters)" + } + }, + "required": ["app_name", "text"] + }), + }); + + // Add focus_element tool + tools.push(Tool { + name: "macax_focus_element".to_string(), + description: "Focus on a UI element (text field, text area, etc.) before typing".to_string(), + input_schema: json!({ + "type": "object", + "properties": { + "app_name": {"type": "string", "description": "Name of the application"}, + "role": {"type": "string", "description": "UI element role (e.g., 'text field', 'text area')"}, + "title": {"type": "string", "description": "Element title or label (optional)"}, + "identifier": {"type": "string", "description": "Element accessibility identifier (optional)"} + }, + "required": ["app_name", "role"] + }), + }); + } + tools } @@ -2469,7 +2704,7 @@ Template: // Ensure tools are included for native providers in subsequent iterations if provider.has_native_tool_calling() { - request.tools = Some(Self::create_tool_definitions(self.config.webdriver.enabled)); + request.tools = Some(Self::create_tool_definitions(self.config.webdriver.enabled, self.config.macax.enabled)); } // Only add to full_response if we haven't already added it @@ -3829,6 +4064,331 @@ Template: Err(_) => Ok("❌ Cannot quit: WebDriver session is still in use".to_string()), } } + "macax_list_apps" => { + debug!("Processing macax_list_apps tool call"); + + if !self.config.macax.enabled { + return Ok("❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string()); + } + + let controller_guard = self.macax_controller.read().await; + let controller = match controller_guard.as_ref() { + Some(c) => c, + None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()), + }; + + match controller.list_applications() { + Ok(apps) => { + let app_list: Vec = apps.iter().map(|a| a.name.clone()).collect(); + Ok(format!("Running applications:\n{}", app_list.join("\n"))) + } + Err(e) => Ok(format!("❌ Failed to list applications: {}", e)), + } + } + "macax_get_frontmost_app" => { + debug!("Processing macax_get_frontmost_app tool call"); + + if !self.config.macax.enabled { + return Ok("❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string()); + } + + let controller_guard = self.macax_controller.read().await; + let controller = match controller_guard.as_ref() { + Some(c) => c, + None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()), + }; + + match controller.get_frontmost_app() { + Ok(app) => Ok(format!("Frontmost application: {}", app.name)), + Err(e) => Ok(format!("❌ Failed to get frontmost app: {}", e)), + } + } + "macax_activate_app" => { + debug!("Processing macax_activate_app tool call"); + + if !self.config.macax.enabled { + return Ok("❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string()); + } + + let app_name = match tool_call.args.get("app_name").and_then(|v| v.as_str()) { + Some(n) => n, + None => return Ok("❌ Missing app_name argument".to_string()), + }; + + let controller_guard = self.macax_controller.read().await; + let controller = match controller_guard.as_ref() { + Some(c) => c, + None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()), + }; + + match controller.activate_app(app_name) { + Ok(_) => Ok(format!("✅ Activated application: {}", app_name)), + Err(e) => Ok(format!("❌ Failed to activate app: {}", e)), + } + } + "macax_get_ui_tree" => { + debug!("Processing macax_get_ui_tree tool call"); + + if !self.config.macax.enabled { + return Ok("❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string()); + } + + let app_name = match tool_call.args.get("app_name").and_then(|v| v.as_str()) { + Some(n) => n, + None => return Ok("❌ Missing app_name argument".to_string()), + }; + + let max_depth = tool_call.args.get("max_depth") + .and_then(|v| v.as_u64()) + .map(|n| n as usize) + .unwrap_or(3); + + let controller_guard = self.macax_controller.read().await; + let controller = match controller_guard.as_ref() { + Some(c) => c, + None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()), + }; + + match controller.get_ui_tree(app_name, max_depth) { + Ok(tree) => Ok(tree), + Err(e) => Ok(format!("❌ Failed to get UI tree: {}", e)), + } + } + "macax_find_elements" => { + debug!("Processing macax_find_elements tool call"); + + if !self.config.macax.enabled { + return Ok("❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string()); + } + + let app_name = match tool_call.args.get("app_name").and_then(|v| v.as_str()) { + Some(n) => n, + None => return Ok("❌ Missing app_name argument".to_string()), + }; + + let role = tool_call.args.get("role").and_then(|v| v.as_str()); + let title = tool_call.args.get("title").and_then(|v| v.as_str()); + let identifier = tool_call.args.get("identifier").and_then(|v| v.as_str()); + + let controller_guard = self.macax_controller.read().await; + let controller = match controller_guard.as_ref() { + Some(c) => c, + None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()), + }; + + match controller.find_elements(app_name, role, title, identifier) { + Ok(elements) => { + if elements.is_empty() { + Ok("No elements found matching criteria".to_string()) + } else { + let element_strs: Vec = elements.iter() + .map(|e| e.to_string()) + .collect(); + Ok(format!("Found {} element(s):\n{}", elements.len(), element_strs.join("\n"))) + } + } + Err(e) => Ok(format!("❌ Failed to find elements: {}", e)), + } + } + "macax_click" => { + debug!("Processing macax_click tool call"); + + if !self.config.macax.enabled { + return Ok("❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string()); + } + + let app_name = match tool_call.args.get("app_name").and_then(|v| v.as_str()) { + Some(n) => n, + None => return Ok("❌ Missing app_name argument".to_string()), + }; + + let role = match tool_call.args.get("role").and_then(|v| v.as_str()) { + Some(r) => r, + None => return Ok("❌ Missing role argument".to_string()), + }; + + let title = tool_call.args.get("title").and_then(|v| v.as_str()); + let identifier = tool_call.args.get("identifier").and_then(|v| v.as_str()); + + let controller_guard = self.macax_controller.read().await; + let controller = match controller_guard.as_ref() { + Some(c) => c, + None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()), + }; + + match controller.click_element(app_name, role, title, identifier) { + Ok(_) => Ok(format!("✅ Clicked {} element", role)), + Err(e) => Ok(format!("❌ Failed to click element: {}", e)), + } + } + "macax_set_value" => { + debug!("Processing macax_set_value tool call"); + + if !self.config.macax.enabled { + return Ok("❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string()); + } + + let app_name = match tool_call.args.get("app_name").and_then(|v| v.as_str()) { + Some(n) => n, + None => return Ok("❌ Missing app_name argument".to_string()), + }; + + let role = match tool_call.args.get("role").and_then(|v| v.as_str()) { + Some(r) => r, + None => return Ok("❌ Missing role argument".to_string()), + }; + + let value = match tool_call.args.get("value").and_then(|v| v.as_str()) { + Some(v) => v, + None => return Ok("❌ Missing value argument".to_string()), + }; + + let title = tool_call.args.get("title").and_then(|v| v.as_str()); + let identifier = tool_call.args.get("identifier").and_then(|v| v.as_str()); + + let controller_guard = self.macax_controller.read().await; + let controller = match controller_guard.as_ref() { + Some(c) => c, + None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()), + }; + + match controller.set_value(app_name, role, value, title, identifier) { + Ok(_) => Ok(format!("✅ Set value of {} element to: {}", role, value)), + Err(e) => Ok(format!("❌ Failed to set value: {}", e)), + } + } + "macax_get_value" => { + debug!("Processing macax_get_value tool call"); + + if !self.config.macax.enabled { + return Ok("❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string()); + } + + let app_name = match tool_call.args.get("app_name").and_then(|v| v.as_str()) { + Some(n) => n, + None => return Ok("❌ Missing app_name argument".to_string()), + }; + + let role = match tool_call.args.get("role").and_then(|v| v.as_str()) { + Some(r) => r, + None => return Ok("❌ Missing role argument".to_string()), + }; + + let title = tool_call.args.get("title").and_then(|v| v.as_str()); + let identifier = tool_call.args.get("identifier").and_then(|v| v.as_str()); + + let controller_guard = self.macax_controller.read().await; + let controller = match controller_guard.as_ref() { + Some(c) => c, + None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()), + }; + + match controller.get_value(app_name, role, title, identifier) { + Ok(value) => Ok(format!("Value: {}", value)), + Err(e) => Ok(format!("❌ Failed to get value: {}", e)), + } + } + "macax_press_key" => { + debug!("Processing macax_press_key tool call"); + + if !self.config.macax.enabled { + return Ok("❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string()); + } + + let app_name = match tool_call.args.get("app_name").and_then(|v| v.as_str()) { + Some(n) => n, + None => return Ok("❌ Missing app_name argument".to_string()), + }; + + let key = match tool_call.args.get("key").and_then(|v| v.as_str()) { + Some(k) => k, + None => return Ok("❌ Missing key argument".to_string()), + }; + + let modifiers_vec: Vec<&str> = tool_call.args.get("modifiers") + .and_then(|v| v.as_array()) + .map(|arr| arr.iter() + .filter_map(|v| v.as_str()) + .collect()) + .unwrap_or_default(); + + let controller_guard = self.macax_controller.read().await; + let controller = match controller_guard.as_ref() { + Some(c) => c, + None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()), + }; + + match controller.press_key(app_name, key, modifiers_vec.clone()) { + Ok(_) => { + let modifier_str = if modifiers_vec.is_empty() { + String::new() + } else { + format!(" with modifiers: {}", modifiers_vec.join("+")) + }; + Ok(format!("✅ Pressed key: {}{}", key, modifier_str)) + } + Err(e) => Ok(format!("❌ Failed to press key: {}", e)), + } + } + "macax_type_text" => { + debug!("Processing macax_type_text tool call"); + + if !self.config.macax.enabled { + return Ok("❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string()); + } + + let app_name = match tool_call.args.get("app_name").and_then(|v| v.as_str()) { + Some(n) => n, + None => return Ok("❌ Missing app_name argument".to_string()), + }; + + let text = match tool_call.args.get("text").and_then(|v| v.as_str()) { + Some(t) => t, + None => return Ok("❌ Missing text argument".to_string()), + }; + + let controller_guard = self.macax_controller.read().await; + let controller = match controller_guard.as_ref() { + Some(c) => c, + None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()), + }; + + match controller.type_text(app_name, text) { + Ok(_) => Ok(format!("✅ Typed text into {}", app_name)), + Err(e) => Ok(format!("❌ Failed to type text: {}", e)), + } + } + "macax_focus_element" => { + debug!("Processing macax_focus_element tool call"); + + if !self.config.macax.enabled { + return Ok("❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string()); + } + + let app_name = match tool_call.args.get("app_name").and_then(|v| v.as_str()) { + Some(n) => n, + None => return Ok("❌ Missing app_name argument".to_string()), + }; + + let role = match tool_call.args.get("role").and_then(|v| v.as_str()) { + Some(r) => r, + None => return Ok("❌ Missing role argument".to_string()), + }; + + let title = tool_call.args.get("title").and_then(|v| v.as_str()); + let identifier = tool_call.args.get("identifier").and_then(|v| v.as_str()); + + let controller_guard = self.macax_controller.read().await; + let controller = match controller_guard.as_ref() { + Some(c) => c, + None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()), + }; + + match controller.focus_element(app_name, role, title, identifier) { + Ok(_) => Ok(format!("✅ Focused {} element in {}", role, app_name)), + Err(e) => Ok(format!("❌ Failed to focus element: {}", e)), + } + } _ => { warn!("Unknown tool: {}", tool_call.tool); Ok(format!("❓ Unknown tool: {}", tool_call.tool)) From efd4eca75553e9907dc92dd33d044bb5ea083b53 Mon Sep 17 00:00:00 2001 From: Dhanji Prasanna Date: Thu, 23 Oct 2025 07:17:55 +1100 Subject: [PATCH 04/16] warnings fix --- crates/g3-computer-control/examples/safari_demo.rs | 2 +- .../g3-computer-control/examples/test_permission_prompt.rs | 2 +- crates/g3-computer-control/examples/test_screenshot_fix.rs | 1 - crates/g3-computer-control/src/lib.rs | 3 +++ crates/g3-computer-control/src/macax/controller.rs | 6 +----- 5 files changed, 6 insertions(+), 8 deletions(-) diff --git a/crates/g3-computer-control/examples/safari_demo.rs b/crates/g3-computer-control/examples/safari_demo.rs index aed4c1e..b28ebd6 100644 --- a/crates/g3-computer-control/examples/safari_demo.rs +++ b/crates/g3-computer-control/examples/safari_demo.rs @@ -31,7 +31,7 @@ async fn main() -> Result<()> { // Find an element println!("Finding h1 element..."); - let mut h1 = driver.find_element("h1").await?; + let h1 = driver.find_element("h1").await?; let h1_text = h1.text().await?; println!("H1 text: {}\n", h1_text); diff --git a/crates/g3-computer-control/examples/test_permission_prompt.rs b/crates/g3-computer-control/examples/test_permission_prompt.rs index bf1d640..fdd5a4b 100644 --- a/crates/g3-computer-control/examples/test_permission_prompt.rs +++ b/crates/g3-computer-control/examples/test_permission_prompt.rs @@ -1,4 +1,4 @@ -use g3_computer_control::{create_controller, ComputerController}; +use g3_computer_control::create_controller; #[tokio::main] async fn main() { diff --git a/crates/g3-computer-control/examples/test_screenshot_fix.rs b/crates/g3-computer-control/examples/test_screenshot_fix.rs index bcfb60b..467da49 100644 --- a/crates/g3-computer-control/examples/test_screenshot_fix.rs +++ b/crates/g3-computer-control/examples/test_screenshot_fix.rs @@ -1,6 +1,5 @@ use core_graphics::display::CGDisplay; use image::{ImageBuffer, RgbaImage}; -use std::path::Path; fn main() { let display = CGDisplay::main(); diff --git a/crates/g3-computer-control/src/lib.rs b/crates/g3-computer-control/src/lib.rs index 2eb686c..e4180c6 100644 --- a/crates/g3-computer-control/src/lib.rs +++ b/crates/g3-computer-control/src/lib.rs @@ -1,3 +1,6 @@ +// Suppress warnings from objc crate macros +#![allow(unexpected_cfgs)] + pub mod types; pub mod platform; pub mod webdriver; diff --git a/crates/g3-computer-control/src/macax/controller.rs b/crates/g3-computer-control/src/macax/controller.rs index a887714..ac91ac1 100644 --- a/crates/g3-computer-control/src/macax/controller.rs +++ b/crates/g3-computer-control/src/macax/controller.rs @@ -11,9 +11,6 @@ use core_foundation::base::TCFType; #[cfg(target_os = "macos")] use core_foundation::string::CFString; -#[cfg(target_os = "macos")] -use core_foundation::boolean::CFBoolean; - /// macOS Accessibility API controller using native APIs pub struct MacAxController { // Cache for application elements @@ -202,7 +199,7 @@ impl MacAxController { /// Activate (bring to front) an application #[cfg(target_os = "macos")] pub fn activate_app(&self, app_name: &str) -> Result<()> { - use cocoa::base::{id, nil}; + use cocoa::base::id; use objc::{class, msg_send, sel, sel_impl}; // Find the app @@ -507,7 +504,6 @@ impl MacAxController { /// Type text into the currently focused element (uses system text input) #[cfg(target_os = "macos")] pub fn type_text(&self, app_name: &str, text: &str) -> Result<()> { - use cocoa::appkit::NSPasteboard; use cocoa::base::{id, nil}; use cocoa::foundation::NSString; use objc::{class, msg_send, sel, sel_impl}; From 0be4829ca907f1d610c70e88644ef7aad520ca6a Mon Sep 17 00:00:00 2001 From: Dhanji Prasanna Date: Thu, 23 Oct 2025 13:16:13 +1100 Subject: [PATCH 05/16] thinning message highlighted --- crates/g3-cli/src/lib.rs | 5 +- crates/g3-cli/src/tui.rs | 32 ++ crates/g3-cli/src/ui_writer_impl.rs | 44 ++ .../src/platform/macos.rs.bak | 425 ------------------ crates/g3-core/src/lib.rs | 3 +- crates/g3-core/src/ui_writer.rs | 4 + 6 files changed, 83 insertions(+), 430 deletions(-) delete mode 100644 crates/g3-computer-control/src/platform/macos.rs.bak diff --git a/crates/g3-cli/src/lib.rs b/crates/g3-cli/src/lib.rs index 32000b3..db3b3f3 100644 --- a/crates/g3-cli/src/lib.rs +++ b/crates/g3-cli/src/lib.rs @@ -169,7 +169,7 @@ use tracing::{error, info}; use g3_core::error_handling::{classify_error, ErrorType, RecoverableError}; mod retro_tui; mod theme; -mod tui; +pub mod tui; mod ui_writer_impl; use retro_tui::RetroTui; use theme::ColorTheme; @@ -1099,9 +1099,8 @@ async fn run_interactive( continue; } "/thinnify" => { - output.print("🔧 Triggering manual context thinning..."); let summary = agent.force_thin(); - output.print(&summary); + output.print_context_thinning(&summary); continue; } "/readme" => { diff --git a/crates/g3-cli/src/tui.rs b/crates/g3-cli/src/tui.rs index aedd522..452db5c 100644 --- a/crates/g3-cli/src/tui.rs +++ b/crates/g3-cli/src/tui.rs @@ -1,5 +1,6 @@ use crossterm::style::Color; use crossterm::style::{SetForegroundColor, ResetColor}; +use std::io::{self, Write}; use termimad::MadSkin; /// Simple output handler with markdown support @@ -93,6 +94,37 @@ impl SimpleOutput { print!("{}", ResetColor); println!(" {:.1}% | {}/{} tokens", percentage, used, total); } + + pub fn print_context_thinning(&self, message: &str) { + // Animated highlight for context thinning + // Use bright cyan/green with a quick flash animation + + // Flash animation: print with bright background, then normal + let frames = vec![ + "\x1b[1;97;46m", // Frame 1: Bold white on cyan background + "\x1b[1;97;42m", // Frame 2: Bold white on green background + "\x1b[1;96;40m", // Frame 3: Bold cyan on black background + ]; + + println!(); + + // Quick flash animation + for frame in &frames { + print!("\r{} ✨ {} ✨\x1b[0m", frame, message); + let _ = io::stdout().flush(); + std::thread::sleep(std::time::Duration::from_millis(80)); + } + + // Final display with bright cyan and sparkle emojis + print!("\r\x1b[1;96m✨ {} ✨\x1b[0m", message); + println!(); + + // Add a subtle "success" indicator line + println!("\x1b[2;36m └─ Context optimized successfully\x1b[0m"); + println!(); + + let _ = io::stdout().flush(); + } } #[cfg(test)] diff --git a/crates/g3-cli/src/ui_writer_impl.rs b/crates/g3-cli/src/ui_writer_impl.rs index ae0b7a3..407e0d1 100644 --- a/crates/g3-cli/src/ui_writer_impl.rs +++ b/crates/g3-cli/src/ui_writer_impl.rs @@ -104,6 +104,37 @@ impl UiWriter for ConsoleUiWriter { println!("{}", message); } + fn print_context_thinning(&self, message: &str) { + // Animated highlight for context thinning + // Use bright cyan/green with a quick flash animation + + // Flash animation: print with bright background, then normal + let frames = vec![ + "\x1b[1;97;46m", // Frame 1: Bold white on cyan background + "\x1b[1;97;42m", // Frame 2: Bold white on green background + "\x1b[1;96;40m", // Frame 3: Bold cyan on black background + ]; + + println!(); + + // Quick flash animation + for frame in &frames { + print!("\r{} ✨ {} ✨\x1b[0m", frame, message); + let _ = io::stdout().flush(); + std::thread::sleep(std::time::Duration::from_millis(80)); + } + + // Final display with bright cyan and sparkle emojis + print!("\r\x1b[1;96m✨ {} ✨\x1b[0m", message); + println!(); + + // Add a subtle "success" indicator line + println!("\x1b[2;36m └─ Context optimized successfully\x1b[0m"); + println!(); + + let _ = io::stdout().flush(); + } + fn print_tool_header(&self, tool_name: &str) { // Store the tool name and clear args for collection *self.current_tool_name.lock().unwrap() = Some(tool_name.to_string()); @@ -360,6 +391,19 @@ impl UiWriter for RetroTuiWriter { self.tui.output(message); } + fn print_context_thinning(&self, message: &str) { + // For TUI, we'll use a highlighted output with special formatting + // The TUI will handle the visual presentation + + // Add visual separators and emphasis + self.tui.output(""); + self.tui.output("═══════════════════════════════════════════════════════════"); + self.tui.output(&format!("✨ {} ✨", message)); + self.tui.output(" └─ Context optimized successfully"); + self.tui.output("═══════════════════════════════════════════════════════════"); + self.tui.output(""); + } + fn print_tool_header(&self, tool_name: &str) { // Start collecting tool output *self.current_tool_start.lock().unwrap() = Some(Instant::now()); diff --git a/crates/g3-computer-control/src/platform/macos.rs.bak b/crates/g3-computer-control/src/platform/macos.rs.bak deleted file mode 100644 index 03d5050..0000000 --- a/crates/g3-computer-control/src/platform/macos.rs.bak +++ /dev/null @@ -1,425 +0,0 @@ -use crate::{ComputerController, types::*}; -use anyhow::Result; -use async_trait::async_trait; -use core_graphics::display::CGPoint; -use core_graphics::event::{CGEvent, CGEventType, CGMouseButton, CGEventTapLocation}; -use core_graphics::event_source::{CGEventSource, CGEventSourceStateID}; -use std::path::Path; -use tesseract::Tesseract; - -// MacOSController doesn't store CGEventSource to avoid Send/Sync issues -// We create it fresh for each operation -pub struct MacOSController { - // Empty struct - event source created per operation -} - -impl MacOSController { - pub fn new() -> Result { - // Test that we can create an event source - let _event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState) - .map_err(|_| anyhow::anyhow!("Failed to create event source. Make sure Accessibility permissions are granted."))?; - Ok(Self {}) - } - - fn key_to_keycode(&self, key: &str) -> Result { - // Map key names to macOS keycodes - let keycode = match key.to_lowercase().as_str() { - "return" | "enter" => 36, - "tab" => 48, - "space" => 49, - "delete" | "backspace" => 51, - "escape" | "esc" => 53, - "command" | "cmd" => 55, - "shift" => 56, - "capslock" => 57, - "option" | "alt" => 58, - "control" | "ctrl" => 59, - "left" => 123, - "right" => 124, - "down" => 125, - "up" => 126, - _ => anyhow::bail!("Unknown key: {}", key), - }; - Ok(keycode) - } -} - -#[async_trait] -impl ComputerController for MacOSController { - async fn move_mouse(&self, x: i32, y: i32) -> Result<()> { - let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState) - .map_err(|_| anyhow::anyhow!("Failed to create event source"))?; - let point = CGPoint::new(x as f64, y as f64); - let event = CGEvent::new_mouse_event( - event_source, - CGEventType::MouseMoved, - point, - CGMouseButton::Left, - ).map_err(|_| anyhow::anyhow!("Failed to create mouse move event"))?; - - event.post(CGEventTapLocation::HID); - Ok(()) - } - - async fn click(&self, button: MouseButton) -> Result<()> { - let (cg_button, down_type, up_type) = match button { - MouseButton::Left => (CGMouseButton::Left, CGEventType::LeftMouseDown, CGEventType::LeftMouseUp), - MouseButton::Right => (CGMouseButton::Right, CGEventType::RightMouseDown, CGEventType::RightMouseUp), - MouseButton::Middle => (CGMouseButton::Center, CGEventType::OtherMouseDown, CGEventType::OtherMouseUp), - }; - - let point = { - // Get current mouse position - let temp_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState) - .map_err(|_| anyhow::anyhow!("Failed to create event source"))?; - let event = CGEvent::new(temp_source) - .map_err(|_| anyhow::anyhow!("Failed to get mouse position"))?; - let p = event.location(); - p - }; - - { - let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState) - .map_err(|_| anyhow::anyhow!("Failed to create event source"))?; - - // Mouse down - let down_event = CGEvent::new_mouse_event( - event_source, - down_type, - point, - cg_button, - ).map_err(|_| anyhow::anyhow!("Failed to create mouse down event"))?; - down_event.post(CGEventTapLocation::HID); - } // event_source and down_event dropped here - - // Small delay - tokio::time::sleep(tokio::time::Duration::from_millis(50)).await; - - { - let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState) - .map_err(|_| anyhow::anyhow!("Failed to create event source"))?; - - let up_event = CGEvent::new_mouse_event( - event_source, - up_type, - point, - cg_button, - ).map_err(|_| anyhow::anyhow!("Failed to create mouse up event"))?; - up_event.post(CGEventTapLocation::HID); - } // event_source and up_event dropped here - - Ok(()) - } - - async fn double_click(&self, button: MouseButton) -> Result<()> { - self.click(button).await?; - tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; - self.click(button).await?; - Ok(()) - } - - async fn type_text(&self, text: &str) -> Result<()> { - for ch in text.chars() { - { - let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState) - .map_err(|_| anyhow::anyhow!("Failed to create event source"))?; - - // Create keyboard event for character - let event = CGEvent::new_keyboard_event( - event_source, - 0, // keycode (0 for unicode) - true, - ).map_err(|_| anyhow::anyhow!("Failed to create keyboard event"))?; - - // Set unicode string - let mut utf16_buf = [0u16; 2]; - let utf16_slice = ch.encode_utf16(&mut utf16_buf); - let utf16_chars: Vec = utf16_slice.iter().copied().collect(); - - event.set_string_from_utf16_unchecked(utf16_chars.as_slice()); - event.post(CGEventTapLocation::HID); - } // event_source and event dropped here - - tokio::time::sleep(tokio::time::Duration::from_millis(10)).await; - } - Ok(()) - } - - async fn press_key(&self, key: &str) -> Result<()> { - let keycode = self.key_to_keycode(key)?; - - { - let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState) - .map_err(|_| anyhow::anyhow!("Failed to create event source"))?; - - // Key down - let down_event = CGEvent::new_keyboard_event( - event_source, - keycode, - true, - ).map_err(|_| anyhow::anyhow!("Failed to create key down event"))?; - down_event.post(CGEventTapLocation::HID); - } // event_source and down_event dropped here - - tokio::time::sleep(tokio::time::Duration::from_millis(50)).await; - - { - let event_source = CGEventSource::new(CGEventSourceStateID::CombinedSessionState) - .map_err(|_| anyhow::anyhow!("Failed to create event source"))?; - - // Key up - let up_event = CGEvent::new_keyboard_event( - event_source, - keycode, - false, - ).map_err(|_| anyhow::anyhow!("Failed to create key up event"))?; - up_event.post(CGEventTapLocation::HID); - } // event_source and up_event dropped here - - Ok(()) - } - - async fn list_windows(&self) -> Result> { - // Note: Full implementation would use CGWindowListCopyWindowInfo - // For now, return empty list as this requires more complex FFI - tracing::warn!("list_windows not fully implemented on macOS"); - Ok(vec![]) - } - - async fn focus_window(&self, _window_id: &str) -> Result<()> { - // Note: Full implementation would use NSWorkspace to activate application - tracing::warn!("focus_window not fully implemented on macOS"); - Ok(()) - } - - async fn get_window_bounds(&self, _window_id: &str) -> Result { - // Note: Full implementation would use Accessibility API - tracing::warn!("get_window_bounds not fully implemented on macOS"); - Ok(Rect { x: 0, y: 0, width: 800, height: 600 }) - } - - async fn find_element(&self, _selector: &ElementSelector) -> Result> { - // Note: Full implementation would use macOS Accessibility API - tracing::warn!("find_element not fully implemented on macOS"); - Ok(None) - } - - async fn get_element_text(&self, _element_id: &str) -> Result { - // Note: Full implementation would use Accessibility API - tracing::warn!("get_element_text not fully implemented on macOS"); - Ok(String::new()) - } - - async fn get_element_bounds(&self, _element_id: &str) -> Result { - // Note: Full implementation would use Accessibility API - tracing::warn!("get_element_bounds not fully implemented on macOS"); - Ok(Rect { x: 0, y: 0, width: 100, height: 30 }) - } - - async fn take_screenshot(&self, path: &str, _region: Option, window_id: Option<&str>) -> Result<()> { - // Use native macOS screencapture command which handles all the format complexities - - // Check if we have Screen Recording permission by attempting a test capture - // If we only get wallpaper/menubar but no windows, we need permission - let needs_permission_check = std::env::var("G3_SKIP_PERMISSION_CHECK").is_err(); - - if needs_permission_check { - // Try to open Screen Recording settings if this is the first screenshot - static PERMISSION_PROMPTED: std::sync::atomic::AtomicBool = std::sync::atomic::AtomicBool::new(false); - - if !PERMISSION_PROMPTED.swap(true, std::sync::atomic::Ordering::Relaxed) { - tracing::warn!("\n=== Screen Recording Permission Required ===\n\ - macOS requires explicit permission to capture window content.\n\ - If screenshots only show wallpaper/menubar (no windows):\n\n\ - 1. Open System Settings > Privacy & Security > Screen Recording\n\ - 2. Enable permission for your terminal (iTerm/Terminal) or g3\n\ - 3. Restart your terminal if needed\n\n\ - Opening Screen Recording settings now...\n"); - - // Try to open the settings (non-blocking) - let _ = std::process::Command::new("open") - .arg("x-apple.systempreferences:com.apple.preference.security?Privacy_ScreenCapture") - .spawn(); - } - } - - let path_obj = Path::new(path); - if let Some(parent) = path_obj.parent() { - std::fs::create_dir_all(parent)?; - } - - let mut cmd = std::process::Command::new("screencapture"); - - // Add flags - cmd.arg("-x"); // No sound - - if let Some(window_id) = window_id { - // Capture specific window by getting its bounds and using region capture - // window_id format: "AppName" or "AppName:WindowTitle" - let app_name = window_id.split(':').next().unwrap_or(window_id); - - // Use AppleScript to get window bounds - let script = format!( - r#"tell application "{}" - tell current window - get bounds - end tell - end tell"#, - app_name - ); - - let output = std::process::Command::new("osascript") - .arg("-e") - .arg(&script) - .output() - .map_err(|e| anyhow::anyhow!("Failed to get window bounds: {}", e))?; - - if output.status.success() { - let bounds_str = String::from_utf8_lossy(&output.stdout); - let bounds: Vec = bounds_str - .trim() - .split(',') - .filter_map(|s| s.trim().parse().ok()) - .collect(); - - if bounds.len() == 4 { - let (left, top, right, bottom) = (bounds[0], bounds[1], bounds[2], bounds[3]); - let width = right - left; - let height = bottom - top; - - cmd.arg("-R"); - cmd.arg(format!("{},{},{},{}", left, top, width, height)); - - tracing::debug!("Capturing window '{}' at region: {},{} {}x{}", app_name, left, top, width, height); - } else { - tracing::warn!("Failed to parse window bounds, capturing full screen"); - } - } else { - tracing::warn!("Failed to get window bounds for '{}', capturing full screen", app_name); - } - } else if let Some(region) = _region { - // Capture specific region: -R x,y,width,height - cmd.arg("-R"); - cmd.arg(format!("{},{},{},{}", region.x, region.y, region.width, region.height)); - } - - cmd.arg(path); - - let output = cmd.output() - .map_err(|e| anyhow::anyhow!("Failed to execute screencapture: {}", e))?; - - if !output.status.success() { - let stderr = String::from_utf8_lossy(&output.stderr); - anyhow::bail!("screencapture failed: {}", stderr); - } - - tracing::debug!("Screenshot saved using screencapture: {}", path); - - Ok(()) - } - - } - - async fn extract_text_from_screen(&self, region: Rect) -> Result { - // Take screenshot of region first - let temp_path = format!("/tmp/g3_ocr_{}.png", uuid::Uuid::new_v4()); - self.take_screenshot(&temp_path, Some(region), None).await?; - - // Extract text from the screenshot - let result = self.extract_text_from_image(&temp_path).await?; - - // Clean up temp file - let _ = std::fs::remove_file(&temp_path); - - Ok(result) - } - - async fn extract_text_from_image(&self, _path: &str) -> Result { - // Check if tesseract is available on the system - let tesseract_check = std::process::Command::new("which") - .arg("tesseract") - .output(); - - if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() { - anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\ - To install tesseract:\n macOS: brew install tesseract\n \ - Linux: sudo apt-get install tesseract-ocr (Ubuntu/Debian)\n \ - sudo yum install tesseract (RHEL/CentOS)\n \ - Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki\n\n\ - After installation, restart your terminal and try again."); - } - - // Initialize Tesseract - let tess = Tesseract::new(None, Some("eng")) - .map_err(|e| { - anyhow::anyhow!("Failed to initialize Tesseract: {}\n\n\ - This usually means:\n1. Tesseract is not properly installed\n\ - 2. Language data files are missing\n\nTo fix:\n \ - macOS: brew reinstall tesseract\n \ - Linux: sudo apt-get install tesseract-ocr-eng\n \ - Windows: Reinstall tesseract and ensure language files are included", e) - })?; - - let text = tess.set_image(_path) - .map_err(|e| anyhow::anyhow!("Failed to load image '{}': {}", _path, e))? - .get_text() - .map_err(|e| anyhow::anyhow!("Failed to extract text from image: {}", e))?; - - // Get confidence (simplified - would need more complex API calls for per-word confidence) - let confidence = 0.85; // Placeholder - - Ok(OCRResult { - text, - confidence, - bounds: Rect { x: 0, y: 0, width: 0, height: 0 }, // Would need image dimensions - }) - } - - async fn find_text_on_screen(&self, _text: &str) -> Result> { - // Check if tesseract is available on the system - let tesseract_check = std::process::Command::new("which") - .arg("tesseract") - .output(); - - if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() { - anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\ - To install tesseract:\n macOS: brew install tesseract\n \ - Linux: sudo apt-get install tesseract-ocr (Ubuntu/Debian)\n \ - sudo yum install tesseract (RHEL/CentOS)\n \ - Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki\n\n\ - After installation, restart your terminal and try again."); - } - - // Take full screen screenshot - let temp_path = format!("/tmp/g3_ocr_search_{}.png", uuid::Uuid::new_v4()); - self.take_screenshot(&temp_path, None, None).await?; - - // Use Tesseract to find text with bounding boxes - let tess = Tesseract::new(None, Some("eng")) - .map_err(|e| { - anyhow::anyhow!("Failed to initialize Tesseract: {}\n\n\ - This usually means:\n1. Tesseract is not properly installed\n\ - 2. Language data files are missing\n\nTo fix:\n \ - macOS: brew reinstall tesseract\n \ - Linux: sudo apt-get install tesseract-ocr-eng\n \ - Windows: Reinstall tesseract and ensure language files are included", e) - })?; - - let full_text = tess.set_image(temp_path.as_str()) - .map_err(|e| anyhow::anyhow!("Failed to load screenshot: {}", e))? - .get_text() - .map_err(|e| anyhow::anyhow!("Failed to extract text from screen: {}", e))?; - - // Clean up temp file - let _ = std::fs::remove_file(&temp_path); - - // Simple text search - full implementation would use get_component_images - // to get bounding boxes for each word - if full_text.contains(_text) { - tracing::warn!("Text found but precise coordinates not available in simplified implementation"); - Ok(Some(Point { x: 0, y: 0 })) - } else { - Ok(None) - } - } -} diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs index 21e7bd0..2786182 100644 --- a/crates/g3-core/src/lib.rs +++ b/crates/g3-core/src/lib.rs @@ -2473,8 +2473,7 @@ Template: let (thin_summary, chars_saved) = self.context_window.thin_context(); self.thinning_events.push(chars_saved); // Print the thinning summary to the user - self.ui_writer.println(""); - self.ui_writer.print_context_status(&format!("{}\n", thin_summary)); + self.ui_writer.print_context_thinning(&thin_summary); } // Track what we've already displayed before getting new text diff --git a/crates/g3-core/src/ui_writer.rs b/crates/g3-core/src/ui_writer.rs index 1b532e7..b907ea6 100644 --- a/crates/g3-core/src/ui_writer.rs +++ b/crates/g3-core/src/ui_writer.rs @@ -17,6 +17,9 @@ pub trait UiWriter: Send + Sync { /// Print a context window status message fn print_context_status(&self, message: &str); + /// Print a context thinning success message with highlight and animation + fn print_context_thinning(&self, message: &str); + /// Print a tool execution header fn print_tool_header(&self, tool_name: &str); @@ -60,6 +63,7 @@ impl UiWriter for NullUiWriter { fn print_inline(&self, _message: &str) {} fn print_system_prompt(&self, _prompt: &str) {} fn print_context_status(&self, _message: &str) {} + fn print_context_thinning(&self, _message: &str) {} fn print_tool_header(&self, _tool_name: &str) {} fn print_tool_arg(&self, _key: &str, _value: &str) {} fn print_tool_output_header(&self) {} From e1e732150a1a29e7cd1a4850e512dee38cfbf1bf Mon Sep 17 00:00:00 2001 From: Dhanji Prasanna Date: Fri, 24 Oct 2025 10:11:43 +1100 Subject: [PATCH 06/16] coach rigor +++ --- crates/g3-cli/src/lib.rs | 14 +- .../examples/list_windows.rs | 8 +- .../tests/integration_test.rs | 45 ------ crates/g3-core/src/lib.rs | 149 +++++++++++++++++- crates/g3-core/tests/test_context_thinning.rs | 121 +++++++++++++- 5 files changed, 278 insertions(+), 59 deletions(-) diff --git a/crates/g3-cli/src/lib.rs b/crates/g3-cli/src/lib.rs index db3b3f3..081c5da 100644 --- a/crates/g3-cli/src/lib.rs +++ b/crates/g3-cli/src/lib.rs @@ -243,6 +243,10 @@ pub struct Cli { /// Enable macOS Accessibility API tools for native app automation #[arg(long)] pub macax: bool, + + /// Enable WebDriver browser automation tools + #[arg(long)] + pub webdriver: bool, } pub async fn run() -> Result<()> { @@ -451,6 +455,11 @@ Output ONLY the markdown content, no explanations or meta-commentary."#, } } + // Apply webdriver flag override + if cli.webdriver { + config.webdriver.enabled = true; + } + // Validate provider if specified if let Some(ref provider) = cli.provider { let valid_providers = ["anthropic", "databricks", "embedded", "openai"]; @@ -1630,6 +1639,7 @@ Review the current state of the project and provide a concise critique focusing 2. Whether the project compiles successfully 3. What requirements are missing or incorrect 4. Specific improvements needed to satisfy requirements +5. Use UI tools such as webdriver to test functionality thoroughly CRITICAL INSTRUCTIONS: 1. You MUST use the final_output tool to provide your feedback @@ -1637,13 +1647,13 @@ CRITICAL INSTRUCTIONS: 3. Focus ONLY on what needs to be fixed or improved 4. Do NOT include your analysis process, file contents, or compilation output in the summary -If the implementation generally meets all requirements and compiles without errors: +If the implementation thoroughly meets all requirements, compiles and is fully tested (especially UI flows) *WITHOUT* gaps or errors: - Call final_output with summary: 'IMPLEMENTATION_APPROVED' If improvements are needed: - Call final_output with a brief summary listing ONLY the specific issues to fix -Remember: Be clear in your review and concise in your feedback. APPROVE if the implementation works and generally fits the requirements. Don't be picky.", +Remember: Be clear in your review and concise in your feedback. APPROVE if the implementation works and thoroughly fits the requirements (implementation > 95% complete). Be rigorous, especially by testing that all UI features work.", requirements ); diff --git a/crates/g3-computer-control/examples/list_windows.rs b/crates/g3-computer-control/examples/list_windows.rs index 5b571d9..e638a19 100644 --- a/crates/g3-computer-control/examples/list_windows.rs +++ b/crates/g3-computer-control/examples/list_windows.rs @@ -1,7 +1,7 @@ use core_graphics::window::{kCGWindowListOptionOnScreenOnly, kCGNullWindowID, CGWindowListCopyWindowInfo}; use core_foundation::dictionary::CFDictionary; use core_foundation::string::CFString; -use core_foundation::base::TCFType; +use core_foundation::base::{TCFType, ToVoid}; fn main() { println!("Listing all on-screen windows..."); @@ -22,7 +22,7 @@ fn main() { // Get window ID let window_id_key = CFString::from_static_string("kCGWindowNumber"); - let window_id: i64 = if let Some(value) = dict.find(window_id_key.as_concrete_TypeRef()) { + let window_id: i64 = if let Some(value) = dict.find(window_id_key.to_void()) { let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _); num.to_i64().unwrap_or(0) } else { @@ -31,7 +31,7 @@ fn main() { // Get owner name let owner_key = CFString::from_static_string("kCGWindowOwnerName"); - let owner: String = if let Some(value) = dict.find(owner_key.as_concrete_TypeRef()) { + let owner: String = if let Some(value) = dict.find(owner_key.to_void()) { let s: CFString = TCFType::wrap_under_get_rule(*value as *const _); s.to_string() } else { @@ -40,7 +40,7 @@ fn main() { // Get window name/title let name_key = CFString::from_static_string("kCGWindowName"); - let title: String = if let Some(value) = dict.find(name_key.as_concrete_TypeRef()) { + let title: String = if let Some(value) = dict.find(name_key.to_void()) { let s: CFString = TCFType::wrap_under_get_rule(*value as *const _); s.to_string() } else { diff --git a/crates/g3-computer-control/tests/integration_test.rs b/crates/g3-computer-control/tests/integration_test.rs index 75c884f..87227e5 100644 --- a/crates/g3-computer-control/tests/integration_test.rs +++ b/crates/g3-computer-control/tests/integration_test.rs @@ -1,23 +1,5 @@ use g3_computer_control::*; -#[tokio::test] -async fn test_mouse_movement() { - let controller = create_controller().expect("Failed to create controller"); - - // Move mouse to center of screen (assuming 1920x1080) - let result = controller.move_mouse(960, 540).await; - assert!(result.is_ok(), "Failed to move mouse: {:?}", result.err()); -} - -#[tokio::test] -async fn test_typing() { - let controller = create_controller().expect("Failed to create controller"); - - // Type some text - let result = controller.type_text("Hello, World!").await; - assert!(result.is_ok(), "Failed to type text: {:?}", result.err()); -} - #[tokio::test] async fn test_screenshot() { let controller = create_controller().expect("Failed to create controller"); @@ -33,30 +15,3 @@ async fn test_screenshot() { // Clean up let _ = std::fs::remove_file(path); } - -#[tokio::test] -async fn test_click() { - let controller = create_controller().expect("Failed to create controller"); - - // Click at a safe location - let result = controller.click(types::MouseButton::Left).await; - assert!(result.is_ok(), "Failed to click: {:?}", result.err()); -} - -#[tokio::test] -async fn test_double_click() { - let controller = create_controller().expect("Failed to create controller"); - - // Double click - let result = controller.double_click(types::MouseButton::Left).await; - assert!(result.is_ok(), "Failed to double click: {:?}", result.err()); -} - -#[tokio::test] -async fn test_press_key() { - let controller = create_controller().expect("Failed to create controller"); - - // Press escape key - let result = controller.press_key("escape").await; - assert!(result.is_ok(), "Failed to press key: {:?}", result.err()); -} diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs index 2786182..61bb974 100644 --- a/crates/g3-core/src/lib.rs +++ b/crates/g3-core/src/lib.rs @@ -466,6 +466,7 @@ Format this as a detailed but concise summary that can be used to resume the con let first_third_end = (total_messages / 3).max(1); let mut leaned_count = 0; + let mut tool_call_leaned_count = 0; let mut chars_saved = 0; // Create ~/tmp directory if it doesn't exist @@ -478,7 +479,7 @@ Format this as a detailed but concise summary that can be used to resume the con // Scan the first third of messages for i in 0..first_third_end { if let Some(message) = self.conversation_history.get_mut(i) { - // Only process User messages that look like tool results + // Process User messages that look like tool results if matches!(message.role, MessageRole::User) && message.content.starts_with("Tool result:") { let content_len = message.content.len(); @@ -508,6 +509,109 @@ Format this as a detailed but concise summary that can be used to resume the con debug!("Thinned tool result {} ({} chars) to {}", i, original_len, file_path); } } + + // Process Assistant messages that contain tool calls with large arguments + if matches!(message.role, MessageRole::Assistant) { + // Try to parse the message content as JSON to find tool calls + let content = &message.content; + + // Look for JSON tool call patterns + if let Some(tool_call_start) = content.find(r#"{"tool":"#) + .or_else(|| content.find(r#"{ "tool":"#)) + .or_else(|| content.find(r#"{"tool" :"#)) + .or_else(|| content.find(r#"{ "tool" :"#)) + { + // Try to extract and parse the JSON tool call + let json_portion = &content[tool_call_start..]; + + // Find the end of the JSON object + if let Some(json_end) = Self::find_json_end(json_portion) { + let json_str = &json_portion[..=json_end]; + + // Try to parse as ToolCall + if let Ok(mut tool_call) = serde_json::from_str::(json_str) { + let mut modified = false; + + // Handle write_file tool calls + if tool_call.tool == "write_file" { + if let Some(args_obj) = tool_call.args.as_object_mut() { + // Extract content to avoid borrow issues + let content_info = args_obj.get("content") + .and_then(|v| v.as_str()) + .map(|s| (s.to_string(), s.len())); + + if let Some((content_str, content_len)) = content_info { + // Only thin if content is greater than 1000 chars + if content_len > 1000 { + let timestamp = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + let filename = format!("leaned_write_file_content_{}_{}.txt", timestamp, i); + let file_path = format!("{}/{}", tmp_dir, filename); + + if std::fs::write(&file_path, &content_str).is_ok() { + args_obj.insert( + "content".to_string(), + serde_json::Value::String(format!("", file_path)) + ); + modified = true; + chars_saved += content_len; + tool_call_leaned_count += 1; + debug!("Thinned write_file content {} ({} chars) to {}", i, content_len, file_path); + } + } + } + } + } + + // Handle str_replace tool calls + if tool_call.tool == "str_replace" { + if let Some(args_obj) = tool_call.args.as_object_mut() { + // Extract diff to avoid borrow issues + let diff_info = args_obj.get("diff") + .and_then(|v| v.as_str()) + .map(|s| (s.to_string(), s.len())); + + if let Some((diff_str, diff_len)) = diff_info { + // Only thin if diff is greater than 1000 chars + if diff_len > 1000 { + let timestamp = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + let filename = format!("leaned_str_replace_diff_{}_{}.txt", timestamp, i); + let file_path = format!("{}/{}", tmp_dir, filename); + + if std::fs::write(&file_path, &diff_str).is_ok() { + args_obj.insert( + "diff".to_string(), + serde_json::Value::String(format!("", file_path)) + ); + modified = true; + chars_saved += diff_len; + tool_call_leaned_count += 1; + debug!("Thinned str_replace diff {} ({} chars) to {}", i, diff_len, file_path); + } + } + } + } + } + + // If we modified the tool call, reconstruct the message + if modified { + let prefix = &content[..tool_call_start]; + let suffix = &content[tool_call_start + json_str.len()..]; + + // Serialize the modified tool call + if let Ok(new_json) = serde_json::to_string(&tool_call) { + message.content = format!("{}{}{}", prefix, new_json, suffix); + } + } + } + } + } + } } } @@ -515,10 +619,18 @@ Format this as a detailed but concise summary that can be used to resume the con self.recalculate_tokens(); if leaned_count > 0 { - (format!("🥒 Context thinned at {}%: {} tool results, ~{} chars saved", - current_threshold, leaned_count, chars_saved), chars_saved) + if tool_call_leaned_count > 0 { + (format!("🥒 Context thinned at {}%: {} tool results + {} tool calls, ~{} chars saved", + current_threshold, leaned_count, tool_call_leaned_count, chars_saved), chars_saved) + } else { + (format!("🥒 Context thinned at {}%: {} tool results, ~{} chars saved", + current_threshold, leaned_count, chars_saved), chars_saved) + } + } else if tool_call_leaned_count > 0 { + (format!("🥒 Context thinned at {}%: {} tool calls, ~{} chars saved", + current_threshold, tool_call_leaned_count, chars_saved), chars_saved) } else { - (format!("ℹ Context thinning triggered at {}% but no large tool results found in first third", + (format!("ℹ Context thinning triggered at {}% but no large tool results or tool calls found in first third", current_threshold), 0) } } @@ -533,6 +645,35 @@ Format this as a detailed but concise summary that can be used to resume the con debug!("Recalculated tokens after thinning: {} tokens", total); } + + /// Helper function to find the end of a JSON object + fn find_json_end(json_str: &str) -> Option { + let mut brace_count = 0; + let mut in_string = false; + let mut escape_next = false; + + for (i, ch) in json_str.char_indices() { + if escape_next { + escape_next = false; + continue; + } + + match ch { + '\\' => escape_next = true, + '"' if !escape_next => in_string = !in_string, + '{' if !in_string => brace_count += 1, + '}' if !in_string => { + brace_count -= 1; + if brace_count == 0 { + return Some(i); + } + } + _ => {} + } + } + + None + } } pub struct Agent { diff --git a/crates/g3-core/tests/test_context_thinning.rs b/crates/g3-core/tests/test_context_thinning.rs index 760524f..db6761f 100644 --- a/crates/g3-core/tests/test_context_thinning.rs +++ b/crates/g3-core/tests/test_context_thinning.rs @@ -72,7 +72,7 @@ fn test_thin_context_basic() { // Trigger thinning at 50% context.used_tokens = 5000; - let summary = context.thin_context(); + let (summary, _chars_saved) = context.thin_context(); println!("Thinning summary: {}", summary); @@ -93,6 +93,119 @@ fn test_thin_context_basic() { } } +#[test] +fn test_thin_write_file_tool_calls() { + let mut context = ContextWindow::new(10000); + + // Add some messages including a write_file tool call with large content + context.add_message(Message { + role: MessageRole::User, + content: "Please create a large file".to_string(), + }); + + // Add an assistant message with a write_file tool call containing large content + let large_content = "x".repeat(1500); + let tool_call_json = format!( + r#"{{"tool": "write_file", "args": {{"file_path": "test.txt", "content": "{}"}}}}"#, + large_content + ); + context.add_message(Message { + role: MessageRole::Assistant, + content: format!("I'll create that file.\n\n{}", tool_call_json), + }); + + context.add_message(Message { + role: MessageRole::User, + content: "Tool result: ✅ Successfully wrote 1500 lines".to_string(), + }); + + // Add more messages to ensure we have enough for "first third" logic + for i in 0..6 { + context.add_message(Message { + role: MessageRole::Assistant, + content: format!("Response {}", i), + }); + } + + // Trigger thinning at 50% + context.used_tokens = 5000; + let (summary, _chars_saved) = context.thin_context(); + + println!("Thinning summary: {}", summary); + + // Should have thinned the write_file tool call + assert!(summary.contains("tool call") || summary.contains("chars saved")); + + // Check that the large content was replaced with a file reference + let first_third_end = context.conversation_history.len() / 3; + for i in 0..first_third_end { + if let Some(msg) = context.conversation_history.get(i) { + if matches!(msg.role, MessageRole::Assistant) && msg.content.contains("write_file") { + // The content should now reference an external file + assert!(msg.content.contains(" Date: Fri, 24 Oct 2025 10:45:24 +1100 Subject: [PATCH 07/16] more macax tooling --- crates/g3-cli/src/lib.rs | 6 +- crates/g3-computer-control/src/lib.rs | 6 + .../g3-computer-control/src/platform/macos.rs | 150 ++++++++++++++++- crates/g3-computer-control/src/types.rs | 10 ++ crates/g3-core/src/lib.rs | 157 +++++++++++++++++- crates/g3-providers/src/databricks.rs | 8 + 6 files changed, 328 insertions(+), 9 deletions(-) diff --git a/crates/g3-cli/src/lib.rs b/crates/g3-cli/src/lib.rs index 081c5da..3facf6a 100644 --- a/crates/g3-cli/src/lib.rs +++ b/crates/g3-cli/src/lib.rs @@ -1639,7 +1639,7 @@ Review the current state of the project and provide a concise critique focusing 2. Whether the project compiles successfully 3. What requirements are missing or incorrect 4. Specific improvements needed to satisfy requirements -5. Use UI tools such as webdriver to test functionality thoroughly +5. Use UI tools such as webdriver or macax to test functionality thoroughly CRITICAL INSTRUCTIONS: 1. You MUST use the final_output tool to provide your feedback @@ -1647,13 +1647,13 @@ CRITICAL INSTRUCTIONS: 3. Focus ONLY on what needs to be fixed or improved 4. Do NOT include your analysis process, file contents, or compilation output in the summary -If the implementation thoroughly meets all requirements, compiles and is fully tested (especially UI flows) *WITHOUT* gaps or errors: +If the implementation thoroughly meets all requirements, compiles and is fully tested (especially UI flows) *WITHOUT* minor gaps or errors: - Call final_output with summary: 'IMPLEMENTATION_APPROVED' If improvements are needed: - Call final_output with a brief summary listing ONLY the specific issues to fix -Remember: Be clear in your review and concise in your feedback. APPROVE if the implementation works and thoroughly fits the requirements (implementation > 95% complete). Be rigorous, especially by testing that all UI features work.", +Remember: Be clear in your review and concise in your feedback. APPROVE iff the implementation works and thoroughly fits the requirements (implementation > 95% complete). Be rigorous, especially by testing that all UI features work.", requirements ); diff --git a/crates/g3-computer-control/src/lib.rs b/crates/g3-computer-control/src/lib.rs index e4180c6..ad564b5 100644 --- a/crates/g3-computer-control/src/lib.rs +++ b/crates/g3-computer-control/src/lib.rs @@ -24,6 +24,12 @@ pub trait ComputerController: Send + Sync { // OCR operations async fn extract_text_from_screen(&self, region: Rect) -> Result; async fn extract_text_from_image(&self, path: &str) -> Result; + async fn extract_text_with_locations(&self, path: &str) -> Result>; + async fn find_text_on_screen(&self, search_text: &str) -> Result>; + + // Mouse operations + fn move_mouse(&self, x: i32, y: i32) -> Result<()>; + fn click_at(&self, x: i32, y: i32) -> Result<()>; } // Platform-specific constructor diff --git a/crates/g3-computer-control/src/platform/macos.rs b/crates/g3-computer-control/src/platform/macos.rs index 129b73c..d2e6a0a 100644 --- a/crates/g3-computer-control/src/platform/macos.rs +++ b/crates/g3-computer-control/src/platform/macos.rs @@ -1,5 +1,5 @@ -use crate::{ComputerController, types::Rect}; -use anyhow::Result; +use crate::{ComputerController, types::{Rect, TextLocation}}; +use anyhow::{Result, Context}; use async_trait::async_trait; use std::path::Path; use tesseract::Tesseract; @@ -122,4 +122,150 @@ impl ComputerController for MacOSController { Ok(text) } + + async fn extract_text_with_locations(&self, path: &str) -> Result> { + // For now, use tesseract CLI with TSV output to get bounding boxes + // This is a workaround since the Rust tesseract crate doesn't expose get_component_boxes + let output = std::process::Command::new("tesseract") + .arg(path) + .arg("stdout") + .arg("tsv") + .output() + .map_err(|e| anyhow::anyhow!("Failed to run tesseract: {}", e))?; + + if !output.status.success() { + anyhow::bail!("Tesseract failed: {}", String::from_utf8_lossy(&output.stderr)); + } + + let tsv_text = String::from_utf8_lossy(&output.stdout); + let mut locations = Vec::new(); + + // Parse TSV output (skip header line) + for (i, line) in tsv_text.lines().enumerate() { + if i == 0 { continue; } // Skip header + + let parts: Vec<&str> = line.split('\t').collect(); + if parts.len() >= 12 { + // TSV format: level, page_num, block_num, par_num, line_num, word_num, + // left, top, width, height, conf, text + if let (Ok(x), Ok(y), Ok(w), Ok(h), Ok(conf), text) = ( + parts[6].parse::(), + parts[7].parse::(), + parts[8].parse::(), + parts[9].parse::(), + parts[10].parse::(), + parts[11], + ) { + let trimmed = text.trim(); + if !trimmed.is_empty() && conf > 0.0 { + locations.push(TextLocation { + text: trimmed.to_string(), + x, + y, + width: w, + height: h, + confidence: conf / 100.0, // Convert from 0-100 to 0-1 + }); + } + } + } + } + + Ok(locations) + } + + async fn find_text_on_screen(&self, search_text: &str) -> Result> { + // Take full screenshot + let home = std::env::var("HOME").unwrap_or_else(|_| "/tmp".to_string()); + let temp_path = format!("{}/Desktop/g3_find_text_{}.png", home, uuid::Uuid::new_v4()); + self.take_screenshot(&temp_path, None, None).await?; + + // Extract all text with locations + let locations = self.extract_text_with_locations(&temp_path).await?; + + // Clean up temp file + let _ = std::fs::remove_file(&temp_path); + + // Find matching text (case-insensitive) + let search_lower = search_text.to_lowercase(); + for location in locations { + if location.text.to_lowercase().contains(&search_lower) { + return Ok(Some(location)); + } + } + + Ok(None) + } + + fn move_mouse(&self, x: i32, y: i32) -> Result<()> { + use core_graphics::event::{ + CGEvent, CGEventTapLocation, CGEventType, CGMouseButton, + }; + use core_graphics::event_source::{ + CGEventSource, CGEventSourceStateID, + }; + use core_graphics::geometry::CGPoint; + + let source = CGEventSource::new(CGEventSourceStateID::HIDSystemState) + .ok().context("Failed to create event source")?; + + let event = CGEvent::new_mouse_event( + source, + CGEventType::MouseMoved, + CGPoint::new(x as f64, y as f64), + CGMouseButton::Left, + ).ok().context("Failed to create mouse event")?; + + event.post(CGEventTapLocation::HID); + + Ok(()) + } + + fn click_at(&self, x: i32, y: i32) -> Result<()> { + use core_graphics::event::{ + CGEvent, CGEventTapLocation, CGEventType, CGMouseButton, + }; + use core_graphics::event_source::{ + CGEventSource, CGEventSourceStateID, + }; + use core_graphics::geometry::CGPoint; + + let source = CGEventSource::new(CGEventSourceStateID::HIDSystemState) + .ok().context("Failed to create event source")?; + + let point = CGPoint::new(x as f64, y as f64); + + // Move mouse to position first + let move_event = CGEvent::new_mouse_event( + source.clone(), + CGEventType::MouseMoved, + point, + CGMouseButton::Left, + ).ok().context("Failed to create mouse move event")?; + move_event.post(CGEventTapLocation::HID); + + std::thread::sleep(std::time::Duration::from_millis(100)); + + // Mouse down + let mouse_down = CGEvent::new_mouse_event( + source.clone(), + CGEventType::LeftMouseDown, + point, + CGMouseButton::Left, + ).ok().context("Failed to create mouse down event")?; + mouse_down.post(CGEventTapLocation::HID); + + std::thread::sleep(std::time::Duration::from_millis(50)); + + // Mouse up + let mouse_up = CGEvent::new_mouse_event( + source, + CGEventType::LeftMouseUp, + point, + CGMouseButton::Left, + ).ok().context("Failed to create mouse up event")?; + mouse_up.post(CGEventTapLocation::HID); + + Ok(()) + } } \ No newline at end of file diff --git a/crates/g3-computer-control/src/types.rs b/crates/g3-computer-control/src/types.rs index e7ea40e..7d09042 100644 --- a/crates/g3-computer-control/src/types.rs +++ b/crates/g3-computer-control/src/types.rs @@ -7,3 +7,13 @@ pub struct Rect { pub width: i32, pub height: i32, } + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TextLocation { + pub text: String, + pub x: i32, + pub y: i32, + pub width: i32, + pub height: i32, + pub confidence: f32, +} diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs index 61bb974..14f36c3 100644 --- a/crates/g3-core/src/lib.rs +++ b/crates/g3-core/src/lib.rs @@ -1239,7 +1239,7 @@ Template: // Check if provider supports native tool calling and add tools if so let provider = self.providers.get(None)?; let tools = if provider.has_native_tool_calling() { - Some(Self::create_tool_definitions(self.config.webdriver.enabled, self.config.macax.enabled)) + Some(Self::create_tool_definitions(self.config.webdriver.enabled, self.config.macax.enabled, self.config.computer_control.enabled)) } else { None }; @@ -1700,7 +1700,7 @@ Template: } /// Create tool definitions for native tool calling providers - fn create_tool_definitions(enable_webdriver: bool, enable_macax: bool) -> Vec { + fn create_tool_definitions(enable_webdriver: bool, enable_macax: bool, enable_computer_control: bool) -> Vec { let mut tools = vec![ Tool { name: "shell".to_string(), @@ -2279,7 +2279,65 @@ Template: }), }); } - + + // Add vision-guided tools (requires computer control) + if enable_computer_control { + // Add vision-guided tools + tools.push(Tool { + name: "vision_find_text".to_string(), + description: "Find text on screen and return its location (useful for locating UI elements)".to_string(), + input_schema: json!({ + "type": "object", + "properties": { + "text": { + "type": "string", + "description": "The text to search for on screen" + } + }, + "required": ["text"] + }), + }); + + tools.push(Tool { + name: "vision_click_text".to_string(), + description: "Find text on screen and click on it (useful for clicking buttons, links, menu items)".to_string(), + input_schema: json!({ + "type": "object", + "properties": { + "text": { + "type": "string", + "description": "The text to click on (e.g., 'Submit', 'OK', 'Cancel', '+')" + } + }, + "required": ["text"] + }), + }); + + tools.push(Tool { + name: "vision_click_near_text".to_string(), + description: "Find text on screen and click near it (useful for clicking text fields next to labels)".to_string(), + input_schema: json!({ + "type": "object", + "properties": { + "text": { + "type": "string", + "description": "The label text to find (e.g., 'Name:', 'Email:', 'Task:')" + }, + "direction": { + "type": "string", + "enum": ["right", "below", "left", "above"], + "description": "Direction to click relative to the text (default: right)" + }, + "distance": { + "type": "integer", + "description": "Distance in pixels from the text (default: 50)" + } + }, + "required": ["text"] + }), + }); + } + tools } @@ -2844,7 +2902,7 @@ Template: // Ensure tools are included for native providers in subsequent iterations if provider.has_native_tool_calling() { - request.tools = Some(Self::create_tool_definitions(self.config.webdriver.enabled, self.config.macax.enabled)); + request.tools = Some(Self::create_tool_definitions(self.config.webdriver.enabled, self.config.macax.enabled, self.config.computer_control.enabled)); } // Only add to full_response if we haven't already added it @@ -4529,6 +4587,97 @@ Template: Err(e) => Ok(format!("❌ Failed to focus element: {}", e)), } } + "vision_find_text" => { + debug!("Processing vision_find_text tool call"); + + if let Some(controller) = &self.computer_controller { + let text = tool_call.args.get("text") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing text parameter"))?; + + match controller.find_text_on_screen(text).await { + Ok(Some(location)) => { + Ok(format!( + "✅ Found '{}' at position ({}, {}) with size {}x{} (confidence: {:.0}%)", + location.text, location.x, location.y, location.width, location.height, + location.confidence * 100.0 + )) + } + Ok(None) => Ok(format!("❌ Could not find '{}' on screen", text)), + Err(e) => Ok(format!("❌ Error finding text: {}", e)), + } + } else { + Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string()) + } + } + "vision_click_text" => { + debug!("Processing vision_click_text tool call"); + + if let Some(controller) = &self.computer_controller { + let text = tool_call.args.get("text") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing text parameter"))?; + + match controller.find_text_on_screen(text).await { + Ok(Some(location)) => { + // Click on center of text + let center_x = location.x + location.width / 2; + let center_y = location.y + location.height / 2; + + match controller.click_at(center_x, center_y) { + Ok(_) => Ok(format!("✅ Clicked on '{}' at ({}, {})", text, center_x, center_y)), + Err(e) => Ok(format!("❌ Failed to click: {}", e)), + } + } + Ok(None) => Ok(format!("❌ Could not find '{}' on screen", text)), + Err(e) => Ok(format!("❌ Error finding text: {}", e)), + } + } else { + Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string()) + } + } + "vision_click_near_text" => { + debug!("Processing vision_click_near_text tool call"); + + if let Some(controller) = &self.computer_controller { + let text = tool_call.args.get("text") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing text parameter"))?; + + let direction = tool_call.args.get("direction") + .and_then(|v| v.as_str()) + .unwrap_or("right"); + + let distance = tool_call.args.get("distance") + .and_then(|v| v.as_i64()) + .unwrap_or(50) as i32; + + match controller.find_text_on_screen(text).await { + Ok(Some(location)) => { + // Calculate click position based on direction + let (click_x, click_y) = match direction { + "right" => (location.x + location.width + distance, location.y + location.height / 2), + "below" => (location.x + location.width / 2, location.y + location.height + distance), + "left" => (location.x - distance, location.y + location.height / 2), + "above" => (location.x + location.width / 2, location.y - distance), + _ => (location.x + location.width + distance, location.y + location.height / 2), + }; + + match controller.click_at(click_x, click_y) { + Ok(_) => Ok(format!( + "✅ Clicked {} of '{}' at ({}, {})", + direction, text, click_x, click_y + )), + Err(e) => Ok(format!("❌ Failed to click: {}", e)), + } + } + Ok(None) => Ok(format!("❌ Could not find '{}' on screen", text)), + Err(e) => Ok(format!("❌ Error finding text: {}", e)), + } + } else { + Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string()) + } + } _ => { warn!("Unknown tool: {}", tool_call.tool); Ok(format!("❓ Unknown tool: {}", tool_call.tool)) diff --git a/crates/g3-providers/src/databricks.rs b/crates/g3-providers/src/databricks.rs index 02c669a..50373d6 100644 --- a/crates/g3-providers/src/databricks.rs +++ b/crates/g3-providers/src/databricks.rs @@ -881,6 +881,14 @@ impl LLMProvider for DatabricksProvider { "Processing Databricks streaming request with {} messages", request.messages.len() ); + + // Debug: Log tool count + if let Some(ref tools) = request.tools { + debug!("Request has {} tools", tools.len()); + for tool in tools.iter().take(5) { + debug!(" Tool: {}", tool.name); + } + } let max_tokens = request.max_tokens.unwrap_or(self.max_tokens); let temperature = request.temperature.unwrap_or(self.temperature); From 61d748034d0ceb7dbc18b36c8de8e354b0415b58 Mon Sep 17 00:00:00 2001 From: Dhanji Prasanna Date: Fri, 24 Oct 2025 15:35:47 +1100 Subject: [PATCH 08/16] replace tesseract with apple vision --- .gitignore | 1 + Cargo.lock | 160 ++++-------------- crates/g3-cli/src/ui_writer_impl.rs | 14 +- crates/g3-computer-control/Cargo.toml | 6 +- crates/g3-computer-control/build.rs | 63 +++++++ .../examples/test_vision.rs | 85 ++++++++++ crates/g3-computer-control/src/lib.rs | 5 +- crates/g3-computer-control/src/ocr/mod.rs | 26 +++ .../g3-computer-control/src/ocr/tesseract.rs | 84 +++++++++ crates/g3-computer-control/src/ocr/vision.rs | 103 +++++++++++ .../g3-computer-control/src/platform/macos.rs | 144 ++++++---------- .../vision-bridge/Package.swift | 24 +++ .../Sources/VisionBridge/VisionBridge.h | 39 +++++ .../Sources/VisionBridge/VisionOCR.swift | 145 ++++++++++++++++ crates/g3-core/src/lib.rs | 129 +++++++++++--- docs/coach-player-providers.md | 75 -------- 16 files changed, 785 insertions(+), 318 deletions(-) create mode 100644 crates/g3-computer-control/build.rs create mode 100644 crates/g3-computer-control/examples/test_vision.rs create mode 100644 crates/g3-computer-control/src/ocr/mod.rs create mode 100644 crates/g3-computer-control/src/ocr/tesseract.rs create mode 100644 crates/g3-computer-control/src/ocr/vision.rs create mode 100644 crates/g3-computer-control/vision-bridge/Package.swift create mode 100644 crates/g3-computer-control/vision-bridge/Sources/VisionBridge/VisionBridge.h create mode 100644 crates/g3-computer-control/vision-bridge/Sources/VisionBridge/VisionOCR.swift delete mode 100644 docs/coach-player-providers.md diff --git a/.gitignore b/.gitignore index fe29988..f9f70c3 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ # will have compiled files and executables debug target +.build # These are backup files generated by rustfmt **/*.rs.bk diff --git a/Cargo.lock b/Cargo.lock index 38b9eb8..a09efd0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -136,7 +136,7 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -218,28 +218,6 @@ version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" -[[package]] -name = "bindgen" -version = "0.64.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4243e6031260db77ede97ad86c27e501d646a27ab57b59a574f725d98ab1fb4" -dependencies = [ - "bitflags 1.3.2", - "cexpr", - "clang-sys", - "lazy_static", - "lazycell", - "log", - "peeking_take_while", - "proc-macro2", - "quote", - "regex", - "rustc-hash", - "shlex", - "syn 1.0.109", - "which", -] - [[package]] name = "bindgen" version = "0.69.5" @@ -259,7 +237,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn 2.0.107", + "syn", "which", ] @@ -433,7 +411,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -767,7 +745,7 @@ dependencies = [ "proc-macro2", "quote", "strict", - "syn 2.0.107", + "syn", ] [[package]] @@ -906,7 +884,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.107", + "syn", ] [[package]] @@ -917,7 +895,7 @@ checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" dependencies = [ "darling_core", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -939,7 +917,7 @@ dependencies = [ "proc-macro2", "quote", "rustc_version", - "syn 2.0.107", + "syn", ] [[package]] @@ -960,7 +938,7 @@ dependencies = [ "convert_case 0.7.1", "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -1023,7 +1001,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -1213,7 +1191,7 @@ checksum = "1a5c6c585bc94aaf2c7b51dd4c2ba22680844aba4c687be581871a6f518c5742" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -1293,7 +1271,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -1374,7 +1352,6 @@ dependencies = [ "serde", "serde_json", "shellexpand", - "tesseract", "thiserror 1.0.69", "tokio", "tracing", @@ -1959,7 +1936,7 @@ dependencies = [ "indoc", "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -2080,7 +2057,7 @@ dependencies = [ "proc-macro2", "quote", "regex", - "syn 2.0.107", + "syn", ] [[package]] @@ -2101,28 +2078,6 @@ version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a79a3332a6609480d7d0c9eab957bca6b455b91bb84e66d19f5ff66294b85b8" -[[package]] -name = "leptonica-plumbing" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc7a74c43d6f090d39158d233f326f47cd8bba545217595c93662b4e31156f42" -dependencies = [ - "leptonica-sys", - "libc", - "thiserror 1.0.69", -] - -[[package]] -name = "leptonica-sys" -version = "0.4.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da627c72b2499a8106f4dd33143843015e4a631f445d561f3481f7fba35b6151" -dependencies = [ - "bindgen 0.64.0", - "pkg-config", - "vcpkg", -] - [[package]] name = "libc" version = "0.2.177" @@ -2203,7 +2158,7 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "037a1881ada3592c6a922224d5177b4b4f452e6b2979eb97393b71989e48357f" dependencies = [ - "bindgen 0.69.5", + "bindgen", "cc", "link-cplusplus", "once_cell", @@ -2478,7 +2433,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -2550,12 +2505,6 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df94ce210e5bc13cb6651479fa48d14f601d9858cfe0467f43ae157023b938d3" -[[package]] -name = "peeking_take_while" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" - [[package]] name = "percent-encoding" version = "2.3.2" @@ -2592,7 +2541,7 @@ dependencies = [ "pest_meta", "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -2673,7 +2622,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" dependencies = [ "proc-macro2", - "syn 2.0.107", + "syn", ] [[package]] @@ -3078,7 +3027,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -3272,18 +3221,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.107", -] - -[[package]] -name = "syn" -version = "1.0.109" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", + "syn", ] [[package]] @@ -3317,7 +3255,7 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -3370,40 +3308,6 @@ dependencies = [ "unicode-width 0.1.14", ] -[[package]] -name = "tesseract" -version = "0.14.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ee0c2c608b63817b095f7fded5c50add36a29e2be2b2fc4901357163329290a" -dependencies = [ - "tesseract-plumbing", - "tesseract-sys", - "thiserror 1.0.69", -] - -[[package]] -name = "tesseract-plumbing" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e496d3e29eba540a276975394b85dccb5fd344b3eefb743d9286c8150f766d5" -dependencies = [ - "leptonica-plumbing", - "tesseract-sys", - "thiserror 1.0.69", -] - -[[package]] -name = "tesseract-sys" -version = "0.5.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd33f6f216124cfaf0fa86c2c0cdf04da39b6257bd78c5e44fa4fa98c3a5857b" -dependencies = [ - "bindgen 0.64.0", - "leptonica-sys", - "pkg-config", - "vcpkg", -] - [[package]] name = "thiserror" version = "1.0.69" @@ -3430,7 +3334,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -3441,7 +3345,7 @@ checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -3539,7 +3443,7 @@ checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -3665,7 +3569,7 @@ checksum = "81383ab64e72a7a8b8e13130c49e3dab29def6d0c7d76a03087b3cf71c5c6903" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -3870,7 +3774,7 @@ dependencies = [ "log", "proc-macro2", "quote", - "syn 2.0.107", + "syn", "wasm-bindgen-shared", ] @@ -3905,7 +3809,7 @@ checksum = "9f07d2f20d4da7b26400c9f4a0511e6e0345b040694e8a75bd41d578fa4421d7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -4077,7 +3981,7 @@ checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -4088,7 +3992,7 @@ checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -4484,7 +4388,7 @@ checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", "synstructure", ] @@ -4505,7 +4409,7 @@ checksum = "88d2b8d9c68ad2b9e4340d7832716a4d21a22a1154777ad56ea55c51a9cf3831" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] @@ -4525,7 +4429,7 @@ checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", "synstructure", ] @@ -4559,7 +4463,7 @@ checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.107", + "syn", ] [[package]] diff --git a/crates/g3-cli/src/ui_writer_impl.rs b/crates/g3-cli/src/ui_writer_impl.rs index 407e0d1..ec1a203 100644 --- a/crates/g3-cli/src/ui_writer_impl.rs +++ b/crates/g3-cli/src/ui_writer_impl.rs @@ -193,7 +193,12 @@ impl UiWriter for ConsoleUiWriter { // Truncate long values for display let display_value = if first_line.len() > 80 { - format!("{}...", &first_line[..77]) + // Use char_indices to safely truncate at character boundary + let truncate_at = first_line.char_indices() + .nth(77) + .map(|(i, _)| i) + .unwrap_or(first_line.len()); + format!("{}...", &first_line[..truncate_at]) } else { first_line.to_string() }; @@ -440,7 +445,12 @@ impl UiWriter for RetroTuiWriter { if caption.is_empty() && (key == "file_path" || key == "command" || key == "path") { // Truncate long values for the caption let truncated = if value.len() > 50 { - format!("{}...", &value[..47]) + // Use char_indices to safely truncate at character boundary + let truncate_at = value.char_indices() + .nth(47) + .map(|(i, _)| i) + .unwrap_or(value.len()); + format!("{}...", &value[..truncate_at]) } else { value.to_string() }; diff --git a/crates/g3-computer-control/Cargo.toml b/crates/g3-computer-control/Cargo.toml index 4300dc1..b9ed189 100644 --- a/crates/g3-computer-control/Cargo.toml +++ b/crates/g3-computer-control/Cargo.toml @@ -3,6 +3,9 @@ name = "g3-computer-control" version = "0.1.0" edition = "2021" +[build-dependencies] +# Only needed for building Swift bridge on macOS + [dependencies] # Workspace dependencies tokio = { workspace = true } @@ -20,9 +23,6 @@ async-trait = "0.1" # WebDriver support fantoccini = "0.21" -# OCR dependencies -tesseract = "0.14" - # macOS dependencies [target.'cfg(target_os = "macos")'.dependencies] core-graphics = "0.23" diff --git a/crates/g3-computer-control/build.rs b/crates/g3-computer-control/build.rs new file mode 100644 index 0000000..fed302c --- /dev/null +++ b/crates/g3-computer-control/build.rs @@ -0,0 +1,63 @@ +use std::env; +use std::path::PathBuf; +use std::process::Command; + +fn main() { + // Only build Vision bridge on macOS + if env::var("CARGO_CFG_TARGET_OS").unwrap() != "macos" { + return; + } + + println!("cargo:rerun-if-changed=vision-bridge/Sources/VisionBridge/VisionOCR.swift"); + println!("cargo:rerun-if-changed=vision-bridge/Sources/VisionBridge/VisionBridge.h"); + println!("cargo:rerun-if-changed=vision-bridge/Package.swift"); + + let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap()); + let vision_bridge_dir = manifest_dir.join("vision-bridge"); + + // Build Swift package + println!("cargo:warning=Building VisionBridge Swift package..."); + let build_status = Command::new("swift") + .args(&["build", "-c", "release"]) + .current_dir(&vision_bridge_dir) + .status() + .expect("Failed to build Swift package"); + + if !build_status.success() { + panic!("Swift build failed"); + } + + // Find the built library + let lib_path = vision_bridge_dir + .join(".build/release") + .canonicalize() + .expect("Failed to find .build/release directory"); + + // Copy the dylib to the output directory so it can be found at runtime + let target_dir = manifest_dir.parent().unwrap().parent().unwrap().join("target"); + let profile = env::var("PROFILE").unwrap_or_else(|_| "debug".to_string()); + let output_dir = target_dir.join(&profile); + + let dylib_src = lib_path.join("libVisionBridge.dylib"); + let dylib_dst = output_dir.join("libVisionBridge.dylib"); + + std::fs::copy(&dylib_src, &dylib_dst) + .expect(&format!("Failed to copy dylib from {} to {}", dylib_src.display(), dylib_dst.display())); + + println!("cargo:warning=Copied libVisionBridge.dylib to {}", dylib_dst.display()); + + // Add rpath so the dylib can be found at runtime + println!("cargo:rustc-link-arg=-Wl,-rpath,@executable_path"); + println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path"); + println!("cargo:rustc-link-search=native={}", lib_path.display()); + println!("cargo:rustc-link-lib=dylib=VisionBridge"); + + // Link required frameworks + println!("cargo:rustc-link-lib=framework=Vision"); + println!("cargo:rustc-link-lib=framework=AppKit"); + println!("cargo:rustc-link-lib=framework=Foundation"); + println!("cargo:rustc-link-lib=framework=CoreGraphics"); + println!("cargo:rustc-link-lib=framework=CoreImage"); + + println!("cargo:warning=VisionBridge built successfully at {}", lib_path.display()); +} diff --git a/crates/g3-computer-control/examples/test_vision.rs b/crates/g3-computer-control/examples/test_vision.rs new file mode 100644 index 0000000..5ff09a5 --- /dev/null +++ b/crates/g3-computer-control/examples/test_vision.rs @@ -0,0 +1,85 @@ +use g3_computer_control::ocr::{OCREngine, DefaultOCR}; +use anyhow::Result; + +#[tokio::main] +async fn main() -> Result<()> { + println!("🧪 Testing Apple Vision OCR"); + println!("===========================\n"); + + // Initialize OCR engine + println!("📦 Initializing OCR engine..."); + let ocr = DefaultOCR::new()?; + println!("✅ OCR engine: {}\n", ocr.name()); + + // Check if test image exists + let test_image = "/tmp/safari_test.png"; + if !std::path::Path::new(test_image).exists() { + println!("⚠️ Test image not found: {}", test_image); + println!(" Creating a screenshot..."); + + let status = std::process::Command::new("screencapture") + .arg("-x") + .arg("-R") + .arg("0,0,1200,800") + .arg(test_image) + .status()?; + + if !status.success() { + anyhow::bail!("Failed to create screenshot"); + } + + println!("✅ Screenshot created\n"); + } + + // Run OCR + println!("🔍 Running Apple Vision OCR on {}...", test_image); + let start = std::time::Instant::now(); + let locations = ocr.extract_text_with_locations(test_image).await?; + let duration = start.elapsed(); + + println!("✅ OCR completed in {:.3}s\n", duration.as_secs_f64()); + + // Display results + println!("📊 Results:"); + println!(" Found {} text elements\n", locations.len()); + + if locations.is_empty() { + println!("⚠️ No text found in image"); + } else { + println!(" Top 20 results:"); + println!(" {:<4} {:<40} {:<15} {:<12} {:<8}", "#", "Text", "Position", "Size", "Conf"); + println!(" {}", "-".repeat(85)); + + for (i, loc) in locations.iter().take(20).enumerate() { + let text = if loc.text.len() > 37 { + format!("{}...", &loc.text[..37]) + } else { + loc.text.clone() + }; + + println!(" {:<4} {:<40} ({:>4},{:>4}) {:>4}x{:<4} {:.2}", + i + 1, + text, + loc.x, + loc.y, + loc.width, + loc.height, + loc.confidence + ); + } + + if locations.len() > 20 { + println!("\n ... and {} more", locations.len() - 20); + } + + // Performance comparison + println!("\n📈 Performance:"); + println!(" OCR Speed: {:.3}s", duration.as_secs_f64()); + println!(" Text elements: {}", locations.len()); + println!(" Avg per element: {:.1}ms", duration.as_millis() as f64 / locations.len() as f64); + } + + println!("\n✅ Test complete!"); + + Ok(()) +} diff --git a/crates/g3-computer-control/src/lib.rs b/crates/g3-computer-control/src/lib.rs index ad564b5..355a591 100644 --- a/crates/g3-computer-control/src/lib.rs +++ b/crates/g3-computer-control/src/lib.rs @@ -3,6 +3,7 @@ pub mod types; pub mod platform; +pub mod ocr; pub mod webdriver; pub mod macax; @@ -25,11 +26,11 @@ pub trait ComputerController: Send + Sync { async fn extract_text_from_screen(&self, region: Rect) -> Result; async fn extract_text_from_image(&self, path: &str) -> Result; async fn extract_text_with_locations(&self, path: &str) -> Result>; - async fn find_text_on_screen(&self, search_text: &str) -> Result>; + async fn find_text_in_app(&self, app_name: &str, search_text: &str) -> Result>; // Mouse operations fn move_mouse(&self, x: i32, y: i32) -> Result<()>; - fn click_at(&self, x: i32, y: i32) -> Result<()>; + fn click_at(&self, x: i32, y: i32, app_name: Option<&str>) -> Result<()>; } // Platform-specific constructor diff --git a/crates/g3-computer-control/src/ocr/mod.rs b/crates/g3-computer-control/src/ocr/mod.rs new file mode 100644 index 0000000..b651da3 --- /dev/null +++ b/crates/g3-computer-control/src/ocr/mod.rs @@ -0,0 +1,26 @@ +use crate::types::TextLocation; +use anyhow::Result; +use async_trait::async_trait; + +/// OCR engine trait for text recognition with bounding boxes +#[async_trait] +pub trait OCREngine: Send + Sync { + /// Extract text with locations from an image file + async fn extract_text_with_locations(&self, path: &str) -> Result>; + + /// Get the name of the OCR engine + fn name(&self) -> &str; +} + +// Platform-specific modules +#[cfg(target_os = "macos")] +pub mod vision; + +pub mod tesseract; + +// Re-export the default OCR engine for the platform +#[cfg(target_os = "macos")] +pub use vision::AppleVisionOCR as DefaultOCR; + +#[cfg(not(target_os = "macos"))] +pub use tesseract::TesseractOCR as DefaultOCR; diff --git a/crates/g3-computer-control/src/ocr/tesseract.rs b/crates/g3-computer-control/src/ocr/tesseract.rs new file mode 100644 index 0000000..d55fc3f --- /dev/null +++ b/crates/g3-computer-control/src/ocr/tesseract.rs @@ -0,0 +1,84 @@ +use super::OCREngine; +use crate::types::TextLocation; +use anyhow::Result; +use async_trait::async_trait; + +/// Tesseract OCR engine (fallback/cross-platform) +pub struct TesseractOCR; + +impl TesseractOCR { + pub fn new() -> Result { + // Check if tesseract is available + let tesseract_check = std::process::Command::new("which") + .arg("tesseract") + .output(); + + if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() { + anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\ + To install tesseract:\n macOS: brew install tesseract\n \ + Linux: sudo apt-get install tesseract-ocr (Ubuntu/Debian)\n \ + sudo yum install tesseract (RHEL/CentOS)\n \ + Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki\n\n\ + After installation, restart your terminal and try again."); + } + + Ok(Self) + } +} + +#[async_trait] +impl OCREngine for TesseractOCR { + async fn extract_text_with_locations(&self, path: &str) -> Result> { + // Use tesseract CLI with TSV output to get bounding boxes + let output = std::process::Command::new("tesseract") + .arg(path) + .arg("stdout") + .arg("tsv") + .output() + .map_err(|e| anyhow::anyhow!("Failed to run tesseract: {}", e))?; + + if !output.status.success() { + anyhow::bail!("Tesseract failed: {}", String::from_utf8_lossy(&output.stderr)); + } + + let tsv_text = String::from_utf8_lossy(&output.stdout); + let mut locations = Vec::new(); + + // Parse TSV output (skip header line) + for (i, line) in tsv_text.lines().enumerate() { + if i == 0 { continue; } // Skip header + + let parts: Vec<&str> = line.split('\t').collect(); + if parts.len() >= 12 { + // TSV format: level, page_num, block_num, par_num, line_num, word_num, + // left, top, width, height, conf, text + if let (Ok(x), Ok(y), Ok(w), Ok(h), Ok(conf), text) = ( + parts[6].parse::(), + parts[7].parse::(), + parts[8].parse::(), + parts[9].parse::(), + parts[10].parse::(), + parts[11], + ) { + let trimmed = text.trim(); + if !trimmed.is_empty() && conf > 0.0 { + locations.push(TextLocation { + text: trimmed.to_string(), + x, + y, + width: w, + height: h, + confidence: conf / 100.0, // Convert from 0-100 to 0-1 + }); + } + } + } + } + + Ok(locations) + } + + fn name(&self) -> &str { + "Tesseract OCR" + } +} diff --git a/crates/g3-computer-control/src/ocr/vision.rs b/crates/g3-computer-control/src/ocr/vision.rs new file mode 100644 index 0000000..d35491d --- /dev/null +++ b/crates/g3-computer-control/src/ocr/vision.rs @@ -0,0 +1,103 @@ +use super::OCREngine; +use crate::types::TextLocation; +use anyhow::{Result, Context}; +use async_trait::async_trait; +use std::ffi::{CStr, CString}; +use std::os::raw::{c_char, c_float, c_uint}; + +// FFI bindings to Swift VisionBridge +#[repr(C)] +struct VisionTextBox { + text: *const c_char, + text_len: c_uint, + x: i32, + y: i32, + width: i32, + height: i32, + confidence: c_float, +} + +extern "C" { + fn vision_recognize_text( + image_path: *const c_char, + image_path_len: c_uint, + out_boxes: *mut *mut std::ffi::c_void, + out_count: *mut c_uint, + ) -> bool; + + fn vision_free_boxes(boxes: *mut std::ffi::c_void, count: c_uint); +} + +/// Apple Vision Framework OCR engine +pub struct AppleVisionOCR; + +impl AppleVisionOCR { + pub fn new() -> Result { + Ok(Self) + } +} + +#[async_trait] +impl OCREngine for AppleVisionOCR { + async fn extract_text_with_locations(&self, path: &str) -> Result> { + // Convert path to C string + let c_path = CString::new(path) + .context("Failed to convert path to C string")?; + + let mut boxes_ptr: *mut std::ffi::c_void = std::ptr::null_mut(); + let mut count: c_uint = 0; + + // Call Swift Vision API + let success = unsafe { + vision_recognize_text( + c_path.as_ptr(), + path.len() as c_uint, + &mut boxes_ptr, + &mut count, + ) + }; + + if !success || boxes_ptr.is_null() { + anyhow::bail!("Apple Vision OCR failed"); + } + + // Convert C array to Rust Vec + let mut locations = Vec::new(); + + unsafe { + let typed_boxes = boxes_ptr as *const VisionTextBox; + let boxes_slice = std::slice::from_raw_parts(typed_boxes, count as usize); + + for box_data in boxes_slice { + // Convert C string to Rust String + let text = if !box_data.text.is_null() { + CStr::from_ptr(box_data.text) + .to_string_lossy() + .into_owned() + } else { + String::new() + }; + + if !text.is_empty() { + locations.push(TextLocation { + text, + x: box_data.x, + y: box_data.y, + width: box_data.width, + height: box_data.height, + confidence: box_data.confidence, + }); + } + } + + // Free the C array + vision_free_boxes(boxes_ptr, count); + } + + Ok(locations) + } + + fn name(&self) -> &str { + "Apple Vision Framework" + } +} diff --git a/crates/g3-computer-control/src/platform/macos.rs b/crates/g3-computer-control/src/platform/macos.rs index d2e6a0a..da1aa95 100644 --- a/crates/g3-computer-control/src/platform/macos.rs +++ b/crates/g3-computer-control/src/platform/macos.rs @@ -1,16 +1,21 @@ use crate::{ComputerController, types::{Rect, TextLocation}}; +use crate::ocr::{OCREngine, DefaultOCR}; use anyhow::{Result, Context}; use async_trait::async_trait; use std::path::Path; -use tesseract::Tesseract; pub struct MacOSController { - // Empty struct for now + ocr_engine: Box, + #[allow(dead_code)] + ocr_name: String, } impl MacOSController { pub fn new() -> Result { - Ok(Self {}) + let ocr = Box::new(DefaultOCR::new()?); + let ocr_name = ocr.name().to_string(); + tracing::info!("Initialized macOS controller with OCR engine: {}", ocr_name); + Ok(Self { ocr_engine: ocr, ocr_name }) } } @@ -90,95 +95,21 @@ impl ComputerController for MacOSController { } async fn extract_text_from_image(&self, path: &str) -> Result { - // Check if tesseract is available on the system - let tesseract_check = std::process::Command::new("which") - .arg("tesseract") - .output(); - - if tesseract_check.is_err() || !tesseract_check.as_ref().unwrap().status.success() { - anyhow::bail!("Tesseract OCR is not installed on your system.\n\n\ - To install tesseract:\n macOS: brew install tesseract\n \ - Linux: sudo apt-get install tesseract-ocr (Ubuntu/Debian)\n \ - sudo yum install tesseract (RHEL/CentOS)\n \ - Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki\n\n\ - After installation, restart your terminal and try again."); - } - - // Initialize Tesseract - let tess = Tesseract::new(None, Some("eng")) - .map_err(|e| { - anyhow::anyhow!("Failed to initialize Tesseract: {}\n\n\ - This usually means:\n1. Tesseract is not properly installed\n\ - 2. Language data files are missing\n\nTo fix:\n \ - macOS: brew reinstall tesseract\n \ - Linux: sudo apt-get install tesseract-ocr-eng\n \ - Windows: Reinstall tesseract and ensure language files are included", e) - })?; - - let text = tess.set_image(path) - .map_err(|e| anyhow::anyhow!("Failed to load image '{}': {}", path, e))? - .get_text() - .map_err(|e| anyhow::anyhow!("Failed to extract text from image: {}", e))?; - - Ok(text) + // Extract all text and concatenate + let locations = self.ocr_engine.extract_text_with_locations(path).await?; + Ok(locations.iter().map(|loc| loc.text.as_str()).collect::>().join(" ")) } async fn extract_text_with_locations(&self, path: &str) -> Result> { - // For now, use tesseract CLI with TSV output to get bounding boxes - // This is a workaround since the Rust tesseract crate doesn't expose get_component_boxes - let output = std::process::Command::new("tesseract") - .arg(path) - .arg("stdout") - .arg("tsv") - .output() - .map_err(|e| anyhow::anyhow!("Failed to run tesseract: {}", e))?; - - if !output.status.success() { - anyhow::bail!("Tesseract failed: {}", String::from_utf8_lossy(&output.stderr)); - } - - let tsv_text = String::from_utf8_lossy(&output.stdout); - let mut locations = Vec::new(); - - // Parse TSV output (skip header line) - for (i, line) in tsv_text.lines().enumerate() { - if i == 0 { continue; } // Skip header - - let parts: Vec<&str> = line.split('\t').collect(); - if parts.len() >= 12 { - // TSV format: level, page_num, block_num, par_num, line_num, word_num, - // left, top, width, height, conf, text - if let (Ok(x), Ok(y), Ok(w), Ok(h), Ok(conf), text) = ( - parts[6].parse::(), - parts[7].parse::(), - parts[8].parse::(), - parts[9].parse::(), - parts[10].parse::(), - parts[11], - ) { - let trimmed = text.trim(); - if !trimmed.is_empty() && conf > 0.0 { - locations.push(TextLocation { - text: trimmed.to_string(), - x, - y, - width: w, - height: h, - confidence: conf / 100.0, // Convert from 0-100 to 0-1 - }); - } - } - } - } - - Ok(locations) + // Use the OCR engine + self.ocr_engine.extract_text_with_locations(path).await } - async fn find_text_on_screen(&self, search_text: &str) -> Result> { - // Take full screenshot + async fn find_text_in_app(&self, app_name: &str, search_text: &str) -> Result> { + // Take screenshot of specific app window let home = std::env::var("HOME").unwrap_or_else(|_| "/tmp".to_string()); - let temp_path = format!("{}/Desktop/g3_find_text_{}.png", home, uuid::Uuid::new_v4()); - self.take_screenshot(&temp_path, None, None).await?; + let temp_path = format!("{}/Desktop/g3_find_text_{}_{}.png", home, app_name, uuid::Uuid::new_v4()); + self.take_screenshot(&temp_path, None, Some(app_name)).await?; // Extract all text with locations let locations = self.extract_text_with_locations(&temp_path).await?; @@ -221,7 +152,44 @@ impl ComputerController for MacOSController { Ok(()) } - fn click_at(&self, x: i32, y: i32) -> Result<()> { + fn click_at(&self, x: i32, y: i32, app_name: Option<&str>) -> Result<()> { + // If app_name is provided, get window position and offset coordinates + let (global_x, global_y) = if let Some(app) = app_name { + // Get window position using AppleScript + let script = format!( + r#"tell application "{}" to get bounds of window 1"#, + app + ); + + let output = std::process::Command::new("osascript") + .arg("-e") + .arg(&script) + .output()?; + + if output.status.success() { + let bounds_str = String::from_utf8_lossy(&output.stdout); + // Parse bounds: "x1, y1, x2, y2" + let parts: Vec<&str> = bounds_str.trim().split(", ").collect(); + if parts.len() >= 2 { + if let (Ok(window_x), Ok(window_y)) = ( + parts[0].trim().parse::(), + parts[1].trim().parse::(), + ) { + // Offset relative coordinates by window position + (x + window_x, y + window_y) + } else { + (x, y) // Fallback to absolute coordinates + } + } else { + (x, y) // Fallback to absolute coordinates + } + } else { + (x, y) // Fallback to absolute coordinates + } + } else { + (x, y) // No app name, use absolute coordinates + }; + use core_graphics::event::{ CGEvent, CGEventTapLocation, CGEventType, CGMouseButton, }; @@ -233,7 +201,7 @@ impl ComputerController for MacOSController { let source = CGEventSource::new(CGEventSourceStateID::HIDSystemState) .ok().context("Failed to create event source")?; - let point = CGPoint::new(x as f64, y as f64); + let point = CGPoint::new(global_x as f64, global_y as f64); // Move mouse to position first let move_event = CGEvent::new_mouse_event( diff --git a/crates/g3-computer-control/vision-bridge/Package.swift b/crates/g3-computer-control/vision-bridge/Package.swift new file mode 100644 index 0000000..76d0503 --- /dev/null +++ b/crates/g3-computer-control/vision-bridge/Package.swift @@ -0,0 +1,24 @@ +// swift-tools-version:5.9 +import PackageDescription + +let package = Package( + name: "VisionBridge", + platforms: [ + .macOS(.v11) + ], + products: [ + .library( + name: "VisionBridge", + type: .dynamic, + targets: ["VisionBridge"] + ), + ], + targets: [ + .target( + name: "VisionBridge", + dependencies: [], + path: "Sources/VisionBridge", + publicHeadersPath: "." + ), + ] +) diff --git a/crates/g3-computer-control/vision-bridge/Sources/VisionBridge/VisionBridge.h b/crates/g3-computer-control/vision-bridge/Sources/VisionBridge/VisionBridge.h new file mode 100644 index 0000000..a83d1dc --- /dev/null +++ b/crates/g3-computer-control/vision-bridge/Sources/VisionBridge/VisionBridge.h @@ -0,0 +1,39 @@ +#ifndef VisionBridge_h +#define VisionBridge_h + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// Text box structure for FFI +typedef struct { + const char* text; + uint32_t text_len; + int32_t x; + int32_t y; + int32_t width; + int32_t height; + float confidence; +} VisionTextBox; + +// Recognize text in an image and return bounding boxes +// Returns true on success, false on failure +// Caller must free the returned boxes using vision_free_boxes +bool vision_recognize_text( + const char* image_path, + uint32_t image_path_len, + VisionTextBox** out_boxes, + uint32_t* out_count +); + +// Free memory allocated by vision_recognize_text +void vision_free_boxes(VisionTextBox* boxes, uint32_t count); + +#ifdef __cplusplus +} +#endif + +#endif /* VisionBridge_h */ diff --git a/crates/g3-computer-control/vision-bridge/Sources/VisionBridge/VisionOCR.swift b/crates/g3-computer-control/vision-bridge/Sources/VisionBridge/VisionOCR.swift new file mode 100644 index 0000000..5ff12d0 --- /dev/null +++ b/crates/g3-computer-control/vision-bridge/Sources/VisionBridge/VisionOCR.swift @@ -0,0 +1,145 @@ +import Foundation +import Vision +import AppKit +import CoreGraphics + +// MARK: - C Bridge Functions + +@_cdecl("vision_recognize_text") +public func vision_recognize_text( + _ imagePath: UnsafePointer, + _ imagePathLen: UInt32, + _ outBoxes: UnsafeMutablePointer, + _ outCount: UnsafeMutablePointer +) -> Bool { + // Convert C string to Swift String + guard let pathData = Data(bytes: imagePath, count: Int(imagePathLen)).withUnsafeBytes({ + String(bytes: $0, encoding: .utf8) + }) else { + return false + } + + let path = pathData.trimmingCharacters(in: .whitespaces) + + // Load image + guard let image = NSImage(contentsOfFile: path), + let cgImage = image.cgImage(forProposedRect: nil, context: nil, hints: nil) else { + return false + } + + // Perform OCR + var textBoxes: [CTextBox] = [] + let semaphore = DispatchSemaphore(value: 0) + var success = false + + let request = VNRecognizeTextRequest { request, error in + defer { semaphore.signal() } + + if let error = error { + print("Vision OCR error: \(error.localizedDescription)") + return + } + + guard let observations = request.results as? [VNRecognizedTextObservation] else { + return + } + + let imageSize = CGSize(width: cgImage.width, height: cgImage.height) + + for observation in observations { + guard let candidate = observation.topCandidates(1).first else { continue } + + let text = candidate.string + let boundingBox = observation.boundingBox + + // Convert normalized coordinates (bottom-left origin) to pixel coordinates (top-left origin) + let x = Int32(boundingBox.origin.x * imageSize.width) + let y = Int32((1.0 - boundingBox.origin.y - boundingBox.height) * imageSize.height) + let width = Int32(boundingBox.width * imageSize.width) + let height = Int32(boundingBox.height * imageSize.height) + + // Allocate C string for text + let cString = strdup(text) + + textBoxes.append(CTextBox( + text: cString, + text_len: UInt32(text.utf8.count), + x: x, + y: y, + width: width, + height: height, + confidence: observation.confidence + )) + } + + success = true + } + + // Configure request for best accuracy + request.recognitionLevel = .accurate + request.usesLanguageCorrection = true + request.recognitionLanguages = ["en-US"] + + // Perform request + let handler = VNImageRequestHandler(cgImage: cgImage, options: [:]) + do { + try handler.perform([request]) + } catch { + print("Vision request failed: \(error.localizedDescription)") + return false + } + + // Wait for completion + semaphore.wait() + + if !success { + return false + } + + // Allocate array for results + let boxesPtr = UnsafeMutablePointer.allocate(capacity: textBoxes.count) + for (index, box) in textBoxes.enumerated() { + boxesPtr[index] = box + } + + outBoxes.pointee = UnsafeMutableRawPointer(boxesPtr) + outCount.pointee = UInt32(textBoxes.count) + + return true +} + +@_cdecl("vision_free_boxes") +public func vision_free_boxes( + _ boxes: UnsafeMutableRawPointer, + _ count: UInt32 +) { + let typedBoxes = boxes.assumingMemoryBound(to: CTextBox.self) + for i in 0..? + public let text_len: UInt32 + public let x: Int32 + public let y: Int32 + public let width: Int32 + public let height: Int32 + public let confidence: Float + + public init(text: UnsafePointer?, text_len: UInt32, x: Int32, y: Int32, width: Int32, height: Int32, confidence: Float) { + self.text = text + self.text_len = text_len + self.x = x + self.y = y + self.width = width + self.height = height + self.confidence = confidence + } +} diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs index 14f36c3..dd3e52c 100644 --- a/crates/g3-core/src/lib.rs +++ b/crates/g3-core/src/lib.rs @@ -1825,7 +1825,7 @@ Template: }, Tool { name: "extract_text".to_string(), - description: "Extract text from a screen region or image file using OCR".to_string(), + description: "Extract text from a screen region or image file using OCR. Returns plain text only (no bounding boxes). For text with location/coordinates, use vision_find_text instead.".to_string(), input_schema: json!({ "type": "object", "properties": { @@ -2280,45 +2280,79 @@ Template: }); } + // Add extract_text_with_boxes tool (requires macax flag) + if enable_macax { + tools.push(Tool { + name: "extract_text_with_boxes".to_string(), + description: "Extract all text from an image file with bounding box coordinates for each text element. Returns JSON array with text, position (x, y), size (width, height), and confidence for each detected text. Uses Apple Vision Framework for precise sub-pixel accuracy.".to_string(), + input_schema: json!({ + "type": "object", + "properties": { + "path": { + "type": "string", + "description": "Path to image file to extract text from" + }, + "app_name": { + "type": "string", + "description": "Optional: Name of application to screenshot first (e.g., 'Safari', 'Things3'). If provided, takes screenshot of app before extracting text." + } + }, + "required": ["path"] + }), + }); + } + // Add vision-guided tools (requires computer control) if enable_computer_control { // Add vision-guided tools tools.push(Tool { name: "vision_find_text".to_string(), - description: "Find text on screen and return its location (useful for locating UI elements)".to_string(), + description: "Find text in a specific application window and return its location with bounding box coordinates (x, y, width, height) and confidence score. Useful for locating UI elements. Uses Apple Vision Framework for precise sub-pixel accuracy.".to_string(), input_schema: json!({ "type": "object", "properties": { + "app_name": { + "type": "string", + "description": "Name of the application to search in (e.g., 'Things3', 'Safari', 'TextEdit')" + }, "text": { "type": "string", "description": "The text to search for on screen" } }, - "required": ["text"] + "required": ["app_name", "text"] }), }); tools.push(Tool { name: "vision_click_text".to_string(), - description: "Find text on screen and click on it (useful for clicking buttons, links, menu items)".to_string(), + description: "Find text in a specific application window and click on it (useful for clicking buttons, links, menu items)".to_string(), input_schema: json!({ "type": "object", "properties": { + "app_name": { + "type": "string", + "description": "Name of the application (e.g., 'Things3', 'Safari', 'TextEdit')" + }, "text": { "type": "string", "description": "The text to click on (e.g., 'Submit', 'OK', 'Cancel', '+')" } }, - "required": ["text"] + "required": ["app_name", "text"] }), }); tools.push(Tool { name: "vision_click_near_text".to_string(), - description: "Find text on screen and click near it (useful for clicking text fields next to labels)".to_string(), + description: "Find text in a specific application window and click near it (useful for clicking text fields next to labels)".to_string(), input_schema: json!({ "type": "object", "properties": { + "app_name": { + "type": "string", + "description": "Name of the application (e.g., 'Things3', 'Safari', 'TextEdit')" + }, "text": { "type": "string", "description": "The label text to find (e.g., 'Name:', 'Email:', 'Task:')" @@ -2333,7 +2367,7 @@ Template: "description": "Distance in pixels from the text (default: 50)" } }, - "required": ["text"] + "required": ["app_name", "text"] }), }); } @@ -4591,19 +4625,23 @@ Template: debug!("Processing vision_find_text tool call"); if let Some(controller) = &self.computer_controller { + let app_name = tool_call.args.get("app_name") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing app_name parameter"))?; + let text = tool_call.args.get("text") .and_then(|v| v.as_str()) .ok_or_else(|| anyhow::anyhow!("Missing text parameter"))?; - match controller.find_text_on_screen(text).await { + match controller.find_text_in_app(app_name, text).await { Ok(Some(location)) => { Ok(format!( - "✅ Found '{}' at position ({}, {}) with size {}x{} (confidence: {:.0}%)", - location.text, location.x, location.y, location.width, location.height, + "✅ Found '{}' in {} at position ({}, {}) with size {}x{} (confidence: {:.0}%)", + location.text, app_name, location.x, location.y, location.width, location.height, location.confidence * 100.0 )) } - Ok(None) => Ok(format!("❌ Could not find '{}' on screen", text)), + Ok(None) => Ok(format!("❌ Could not find '{}' in {}", text, app_name)), Err(e) => Ok(format!("❌ Error finding text: {}", e)), } } else { @@ -4614,32 +4652,83 @@ Template: debug!("Processing vision_click_text tool call"); if let Some(controller) = &self.computer_controller { + let app_name = tool_call.args.get("app_name") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing app_name parameter"))?; + let text = tool_call.args.get("text") .and_then(|v| v.as_str()) .ok_or_else(|| anyhow::anyhow!("Missing text parameter"))?; - match controller.find_text_on_screen(text).await { + match controller.find_text_in_app(app_name, text).await { Ok(Some(location)) => { // Click on center of text let center_x = location.x + location.width / 2; let center_y = location.y + location.height / 2; - match controller.click_at(center_x, center_y) { - Ok(_) => Ok(format!("✅ Clicked on '{}' at ({}, {})", text, center_x, center_y)), + match controller.click_at(center_x, center_y, Some(app_name)) { + Ok(_) => Ok(format!("✅ Clicked on '{}' in {} at ({}, {})", text, app_name, center_x, center_y)), Err(e) => Ok(format!("❌ Failed to click: {}", e)), } } - Ok(None) => Ok(format!("❌ Could not find '{}' on screen", text)), + Ok(None) => Ok(format!("❌ Could not find '{}' in {}", text, app_name)), Err(e) => Ok(format!("❌ Error finding text: {}", e)), } } else { Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string()) } } + "extract_text_with_boxes" => { + debug!("Processing extract_text_with_boxes tool call"); + + if !self.config.macax.enabled { + return Ok("❌ extract_text_with_boxes requires --macax flag to be enabled".to_string()); + } + + if let Some(controller) = &self.computer_controller { + let path = tool_call.args.get("path") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing path parameter"))?; + + // Optional: take screenshot of app first + let final_path = if let Some(app_name) = tool_call.args.get("app_name").and_then(|v| v.as_str()) { + let temp_path = format!("/tmp/g3_extract_boxes_{}.png", uuid::Uuid::new_v4()); + match controller.take_screenshot(&temp_path, None, Some(app_name)).await { + Ok(_) => temp_path, + Err(e) => return Ok(format!("❌ Failed to take screenshot: {}", e)), + } + } else { + path.to_string() + }; + + // Extract text with locations + match controller.extract_text_with_locations(&final_path).await { + Ok(locations) => { + // Clean up temp file if we created one + if final_path != path { + let _ = std::fs::remove_file(&final_path); + } + + // Return as JSON + match serde_json::to_string_pretty(&locations) { + Ok(json) => Ok(format!("✅ Extracted {} text elements:\n{}", locations.len(), json)), + Err(e) => Ok(format!("❌ Failed to serialize results: {}", e)), + } + } + Err(e) => Ok(format!("❌ Failed to extract text: {}", e)), + } + } else { + Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string()) + } + } "vision_click_near_text" => { debug!("Processing vision_click_near_text tool call"); if let Some(controller) = &self.computer_controller { + let app_name = tool_call.args.get("app_name") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing app_name parameter"))?; + let text = tool_call.args.get("text") .and_then(|v| v.as_str()) .ok_or_else(|| anyhow::anyhow!("Missing text parameter"))?; @@ -4652,7 +4741,7 @@ Template: .and_then(|v| v.as_i64()) .unwrap_or(50) as i32; - match controller.find_text_on_screen(text).await { + match controller.find_text_in_app(app_name, text).await { Ok(Some(location)) => { // Calculate click position based on direction let (click_x, click_y) = match direction { @@ -4663,15 +4752,15 @@ Template: _ => (location.x + location.width + distance, location.y + location.height / 2), }; - match controller.click_at(click_x, click_y) { + match controller.click_at(click_x, click_y, Some(app_name)) { Ok(_) => Ok(format!( - "✅ Clicked {} of '{}' at ({}, {})", - direction, text, click_x, click_y + "✅ Clicked {} of '{}' in {} at ({}, {})", + direction, text, app_name, click_x, click_y )), Err(e) => Ok(format!("❌ Failed to click: {}", e)), } } - Ok(None) => Ok(format!("❌ Could not find '{}' on screen", text)), + Ok(None) => Ok(format!("❌ Could not find '{}' in {}", text, app_name)), Err(e) => Ok(format!("❌ Error finding text: {}", e)), } } else { diff --git a/docs/coach-player-providers.md b/docs/coach-player-providers.md deleted file mode 100644 index d1e05e4..0000000 --- a/docs/coach-player-providers.md +++ /dev/null @@ -1,75 +0,0 @@ -# Coach-Player Provider Configuration - -G3 now supports specifying different LLM providers for the coach and player agents when running in autonomous mode. This allows you to optimize for different requirements: - -- **Player**: The agent that implements code - might benefit from a faster, more cost-effective model -- **Coach**: The agent that reviews code - might benefit from a more powerful, analytical model - -## Configuration - -In your `config.toml` file, under the `[providers]` section, you can specify: - -```toml -[providers] -default_provider = "databricks" # Used for normal operations -coach = "databricks" # Provider for coach (code reviewer) -player = "anthropic" # Provider for player (code implementer) -``` - -If `coach` or `player` are not specified, they will default to using the `default_provider`. - -## Example Use Cases - -### Cost Optimization -Use a cheaper, faster model for initial implementations (player) and a more powerful model for review (coach): - -```toml -coach = "anthropic" # Claude Sonnet for thorough review -player = "anthropic" # Claude Haiku for quick implementation -``` - -### Speed vs Quality Trade-off -Use a local embedded model for fast iterations (player) and a cloud model for quality review (coach): - -```toml -coach = "databricks" # Cloud model for quality review -player = "embedded" # Local model for fast implementation -``` - -### Specialized Models -Use different models optimized for different tasks: - -```toml -coach = "databricks" # Model fine-tuned for code review -player = "openai" # Model optimized for code generation -``` - -## Requirements - -- Both providers must be properly configured in your config file -- Each provider must have valid credentials -- The models specified for each provider must be accessible - -## How It Works - -When running in autonomous mode (`g3 --autonomous`), the system will: - -1. Use the `player` provider (or default) for the initial implementation -2. Switch to the `coach` provider (or default) for code review -3. Return to the `player` provider for implementing feedback -4. Continue this cycle for the specified number of turns - -The providers are logged at startup so you can verify which models are being used: - -``` -🎮 Player provider: anthropic -👨‍🏫 Coach provider: databricks -ℹ️ Using different providers for player and coach -``` - -## Benefits - -- **Cost Efficiency**: Use expensive models only where they add the most value -- **Speed Optimization**: Use faster models for iterative development -- **Specialization**: Leverage models that excel at specific tasks -- **Flexibility**: Easy to experiment with different provider combinations From a8af5d7cc1d7778ec58c865cb084e49e4618ecc0 Mon Sep 17 00:00:00 2001 From: Dhanji Prasanna Date: Fri, 24 Oct 2025 16:11:12 +1100 Subject: [PATCH 09/16] Native api for screen capture --- crates/g3-computer-control/src/lib.rs | 2 +- .../g3-computer-control/src/platform/linux.rs | 7 +- .../g3-computer-control/src/platform/macos.rs | 85 ++++++++++++++----- .../src/platform/windows.rs | 7 +- crates/g3-core/src/lib.rs | 75 +++++----------- 5 files changed, 97 insertions(+), 79 deletions(-) diff --git a/crates/g3-computer-control/src/lib.rs b/crates/g3-computer-control/src/lib.rs index 355a591..b1cbc36 100644 --- a/crates/g3-computer-control/src/lib.rs +++ b/crates/g3-computer-control/src/lib.rs @@ -23,7 +23,7 @@ pub trait ComputerController: Send + Sync { async fn take_screenshot(&self, path: &str, region: Option, window_id: Option<&str>) -> Result<()>; // OCR operations - async fn extract_text_from_screen(&self, region: Rect) -> Result; + async fn extract_text_from_screen(&self, region: Rect, window_id: &str) -> Result; async fn extract_text_from_image(&self, path: &str) -> Result; async fn extract_text_with_locations(&self, path: &str) -> Result>; async fn find_text_in_app(&self, app_name: &str, search_text: &str) -> Result>; diff --git a/crates/g3-computer-control/src/platform/linux.rs b/crates/g3-computer-control/src/platform/linux.rs index 2a9d89c..cf485ed 100644 --- a/crates/g3-computer-control/src/platform/linux.rs +++ b/crates/g3-computer-control/src/platform/linux.rs @@ -63,10 +63,15 @@ impl ComputerController for LinuxController { } async fn take_screenshot(&self, _path: &str, _region: Option, _window_id: Option<&str>) -> Result<()> { + // Enforce that window_id must be provided + if _window_id.is_none() { + anyhow::bail!("window_id is required. You must specify which window to capture (e.g., 'Firefox', 'Terminal', 'gedit'). Use list_windows to see available windows."); + } + anyhow::bail!("Linux implementation not yet available") } - async fn extract_text_from_screen(&self, _region: Rect) -> Result { + async fn extract_text_from_screen(&self, _region: Rect, _window_id: &str) -> Result { anyhow::bail!("Linux implementation not yet available") } diff --git a/crates/g3-computer-control/src/platform/macos.rs b/crates/g3-computer-control/src/platform/macos.rs index da1aa95..b8b6bea 100644 --- a/crates/g3-computer-control/src/platform/macos.rs +++ b/crates/g3-computer-control/src/platform/macos.rs @@ -3,6 +3,11 @@ use crate::ocr::{OCREngine, DefaultOCR}; use anyhow::{Result, Context}; use async_trait::async_trait; use std::path::Path; +use core_graphics::window::{kCGWindowListOptionOnScreenOnly, kCGNullWindowID, CGWindowListCopyWindowInfo}; +use core_foundation::dictionary::CFDictionary; +use core_foundation::string::CFString; +use core_foundation::base::{TCFType, ToVoid}; +use core_foundation::array::CFArray; pub struct MacOSController { ocr_engine: Box, @@ -22,6 +27,11 @@ impl MacOSController { #[async_trait] impl ComputerController for MacOSController { async fn take_screenshot(&self, path: &str, region: Option, window_id: Option<&str>) -> Result<()> { + // Enforce that window_id must be provided + if window_id.is_none() { + return Err(anyhow::anyhow!("window_id is required. You must specify which window to capture (e.g., 'Safari', 'Terminal', 'Google Chrome'). Use list_windows to see available windows.")); + } + // Determine the temporary directory for screenshots let temp_dir = std::env::var("TMPDIR") .or_else(|_| std::env::var("HOME").map(|h| format!("{}/tmp", h))) @@ -42,48 +52,81 @@ impl ComputerController for MacOSController { std::fs::create_dir_all(parent)?; } - let mut cmd = std::process::Command::new("screencapture"); + let app_name = window_id.unwrap(); // Safe because we checked is_none() above - // Add flags + // Get the window ID for the specified application + let cg_window_id = unsafe { + let window_list = CGWindowListCopyWindowInfo( + kCGWindowListOptionOnScreenOnly, + kCGNullWindowID + ); + + let array = CFArray::::wrap_under_create_rule(window_list); + let count = array.len(); + + let mut found_window_id: Option = None; + + for i in 0..count { + let dict = array.get(i).unwrap(); + + // Get owner name + let owner_key = CFString::from_static_string("kCGWindowOwnerName"); + let owner: String = if let Some(value) = dict.find(owner_key.to_void()) { + let s: CFString = TCFType::wrap_under_get_rule(*value as *const _); + s.to_string() + } else { + continue; + }; + + // Check if this is the app we're looking for + if owner.to_lowercase().contains(&app_name.to_lowercase()) || app_name.to_lowercase().contains(&owner.to_lowercase()) { + // Get window ID + let window_id_key = CFString::from_static_string("kCGWindowNumber"); + if let Some(value) = dict.find(window_id_key.to_void()) { + let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _); + if let Some(id) = num.to_i64() { + found_window_id = Some(id as u32); + break; + } + } + } + } + + found_window_id + }; + + let cg_window_id = cg_window_id.ok_or_else(|| { + anyhow::anyhow!("Could not find window for application '{}'. Use list_windows to see available windows.", app_name) + })?; + + // Use screencapture with the window ID for now + // TODO: Implement direct CGWindowListCreateImage approach with proper image saving + let mut cmd = std::process::Command::new("screencapture"); cmd.arg("-x"); // No sound + cmd.arg("-l"); + cmd.arg(cg_window_id.to_string()); if let Some(region) = region { - // Capture specific region: -R x,y,width,height cmd.arg("-R"); cmd.arg(format!("{},{},{},{}", region.x, region.y, region.width, region.height)); } - if let Some(app_name) = window_id { - // Capture specific window by app name - // Use AppleScript to get window ID - let script = format!(r#"tell application "{}" to id of window 1"#, app_name); - let output = std::process::Command::new("osascript") - .arg("-e") - .arg(&script) - .output()?; - - if output.status.success() { - let window_id_str = String::from_utf8_lossy(&output.stdout).trim().to_string(); - cmd.arg(format!("-l{}", window_id_str)); - } - } - cmd.arg(&final_path); let screenshot_result = cmd.output()?; if !screenshot_result.status.success() { let stderr = String::from_utf8_lossy(&screenshot_result.stderr); - return Err(anyhow::anyhow!("screencapture failed: {}", stderr)); + return Err(anyhow::anyhow!("screencapture failed for window {}: {}", cg_window_id, stderr)); } Ok(()) } - async fn extract_text_from_screen(&self, region: Rect) -> Result { + async fn extract_text_from_screen(&self, region: Rect, window_id: &str) -> Result { // Take screenshot of region first let temp_path = format!("/tmp/g3_ocr_{}.png", uuid::Uuid::new_v4()); - self.take_screenshot(&temp_path, Some(region), None).await?; + self.take_screenshot(&temp_path, Some(region), Some(window_id)).await?; // Extract text from the screenshot let result = self.extract_text_from_image(&temp_path).await?; diff --git a/crates/g3-computer-control/src/platform/windows.rs b/crates/g3-computer-control/src/platform/windows.rs index 6213d56..f3250f7 100644 --- a/crates/g3-computer-control/src/platform/windows.rs +++ b/crates/g3-computer-control/src/platform/windows.rs @@ -62,10 +62,15 @@ impl ComputerController for WindowsController { } async fn take_screenshot(&self, _path: &str, _region: Option, _window_id: Option<&str>) -> Result<()> { + // Enforce that window_id must be provided + if _window_id.is_none() { + anyhow::bail!("window_id is required. You must specify which window to capture (e.g., 'Chrome', 'Terminal', 'Notepad'). Use list_windows to see available windows."); + } + anyhow::bail!("Windows implementation not yet available") } - async fn extract_text_from_screen(&self, _region: Rect) -> Result { + async fn extract_text_from_screen(&self, _region: Rect, _window_id: &str) -> Result { anyhow::bail!("Windows implementation not yet available") } diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs index dd3e52c..69a90ca 100644 --- a/crates/g3-core/src/lib.rs +++ b/crates/g3-core/src/lib.rs @@ -1798,7 +1798,7 @@ Template: }, Tool { name: "take_screenshot".to_string(), - description: "Capture a screenshot of the screen, region, or window. When capturing a specific application window (e.g., 'Safari', 'Terminal'), use the window_id parameter with just the application name. The tool will automatically use the native screencapture command with the application's window ID for a clean capture.".to_string(), + description: "Capture a screenshot of a specific application window. You MUST specify the window_id parameter with the application name (e.g., 'Safari', 'Terminal', 'Google Chrome'). The tool will automatically use the native screencapture command with the application's window ID for a clean capture. Use list_windows first to identify available windows.".to_string(), input_schema: json!({ "type": "object", "properties": { @@ -1808,7 +1808,7 @@ Template: }, "window_id": { "type": "string", - "description": "Optional application name to capture (e.g., 'Safari', 'Terminal', 'Google Chrome'). The tool will capture the frontmost window of that application using its native window ID." + "description": "REQUIRED: Application name to capture (e.g., 'Safari', 'Terminal', 'Google Chrome'). The tool will capture the frontmost window of that application using its native window ID." }, "region": { "type": "object", @@ -1820,12 +1820,12 @@ Template: } } }, - "required": ["path"] + "required": ["path", "window_id"] }), }, Tool { name: "extract_text".to_string(), - description: "Extract text from a screen region or image file using OCR. Returns plain text only (no bounding boxes). For text with location/coordinates, use vision_find_text instead.".to_string(), + description: "Extract text from an image file using OCR. For extracting text from a specific window, use vision_find_text instead which automatically handles window capture.".to_string(), input_schema: json!({ "type": "object", "properties": { @@ -1833,16 +1833,6 @@ Template: "type": "string", "description": "Path to image file (optional if region is provided)" }, - "region": { - "type": "object", - "description": "Screen region to capture and extract text from", - "properties": { - "x": {"type": "integer"}, - "y": {"type": "integer"}, - "width": {"type": "integer"}, - "height": {"type": "integer"} - } - } } }), }, @@ -3750,8 +3740,9 @@ Template: .and_then(|v| v.as_str()) .ok_or_else(|| anyhow::anyhow!("Missing path argument"))?; - // Extract window_id (app name) if provided - let window_id = tool_call.args.get("window_id").and_then(|v| v.as_str()); + // Extract window_id (app name) - REQUIRED + let window_id = tool_call.args.get("window_id").and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing window_id argument. You must specify which window to capture (e.g., 'Safari', 'Terminal', 'Google Chrome')."))?; // Extract region if provided let region = tool_call @@ -3771,7 +3762,7 @@ Template: .unwrap_or(0) as i32, }); - match controller.take_screenshot(path, region, window_id).await { + match controller.take_screenshot(path, region, Some(window_id)).await { Ok(_) => { // Get the actual path where the screenshot was saved let actual_path = if path.starts_with('/') { @@ -3785,14 +3776,10 @@ Template: format!("{}/{}", temp_dir.trim_end_matches('/'), path) }; - if let Some(app) = window_id { - Ok(format!( - "✅ Screenshot of {} saved to: {}", - app, actual_path - )) - } else { - Ok(format!("✅ Screenshot saved to: {}", actual_path)) - } + Ok(format!( + "✅ Screenshot of {} saved to: {}", + window_id, actual_path + )) } Err(e) => Ok(format!("❌ Failed to take screenshot: {}", e)), } @@ -3802,36 +3789,14 @@ Template: } "extract_text" => { if let Some(controller) = &self.computer_controller { - // Check if we have a path or a region - if let Some(path) = tool_call.args.get("path").and_then(|v| v.as_str()) { - // Extract text from image file - match controller.extract_text_from_image(path).await { - Ok(text) => Ok(format!("✅ Extracted text:\n{}", text)), - Err(e) => Ok(format!("❌ Failed to extract text: {}", e)), - } - } else if let Some(region_obj) = - tool_call.args.get("region").and_then(|v| v.as_object()) - { - // Extract text from screen region - let region = g3_computer_control::types::Rect { - x: region_obj.get("x").and_then(|v| v.as_i64()).unwrap_or(0) as i32, - y: region_obj.get("y").and_then(|v| v.as_i64()).unwrap_or(0) as i32, - width: region_obj - .get("width") - .and_then(|v| v.as_i64()) - .unwrap_or(0) as i32, - height: region_obj - .get("height") - .and_then(|v| v.as_i64()) - .unwrap_or(0) as i32, - }; - - match controller.extract_text_from_screen(region).await { - Ok(text) => Ok(format!("✅ Extracted text:\n{}", text)), - Err(e) => Ok(format!("❌ Failed to extract text: {}", e)), - } - } else { - Ok("❌ Missing path or region argument".to_string()) + let path = tool_call.args.get("path") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing path argument"))?; + + // Extract text from image file only + match controller.extract_text_from_image(path).await { + Ok(text) => Ok(format!("✅ Extracted text:\n{}", text)), + Err(e) => Ok(format!("❌ Failed to extract text: {}", e)), } } else { Ok("❌ Computer control not enabled. Set computer_control.enabled = true in config.".to_string()) From 65f25f840eae66da45ebfb65354d1fe3e0299e26 Mon Sep 17 00:00:00 2001 From: Dhanji Prasanna Date: Fri, 24 Oct 2025 16:11:24 +1100 Subject: [PATCH 10/16] test --- crates/g3-core/src/take_screenshot_test.rs | 37 ++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 crates/g3-core/src/take_screenshot_test.rs diff --git a/crates/g3-core/src/take_screenshot_test.rs b/crates/g3-core/src/take_screenshot_test.rs new file mode 100644 index 0000000..a90d81e --- /dev/null +++ b/crates/g3-core/src/take_screenshot_test.rs @@ -0,0 +1,37 @@ +// Test to verify take_screenshot requires window_id + +#[cfg(test)] +mod take_screenshot_tests { + use super::*; + use serde_json::json; + + #[test] + fn test_take_screenshot_requires_window_id() { + // Create a tool call without window_id + let tool_call = ToolCall { + tool: "take_screenshot".to_string(), + args: json!({ + "path": "test.png" + }), + }; + + // Verify that window_id is missing + assert!(tool_call.args.get("window_id").is_none()); + } + + #[test] + fn test_take_screenshot_with_window_id() { + // Create a tool call with window_id + let tool_call = ToolCall { + tool: "take_screenshot".to_string(), + args: json!({ + "path": "test.png", + "window_id": "Safari" + }), + }; + + // Verify that window_id is present + assert!(tool_call.args.get("window_id").is_some()); + assert_eq!(tool_call.args.get("window_id").unwrap().as_str().unwrap(), "Safari"); + } +} From 834153ea6905be2f248d489571c9c2cb72d84d28 Mon Sep 17 00:00:00 2001 From: Dhanji Prasanna Date: Fri, 24 Oct 2025 20:40:43 +1100 Subject: [PATCH 11/16] screenshotting bug fix --- .../examples/list_windows.rs | 4 +- .../g3-computer-control/src/platform/macos.rs | 39 ++++++++++++++++--- 2 files changed, 35 insertions(+), 8 deletions(-) diff --git a/crates/g3-computer-control/examples/list_windows.rs b/crates/g3-computer-control/examples/list_windows.rs index e638a19..f1681ff 100644 --- a/crates/g3-computer-control/examples/list_windows.rs +++ b/crates/g3-computer-control/examples/list_windows.rs @@ -47,8 +47,8 @@ fn main() { "".to_string() }; - // Filter for iTerm or show all - if owner.contains("iTerm") || owner.contains("Terminal") { + // Show all windows + if !owner.is_empty() { println!("{:<10} {:<25} {}", window_id, owner, title); } } diff --git a/crates/g3-computer-control/src/platform/macos.rs b/crates/g3-computer-control/src/platform/macos.rs index b8b6bea..c3bff9e 100644 --- a/crates/g3-computer-control/src/platform/macos.rs +++ b/crates/g3-computer-control/src/platform/macos.rs @@ -64,7 +64,8 @@ impl ComputerController for MacOSController { let array = CFArray::::wrap_under_create_rule(window_list); let count = array.len(); - let mut found_window_id: Option = None; + let mut found_window_id: Option<(u32, String, bool)> = None; // (id, owner, is_exact_match) + let app_name_lower = app_name.to_lowercase(); for i in 0..count { let dict = array.get(i).unwrap(); @@ -78,15 +79,35 @@ impl ComputerController for MacOSController { continue; }; - // Check if this is the app we're looking for - if owner.to_lowercase().contains(&app_name.to_lowercase()) || app_name.to_lowercase().contains(&owner.to_lowercase()) { + tracing::debug!("Checking window: owner='{}', looking for '{}'", owner, app_name); + let owner_lower = owner.to_lowercase(); + + // Check for exact match first (case-insensitive) + let is_exact_match = owner_lower == app_name_lower; + + // Check for fuzzy match (either direction contains) + let is_fuzzy_match = owner_lower.contains(&app_name_lower) || app_name_lower.contains(&owner_lower); + + if is_exact_match || is_fuzzy_match { // Get window ID let window_id_key = CFString::from_static_string("kCGWindowNumber"); if let Some(value) = dict.find(window_id_key.to_void()) { let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _); if let Some(id) = num.to_i64() { - found_window_id = Some(id as u32); - break; + tracing::debug!("Found candidate: window ID {} for app '{}' (exact={}, fuzzy={})", id, owner, is_exact_match, is_fuzzy_match); + + // If we found an exact match, use it immediately + if is_exact_match { + tracing::info!("Found exact match: window ID {} for app '{}'", id, owner); + found_window_id = Some((id as u32, owner.clone(), true)); + break; + } + + // Otherwise, keep the first fuzzy match but continue looking for exact match + if found_window_id.is_none() { + tracing::info!("Found fuzzy match: window ID {} for app '{}'", id, owner); + found_window_id = Some((id as u32, owner.clone(), false)); + } } } } @@ -95,10 +116,16 @@ impl ComputerController for MacOSController { found_window_id }; - let cg_window_id = cg_window_id.ok_or_else(|| { + let (cg_window_id, matched_owner, is_exact) = cg_window_id.ok_or_else(|| { anyhow::anyhow!("Could not find window for application '{}'. Use list_windows to see available windows.", app_name) })?; + if !is_exact { + tracing::warn!("Using fuzzy match: requested '{}' but found '{}' (window ID {})", app_name, matched_owner, cg_window_id); + } else { + tracing::info!("Taking screenshot of window ID {} for app '{}'", cg_window_id, matched_owner); + } + // Use screencapture with the window ID for now // TODO: Implement direct CGWindowListCreateImage approach with proper image saving let mut cmd = std::process::Command::new("screencapture"); From c3f3f79dc50d098e666ff2042c406f25ed436fbe Mon Sep 17 00:00:00 2001 From: Dhanji Prasanna Date: Sat, 25 Oct 2025 16:51:27 +1100 Subject: [PATCH 12/16] fixed x,y detection in vision click --- .../g3-computer-control/src/platform/macos.rs | 330 ++++++++++++--- .../platform/macos_window_matching_test.rs | 45 ++ crates/g3-core/src/lib.rs | 389 ++---------------- crates/g3-execution/src/lib.rs | 48 +++ 4 files changed, 397 insertions(+), 415 deletions(-) create mode 100644 crates/g3-computer-control/src/platform/macos_window_matching_test.rs diff --git a/crates/g3-computer-control/src/platform/macos.rs b/crates/g3-computer-control/src/platform/macos.rs index c3bff9e..da9c81b 100644 --- a/crates/g3-computer-control/src/platform/macos.rs +++ b/crates/g3-computer-control/src/platform/macos.rs @@ -64,7 +64,7 @@ impl ComputerController for MacOSController { let array = CFArray::::wrap_under_create_rule(window_list); let count = array.len(); - let mut found_window_id: Option<(u32, String, bool)> = None; // (id, owner, is_exact_match) + let mut found_window_id: Option<(u32, String)> = None; // (id, owner) let app_name_lower = app_name.to_lowercase(); for i in 0..count { @@ -82,31 +82,62 @@ impl ComputerController for MacOSController { tracing::debug!("Checking window: owner='{}', looking for '{}'", owner, app_name); let owner_lower = owner.to_lowercase(); - // Check for exact match first (case-insensitive) - let is_exact_match = owner_lower == app_name_lower; + // Normalize by removing spaces for exact matching + let app_name_normalized = app_name_lower.replace(" ", ""); + let owner_normalized = owner_lower.replace(" ", ""); - // Check for fuzzy match (either direction contains) - let is_fuzzy_match = owner_lower.contains(&app_name_lower) || app_name_lower.contains(&owner_lower); + // ONLY accept exact matches (case-insensitive, with or without spaces) + // This prevents "Goose" from matching "GooseStudio" + let is_match = owner_lower == app_name_lower || owner_normalized == app_name_normalized; - if is_exact_match || is_fuzzy_match { + if is_match { // Get window ID let window_id_key = CFString::from_static_string("kCGWindowNumber"); if let Some(value) = dict.find(window_id_key.to_void()) { let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _); if let Some(id) = num.to_i64() { - tracing::debug!("Found candidate: window ID {} for app '{}' (exact={}, fuzzy={})", id, owner, is_exact_match, is_fuzzy_match); + // Get window layer to filter out menu bar windows + let layer_key = CFString::from_static_string("kCGWindowLayer"); + let layer: i32 = if let Some(value) = dict.find(layer_key.to_void()) { + let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _); + num.to_i32().unwrap_or(0) + } else { + 0 + }; - // If we found an exact match, use it immediately - if is_exact_match { - tracing::info!("Found exact match: window ID {} for app '{}'", id, owner); - found_window_id = Some((id as u32, owner.clone(), true)); + // Get window bounds to verify it's a real window + let bounds_key = CFString::from_static_string("kCGWindowBounds"); + let has_real_bounds = if let Some(value) = dict.find(bounds_key.to_void()) { + let bounds_dict: CFDictionary = TCFType::wrap_under_get_rule(*value as *const _); + let width_key = CFString::from_static_string("Width"); + let height_key = CFString::from_static_string("Height"); + + if let (Some(w_val), Some(h_val)) = ( + bounds_dict.find(width_key.to_void()), + bounds_dict.find(height_key.to_void()), + ) { + let w_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*w_val as *const _); + let h_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*h_val as *const _); + let width = w_num.to_f64().unwrap_or(0.0); + let height = h_num.to_f64().unwrap_or(0.0); + // Real windows should be at least 100x100 pixels + width >= 100.0 && height >= 100.0 + } else { + false + } + } else { + false + }; + + // Only accept windows that are: + // 1. At layer 0 (normal windows, not menu bar) + // 2. Have real bounds (width and height >= 100) + if layer == 0 && has_real_bounds { + tracing::info!("Found valid window: ID {} for app '{}' (layer={}, bounds valid)", id, owner, layer); + found_window_id = Some((id as u32, owner.clone())); break; - } - - // Otherwise, keep the first fuzzy match but continue looking for exact match - if found_window_id.is_none() { - tracing::info!("Found fuzzy match: window ID {} for app '{}'", id, owner); - found_window_id = Some((id as u32, owner.clone(), false)); + } else { + tracing::debug!("Skipping window ID {} for '{}': layer={}, has_real_bounds={}", id, owner, layer, has_real_bounds); } } } @@ -116,15 +147,10 @@ impl ComputerController for MacOSController { found_window_id }; - let (cg_window_id, matched_owner, is_exact) = cg_window_id.ok_or_else(|| { + let (cg_window_id, matched_owner) = cg_window_id.ok_or_else(|| { anyhow::anyhow!("Could not find window for application '{}'. Use list_windows to see available windows.", app_name) })?; - - if !is_exact { - tracing::warn!("Using fuzzy match: requested '{}' but found '{}' (window ID {})", app_name, matched_owner, cg_window_id); - } else { tracing::info!("Taking screenshot of window ID {} for app '{}'", cg_window_id, matched_owner); - } // Use screencapture with the window ID for now // TODO: Implement direct CGWindowListCreateImage approach with proper image saving @@ -178,12 +204,18 @@ impl ComputerController for MacOSController { async fn find_text_in_app(&self, app_name: &str, search_text: &str) -> Result> { // Take screenshot of specific app window let home = std::env::var("HOME").unwrap_or_else(|_| "/tmp".to_string()); - let temp_path = format!("{}/Desktop/g3_find_text_{}_{}.png", home, app_name, uuid::Uuid::new_v4()); + let temp_path = format!("{}/tmp/g3_find_text_{}_{}.png", home, app_name, uuid::Uuid::new_v4()); self.take_screenshot(&temp_path, None, Some(app_name)).await?; + // Get screenshot dimensions before we delete it + let screenshot_dims = get_image_dimensions(&temp_path)?; + // Extract all text with locations let locations = self.extract_text_with_locations(&temp_path).await?; + // Get window bounds to calculate coordinate transformation + let window_bounds = self.get_window_bounds(app_name)?; + // Clean up temp file let _ = std::fs::remove_file(&temp_path); @@ -191,7 +223,13 @@ impl ComputerController for MacOSController { let search_lower = search_text.to_lowercase(); for location in locations { if location.text.to_lowercase().contains(&search_lower) { - return Ok(Some(location)); + // Transform coordinates from screenshot space to screen space + let transformed = transform_screenshot_to_screen_coords( + location, + window_bounds, + screenshot_dims, + ); + return Ok(Some(transformed)); } } @@ -222,44 +260,7 @@ impl ComputerController for MacOSController { Ok(()) } - fn click_at(&self, x: i32, y: i32, app_name: Option<&str>) -> Result<()> { - // If app_name is provided, get window position and offset coordinates - let (global_x, global_y) = if let Some(app) = app_name { - // Get window position using AppleScript - let script = format!( - r#"tell application "{}" to get bounds of window 1"#, - app - ); - - let output = std::process::Command::new("osascript") - .arg("-e") - .arg(&script) - .output()?; - - if output.status.success() { - let bounds_str = String::from_utf8_lossy(&output.stdout); - // Parse bounds: "x1, y1, x2, y2" - let parts: Vec<&str> = bounds_str.trim().split(", ").collect(); - if parts.len() >= 2 { - if let (Ok(window_x), Ok(window_y)) = ( - parts[0].trim().parse::(), - parts[1].trim().parse::(), - ) { - // Offset relative coordinates by window position - (x + window_x, y + window_y) - } else { - (x, y) // Fallback to absolute coordinates - } - } else { - (x, y) // Fallback to absolute coordinates - } - } else { - (x, y) // Fallback to absolute coordinates - } - } else { - (x, y) // No app name, use absolute coordinates - }; - + fn click_at(&self, x: i32, y: i32, _app_name: Option<&str>) -> Result<()> { use core_graphics::event::{ CGEvent, CGEventTapLocation, CGEventType, CGMouseButton, }; @@ -267,12 +268,27 @@ impl ComputerController for MacOSController { CGEventSource, CGEventSourceStateID, }; use core_graphics::geometry::CGPoint; + use core_graphics::display::CGDisplay; + + // IMPORTANT: Coordinates passed here are in NSScreen/CGWindowListCopyWindowInfo space + // (Y=0 at BOTTOM, increases UPWARD) + // But CGEvent uses a different coordinate system (Y=0 at TOP, increases DOWNWARD) + // We need to convert: CGEvent.y = screenHeight - NSScreen.y + + let screen_height = CGDisplay::main().pixels_high() as i32; + let cgevent_x = x; + let cgevent_y = screen_height - y; + + tracing::debug!("click_at: NSScreen coords ({}, {}) -> CGEvent coords ({}, {}) [screen_height={}]", + x, y, cgevent_x, cgevent_y, screen_height); + + let (global_x, global_y) = (cgevent_x, cgevent_y); + + let point = CGPoint::new(global_x as f64, global_y as f64); let source = CGEventSource::new(CGEventSourceStateID::HIDSystemState) .ok().context("Failed to create event source")?; - let point = CGPoint::new(global_x as f64, global_y as f64); - // Move mouse to position first let move_event = CGEvent::new_mouse_event( source.clone(), @@ -306,4 +322,186 @@ impl ComputerController for MacOSController { Ok(()) } -} \ No newline at end of file +} + +impl MacOSController { + /// Get window bounds for an application (helper method) + fn get_window_bounds(&self, app_name: &str) -> Result<(i32, i32, i32, i32)> { + unsafe { + let window_list = CGWindowListCopyWindowInfo( + kCGWindowListOptionOnScreenOnly, + kCGNullWindowID + ); + + let array = CFArray::::wrap_under_create_rule(window_list); + let count = array.len(); + + let app_name_lower = app_name.to_lowercase(); + + for i in 0..count { + let dict = array.get(i).unwrap(); + + // Get owner name + let owner_key = CFString::from_static_string("kCGWindowOwnerName"); + let owner: String = if let Some(value) = dict.find(owner_key.to_void()) { + let s: CFString = TCFType::wrap_under_get_rule(*value as *const _); + s.to_string() + } else { + continue; + }; + + let owner_lower = owner.to_lowercase(); + + // Normalize by removing spaces for exact matching + let app_name_normalized = app_name_lower.replace(" ", ""); + let owner_normalized = owner_lower.replace(" ", ""); + + // ONLY accept exact matches (case-insensitive, with or without spaces) + // This prevents "Goose" from matching "GooseStudio" + let is_match = owner_lower == app_name_lower || owner_normalized == app_name_normalized; + + if is_match { + // Get window layer to filter out menu bar windows + let layer_key = CFString::from_static_string("kCGWindowLayer"); + let layer: i32 = if let Some(value) = dict.find(layer_key.to_void()) { + let num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*value as *const _); + num.to_i32().unwrap_or(0) + } else { + 0 + }; + + // Skip menu bar windows (layer >= 20) + if layer >= 20 { + tracing::debug!("Skipping window for '{}' at layer {} (menu bar)", owner, layer); + continue; + } + + // Get window bounds to verify it's a real window + let bounds_key = CFString::from_static_string("kCGWindowBounds"); + if let Some(value) = dict.find(bounds_key.to_void()) { + let bounds_dict: CFDictionary = TCFType::wrap_under_get_rule(*value as *const _); + + let x_key = CFString::from_static_string("X"); + let y_key = CFString::from_static_string("Y"); + let width_key = CFString::from_static_string("Width"); + let height_key = CFString::from_static_string("Height"); + + if let (Some(x_val), Some(y_val), Some(w_val), Some(h_val)) = ( + bounds_dict.find(x_key.to_void()), + bounds_dict.find(y_key.to_void()), + bounds_dict.find(width_key.to_void()), + bounds_dict.find(height_key.to_void()), + ) { + let x_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*x_val as *const _); + let y_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*y_val as *const _); + let w_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*w_val as *const _); + let h_num: core_foundation::number::CFNumber = TCFType::wrap_under_get_rule(*h_val as *const _); + + let x: i32 = x_num.to_i64().unwrap_or(0) as i32; + let y: i32 = y_num.to_i64().unwrap_or(0) as i32; + let w: i32 = w_num.to_i64().unwrap_or(0) as i32; + let h: i32 = h_num.to_i64().unwrap_or(0) as i32; + + // Only accept windows with real bounds (>= 100x100 pixels) + if w >= 100 && h >= 100 { + tracing::info!("Found valid window bounds for '{}': x={}, y={}, w={}, h={} (layer={})", owner, x, y, w, h, layer); + return Ok((x, y, w, h)); + } else { + tracing::debug!("Skipping window for '{}': too small ({}x{})", owner, w, h); + continue; + } + } else { + continue; + } + } + } + } + } + + Err(anyhow::anyhow!("Could not find window bounds for '{}'", app_name)) + } +} + +/// Get image dimensions from a PNG file +fn get_image_dimensions(path: &str) -> Result<(i32, i32)> { + use std::fs::File; + use std::io::Read; + + let mut file = File::open(path)?; + let mut buffer = vec![0u8; 24]; + file.read_exact(&mut buffer)?; + + // PNG signature check + if &buffer[0..8] != b"\x89PNG\r\n\x1a\n" { + anyhow::bail!("Not a valid PNG file"); + } + + // Read IHDR chunk (width and height are at bytes 16-23) + let width = u32::from_be_bytes([buffer[16], buffer[17], buffer[18], buffer[19]]) as i32; + let height = u32::from_be_bytes([buffer[20], buffer[21], buffer[22], buffer[23]]) as i32; + + Ok((width, height)) +} + +/// Transform coordinates from screenshot space to screen space +/// +/// The screenshot is taken of a window, and Vision OCR returns coordinates +/// relative to the screenshot image. We need to transform these to actual +/// screen coordinates for clicking. +/// +/// On Retina displays, screenshots are taken at 2x resolution, so we need +/// to account for this scaling factor. +fn transform_screenshot_to_screen_coords( + location: TextLocation, + window_bounds: (i32, i32, i32, i32), // (x, y, width, height) in screen space + screenshot_dims: (i32, i32), // (width, height) in pixels +) -> TextLocation { + let (win_x, win_y, win_width, win_height) = window_bounds; + let (screenshot_width, screenshot_height) = screenshot_dims; + + // Calculate scale factors + // On Retina displays, screenshot is typically 2x the window size + let scale_x = win_width as f64 / screenshot_width as f64; + let scale_y = win_height as f64 / screenshot_height as f64; + + tracing::debug!("Transform: screenshot={}x{}, window={}x{} at ({},{}), scale=({:.2},{:.2})", + screenshot_width, screenshot_height, win_width, win_height, win_x, win_y, scale_x, scale_y); + + // Transform coordinates from image space to screen space + // IMPORTANT: macOS screen coordinates have origin at BOTTOM-LEFT (Y increases upward) + // Image coordinates have origin at TOP-LEFT (Y increases downward) + // win_y is the BOTTOM of the window in screen coordinates + // So we need to: (win_y + win_height) to get window TOP, then subtract screenshot_y + let window_top_y = win_y + win_height; + + tracing::debug!("[transform] Input location in image space: x={}, y={}, width={}, height={}", + location.x, location.y, location.width, location.height); + tracing::debug!("[transform] Scale factors: scale_x={:.4}, scale_y={:.4}", scale_x, scale_y); + + let transformed_x = win_x + (location.x as f64 * scale_x) as i32; + let transformed_y = window_top_y - (location.y as f64 * scale_y) as i32; + let transformed_width = (location.width as f64 * scale_x) as i32; + let transformed_height = (location.height as f64 * scale_y) as i32; + + tracing::debug!("[transform] Calculation details:"); + tracing::debug!(" - transformed_x = {} + ({} * {:.4}) = {} + {:.2} = {}", win_x, location.x, scale_x, win_x, location.x as f64 * scale_x, transformed_x); + tracing::debug!(" - transformed_width = ({} * {:.4}) = {:.2} -> {}", location.width, scale_x, location.width as f64 * scale_x, transformed_width); + tracing::debug!(" - transformed_height = ({} * {:.4}) = {:.2} -> {}", location.height, scale_y, location.height as f64 * scale_y, transformed_height); + + tracing::debug!("Transformed location: screenshot=({},{}) {}x{} -> screen=({},{}) {}x{}", + location.x, location.y, location.width, location.height, + transformed_x, transformed_y, transformed_width, transformed_height); + + TextLocation { + text: location.text, + x: transformed_x, + y: transformed_y, + width: transformed_width, + height: transformed_height, + confidence: location.confidence, + } +} + +#[path = "macos_window_matching_test.rs"] +#[cfg(test)] +mod tests; \ No newline at end of file diff --git a/crates/g3-computer-control/src/platform/macos_window_matching_test.rs b/crates/g3-computer-control/src/platform/macos_window_matching_test.rs new file mode 100644 index 0000000..387988f --- /dev/null +++ b/crates/g3-computer-control/src/platform/macos_window_matching_test.rs @@ -0,0 +1,45 @@ +#[cfg(test)] +mod window_matching_tests { + /// Test that window name matching handles spaces correctly + /// + /// Issue: When a user requests a screenshot of "Goose Studio" but the actual + /// application name is "GooseStudio" (no space), the fuzzy matching should + /// still find the window. + /// + /// The fix normalizes both names by removing spaces before comparing. + #[test] + fn test_space_normalization() { + let test_cases = vec![ + // (user_input, actual_app_name, should_match) + ("Goose Studio", "GooseStudio", true), + ("GooseStudio", "Goose Studio", true), + ("Visual Studio Code", "VisualStudioCode", true), + ("Google Chrome", "Google Chrome", true), + ("Safari", "Safari", true), + ("iTerm", "iTerm2", true), // fuzzy match + ("Code", "Visual Studio Code", true), // fuzzy match + ]; + + for (user_input, app_name, should_match) in test_cases { + let user_lower = user_input.to_lowercase(); + let app_lower = app_name.to_lowercase(); + + let user_normalized = user_lower.replace(" ", ""); + let app_normalized = app_lower.replace(" ", ""); + + let is_exact = app_lower == user_lower || app_normalized == user_normalized; + let is_fuzzy = app_lower.contains(&user_lower) + || user_lower.contains(&app_lower) + || app_normalized.contains(&user_normalized) + || user_normalized.contains(&app_normalized); + + let matches = is_exact || is_fuzzy; + + assert_eq!( + matches, should_match, + "Expected '{}' vs '{}' to match={}, but got match={}", + user_input, app_name, should_match, matches + ); + } + } +} diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs index 69a90ca..b32dce9 100644 --- a/crates/g3-core/src/lib.rs +++ b/crates/g3-core/src/lib.rs @@ -483,8 +483,8 @@ Format this as a detailed but concise summary that can be used to resume the con if matches!(message.role, MessageRole::User) && message.content.starts_with("Tool result:") { let content_len = message.content.len(); - // Only thin if the content is greater than 1000 chars - if content_len > 1000 { + // Only thin if the content is greater than 500 chars + if content_len > 500 { // Generate a unique filename based on timestamp and index let timestamp = std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) @@ -541,8 +541,8 @@ Format this as a detailed but concise summary that can be used to resume the con .map(|s| (s.to_string(), s.len())); if let Some((content_str, content_len)) = content_info { - // Only thin if content is greater than 1000 chars - if content_len > 1000 { + // Only thin if content is greater than 500 chars + if content_len > 500 { let timestamp = std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) .unwrap_or_default() @@ -574,8 +574,8 @@ Format this as a detailed but concise summary that can be used to resume the con .map(|s| (s.to_string(), s.len())); if let Some((diff_str, diff_len)) = diff_info { - // Only thin if diff is greater than 1000 chars - if diff_len > 1000 { + // Only thin if diff is greater than 500 chars + if diff_len > 500 { let timestamp = std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) .unwrap_or_default() @@ -2080,132 +2080,6 @@ Template: "required": ["app_name"] }), }, - Tool { - name: "macax_get_ui_tree".to_string(), - description: "Get the UI element hierarchy of an application as a tree structure".to_string(), - input_schema: json!({ - "type": "object", - "properties": { - "app_name": { - "type": "string", - "description": "Name of the application" - }, - "max_depth": { - "type": "integer", - "description": "Maximum depth to traverse (default: 3)" - } - }, - "required": ["app_name"] - }), - }, - Tool { - name: "macax_find_elements".to_string(), - description: "Find UI elements in an application by role, title, or identifier. Use this to locate buttons, text fields, etc.".to_string(), - input_schema: json!({ - "type": "object", - "properties": { - "app_name": { - "type": "string", - "description": "Name of the application" - }, - "role": { - "type": "string", - "description": "UI element role (e.g., 'button', 'text field', 'window')" - }, - "title": { - "type": "string", - "description": "Element title or label to match" - }, - "identifier": { - "type": "string", - "description": "Element identifier (accessibility identifier)" - } - }, - "required": ["app_name"] - }), - }, - Tool { - name: "macax_click".to_string(), - description: "Click a UI element in an application".to_string(), - input_schema: json!({ - "type": "object", - "properties": { - "app_name": { - "type": "string", - "description": "Name of the application" - }, - "role": { - "type": "string", - "description": "UI element role (e.g., 'button')" - }, - "title": { - "type": "string", - "description": "Element title or label" - }, - "identifier": { - "type": "string", - "description": "Element identifier" - } - }, - "required": ["app_name", "role"] - }), - }, - Tool { - name: "macax_set_value".to_string(), - description: "Set the value of a UI element (e.g., type into a text field)".to_string(), - input_schema: json!({ - "type": "object", - "properties": { - "app_name": { - "type": "string", - "description": "Name of the application" - }, - "role": { - "type": "string", - "description": "UI element role (e.g., 'text field')" - }, - "value": { - "type": "string", - "description": "Value to set" - }, - "title": { - "type": "string", - "description": "Element title or label" - }, - "identifier": { - "type": "string", - "description": "Element identifier" - } - }, - "required": ["app_name", "role", "value"] - }), - }, - Tool { - name: "macax_get_value".to_string(), - description: "Get the value of a UI element (e.g., read text from a text field)".to_string(), - input_schema: json!({ - "type": "object", - "properties": { - "app_name": { - "type": "string", - "description": "Name of the application" - }, - "role": { - "type": "string", - "description": "UI element role (e.g., 'text field')" - }, - "title": { - "type": "string", - "description": "Element title or label" - }, - "identifier": { - "type": "string", - "description": "Element identifier" - } - }, - "required": ["app_name", "role"] - }), - }, Tool { name: "macax_press_key".to_string(), description: "Press a keyboard key or shortcut in an application (e.g., Cmd+S to save)".to_string(), @@ -2253,21 +2127,6 @@ Template: }), }); - // Add focus_element tool - tools.push(Tool { - name: "macax_focus_element".to_string(), - description: "Focus on a UI element (text field, text area, etc.) before typing".to_string(), - input_schema: json!({ - "type": "object", - "properties": { - "app_name": {"type": "string", "description": "Name of the application"}, - "role": {"type": "string", "description": "UI element role (e.g., 'text field', 'text area')"}, - "title": {"type": "string", "description": "Element title or label (optional)"}, - "identifier": {"type": "string", "description": "Element accessibility identifier (optional)"} - }, - "required": ["app_name", "role"] - }), - }); } // Add extract_text_with_boxes tool (requires macax flag) @@ -4323,168 +4182,6 @@ Template: Err(e) => Ok(format!("❌ Failed to activate app: {}", e)), } } - "macax_get_ui_tree" => { - debug!("Processing macax_get_ui_tree tool call"); - - if !self.config.macax.enabled { - return Ok("❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string()); - } - - let app_name = match tool_call.args.get("app_name").and_then(|v| v.as_str()) { - Some(n) => n, - None => return Ok("❌ Missing app_name argument".to_string()), - }; - - let max_depth = tool_call.args.get("max_depth") - .and_then(|v| v.as_u64()) - .map(|n| n as usize) - .unwrap_or(3); - - let controller_guard = self.macax_controller.read().await; - let controller = match controller_guard.as_ref() { - Some(c) => c, - None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()), - }; - - match controller.get_ui_tree(app_name, max_depth) { - Ok(tree) => Ok(tree), - Err(e) => Ok(format!("❌ Failed to get UI tree: {}", e)), - } - } - "macax_find_elements" => { - debug!("Processing macax_find_elements tool call"); - - if !self.config.macax.enabled { - return Ok("❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string()); - } - - let app_name = match tool_call.args.get("app_name").and_then(|v| v.as_str()) { - Some(n) => n, - None => return Ok("❌ Missing app_name argument".to_string()), - }; - - let role = tool_call.args.get("role").and_then(|v| v.as_str()); - let title = tool_call.args.get("title").and_then(|v| v.as_str()); - let identifier = tool_call.args.get("identifier").and_then(|v| v.as_str()); - - let controller_guard = self.macax_controller.read().await; - let controller = match controller_guard.as_ref() { - Some(c) => c, - None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()), - }; - - match controller.find_elements(app_name, role, title, identifier) { - Ok(elements) => { - if elements.is_empty() { - Ok("No elements found matching criteria".to_string()) - } else { - let element_strs: Vec = elements.iter() - .map(|e| e.to_string()) - .collect(); - Ok(format!("Found {} element(s):\n{}", elements.len(), element_strs.join("\n"))) - } - } - Err(e) => Ok(format!("❌ Failed to find elements: {}", e)), - } - } - "macax_click" => { - debug!("Processing macax_click tool call"); - - if !self.config.macax.enabled { - return Ok("❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string()); - } - - let app_name = match tool_call.args.get("app_name").and_then(|v| v.as_str()) { - Some(n) => n, - None => return Ok("❌ Missing app_name argument".to_string()), - }; - - let role = match tool_call.args.get("role").and_then(|v| v.as_str()) { - Some(r) => r, - None => return Ok("❌ Missing role argument".to_string()), - }; - - let title = tool_call.args.get("title").and_then(|v| v.as_str()); - let identifier = tool_call.args.get("identifier").and_then(|v| v.as_str()); - - let controller_guard = self.macax_controller.read().await; - let controller = match controller_guard.as_ref() { - Some(c) => c, - None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()), - }; - - match controller.click_element(app_name, role, title, identifier) { - Ok(_) => Ok(format!("✅ Clicked {} element", role)), - Err(e) => Ok(format!("❌ Failed to click element: {}", e)), - } - } - "macax_set_value" => { - debug!("Processing macax_set_value tool call"); - - if !self.config.macax.enabled { - return Ok("❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string()); - } - - let app_name = match tool_call.args.get("app_name").and_then(|v| v.as_str()) { - Some(n) => n, - None => return Ok("❌ Missing app_name argument".to_string()), - }; - - let role = match tool_call.args.get("role").and_then(|v| v.as_str()) { - Some(r) => r, - None => return Ok("❌ Missing role argument".to_string()), - }; - - let value = match tool_call.args.get("value").and_then(|v| v.as_str()) { - Some(v) => v, - None => return Ok("❌ Missing value argument".to_string()), - }; - - let title = tool_call.args.get("title").and_then(|v| v.as_str()); - let identifier = tool_call.args.get("identifier").and_then(|v| v.as_str()); - - let controller_guard = self.macax_controller.read().await; - let controller = match controller_guard.as_ref() { - Some(c) => c, - None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()), - }; - - match controller.set_value(app_name, role, value, title, identifier) { - Ok(_) => Ok(format!("✅ Set value of {} element to: {}", role, value)), - Err(e) => Ok(format!("❌ Failed to set value: {}", e)), - } - } - "macax_get_value" => { - debug!("Processing macax_get_value tool call"); - - if !self.config.macax.enabled { - return Ok("❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string()); - } - - let app_name = match tool_call.args.get("app_name").and_then(|v| v.as_str()) { - Some(n) => n, - None => return Ok("❌ Missing app_name argument".to_string()), - }; - - let role = match tool_call.args.get("role").and_then(|v| v.as_str()) { - Some(r) => r, - None => return Ok("❌ Missing role argument".to_string()), - }; - - let title = tool_call.args.get("title").and_then(|v| v.as_str()); - let identifier = tool_call.args.get("identifier").and_then(|v| v.as_str()); - - let controller_guard = self.macax_controller.read().await; - let controller = match controller_guard.as_ref() { - Some(c) => c, - None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()), - }; - - match controller.get_value(app_name, role, title, identifier) { - Ok(value) => Ok(format!("Value: {}", value)), - Err(e) => Ok(format!("❌ Failed to get value: {}", e)), - } - } "macax_press_key" => { debug!("Processing macax_press_key tool call"); @@ -4555,37 +4252,6 @@ Template: Err(e) => Ok(format!("❌ Failed to type text: {}", e)), } } - "macax_focus_element" => { - debug!("Processing macax_focus_element tool call"); - - if !self.config.macax.enabled { - return Ok("❌ macOS Accessibility is not enabled. Use --macax flag to enable.".to_string()); - } - - let app_name = match tool_call.args.get("app_name").and_then(|v| v.as_str()) { - Some(n) => n, - None => return Ok("❌ Missing app_name argument".to_string()), - }; - - let role = match tool_call.args.get("role").and_then(|v| v.as_str()) { - Some(r) => r, - None => return Ok("❌ Missing role argument".to_string()), - }; - - let title = tool_call.args.get("title").and_then(|v| v.as_str()); - let identifier = tool_call.args.get("identifier").and_then(|v| v.as_str()); - - let controller_guard = self.macax_controller.read().await; - let controller = match controller_guard.as_ref() { - Some(c) => c, - None => return Ok("❌ macOS Accessibility controller not initialized.".to_string()), - }; - - match controller.focus_element(app_name, role, title, identifier) { - Ok(_) => Ok(format!("✅ Focused {} element in {}", role, app_name)), - Err(e) => Ok(format!("❌ Failed to focus element: {}", e)), - } - } "vision_find_text" => { debug!("Processing vision_find_text tool call"); @@ -4628,11 +4294,34 @@ Template: match controller.find_text_in_app(app_name, text).await { Ok(Some(location)) => { // Click on center of text - let center_x = location.x + location.width / 2; - let center_y = location.y + location.height / 2; + // IMPORTANT: location coordinates are in NSScreen space (Y=0 at BOTTOM, increases UPWARD) + // location.x is the LEFT edge of the bounding box + // location.y is the TOP edge of the bounding box (highest Y value in NSScreen space) + // location.width and location.height are already scaled to screen space + // To get center: we need to add half the SCALED width and subtract half the SCALED height - match controller.click_at(center_x, center_y, Some(app_name)) { - Ok(_) => Ok(format!("✅ Clicked on '{}' in {} at ({}, {})", text, app_name, center_x, center_y)), + if location.width == 0 || location.height == 0 { + return Ok(format!("❌ Invalid bounding box dimensions: width={}, height={}", location.width, location.height)); + } + + debug!("[vision_click_text] Location from find_text_in_app: x={}, y={}, width={}, height={}, text='{}'", + location.x, location.y, location.width, location.height, location.text); + + // Calculate center using the SCALED dimensions + // X: Use right edge instead of center (Vision OCR bounding box seems offset) + // This gives us: left edge + full width = right edge + // Y: top edge - half of scaled height (subtract because Y increases upward) + let click_x = location.x + location.width; // Right edge + let half_height = location.height / 2; + let click_y = location.y - half_height; + + debug!("[vision_click_text] Click position calculation: x={} + {} = {} (right edge), y={} - {} = {}", + location.x, location.width, click_x, location.y, half_height, click_y); + debug!("[vision_click_text] This means: left_edge={}, center={}, right_edge={}", + location.x, click_x, location.x + location.width); + + match controller.click_at(click_x, click_y, Some(app_name)) { + Ok(_) => Ok(format!("✅ Clicked on '{}' in {} at ({}, {})", text, app_name, click_x, click_y)), Err(e) => Ok(format!("❌ Failed to click: {}", e)), } } @@ -4709,13 +4398,15 @@ Template: match controller.find_text_in_app(app_name, text).await { Ok(Some(location)) => { // Calculate click position based on direction + // location.x is LEFT edge, location.y is TOP edge (in NSScreen space) let (click_x, click_y) = match direction { - "right" => (location.x + location.width + distance, location.y + location.height / 2), - "below" => (location.x + location.width / 2, location.y + location.height + distance), - "left" => (location.x - distance, location.y + location.height / 2), - "above" => (location.x + location.width / 2, location.y - distance), - _ => (location.x + location.width + distance, location.y + location.height / 2), + "right" => (location.x + location.width + distance, location.y - (location.height / 2)), + "below" => (location.x + (location.width / 2), location.y - location.height - distance), + "left" => (location.x - distance, location.y - (location.height / 2)), + "above" => (location.x + (location.width / 2), location.y + distance), + _ => (location.x + location.width + distance, location.y - (location.height / 2)), }; + debug!("[vision_click_near_text] Clicking {} of text at ({}, {})", direction, click_x, click_y); match controller.click_at(click_x, click_y, Some(app_name)) { Ok(_) => Ok(format!( diff --git a/crates/g3-execution/src/lib.rs b/crates/g3-execution/src/lib.rs index a42ba97..2a2e871 100644 --- a/crates/g3-execution/src/lib.rs +++ b/crates/g3-execution/src/lib.rs @@ -166,6 +166,31 @@ impl CodeExecutor { /// Execute Bash code async fn execute_bash(&self, code: &str) -> Result { + // Check if this is a detached/daemon command that should run independently + let is_detached = code.trim_start().starts_with("setsid ") + || code.trim_start().starts_with("nohup ") + || code.contains(" disown") + || (code.contains(" &") && (code.contains("nohup") || code.contains("setsid"))); + + if is_detached { + // For detached commands, just spawn and return immediately + use std::process::Stdio; + Command::new("bash") + .arg("-c") + .arg(code) + .stdin(Stdio::null()) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .spawn()?; + + return Ok(ExecutionResult { + stdout: "✅ Command launched in background (detached process)".to_string(), + stderr: String::new(), + exit_code: 0, + success: true, + }); + } + let output = Command::new("bash") .arg("-c") .arg(code) @@ -221,6 +246,29 @@ impl CodeExecutor { use tokio::io::{AsyncBufReadExt, BufReader}; use tokio::process::Command as TokioCommand; + // Check if this is a detached/daemon command that should run independently + // Look for patterns like: setsid, nohup with &, or explicit backgrounding with disown + let is_detached = code.trim_start().starts_with("setsid ") + || code.trim_start().starts_with("nohup ") + || code.contains(" disown") + || (code.contains(" &") && (code.contains("nohup") || code.contains("setsid"))); + + if is_detached { + // For detached commands, just spawn and return immediately + TokioCommand::new("bash") + .arg("-c") + .arg(code) + .spawn()?; + + // Don't wait for the process - it's meant to run independently + return Ok(ExecutionResult { + stdout: "✅ Command launched in background (detached process)".to_string(), + stderr: String::new(), + exit_code: 0, + success: true, + }); + } + let mut child = TokioCommand::new("bash") .arg("-c") .arg(code) From 5e08d6bbba5e1eb9f5d514bb269e73fe676eea3f Mon Sep 17 00:00:00 2001 From: Dhanji Prasanna Date: Mon, 27 Oct 2025 10:37:05 +1100 Subject: [PATCH 13/16] --machine mode flag for verbose CLI output --- crates/g3-cli/src/lib.rs | 665 +++++++++++++------------ crates/g3-cli/src/machine_ui_writer.rs | 93 ++++ crates/g3-cli/src/simple_output.rs | 32 ++ crates/g3-cli/src/ui_writer_impl.rs | 240 --------- crates/g3-core/src/lib.rs | 21 +- crates/g3-core/src/ui_writer.rs | 5 + 6 files changed, 477 insertions(+), 579 deletions(-) create mode 100644 crates/g3-cli/src/machine_ui_writer.rs create mode 100644 crates/g3-cli/src/simple_output.rs diff --git a/crates/g3-cli/src/lib.rs b/crates/g3-cli/src/lib.rs index 3facf6a..5b64b65 100644 --- a/crates/g3-cli/src/lib.rs +++ b/crates/g3-cli/src/lib.rs @@ -167,14 +167,12 @@ use tokio_util::sync::CancellationToken; use tracing::{error, info}; use g3_core::error_handling::{classify_error, ErrorType, RecoverableError}; -mod retro_tui; -mod theme; -pub mod tui; mod ui_writer_impl; -use retro_tui::RetroTui; -use theme::ColorTheme; -use tui::SimpleOutput; -use ui_writer_impl::{ConsoleUiWriter, RetroTuiWriter}; +mod simple_output; +use simple_output::SimpleOutput; +mod machine_ui_writer; +use machine_ui_writer::MachineUiWriter; +use ui_writer_impl::ConsoleUiWriter; #[derive(Parser)] #[command(name = "g3")] @@ -220,13 +218,9 @@ pub struct Cli { #[arg(long)] pub interactive_requirements: bool, - /// Use retro terminal UI (inspired by 80s sci-fi) + /// Enable machine-friendly output mode with JSON markers and stats #[arg(long)] - pub retro: bool, - - /// Color theme for retro mode (default, dracula, or path to theme file) - #[arg(long, value_name = "THEME")] - pub theme: Option, + pub machine: bool, /// Override the configured provider (anthropic, databricks, embedded, openai) #[arg(long, value_name = "PROVIDER")] @@ -253,7 +247,7 @@ pub async fn run() -> Result<()> { let cli = Cli::parse(); // Only initialize logging if not in retro mode - if !cli.retro { + if !cli.machine { // Initialize logging with filtering use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt, EnvFilter}; @@ -291,16 +285,16 @@ pub async fn run() -> Result<()> { tracing_subscriber::registry().with(filter).init(); } - if !cli.retro { + if !cli.machine { info!("Starting G3 AI Coding Agent"); } // Set up workspace directory - let workspace_dir = if let Some(ws) = cli.workspace { - ws + let workspace_dir = if let Some(ws) = &cli.workspace { + ws.clone() } else if cli.autonomous { // For autonomous mode, use G3_WORKSPACE env var or default - setup_workspace_directory()? + setup_workspace_directory(cli.machine)? } else { // Default to current directory for interactive/single-shot mode std::env::current_dir()? @@ -421,9 +415,9 @@ Output ONLY the markdown content, no explanations or meta-commentary."#, } } - if let Some(requirements_text) = cli.requirements { + if let Some(requirements_text) = &cli.requirements { // Use requirements text override - Project::new_autonomous_with_requirements(workspace_dir.clone(), requirements_text)? + Project::new_autonomous_with_requirements(workspace_dir.clone(), requirements_text.clone())? } else { // Use traditional requirements.md file Project::new_autonomous(workspace_dir.clone())? @@ -436,7 +430,7 @@ Output ONLY the markdown content, no explanations or meta-commentary."#, project.ensure_workspace_exists()?; project.enter_workspace()?; - if !cli.retro { + if !cli.machine { info!("Using workspace: {}", project.workspace().display()); } @@ -450,7 +444,7 @@ Output ONLY the markdown content, no explanations or meta-commentary."#, // Apply macax flag override if cli.macax { config.macax.enabled = true; - if !cli.retro { + if !cli.machine { info!("macOS Accessibility API tools enabled"); } } @@ -473,7 +467,7 @@ Output ONLY the markdown content, no explanations or meta-commentary."#, } // Initialize agent - let ui_writer = ConsoleUiWriter::new(); + // ui_writer will be created conditionally based on machine mode // Combine AGENTS.md and README content if both exist let combined_content = match (agents_content.clone(), readme_content.clone()) { @@ -485,28 +479,117 @@ Output ONLY the markdown content, no explanations or meta-commentary."#, (None, None) => None, }; - let mut agent = if cli.autonomous { - Agent::new_autonomous_with_readme_and_quiet( - config.clone(), - ui_writer, - combined_content.clone(), - cli.quiet, - ) - .await? + // Execute task, autonomous mode, or start interactive mode based on machine mode + if cli.machine { + // Machine mode - use MachineUiWriter + let ui_writer = MachineUiWriter::new(); + + let agent = if cli.autonomous { + Agent::new_autonomous_with_readme_and_quiet( + config.clone(), + ui_writer, + combined_content.clone(), + cli.quiet, + ) + .await? + } else { + Agent::new_with_readme_and_quiet( + config.clone(), + ui_writer, + combined_content.clone(), + cli.quiet, + ) + .await? + }; + + run_with_machine_mode(agent, cli, project).await?; } else { - Agent::new_with_readme_and_quiet( - config.clone(), - ui_writer, - combined_content.clone(), - cli.quiet, - ) - .await? + // Normal mode - use ConsoleUiWriter + let ui_writer = ConsoleUiWriter::new(); + + let agent = if cli.autonomous { + Agent::new_autonomous_with_readme_and_quiet( + config.clone(), + ui_writer, + combined_content.clone(), + cli.quiet, + ) + .await? + } else { + Agent::new_with_readme_and_quiet( + config.clone(), + ui_writer, + combined_content.clone(), + cli.quiet, + ) + .await? + }; + + run_with_console_mode(agent, cli, project, combined_content).await?; + } + + Ok(()) +} + +// Simplified machine mode version of autonomous mode +async fn run_autonomous_machine( + mut agent: Agent, + project: Project, + show_prompt: bool, + show_code: bool, + max_turns: usize, + _quiet: bool, +) -> Result<()> { + println!("AUTONOMOUS_MODE_STARTED"); + println!("WORKSPACE: {}", project.workspace().display()); + println!("MAX_TURNS: {}", max_turns); + + // Check if requirements exist + if !project.has_requirements() { + println!("ERROR: requirements.md not found in workspace directory"); + return Ok(()); + } + + // Read requirements + let requirements = match project.read_requirements()? { + Some(content) => content, + None => { + println!("ERROR: Could not read requirements"); + return Ok(()); + } }; + println!("REQUIREMENTS_LOADED"); + + // For now, just execute a simple autonomous loop + // This is a simplified version - full implementation would need coach-player loop + let task = format!( + "You are G3 in implementation mode. Read and implement the following requirements:\n\n{}\n\nImplement this step by step, creating all necessary files and code.", + requirements + ); + + println!("TASK_START"); + let result = agent.execute_task_with_timing(&task, None, false, show_prompt, show_code, true).await?; + println!("AGENT_RESPONSE:"); + println!("{}", result.response); + println!("END_AGENT_RESPONSE"); + println!("TASK_END"); + + println!("AUTONOMOUS_MODE_ENDED"); + Ok(()) +} + +async fn run_with_console_mode( + mut agent: Agent, + cli: Cli, + project: Project, + combined_content: Option, +) -> Result<()> { + // Execute task, autonomous mode, or start interactive mode if cli.autonomous { // Autonomous mode with coach-player feedback loop - if !cli.retro { + if !cli.machine { info!("Starting autonomous mode"); } run_autonomous( @@ -520,7 +603,7 @@ Output ONLY the markdown content, no explanations or meta-commentary."#, .await?; } else if let Some(task) = cli.task { // Single-shot mode - if !cli.retro { + if !cli.machine { info!("Executing task: {}", task); } let output = SimpleOutput::new(); @@ -530,26 +613,43 @@ Output ONLY the markdown content, no explanations or meta-commentary."#, output.print_smart(&result.response); } else { // Interactive mode (default) - if !cli.retro { + if !cli.machine { info!("Starting interactive mode"); } + println!("📁 Workspace: {}", project.workspace().display()); + run_interactive(agent, cli.show_prompt, cli.show_code, combined_content).await?; + } - if cli.retro { - // Use retro terminal UI - run_interactive_retro( - config, // Already has overrides applied - cli.show_prompt, - cli.show_code, - cli.theme, - combined_content, - ) + Ok(()) +} + +async fn run_with_machine_mode( + mut agent: Agent, + cli: Cli, + project: Project, +) -> Result<()> { + if cli.autonomous { + // Autonomous mode with coach-player feedback loop + run_autonomous_machine( + agent, + project, + cli.show_prompt, + cli.show_code, + cli.max_turns, + cli.quiet, + ) + .await?; + } else if let Some(task) = cli.task { + // Single-shot mode + let result = agent + .execute_task_with_timing(&task, None, false, cli.show_prompt, cli.show_code, true) .await?; - } else { - // Use standard terminal UI - let output = SimpleOutput::new(); - output.print(&format!("📁 Workspace: {}", project.workspace().display())); - run_interactive(agent, cli.show_prompt, cli.show_code, combined_content).await?; - } + println!("AGENT_RESPONSE:"); + println!("{}", result.response); + println!("END_AGENT_RESPONSE"); + } else { + // Interactive mode + run_interactive_machine(agent, cli.show_prompt, cli.show_code).await?; } Ok(()) @@ -691,274 +791,6 @@ fn extract_readme_heading(readme_content: &str) -> Option { None } -async fn run_interactive_retro( - config: Config, - show_prompt: bool, - show_code: bool, - theme_name: Option, - combined_content: Option, -) -> Result<()> { - use crossterm::event::{self, Event, KeyCode, KeyModifiers}; - use std::time::Duration; - - // Set environment variable to suppress println in other crates - std::env::set_var("G3_RETRO_MODE", "1"); - - // Load the color theme - let theme = match ColorTheme::load(theme_name.as_deref()) { - Ok(t) => t, - Err(e) => { - eprintln!("Failed to load theme: {}. Using default.", e); - ColorTheme::default() - } - }; - - // Initialize the retro terminal UI - let tui = RetroTui::start(theme).await?; - - // Create agent with RetroTuiWriter - let ui_writer = RetroTuiWriter::new(tui.clone()); - let mut agent = Agent::new_with_readme_and_quiet(config, ui_writer, combined_content.clone(), false).await?; - - // Display initial system messages - tui.output("SYSTEM: AGENT ONLINE\n\n"); - - // Display message if AGENTS.md or README was loaded - if let Some(ref content) = combined_content { - // Check what was loaded - let has_agents = content.contains("Agent Configuration"); - let has_readme = content.contains("Project README"); - - if has_agents { - tui.output("SYSTEM: AGENT CONFIGURATION LOADED\n\n"); - } - - if has_readme { - // Extract the first heading or title from the README - let readme_snippet = extract_readme_heading(content) - .unwrap_or_else(|| "PROJECT DOCUMENTATION LOADED".to_string()); - - tui.output(&format!( - "SYSTEM: PROJECT README LOADED - {}\n\n", - readme_snippet - )); - } - } - tui.output("SYSTEM: READY FOR INPUT\n\n"); - tui.output("\n\n"); - - // Display provider and model information - match agent.get_provider_info() { - Ok((provider, model)) => { - tui.update_provider_info(&provider, &model); - } - Err(e) => { - tui.update_provider_info("ERROR", &e.to_string()); - } - } - - // Track multiline input - let mut multiline_buffer = String::new(); - let mut in_multiline = false; - - // Main event loop - loop { - // Update context window display - let context = agent.get_context_window(); - tui.update_context( - context.used_tokens, - context.total_tokens, - context.percentage_used(), - ); - - // Poll for keyboard events - if event::poll(Duration::from_millis(50))? { - if let Event::Key(key) = event::read()? { - match key.code { - KeyCode::Char('c') if key.modifiers.contains(KeyModifiers::CONTROL) => { - tui.exit(); - break; - } - KeyCode::Char('d') if key.modifiers.contains(KeyModifiers::CONTROL) => { - tui.exit(); - break; - } - // Emacs/bash-like shortcuts - KeyCode::Char('a') if key.modifiers.contains(KeyModifiers::CONTROL) => { - tui.cursor_home(); - } - KeyCode::Char('e') if key.modifiers.contains(KeyModifiers::CONTROL) => { - tui.cursor_end(); - } - KeyCode::Char('w') if key.modifiers.contains(KeyModifiers::CONTROL) => { - tui.delete_word(); - } - KeyCode::Char('k') if key.modifiers.contains(KeyModifiers::CONTROL) => { - tui.delete_to_end(); - } - KeyCode::Char('u') if key.modifiers.contains(KeyModifiers::CONTROL) => { - // Delete from beginning to cursor (similar to Ctrl-K but opposite direction) - let (input_buffer, cursor_pos) = tui.get_input_state(); - if cursor_pos > 0 { - let after = input_buffer.chars().skip(cursor_pos).collect::(); - tui.update_input(&after); - tui.cursor_home(); - } - } - KeyCode::Left => { - tui.cursor_left(); - } - KeyCode::Right => { - tui.cursor_right(); - } - KeyCode::Home if !key.modifiers.contains(KeyModifiers::CONTROL) => { - tui.cursor_home(); - } - KeyCode::End if !key.modifiers.contains(KeyModifiers::CONTROL) => { - tui.cursor_end(); - } - KeyCode::Delete => { - tui.delete_char(); - } - KeyCode::Enter => { - let (input_buffer, _) = tui.get_input_state(); - if !input_buffer.is_empty() { - // Clear the input for next command - tui.update_input(""); - let trimmed = input_buffer.trim_end(); - - // Check if line ends with backslash for continuation - if let Some(without_backslash) = trimmed.strip_suffix('\\') { - // Remove the backslash and add to buffer - multiline_buffer.push_str(without_backslash); - multiline_buffer.push('\n'); - in_multiline = true; - tui.status("MULTILINE INPUT"); - continue; - } - - // If we're in multiline mode and no backslash, this is the final line - let final_input = if in_multiline { - multiline_buffer.push_str(&input_buffer); - in_multiline = false; - let result = multiline_buffer.clone(); - multiline_buffer.clear(); - tui.status("READY"); - result - } else { - input_buffer.clone() - }; - - let input = final_input.trim().to_string(); - if input.is_empty() { - continue; - } - - if input == "exit" || input == "quit" { - tui.exit(); - break; - } - - // Execute the task - tui.output(&format!("> {}", input)); - tui.status("PROCESSING"); - - const MAX_TIMEOUT_RETRIES: u32 = 3; - let mut attempt = 0; - - loop { - attempt += 1; - - match agent - .execute_task_with_timing( - &input, - None, - false, - show_prompt, - show_code, - true, - ) - .await - { - Ok(result) => { - if attempt > 1 { - tui.output(&format!( - "SYSTEM: REQUEST SUCCEEDED AFTER {} ATTEMPTS", - attempt - )); - } - tui.output(&result.response); - tui.status("READY"); - break; - } - Err(e) => { - // Check if this is a timeout error that we should retry - let error_type = classify_error(&e); - - if matches!( - error_type, - ErrorType::Recoverable(RecoverableError::Timeout) - ) && attempt < MAX_TIMEOUT_RETRIES - { - // Calculate retry delay with exponential backoff - let delay_ms = 1000 * (2_u64.pow(attempt - 1)); - let delay = std::time::Duration::from_millis(delay_ms); - - tui.output(&format!("SYSTEM: TIMEOUT ERROR (ATTEMPT {}/{}). RETRYING IN {:?}...", - attempt, MAX_TIMEOUT_RETRIES, delay)); - tui.status("RETRYING"); - - // Wait before retrying - tokio::time::sleep(delay).await; - continue; - } - - // For non-timeout errors or after max retries - tui.error(&format!("Task execution failed: {}", e)); - tui.status("ERROR"); - break; - } - } - } - } - } - KeyCode::Char(c) => { - tui.insert_char(c); - } - KeyCode::Backspace => { - tui.backspace(); - } - KeyCode::Up => { - tui.scroll_up(); - } - KeyCode::Down => { - tui.scroll_down(); - } - KeyCode::PageUp => { - tui.scroll_page_up(); - } - KeyCode::PageDown => { - tui.scroll_page_down(); - } - KeyCode::Home if key.modifiers.contains(KeyModifiers::CONTROL) => { - tui.scroll_home(); // Ctrl+Home for scrolling to top - } - KeyCode::End if key.modifiers.contains(KeyModifiers::CONTROL) => { - tui.scroll_end(); // Ctrl+End for scrolling to bottom - } - _ => {} - } - } - } - - // Small delay to prevent CPU spinning - tokio::time::sleep(Duration::from_millis(10)).await; - } - - tui.output("SYSTEM: SHUTDOWN INITIATED"); - Ok(()) -} - async fn run_interactive( mut agent: Agent, show_prompt: bool, @@ -1109,7 +941,7 @@ async fn run_interactive( } "/thinnify" => { let summary = agent.force_thin(); - output.print_context_thinning(&summary); + println!("{}", summary); continue; } "/readme" => { @@ -1247,6 +1079,178 @@ async fn execute_task( } } +async fn run_interactive_machine( + mut agent: Agent, + show_prompt: bool, + show_code: bool, +) -> Result<()> { + println!("INTERACTIVE_MODE_STARTED"); + + // Display provider and model information + match agent.get_provider_info() { + Ok((provider, model)) => { + println!("PROVIDER: {}", provider); + println!("MODEL: {}", model); + } + Err(e) => { + println!("ERROR: Failed to get provider info: {}", e); + } + } + + // Initialize rustyline editor with history + let mut rl = DefaultEditor::new()?; + + // Try to load history from a file in the user's home directory + let history_file = dirs::home_dir().map(|mut path| { + path.push(".g3_history"); + path + }); + + if let Some(ref history_path) = history_file { + let _ = rl.load_history(history_path); + } + + loop { + let readline = rl.readline(""); + match readline { + Ok(line) => { + let input = line.trim().to_string(); + + if input.is_empty() { + continue; + } + + if input == "exit" || input == "quit" { + break; + } + + // Add to history + rl.add_history_entry(&input)?; + + // Check for control commands + if input.starts_with('/') { + match input.as_str() { + "/compact" => { + println!("COMMAND: compact"); + match agent.force_summarize().await { + Ok(true) => println!("RESULT: Summarization completed"), + Ok(false) => println!("RESULT: Summarization failed"), + Err(e) => println!("ERROR: {}", e), + } + continue; + } + "/thinnify" => { + println!("COMMAND: thinnify"); + let summary = agent.force_thin(); + println!("{}", summary); + continue; + } + _ => { + println!("ERROR: Unknown command: {}", input); + continue; + } + } + } + + // Execute task + println!("TASK_START"); + execute_task_machine(&mut agent, &input, show_prompt, show_code).await; + println!("TASK_END"); + } + Err(ReadlineError::Interrupted) => continue, + Err(ReadlineError::Eof) => break, + Err(err) => { + println!("ERROR: {:?}", err); + break; + } + } + } + + // Save history before exiting + if let Some(ref history_path) = history_file { + let _ = rl.save_history(history_path); + } + + println!("INTERACTIVE_MODE_ENDED"); + Ok(()) +} + +async fn execute_task_machine( + agent: &mut Agent, + input: &str, + show_prompt: bool, + show_code: bool, +) { + const MAX_TIMEOUT_RETRIES: u32 = 3; + let mut attempt = 0; + + // Create cancellation token for this request + let cancellation_token = CancellationToken::new(); + let cancel_token_clone = cancellation_token.clone(); + + loop { + attempt += 1; + + // Execute task with cancellation support + let execution_result = tokio::select! { + result = agent.execute_task_with_timing_cancellable( + input, None, false, show_prompt, show_code, true, cancellation_token.clone() + ) => { + result + } + _ = tokio::signal::ctrl_c() => { + cancel_token_clone.cancel(); + println!("CANCELLED"); + return; + } + }; + + match execution_result { + Ok(result) => { + if attempt > 1 { + println!("RETRY_SUCCESS: attempt {}", attempt); + } + println!("AGENT_RESPONSE:"); + println!("{}", result.response); + println!("END_AGENT_RESPONSE"); + return; + } + Err(e) => { + if e.to_string().contains("cancelled") { + println!("CANCELLED"); + return; + } + + // Check if this is a timeout error that we should retry + let error_type = classify_error(&e); + + if matches!( + error_type, + ErrorType::Recoverable(RecoverableError::Timeout) + ) && attempt < MAX_TIMEOUT_RETRIES + { + // Calculate retry delay with exponential backoff + let delay_ms = 1000 * (2_u64.pow(attempt - 1)); + let delay = std::time::Duration::from_millis(delay_ms); + + println!("TIMEOUT: attempt {} of {}, retrying in {:?}", attempt, MAX_TIMEOUT_RETRIES, delay); + + // Wait before retrying + tokio::time::sleep(delay).await; + continue; + } + + // For non-timeout errors or after max retries + println!("ERROR: {}", e); + if attempt > 1 { + println!("FAILED_AFTER_RETRIES: {}", attempt); + } + return; + } + } + } +} + fn handle_execution_error(e: &anyhow::Error, input: &str, output: &SimpleOutput, attempt: u32) { // Enhanced error logging with detailed information error!("=== TASK EXECUTION ERROR ==="); @@ -1280,16 +1284,13 @@ fn handle_execution_error(e: &anyhow::Error, input: &str, output: &SimpleOutput, fn display_context_progress(agent: &Agent, output: &SimpleOutput) { let context = agent.get_context_window(); - output.print_context( - context.used_tokens, - context.total_tokens, - context.percentage_used(), - ); + output.print(&format!("Context: {}/{} tokens ({:.1}%)", + context.used_tokens, context.total_tokens, context.percentage_used())); } /// Set up the workspace directory for autonomous mode /// Uses G3_WORKSPACE environment variable or defaults to ~/tmp/workspace -fn setup_workspace_directory() -> Result { +fn setup_workspace_directory(machine_mode: bool) -> Result { let workspace_dir = if let Ok(env_workspace) = std::env::var("G3_WORKSPACE") { PathBuf::from(env_workspace) } else { @@ -1302,7 +1303,7 @@ fn setup_workspace_directory() -> Result { // Create the directory if it doesn't exist if !workspace_dir.exists() { std::fs::create_dir_all(&workspace_dir)?; - let output = SimpleOutput::new(); + let output = SimpleOutput::new_with_mode(machine_mode); output.print(&format!( "📁 Created workspace directory: {}", workspace_dir.display() diff --git a/crates/g3-cli/src/machine_ui_writer.rs b/crates/g3-cli/src/machine_ui_writer.rs new file mode 100644 index 0000000..bc4e61b --- /dev/null +++ b/crates/g3-cli/src/machine_ui_writer.rs @@ -0,0 +1,93 @@ +use g3_core::ui_writer::UiWriter; +use std::io::{self, Write}; + +/// Machine-mode implementation of UiWriter that prints plain, unformatted output +/// This is designed for programmatic consumption and outputs everything verbatim +pub struct MachineUiWriter; + +impl MachineUiWriter { + pub fn new() -> Self { + Self + } +} + +impl UiWriter for MachineUiWriter { + fn print(&self, message: &str) { + print!("{}", message); + } + + fn println(&self, message: &str) { + println!("{}", message); + } + + fn print_inline(&self, message: &str) { + print!("{}", message); + let _ = io::stdout().flush(); + } + + fn print_system_prompt(&self, prompt: &str) { + println!("SYSTEM_PROMPT:"); + println!("{}", prompt); + println!("END_SYSTEM_PROMPT"); + println!(); + } + + fn print_context_status(&self, message: &str) { + println!("CONTEXT_STATUS: {}", message); + } + + fn print_context_thinning(&self, message: &str) { + println!("CONTEXT_THINNING: {}", message); + } + + fn print_tool_header(&self, tool_name: &str) { + println!("TOOL_CALL: {}", tool_name); + } + + fn print_tool_arg(&self, key: &str, value: &str) { + println!("TOOL_ARG: {} = {}", key, value); + } + + fn print_tool_output_header(&self) { + println!("TOOL_OUTPUT:"); + } + + fn update_tool_output_line(&self, line: &str) { + println!("{}", line); + } + + fn print_tool_output_line(&self, line: &str) { + println!("{}", line); + } + + fn print_tool_output_summary(&self, count: usize) { + println!("TOOL_OUTPUT_LINES: {}", count); + } + + fn print_tool_timing(&self, duration_str: &str) { + println!("TOOL_DURATION: {}", duration_str); + println!("END_TOOL_OUTPUT"); + println!(); + } + + fn print_agent_prompt(&self) { + let _ = io::stdout().flush(); + } + + fn print_agent_response(&self, content: &str) { + print!("{}", content); + let _ = io::stdout().flush(); + } + + fn notify_sse_received(&self) { + // No-op for machine mode + } + + fn flush(&self) { + let _ = io::stdout().flush(); + } + + fn wants_full_output(&self) -> bool { + true // Machine mode wants complete, untruncated output + } +} diff --git a/crates/g3-cli/src/simple_output.rs b/crates/g3-cli/src/simple_output.rs new file mode 100644 index 0000000..456da9e --- /dev/null +++ b/crates/g3-cli/src/simple_output.rs @@ -0,0 +1,32 @@ +/// Simple output helper for printing messages +pub struct SimpleOutput { + machine_mode: bool, +} + +impl SimpleOutput { + pub fn new() -> Self { + SimpleOutput { machine_mode: false } + } + + pub fn new_with_mode(machine_mode: bool) -> Self { + SimpleOutput { machine_mode } + } + + pub fn print(&self, message: &str) { + if !self.machine_mode { + println!("{}", message); + } + } + + pub fn print_smart(&self, message: &str) { + if !self.machine_mode { + println!("{}", message); + } + } +} + +impl Default for SimpleOutput { + fn default() -> Self { + Self::new() + } +} diff --git a/crates/g3-cli/src/ui_writer_impl.rs b/crates/g3-cli/src/ui_writer_impl.rs index ec1a203..2f336fd 100644 --- a/crates/g3-cli/src/ui_writer_impl.rs +++ b/crates/g3-cli/src/ui_writer_impl.rs @@ -1,8 +1,6 @@ -use crate::retro_tui::RetroTui; use g3_core::ui_writer::UiWriter; use std::io::{self, Write}; use std::sync::Mutex; -use std::time::Instant; /// Console implementation of UiWriter that prints to stdout pub struct ConsoleUiWriter { @@ -347,241 +345,3 @@ impl UiWriter for ConsoleUiWriter { } } -/// RetroTui implementation of UiWriter that sends output to the TUI -pub struct RetroTuiWriter { - tui: RetroTui, - current_tool_name: Mutex>, - current_tool_output: Mutex>, - current_tool_start: Mutex>, - current_tool_caption: Mutex, -} - -impl RetroTuiWriter { - pub fn new(tui: RetroTui) -> Self { - Self { - tui, - current_tool_name: Mutex::new(None), - current_tool_output: Mutex::new(Vec::new()), - current_tool_start: Mutex::new(None), - current_tool_caption: Mutex::new(String::new()), - } - } -} - -impl UiWriter for RetroTuiWriter { - fn print(&self, message: &str) { - self.tui.output(message); - } - - fn println(&self, message: &str) { - self.tui.output(message); - } - - fn print_inline(&self, message: &str) { - // For inline printing, we'll just append to the output - self.tui.output(message); - } - - fn print_system_prompt(&self, prompt: &str) { - self.tui.output("🔍 System Prompt:"); - self.tui.output("================"); - for line in prompt.lines() { - self.tui.output(line); - } - self.tui.output("================"); - self.tui.output(""); - } - - fn print_context_status(&self, message: &str) { - self.tui.output(message); - } - - fn print_context_thinning(&self, message: &str) { - // For TUI, we'll use a highlighted output with special formatting - // The TUI will handle the visual presentation - - // Add visual separators and emphasis - self.tui.output(""); - self.tui.output("═══════════════════════════════════════════════════════════"); - self.tui.output(&format!("✨ {} ✨", message)); - self.tui.output(" └─ Context optimized successfully"); - self.tui.output("═══════════════════════════════════════════════════════════"); - self.tui.output(""); - } - - fn print_tool_header(&self, tool_name: &str) { - // Start collecting tool output - *self.current_tool_start.lock().unwrap() = Some(Instant::now()); - *self.current_tool_name.lock().unwrap() = Some(tool_name.to_string()); - self.current_tool_output.lock().unwrap().clear(); - self.current_tool_output - .lock() - .unwrap() - .push(format!("Tool: {}", tool_name)); - - // Initialize caption - *self.current_tool_caption.lock().unwrap() = String::new(); - } - - fn print_tool_arg(&self, key: &str, value: &str) { - // Filter out any keys that look like they might be agent message content - // (e.g., keys that are suspiciously long or contain message-like content) - let is_valid_arg_key = key.len() < 50 - && !key.contains('\n') - && !key.contains("I'll") - && !key.contains("Let me") - && !key.contains("Here's") - && !key.contains("I can"); - - if is_valid_arg_key { - self.current_tool_output - .lock() - .unwrap() - .push(format!("{}: {}", key, value)); - } - - // Build caption from first argument (usually the most important one) - let mut caption = self.current_tool_caption.lock().unwrap(); - if caption.is_empty() && (key == "file_path" || key == "command" || key == "path") { - // Truncate long values for the caption - let truncated = if value.len() > 50 { - // Use char_indices to safely truncate at character boundary - let truncate_at = value.char_indices() - .nth(47) - .map(|(i, _)| i) - .unwrap_or(value.len()); - format!("{}...", &value[..truncate_at]) - } else { - value.to_string() - }; - - // Add range information for read_file tool calls - let tool_name = self.current_tool_name.lock().unwrap(); - let range_suffix = if tool_name.as_ref().is_some_and(|name| name == "read_file") { - // We need to check if start/end args will be provided - for now just check if this is a partial read - // This is a simplified approach since we're building the caption incrementally - String::new() // We'll handle this in print_tool_output_header instead - } else { - String::new() - }; - - *caption = format!("{}{}", truncated, range_suffix); - } - } - - fn print_tool_output_header(&self) { - // This is called right before tool execution starts - // Send the initial tool header to the TUI now - if let Some(tool_name) = self.current_tool_name.lock().unwrap().as_ref() { - let mut caption = self.current_tool_caption.lock().unwrap().clone(); - - // Add range information for read_file tool calls - if tool_name == "read_file" { - // Check the tool output for start/end parameters - let output = self.current_tool_output.lock().unwrap(); - let has_start = output.iter().any(|line| line.starts_with("start:")); - let has_end = output.iter().any(|line| line.starts_with("end:")); - - if has_start || has_end { - let start_val = output.iter().find(|line| line.starts_with("start:")).map(|line| line.split(':').nth(1).unwrap_or("0").trim()).unwrap_or("0"); - let end_val = output.iter().find(|line| line.starts_with("end:")).map(|line| line.split(':').nth(1).unwrap_or("end").trim()).unwrap_or("end"); - caption = format!("{} [{}..{}]", caption, start_val, end_val); - } - } - - // Send the tool output with initial header - self.tui.tool_output(tool_name, &caption, ""); - } - - self.current_tool_output.lock().unwrap().push(String::new()); - self.current_tool_output - .lock() - .unwrap() - .push("Output:".to_string()); - } - - fn update_tool_output_line(&self, line: &str) { - // For retro mode, we'll just add to the output buffer - self.current_tool_output - .lock() - .unwrap() - .push(line.to_string()); - } - - fn print_tool_output_line(&self, line: &str) { - self.current_tool_output - .lock() - .unwrap() - .push(line.to_string()); - } - - fn print_tool_output_summary(&self, hidden_count: usize) { - self.current_tool_output.lock().unwrap().push(format!( - "... ({} more line{})", - hidden_count, - if hidden_count == 1 { "" } else { "s" } - )); - } - - fn print_tool_timing(&self, duration_str: &str) { - self.current_tool_output - .lock() - .unwrap() - .push(format!("⚡️ {}", duration_str)); - - // Calculate the actual duration - let duration_ms = if let Some(start) = *self.current_tool_start.lock().unwrap() { - start.elapsed().as_millis() - } else { - 0 - }; - - // Get the tool name and caption - if let Some(tool_name) = self.current_tool_name.lock().unwrap().as_ref() { - let content = self.current_tool_output.lock().unwrap().join("\n"); - let caption = self.current_tool_caption.lock().unwrap().clone(); - let caption = if caption.is_empty() { - "Completed".to_string() - } else { - caption - }; - - // Update the tool detail panel with the complete output without adding a new header - // This keeps the original header in place to be updated by tool_complete - self.tui.update_tool_detail(tool_name, &content); - - // Determine success based on whether there's an error in the output - // This is a simple heuristic - you might want to make this more sophisticated - let success = !content.contains("error") - && !content.contains("Error") - && !content.contains("ERROR"); - - // Send the completion status to update the header - self.tui - .tool_complete(tool_name, success, duration_ms, &caption); - } - - // Clear the buffers - *self.current_tool_name.lock().unwrap() = None; - self.current_tool_output.lock().unwrap().clear(); - *self.current_tool_start.lock().unwrap() = None; - *self.current_tool_caption.lock().unwrap() = String::new(); - } - - fn print_agent_prompt(&self) { - self.tui.output("\n💬 "); - } - - fn print_agent_response(&self, content: &str) { - self.tui.output(content); - } - - fn notify_sse_received(&self) { - // Notify the TUI that an SSE was received - self.tui.sse_received(); - } - - fn flush(&self) { - // No-op for TUI since it handles its own rendering - } -} diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs index b32dce9..6b3d991 100644 --- a/crates/g3-core/src/lib.rs +++ b/crates/g3-core/src/lib.rs @@ -2677,12 +2677,19 @@ Template: if tool_call.tool != "final_output" { let output_lines: Vec<&str> = tool_result.lines().collect(); + // Check if UI wants full output (machine mode) or truncated (human mode) + let wants_full = self.ui_writer.wants_full_output(); + // Helper function to safely truncate strings at character boundaries - let truncate_line = |line: &str, max_width: usize| -> String { - let char_count = line.chars().count(); - if char_count <= max_width { + let truncate_line = |line: &str, max_width: usize, truncate: bool| -> String { + if !truncate { + // Machine mode - return full line + line.to_string() + } else if line.chars().count() <= max_width { + // Human mode - line fits within limit line.to_string() } else { + // Human mode - truncate long line let truncated: String = line .chars() .take(max_width.saturating_sub(3)) @@ -2697,18 +2704,18 @@ Template: // For todo tools, show all lines without truncation let is_todo_tool = tool_call.tool == "todo_read" || tool_call.tool == "todo_write"; - let max_lines_to_show = if is_todo_tool { output_len } else { MAX_LINES }; + let max_lines_to_show = if is_todo_tool || wants_full { output_len } else { MAX_LINES }; for (idx, line) in output_lines.iter().enumerate() { - if !is_todo_tool && idx >= max_lines_to_show { + if !is_todo_tool && !wants_full && idx >= max_lines_to_show { break; } // Clip line to max width - let clipped_line = truncate_line(line, MAX_LINE_WIDTH); + let clipped_line = truncate_line(line, MAX_LINE_WIDTH, !wants_full); self.ui_writer.update_tool_output_line(&clipped_line); } - if !is_todo_tool && output_len > MAX_LINES { + if !is_todo_tool && !wants_full && output_len > MAX_LINES { self.ui_writer.print_tool_output_summary(output_len); } } diff --git a/crates/g3-core/src/ui_writer.rs b/crates/g3-core/src/ui_writer.rs index b907ea6..49e29b9 100644 --- a/crates/g3-core/src/ui_writer.rs +++ b/crates/g3-core/src/ui_writer.rs @@ -52,6 +52,10 @@ pub trait UiWriter: Send + Sync { /// Flush any buffered output fn flush(&self); + + /// Returns true if this UI writer wants full, untruncated output + /// Default is false (truncate for human readability) + fn wants_full_output(&self) -> bool { false } } /// A no-op implementation for when UI output is not needed @@ -75,4 +79,5 @@ impl UiWriter for NullUiWriter { fn print_agent_response(&self, _content: &str) {} fn notify_sse_received(&self) {} fn flush(&self) {} + fn wants_full_output(&self) -> bool { false } } \ No newline at end of file From a4476a555cfce71a084d069e7e70ba7c88248f8f Mon Sep 17 00:00:00 2001 From: Dhanji Prasanna Date: Mon, 27 Oct 2025 13:32:14 +1100 Subject: [PATCH 14/16] minor --- Cargo.lock | 75 +++++++++++++------------- crates/g3-cli/src/machine_ui_writer.rs | 1 + crates/g3-core/src/lib.rs | 22 +++++--- 3 files changed, 55 insertions(+), 43 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a09efd0..7ec765c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -318,9 +318,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.41" +version = "1.2.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac9fe6cdbb24b6ade63616c0a0688e45bb56732262c158df3c0c4bea4ca47cb7" +checksum = "739eb0f94557554b3ca9a86d2d37bebd49c5e6d0c1d2bda35ba5bdac830befc2" dependencies = [ "find-msvc-tools", "jobserver", @@ -900,9 +900,9 @@ dependencies = [ [[package]] name = "deranged" -version = "0.5.4" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a41953f86f8a05768a6cda24def994fd2f424b04ec5c719cf89989779f199071" +checksum = "ececcb659e7ba858fb4f10388c250a7252eb0a27373f1a72b8748afdd248e587" dependencies = [ "powerfmt", ] @@ -990,7 +990,7 @@ dependencies = [ "libc", "option-ext", "redox_users 0.5.2", - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -1015,9 +1015,9 @@ dependencies = [ [[package]] name = "document-features" -version = "0.2.11" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95249b50c6c185bee49034bcb378a49dc2b5dff0be90ff6616d31d64febab05d" +checksum = "d4b8a88685455ed29a21542a33abd9cb6510b6b129abadabdcef0f4c55bc8f61" dependencies = [ "litrs", ] @@ -1062,7 +1062,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -1144,9 +1144,9 @@ checksum = "52051878f80a721bb68ebfbc930e07b65ba72f2da88968ea5c06fd6ca3d3a127" [[package]] name = "flate2" -version = "1.1.4" +version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc5a4e564e38c699f2880d3fda590bedc2e69f3f84cd48b457bd892ce61d0aa9" +checksum = "bfe33edd8e85a12a67454e37f8c75e730830d83e313556ab9ebf9ee7fbeb3bfb" dependencies = [ "crc32fast", "miniz_oxide", @@ -1571,11 +1571,11 @@ checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" [[package]] name = "home" -version = "0.5.11" +version = "0.5.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "589533453244b0995c858700322199b2becb13b627df2851f64a2775d024abcf" +checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5" dependencies = [ - "windows-sys 0.59.0", + "windows-sys 0.52.0", ] [[package]] @@ -1922,9 +1922,12 @@ dependencies = [ [[package]] name = "indoc" -version = "2.0.6" +version = "2.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd" +checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706" +dependencies = [ + "rustversion", +] [[package]] name = "instability" @@ -1947,9 +1950,9 @@ checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" [[package]] name = "is_terminal_polyfill" -version = "1.70.1" +version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" [[package]] name = "itertools" @@ -2133,9 +2136,9 @@ checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956" [[package]] name = "litrs" -version = "0.4.2" +version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f5e54036fe321fd421e10d732f155734c4e4afd610dd556d9a82833ab3ee0bed" +checksum = "11d3d7f243d5c5a8b9bb5d6dd2b1602c0cb0b9db1621bafc7ed66e35ff9fe092" [[package]] name = "llama_cpp" @@ -2251,14 +2254,14 @@ dependencies = [ [[package]] name = "mio" -version = "1.0.4" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78bed444cc8a2160f01cbcf811ef18cac863ad68ae8ca62092e8db51d51c761c" +checksum = "69d83b0086dc8ecf3ce9ae2874b2d1290252e2a30720bea58a5c6639b0092873" dependencies = [ "libc", "log", "wasi", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -2330,7 +2333,7 @@ version = "0.50.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -2406,9 +2409,9 @@ checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" [[package]] name = "once_cell_polyfill" -version = "1.70.1" +version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" [[package]] name = "openssl" @@ -2627,9 +2630,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.101" +version = "1.0.103" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de" +checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8" dependencies = [ "unicode-ident", ] @@ -2901,7 +2904,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys 0.11.0", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -3122,9 +3125,9 @@ dependencies = [ [[package]] name = "signal-hook-mio" -version = "0.2.4" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34db1a06d485c9142248b7a054f034b349b212551f3dfd19c94d45a754a217cd" +checksum = "b75a19a7a740b25bc7944bdee6172368f988763b744e3d4dfe753f6b4ece40cc" dependencies = [ "libc", "mio", @@ -3226,9 +3229,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.107" +version = "2.0.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a26dbd934e5451d21ef060c018dae56fc073894c5a7896f882928a76e6d081b" +checksum = "da58917d35242480a05c2897064da0a80589a2a0476c9a3f2fdc83b53502e917" dependencies = [ "proc-macro2", "quote", @@ -3289,7 +3292,7 @@ dependencies = [ "getrandom 0.3.4", "once_cell", "rustix 1.1.2", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -3631,9 +3634,9 @@ checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971" [[package]] name = "unicode-ident" -version = "1.0.19" +version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d" +checksum = "462eeb75aeb73aea900253ce739c8e18a67423fadf006037cd3ff27e82748a06" [[package]] name = "unicode-segmentation" @@ -3932,7 +3935,7 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.48.0", ] [[package]] diff --git a/crates/g3-cli/src/machine_ui_writer.rs b/crates/g3-cli/src/machine_ui_writer.rs index bc4e61b..0d97292 100644 --- a/crates/g3-cli/src/machine_ui_writer.rs +++ b/crates/g3-cli/src/machine_ui_writer.rs @@ -71,6 +71,7 @@ impl UiWriter for MachineUiWriter { } fn print_agent_prompt(&self) { + println!("AGENT_RESPONSE:"); let _ = io::stdout().flush(); } diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs index 6b3d991..4f2ab08 100644 --- a/crates/g3-core/src/lib.rs +++ b/crates/g3-core/src/lib.rs @@ -2675,7 +2675,12 @@ Template: // Display tool execution result with proper indentation if tool_call.tool != "final_output" { - let output_lines: Vec<&str> = tool_result.lines().collect(); + // Skip displaying output for shell tool since it was already streamed + let should_display_output = tool_call.tool != "shell"; + + let output_lines: Vec<&str> = if should_display_output { + tool_result.lines().collect() + } else { vec![] }; // Check if UI wants full output (machine mode) or truncated (human mode) let wants_full = self.ui_writer.wants_full_output(); @@ -3186,13 +3191,16 @@ Template: { Ok(result) => { if result.success { - Ok(if result.stdout.is_empty() { - "✅ Command executed successfully".to_string() - } else { - result.stdout.trim().to_string() - }) + // Don't return stdout - it was already streamed to the UI + // Returning it would cause duplicate output + Ok("✅ Command executed successfully".to_string()) } else { - Ok(format!("❌ Command failed: {}", result.stderr.trim())) + // For errors, return stderr since it wasn't streamed + Ok(if result.stderr.is_empty() { + "❌ Command failed".to_string() + } else { + format!("❌ Command failed: {}", result.stderr.trim()) + }) } } Err(e) => Ok(format!("❌ Execution error: {}", e)), From 98f4220544c7e6395580238f07c740d277e438b6 Mon Sep 17 00:00:00 2001 From: Dhanji Prasanna Date: Mon, 27 Oct 2025 13:48:46 +1100 Subject: [PATCH 15/16] Fix duplicate dump at end --- crates/g3-core/src/lib.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs index 4f2ab08..bad045f 100644 --- a/crates/g3-core/src/lib.rs +++ b/crates/g3-core/src/lib.rs @@ -2727,7 +2727,8 @@ Template: // Check if this was a final_output tool call if tool_call.tool == "final_output" { - full_response.push_str(final_display_content); + // Don't add final_display_content here - it was already added before tool execution + // Adding it again would duplicate the output if let Some(summary) = tool_call.args.get("summary") { if let Some(summary_str) = summary.as_str() { full_response.push_str(&format!("\n\n{}", summary_str)); From 7c2c43374635218f298a38808487202481fdcb74 Mon Sep 17 00:00:00 2001 From: Dhanji Prasanna Date: Tue, 28 Oct 2025 12:35:58 +1100 Subject: [PATCH 16/16] control commands for machine mode --- crates/g3-cli/src/lib.rs | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/crates/g3-cli/src/lib.rs b/crates/g3-cli/src/lib.rs index 5b64b65..355cffc 100644 --- a/crates/g3-cli/src/lib.rs +++ b/crates/g3-cli/src/lib.rs @@ -1145,6 +1145,27 @@ async fn run_interactive_machine( println!("{}", summary); continue; } + "/readme" => { + println!("COMMAND: readme"); + match agent.reload_readme() { + Ok(true) => println!("RESULT: README content reloaded successfully"), + Ok(false) => println!("RESULT: No README was loaded at startup, cannot reload"), + Err(e) => println!("ERROR: {}", e), + } + continue; + } + "/stats" => { + println!("COMMAND: stats"); + let stats = agent.get_stats(); + // Emit stats as structured data (name: value pairs) + println!("{}", stats); + continue; + } + "/help" => { + println!("COMMAND: help"); + println!("AVAILABLE_COMMANDS: /compact /thinnify /readme /stats /help"); + continue; + } _ => { println!("ERROR: Unknown command: {}", input); continue;