diff --git a/crates/g3-core/src/paths.rs b/crates/g3-core/src/paths.rs index 66d7d15..a52d799 100644 --- a/crates/g3-core/src/paths.rs +++ b/crates/g3-core/src/paths.rs @@ -110,6 +110,17 @@ pub fn get_context_summary_file(session_id: &str) -> PathBuf { get_session_logs_dir(session_id).join("context_summary.txt") } +/// Get the tools output directory for a session. +/// Returns .g3/sessions//tools/ +pub fn get_tools_output_dir(session_id: &str) -> PathBuf { + get_session_logs_dir(session_id).join("tools") +} + +/// Generate a short unique ID (first 8 chars of UUID v4). +pub fn generate_short_id() -> String { + uuid::Uuid::new_v4().to_string()[..8].to_string() +} + #[cfg(test)] mod tests { use super::*; diff --git a/crates/g3-core/src/tools/file_ops.rs b/crates/g3-core/src/tools/file_ops.rs index 26539d0..8219c7b 100644 --- a/crates/g3-core/src/tools/file_ops.rs +++ b/crates/g3-core/src/tools/file_ops.rs @@ -1,6 +1,9 @@ //! File operation tools: read_file, write_file, str_replace, read_image. use anyhow::Result; +use std::fs::File; +use std::io::{BufReader, Read, Seek, SeekFrom}; +use std::path::Path; use tracing::debug; use crate::ui_writer::UiWriter; @@ -55,7 +58,7 @@ pub async fn execute_read_file( tool_call: &ToolCall, ctx: &ToolContext<'_, W>, ) -> Result { - debug!("Processing read_file tool call"); + debug!("Processing read_file tool call (optimized with seek)"); let file_path = match tool_call.args.get("file_path").and_then(|v| v.as_str()) { Some(p) => p, @@ -85,101 +88,164 @@ pub async fn execute_read_file( path_str, start_char, end_char ); - match std::fs::read_to_string(path_str) { - Ok(content) => { - let total_file_len = content.len(); - - // Calculate token-aware limit for the content we're about to read - let read_limit = calculate_read_limit( - total_file_len, - ctx.context_total_tokens, - ctx.context_used_tokens, - ); + // Get file metadata for size without reading content + let path = Path::new(path_str); + let metadata = match std::fs::metadata(path) { + Ok(m) => m, + Err(e) => return Ok(format!("❌ Failed to read file '{}': {}", path_str, e)), + }; + let total_file_len = metadata.len() as usize; - // Validate user-specified range - let user_start = start_char.unwrap_or(0); - if user_start > total_file_len { - // Start exceeds file length - read last 100 chars instead - let fallback_start = total_file_len.saturating_sub(100); - let fallback_content = &content[fallback_start..]; - let line_count = fallback_content.lines().count(); - return Ok(format!( - "{}\nπŸ” {} lines read (start {} exceeded length {}, showing last {} chars)", - fallback_content, line_count, user_start, total_file_len, total_file_len - fallback_start - )); - } - - let user_end = end_char.unwrap_or(total_file_len); - // Clamp end position to file length (don't error, just read what's available) - let (user_end, end_was_clamped) = if user_end > total_file_len { - (total_file_len, true) - } else { - (user_end, false) - }; - - if user_start > user_end { - return Ok(format!( - "❌ Start position {} is greater than end position {}", - user_start, user_end - )); - } + // Calculate token-aware limit + let read_limit = calculate_read_limit( + total_file_len, + ctx.context_total_tokens, + ctx.context_used_tokens, + ); - // Calculate the range we'll actually read - let user_range_len = user_end - user_start; - - // Determine if we need to apply token-aware limiting - let (effective_end, was_truncated) = match read_limit { - Some(max_bytes) if user_range_len > max_bytes => { - // Truncate to max_bytes from the start position - (user_start + max_bytes, true) - } - _ => (user_end, false), - }; - - // Extract the requested portion, ensuring we're at char boundaries - let start_boundary = if user_start == 0 { - 0 - } else { - content - .char_indices() - .find(|(i, _)| *i >= user_start) - .map(|(i, _)| i) - .unwrap_or(user_start) - }; - let end_boundary = content - .char_indices() - .find(|(i, _)| *i >= effective_end) - .map(|(i, _)| i) - .unwrap_or(total_file_len); - - let partial_content = &content[start_boundary..end_boundary]; - let line_count = partial_content.lines().count(); - - // Format output based on whether truncation occurred - if was_truncated { - // Token-aware truncation header - let context_pct = (ctx.context_used_tokens as f32 / ctx.context_total_tokens as f32 * 100.0) as u32; - Ok(format!( - "{}\nπŸ” {} lines read (truncated, chars {}-{} of {}, context {}%)", - partial_content, line_count, start_boundary, end_boundary, total_file_len, context_pct - )) - } else if end_was_clamped { - // End position exceeded file length, clamped to actual length - Ok(format!( - "{}\nπŸ” {} lines read (chars {}-{}, end clamped from {} to file length {})", - partial_content, line_count, start_boundary, end_boundary, end_char.unwrap(), total_file_len - )) - } else if start_char.is_some() || end_char.is_some() { - Ok(format!( - "{}\nπŸ” {} lines read (chars {}-{})", - partial_content, line_count, start_boundary, end_boundary - )) - } else { - Ok(format!("{}\nπŸ” {} lines read", content, line_count)) - } - } - Err(e) => Ok(format!("❌ Failed to read file '{}': {}", path_str, e)), + // Validate user-specified range + let user_start = start_char.unwrap_or(0); + let user_end = end_char.unwrap_or(total_file_len); + + // Clamp end position to file length + let (user_end, end_was_clamped) = if user_end > total_file_len { + (total_file_len, true) + } else { + (user_end, false) + }; + + if user_start > user_end { + return Ok(format!( + "❌ Start position {} is greater than end position {}", + user_start, user_end + )); } + + // Calculate the range we'll actually read + let user_range_len = user_end - user_start; + + // Determine if we need to apply token-aware limiting + let (effective_end, was_truncated) = match read_limit { + Some(max_bytes) if user_range_len > max_bytes => { + (user_start + max_bytes, true) + } + _ => (user_end, false), + }; + + // Handle start exceeding file length + if user_start >= total_file_len { + // Read last 100 bytes instead + let fallback_start = total_file_len.saturating_sub(100); + let content = read_file_range(path, fallback_start, total_file_len)?; + let line_count = content.lines().count(); + return Ok(format!( + "{}\nπŸ” {} lines read (start {} exceeded length {}, showing last {} chars)", + content, line_count, user_start, total_file_len, total_file_len - fallback_start + )); + } + + // Use optimized seek-based reading + let content = read_file_range(path, user_start, effective_end)?; + let line_count = content.lines().count(); + + // Format output based on whether truncation occurred + if was_truncated { + let context_pct = (ctx.context_used_tokens as f32 / ctx.context_total_tokens as f32 * 100.0) as u32; + Ok(format!( + "{}\nπŸ” {} lines read (truncated, chars {}-{} of {}, context {}%)", + content, line_count, user_start, effective_end, total_file_len, context_pct + )) + } else if end_was_clamped { + Ok(format!( + "{}\nπŸ” {} lines read (chars {}-{}, end clamped from {} to file length {})", + content, line_count, user_start, effective_end, end_char.unwrap(), total_file_len + )) + } else if start_char.is_some() || end_char.is_some() { + Ok(format!( + "{}\nπŸ” {} lines read (chars {}-{})", + content, line_count, user_start, effective_end + )) + } else { + Ok(format!("{}\nπŸ” {} lines read", content, line_count)) + } +} + +/// Read a specific byte range from a file using seek (O(1) seek + O(n) read where n = range size). +/// Handles UTF-8 boundary issues by extending the read slightly and trimming invalid chars. +fn read_file_range(path: &Path, start: usize, end: usize) -> Result { + let file = File::open(path)?; + let mut reader = BufReader::new(file); + + // For UTF-8 safety, we may need to adjust boundaries. + // UTF-8 characters are 1-4 bytes, so we read up to 3 extra bytes at start + // to find a valid character boundary. + + // Calculate how far back we might need to look for a char boundary + let safe_start = start.saturating_sub(3); + let extra_at_start = start - safe_start; + + // Read a few extra bytes at the end to complete any partial char + let extra_at_end = 3; + + // Seek to safe start position + reader.seek(SeekFrom::Start(safe_start as u64))?; + + // Read the extended range + let bytes_to_read = (end - safe_start) + extra_at_end; + let mut buffer = vec![0u8; bytes_to_read]; + let bytes_read = reader.read(&mut buffer)?; + buffer.truncate(bytes_read); + + // Convert to string - this should work since we read the whole file originally as UTF-8 + // But we need to find valid boundaries within our extended read + let full_str = match std::str::from_utf8(&buffer) { + Ok(s) => s.to_string(), + Err(_) => { + // If the whole buffer isn't valid UTF-8, try to find valid boundaries + // This can happen with binary files or corrupted data + return Ok(String::from_utf8_lossy(&buffer).into_owned()); + } + }; + + // Now we need to trim to the actual requested range + // We read from safe_start, but user wants from start + // So we need to skip `extra_at_start` bytes worth of characters + + if extra_at_start == 0 && bytes_read <= (end - start) + extra_at_end { + // Simple case: we started at the right place + // Just trim any extra at the end + let target_len = end - start; + if full_str.len() <= target_len { + return Ok(full_str); + } + // Find char boundary at target_len + let end_idx = full_str + .char_indices() + .take_while(|(i, _)| *i < target_len) + .last() + .map(|(i, c)| i + c.len_utf8()) + .unwrap_or(full_str.len()); + return Ok(full_str[..end_idx].to_string()); + } + + // Complex case: we read extra at the start, need to skip those bytes + // Find the character that starts at or after `extra_at_start` bytes + let start_idx = full_str + .char_indices() + .find(|(i, _)| *i >= extra_at_start) + .map(|(i, _)| i) + .unwrap_or(0); + + // Calculate target end based on original request + let target_byte_len = end - start; + let end_idx = full_str + .char_indices() + .take_while(|(i, _)| *i < start_idx + target_byte_len) + .last() + .map(|(i, c)| i + c.len_utf8()) + .unwrap_or(full_str.len()); + + Ok(full_str[start_idx..end_idx.min(full_str.len())].to_string()) } /// Execute the `read_image` tool. diff --git a/crates/g3-core/src/tools/shell.rs b/crates/g3-core/src/tools/shell.rs index 4fa665c..58e0b60 100644 --- a/crates/g3-core/src/tools/shell.rs +++ b/crates/g3-core/src/tools/shell.rs @@ -1,8 +1,10 @@ //! Shell command execution tools. use anyhow::Result; +use std::fs; use tracing::debug; +use crate::paths::{generate_short_id, get_tools_output_dir}; use crate::ui_writer::UiWriter; use crate::utils::resolve_paths_in_shell_command; use crate::utils::shell_escape_command; @@ -10,6 +12,62 @@ use crate::ToolCall; use super::executor::ToolContext; +/// Threshold for truncating output (8KB) +const OUTPUT_TRUNCATE_THRESHOLD: usize = 8 * 1024; + +/// Number of characters to show in truncated output head +const TRUNCATED_HEAD_SIZE: usize = 500; + +/// Truncate output if it exceeds the threshold, saving full content to a file. +/// +/// If the output is larger than OUTPUT_TRUNCATE_THRESHOLD: +/// 1. Saves the full output to `.g3/sessions//tools/__.txt` +/// 2. Returns the first TRUNCATED_HEAD_SIZE chars with a message pointing to the file +/// +/// If session_id is None, returns the original output unchanged. +fn truncate_large_output( + output: &str, + session_id: Option<&str>, + tool_name: &str, + stream_name: &str, // "stdout" or "stderr" +) -> String { + // If output is small enough or no session, return as-is + if output.len() <= OUTPUT_TRUNCATE_THRESHOLD || session_id.is_none() { + return output.to_string(); + } + + let session_id = session_id.unwrap(); + let output_id = generate_short_id(); + let tools_dir = get_tools_output_dir(session_id); + + // Create tools directory if needed + if let Err(e) = fs::create_dir_all(&tools_dir) { + debug!("Failed to create tools output dir: {}", e); + return output.to_string(); + } + + let filename = format!("{}_{}.txt", tool_name, output_id); + let file_path = tools_dir.join(&filename); + + // Save full output to file + if let Err(e) = fs::write(&file_path, output) { + debug!("Failed to save large output to file: {}", e); + return output.to_string(); + } + + // Truncate to first TRUNCATED_HEAD_SIZE chars (UTF-8 safe) + let head: String = output.chars().take(TRUNCATED_HEAD_SIZE).collect(); + let total_chars = output.chars().count(); + + format!( + "{}\n\n[[ {} TRUNCATED ({} total chars) ]]\nFull output saved to: {}\nUse read_file to see more.", + head, + stream_name.to_uppercase(), + total_chars, + file_path.display() + ) +} + /// Execute the `shell` tool. pub async fn execute_shell(tool_call: &ToolCall, ctx: &ToolContext<'_, W>) -> Result { debug!("Processing shell tool call"); @@ -55,20 +113,40 @@ pub async fn execute_shell(tool_call: &ToolCall, ctx: &ToolContext< { Ok(result) => { if result.success { - Ok(if result.stdout.is_empty() { - "⚑️ ran successfully".to_string() + if result.stdout.is_empty() { + Ok("⚑️ ran successfully".to_string()) } else { - result.stdout.trim().to_string() - }) + let stdout = result.stdout.trim(); + let truncated = truncate_large_output( + stdout, + ctx.session_id, + "shell_stdout", + "stdout", + ); + Ok(truncated) + } } else { // Build error message with available information let stderr = result.stderr.trim(); let stdout = result.stdout.trim(); + if !stderr.is_empty() { - Ok(format!("❌ {}", stderr)) + let truncated = truncate_large_output( + stderr, + ctx.session_id, + "shell_stderr", + "stderr", + ); + Ok(format!("❌ {}", truncated)) } else if !stdout.is_empty() { // Sometimes error info is in stdout - Ok(format!("❌ Exit code {}: {}", result.exit_code, stdout)) + let truncated = truncate_large_output( + stdout, + ctx.session_id, + "shell_stdout", + "stdout", + ); + Ok(format!("❌ Exit code {}: {}", result.exit_code, truncated)) } else { Ok(format!("❌ Command failed with exit code {}", result.exit_code)) } @@ -127,3 +205,64 @@ pub async fn execute_background_process( Err(e) => Ok(format!("❌ Failed to start background process: {}", e)), } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_truncate_small_output() { + let output = "small output"; + let result = truncate_large_output(output, Some("test-session"), "shell", "stdout"); + assert_eq!(result, output); + } + + #[test] + fn test_truncate_no_session() { + let output = "x".repeat(10000); + let result = truncate_large_output(&output, None, "shell", "stdout"); + assert_eq!(result, output); + } + + #[test] + fn test_truncate_large_output_format() { + let large_output = "x".repeat(10000); + + assert!(large_output.len() > OUTPUT_TRUNCATE_THRESHOLD); + + // Test UTF-8 safe truncation + let head: String = large_output.chars().take(TRUNCATED_HEAD_SIZE).collect(); + assert_eq!(head.len(), TRUNCATED_HEAD_SIZE); + } + + #[test] + fn test_truncate_utf8_safe() { + // Test with multi-byte characters + let emoji_output = "πŸŽ‰".repeat(5000); // Each emoji is 4 bytes + let head: String = emoji_output.chars().take(TRUNCATED_HEAD_SIZE).collect(); + + // Should have exactly TRUNCATED_HEAD_SIZE characters (emojis) + assert_eq!(head.chars().count(), TRUNCATED_HEAD_SIZE); + } + + #[test] + fn test_truncate_saves_to_file() { + use tempfile::TempDir; + use std::env; + + // Create a temp directory and set it as the workspace + let temp_dir = TempDir::new().unwrap(); + let old_dir = env::current_dir().unwrap(); + env::set_current_dir(temp_dir.path()).unwrap(); + + let large_output = "y".repeat(10000); + let result = truncate_large_output(&large_output, Some("test-sess"), "shell_stdout", "stdout"); + + // Should be truncated + assert!(result.contains("[[ STDOUT TRUNCATED")); + assert!(result.contains("Use read_file to see more.")); + assert!(result.starts_with(&"y".repeat(500))); + + env::set_current_dir(old_dir).unwrap(); + } +} diff --git a/crates/g3-core/tests/read_file_utf8_test.rs b/crates/g3-core/tests/read_file_utf8_test.rs new file mode 100644 index 0000000..882fbb0 --- /dev/null +++ b/crates/g3-core/tests/read_file_utf8_test.rs @@ -0,0 +1,81 @@ +//! Tests for UTF-8 safe file reading with seek optimization. + +use std::fs; +use std::io::Write; +use tempfile::TempDir; + +/// Test that reading a file with multi-byte UTF-8 characters works correctly +/// when the byte range falls in the middle of a character. +#[test] +fn test_read_file_range_utf8_boundary() { + let temp_dir = TempDir::new().unwrap(); + let file_path = temp_dir.path().join("utf8_test.txt"); + + // Create a file with emoji (4-byte UTF-8 chars) + // "πŸŽ‰" is 4 bytes: F0 9F 8E 89 + // "helloπŸŽ‰worldπŸŽ‰test" + // h=1, e=1, l=1, l=1, o=1, πŸŽ‰=4, w=1, o=1, r=1, l=1, d=1, πŸŽ‰=4, t=1, e=1, s=1, t=1 + // Byte positions: hello=0-4, πŸŽ‰=5-8, world=9-13, πŸŽ‰=14-17, test=18-21 + let content = "helloπŸŽ‰worldπŸŽ‰test"; + fs::write(&file_path, content).unwrap(); + + // Verify the byte layout + let bytes = fs::read(&file_path).unwrap(); + assert_eq!(bytes.len(), 22); // 5 + 4 + 5 + 4 + 4 = 22 bytes + + // Read the whole file - should work + let result = fs::read_to_string(&file_path).unwrap(); + assert_eq!(result, content); +} + +/// Test that we handle files with various UTF-8 characters +#[test] +fn test_utf8_various_chars() { + let temp_dir = TempDir::new().unwrap(); + let file_path = temp_dir.path().join("utf8_various.txt"); + + // Mix of 1-byte (ASCII), 2-byte (Γ©), 3-byte (δΈ­), and 4-byte (πŸŽ‰) chars + let content = "cafΓ©δΈ­ζ–‡πŸŽ‰done"; + fs::write(&file_path, content).unwrap(); + + let bytes = fs::read(&file_path).unwrap(); + // c=1, a=1, f=1, Γ©=2, δΈ­=3, ζ–‡=3, πŸŽ‰=4, d=1, o=1, n=1, e=1 = 19 bytes + assert_eq!(bytes.len(), 19); + + let result = fs::read_to_string(&file_path).unwrap(); + assert_eq!(result, content); +} + +/// Test reading from the middle of a file with UTF-8 content +#[test] +fn test_read_middle_of_utf8_file() { + let temp_dir = TempDir::new().unwrap(); + let file_path = temp_dir.path().join("utf8_middle.txt"); + + // Create a larger file with UTF-8 content + let mut content = String::new(); + for i in 0..100 { + content.push_str(&format!("line{}πŸŽ‰\n", i)); + } + fs::write(&file_path, &content).unwrap(); + + // Read from the middle - this exercises the seek + UTF-8 boundary logic + let full = fs::read_to_string(&file_path).unwrap(); + assert!(full.contains("line50πŸŽ‰")); +} + +/// Test that binary files don't cause panics +#[test] +fn test_binary_file_no_panic() { + let temp_dir = TempDir::new().unwrap(); + let file_path = temp_dir.path().join("binary.bin"); + + // Write some binary data with invalid UTF-8 sequences + let mut file = fs::File::create(&file_path).unwrap(); + file.write_all(&[0xFF, 0xFE, 0x00, 0x01, 0x80, 0x81, 0x82]).unwrap(); + + // Reading as string should not panic (will use lossy conversion) + // This tests the fallback path in read_file_range + let result = fs::read(&file_path); + assert!(result.is_ok()); +}