//! Utility functions for diff parsing, shell escaping, and JSON fixing. //! //! This module contains helper functions used by the agent for: //! - String truncation utilities //! - Applying unified diffs to strings //! - Shell command escaping //! - JSON quote fixing use anyhow::Result; use tracing::debug; /// Truncate a string to approximately max_len characters, ending at a word boundary. /// /// This function attempts to break at a space character for cleaner display. /// If no suitable word boundary is found (or it would result in too short a string), /// it falls back to character-based truncation. /// /// # Arguments /// * `s` - The string to truncate /// * `max_len` - Maximum number of characters (approximate) /// /// # Returns /// The truncated string with "..." appended if truncation occurred pub fn truncate_to_word_boundary(s: &str, max_len: usize) -> String { let char_count = s.chars().count(); if char_count <= max_len { return s.to_string(); } // Get the byte index of the max_len-th character let byte_index: usize = s.char_indices() .nth(max_len) .map(|(i, _)| i) .unwrap_or(s.len()); // Find the last space before the character limit let truncated = &s[..byte_index]; if let Some(last_space_byte) = truncated.rfind(' ') { if truncated[..last_space_byte].chars().count() > max_len / 2 { // Only use word boundary if it's not too short (in characters) return format!("{}...", &s[..last_space_byte]); } } // Fall back to truncation at character boundary format!("{}...", truncated) } /// Normalize Unicode space characters in a file path to regular ASCII spaces. /// /// macOS uses special Unicode space characters in certain filenames: /// - U+202F (Narrow No-Break Space) in screenshot filenames before "am"/"pm" /// - U+00A0 (No-Break Space) in some contexts /// /// This function replaces these with regular ASCII spaces (0x20) so that /// file paths typed or copied by users will match the actual filenames. /// /// # Arguments /// * `path` - The file path that may contain Unicode space characters /// /// # Returns /// A new string with Unicode spaces normalized to ASCII spaces pub fn normalize_path_unicode_spaces(path: &str) -> String { path.chars() .map(|c| match c { '\u{202F}' => ' ', // Narrow No-Break Space '\u{00A0}' => ' ', // No-Break Space '\u{2007}' => ' ', // Figure Space '\u{2008}' => ' ', // Punctuation Space '\u{2009}' => ' ', // Thin Space '\u{200A}' => ' ', // Hair Space '\u{200B}' => ' ', // Zero Width Space (remove) '\u{FEFF}' => ' ', // Zero Width No-Break Space / BOM _ => c, }) .collect() } /// Try to resolve a file path, handling Unicode space normalization. /// /// This function attempts to find a file in the following order: /// 1. Try the path as-is /// 2. If not found and path contains spaces, try with Unicode narrow no-break spaces /// (macOS uses U+202F in screenshot filenames) /// /// # Arguments /// * `path` - The file path to resolve /// /// # Returns /// The resolved path that exists, or the original path if no match found pub fn resolve_path_with_unicode_fallback(path: &str) -> std::borrow::Cow<'_, str> { use std::borrow::Cow; use std::path::Path; // First, try the path as-is if Path::new(path).exists() { return Cow::Borrowed(path); } // If the path contains regular spaces, try replacing them with U+202F // (narrow no-break space) which macOS uses in screenshot filenames if path.contains(' ') { // Try with narrow no-break space before am/pm (common macOS pattern) let unicode_path = path .replace(" am.", "\u{202F}am.") .replace(" pm.", "\u{202F}pm.") .replace(" AM.", "\u{202F}AM.") .replace(" PM.", "\u{202F}PM."); if unicode_path != path && Path::new(&unicode_path).exists() { return Cow::Owned(unicode_path); } } // Return original path if no Unicode variant found Cow::Borrowed(path) } /// Resolve file paths within a shell command, handling Unicode space normalization. /// /// This function finds quoted file paths in a shell command and resolves them /// using Unicode space fallback (for macOS screenshot filenames with U+202F). /// /// # Arguments /// * `command` - The shell command that may contain file paths /// /// # Returns /// The command with file paths resolved to their actual filesystem paths pub fn resolve_paths_in_shell_command(command: &str) -> String { use std::path::Path; let mut result = command.to_string(); // Find all double-quoted strings that look like file paths let mut i = 0; let chars: Vec = command.chars().collect(); while i < chars.len() { if chars[i] == '"' { // Found start of quoted string let start = i; i += 1; while i < chars.len() && chars[i] != '"' { if chars[i] == '\\' && i + 1 < chars.len() { i += 2; // Skip escaped character } else { i += 1; } } if i < chars.len() { // Extract the quoted content (without quotes) let quoted_content: String = chars[start + 1..i].iter().collect(); // Check if it looks like a file path and doesn't exist if (quoted_content.starts_with('/') || quoted_content.starts_with('~')) && !Path::new("ed_content).exists() { let resolved = resolve_path_with_unicode_fallback("ed_content); if resolved.as_ref() != quoted_content { let old_quoted: String = chars[start..=i].iter().collect(); let new_quoted = format!("\"{}\"", resolved); result = result.replace(&old_quoted, &new_quoted); } } } } i += 1; } result } /// Apply unified diff to an input string with optional [start, end) bounds. /// /// # Arguments /// * `file_content` - The original file content /// * `diff` - The unified diff to apply /// * `start_char` - Optional start character position (0-indexed, inclusive) /// * `end_char` - Optional end character position (0-indexed, exclusive) /// /// # Returns /// The modified content with the diff applied pub fn apply_unified_diff_to_string( file_content: &str, diff: &str, start_char: Option, end_char: Option, ) -> Result { // Parse full unified diff into hunks and apply sequentially. let hunks = parse_unified_diff_hunks(diff); if hunks.is_empty() { anyhow::bail!( "Invalid diff format. Expected unified diff with @@ hunks or +/- with context lines" ); } // Normalize line endings to avoid CRLF/CR mismatches let content_norm = file_content.replace("\r\n", "\n").replace('\r', "\n"); // Determine and validate the search range let search_start = start_char.unwrap_or(0); let search_end = end_char.unwrap_or(content_norm.len()); if search_start > content_norm.len() { anyhow::bail!( "start position {} exceeds file length {}", search_start, content_norm.len() ); } if search_end > content_norm.len() { anyhow::bail!( "end position {} exceeds file length {}", search_end, content_norm.len() ); } if search_start > search_end { anyhow::bail!( "start position {} is greater than end position {}", search_start, search_end ); } // Extract the region we're going to modify, ensuring we're at char boundaries // Find the nearest valid char boundaries let start_boundary = if search_start == 0 { 0 } else { content_norm .char_indices() .find(|(i, _)| *i >= search_start) .map(|(i, _)| i) .unwrap_or(search_start) }; let end_boundary = content_norm .char_indices() .find(|(i, _)| *i >= search_end) .map(|(i, _)| i) .unwrap_or(content_norm.len()); let mut region_content = content_norm[start_boundary..end_boundary].to_string(); // Apply hunks in order for (idx, (old_block, new_block)) in hunks.iter().enumerate() { debug!( "Applying hunk {}: old_len={}, new_len={}", idx + 1, old_block.len(), new_block.len() ); if let Some(pos) = region_content.find(old_block) { let endpos = pos + old_block.len(); region_content.replace_range(pos..endpos, new_block); } else { // Not found; provide helpful diagnostics with a short preview // Use character-based slicing to avoid splitting multi-byte UTF-8 characters let max_chars = 200; let preview_len = old_block.chars().count().min(max_chars); let mut old_preview: String = old_block.chars().take(preview_len).collect(); let was_truncated = old_block.chars().count() > max_chars; if was_truncated { old_preview.push_str("..."); } let range_note = if start_char.is_some() || end_char.is_some() { format!( " (within character range {}:{})", start_boundary, end_boundary ) } else { String::new() }; anyhow::bail!( "Pattern not found in file{}\nHunk {} failed. Searched for:\n{}", range_note, idx + 1, old_preview ); } } // Reconstruct the full content with the modified region let mut result = String::with_capacity(content_norm.len() + region_content.len()); result.push_str(&content_norm[..start_boundary]); result.push_str(®ion_content); result.push_str(&content_norm[end_boundary..]); Ok(result) } /// Parse a unified diff into a list of hunks as (old_block, new_block). /// Each hunk contains the exact text to search for and the replacement text including context lines. pub fn parse_unified_diff_hunks(diff: &str) -> Vec<(String, String)> { let mut hunks: Vec<(String, String)> = Vec::new(); let mut old_lines: Vec = Vec::new(); let mut new_lines: Vec = Vec::new(); let mut in_hunk = false; for raw_line in diff.lines() { let line = raw_line; // Skip common diff headers if line.starts_with("diff ") || line.starts_with("index ") || line.starts_with("new file mode") || line.starts_with("deleted file mode") { continue; } if line.starts_with("--- ") || line.starts_with("+++ ") { // File header lines — ignore continue; } if line.starts_with("@@") { // Starting a new hunk — flush previous if present if in_hunk && (!old_lines.is_empty() || !new_lines.is_empty()) { hunks.push((old_lines.join("\n"), new_lines.join("\n"))); old_lines.clear(); new_lines.clear(); } in_hunk = true; continue; } if !in_hunk { // Some minimal diffs may omit @@; start collecting once we see diff markers if line.starts_with(' ') || (line.starts_with('-') && !line.starts_with("---")) || (line.starts_with('+') && !line.starts_with("+++")) { in_hunk = true; } else { continue; } } if let Some(content) = line.strip_prefix(' ') { old_lines.push(content.to_string()); new_lines.push(content.to_string()); } else if line.starts_with('+') && !line.starts_with("+++") { new_lines.push(line[1..].to_string()); } else if line.starts_with('-') && !line.starts_with("---") { old_lines.push(line[1..].to_string()); } else if line.starts_with('\\') { // Example: "\\ No newline at end of file" — ignore continue; } else { // Unknown line type — ignore } } if in_hunk && (!old_lines.is_empty() || !new_lines.is_empty()) { hunks.push((old_lines.join("\n"), new_lines.join("\n"))); } hunks } /// Helper function to properly escape shell commands. /// Handles file paths with spaces and other special characters. #[allow(dead_code)] pub fn shell_escape_command(command: &str) -> String { let parts: Vec<&str> = command.split_whitespace().collect(); if parts.is_empty() { return command.to_string(); } let cmd = parts[0]; // Commands that typically take file paths as arguments let file_commands = [ "cat", "ls", "cp", "mv", "rm", "chmod", "chown", "file", "head", "tail", "wc", "grep", ]; if file_commands.contains(&cmd) { // For file commands, we need to be smarter about escaping // Check if the command already has proper quoting if command.contains('"') || command.contains('\'') { // Already has some quoting, use as-is return command.to_string(); } // Look for file paths that need escaping (contain spaces but aren't quoted) let mut escaped_command = String::new(); let mut in_quotes = false; let mut current_word = String::new(); let mut words = Vec::new(); for ch in command.chars() { match ch { ' ' if !in_quotes => { if !current_word.is_empty() { words.push(current_word.clone()); current_word.clear(); } } '"' => { in_quotes = !in_quotes; current_word.push(ch); } _ => { current_word.push(ch); } } } if !current_word.is_empty() { words.push(current_word); } // Reconstruct the command with proper escaping for (i, word) in words.iter().enumerate() { if i > 0 { escaped_command.push(' '); } // If this word looks like a file path (contains / or ~) and has spaces, quote it if word.contains('/') || word.starts_with('~') { if word.contains(' ') && !word.starts_with('"') && !word.starts_with('\'') { escaped_command.push_str(&format!("\"{}\"", word)); } else { escaped_command.push_str(word); } } else { escaped_command.push_str(word); } } escaped_command } else { // For non-file commands, use the original command command.to_string() } } /// Helper function to fix nested quotes in shell commands within JSON. #[allow(dead_code)] pub fn fix_nested_quotes_in_shell_command(json_str: &str) -> String { // Look for the pattern: "command": " if let Some(command_start) = json_str.find(r#""command": ""#) { let command_value_start = command_start + r#""command": ""#.len(); // Find the end of the command string by looking for the pattern "} if let Some(end_marker) = json_str[command_value_start..].find(r#"" }"#) { let command_end = command_value_start + end_marker; let before = &json_str[..command_value_start]; let command_content = &json_str[command_value_start..command_end]; let after = &json_str[command_end..]; // Fix the command content by properly escaping quotes let mut fixed_command = String::new(); let mut chars = command_content.chars().peekable(); while let Some(ch) = chars.next() { match ch { '"' => { // Check if this quote is already escaped if fixed_command.ends_with('\\') { fixed_command.push(ch); // Already escaped, keep as-is } else { fixed_command.push_str(r#"\""#); // Escape the quote } } '\\' => { // Check what follows the backslash if let Some(&next_ch) = chars.peek() { if next_ch == '"' { // This is an escaped quote, keep the backslash fixed_command.push(ch); } else { // Regular backslash, escape it fixed_command.push_str(r#"\\"#); } } else { // Backslash at end, escape it fixed_command.push_str(r#"\\"#); } } _ => fixed_command.push(ch), } } return format!("{}{}{}", before, fixed_command, after); } } // Fallback: if we can't parse the structure, return as-is json_str.to_string() } /// Helper function to fix mixed quotes in JSON (single quotes where double quotes should be). #[allow(dead_code)] pub fn fix_mixed_quotes_in_json(json_str: &str) -> String { let mut result = String::new(); let mut chars = json_str.chars().peekable(); let mut in_string = false; let mut string_delimiter = '"'; while let Some(ch) = chars.next() { match ch { '"' if !in_string => { // Start of a double-quoted string in_string = true; string_delimiter = '"'; result.push(ch); } '\'' if !in_string => { // Start of a single-quoted string - convert to double quotes in_string = true; string_delimiter = '\''; result.push('"'); // Convert single quote to double quote } c if in_string && c == string_delimiter => { // End of current string if string_delimiter == '\'' { result.push('"'); // Convert single quote to double quote } else { result.push(c); } in_string = false; } '"' if in_string && string_delimiter == '\'' => { // Double quote inside single-quoted string - escape it result.push_str(r#"\""#); } '\\' if in_string => { // Escape sequence - preserve it result.push(ch); if chars.peek().is_some() { result.push(chars.next().unwrap()); } } _ => { result.push(ch); } } } result } #[cfg(test)] mod tests { use super::*; #[test] fn parses_minimal_unified_diff_without_hunk_header() { let diff = "--- old\n-old text\n+++ new\n+new text\n"; let hunks = parse_unified_diff_hunks(diff); assert_eq!(hunks.len(), 1); assert_eq!(hunks[0].0, "old text"); assert_eq!(hunks[0].1, "new text"); } #[test] fn parses_diff_with_context_and_hunk_headers() { let diff = "@@ -1,3 +1,3 @@\n common\n-old\n+new\n common2\n"; let hunks = parse_unified_diff_hunks(diff); assert_eq!(hunks.len(), 1); assert_eq!(hunks[0].0, "common\nold\ncommon2"); assert_eq!(hunks[0].1, "common\nnew\ncommon2"); } #[test] fn apply_multi_hunk_unified_diff_to_string() { let original = "line 1\nkeep\nold A\nkeep 2\nold B\nkeep 3\n"; let diff = "@@ -1,6 +1,6 @@\n line 1\n keep\n-old A\n+new A\n keep 2\n-old B\n+new B\n keep 3\n"; let result = apply_unified_diff_to_string(original, diff, None, None).unwrap(); let expected = "line 1\nkeep\nnew A\nkeep 2\nnew B\nkeep 3\n"; assert_eq!(result, expected); } #[test] fn apply_diff_within_range_only() { let original = "A\nold\nB\nold\nC\n"; // Only the first 'old' should be replaced due to range let diff = "@@ -1,3 +1,3 @@\n A\n-old\n+NEW\n B\n"; let start = 0usize; // Start of file let end = original.find("B\n").unwrap() + 2; // up to end of line 'B\n' let result = apply_unified_diff_to_string(original, diff, Some(start), Some(end)).unwrap(); let expected = "A\nNEW\nB\nold\nC\n"; assert_eq!(result, expected); } #[test] fn shell_escape_preserves_simple_commands() { assert_eq!(shell_escape_command("ls -la"), "ls -la"); assert_eq!(shell_escape_command("echo hello"), "echo hello"); } #[test] fn fix_mixed_quotes_converts_single_to_double() { let input = "{'key': 'value'}"; let result = fix_mixed_quotes_in_json(input); assert_eq!(result, "{\"key\": \"value\"}"); } #[test] fn normalize_path_unicode_spaces_converts_narrow_no_break_space() { // U+202F is Narrow No-Break Space (used by macOS in screenshot filenames) let path_with_unicode = "/Users/test/Screenshot 2025-01-03 at 4.41.27\u{202F}pm.png"; let normalized = normalize_path_unicode_spaces(path_with_unicode); assert_eq!(normalized, "/Users/test/Screenshot 2025-01-03 at 4.41.27 pm.png"); } #[test] fn normalize_path_unicode_spaces_converts_no_break_space() { // U+00A0 is No-Break Space let path_with_unicode = "/Users/test/file\u{00A0}name.txt"; let normalized = normalize_path_unicode_spaces(path_with_unicode); assert_eq!(normalized, "/Users/test/file name.txt"); } #[test] fn normalize_path_unicode_spaces_preserves_regular_spaces() { let path = "/Users/test/file with spaces.txt"; let normalized = normalize_path_unicode_spaces(path); assert_eq!(normalized, path); } #[test] fn normalize_path_unicode_spaces_handles_multiple_unicode_spaces() { // Multiple different Unicode space types let path = "/Users/test/a\u{202F}b\u{00A0}c\u{2009}d.txt"; let normalized = normalize_path_unicode_spaces(path); assert_eq!(normalized, "/Users/test/a b c d.txt"); } #[test] fn resolve_paths_in_shell_command_preserves_commands_without_paths() { let cmd = "echo hello world"; assert_eq!(resolve_paths_in_shell_command(cmd), cmd); } #[test] fn resolve_paths_in_shell_command_preserves_existing_paths() { let cmd = "cat \"/etc/hosts\""; assert_eq!(resolve_paths_in_shell_command(cmd), cmd); } #[test] fn truncate_to_word_boundary_short_string_unchanged() { assert_eq!(truncate_to_word_boundary("hello", 10), "hello"); assert_eq!(truncate_to_word_boundary("hello world", 20), "hello world"); } #[test] fn truncate_to_word_boundary_breaks_at_space() { // Should break at word boundary let result = truncate_to_word_boundary("hello world this is a long string", 15); assert_eq!(result, "hello world..."); } #[test] fn truncate_to_word_boundary_falls_back_to_char_limit() { // When word boundary would be too short, fall back to char limit let result = truncate_to_word_boundary("a verylongwordwithoutspaces", 10); assert_eq!(result, "a verylong..."); } #[test] fn truncate_to_word_boundary_handles_unicode() { // Should handle unicode characters correctly let result = truncate_to_word_boundary("héllo wörld this is long", 12); assert!(result.ends_with("...")); } }