Rewrite JSON tool call filter with clean state machine
Fixes bug where JSON tool calls were printed as text due to chunking issues.
Changes:
- Complete rewrite of filter_json.rs with 3-state machine:
- Streaming: normal pass-through, watches for newline + whitespace + {
- Buffering: confirms/denies tool pattern with ~20 char buffer
- Suppressing: string-aware brace counting until balanced
- Character-by-character processing eliminates chunk boundary issues
- Proper handling of } inside JSON strings (was causing premature exit)
- Detects truncated JSON followed by complete JSON (LLM retry case)
- Removed regex dependency, simpler pattern matching
- Added 59 stress tests covering malformed JSON, partial patterns,
streaming edge cases, adversarial inputs, and real-world patterns
All 86 filter_json tests pass.
This commit is contained in:
@@ -1,78 +1,160 @@
|
|||||||
//! JSON tool call filtering for streaming LLM responses.
|
//! JSON tool call filtering for streaming LLM responses.
|
||||||
//!
|
//!
|
||||||
//! This module filters out JSON tool calls from LLM output streams while preserving
|
//! This module filters out JSON tool calls from LLM output streams while preserving
|
||||||
//! regular text content. It uses a state machine to handle streaming chunks.
|
//! regular text content. It uses a simple state machine optimized for streaming.
|
||||||
//!
|
//!
|
||||||
//! # Design
|
//! # Design
|
||||||
//!
|
//!
|
||||||
//! The filter detects tool calls by looking for JSON objects that start with `{"tool":`
|
//! The filter uses three states:
|
||||||
//! at the beginning of a line. It uses brace counting to find the complete JSON object
|
//! - **Streaming**: Normal pass-through mode. Watches for newline + whitespace + `{`
|
||||||
//! and removes it from the output stream.
|
//! - **Buffering**: Saw potential tool call start, buffering to confirm/deny
|
||||||
|
//! - **Suppressing**: Confirmed tool call, counting braces (string-aware) to find end
|
||||||
//!
|
//!
|
||||||
//! # Known Edge Cases
|
//! The key insight is that we only need to buffer a small amount (around 12 chars)
|
||||||
//!
|
//! to confirm whether `{` starts a tool call pattern like `{"tool":`.
|
||||||
//! 1. **Brace counting without string awareness in main loop**: The main filtering loop
|
|
||||||
//! counts braces without considering whether they're inside JSON strings. This can
|
|
||||||
//! cause premature exit from suppression mode if a string contains `}`.
|
|
||||||
//!
|
|
||||||
//! 2. **Tool calls not at line start**: Tool calls that don't start at the beginning
|
|
||||||
//! of a line (after optional whitespace) won't be detected.
|
|
||||||
//!
|
|
||||||
//! 3. **Streaming chunk boundaries**: If a tool call pattern is split across chunks
|
|
||||||
//! (e.g., `{"to` in one chunk and `ol":` in the next), detection may fail.
|
|
||||||
|
|
||||||
use regex::Regex;
|
|
||||||
use std::cell::RefCell;
|
use std::cell::RefCell;
|
||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
|
|
||||||
|
/// Maximum chars needed to confirm/deny a tool call pattern.
|
||||||
|
/// Pattern is: { + optional whitespace + "tool" + optional whitespace + : + optional whitespace + "
|
||||||
|
/// Realistically: `{"tool":"` = 9 chars, with whitespace maybe 15 max
|
||||||
|
const MAX_BUFFER_FOR_DETECTION: usize = 20;
|
||||||
|
|
||||||
// Thread-local state for tracking JSON tool call suppression
|
// Thread-local state for tracking JSON tool call suppression
|
||||||
thread_local! {
|
thread_local! {
|
||||||
static JSON_TOOL_STATE: RefCell<JsonToolState> = RefCell::new(JsonToolState::new());
|
static JSON_TOOL_STATE: RefCell<FilterState> = RefCell::new(FilterState::new());
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Internal state for tracking JSON tool call filtering across streaming chunks.
|
/// The three possible states of the filter
|
||||||
|
#[derive(Debug, Clone, PartialEq)]
|
||||||
|
enum State {
|
||||||
|
/// Normal streaming - pass through content, watch for newline + whitespace + {
|
||||||
|
Streaming,
|
||||||
|
/// Saw potential start, buffering to confirm/deny tool pattern
|
||||||
|
Buffering,
|
||||||
|
/// Confirmed tool call, suppressing until braces balance
|
||||||
|
Suppressing,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Internal state for the filter
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
struct JsonToolState {
|
struct FilterState {
|
||||||
/// True when actively suppressing a confirmed tool call
|
state: State,
|
||||||
suppression_mode: bool,
|
/// Buffer for potential tool call detection (Buffering state)
|
||||||
/// True when buffering potential JSON (saw { but not yet confirmed as tool call)
|
|
||||||
potential_json_mode: bool,
|
|
||||||
/// Tracks nesting depth of braces within JSON
|
|
||||||
brace_depth: i32,
|
|
||||||
buffer: String,
|
buffer: String,
|
||||||
json_start_in_buffer: Option<usize>, // Position where confirmed JSON tool call starts
|
/// Brace depth for JSON tracking (Suppressing state) - string-aware
|
||||||
content_returned_up_to: usize, // Track how much content we've already returned
|
brace_depth: i32,
|
||||||
potential_json_start: Option<usize>, // Where the potential JSON started
|
/// Are we inside a JSON string? (for proper brace counting)
|
||||||
|
in_string: bool,
|
||||||
|
/// Was the previous char a backslash? (for escape handling)
|
||||||
|
escape_next: bool,
|
||||||
|
/// Track if we just saw a newline (to detect line-start patterns)
|
||||||
|
at_line_start: bool,
|
||||||
|
/// Whitespace seen after newline (before potential {)
|
||||||
|
pending_whitespace: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl JsonToolState {
|
impl FilterState {
|
||||||
fn new() -> Self {
|
fn new() -> Self {
|
||||||
Self {
|
Self {
|
||||||
suppression_mode: false,
|
state: State::Streaming,
|
||||||
potential_json_mode: false,
|
|
||||||
brace_depth: 0,
|
|
||||||
buffer: String::new(),
|
buffer: String::new(),
|
||||||
json_start_in_buffer: None,
|
brace_depth: 0,
|
||||||
content_returned_up_to: 0,
|
in_string: false,
|
||||||
potential_json_start: None,
|
escape_next: false,
|
||||||
|
at_line_start: true, // Start of input counts as line start
|
||||||
|
pending_whitespace: String::new(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn reset(&mut self) {
|
fn reset(&mut self) {
|
||||||
self.suppression_mode = false;
|
self.state = State::Streaming;
|
||||||
self.potential_json_mode = false;
|
|
||||||
self.brace_depth = 0;
|
|
||||||
self.buffer.clear();
|
self.buffer.clear();
|
||||||
self.json_start_in_buffer = None;
|
self.brace_depth = 0;
|
||||||
self.content_returned_up_to = 0;
|
self.in_string = false;
|
||||||
self.potential_json_start = None;
|
self.escape_next = false;
|
||||||
|
self.at_line_start = true;
|
||||||
|
self.pending_whitespace.clear();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Check if buffer matches the tool call pattern.
|
||||||
|
/// Pattern: `{` followed by optional whitespace, `"tool"`, optional whitespace, `:`, optional whitespace, `"`
|
||||||
|
///
|
||||||
|
/// Returns:
|
||||||
|
/// - Some(true) if confirmed as tool call
|
||||||
|
/// - Some(false) if confirmed NOT a tool call
|
||||||
|
/// - None if need more data
|
||||||
|
fn check_tool_pattern(buffer: &str) -> Option<bool> {
|
||||||
|
// Must start with {
|
||||||
|
if !buffer.starts_with('{') {
|
||||||
|
return Some(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
let after_brace = &buffer[1..];
|
||||||
|
|
||||||
|
// Skip leading whitespace after {
|
||||||
|
let trimmed = after_brace.trim_start();
|
||||||
|
|
||||||
|
// Need at least `"tool":"` = 8 chars after whitespace
|
||||||
|
if trimmed.len() < 8 {
|
||||||
|
// Not enough data yet - but check for early rejection
|
||||||
|
if trimmed.starts_with('"') {
|
||||||
|
let after_quote = &trimmed[1..];
|
||||||
|
// If we have chars after the quote, check if it starts with 't'
|
||||||
|
if !after_quote.is_empty() && !after_quote.starts_with('t') {
|
||||||
|
return Some(false); // Definitely not "tool
|
||||||
|
}
|
||||||
|
if after_quote.len() >= 2 && !after_quote.starts_with("to") {
|
||||||
|
return Some(false);
|
||||||
|
}
|
||||||
|
if after_quote.len() >= 3 && !after_quote.starts_with("too") {
|
||||||
|
return Some(false);
|
||||||
|
}
|
||||||
|
if after_quote.len() >= 4 && !after_quote.starts_with("tool") {
|
||||||
|
return Some(false);
|
||||||
|
}
|
||||||
|
} else if !trimmed.is_empty() && !trimmed.starts_with('"') {
|
||||||
|
// First non-whitespace char after { is not " - not a tool call
|
||||||
|
return Some(false);
|
||||||
|
}
|
||||||
|
return None; // Need more data
|
||||||
|
}
|
||||||
|
|
||||||
|
// We have enough data - check the full pattern
|
||||||
|
// Must be: "tool" followed by optional whitespace, :, optional whitespace, "
|
||||||
|
if !trimmed.starts_with("\"tool\"") {
|
||||||
|
return Some(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
let after_tool = trimmed[6..].trim_start(); // 6 = len of "tool"
|
||||||
|
|
||||||
|
if after_tool.is_empty() {
|
||||||
|
return None; // Need more data
|
||||||
|
}
|
||||||
|
|
||||||
|
if !after_tool.starts_with(':') {
|
||||||
|
return Some(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
let after_colon = after_tool[1..].trim_start();
|
||||||
|
|
||||||
|
if after_colon.is_empty() {
|
||||||
|
return None; // Need more data
|
||||||
|
}
|
||||||
|
|
||||||
|
if after_colon.starts_with('"') {
|
||||||
|
return Some(true); // Confirmed tool call!
|
||||||
|
}
|
||||||
|
|
||||||
|
Some(false) // Has : but not followed by "
|
||||||
|
}
|
||||||
|
|
||||||
/// Filters JSON tool calls from streaming LLM content.
|
/// Filters JSON tool calls from streaming LLM content.
|
||||||
///
|
///
|
||||||
/// Processes content chunks and removes JSON tool calls while preserving regular text.
|
/// Processes content character-by-character and removes JSON tool calls
|
||||||
/// Maintains state across calls to handle tool calls spanning multiple chunks.
|
/// while preserving regular text. Maintains state across calls.
|
||||||
///
|
///
|
||||||
/// # Arguments
|
/// # Arguments
|
||||||
/// * `content` - A chunk of streaming content from the LLM
|
/// * `content` - A chunk of streaming content from the LLM
|
||||||
@@ -86,383 +168,165 @@ pub fn filter_json_tool_calls(content: &str) -> String {
|
|||||||
|
|
||||||
JSON_TOOL_STATE.with(|state| {
|
JSON_TOOL_STATE.with(|state| {
|
||||||
let mut state = state.borrow_mut();
|
let mut state = state.borrow_mut();
|
||||||
|
let mut output = String::new();
|
||||||
// Add new content to buffer
|
|
||||||
state.buffer.push_str(content);
|
|
||||||
|
|
||||||
// If we're already in suppression mode, continue brace counting
|
|
||||||
if state.suppression_mode {
|
|
||||||
// Count braces in the new content only
|
|
||||||
for ch in content.chars() {
|
|
||||||
match ch {
|
|
||||||
'{' => state.brace_depth += 1,
|
|
||||||
'}' => {
|
|
||||||
state.brace_depth -= 1;
|
|
||||||
// Exit suppression mode when all braces are closed
|
|
||||||
if state.brace_depth <= 0 {
|
|
||||||
debug!("JSON tool call completed - exiting suppression mode");
|
|
||||||
|
|
||||||
// Extract the complete result with JSON filtered out
|
|
||||||
let result = extract_content_without_json(
|
|
||||||
&state.buffer,
|
|
||||||
state.json_start_in_buffer.unwrap_or(0),
|
|
||||||
);
|
|
||||||
|
|
||||||
// Return only the part we haven't returned yet
|
|
||||||
let new_content = if result.len() > state.content_returned_up_to {
|
|
||||||
result[state.content_returned_up_to..].to_string()
|
|
||||||
} else {
|
|
||||||
String::new()
|
|
||||||
};
|
|
||||||
|
|
||||||
state.reset();
|
|
||||||
return new_content;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_ => {}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// After counting braces, if still in suppression mode,
|
|
||||||
// check if a new tool call pattern appears. This handles truncated JSON
|
|
||||||
// followed by complete JSON.
|
|
||||||
if state.suppression_mode {
|
|
||||||
let current_json_start = state.json_start_in_buffer.unwrap();
|
|
||||||
// Don't require newline - the new JSON might be concatenated directly
|
|
||||||
let tool_call_regex = Regex::new(r#"\{\s*"tool"\s*:\s*""#).unwrap();
|
|
||||||
|
|
||||||
// Look for new tool call patterns after the current one
|
|
||||||
if let Some(captures) = tool_call_regex.find(&state.buffer[current_json_start + 1..]) {
|
|
||||||
let new_json_start = current_json_start + 1 + captures.start() + captures.as_str().find('{').unwrap();
|
|
||||||
|
|
||||||
debug!("Detected new tool call at position {} while processing incomplete one at {} - discarding old", new_json_start, current_json_start);
|
|
||||||
|
|
||||||
// The previous JSON was incomplete/malformed
|
|
||||||
// Return content before the old JSON (if any)
|
|
||||||
let content_before_old_json = if current_json_start > state.content_returned_up_to {
|
|
||||||
state.buffer[state.content_returned_up_to..current_json_start].to_string()
|
|
||||||
} else {
|
|
||||||
String::new()
|
|
||||||
};
|
|
||||||
|
|
||||||
// Update state to skip the incomplete JSON and position at the new one
|
|
||||||
// We'll process the new JSON on the next call
|
|
||||||
state.content_returned_up_to = new_json_start;
|
|
||||||
state.suppression_mode = false;
|
|
||||||
state.json_start_in_buffer = None;
|
|
||||||
state.brace_depth = 0;
|
|
||||||
|
|
||||||
return content_before_old_json;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Still in suppression mode, return empty string (content is being accumulated)
|
|
||||||
return String::new();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check if we're in potential JSON mode (saw { but waiting to confirm it's a tool call)
|
|
||||||
if state.potential_json_mode {
|
|
||||||
// Check if the buffer contains a confirmed tool call pattern
|
|
||||||
let tool_call_regex = Regex::new(r#"(?m)^\s*\{\s*"tool"\s*:\s*""#).unwrap();
|
|
||||||
|
|
||||||
if let Some(captures) = tool_call_regex.find(&state.buffer) {
|
|
||||||
// Confirmed! This is a tool call - enter suppression mode
|
|
||||||
let match_text = captures.as_str();
|
|
||||||
if let Some(brace_offset) = match_text.find('{') {
|
|
||||||
let json_start = captures.start() + brace_offset;
|
|
||||||
|
|
||||||
debug!("Confirmed JSON tool call at position {} - entering suppression mode", json_start);
|
|
||||||
|
|
||||||
state.potential_json_mode = false;
|
|
||||||
state.suppression_mode = true;
|
|
||||||
state.brace_depth = 0;
|
|
||||||
state.json_start_in_buffer = Some(json_start);
|
|
||||||
|
|
||||||
// Count braces from json_start to see if JSON is complete
|
|
||||||
let buffer_slice = state.buffer[json_start..].to_string();
|
|
||||||
for ch in buffer_slice.chars() {
|
|
||||||
match ch {
|
|
||||||
'{' => state.brace_depth += 1,
|
|
||||||
'}' => {
|
|
||||||
state.brace_depth -= 1;
|
|
||||||
if state.brace_depth <= 0 {
|
|
||||||
debug!("JSON tool call completed immediately");
|
|
||||||
let result = extract_content_without_json(&state.buffer, json_start);
|
|
||||||
let new_content = if result.len() > state.content_returned_up_to {
|
|
||||||
result[state.content_returned_up_to..].to_string()
|
|
||||||
} else {
|
|
||||||
String::new()
|
|
||||||
};
|
|
||||||
state.reset();
|
|
||||||
return new_content;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_ => {}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// JSON incomplete, stay in suppression mode, return nothing
|
|
||||||
return String::new();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check if we can rule out this being a tool call
|
|
||||||
// If we have enough content after the { and it doesn't match the pattern, release it
|
|
||||||
if let Some(potential_start) = state.potential_json_start {
|
|
||||||
let content_after_brace = &state.buffer[potential_start..];
|
|
||||||
|
|
||||||
// Rule out as a tool call if:
|
|
||||||
// 1. Closing } appears before we see the full pattern
|
|
||||||
// 2. Content clearly doesn't match the tool call pattern
|
|
||||||
// 3. Newline appears after the opening brace (tool calls should be compact)
|
|
||||||
|
|
||||||
let has_closing_brace = content_after_brace.contains('}');
|
|
||||||
let has_newline = content_after_brace[1..].contains('\n'); // Skip first char which is {
|
|
||||||
let long_enough = content_after_brace.len() >= 10;
|
|
||||||
|
|
||||||
// Detect non-tool JSON patterns:
|
|
||||||
// - { followed by " and a key that doesn't start with "tool"
|
|
||||||
// - { followed by "t" but not "to"
|
|
||||||
// - { followed by "to" but not "too", etc.
|
|
||||||
let not_tool_pattern = Regex::new(r#"^\{\s*"(?:[^t]|t(?:[^o]|o(?:[^o]|o(?:[^l]|l[^"\s:]))))"#).unwrap();
|
|
||||||
let definitely_not_tool = not_tool_pattern.is_match(content_after_brace);
|
|
||||||
|
|
||||||
if has_closing_brace || has_newline || (long_enough && definitely_not_tool) {
|
|
||||||
debug!("Potential JSON ruled out - not a tool call");
|
|
||||||
state.potential_json_mode = false;
|
|
||||||
state.potential_json_start = None;
|
|
||||||
|
|
||||||
// Return the buffered content we've been holding
|
|
||||||
let new_content = if state.buffer.len() > state.content_returned_up_to {
|
|
||||||
state.buffer[state.content_returned_up_to..].to_string()
|
|
||||||
} else {
|
|
||||||
String::new()
|
|
||||||
};
|
|
||||||
state.content_returned_up_to = state.buffer.len();
|
|
||||||
return new_content;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Still in potential mode, keep buffering
|
|
||||||
return String::new();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Detect potential JSON start: { at the beginning of a line
|
|
||||||
let potential_json_regex = Regex::new(r"(?m)^\s*\{\s*").unwrap();
|
|
||||||
|
|
||||||
if let Some(captures) = potential_json_regex.find(&state.buffer[state.content_returned_up_to..]) {
|
for ch in content.chars() {
|
||||||
let match_start = state.content_returned_up_to + captures.start();
|
match state.state {
|
||||||
let brace_pos = match_start + captures.as_str().find('{').unwrap();
|
State::Streaming => {
|
||||||
|
handle_streaming_char(&mut state, ch, &mut output);
|
||||||
debug!("Potential JSON detected at position {} - entering buffering mode", brace_pos);
|
|
||||||
|
|
||||||
// Fast path: check if this is already a confirmed tool call
|
|
||||||
let tool_call_regex = Regex::new(r#"(?m)^\s*\{\s*"tool"\s*:\s*""#).unwrap();
|
|
||||||
if tool_call_regex.is_match(&state.buffer[brace_pos..]) {
|
|
||||||
// This is a confirmed tool call! Process it immediately
|
|
||||||
let json_start = brace_pos;
|
|
||||||
debug!("Immediately confirmed tool call at position {}", json_start);
|
|
||||||
|
|
||||||
// Return content before JSON
|
|
||||||
let content_before = if json_start > state.content_returned_up_to {
|
|
||||||
state.buffer[state.content_returned_up_to..json_start].to_string()
|
|
||||||
} else {
|
|
||||||
String::new()
|
|
||||||
};
|
|
||||||
|
|
||||||
state.content_returned_up_to = json_start;
|
|
||||||
state.suppression_mode = true;
|
|
||||||
state.brace_depth = 0;
|
|
||||||
state.json_start_in_buffer = Some(json_start);
|
|
||||||
|
|
||||||
// Count braces to see if JSON is complete
|
|
||||||
let buffer_slice = state.buffer[json_start..].to_string();
|
|
||||||
for ch in buffer_slice.chars() {
|
|
||||||
match ch {
|
|
||||||
'{' => state.brace_depth += 1,
|
|
||||||
'}' => {
|
|
||||||
state.brace_depth -= 1;
|
|
||||||
if state.brace_depth <= 0 {
|
|
||||||
debug!("JSON tool call completed in same chunk");
|
|
||||||
let result = extract_content_without_json(&state.buffer, json_start);
|
|
||||||
let content_after = if result.len() > json_start {
|
|
||||||
&result[json_start..]
|
|
||||||
} else {
|
|
||||||
""
|
|
||||||
};
|
|
||||||
let final_result = format!("{}{}", content_before, content_after);
|
|
||||||
state.reset();
|
|
||||||
return final_result;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_ => {}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
// JSON incomplete, return content before and stay in suppression mode
|
State::Buffering => {
|
||||||
return content_before;
|
handle_buffering_char(&mut state, ch, &mut output);
|
||||||
}
|
}
|
||||||
|
State::Suppressing => {
|
||||||
// Return content before the potential JSON
|
handle_suppressing_char(&mut state, ch, &mut output);
|
||||||
let content_before = if brace_pos > state.content_returned_up_to {
|
|
||||||
state.buffer[state.content_returned_up_to..brace_pos].to_string()
|
|
||||||
} else {
|
|
||||||
String::new()
|
|
||||||
};
|
|
||||||
|
|
||||||
state.content_returned_up_to = brace_pos;
|
|
||||||
state.potential_json_mode = true;
|
|
||||||
state.potential_json_start = Some(brace_pos);
|
|
||||||
|
|
||||||
// Optimization: immediately check if we can rule this out for single-chunk processing
|
|
||||||
let content_after_brace = &state.buffer[brace_pos..];
|
|
||||||
let has_closing_brace = content_after_brace.contains('}');
|
|
||||||
let has_newline = content_after_brace.len() > 1 && content_after_brace[1..].contains('\n');
|
|
||||||
let long_enough = content_after_brace.len() >= 10;
|
|
||||||
|
|
||||||
let not_tool_pattern = Regex::new(r#"^\{\s*"(?:[^t]|t(?:[^o]|o(?:[^o]|o(?:[^l]|l[^"\s:]))))"#).unwrap();
|
|
||||||
let definitely_not_tool = not_tool_pattern.is_match(content_after_brace);
|
|
||||||
|
|
||||||
if has_closing_brace || has_newline || (long_enough && definitely_not_tool) {
|
|
||||||
debug!("Immediately ruled out as not a tool call");
|
|
||||||
state.potential_json_mode = false;
|
|
||||||
state.potential_json_start = None;
|
|
||||||
|
|
||||||
// Return all the buffered content
|
|
||||||
let new_content = if state.buffer.len() > state.content_returned_up_to {
|
|
||||||
state.buffer[state.content_returned_up_to..].to_string()
|
|
||||||
} else {
|
|
||||||
String::new()
|
|
||||||
};
|
|
||||||
state.content_returned_up_to = state.buffer.len();
|
|
||||||
return format!("{}{}", content_before, new_content);
|
|
||||||
}
|
|
||||||
|
|
||||||
return content_before;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check for tool call pattern using corrected regex
|
|
||||||
let tool_call_regex = Regex::new(r#"(?m)^\s*\{\s*"tool"\s*:\s*"[^"]*""#).unwrap();
|
|
||||||
|
|
||||||
if let Some(captures) = tool_call_regex.find(&state.buffer) {
|
|
||||||
let match_text = captures.as_str();
|
|
||||||
|
|
||||||
// Find the position of the opening brace in the match
|
|
||||||
if let Some(brace_offset) = match_text.find('{') {
|
|
||||||
let json_start = captures.start() + brace_offset;
|
|
||||||
|
|
||||||
debug!(
|
|
||||||
"Detected JSON tool call at position {} - entering suppression mode",
|
|
||||||
json_start
|
|
||||||
);
|
|
||||||
|
|
||||||
// Return content before JSON that we haven't returned yet
|
|
||||||
let content_before_json = if json_start >= state.content_returned_up_to {
|
|
||||||
state.buffer[state.content_returned_up_to..json_start].to_string()
|
|
||||||
} else {
|
|
||||||
String::new()
|
|
||||||
};
|
|
||||||
|
|
||||||
state.content_returned_up_to = json_start;
|
|
||||||
|
|
||||||
// Enter suppression mode
|
|
||||||
state.suppression_mode = true;
|
|
||||||
state.brace_depth = 0;
|
|
||||||
state.json_start_in_buffer = Some(json_start);
|
|
||||||
|
|
||||||
// Count braces from the JSON start to see if it's complete
|
|
||||||
let buffer_clone = state.buffer.clone();
|
|
||||||
for ch in buffer_clone[json_start..].chars() {
|
|
||||||
match ch {
|
|
||||||
'{' => state.brace_depth += 1,
|
|
||||||
'}' => {
|
|
||||||
state.brace_depth -= 1;
|
|
||||||
if state.brace_depth <= 0 {
|
|
||||||
// JSON is complete in this chunk
|
|
||||||
debug!("JSON tool call completed in same chunk");
|
|
||||||
let result = extract_content_without_json(&buffer_clone, json_start);
|
|
||||||
|
|
||||||
// Return content before JSON plus content after JSON
|
|
||||||
let content_after_json = if result.len() > json_start {
|
|
||||||
&result[json_start..]
|
|
||||||
} else {
|
|
||||||
""
|
|
||||||
};
|
|
||||||
|
|
||||||
let final_result =
|
|
||||||
format!("{}{}", content_before_json, content_after_json);
|
|
||||||
state.reset();
|
|
||||||
return final_result;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_ => {}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// JSON is incomplete, return only the content before JSON
|
|
||||||
return content_before_json;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// No JSON tool call detected, return only the new content we haven't returned yet
|
output
|
||||||
if state.buffer.len() > state.content_returned_up_to {
|
|
||||||
let result = state.buffer[state.content_returned_up_to..].to_string();
|
|
||||||
state.content_returned_up_to = state.buffer.len();
|
|
||||||
result
|
|
||||||
} else {
|
|
||||||
String::new()
|
|
||||||
}
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Extracts content from buffer, removing the JSON tool call.
|
/// Handle a character in Streaming state
|
||||||
///
|
fn handle_streaming_char(state: &mut FilterState, ch: char, output: &mut String) {
|
||||||
/// Given a buffer and the start position of a JSON tool call, this function:
|
match ch {
|
||||||
/// 1. Extracts all content before the JSON
|
'\n' => {
|
||||||
/// 2. Finds the end of the JSON (matching closing brace)
|
// Output the newline and any pending whitespace
|
||||||
/// 3. Extracts all content after the JSON
|
output.push_str(&state.pending_whitespace);
|
||||||
/// 4. Returns the concatenation of before + after (JSON removed)
|
output.push(ch);
|
||||||
///
|
state.pending_whitespace.clear();
|
||||||
/// # Arguments
|
state.at_line_start = true;
|
||||||
/// * `full_content` - The full content buffer
|
|
||||||
/// * `json_start` - Position where the JSON tool call begins
|
|
||||||
fn extract_content_without_json(full_content: &str, json_start: usize) -> String {
|
|
||||||
// Find the end of the JSON using proper brace counting with string handling
|
|
||||||
let mut brace_depth = 0;
|
|
||||||
let mut json_end = json_start;
|
|
||||||
let mut in_string = false;
|
|
||||||
let mut escape_next = false;
|
|
||||||
|
|
||||||
for (i, ch) in full_content[json_start..].char_indices() {
|
|
||||||
if escape_next {
|
|
||||||
escape_next = false;
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
|
' ' | '\t' if state.at_line_start => {
|
||||||
match ch {
|
// Accumulate whitespace at line start
|
||||||
'\\' if in_string => escape_next = true,
|
state.pending_whitespace.push(ch);
|
||||||
'"' if !escape_next => in_string = !in_string,
|
}
|
||||||
'{' if !in_string => {
|
'{' if state.at_line_start => {
|
||||||
brace_depth += 1;
|
// Potential tool call! Enter buffering mode
|
||||||
}
|
debug!("Potential tool call detected - entering Buffering state");
|
||||||
'}' if !in_string => {
|
state.state = State::Buffering;
|
||||||
brace_depth -= 1;
|
state.buffer.clear();
|
||||||
if brace_depth == 0 {
|
state.buffer.push(ch);
|
||||||
json_end = json_start + i + 1; // +1 to include the closing brace
|
// Don't output pending_whitespace yet - we might need to suppress it
|
||||||
break;
|
}
|
||||||
}
|
_ => {
|
||||||
}
|
// Regular character - output any pending whitespace first
|
||||||
_ => {}
|
output.push_str(&state.pending_whitespace);
|
||||||
|
state.pending_whitespace.clear();
|
||||||
|
output.push(ch);
|
||||||
|
state.at_line_start = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Return content before and after the JSON (excluding the JSON itself)
|
/// Handle a character in Buffering state
|
||||||
let before = &full_content[..json_start];
|
fn handle_buffering_char(state: &mut FilterState, ch: char, output: &mut String) {
|
||||||
let after = if json_end < full_content.len() {
|
state.buffer.push(ch);
|
||||||
&full_content[json_end..]
|
|
||||||
} else {
|
// Check if we can determine tool call status
|
||||||
""
|
match check_tool_pattern(&state.buffer) {
|
||||||
};
|
Some(true) => {
|
||||||
|
// Confirmed tool call! Enter suppression mode
|
||||||
|
debug!("Confirmed tool call - entering Suppressing state");
|
||||||
|
state.state = State::Suppressing;
|
||||||
|
state.brace_depth = 1; // We already have the opening {
|
||||||
|
state.in_string = true; // We're inside the "tool" value string
|
||||||
|
state.escape_next = false;
|
||||||
|
// Discard pending_whitespace (it's part of the tool call line)
|
||||||
|
state.pending_whitespace.clear();
|
||||||
|
state.buffer.clear();
|
||||||
|
}
|
||||||
|
Some(false) => {
|
||||||
|
// Not a tool call - release buffered content
|
||||||
|
debug!("Not a tool call - releasing buffer");
|
||||||
|
output.push_str(&state.pending_whitespace);
|
||||||
|
output.push_str(&state.buffer);
|
||||||
|
state.pending_whitespace.clear();
|
||||||
|
state.buffer.clear();
|
||||||
|
state.state = State::Streaming;
|
||||||
|
state.at_line_start = ch == '\n';
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
// Need more data - check if buffer is getting too long
|
||||||
|
if state.buffer.len() > MAX_BUFFER_FOR_DETECTION {
|
||||||
|
// Too long without confirmation - not a tool call
|
||||||
|
debug!("Buffer exceeded max length - not a tool call");
|
||||||
|
output.push_str(&state.pending_whitespace);
|
||||||
|
output.push_str(&state.buffer);
|
||||||
|
state.pending_whitespace.clear();
|
||||||
|
state.buffer.clear();
|
||||||
|
state.state = State::Streaming;
|
||||||
|
state.at_line_start = false;
|
||||||
|
}
|
||||||
|
// Otherwise keep buffering
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
format!("{}{}", before, after)
|
/// Handle a character in Suppressing state (string-aware brace counting)
|
||||||
|
fn handle_suppressing_char(state: &mut FilterState, ch: char, _output: &mut String) {
|
||||||
|
// Track chars to detect if we see a new tool call pattern while suppressing
|
||||||
|
// This handles truncated JSON followed by complete JSON
|
||||||
|
state.buffer.push(ch);
|
||||||
|
|
||||||
|
// Handle escape sequences
|
||||||
|
if state.escape_next {
|
||||||
|
state.escape_next = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
match ch {
|
||||||
|
'\\' if state.in_string => {
|
||||||
|
state.escape_next = true;
|
||||||
|
}
|
||||||
|
'"' => {
|
||||||
|
state.in_string = !state.in_string;
|
||||||
|
}
|
||||||
|
'{' if !state.in_string => {
|
||||||
|
state.brace_depth += 1;
|
||||||
|
}
|
||||||
|
'}' if !state.in_string => {
|
||||||
|
state.brace_depth -= 1;
|
||||||
|
if state.brace_depth <= 0 {
|
||||||
|
// JSON complete! Return to streaming
|
||||||
|
debug!("Tool call complete - returning to Streaming state");
|
||||||
|
state.state = State::Streaming;
|
||||||
|
state.at_line_start = false; // We're right after the }
|
||||||
|
state.in_string = false;
|
||||||
|
state.escape_next = false;
|
||||||
|
state.buffer.clear();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if we're seeing a new tool call pattern (truncated JSON case)
|
||||||
|
// This can happen with or without a newline before the new {
|
||||||
|
// Look for { followed by tool pattern in the buffer
|
||||||
|
if state.buffer.len() >= 10 {
|
||||||
|
// Find the last { that could start a new tool call
|
||||||
|
for (i, c) in state.buffer.char_indices().rev() {
|
||||||
|
if c == '{' && i > 0 {
|
||||||
|
let potential_tool = &state.buffer[i..];
|
||||||
|
if let Some(true) = check_tool_pattern(potential_tool) {
|
||||||
|
// New tool call detected! Restart suppression from here
|
||||||
|
debug!("New tool call detected while suppressing - restarting");
|
||||||
|
state.brace_depth = 1;
|
||||||
|
state.in_string = true;
|
||||||
|
// Keep only the part after the new { for continued tracking
|
||||||
|
state.buffer = potential_tool.to_string();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Limit buffer size to prevent unbounded growth
|
||||||
|
if state.buffer.len() > 200 {
|
||||||
|
let keep_from = state.buffer.len() - 100;
|
||||||
|
state.buffer = state.buffer[keep_from..].to_string();
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Resets the global JSON filtering state.
|
/// Resets the global JSON filtering state.
|
||||||
@@ -475,3 +339,78 @@ pub fn reset_json_tool_state() {
|
|||||||
state.reset();
|
state.reset();
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_check_tool_pattern_confirmed() {
|
||||||
|
assert_eq!(check_tool_pattern(r#"{"tool":""
|
||||||
|
"#), Some(true));
|
||||||
|
assert_eq!(check_tool_pattern(r#"{"tool": "shell""#), Some(true));
|
||||||
|
assert_eq!(check_tool_pattern(r#"{ "tool" : "test""#), Some(true));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_check_tool_pattern_rejected() {
|
||||||
|
assert_eq!(check_tool_pattern(r#"{"other": "value"}"#), Some(false));
|
||||||
|
assert_eq!(check_tool_pattern(r#"{"tools": "value"}"#), Some(false));
|
||||||
|
assert_eq!(check_tool_pattern(r#"{"tool": 123}"#), Some(false)); // number not string
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_check_tool_pattern_need_more() {
|
||||||
|
assert_eq!(check_tool_pattern(r#"{"#), None);
|
||||||
|
assert_eq!(check_tool_pattern(r#"{"tool"#), None);
|
||||||
|
assert_eq!(check_tool_pattern(r#"{"tool":"#), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_passthrough_no_tool() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "Hello world";
|
||||||
|
assert_eq!(filter_json_tool_calls(input), input);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_simple_tool_filtered() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "Before\n{\"tool\": \"shell\", \"args\": {}}\nAfter";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
assert_eq!(result, "Before\n\nAfter");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_tool_with_braces_in_string() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "Text\n{\"tool\": \"shell\", \"args\": {\"cmd\": \"echo }\"}}\nMore";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
assert_eq!(result, "Text\n\nMore");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_non_tool_json_passes_through() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "Text\n{\"other\": \"value\"}\nMore";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
assert_eq!(result, input);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_streaming_chunks() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let chunks = vec![
|
||||||
|
"Before\n",
|
||||||
|
"{\"tool\": \"",
|
||||||
|
"shell\", \"args\": {}",
|
||||||
|
"}\nAfter",
|
||||||
|
];
|
||||||
|
let mut result = String::new();
|
||||||
|
for chunk in chunks {
|
||||||
|
result.push_str(&filter_json_tool_calls(chunk));
|
||||||
|
}
|
||||||
|
assert_eq!(result, "Before\n\nAfter");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
639
crates/g3-cli/tests/filter_json_stress_test.rs
Normal file
639
crates/g3-cli/tests/filter_json_stress_test.rs
Normal file
@@ -0,0 +1,639 @@
|
|||||||
|
//! Stress tests for JSON tool call filtering.
|
||||||
|
//!
|
||||||
|
//! These tests hammer the filter with malformed JSON, partial tool calls,
|
||||||
|
//! edge cases, and adversarial inputs to ensure robustness.
|
||||||
|
|
||||||
|
use g3_cli::filter_json::{filter_json_tool_calls, reset_json_tool_state};
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// Malformed JSON Tests
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_unclosed_brace_at_end() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "Text\n{\"tool\": \"shell\", \"args\": {\"cmd\": \"ls\"";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
// Should suppress the incomplete tool call
|
||||||
|
assert_eq!(result, "Text\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_missing_closing_quote() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "Text\n{\"tool\": \"shell\", \"args\": {\"cmd\": \"ls}}\nMore";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
// The unbalanced quote makes brace counting tricky
|
||||||
|
// Should still filter the tool call attempt
|
||||||
|
assert_eq!(result, "Text\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_extra_closing_braces() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "Text\n{\"tool\": \"shell\", \"args\": {}}}}}\nMore";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
// Extra braces after valid JSON should pass through
|
||||||
|
assert_eq!(result, "Text\n}}}\nMore");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_deeply_nested_malformed() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "Text\n{\"tool\": \"x\", \"args\": {{{{{{}}}}}}}\nMore";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
// Should handle deep nesting - extra braces get consumed as part of the tool call
|
||||||
|
assert_eq!(result, "Text\n\nMore");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_null_bytes_in_json() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "Text\n{\"tool\": \"shell\0\", \"args\": {}}\nMore";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
// Should handle null bytes gracefully
|
||||||
|
assert_eq!(result, "Text\n\nMore");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_unicode_in_tool_name() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "Text\n{\"tool\": \"shëll\", \"args\": {}}\nMore";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
// Unicode in tool name - still a valid tool call pattern
|
||||||
|
assert_eq!(result, "Text\n\nMore");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_emoji_in_args() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "Text\n{\"tool\": \"shell\", \"args\": {\"msg\": \"Hello 🎉\"}}\nMore";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
assert_eq!(result, "Text\n\nMore");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_very_long_string_value() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let long_string = "x".repeat(10000);
|
||||||
|
let input = format!("Text\n{{\"tool\": \"shell\", \"args\": {{\"data\": \"{}\"}}}}\nMore", long_string);
|
||||||
|
let result = filter_json_tool_calls(&input);
|
||||||
|
assert_eq!(result, "Text\n\nMore");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_many_escaped_quotes() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = r#"Text
|
||||||
|
{"tool": "shell", "args": {"cmd": "echo \"a\" \"b\" \"c\" \"d\" \"e\""}}
|
||||||
|
More"#;
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
assert_eq!(result, "Text\n\nMore");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_escaped_backslash_before_quote() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
// This is: {"tool": "shell", "args": {"path": "C:\\"}}
|
||||||
|
let input = "Text\n{\"tool\": \"shell\", \"args\": {\"path\": \"C:\\\\\"}}\nMore";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
assert_eq!(result, "Text\n\nMore");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_newlines_inside_string() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "Text\n{\"tool\": \"shell\", \"args\": {\"cmd\": \"echo\\nworld\"}}\nMore";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
assert_eq!(result, "Text\n\nMore");
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// Partial Tool Call Pattern Tests
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_just_opening_brace() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let result = filter_json_tool_calls("Text\n{");
|
||||||
|
// Should buffer, waiting for more
|
||||||
|
assert_eq!(result, "Text\n");
|
||||||
|
|
||||||
|
// Now send something that's not a tool call
|
||||||
|
let result2 = filter_json_tool_calls("\"other\": 1}\nMore");
|
||||||
|
assert_eq!(result2, "{\"other\": 1}\nMore");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_partial_tool_keyword() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let chunks = vec!["Text\n{", "\"to", "ol", "\": ", "\"sh", "ell\"", ", \"args\": {}", "}\nMore"];
|
||||||
|
let mut result = String::new();
|
||||||
|
for chunk in chunks {
|
||||||
|
result.push_str(&filter_json_tool_calls(chunk));
|
||||||
|
}
|
||||||
|
assert_eq!(result, "Text\n\nMore");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_tool_then_not_colon() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "Text\n{\"tool\" \"shell\"}\nMore"; // Missing colon
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
// Not a valid tool call pattern - should pass through
|
||||||
|
assert_eq!(result, input);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_tool_colon_then_number() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "Text\n{\"tool\": 123}\nMore"; // Number instead of string
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
// Not a valid tool call pattern - should pass through
|
||||||
|
assert_eq!(result, input);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_tool_colon_then_null() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "Text\n{\"tool\": null}\nMore";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
// Not a valid tool call pattern - should pass through
|
||||||
|
assert_eq!(result, input);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_tool_colon_then_array() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "Text\n{\"tool\": []}\nMore";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
// Not a valid tool call pattern - should pass through
|
||||||
|
assert_eq!(result, input);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_tool_colon_then_object() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "Text\n{\"tool\": {}}\nMore";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
// Not a valid tool call pattern - should pass through
|
||||||
|
assert_eq!(result, input);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_tools_plural() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "Text\n{\"tools\": \"shell\"}\nMore";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
// "tools" is not "tool" - should pass through
|
||||||
|
assert_eq!(result, input);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_tool_with_prefix() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "Text\n{\"mytool\": \"shell\"}\nMore";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
// "mytool" is not "tool" - should pass through
|
||||||
|
assert_eq!(result, input);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_tool_uppercase() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "Text\n{\"TOOL\": \"shell\"}\nMore";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
// "TOOL" is not "tool" - should pass through
|
||||||
|
assert_eq!(result, input);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// Streaming Edge Cases
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_single_char_streaming() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "Hi\n{\"tool\": \"x\", \"args\": {}}\nBye";
|
||||||
|
let mut result = String::new();
|
||||||
|
for ch in input.chars() {
|
||||||
|
result.push_str(&filter_json_tool_calls(&ch.to_string()));
|
||||||
|
}
|
||||||
|
assert_eq!(result, "Hi\n\nBye");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_two_char_streaming() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "Hi\n{\"tool\": \"x\", \"args\": {}}\nBye";
|
||||||
|
let mut result = String::new();
|
||||||
|
let chars: Vec<char> = input.chars().collect();
|
||||||
|
for chunk in chars.chunks(2) {
|
||||||
|
let s: String = chunk.iter().collect();
|
||||||
|
result.push_str(&filter_json_tool_calls(&s));
|
||||||
|
}
|
||||||
|
assert_eq!(result, "Hi\n\nBye");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_random_chunk_sizes() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "Before\n{\"tool\": \"shell\", \"args\": {\"cmd\": \"ls -la\"}}\nAfter";
|
||||||
|
|
||||||
|
// Chunk at various sizes
|
||||||
|
let chunk_sizes = [1, 3, 7, 11, 13, 17];
|
||||||
|
|
||||||
|
for &size in &chunk_sizes {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let mut result = String::new();
|
||||||
|
let mut pos = 0;
|
||||||
|
while pos < input.len() {
|
||||||
|
let end = (pos + size).min(input.len());
|
||||||
|
let chunk = &input[pos..end];
|
||||||
|
result.push_str(&filter_json_tool_calls(chunk));
|
||||||
|
pos = end;
|
||||||
|
}
|
||||||
|
assert_eq!(result, "Before\n\nAfter", "Failed with chunk size {}", size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_chunk_boundary_at_brace() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let chunks = vec!["Text\n", "{", "\"tool\": \"x\", \"args\": {}", "}", "\nMore"];
|
||||||
|
let mut result = String::new();
|
||||||
|
for chunk in chunks {
|
||||||
|
result.push_str(&filter_json_tool_calls(chunk));
|
||||||
|
}
|
||||||
|
assert_eq!(result, "Text\n\nMore");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_chunk_boundary_at_quote() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let chunks = vec!["Text\n{\"tool\": \"", "shell", "\", \"args\": {}}", "\nMore"];
|
||||||
|
let mut result = String::new();
|
||||||
|
for chunk in chunks {
|
||||||
|
result.push_str(&filter_json_tool_calls(chunk));
|
||||||
|
}
|
||||||
|
assert_eq!(result, "Text\n\nMore");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_chunk_boundary_at_colon() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let chunks = vec!["Text\n{\"tool\"", ":", " \"shell\", \"args\": {}}\nMore"];
|
||||||
|
let mut result = String::new();
|
||||||
|
for chunk in chunks {
|
||||||
|
result.push_str(&filter_json_tool_calls(chunk));
|
||||||
|
}
|
||||||
|
assert_eq!(result, "Text\n\nMore");
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// Multiple Tool Calls
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_two_tool_calls_same_line() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
// Two tool calls on same line (no newline between)
|
||||||
|
let input = "Text\n{\"tool\": \"a\", \"args\": {}}{\"tool\": \"b\", \"args\": {}}\nMore";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
// First is filtered (starts at line beginning)
|
||||||
|
// Second starts immediately after first's }, not at line start, so passes through
|
||||||
|
// This is acceptable - LLMs typically put tool calls on separate lines
|
||||||
|
assert_eq!(result, "Text\n{\"tool\": \"b\", \"args\": {}}\nMore");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_three_tool_calls_separate_lines() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "A\n{\"tool\": \"x\", \"args\": {}}\nB\n{\"tool\": \"y\", \"args\": {}}\nC\n{\"tool\": \"z\", \"args\": {}}\nD";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
assert_eq!(result, "A\n\nB\n\nC\n\nD");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_tool_call_then_regular_json() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "A\n{\"tool\": \"x\", \"args\": {}}\nB\n{\"data\": 123}\nC";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
// First is tool call (filtered), second is regular JSON (kept)
|
||||||
|
assert_eq!(result, "A\n\nB\n{\"data\": 123}\nC");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_regular_json_then_tool_call() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "A\n{\"data\": 123}\nB\n{\"tool\": \"x\", \"args\": {}}\nC";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
assert_eq!(result, "A\n{\"data\": 123}\nB\n\nC");
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// Adversarial Inputs
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_fake_tool_in_string() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
// The tool pattern appears inside a string value
|
||||||
|
let input = r#"Text
|
||||||
|
{"message": "{\"tool\": \"shell\"}"}
|
||||||
|
More"#;
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
// Should pass through - the pattern is inside a string
|
||||||
|
assert_eq!(result, input);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_nested_json_with_tool_key() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
// Nested object has "tool" key but outer doesn't match pattern
|
||||||
|
let input = "Text\n{\"outer\": {\"tool\": \"inner\"}}\nMore";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
// Should pass through - outer object doesn't start with "tool"
|
||||||
|
assert_eq!(result, input);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_brace_bomb() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
// Many braces to stress the counter
|
||||||
|
let input = "Text\n{\"tool\": \"x\", \"args\": {\"a\": {\"b\": {\"c\": {\"d\": {\"e\": {}}}}}}}\nMore";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
assert_eq!(result, "Text\n\nMore");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_string_with_many_braces() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "Text\n{\"tool\": \"x\", \"args\": {\"code\": \"{{{{}}}}\"}}\nMore";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
assert_eq!(result, "Text\n\nMore");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_alternating_braces_in_string() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "Text\n{\"tool\": \"x\", \"args\": {\"pat\": \"}{}{}{\"}}\nMore";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
assert_eq!(result, "Text\n\nMore");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_quote_after_backslash_in_string() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
// Tricky: \" inside string should not end the string
|
||||||
|
let input = r#"Text
|
||||||
|
{"tool": "x", "args": {"msg": "say \"hi\""}}
|
||||||
|
More"#;
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
assert_eq!(result, "Text\n\nMore");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_double_backslash_then_quote() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
// \\ followed by " - the quote DOES end the string
|
||||||
|
let input = "Text\n{\"tool\": \"x\", \"args\": {\"path\": \"C:\\\\\"}}\nMore";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
assert_eq!(result, "Text\n\nMore");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_triple_backslash_then_quote() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
// \\\" - escaped backslash followed by escaped quote
|
||||||
|
let input = "Text\n{\"tool\": \"x\", \"args\": {\"s\": \"a\\\\\\\"b\"}}\nMore";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
assert_eq!(result, "Text\n\nMore");
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// Whitespace Variations
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_tabs_before_brace() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "Text\n\t\t{\"tool\": \"x\", \"args\": {}}\nMore";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
assert_eq!(result, "Text\n\nMore");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_spaces_before_brace() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "Text\n {\"tool\": \"x\", \"args\": {}}\nMore";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
assert_eq!(result, "Text\n\nMore");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_mixed_whitespace_before_brace() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "Text\n \t \t {\"tool\": \"x\", \"args\": {}}\nMore";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
assert_eq!(result, "Text\n\nMore");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_space_after_opening_brace() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "Text\n{ \"tool\": \"x\", \"args\": {}}\nMore";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
assert_eq!(result, "Text\n\nMore");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_lots_of_space_in_json() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "Text\n{ \"tool\" : \"x\" , \"args\" : { } }\nMore";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
assert_eq!(result, "Text\n\nMore");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_crlf_line_endings() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "Text\r\n{\"tool\": \"x\", \"args\": {}}\r\nMore";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
// \r is not treated as line start, so { after \r\n should work
|
||||||
|
// Actually \n triggers line start, \r is just a regular char
|
||||||
|
assert_eq!(result, "Text\r\n\r\nMore");
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// Empty and Minimal Cases
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_empty_input() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
assert_eq!(filter_json_tool_calls(""), "");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_just_newline() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
assert_eq!(filter_json_tool_calls("\n"), "\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_just_brace() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let r1 = filter_json_tool_calls("{");
|
||||||
|
// At start of input (line start), { triggers buffering
|
||||||
|
assert_eq!(r1, "");
|
||||||
|
|
||||||
|
// Send non-tool content - the newline comes through
|
||||||
|
let r2 = filter_json_tool_calls("}\n");
|
||||||
|
assert_eq!(r2, "{}\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_minimal_tool_call() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "{\"tool\":\"x\",\"args\":{}}";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
assert_eq!(result, "");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_tool_call_at_very_start() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "{\"tool\": \"x\", \"args\": {}}\nAfter";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
assert_eq!(result, "\nAfter");
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// State Reset Tests
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_reset_clears_buffering_state() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
|
||||||
|
// Start a potential tool call
|
||||||
|
let _ = filter_json_tool_calls("Text\n{");
|
||||||
|
|
||||||
|
// Reset
|
||||||
|
reset_json_tool_state();
|
||||||
|
|
||||||
|
// New input should work fresh
|
||||||
|
let result = filter_json_tool_calls("Fresh start");
|
||||||
|
assert_eq!(result, "Fresh start");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_reset_clears_suppressing_state() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
|
||||||
|
// Start suppressing a tool call
|
||||||
|
let _ = filter_json_tool_calls("Text\n{\"tool\": \"x\", \"args\": {");
|
||||||
|
|
||||||
|
// Reset
|
||||||
|
reset_json_tool_state();
|
||||||
|
|
||||||
|
// New input should work fresh
|
||||||
|
let result = filter_json_tool_calls("Fresh start");
|
||||||
|
assert_eq!(result, "Fresh start");
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// Real-World Patterns from Bug Reports
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_str_replace_with_diff() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = r#"I'll update the file:
|
||||||
|
{"tool": "str_replace", "args": {"file_path": "src/main.rs", "diff": "@@ -1,3 +1,4 @@\n fn main() {\n+ println!(\"Hello\");\n }"}}
|
||||||
|
Done!"#;
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
assert_eq!(result, "I'll update the file:\n\nDone!");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_shell_with_complex_command() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = r#"Running command:
|
||||||
|
{"tool": "shell", "args": {"command": "find . -name '*.rs' -exec grep -l 'TODO' {} \;"}}
|
||||||
|
Results above."#;
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
assert_eq!(result, "Running command:\n\nResults above.");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_write_file_with_json_content() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = r#"Creating config:
|
||||||
|
{"tool": "write_file", "args": {"file_path": "config.json", "content": "{\"key\": \"value\"}"}}
|
||||||
|
File created."#;
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
assert_eq!(result, "Creating config:\n\nFile created.");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_read_file_simple() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "Let me check:\n{\"tool\": \"read_file\", \"args\": {\"file_path\": \"README.md\"}}\nHere's what I found:";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
assert_eq!(result, "Let me check:\n\nHere's what I found:");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_final_output() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "Task complete.\n{\"tool\": \"final_output\", \"args\": {\"summary\": \"# Summary\\n\\nI completed the task.\\n\\n## Details\\n- Item 1\\n- Item 2\"}}\n";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
assert_eq!(result, "Task complete.\n\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// Truncated JSON followed by Complete JSON (the original bug)
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_truncated_then_complete_streaming() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
|
||||||
|
// Chunk 1: text
|
||||||
|
let r1 = filter_json_tool_calls("Some text\n");
|
||||||
|
assert_eq!(r1, "Some text\n");
|
||||||
|
|
||||||
|
// Chunk 2: truncated tool call
|
||||||
|
let r2 = filter_json_tool_calls(r#"{"tool": "str_replace", "args": {"diff":"partial"#);
|
||||||
|
assert_eq!(r2, "");
|
||||||
|
|
||||||
|
// Chunk 3: new complete tool call (LLM retry)
|
||||||
|
let r3 = filter_json_tool_calls(r#"{"tool": "str_replace", "args": {"diff":"complete", "file_path":"x.rs"}}"#);
|
||||||
|
assert_eq!(r3, "");
|
||||||
|
|
||||||
|
// Chunk 4: text after
|
||||||
|
let r4 = filter_json_tool_calls("\nMore text");
|
||||||
|
assert_eq!(r4, "\nMore text");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_multiple_truncated_then_complete() {
|
||||||
|
reset_json_tool_state();
|
||||||
|
|
||||||
|
let chunks = vec![
|
||||||
|
"Start\n",
|
||||||
|
r#"{"tool": "a", "args": {"x": "trunc"#, // truncated
|
||||||
|
r#"{"tool": "b", "args": {"y": "also_trunc"#, // another truncated
|
||||||
|
r#"{"tool": "c", "args": {"z": "complete"}}"#, // finally complete
|
||||||
|
"\nEnd",
|
||||||
|
];
|
||||||
|
|
||||||
|
let mut result = String::new();
|
||||||
|
for chunk in chunks {
|
||||||
|
result.push_str(&filter_json_tool_calls(chunk));
|
||||||
|
}
|
||||||
|
|
||||||
|
assert_eq!(result, "Start\n\nEnd");
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user