Fix JSON filter to preserve code fence and indented content
Two cosmetic bugs fixed:
1. JSON inside code fences was being filtered - now tracks fence state
and passes through all content inside ``` ... ``` blocks
2. Indented JSON was being filtered - now recognizes that real tool
calls are never indented, so indented JSON is always documentation
Changes:
- Added in_code_fence and fence_buffer fields to FilterState
- Added track_code_fence() to detect ``` markers (with/without language)
- Added pass_through_char() for content inside code fences
- Modified '{' handling to only filter when no leading whitespace
- Added 4 new unit tests for code fence and indentation cases
- Updated 3 stress tests to expect new (correct) behavior
All 16 filter_json unit tests and 59 stress tests pass.
This commit is contained in:
@@ -54,6 +54,10 @@ struct FilterState {
|
|||||||
state: State,
|
state: State,
|
||||||
/// Buffer for potential tool call detection (Buffering state)
|
/// Buffer for potential tool call detection (Buffering state)
|
||||||
buffer: String,
|
buffer: String,
|
||||||
|
/// Are we inside a code fence? (``` ... ```)
|
||||||
|
in_code_fence: bool,
|
||||||
|
/// Buffer for detecting code fence markers
|
||||||
|
fence_buffer: String,
|
||||||
/// Brace depth for JSON tracking (Suppressing state) - string-aware
|
/// Brace depth for JSON tracking (Suppressing state) - string-aware
|
||||||
brace_depth: i32,
|
brace_depth: i32,
|
||||||
/// Are we inside a JSON string? (for proper brace counting)
|
/// Are we inside a JSON string? (for proper brace counting)
|
||||||
@@ -73,6 +77,8 @@ impl FilterState {
|
|||||||
Self {
|
Self {
|
||||||
state: State::Streaming,
|
state: State::Streaming,
|
||||||
buffer: String::new(),
|
buffer: String::new(),
|
||||||
|
in_code_fence: false,
|
||||||
|
fence_buffer: String::new(),
|
||||||
brace_depth: 0,
|
brace_depth: 0,
|
||||||
in_string: false,
|
in_string: false,
|
||||||
escape_next: false,
|
escape_next: false,
|
||||||
@@ -85,6 +91,8 @@ impl FilterState {
|
|||||||
fn reset(&mut self) {
|
fn reset(&mut self) {
|
||||||
self.state = State::Streaming;
|
self.state = State::Streaming;
|
||||||
self.buffer.clear();
|
self.buffer.clear();
|
||||||
|
self.in_code_fence = false;
|
||||||
|
self.fence_buffer.clear();
|
||||||
self.brace_depth = 0;
|
self.brace_depth = 0;
|
||||||
self.in_string = false;
|
self.in_string = false;
|
||||||
self.escape_next = false;
|
self.escape_next = false;
|
||||||
@@ -185,6 +193,15 @@ pub fn filter_json_tool_calls(content: &str) -> String {
|
|||||||
|
|
||||||
/// Handle a character in Streaming state
|
/// Handle a character in Streaming state
|
||||||
fn handle_streaming_char(state: &mut FilterState, ch: char, output: &mut String) {
|
fn handle_streaming_char(state: &mut FilterState, ch: char, output: &mut String) {
|
||||||
|
// Track code fence state
|
||||||
|
track_code_fence(state, ch);
|
||||||
|
|
||||||
|
// If inside a code fence, pass through everything
|
||||||
|
if state.in_code_fence {
|
||||||
|
pass_through_char(state, ch, output);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
match ch {
|
match ch {
|
||||||
'\n' => {
|
'\n' => {
|
||||||
// Buffer extra newlines at line start - they may precede a tool call
|
// Buffer extra newlines at line start - they may precede a tool call
|
||||||
@@ -202,14 +219,24 @@ fn handle_streaming_char(state: &mut FilterState, ch: char, output: &mut String)
|
|||||||
// Accumulate whitespace at line start
|
// Accumulate whitespace at line start
|
||||||
state.pending_whitespace.push(ch);
|
state.pending_whitespace.push(ch);
|
||||||
}
|
}
|
||||||
'{' if state.at_line_start => {
|
'{' if state.at_line_start && state.pending_whitespace.is_empty() => {
|
||||||
// Potential tool call! Enter buffering mode
|
// Potential tool call! Enter buffering mode
|
||||||
|
// BUT only if there's no leading whitespace (indented JSON is not a tool call)
|
||||||
debug!("Potential tool call detected - entering Buffering state");
|
debug!("Potential tool call detected - entering Buffering state");
|
||||||
state.state = State::Buffering;
|
state.state = State::Buffering;
|
||||||
state.buffer.clear();
|
state.buffer.clear();
|
||||||
state.buffer.push(ch);
|
state.buffer.push(ch);
|
||||||
// Don't output pending_newlines or pending_whitespace yet - we might need to suppress them
|
// Don't output pending_newlines or pending_whitespace yet - we might need to suppress them
|
||||||
}
|
}
|
||||||
|
'{' if state.at_line_start && !state.pending_whitespace.is_empty() => {
|
||||||
|
// Indented JSON - not a tool call, pass through
|
||||||
|
output.push_str(&state.pending_newlines);
|
||||||
|
output.push_str(&state.pending_whitespace);
|
||||||
|
state.pending_newlines.clear();
|
||||||
|
state.pending_whitespace.clear();
|
||||||
|
output.push(ch);
|
||||||
|
state.at_line_start = false;
|
||||||
|
}
|
||||||
_ => {
|
_ => {
|
||||||
// Regular character - output any pending newlines and whitespace first
|
// Regular character - output any pending newlines and whitespace first
|
||||||
output.push_str(&state.pending_newlines);
|
output.push_str(&state.pending_newlines);
|
||||||
@@ -222,6 +249,45 @@ fn handle_streaming_char(state: &mut FilterState, ch: char, output: &mut String)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Pass through a character without filtering (used inside code fences)
|
||||||
|
fn pass_through_char(state: &mut FilterState, ch: char, output: &mut String) {
|
||||||
|
// Output any pending content first
|
||||||
|
output.push_str(&state.pending_newlines);
|
||||||
|
output.push_str(&state.pending_whitespace);
|
||||||
|
state.pending_newlines.clear();
|
||||||
|
state.pending_whitespace.clear();
|
||||||
|
output.push(ch);
|
||||||
|
state.at_line_start = ch == '\n';
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Track code fence state (``` markers)
|
||||||
|
fn track_code_fence(state: &mut FilterState, ch: char) {
|
||||||
|
match ch {
|
||||||
|
'`' => {
|
||||||
|
state.fence_buffer.push(ch);
|
||||||
|
}
|
||||||
|
'\n' => {
|
||||||
|
// Check if we have a fence marker
|
||||||
|
if state.fence_buffer.starts_with("```") {
|
||||||
|
// Toggle fence state
|
||||||
|
state.in_code_fence = !state.in_code_fence;
|
||||||
|
debug!("Code fence toggled: in_code_fence={}", state.in_code_fence);
|
||||||
|
}
|
||||||
|
state.fence_buffer.clear();
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
// If we were accumulating backticks but got something else,
|
||||||
|
// check if we have a fence marker (for opening fences with language)
|
||||||
|
if state.fence_buffer.starts_with("```") && !state.in_code_fence {
|
||||||
|
// Opening fence with language specifier (e.g., ```json)
|
||||||
|
state.in_code_fence = true;
|
||||||
|
debug!("Code fence opened with language: in_code_fence=true");
|
||||||
|
}
|
||||||
|
state.fence_buffer.clear();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Handle a character in Buffering state
|
/// Handle a character in Buffering state
|
||||||
fn handle_buffering_char(state: &mut FilterState, ch: char, output: &mut String) {
|
fn handle_buffering_char(state: &mut FilterState, ch: char, output: &mut String) {
|
||||||
state.buffer.push(ch);
|
state.buffer.push(ch);
|
||||||
@@ -508,4 +574,40 @@ mod tests {
|
|||||||
let result = filter_json_tool_calls(input);
|
let result = filter_json_tool_calls(input);
|
||||||
assert_eq!(result, input, "Tool calls not at line start should pass through");
|
assert_eq!(result, input, "Tool calls not at line start should pass through");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_tool_json_in_code_fence_passes_through() {
|
||||||
|
// JSON inside code fences should NOT be filtered, even if it looks like a tool call
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "Before\n```json\n{\"tool\": \"shell\", \"args\": {}}\n```\nAfter";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
assert_eq!(result, input, "Tool JSON inside code fence should pass through");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_tool_json_in_plain_code_fence_passes_through() {
|
||||||
|
// JSON inside plain code fences (no language) should also pass through
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "Before\n```\n{\"tool\": \"shell\", \"args\": {}}\n```\nAfter";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
assert_eq!(result, input, "Tool JSON inside plain code fence should pass through");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_indented_tool_json_passes_through() {
|
||||||
|
// Indented JSON should NOT be filtered (real tool calls are never indented)
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "Before\n {\"tool\": \"shell\", \"args\": {}}\nAfter";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
assert_eq!(result, input, "Indented tool JSON should pass through");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_tab_indented_tool_json_passes_through() {
|
||||||
|
// Tab-indented JSON should also pass through
|
||||||
|
reset_json_tool_state();
|
||||||
|
let input = "Before\n\t{\"tool\": \"shell\", \"args\": {}}\nAfter";
|
||||||
|
let result = filter_json_tool_calls(input);
|
||||||
|
assert_eq!(result, input, "Tab-indented tool JSON should pass through");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -420,7 +420,8 @@ fn test_tabs_before_brace() {
|
|||||||
reset_json_tool_state();
|
reset_json_tool_state();
|
||||||
let input = "Text\n\t\t{\"tool\": \"x\", \"args\": {}}\nMore";
|
let input = "Text\n\t\t{\"tool\": \"x\", \"args\": {}}\nMore";
|
||||||
let result = filter_json_tool_calls(input);
|
let result = filter_json_tool_calls(input);
|
||||||
assert_eq!(result, "Text\n\nMore");
|
// Indented JSON should NOT be filtered - real tool calls are never indented
|
||||||
|
assert_eq!(result, input);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -428,7 +429,8 @@ fn test_spaces_before_brace() {
|
|||||||
reset_json_tool_state();
|
reset_json_tool_state();
|
||||||
let input = "Text\n {\"tool\": \"x\", \"args\": {}}\nMore";
|
let input = "Text\n {\"tool\": \"x\", \"args\": {}}\nMore";
|
||||||
let result = filter_json_tool_calls(input);
|
let result = filter_json_tool_calls(input);
|
||||||
assert_eq!(result, "Text\n\nMore");
|
// Indented JSON should NOT be filtered - real tool calls are never indented
|
||||||
|
assert_eq!(result, input);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -436,7 +438,8 @@ fn test_mixed_whitespace_before_brace() {
|
|||||||
reset_json_tool_state();
|
reset_json_tool_state();
|
||||||
let input = "Text\n \t \t {\"tool\": \"x\", \"args\": {}}\nMore";
|
let input = "Text\n \t \t {\"tool\": \"x\", \"args\": {}}\nMore";
|
||||||
let result = filter_json_tool_calls(input);
|
let result = filter_json_tool_calls(input);
|
||||||
assert_eq!(result, "Text\n\nMore");
|
// Indented JSON should NOT be filtered - real tool calls are never indented
|
||||||
|
assert_eq!(result, input);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
Reference in New Issue
Block a user