Fix inline JSON being incorrectly detected as tool call

The bug was caused by mark_tool_calls_consumed() being called after
displaying each chunk, which advanced last_consumed_position to the
end of the current buffer. When the next chunk arrived with JSON,
the unchecked_buffer started at position 0 of the slice, causing
is_on_own_line() to return true (position 0 is always "on its own line").

Removed the problematic mark_tool_calls_consumed() call from the
"no tool executed" branch. The remaining call after actual tool
execution is correct and necessary.

Added integration test that verifies inline JSON in prose is not
detected as a tool call.
This commit is contained in:
Dhanji R. Prasanna
2026-01-19 14:35:01 +05:30
parent 292a3aa48d
commit 5caa101b84
4 changed files with 311 additions and 7 deletions

View File

@@ -1,5 +1,5 @@
# Project Memory # Project Memory
> Updated: 2026-01-15T00:50:41Z | Size: 12.9k chars > Updated: 2026-01-19T08:32:03Z | Size: 14.0k chars
### Remember Tool Wiring ### Remember Tool Wiring
- `crates/g3-core/src/tools/memory.rs` [0..5000] - `execute_remember()`, `get_memory_path()`, `merge_memory()` - `crates/g3-core/src/tools/memory.rs` [0..5000] - `execute_remember()`, `get_memory_path()`, `merge_memory()`
@@ -229,4 +229,27 @@ Injects agent+language-specific guidance when running in agent mode in a workspa
To add a new agent+lang prompt: To add a new agent+lang prompt:
1. Create `prompts/langs/<agent>.<lang>.md` 1. Create `prompts/langs/<agent>.<lang>.md`
2. Add entry to `AGENT_LANGUAGE_PROMPTS` in `language_prompts.rs` with `include_str!` 2. Add entry to `AGENT_LANGUAGE_PROMPTS` in `language_prompts.rs` with `include_str!`
### MockProvider for Testing
Configurable mock LLM provider for integration testing without real API calls.
- `crates/g3-providers/src/mock.rs`
- `MockProvider` [220..320] - mock provider with response queue, request tracking
- `MockResponse` [35..200] - configurable response with chunks and usage
- `MockChunk` [45..100] - individual streaming chunk (content, finished, tool_calls)
- `scenarios` module [410..480] - preset scenarios: `text_only_response()`, `multi_turn()`, `tool_then_response()`
- `crates/g3-core/tests/mock_provider_integration_test.rs`
- `test_butler_bug_scenario()` - reproduces consecutive user messages bug
- `test_text_only_response_saves_to_context()` - verifies text responses saved
- `test_multi_turn_text_only_maintains_alternation()` - verifies user/assistant alternation
Usage pattern:
```rust
let provider = MockProvider::new()
.with_response(MockResponse::text("Hello!"));
let mut registry = ProviderRegistry::new();
registry.register(provider);
let agent = Agent::new_for_test(config, NullUiWriter, registry).await?;
```

View File

@@ -2432,11 +2432,6 @@ Skip if nothing new. Be brief."#;
self.ui_writer.print_agent_response(&filtered_content); self.ui_writer.print_agent_response(&filtered_content);
self.ui_writer.flush(); self.ui_writer.flush();
current_response.push_str(&filtered_content); current_response.push_str(&filtered_content);
// Mark parser buffer as consumed up to current position
// This prevents tool-call-like patterns in displayed text
// from triggering false positives in has_unexecuted_tool_call()
parser.mark_tool_calls_consumed();
} }
} }
} }

View File

@@ -226,3 +226,239 @@ async fn test_butler_bug_scenario() {
assistant_count assistant_count
); );
} }
// =============================================================================
// Parser Poisoning Tests (commits 999ac6f, d68f059, 4c36cc0)
// =============================================================================
/// Test the parser directly with the same chunks to isolate the issue
#[tokio::test]
async fn test_parser_directly_with_inline_json_chunks() {
use g3_core::streaming_parser::StreamingToolParser;
use g3_providers::CompletionChunk;
let mut parser = StreamingToolParser::new();
// Simulate the exact chunks from the mock provider
let chunk1 = CompletionChunk {
content: "To run a command, you can use the format ".to_string(),
tool_calls: None,
finished: false,
stop_reason: None,
tool_call_streaming: None,
usage: None,
};
let chunk2 = CompletionChunk {
content: r#"{"tool": "shell", "args": {"command": "ls"}}"#.to_string(),
tool_calls: None,
finished: false,
stop_reason: None,
tool_call_streaming: None,
usage: None,
};
let tools1 = parser.process_chunk(&chunk1);
let tools2 = parser.process_chunk(&chunk2);
assert!(tools1.is_empty(), "Chunk 1 should not produce tools");
assert!(tools2.is_empty(), "Chunk 2 should NOT produce tools - JSON is inline, not on its own line");
// Also check has_unexecuted_tool_call and has_incomplete_tool_call
assert!(!parser.has_unexecuted_tool_call(), "Should NOT have unexecuted tool call - JSON is inline");
assert!(!parser.has_incomplete_tool_call(), "Should NOT have incomplete tool call");
}
// These tests verify that inline JSON patterns in prose don't trigger
// false tool call detection, which would cause the agent to return
// control mid-task.
/// Test: Inline JSON in prose should NOT trigger tool call detection
///
/// Bug: When the LLM explained tool call format in prose like:
/// "You can use {"tool": "shell", ...} to run commands"
/// The parser would incorrectly detect this as a tool call.
///
/// Fix: Only detect tool calls that appear on their own line.
#[tokio::test]
async fn test_inline_json_in_prose_not_detected_as_tool() {
let provider = MockProvider::new()
.with_response(MockResponse::streaming(vec![
"To run a command, you can use the format ",
r#"{"tool": "shell", "args": {"command": "ls"}}"#,
" in your request. ",
"Let me know if you need help!",
]))
// Add a default response in case auto-continue is triggered (which would be a bug)
.with_default_response(MockResponse::text("[BUG: Auto-continue was triggered - inline JSON was detected as tool call]"));
let (mut agent, _temp_dir) = create_agent_with_mock(provider).await;
let result = agent.execute_task("How do I run commands?", None, false).await;
assert!(result.is_ok(), "Task should succeed: {:?}", result.err());
// The response should be saved as text, not executed as a tool
let history = &agent.get_context_window().conversation_history;
let assistant_msg = history
.iter()
.rev()
.find(|m| matches!(m.role, MessageRole::Assistant))
.expect("Should have an assistant message");
// The inline JSON should be preserved in the response
assert!(
assistant_msg.content.contains("tool") && assistant_msg.content.contains("shell"),
"Response should contain the inline JSON example: {}",
assistant_msg.content
);
}
/// Test: JSON tool call on its own line SHOULD be detected
///
/// This is the normal case - real tool calls from LLMs appear on their own line.
#[tokio::test]
async fn test_json_on_own_line_detected_as_tool() {
// This test uses native tool calling to verify tool detection works
let provider = MockProvider::new()
.with_native_tool_calling(true)
.with_response(MockResponse::native_tool_call(
"shell",
serde_json::json!({"command": "echo hello"}),
));
let (mut agent, _temp_dir) = create_agent_with_mock(provider).await;
// The task should detect the tool call
// Note: This will fail because we don't have a real shell, but that's OK
// We just want to verify the tool call was detected
let result = agent.execute_task("Run echo hello", None, false).await;
// The result might be an error (tool execution fails in test env)
// but we can check if a tool was attempted by looking at context
let history = &agent.get_context_window().conversation_history;
// Should have user message at minimum
assert!(
history.iter().any(|m| matches!(m.role, MessageRole::User)),
"Should have user message"
);
}
/// Test: Response with emoji and special characters doesn't crash
///
/// Bug: UTF-8 multi-byte characters caused panics when truncating strings
/// using byte indices instead of character indices.
#[tokio::test]
async fn test_utf8_response_handling() {
let provider = MockProvider::new()
.with_response(MockResponse::text(
"Here's the result! 🎉\n\n\
• First item with bullet\n\
• Second item 中文字符\n\
• Third item émojis: 🚀🔥💯\n\n\
Done! ✅"
));
let (mut agent, _temp_dir) = create_agent_with_mock(provider).await;
let result = agent.execute_task("Show me a list", None, false).await;
assert!(result.is_ok(), "Task should succeed with UTF-8 content: {:?}", result.err());
let history = &agent.get_context_window().conversation_history;
let assistant_msg = history
.iter()
.rev()
.find(|m| matches!(m.role, MessageRole::Assistant))
.expect("Should have an assistant message");
// Verify the UTF-8 content is preserved
assert!(assistant_msg.content.contains("🎉"), "Should contain emoji");
assert!(assistant_msg.content.contains("中文"), "Should contain CJK characters");
assert!(assistant_msg.content.contains(""), "Should contain bullet points");
}
/// Test: Very long response with UTF-8 doesn't panic on truncation
#[tokio::test]
async fn test_long_utf8_response_no_panic() {
// Create a response with lots of multi-byte characters
let long_content = "🔥".repeat(1000) + &"Test content with emoji 🎉 and more 中文字符 here. ".repeat(100);
let provider = MockProvider::new()
.with_response(MockResponse::text(&long_content));
let (mut agent, _temp_dir) = create_agent_with_mock(provider).await;
// This should not panic even with lots of multi-byte characters
let result = agent.execute_task("Generate long content", None, false).await;
assert!(result.is_ok(), "Task should succeed: {:?}", result.err());
}
/// Test: Response is not duplicated in output
///
/// Bug (cebec23): Response was printed twice - once during streaming
/// and again after task completion.
#[tokio::test]
async fn test_response_not_duplicated() {
let provider = MockProvider::new()
.with_response(MockResponse::text("This is a unique response that should appear once."));
let (mut agent, _temp_dir) = create_agent_with_mock(provider).await;
let result = agent.execute_task("Say something unique", None, false).await;
assert!(result.is_ok(), "Task should succeed: {:?}", result.err());
// Check the TaskResult - it should have the response
let task_result = result.unwrap();
// The response field might be empty (content was streamed) or contain the response
// Either way, the context should have exactly one assistant message with this content
let history = &agent.get_context_window().conversation_history;
let assistant_messages: Vec<_> = history
.iter()
.filter(|m| matches!(m.role, MessageRole::Assistant))
.filter(|m| m.content.contains("unique response"))
.collect();
assert_eq!(
assistant_messages.len(), 1,
"Should have exactly one assistant message with the response, got {}",
assistant_messages.len()
);
}
/// Test: Multiple chunks streamed correctly without duplication
#[tokio::test]
async fn test_streaming_no_chunk_duplication() {
let provider = MockProvider::new()
.with_response(MockResponse::streaming(vec![
"Part 1. ",
"Part 2. ",
"Part 3. ",
"Part 4.",
]));
let (mut agent, _temp_dir) = create_agent_with_mock(provider).await;
let result = agent.execute_task("Stream something", None, false).await;
assert!(result.is_ok(), "Task should succeed: {:?}", result.err());
let history = &agent.get_context_window().conversation_history;
let assistant_msg = history
.iter()
.rev()
.find(|m| matches!(m.role, MessageRole::Assistant))
.expect("Should have an assistant message");
// Each part should appear exactly once
let content = &assistant_msg.content;
assert_eq!(
content.matches("Part 1").count(), 1,
"Part 1 should appear exactly once in: {}",
content
);
assert_eq!(
content.matches("Part 2").count(), 1,
"Part 2 should appear exactly once in: {}",
content
);
}

View File

@@ -432,3 +432,53 @@ mod streaming_repro {
assert_eq!(tools[0].tool, "shell"); assert_eq!(tools[0].tool, "shell");
} }
} }
/// Test that inline JSON is not detected even when stream finishes
/// This tests the try_parse_all_json_tool_calls_from_buffer path
#[test]
fn test_inline_json_not_detected_at_stream_end() {
use g3_core::StreamingToolParser;
use g3_providers::CompletionChunk;
fn chunk(content: &str, finished: bool) -> CompletionChunk {
CompletionChunk {
content: content.to_string(),
finished,
tool_calls: None,
usage: None,
stop_reason: if finished { Some("end_turn".to_string()) } else { None },
tool_call_streaming: None,
}
}
let mut parser = StreamingToolParser::new();
// Send chunks exactly as MockProvider would
let tools = parser.process_chunk(&chunk("To run a command, you can use the format ", false));
assert!(tools.is_empty(), "Chunk 1: no tools");
let tools = parser.process_chunk(&chunk(r#"{"tool": "shell", "args": {"command": "ls"}}"#, false));
assert!(tools.is_empty(), "Chunk 2: inline JSON should not trigger tool detection");
let tools = parser.process_chunk(&chunk(" in your request. ", false));
assert!(tools.is_empty(), "Chunk 3: no tools");
let tools = parser.process_chunk(&chunk("Let me know if you need help!", false));
assert!(tools.is_empty(), "Chunk 4: no tools");
// Finish chunk - this triggers try_parse_all_json_tool_calls_from_buffer
let tools = parser.process_chunk(&chunk("", true));
assert!(
tools.is_empty(),
"Finish chunk: inline JSON should NOT be detected as tool call. Found: {:?}",
tools.iter().map(|t| &t.tool).collect::<Vec<_>>()
);
// Verify the full buffer content
let buffer = parser.get_text_content();
assert!(
buffer.contains(r#"{"tool": "shell"#),
"Buffer should contain the inline JSON: {}",
buffer
);
}