Compare commits
23 Commits
jochen-deb
...
jochen-fix
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4aa84e2144 | ||
|
|
2283d9ddbf | ||
|
|
fb2cf6f898 | ||
|
|
696c441a47 | ||
|
|
48e6d594bc | ||
|
|
678403da35 | ||
|
|
0970e4f356 | ||
|
|
758a313de0 | ||
|
|
0327a6dfdf | ||
|
|
928f2bfa9d | ||
|
|
21af6ba574 | ||
|
|
ae16243f49 | ||
|
|
9ee0468b87 | ||
|
|
d9ad244197 | ||
|
|
a6537e4dba | ||
|
|
df3f25f2f0 | ||
|
|
f8f989d4c6 | ||
|
|
0e4c935a70 | ||
|
|
1b4ea93ba4 | ||
|
|
4496eee046 | ||
|
|
8928fb92be | ||
|
|
81fd2ab92f | ||
|
|
af7fb8f7f1 |
1
Cargo.lock
generated
1
Cargo.lock
generated
@@ -1377,6 +1377,7 @@ dependencies = [
|
|||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"sha2",
|
"sha2",
|
||||||
|
"tempfile",
|
||||||
"termimad",
|
"termimad",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tokio-util",
|
"tokio-util",
|
||||||
|
|||||||
@@ -76,6 +76,7 @@ G3 includes robust error handling with automatic retry logic:
|
|||||||
G3's interactive CLI includes control commands for manual context management:
|
G3's interactive CLI includes control commands for manual context management:
|
||||||
- **`/compact`**: Manually trigger summarization to compact conversation history
|
- **`/compact`**: Manually trigger summarization to compact conversation history
|
||||||
- **`/thinnify`**: Manually trigger context thinning to replace large tool results with file references
|
- **`/thinnify`**: Manually trigger context thinning to replace large tool results with file references
|
||||||
|
- **`/skinnify`**: Manually trigger full context thinning (like `/thinnify` but processes the entire context window, not just the first third)
|
||||||
- **`/readme`**: Reload README.md and AGENTS.md from disk without restarting
|
- **`/readme`**: Reload README.md and AGENTS.md from disk without restarting
|
||||||
- **`/stats`**: Show detailed context and performance statistics
|
- **`/stats`**: Show detailed context and performance statistics
|
||||||
- **`/help`**: Display all available control commands
|
- **`/help`**: Display all available control commands
|
||||||
|
|||||||
@@ -24,6 +24,8 @@ temperature = 0.3 # Slightly higher temperature for more creative implementatio
|
|||||||
# Options: "ephemeral", "5minute", "1hour"
|
# Options: "ephemeral", "5minute", "1hour"
|
||||||
# Reduces costs and latency for repeated prompts. Uses Anthropic's prompt caching with different TTLs.
|
# Reduces costs and latency for repeated prompts. Uses Anthropic's prompt caching with different TTLs.
|
||||||
# enable_1m_context = true # optional, more expensive
|
# enable_1m_context = true # optional, more expensive
|
||||||
|
# thinking_budget_tokens = 10000 # Optional: Enable extended thinking mode with token budget
|
||||||
|
# Allows the model to "think" before responding. Useful for complex reasoning tasks.
|
||||||
|
|
||||||
|
|
||||||
# Multiple OpenAI-compatible providers can be configured with custom names
|
# Multiple OpenAI-compatible providers can be configured with custom names
|
||||||
|
|||||||
@@ -27,3 +27,6 @@ chrono = { version = "0.4", features = ["serde"] }
|
|||||||
crossterm = "0.29.0"
|
crossterm = "0.29.0"
|
||||||
ratatui = "0.29"
|
ratatui = "0.29"
|
||||||
termimad = "0.34.0"
|
termimad = "0.34.0"
|
||||||
|
|
||||||
|
[dev-dependencies]
|
||||||
|
tempfile = "3.8"
|
||||||
|
|||||||
@@ -163,15 +163,66 @@ fn extract_coach_feedback_from_logs(
|
|||||||
if let Some(context_window) = log_json.get("context_window") {
|
if let Some(context_window) = log_json.get("context_window") {
|
||||||
if let Some(conversation_history) = context_window.get("conversation_history") {
|
if let Some(conversation_history) = context_window.get("conversation_history") {
|
||||||
if let Some(messages) = conversation_history.as_array() {
|
if let Some(messages) = conversation_history.as_array() {
|
||||||
// Simply get the last message content - this is the coach's final feedback
|
// Go backwards through the conversation to find the last tool result
|
||||||
if let Some(last_message) = messages.last() {
|
// that corresponds to a final_output tool call
|
||||||
if let Some(content) = last_message.get("content") {
|
for i in (0..messages.len()).rev() {
|
||||||
if let Some(content_str) = content.as_str() {
|
let msg = &messages[i];
|
||||||
output.print(&format!(
|
|
||||||
"✅ Extracted coach feedback from session: {}",
|
// Check if this is a User message with "Tool result:"
|
||||||
session_id
|
if let Some(role) = msg.get("role") {
|
||||||
));
|
if let Some(role_str) = role.as_str() {
|
||||||
return Ok(content_str.to_string());
|
if role_str == "User" || role_str == "user" {
|
||||||
|
if let Some(content) = msg.get("content") {
|
||||||
|
if let Some(content_str) = content.as_str() {
|
||||||
|
if content_str.starts_with("Tool result:") {
|
||||||
|
// Found a tool result, now check the preceding message
|
||||||
|
// to verify it was a final_output tool call
|
||||||
|
if i > 0 {
|
||||||
|
let prev_msg = &messages[i - 1];
|
||||||
|
if let Some(prev_role) = prev_msg.get("role") {
|
||||||
|
if let Some(prev_role_str) = prev_role.as_str() {
|
||||||
|
if prev_role_str == "assistant" || prev_role_str == "Assistant" {
|
||||||
|
if let Some(prev_content) = prev_msg.get("content") {
|
||||||
|
if let Some(prev_content_str) = prev_content.as_str() {
|
||||||
|
// Check if the previous assistant message contains a final_output tool call
|
||||||
|
if prev_content_str.contains("\"tool\": \"final_output\"") {
|
||||||
|
// This is a final_output tool result
|
||||||
|
let feedback = if content_str.starts_with("Tool result: ") {
|
||||||
|
content_str.strip_prefix("Tool result: ")
|
||||||
|
.unwrap_or(content_str)
|
||||||
|
.to_string()
|
||||||
|
} else {
|
||||||
|
content_str.to_string()
|
||||||
|
};
|
||||||
|
|
||||||
|
output.print(&format!(
|
||||||
|
"Coach feedback extracted: {} characters (from {} total)",
|
||||||
|
feedback.len(),
|
||||||
|
content_str.len()
|
||||||
|
));
|
||||||
|
output.print(&format!("Coach feedback:\n{}", feedback));
|
||||||
|
|
||||||
|
output.print(&format!(
|
||||||
|
"✅ Extracted coach feedback from session: {} (verified final_output tool)",
|
||||||
|
session_id
|
||||||
|
));
|
||||||
|
return Ok(feedback);
|
||||||
|
} else {
|
||||||
|
output.print(&format!(
|
||||||
|
"⚠️ Skipping tool result at index {} - not a final_output tool call",
|
||||||
|
i
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -187,7 +238,7 @@ fn extract_coach_feedback_from_logs(
|
|||||||
"CRITICAL: Could not extract coach feedback from session: {}\n\
|
"CRITICAL: Could not extract coach feedback from session: {}\n\
|
||||||
Log file path: {:?}\n\
|
Log file path: {:?}\n\
|
||||||
Log file exists: {}\n\
|
Log file exists: {}\n\
|
||||||
This indicates the coach did not call any tool or the log is corrupted.\n\
|
This indicates the coach did not call final_output tool or the log is corrupted.\n\
|
||||||
Coach result response length: {} chars",
|
Coach result response length: {} chars",
|
||||||
session_id,
|
session_id,
|
||||||
log_file_path,
|
log_file_path,
|
||||||
@@ -1283,6 +1334,7 @@ async fn run_interactive<W: UiWriter>(
|
|||||||
output.print("📖 Control Commands:");
|
output.print("📖 Control Commands:");
|
||||||
output.print(" /compact - Trigger auto-summarization (compacts conversation history)");
|
output.print(" /compact - Trigger auto-summarization (compacts conversation history)");
|
||||||
output.print(" /thinnify - Trigger context thinning (replaces large tool results with file references)");
|
output.print(" /thinnify - Trigger context thinning (replaces large tool results with file references)");
|
||||||
|
output.print(" /skinnify - Trigger full context thinning (like /thinnify but for entire context, not just first third)");
|
||||||
output.print(
|
output.print(
|
||||||
" /readme - Reload README.md and AGENTS.md from disk",
|
" /readme - Reload README.md and AGENTS.md from disk",
|
||||||
);
|
);
|
||||||
@@ -1315,6 +1367,11 @@ async fn run_interactive<W: UiWriter>(
|
|||||||
println!("{}", summary);
|
println!("{}", summary);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
"/skinnify" => {
|
||||||
|
let summary = agent.force_thin_all();
|
||||||
|
println!("{}", summary);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
"/readme" => {
|
"/readme" => {
|
||||||
output.print("📚 Reloading README.md and AGENTS.md...");
|
output.print("📚 Reloading README.md and AGENTS.md...");
|
||||||
match agent.reload_readme() {
|
match agent.reload_readme() {
|
||||||
@@ -1524,6 +1581,12 @@ async fn run_interactive_machine(
|
|||||||
println!("{}", summary);
|
println!("{}", summary);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
"/skinnify" => {
|
||||||
|
println!("COMMAND: skinnify");
|
||||||
|
let summary = agent.force_thin_all();
|
||||||
|
println!("{}", summary);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
"/readme" => {
|
"/readme" => {
|
||||||
println!("COMMAND: readme");
|
println!("COMMAND: readme");
|
||||||
match agent.reload_readme() {
|
match agent.reload_readme() {
|
||||||
@@ -1546,7 +1609,7 @@ async fn run_interactive_machine(
|
|||||||
}
|
}
|
||||||
"/help" => {
|
"/help" => {
|
||||||
println!("COMMAND: help");
|
println!("COMMAND: help");
|
||||||
println!("AVAILABLE_COMMANDS: /compact /thinnify /readme /stats /help");
|
println!("AVAILABLE_COMMANDS: /compact /thinnify /skinnify /readme /stats /help");
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
_ => {
|
_ => {
|
||||||
|
|||||||
@@ -105,4 +105,9 @@ impl UiWriter for MachineUiWriter {
|
|||||||
// Default to first option (index 0) for automation
|
// Default to first option (index 0) for automation
|
||||||
0
|
0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn print_final_output(&self, summary: &str) {
|
||||||
|
println!("FINAL_OUTPUT:");
|
||||||
|
println!("{}", summary);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,78 +1,22 @@
|
|||||||
use g3_core::ui_writer::UiWriter;
|
use g3_core::ui_writer::UiWriter;
|
||||||
use std::io::{self, Write};
|
use std::io::{self, Write};
|
||||||
use std::sync::Mutex;
|
use termimad::MadSkin;
|
||||||
|
|
||||||
/// Console implementation of UiWriter that prints to stdout
|
/// Console implementation of UiWriter that prints to stdout
|
||||||
pub struct ConsoleUiWriter {
|
pub struct ConsoleUiWriter {
|
||||||
current_tool_name: Mutex<Option<String>>,
|
current_tool_name: std::sync::Mutex<Option<String>>,
|
||||||
current_tool_args: Mutex<Vec<(String, String)>>,
|
current_tool_args: std::sync::Mutex<Vec<(String, String)>>,
|
||||||
current_output_line: Mutex<Option<String>>,
|
current_output_line: std::sync::Mutex<Option<String>>,
|
||||||
output_line_printed: Mutex<bool>,
|
output_line_printed: std::sync::Mutex<bool>,
|
||||||
in_todo_tool: Mutex<bool>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ConsoleUiWriter {
|
impl ConsoleUiWriter {
|
||||||
pub fn new() -> Self {
|
pub fn new() -> Self {
|
||||||
Self {
|
Self {
|
||||||
current_tool_name: Mutex::new(None),
|
current_tool_name: std::sync::Mutex::new(None),
|
||||||
current_tool_args: Mutex::new(Vec::new()),
|
current_tool_args: std::sync::Mutex::new(Vec::new()),
|
||||||
current_output_line: Mutex::new(None),
|
current_output_line: std::sync::Mutex::new(None),
|
||||||
output_line_printed: Mutex::new(false),
|
output_line_printed: std::sync::Mutex::new(false),
|
||||||
in_todo_tool: Mutex::new(false),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn print_todo_line(&self, line: &str) {
|
|
||||||
// Transform and print todo list lines elegantly
|
|
||||||
let trimmed = line.trim();
|
|
||||||
|
|
||||||
// Skip the "📝 TODO list:" prefix line
|
|
||||||
if trimmed.starts_with("📝 TODO list:") || trimmed == "📝 TODO list is empty" {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Handle empty lines
|
|
||||||
if trimmed.is_empty() {
|
|
||||||
println!();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Detect indentation level
|
|
||||||
let indent_count = line.chars().take_while(|c| c.is_whitespace()).count();
|
|
||||||
let indent = " ".repeat(indent_count / 2); // Convert spaces to visual indent
|
|
||||||
|
|
||||||
// Format based on line type
|
|
||||||
if trimmed.starts_with("- [ ]") {
|
|
||||||
// Incomplete task
|
|
||||||
let task = trimmed.strip_prefix("- [ ]").unwrap_or(trimmed).trim();
|
|
||||||
println!("{}☐ {}", indent, task);
|
|
||||||
} else if trimmed.starts_with("- [x]") || trimmed.starts_with("- [X]") {
|
|
||||||
// Completed task
|
|
||||||
let task = trimmed
|
|
||||||
.strip_prefix("- [x]")
|
|
||||||
.or_else(|| trimmed.strip_prefix("- [X]"))
|
|
||||||
.unwrap_or(trimmed)
|
|
||||||
.trim();
|
|
||||||
println!("{}\x1b[2m☑ {}\x1b[0m", indent, task);
|
|
||||||
} else if trimmed.starts_with("- ") {
|
|
||||||
// Regular bullet point
|
|
||||||
let item = trimmed.strip_prefix("- ").unwrap_or(trimmed).trim();
|
|
||||||
println!("{}• {}", indent, item);
|
|
||||||
} else if trimmed.starts_with("# ") {
|
|
||||||
// Heading
|
|
||||||
let heading = trimmed.strip_prefix("# ").unwrap_or(trimmed).trim();
|
|
||||||
println!("\n\x1b[1m{}\x1b[0m", heading);
|
|
||||||
} else if trimmed.starts_with("## ") {
|
|
||||||
// Subheading
|
|
||||||
let subheading = trimmed.strip_prefix("## ").unwrap_or(trimmed).trim();
|
|
||||||
println!("\n\x1b[1m{}\x1b[0m", subheading);
|
|
||||||
} else if trimmed.starts_with("**") && trimmed.ends_with("**") {
|
|
||||||
// Bold text (section marker)
|
|
||||||
let text = trimmed.trim_start_matches("**").trim_end_matches("**");
|
|
||||||
println!("{}\x1b[1m{}\x1b[0m", indent, text);
|
|
||||||
} else {
|
|
||||||
// Regular text or note
|
|
||||||
println!("{}{}", indent, trimmed);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -138,13 +82,6 @@ impl UiWriter for ConsoleUiWriter {
|
|||||||
// Store the tool name and clear args for collection
|
// Store the tool name and clear args for collection
|
||||||
*self.current_tool_name.lock().unwrap() = Some(tool_name.to_string());
|
*self.current_tool_name.lock().unwrap() = Some(tool_name.to_string());
|
||||||
self.current_tool_args.lock().unwrap().clear();
|
self.current_tool_args.lock().unwrap().clear();
|
||||||
|
|
||||||
// Check if this is a todo tool call
|
|
||||||
let is_todo = tool_name == "todo_read" || tool_name == "todo_write";
|
|
||||||
*self.in_todo_tool.lock().unwrap() = is_todo;
|
|
||||||
|
|
||||||
// For todo tools, we'll skip the normal header and print a custom one later
|
|
||||||
if is_todo {}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn print_tool_arg(&self, key: &str, value: &str) {
|
fn print_tool_arg(&self, key: &str, value: &str) {
|
||||||
@@ -167,13 +104,10 @@ impl UiWriter for ConsoleUiWriter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn print_tool_output_header(&self) {
|
fn print_tool_output_header(&self) {
|
||||||
// Skip normal header for todo tools
|
|
||||||
if *self.in_todo_tool.lock().unwrap() {
|
|
||||||
println!(); // Just add a newline
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
println!();
|
println!();
|
||||||
|
// Reset output_line_printed at the start of a new tool output
|
||||||
|
// This ensures the header isn't cleared by update_tool_output_line
|
||||||
|
*self.output_line_printed.lock().unwrap() = false;
|
||||||
// Now print the tool header with the most important arg in bold green
|
// Now print the tool header with the most important arg in bold green
|
||||||
if let Some(tool_name) = self.current_tool_name.lock().unwrap().as_ref() {
|
if let Some(tool_name) = self.current_tool_name.lock().unwrap().as_ref() {
|
||||||
let args = self.current_tool_args.lock().unwrap();
|
let args = self.current_tool_args.lock().unwrap();
|
||||||
@@ -259,21 +193,14 @@ impl UiWriter for ConsoleUiWriter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn print_tool_output_line(&self, line: &str) {
|
fn print_tool_output_line(&self, line: &str) {
|
||||||
// Special handling for todo tools
|
// Skip the TODO list header line
|
||||||
if *self.in_todo_tool.lock().unwrap() {
|
if line.starts_with("📝 TODO list:") {
|
||||||
self.print_todo_line(line);
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
println!("│ \x1b[2m{}\x1b[0m", line);
|
println!("│ \x1b[2m{}\x1b[0m", line);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn print_tool_output_summary(&self, count: usize) {
|
fn print_tool_output_summary(&self, count: usize) {
|
||||||
// Skip for todo tools
|
|
||||||
if *self.in_todo_tool.lock().unwrap() {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
println!(
|
println!(
|
||||||
"│ \x1b[2m({} line{})\x1b[0m",
|
"│ \x1b[2m({} line{})\x1b[0m",
|
||||||
count,
|
count,
|
||||||
@@ -282,13 +209,6 @@ impl UiWriter for ConsoleUiWriter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn print_tool_timing(&self, duration_str: &str) {
|
fn print_tool_timing(&self, duration_str: &str) {
|
||||||
// For todo tools, just print a simple completion message
|
|
||||||
if *self.in_todo_tool.lock().unwrap() {
|
|
||||||
println!();
|
|
||||||
*self.in_todo_tool.lock().unwrap() = false;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Parse the duration string to determine color
|
// Parse the duration string to determine color
|
||||||
// Format is like "1.5s", "500ms", "2m 30.0s"
|
// Format is like "1.5s", "500ms", "2m 30.0s"
|
||||||
let color_code = if duration_str.ends_with("ms") {
|
let color_code = if duration_str.ends_with("ms") {
|
||||||
@@ -390,4 +310,44 @@ impl UiWriter for ConsoleUiWriter {
|
|||||||
let _ = io::stdout().flush();
|
let _ = io::stdout().flush();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn print_final_output(&self, summary: &str) {
|
||||||
|
// Show spinner while "formatting"
|
||||||
|
let spinner_frames = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏'];
|
||||||
|
let message = "summarizing work done...";
|
||||||
|
|
||||||
|
// Brief spinner animation (about 0.5 seconds)
|
||||||
|
for i in 0..5 {
|
||||||
|
let frame = spinner_frames[i % spinner_frames.len()];
|
||||||
|
print!("\r\x1b[36m{} {}\x1b[0m", frame, message);
|
||||||
|
let _ = io::stdout().flush();
|
||||||
|
std::thread::sleep(std::time::Duration::from_millis(100));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Clear the spinner line
|
||||||
|
print!("\r\x1b[2K");
|
||||||
|
let _ = io::stdout().flush();
|
||||||
|
|
||||||
|
// Create a styled markdown skin
|
||||||
|
let mut skin = MadSkin::default();
|
||||||
|
// Customize colors for better terminal appearance
|
||||||
|
skin.bold.set_fg(termimad::crossterm::style::Color::Green);
|
||||||
|
skin.italic.set_fg(termimad::crossterm::style::Color::Cyan);
|
||||||
|
skin.headers[0].set_fg(termimad::crossterm::style::Color::Magenta);
|
||||||
|
skin.headers[1].set_fg(termimad::crossterm::style::Color::Magenta);
|
||||||
|
skin.code_block.set_fg(termimad::crossterm::style::Color::Yellow);
|
||||||
|
skin.inline_code.set_fg(termimad::crossterm::style::Color::Yellow);
|
||||||
|
|
||||||
|
// Print a header separator
|
||||||
|
println!("\x1b[1;35m━━━ Summary ━━━\x1b[0m");
|
||||||
|
println!();
|
||||||
|
|
||||||
|
// Render the markdown
|
||||||
|
let rendered = skin.term_text(summary);
|
||||||
|
print!("{}", rendered);
|
||||||
|
|
||||||
|
// Print a footer separator
|
||||||
|
println!();
|
||||||
|
println!("\x1b[1;35m━━━━━━━━━━━━━━━\x1b[0m");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
336
crates/g3-cli/tests/coach_feedback_extraction_test.rs
Normal file
336
crates/g3-cli/tests/coach_feedback_extraction_test.rs
Normal file
@@ -0,0 +1,336 @@
|
|||||||
|
use serde_json::json;
|
||||||
|
use std::fs;
|
||||||
|
use tempfile::TempDir;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_extract_coach_feedback_with_timing_message() {
|
||||||
|
// Create a temporary directory for logs
|
||||||
|
let temp_dir = TempDir::new().unwrap();
|
||||||
|
let logs_dir = temp_dir.path().join("logs");
|
||||||
|
fs::create_dir(&logs_dir).unwrap();
|
||||||
|
|
||||||
|
// Create a mock session log with the problematic conversation history
|
||||||
|
// where timing message appears after the tool result
|
||||||
|
let session_id = "test_session_123";
|
||||||
|
let log_file_path = logs_dir.join(format!("g3_session_{}.json", session_id));
|
||||||
|
|
||||||
|
let log_content = json!({
|
||||||
|
"session_id": session_id,
|
||||||
|
"context_window": {
|
||||||
|
"conversation_history": [
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "{\"tool\": \"final_output\", \"args\": {\"summary\":\"IMPLEMENTATION_APPROVED\"}}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Tool result: IMPLEMENTATION_APPROVED"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "🕝 27.7s | 💭 7.5s"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
fs::write(&log_file_path, serde_json::to_string_pretty(&log_content).unwrap()).unwrap();
|
||||||
|
|
||||||
|
// Now test the extraction logic
|
||||||
|
let log_content_str = fs::read_to_string(&log_file_path).unwrap();
|
||||||
|
let log_json: serde_json::Value = serde_json::from_str(&log_content_str).unwrap();
|
||||||
|
|
||||||
|
if let Some(context_window) = log_json.get("context_window") {
|
||||||
|
if let Some(conversation_history) = context_window.get("conversation_history") {
|
||||||
|
if let Some(messages) = conversation_history.as_array() {
|
||||||
|
// This is the key logic we're testing - find the last USER message with "Tool result:"
|
||||||
|
let last_tool_result = messages.iter().rev().find(|msg| {
|
||||||
|
if let Some(role) = msg.get("role") {
|
||||||
|
if let Some(role_str) = role.as_str() {
|
||||||
|
if role_str == "User" || role_str == "user" {
|
||||||
|
if let Some(content) = msg.get("content") {
|
||||||
|
if let Some(content_str) = content.as_str() {
|
||||||
|
return content_str.starts_with("Tool result:");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
false
|
||||||
|
});
|
||||||
|
|
||||||
|
// Verify we found the correct message
|
||||||
|
assert!(last_tool_result.is_some(), "Should find the tool result message");
|
||||||
|
|
||||||
|
if let Some(last_message) = last_tool_result {
|
||||||
|
if let Some(content) = last_message.get("content") {
|
||||||
|
if let Some(content_str) = content.as_str() {
|
||||||
|
let feedback = if content_str.starts_with("Tool result: ") {
|
||||||
|
content_str.strip_prefix("Tool result: ").unwrap_or(content_str)
|
||||||
|
} else {
|
||||||
|
content_str
|
||||||
|
};
|
||||||
|
|
||||||
|
// Verify we extracted the correct feedback
|
||||||
|
assert_eq!(feedback, "IMPLEMENTATION_APPROVED", "Should extract the actual feedback, not timing");
|
||||||
|
|
||||||
|
// Verify the feedback is NOT the timing message
|
||||||
|
assert!(!feedback.contains("🕝"), "Feedback should not be the timing message");
|
||||||
|
|
||||||
|
println!("✅ Successfully extracted coach feedback: {}", feedback);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
panic!("Failed to extract coach feedback");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_extract_only_final_output_tool_results() {
|
||||||
|
// Test that we only extract tool results from final_output, not from other tools
|
||||||
|
let temp_dir = TempDir::new().unwrap();
|
||||||
|
let logs_dir = temp_dir.path().join("logs");
|
||||||
|
fs::create_dir(&logs_dir).unwrap();
|
||||||
|
|
||||||
|
let session_id = "test_session_final_output_only";
|
||||||
|
let log_file_path = logs_dir.join(format!("g3_session_{}.json", session_id));
|
||||||
|
|
||||||
|
let log_content = json!({
|
||||||
|
"session_id": session_id,
|
||||||
|
"context_window": {
|
||||||
|
"conversation_history": [
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "{\"tool\": \"shell\", \"args\": {\"command\":\"ls\"}}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Tool result: file1.txt\nfile2.txt"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "{\"tool\": \"read_file\", \"args\": {\"file_path\":\"test.txt\"}}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Tool result: This is test content"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "{\"tool\": \"final_output\", \"args\": {\"summary\":\"APPROVED_RESULT\"}}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Tool result: APPROVED_RESULT"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "🕝 20.5s | 💭 5.2s"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
fs::write(&log_file_path, serde_json::to_string_pretty(&log_content).unwrap()).unwrap();
|
||||||
|
|
||||||
|
// Test the new extraction logic that verifies the tool is final_output
|
||||||
|
let log_content_str = fs::read_to_string(&log_file_path).unwrap();
|
||||||
|
let log_json: serde_json::Value = serde_json::from_str(&log_content_str).unwrap();
|
||||||
|
|
||||||
|
if let Some(context_window) = log_json.get("context_window") {
|
||||||
|
if let Some(conversation_history) = context_window.get("conversation_history") {
|
||||||
|
if let Some(messages) = conversation_history.as_array() {
|
||||||
|
// Go backwards through messages to find final_output tool result
|
||||||
|
for i in (0..messages.len()).rev() {
|
||||||
|
let msg = &messages[i];
|
||||||
|
|
||||||
|
if let Some(role) = msg.get("role") {
|
||||||
|
if let Some(role_str) = role.as_str() {
|
||||||
|
if role_str == "User" || role_str == "user" {
|
||||||
|
if let Some(content) = msg.get("content") {
|
||||||
|
if let Some(content_str) = content.as_str() {
|
||||||
|
if content_str.starts_with("Tool result:") {
|
||||||
|
// Check if preceding message was final_output
|
||||||
|
if i > 0 {
|
||||||
|
let prev_msg = &messages[i - 1];
|
||||||
|
if let Some(prev_content) = prev_msg.get("content") {
|
||||||
|
if let Some(prev_content_str) = prev_content.as_str() {
|
||||||
|
if prev_content_str.contains("\"tool\": \"final_output\"") {
|
||||||
|
let feedback = content_str.strip_prefix("Tool result: ").unwrap_or(content_str);
|
||||||
|
assert_eq!(feedback, "APPROVED_RESULT", "Should extract only final_output result");
|
||||||
|
println!("✅ Correctly extracted only final_output tool result: {}", feedback);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
panic!("Failed to extract final_output tool result");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_extract_coach_feedback_without_timing_message() {
|
||||||
|
// Create a temporary directory for logs
|
||||||
|
let temp_dir = TempDir::new().unwrap();
|
||||||
|
let logs_dir = temp_dir.path().join("logs");
|
||||||
|
fs::create_dir(&logs_dir).unwrap();
|
||||||
|
|
||||||
|
// Test the case where there's no timing message (backward compatibility)
|
||||||
|
let session_id = "test_session_456";
|
||||||
|
let log_file_path = logs_dir.join(format!("g3_session_{}.json", session_id));
|
||||||
|
|
||||||
|
let log_content = json!({
|
||||||
|
"session_id": session_id,
|
||||||
|
"context_window": {
|
||||||
|
"conversation_history": [
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "{\"tool\": \"final_output\", \"args\": {\"summary\":\"TEST_FEEDBACK\"}}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Tool result: TEST_FEEDBACK"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
fs::write(&log_file_path, serde_json::to_string_pretty(&log_content).unwrap()).unwrap();
|
||||||
|
|
||||||
|
// Test extraction
|
||||||
|
let log_content_str = fs::read_to_string(&log_file_path).unwrap();
|
||||||
|
let log_json: serde_json::Value = serde_json::from_str(&log_content_str).unwrap();
|
||||||
|
|
||||||
|
if let Some(context_window) = log_json.get("context_window") {
|
||||||
|
if let Some(conversation_history) = context_window.get("conversation_history") {
|
||||||
|
if let Some(messages) = conversation_history.as_array() {
|
||||||
|
let last_tool_result = messages.iter().rev().find(|msg| {
|
||||||
|
if let Some(role) = msg.get("role") {
|
||||||
|
if let Some(role_str) = role.as_str() {
|
||||||
|
if role_str == "User" || role_str == "user" {
|
||||||
|
if let Some(content) = msg.get("content") {
|
||||||
|
if let Some(content_str) = content.as_str() {
|
||||||
|
return content_str.starts_with("Tool result:");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
false
|
||||||
|
});
|
||||||
|
|
||||||
|
assert!(last_tool_result.is_some());
|
||||||
|
|
||||||
|
if let Some(last_message) = last_tool_result {
|
||||||
|
if let Some(content) = last_message.get("content") {
|
||||||
|
if let Some(content_str) = content.as_str() {
|
||||||
|
let feedback = content_str.strip_prefix("Tool result: ").unwrap_or(content_str);
|
||||||
|
assert_eq!(feedback, "TEST_FEEDBACK");
|
||||||
|
println!("✅ Successfully extracted coach feedback without timing: {}", feedback);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
panic!("Failed to extract coach feedback");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_extract_coach_feedback_with_multiple_tool_results() {
|
||||||
|
// Test that we get the LAST tool result when there are multiple
|
||||||
|
let temp_dir = TempDir::new().unwrap();
|
||||||
|
let logs_dir = temp_dir.path().join("logs");
|
||||||
|
fs::create_dir(&logs_dir).unwrap();
|
||||||
|
|
||||||
|
let session_id = "test_session_789";
|
||||||
|
let log_file_path = logs_dir.join(format!("g3_session_{}.json", session_id));
|
||||||
|
|
||||||
|
let log_content = json!({
|
||||||
|
"session_id": session_id,
|
||||||
|
"context_window": {
|
||||||
|
"conversation_history": [
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "{\"tool\": \"shell\", \"args\": {\"command\":\"ls\"}}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Tool result: file1.txt\nfile2.txt"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "{\"tool\": \"final_output\", \"args\": {\"summary\":\"FINAL_RESULT\"}}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Tool result: FINAL_RESULT"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "🕝 15.2s | 💭 3.1s"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
fs::write(&log_file_path, serde_json::to_string_pretty(&log_content).unwrap()).unwrap();
|
||||||
|
|
||||||
|
// Test extraction
|
||||||
|
let log_content_str = fs::read_to_string(&log_file_path).unwrap();
|
||||||
|
let log_json: serde_json::Value = serde_json::from_str(&log_content_str).unwrap();
|
||||||
|
|
||||||
|
if let Some(context_window) = log_json.get("context_window") {
|
||||||
|
if let Some(conversation_history) = context_window.get("conversation_history") {
|
||||||
|
if let Some(messages) = conversation_history.as_array() {
|
||||||
|
let last_tool_result = messages.iter().rev().find(|msg| {
|
||||||
|
if let Some(role) = msg.get("role") {
|
||||||
|
if let Some(role_str) = role.as_str() {
|
||||||
|
if role_str == "User" || role_str == "user" {
|
||||||
|
if let Some(content) = msg.get("content") {
|
||||||
|
if let Some(content_str) = content.as_str() {
|
||||||
|
return content_str.starts_with("Tool result:");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
false
|
||||||
|
});
|
||||||
|
|
||||||
|
assert!(last_tool_result.is_some());
|
||||||
|
|
||||||
|
if let Some(last_message) = last_tool_result {
|
||||||
|
if let Some(content) = last_message.get("content") {
|
||||||
|
if let Some(content_str) = content.as_str() {
|
||||||
|
let feedback = content_str.strip_prefix("Tool result: ").unwrap_or(content_str);
|
||||||
|
// Should get the LAST tool result (final_output), not the first one (shell)
|
||||||
|
assert_eq!(feedback, "FINAL_RESULT", "Should extract the last tool result");
|
||||||
|
assert!(!feedback.contains("file1.txt"), "Should not extract earlier tool results");
|
||||||
|
println!("✅ Successfully extracted last tool result: {}", feedback);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
panic!("Failed to extract coach feedback");
|
||||||
|
}
|
||||||
@@ -42,6 +42,7 @@ pub struct AnthropicConfig {
|
|||||||
pub temperature: Option<f32>,
|
pub temperature: Option<f32>,
|
||||||
pub cache_config: Option<String>, // "ephemeral", "5minute", "1hour", or None to disable
|
pub cache_config: Option<String>, // "ephemeral", "5minute", "1hour", or None to disable
|
||||||
pub enable_1m_context: Option<bool>, // Enable 1m context window (costs extra)
|
pub enable_1m_context: Option<bool>, // Enable 1m context window (costs extra)
|
||||||
|
pub thinking_budget_tokens: Option<u32>, // Budget tokens for extended thinking
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -65,6 +65,10 @@ pub trait UiWriter: Send + Sync {
|
|||||||
/// Prompt the user to choose from a list of options
|
/// Prompt the user to choose from a list of options
|
||||||
/// Returns the index of the selected option
|
/// Returns the index of the selected option
|
||||||
fn prompt_user_choice(&self, message: &str, options: &[&str]) -> usize;
|
fn prompt_user_choice(&self, message: &str, options: &[&str]) -> usize;
|
||||||
|
|
||||||
|
/// Print the final output summary with markdown formatting
|
||||||
|
/// Shows a spinner while formatting, then renders the markdown
|
||||||
|
fn print_final_output(&self, summary: &str);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A no-op implementation for when UI output is not needed
|
/// A no-op implementation for when UI output is not needed
|
||||||
@@ -97,4 +101,7 @@ impl UiWriter for NullUiWriter {
|
|||||||
fn prompt_user_choice(&self, _message: &str, _options: &[&str]) -> usize {
|
fn prompt_user_choice(&self, _message: &str, _options: &[&str]) -> usize {
|
||||||
0
|
0
|
||||||
}
|
}
|
||||||
|
fn print_final_output(&self, _summary: &str) {
|
||||||
|
// No-op for null writer
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
188
crates/g3-core/tests/test_preflight_max_tokens.rs
Normal file
188
crates/g3-core/tests/test_preflight_max_tokens.rs
Normal file
@@ -0,0 +1,188 @@
|
|||||||
|
//! Tests for the pre-flight max_tokens validation with thinking.budget_tokens constraint
|
||||||
|
//!
|
||||||
|
//! These tests verify that when using Anthropic with extended thinking enabled,
|
||||||
|
//! the max_tokens calculation properly accounts for the budget_tokens constraint.
|
||||||
|
|
||||||
|
use g3_config::Config;
|
||||||
|
use g3_core::ContextWindow;
|
||||||
|
|
||||||
|
/// Helper function to create a minimal config for testing
|
||||||
|
fn create_test_config_with_thinking(thinking_budget: Option<u32>) -> Config {
|
||||||
|
let mut config = Config::default();
|
||||||
|
|
||||||
|
// Set up Anthropic provider with optional thinking budget
|
||||||
|
config.providers.anthropic = Some(g3_config::AnthropicConfig {
|
||||||
|
api_key: "test-key".to_string(),
|
||||||
|
model: "claude-sonnet-4-5".to_string(),
|
||||||
|
max_tokens: Some(16000),
|
||||||
|
temperature: Some(0.1),
|
||||||
|
cache_config: None,
|
||||||
|
enable_1m_context: None,
|
||||||
|
thinking_budget_tokens: thinking_budget,
|
||||||
|
});
|
||||||
|
|
||||||
|
config.providers.default_provider = "anthropic".to_string();
|
||||||
|
config
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test that when thinking is disabled, max_tokens passes through unchanged
|
||||||
|
#[test]
|
||||||
|
fn test_no_thinking_budget_passes_through() {
|
||||||
|
let config = create_test_config_with_thinking(None);
|
||||||
|
|
||||||
|
// Without thinking budget, any max_tokens should be fine
|
||||||
|
let proposed_max = 5000;
|
||||||
|
|
||||||
|
// The constraint check would return (proposed_max, false)
|
||||||
|
// since there's no thinking_budget_tokens configured
|
||||||
|
assert!(config.providers.anthropic.as_ref().unwrap().thinking_budget_tokens.is_none());
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test that when max_tokens > budget_tokens + buffer, no reduction is needed
|
||||||
|
#[test]
|
||||||
|
fn test_sufficient_max_tokens_no_reduction_needed() {
|
||||||
|
let config = create_test_config_with_thinking(Some(10000));
|
||||||
|
let budget_tokens = config.providers.anthropic.as_ref().unwrap().thinking_budget_tokens.unwrap();
|
||||||
|
|
||||||
|
// minimum_required = budget_tokens + 1024 = 11024
|
||||||
|
let minimum_required = budget_tokens + 1024;
|
||||||
|
|
||||||
|
// If proposed_max >= minimum_required, no reduction is needed
|
||||||
|
let proposed_max = 15000;
|
||||||
|
assert!(proposed_max >= minimum_required);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test that when max_tokens < budget_tokens + buffer, reduction is needed
|
||||||
|
#[test]
|
||||||
|
fn test_insufficient_max_tokens_needs_reduction() {
|
||||||
|
let config = create_test_config_with_thinking(Some(10000));
|
||||||
|
let budget_tokens = config.providers.anthropic.as_ref().unwrap().thinking_budget_tokens.unwrap();
|
||||||
|
|
||||||
|
// minimum_required = budget_tokens + 1024 = 11024
|
||||||
|
let minimum_required = budget_tokens + 1024;
|
||||||
|
|
||||||
|
// If proposed_max < minimum_required, reduction IS needed
|
||||||
|
let proposed_max = 5000;
|
||||||
|
assert!(proposed_max < minimum_required);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test the minimum required calculation
|
||||||
|
#[test]
|
||||||
|
fn test_minimum_required_calculation() {
|
||||||
|
// For a budget of 10000, we need at least 11024 tokens
|
||||||
|
let budget_tokens = 10000u32;
|
||||||
|
let output_buffer = 1024u32;
|
||||||
|
let minimum_required = budget_tokens + output_buffer;
|
||||||
|
|
||||||
|
assert_eq!(minimum_required, 11024);
|
||||||
|
|
||||||
|
// For a larger budget
|
||||||
|
let budget_tokens = 32000u32;
|
||||||
|
let minimum_required = budget_tokens + output_buffer;
|
||||||
|
assert_eq!(minimum_required, 33024);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test context window usage calculation for summary max_tokens
|
||||||
|
#[test]
|
||||||
|
fn test_context_window_available_tokens() {
|
||||||
|
let mut context = ContextWindow::new(200000); // 200k context window
|
||||||
|
|
||||||
|
// Simulate heavy usage
|
||||||
|
context.used_tokens = 180000; // 90% used
|
||||||
|
|
||||||
|
let model_limit = context.total_tokens;
|
||||||
|
let current_usage = context.used_tokens;
|
||||||
|
|
||||||
|
// 2.5% buffer calculation
|
||||||
|
let buffer = (model_limit / 40).clamp(1000, 10000);
|
||||||
|
assert_eq!(buffer, 5000); // 200000/40 = 5000
|
||||||
|
|
||||||
|
let available = model_limit
|
||||||
|
.saturating_sub(current_usage)
|
||||||
|
.saturating_sub(buffer);
|
||||||
|
|
||||||
|
// 200000 - 180000 - 5000 = 15000
|
||||||
|
assert_eq!(available, 15000);
|
||||||
|
|
||||||
|
// Capped at 10000 for summary
|
||||||
|
let summary_max = available.min(10_000);
|
||||||
|
assert_eq!(summary_max, 10000);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test that when context is nearly full, available tokens may be below thinking budget
|
||||||
|
#[test]
|
||||||
|
fn test_context_nearly_full_triggers_reduction() {
|
||||||
|
let mut context = ContextWindow::new(200000);
|
||||||
|
|
||||||
|
// Very heavy usage - 98% used
|
||||||
|
context.used_tokens = 196000;
|
||||||
|
|
||||||
|
let model_limit = context.total_tokens;
|
||||||
|
let current_usage = context.used_tokens;
|
||||||
|
let buffer = (model_limit / 40).clamp(1000, 10000); // 5000
|
||||||
|
|
||||||
|
let available = model_limit
|
||||||
|
.saturating_sub(current_usage)
|
||||||
|
.saturating_sub(buffer);
|
||||||
|
|
||||||
|
// 200000 - 196000 - 5000 = -1000 -> saturates to 0
|
||||||
|
assert_eq!(available, 0);
|
||||||
|
|
||||||
|
// With thinking_budget of 10000, this would definitely need reduction
|
||||||
|
let thinking_budget = 10000u32;
|
||||||
|
let minimum_required = thinking_budget + 1024;
|
||||||
|
assert!(available < minimum_required);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test the hard-coded fallback value
|
||||||
|
#[test]
|
||||||
|
fn test_hardcoded_fallback_value() {
|
||||||
|
// When all else fails, we use 5000 as the hard-coded max_tokens
|
||||||
|
let hardcoded_fallback = 5000u32;
|
||||||
|
|
||||||
|
// This should be a reasonable value that Anthropic will accept
|
||||||
|
// even with thinking enabled (though output will be limited)
|
||||||
|
assert!(hardcoded_fallback > 0);
|
||||||
|
|
||||||
|
// Note: With a 10000 thinking budget, 5000 is still below the
|
||||||
|
// minimum required (11024), but we send it anyway as a "last resort"
|
||||||
|
// hoping the API might still work for basic operations
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test provider-specific caps
|
||||||
|
#[test]
|
||||||
|
fn test_provider_specific_caps() {
|
||||||
|
// Anthropic/Databricks: cap at 10000
|
||||||
|
let anthropic_cap = 10000u32;
|
||||||
|
let proposed = 15000u32;
|
||||||
|
assert_eq!(proposed.min(anthropic_cap), 10000);
|
||||||
|
|
||||||
|
// Embedded: cap at 3000
|
||||||
|
let embedded_cap = 3000u32;
|
||||||
|
let proposed = 5000u32;
|
||||||
|
assert_eq!(proposed.min(embedded_cap), 3000);
|
||||||
|
|
||||||
|
// Default: cap at 5000
|
||||||
|
let default_cap = 5000u32;
|
||||||
|
let proposed = 8000u32;
|
||||||
|
assert_eq!(proposed.min(default_cap), 5000);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test that the error message mentions the thinking budget constraint
|
||||||
|
#[test]
|
||||||
|
fn test_error_message_content() {
|
||||||
|
// Verify the warning message format contains useful information
|
||||||
|
let proposed_max_tokens = 5000u32;
|
||||||
|
let budget_tokens = 10000u32;
|
||||||
|
let minimum_required = budget_tokens + 1024;
|
||||||
|
|
||||||
|
let warning = format!(
|
||||||
|
"max_tokens ({}) is below required minimum ({}) for thinking.budget_tokens ({}). Context reduction needed.",
|
||||||
|
proposed_max_tokens, minimum_required, budget_tokens
|
||||||
|
);
|
||||||
|
|
||||||
|
assert!(warning.contains("5000"));
|
||||||
|
assert!(warning.contains("11024"));
|
||||||
|
assert!(warning.contains("10000"));
|
||||||
|
assert!(warning.contains("Context reduction needed"));
|
||||||
|
}
|
||||||
159
crates/g3-core/tests/test_reset_with_summary.rs
Normal file
159
crates/g3-core/tests/test_reset_with_summary.rs
Normal file
@@ -0,0 +1,159 @@
|
|||||||
|
//! Tests for reset_with_summary to ensure system prompt is preserved after compaction
|
||||||
|
|
||||||
|
use g3_core::ContextWindow;
|
||||||
|
use g3_providers::{Message, MessageRole};
|
||||||
|
|
||||||
|
/// Test that reset_with_summary preserves the original system prompt
|
||||||
|
#[test]
|
||||||
|
fn test_reset_with_summary_preserves_system_prompt() {
|
||||||
|
let mut context = ContextWindow::new(10000);
|
||||||
|
|
||||||
|
// Add the system prompt as the first message (simulating agent initialization)
|
||||||
|
let system_prompt = "You are G3, an AI programming agent...";
|
||||||
|
context.add_message(Message::new(MessageRole::System, system_prompt.to_string()));
|
||||||
|
|
||||||
|
// Add some conversation history
|
||||||
|
context.add_message(Message::new(MessageRole::User, "Task: Write a function".to_string()));
|
||||||
|
context.add_message(Message::new(MessageRole::Assistant, "I'll help you write that function.".to_string()));
|
||||||
|
context.add_message(Message::new(MessageRole::User, "Thanks, now add tests".to_string()));
|
||||||
|
context.add_message(Message::new(MessageRole::Assistant, "Here are the tests.".to_string()));
|
||||||
|
|
||||||
|
// Verify we have 5 messages before reset
|
||||||
|
assert_eq!(context.conversation_history.len(), 5);
|
||||||
|
|
||||||
|
// Reset with summary
|
||||||
|
let summary = "We discussed writing a function and adding tests.".to_string();
|
||||||
|
let latest_user_msg = Some("Continue with the implementation".to_string());
|
||||||
|
context.reset_with_summary(summary, latest_user_msg);
|
||||||
|
|
||||||
|
// Verify the first message is still the system prompt
|
||||||
|
assert!(!context.conversation_history.is_empty(), "Conversation history should not be empty");
|
||||||
|
|
||||||
|
let first_message = &context.conversation_history[0];
|
||||||
|
assert!(
|
||||||
|
matches!(first_message.role, MessageRole::System),
|
||||||
|
"First message should be a System message, got {:?}",
|
||||||
|
first_message.role
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
first_message.content.contains("You are G3"),
|
||||||
|
"First message should contain the system prompt 'You are G3', got: {}",
|
||||||
|
&first_message.content[..first_message.content.len().min(100)]
|
||||||
|
);
|
||||||
|
|
||||||
|
// Verify the summary was added as a separate system message
|
||||||
|
let has_summary = context.conversation_history.iter().any(|m| {
|
||||||
|
matches!(m.role, MessageRole::System) && m.content.contains("Previous conversation summary")
|
||||||
|
});
|
||||||
|
assert!(has_summary, "Should have a summary message");
|
||||||
|
|
||||||
|
// Verify the latest user message was added
|
||||||
|
let has_user_msg = context.conversation_history.iter().any(|m| {
|
||||||
|
matches!(m.role, MessageRole::User) && m.content.contains("Continue with the implementation")
|
||||||
|
});
|
||||||
|
assert!(has_user_msg, "Should have the latest user message");
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test that reset_with_summary preserves README message if present
|
||||||
|
#[test]
|
||||||
|
fn test_reset_with_summary_preserves_readme() {
|
||||||
|
let mut context = ContextWindow::new(10000);
|
||||||
|
|
||||||
|
// Add the system prompt as the first message
|
||||||
|
let system_prompt = "You are G3, an AI programming agent...";
|
||||||
|
context.add_message(Message::new(MessageRole::System, system_prompt.to_string()));
|
||||||
|
|
||||||
|
// Add README as second system message
|
||||||
|
let readme_content = "# Project README\n\nThis is a test project.";
|
||||||
|
context.add_message(Message::new(MessageRole::System, readme_content.to_string()));
|
||||||
|
|
||||||
|
// Add some conversation history
|
||||||
|
context.add_message(Message::new(MessageRole::User, "Task: Write a function".to_string()));
|
||||||
|
context.add_message(Message::new(MessageRole::Assistant, "Done.".to_string()));
|
||||||
|
|
||||||
|
// Verify we have 4 messages before reset
|
||||||
|
assert_eq!(context.conversation_history.len(), 4);
|
||||||
|
|
||||||
|
// Reset with summary
|
||||||
|
let summary = "We wrote a function.".to_string();
|
||||||
|
context.reset_with_summary(summary, None);
|
||||||
|
|
||||||
|
// Verify the first message is still the system prompt
|
||||||
|
let first_message = &context.conversation_history[0];
|
||||||
|
assert!(
|
||||||
|
first_message.content.contains("You are G3"),
|
||||||
|
"First message should be the system prompt"
|
||||||
|
);
|
||||||
|
|
||||||
|
// Verify the README was preserved as the second message
|
||||||
|
let second_message = &context.conversation_history[1];
|
||||||
|
assert!(
|
||||||
|
matches!(second_message.role, MessageRole::System),
|
||||||
|
"Second message should be a System message"
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
second_message.content.contains("Project README"),
|
||||||
|
"Second message should be the README"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test that reset_with_summary works when there's no README
|
||||||
|
#[test]
|
||||||
|
fn test_reset_with_summary_without_readme() {
|
||||||
|
let mut context = ContextWindow::new(10000);
|
||||||
|
|
||||||
|
// Add only the system prompt (no README)
|
||||||
|
let system_prompt = "You are G3, an AI programming agent...";
|
||||||
|
context.add_message(Message::new(MessageRole::System, system_prompt.to_string()));
|
||||||
|
|
||||||
|
// Add conversation without README
|
||||||
|
context.add_message(Message::new(MessageRole::User, "Hello".to_string()));
|
||||||
|
context.add_message(Message::new(MessageRole::Assistant, "Hi there!".to_string()));
|
||||||
|
|
||||||
|
// Reset with summary
|
||||||
|
let summary = "Greeted the user.".to_string();
|
||||||
|
context.reset_with_summary(summary, None);
|
||||||
|
|
||||||
|
// Verify the first message is still the system prompt
|
||||||
|
let first_message = &context.conversation_history[0];
|
||||||
|
assert!(
|
||||||
|
first_message.content.contains("You are G3"),
|
||||||
|
"First message should be the system prompt"
|
||||||
|
);
|
||||||
|
|
||||||
|
// Verify we have system prompt + summary (no README)
|
||||||
|
// The second message should be the summary, not a README
|
||||||
|
let second_message = &context.conversation_history[1];
|
||||||
|
assert!(
|
||||||
|
second_message.content.contains("Previous conversation summary"),
|
||||||
|
"Second message should be the summary when no README exists"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test that reset_with_summary handles Agent Configuration in addition to README
|
||||||
|
#[test]
|
||||||
|
fn test_reset_with_summary_preserves_agent_configuration() {
|
||||||
|
let mut context = ContextWindow::new(10000);
|
||||||
|
|
||||||
|
// Add the system prompt as the first message
|
||||||
|
let system_prompt = "You are G3, an AI programming agent...";
|
||||||
|
context.add_message(Message::new(MessageRole::System, system_prompt.to_string()));
|
||||||
|
|
||||||
|
// Add Agent Configuration as second system message
|
||||||
|
let agents_content = "# Agent Configuration\n\nSpecial instructions for this project.";
|
||||||
|
context.add_message(Message::new(MessageRole::System, agents_content.to_string()));
|
||||||
|
|
||||||
|
// Add some conversation history
|
||||||
|
context.add_message(Message::new(MessageRole::User, "Task: Do something".to_string()));
|
||||||
|
|
||||||
|
// Reset with summary
|
||||||
|
let summary = "Did something.".to_string();
|
||||||
|
context.reset_with_summary(summary, None);
|
||||||
|
|
||||||
|
// Verify the Agent Configuration was preserved
|
||||||
|
let second_message = &context.conversation_history[1];
|
||||||
|
assert!(
|
||||||
|
second_message.content.contains("Agent Configuration"),
|
||||||
|
"Second message should be the Agent Configuration"
|
||||||
|
);
|
||||||
|
}
|
||||||
263
crates/g3-core/tests/test_system_message_loading.rs
Normal file
263
crates/g3-core/tests/test_system_message_loading.rs
Normal file
@@ -0,0 +1,263 @@
|
|||||||
|
//! Tests for verifying system message loading with README content
|
||||||
|
//!
|
||||||
|
//! This test verifies that when a README is present, the system message
|
||||||
|
//! is correctly loaded and structured in the context window.
|
||||||
|
|
||||||
|
use g3_core::ContextWindow;
|
||||||
|
use g3_providers::{Message, MessageRole};
|
||||||
|
|
||||||
|
/// Test that system prompt is always the first message
|
||||||
|
#[test]
|
||||||
|
fn test_system_prompt_is_first_message() {
|
||||||
|
let mut context = ContextWindow::new(10000);
|
||||||
|
|
||||||
|
// Simulate agent initialization: system prompt first
|
||||||
|
let system_prompt = "You are G3, an AI programming agent of the same skill level...";
|
||||||
|
context.add_message(Message::new(MessageRole::System, system_prompt.to_string()));
|
||||||
|
|
||||||
|
// Verify the first message is the system prompt
|
||||||
|
assert!(!context.conversation_history.is_empty());
|
||||||
|
let first_message = &context.conversation_history[0];
|
||||||
|
assert!(
|
||||||
|
matches!(first_message.role, MessageRole::System),
|
||||||
|
"First message should be a System message"
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
first_message.content.contains("You are G3"),
|
||||||
|
"First message should contain the system prompt"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test that README is added as the second system message after the system prompt
|
||||||
|
#[test]
|
||||||
|
fn test_readme_is_second_message_after_system_prompt() {
|
||||||
|
let mut context = ContextWindow::new(10000);
|
||||||
|
|
||||||
|
// Simulate agent initialization: system prompt first
|
||||||
|
let system_prompt = "You are G3, an AI programming agent of the same skill level...";
|
||||||
|
context.add_message(Message::new(MessageRole::System, system_prompt.to_string()));
|
||||||
|
|
||||||
|
// Add README as second system message (simulating what Agent::new_with_readme does)
|
||||||
|
let readme_content = "📚 Project README (from README.md):\n\n# My Project\n\nThis is a test project.";
|
||||||
|
context.add_message(Message::new(MessageRole::System, readme_content.to_string()));
|
||||||
|
|
||||||
|
// Verify we have 2 messages
|
||||||
|
assert_eq!(context.conversation_history.len(), 2);
|
||||||
|
|
||||||
|
// Verify the first message is the system prompt
|
||||||
|
let first_message = &context.conversation_history[0];
|
||||||
|
assert!(
|
||||||
|
matches!(first_message.role, MessageRole::System),
|
||||||
|
"First message should be a System message"
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
first_message.content.contains("You are G3"),
|
||||||
|
"First message should contain the system prompt"
|
||||||
|
);
|
||||||
|
|
||||||
|
// Verify the second message is the README
|
||||||
|
let second_message = &context.conversation_history[1];
|
||||||
|
assert!(
|
||||||
|
matches!(second_message.role, MessageRole::System),
|
||||||
|
"Second message should be a System message"
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
second_message.content.contains("Project README"),
|
||||||
|
"Second message should contain the README content"
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
second_message.content.contains("My Project"),
|
||||||
|
"Second message should contain the actual README content"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test that system prompt and README are separate messages (not combined)
|
||||||
|
#[test]
|
||||||
|
fn test_system_prompt_and_readme_are_separate() {
|
||||||
|
let mut context = ContextWindow::new(10000);
|
||||||
|
|
||||||
|
// Simulate agent initialization
|
||||||
|
let system_prompt = "You are G3, an AI programming agent...";
|
||||||
|
context.add_message(Message::new(MessageRole::System, system_prompt.to_string()));
|
||||||
|
|
||||||
|
let readme_content = "📚 Project README (from README.md):\n\n# Test Project";
|
||||||
|
context.add_message(Message::new(MessageRole::System, readme_content.to_string()));
|
||||||
|
|
||||||
|
// Verify they are separate messages
|
||||||
|
assert_eq!(context.conversation_history.len(), 2);
|
||||||
|
|
||||||
|
// First message should NOT contain README
|
||||||
|
let first_message = &context.conversation_history[0];
|
||||||
|
assert!(
|
||||||
|
!first_message.content.contains("Project README"),
|
||||||
|
"System prompt should not contain README content"
|
||||||
|
);
|
||||||
|
|
||||||
|
// Second message should NOT contain system prompt
|
||||||
|
let second_message = &context.conversation_history[1];
|
||||||
|
assert!(
|
||||||
|
!second_message.content.contains("You are G3"),
|
||||||
|
"README message should not contain system prompt"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test that TODO is added as third message after system prompt and README
|
||||||
|
#[test]
|
||||||
|
fn test_todo_is_third_message_after_readme() {
|
||||||
|
let mut context = ContextWindow::new(10000);
|
||||||
|
|
||||||
|
// Simulate agent initialization order:
|
||||||
|
// 1. System prompt
|
||||||
|
let system_prompt = "You are G3, an AI programming agent...";
|
||||||
|
context.add_message(Message::new(MessageRole::System, system_prompt.to_string()));
|
||||||
|
|
||||||
|
// 2. README
|
||||||
|
let readme_content = "📚 Project README (from README.md):\n\n# Test Project";
|
||||||
|
context.add_message(Message::new(MessageRole::System, readme_content.to_string()));
|
||||||
|
|
||||||
|
// 3. TODO (if present)
|
||||||
|
let todo_content = "📋 Existing TODO list (from todo.g3.md):\n\n- [ ] Task 1\n- [x] Task 2";
|
||||||
|
context.add_message(Message::new(MessageRole::System, todo_content.to_string()));
|
||||||
|
|
||||||
|
// Verify we have 3 messages
|
||||||
|
assert_eq!(context.conversation_history.len(), 3);
|
||||||
|
|
||||||
|
// Verify order
|
||||||
|
assert!(
|
||||||
|
context.conversation_history[0].content.contains("You are G3"),
|
||||||
|
"First message should be system prompt"
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
context.conversation_history[1].content.contains("Project README"),
|
||||||
|
"Second message should be README"
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
context.conversation_history[2].content.contains("TODO list"),
|
||||||
|
"Third message should be TODO"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test that AGENTS.md content is combined with README in the same message
|
||||||
|
#[test]
|
||||||
|
fn test_agents_and_readme_combined() {
|
||||||
|
let mut context = ContextWindow::new(10000);
|
||||||
|
|
||||||
|
// Simulate agent initialization
|
||||||
|
let system_prompt = "You are G3, an AI programming agent...";
|
||||||
|
context.add_message(Message::new(MessageRole::System, system_prompt.to_string()));
|
||||||
|
|
||||||
|
// Combined AGENTS.md and README.md content (as done in g3-cli)
|
||||||
|
let combined_content = "# Agent Configuration\n\nSpecial instructions.\n\n# Project README\n\nProject description.";
|
||||||
|
context.add_message(Message::new(MessageRole::System, combined_content.to_string()));
|
||||||
|
|
||||||
|
// Verify we have 2 messages
|
||||||
|
assert_eq!(context.conversation_history.len(), 2);
|
||||||
|
|
||||||
|
// Verify the second message contains both AGENTS and README
|
||||||
|
let second_message = &context.conversation_history[1];
|
||||||
|
assert!(
|
||||||
|
second_message.content.contains("Agent Configuration"),
|
||||||
|
"Combined message should contain AGENTS.md content"
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
second_message.content.contains("Project README"),
|
||||||
|
"Combined message should contain README content"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test that user messages come after system messages
|
||||||
|
#[test]
|
||||||
|
fn test_user_messages_after_system_messages() {
|
||||||
|
let mut context = ContextWindow::new(10000);
|
||||||
|
|
||||||
|
// Simulate agent initialization
|
||||||
|
let system_prompt = "You are G3, an AI programming agent...";
|
||||||
|
context.add_message(Message::new(MessageRole::System, system_prompt.to_string()));
|
||||||
|
|
||||||
|
let readme_content = "📚 Project README (from README.md):\n\n# Test Project";
|
||||||
|
context.add_message(Message::new(MessageRole::System, readme_content.to_string()));
|
||||||
|
|
||||||
|
// Add user message
|
||||||
|
let user_message = "Please help me with this task.";
|
||||||
|
context.add_message(Message::new(MessageRole::User, user_message.to_string()));
|
||||||
|
|
||||||
|
// Verify order
|
||||||
|
assert_eq!(context.conversation_history.len(), 3);
|
||||||
|
assert!(matches!(context.conversation_history[0].role, MessageRole::System));
|
||||||
|
assert!(matches!(context.conversation_history[1].role, MessageRole::System));
|
||||||
|
assert!(matches!(context.conversation_history[2].role, MessageRole::User));
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test that empty README content is not added
|
||||||
|
#[test]
|
||||||
|
fn test_empty_readme_not_added() {
|
||||||
|
let mut context = ContextWindow::new(10000);
|
||||||
|
|
||||||
|
// Simulate agent initialization
|
||||||
|
let system_prompt = "You are G3, an AI programming agent...";
|
||||||
|
context.add_message(Message::new(MessageRole::System, system_prompt.to_string()));
|
||||||
|
|
||||||
|
// Try to add empty README (should be skipped due to empty content check)
|
||||||
|
let empty_readme = " "; // whitespace only
|
||||||
|
context.add_message(Message::new(MessageRole::System, empty_readme.to_string()));
|
||||||
|
|
||||||
|
// Verify only system prompt was added (empty message should be skipped)
|
||||||
|
assert_eq!(
|
||||||
|
context.conversation_history.len(),
|
||||||
|
1,
|
||||||
|
"Empty README should not be added to conversation history"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test the reload_readme detection logic
|
||||||
|
#[test]
|
||||||
|
fn test_readme_detection_for_reload() {
|
||||||
|
let mut context = ContextWindow::new(10000);
|
||||||
|
|
||||||
|
// Simulate agent initialization
|
||||||
|
let system_prompt = "You are G3, an AI programming agent...";
|
||||||
|
context.add_message(Message::new(MessageRole::System, system_prompt.to_string()));
|
||||||
|
|
||||||
|
// Add README with expected markers
|
||||||
|
let readme_content = "# Project README\n\nThis is the project description.";
|
||||||
|
context.add_message(Message::new(MessageRole::System, readme_content.to_string()));
|
||||||
|
|
||||||
|
// Check if the second message (index 1) is a README
|
||||||
|
let has_readme = context
|
||||||
|
.conversation_history
|
||||||
|
.get(1)
|
||||||
|
.map(|m| {
|
||||||
|
matches!(m.role, MessageRole::System)
|
||||||
|
&& (m.content.contains("Project README")
|
||||||
|
|| m.content.contains("Agent Configuration"))
|
||||||
|
})
|
||||||
|
.unwrap_or(false);
|
||||||
|
|
||||||
|
assert!(has_readme, "Should detect README at index 1");
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test that README detection fails when no README is present
|
||||||
|
#[test]
|
||||||
|
fn test_readme_detection_without_readme() {
|
||||||
|
let mut context = ContextWindow::new(10000);
|
||||||
|
|
||||||
|
// Simulate agent initialization without README
|
||||||
|
let system_prompt = "You are G3, an AI programming agent...";
|
||||||
|
context.add_message(Message::new(MessageRole::System, system_prompt.to_string()));
|
||||||
|
|
||||||
|
// Add a user message directly (no README)
|
||||||
|
context.add_message(Message::new(MessageRole::User, "Hello".to_string()));
|
||||||
|
|
||||||
|
// Check if the second message (index 1) is a README
|
||||||
|
let has_readme = context
|
||||||
|
.conversation_history
|
||||||
|
.get(1)
|
||||||
|
.map(|m| {
|
||||||
|
matches!(m.role, MessageRole::System)
|
||||||
|
&& (m.content.contains("Project README")
|
||||||
|
|| m.content.contains("Agent Configuration"))
|
||||||
|
})
|
||||||
|
.unwrap_or(false);
|
||||||
|
|
||||||
|
assert!(!has_readme, "Should not detect README when none exists");
|
||||||
|
}
|
||||||
78
crates/g3-core/tests/test_todo_completion.rs
Normal file
78
crates/g3-core/tests/test_todo_completion.rs
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
//! Tests for TODO completion detection and file deletion behavior
|
||||||
|
|
||||||
|
/// Helper to check if all TODOs are complete (same logic as in lib.rs)
|
||||||
|
fn all_todos_complete(content: &str) -> bool {
|
||||||
|
let has_incomplete = content.lines().any(|line| {
|
||||||
|
let trimmed = line.trim();
|
||||||
|
trimmed.starts_with("- [ ]")
|
||||||
|
});
|
||||||
|
|
||||||
|
!has_incomplete && (content.contains("- [x]") || content.contains("- [X]"))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_all_complete_lowercase() {
|
||||||
|
let content = "# Test\n\n- [x] Done 1\n- [x] Done 2";
|
||||||
|
assert!(all_todos_complete(content));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_all_complete_uppercase() {
|
||||||
|
let content = "# Test\n\n- [X] Done 1\n- [X] Done 2";
|
||||||
|
assert!(all_todos_complete(content));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_all_complete_mixed_case() {
|
||||||
|
let content = "# Test\n\n- [x] Done 1\n- [X] Done 2";
|
||||||
|
assert!(all_todos_complete(content));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_has_incomplete() {
|
||||||
|
let content = "# Test\n\n- [x] Done 1\n- [ ] Not done";
|
||||||
|
assert!(!all_todos_complete(content));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_all_incomplete() {
|
||||||
|
let content = "# Test\n\n- [ ] Not done 1\n- [ ] Not done 2";
|
||||||
|
assert!(!all_todos_complete(content));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_no_checkboxes() {
|
||||||
|
let content = "# Just a header\n\nSome text without checkboxes";
|
||||||
|
assert!(!all_todos_complete(content));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_nested_complete() {
|
||||||
|
let content = "# Test\n\n- [x] Parent\n - [x] Child 1\n - [x] Child 2";
|
||||||
|
assert!(all_todos_complete(content));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_nested_incomplete() {
|
||||||
|
let content = "# Test\n\n- [x] Parent\n - [x] Child 1\n - [ ] Child 2";
|
||||||
|
assert!(!all_todos_complete(content));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_indented_incomplete() {
|
||||||
|
// Indented incomplete items should still be detected
|
||||||
|
let content = "# Test\n\n- [x] Done\n - [ ] Indented incomplete";
|
||||||
|
assert!(!all_todos_complete(content));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_empty_content() {
|
||||||
|
let content = "";
|
||||||
|
assert!(!all_todos_complete(content));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_whitespace_only() {
|
||||||
|
let content = " \n\n ";
|
||||||
|
assert!(!all_todos_complete(content));
|
||||||
|
}
|
||||||
@@ -1,103 +1,170 @@
|
|||||||
use g3_core::ContextWindow;
|
use g3_core::ContextWindow;
|
||||||
use g3_providers::Usage;
|
use g3_providers::{Message, MessageRole, Usage};
|
||||||
|
|
||||||
|
/// Test that used_tokens is tracked via add_message, not update_usage_from_response.
|
||||||
|
/// This is critical for the 80% summarization threshold to work correctly.
|
||||||
#[test]
|
#[test]
|
||||||
fn test_token_accumulation() {
|
fn test_used_tokens_tracked_via_messages() {
|
||||||
let mut window = ContextWindow::new(10000);
|
let mut window = ContextWindow::new(10000);
|
||||||
|
|
||||||
// First API call: 100 prompt + 50 completion = 150 total
|
// Add a user message - this should update used_tokens
|
||||||
let usage1 = Usage {
|
let user_msg = Message::new(MessageRole::User, "Hello, how are you?".to_string());
|
||||||
|
window.add_message(user_msg);
|
||||||
|
|
||||||
|
// used_tokens should be non-zero after adding a message
|
||||||
|
assert!(window.used_tokens > 0, "used_tokens should increase after add_message");
|
||||||
|
let tokens_after_user_msg = window.used_tokens;
|
||||||
|
|
||||||
|
// Add an assistant message
|
||||||
|
let assistant_msg = Message::new(MessageRole::Assistant, "I'm doing well, thank you!".to_string());
|
||||||
|
window.add_message(assistant_msg);
|
||||||
|
|
||||||
|
// used_tokens should increase further
|
||||||
|
assert!(window.used_tokens > tokens_after_user_msg, "used_tokens should increase after adding assistant message");
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test that update_usage_from_response only updates cumulative_tokens, not used_tokens.
|
||||||
|
/// This prevents double-counting which was causing the 80% threshold to be reached at 200%+.
|
||||||
|
#[test]
|
||||||
|
fn test_update_usage_only_affects_cumulative() {
|
||||||
|
let mut window = ContextWindow::new(10000);
|
||||||
|
|
||||||
|
// Initial state
|
||||||
|
assert_eq!(window.used_tokens, 0);
|
||||||
|
assert_eq!(window.cumulative_tokens, 0);
|
||||||
|
|
||||||
|
// Simulate API response with usage data
|
||||||
|
let usage = Usage {
|
||||||
prompt_tokens: 100,
|
prompt_tokens: 100,
|
||||||
completion_tokens: 50,
|
completion_tokens: 50,
|
||||||
total_tokens: 150,
|
total_tokens: 150,
|
||||||
};
|
};
|
||||||
window.update_usage_from_response(&usage1);
|
window.update_usage_from_response(&usage);
|
||||||
assert_eq!(window.used_tokens, 150, "First call should have 150 tokens");
|
|
||||||
assert_eq!(window.cumulative_tokens, 150, "Cumulative should be 150");
|
|
||||||
|
|
||||||
// Second API call: 200 prompt + 75 completion = 275 total
|
// used_tokens should NOT change - it's tracked via add_message
|
||||||
|
assert_eq!(window.used_tokens, 0, "used_tokens should not be updated by update_usage_from_response");
|
||||||
|
|
||||||
|
// cumulative_tokens SHOULD be updated for API usage tracking
|
||||||
|
assert_eq!(window.cumulative_tokens, 150, "cumulative_tokens should track total API usage");
|
||||||
|
|
||||||
|
// Another API call
|
||||||
let usage2 = Usage {
|
let usage2 = Usage {
|
||||||
prompt_tokens: 200,
|
prompt_tokens: 200,
|
||||||
completion_tokens: 75,
|
completion_tokens: 75,
|
||||||
total_tokens: 275,
|
total_tokens: 275,
|
||||||
};
|
};
|
||||||
window.update_usage_from_response(&usage2);
|
window.update_usage_from_response(&usage2);
|
||||||
assert_eq!(
|
|
||||||
window.used_tokens, 425,
|
|
||||||
"Second call should accumulate to 425 tokens"
|
|
||||||
);
|
|
||||||
assert_eq!(window.cumulative_tokens, 425, "Cumulative should be 425");
|
|
||||||
|
|
||||||
// Third API call with SMALLER token count: 50 prompt + 25 completion = 75 total
|
// used_tokens still unchanged
|
||||||
let usage3 = Usage {
|
assert_eq!(window.used_tokens, 0, "used_tokens should remain unchanged");
|
||||||
prompt_tokens: 50,
|
|
||||||
completion_tokens: 25,
|
|
||||||
total_tokens: 75,
|
|
||||||
};
|
|
||||||
window.update_usage_from_response(&usage3);
|
|
||||||
assert_eq!(
|
|
||||||
window.used_tokens, 500,
|
|
||||||
"Third call should accumulate to 500 tokens"
|
|
||||||
);
|
|
||||||
assert_eq!(window.cumulative_tokens, 500, "Cumulative should be 500");
|
|
||||||
|
|
||||||
// Verify tokens never decrease
|
// cumulative_tokens accumulates
|
||||||
assert!(
|
assert_eq!(window.cumulative_tokens, 425, "cumulative_tokens should accumulate");
|
||||||
window.used_tokens >= 425,
|
|
||||||
"Token count should never decrease!"
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Test that add_streaming_tokens only updates cumulative_tokens.
|
||||||
|
/// The assistant message will be added via add_message which tracks used_tokens.
|
||||||
#[test]
|
#[test]
|
||||||
fn test_add_streaming_tokens() {
|
fn test_add_streaming_tokens_only_affects_cumulative() {
|
||||||
let mut window = ContextWindow::new(10000);
|
let mut window = ContextWindow::new(10000);
|
||||||
|
|
||||||
// Add some streaming tokens
|
// Add streaming tokens (fallback when no usage data available)
|
||||||
window.add_streaming_tokens(100);
|
window.add_streaming_tokens(100);
|
||||||
assert_eq!(window.used_tokens, 100);
|
|
||||||
assert_eq!(window.cumulative_tokens, 100);
|
|
||||||
|
|
||||||
// Add more
|
// used_tokens should NOT change
|
||||||
|
assert_eq!(window.used_tokens, 0, "used_tokens should not be updated by add_streaming_tokens");
|
||||||
|
|
||||||
|
// cumulative_tokens SHOULD be updated
|
||||||
|
assert_eq!(window.cumulative_tokens, 100, "cumulative_tokens should be updated");
|
||||||
|
|
||||||
|
// Add more streaming tokens
|
||||||
window.add_streaming_tokens(50);
|
window.add_streaming_tokens(50);
|
||||||
assert_eq!(window.used_tokens, 150);
|
assert_eq!(window.used_tokens, 0);
|
||||||
assert_eq!(window.cumulative_tokens, 150);
|
assert_eq!(window.cumulative_tokens, 150);
|
||||||
|
|
||||||
// Now update from provider response
|
|
||||||
let usage = Usage {
|
|
||||||
prompt_tokens: 80,
|
|
||||||
completion_tokens: 40,
|
|
||||||
total_tokens: 120,
|
|
||||||
};
|
|
||||||
window.update_usage_from_response(&usage);
|
|
||||||
|
|
||||||
// Should ADD to existing, not replace
|
|
||||||
assert_eq!(window.used_tokens, 270, "Should add 120 to existing 150");
|
|
||||||
assert_eq!(window.cumulative_tokens, 270);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Test percentage calculation is based on used_tokens (actual context content).
|
||||||
#[test]
|
#[test]
|
||||||
fn test_percentage_calculation() {
|
fn test_percentage_based_on_used_tokens() {
|
||||||
let mut window = ContextWindow::new(1000);
|
let mut window = ContextWindow::new(1000);
|
||||||
|
|
||||||
// Add tokens via provider response
|
// Initially 0%
|
||||||
|
assert_eq!(window.percentage_used(), 0.0);
|
||||||
|
assert_eq!(window.remaining_tokens(), 1000);
|
||||||
|
|
||||||
|
// Add messages to increase used_tokens
|
||||||
|
// A message with ~100 chars should be roughly 25-30 tokens
|
||||||
|
let msg = Message::new(MessageRole::User, "x".repeat(400)); // ~100 tokens estimated
|
||||||
|
window.add_message(msg);
|
||||||
|
|
||||||
|
// Percentage should be based on used_tokens
|
||||||
|
let percentage = window.percentage_used();
|
||||||
|
assert!(percentage > 0.0, "percentage should be > 0 after adding message");
|
||||||
|
assert!(percentage < 100.0, "percentage should be < 100");
|
||||||
|
|
||||||
|
// remaining_tokens should decrease
|
||||||
|
assert!(window.remaining_tokens() < 1000, "remaining tokens should decrease");
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test that the 80% summarization threshold works correctly.
|
||||||
|
/// This was the original bug - used_tokens was being double/triple counted.
|
||||||
|
#[test]
|
||||||
|
fn test_should_summarize_threshold() {
|
||||||
|
let mut window = ContextWindow::new(1000);
|
||||||
|
|
||||||
|
// Add messages until we approach 80%
|
||||||
|
// Each message of ~320 chars is roughly 80 tokens (at 4 chars/token)
|
||||||
|
for _ in 0..9 {
|
||||||
|
let msg = Message::new(MessageRole::User, "x".repeat(320));
|
||||||
|
window.add_message(msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Should be around 720 tokens (72%) - not yet at threshold
|
||||||
|
// Note: actual token count depends on estimation algorithm
|
||||||
|
let percentage = window.percentage_used();
|
||||||
|
println!("After 9 messages: {}% used ({} tokens)", percentage, window.used_tokens);
|
||||||
|
|
||||||
|
// Add one more message to push over 80%
|
||||||
|
let msg = Message::new(MessageRole::User, "x".repeat(320));
|
||||||
|
window.add_message(msg);
|
||||||
|
|
||||||
|
let percentage_after = window.percentage_used();
|
||||||
|
println!("After 10 messages: {}% used ({} tokens)", percentage_after, window.used_tokens);
|
||||||
|
|
||||||
|
// Now should_summarize should return true if we're at 80%+
|
||||||
|
if percentage_after >= 80.0 {
|
||||||
|
assert!(window.should_summarize(), "should_summarize should be true at 80%+");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test that cumulative_tokens and used_tokens are independent.
|
||||||
|
#[test]
|
||||||
|
fn test_cumulative_vs_used_independence() {
|
||||||
|
let mut window = ContextWindow::new(10000);
|
||||||
|
|
||||||
|
// Add a message (affects used_tokens)
|
||||||
|
let msg = Message::new(MessageRole::User, "Hello world".to_string());
|
||||||
|
window.add_message(msg);
|
||||||
|
let used_after_msg = window.used_tokens;
|
||||||
|
let cumulative_after_msg = window.cumulative_tokens;
|
||||||
|
|
||||||
|
// Both should be equal at this point (message adds to both)
|
||||||
|
assert_eq!(used_after_msg, cumulative_after_msg);
|
||||||
|
|
||||||
|
// Now simulate API response (only affects cumulative_tokens)
|
||||||
let usage = Usage {
|
let usage = Usage {
|
||||||
prompt_tokens: 150,
|
prompt_tokens: 500,
|
||||||
completion_tokens: 100,
|
completion_tokens: 200,
|
||||||
total_tokens: 250,
|
total_tokens: 700,
|
||||||
};
|
};
|
||||||
window.update_usage_from_response(&usage);
|
window.update_usage_from_response(&usage);
|
||||||
|
|
||||||
assert_eq!(window.percentage_used(), 25.0);
|
// used_tokens unchanged
|
||||||
assert_eq!(window.remaining_tokens(), 750);
|
assert_eq!(window.used_tokens, used_after_msg, "used_tokens should not change from API response");
|
||||||
|
|
||||||
// Add more tokens
|
// cumulative_tokens increased
|
||||||
let usage2 = Usage {
|
assert_eq!(window.cumulative_tokens, cumulative_after_msg + 700, "cumulative_tokens should increase");
|
||||||
prompt_tokens: 300,
|
|
||||||
completion_tokens: 200,
|
|
||||||
total_tokens: 500,
|
|
||||||
};
|
|
||||||
window.update_usage_from_response(&usage2);
|
|
||||||
|
|
||||||
assert_eq!(window.percentage_used(), 75.0);
|
// They should now be different
|
||||||
assert_eq!(window.remaining_tokens(), 250);
|
assert!(window.cumulative_tokens > window.used_tokens, "cumulative should be greater than used");
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -81,6 +81,9 @@ impl UiWriter for MockUiWriter {
|
|||||||
.push(format!("CHOICE: {} Options: {:?}", message, options));
|
.push(format!("CHOICE: {} Options: {:?}", message, options));
|
||||||
self.choice_responses.lock().unwrap().pop().unwrap_or(0)
|
self.choice_responses.lock().unwrap().pop().unwrap_or(0)
|
||||||
}
|
}
|
||||||
|
fn print_final_output(&self, summary: &str) {
|
||||||
|
self.output.lock().unwrap().push(format!("FINAL: {}", summary));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
|
|||||||
@@ -85,6 +85,7 @@ pub async fn get_initial_discovery_messages(
|
|||||||
temperature: Some(provider.temperature()),
|
temperature: Some(provider.temperature()),
|
||||||
stream: false,
|
stream: false,
|
||||||
tools: None,
|
tools: None,
|
||||||
|
disable_thinking: false,
|
||||||
};
|
};
|
||||||
|
|
||||||
status("🤖 Calling LLM for discovery commands...");
|
status("🤖 Calling LLM for discovery commands...");
|
||||||
|
|||||||
@@ -26,6 +26,7 @@
|
|||||||
//! Some(0.1),
|
//! Some(0.1),
|
||||||
//! None, // cache_config
|
//! None, // cache_config
|
||||||
//! None, // enable_1m_context
|
//! None, // enable_1m_context
|
||||||
|
//! None, // thinking_budget_tokens
|
||||||
//! )?;
|
//! )?;
|
||||||
//!
|
//!
|
||||||
//! // Create a completion request
|
//! // Create a completion request
|
||||||
@@ -38,6 +39,7 @@
|
|||||||
//! temperature: Some(0.7),
|
//! temperature: Some(0.7),
|
||||||
//! stream: false,
|
//! stream: false,
|
||||||
//! tools: None,
|
//! tools: None,
|
||||||
|
//! disable_thinking: false,
|
||||||
//! };
|
//! };
|
||||||
//!
|
//!
|
||||||
//! // Get a completion
|
//! // Get a completion
|
||||||
@@ -63,6 +65,7 @@
|
|||||||
//! None,
|
//! None,
|
||||||
//! None, // cache_config
|
//! None, // cache_config
|
||||||
//! None, // enable_1m_context
|
//! None, // enable_1m_context
|
||||||
|
//! None, // thinking_budget_tokens
|
||||||
//! )?;
|
//! )?;
|
||||||
//!
|
//!
|
||||||
//! let request = CompletionRequest {
|
//! let request = CompletionRequest {
|
||||||
@@ -73,6 +76,7 @@
|
|||||||
//! temperature: Some(0.7),
|
//! temperature: Some(0.7),
|
||||||
//! stream: true,
|
//! stream: true,
|
||||||
//! tools: None,
|
//! tools: None,
|
||||||
|
//! disable_thinking: false,
|
||||||
//! };
|
//! };
|
||||||
//!
|
//!
|
||||||
//! let mut stream = provider.stream(request).await?;
|
//! let mut stream = provider.stream(request).await?;
|
||||||
@@ -103,7 +107,7 @@ use serde::{Deserialize, Serialize};
|
|||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use tokio::sync::mpsc;
|
use tokio::sync::mpsc;
|
||||||
use tokio_stream::wrappers::ReceiverStream;
|
use tokio_stream::wrappers::ReceiverStream;
|
||||||
use tracing::{debug, error, warn};
|
use tracing::{debug, error};
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
CompletionChunk, CompletionRequest, CompletionResponse, CompletionStream, LLMProvider, Message,
|
CompletionChunk, CompletionRequest, CompletionResponse, CompletionStream, LLMProvider, Message,
|
||||||
@@ -122,6 +126,7 @@ pub struct AnthropicProvider {
|
|||||||
temperature: f32,
|
temperature: f32,
|
||||||
cache_config: Option<String>,
|
cache_config: Option<String>,
|
||||||
enable_1m_context: bool,
|
enable_1m_context: bool,
|
||||||
|
thinking_budget_tokens: Option<u32>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl AnthropicProvider {
|
impl AnthropicProvider {
|
||||||
@@ -132,6 +137,7 @@ impl AnthropicProvider {
|
|||||||
temperature: Option<f32>,
|
temperature: Option<f32>,
|
||||||
cache_config: Option<String>,
|
cache_config: Option<String>,
|
||||||
enable_1m_context: Option<bool>,
|
enable_1m_context: Option<bool>,
|
||||||
|
thinking_budget_tokens: Option<u32>,
|
||||||
) -> Result<Self> {
|
) -> Result<Self> {
|
||||||
let client = Client::builder()
|
let client = Client::builder()
|
||||||
.timeout(Duration::from_secs(300))
|
.timeout(Duration::from_secs(300))
|
||||||
@@ -150,6 +156,7 @@ impl AnthropicProvider {
|
|||||||
temperature: temperature.unwrap_or(0.1),
|
temperature: temperature.unwrap_or(0.1),
|
||||||
cache_config,
|
cache_config,
|
||||||
enable_1m_context: enable_1m_context.unwrap_or(false),
|
enable_1m_context: enable_1m_context.unwrap_or(false),
|
||||||
|
thinking_budget_tokens,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -223,10 +230,12 @@ impl AnthropicProvider {
|
|||||||
for message in messages {
|
for message in messages {
|
||||||
match message.role {
|
match message.role {
|
||||||
MessageRole::System => {
|
MessageRole::System => {
|
||||||
if system_message.is_some() {
|
if let Some(existing) = system_message {
|
||||||
warn!("Multiple system messages found, using the last one");
|
// Concatenate system messages instead of replacing
|
||||||
|
system_message = Some(format!("{}\n\n{}", existing, message.content));
|
||||||
|
} else {
|
||||||
|
system_message = Some(message.content.clone());
|
||||||
}
|
}
|
||||||
system_message = Some(message.content.clone());
|
|
||||||
}
|
}
|
||||||
MessageRole::User => {
|
MessageRole::User => {
|
||||||
anthropic_messages.push(AnthropicMessage {
|
anthropic_messages.push(AnthropicMessage {
|
||||||
@@ -265,6 +274,7 @@ impl AnthropicProvider {
|
|||||||
streaming: bool,
|
streaming: bool,
|
||||||
max_tokens: u32,
|
max_tokens: u32,
|
||||||
temperature: f32,
|
temperature: f32,
|
||||||
|
disable_thinking: bool,
|
||||||
) -> Result<AnthropicRequest> {
|
) -> Result<AnthropicRequest> {
|
||||||
let (system, anthropic_messages) = self.convert_messages(messages)?;
|
let (system, anthropic_messages) = self.convert_messages(messages)?;
|
||||||
|
|
||||||
@@ -277,6 +287,33 @@ impl AnthropicProvider {
|
|||||||
// Convert tools if provided
|
// Convert tools if provided
|
||||||
let anthropic_tools = tools.map(|t| self.convert_tools(t));
|
let anthropic_tools = tools.map(|t| self.convert_tools(t));
|
||||||
|
|
||||||
|
// Add thinking configuration if budget_tokens is set AND max_tokens is sufficient AND not explicitly disabled
|
||||||
|
// Anthropic requires: max_tokens > thinking.budget_tokens
|
||||||
|
// We add 1024 as minimum buffer for actual response content
|
||||||
|
tracing::debug!("create_request_body called: max_tokens={}, disable_thinking={}, thinking_budget_tokens={:?}", max_tokens, disable_thinking, self.thinking_budget_tokens);
|
||||||
|
|
||||||
|
let thinking = if disable_thinking {
|
||||||
|
tracing::info!(
|
||||||
|
"Thinking mode explicitly disabled for this request (max_tokens={})",
|
||||||
|
max_tokens
|
||||||
|
);
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
self.thinking_budget_tokens.and_then(|budget| {
|
||||||
|
let min_required = budget + 1024;
|
||||||
|
if max_tokens > min_required {
|
||||||
|
Some(ThinkingConfig::enabled(budget))
|
||||||
|
} else {
|
||||||
|
tracing::warn!(
|
||||||
|
"Disabling thinking mode: max_tokens ({}) is not greater than thinking.budget_tokens ({}) + 1024 buffer. \
|
||||||
|
Required: max_tokens > {}",
|
||||||
|
max_tokens, budget, min_required
|
||||||
|
);
|
||||||
|
None
|
||||||
|
}
|
||||||
|
})
|
||||||
|
};
|
||||||
|
|
||||||
let request = AnthropicRequest {
|
let request = AnthropicRequest {
|
||||||
model: self.model.clone(),
|
model: self.model.clone(),
|
||||||
max_tokens,
|
max_tokens,
|
||||||
@@ -285,6 +322,7 @@ impl AnthropicProvider {
|
|||||||
system,
|
system,
|
||||||
tools: anthropic_tools,
|
tools: anthropic_tools,
|
||||||
stream: streaming,
|
stream: streaming,
|
||||||
|
thinking,
|
||||||
};
|
};
|
||||||
|
|
||||||
// Ensure the conversation starts with a user message
|
// Ensure the conversation starts with a user message
|
||||||
@@ -624,6 +662,7 @@ impl LLMProvider for AnthropicProvider {
|
|||||||
false,
|
false,
|
||||||
max_tokens,
|
max_tokens,
|
||||||
temperature,
|
temperature,
|
||||||
|
request.disable_thinking,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
debug!(
|
debug!(
|
||||||
@@ -697,6 +736,7 @@ impl LLMProvider for AnthropicProvider {
|
|||||||
true,
|
true,
|
||||||
max_tokens,
|
max_tokens,
|
||||||
temperature,
|
temperature,
|
||||||
|
request.disable_thinking,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
debug!(
|
debug!(
|
||||||
@@ -775,6 +815,19 @@ impl LLMProvider for AnthropicProvider {
|
|||||||
|
|
||||||
// Anthropic API request/response structures
|
// Anthropic API request/response structures
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
struct ThinkingConfig {
|
||||||
|
#[serde(rename = "type")]
|
||||||
|
thinking_type: String,
|
||||||
|
budget_tokens: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ThinkingConfig {
|
||||||
|
fn enabled(budget_tokens: u32) -> Self {
|
||||||
|
Self { thinking_type: "enabled".to_string(), budget_tokens }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Serialize)]
|
#[derive(Debug, Serialize)]
|
||||||
struct AnthropicRequest {
|
struct AnthropicRequest {
|
||||||
model: String,
|
model: String,
|
||||||
@@ -786,6 +839,8 @@ struct AnthropicRequest {
|
|||||||
#[serde(skip_serializing_if = "Option::is_none")]
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
tools: Option<Vec<AnthropicTool>>,
|
tools: Option<Vec<AnthropicTool>>,
|
||||||
stream: bool,
|
stream: bool,
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
|
thinking: Option<ThinkingConfig>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Serialize)]
|
#[derive(Debug, Serialize)]
|
||||||
@@ -819,6 +874,12 @@ enum AnthropicContent {
|
|||||||
#[serde(skip_serializing_if = "Option::is_none")]
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
cache_control: Option<crate::CacheControl>,
|
cache_control: Option<crate::CacheControl>,
|
||||||
},
|
},
|
||||||
|
#[serde(rename = "thinking")]
|
||||||
|
Thinking {
|
||||||
|
thinking: String,
|
||||||
|
#[serde(default)]
|
||||||
|
signature: Option<String>,
|
||||||
|
},
|
||||||
#[serde(rename = "tool_use")]
|
#[serde(rename = "tool_use")]
|
||||||
ToolUse {
|
ToolUse {
|
||||||
id: String,
|
id: String,
|
||||||
@@ -884,7 +945,7 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_message_conversion() {
|
fn test_message_conversion() {
|
||||||
let provider =
|
let provider =
|
||||||
AnthropicProvider::new("test-key".to_string(), None, None, None, None, None).unwrap();
|
AnthropicProvider::new("test-key".to_string(), None, None, None, None, None, None).unwrap();
|
||||||
|
|
||||||
let messages = vec![
|
let messages = vec![
|
||||||
Message::new(
|
Message::new(
|
||||||
@@ -912,13 +973,14 @@ mod tests {
|
|||||||
Some(0.5),
|
Some(0.5),
|
||||||
None,
|
None,
|
||||||
None,
|
None,
|
||||||
|
None,
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let messages = vec![Message::new(MessageRole::User, "Test message".to_string())];
|
let messages = vec![Message::new(MessageRole::User, "Test message".to_string())];
|
||||||
|
|
||||||
let request_body = provider
|
let request_body = provider
|
||||||
.create_request_body(&messages, None, false, 1000, 0.5)
|
.create_request_body(&messages, None, false, 1000, 0.5, false)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
assert_eq!(request_body.model, "claude-3-haiku-20240307");
|
assert_eq!(request_body.model, "claude-3-haiku-20240307");
|
||||||
@@ -932,7 +994,7 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_tool_conversion() {
|
fn test_tool_conversion() {
|
||||||
let provider =
|
let provider =
|
||||||
AnthropicProvider::new("test-key".to_string(), None, None, None, None, None).unwrap();
|
AnthropicProvider::new("test-key".to_string(), None, None, None, None, None, None).unwrap();
|
||||||
|
|
||||||
let tools = vec![Tool {
|
let tools = vec![Tool {
|
||||||
name: "get_weather".to_string(),
|
name: "get_weather".to_string(),
|
||||||
@@ -965,7 +1027,7 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_cache_control_serialization() {
|
fn test_cache_control_serialization() {
|
||||||
let provider =
|
let provider =
|
||||||
AnthropicProvider::new("test-key".to_string(), None, None, None, None, None).unwrap();
|
AnthropicProvider::new("test-key".to_string(), None, None, None, None, None, None).unwrap();
|
||||||
|
|
||||||
// Test message WITHOUT cache_control
|
// Test message WITHOUT cache_control
|
||||||
let messages_without = vec![Message::new(MessageRole::User, "Hello".to_string())];
|
let messages_without = vec![Message::new(MessageRole::User, "Hello".to_string())];
|
||||||
@@ -1007,4 +1069,114 @@ mod tests {
|
|||||||
"JSON should not contain 'cache_control' field or null values when not configured"
|
"JSON should not contain 'cache_control' field or null values when not configured"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_thinking_parameter_serialization() {
|
||||||
|
// Test WITHOUT thinking parameter
|
||||||
|
let provider_without = AnthropicProvider::new(
|
||||||
|
"test-key".to_string(),
|
||||||
|
Some("claude-sonnet-4-5".to_string()),
|
||||||
|
Some(1000),
|
||||||
|
Some(0.5),
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
None, // No thinking budget
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let messages = vec![Message::new(MessageRole::User, "Test message".to_string())];
|
||||||
|
let request_without = provider_without
|
||||||
|
.create_request_body(&messages, None, false, 1000, 0.5, false)
|
||||||
|
.unwrap();
|
||||||
|
let json_without = serde_json::to_string(&request_without).unwrap();
|
||||||
|
assert!(!json_without.contains("thinking"), "JSON should not contain 'thinking' field when not configured");
|
||||||
|
|
||||||
|
// Test WITH thinking parameter - max_tokens must be > budget_tokens + 1024
|
||||||
|
// Using budget=10000 requires max_tokens > 11024
|
||||||
|
let provider_with = AnthropicProvider::new(
|
||||||
|
"test-key".to_string(),
|
||||||
|
Some("claude-sonnet-4-5".to_string()),
|
||||||
|
Some(20000), // Sufficient for thinking budget
|
||||||
|
Some(0.5),
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
Some(10000), // With thinking budget
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let request_with = provider_with
|
||||||
|
.create_request_body(&messages, None, false, 20000, 0.5, false)
|
||||||
|
.unwrap();
|
||||||
|
let json_with = serde_json::to_string(&request_with).unwrap();
|
||||||
|
assert!(json_with.contains("thinking"), "JSON should contain 'thinking' field when configured");
|
||||||
|
assert!(json_with.contains("\"type\":\"enabled\""), "JSON should contain type: enabled");
|
||||||
|
assert!(json_with.contains("\"budget_tokens\":10000"), "JSON should contain budget_tokens: 10000");
|
||||||
|
|
||||||
|
// Test WITH thinking parameter but INSUFFICIENT max_tokens - thinking should be disabled
|
||||||
|
let request_insufficient = provider_with
|
||||||
|
.create_request_body(&messages, None, false, 5000, 0.5, false) // Less than budget + 1024
|
||||||
|
.unwrap();
|
||||||
|
let json_insufficient = serde_json::to_string(&request_insufficient).unwrap();
|
||||||
|
assert!(!json_insufficient.contains("thinking"), "JSON should NOT contain 'thinking' field when max_tokens is insufficient");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_disable_thinking_flag() {
|
||||||
|
// Test that disable_thinking=true prevents thinking even with sufficient max_tokens
|
||||||
|
let provider = AnthropicProvider::new(
|
||||||
|
"test-key".to_string(),
|
||||||
|
Some("claude-sonnet-4-5".to_string()),
|
||||||
|
Some(20000),
|
||||||
|
Some(0.5),
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
Some(10000), // With thinking budget
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let messages = vec![Message::new(MessageRole::User, "Test message".to_string())];
|
||||||
|
|
||||||
|
// With disable_thinking=false, thinking should be enabled (max_tokens is sufficient)
|
||||||
|
let request_with_thinking = provider
|
||||||
|
.create_request_body(&messages, None, false, 20000, 0.5, false)
|
||||||
|
.unwrap();
|
||||||
|
let json_with = serde_json::to_string(&request_with_thinking).unwrap();
|
||||||
|
assert!(json_with.contains("thinking"), "JSON should contain 'thinking' field when not disabled");
|
||||||
|
|
||||||
|
// With disable_thinking=true, thinking should be disabled even with sufficient max_tokens
|
||||||
|
let request_without_thinking = provider
|
||||||
|
.create_request_body(&messages, None, false, 20000, 0.5, true)
|
||||||
|
.unwrap();
|
||||||
|
let json_without = serde_json::to_string(&request_without_thinking).unwrap();
|
||||||
|
assert!(!json_without.contains("thinking"), "JSON should NOT contain 'thinking' field when explicitly disabled");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_thinking_content_block_deserialization() {
|
||||||
|
// Test that we can deserialize a response containing a "thinking" content block
|
||||||
|
// This is what Anthropic returns when extended thinking is enabled
|
||||||
|
let json_response = r#"{
|
||||||
|
"content": [
|
||||||
|
{"type": "thinking", "thinking": "Let me analyze this...", "signature": "abc123"},
|
||||||
|
{"type": "text", "text": "Here is my response."}
|
||||||
|
],
|
||||||
|
"model": "claude-sonnet-4-5",
|
||||||
|
"usage": {"input_tokens": 100, "output_tokens": 50}
|
||||||
|
}"#;
|
||||||
|
|
||||||
|
let response: AnthropicResponse = serde_json::from_str(json_response)
|
||||||
|
.expect("Should be able to deserialize response with thinking block");
|
||||||
|
|
||||||
|
assert_eq!(response.content.len(), 2);
|
||||||
|
assert_eq!(response.model, "claude-sonnet-4-5");
|
||||||
|
|
||||||
|
// Extract only text content (thinking should be filtered out)
|
||||||
|
let text_content: Vec<_> = response.content.iter().filter_map(|c| match c {
|
||||||
|
AnthropicContent::Text { text, .. } => Some(text.as_str()),
|
||||||
|
_ => None,
|
||||||
|
}).collect();
|
||||||
|
|
||||||
|
assert_eq!(text_content.len(), 1);
|
||||||
|
assert_eq!(text_content[0], "Here is my response.");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -45,6 +45,7 @@
|
|||||||
//! temperature: Some(0.7),
|
//! temperature: Some(0.7),
|
||||||
//! stream: false,
|
//! stream: false,
|
||||||
//! tools: None,
|
//! tools: None,
|
||||||
|
//! disable_thinking: false,
|
||||||
//! };
|
//! };
|
||||||
//!
|
//!
|
||||||
//! // Get a completion
|
//! // Get a completion
|
||||||
|
|||||||
@@ -42,6 +42,8 @@ pub struct CompletionRequest {
|
|||||||
pub temperature: Option<f32>,
|
pub temperature: Option<f32>,
|
||||||
pub stream: bool,
|
pub stream: bool,
|
||||||
pub tools: Option<Vec<Tool>>,
|
pub tools: Option<Vec<Tool>>,
|
||||||
|
/// Force disable thinking mode for this request (used when max_tokens is too low)
|
||||||
|
pub disable_thinking: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
|||||||
Reference in New Issue
Block a user