Compare commits
10 Commits
jochen-son
...
jochen-fix
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4aa84e2144 | ||
|
|
2283d9ddbf | ||
|
|
fb2cf6f898 | ||
|
|
696c441a47 | ||
|
|
48e6d594bc | ||
|
|
678403da35 | ||
|
|
0970e4f356 | ||
|
|
758a313de0 | ||
|
|
0327a6dfdf | ||
|
|
928f2bfa9d |
1
Cargo.lock
generated
1
Cargo.lock
generated
@@ -1377,6 +1377,7 @@ dependencies = [
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sha2",
|
||||
"tempfile",
|
||||
"termimad",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
|
||||
@@ -76,6 +76,7 @@ G3 includes robust error handling with automatic retry logic:
|
||||
G3's interactive CLI includes control commands for manual context management:
|
||||
- **`/compact`**: Manually trigger summarization to compact conversation history
|
||||
- **`/thinnify`**: Manually trigger context thinning to replace large tool results with file references
|
||||
- **`/skinnify`**: Manually trigger full context thinning (like `/thinnify` but processes the entire context window, not just the first third)
|
||||
- **`/readme`**: Reload README.md and AGENTS.md from disk without restarting
|
||||
- **`/stats`**: Show detailed context and performance statistics
|
||||
- **`/help`**: Display all available control commands
|
||||
|
||||
@@ -27,3 +27,6 @@ chrono = { version = "0.4", features = ["serde"] }
|
||||
crossterm = "0.29.0"
|
||||
ratatui = "0.29"
|
||||
termimad = "0.34.0"
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3.8"
|
||||
|
||||
@@ -163,15 +163,66 @@ fn extract_coach_feedback_from_logs(
|
||||
if let Some(context_window) = log_json.get("context_window") {
|
||||
if let Some(conversation_history) = context_window.get("conversation_history") {
|
||||
if let Some(messages) = conversation_history.as_array() {
|
||||
// Simply get the last message content - this is the coach's final feedback
|
||||
if let Some(last_message) = messages.last() {
|
||||
if let Some(content) = last_message.get("content") {
|
||||
if let Some(content_str) = content.as_str() {
|
||||
output.print(&format!(
|
||||
"✅ Extracted coach feedback from session: {}",
|
||||
session_id
|
||||
));
|
||||
return Ok(content_str.to_string());
|
||||
// Go backwards through the conversation to find the last tool result
|
||||
// that corresponds to a final_output tool call
|
||||
for i in (0..messages.len()).rev() {
|
||||
let msg = &messages[i];
|
||||
|
||||
// Check if this is a User message with "Tool result:"
|
||||
if let Some(role) = msg.get("role") {
|
||||
if let Some(role_str) = role.as_str() {
|
||||
if role_str == "User" || role_str == "user" {
|
||||
if let Some(content) = msg.get("content") {
|
||||
if let Some(content_str) = content.as_str() {
|
||||
if content_str.starts_with("Tool result:") {
|
||||
// Found a tool result, now check the preceding message
|
||||
// to verify it was a final_output tool call
|
||||
if i > 0 {
|
||||
let prev_msg = &messages[i - 1];
|
||||
if let Some(prev_role) = prev_msg.get("role") {
|
||||
if let Some(prev_role_str) = prev_role.as_str() {
|
||||
if prev_role_str == "assistant" || prev_role_str == "Assistant" {
|
||||
if let Some(prev_content) = prev_msg.get("content") {
|
||||
if let Some(prev_content_str) = prev_content.as_str() {
|
||||
// Check if the previous assistant message contains a final_output tool call
|
||||
if prev_content_str.contains("\"tool\": \"final_output\"") {
|
||||
// This is a final_output tool result
|
||||
let feedback = if content_str.starts_with("Tool result: ") {
|
||||
content_str.strip_prefix("Tool result: ")
|
||||
.unwrap_or(content_str)
|
||||
.to_string()
|
||||
} else {
|
||||
content_str.to_string()
|
||||
};
|
||||
|
||||
output.print(&format!(
|
||||
"Coach feedback extracted: {} characters (from {} total)",
|
||||
feedback.len(),
|
||||
content_str.len()
|
||||
));
|
||||
output.print(&format!("Coach feedback:\n{}", feedback));
|
||||
|
||||
output.print(&format!(
|
||||
"✅ Extracted coach feedback from session: {} (verified final_output tool)",
|
||||
session_id
|
||||
));
|
||||
return Ok(feedback);
|
||||
} else {
|
||||
output.print(&format!(
|
||||
"⚠️ Skipping tool result at index {} - not a final_output tool call",
|
||||
i
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -187,7 +238,7 @@ fn extract_coach_feedback_from_logs(
|
||||
"CRITICAL: Could not extract coach feedback from session: {}\n\
|
||||
Log file path: {:?}\n\
|
||||
Log file exists: {}\n\
|
||||
This indicates the coach did not call any tool or the log is corrupted.\n\
|
||||
This indicates the coach did not call final_output tool or the log is corrupted.\n\
|
||||
Coach result response length: {} chars",
|
||||
session_id,
|
||||
log_file_path,
|
||||
@@ -1283,6 +1334,7 @@ async fn run_interactive<W: UiWriter>(
|
||||
output.print("📖 Control Commands:");
|
||||
output.print(" /compact - Trigger auto-summarization (compacts conversation history)");
|
||||
output.print(" /thinnify - Trigger context thinning (replaces large tool results with file references)");
|
||||
output.print(" /skinnify - Trigger full context thinning (like /thinnify but for entire context, not just first third)");
|
||||
output.print(
|
||||
" /readme - Reload README.md and AGENTS.md from disk",
|
||||
);
|
||||
@@ -1315,6 +1367,11 @@ async fn run_interactive<W: UiWriter>(
|
||||
println!("{}", summary);
|
||||
continue;
|
||||
}
|
||||
"/skinnify" => {
|
||||
let summary = agent.force_thin_all();
|
||||
println!("{}", summary);
|
||||
continue;
|
||||
}
|
||||
"/readme" => {
|
||||
output.print("📚 Reloading README.md and AGENTS.md...");
|
||||
match agent.reload_readme() {
|
||||
@@ -1524,6 +1581,12 @@ async fn run_interactive_machine(
|
||||
println!("{}", summary);
|
||||
continue;
|
||||
}
|
||||
"/skinnify" => {
|
||||
println!("COMMAND: skinnify");
|
||||
let summary = agent.force_thin_all();
|
||||
println!("{}", summary);
|
||||
continue;
|
||||
}
|
||||
"/readme" => {
|
||||
println!("COMMAND: readme");
|
||||
match agent.reload_readme() {
|
||||
@@ -1546,7 +1609,7 @@ async fn run_interactive_machine(
|
||||
}
|
||||
"/help" => {
|
||||
println!("COMMAND: help");
|
||||
println!("AVAILABLE_COMMANDS: /compact /thinnify /readme /stats /help");
|
||||
println!("AVAILABLE_COMMANDS: /compact /thinnify /skinnify /readme /stats /help");
|
||||
continue;
|
||||
}
|
||||
_ => {
|
||||
|
||||
336
crates/g3-cli/tests/coach_feedback_extraction_test.rs
Normal file
336
crates/g3-cli/tests/coach_feedback_extraction_test.rs
Normal file
@@ -0,0 +1,336 @@
|
||||
use serde_json::json;
|
||||
use std::fs;
|
||||
use tempfile::TempDir;
|
||||
|
||||
#[test]
|
||||
fn test_extract_coach_feedback_with_timing_message() {
|
||||
// Create a temporary directory for logs
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let logs_dir = temp_dir.path().join("logs");
|
||||
fs::create_dir(&logs_dir).unwrap();
|
||||
|
||||
// Create a mock session log with the problematic conversation history
|
||||
// where timing message appears after the tool result
|
||||
let session_id = "test_session_123";
|
||||
let log_file_path = logs_dir.join(format!("g3_session_{}.json", session_id));
|
||||
|
||||
let log_content = json!({
|
||||
"session_id": session_id,
|
||||
"context_window": {
|
||||
"conversation_history": [
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "{\"tool\": \"final_output\", \"args\": {\"summary\":\"IMPLEMENTATION_APPROVED\"}}"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Tool result: IMPLEMENTATION_APPROVED"
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "🕝 27.7s | 💭 7.5s"
|
||||
}
|
||||
]
|
||||
}
|
||||
});
|
||||
|
||||
fs::write(&log_file_path, serde_json::to_string_pretty(&log_content).unwrap()).unwrap();
|
||||
|
||||
// Now test the extraction logic
|
||||
let log_content_str = fs::read_to_string(&log_file_path).unwrap();
|
||||
let log_json: serde_json::Value = serde_json::from_str(&log_content_str).unwrap();
|
||||
|
||||
if let Some(context_window) = log_json.get("context_window") {
|
||||
if let Some(conversation_history) = context_window.get("conversation_history") {
|
||||
if let Some(messages) = conversation_history.as_array() {
|
||||
// This is the key logic we're testing - find the last USER message with "Tool result:"
|
||||
let last_tool_result = messages.iter().rev().find(|msg| {
|
||||
if let Some(role) = msg.get("role") {
|
||||
if let Some(role_str) = role.as_str() {
|
||||
if role_str == "User" || role_str == "user" {
|
||||
if let Some(content) = msg.get("content") {
|
||||
if let Some(content_str) = content.as_str() {
|
||||
return content_str.starts_with("Tool result:");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
false
|
||||
});
|
||||
|
||||
// Verify we found the correct message
|
||||
assert!(last_tool_result.is_some(), "Should find the tool result message");
|
||||
|
||||
if let Some(last_message) = last_tool_result {
|
||||
if let Some(content) = last_message.get("content") {
|
||||
if let Some(content_str) = content.as_str() {
|
||||
let feedback = if content_str.starts_with("Tool result: ") {
|
||||
content_str.strip_prefix("Tool result: ").unwrap_or(content_str)
|
||||
} else {
|
||||
content_str
|
||||
};
|
||||
|
||||
// Verify we extracted the correct feedback
|
||||
assert_eq!(feedback, "IMPLEMENTATION_APPROVED", "Should extract the actual feedback, not timing");
|
||||
|
||||
// Verify the feedback is NOT the timing message
|
||||
assert!(!feedback.contains("🕝"), "Feedback should not be the timing message");
|
||||
|
||||
println!("✅ Successfully extracted coach feedback: {}", feedback);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
panic!("Failed to extract coach feedback");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_only_final_output_tool_results() {
|
||||
// Test that we only extract tool results from final_output, not from other tools
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let logs_dir = temp_dir.path().join("logs");
|
||||
fs::create_dir(&logs_dir).unwrap();
|
||||
|
||||
let session_id = "test_session_final_output_only";
|
||||
let log_file_path = logs_dir.join(format!("g3_session_{}.json", session_id));
|
||||
|
||||
let log_content = json!({
|
||||
"session_id": session_id,
|
||||
"context_window": {
|
||||
"conversation_history": [
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "{\"tool\": \"shell\", \"args\": {\"command\":\"ls\"}}"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Tool result: file1.txt\nfile2.txt"
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "{\"tool\": \"read_file\", \"args\": {\"file_path\":\"test.txt\"}}"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Tool result: This is test content"
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "{\"tool\": \"final_output\", \"args\": {\"summary\":\"APPROVED_RESULT\"}}"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Tool result: APPROVED_RESULT"
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "🕝 20.5s | 💭 5.2s"
|
||||
}
|
||||
]
|
||||
}
|
||||
});
|
||||
|
||||
fs::write(&log_file_path, serde_json::to_string_pretty(&log_content).unwrap()).unwrap();
|
||||
|
||||
// Test the new extraction logic that verifies the tool is final_output
|
||||
let log_content_str = fs::read_to_string(&log_file_path).unwrap();
|
||||
let log_json: serde_json::Value = serde_json::from_str(&log_content_str).unwrap();
|
||||
|
||||
if let Some(context_window) = log_json.get("context_window") {
|
||||
if let Some(conversation_history) = context_window.get("conversation_history") {
|
||||
if let Some(messages) = conversation_history.as_array() {
|
||||
// Go backwards through messages to find final_output tool result
|
||||
for i in (0..messages.len()).rev() {
|
||||
let msg = &messages[i];
|
||||
|
||||
if let Some(role) = msg.get("role") {
|
||||
if let Some(role_str) = role.as_str() {
|
||||
if role_str == "User" || role_str == "user" {
|
||||
if let Some(content) = msg.get("content") {
|
||||
if let Some(content_str) = content.as_str() {
|
||||
if content_str.starts_with("Tool result:") {
|
||||
// Check if preceding message was final_output
|
||||
if i > 0 {
|
||||
let prev_msg = &messages[i - 1];
|
||||
if let Some(prev_content) = prev_msg.get("content") {
|
||||
if let Some(prev_content_str) = prev_content.as_str() {
|
||||
if prev_content_str.contains("\"tool\": \"final_output\"") {
|
||||
let feedback = content_str.strip_prefix("Tool result: ").unwrap_or(content_str);
|
||||
assert_eq!(feedback, "APPROVED_RESULT", "Should extract only final_output result");
|
||||
println!("✅ Correctly extracted only final_output tool result: {}", feedback);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
panic!("Failed to extract final_output tool result");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_coach_feedback_without_timing_message() {
|
||||
// Create a temporary directory for logs
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let logs_dir = temp_dir.path().join("logs");
|
||||
fs::create_dir(&logs_dir).unwrap();
|
||||
|
||||
// Test the case where there's no timing message (backward compatibility)
|
||||
let session_id = "test_session_456";
|
||||
let log_file_path = logs_dir.join(format!("g3_session_{}.json", session_id));
|
||||
|
||||
let log_content = json!({
|
||||
"session_id": session_id,
|
||||
"context_window": {
|
||||
"conversation_history": [
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "{\"tool\": \"final_output\", \"args\": {\"summary\":\"TEST_FEEDBACK\"}}"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Tool result: TEST_FEEDBACK"
|
||||
}
|
||||
]
|
||||
}
|
||||
});
|
||||
|
||||
fs::write(&log_file_path, serde_json::to_string_pretty(&log_content).unwrap()).unwrap();
|
||||
|
||||
// Test extraction
|
||||
let log_content_str = fs::read_to_string(&log_file_path).unwrap();
|
||||
let log_json: serde_json::Value = serde_json::from_str(&log_content_str).unwrap();
|
||||
|
||||
if let Some(context_window) = log_json.get("context_window") {
|
||||
if let Some(conversation_history) = context_window.get("conversation_history") {
|
||||
if let Some(messages) = conversation_history.as_array() {
|
||||
let last_tool_result = messages.iter().rev().find(|msg| {
|
||||
if let Some(role) = msg.get("role") {
|
||||
if let Some(role_str) = role.as_str() {
|
||||
if role_str == "User" || role_str == "user" {
|
||||
if let Some(content) = msg.get("content") {
|
||||
if let Some(content_str) = content.as_str() {
|
||||
return content_str.starts_with("Tool result:");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
false
|
||||
});
|
||||
|
||||
assert!(last_tool_result.is_some());
|
||||
|
||||
if let Some(last_message) = last_tool_result {
|
||||
if let Some(content) = last_message.get("content") {
|
||||
if let Some(content_str) = content.as_str() {
|
||||
let feedback = content_str.strip_prefix("Tool result: ").unwrap_or(content_str);
|
||||
assert_eq!(feedback, "TEST_FEEDBACK");
|
||||
println!("✅ Successfully extracted coach feedback without timing: {}", feedback);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
panic!("Failed to extract coach feedback");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_coach_feedback_with_multiple_tool_results() {
|
||||
// Test that we get the LAST tool result when there are multiple
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let logs_dir = temp_dir.path().join("logs");
|
||||
fs::create_dir(&logs_dir).unwrap();
|
||||
|
||||
let session_id = "test_session_789";
|
||||
let log_file_path = logs_dir.join(format!("g3_session_{}.json", session_id));
|
||||
|
||||
let log_content = json!({
|
||||
"session_id": session_id,
|
||||
"context_window": {
|
||||
"conversation_history": [
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "{\"tool\": \"shell\", \"args\": {\"command\":\"ls\"}}"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Tool result: file1.txt\nfile2.txt"
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "{\"tool\": \"final_output\", \"args\": {\"summary\":\"FINAL_RESULT\"}}"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Tool result: FINAL_RESULT"
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "🕝 15.2s | 💭 3.1s"
|
||||
}
|
||||
]
|
||||
}
|
||||
});
|
||||
|
||||
fs::write(&log_file_path, serde_json::to_string_pretty(&log_content).unwrap()).unwrap();
|
||||
|
||||
// Test extraction
|
||||
let log_content_str = fs::read_to_string(&log_file_path).unwrap();
|
||||
let log_json: serde_json::Value = serde_json::from_str(&log_content_str).unwrap();
|
||||
|
||||
if let Some(context_window) = log_json.get("context_window") {
|
||||
if let Some(conversation_history) = context_window.get("conversation_history") {
|
||||
if let Some(messages) = conversation_history.as_array() {
|
||||
let last_tool_result = messages.iter().rev().find(|msg| {
|
||||
if let Some(role) = msg.get("role") {
|
||||
if let Some(role_str) = role.as_str() {
|
||||
if role_str == "User" || role_str == "user" {
|
||||
if let Some(content) = msg.get("content") {
|
||||
if let Some(content_str) = content.as_str() {
|
||||
return content_str.starts_with("Tool result:");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
false
|
||||
});
|
||||
|
||||
assert!(last_tool_result.is_some());
|
||||
|
||||
if let Some(last_message) = last_tool_result {
|
||||
if let Some(content) = last_message.get("content") {
|
||||
if let Some(content_str) = content.as_str() {
|
||||
let feedback = content_str.strip_prefix("Tool result: ").unwrap_or(content_str);
|
||||
// Should get the LAST tool result (final_output), not the first one (shell)
|
||||
assert_eq!(feedback, "FINAL_RESULT", "Should extract the last tool result");
|
||||
assert!(!feedback.contains("file1.txt"), "Should not extract earlier tool results");
|
||||
println!("✅ Successfully extracted last tool result: {}", feedback);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
panic!("Failed to extract coach feedback");
|
||||
}
|
||||
@@ -737,6 +737,233 @@ Format this as a detailed but concise summary that can be used to resume the con
|
||||
}
|
||||
}
|
||||
|
||||
/// Perform context thinning on the ENTIRE conversation history (not just first third)
|
||||
/// This is the "skinnify" variant that processes all messages
|
||||
/// Returns a summary message about what was thinned
|
||||
pub fn thin_context_all(&mut self) -> (String, usize) {
|
||||
let current_percentage = self.percentage_used() as u32;
|
||||
|
||||
// Calculate the total messages - process ALL of them
|
||||
let total_messages = self.conversation_history.len();
|
||||
|
||||
let mut leaned_count = 0;
|
||||
let mut tool_call_leaned_count = 0;
|
||||
let mut chars_saved = 0;
|
||||
|
||||
// Create ~/tmp directory if it doesn't exist
|
||||
let tmp_dir = shellexpand::tilde("~/tmp").to_string();
|
||||
if let Err(e) = std::fs::create_dir_all(&tmp_dir) {
|
||||
warn!("Failed to create ~/tmp directory: {}", e);
|
||||
return (
|
||||
"⚠️ Context skinnifying failed: could not create ~/tmp directory".to_string(),
|
||||
0,
|
||||
);
|
||||
}
|
||||
|
||||
// Scan ALL messages (not just first third)
|
||||
for i in 0..total_messages {
|
||||
// Check if the previous message was a TODO tool call (before getting mutable reference)
|
||||
let is_todo_result = if i > 0 {
|
||||
if let Some(prev_message) = self.conversation_history.get(i - 1) {
|
||||
if matches!(prev_message.role, MessageRole::Assistant) {
|
||||
prev_message.content.contains(r#""tool":"todo_read""#)
|
||||
|| prev_message.content.contains(r#""tool":"todo_write""#)
|
||||
|| prev_message.content.contains(r#""tool": "todo_read""#)
|
||||
|| prev_message.content.contains(r#""tool": "todo_write""#)
|
||||
} else {
|
||||
false
|
||||
}
|
||||
} else {
|
||||
false
|
||||
}
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
if let Some(message) = self.conversation_history.get_mut(i) {
|
||||
// Process User messages that look like tool results
|
||||
if matches!(message.role, MessageRole::User)
|
||||
&& message.content.starts_with("Tool result:")
|
||||
{
|
||||
let content_len = message.content.len();
|
||||
|
||||
// Only thin if the content is greater than 500 chars and not a TODO tool result
|
||||
if !is_todo_result && content_len > 500 {
|
||||
// Generate a unique filename based on timestamp and index
|
||||
let timestamp = std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.unwrap_or_default()
|
||||
.as_secs();
|
||||
let filename = format!("skinny_tool_result_{}_{}.txt", timestamp, i);
|
||||
let file_path = format!("{}/{}", tmp_dir, filename);
|
||||
|
||||
// Write the content to file
|
||||
if let Err(e) = std::fs::write(&file_path, &message.content) {
|
||||
warn!("Failed to write skinnified content to {}: {}", file_path, e);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Replace the message content with a note
|
||||
let original_len = message.content.len();
|
||||
message.content = format!("Tool result saved to {}", file_path);
|
||||
|
||||
leaned_count += 1;
|
||||
chars_saved += original_len - message.content.len();
|
||||
|
||||
debug!(
|
||||
"Skinnified tool result {} ({} chars) to {}",
|
||||
i, original_len, file_path
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Process Assistant messages that contain tool calls with large arguments
|
||||
if matches!(message.role, MessageRole::Assistant) {
|
||||
// Try to parse the message content as JSON to find tool calls
|
||||
let content = &message.content;
|
||||
|
||||
// Look for JSON tool call patterns
|
||||
if let Some(tool_call_start) = content
|
||||
.find(r#"{"tool":"#)
|
||||
.or_else(|| content.find(r#"{ "tool":"#))
|
||||
.or_else(|| content.find(r#"{"tool" :"#))
|
||||
.or_else(|| content.find(r#"{ "tool" :"#))
|
||||
{
|
||||
// Try to extract and parse the JSON tool call
|
||||
let json_portion = &content[tool_call_start..];
|
||||
|
||||
// Find the end of the JSON object
|
||||
if let Some(json_end) = Self::find_json_end(json_portion) {
|
||||
let json_str = &json_portion[..=json_end];
|
||||
|
||||
// Try to parse as ToolCall
|
||||
if let Ok(mut tool_call) = serde_json::from_str::<ToolCall>(json_str) {
|
||||
let mut modified = false;
|
||||
|
||||
// Handle write_file tool calls
|
||||
if tool_call.tool == "write_file" {
|
||||
if let Some(args_obj) = tool_call.args.as_object_mut() {
|
||||
let content_info = args_obj
|
||||
.get("content")
|
||||
.and_then(|v| v.as_str())
|
||||
.map(|s| (s.to_string(), s.len()));
|
||||
|
||||
if let Some((content_str, content_len)) = content_info {
|
||||
if content_len > 500 {
|
||||
let timestamp = std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.unwrap_or_default()
|
||||
.as_secs();
|
||||
let filename = format!(
|
||||
"skinny_write_file_content_{}_{}.txt",
|
||||
timestamp, i
|
||||
);
|
||||
let file_path = format!("{}/{}", tmp_dir, filename);
|
||||
|
||||
if std::fs::write(&file_path, &content_str).is_ok() {
|
||||
args_obj.insert(
|
||||
"content".to_string(),
|
||||
serde_json::Value::String(format!(
|
||||
"<content saved to {}>",
|
||||
file_path
|
||||
)),
|
||||
);
|
||||
modified = true;
|
||||
chars_saved += content_len;
|
||||
tool_call_leaned_count += 1;
|
||||
debug!("Skinnified write_file content {} ({} chars) to {}", i, content_len, file_path);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Handle str_replace tool calls
|
||||
if tool_call.tool == "str_replace" {
|
||||
if let Some(args_obj) = tool_call.args.as_object_mut() {
|
||||
let diff_info = args_obj
|
||||
.get("diff")
|
||||
.and_then(|v| v.as_str())
|
||||
.map(|s| (s.to_string(), s.len()));
|
||||
|
||||
if let Some((diff_str, diff_len)) = diff_info {
|
||||
if diff_len > 500 {
|
||||
let timestamp = std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.unwrap_or_default()
|
||||
.as_secs();
|
||||
let filename = format!(
|
||||
"skinny_str_replace_diff_{}_{}.txt",
|
||||
timestamp, i
|
||||
);
|
||||
let file_path = format!("{}/{}", tmp_dir, filename);
|
||||
|
||||
if std::fs::write(&file_path, &diff_str).is_ok() {
|
||||
args_obj.insert(
|
||||
"diff".to_string(),
|
||||
serde_json::Value::String(format!(
|
||||
"<diff saved to {}>",
|
||||
file_path
|
||||
)),
|
||||
);
|
||||
modified = true;
|
||||
chars_saved += diff_len;
|
||||
tool_call_leaned_count += 1;
|
||||
debug!("Skinnified str_replace diff {} ({} chars) to {}", i, diff_len, file_path);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we modified the tool call, reconstruct the message
|
||||
if modified {
|
||||
let prefix = &content[..tool_call_start];
|
||||
let suffix = &content[tool_call_start + json_str.len()..];
|
||||
|
||||
// Serialize the modified tool call
|
||||
if let Ok(new_json) = serde_json::to_string(&tool_call) {
|
||||
message.content =
|
||||
format!("{}{}{}", prefix, new_json, suffix);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Recalculate token usage after thinning
|
||||
self.recalculate_tokens();
|
||||
|
||||
if leaned_count > 0 {
|
||||
if tool_call_leaned_count > 0 {
|
||||
(format!("🦴 Context skinnified at {}%: {} tool results + {} tool calls across entire history, ~{} chars saved",
|
||||
current_percentage, leaned_count, tool_call_leaned_count, chars_saved), chars_saved)
|
||||
} else {
|
||||
(
|
||||
format!(
|
||||
"🦴 Context skinnified at {}%: {} tool results across entire history, ~{} chars saved",
|
||||
current_percentage, leaned_count, chars_saved
|
||||
),
|
||||
chars_saved,
|
||||
)
|
||||
}
|
||||
} else if tool_call_leaned_count > 0 {
|
||||
(
|
||||
format!(
|
||||
"🦴 Context skinnified at {}%: {} tool calls across entire history, ~{} chars saved",
|
||||
current_percentage, tool_call_leaned_count, chars_saved
|
||||
),
|
||||
chars_saved,
|
||||
)
|
||||
} else {
|
||||
(format!("ℹ Context skinnifying triggered at {}% but no large tool results or tool calls found in entire history",
|
||||
current_percentage), 0)
|
||||
}
|
||||
}
|
||||
|
||||
/// Recalculate token usage based on current conversation history
|
||||
fn recalculate_tokens(&mut self) {
|
||||
let mut total = 0;
|
||||
@@ -1181,14 +1408,237 @@ impl<W: UiWriter> Agent<W> {
|
||||
|
||||
/// Resolve the max_tokens to use for a given provider, applying fallbacks
|
||||
fn resolve_max_tokens(&self, provider_name: &str) -> u32 {
|
||||
match provider_name {
|
||||
let base = match provider_name {
|
||||
"databricks" => Self::provider_max_tokens(&self.config, "databricks")
|
||||
.or(Some(self.config.agent.fallback_default_max_tokens as u32))
|
||||
.unwrap_or(32000),
|
||||
other => Self::provider_max_tokens(&self.config, other)
|
||||
.or(Some(self.config.agent.fallback_default_max_tokens as u32))
|
||||
.unwrap_or(16000),
|
||||
};
|
||||
|
||||
// For Anthropic with thinking enabled, ensure max_tokens is sufficient
|
||||
// Anthropic requires: max_tokens > thinking.budget_tokens
|
||||
if provider_name == "anthropic" {
|
||||
if let Some(budget) = self.get_thinking_budget_tokens() {
|
||||
let minimum_for_thinking = budget + 1024;
|
||||
return base.max(minimum_for_thinking);
|
||||
}
|
||||
}
|
||||
|
||||
base
|
||||
}
|
||||
|
||||
/// Get the thinking budget tokens for Anthropic provider, if configured
|
||||
fn get_thinking_budget_tokens(&self) -> Option<u32> {
|
||||
self.config
|
||||
.providers
|
||||
.anthropic
|
||||
.as_ref()
|
||||
.and_then(|c| c.thinking_budget_tokens)
|
||||
}
|
||||
|
||||
/// Pre-flight check to validate and adjust max_tokens for the thinking.budget_tokens constraint.
|
||||
/// Returns the adjusted max_tokens that satisfies: max_tokens > thinking.budget_tokens
|
||||
/// Also returns whether we need to apply fallback actions (thinnify/skinnify).
|
||||
///
|
||||
/// Returns: (adjusted_max_tokens, needs_context_reduction)
|
||||
fn preflight_validate_max_tokens(
|
||||
&self,
|
||||
provider_name: &str,
|
||||
proposed_max_tokens: u32,
|
||||
) -> (u32, bool) {
|
||||
// Only applies to Anthropic provider with thinking enabled
|
||||
if provider_name != "anthropic" {
|
||||
return (proposed_max_tokens, false);
|
||||
}
|
||||
|
||||
let budget_tokens = match self.get_thinking_budget_tokens() {
|
||||
Some(budget) => budget,
|
||||
None => return (proposed_max_tokens, false), // No thinking enabled
|
||||
};
|
||||
|
||||
// Anthropic requires: max_tokens > budget_tokens
|
||||
// We add a minimum output buffer of 1024 tokens for actual response content
|
||||
let minimum_required = budget_tokens + 1024;
|
||||
|
||||
if proposed_max_tokens >= minimum_required {
|
||||
// We have enough headroom
|
||||
(proposed_max_tokens, false)
|
||||
} else {
|
||||
// max_tokens is too low - need to either adjust or reduce context
|
||||
warn!(
|
||||
"max_tokens ({}) is below required minimum ({}) for thinking.budget_tokens ({}). Context reduction needed.",
|
||||
proposed_max_tokens, minimum_required, budget_tokens
|
||||
);
|
||||
// Return the minimum required, but flag that we need context reduction
|
||||
(minimum_required, true)
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate max_tokens for a summary request, ensuring it satisfies the thinking constraint.
|
||||
/// Applies fallback sequence: thinnify -> skinnify -> hard-coded minimum
|
||||
/// Returns (max_tokens, whether_fallback_was_used)
|
||||
fn calculate_summary_max_tokens(
|
||||
&mut self,
|
||||
provider_name: &str,
|
||||
) -> (u32, bool) {
|
||||
let model_limit = self.context_window.total_tokens;
|
||||
let current_usage = self.context_window.used_tokens;
|
||||
|
||||
// Get the configured max_tokens for this provider
|
||||
let configured_max_tokens = self.resolve_max_tokens(provider_name);
|
||||
|
||||
// Calculate available tokens with buffer
|
||||
let buffer = (model_limit / 40).clamp(1000, 10000); // 2.5% buffer
|
||||
let available = model_limit
|
||||
.saturating_sub(current_usage)
|
||||
.saturating_sub(buffer);
|
||||
// Use the smaller of available tokens or configured max_tokens,
|
||||
// but ensure we don't go below thinking budget floor for Anthropic
|
||||
let proposed_max_tokens = available.min(configured_max_tokens);
|
||||
let proposed_max_tokens = if provider_name == "anthropic" {
|
||||
if let Some(budget) = self.get_thinking_budget_tokens() {
|
||||
proposed_max_tokens.max(budget + 1024)
|
||||
} else {
|
||||
proposed_max_tokens
|
||||
}
|
||||
} else {
|
||||
proposed_max_tokens
|
||||
};
|
||||
|
||||
// Validate against thinking budget constraint
|
||||
let (adjusted, needs_reduction) = self.preflight_validate_max_tokens(provider_name, proposed_max_tokens);
|
||||
|
||||
if !needs_reduction {
|
||||
return (adjusted, false);
|
||||
}
|
||||
|
||||
// We need more headroom - the context is too full
|
||||
// Return the adjusted value but flag that fallbacks are needed
|
||||
(adjusted, true)
|
||||
}
|
||||
|
||||
/// Apply the fallback sequence to free up context space for thinking budget.
|
||||
/// Sequence: thinnify (first third) → skinnify (all) → hard-coded minimum
|
||||
/// Returns the validated max_tokens that satisfies thinking.budget_tokens constraint.
|
||||
fn apply_max_tokens_fallback_sequence(
|
||||
&mut self,
|
||||
provider_name: &str,
|
||||
initial_max_tokens: u32,
|
||||
hard_coded_minimum: u32,
|
||||
) -> u32 {
|
||||
let (mut max_tokens, needs_reduction) = self.preflight_validate_max_tokens(provider_name, initial_max_tokens);
|
||||
|
||||
if !needs_reduction {
|
||||
return max_tokens;
|
||||
}
|
||||
|
||||
self.ui_writer.print_context_status(
|
||||
"⚠️ Context window too full for thinking budget. Applying fallback sequence...\n",
|
||||
);
|
||||
|
||||
// Step 1: Try thinnify (first third of context)
|
||||
self.ui_writer.print_context_status("🥒 Step 1: Trying thinnify...\n");
|
||||
let (thin_msg, thin_saved) = self.context_window.thin_context();
|
||||
self.thinning_events.push(thin_saved);
|
||||
self.ui_writer.print_context_thinning(&thin_msg);
|
||||
|
||||
// Recalculate max_tokens after thinnify
|
||||
let recalc_max = self.resolve_max_tokens(provider_name);
|
||||
let (new_max, still_needs_reduction) = self.preflight_validate_max_tokens(provider_name, recalc_max);
|
||||
max_tokens = new_max;
|
||||
|
||||
if !still_needs_reduction {
|
||||
self.ui_writer.print_context_status(
|
||||
"✅ Thinnify resolved capacity issue. Continuing...\n",
|
||||
);
|
||||
return max_tokens;
|
||||
}
|
||||
|
||||
// Step 2: Try skinnify (entire context)
|
||||
self.ui_writer.print_context_status("🦴 Step 2: Trying skinnify...\n");
|
||||
let (skinny_msg, skinny_saved) = self.context_window.thin_context_all();
|
||||
self.thinning_events.push(skinny_saved);
|
||||
self.ui_writer.print_context_thinning(&skinny_msg);
|
||||
|
||||
// Recalculate max_tokens after skinnify
|
||||
let recalc_max = self.resolve_max_tokens(provider_name);
|
||||
let (final_max, final_needs_reduction) = self.preflight_validate_max_tokens(provider_name, recalc_max);
|
||||
max_tokens = final_max;
|
||||
|
||||
if !final_needs_reduction {
|
||||
self.ui_writer.print_context_status(
|
||||
"✅ Skinnify resolved capacity issue. Continuing...\n",
|
||||
);
|
||||
return max_tokens;
|
||||
}
|
||||
|
||||
// Step 3: Nothing worked, use hard-coded minimum as last resort
|
||||
self.ui_writer.print_context_status(&format!(
|
||||
"⚠️ Step 3: Context reduction insufficient. Using hard-coded max_tokens={} as last resort...\n",
|
||||
hard_coded_minimum
|
||||
));
|
||||
|
||||
hard_coded_minimum
|
||||
}
|
||||
|
||||
/// Apply the fallback sequence for summary requests to free up context space.
|
||||
/// Uses calculate_summary_max_tokens for recalculation (based on available space).
|
||||
/// Returns the validated max_tokens for summary requests.
|
||||
fn apply_summary_fallback_sequence(
|
||||
&mut self,
|
||||
provider_name: &str,
|
||||
) -> u32 {
|
||||
let (mut summary_max_tokens, needs_reduction) = self.calculate_summary_max_tokens(provider_name);
|
||||
|
||||
if !needs_reduction {
|
||||
return summary_max_tokens;
|
||||
}
|
||||
|
||||
self.ui_writer.print_context_status(
|
||||
"⚠️ Context window too full for thinking budget. Applying fallback sequence...\n",
|
||||
);
|
||||
|
||||
// Step 1: Try thinnify (first third of context)
|
||||
self.ui_writer.print_context_status("🥒 Step 1: Trying thinnify...\n");
|
||||
let (thin_msg, thin_saved) = self.context_window.thin_context();
|
||||
self.thinning_events.push(thin_saved);
|
||||
self.ui_writer.print_context_thinning(&thin_msg);
|
||||
|
||||
// Recalculate max_tokens after thinnify
|
||||
let (new_max, still_needs_reduction) = self.calculate_summary_max_tokens(provider_name);
|
||||
summary_max_tokens = new_max;
|
||||
|
||||
if !still_needs_reduction {
|
||||
self.ui_writer.print_context_status(
|
||||
"✅ Thinnify resolved capacity issue. Continuing...\n",
|
||||
);
|
||||
return summary_max_tokens;
|
||||
}
|
||||
|
||||
// Step 2: Try skinnify (entire context)
|
||||
self.ui_writer.print_context_status("🦴 Step 2: Trying skinnify...\n");
|
||||
let (skinny_msg, skinny_saved) = self.context_window.thin_context_all();
|
||||
self.thinning_events.push(skinny_saved);
|
||||
self.ui_writer.print_context_thinning(&skinny_msg);
|
||||
|
||||
// Recalculate max_tokens after skinnify
|
||||
let (final_max, final_needs_reduction) = self.calculate_summary_max_tokens(provider_name);
|
||||
summary_max_tokens = final_max;
|
||||
|
||||
if !final_needs_reduction {
|
||||
self.ui_writer.print_context_status(
|
||||
"✅ Skinnify resolved capacity issue. Continuing...\n",
|
||||
);
|
||||
return summary_max_tokens;
|
||||
}
|
||||
|
||||
// Step 3: Nothing worked, use hard-coded minimum
|
||||
self.ui_writer.print_context_status(
|
||||
"⚠️ Step 3: Context reduction insufficient. Using hard-coded max_tokens=5000 as last resort...\n",
|
||||
);
|
||||
5000
|
||||
}
|
||||
|
||||
/// Resolve the temperature to use for a given provider, applying fallbacks
|
||||
@@ -1578,8 +2028,14 @@ impl<W: UiWriter> Agent<W> {
|
||||
};
|
||||
let _ = provider; // Drop the provider reference to avoid borrowing issues
|
||||
|
||||
// Get max_tokens from provider configuration, falling back to sensible defaults
|
||||
let max_tokens = Some(self.resolve_max_tokens(&provider_name));
|
||||
// Get max_tokens from provider configuration with preflight validation
|
||||
// This ensures max_tokens > thinking.budget_tokens for Anthropic with extended thinking
|
||||
let initial_max_tokens = self.resolve_max_tokens(&provider_name);
|
||||
let max_tokens = Some(self.apply_max_tokens_fallback_sequence(
|
||||
&provider_name,
|
||||
initial_max_tokens,
|
||||
16000, // Hard-coded minimum for main API calls (higher than summary's 5000)
|
||||
));
|
||||
|
||||
let request = CompletionRequest {
|
||||
messages,
|
||||
@@ -1587,6 +2043,7 @@ impl<W: UiWriter> Agent<W> {
|
||||
temperature: Some(self.resolve_temperature(&provider_name)),
|
||||
stream: true, // Enable streaming
|
||||
tools,
|
||||
disable_thinking: false,
|
||||
};
|
||||
|
||||
// Time the LLM call with cancellation support and streaming
|
||||
@@ -1984,6 +2441,32 @@ impl<W: UiWriter> Agent<W> {
|
||||
self.context_window.percentage_used() as u32
|
||||
));
|
||||
|
||||
let provider = self.providers.get(None)?;
|
||||
let provider_name = provider.name().to_string();
|
||||
let _ = provider; // Release borrow early
|
||||
|
||||
// Apply fallback sequence: thinnify -> skinnify -> hard-coded 5000
|
||||
let mut summary_max_tokens = self.apply_summary_fallback_sequence(&provider_name);
|
||||
|
||||
// Apply provider-specific caps
|
||||
// For Anthropic with thinking enabled, we need max_tokens > thinking.budget_tokens
|
||||
// So we set a higher cap when thinking is configured
|
||||
let anthropic_cap = match self.get_thinking_budget_tokens() {
|
||||
Some(budget) => (budget + 2000).max(10_000), // At least budget + 2000 for response
|
||||
None => 10_000,
|
||||
};
|
||||
summary_max_tokens = match provider_name.as_str() {
|
||||
"anthropic" => summary_max_tokens.min(anthropic_cap),
|
||||
"databricks" => summary_max_tokens.min(10_000),
|
||||
"embedded" => summary_max_tokens.min(3000),
|
||||
_ => summary_max_tokens.min(5000),
|
||||
};
|
||||
|
||||
debug!(
|
||||
"Requesting summary with max_tokens: {} (current usage: {} tokens)",
|
||||
summary_max_tokens, self.context_window.used_tokens
|
||||
);
|
||||
|
||||
// Create summary request with FULL history
|
||||
let summary_prompt = self.context_window.create_summary_prompt();
|
||||
|
||||
@@ -2012,41 +2495,26 @@ impl<W: UiWriter> Agent<W> {
|
||||
|
||||
let provider = self.providers.get(None)?;
|
||||
|
||||
// Dynamically calculate max_tokens for summary based on what's left
|
||||
let summary_max_tokens = match provider.name() {
|
||||
"databricks" | "anthropic" => {
|
||||
let model_limit = self.context_window.total_tokens;
|
||||
let current_usage = self.context_window.used_tokens;
|
||||
let available = model_limit
|
||||
.saturating_sub(current_usage)
|
||||
.saturating_sub(5000);
|
||||
Some(available.min(10_000))
|
||||
// Determine if we need to disable thinking mode for this request
|
||||
// Anthropic requires: max_tokens > thinking.budget_tokens + 1024
|
||||
let disable_thinking = self.get_thinking_budget_tokens().map_or(false, |budget| {
|
||||
let minimum_for_thinking = budget + 1024;
|
||||
let should_disable = summary_max_tokens <= minimum_for_thinking;
|
||||
if should_disable {
|
||||
tracing::warn!("Disabling thinking mode for summary: max_tokens ({}) <= minimum_for_thinking ({})", summary_max_tokens, minimum_for_thinking);
|
||||
}
|
||||
"embedded" => {
|
||||
let model_limit = self.context_window.total_tokens;
|
||||
let current_usage = self.context_window.used_tokens;
|
||||
let available = model_limit
|
||||
.saturating_sub(current_usage)
|
||||
.saturating_sub(1000);
|
||||
Some(available.min(3000))
|
||||
}
|
||||
_ => {
|
||||
let available = self.context_window.remaining_tokens().saturating_sub(2000);
|
||||
Some(available.min(5000))
|
||||
}
|
||||
};
|
||||
should_disable
|
||||
});
|
||||
|
||||
debug!(
|
||||
"Requesting summary with max_tokens: {:?} (current usage: {} tokens)",
|
||||
summary_max_tokens, self.context_window.used_tokens
|
||||
);
|
||||
tracing::debug!("Creating summary request: max_tokens={}, disable_thinking={}", summary_max_tokens, disable_thinking);
|
||||
|
||||
let summary_request = CompletionRequest {
|
||||
messages: summary_messages,
|
||||
max_tokens: summary_max_tokens,
|
||||
max_tokens: Some(summary_max_tokens),
|
||||
temperature: Some(self.resolve_temperature(provider.name())),
|
||||
stream: false,
|
||||
tools: None,
|
||||
disable_thinking,
|
||||
};
|
||||
|
||||
// Get the summary
|
||||
@@ -2090,6 +2558,15 @@ impl<W: UiWriter> Agent<W> {
|
||||
message
|
||||
}
|
||||
|
||||
/// Manually trigger context thinning for the ENTIRE context window
|
||||
/// Unlike force_thin which only processes the first third, this processes all messages
|
||||
pub fn force_thin_all(&mut self) -> String {
|
||||
info!("Manual full context skinnifying triggered");
|
||||
let (message, chars_saved) = self.context_window.thin_context_all();
|
||||
self.thinning_events.push(chars_saved);
|
||||
message
|
||||
}
|
||||
|
||||
/// Reload README.md and AGENTS.md and replace the first system message
|
||||
/// Returns Ok(true) if README was found and reloaded, Ok(false) if no README was present initially
|
||||
pub fn reload_readme(&mut self) -> Result<bool> {
|
||||
@@ -2998,6 +3475,32 @@ impl<W: UiWriter> Agent<W> {
|
||||
self.context_window.percentage_used() as u32
|
||||
));
|
||||
|
||||
let provider = self.providers.get(None)?;
|
||||
let provider_name = provider.name().to_string();
|
||||
let _ = provider; // Release borrow early
|
||||
|
||||
// Apply fallback sequence: thinnify -> skinnify -> hard-coded 5000
|
||||
let mut summary_max_tokens = self.apply_summary_fallback_sequence(&provider_name);
|
||||
|
||||
// Apply provider-specific caps
|
||||
// For Anthropic with thinking enabled, we need max_tokens > thinking.budget_tokens
|
||||
// So we set a higher cap when thinking is configured
|
||||
let anthropic_cap = match self.get_thinking_budget_tokens() {
|
||||
Some(budget) => (budget + 2000).max(10_000), // At least budget + 2000 for response
|
||||
None => 10_000,
|
||||
};
|
||||
summary_max_tokens = match provider_name.as_str() {
|
||||
"anthropic" => summary_max_tokens.min(anthropic_cap),
|
||||
"databricks" => summary_max_tokens.min(10_000),
|
||||
"embedded" => summary_max_tokens.min(3000),
|
||||
_ => summary_max_tokens.min(5000),
|
||||
};
|
||||
|
||||
debug!(
|
||||
"Requesting summary with max_tokens: {} (current usage: {} tokens)",
|
||||
summary_max_tokens, self.context_window.used_tokens
|
||||
);
|
||||
|
||||
// Create summary request with FULL history
|
||||
let summary_prompt = self.context_window.create_summary_prompt();
|
||||
|
||||
@@ -3026,85 +3529,26 @@ impl<W: UiWriter> Agent<W> {
|
||||
|
||||
let provider = self.providers.get(None)?;
|
||||
|
||||
// Dynamically calculate max_tokens for summary based on what's left
|
||||
// We need to ensure: used_tokens + max_tokens <= total_context_limit
|
||||
let summary_max_tokens = match provider.name() {
|
||||
"databricks" | "anthropic" => {
|
||||
// Use the actual configured context window size
|
||||
let model_limit = self.context_window.total_tokens;
|
||||
let current_usage = self.context_window.used_tokens;
|
||||
|
||||
// Check if we have enough capacity for summarization
|
||||
if current_usage >= model_limit.saturating_sub(1000) {
|
||||
error!("Context window at capacity ({}%), cannot summarize. Current: {}, Limit: {}",
|
||||
self.context_window.percentage_used(), current_usage, model_limit);
|
||||
return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify or /compact commands to reduce context size, or start a new session."));
|
||||
}
|
||||
|
||||
// Leave buffer proportional to model size (min 1k, max 10k)
|
||||
let buffer = (model_limit / 40).clamp(1000, 10000); // 2.5% buffer
|
||||
let available = model_limit
|
||||
.saturating_sub(current_usage)
|
||||
.saturating_sub(buffer);
|
||||
// Cap at a reasonable summary size (10k tokens max)
|
||||
Some(available.min(10_000))
|
||||
// Determine if we need to disable thinking mode for this request
|
||||
// Anthropic requires: max_tokens > thinking.budget_tokens + 1024
|
||||
let disable_thinking = self.get_thinking_budget_tokens().map_or(false, |budget| {
|
||||
let minimum_for_thinking = budget + 1024;
|
||||
let should_disable = summary_max_tokens <= minimum_for_thinking;
|
||||
if should_disable {
|
||||
tracing::warn!("Disabling thinking mode for summary: max_tokens ({}) <= minimum_for_thinking ({})", summary_max_tokens, minimum_for_thinking);
|
||||
}
|
||||
"embedded" => {
|
||||
// For smaller context models, be more conservative
|
||||
let model_limit = self.context_window.total_tokens;
|
||||
let current_usage = self.context_window.used_tokens;
|
||||
should_disable
|
||||
});
|
||||
|
||||
// Check capacity for embedded models too
|
||||
if current_usage >= model_limit.saturating_sub(500) {
|
||||
error!(
|
||||
"Embedded model context window at capacity ({}%)",
|
||||
self.context_window.percentage_used()
|
||||
);
|
||||
return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify command to reduce context size, or start a new session."));
|
||||
}
|
||||
|
||||
// Leave 1k buffer
|
||||
let available = model_limit
|
||||
.saturating_sub(current_usage)
|
||||
.saturating_sub(1000);
|
||||
// Cap at 3k for embedded models
|
||||
Some(available.min(3000))
|
||||
}
|
||||
_ => {
|
||||
// Default: conservative approach
|
||||
let model_limit = self.context_window.total_tokens;
|
||||
let current_usage = self.context_window.used_tokens;
|
||||
|
||||
if current_usage >= model_limit.saturating_sub(1000) {
|
||||
error!(
|
||||
"Context window at capacity ({}%)",
|
||||
self.context_window.percentage_used()
|
||||
);
|
||||
return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify or /compact commands, or start a new session."));
|
||||
}
|
||||
|
||||
let available = self.context_window.remaining_tokens().saturating_sub(2000);
|
||||
Some(available.min(5000))
|
||||
}
|
||||
};
|
||||
|
||||
debug!(
|
||||
"Requesting summary with max_tokens: {:?} (current usage: {} tokens)",
|
||||
summary_max_tokens, self.context_window.used_tokens
|
||||
);
|
||||
|
||||
// Final safety check
|
||||
if summary_max_tokens.unwrap_or(0) == 0 {
|
||||
error!("No tokens available for summarization");
|
||||
return Err(anyhow::anyhow!("No context window capacity left for summarization. Use /thinnify to reduce context size or start a new session."));
|
||||
}
|
||||
tracing::debug!("Creating auto-summary request: max_tokens={}, disable_thinking={}", summary_max_tokens, disable_thinking);
|
||||
|
||||
let summary_request = CompletionRequest {
|
||||
messages: summary_messages,
|
||||
max_tokens: summary_max_tokens,
|
||||
max_tokens: Some(summary_max_tokens),
|
||||
temperature: Some(self.resolve_temperature(provider.name())),
|
||||
stream: false,
|
||||
tools: None,
|
||||
disable_thinking,
|
||||
};
|
||||
|
||||
// Get the summary
|
||||
@@ -3604,40 +4048,6 @@ impl<W: UiWriter> Agent<W> {
|
||||
}
|
||||
}
|
||||
|
||||
// Check if this was a final_output tool call
|
||||
if tool_call.tool == "final_output" {
|
||||
// The summary was already displayed via print_final_output
|
||||
// Don't add it to full_response to avoid duplicate printing
|
||||
// full_response is intentionally left empty/unchanged
|
||||
self.ui_writer.println("");
|
||||
let _ttft =
|
||||
first_token_time.unwrap_or_else(|| stream_start.elapsed());
|
||||
|
||||
// Add timing if needed
|
||||
let final_response = if show_timing {
|
||||
format!(
|
||||
"🕝 {} | 💭 {}",
|
||||
Self::format_duration(stream_start.elapsed()),
|
||||
Self::format_duration(_ttft)
|
||||
)
|
||||
} else {
|
||||
// Return empty string since content was already displayed
|
||||
String::new()
|
||||
};
|
||||
|
||||
return Ok(TaskResult::new(
|
||||
final_response,
|
||||
self.context_window.clone(),
|
||||
));
|
||||
}
|
||||
|
||||
// Closure marker with timing
|
||||
if tool_call.tool != "final_output" {
|
||||
self.ui_writer
|
||||
.print_tool_timing(&Self::format_duration(exec_duration));
|
||||
self.ui_writer.print_agent_prompt();
|
||||
}
|
||||
|
||||
// Add the tool call and result to the context window using RAW unfiltered content
|
||||
// This ensures the log file contains the true raw content including JSON tool calls
|
||||
let tool_message = if !raw_content_for_log.trim().is_empty() {
|
||||
@@ -3701,6 +4111,43 @@ impl<W: UiWriter> Agent<W> {
|
||||
self.context_window.add_message(tool_message);
|
||||
self.context_window.add_message(result_message);
|
||||
|
||||
// Check if this was a final_output tool call
|
||||
if tool_call.tool == "final_output" {
|
||||
// Save context window BEFORE returning so the session log includes final_output
|
||||
self.save_context_window("completed");
|
||||
|
||||
// The summary was already displayed via print_final_output
|
||||
// Don't add it to full_response to avoid duplicate printing
|
||||
// full_response is intentionally left empty/unchanged
|
||||
self.ui_writer.println("");
|
||||
let _ttft =
|
||||
first_token_time.unwrap_or_else(|| stream_start.elapsed());
|
||||
|
||||
// Add timing if needed
|
||||
let final_response = if show_timing {
|
||||
format!(
|
||||
"🕝 {} | 💭 {}",
|
||||
Self::format_duration(stream_start.elapsed()),
|
||||
Self::format_duration(_ttft)
|
||||
)
|
||||
} else {
|
||||
// Return empty string since content was already displayed
|
||||
String::new()
|
||||
};
|
||||
|
||||
return Ok(TaskResult::new(
|
||||
final_response,
|
||||
self.context_window.clone(),
|
||||
));
|
||||
}
|
||||
|
||||
// Closure marker with timing
|
||||
if tool_call.tool != "final_output" {
|
||||
self.ui_writer
|
||||
.print_tool_timing(&Self::format_duration(exec_duration));
|
||||
self.ui_writer.print_agent_prompt();
|
||||
}
|
||||
|
||||
// Update the request with the new context for next iteration
|
||||
request.messages = self.context_window.conversation_history.clone();
|
||||
|
||||
@@ -3922,6 +4369,9 @@ impl<W: UiWriter> Agent<W> {
|
||||
full_response = String::new();
|
||||
|
||||
self.ui_writer.println("");
|
||||
|
||||
// Save context window BEFORE returning
|
||||
self.save_context_window("completed");
|
||||
let _ttft =
|
||||
first_token_time.unwrap_or_else(|| stream_start.elapsed());
|
||||
|
||||
@@ -4060,6 +4510,9 @@ impl<W: UiWriter> Agent<W> {
|
||||
}
|
||||
}
|
||||
|
||||
// Save context window BEFORE returning
|
||||
self.save_context_window("completed");
|
||||
|
||||
// Add timing if needed
|
||||
let final_response = if show_timing {
|
||||
format!(
|
||||
@@ -4786,7 +5239,14 @@ impl<W: UiWriter> Agent<W> {
|
||||
Ok(_) => {
|
||||
let mut todo = self.todo_content.write().await;
|
||||
*todo = String::new();
|
||||
return Ok("✅ All TODOs completed! Removed todo.g3.md".to_string());
|
||||
// Show the final completed TODOs before deletion
|
||||
let mut result = String::from("✅ All TODOs completed! Removed todo.g3.md\n\nFinal status:\n");
|
||||
for line in content_str.lines() {
|
||||
self.ui_writer.print_tool_output_line(line);
|
||||
result.push_str(line);
|
||||
result.push('\n');
|
||||
}
|
||||
return Ok(result);
|
||||
}
|
||||
Err(e) => return Ok(format!("❌ Failed to remove todo.g3.md: {}", e)),
|
||||
}
|
||||
@@ -4801,11 +5261,7 @@ impl<W: UiWriter> Agent<W> {
|
||||
// Also update in-memory content to stay in sync
|
||||
let mut todo = self.todo_content.write().await;
|
||||
*todo = content_str.to_string();
|
||||
// Print the TODO content to the console
|
||||
self.ui_writer.print_context_status(&format!(
|
||||
"✅ TODO list updated ({} chars) and saved to todo.g3.md:",
|
||||
char_count
|
||||
));
|
||||
// Print the TODO content to the console (inside the tool frame)
|
||||
for line in content_str.lines() {
|
||||
self.ui_writer.print_tool_output_line(line);
|
||||
}
|
||||
|
||||
188
crates/g3-core/tests/test_preflight_max_tokens.rs
Normal file
188
crates/g3-core/tests/test_preflight_max_tokens.rs
Normal file
@@ -0,0 +1,188 @@
|
||||
//! Tests for the pre-flight max_tokens validation with thinking.budget_tokens constraint
|
||||
//!
|
||||
//! These tests verify that when using Anthropic with extended thinking enabled,
|
||||
//! the max_tokens calculation properly accounts for the budget_tokens constraint.
|
||||
|
||||
use g3_config::Config;
|
||||
use g3_core::ContextWindow;
|
||||
|
||||
/// Helper function to create a minimal config for testing
|
||||
fn create_test_config_with_thinking(thinking_budget: Option<u32>) -> Config {
|
||||
let mut config = Config::default();
|
||||
|
||||
// Set up Anthropic provider with optional thinking budget
|
||||
config.providers.anthropic = Some(g3_config::AnthropicConfig {
|
||||
api_key: "test-key".to_string(),
|
||||
model: "claude-sonnet-4-5".to_string(),
|
||||
max_tokens: Some(16000),
|
||||
temperature: Some(0.1),
|
||||
cache_config: None,
|
||||
enable_1m_context: None,
|
||||
thinking_budget_tokens: thinking_budget,
|
||||
});
|
||||
|
||||
config.providers.default_provider = "anthropic".to_string();
|
||||
config
|
||||
}
|
||||
|
||||
/// Test that when thinking is disabled, max_tokens passes through unchanged
|
||||
#[test]
|
||||
fn test_no_thinking_budget_passes_through() {
|
||||
let config = create_test_config_with_thinking(None);
|
||||
|
||||
// Without thinking budget, any max_tokens should be fine
|
||||
let proposed_max = 5000;
|
||||
|
||||
// The constraint check would return (proposed_max, false)
|
||||
// since there's no thinking_budget_tokens configured
|
||||
assert!(config.providers.anthropic.as_ref().unwrap().thinking_budget_tokens.is_none());
|
||||
}
|
||||
|
||||
/// Test that when max_tokens > budget_tokens + buffer, no reduction is needed
|
||||
#[test]
|
||||
fn test_sufficient_max_tokens_no_reduction_needed() {
|
||||
let config = create_test_config_with_thinking(Some(10000));
|
||||
let budget_tokens = config.providers.anthropic.as_ref().unwrap().thinking_budget_tokens.unwrap();
|
||||
|
||||
// minimum_required = budget_tokens + 1024 = 11024
|
||||
let minimum_required = budget_tokens + 1024;
|
||||
|
||||
// If proposed_max >= minimum_required, no reduction is needed
|
||||
let proposed_max = 15000;
|
||||
assert!(proposed_max >= minimum_required);
|
||||
}
|
||||
|
||||
/// Test that when max_tokens < budget_tokens + buffer, reduction is needed
|
||||
#[test]
|
||||
fn test_insufficient_max_tokens_needs_reduction() {
|
||||
let config = create_test_config_with_thinking(Some(10000));
|
||||
let budget_tokens = config.providers.anthropic.as_ref().unwrap().thinking_budget_tokens.unwrap();
|
||||
|
||||
// minimum_required = budget_tokens + 1024 = 11024
|
||||
let minimum_required = budget_tokens + 1024;
|
||||
|
||||
// If proposed_max < minimum_required, reduction IS needed
|
||||
let proposed_max = 5000;
|
||||
assert!(proposed_max < minimum_required);
|
||||
}
|
||||
|
||||
/// Test the minimum required calculation
|
||||
#[test]
|
||||
fn test_minimum_required_calculation() {
|
||||
// For a budget of 10000, we need at least 11024 tokens
|
||||
let budget_tokens = 10000u32;
|
||||
let output_buffer = 1024u32;
|
||||
let minimum_required = budget_tokens + output_buffer;
|
||||
|
||||
assert_eq!(minimum_required, 11024);
|
||||
|
||||
// For a larger budget
|
||||
let budget_tokens = 32000u32;
|
||||
let minimum_required = budget_tokens + output_buffer;
|
||||
assert_eq!(minimum_required, 33024);
|
||||
}
|
||||
|
||||
/// Test context window usage calculation for summary max_tokens
|
||||
#[test]
|
||||
fn test_context_window_available_tokens() {
|
||||
let mut context = ContextWindow::new(200000); // 200k context window
|
||||
|
||||
// Simulate heavy usage
|
||||
context.used_tokens = 180000; // 90% used
|
||||
|
||||
let model_limit = context.total_tokens;
|
||||
let current_usage = context.used_tokens;
|
||||
|
||||
// 2.5% buffer calculation
|
||||
let buffer = (model_limit / 40).clamp(1000, 10000);
|
||||
assert_eq!(buffer, 5000); // 200000/40 = 5000
|
||||
|
||||
let available = model_limit
|
||||
.saturating_sub(current_usage)
|
||||
.saturating_sub(buffer);
|
||||
|
||||
// 200000 - 180000 - 5000 = 15000
|
||||
assert_eq!(available, 15000);
|
||||
|
||||
// Capped at 10000 for summary
|
||||
let summary_max = available.min(10_000);
|
||||
assert_eq!(summary_max, 10000);
|
||||
}
|
||||
|
||||
/// Test that when context is nearly full, available tokens may be below thinking budget
|
||||
#[test]
|
||||
fn test_context_nearly_full_triggers_reduction() {
|
||||
let mut context = ContextWindow::new(200000);
|
||||
|
||||
// Very heavy usage - 98% used
|
||||
context.used_tokens = 196000;
|
||||
|
||||
let model_limit = context.total_tokens;
|
||||
let current_usage = context.used_tokens;
|
||||
let buffer = (model_limit / 40).clamp(1000, 10000); // 5000
|
||||
|
||||
let available = model_limit
|
||||
.saturating_sub(current_usage)
|
||||
.saturating_sub(buffer);
|
||||
|
||||
// 200000 - 196000 - 5000 = -1000 -> saturates to 0
|
||||
assert_eq!(available, 0);
|
||||
|
||||
// With thinking_budget of 10000, this would definitely need reduction
|
||||
let thinking_budget = 10000u32;
|
||||
let minimum_required = thinking_budget + 1024;
|
||||
assert!(available < minimum_required);
|
||||
}
|
||||
|
||||
/// Test the hard-coded fallback value
|
||||
#[test]
|
||||
fn test_hardcoded_fallback_value() {
|
||||
// When all else fails, we use 5000 as the hard-coded max_tokens
|
||||
let hardcoded_fallback = 5000u32;
|
||||
|
||||
// This should be a reasonable value that Anthropic will accept
|
||||
// even with thinking enabled (though output will be limited)
|
||||
assert!(hardcoded_fallback > 0);
|
||||
|
||||
// Note: With a 10000 thinking budget, 5000 is still below the
|
||||
// minimum required (11024), but we send it anyway as a "last resort"
|
||||
// hoping the API might still work for basic operations
|
||||
}
|
||||
|
||||
/// Test provider-specific caps
|
||||
#[test]
|
||||
fn test_provider_specific_caps() {
|
||||
// Anthropic/Databricks: cap at 10000
|
||||
let anthropic_cap = 10000u32;
|
||||
let proposed = 15000u32;
|
||||
assert_eq!(proposed.min(anthropic_cap), 10000);
|
||||
|
||||
// Embedded: cap at 3000
|
||||
let embedded_cap = 3000u32;
|
||||
let proposed = 5000u32;
|
||||
assert_eq!(proposed.min(embedded_cap), 3000);
|
||||
|
||||
// Default: cap at 5000
|
||||
let default_cap = 5000u32;
|
||||
let proposed = 8000u32;
|
||||
assert_eq!(proposed.min(default_cap), 5000);
|
||||
}
|
||||
|
||||
/// Test that the error message mentions the thinking budget constraint
|
||||
#[test]
|
||||
fn test_error_message_content() {
|
||||
// Verify the warning message format contains useful information
|
||||
let proposed_max_tokens = 5000u32;
|
||||
let budget_tokens = 10000u32;
|
||||
let minimum_required = budget_tokens + 1024;
|
||||
|
||||
let warning = format!(
|
||||
"max_tokens ({}) is below required minimum ({}) for thinking.budget_tokens ({}). Context reduction needed.",
|
||||
proposed_max_tokens, minimum_required, budget_tokens
|
||||
);
|
||||
|
||||
assert!(warning.contains("5000"));
|
||||
assert!(warning.contains("11024"));
|
||||
assert!(warning.contains("10000"));
|
||||
assert!(warning.contains("Context reduction needed"));
|
||||
}
|
||||
@@ -85,6 +85,7 @@ pub async fn get_initial_discovery_messages(
|
||||
temperature: Some(provider.temperature()),
|
||||
stream: false,
|
||||
tools: None,
|
||||
disable_thinking: false,
|
||||
};
|
||||
|
||||
status("🤖 Calling LLM for discovery commands...");
|
||||
|
||||
@@ -39,6 +39,7 @@
|
||||
//! temperature: Some(0.7),
|
||||
//! stream: false,
|
||||
//! tools: None,
|
||||
//! disable_thinking: false,
|
||||
//! };
|
||||
//!
|
||||
//! // Get a completion
|
||||
@@ -75,6 +76,7 @@
|
||||
//! temperature: Some(0.7),
|
||||
//! stream: true,
|
||||
//! tools: None,
|
||||
//! disable_thinking: false,
|
||||
//! };
|
||||
//!
|
||||
//! let mut stream = provider.stream(request).await?;
|
||||
@@ -272,6 +274,7 @@ impl AnthropicProvider {
|
||||
streaming: bool,
|
||||
max_tokens: u32,
|
||||
temperature: f32,
|
||||
disable_thinking: bool,
|
||||
) -> Result<AnthropicRequest> {
|
||||
let (system, anthropic_messages) = self.convert_messages(messages)?;
|
||||
|
||||
@@ -284,10 +287,32 @@ impl AnthropicProvider {
|
||||
// Convert tools if provided
|
||||
let anthropic_tools = tools.map(|t| self.convert_tools(t));
|
||||
|
||||
// Add thinking configuration if budget_tokens is set
|
||||
let thinking = self.thinking_budget_tokens.map(|budget| {
|
||||
ThinkingConfig::enabled(budget)
|
||||
});
|
||||
// Add thinking configuration if budget_tokens is set AND max_tokens is sufficient AND not explicitly disabled
|
||||
// Anthropic requires: max_tokens > thinking.budget_tokens
|
||||
// We add 1024 as minimum buffer for actual response content
|
||||
tracing::debug!("create_request_body called: max_tokens={}, disable_thinking={}, thinking_budget_tokens={:?}", max_tokens, disable_thinking, self.thinking_budget_tokens);
|
||||
|
||||
let thinking = if disable_thinking {
|
||||
tracing::info!(
|
||||
"Thinking mode explicitly disabled for this request (max_tokens={})",
|
||||
max_tokens
|
||||
);
|
||||
None
|
||||
} else {
|
||||
self.thinking_budget_tokens.and_then(|budget| {
|
||||
let min_required = budget + 1024;
|
||||
if max_tokens > min_required {
|
||||
Some(ThinkingConfig::enabled(budget))
|
||||
} else {
|
||||
tracing::warn!(
|
||||
"Disabling thinking mode: max_tokens ({}) is not greater than thinking.budget_tokens ({}) + 1024 buffer. \
|
||||
Required: max_tokens > {}",
|
||||
max_tokens, budget, min_required
|
||||
);
|
||||
None
|
||||
}
|
||||
})
|
||||
};
|
||||
|
||||
let request = AnthropicRequest {
|
||||
model: self.model.clone(),
|
||||
@@ -637,6 +662,7 @@ impl LLMProvider for AnthropicProvider {
|
||||
false,
|
||||
max_tokens,
|
||||
temperature,
|
||||
request.disable_thinking,
|
||||
)?;
|
||||
|
||||
debug!(
|
||||
@@ -710,6 +736,7 @@ impl LLMProvider for AnthropicProvider {
|
||||
true,
|
||||
max_tokens,
|
||||
temperature,
|
||||
request.disable_thinking,
|
||||
)?;
|
||||
|
||||
debug!(
|
||||
@@ -847,6 +874,12 @@ enum AnthropicContent {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
cache_control: Option<crate::CacheControl>,
|
||||
},
|
||||
#[serde(rename = "thinking")]
|
||||
Thinking {
|
||||
thinking: String,
|
||||
#[serde(default)]
|
||||
signature: Option<String>,
|
||||
},
|
||||
#[serde(rename = "tool_use")]
|
||||
ToolUse {
|
||||
id: String,
|
||||
@@ -947,7 +980,7 @@ mod tests {
|
||||
let messages = vec![Message::new(MessageRole::User, "Test message".to_string())];
|
||||
|
||||
let request_body = provider
|
||||
.create_request_body(&messages, None, false, 1000, 0.5)
|
||||
.create_request_body(&messages, None, false, 1000, 0.5, false)
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(request_body.model, "claude-3-haiku-20240307");
|
||||
@@ -1053,16 +1086,17 @@ mod tests {
|
||||
|
||||
let messages = vec![Message::new(MessageRole::User, "Test message".to_string())];
|
||||
let request_without = provider_without
|
||||
.create_request_body(&messages, None, false, 1000, 0.5)
|
||||
.create_request_body(&messages, None, false, 1000, 0.5, false)
|
||||
.unwrap();
|
||||
let json_without = serde_json::to_string(&request_without).unwrap();
|
||||
assert!(!json_without.contains("thinking"), "JSON should not contain 'thinking' field when not configured");
|
||||
|
||||
// Test WITH thinking parameter
|
||||
// Test WITH thinking parameter - max_tokens must be > budget_tokens + 1024
|
||||
// Using budget=10000 requires max_tokens > 11024
|
||||
let provider_with = AnthropicProvider::new(
|
||||
"test-key".to_string(),
|
||||
Some("claude-sonnet-4-5".to_string()),
|
||||
Some(1000),
|
||||
Some(20000), // Sufficient for thinking budget
|
||||
Some(0.5),
|
||||
None,
|
||||
None,
|
||||
@@ -1071,11 +1105,78 @@ mod tests {
|
||||
.unwrap();
|
||||
|
||||
let request_with = provider_with
|
||||
.create_request_body(&messages, None, false, 1000, 0.5)
|
||||
.create_request_body(&messages, None, false, 20000, 0.5, false)
|
||||
.unwrap();
|
||||
let json_with = serde_json::to_string(&request_with).unwrap();
|
||||
assert!(json_with.contains("thinking"), "JSON should contain 'thinking' field when configured");
|
||||
assert!(json_with.contains("\"type\":\"enabled\""), "JSON should contain type: enabled");
|
||||
assert!(json_with.contains("\"budget_tokens\":10000"), "JSON should contain budget_tokens: 10000");
|
||||
|
||||
// Test WITH thinking parameter but INSUFFICIENT max_tokens - thinking should be disabled
|
||||
let request_insufficient = provider_with
|
||||
.create_request_body(&messages, None, false, 5000, 0.5, false) // Less than budget + 1024
|
||||
.unwrap();
|
||||
let json_insufficient = serde_json::to_string(&request_insufficient).unwrap();
|
||||
assert!(!json_insufficient.contains("thinking"), "JSON should NOT contain 'thinking' field when max_tokens is insufficient");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_disable_thinking_flag() {
|
||||
// Test that disable_thinking=true prevents thinking even with sufficient max_tokens
|
||||
let provider = AnthropicProvider::new(
|
||||
"test-key".to_string(),
|
||||
Some("claude-sonnet-4-5".to_string()),
|
||||
Some(20000),
|
||||
Some(0.5),
|
||||
None,
|
||||
None,
|
||||
Some(10000), // With thinking budget
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let messages = vec![Message::new(MessageRole::User, "Test message".to_string())];
|
||||
|
||||
// With disable_thinking=false, thinking should be enabled (max_tokens is sufficient)
|
||||
let request_with_thinking = provider
|
||||
.create_request_body(&messages, None, false, 20000, 0.5, false)
|
||||
.unwrap();
|
||||
let json_with = serde_json::to_string(&request_with_thinking).unwrap();
|
||||
assert!(json_with.contains("thinking"), "JSON should contain 'thinking' field when not disabled");
|
||||
|
||||
// With disable_thinking=true, thinking should be disabled even with sufficient max_tokens
|
||||
let request_without_thinking = provider
|
||||
.create_request_body(&messages, None, false, 20000, 0.5, true)
|
||||
.unwrap();
|
||||
let json_without = serde_json::to_string(&request_without_thinking).unwrap();
|
||||
assert!(!json_without.contains("thinking"), "JSON should NOT contain 'thinking' field when explicitly disabled");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_thinking_content_block_deserialization() {
|
||||
// Test that we can deserialize a response containing a "thinking" content block
|
||||
// This is what Anthropic returns when extended thinking is enabled
|
||||
let json_response = r#"{
|
||||
"content": [
|
||||
{"type": "thinking", "thinking": "Let me analyze this...", "signature": "abc123"},
|
||||
{"type": "text", "text": "Here is my response."}
|
||||
],
|
||||
"model": "claude-sonnet-4-5",
|
||||
"usage": {"input_tokens": 100, "output_tokens": 50}
|
||||
}"#;
|
||||
|
||||
let response: AnthropicResponse = serde_json::from_str(json_response)
|
||||
.expect("Should be able to deserialize response with thinking block");
|
||||
|
||||
assert_eq!(response.content.len(), 2);
|
||||
assert_eq!(response.model, "claude-sonnet-4-5");
|
||||
|
||||
// Extract only text content (thinking should be filtered out)
|
||||
let text_content: Vec<_> = response.content.iter().filter_map(|c| match c {
|
||||
AnthropicContent::Text { text, .. } => Some(text.as_str()),
|
||||
_ => None,
|
||||
}).collect();
|
||||
|
||||
assert_eq!(text_content.len(), 1);
|
||||
assert_eq!(text_content[0], "Here is my response.");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -45,6 +45,7 @@
|
||||
//! temperature: Some(0.7),
|
||||
//! stream: false,
|
||||
//! tools: None,
|
||||
//! disable_thinking: false,
|
||||
//! };
|
||||
//!
|
||||
//! // Get a completion
|
||||
|
||||
@@ -42,6 +42,8 @@ pub struct CompletionRequest {
|
||||
pub temperature: Option<f32>,
|
||||
pub stream: bool,
|
||||
pub tools: Option<Vec<Tool>>,
|
||||
/// Force disable thinking mode for this request (used when max_tokens is too low)
|
||||
pub disable_thinking: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
|
||||
Reference in New Issue
Block a user