disable thinking if there is no token budget

small fix to provider name check
fix for thinking budget and hardcoded max token on summary
2025-12-09 16:45:28 +11:00 · 2025-12-09 14:43:35 +11:00 · 2025-12-09 12:41:52 +11:00 · 2025-12-09 10:15:32 +11:00 · 2025-12-08 11:05:01 +11:00 · 2025-12-05 15:32:13 +11:00
11 changed files with 1314 additions and 161 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1377,6 +1377,7 @@ dependencies = [
 "serde",
 "serde_json",
 "sha2",
+ "tempfile",
 "termimad",
 "tokio",
 "tokio-util",
--- a/README.md
+++ b/README.md
@@ -76,6 +76,7 @@ G3 includes robust error handling with automatic retry logic:
 G3's interactive CLI includes control commands for manual context management:
 - **`/compact`**: Manually trigger summarization to compact conversation history
 - **`/thinnify`**: Manually trigger context thinning to replace large tool results with file references
+- **`/skinnify`**: Manually trigger full context thinning (like `/thinnify` but processes the entire context window, not just the first third)
 - **`/readme`**: Reload README.md and AGENTS.md from disk without restarting
 - **`/stats`**: Show detailed context and performance statistics
 - **`/help`**: Display all available control commands
--- a/crates/g3-cli/Cargo.toml
+++ b/crates/g3-cli/Cargo.toml
@@ -27,3 +27,6 @@ chrono = { version = "0.4", features = ["serde"] }
 crossterm = "0.29.0"
 ratatui = "0.29"
 termimad = "0.34.0"
+
+[dev-dependencies]
+tempfile = "3.8"
--- a/crates/g3-cli/src/lib.rs
+++ b/crates/g3-cli/src/lib.rs
@@ -163,15 +163,66 @@ fn extract_coach_feedback_from_logs(
                if let Some(context_window) = log_json.get("context_window") {
                    if let Some(conversation_history) = context_window.get("conversation_history") {
                        if let Some(messages) = conversation_history.as_array() {
-                            // Simply get the last message content - this is the coach's final feedback
-                            if let Some(last_message) = messages.last() {
-                                if let Some(content) = last_message.get("content") {
-                                    if let Some(content_str) = content.as_str() {
-                                        output.print(&format!(
-                                            "✅ Extracted coach feedback from session: {}",
-                                            session_id
-                                        ));
-                                        return Ok(content_str.to_string());
+                            // Go backwards through the conversation to find the last tool result
+                            // that corresponds to a final_output tool call
+                            for i in (0..messages.len()).rev() {
+                                let msg = &messages[i];
+                                
+                                // Check if this is a User message with "Tool result:"
+                                if let Some(role) = msg.get("role") {
+                                    if let Some(role_str) = role.as_str() {
+                                        if role_str == "User" || role_str == "user" {
+                                            if let Some(content) = msg.get("content") {
+                                                if let Some(content_str) = content.as_str() {
+                                                    if content_str.starts_with("Tool result:") {
+                                                        // Found a tool result, now check the preceding message
+                                                        // to verify it was a final_output tool call
+                                                        if i > 0 {
+                                                            let prev_msg = &messages[i - 1];
+                                                            if let Some(prev_role) = prev_msg.get("role") {
+                                                                if let Some(prev_role_str) = prev_role.as_str() {
+                                                                    if prev_role_str == "assistant" || prev_role_str == "Assistant" {
+                                                                        if let Some(prev_content) = prev_msg.get("content") {
+                                                                            if let Some(prev_content_str) = prev_content.as_str() {
+                                                                                // Check if the previous assistant message contains a final_output tool call
+                                                                                if prev_content_str.contains("\"tool\": \"final_output\"") {
+                                                                                    // This is a final_output tool result
+                                                                                    let feedback = if content_str.starts_with("Tool result: ") {
+                                                                                        content_str.strip_prefix("Tool result: ")
+                                                                                            .unwrap_or(content_str)
+                                                                                            .to_string()
+                                                                                    } else {
+                                                                                        content_str.to_string()
+                                                                                    };
+                                                                                    
+                                                                                    output.print(&format!(
+                                                                                        "Coach feedback extracted: {} characters (from {} total)",
+                                                                                        feedback.len(),
+                                                                                        content_str.len()
+                                                                                    ));
+                                                                                    output.print(&format!("Coach feedback:\n{}", feedback));
+                                                                                    
+                                                                                    output.print(&format!(
+                                                                                        "✅ Extracted coach feedback from session: {} (verified final_output tool)",
+                                                                                        session_id
+                                                                                    ));
+                                                                                    return Ok(feedback);
+                                                                                } else {
+                                                                                    output.print(&format!(
+                                                                                        "⚠️  Skipping tool result at index {} - not a final_output tool call",
+                                                                                        i
+                                                                                    ));
+                                                                                }
+                                                                            }
+                                                                        }
+                                                                    }
+                                                                }
+                                                            }
+                                                        }
+                                                    }
+                                                }
+                                            }
+                                        }
                                    }
                                }
                            }
@@ -187,7 +238,7 @@ fn extract_coach_feedback_from_logs(
        "CRITICAL: Could not extract coach feedback from session: {}\n\
         Log file path: {:?}\n\
         Log file exists: {}\n\
-         This indicates the coach did not call any tool or the log is corrupted.\n\
+         This indicates the coach did not call final_output tool or the log is corrupted.\n\
         Coach result response length: {} chars",
        session_id,
        log_file_path,
@@ -1283,6 +1334,7 @@ async fn run_interactive<W: UiWriter>(
                                output.print("📖 Control Commands:");
                                output.print("  /compact   - Trigger auto-summarization (compacts conversation history)");
                                output.print("  /thinnify  - Trigger context thinning (replaces large tool results with file references)");
+                                output.print("  /skinnify  - Trigger full context thinning (like /thinnify but for entire context, not just first third)");
                                output.print(
                                    "  /readme    - Reload README.md and AGENTS.md from disk",
                                );
@@ -1315,6 +1367,11 @@ async fn run_interactive<W: UiWriter>(
                                println!("{}", summary);
                                continue;
                            }
+                            "/skinnify" => {
+                                let summary = agent.force_thin_all();
+                                println!("{}", summary);
+                                continue;
+                            }
                            "/readme" => {
                                output.print("📚 Reloading README.md and AGENTS.md...");
                                match agent.reload_readme() {
@@ -1524,6 +1581,12 @@ async fn run_interactive_machine(
                            println!("{}", summary);
                            continue;
                        }
+                        "/skinnify" => {
+                            println!("COMMAND: skinnify");
+                            let summary = agent.force_thin_all();
+                            println!("{}", summary);
+                            continue;
+                        }
                        "/readme" => {
                            println!("COMMAND: readme");
                            match agent.reload_readme() {
@@ -1546,7 +1609,7 @@ async fn run_interactive_machine(
                        }
                        "/help" => {
                            println!("COMMAND: help");
-                            println!("AVAILABLE_COMMANDS: /compact /thinnify /readme /stats /help");
+                            println!("AVAILABLE_COMMANDS: /compact /thinnify /skinnify /readme /stats /help");
                            continue;
                        }
                        _ => {
--- a/crates/g3-cli/tests/coach_feedback_extraction_test.rs
+++ b/crates/g3-cli/tests/coach_feedback_extraction_test.rs
@@ -0,0 +1,336 @@
+use serde_json::json;
+use std::fs;
+use tempfile::TempDir;
+
+#[test]
+fn test_extract_coach_feedback_with_timing_message() {
+    // Create a temporary directory for logs
+    let temp_dir = TempDir::new().unwrap();
+    let logs_dir = temp_dir.path().join("logs");
+    fs::create_dir(&logs_dir).unwrap();
+
+    // Create a mock session log with the problematic conversation history
+    // where timing message appears after the tool result
+    let session_id = "test_session_123";
+    let log_file_path = logs_dir.join(format!("g3_session_{}.json", session_id));
+
+    let log_content = json!({
+        "session_id": session_id,
+        "context_window": {
+            "conversation_history": [
+                {
+                    "role": "assistant",
+                    "content": "{\"tool\": \"final_output\", \"args\": {\"summary\":\"IMPLEMENTATION_APPROVED\"}}"
+                },
+                {
+                    "role": "user",
+                    "content": "Tool result: IMPLEMENTATION_APPROVED"
+                },
+                {
+                    "role": "assistant",
+                    "content": "🕝 27.7s | 💭 7.5s"
+                }
+            ]
+        }
+    });
+
+    fs::write(&log_file_path, serde_json::to_string_pretty(&log_content).unwrap()).unwrap();
+
+    // Now test the extraction logic
+    let log_content_str = fs::read_to_string(&log_file_path).unwrap();
+    let log_json: serde_json::Value = serde_json::from_str(&log_content_str).unwrap();
+
+    if let Some(context_window) = log_json.get("context_window") {
+        if let Some(conversation_history) = context_window.get("conversation_history") {
+            if let Some(messages) = conversation_history.as_array() {
+                // This is the key logic we're testing - find the last USER message with "Tool result:"
+                let last_tool_result = messages.iter().rev().find(|msg| {
+                    if let Some(role) = msg.get("role") {
+                        if let Some(role_str) = role.as_str() {
+                            if role_str == "User" || role_str == "user" {
+                                if let Some(content) = msg.get("content") {
+                                    if let Some(content_str) = content.as_str() {
+                                        return content_str.starts_with("Tool result:");
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    false
+                });
+
+                // Verify we found the correct message
+                assert!(last_tool_result.is_some(), "Should find the tool result message");
+
+                if let Some(last_message) = last_tool_result {
+                    if let Some(content) = last_message.get("content") {
+                        if let Some(content_str) = content.as_str() {
+                            let feedback = if content_str.starts_with("Tool result: ") {
+                                content_str.strip_prefix("Tool result: ").unwrap_or(content_str)
+                            } else {
+                                content_str
+                            };
+
+                            // Verify we extracted the correct feedback
+                            assert_eq!(feedback, "IMPLEMENTATION_APPROVED", "Should extract the actual feedback, not timing");
+                            
+                            // Verify the feedback is NOT the timing message
+                            assert!(!feedback.contains("🕝"), "Feedback should not be the timing message");
+                            
+                            println!("✅ Successfully extracted coach feedback: {}", feedback);
+                            return;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    panic!("Failed to extract coach feedback");
+}
+
+#[test]
+fn test_extract_only_final_output_tool_results() {
+    // Test that we only extract tool results from final_output, not from other tools
+    let temp_dir = TempDir::new().unwrap();
+    let logs_dir = temp_dir.path().join("logs");
+    fs::create_dir(&logs_dir).unwrap();
+
+    let session_id = "test_session_final_output_only";
+    let log_file_path = logs_dir.join(format!("g3_session_{}.json", session_id));
+
+    let log_content = json!({
+        "session_id": session_id,
+        "context_window": {
+            "conversation_history": [
+                {
+                    "role": "assistant",
+                    "content": "{\"tool\": \"shell\", \"args\": {\"command\":\"ls\"}}"
+                },
+                {
+                    "role": "user",
+                    "content": "Tool result: file1.txt\nfile2.txt"
+                },
+                {
+                    "role": "assistant",
+                    "content": "{\"tool\": \"read_file\", \"args\": {\"file_path\":\"test.txt\"}}"
+                },
+                {
+                    "role": "user",
+                    "content": "Tool result: This is test content"
+                },
+                {
+                    "role": "assistant",
+                    "content": "{\"tool\": \"final_output\", \"args\": {\"summary\":\"APPROVED_RESULT\"}}"
+                },
+                {
+                    "role": "user",
+                    "content": "Tool result: APPROVED_RESULT"
+                },
+                {
+                    "role": "assistant",
+                    "content": "🕝 20.5s | 💭 5.2s"
+                }
+            ]
+        }
+    });
+
+    fs::write(&log_file_path, serde_json::to_string_pretty(&log_content).unwrap()).unwrap();
+
+    // Test the new extraction logic that verifies the tool is final_output
+    let log_content_str = fs::read_to_string(&log_file_path).unwrap();
+    let log_json: serde_json::Value = serde_json::from_str(&log_content_str).unwrap();
+
+    if let Some(context_window) = log_json.get("context_window") {
+        if let Some(conversation_history) = context_window.get("conversation_history") {
+            if let Some(messages) = conversation_history.as_array() {
+                // Go backwards through messages to find final_output tool result
+                for i in (0..messages.len()).rev() {
+                    let msg = &messages[i];
+                    
+                    if let Some(role) = msg.get("role") {
+                        if let Some(role_str) = role.as_str() {
+                            if role_str == "User" || role_str == "user" {
+                                if let Some(content) = msg.get("content") {
+                                    if let Some(content_str) = content.as_str() {
+                                        if content_str.starts_with("Tool result:") {
+                                            // Check if preceding message was final_output
+                                            if i > 0 {
+                                                let prev_msg = &messages[i - 1];
+                                                if let Some(prev_content) = prev_msg.get("content") {
+                                                    if let Some(prev_content_str) = prev_content.as_str() {
+                                                        if prev_content_str.contains("\"tool\": \"final_output\"") {
+                                                            let feedback = content_str.strip_prefix("Tool result: ").unwrap_or(content_str);
+                                                            assert_eq!(feedback, "APPROVED_RESULT", "Should extract only final_output result");
+                                                            println!("✅ Correctly extracted only final_output tool result: {}", feedback);
+                                                            return;
+                                                        }
+                                                    }
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    panic!("Failed to extract final_output tool result");
+}
+
+#[test]
+fn test_extract_coach_feedback_without_timing_message() {
+    // Create a temporary directory for logs
+    let temp_dir = TempDir::new().unwrap();
+    let logs_dir = temp_dir.path().join("logs");
+    fs::create_dir(&logs_dir).unwrap();
+
+    // Test the case where there's no timing message (backward compatibility)
+    let session_id = "test_session_456";
+    let log_file_path = logs_dir.join(format!("g3_session_{}.json", session_id));
+
+    let log_content = json!({
+        "session_id": session_id,
+        "context_window": {
+            "conversation_history": [
+                {
+                    "role": "assistant",
+                    "content": "{\"tool\": \"final_output\", \"args\": {\"summary\":\"TEST_FEEDBACK\"}}"
+                },
+                {
+                    "role": "user",
+                    "content": "Tool result: TEST_FEEDBACK"
+                }
+            ]
+        }
+    });
+
+    fs::write(&log_file_path, serde_json::to_string_pretty(&log_content).unwrap()).unwrap();
+
+    // Test extraction
+    let log_content_str = fs::read_to_string(&log_file_path).unwrap();
+    let log_json: serde_json::Value = serde_json::from_str(&log_content_str).unwrap();
+
+    if let Some(context_window) = log_json.get("context_window") {
+        if let Some(conversation_history) = context_window.get("conversation_history") {
+            if let Some(messages) = conversation_history.as_array() {
+                let last_tool_result = messages.iter().rev().find(|msg| {
+                    if let Some(role) = msg.get("role") {
+                        if let Some(role_str) = role.as_str() {
+                            if role_str == "User" || role_str == "user" {
+                                if let Some(content) = msg.get("content") {
+                                    if let Some(content_str) = content.as_str() {
+                                        return content_str.starts_with("Tool result:");
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    false
+                });
+
+                assert!(last_tool_result.is_some());
+
+                if let Some(last_message) = last_tool_result {
+                    if let Some(content) = last_message.get("content") {
+                        if let Some(content_str) = content.as_str() {
+                            let feedback = content_str.strip_prefix("Tool result: ").unwrap_or(content_str);
+                            assert_eq!(feedback, "TEST_FEEDBACK");
+                            println!("✅ Successfully extracted coach feedback without timing: {}", feedback);
+                            return;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    panic!("Failed to extract coach feedback");
+}
+
+#[test]
+fn test_extract_coach_feedback_with_multiple_tool_results() {
+    // Test that we get the LAST tool result when there are multiple
+    let temp_dir = TempDir::new().unwrap();
+    let logs_dir = temp_dir.path().join("logs");
+    fs::create_dir(&logs_dir).unwrap();
+
+    let session_id = "test_session_789";
+    let log_file_path = logs_dir.join(format!("g3_session_{}.json", session_id));
+
+    let log_content = json!({
+        "session_id": session_id,
+        "context_window": {
+            "conversation_history": [
+                {
+                    "role": "assistant",
+                    "content": "{\"tool\": \"shell\", \"args\": {\"command\":\"ls\"}}"
+                },
+                {
+                    "role": "user",
+                    "content": "Tool result: file1.txt\nfile2.txt"
+                },
+                {
+                    "role": "assistant",
+                    "content": "{\"tool\": \"final_output\", \"args\": {\"summary\":\"FINAL_RESULT\"}}"
+                },
+                {
+                    "role": "user",
+                    "content": "Tool result: FINAL_RESULT"
+                },
+                {
+                    "role": "assistant",
+                    "content": "🕝 15.2s | 💭 3.1s"
+                }
+            ]
+        }
+    });
+
+    fs::write(&log_file_path, serde_json::to_string_pretty(&log_content).unwrap()).unwrap();
+
+    // Test extraction
+    let log_content_str = fs::read_to_string(&log_file_path).unwrap();
+    let log_json: serde_json::Value = serde_json::from_str(&log_content_str).unwrap();
+
+    if let Some(context_window) = log_json.get("context_window") {
+        if let Some(conversation_history) = context_window.get("conversation_history") {
+            if let Some(messages) = conversation_history.as_array() {
+                let last_tool_result = messages.iter().rev().find(|msg| {
+                    if let Some(role) = msg.get("role") {
+                        if let Some(role_str) = role.as_str() {
+                            if role_str == "User" || role_str == "user" {
+                                if let Some(content) = msg.get("content") {
+                                    if let Some(content_str) = content.as_str() {
+                                        return content_str.starts_with("Tool result:");
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    false
+                });
+
+                assert!(last_tool_result.is_some());
+
+                if let Some(last_message) = last_tool_result {
+                    if let Some(content) = last_message.get("content") {
+                        if let Some(content_str) = content.as_str() {
+                            let feedback = content_str.strip_prefix("Tool result: ").unwrap_or(content_str);
+                            // Should get the LAST tool result (final_output), not the first one (shell)
+                            assert_eq!(feedback, "FINAL_RESULT", "Should extract the last tool result");
+                            assert!(!feedback.contains("file1.txt"), "Should not extract earlier tool results");
+                            println!("✅ Successfully extracted last tool result: {}", feedback);
+                            return;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    panic!("Failed to extract coach feedback");
+}
--- a/crates/g3-core/src/lib.rs
+++ b/crates/g3-core/src/lib.rs
@@ -737,6 +737,233 @@ Format this as a detailed but concise summary that can be used to resume the con
        }
    }

+    /// Perform context thinning on the ENTIRE conversation history (not just first third)
+    /// This is the "skinnify" variant that processes all messages
+    /// Returns a summary message about what was thinned
+    pub fn thin_context_all(&mut self) -> (String, usize) {
+        let current_percentage = self.percentage_used() as u32;
+
+        // Calculate the total messages - process ALL of them
+        let total_messages = self.conversation_history.len();
+
+        let mut leaned_count = 0;
+        let mut tool_call_leaned_count = 0;
+        let mut chars_saved = 0;
+
+        // Create ~/tmp directory if it doesn't exist
+        let tmp_dir = shellexpand::tilde("~/tmp").to_string();
+        if let Err(e) = std::fs::create_dir_all(&tmp_dir) {
+            warn!("Failed to create ~/tmp directory: {}", e);
+            return (
+                "⚠️  Context skinnifying failed: could not create ~/tmp directory".to_string(),
+                0,
+            );
+        }
+
+        // Scan ALL messages (not just first third)
+        for i in 0..total_messages {
+            // Check if the previous message was a TODO tool call (before getting mutable reference)
+            let is_todo_result = if i > 0 {
+                if let Some(prev_message) = self.conversation_history.get(i - 1) {
+                    if matches!(prev_message.role, MessageRole::Assistant) {
+                        prev_message.content.contains(r#""tool":"todo_read""#)
+                            || prev_message.content.contains(r#""tool":"todo_write""#)
+                            || prev_message.content.contains(r#""tool": "todo_read""#)
+                            || prev_message.content.contains(r#""tool": "todo_write""#)
+                    } else {
+                        false
+                    }
+                } else {
+                    false
+                }
+            } else {
+                false
+            };
+
+            if let Some(message) = self.conversation_history.get_mut(i) {
+                // Process User messages that look like tool results
+                if matches!(message.role, MessageRole::User)
+                    && message.content.starts_with("Tool result:")
+                {
+                    let content_len = message.content.len();
+
+                    // Only thin if the content is greater than 500 chars and not a TODO tool result
+                    if !is_todo_result && content_len > 500 {
+                        // Generate a unique filename based on timestamp and index
+                        let timestamp = std::time::SystemTime::now()
+                            .duration_since(std::time::UNIX_EPOCH)
+                            .unwrap_or_default()
+                            .as_secs();
+                        let filename = format!("skinny_tool_result_{}_{}.txt", timestamp, i);
+                        let file_path = format!("{}/{}", tmp_dir, filename);
+
+                        // Write the content to file
+                        if let Err(e) = std::fs::write(&file_path, &message.content) {
+                            warn!("Failed to write skinnified content to {}: {}", file_path, e);
+                            continue;
+                        }
+
+                        // Replace the message content with a note
+                        let original_len = message.content.len();
+                        message.content = format!("Tool result saved to {}", file_path);
+
+                        leaned_count += 1;
+                        chars_saved += original_len - message.content.len();
+
+                        debug!(
+                            "Skinnified tool result {} ({} chars) to {}",
+                            i, original_len, file_path
+                        );
+                    }
+                }
+
+                // Process Assistant messages that contain tool calls with large arguments
+                if matches!(message.role, MessageRole::Assistant) {
+                    // Try to parse the message content as JSON to find tool calls
+                    let content = &message.content;
+
+                    // Look for JSON tool call patterns
+                    if let Some(tool_call_start) = content
+                        .find(r#"{"tool":"#)
+                        .or_else(|| content.find(r#"{ "tool":"#))
+                        .or_else(|| content.find(r#"{"tool" :"#))
+                        .or_else(|| content.find(r#"{ "tool" :"#))
+                    {
+                        // Try to extract and parse the JSON tool call
+                        let json_portion = &content[tool_call_start..];
+
+                        // Find the end of the JSON object
+                        if let Some(json_end) = Self::find_json_end(json_portion) {
+                            let json_str = &json_portion[..=json_end];
+
+                            // Try to parse as ToolCall
+                            if let Ok(mut tool_call) = serde_json::from_str::<ToolCall>(json_str) {
+                                let mut modified = false;
+
+                                // Handle write_file tool calls
+                                if tool_call.tool == "write_file" {
+                                    if let Some(args_obj) = tool_call.args.as_object_mut() {
+                                        let content_info = args_obj
+                                            .get("content")
+                                            .and_then(|v| v.as_str())
+                                            .map(|s| (s.to_string(), s.len()));
+
+                                        if let Some((content_str, content_len)) = content_info {
+                                            if content_len > 500 {
+                                                let timestamp = std::time::SystemTime::now()
+                                                    .duration_since(std::time::UNIX_EPOCH)
+                                                    .unwrap_or_default()
+                                                    .as_secs();
+                                                let filename = format!(
+                                                    "skinny_write_file_content_{}_{}.txt",
+                                                    timestamp, i
+                                                );
+                                                let file_path = format!("{}/{}", tmp_dir, filename);
+
+                                                if std::fs::write(&file_path, &content_str).is_ok() {
+                                                    args_obj.insert(
+                                                        "content".to_string(),
+                                                        serde_json::Value::String(format!(
+                                                            "<content saved to {}>",
+                                                            file_path
+                                                        )),
+                                                    );
+                                                    modified = true;
+                                                    chars_saved += content_len;
+                                                    tool_call_leaned_count += 1;
+                                                    debug!("Skinnified write_file content {} ({} chars) to {}", i, content_len, file_path);
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+
+                                // Handle str_replace tool calls
+                                if tool_call.tool == "str_replace" {
+                                    if let Some(args_obj) = tool_call.args.as_object_mut() {
+                                        let diff_info = args_obj
+                                            .get("diff")
+                                            .and_then(|v| v.as_str())
+                                            .map(|s| (s.to_string(), s.len()));
+
+                                        if let Some((diff_str, diff_len)) = diff_info {
+                                            if diff_len > 500 {
+                                                let timestamp = std::time::SystemTime::now()
+                                                    .duration_since(std::time::UNIX_EPOCH)
+                                                    .unwrap_or_default()
+                                                    .as_secs();
+                                                let filename = format!(
+                                                    "skinny_str_replace_diff_{}_{}.txt",
+                                                    timestamp, i
+                                                );
+                                                let file_path = format!("{}/{}", tmp_dir, filename);
+
+                                                if std::fs::write(&file_path, &diff_str).is_ok() {
+                                                    args_obj.insert(
+                                                        "diff".to_string(),
+                                                        serde_json::Value::String(format!(
+                                                            "<diff saved to {}>",
+                                                            file_path
+                                                        )),
+                                                    );
+                                                    modified = true;
+                                                    chars_saved += diff_len;
+                                                    tool_call_leaned_count += 1;
+                                                    debug!("Skinnified str_replace diff {} ({} chars) to {}", i, diff_len, file_path);
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+
+                                // If we modified the tool call, reconstruct the message
+                                if modified {
+                                    let prefix = &content[..tool_call_start];
+                                    let suffix = &content[tool_call_start + json_str.len()..];
+
+                                    // Serialize the modified tool call
+                                    if let Ok(new_json) = serde_json::to_string(&tool_call) {
+                                        message.content =
+                                            format!("{}{}{}", prefix, new_json, suffix);
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        // Recalculate token usage after thinning
+        self.recalculate_tokens();
+
+        if leaned_count > 0 {
+            if tool_call_leaned_count > 0 {
+                (format!("🦴 Context skinnified at {}%: {} tool results + {} tool calls across entire history, ~{} chars saved",
+                        current_percentage, leaned_count, tool_call_leaned_count, chars_saved), chars_saved)
+            } else {
+                (
+                    format!(
+                        "🦴 Context skinnified at {}%: {} tool results across entire history, ~{} chars saved",
+                        current_percentage, leaned_count, chars_saved
+                    ),
+                    chars_saved,
+                )
+            }
+        } else if tool_call_leaned_count > 0 {
+            (
+                format!(
+                    "🦴 Context skinnified at {}%: {} tool calls across entire history, ~{} chars saved",
+                    current_percentage, tool_call_leaned_count, chars_saved
+                ),
+                chars_saved,
+            )
+        } else {
+            (format!("ℹ Context skinnifying triggered at {}% but no large tool results or tool calls found in entire history",
+                    current_percentage), 0)
+        }
+    }
+
    /// Recalculate token usage based on current conversation history
    fn recalculate_tokens(&mut self) {
        let mut total = 0;
@@ -1181,14 +1408,237 @@ impl<W: UiWriter> Agent<W> {

    /// Resolve the max_tokens to use for a given provider, applying fallbacks
    fn resolve_max_tokens(&self, provider_name: &str) -> u32 {
-        match provider_name {
+        let base = match provider_name {
            "databricks" => Self::provider_max_tokens(&self.config, "databricks")
                .or(Some(self.config.agent.fallback_default_max_tokens as u32))
                .unwrap_or(32000),
            other => Self::provider_max_tokens(&self.config, other)
                .or(Some(self.config.agent.fallback_default_max_tokens as u32))
                .unwrap_or(16000),
+        };
+        
+        // For Anthropic with thinking enabled, ensure max_tokens is sufficient
+        // Anthropic requires: max_tokens > thinking.budget_tokens
+        if provider_name == "anthropic" {
+            if let Some(budget) = self.get_thinking_budget_tokens() {
+                let minimum_for_thinking = budget + 1024;
+                return base.max(minimum_for_thinking);
+            }
        }
+        
+        base
+    }
+
+    /// Get the thinking budget tokens for Anthropic provider, if configured
+    fn get_thinking_budget_tokens(&self) -> Option<u32> {
+        self.config
+            .providers
+            .anthropic
+            .as_ref()
+            .and_then(|c| c.thinking_budget_tokens)
+    }
+
+    /// Pre-flight check to validate and adjust max_tokens for the thinking.budget_tokens constraint.
+    /// Returns the adjusted max_tokens that satisfies: max_tokens > thinking.budget_tokens
+    /// Also returns whether we need to apply fallback actions (thinnify/skinnify).
+    ///
+    /// Returns: (adjusted_max_tokens, needs_context_reduction)
+    fn preflight_validate_max_tokens(
+        &self,
+        provider_name: &str,
+        proposed_max_tokens: u32,
+    ) -> (u32, bool) {
+        // Only applies to Anthropic provider with thinking enabled
+        if provider_name != "anthropic" {
+            return (proposed_max_tokens, false);
+        }
+
+        let budget_tokens = match self.get_thinking_budget_tokens() {
+            Some(budget) => budget,
+            None => return (proposed_max_tokens, false), // No thinking enabled
+        };
+
+        // Anthropic requires: max_tokens > budget_tokens
+        // We add a minimum output buffer of 1024 tokens for actual response content
+        let minimum_required = budget_tokens + 1024;
+
+        if proposed_max_tokens >= minimum_required {
+            // We have enough headroom
+            (proposed_max_tokens, false)
+        } else {
+            // max_tokens is too low - need to either adjust or reduce context
+            warn!(
+                "max_tokens ({}) is below required minimum ({}) for thinking.budget_tokens ({}). Context reduction needed.",
+                proposed_max_tokens, minimum_required, budget_tokens
+            );
+            // Return the minimum required, but flag that we need context reduction
+            (minimum_required, true)
+        }
+    }
+
+    /// Calculate max_tokens for a summary request, ensuring it satisfies the thinking constraint.
+    /// Applies fallback sequence: thinnify -> skinnify -> hard-coded minimum
+    /// Returns (max_tokens, whether_fallback_was_used)
+    fn calculate_summary_max_tokens(
+        &mut self,
+        provider_name: &str,
+    ) -> (u32, bool) {
+        let model_limit = self.context_window.total_tokens;
+        let current_usage = self.context_window.used_tokens;
+        
+        // Get the configured max_tokens for this provider
+        let configured_max_tokens = self.resolve_max_tokens(provider_name);
+        
+        // Calculate available tokens with buffer
+        let buffer = (model_limit / 40).clamp(1000, 10000); // 2.5% buffer
+        let available = model_limit
+            .saturating_sub(current_usage)
+            .saturating_sub(buffer);
+        // Use the smaller of available tokens or configured max_tokens,
+        // but ensure we don't go below thinking budget floor for Anthropic
+        let proposed_max_tokens = available.min(configured_max_tokens);
+        let proposed_max_tokens = if provider_name == "anthropic" {
+            if let Some(budget) = self.get_thinking_budget_tokens() {
+                proposed_max_tokens.max(budget + 1024)
+            } else {
+                proposed_max_tokens
+            }
+        } else {
+            proposed_max_tokens
+        };
+
+        // Validate against thinking budget constraint
+        let (adjusted, needs_reduction) = self.preflight_validate_max_tokens(provider_name, proposed_max_tokens);
+        
+        if !needs_reduction {
+            return (adjusted, false);
+        }
+
+        // We need more headroom - the context is too full
+        // Return the adjusted value but flag that fallbacks are needed
+        (adjusted, true)
+    }
+
+    /// Apply the fallback sequence to free up context space for thinking budget.
+    /// Sequence: thinnify (first third) → skinnify (all) → hard-coded minimum
+    /// Returns the validated max_tokens that satisfies thinking.budget_tokens constraint.
+    fn apply_max_tokens_fallback_sequence(
+        &mut self,
+        provider_name: &str,
+        initial_max_tokens: u32,
+        hard_coded_minimum: u32,
+    ) -> u32 {
+        let (mut max_tokens, needs_reduction) = self.preflight_validate_max_tokens(provider_name, initial_max_tokens);
+        
+        if !needs_reduction {
+            return max_tokens;
+        }
+
+        self.ui_writer.print_context_status(
+            "⚠️ Context window too full for thinking budget. Applying fallback sequence...\n",
+        );
+
+        // Step 1: Try thinnify (first third of context)
+        self.ui_writer.print_context_status("🥒 Step 1: Trying thinnify...\n");
+        let (thin_msg, thin_saved) = self.context_window.thin_context();
+        self.thinning_events.push(thin_saved);
+        self.ui_writer.print_context_thinning(&thin_msg);
+
+        // Recalculate max_tokens after thinnify
+        let recalc_max = self.resolve_max_tokens(provider_name);
+        let (new_max, still_needs_reduction) = self.preflight_validate_max_tokens(provider_name, recalc_max);
+        max_tokens = new_max;
+
+        if !still_needs_reduction {
+            self.ui_writer.print_context_status(
+                "✅ Thinnify resolved capacity issue. Continuing...\n",
+            );
+            return max_tokens;
+        }
+
+        // Step 2: Try skinnify (entire context)
+        self.ui_writer.print_context_status("🦴 Step 2: Trying skinnify...\n");
+        let (skinny_msg, skinny_saved) = self.context_window.thin_context_all();
+        self.thinning_events.push(skinny_saved);
+        self.ui_writer.print_context_thinning(&skinny_msg);
+
+        // Recalculate max_tokens after skinnify
+        let recalc_max = self.resolve_max_tokens(provider_name);
+        let (final_max, final_needs_reduction) = self.preflight_validate_max_tokens(provider_name, recalc_max);
+        max_tokens = final_max;
+
+        if !final_needs_reduction {
+            self.ui_writer.print_context_status(
+                "✅ Skinnify resolved capacity issue. Continuing...\n",
+            );
+            return max_tokens;
+        }
+
+        // Step 3: Nothing worked, use hard-coded minimum as last resort
+        self.ui_writer.print_context_status(&format!(
+            "⚠️ Step 3: Context reduction insufficient. Using hard-coded max_tokens={} as last resort...\n",
+            hard_coded_minimum
+        ));
+        
+        hard_coded_minimum
+    }
+
+    /// Apply the fallback sequence for summary requests to free up context space.
+    /// Uses calculate_summary_max_tokens for recalculation (based on available space).
+    /// Returns the validated max_tokens for summary requests.
+    fn apply_summary_fallback_sequence(
+        &mut self,
+        provider_name: &str,
+    ) -> u32 {
+        let (mut summary_max_tokens, needs_reduction) = self.calculate_summary_max_tokens(provider_name);
+        
+        if !needs_reduction {
+            return summary_max_tokens;
+        }
+
+        self.ui_writer.print_context_status(
+            "⚠️ Context window too full for thinking budget. Applying fallback sequence...\n",
+        );
+
+        // Step 1: Try thinnify (first third of context)
+        self.ui_writer.print_context_status("🥒 Step 1: Trying thinnify...\n");
+        let (thin_msg, thin_saved) = self.context_window.thin_context();
+        self.thinning_events.push(thin_saved);
+        self.ui_writer.print_context_thinning(&thin_msg);
+
+        // Recalculate max_tokens after thinnify
+        let (new_max, still_needs_reduction) = self.calculate_summary_max_tokens(provider_name);
+        summary_max_tokens = new_max;
+
+        if !still_needs_reduction {
+            self.ui_writer.print_context_status(
+                "✅ Thinnify resolved capacity issue. Continuing...\n",
+            );
+            return summary_max_tokens;
+        }
+
+        // Step 2: Try skinnify (entire context)
+        self.ui_writer.print_context_status("🦴 Step 2: Trying skinnify...\n");
+        let (skinny_msg, skinny_saved) = self.context_window.thin_context_all();
+        self.thinning_events.push(skinny_saved);
+        self.ui_writer.print_context_thinning(&skinny_msg);
+
+        // Recalculate max_tokens after skinnify
+        let (final_max, final_needs_reduction) = self.calculate_summary_max_tokens(provider_name);
+        summary_max_tokens = final_max;
+
+        if !final_needs_reduction {
+            self.ui_writer.print_context_status(
+                "✅ Skinnify resolved capacity issue. Continuing...\n",
+            );
+            return summary_max_tokens;
+        }
+
+        // Step 3: Nothing worked, use hard-coded minimum
+        self.ui_writer.print_context_status(
+            "⚠️ Step 3: Context reduction insufficient. Using hard-coded max_tokens=5000 as last resort...\n",
+        );
+        5000
    }

    /// Resolve the temperature to use for a given provider, applying fallbacks
@@ -1578,8 +2028,14 @@ impl<W: UiWriter> Agent<W> {
        };
        let _ = provider; // Drop the provider reference to avoid borrowing issues

-        // Get max_tokens from provider configuration, falling back to sensible defaults
-        let max_tokens = Some(self.resolve_max_tokens(&provider_name));
+        // Get max_tokens from provider configuration with preflight validation
+        // This ensures max_tokens > thinking.budget_tokens for Anthropic with extended thinking
+        let initial_max_tokens = self.resolve_max_tokens(&provider_name);
+        let max_tokens = Some(self.apply_max_tokens_fallback_sequence(
+            &provider_name,
+            initial_max_tokens,
+            16000, // Hard-coded minimum for main API calls (higher than summary's 5000)
+        ));

        let request = CompletionRequest {
            messages,
@@ -1587,6 +2043,7 @@ impl<W: UiWriter> Agent<W> {
            temperature: Some(self.resolve_temperature(&provider_name)),
            stream: true, // Enable streaming
            tools,
+            disable_thinking: false,
        };

        // Time the LLM call with cancellation support and streaming
@@ -1984,6 +2441,32 @@ impl<W: UiWriter> Agent<W> {
            self.context_window.percentage_used() as u32
        ));

+        let provider = self.providers.get(None)?;
+        let provider_name = provider.name().to_string();
+        let _ = provider; // Release borrow early
+
+        // Apply fallback sequence: thinnify -> skinnify -> hard-coded 5000
+        let mut summary_max_tokens = self.apply_summary_fallback_sequence(&provider_name);
+
+        // Apply provider-specific caps
+        // For Anthropic with thinking enabled, we need max_tokens > thinking.budget_tokens
+        // So we set a higher cap when thinking is configured
+        let anthropic_cap = match self.get_thinking_budget_tokens() {
+            Some(budget) => (budget + 2000).max(10_000), // At least budget + 2000 for response
+            None => 10_000,
+        };
+        summary_max_tokens = match provider_name.as_str() {
+            "anthropic" => summary_max_tokens.min(anthropic_cap),
+            "databricks" => summary_max_tokens.min(10_000),
+            "embedded" => summary_max_tokens.min(3000),
+            _ => summary_max_tokens.min(5000),
+        };
+
+        debug!(
+            "Requesting summary with max_tokens: {} (current usage: {} tokens)",
+            summary_max_tokens, self.context_window.used_tokens
+        );
+
        // Create summary request with FULL history
        let summary_prompt = self.context_window.create_summary_prompt();

@@ -2012,41 +2495,26 @@ impl<W: UiWriter> Agent<W> {

        let provider = self.providers.get(None)?;

-        // Dynamically calculate max_tokens for summary based on what's left
-        let summary_max_tokens = match provider.name() {
-            "databricks" | "anthropic" => {
-                let model_limit = self.context_window.total_tokens;
-                let current_usage = self.context_window.used_tokens;
-                let available = model_limit
-                    .saturating_sub(current_usage)
-                    .saturating_sub(5000);
-                Some(available.min(10_000))
+        // Determine if we need to disable thinking mode for this request
+        // Anthropic requires: max_tokens > thinking.budget_tokens + 1024
+        let disable_thinking = self.get_thinking_budget_tokens().map_or(false, |budget| {
+            let minimum_for_thinking = budget + 1024;
+            let should_disable = summary_max_tokens <= minimum_for_thinking;
+            if should_disable {
+                tracing::warn!("Disabling thinking mode for summary: max_tokens ({}) <= minimum_for_thinking ({})", summary_max_tokens, minimum_for_thinking);
            }
-            "embedded" => {
-                let model_limit = self.context_window.total_tokens;
-                let current_usage = self.context_window.used_tokens;
-                let available = model_limit
-                    .saturating_sub(current_usage)
-                    .saturating_sub(1000);
-                Some(available.min(3000))
-            }
-            _ => {
-                let available = self.context_window.remaining_tokens().saturating_sub(2000);
-                Some(available.min(5000))
-            }
-        };
+            should_disable
+        });

-        debug!(
-            "Requesting summary with max_tokens: {:?} (current usage: {} tokens)",
-            summary_max_tokens, self.context_window.used_tokens
-        );
+        tracing::debug!("Creating summary request: max_tokens={}, disable_thinking={}", summary_max_tokens, disable_thinking);

        let summary_request = CompletionRequest {
            messages: summary_messages,
-            max_tokens: summary_max_tokens,
+            max_tokens: Some(summary_max_tokens),
            temperature: Some(self.resolve_temperature(provider.name())),
            stream: false,
            tools: None,
+            disable_thinking,
        };

        // Get the summary
@@ -2090,6 +2558,15 @@ impl<W: UiWriter> Agent<W> {
        message
    }

+    /// Manually trigger context thinning for the ENTIRE context window
+    /// Unlike force_thin which only processes the first third, this processes all messages
+    pub fn force_thin_all(&mut self) -> String {
+        info!("Manual full context skinnifying triggered");
+        let (message, chars_saved) = self.context_window.thin_context_all();
+        self.thinning_events.push(chars_saved);
+        message
+    }
+
    /// Reload README.md and AGENTS.md and replace the first system message
    /// Returns Ok(true) if README was found and reloaded, Ok(false) if no README was present initially
    pub fn reload_readme(&mut self) -> Result<bool> {
@@ -2998,6 +3475,32 @@ impl<W: UiWriter> Agent<W> {
                    self.context_window.percentage_used() as u32
                ));

+                let provider = self.providers.get(None)?;
+                let provider_name = provider.name().to_string();
+                let _ = provider; // Release borrow early
+
+                // Apply fallback sequence: thinnify -> skinnify -> hard-coded 5000
+                let mut summary_max_tokens = self.apply_summary_fallback_sequence(&provider_name);
+
+                // Apply provider-specific caps
+                // For Anthropic with thinking enabled, we need max_tokens > thinking.budget_tokens
+                // So we set a higher cap when thinking is configured
+                let anthropic_cap = match self.get_thinking_budget_tokens() {
+                    Some(budget) => (budget + 2000).max(10_000), // At least budget + 2000 for response
+                    None => 10_000,
+                };
+                summary_max_tokens = match provider_name.as_str() {
+                    "anthropic" => summary_max_tokens.min(anthropic_cap),
+                    "databricks" => summary_max_tokens.min(10_000),
+                    "embedded" => summary_max_tokens.min(3000),
+                    _ => summary_max_tokens.min(5000),
+                };
+
+                debug!(
+                    "Requesting summary with max_tokens: {} (current usage: {} tokens)",
+                    summary_max_tokens, self.context_window.used_tokens
+                );
+
                // Create summary request with FULL history
                let summary_prompt = self.context_window.create_summary_prompt();

@@ -3026,85 +3529,26 @@ impl<W: UiWriter> Agent<W> {

                let provider = self.providers.get(None)?;

-                // Dynamically calculate max_tokens for summary based on what's left
-                // We need to ensure: used_tokens + max_tokens <= total_context_limit
-                let summary_max_tokens = match provider.name() {
-                    "databricks" | "anthropic" => {
-                        // Use the actual configured context window size
-                        let model_limit = self.context_window.total_tokens;
-                        let current_usage = self.context_window.used_tokens;
-
-                        // Check if we have enough capacity for summarization
-                        if current_usage >= model_limit.saturating_sub(1000) {
-                            error!("Context window at capacity ({}%), cannot summarize. Current: {}, Limit: {}", 
-                               self.context_window.percentage_used(), current_usage, model_limit);
-                            return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify or /compact commands to reduce context size, or start a new session."));
-                        }
-
-                        // Leave buffer proportional to model size (min 1k, max 10k)
-                        let buffer = (model_limit / 40).clamp(1000, 10000); // 2.5% buffer
-                        let available = model_limit
-                            .saturating_sub(current_usage)
-                            .saturating_sub(buffer);
-                        // Cap at a reasonable summary size (10k tokens max)
-                        Some(available.min(10_000))
+                // Determine if we need to disable thinking mode for this request
+                // Anthropic requires: max_tokens > thinking.budget_tokens + 1024
+                let disable_thinking = self.get_thinking_budget_tokens().map_or(false, |budget| {
+                    let minimum_for_thinking = budget + 1024;
+                    let should_disable = summary_max_tokens <= minimum_for_thinking;
+                    if should_disable {
+                        tracing::warn!("Disabling thinking mode for summary: max_tokens ({}) <= minimum_for_thinking ({})", summary_max_tokens, minimum_for_thinking);
                    }
-                    "embedded" => {
-                        // For smaller context models, be more conservative
-                        let model_limit = self.context_window.total_tokens;
-                        let current_usage = self.context_window.used_tokens;
+                    should_disable
+                });

-                        // Check capacity for embedded models too
-                        if current_usage >= model_limit.saturating_sub(500) {
-                            error!(
-                                "Embedded model context window at capacity ({}%)",
-                                self.context_window.percentage_used()
-                            );
-                            return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify command to reduce context size, or start a new session."));
-                        }
-
-                        // Leave 1k buffer
-                        let available = model_limit
-                            .saturating_sub(current_usage)
-                            .saturating_sub(1000);
-                        // Cap at 3k for embedded models
-                        Some(available.min(3000))
-                    }
-                    _ => {
-                        // Default: conservative approach
-                        let model_limit = self.context_window.total_tokens;
-                        let current_usage = self.context_window.used_tokens;
-
-                        if current_usage >= model_limit.saturating_sub(1000) {
-                            error!(
-                                "Context window at capacity ({}%)",
-                                self.context_window.percentage_used()
-                            );
-                            return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify or /compact commands, or start a new session."));
-                        }
-
-                        let available = self.context_window.remaining_tokens().saturating_sub(2000);
-                        Some(available.min(5000))
-                    }
-                };
-
-                debug!(
-                    "Requesting summary with max_tokens: {:?} (current usage: {} tokens)",
-                    summary_max_tokens, self.context_window.used_tokens
-                );
-
-                // Final safety check
-                if summary_max_tokens.unwrap_or(0) == 0 {
-                    error!("No tokens available for summarization");
-                    return Err(anyhow::anyhow!("No context window capacity left for summarization. Use /thinnify to reduce context size or start a new session."));
-                }
+                tracing::debug!("Creating auto-summary request: max_tokens={}, disable_thinking={}", summary_max_tokens, disable_thinking);

                let summary_request = CompletionRequest {
                    messages: summary_messages,
-                    max_tokens: summary_max_tokens,
+                    max_tokens: Some(summary_max_tokens),
                    temperature: Some(self.resolve_temperature(provider.name())),
                    stream: false,
                    tools: None,
+                    disable_thinking,
                };

                // Get the summary
@@ -3604,40 +4048,6 @@ impl<W: UiWriter> Agent<W> {
                                }
                            }

-                            // Check if this was a final_output tool call
-                            if tool_call.tool == "final_output" {
-                                // The summary was already displayed via print_final_output
-                                // Don't add it to full_response to avoid duplicate printing
-                                // full_response is intentionally left empty/unchanged
-                                self.ui_writer.println("");
-                                let _ttft =
-                                    first_token_time.unwrap_or_else(|| stream_start.elapsed());
-
-                                // Add timing if needed
-                                let final_response = if show_timing {
-                                    format!(
-                                        "🕝 {} | 💭 {}",
-                                        Self::format_duration(stream_start.elapsed()),
-                                        Self::format_duration(_ttft)
-                                    )
-                                } else {
-                                    // Return empty string since content was already displayed
-                                    String::new()
-                                };
-
-                                return Ok(TaskResult::new(
-                                    final_response,
-                                    self.context_window.clone(),
-                                ));
-                            }
-
-                            // Closure marker with timing
-                            if tool_call.tool != "final_output" {
-                                self.ui_writer
-                                    .print_tool_timing(&Self::format_duration(exec_duration));
-                                self.ui_writer.print_agent_prompt();
-                            }
-
                            // Add the tool call and result to the context window using RAW unfiltered content
                            // This ensures the log file contains the true raw content including JSON tool calls
                            let tool_message = if !raw_content_for_log.trim().is_empty() {
@@ -3701,6 +4111,43 @@ impl<W: UiWriter> Agent<W> {
                            self.context_window.add_message(tool_message);
                            self.context_window.add_message(result_message);

+                            // Check if this was a final_output tool call
+                            if tool_call.tool == "final_output" {
+                                // Save context window BEFORE returning so the session log includes final_output
+                                self.save_context_window("completed");
+                                
+                                // The summary was already displayed via print_final_output
+                                // Don't add it to full_response to avoid duplicate printing
+                                // full_response is intentionally left empty/unchanged
+                                self.ui_writer.println("");
+                                let _ttft =
+                                    first_token_time.unwrap_or_else(|| stream_start.elapsed());
+
+                                // Add timing if needed
+                                let final_response = if show_timing {
+                                    format!(
+                                        "🕝 {} | 💭 {}",
+                                        Self::format_duration(stream_start.elapsed()),
+                                        Self::format_duration(_ttft)
+                                    )
+                                } else {
+                                    // Return empty string since content was already displayed
+                                    String::new()
+                                };
+
+                                return Ok(TaskResult::new(
+                                    final_response,
+                                    self.context_window.clone(),
+                                ));
+                            }
+
+                            // Closure marker with timing
+                            if tool_call.tool != "final_output" {
+                                self.ui_writer
+                                    .print_tool_timing(&Self::format_duration(exec_duration));
+                                self.ui_writer.print_agent_prompt();
+                            }
+
                            // Update the request with the new context for next iteration
                            request.messages = self.context_window.conversation_history.clone();

@@ -3922,6 +4369,9 @@ impl<W: UiWriter> Agent<W> {
                                full_response = String::new();

                                self.ui_writer.println("");
+                                
+                                // Save context window BEFORE returning
+                                self.save_context_window("completed");
                                let _ttft =
                                    first_token_time.unwrap_or_else(|| stream_start.elapsed());

@@ -4060,6 +4510,9 @@ impl<W: UiWriter> Agent<W> {
                    }
                }

+                // Save context window BEFORE returning
+                self.save_context_window("completed");
+                
                // Add timing if needed
                let final_response = if show_timing {
                    format!(
@@ -4786,7 +5239,14 @@ impl<W: UiWriter> Agent<W> {
                                    Ok(_) => {
                                        let mut todo = self.todo_content.write().await;
                                        *todo = String::new();
-                                        return Ok("✅ All TODOs completed! Removed todo.g3.md".to_string());
+                                        // Show the final completed TODOs before deletion
+                                        let mut result = String::from("✅ All TODOs completed! Removed todo.g3.md\n\nFinal status:\n");
+                                        for line in content_str.lines() {
+                                            self.ui_writer.print_tool_output_line(line);
+                                            result.push_str(line);
+                                            result.push('\n');
+                                        }
+                                        return Ok(result);
                                    }
                                    Err(e) => return Ok(format!("❌ Failed to remove todo.g3.md: {}", e)),
                                }
@@ -4801,11 +5261,7 @@ impl<W: UiWriter> Agent<W> {
                                // Also update in-memory content to stay in sync
                                let mut todo = self.todo_content.write().await;
                                *todo = content_str.to_string();
-                                // Print the TODO content to the console
-                                self.ui_writer.print_context_status(&format!(
-                                    "✅ TODO list updated ({} chars) and saved to todo.g3.md:",
-                                    char_count
-                                ));
+                                // Print the TODO content to the console (inside the tool frame)
                                for line in content_str.lines() {
                                    self.ui_writer.print_tool_output_line(line);
                                }
--- a/crates/g3-core/tests/test_preflight_max_tokens.rs
+++ b/crates/g3-core/tests/test_preflight_max_tokens.rs
@@ -0,0 +1,188 @@
+//! Tests for the pre-flight max_tokens validation with thinking.budget_tokens constraint
+//!
+//! These tests verify that when using Anthropic with extended thinking enabled,
+//! the max_tokens calculation properly accounts for the budget_tokens constraint.
+
+use g3_config::Config;
+use g3_core::ContextWindow;
+
+/// Helper function to create a minimal config for testing
+fn create_test_config_with_thinking(thinking_budget: Option<u32>) -> Config {
+    let mut config = Config::default();
+    
+    // Set up Anthropic provider with optional thinking budget
+    config.providers.anthropic = Some(g3_config::AnthropicConfig {
+        api_key: "test-key".to_string(),
+        model: "claude-sonnet-4-5".to_string(),
+        max_tokens: Some(16000),
+        temperature: Some(0.1),
+        cache_config: None,
+        enable_1m_context: None,
+        thinking_budget_tokens: thinking_budget,
+    });
+    
+    config.providers.default_provider = "anthropic".to_string();
+    config
+}
+
+/// Test that when thinking is disabled, max_tokens passes through unchanged
+#[test]
+fn test_no_thinking_budget_passes_through() {
+    let config = create_test_config_with_thinking(None);
+    
+    // Without thinking budget, any max_tokens should be fine
+    let proposed_max = 5000;
+    
+    // The constraint check would return (proposed_max, false)
+    // since there's no thinking_budget_tokens configured
+    assert!(config.providers.anthropic.as_ref().unwrap().thinking_budget_tokens.is_none());
+}
+
+/// Test that when max_tokens > budget_tokens + buffer, no reduction is needed
+#[test]
+fn test_sufficient_max_tokens_no_reduction_needed() {
+    let config = create_test_config_with_thinking(Some(10000));
+    let budget_tokens = config.providers.anthropic.as_ref().unwrap().thinking_budget_tokens.unwrap();
+    
+    // minimum_required = budget_tokens + 1024 = 11024
+    let minimum_required = budget_tokens + 1024;
+    
+    // If proposed_max >= minimum_required, no reduction is needed
+    let proposed_max = 15000;
+    assert!(proposed_max >= minimum_required);
+}
+
+/// Test that when max_tokens < budget_tokens + buffer, reduction is needed
+#[test]
+fn test_insufficient_max_tokens_needs_reduction() {
+    let config = create_test_config_with_thinking(Some(10000));
+    let budget_tokens = config.providers.anthropic.as_ref().unwrap().thinking_budget_tokens.unwrap();
+    
+    // minimum_required = budget_tokens + 1024 = 11024
+    let minimum_required = budget_tokens + 1024;
+    
+    // If proposed_max < minimum_required, reduction IS needed
+    let proposed_max = 5000;
+    assert!(proposed_max < minimum_required);
+}
+
+/// Test the minimum required calculation
+#[test]
+fn test_minimum_required_calculation() {
+    // For a budget of 10000, we need at least 11024 tokens
+    let budget_tokens = 10000u32;
+    let output_buffer = 1024u32;
+    let minimum_required = budget_tokens + output_buffer;
+    
+    assert_eq!(minimum_required, 11024);
+    
+    // For a larger budget
+    let budget_tokens = 32000u32;
+    let minimum_required = budget_tokens + output_buffer;
+    assert_eq!(minimum_required, 33024);
+}
+
+/// Test context window usage calculation for summary max_tokens
+#[test]
+fn test_context_window_available_tokens() {
+    let mut context = ContextWindow::new(200000); // 200k context window
+    
+    // Simulate heavy usage
+    context.used_tokens = 180000; // 90% used
+    
+    let model_limit = context.total_tokens;
+    let current_usage = context.used_tokens;
+    
+    // 2.5% buffer calculation
+    let buffer = (model_limit / 40).clamp(1000, 10000);
+    assert_eq!(buffer, 5000); // 200000/40 = 5000
+    
+    let available = model_limit
+        .saturating_sub(current_usage)
+        .saturating_sub(buffer);
+    
+    // 200000 - 180000 - 5000 = 15000
+    assert_eq!(available, 15000);
+    
+    // Capped at 10000 for summary
+    let summary_max = available.min(10_000);
+    assert_eq!(summary_max, 10000);
+}
+
+/// Test that when context is nearly full, available tokens may be below thinking budget
+#[test]
+fn test_context_nearly_full_triggers_reduction() {
+    let mut context = ContextWindow::new(200000);
+    
+    // Very heavy usage - 98% used
+    context.used_tokens = 196000;
+    
+    let model_limit = context.total_tokens;
+    let current_usage = context.used_tokens;
+    let buffer = (model_limit / 40).clamp(1000, 10000); // 5000
+    
+    let available = model_limit
+        .saturating_sub(current_usage)
+        .saturating_sub(buffer);
+    
+    // 200000 - 196000 - 5000 = -1000 -> saturates to 0
+    assert_eq!(available, 0);
+    
+    // With thinking_budget of 10000, this would definitely need reduction
+    let thinking_budget = 10000u32;
+    let minimum_required = thinking_budget + 1024;
+    assert!(available < minimum_required);
+}
+
+/// Test the hard-coded fallback value
+#[test]
+fn test_hardcoded_fallback_value() {
+    // When all else fails, we use 5000 as the hard-coded max_tokens
+    let hardcoded_fallback = 5000u32;
+    
+    // This should be a reasonable value that Anthropic will accept
+    // even with thinking enabled (though output will be limited)
+    assert!(hardcoded_fallback > 0);
+    
+    // Note: With a 10000 thinking budget, 5000 is still below the
+    // minimum required (11024), but we send it anyway as a "last resort"
+    // hoping the API might still work for basic operations
+}
+
+/// Test provider-specific caps
+#[test]
+fn test_provider_specific_caps() {
+    // Anthropic/Databricks: cap at 10000
+    let anthropic_cap = 10000u32;
+    let proposed = 15000u32;
+    assert_eq!(proposed.min(anthropic_cap), 10000);
+    
+    // Embedded: cap at 3000
+    let embedded_cap = 3000u32;
+    let proposed = 5000u32;
+    assert_eq!(proposed.min(embedded_cap), 3000);
+    
+    // Default: cap at 5000
+    let default_cap = 5000u32;
+    let proposed = 8000u32;
+    assert_eq!(proposed.min(default_cap), 5000);
+}
+
+/// Test that the error message mentions the thinking budget constraint
+#[test]
+fn test_error_message_content() {
+    // Verify the warning message format contains useful information
+    let proposed_max_tokens = 5000u32;
+    let budget_tokens = 10000u32;
+    let minimum_required = budget_tokens + 1024;
+    
+    let warning = format!(
+        "max_tokens ({}) is below required minimum ({}) for thinking.budget_tokens ({}). Context reduction needed.",
+        proposed_max_tokens, minimum_required, budget_tokens
+    );
+    
+    assert!(warning.contains("5000"));
+    assert!(warning.contains("11024"));
+    assert!(warning.contains("10000"));
+    assert!(warning.contains("Context reduction needed"));
+}
--- a/crates/g3-planner/src/lib.rs
+++ b/crates/g3-planner/src/lib.rs
@@ -85,6 +85,7 @@ pub async fn get_initial_discovery_messages(
        temperature: Some(provider.temperature()),
        stream: false,
        tools: None,
+        disable_thinking: false,
    };

    status("🤖 Calling LLM for discovery commands...");
--- a/crates/g3-providers/src/anthropic.rs
+++ b/crates/g3-providers/src/anthropic.rs
@@ -39,6 +39,7 @@
 //!         temperature: Some(0.7),
 //!         stream: false,
 //!         tools: None,
+//!         disable_thinking: false,
 //!     };
 //!
 //!     // Get a completion
@@ -75,6 +76,7 @@
 //!         temperature: Some(0.7),
 //!         stream: true,
 //!         tools: None,
+//!         disable_thinking: false,
 //!     };
 //!
 //!     let mut stream = provider.stream(request).await?;
@@ -272,6 +274,7 @@ impl AnthropicProvider {
        streaming: bool,
        max_tokens: u32,
        temperature: f32,
+        disable_thinking: bool,
    ) -> Result<AnthropicRequest> {
        let (system, anthropic_messages) = self.convert_messages(messages)?;

@@ -284,10 +287,32 @@ impl AnthropicProvider {
        // Convert tools if provided
        let anthropic_tools = tools.map(|t| self.convert_tools(t));

-        // Add thinking configuration if budget_tokens is set
-        let thinking = self.thinking_budget_tokens.map(|budget| {
-            ThinkingConfig::enabled(budget)
-        });
+        // Add thinking configuration if budget_tokens is set AND max_tokens is sufficient AND not explicitly disabled
+        // Anthropic requires: max_tokens > thinking.budget_tokens
+        // We add 1024 as minimum buffer for actual response content
+        tracing::debug!("create_request_body called: max_tokens={}, disable_thinking={}, thinking_budget_tokens={:?}", max_tokens, disable_thinking, self.thinking_budget_tokens);
+
+        let thinking = if disable_thinking {
+            tracing::info!(
+                "Thinking mode explicitly disabled for this request (max_tokens={})",
+                max_tokens
+            );
+            None
+        } else {
+            self.thinking_budget_tokens.and_then(|budget| {
+            let min_required = budget + 1024;
+            if max_tokens > min_required {
+                Some(ThinkingConfig::enabled(budget))
+            } else {
+                tracing::warn!(
+                    "Disabling thinking mode: max_tokens ({}) is not greater than thinking.budget_tokens ({}) + 1024 buffer. \
+                     Required: max_tokens > {}",
+                    max_tokens, budget, min_required
+                );
+                None
+            }
+            })
+        };

        let request = AnthropicRequest {
            model: self.model.clone(),
@@ -637,6 +662,7 @@ impl LLMProvider for AnthropicProvider {
            false,
            max_tokens,
            temperature,
+            request.disable_thinking,
        )?;

        debug!(
@@ -710,6 +736,7 @@ impl LLMProvider for AnthropicProvider {
            true,
            max_tokens,
            temperature,
+            request.disable_thinking,
        )?;

        debug!(
@@ -847,6 +874,12 @@ enum AnthropicContent {
        #[serde(skip_serializing_if = "Option::is_none")]
        cache_control: Option<crate::CacheControl>,
    },
+    #[serde(rename = "thinking")]
+    Thinking {
+        thinking: String,
+        #[serde(default)]
+        signature: Option<String>,
+    },
    #[serde(rename = "tool_use")]
    ToolUse {
        id: String,
@@ -947,7 +980,7 @@ mod tests {
        let messages = vec![Message::new(MessageRole::User, "Test message".to_string())];

        let request_body = provider
-            .create_request_body(&messages, None, false, 1000, 0.5)
+            .create_request_body(&messages, None, false, 1000, 0.5, false)
            .unwrap();

        assert_eq!(request_body.model, "claude-3-haiku-20240307");
@@ -1053,16 +1086,17 @@ mod tests {

        let messages = vec![Message::new(MessageRole::User, "Test message".to_string())];
        let request_without = provider_without
-            .create_request_body(&messages, None, false, 1000, 0.5)
+            .create_request_body(&messages, None, false, 1000, 0.5, false)
            .unwrap();
        let json_without = serde_json::to_string(&request_without).unwrap();
        assert!(!json_without.contains("thinking"), "JSON should not contain 'thinking' field when not configured");

-        // Test WITH thinking parameter
+        // Test WITH thinking parameter - max_tokens must be > budget_tokens + 1024
+        // Using budget=10000 requires max_tokens > 11024
        let provider_with = AnthropicProvider::new(
            "test-key".to_string(),
            Some("claude-sonnet-4-5".to_string()),
-            Some(1000),
+            Some(20000),  // Sufficient for thinking budget
            Some(0.5),
            None,
            None,
@@ -1071,11 +1105,78 @@ mod tests {
        .unwrap();

        let request_with = provider_with
-            .create_request_body(&messages, None, false, 1000, 0.5)
+            .create_request_body(&messages, None, false, 20000, 0.5, false)
            .unwrap();
        let json_with = serde_json::to_string(&request_with).unwrap();
        assert!(json_with.contains("thinking"), "JSON should contain 'thinking' field when configured");
        assert!(json_with.contains("\"type\":\"enabled\""), "JSON should contain type: enabled");
        assert!(json_with.contains("\"budget_tokens\":10000"), "JSON should contain budget_tokens: 10000");
+
+        // Test WITH thinking parameter but INSUFFICIENT max_tokens - thinking should be disabled
+        let request_insufficient = provider_with
+            .create_request_body(&messages, None, false, 5000, 0.5, false)  // Less than budget + 1024
+            .unwrap();
+        let json_insufficient = serde_json::to_string(&request_insufficient).unwrap();
+        assert!(!json_insufficient.contains("thinking"), "JSON should NOT contain 'thinking' field when max_tokens is insufficient");
+    }
+
+    #[test]
+    fn test_disable_thinking_flag() {
+        // Test that disable_thinking=true prevents thinking even with sufficient max_tokens
+        let provider = AnthropicProvider::new(
+            "test-key".to_string(),
+            Some("claude-sonnet-4-5".to_string()),
+            Some(20000),
+            Some(0.5),
+            None,
+            None,
+            Some(10000), // With thinking budget
+        )
+        .unwrap();
+
+        let messages = vec![Message::new(MessageRole::User, "Test message".to_string())];
+        
+        // With disable_thinking=false, thinking should be enabled (max_tokens is sufficient)
+        let request_with_thinking = provider
+            .create_request_body(&messages, None, false, 20000, 0.5, false)
+            .unwrap();
+        let json_with = serde_json::to_string(&request_with_thinking).unwrap();
+        assert!(json_with.contains("thinking"), "JSON should contain 'thinking' field when not disabled");
+
+        // With disable_thinking=true, thinking should be disabled even with sufficient max_tokens
+        let request_without_thinking = provider
+            .create_request_body(&messages, None, false, 20000, 0.5, true)
+            .unwrap();
+        let json_without = serde_json::to_string(&request_without_thinking).unwrap();
+        assert!(!json_without.contains("thinking"), "JSON should NOT contain 'thinking' field when explicitly disabled");
+    }
+
+    #[test]
+    fn test_thinking_content_block_deserialization() {
+        // Test that we can deserialize a response containing a "thinking" content block
+        // This is what Anthropic returns when extended thinking is enabled
+        let json_response = r#"{
+            "content": [
+                {"type": "thinking", "thinking": "Let me analyze this...", "signature": "abc123"},
+                {"type": "text", "text": "Here is my response."}
+            ],
+            "model": "claude-sonnet-4-5",
+            "usage": {"input_tokens": 100, "output_tokens": 50}
+        }"#;
+
+        let response: AnthropicResponse = serde_json::from_str(json_response)
+            .expect("Should be able to deserialize response with thinking block");
+        
+        assert_eq!(response.content.len(), 2);
+        assert_eq!(response.model, "claude-sonnet-4-5");
+        
+        // Extract only text content (thinking should be filtered out)
+        let text_content: Vec<_> = response.content.iter().filter_map(|c| match c {
+            AnthropicContent::Text { text, .. } => Some(text.as_str()),
+            _ => None,
+        }).collect();
+        
+        assert_eq!(text_content.len(), 1);
+        assert_eq!(text_content[0], "Here is my response.");
    }
 }
--- a/crates/g3-providers/src/databricks.rs
+++ b/crates/g3-providers/src/databricks.rs
@@ -45,6 +45,7 @@
 //!         temperature: Some(0.7),
 //!         stream: false,
 //!         tools: None,
+//!         disable_thinking: false,
 //!     };
 //!
 //!     // Get a completion
--- a/crates/g3-providers/src/lib.rs
+++ b/crates/g3-providers/src/lib.rs
@@ -42,6 +42,8 @@ pub struct CompletionRequest {
    pub temperature: Option<f32>,
    pub stream: bool,
    pub tools: Option<Vec<Tool>>,
+    /// Force disable thinking mode for this request (used when max_tokens is too low)
+    pub disable_thinking: bool,
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
Author	SHA1	Message	Date
Jochen	4aa84e2144	disable thinking if there is no token budget	2025-12-09 16:45:28 +11:00
Jochen	2283d9ddbf	small fix to provider name check	2025-12-09 14:43:35 +11:00
Jochen	fb2cf6f898	fix for thinking budget and hardcoded max token on summary	2025-12-09 12:41:52 +11:00
Jochen	696c441a47	validate max_tokens for call, also fallbacks for summary When the CW is full, max_tokens is often passed at 0 or tiny. The LLM will fail. For Anthropic with thining, there is also the thinking budget. This can happen during summary attempts, in that case first try thinnify, skinnify etc..	2025-12-09 10:15:32 +11:00
Dhanji R. Prasanna	48e6d594bc	tweak todo tool output	2025-12-08 11:05:01 +11:00
Dhanji R. Prasanna	678403da35	add a force thinnify cmd	2025-12-05 15:32:13 +11:00
Jochen	0970e4f356	Merge pull request #40 from dhanji/jochen-fix-coach-feedback now coach feedback works again	2025-12-03 10:55:15 +11:00
Jochen	758a313de0	Merge pull request #39 from dhanji/jochen-sonnet-thinking Fix temperature param + add thinking for anthropic	2025-12-03 10:54:34 +11:00
Jochen	0327a6dfdf	make sure coach feedback is extracted.	2025-12-02 22:00:58 +11:00
Jochen	928f2bfa9d	actually record coach feedback and use it	2025-12-02 21:23:50 +11:00