disable thinking if there is no token budget

small fix to provider name check
fix for thinking budget and hardcoded max token on summary
2025-12-09 16:45:28 +11:00 · 2025-12-09 14:43:35 +11:00 · 2025-12-09 12:41:52 +11:00 · 2025-12-09 10:15:32 +11:00 · 2025-12-08 11:05:01 +11:00 · 2025-12-05 15:32:13 +11:00
21 changed files with 2217 additions and 359 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1377,6 +1377,7 @@ dependencies = [
 "serde",
 "serde_json",
 "sha2",
+ "tempfile",
 "termimad",
 "tokio",
 "tokio-util",
--- a/README.md
+++ b/README.md
@@ -76,6 +76,7 @@ G3 includes robust error handling with automatic retry logic:
 G3's interactive CLI includes control commands for manual context management:
 - **`/compact`**: Manually trigger summarization to compact conversation history
 - **`/thinnify`**: Manually trigger context thinning to replace large tool results with file references
+- **`/skinnify`**: Manually trigger full context thinning (like `/thinnify` but processes the entire context window, not just the first third)
 - **`/readme`**: Reload README.md and AGENTS.md from disk without restarting
 - **`/stats`**: Show detailed context and performance statistics
 - **`/help`**: Display all available control commands
--- a/config.example.toml
+++ b/config.example.toml
@@ -24,6 +24,8 @@ temperature = 0.3  # Slightly higher temperature for more creative implementatio
 # Options: "ephemeral", "5minute", "1hour"
 # Reduces costs and latency for repeated prompts. Uses Anthropic's prompt caching with different TTLs.
 # enable_1m_context = true    # optional, more expensive
+# thinking_budget_tokens = 10000  # Optional: Enable extended thinking mode with token budget
+# Allows the model to "think" before responding. Useful for complex reasoning tasks.


 # Multiple OpenAI-compatible providers can be configured with custom names
--- a/crates/g3-cli/Cargo.toml
+++ b/crates/g3-cli/Cargo.toml
@@ -27,3 +27,6 @@ chrono = { version = "0.4", features = ["serde"] }
 crossterm = "0.29.0"
 ratatui = "0.29"
 termimad = "0.34.0"
+
+[dev-dependencies]
+tempfile = "3.8"
--- a/crates/g3-cli/src/lib.rs
+++ b/crates/g3-cli/src/lib.rs
@@ -163,15 +163,66 @@ fn extract_coach_feedback_from_logs(
                if let Some(context_window) = log_json.get("context_window") {
                    if let Some(conversation_history) = context_window.get("conversation_history") {
                        if let Some(messages) = conversation_history.as_array() {
-                            // Simply get the last message content - this is the coach's final feedback
-                            if let Some(last_message) = messages.last() {
-                                if let Some(content) = last_message.get("content") {
+                            // Go backwards through the conversation to find the last tool result
+                            // that corresponds to a final_output tool call
+                            for i in (0..messages.len()).rev() {
+                                let msg = &messages[i];
+                                
+                                // Check if this is a User message with "Tool result:"
+                                if let Some(role) = msg.get("role") {
+                                    if let Some(role_str) = role.as_str() {
+                                        if role_str == "User" || role_str == "user" {
+                                            if let Some(content) = msg.get("content") {
                                                if let Some(content_str) = content.as_str() {
+                                                    if content_str.starts_with("Tool result:") {
+                                                        // Found a tool result, now check the preceding message
+                                                        // to verify it was a final_output tool call
+                                                        if i > 0 {
+                                                            let prev_msg = &messages[i - 1];
+                                                            if let Some(prev_role) = prev_msg.get("role") {
+                                                                if let Some(prev_role_str) = prev_role.as_str() {
+                                                                    if prev_role_str == "assistant" || prev_role_str == "Assistant" {
+                                                                        if let Some(prev_content) = prev_msg.get("content") {
+                                                                            if let Some(prev_content_str) = prev_content.as_str() {
+                                                                                // Check if the previous assistant message contains a final_output tool call
+                                                                                if prev_content_str.contains("\"tool\": \"final_output\"") {
+                                                                                    // This is a final_output tool result
+                                                                                    let feedback = if content_str.starts_with("Tool result: ") {
+                                                                                        content_str.strip_prefix("Tool result: ")
+                                                                                            .unwrap_or(content_str)
+                                                                                            .to_string()
+                                                                                    } else {
+                                                                                        content_str.to_string()
+                                                                                    };
+                                                                                    
                                                                                    output.print(&format!(
-                                            "✅ Extracted coach feedback from session: {}",
+                                                                                        "Coach feedback extracted: {} characters (from {} total)",
+                                                                                        feedback.len(),
+                                                                                        content_str.len()
+                                                                                    ));
+                                                                                    output.print(&format!("Coach feedback:\n{}", feedback));
+                                                                                    
+                                                                                    output.print(&format!(
+                                                                                        "✅ Extracted coach feedback from session: {} (verified final_output tool)",
                                                                                        session_id
                                                                                    ));
-                                        return Ok(content_str.to_string());
+                                                                                    return Ok(feedback);
+                                                                                } else {
+                                                                                    output.print(&format!(
+                                                                                        "⚠️  Skipping tool result at index {} - not a final_output tool call",
+                                                                                        i
+                                                                                    ));
+                                                                                }
+                                                                            }
+                                                                        }
+                                                                    }
+                                                                }
+                                                            }
+                                                        }
+                                                    }
+                                                }
+                                            }
+                                        }
                                    }
                                }
                            }
@@ -187,7 +238,7 @@ fn extract_coach_feedback_from_logs(
        "CRITICAL: Could not extract coach feedback from session: {}\n\
         Log file path: {:?}\n\
         Log file exists: {}\n\
-         This indicates the coach did not call any tool or the log is corrupted.\n\
+         This indicates the coach did not call final_output tool or the log is corrupted.\n\
         Coach result response length: {} chars",
        session_id,
        log_file_path,
@@ -1283,6 +1334,7 @@ async fn run_interactive<W: UiWriter>(
                                output.print("📖 Control Commands:");
                                output.print("  /compact   - Trigger auto-summarization (compacts conversation history)");
                                output.print("  /thinnify  - Trigger context thinning (replaces large tool results with file references)");
+                                output.print("  /skinnify  - Trigger full context thinning (like /thinnify but for entire context, not just first third)");
                                output.print(
                                    "  /readme    - Reload README.md and AGENTS.md from disk",
                                );
@@ -1315,6 +1367,11 @@ async fn run_interactive<W: UiWriter>(
                                println!("{}", summary);
                                continue;
                            }
+                            "/skinnify" => {
+                                let summary = agent.force_thin_all();
+                                println!("{}", summary);
+                                continue;
+                            }
                            "/readme" => {
                                output.print("📚 Reloading README.md and AGENTS.md...");
                                match agent.reload_readme() {
@@ -1524,6 +1581,12 @@ async fn run_interactive_machine(
                            println!("{}", summary);
                            continue;
                        }
+                        "/skinnify" => {
+                            println!("COMMAND: skinnify");
+                            let summary = agent.force_thin_all();
+                            println!("{}", summary);
+                            continue;
+                        }
                        "/readme" => {
                            println!("COMMAND: readme");
                            match agent.reload_readme() {
@@ -1546,7 +1609,7 @@ async fn run_interactive_machine(
                        }
                        "/help" => {
                            println!("COMMAND: help");
-                            println!("AVAILABLE_COMMANDS: /compact /thinnify /readme /stats /help");
+                            println!("AVAILABLE_COMMANDS: /compact /thinnify /skinnify /readme /stats /help");
                            continue;
                        }
                        _ => {
--- a/crates/g3-cli/src/machine_ui_writer.rs
+++ b/crates/g3-cli/src/machine_ui_writer.rs
@@ -105,4 +105,9 @@ impl UiWriter for MachineUiWriter {
        // Default to first option (index 0) for automation
        0
    }
+
+    fn print_final_output(&self, summary: &str) {
+        println!("FINAL_OUTPUT:");
+        println!("{}", summary);
+    }
 }
--- a/crates/g3-cli/src/ui_writer_impl.rs
+++ b/crates/g3-cli/src/ui_writer_impl.rs
@@ -1,78 +1,22 @@
 use g3_core::ui_writer::UiWriter;
 use std::io::{self, Write};
-use std::sync::Mutex;
+use termimad::MadSkin;

 /// Console implementation of UiWriter that prints to stdout
 pub struct ConsoleUiWriter {
-    current_tool_name: Mutex<Option<String>>,
-    current_tool_args: Mutex<Vec<(String, String)>>,
-    current_output_line: Mutex<Option<String>>,
-    output_line_printed: Mutex<bool>,
-    in_todo_tool: Mutex<bool>,
+    current_tool_name: std::sync::Mutex<Option<String>>,
+    current_tool_args: std::sync::Mutex<Vec<(String, String)>>,
+    current_output_line: std::sync::Mutex<Option<String>>,
+    output_line_printed: std::sync::Mutex<bool>,
 }

 impl ConsoleUiWriter {
    pub fn new() -> Self {
        Self {
-            current_tool_name: Mutex::new(None),
-            current_tool_args: Mutex::new(Vec::new()),
-            current_output_line: Mutex::new(None),
-            output_line_printed: Mutex::new(false),
-            in_todo_tool: Mutex::new(false),
-        }
-    }
-
-    fn print_todo_line(&self, line: &str) {
-        // Transform and print todo list lines elegantly
-        let trimmed = line.trim();
-
-        // Skip the "📝 TODO list:" prefix line
-        if trimmed.starts_with("📝 TODO list:") || trimmed == "📝 TODO list is empty" {
-            return;
-        }
-
-        // Handle empty lines
-        if trimmed.is_empty() {
-            println!();
-            return;
-        }
-
-        // Detect indentation level
-        let indent_count = line.chars().take_while(|c| c.is_whitespace()).count();
-        let indent = "  ".repeat(indent_count / 2); // Convert spaces to visual indent
-
-        // Format based on line type
-        if trimmed.starts_with("- [ ]") {
-            // Incomplete task
-            let task = trimmed.strip_prefix("- [ ]").unwrap_or(trimmed).trim();
-            println!("{}☐ {}", indent, task);
-        } else if trimmed.starts_with("- [x]") || trimmed.starts_with("- [X]") {
-            // Completed task
-            let task = trimmed
-                .strip_prefix("- [x]")
-                .or_else(|| trimmed.strip_prefix("- [X]"))
-                .unwrap_or(trimmed)
-                .trim();
-            println!("{}\x1b[2m☑ {}\x1b[0m", indent, task);
-        } else if trimmed.starts_with("- ") {
-            // Regular bullet point
-            let item = trimmed.strip_prefix("- ").unwrap_or(trimmed).trim();
-            println!("{}• {}", indent, item);
-        } else if trimmed.starts_with("# ") {
-            // Heading
-            let heading = trimmed.strip_prefix("# ").unwrap_or(trimmed).trim();
-            println!("\n\x1b[1m{}\x1b[0m", heading);
-        } else if trimmed.starts_with("## ") {
-            // Subheading
-            let subheading = trimmed.strip_prefix("## ").unwrap_or(trimmed).trim();
-            println!("\n\x1b[1m{}\x1b[0m", subheading);
-        } else if trimmed.starts_with("**") && trimmed.ends_with("**") {
-            // Bold text (section marker)
-            let text = trimmed.trim_start_matches("**").trim_end_matches("**");
-            println!("{}\x1b[1m{}\x1b[0m", indent, text);
-        } else {
-            // Regular text or note
-            println!("{}{}", indent, trimmed);
+            current_tool_name: std::sync::Mutex::new(None),
+            current_tool_args: std::sync::Mutex::new(Vec::new()),
+            current_output_line: std::sync::Mutex::new(None),
+            output_line_printed: std::sync::Mutex::new(false),
        }
    }
 }
@@ -138,13 +82,6 @@ impl UiWriter for ConsoleUiWriter {
        // Store the tool name and clear args for collection
        *self.current_tool_name.lock().unwrap() = Some(tool_name.to_string());
        self.current_tool_args.lock().unwrap().clear();
-
-        // Check if this is a todo tool call
-        let is_todo = tool_name == "todo_read" || tool_name == "todo_write";
-        *self.in_todo_tool.lock().unwrap() = is_todo;
-
-        // For todo tools, we'll skip the normal header and print a custom one later
-        if is_todo {}
    }

    fn print_tool_arg(&self, key: &str, value: &str) {
@@ -167,13 +104,10 @@ impl UiWriter for ConsoleUiWriter {
    }

    fn print_tool_output_header(&self) {
-        // Skip normal header for todo tools
-        if *self.in_todo_tool.lock().unwrap() {
-            println!(); // Just add a newline
-            return;
-        }
-
        println!();
+        // Reset output_line_printed at the start of a new tool output
+        // This ensures the header isn't cleared by update_tool_output_line
+        *self.output_line_printed.lock().unwrap() = false;
        // Now print the tool header with the most important arg in bold green
        if let Some(tool_name) = self.current_tool_name.lock().unwrap().as_ref() {
            let args = self.current_tool_args.lock().unwrap();
@@ -259,21 +193,14 @@ impl UiWriter for ConsoleUiWriter {
    }

    fn print_tool_output_line(&self, line: &str) {
-        // Special handling for todo tools
-        if *self.in_todo_tool.lock().unwrap() {
-            self.print_todo_line(line);
+        // Skip the TODO list header line
+        if line.starts_with("📝 TODO list:") {
            return;
        }
-
        println!("│ \x1b[2m{}\x1b[0m", line);
    }

    fn print_tool_output_summary(&self, count: usize) {
-        // Skip for todo tools
-        if *self.in_todo_tool.lock().unwrap() {
-            return;
-        }
-
        println!(
            "│ \x1b[2m({} line{})\x1b[0m",
            count,
@@ -282,13 +209,6 @@ impl UiWriter for ConsoleUiWriter {
    }

    fn print_tool_timing(&self, duration_str: &str) {
-        // For todo tools, just print a simple completion message
-        if *self.in_todo_tool.lock().unwrap() {
-            println!();
-            *self.in_todo_tool.lock().unwrap() = false;
-            return;
-        }
-
        // Parse the duration string to determine color
        // Format is like "1.5s", "500ms", "2m 30.0s"
        let color_code = if duration_str.ends_with("ms") {
@@ -390,4 +310,44 @@ impl UiWriter for ConsoleUiWriter {
            let _ = io::stdout().flush();
        }
    }
+
+    fn print_final_output(&self, summary: &str) {
+        // Show spinner while "formatting"
+        let spinner_frames = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏'];
+        let message = "summarizing work done...";
+
+        // Brief spinner animation (about 0.5 seconds)
+        for i in 0..5 {
+            let frame = spinner_frames[i % spinner_frames.len()];
+            print!("\r\x1b[36m{} {}\x1b[0m", frame, message);
+            let _ = io::stdout().flush();
+            std::thread::sleep(std::time::Duration::from_millis(100));
+        }
+
+        // Clear the spinner line
+        print!("\r\x1b[2K");
+        let _ = io::stdout().flush();
+
+        // Create a styled markdown skin
+        let mut skin = MadSkin::default();
+        // Customize colors for better terminal appearance
+        skin.bold.set_fg(termimad::crossterm::style::Color::Green);
+        skin.italic.set_fg(termimad::crossterm::style::Color::Cyan);
+        skin.headers[0].set_fg(termimad::crossterm::style::Color::Magenta);
+        skin.headers[1].set_fg(termimad::crossterm::style::Color::Magenta);
+        skin.code_block.set_fg(termimad::crossterm::style::Color::Yellow);
+        skin.inline_code.set_fg(termimad::crossterm::style::Color::Yellow);
+
+        // Print a header separator
+        println!("\x1b[1;35m━━━ Summary ━━━\x1b[0m");
+        println!();
+
+        // Render the markdown
+        let rendered = skin.term_text(summary);
+        print!("{}", rendered);
+
+        // Print a footer separator
+        println!();
+        println!("\x1b[1;35m━━━━━━━━━━━━━━━\x1b[0m");
+    }
 }
--- a/crates/g3-cli/tests/coach_feedback_extraction_test.rs
+++ b/crates/g3-cli/tests/coach_feedback_extraction_test.rs
@@ -0,0 +1,336 @@
+use serde_json::json;
+use std::fs;
+use tempfile::TempDir;
+
+#[test]
+fn test_extract_coach_feedback_with_timing_message() {
+    // Create a temporary directory for logs
+    let temp_dir = TempDir::new().unwrap();
+    let logs_dir = temp_dir.path().join("logs");
+    fs::create_dir(&logs_dir).unwrap();
+
+    // Create a mock session log with the problematic conversation history
+    // where timing message appears after the tool result
+    let session_id = "test_session_123";
+    let log_file_path = logs_dir.join(format!("g3_session_{}.json", session_id));
+
+    let log_content = json!({
+        "session_id": session_id,
+        "context_window": {
+            "conversation_history": [
+                {
+                    "role": "assistant",
+                    "content": "{\"tool\": \"final_output\", \"args\": {\"summary\":\"IMPLEMENTATION_APPROVED\"}}"
+                },
+                {
+                    "role": "user",
+                    "content": "Tool result: IMPLEMENTATION_APPROVED"
+                },
+                {
+                    "role": "assistant",
+                    "content": "🕝 27.7s | 💭 7.5s"
+                }
+            ]
+        }
+    });
+
+    fs::write(&log_file_path, serde_json::to_string_pretty(&log_content).unwrap()).unwrap();
+
+    // Now test the extraction logic
+    let log_content_str = fs::read_to_string(&log_file_path).unwrap();
+    let log_json: serde_json::Value = serde_json::from_str(&log_content_str).unwrap();
+
+    if let Some(context_window) = log_json.get("context_window") {
+        if let Some(conversation_history) = context_window.get("conversation_history") {
+            if let Some(messages) = conversation_history.as_array() {
+                // This is the key logic we're testing - find the last USER message with "Tool result:"
+                let last_tool_result = messages.iter().rev().find(|msg| {
+                    if let Some(role) = msg.get("role") {
+                        if let Some(role_str) = role.as_str() {
+                            if role_str == "User" || role_str == "user" {
+                                if let Some(content) = msg.get("content") {
+                                    if let Some(content_str) = content.as_str() {
+                                        return content_str.starts_with("Tool result:");
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    false
+                });
+
+                // Verify we found the correct message
+                assert!(last_tool_result.is_some(), "Should find the tool result message");
+
+                if let Some(last_message) = last_tool_result {
+                    if let Some(content) = last_message.get("content") {
+                        if let Some(content_str) = content.as_str() {
+                            let feedback = if content_str.starts_with("Tool result: ") {
+                                content_str.strip_prefix("Tool result: ").unwrap_or(content_str)
+                            } else {
+                                content_str
+                            };
+
+                            // Verify we extracted the correct feedback
+                            assert_eq!(feedback, "IMPLEMENTATION_APPROVED", "Should extract the actual feedback, not timing");
+                            
+                            // Verify the feedback is NOT the timing message
+                            assert!(!feedback.contains("🕝"), "Feedback should not be the timing message");
+                            
+                            println!("✅ Successfully extracted coach feedback: {}", feedback);
+                            return;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    panic!("Failed to extract coach feedback");
+}
+
+#[test]
+fn test_extract_only_final_output_tool_results() {
+    // Test that we only extract tool results from final_output, not from other tools
+    let temp_dir = TempDir::new().unwrap();
+    let logs_dir = temp_dir.path().join("logs");
+    fs::create_dir(&logs_dir).unwrap();
+
+    let session_id = "test_session_final_output_only";
+    let log_file_path = logs_dir.join(format!("g3_session_{}.json", session_id));
+
+    let log_content = json!({
+        "session_id": session_id,
+        "context_window": {
+            "conversation_history": [
+                {
+                    "role": "assistant",
+                    "content": "{\"tool\": \"shell\", \"args\": {\"command\":\"ls\"}}"
+                },
+                {
+                    "role": "user",
+                    "content": "Tool result: file1.txt\nfile2.txt"
+                },
+                {
+                    "role": "assistant",
+                    "content": "{\"tool\": \"read_file\", \"args\": {\"file_path\":\"test.txt\"}}"
+                },
+                {
+                    "role": "user",
+                    "content": "Tool result: This is test content"
+                },
+                {
+                    "role": "assistant",
+                    "content": "{\"tool\": \"final_output\", \"args\": {\"summary\":\"APPROVED_RESULT\"}}"
+                },
+                {
+                    "role": "user",
+                    "content": "Tool result: APPROVED_RESULT"
+                },
+                {
+                    "role": "assistant",
+                    "content": "🕝 20.5s | 💭 5.2s"
+                }
+            ]
+        }
+    });
+
+    fs::write(&log_file_path, serde_json::to_string_pretty(&log_content).unwrap()).unwrap();
+
+    // Test the new extraction logic that verifies the tool is final_output
+    let log_content_str = fs::read_to_string(&log_file_path).unwrap();
+    let log_json: serde_json::Value = serde_json::from_str(&log_content_str).unwrap();
+
+    if let Some(context_window) = log_json.get("context_window") {
+        if let Some(conversation_history) = context_window.get("conversation_history") {
+            if let Some(messages) = conversation_history.as_array() {
+                // Go backwards through messages to find final_output tool result
+                for i in (0..messages.len()).rev() {
+                    let msg = &messages[i];
+                    
+                    if let Some(role) = msg.get("role") {
+                        if let Some(role_str) = role.as_str() {
+                            if role_str == "User" || role_str == "user" {
+                                if let Some(content) = msg.get("content") {
+                                    if let Some(content_str) = content.as_str() {
+                                        if content_str.starts_with("Tool result:") {
+                                            // Check if preceding message was final_output
+                                            if i > 0 {
+                                                let prev_msg = &messages[i - 1];
+                                                if let Some(prev_content) = prev_msg.get("content") {
+                                                    if let Some(prev_content_str) = prev_content.as_str() {
+                                                        if prev_content_str.contains("\"tool\": \"final_output\"") {
+                                                            let feedback = content_str.strip_prefix("Tool result: ").unwrap_or(content_str);
+                                                            assert_eq!(feedback, "APPROVED_RESULT", "Should extract only final_output result");
+                                                            println!("✅ Correctly extracted only final_output tool result: {}", feedback);
+                                                            return;
+                                                        }
+                                                    }
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    panic!("Failed to extract final_output tool result");
+}
+
+#[test]
+fn test_extract_coach_feedback_without_timing_message() {
+    // Create a temporary directory for logs
+    let temp_dir = TempDir::new().unwrap();
+    let logs_dir = temp_dir.path().join("logs");
+    fs::create_dir(&logs_dir).unwrap();
+
+    // Test the case where there's no timing message (backward compatibility)
+    let session_id = "test_session_456";
+    let log_file_path = logs_dir.join(format!("g3_session_{}.json", session_id));
+
+    let log_content = json!({
+        "session_id": session_id,
+        "context_window": {
+            "conversation_history": [
+                {
+                    "role": "assistant",
+                    "content": "{\"tool\": \"final_output\", \"args\": {\"summary\":\"TEST_FEEDBACK\"}}"
+                },
+                {
+                    "role": "user",
+                    "content": "Tool result: TEST_FEEDBACK"
+                }
+            ]
+        }
+    });
+
+    fs::write(&log_file_path, serde_json::to_string_pretty(&log_content).unwrap()).unwrap();
+
+    // Test extraction
+    let log_content_str = fs::read_to_string(&log_file_path).unwrap();
+    let log_json: serde_json::Value = serde_json::from_str(&log_content_str).unwrap();
+
+    if let Some(context_window) = log_json.get("context_window") {
+        if let Some(conversation_history) = context_window.get("conversation_history") {
+            if let Some(messages) = conversation_history.as_array() {
+                let last_tool_result = messages.iter().rev().find(|msg| {
+                    if let Some(role) = msg.get("role") {
+                        if let Some(role_str) = role.as_str() {
+                            if role_str == "User" || role_str == "user" {
+                                if let Some(content) = msg.get("content") {
+                                    if let Some(content_str) = content.as_str() {
+                                        return content_str.starts_with("Tool result:");
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    false
+                });
+
+                assert!(last_tool_result.is_some());
+
+                if let Some(last_message) = last_tool_result {
+                    if let Some(content) = last_message.get("content") {
+                        if let Some(content_str) = content.as_str() {
+                            let feedback = content_str.strip_prefix("Tool result: ").unwrap_or(content_str);
+                            assert_eq!(feedback, "TEST_FEEDBACK");
+                            println!("✅ Successfully extracted coach feedback without timing: {}", feedback);
+                            return;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    panic!("Failed to extract coach feedback");
+}
+
+#[test]
+fn test_extract_coach_feedback_with_multiple_tool_results() {
+    // Test that we get the LAST tool result when there are multiple
+    let temp_dir = TempDir::new().unwrap();
+    let logs_dir = temp_dir.path().join("logs");
+    fs::create_dir(&logs_dir).unwrap();
+
+    let session_id = "test_session_789";
+    let log_file_path = logs_dir.join(format!("g3_session_{}.json", session_id));
+
+    let log_content = json!({
+        "session_id": session_id,
+        "context_window": {
+            "conversation_history": [
+                {
+                    "role": "assistant",
+                    "content": "{\"tool\": \"shell\", \"args\": {\"command\":\"ls\"}}"
+                },
+                {
+                    "role": "user",
+                    "content": "Tool result: file1.txt\nfile2.txt"
+                },
+                {
+                    "role": "assistant",
+                    "content": "{\"tool\": \"final_output\", \"args\": {\"summary\":\"FINAL_RESULT\"}}"
+                },
+                {
+                    "role": "user",
+                    "content": "Tool result: FINAL_RESULT"
+                },
+                {
+                    "role": "assistant",
+                    "content": "🕝 15.2s | 💭 3.1s"
+                }
+            ]
+        }
+    });
+
+    fs::write(&log_file_path, serde_json::to_string_pretty(&log_content).unwrap()).unwrap();
+
+    // Test extraction
+    let log_content_str = fs::read_to_string(&log_file_path).unwrap();
+    let log_json: serde_json::Value = serde_json::from_str(&log_content_str).unwrap();
+
+    if let Some(context_window) = log_json.get("context_window") {
+        if let Some(conversation_history) = context_window.get("conversation_history") {
+            if let Some(messages) = conversation_history.as_array() {
+                let last_tool_result = messages.iter().rev().find(|msg| {
+                    if let Some(role) = msg.get("role") {
+                        if let Some(role_str) = role.as_str() {
+                            if role_str == "User" || role_str == "user" {
+                                if let Some(content) = msg.get("content") {
+                                    if let Some(content_str) = content.as_str() {
+                                        return content_str.starts_with("Tool result:");
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    false
+                });
+
+                assert!(last_tool_result.is_some());
+
+                if let Some(last_message) = last_tool_result {
+                    if let Some(content) = last_message.get("content") {
+                        if let Some(content_str) = content.as_str() {
+                            let feedback = content_str.strip_prefix("Tool result: ").unwrap_or(content_str);
+                            // Should get the LAST tool result (final_output), not the first one (shell)
+                            assert_eq!(feedback, "FINAL_RESULT", "Should extract the last tool result");
+                            assert!(!feedback.contains("file1.txt"), "Should not extract earlier tool results");
+                            println!("✅ Successfully extracted last tool result: {}", feedback);
+                            return;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    panic!("Failed to extract coach feedback");
+}
--- a/crates/g3-config/src/lib.rs
+++ b/crates/g3-config/src/lib.rs
@@ -42,6 +42,7 @@ pub struct AnthropicConfig {
    pub temperature: Option<f32>,
    pub cache_config: Option<String>, // "ephemeral", "5minute", "1hour", or None to disable
    pub enable_1m_context: Option<bool>, // Enable 1m context window (costs extra)
+    pub thinking_budget_tokens: Option<u32>, // Budget tokens for extended thinking
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
--- a/crates/g3-core/src/lib.rs
+++ b/crates/g3-core/src/lib.rs
--- a/crates/g3-core/src/ui_writer.rs
+++ b/crates/g3-core/src/ui_writer.rs
@@ -65,6 +65,10 @@ pub trait UiWriter: Send + Sync {
    /// Prompt the user to choose from a list of options
    /// Returns the index of the selected option
    fn prompt_user_choice(&self, message: &str, options: &[&str]) -> usize;
+
+    /// Print the final output summary with markdown formatting
+    /// Shows a spinner while formatting, then renders the markdown
+    fn print_final_output(&self, summary: &str);
 }

 /// A no-op implementation for when UI output is not needed
@@ -97,4 +101,7 @@ impl UiWriter for NullUiWriter {
    fn prompt_user_choice(&self, _message: &str, _options: &[&str]) -> usize {
        0
    }
+    fn print_final_output(&self, _summary: &str) {
+        // No-op for null writer
+    }
 }
--- a/crates/g3-core/tests/test_preflight_max_tokens.rs
+++ b/crates/g3-core/tests/test_preflight_max_tokens.rs
@@ -0,0 +1,188 @@
+//! Tests for the pre-flight max_tokens validation with thinking.budget_tokens constraint
+//!
+//! These tests verify that when using Anthropic with extended thinking enabled,
+//! the max_tokens calculation properly accounts for the budget_tokens constraint.
+
+use g3_config::Config;
+use g3_core::ContextWindow;
+
+/// Helper function to create a minimal config for testing
+fn create_test_config_with_thinking(thinking_budget: Option<u32>) -> Config {
+    let mut config = Config::default();
+    
+    // Set up Anthropic provider with optional thinking budget
+    config.providers.anthropic = Some(g3_config::AnthropicConfig {
+        api_key: "test-key".to_string(),
+        model: "claude-sonnet-4-5".to_string(),
+        max_tokens: Some(16000),
+        temperature: Some(0.1),
+        cache_config: None,
+        enable_1m_context: None,
+        thinking_budget_tokens: thinking_budget,
+    });
+    
+    config.providers.default_provider = "anthropic".to_string();
+    config
+}
+
+/// Test that when thinking is disabled, max_tokens passes through unchanged
+#[test]
+fn test_no_thinking_budget_passes_through() {
+    let config = create_test_config_with_thinking(None);
+    
+    // Without thinking budget, any max_tokens should be fine
+    let proposed_max = 5000;
+    
+    // The constraint check would return (proposed_max, false)
+    // since there's no thinking_budget_tokens configured
+    assert!(config.providers.anthropic.as_ref().unwrap().thinking_budget_tokens.is_none());
+}
+
+/// Test that when max_tokens > budget_tokens + buffer, no reduction is needed
+#[test]
+fn test_sufficient_max_tokens_no_reduction_needed() {
+    let config = create_test_config_with_thinking(Some(10000));
+    let budget_tokens = config.providers.anthropic.as_ref().unwrap().thinking_budget_tokens.unwrap();
+    
+    // minimum_required = budget_tokens + 1024 = 11024
+    let minimum_required = budget_tokens + 1024;
+    
+    // If proposed_max >= minimum_required, no reduction is needed
+    let proposed_max = 15000;
+    assert!(proposed_max >= minimum_required);
+}
+
+/// Test that when max_tokens < budget_tokens + buffer, reduction is needed
+#[test]
+fn test_insufficient_max_tokens_needs_reduction() {
+    let config = create_test_config_with_thinking(Some(10000));
+    let budget_tokens = config.providers.anthropic.as_ref().unwrap().thinking_budget_tokens.unwrap();
+    
+    // minimum_required = budget_tokens + 1024 = 11024
+    let minimum_required = budget_tokens + 1024;
+    
+    // If proposed_max < minimum_required, reduction IS needed
+    let proposed_max = 5000;
+    assert!(proposed_max < minimum_required);
+}
+
+/// Test the minimum required calculation
+#[test]
+fn test_minimum_required_calculation() {
+    // For a budget of 10000, we need at least 11024 tokens
+    let budget_tokens = 10000u32;
+    let output_buffer = 1024u32;
+    let minimum_required = budget_tokens + output_buffer;
+    
+    assert_eq!(minimum_required, 11024);
+    
+    // For a larger budget
+    let budget_tokens = 32000u32;
+    let minimum_required = budget_tokens + output_buffer;
+    assert_eq!(minimum_required, 33024);
+}
+
+/// Test context window usage calculation for summary max_tokens
+#[test]
+fn test_context_window_available_tokens() {
+    let mut context = ContextWindow::new(200000); // 200k context window
+    
+    // Simulate heavy usage
+    context.used_tokens = 180000; // 90% used
+    
+    let model_limit = context.total_tokens;
+    let current_usage = context.used_tokens;
+    
+    // 2.5% buffer calculation
+    let buffer = (model_limit / 40).clamp(1000, 10000);
+    assert_eq!(buffer, 5000); // 200000/40 = 5000
+    
+    let available = model_limit
+        .saturating_sub(current_usage)
+        .saturating_sub(buffer);
+    
+    // 200000 - 180000 - 5000 = 15000
+    assert_eq!(available, 15000);
+    
+    // Capped at 10000 for summary
+    let summary_max = available.min(10_000);
+    assert_eq!(summary_max, 10000);
+}
+
+/// Test that when context is nearly full, available tokens may be below thinking budget
+#[test]
+fn test_context_nearly_full_triggers_reduction() {
+    let mut context = ContextWindow::new(200000);
+    
+    // Very heavy usage - 98% used
+    context.used_tokens = 196000;
+    
+    let model_limit = context.total_tokens;
+    let current_usage = context.used_tokens;
+    let buffer = (model_limit / 40).clamp(1000, 10000); // 5000
+    
+    let available = model_limit
+        .saturating_sub(current_usage)
+        .saturating_sub(buffer);
+    
+    // 200000 - 196000 - 5000 = -1000 -> saturates to 0
+    assert_eq!(available, 0);
+    
+    // With thinking_budget of 10000, this would definitely need reduction
+    let thinking_budget = 10000u32;
+    let minimum_required = thinking_budget + 1024;
+    assert!(available < minimum_required);
+}
+
+/// Test the hard-coded fallback value
+#[test]
+fn test_hardcoded_fallback_value() {
+    // When all else fails, we use 5000 as the hard-coded max_tokens
+    let hardcoded_fallback = 5000u32;
+    
+    // This should be a reasonable value that Anthropic will accept
+    // even with thinking enabled (though output will be limited)
+    assert!(hardcoded_fallback > 0);
+    
+    // Note: With a 10000 thinking budget, 5000 is still below the
+    // minimum required (11024), but we send it anyway as a "last resort"
+    // hoping the API might still work for basic operations
+}
+
+/// Test provider-specific caps
+#[test]
+fn test_provider_specific_caps() {
+    // Anthropic/Databricks: cap at 10000
+    let anthropic_cap = 10000u32;
+    let proposed = 15000u32;
+    assert_eq!(proposed.min(anthropic_cap), 10000);
+    
+    // Embedded: cap at 3000
+    let embedded_cap = 3000u32;
+    let proposed = 5000u32;
+    assert_eq!(proposed.min(embedded_cap), 3000);
+    
+    // Default: cap at 5000
+    let default_cap = 5000u32;
+    let proposed = 8000u32;
+    assert_eq!(proposed.min(default_cap), 5000);
+}
+
+/// Test that the error message mentions the thinking budget constraint
+#[test]
+fn test_error_message_content() {
+    // Verify the warning message format contains useful information
+    let proposed_max_tokens = 5000u32;
+    let budget_tokens = 10000u32;
+    let minimum_required = budget_tokens + 1024;
+    
+    let warning = format!(
+        "max_tokens ({}) is below required minimum ({}) for thinking.budget_tokens ({}). Context reduction needed.",
+        proposed_max_tokens, minimum_required, budget_tokens
+    );
+    
+    assert!(warning.contains("5000"));
+    assert!(warning.contains("11024"));
+    assert!(warning.contains("10000"));
+    assert!(warning.contains("Context reduction needed"));
+}
--- a/crates/g3-core/tests/test_reset_with_summary.rs
+++ b/crates/g3-core/tests/test_reset_with_summary.rs
@@ -0,0 +1,159 @@
+//! Tests for reset_with_summary to ensure system prompt is preserved after compaction
+
+use g3_core::ContextWindow;
+use g3_providers::{Message, MessageRole};
+
+/// Test that reset_with_summary preserves the original system prompt
+#[test]
+fn test_reset_with_summary_preserves_system_prompt() {
+    let mut context = ContextWindow::new(10000);
+
+    // Add the system prompt as the first message (simulating agent initialization)
+    let system_prompt = "You are G3, an AI programming agent...";
+    context.add_message(Message::new(MessageRole::System, system_prompt.to_string()));
+
+    // Add some conversation history
+    context.add_message(Message::new(MessageRole::User, "Task: Write a function".to_string()));
+    context.add_message(Message::new(MessageRole::Assistant, "I'll help you write that function.".to_string()));
+    context.add_message(Message::new(MessageRole::User, "Thanks, now add tests".to_string()));
+    context.add_message(Message::new(MessageRole::Assistant, "Here are the tests.".to_string()));
+
+    // Verify we have 5 messages before reset
+    assert_eq!(context.conversation_history.len(), 5);
+
+    // Reset with summary
+    let summary = "We discussed writing a function and adding tests.".to_string();
+    let latest_user_msg = Some("Continue with the implementation".to_string());
+    context.reset_with_summary(summary, latest_user_msg);
+
+    // Verify the first message is still the system prompt
+    assert!(!context.conversation_history.is_empty(), "Conversation history should not be empty");
+    
+    let first_message = &context.conversation_history[0];
+    assert!(
+        matches!(first_message.role, MessageRole::System),
+        "First message should be a System message, got {:?}",
+        first_message.role
+    );
+    assert!(
+        first_message.content.contains("You are G3"),
+        "First message should contain the system prompt 'You are G3', got: {}",
+        &first_message.content[..first_message.content.len().min(100)]
+    );
+
+    // Verify the summary was added as a separate system message
+    let has_summary = context.conversation_history.iter().any(|m| {
+        matches!(m.role, MessageRole::System) && m.content.contains("Previous conversation summary")
+    });
+    assert!(has_summary, "Should have a summary message");
+
+    // Verify the latest user message was added
+    let has_user_msg = context.conversation_history.iter().any(|m| {
+        matches!(m.role, MessageRole::User) && m.content.contains("Continue with the implementation")
+    });
+    assert!(has_user_msg, "Should have the latest user message");
+}
+
+/// Test that reset_with_summary preserves README message if present
+#[test]
+fn test_reset_with_summary_preserves_readme() {
+    let mut context = ContextWindow::new(10000);
+
+    // Add the system prompt as the first message
+    let system_prompt = "You are G3, an AI programming agent...";
+    context.add_message(Message::new(MessageRole::System, system_prompt.to_string()));
+
+    // Add README as second system message
+    let readme_content = "# Project README\n\nThis is a test project.";
+    context.add_message(Message::new(MessageRole::System, readme_content.to_string()));
+
+    // Add some conversation history
+    context.add_message(Message::new(MessageRole::User, "Task: Write a function".to_string()));
+    context.add_message(Message::new(MessageRole::Assistant, "Done.".to_string()));
+
+    // Verify we have 4 messages before reset
+    assert_eq!(context.conversation_history.len(), 4);
+
+    // Reset with summary
+    let summary = "We wrote a function.".to_string();
+    context.reset_with_summary(summary, None);
+
+    // Verify the first message is still the system prompt
+    let first_message = &context.conversation_history[0];
+    assert!(
+        first_message.content.contains("You are G3"),
+        "First message should be the system prompt"
+    );
+
+    // Verify the README was preserved as the second message
+    let second_message = &context.conversation_history[1];
+    assert!(
+        matches!(second_message.role, MessageRole::System),
+        "Second message should be a System message"
+    );
+    assert!(
+        second_message.content.contains("Project README"),
+        "Second message should be the README"
+    );
+}
+
+/// Test that reset_with_summary works when there's no README
+#[test]
+fn test_reset_with_summary_without_readme() {
+    let mut context = ContextWindow::new(10000);
+
+    // Add only the system prompt (no README)
+    let system_prompt = "You are G3, an AI programming agent...";
+    context.add_message(Message::new(MessageRole::System, system_prompt.to_string()));
+
+    // Add conversation without README
+    context.add_message(Message::new(MessageRole::User, "Hello".to_string()));
+    context.add_message(Message::new(MessageRole::Assistant, "Hi there!".to_string()));
+
+    // Reset with summary
+    let summary = "Greeted the user.".to_string();
+    context.reset_with_summary(summary, None);
+
+    // Verify the first message is still the system prompt
+    let first_message = &context.conversation_history[0];
+    assert!(
+        first_message.content.contains("You are G3"),
+        "First message should be the system prompt"
+    );
+
+    // Verify we have system prompt + summary (no README)
+    // The second message should be the summary, not a README
+    let second_message = &context.conversation_history[1];
+    assert!(
+        second_message.content.contains("Previous conversation summary"),
+        "Second message should be the summary when no README exists"
+    );
+}
+
+/// Test that reset_with_summary handles Agent Configuration in addition to README
+#[test]
+fn test_reset_with_summary_preserves_agent_configuration() {
+    let mut context = ContextWindow::new(10000);
+
+    // Add the system prompt as the first message
+    let system_prompt = "You are G3, an AI programming agent...";
+    context.add_message(Message::new(MessageRole::System, system_prompt.to_string()));
+
+    // Add Agent Configuration as second system message
+    let agents_content = "# Agent Configuration\n\nSpecial instructions for this project.";
+    context.add_message(Message::new(MessageRole::System, agents_content.to_string()));
+
+    // Add some conversation history
+    context.add_message(Message::new(MessageRole::User, "Task: Do something".to_string()));
+
+    // Reset with summary
+    let summary = "Did something.".to_string();
+    context.reset_with_summary(summary, None);
+
+    // Verify the Agent Configuration was preserved
+    let second_message = &context.conversation_history[1];
+    assert!(
+        second_message.content.contains("Agent Configuration"),
+        "Second message should be the Agent Configuration"
+    );
+}
--- a/crates/g3-core/tests/test_system_message_loading.rs
+++ b/crates/g3-core/tests/test_system_message_loading.rs
@@ -0,0 +1,263 @@
+//! Tests for verifying system message loading with README content
+//!
+//! This test verifies that when a README is present, the system message
+//! is correctly loaded and structured in the context window.
+
+use g3_core::ContextWindow;
+use g3_providers::{Message, MessageRole};
+
+/// Test that system prompt is always the first message
+#[test]
+fn test_system_prompt_is_first_message() {
+    let mut context = ContextWindow::new(10000);
+
+    // Simulate agent initialization: system prompt first
+    let system_prompt = "You are G3, an AI programming agent of the same skill level...";
+    context.add_message(Message::new(MessageRole::System, system_prompt.to_string()));
+
+    // Verify the first message is the system prompt
+    assert!(!context.conversation_history.is_empty());
+    let first_message = &context.conversation_history[0];
+    assert!(
+        matches!(first_message.role, MessageRole::System),
+        "First message should be a System message"
+    );
+    assert!(
+        first_message.content.contains("You are G3"),
+        "First message should contain the system prompt"
+    );
+}
+
+/// Test that README is added as the second system message after the system prompt
+#[test]
+fn test_readme_is_second_message_after_system_prompt() {
+    let mut context = ContextWindow::new(10000);
+
+    // Simulate agent initialization: system prompt first
+    let system_prompt = "You are G3, an AI programming agent of the same skill level...";
+    context.add_message(Message::new(MessageRole::System, system_prompt.to_string()));
+
+    // Add README as second system message (simulating what Agent::new_with_readme does)
+    let readme_content = "📚 Project README (from README.md):\n\n# My Project\n\nThis is a test project.";
+    context.add_message(Message::new(MessageRole::System, readme_content.to_string()));
+
+    // Verify we have 2 messages
+    assert_eq!(context.conversation_history.len(), 2);
+
+    // Verify the first message is the system prompt
+    let first_message = &context.conversation_history[0];
+    assert!(
+        matches!(first_message.role, MessageRole::System),
+        "First message should be a System message"
+    );
+    assert!(
+        first_message.content.contains("You are G3"),
+        "First message should contain the system prompt"
+    );
+
+    // Verify the second message is the README
+    let second_message = &context.conversation_history[1];
+    assert!(
+        matches!(second_message.role, MessageRole::System),
+        "Second message should be a System message"
+    );
+    assert!(
+        second_message.content.contains("Project README"),
+        "Second message should contain the README content"
+    );
+    assert!(
+        second_message.content.contains("My Project"),
+        "Second message should contain the actual README content"
+    );
+}
+
+/// Test that system prompt and README are separate messages (not combined)
+#[test]
+fn test_system_prompt_and_readme_are_separate() {
+    let mut context = ContextWindow::new(10000);
+
+    // Simulate agent initialization
+    let system_prompt = "You are G3, an AI programming agent...";
+    context.add_message(Message::new(MessageRole::System, system_prompt.to_string()));
+
+    let readme_content = "📚 Project README (from README.md):\n\n# Test Project";
+    context.add_message(Message::new(MessageRole::System, readme_content.to_string()));
+
+    // Verify they are separate messages
+    assert_eq!(context.conversation_history.len(), 2);
+
+    // First message should NOT contain README
+    let first_message = &context.conversation_history[0];
+    assert!(
+        !first_message.content.contains("Project README"),
+        "System prompt should not contain README content"
+    );
+
+    // Second message should NOT contain system prompt
+    let second_message = &context.conversation_history[1];
+    assert!(
+        !second_message.content.contains("You are G3"),
+        "README message should not contain system prompt"
+    );
+}
+
+/// Test that TODO is added as third message after system prompt and README
+#[test]
+fn test_todo_is_third_message_after_readme() {
+    let mut context = ContextWindow::new(10000);
+
+    // Simulate agent initialization order:
+    // 1. System prompt
+    let system_prompt = "You are G3, an AI programming agent...";
+    context.add_message(Message::new(MessageRole::System, system_prompt.to_string()));
+
+    // 2. README
+    let readme_content = "📚 Project README (from README.md):\n\n# Test Project";
+    context.add_message(Message::new(MessageRole::System, readme_content.to_string()));
+
+    // 3. TODO (if present)
+    let todo_content = "📋 Existing TODO list (from todo.g3.md):\n\n- [ ] Task 1\n- [x] Task 2";
+    context.add_message(Message::new(MessageRole::System, todo_content.to_string()));
+
+    // Verify we have 3 messages
+    assert_eq!(context.conversation_history.len(), 3);
+
+    // Verify order
+    assert!(
+        context.conversation_history[0].content.contains("You are G3"),
+        "First message should be system prompt"
+    );
+    assert!(
+        context.conversation_history[1].content.contains("Project README"),
+        "Second message should be README"
+    );
+    assert!(
+        context.conversation_history[2].content.contains("TODO list"),
+        "Third message should be TODO"
+    );
+}
+
+/// Test that AGENTS.md content is combined with README in the same message
+#[test]
+fn test_agents_and_readme_combined() {
+    let mut context = ContextWindow::new(10000);
+
+    // Simulate agent initialization
+    let system_prompt = "You are G3, an AI programming agent...";
+    context.add_message(Message::new(MessageRole::System, system_prompt.to_string()));
+
+    // Combined AGENTS.md and README.md content (as done in g3-cli)
+    let combined_content = "# Agent Configuration\n\nSpecial instructions.\n\n# Project README\n\nProject description.";
+    context.add_message(Message::new(MessageRole::System, combined_content.to_string()));
+
+    // Verify we have 2 messages
+    assert_eq!(context.conversation_history.len(), 2);
+
+    // Verify the second message contains both AGENTS and README
+    let second_message = &context.conversation_history[1];
+    assert!(
+        second_message.content.contains("Agent Configuration"),
+        "Combined message should contain AGENTS.md content"
+    );
+    assert!(
+        second_message.content.contains("Project README"),
+        "Combined message should contain README content"
+    );
+}
+
+/// Test that user messages come after system messages
+#[test]
+fn test_user_messages_after_system_messages() {
+    let mut context = ContextWindow::new(10000);
+
+    // Simulate agent initialization
+    let system_prompt = "You are G3, an AI programming agent...";
+    context.add_message(Message::new(MessageRole::System, system_prompt.to_string()));
+
+    let readme_content = "📚 Project README (from README.md):\n\n# Test Project";
+    context.add_message(Message::new(MessageRole::System, readme_content.to_string()));
+
+    // Add user message
+    let user_message = "Please help me with this task.";
+    context.add_message(Message::new(MessageRole::User, user_message.to_string()));
+
+    // Verify order
+    assert_eq!(context.conversation_history.len(), 3);
+    assert!(matches!(context.conversation_history[0].role, MessageRole::System));
+    assert!(matches!(context.conversation_history[1].role, MessageRole::System));
+    assert!(matches!(context.conversation_history[2].role, MessageRole::User));
+}
+
+/// Test that empty README content is not added
+#[test]
+fn test_empty_readme_not_added() {
+    let mut context = ContextWindow::new(10000);
+
+    // Simulate agent initialization
+    let system_prompt = "You are G3, an AI programming agent...";
+    context.add_message(Message::new(MessageRole::System, system_prompt.to_string()));
+
+    // Try to add empty README (should be skipped due to empty content check)
+    let empty_readme = "   "; // whitespace only
+    context.add_message(Message::new(MessageRole::System, empty_readme.to_string()));
+
+    // Verify only system prompt was added (empty message should be skipped)
+    assert_eq!(
+        context.conversation_history.len(),
+        1,
+        "Empty README should not be added to conversation history"
+    );
+}
+
+/// Test the reload_readme detection logic
+#[test]
+fn test_readme_detection_for_reload() {
+    let mut context = ContextWindow::new(10000);
+
+    // Simulate agent initialization
+    let system_prompt = "You are G3, an AI programming agent...";
+    context.add_message(Message::new(MessageRole::System, system_prompt.to_string()));
+
+    // Add README with expected markers
+    let readme_content = "# Project README\n\nThis is the project description.";
+    context.add_message(Message::new(MessageRole::System, readme_content.to_string()));
+
+    // Check if the second message (index 1) is a README
+    let has_readme = context
+        .conversation_history
+        .get(1)
+        .map(|m| {
+            matches!(m.role, MessageRole::System)
+                && (m.content.contains("Project README")
+                    || m.content.contains("Agent Configuration"))
+        })
+        .unwrap_or(false);
+
+    assert!(has_readme, "Should detect README at index 1");
+}
+
+/// Test that README detection fails when no README is present
+#[test]
+fn test_readme_detection_without_readme() {
+    let mut context = ContextWindow::new(10000);
+
+    // Simulate agent initialization without README
+    let system_prompt = "You are G3, an AI programming agent...";
+    context.add_message(Message::new(MessageRole::System, system_prompt.to_string()));
+
+    // Add a user message directly (no README)
+    context.add_message(Message::new(MessageRole::User, "Hello".to_string()));
+
+    // Check if the second message (index 1) is a README
+    let has_readme = context
+        .conversation_history
+        .get(1)
+        .map(|m| {
+            matches!(m.role, MessageRole::System)
+                && (m.content.contains("Project README")
+                    || m.content.contains("Agent Configuration"))
+        })
+        .unwrap_or(false);
+
+    assert!(!has_readme, "Should not detect README when none exists");
+}
--- a/crates/g3-core/tests/test_todo_completion.rs
+++ b/crates/g3-core/tests/test_todo_completion.rs
@@ -0,0 +1,78 @@
+//! Tests for TODO completion detection and file deletion behavior
+
+/// Helper to check if all TODOs are complete (same logic as in lib.rs)
+fn all_todos_complete(content: &str) -> bool {
+    let has_incomplete = content.lines().any(|line| {
+        let trimmed = line.trim();
+        trimmed.starts_with("- [ ]")
+    });
+    
+    !has_incomplete && (content.contains("- [x]") || content.contains("- [X]"))
+}
+
+#[test]
+fn test_all_complete_lowercase() {
+    let content = "# Test\n\n- [x] Done 1\n- [x] Done 2";
+    assert!(all_todos_complete(content));
+}
+
+#[test]
+fn test_all_complete_uppercase() {
+    let content = "# Test\n\n- [X] Done 1\n- [X] Done 2";
+    assert!(all_todos_complete(content));
+}
+
+#[test]
+fn test_all_complete_mixed_case() {
+    let content = "# Test\n\n- [x] Done 1\n- [X] Done 2";
+    assert!(all_todos_complete(content));
+}
+
+#[test]
+fn test_has_incomplete() {
+    let content = "# Test\n\n- [x] Done 1\n- [ ] Not done";
+    assert!(!all_todos_complete(content));
+}
+
+#[test]
+fn test_all_incomplete() {
+    let content = "# Test\n\n- [ ] Not done 1\n- [ ] Not done 2";
+    assert!(!all_todos_complete(content));
+}
+
+#[test]
+fn test_no_checkboxes() {
+    let content = "# Just a header\n\nSome text without checkboxes";
+    assert!(!all_todos_complete(content));
+}
+
+#[test]
+fn test_nested_complete() {
+    let content = "# Test\n\n- [x] Parent\n  - [x] Child 1\n  - [x] Child 2";
+    assert!(all_todos_complete(content));
+}
+
+#[test]
+fn test_nested_incomplete() {
+    let content = "# Test\n\n- [x] Parent\n  - [x] Child 1\n  - [ ] Child 2";
+    assert!(!all_todos_complete(content));
+}
+
+#[test]
+fn test_indented_incomplete() {
+    // Indented incomplete items should still be detected
+    let content = "# Test\n\n- [x] Done\n    - [ ] Indented incomplete";
+    assert!(!all_todos_complete(content));
+}
+
+#[test]
+fn test_empty_content() {
+    let content = "";
+    assert!(!all_todos_complete(content));
+}
+
+#[test]
+fn test_whitespace_only() {
+    let content = "   \n\n   ";
+    assert!(!all_todos_complete(content));
+}
--- a/crates/g3-core/tests/test_token_counting.rs
+++ b/crates/g3-core/tests/test_token_counting.rs
@@ -1,103 +1,170 @@
 use g3_core::ContextWindow;
-use g3_providers::Usage;
+use g3_providers::{Message, MessageRole, Usage};

+/// Test that used_tokens is tracked via add_message, not update_usage_from_response.
+/// This is critical for the 80% summarization threshold to work correctly.
 #[test]
-fn test_token_accumulation() {
+fn test_used_tokens_tracked_via_messages() {
    let mut window = ContextWindow::new(10000);

-    // First API call: 100 prompt + 50 completion = 150 total
-    let usage1 = Usage {
+    // Add a user message - this should update used_tokens
+    let user_msg = Message::new(MessageRole::User, "Hello, how are you?".to_string());
+    window.add_message(user_msg);
+    
+    // used_tokens should be non-zero after adding a message
+    assert!(window.used_tokens > 0, "used_tokens should increase after add_message");
+    let tokens_after_user_msg = window.used_tokens;
+
+    // Add an assistant message
+    let assistant_msg = Message::new(MessageRole::Assistant, "I'm doing well, thank you!".to_string());
+    window.add_message(assistant_msg);
+    
+    // used_tokens should increase further
+    assert!(window.used_tokens > tokens_after_user_msg, "used_tokens should increase after adding assistant message");
+}
+
+/// Test that update_usage_from_response only updates cumulative_tokens, not used_tokens.
+/// This prevents double-counting which was causing the 80% threshold to be reached at 200%+.
+#[test]
+fn test_update_usage_only_affects_cumulative() {
+    let mut window = ContextWindow::new(10000);
+
+    // Initial state
+    assert_eq!(window.used_tokens, 0);
+    assert_eq!(window.cumulative_tokens, 0);
+
+    // Simulate API response with usage data
+    let usage = Usage {
        prompt_tokens: 100,
        completion_tokens: 50,
        total_tokens: 150,
    };
-    window.update_usage_from_response(&usage1);
-    assert_eq!(window.used_tokens, 150, "First call should have 150 tokens");
-    assert_eq!(window.cumulative_tokens, 150, "Cumulative should be 150");
+    window.update_usage_from_response(&usage);

-    // Second API call: 200 prompt + 75 completion = 275 total
+    // used_tokens should NOT change - it's tracked via add_message
+    assert_eq!(window.used_tokens, 0, "used_tokens should not be updated by update_usage_from_response");
+    
+    // cumulative_tokens SHOULD be updated for API usage tracking
+    assert_eq!(window.cumulative_tokens, 150, "cumulative_tokens should track total API usage");
+
+    // Another API call
    let usage2 = Usage {
        prompt_tokens: 200,
        completion_tokens: 75,
        total_tokens: 275,
    };
    window.update_usage_from_response(&usage2);
-    assert_eq!(
-        window.used_tokens, 425,
-        "Second call should accumulate to 425 tokens"
-    );
-    assert_eq!(window.cumulative_tokens, 425, "Cumulative should be 425");

-    // Third API call with SMALLER token count: 50 prompt + 25 completion = 75 total
-    let usage3 = Usage {
-        prompt_tokens: 50,
-        completion_tokens: 25,
-        total_tokens: 75,
-    };
-    window.update_usage_from_response(&usage3);
-    assert_eq!(
-        window.used_tokens, 500,
-        "Third call should accumulate to 500 tokens"
-    );
-    assert_eq!(window.cumulative_tokens, 500, "Cumulative should be 500");
+    // used_tokens still unchanged
+    assert_eq!(window.used_tokens, 0, "used_tokens should remain unchanged");
    
-    // Verify tokens never decrease
-    assert!(
-        window.used_tokens >= 425,
-        "Token count should never decrease!"
-    );
+    // cumulative_tokens accumulates
+    assert_eq!(window.cumulative_tokens, 425, "cumulative_tokens should accumulate");
 }

+/// Test that add_streaming_tokens only updates cumulative_tokens.
+/// The assistant message will be added via add_message which tracks used_tokens.
 #[test]
-fn test_add_streaming_tokens() {
+fn test_add_streaming_tokens_only_affects_cumulative() {
    let mut window = ContextWindow::new(10000);

-    // Add some streaming tokens
+    // Add streaming tokens (fallback when no usage data available)
    window.add_streaming_tokens(100);
-    assert_eq!(window.used_tokens, 100);
-    assert_eq!(window.cumulative_tokens, 100);
    
-    // Add more
+    // used_tokens should NOT change
+    assert_eq!(window.used_tokens, 0, "used_tokens should not be updated by add_streaming_tokens");
+    
+    // cumulative_tokens SHOULD be updated
+    assert_eq!(window.cumulative_tokens, 100, "cumulative_tokens should be updated");
+
+    // Add more streaming tokens
    window.add_streaming_tokens(50);
-    assert_eq!(window.used_tokens, 150);
+    assert_eq!(window.used_tokens, 0);
    assert_eq!(window.cumulative_tokens, 150);
-
-    // Now update from provider response
-    let usage = Usage {
-        prompt_tokens: 80,
-        completion_tokens: 40,
-        total_tokens: 120,
-    };
-    window.update_usage_from_response(&usage);
-
-    // Should ADD to existing, not replace
-    assert_eq!(window.used_tokens, 270, "Should add 120 to existing 150");
-    assert_eq!(window.cumulative_tokens, 270);
 }

+/// Test percentage calculation is based on used_tokens (actual context content).
 #[test]
-fn test_percentage_calculation() {
+fn test_percentage_based_on_used_tokens() {
    let mut window = ContextWindow::new(1000);

-    // Add tokens via provider response
+    // Initially 0%
+    assert_eq!(window.percentage_used(), 0.0);
+    assert_eq!(window.remaining_tokens(), 1000);
+
+    // Add messages to increase used_tokens
+    // A message with ~100 chars should be roughly 25-30 tokens
+    let msg = Message::new(MessageRole::User, "x".repeat(400)); // ~100 tokens estimated
+    window.add_message(msg);
+    
+    // Percentage should be based on used_tokens
+    let percentage = window.percentage_used();
+    assert!(percentage > 0.0, "percentage should be > 0 after adding message");
+    assert!(percentage < 100.0, "percentage should be < 100");
+    
+    // remaining_tokens should decrease
+    assert!(window.remaining_tokens() < 1000, "remaining tokens should decrease");
+}
+
+/// Test that the 80% summarization threshold works correctly.
+/// This was the original bug - used_tokens was being double/triple counted.
+#[test]
+fn test_should_summarize_threshold() {
+    let mut window = ContextWindow::new(1000);
+
+    // Add messages until we approach 80%
+    // Each message of ~320 chars is roughly 80 tokens (at 4 chars/token)
+    for _ in 0..9 {
+        let msg = Message::new(MessageRole::User, "x".repeat(320));
+        window.add_message(msg);
+    }
+
+    // Should be around 720 tokens (72%) - not yet at threshold
+    // Note: actual token count depends on estimation algorithm
+    let percentage = window.percentage_used();
+    println!("After 9 messages: {}% used ({} tokens)", percentage, window.used_tokens);
+
+    // Add one more message to push over 80%
+    let msg = Message::new(MessageRole::User, "x".repeat(320));
+    window.add_message(msg);
+    
+    let percentage_after = window.percentage_used();
+    println!("After 10 messages: {}% used ({} tokens)", percentage_after, window.used_tokens);
+
+    // Now should_summarize should return true if we're at 80%+
+    if percentage_after >= 80.0 {
+        assert!(window.should_summarize(), "should_summarize should be true at 80%+");
+    }
+}
+
+/// Test that cumulative_tokens and used_tokens are independent.
+#[test]
+fn test_cumulative_vs_used_independence() {
+    let mut window = ContextWindow::new(10000);
+
+    // Add a message (affects used_tokens)
+    let msg = Message::new(MessageRole::User, "Hello world".to_string());
+    window.add_message(msg);
+    let used_after_msg = window.used_tokens;
+    let cumulative_after_msg = window.cumulative_tokens;
+    
+    // Both should be equal at this point (message adds to both)
+    assert_eq!(used_after_msg, cumulative_after_msg);
+
+    // Now simulate API response (only affects cumulative_tokens)
    let usage = Usage {
-        prompt_tokens: 150,
-        completion_tokens: 100,
-        total_tokens: 250,
+        prompt_tokens: 500,
+        completion_tokens: 200,
+        total_tokens: 700,
    };
    window.update_usage_from_response(&usage);

-    assert_eq!(window.percentage_used(), 25.0);
-    assert_eq!(window.remaining_tokens(), 750);
+    // used_tokens unchanged
+    assert_eq!(window.used_tokens, used_after_msg, "used_tokens should not change from API response");
    
-    // Add more tokens
-    let usage2 = Usage {
-        prompt_tokens: 300,
-        completion_tokens: 200,
-        total_tokens: 500,
-    };
-    window.update_usage_from_response(&usage2);
+    // cumulative_tokens increased
+    assert_eq!(window.cumulative_tokens, cumulative_after_msg + 700, "cumulative_tokens should increase");
    
-    assert_eq!(window.percentage_used(), 75.0);
-    assert_eq!(window.remaining_tokens(), 250);
+    // They should now be different
+    assert!(window.cumulative_tokens > window.used_tokens, "cumulative should be greater than used");
 }
--- a/crates/g3-core/tests/todo_staleness_test.rs
+++ b/crates/g3-core/tests/todo_staleness_test.rs
@@ -81,6 +81,9 @@ impl UiWriter for MockUiWriter {
            .push(format!("CHOICE: {} Options: {:?}", message, options));
        self.choice_responses.lock().unwrap().pop().unwrap_or(0)
    }
+    fn print_final_output(&self, summary: &str) {
+        self.output.lock().unwrap().push(format!("FINAL: {}", summary));
+    }
 }

 #[tokio::test]
--- a/crates/g3-planner/src/lib.rs
+++ b/crates/g3-planner/src/lib.rs
@@ -85,6 +85,7 @@ pub async fn get_initial_discovery_messages(
        temperature: Some(provider.temperature()),
        stream: false,
        tools: None,
+        disable_thinking: false,
    };

    status("🤖 Calling LLM for discovery commands...");
--- a/crates/g3-providers/src/anthropic.rs
+++ b/crates/g3-providers/src/anthropic.rs
@@ -26,6 +26,7 @@
 //!         Some(0.1),
 //!         None, // cache_config
 //!         None, // enable_1m_context
+//!         None, // thinking_budget_tokens
 //!     )?;
 //!
 //!     // Create a completion request
@@ -38,6 +39,7 @@
 //!         temperature: Some(0.7),
 //!         stream: false,
 //!         tools: None,
+//!         disable_thinking: false,
 //!     };
 //!
 //!     // Get a completion
@@ -63,6 +65,7 @@
 //!         None,
 //!         None, // cache_config
 //!         None, // enable_1m_context
+//!         None, // thinking_budget_tokens
 //!     )?;
 //!
 //!     let request = CompletionRequest {
@@ -73,6 +76,7 @@
 //!         temperature: Some(0.7),
 //!         stream: true,
 //!         tools: None,
+//!         disable_thinking: false,
 //!     };
 //!
 //!     let mut stream = provider.stream(request).await?;
@@ -103,7 +107,7 @@ use serde::{Deserialize, Serialize};
 use std::time::Duration;
 use tokio::sync::mpsc;
 use tokio_stream::wrappers::ReceiverStream;
-use tracing::{debug, error, warn};
+use tracing::{debug, error};

 use crate::{
    CompletionChunk, CompletionRequest, CompletionResponse, CompletionStream, LLMProvider, Message,
@@ -122,6 +126,7 @@ pub struct AnthropicProvider {
    temperature: f32,
    cache_config: Option<String>,
    enable_1m_context: bool,
+    thinking_budget_tokens: Option<u32>,
 }

 impl AnthropicProvider {
@@ -132,6 +137,7 @@ impl AnthropicProvider {
        temperature: Option<f32>,
        cache_config: Option<String>,
        enable_1m_context: Option<bool>,
+        thinking_budget_tokens: Option<u32>,
    ) -> Result<Self> {
        let client = Client::builder()
            .timeout(Duration::from_secs(300))
@@ -150,6 +156,7 @@ impl AnthropicProvider {
            temperature: temperature.unwrap_or(0.1),
            cache_config,
            enable_1m_context: enable_1m_context.unwrap_or(false),
+            thinking_budget_tokens,
        })
    }

@@ -223,11 +230,13 @@ impl AnthropicProvider {
        for message in messages {
            match message.role {
                MessageRole::System => {
-                    if system_message.is_some() {
-                        warn!("Multiple system messages found, using the last one");
-                    }
+                    if let Some(existing) = system_message {
+                        // Concatenate system messages instead of replacing
+                        system_message = Some(format!("{}\n\n{}", existing, message.content));
+                    } else {
                        system_message = Some(message.content.clone());
                    }
+                }
                MessageRole::User => {
                    anthropic_messages.push(AnthropicMessage {
                        role: "user".to_string(),
@@ -265,6 +274,7 @@ impl AnthropicProvider {
        streaming: bool,
        max_tokens: u32,
        temperature: f32,
+        disable_thinking: bool,
    ) -> Result<AnthropicRequest> {
        let (system, anthropic_messages) = self.convert_messages(messages)?;

@@ -277,6 +287,33 @@ impl AnthropicProvider {
        // Convert tools if provided
        let anthropic_tools = tools.map(|t| self.convert_tools(t));

+        // Add thinking configuration if budget_tokens is set AND max_tokens is sufficient AND not explicitly disabled
+        // Anthropic requires: max_tokens > thinking.budget_tokens
+        // We add 1024 as minimum buffer for actual response content
+        tracing::debug!("create_request_body called: max_tokens={}, disable_thinking={}, thinking_budget_tokens={:?}", max_tokens, disable_thinking, self.thinking_budget_tokens);
+
+        let thinking = if disable_thinking {
+            tracing::info!(
+                "Thinking mode explicitly disabled for this request (max_tokens={})",
+                max_tokens
+            );
+            None
+        } else {
+            self.thinking_budget_tokens.and_then(|budget| {
+            let min_required = budget + 1024;
+            if max_tokens > min_required {
+                Some(ThinkingConfig::enabled(budget))
+            } else {
+                tracing::warn!(
+                    "Disabling thinking mode: max_tokens ({}) is not greater than thinking.budget_tokens ({}) + 1024 buffer. \
+                     Required: max_tokens > {}",
+                    max_tokens, budget, min_required
+                );
+                None
+            }
+            })
+        };
+
        let request = AnthropicRequest {
            model: self.model.clone(),
            max_tokens,
@@ -285,6 +322,7 @@ impl AnthropicProvider {
            system,
            tools: anthropic_tools,
            stream: streaming,
+            thinking,
        };

        // Ensure the conversation starts with a user message
@@ -624,6 +662,7 @@ impl LLMProvider for AnthropicProvider {
            false,
            max_tokens,
            temperature,
+            request.disable_thinking,
        )?;

        debug!(
@@ -697,6 +736,7 @@ impl LLMProvider for AnthropicProvider {
            true,
            max_tokens,
            temperature,
+            request.disable_thinking,
        )?;

        debug!(
@@ -775,6 +815,19 @@ impl LLMProvider for AnthropicProvider {

 // Anthropic API request/response structures

+#[derive(Debug, Serialize)]
+struct ThinkingConfig {
+    #[serde(rename = "type")]
+    thinking_type: String,
+    budget_tokens: u32,
+}
+
+impl ThinkingConfig {
+    fn enabled(budget_tokens: u32) -> Self {
+        Self { thinking_type: "enabled".to_string(), budget_tokens }
+    }
+}
+
 #[derive(Debug, Serialize)]
 struct AnthropicRequest {
    model: String,
@@ -786,6 +839,8 @@ struct AnthropicRequest {
    #[serde(skip_serializing_if = "Option::is_none")]
    tools: Option<Vec<AnthropicTool>>,
    stream: bool,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    thinking: Option<ThinkingConfig>,
 }

 #[derive(Debug, Serialize)]
@@ -819,6 +874,12 @@ enum AnthropicContent {
        #[serde(skip_serializing_if = "Option::is_none")]
        cache_control: Option<crate::CacheControl>,
    },
+    #[serde(rename = "thinking")]
+    Thinking {
+        thinking: String,
+        #[serde(default)]
+        signature: Option<String>,
+    },
    #[serde(rename = "tool_use")]
    ToolUse {
        id: String,
@@ -884,7 +945,7 @@ mod tests {
    #[test]
    fn test_message_conversion() {
        let provider =
-            AnthropicProvider::new("test-key".to_string(), None, None, None, None, None).unwrap();
+            AnthropicProvider::new("test-key".to_string(), None, None, None, None, None, None).unwrap();

        let messages = vec![
            Message::new(
@@ -912,13 +973,14 @@ mod tests {
            Some(0.5),
            None,
            None,
+            None,
        )
        .unwrap();

        let messages = vec![Message::new(MessageRole::User, "Test message".to_string())];

        let request_body = provider
-            .create_request_body(&messages, None, false, 1000, 0.5)
+            .create_request_body(&messages, None, false, 1000, 0.5, false)
            .unwrap();

        assert_eq!(request_body.model, "claude-3-haiku-20240307");
@@ -932,7 +994,7 @@ mod tests {
    #[test]
    fn test_tool_conversion() {
        let provider =
-            AnthropicProvider::new("test-key".to_string(), None, None, None, None, None).unwrap();
+            AnthropicProvider::new("test-key".to_string(), None, None, None, None, None, None).unwrap();

        let tools = vec![Tool {
            name: "get_weather".to_string(),
@@ -965,7 +1027,7 @@ mod tests {
    #[test]
    fn test_cache_control_serialization() {
        let provider =
-            AnthropicProvider::new("test-key".to_string(), None, None, None, None, None).unwrap();
+            AnthropicProvider::new("test-key".to_string(), None, None, None, None, None, None).unwrap();

        // Test message WITHOUT cache_control
        let messages_without = vec![Message::new(MessageRole::User, "Hello".to_string())];
@@ -1007,4 +1069,114 @@ mod tests {
            "JSON should not contain 'cache_control' field or null values when not configured"
        );
    }
+
+    #[test]
+    fn test_thinking_parameter_serialization() {
+        // Test WITHOUT thinking parameter
+        let provider_without = AnthropicProvider::new(
+            "test-key".to_string(),
+            Some("claude-sonnet-4-5".to_string()),
+            Some(1000),
+            Some(0.5),
+            None,
+            None,
+            None, // No thinking budget
+        )
+        .unwrap();
+
+        let messages = vec![Message::new(MessageRole::User, "Test message".to_string())];
+        let request_without = provider_without
+            .create_request_body(&messages, None, false, 1000, 0.5, false)
+            .unwrap();
+        let json_without = serde_json::to_string(&request_without).unwrap();
+        assert!(!json_without.contains("thinking"), "JSON should not contain 'thinking' field when not configured");
+
+        // Test WITH thinking parameter - max_tokens must be > budget_tokens + 1024
+        // Using budget=10000 requires max_tokens > 11024
+        let provider_with = AnthropicProvider::new(
+            "test-key".to_string(),
+            Some("claude-sonnet-4-5".to_string()),
+            Some(20000),  // Sufficient for thinking budget
+            Some(0.5),
+            None,
+            None,
+            Some(10000), // With thinking budget
+        )
+        .unwrap();
+
+        let request_with = provider_with
+            .create_request_body(&messages, None, false, 20000, 0.5, false)
+            .unwrap();
+        let json_with = serde_json::to_string(&request_with).unwrap();
+        assert!(json_with.contains("thinking"), "JSON should contain 'thinking' field when configured");
+        assert!(json_with.contains("\"type\":\"enabled\""), "JSON should contain type: enabled");
+        assert!(json_with.contains("\"budget_tokens\":10000"), "JSON should contain budget_tokens: 10000");
+
+        // Test WITH thinking parameter but INSUFFICIENT max_tokens - thinking should be disabled
+        let request_insufficient = provider_with
+            .create_request_body(&messages, None, false, 5000, 0.5, false)  // Less than budget + 1024
+            .unwrap();
+        let json_insufficient = serde_json::to_string(&request_insufficient).unwrap();
+        assert!(!json_insufficient.contains("thinking"), "JSON should NOT contain 'thinking' field when max_tokens is insufficient");
+    }
+
+    #[test]
+    fn test_disable_thinking_flag() {
+        // Test that disable_thinking=true prevents thinking even with sufficient max_tokens
+        let provider = AnthropicProvider::new(
+            "test-key".to_string(),
+            Some("claude-sonnet-4-5".to_string()),
+            Some(20000),
+            Some(0.5),
+            None,
+            None,
+            Some(10000), // With thinking budget
+        )
+        .unwrap();
+
+        let messages = vec![Message::new(MessageRole::User, "Test message".to_string())];
+        
+        // With disable_thinking=false, thinking should be enabled (max_tokens is sufficient)
+        let request_with_thinking = provider
+            .create_request_body(&messages, None, false, 20000, 0.5, false)
+            .unwrap();
+        let json_with = serde_json::to_string(&request_with_thinking).unwrap();
+        assert!(json_with.contains("thinking"), "JSON should contain 'thinking' field when not disabled");
+
+        // With disable_thinking=true, thinking should be disabled even with sufficient max_tokens
+        let request_without_thinking = provider
+            .create_request_body(&messages, None, false, 20000, 0.5, true)
+            .unwrap();
+        let json_without = serde_json::to_string(&request_without_thinking).unwrap();
+        assert!(!json_without.contains("thinking"), "JSON should NOT contain 'thinking' field when explicitly disabled");
+    }
+
+    #[test]
+    fn test_thinking_content_block_deserialization() {
+        // Test that we can deserialize a response containing a "thinking" content block
+        // This is what Anthropic returns when extended thinking is enabled
+        let json_response = r#"{
+            "content": [
+                {"type": "thinking", "thinking": "Let me analyze this...", "signature": "abc123"},
+                {"type": "text", "text": "Here is my response."}
+            ],
+            "model": "claude-sonnet-4-5",
+            "usage": {"input_tokens": 100, "output_tokens": 50}
+        }"#;
+
+        let response: AnthropicResponse = serde_json::from_str(json_response)
+            .expect("Should be able to deserialize response with thinking block");
+        
+        assert_eq!(response.content.len(), 2);
+        assert_eq!(response.model, "claude-sonnet-4-5");
+        
+        // Extract only text content (thinking should be filtered out)
+        let text_content: Vec<_> = response.content.iter().filter_map(|c| match c {
+            AnthropicContent::Text { text, .. } => Some(text.as_str()),
+            _ => None,
+        }).collect();
+        
+        assert_eq!(text_content.len(), 1);
+        assert_eq!(text_content[0], "Here is my response.");
+    }
 }
--- a/crates/g3-providers/src/databricks.rs
+++ b/crates/g3-providers/src/databricks.rs
@@ -45,6 +45,7 @@
 //!         temperature: Some(0.7),
 //!         stream: false,
 //!         tools: None,
+//!         disable_thinking: false,
 //!     };
 //!
 //!     // Get a completion
--- a/crates/g3-providers/src/lib.rs
+++ b/crates/g3-providers/src/lib.rs
@@ -42,6 +42,8 @@ pub struct CompletionRequest {
    pub temperature: Option<f32>,
    pub stream: bool,
    pub tools: Option<Vec<Tool>>,
+    /// Force disable thinking mode for this request (used when max_tokens is too low)
+    pub disable_thinking: bool,
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
Author	SHA1	Message	Date
Jochen	4aa84e2144	disable thinking if there is no token budget	2025-12-09 16:45:28 +11:00
Jochen	2283d9ddbf	small fix to provider name check	2025-12-09 14:43:35 +11:00
Jochen	fb2cf6f898	fix for thinking budget and hardcoded max token on summary	2025-12-09 12:41:52 +11:00
Jochen	696c441a47	validate max_tokens for call, also fallbacks for summary When the CW is full, max_tokens is often passed at 0 or tiny. The LLM will fail. For Anthropic with thining, there is also the thinking budget. This can happen during summary attempts, in that case first try thinnify, skinnify etc..	2025-12-09 10:15:32 +11:00
Dhanji R. Prasanna	48e6d594bc	tweak todo tool output	2025-12-08 11:05:01 +11:00
Dhanji R. Prasanna	678403da35	add a force thinnify cmd	2025-12-05 15:32:13 +11:00
Jochen	0970e4f356	Merge pull request #40 from dhanji/jochen-fix-coach-feedback now coach feedback works again	2025-12-03 10:55:15 +11:00
Jochen	758a313de0	Merge pull request #39 from dhanji/jochen-sonnet-thinking Fix temperature param + add thinking for anthropic	2025-12-03 10:54:34 +11:00
Jochen	0327a6dfdf	make sure coach feedback is extracted.	2025-12-02 22:00:58 +11:00
Jochen	928f2bfa9d	actually record coach feedback and use it	2025-12-02 21:23:50 +11:00
Jochen	21af6ba574	fix temperature for summary request too.	2025-12-02 21:20:16 +11:00
Jochen	ae16243f49	Fix temperature param + add thinking for anthropic The temperature param was not passed to the llm. Now support anthropic models in 'thinking' mode.	2025-12-02 17:24:55 +11:00
Dhanji R. Prasanna	9ee0468b87	test for system message	2025-12-02 14:45:12 +11:00
Dhanji R. Prasanna	d9ad244197	add markdown format only to final_output and fix todo duplication	2025-12-02 14:26:22 +11:00
Dhanji R. Prasanna	a6537e4dba	todo_write outputs entire list	2025-12-02 13:48:05 +11:00
Dhanji R. Prasanna	df3f25f2f0	test for resume unfinished todos	2025-12-02 11:07:13 +11:00
Dhanji R. Prasanna	f8f989d4c6	resume unfinished TODOs	2025-12-02 11:06:58 +11:00
Dhanji R. Prasanna	0e4c935a70	clean up TODO output	2025-12-02 06:48:58 +11:00
Dhanji R. Prasanna	1b4ea93ba4	token counting bugfix	2025-12-01 14:52:10 +11:00
Dhanji R. Prasanna	4496eee046	fix compaction to restore system message	2025-12-01 14:38:21 +11:00
Dhanji R. Prasanna	8928fb92be	append instead of replace system msg	2025-11-29 16:13:00 +11:00
Dhanji R. Prasanna	81fd2ab92f	unused var	2025-11-29 15:44:30 +11:00
Jochen	af7fb8f7f1	Merge pull request #38 from dhanji/jochen-debug-with-ids dumps context window for monitoring sizes, also add message id for internal debugging	2025-11-28 16:43:26 +11:00