Merge branch 'dhanji/fix-auto-continue': Fix auto-continue and duplicate detection bugs

2025-12-22 17:12:24 +11:00
parent 38fcaaf449 10e2fe9b94
commit 720ad8cad7
25 changed files with 1656 additions and 383 deletions
--- a/crates/g3-cli/src/lib.rs
+++ b/crates/g3-cli/src/lib.rs
@@ -267,7 +267,7 @@ use std::path::Path;
 use std::path::PathBuf;
 use std::process::exit;
 use tokio_util::sync::CancellationToken;
-use tracing::{error, info};
+use tracing::{debug, error};

 use g3_core::error_handling::{classify_error, ErrorType, RecoverableError};
 mod simple_output;
@@ -2693,7 +2693,7 @@ Remember: Be clear in your review and concise in your feedback. APPROVE iff the
            extract_coach_feedback_from_logs(&coach_result, &coach_agent, &output)?;

        // Log the size of the feedback for debugging
-        info!(
+        debug!(
            "Coach feedback extracted: {} characters (from {} total)",
            coach_feedback_text.len(),
            coach_result.response.len()
--- a/crates/g3-computer-control/build.rs
+++ b/crates/g3-computer-control/build.rs
@@ -68,6 +68,18 @@ fn main() {
        dylib_dst.display()
    );

+    // Re-sign the dylib with ad-hoc signature to fix code signing issues on Apple Silicon
+    // This is necessary because incremental compilation can invalidate signatures
+    let codesign_status = Command::new("codesign")
+        .args(&["-f", "-s", "-", dylib_dst.to_str().unwrap()])
+        .status();
+
+    if let Ok(status) = codesign_status {
+        if !status.success() {
+            println!("cargo:warning=Failed to codesign libVisionBridge.dylib (non-fatal)");
+        }
+    }
+
    // Add rpath so the dylib can be found at runtime
    println!("cargo:rustc-link-arg=-Wl,-rpath,@executable_path");
    println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
--- a/crates/g3-computer-control/src/platform/macos.rs
+++ b/crates/g3-computer-control/src/platform/macos.rs
@@ -24,7 +24,7 @@ impl MacOSController {
    pub fn new() -> Result<Self> {
        let ocr = Box::new(DefaultOCR::new()?);
        let ocr_name = ocr.name().to_string();
-        tracing::info!("Initialized macOS controller with OCR engine: {}", ocr_name);
+        tracing::debug!("Initialized macOS controller with OCR engine: {}", ocr_name);
        Ok(Self {
            ocr_engine: ocr,
            ocr_name,
@@ -155,7 +155,7 @@ impl ComputerController for MacOSController {
                            // 1. At layer 0 (normal windows, not menu bar)
                            // 2. Have real bounds (width and height >= 100)
                            if layer == 0 && has_real_bounds {
-                                tracing::info!("Found valid window: ID {} for app '{}' (layer={}, bounds valid)", id, owner, layer);
+                                tracing::debug!("Found valid window: ID {} for app '{}' (layer={}, bounds valid)", id, owner, layer);
                                found_window_id = Some((id as u32, owner.clone()));
                                break;
                            } else {
@@ -178,7 +178,7 @@ impl ComputerController for MacOSController {
        let (cg_window_id, matched_owner) = cg_window_id.ok_or_else(|| {
            anyhow::anyhow!("Could not find window for application '{}'. Use list_windows to see available windows.", app_name)
        })?;
-        tracing::info!(
+        tracing::debug!(
            "Taking screenshot of window ID {} for app '{}'",
            cg_window_id,
            matched_owner
@@ -468,7 +468,7 @@ impl MacOSController {

                            // Only accept windows with real bounds (>= 100x100 pixels)
                            if w >= 100 && h >= 100 {
-                                tracing::info!("Found valid window bounds for '{}': x={}, y={}, w={}, h={} (layer={})", owner, x, y, w, h, layer);
+                                tracing::debug!("Found valid window bounds for '{}': x={}, y={}, w={}, h={} (layer={})", owner, x, y, w, h, layer);
                                return Ok((x, y, w, h));
                            } else {
                                tracing::debug!(
--- a/crates/g3-console/src/api/control.rs
+++ b/crates/g3-console/src/api/control.rs
@@ -3,7 +3,7 @@ use crate::process::ProcessController;
 use axum::{extract::State, http::StatusCode, Json};
 use std::sync::Arc;
 use tokio::sync::Mutex;
-use tracing::{error, info};
+use tracing::{debug, error};

 pub type ControllerState = Arc<Mutex<ProcessController>>;

@@ -22,7 +22,7 @@ pub async fn kill_instance(

    match controller.kill_process(pid) {
        Ok(_) => {
-            info!("Successfully killed process {}", pid);
+            debug!("Successfully killed process {}", pid);
            Ok(Json(serde_json::json!({
                "status": "terminating"
            })))
@@ -38,7 +38,7 @@ pub async fn restart_instance(
    State(controller): State<ControllerState>,
    axum::extract::Path(id): axum::extract::Path<String>,
 ) -> Result<Json<LaunchResponse>, StatusCode> {
-    info!("Restarting instance: {}", id);
+    debug!("Restarting instance: {}", id);

    // Extract PID from instance ID (format: pid_timestamp)
    let pid: u32 = id
@@ -81,7 +81,7 @@ pub async fn launch_instance(
    State(controller): State<ControllerState>,
    Json(request): Json<LaunchRequest>,
 ) -> Result<Json<LaunchResponse>, (StatusCode, Json<serde_json::Value>)> {
-    info!("Launching new g3 instance: {:?}", request);
+    debug!("Launching new g3 instance: {:?}", request);

    // Validate binary path if provided
    if let Some(ref binary_path) = request.g3_binary_path {
@@ -149,7 +149,7 @@ pub async fn launch_instance(
    ) {
        Ok(pid) => {
            let id = format!("{}_{}", pid, chrono::Utc::now().timestamp());
-            info!("Successfully launched g3 instance with PID {}", pid);
+            debug!("Successfully launched g3 instance with PID {}", pid);
            Ok(Json(LaunchResponse {
                id,
                status: "starting".to_string(),
--- a/crates/g3-console/src/api/state.rs
+++ b/crates/g3-console/src/api/state.rs
@@ -3,7 +3,7 @@ use axum::{http::StatusCode, Json};
 use serde::{Deserialize, Serialize};
 use std::os::unix::fs::PermissionsExt;
 use std::path::PathBuf;
-use tracing::{error, info};
+use tracing::{debug, error};

 pub async fn get_state() -> Result<Json<ConsoleState>, StatusCode> {
    let state = ConsoleState::load();
@@ -15,7 +15,7 @@ pub async fn save_state(
 ) -> Result<Json<serde_json::Value>, StatusCode> {
    match state.save() {
        Ok(_) => {
-            info!("Console state saved successfully");
+            debug!("Console state saved successfully");
            Ok(Json(serde_json::json!({
                "status": "saved"
            })))
--- a/crates/g3-console/src/launch.rs
+++ b/crates/g3-console/src/launch.rs
@@ -1,7 +1,7 @@
 use serde::{Deserialize, Serialize};
 use std::fs;
 use std::path::PathBuf;
-use tracing::info;
+use tracing::debug;

 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct ConsoleState {
@@ -42,7 +42,7 @@ impl ConsoleState {

    pub fn save(&self) -> anyhow::Result<()> {
        let config_path = Self::config_path();
-        info!("Saving console state to: {:?}", config_path);
+        debug!("Saving console state to: {:?}", config_path);

        // Create parent directory if it doesn't exist
        if let Some(parent) = config_path.parent() {
@@ -51,7 +51,7 @@ impl ConsoleState {

        let content = serde_json::to_string_pretty(self)?;
        fs::write(&config_path, content)?;
-        info!("Console state saved successfully to: {:?}", config_path);
+        debug!("Console state saved successfully to: {:?}", config_path);

        Ok(())
    }
--- a/crates/g3-console/src/main.rs
+++ b/crates/g3-console/src/main.rs
@@ -16,7 +16,7 @@ use std::sync::Arc;
 use tokio::sync::Mutex;
 use tower_http::cors::CorsLayer;
 use tower_http::services::ServeDir;
-use tracing::{info, Level};
+use tracing::{debug, Level};
 use tracing_subscriber;

 #[derive(Parser, Debug)]
@@ -84,12 +84,12 @@ async fn main() -> anyhow::Result<()> {
        .layer(CorsLayer::permissive());

    let addr = format!("{}:{}", args.host, args.port);
-    info!("Starting g3-console on http://{}", addr);
+    debug!("Starting g3-console on http://{}", addr);

    // Auto-open browser if requested
    if args.open {
        let url = format!("http://{}", addr);
-        info!("Opening browser to {}", url);
+        debug!("Opening browser to {}", url);
        let _ = open::that(&url);
    }

--- a/crates/g3-console/src/process/controller.rs
+++ b/crates/g3-console/src/process/controller.rs
@@ -6,7 +6,7 @@ use std::path::PathBuf;
 use std::process::{Command, Stdio};
 use std::sync::Mutex;
 use sysinfo::{Pid, Process, Signal, System};
-use tracing::{debug, info};
+use tracing::debug;

 pub struct ProcessController {
    system: System,
@@ -26,7 +26,7 @@ impl ProcessController {
        self.system.refresh_processes();

        if let Some(process) = self.system.process(sysinfo_pid) {
-            info!("Killing process {} ({})", pid, process.name());
+            debug!("Killing process {} ({})", pid, process.name());

            // Try SIGTERM first
            if process.kill_with(Signal::Term).is_some() {
@@ -107,7 +107,7 @@ impl ProcessController {
            });
        }

-        info!("Launching g3: {:?}", cmd);
+        debug!("Launching g3: {:?}", cmd);

        // Spawn and wait for the intermediate process to exit
        let mut child = cmd.spawn().context("Failed to spawn g3 process")?;
@@ -120,7 +120,7 @@ impl ProcessController {

        // The actual g3 process is now running as orphan
        // We need to scan for it by matching workspace and recent start time
-        info!(
+        debug!(
            "Scanning for newly launched g3 process in workspace: {}",
            workspace
        );
@@ -171,7 +171,7 @@ impl ProcessController {
            found
        } else {
            // If we couldn't find it, try one more refresh after a longer delay
-            info!("Process not found on first scan, trying again...");
+            debug!("Process not found on first scan, trying again...");
            std::thread::sleep(std::time::Duration::from_millis(2000));
            self.system.refresh_processes();

@@ -204,7 +204,7 @@ impl ProcessController {
            retry_found.unwrap_or(intermediate_pid)
        };

-        info!("Launched g3 process with PID {}", pid);
+        debug!("Launched g3 process with PID {}", pid);

        // Store launch params for restart
        let params = LaunchParams {
--- a/crates/g3-console/src/process/detector.rs
+++ b/crates/g3-console/src/process/detector.rs
@@ -3,7 +3,7 @@ use anyhow::Result;
 use chrono::{DateTime, Utc};
 use std::path::PathBuf;
 use sysinfo::{Pid, Process, System};
-use tracing::{debug, info, warn};
+use tracing::{debug, warn};

 pub struct ProcessDetector {
    system: System,
@@ -17,7 +17,7 @@ impl ProcessDetector {
    }

    pub fn detect_instances(&mut self) -> Result<Vec<Instance>> {
-        info!("Scanning for g3 processes...");
+        debug!("Scanning for g3 processes...");
        // Refresh all processes to ensure we catch newly started ones
        // Using refresh_all() instead of just refresh_processes() to ensure
        // we get complete information about new processes
@@ -37,7 +37,7 @@ impl ProcessDetector {
            }
        }

-        info!("Detected {} g3 instances", instances.len());
+        debug!("Detected {} g3 instances", instances.len());
        Ok(instances)
    }

--- a/crates/g3-core/src/error_handling.rs
+++ b/crates/g3-core/src/error_handling.rs
@@ -9,7 +9,7 @@
 use anyhow::Result;
 use serde::{Deserialize, Serialize};
 use std::time::Duration;
-use tracing::{error, info, warn};
+use tracing::{debug, error, warn};

 /// Base delay for exponential backoff (in milliseconds)
 const BASE_RETRY_DELAY_MS: u64 = 1000;
@@ -149,7 +149,7 @@ impl ErrorContext {
                if let Err(e) = std::fs::write(&filename, json_content) {
                    error!("Failed to save error context to {:?}: {}", &filename, e);
                } else {
-                    info!("Error details saved to: {:?}", &filename);
+                    debug!("Error details saved to: {:?}", &filename);
                }
            }
            Err(e) => {
@@ -328,7 +328,7 @@ where
        match operation().await {
            Ok(result) => {
                if attempt > 1 {
-                    info!(
+                    debug!(
                        "Operation '{}' succeeded after {} attempts",
                        operation_name, attempt
                    );
@@ -357,7 +357,7 @@ where

                        // Special handling for token limit errors
                        if matches!(recoverable_type, RecoverableError::TokenLimit) {
-                            info!("Token limit error detected. Consider triggering summarization.");
+                            debug!("Token limit error detected. Consider triggering summarization.");
                        }

                        tokio::time::sleep(delay).await;
--- a/crates/g3-core/src/feedback_extraction.rs
+++ b/crates/g3-core/src/feedback_extraction.rs
@@ -12,7 +12,7 @@ use crate::{logs_dir, Agent, TaskResult};
 use crate::ui_writer::UiWriter;
 use serde_json::Value;
 use std::path::PathBuf;
-use tracing::{debug, info, warn};
+use tracing::{debug, warn};

 /// Result of feedback extraction with source information
 #[derive(Debug, Clone)]
@@ -103,21 +103,21 @@ where
    // Try session log first (most reliable)
    if let Some(session_id) = agent.get_session_id() {
        if let Some(feedback) = try_extract_from_session_log(&session_id, config) {
-            info!("Extracted coach feedback from session log: {} chars", feedback.len());
+            debug!("Extracted coach feedback from session log: {} chars", feedback.len());
            return ExtractedFeedback::new(feedback, FeedbackSource::SessionLog);
        }
    }

    // Try native tool call JSON parsing
    if let Some(feedback) = try_extract_from_native_tool_call(&coach_result.response) {
-        info!("Extracted coach feedback from native tool call: {} chars", feedback.len());
+        debug!("Extracted coach feedback from native tool call: {} chars", feedback.len());
        return ExtractedFeedback::new(feedback, FeedbackSource::NativeToolCall);
    }

    // Try conversation history
    if let Some(session_id) = agent.get_session_id() {
        if let Some(feedback) = try_extract_from_conversation_history(&session_id, config) {
-            info!("Extracted coach feedback from conversation history: {} chars", feedback.len());
+            debug!("Extracted coach feedback from conversation history: {} chars", feedback.len());
            return ExtractedFeedback::new(feedback, FeedbackSource::ConversationHistory);
        }
    }
@@ -125,7 +125,7 @@ where
    // Try TaskResult parsing
    let extracted = coach_result.extract_final_output();
    if !extracted.is_empty() {
-        info!("Extracted coach feedback from task result: {} chars", extracted.len());
+        debug!("Extracted coach feedback from task result: {} chars", extracted.len());
        return ExtractedFeedback::new(extracted, FeedbackSource::TaskResultResponse);
    }

--- a/crates/g3-core/src/lib.rs
+++ b/crates/g3-core/src/lib.rs
@@ -39,7 +39,7 @@ use serde_json::json;
 use std::io::Write;
 use std::time::{Duration, Instant};
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, error, info, warn};
+use tracing::{debug, error, warn};

 /// Get the path to the todo.g3.md file.
 /// 
@@ -246,13 +246,23 @@ pub enum StreamState {
    Resuming,
 }

+/// Patterns used to detect JSON tool calls in text
+/// These cover common whitespace variations in JSON formatting
+const TOOL_CALL_PATTERNS: [&str; 4] = [
+    r#"{"tool":"#,
+    r#"{ "tool":"#,
+    r#"{"tool" :"#,
+    r#"{ "tool" :"#,
+];
+
 /// Modern streaming tool parser that properly handles native tool calls and SSE chunks
 #[derive(Debug)]
 pub struct StreamingToolParser {
    /// Buffer for accumulating text content
    text_buffer: String,
-    /// Buffer for accumulating native tool calls
-    native_tool_calls: Vec<g3_providers::ToolCall>,
+    /// Position in text_buffer up to which tool calls have been consumed/executed
+    /// This prevents has_unexecuted_tool_call() from returning true for already-executed tools
+    last_consumed_position: usize,
    /// Whether we've received a message_stop event
    message_stopped: bool,
    /// Whether we're currently in a JSON tool call (for fallback parsing)
@@ -271,13 +281,58 @@ impl StreamingToolParser {
    pub fn new() -> Self {
        Self {
            text_buffer: String::new(),
-            native_tool_calls: Vec::new(),
+            last_consumed_position: 0,
            message_stopped: false,
            in_json_tool_call: false,
            json_tool_start: None,
        }
    }

+    /// Find the starting position of the last tool call pattern in the given text
+    /// Returns None if no tool call pattern is found
+    fn find_last_tool_call_start(text: &str) -> Option<usize> {
+        let mut best_start: Option<usize> = None;
+        for pattern in &TOOL_CALL_PATTERNS {
+            if let Some(pos) = text.rfind(pattern) {
+                if best_start.map_or(true, |best| pos > best) {
+                    best_start = Some(pos);
+                }
+            }
+        }
+        best_start
+    }
+
+    /// Find the starting position of the FIRST tool call pattern in the given text
+    /// Returns None if no tool call pattern is found
+    fn find_first_tool_call_start(text: &str) -> Option<usize> {
+        let mut best_start: Option<usize> = None;
+        for pattern in &TOOL_CALL_PATTERNS {
+            if let Some(pos) = text.find(pattern) {
+                if best_start.map_or(true, |best| pos < best) {
+                    best_start = Some(pos);
+                }
+            }
+        }
+        best_start
+    }
+
+    /// Validate that tool call args don't contain message-like content
+    /// This detects malformed tool calls where agent messages got mixed into args
+    fn has_message_like_keys(args: &serde_json::Map<String, serde_json::Value>) -> bool {
+        args.keys().any(|key| {
+            key.len() > 100
+                || key.contains('\n')
+                || key.contains("I'll")
+                || key.contains("Let me")
+                || key.contains("Here's")
+                || key.contains("I can")
+                || key.contains("I need")
+                || key.contains("First")
+                || key.contains("Now")
+                || key.contains("The ")
+        })
+    }
+
    /// Process a streaming chunk and return completed tool calls if any
    pub fn process_chunk(&mut self, chunk: &g3_providers::CompletionChunk) -> Vec<ToolCall> {
        let mut completed_tools = Vec::new();
@@ -308,10 +363,12 @@ impl StreamingToolParser {
            self.message_stopped = true;
            debug!("Message finished, processing accumulated tool calls");
            
-            // When stream finishes, do a final check for JSON tool calls in the accumulated buffer
+            // When stream finishes, find ALL JSON tool calls in the accumulated buffer
            if completed_tools.is_empty() && !self.text_buffer.is_empty() {
-                if let Some(json_tool) = self.try_parse_json_tool_call_from_buffer() {
-                    completed_tools.push(json_tool);
+                let all_tools = self.try_parse_all_json_tool_calls_from_buffer();
+                if !all_tools.is_empty() {
+                    debug!("Found {} JSON tool calls in buffer at stream end", all_tools.len());
+                    completed_tools.extend(all_tools);
                }
            }
        }
@@ -328,26 +385,12 @@ impl StreamingToolParser {

    /// Fallback method to parse JSON tool calls from text content
    fn try_parse_json_tool_call(&mut self, _content: &str) -> Option<ToolCall> {
-        // Look for JSON tool call patterns
-        let patterns = [
-            r#"{"tool":"#,
-            r#"{ "tool":"#,
-            r#"{"tool" :"#,
-            r#"{ "tool" :"#,
-        ];
-
        // If we're not currently in a JSON tool call, look for the start
        if !self.in_json_tool_call {
-            for pattern in &patterns {
-                if let Some(pos) = self.text_buffer.rfind(pattern) {
-                    debug!(
-                        "Found JSON tool call pattern '{}' at position {}",
-                        pattern, pos
-                    );
-                    self.in_json_tool_call = true;
-                    self.json_tool_start = Some(pos);
-                    break;
-                }
+            if let Some(pos) = Self::find_last_tool_call_start(&self.text_buffer) {
+                debug!("Found JSON tool call pattern at position {}", pos);
+                self.in_json_tool_call = true;
+                self.json_tool_start = Some(pos);
            }
        }

@@ -356,83 +399,34 @@ impl StreamingToolParser {
            if let Some(start_pos) = self.json_tool_start {
                let json_text = &self.text_buffer[start_pos..];

-                // Try to find a complete JSON object
-                let mut brace_count = 0;
-                let mut in_string = false;
-                let mut escape_next = false;
+                // Try to find a complete JSON object using the shared helper
+                if let Some(end_pos) = Self::find_complete_json_object_end(json_text) {
+                    let json_str = &json_text[..=end_pos];
+                    debug!("Attempting to parse JSON tool call: {}", json_str);

-                for (i, ch) in json_text.char_indices() {
-                    if escape_next {
-                        escape_next = false;
-                        continue;
-                    }
-
-                    match ch {
-                        '\\' => escape_next = true,
-                        '"' if !escape_next => in_string = !in_string,
-                        '{' if !in_string => brace_count += 1,
-                        '}' if !in_string => {
-                            brace_count -= 1;
-                            if brace_count == 0 {
-                                // Found complete JSON object
-                                let json_str = &json_text[..=i];
-                                debug!("Attempting to parse JSON tool call: {}", json_str);
-
-                                // First try to parse as a ToolCall
-                                if let Ok(tool_call) = serde_json::from_str::<ToolCall>(json_str) {
-                                    // Validate that this is actually a proper tool call
-                                    // The args should be a JSON object with reasonable keys
-                                    if let Some(args_obj) = tool_call.args.as_object() {
-                                        // Check if any key looks like it contains agent message content
-                                        // This would indicate a malformed tool call where the message
-                                        // got mixed into the args
-                                        let has_message_like_key = args_obj.keys().any(|key| {
-                                            key.len() > 100
-                                                || key.contains('\n')
-                                                || key.contains("I'll")
-                                                || key.contains("Let me")
-                                                || key.contains("Here's")
-                                                || key.contains("I can")
-                                                || key.contains("I need")
-                                                || key.contains("First")
-                                                || key.contains("Now")
-                                                || key.contains("The ")
-                                        });
-
-                                        if has_message_like_key {
-                                            debug!("Detected malformed tool call with message-like keys, skipping");
-                                            // This looks like a malformed tool call, skip it
-                                            self.in_json_tool_call = false;
-                                            self.json_tool_start = None;
-                                            break;
-                                        }
-
-                                        // Also check if the values look reasonable
-                                        // Tool arguments should typically be file paths, commands, or content
-                                        // Not entire agent messages
-
-                                        debug!(
-                                            "Successfully parsed valid JSON tool call: {:?}",
-                                            tool_call
-                                        );
-                                        // Reset JSON parsing state
-                                        self.in_json_tool_call = false;
-                                        self.json_tool_start = None;
-                                        return Some(tool_call);
-                                    }
-                                    // If args is not an object, skip this as invalid
-                                    debug!("Tool call args is not an object, skipping");
-                                } else {
-                                    debug!("Failed to parse JSON tool call: {}", json_str);
-                                    // Reset and continue looking
-                                    self.in_json_tool_call = false;
-                                    self.json_tool_start = None;
-                                }
-                                break;
+                    // Try to parse as a ToolCall
+                    if let Ok(tool_call) = serde_json::from_str::<ToolCall>(json_str) {
+                        // Validate that args is an object with reasonable keys
+                        if let Some(args_obj) = tool_call.args.as_object() {
+                            if Self::has_message_like_keys(args_obj) {
+                                debug!("Detected malformed tool call with message-like keys, skipping");
+                                self.in_json_tool_call = false;
+                                self.json_tool_start = None;
+                                return None;
                            }
+
+                            debug!("Successfully parsed valid JSON tool call: {:?}", tool_call);
+                            self.in_json_tool_call = false;
+                            self.json_tool_start = None;
+                            return Some(tool_call);
                        }
-                        _ => {}
+                        debug!("Tool call args is not an object, skipping");
+                    } else {
+                        debug!("Failed to parse JSON tool call: {}", json_str);
                    }
+                    // Reset and continue looking
+                    self.in_json_tool_call = false;
+                    self.json_tool_start = None;
                }
            }
        }
@@ -440,76 +434,45 @@ impl StreamingToolParser {
        None
    }

-    /// Parse JSON tool call from the accumulated text buffer (called when stream finishes)
-    /// This is similar to try_parse_json_tool_call but operates on the full buffer
-    fn try_parse_json_tool_call_from_buffer(&mut self) -> Option<ToolCall> {
-        // Look for JSON tool call patterns in the accumulated buffer
-        let patterns = [
-            r#"{"tool":"#,
-            r#"{ "tool":"#,
-            r#"{"tool" :"#,
-            r#"{ "tool" :"#,
-        ];
-
-        // Find the last occurrence of a tool call pattern (most likely to be complete)
-        let mut best_start: Option<usize> = None;
-        for pattern in &patterns {
-            if let Some(pos) = self.text_buffer.rfind(pattern) {
-                if best_start.map_or(true, |best| pos > best) {
-                    best_start = Some(pos);
-                }
-            }
-        }
-
-        if let Some(start_pos) = best_start {
-            let json_text = &self.text_buffer[start_pos..];
-            debug!("Found potential JSON tool call at position {}: {:?}", start_pos, 
-                   if json_text.len() > 200 { &json_text[..200] } else { json_text });
-
-            // Try to find a complete JSON object
-            let mut brace_count = 0;
-            let mut in_string = false;
-            let mut escape_next = false;
-
-            for (i, ch) in json_text.char_indices() {
-                if escape_next {
-                    escape_next = false;
-                    continue;
-                }
-
-                match ch {
-                    '\\' => escape_next = true,
-                    '"' if !escape_next => in_string = !in_string,
-                    '{' if !in_string => brace_count += 1,
-                    '}' if !in_string => {
-                        brace_count -= 1;
-                        if brace_count == 0 {
-                            // Found complete JSON object
-                            let json_str = &json_text[..=i];
-                            debug!("Attempting to parse JSON tool call from buffer: {}", json_str);
-
-                            if let Ok(tool_call) = serde_json::from_str::<ToolCall>(json_str) {
-                                if let Some(args_obj) = tool_call.args.as_object() {
-                                    // Validate - check for message-like keys
-                                    let has_message_like_key = args_obj.keys().any(|key| {
-                                        key.len() > 100 || key.contains('\n')
-                                    });
-
-                                    if !has_message_like_key {
-                                        debug!("Successfully parsed JSON tool call from buffer: {:?}", tool_call);
-                                        return Some(tool_call);
-                                    }
-                                }
+    /// Parse ALL JSON tool calls from the accumulated text buffer
+    /// This finds all complete tool calls, not just the last one
+    fn try_parse_all_json_tool_calls_from_buffer(&self) -> Vec<ToolCall> {
+        let mut tool_calls = Vec::new();
+        let mut search_start = 0;
+        
+        while search_start < self.text_buffer.len() {
+            let search_text = &self.text_buffer[search_start..];
+            
+            // Find the next tool call pattern
+            if let Some(relative_pos) = Self::find_first_tool_call_start(search_text) {
+                let abs_start = search_start + relative_pos;
+                let json_text = &self.text_buffer[abs_start..];
+                
+                // Try to find a complete JSON object
+                if let Some(end_pos) = Self::find_complete_json_object_end(json_text) {
+                    let json_str = &json_text[..=end_pos];
+                    
+                    if let Ok(tool_call) = serde_json::from_str::<ToolCall>(json_str) {
+                        if let Some(args_obj) = tool_call.args.as_object() {
+                            if !Self::has_message_like_keys(args_obj) {
+                                debug!("Found tool call at position {}: {:?}", abs_start, tool_call.tool);
+                                tool_calls.push(tool_call);
                            }
-                            break;
                        }
                    }
-                    _ => {}
+                    // Move past this tool call
+                    search_start = abs_start + end_pos + 1;
+                } else {
+                    // Incomplete JSON, stop searching
+                    break;
                }
+            } else {
+                // No more tool call patterns found
+                break;
            }
        }
-
-        None
+        
+        tool_calls
    }

    /// Get the accumulated text content (excluding tool calls)
@@ -531,10 +494,83 @@ impl StreamingToolParser {
        self.message_stopped
    }

+    /// Check if the text buffer contains an incomplete JSON tool call
+    /// This detects cases where the LLM started emitting a tool call but the stream ended
+    /// before the JSON was complete (truncated output)
+    pub fn has_incomplete_tool_call(&self) -> bool {
+        // Only check the unconsumed portion of the buffer
+        let unchecked_buffer = &self.text_buffer[self.last_consumed_position..];
+        if let Some(start_pos) = Self::find_last_tool_call_start(unchecked_buffer) {
+            let json_text = &unchecked_buffer[start_pos..];
+            // If NOT complete, it's an incomplete tool call
+            Self::find_complete_json_object_end(json_text).is_none()
+        } else {
+            false
+        }
+    }
+
+    /// Check if the text buffer contains an unexecuted tool call
+    /// This detects cases where the LLM emitted a complete tool call JSON
+    /// but it wasn't parsed/executed (e.g., due to parsing issues)
+    pub fn has_unexecuted_tool_call(&self) -> bool {
+        // Only check the unconsumed portion of the buffer
+        let unchecked_buffer = &self.text_buffer[self.last_consumed_position..];
+        if let Some(start_pos) = Self::find_last_tool_call_start(unchecked_buffer) {
+            let json_text = &unchecked_buffer[start_pos..];
+            // If the JSON IS complete, it means there's an unexecuted tool call
+            if let Some(json_end) = Self::find_complete_json_object_end(json_text) {
+                let json_only = &json_text[..=json_end];
+                return serde_json::from_str::<serde_json::Value>(json_only).is_ok();
+            }
+        }
+        false
+    }
+
+    /// Mark all tool calls up to the current buffer position as consumed/executed
+    /// This prevents has_unexecuted_tool_call() from returning true for already-executed tools
+    pub fn mark_tool_calls_consumed(&mut self) {
+        self.last_consumed_position = self.text_buffer.len();
+    }
+
+    /// Find the end position (byte index) of a complete JSON object in the text
+    /// Returns None if no complete JSON object is found
+    /// Find the end position (byte index) of a complete JSON object in the text
+    pub fn find_complete_json_object_end(text: &str) -> Option<usize> {
+        let mut brace_count = 0;
+        let mut in_string = false;
+        let mut escape_next = false;
+        let mut found_start = false;
+
+        for (i, ch) in text.char_indices() {
+            if escape_next {
+                escape_next = false;
+                continue;
+            }
+
+            match ch {
+                '\\' => escape_next = true,
+                '"' if !escape_next => in_string = !in_string,
+                '{' if !in_string => {
+                    brace_count += 1;
+                    found_start = true;
+                }
+                '}' if !in_string => {
+                    brace_count -= 1;
+                    if brace_count == 0 && found_start {
+                        return Some(i); // Return the byte index of the closing brace
+                    }
+                }
+                _ => {}
+            }
+        }
+
+        None // No complete JSON object found
+    }
+
    /// Reset the parser state for a new message
    pub fn reset(&mut self) {
        self.text_buffer.clear();
-        self.native_tool_calls.clear();
+        self.last_consumed_position = 0;
        self.message_stopped = false;
        self.in_json_tool_call = false;
        self.json_tool_start = None;
@@ -2743,7 +2779,7 @@ impl<W: UiWriter> Agent<W> {
    /// Manually trigger context summarization regardless of context window size
    /// Returns Ok(true) if summarization was successful, Ok(false) if it failed
    pub async fn force_summarize(&mut self) -> Result<bool> {
-        info!("Manual summarization triggered");
+        debug!("Manual summarization triggered");

        self.ui_writer.print_context_status(&format!(
            "\n🗜️ Manual summarization requested (current usage: {}%)...",
@@ -2861,7 +2897,7 @@ impl<W: UiWriter> Agent<W> {

    /// Manually trigger context thinning regardless of thresholds
    pub fn force_thin(&mut self) -> String {
-        info!("Manual context thinning triggered");
+        debug!("Manual context thinning triggered");
        let (message, chars_saved) = self.context_window.thin_context(self.session_id.as_deref());
        self.thinning_events.push(chars_saved);
        message
@@ -2870,7 +2906,7 @@ impl<W: UiWriter> Agent<W> {
    /// Manually trigger context thinning for the ENTIRE context window
    /// Unlike force_thin which only processes the first third, this processes all messages
    pub fn force_thin_all(&mut self) -> String {
-        info!("Manual full context skinnifying triggered");
+        debug!("Manual full context skinnifying triggered");
        let (message, chars_saved) = self.context_window.thin_context_all(self.session_id.as_deref());
        self.thinning_events.push(chars_saved);
        message
@@ -2879,7 +2915,7 @@ impl<W: UiWriter> Agent<W> {
    /// Reload README.md and AGENTS.md and replace the first system message
    /// Returns Ok(true) if README was found and reloaded, Ok(false) if no README was present initially
    pub fn reload_readme(&mut self) -> Result<bool> {
-        info!("Manual README reload triggered");
+        debug!("Manual README reload triggered");

        // Check if the second message in conversation history is a system message with README content
        // (The first message should always be the system prompt)
@@ -2922,7 +2958,7 @@ impl<W: UiWriter> Agent<W> {
            // Replace the second message (README) with the new content
            if let Some(first_msg) = self.context_window.conversation_history.get_mut(1) {
                first_msg.content = combined_content;
-                info!("README content reloaded successfully");
+                debug!("README content reloaded successfully");
                Ok(true)
            } else {
                Ok(false)
@@ -3156,7 +3192,7 @@ impl<W: UiWriter> Agent<W> {
            error!("Failed to clear continuation artifacts: {}", e);
        }
        
-        info!("Session cleared");
+        debug!("Session cleared");
    }

    /// Restore session from a continuation artifact
@@ -3201,7 +3237,7 @@ impl<W: UiWriter> Agent<W> {
                            });
                        }
                        
-                        info!("Restored full context from session log");
+                        debug!("Restored full context from session log");
                        return Ok(true);
                    }
                }
@@ -3226,7 +3262,7 @@ impl<W: UiWriter> Agent<W> {
            });
        }
        
-        info!("Restored session from summary");
+        debug!("Restored session from summary");
        Ok(false)
    }

@@ -3836,7 +3872,7 @@ impl<W: UiWriter> Agent<W> {
            match provider.stream(request.clone()).await {
                Ok(stream) => {
                    if attempt > 1 {
-                        info!("Stream started successfully after {} attempts", attempt);
+                        debug!("Stream started successfully after {} attempts", attempt);
                    }
                    debug!("Stream started successfully");
                    debug!(
@@ -3886,9 +3922,9 @@ impl<W: UiWriter> Agent<W> {
        let mut response_started = false;
        let mut any_tool_executed = false; // Track if ANY tool was executed across all iterations
        let mut auto_summary_attempts = 0; // Track auto-summary prompt attempts
-        const MAX_AUTO_SUMMARY_ATTEMPTS: usize = 2; // Limit auto-summary retries
+        const MAX_AUTO_SUMMARY_ATTEMPTS: usize = 5; // Limit auto-summary retries (increased from 2 for better recovery)
        let mut final_output_called = false; // Track if final_output was called
-        let mut executed_tools_in_session: std::collections::HashSet<String> = std::collections::HashSet::new(); // Track executed tools to prevent duplicates
+        // Note: Session-level duplicate tracking was removed - we only prevent sequential duplicates (DUP IN CHUNK, DUP IN MSG)

        // Check if we need to summarize before starting
        if self.context_window.should_summarize() {
@@ -4189,77 +4225,51 @@ impl<W: UiWriter> Agent<W> {
                        };

                        // De-duplicate tool calls and track duplicates
-                        let mut seen_in_chunk: Vec<ToolCall> = Vec::new();
+                        let mut last_tool_in_chunk: Option<ToolCall> = None;
                        let mut deduplicated_tools: Vec<(ToolCall, Option<String>)> = Vec::new();

                        for tool_call in tools_to_process {
                            let mut duplicate_type = None;

-                            // Check for duplicates in current chunk
-                            if seen_in_chunk
-                                .iter()
-                                .any(|tc| are_duplicates(tc, &tool_call))
-                            {
+                            // Check for IMMEDIATELY SEQUENTIAL duplicate in current chunk
+                            // Only the immediately previous tool call counts as a duplicate
+                            if let Some(ref last_tool) = last_tool_in_chunk {
+                                if are_duplicates(last_tool, &tool_call) {
                                duplicate_type = Some("DUP IN CHUNK".to_string());
+                                }
                            } else {
-                                // Check for duplicate against previous message in history
-                                // Look at the last assistant message that contains tool calls
+                                // Check for IMMEDIATELY SEQUENTIAL duplicate against previous message
+                                // Only mark as duplicate if the LAST tool call in the previous message
+                                // matches AND there's no significant text after it
                                let mut found_in_prev = false;
                                for msg in self.context_window.conversation_history.iter().rev() {
                                    if matches!(msg.role, MessageRole::Assistant) {
-                                        // Try to parse tool calls from the message content
-                                        if msg.content.contains(r#"\"tool\""#) {
-                                            // Simple JSON extraction for tool calls
-                                            let content = &msg.content;
-                                            let mut start_idx = 0;
-                                            while let Some(tool_start) =
-                                                content[start_idx..].find(r#"{\"tool\""#)
-                                            {
-                                                let tool_start = start_idx + tool_start;
-                                                // Find the end of this JSON object
-                                                let mut brace_count = 0;
-                                                let mut in_string = false;
-                                                let mut escape_next = false;
-                                                let mut end_idx = tool_start;
-
-                                                for (i, ch) in content[tool_start..].char_indices()
-                                                {
-                                                    if escape_next {
-                                                        escape_next = false;
-                                                        continue;
-                                                    }
-                                                    if ch == '\\' && in_string {
-                                                        escape_next = true;
-                                                        continue;
-                                                    }
-                                                    if ch == '"' && !escape_next {
-                                                        in_string = !in_string;
-                                                    }
-                                                    if !in_string {
-                                                        if ch == '{' {
-                                                            brace_count += 1;
-                                                        } else if ch == '}' {
-                                                            brace_count -= 1;
-                                                            if brace_count == 0 {
-                                                                end_idx = tool_start + i + 1;
-                                                                break;
-                                                            }
-                                                        }
-                                                    }
-                                                }
-
-                                                if end_idx > tool_start {
-                                                    let tool_json = &content[tool_start..end_idx];
-                                                    if let Ok(prev_tool) =
-                                                        serde_json::from_str::<ToolCall>(tool_json)
-                                                    {
+                                        // Find the LAST tool call in the message
+                                        let content = &msg.content;
+                                        
+                                        // Look for the last occurrence of a tool call pattern
+                                        if let Some(last_tool_start) = content.rfind(r#"{"tool""#)
+                                            .or_else(|| content.rfind(r#"{ "tool""#))
+                                        {
+                                            // Find the end of this JSON object
+                                            if let Some(end_offset) = StreamingToolParser::find_complete_json_object_end(&content[last_tool_start..]) {
+                                                let end_idx = last_tool_start + end_offset + 1;
+                                                let tool_json = &content[last_tool_start..end_idx];
+                                                
+                                                // Check if there's any non-whitespace text after this tool call
+                                                let text_after = content[end_idx..].trim();
+                                                let has_text_after = !text_after.is_empty();
+                                                
+                                                // Only consider it a duplicate if:
+                                                // 1. The tool call matches
+                                                // 2. There's no text after it (it was the last thing in the message)
+                                                if !has_text_after {
+                                                    if let Ok(prev_tool) = serde_json::from_str::<ToolCall>(tool_json) {
                                                        if are_duplicates(&prev_tool, &tool_call) {
                                                            found_in_prev = true;
-                                                            break;
                                                        }
                                                    }
                                                }
-                                                start_idx = end_idx;
                                            }
                                        }
                                        // Only check the most recent assistant message
@@ -4272,13 +4282,8 @@ impl<W: UiWriter> Agent<W> {
                                }
                            }

-                            // Add to seen list if not a duplicate in chunk
-                            if duplicate_type
-                                .as_ref()
-                                .map_or(true, |s| s != "DUP IN CHUNK")
-                            {
-                                seen_in_chunk.push(tool_call.clone());
-                            }
+                            // Track the last tool call for sequential duplicate detection
+                            last_tool_in_chunk = Some(tool_call.clone());

                            deduplicated_tools.push((tool_call, duplicate_type));
                        }
@@ -4286,22 +4291,11 @@ impl<W: UiWriter> Agent<W> {
                        // Process each tool call
                        for (tool_call, duplicate_type) in deduplicated_tools {
                            debug!("Processing completed tool call: {:?}", tool_call);
+                            
+                            // Mark that we detected a tool call - this prevents content from being printed
+                            // even if the tool is skipped as a duplicate
+                            tool_executed = true;

-                            // Check if this tool was already executed in this session
-                            let tool_key = format!("{}:{}", tool_call.tool, serde_json::to_string(&tool_call.args).unwrap_or_default());
-                            if executed_tools_in_session.contains(&tool_key) {
-                                // Log the duplicate with red prefix
-                                let prefixed_tool_name = format!("🟥 {} DUP IN SESSION", tool_call.tool);
-                                let warning_msg = format!(
-                                    "⚠️ Duplicate tool call detected (already executed in session): Skipping {} with args {}",
-                                    tool_call.tool,
-                                    serde_json::to_string(&tool_call.args).unwrap_or_else(|_| "<unserializable>".to_string())
-                                );
-                                let mut modified_tool_call = tool_call.clone();
-                                modified_tool_call.tool = prefixed_tool_name;
-                                debug!("{}", warning_msg);
-                                continue; // Skip execution of duplicate
-                            }

                            // If it's a duplicate, log it and return a warning
                            if let Some(dup_type) = &duplicate_type {
@@ -4639,15 +4633,25 @@ impl<W: UiWriter> Agent<W> {
                            tool_executed = true;
                            any_tool_executed = true; // Track across all iterations

-                            // Add to executed tools set to prevent re-execution in this session
-                            executed_tools_in_session.insert(tool_key.clone());
+                            // Reset auto-continue attempts after successful tool execution
+                            // This gives the LLM fresh attempts since it's making progress
+                            auto_summary_attempts = 0;
+

                            // Reset the JSON tool call filter state after each tool execution
                            // This ensures the filter doesn't stay in suppression mode for subsequent streaming content
                            self.ui_writer.reset_json_filter();

-                            // Reset parser for next iteration - this clears the text buffer
-                            parser.reset();
+                            // Only reset parser if there are no more unexecuted tool calls in the buffer
+                            // This handles the case where the LLM emits multiple tool calls in one response
+                            if parser.has_unexecuted_tool_call() {
+                                debug!("Parser still has unexecuted tool calls, not resetting buffer");
+                                // Mark current tool as consumed so we don't re-detect it
+                                parser.mark_tool_calls_consumed();
+                            } else {
+                                // Reset parser for next iteration - this clears the text buffer
+                                parser.reset();
+                            }

                            // Clear current_response for next iteration to prevent buffered text
                            // from being incorrectly displayed after tool execution
@@ -4662,8 +4666,14 @@ impl<W: UiWriter> Agent<W> {
                        } // End of for loop processing each tool call

                        // If we processed any tools in multiple mode, break out to start new stream
+                        // BUT only if there are no more unexecuted tool calls in the buffer
                        if tool_executed && self.config.agent.allow_multiple_tool_calls {
-                            break;
+                            if parser.has_unexecuted_tool_call() {
+                                debug!("Tool executed but parser still has unexecuted tool calls, continuing to process");
+                                // Don't break - continue processing to pick up remaining tool calls
+                            } else {
+                                break;
+                            }
                        }

                        // If no tool calls were completed, continue streaming normally
@@ -4753,7 +4763,7 @@ impl<W: UiWriter> Agent<W> {
                                        "  - Text buffer content: {:?}",
                                        parser.get_text_content()
                                    );
-                                    error!("  - Native tool calls: {:?}", parser.native_tool_calls);
+                                    error!("  - Has incomplete tool call: {}", parser.has_incomplete_tool_call());
                                    error!("  - Message stopped: {}", parser.is_message_stopped());
                                    error!("  - In JSON tool call: {}", parser.in_json_tool_call);
                                    error!("  - JSON tool start: {:?}", parser.json_tool_start);
@@ -4831,6 +4841,17 @@ impl<W: UiWriter> Agent<W> {
                                    ));
                                }

+                                // If tools were executed in previous iterations but final_output wasn't called,
+                                // break to let the outer loop's auto-continue logic handle it
+                                if any_tool_executed && !final_output_called {
+                                    debug!("Tools were executed but final_output not called - breaking to auto-continue");
+                                    // Add the text response to context before breaking
+                                    if has_text_response && !current_response.trim().is_empty() {
+                                        full_response = current_response.clone();
+                                    }
+                                    break;
+                                }
+
                                // Set full_response to current_response (don't append)
                                // current_response already contains everything that was displayed
                                // Don't set full_response here - it would duplicate the output
@@ -4873,8 +4894,8 @@ impl<W: UiWriter> Agent<W> {
                        );

                        error!("Error type: {}", std::any::type_name_of_val(&e));
-                        error!("Parser state at error: text_buffer_len={}, native_tool_calls={}, message_stopped={}",
-                            parser.text_buffer_len(), parser.native_tool_calls.len(), parser.is_message_stopped());
+                        error!("Parser state at error: text_buffer_len={}, has_incomplete={}, message_stopped={}",
+                            parser.text_buffer_len(), parser.has_incomplete_tool_call(), parser.is_message_stopped());

                        // Store the error for potential logging later
                        _last_error = Some(error_details.clone());
@@ -4893,7 +4914,7 @@ impl<W: UiWriter> Agent<W> {
                            // If we have any content or tool calls, treat this as a graceful end
                            if chunks_received > 0
                                && (!parser.get_text_content().is_empty()
-                                    || parser.native_tool_calls.len() > 0)
+                                    || parser.has_unexecuted_tool_call())
                            {
                                warn!("Stream terminated unexpectedly but we have content, continuing");
                                break; // Break to process what we have
@@ -4941,18 +4962,77 @@ impl<W: UiWriter> Agent<W> {

                let has_response = !current_response.is_empty() || !full_response.is_empty();

+                // Check if the response is essentially empty (just whitespace or timing lines)
+                // This detects cases where the LLM outputs nothing substantive
+                let response_text = if !current_response.is_empty() {
+                    &current_response
+                } else {
+                    &full_response
+                };
+                let is_empty_response = response_text.trim().is_empty() 
+                    || response_text.lines().all(|line| line.trim().is_empty() || line.trim().starts_with("⏱️"));
+
+                // Check if there's an incomplete tool call in the buffer
+                let has_incomplete_tool_call = parser.has_incomplete_tool_call();
+
+                // Check if there's a complete but unexecuted tool call in the buffer
+                let has_unexecuted_tool_call = parser.has_unexecuted_tool_call();
+
+                // Log when we detect unexecuted or incomplete tool calls for debugging
+                if has_incomplete_tool_call {
+                    debug!("Detected incomplete tool call in buffer (buffer_len={}, consumed_up_to={})",
+                        parser.text_buffer_len(), parser.text_buffer_len());
+                }
+                if has_unexecuted_tool_call {
+                    debug!("Detected unexecuted tool call in buffer - this may indicate a parsing issue");
+                    warn!("Unexecuted tool call detected in buffer after stream ended");
+                }
+
                // Auto-continue if tools were executed but final_output was never called
-                // This is the simple rule: LLM must call final_output before returning control
-                if any_tool_executed && !final_output_called {
+                // OR if the LLM emitted an incomplete tool call (truncated JSON)
+                // OR if the LLM emitted a complete tool call that wasn't executed
+                // This ensures we don't return control when the LLM clearly intended to call a tool
+                // Note: We removed the redundant condition (any_tool_executed && is_empty_response)
+                // because it's already covered by (any_tool_executed && !final_output_called)
+                let should_auto_continue = (any_tool_executed && !final_output_called) 
+                    || has_incomplete_tool_call 
+                    || has_unexecuted_tool_call;
+                if should_auto_continue {
                    if auto_summary_attempts < MAX_AUTO_SUMMARY_ATTEMPTS {
                        auto_summary_attempts += 1;
-                        warn!(
-                            "LLM stopped without calling final_output after executing tools ({} iterations, auto-continue attempt {})",
-                            iteration_count, auto_summary_attempts
-                        );
-                        self.ui_writer.print_context_status(
-                            "\n🔄 Model stopped without calling final_output. Auto-continuing...\n"
-                        );
+                        if has_incomplete_tool_call {
+                            warn!(
+                                "LLM emitted incomplete tool call ({} iterations, auto-continue attempt {}/{})",
+                                iteration_count, auto_summary_attempts, MAX_AUTO_SUMMARY_ATTEMPTS
+                            );
+                            self.ui_writer.print_context_status(
+                                "\n🔄 Model emitted incomplete tool call. Auto-continuing...\n"
+                            );
+                        } else if has_unexecuted_tool_call {
+                            warn!(
+                                "LLM emitted unexecuted tool call ({} iterations, auto-continue attempt {}/{})",
+                                iteration_count, auto_summary_attempts, MAX_AUTO_SUMMARY_ATTEMPTS
+                            );
+                            self.ui_writer.print_context_status(
+                                "\n🔄 Model emitted tool call that wasn't executed. Auto-continuing...\n"
+                            );
+                        } else if is_empty_response {
+                            warn!(
+                                "LLM emitted empty/trivial response ({} iterations, auto-continue attempt {}/{})",
+                                iteration_count, auto_summary_attempts, MAX_AUTO_SUMMARY_ATTEMPTS
+                            );
+                            self.ui_writer.print_context_status(
+                                "\n🔄 Model emitted empty response. Auto-continuing...\n"
+                            );
+                        } else {
+                            warn!(
+                                "LLM stopped without calling final_output after executing tools ({} iterations, auto-continue attempt {}/{})",
+                                iteration_count, auto_summary_attempts, MAX_AUTO_SUMMARY_ATTEMPTS
+                            );
+                            self.ui_writer.print_context_status(
+                                "\n🔄 Model stopped without calling final_output. Auto-continuing...\n"
+                            );
+                        }
                        
                        // Add any text response to context before prompting for continuation
                        if has_response {
@@ -4971,10 +5051,17 @@ impl<W: UiWriter> Agent<W> {
                        }
                        
                        // Add a follow-up message asking for continuation
-                        let continue_prompt = Message::new(
-                            MessageRole::User,
-                            "Please continue until you are done. You **MUST** call `final_output` with a summary when done.".to_string(),
-                        );
+                        let continue_prompt = if has_incomplete_tool_call {
+                            Message::new(
+                                MessageRole::User,
+                                "Your previous response was cut off mid-tool-call. Please complete the tool call and continue.".to_string(),
+                            )
+                        } else {
+                            Message::new(
+                                MessageRole::User,
+                                "Please continue until you are done. You **MUST** call `final_output` with a summary when done.".to_string(),
+                            )
+                        };
                        self.context_window.add_message(continue_prompt);
                        request.messages = self.context_window.conversation_history.clone();
                        
@@ -4983,11 +5070,17 @@ impl<W: UiWriter> Agent<W> {
                    } else {
                        // Max attempts reached, give up gracefully
                        warn!(
-                            "Max auto-continue attempts ({}) reached, returning without final_output",
-                            MAX_AUTO_SUMMARY_ATTEMPTS
+                            "Max auto-continue attempts ({}) reached after {} iterations. Conditions: any_tool_executed={}, final_output_called={}, has_incomplete={}, has_unexecuted={}, is_empty_response={}",
+                            MAX_AUTO_SUMMARY_ATTEMPTS,
+                            iteration_count,
+                            any_tool_executed,
+                            final_output_called,
+                            has_incomplete_tool_call,
+                            has_unexecuted_tool_call,
+                            is_empty_response
                        );
                        self.ui_writer.print_agent_response(
-                            "\n⚠️ The model stopped without calling final_output after multiple attempts.\n"
+                            &format!("\n⚠️ The model stopped without calling final_output after {} auto-continue attempts.\n", MAX_AUTO_SUMMARY_ATTEMPTS)
                        );
                    }
                } else if has_response {
@@ -6434,7 +6527,7 @@ impl<W: UiWriter> Agent<W> {
                        let driver = mutex.into_inner();
                        match driver.quit().await {
                            Ok(_) => {
-                                info!("WebDriver session closed successfully");
+                                debug!("WebDriver session closed successfully");

                                // Kill the safaridriver process
                                if let Some(mut process) =
@@ -6443,7 +6536,7 @@ impl<W: UiWriter> Agent<W> {
                                    if let Err(e) = process.kill().await {
                                        warn!("Failed to kill safaridriver process: {}", e);
                                    } else {
-                                        info!("Safaridriver process terminated");
+                                        debug!("Safaridriver process terminated");
                                    }
                                }

--- a/crates/g3-core/src/retry.rs
+++ b/crates/g3-core/src/retry.rs
@@ -10,7 +10,7 @@ use crate::ui_writer::UiWriter;
 use crate::{Agent, DiscoveryOptions, TaskResult};
 use anyhow::Result;
 use std::time::Instant;
-use tracing::{info, warn};
+use tracing::{debug, warn};

 /// Configuration for retry behavior
 #[derive(Debug, Clone)]
@@ -142,7 +142,7 @@ where
        match result {
            Ok(task_result) => {
                if retry_count > 0 {
-                    info!(
+                    debug!(
                        "{} task succeeded after {} retries (elapsed: {:?})",
                        config.role_name,
                        retry_count,
@@ -259,7 +259,7 @@ where
        match operation().await {
            Ok(result) => {
                if retry_count > 0 {
-                    info!(
+                    debug!(
                        "Operation '{}' succeeded after {} retries",
                        operation_name, retry_count
                    );
--- a/crates/g3-core/src/session_continuation.rs
+++ b/crates/g3-core/src/session_continuation.rs
@@ -6,7 +6,7 @@
 use anyhow::Result;
 use serde::{Deserialize, Serialize};
 use std::path::{Path, PathBuf};
-use tracing::{debug, error, info, warn};
+use tracing::{debug, error, warn};

 /// Version of the session continuation format
 const CONTINUATION_VERSION: &str = "1.0";
@@ -89,7 +89,7 @@ pub fn save_continuation(continuation: &SessionContinuation) -> Result<PathBuf>
    let json = serde_json::to_string_pretty(continuation)?;
    std::fs::write(&latest_path, &json)?;
    
-    info!("Saved session continuation to {:?}", latest_path);
+    debug!("Saved session continuation to {:?}", latest_path);
    Ok(latest_path)
 }

@@ -113,7 +113,7 @@ pub fn load_continuation() -> Result<Option<SessionContinuation>> {
        );
    }
    
-    info!("Loaded session continuation from {:?}", latest_path);
+    debug!("Loaded session continuation from {:?}", latest_path);
    Ok(Some(continuation))
 }

@@ -131,7 +131,7 @@ pub fn clear_continuation() -> Result<()> {
                debug!("Removed session file: {:?}", path);
            }
        }
-        info!("Cleared session continuation artifacts");
+        debug!("Cleared session continuation artifacts");
    }
    
    Ok(())
--- a/crates/g3-core/tests/auto_continue_test.rs
+++ b/crates/g3-core/tests/auto_continue_test.rs
@@ -0,0 +1,234 @@
+//! Tests for the auto-continue detection features
+//!
+//! These tests verify the logic used to detect when the LLM should auto-continue:
+//! 1. Empty/trivial responses (just timing lines)
+//! 2. Incomplete tool calls
+//! 3. Unexecuted tool calls
+//! 4. Missing final_output after tool execution
+
+/// Helper function to check if a response is considered "empty" or trivial
+/// This mirrors the logic in lib.rs for detecting empty responses
+fn is_empty_response(response_text: &str) -> bool {
+    response_text.trim().is_empty()
+        || response_text.lines().all(|line| {
+            line.trim().is_empty() || line.trim().starts_with("⏱️")
+        })
+}
+
+#[test]
+fn test_empty_response_detection_empty_string() {
+    assert!(is_empty_response(""));
+}
+
+#[test]
+fn test_empty_response_detection_whitespace_only() {
+    assert!(is_empty_response("   "));
+    assert!(is_empty_response("\n\n\n"));
+    assert!(is_empty_response("  \n  \t  \n  "));
+}
+
+#[test]
+fn test_empty_response_detection_timing_line_only() {
+    assert!(is_empty_response("⏱️ 43.0s | 💭 3.6s"));
+    assert!(is_empty_response("  ⏱️ 43.0s | 💭 3.6s  "));
+    assert!(is_empty_response("\n⏱️ 43.0s | 💭 3.6s\n"));
+}
+
+#[test]
+fn test_empty_response_detection_multiple_timing_lines() {
+    let response = "\n⏱️ 10.0s | 💭 1.0s\n\n⏱️ 20.0s | 💭 2.0s\n";
+    assert!(is_empty_response(response));
+}
+
+#[test]
+fn test_empty_response_detection_timing_with_empty_lines() {
+    let response = "\n\n⏱️ 43.0s | 💭 3.6s\n\n";
+    assert!(is_empty_response(response));
+}
+
+#[test]
+fn test_empty_response_detection_substantive_content() {
+    // These should NOT be considered empty
+    assert!(!is_empty_response("Hello, I will help you."));
+    assert!(!is_empty_response("Let me read that file."));
+    assert!(!is_empty_response("I've completed the task."));
+}
+
+#[test]
+fn test_empty_response_detection_timing_with_text() {
+    // If there's any substantive text, it's not empty
+    let response = "⏱️ 43.0s | 💭 3.6s\nHere is the result.";
+    assert!(!is_empty_response(response));
+}
+
+#[test]
+fn test_empty_response_detection_text_before_timing() {
+    let response = "Done!\n⏱️ 43.0s | 💭 3.6s";
+    assert!(!is_empty_response(response));
+}
+
+#[test]
+fn test_empty_response_detection_json_tool_call() {
+    // A JSON tool call is definitely not empty
+    let response = r#"{"tool": "read_file", "args": {"file_path": "test.txt"}}"#;
+    assert!(!is_empty_response(response));
+}
+
+#[test]
+fn test_empty_response_detection_partial_json() {
+    // Even partial JSON is not empty
+    let response = r#"{"tool": "read_file", "args": {"#;
+    assert!(!is_empty_response(response));
+}
+
+#[test]
+fn test_empty_response_detection_markdown() {
+    // Markdown content is not empty
+    let response = "# Summary\n\nI completed the task.";
+    assert!(!is_empty_response(response));
+}
+
+#[test]
+fn test_empty_response_detection_code_block() {
+    // Code blocks are not empty
+    let response = "```rust\nfn main() {}\n```";
+    assert!(!is_empty_response(response));
+}
+
+// Test the MAX_AUTO_SUMMARY_ATTEMPTS constant value
+// This is a compile-time check that the constant exists and has the expected value
+#[test]
+fn test_max_auto_summary_attempts_is_reasonable() {
+    // The constant should be at least 3 to give the LLM a fair chance to recover
+    // We can't directly access the constant from here, but we document the expected value
+    // Current value: 5 (increased from 2)
+    const EXPECTED_MIN_ATTEMPTS: usize = 3;
+    const EXPECTED_MAX_ATTEMPTS: usize = 10;
+    const CURRENT_VALUE: usize = 5;
+    
+    assert!(CURRENT_VALUE >= EXPECTED_MIN_ATTEMPTS, 
+        "MAX_AUTO_SUMMARY_ATTEMPTS should be at least {} for reliable recovery", EXPECTED_MIN_ATTEMPTS);
+    assert!(CURRENT_VALUE <= EXPECTED_MAX_ATTEMPTS,
+        "MAX_AUTO_SUMMARY_ATTEMPTS should not exceed {} to avoid infinite loops", EXPECTED_MAX_ATTEMPTS);
+}
+
+// =============================================================================
+// Test: Auto-continue condition logic
+// =============================================================================
+
+/// Simulates the should_auto_continue logic from lib.rs
+fn should_auto_continue(
+    any_tool_executed: bool,
+    final_output_called: bool,
+    has_incomplete_tool_call: bool,
+    has_unexecuted_tool_call: bool,
+    is_empty_response: bool,
+) -> bool {
+    (any_tool_executed && !final_output_called)
+        || has_incomplete_tool_call
+        || has_unexecuted_tool_call
+        || (any_tool_executed && is_empty_response)
+}
+
+#[test]
+fn test_auto_continue_after_tool_no_final_output() {
+    // Tool executed but no final_output - should continue
+    assert!(should_auto_continue(
+        true,  // any_tool_executed
+        false, // final_output_called
+        false, // has_incomplete_tool_call
+        false, // has_unexecuted_tool_call
+        false, // is_empty_response
+    ));
+}
+
+#[test]
+fn test_auto_continue_with_final_output() {
+    // Tool executed AND final_output called - should NOT continue
+    assert!(!should_auto_continue(
+        true,  // any_tool_executed
+        true,  // final_output_called
+        false, // has_incomplete_tool_call
+        false, // has_unexecuted_tool_call
+        false, // is_empty_response
+    ));
+}
+
+#[test]
+fn test_auto_continue_incomplete_tool_call() {
+    // Incomplete tool call - should continue regardless of other flags
+    assert!(should_auto_continue(
+        false, // any_tool_executed
+        false, // final_output_called
+        true,  // has_incomplete_tool_call
+        false, // has_unexecuted_tool_call
+        false, // is_empty_response
+    ));
+}
+
+#[test]
+fn test_auto_continue_unexecuted_tool_call() {
+    // Unexecuted tool call - should continue
+    assert!(should_auto_continue(
+        false, // any_tool_executed
+        false, // final_output_called
+        false, // has_incomplete_tool_call
+        true,  // has_unexecuted_tool_call
+        false, // is_empty_response
+    ));
+}
+
+#[test]
+fn test_auto_continue_empty_response_after_tool() {
+    // Empty response after tool execution - should continue
+    assert!(should_auto_continue(
+        true,  // any_tool_executed
+        false, // final_output_called
+        false, // has_incomplete_tool_call
+        false, // has_unexecuted_tool_call
+        true,  // is_empty_response
+    ));
+}
+
+#[test]
+fn test_auto_continue_empty_response_no_tool() {
+    // Empty response but no tool executed - should NOT continue
+    // (This is a normal case where LLM just didn't respond)
+    assert!(!should_auto_continue(
+        false, // any_tool_executed
+        false, // final_output_called
+        false, // has_incomplete_tool_call
+        false, // has_unexecuted_tool_call
+        true,  // is_empty_response
+    ));
+}
+
+#[test]
+fn test_auto_continue_no_conditions_met() {
+    // No tools, no incomplete calls, substantive response - should NOT continue
+    assert!(!should_auto_continue(
+        false, // any_tool_executed
+        false, // final_output_called
+        false, // has_incomplete_tool_call
+        false, // has_unexecuted_tool_call
+        false, // is_empty_response
+    ));
+}
+
+// =============================================================================
+// Test: Redundant condition detection
+// =============================================================================
+
+#[test]
+fn test_redundant_empty_response_condition() {
+    // This test documents that (any_tool_executed && is_empty_response) is redundant
+    // when (any_tool_executed && !final_output_called) is already true
+    
+    // Case: tool executed, no final_output, empty response
+    let result_with_empty = should_auto_continue(true, false, false, false, true);
+    let result_without_empty = should_auto_continue(true, false, false, false, false);
+    
+    // Both should be true because (any_tool_executed && !final_output_called) is true
+    assert_eq!(result_with_empty, result_without_empty, 
+        "The is_empty_response condition is redundant when any_tool_executed && !final_output_called");
+}
--- a/crates/g3-core/tests/duplicate_detection_test.rs
+++ b/crates/g3-core/tests/duplicate_detection_test.rs
@@ -0,0 +1,231 @@
+//! Tests for tool call duplicate detection
+//!
+//! These tests ensure that duplicate detection only catches IMMEDIATELY SEQUENTIAL
+//! duplicates, not legitimate re-use of tools with text between them.
+
+use g3_core::StreamingToolParser;
+use g3_providers::CompletionChunk;
+
+// Helper to create a chunk
+fn chunk(content: &str, finished: bool) -> CompletionChunk {
+    CompletionChunk {
+        content: content.to_string(),
+        finished,
+        tool_calls: None,
+        usage: None,
+    }
+}
+
+// =============================================================================
+// Test: find_complete_json_object_end helper function
+// =============================================================================
+
+#[test]
+fn test_find_complete_json_object_end_simple() {
+    let json = r#"{"tool": "test", "args": {}}"#;
+    let end = StreamingToolParser::find_complete_json_object_end(json);
+    assert!(end.is_some(), "Should find end of complete JSON");
+    assert_eq!(end.unwrap(), json.len() - 1, "End should be at last character");
+}
+
+#[test]
+fn test_find_complete_json_object_end_nested() {
+    let json = r#"{"tool": "test", "args": {"nested": {"deep": true}}}"#;
+    let end = StreamingToolParser::find_complete_json_object_end(json);
+    assert!(end.is_some(), "Should find end of nested JSON");
+    assert_eq!(end.unwrap(), json.len() - 1);
+}
+
+#[test]
+fn test_find_complete_json_object_end_with_trailing_text() {
+    let json = r#"{"tool": "test", "args": {}} some text after"#;
+    let end = StreamingToolParser::find_complete_json_object_end(json);
+    assert!(end.is_some(), "Should find end of JSON even with trailing text");
+    // The end should be at the closing brace, not at the end of the string
+    let end_pos = end.unwrap();
+    assert_eq!(&json[end_pos..end_pos+1], "}", "End should be at closing brace");
+}
+
+#[test]
+fn test_find_complete_json_object_end_incomplete() {
+    let json = r#"{"tool": "test", "args": {"#;
+    let end = StreamingToolParser::find_complete_json_object_end(json);
+    assert!(end.is_none(), "Should return None for incomplete JSON");
+}
+
+// =============================================================================
+// Test: Tool calls separated by text should NOT be duplicates
+// =============================================================================
+
+#[test]
+fn test_same_tool_with_text_between_not_duplicate() {
+    // This tests the scenario where the LLM calls the same tool twice
+    // but with explanatory text between them - this should NOT be a duplicate
+    let mut parser = StreamingToolParser::new();
+    
+    // First tool call
+    let content1 = r#"{"tool": "todo_read", "args": {}}"#;
+    let tools1 = parser.process_chunk(&chunk(content1, true));
+    assert_eq!(tools1.len(), 1, "First tool call should be detected");
+    assert_eq!(tools1[0].tool, "todo_read");
+    
+    // Reset parser (simulating what happens after tool execution)
+    parser.reset();
+    
+    // Some text, then the same tool call again
+    let content2 = r#"Now let me check the TODO again to verify my changes.
+{"tool": "todo_read", "args": {}}"#;
+    let tools2 = parser.process_chunk(&chunk(content2, true));
+    
+    // The second tool call should be detected - it's NOT a duplicate
+    // because there's text before it
+    assert_eq!(tools2.len(), 1, "Second tool call should be detected (not a duplicate)");
+    assert_eq!(tools2[0].tool, "todo_read");
+}
+
+#[test]
+fn test_different_tools_back_to_back_not_duplicate() {
+    let mut parser = StreamingToolParser::new();
+    
+    // Two different tool calls back to back
+    let content = r#"{"tool": "read_file", "args": {"file_path": "a.txt"}}
+{"tool": "shell", "args": {"command": "ls"}}"#;
+    
+    let tools = parser.process_chunk(&chunk(content, true));
+    
+    // Both should be detected - they're different tools
+    assert!(tools.len() >= 1, "Should detect tool calls");
+    // At minimum, the first one should be detected
+    assert_eq!(tools[0].tool, "read_file");
+}
+
+#[test]
+fn test_same_tool_different_args_not_duplicate() {
+    let mut parser = StreamingToolParser::new();
+    
+    // Same tool but different arguments - NOT a duplicate
+    let content = r#"{"tool": "read_file", "args": {"file_path": "a.txt"}}
+{"tool": "read_file", "args": {"file_path": "b.txt"}}"#;
+    
+    let tools = parser.process_chunk(&chunk(content, true));
+    
+    // Both should be detected - different args means not a duplicate
+    assert!(tools.len() >= 1, "Should detect tool calls");
+}
+
+// =============================================================================
+// Test: Immediately sequential identical tool calls ARE duplicates
+// =============================================================================
+
+#[test]
+fn test_identical_tool_calls_back_to_back_are_duplicates() {
+    // This tests the scenario where the LLM stutters and outputs
+    // the exact same tool call twice in a row - this IS a duplicate
+    let mut parser = StreamingToolParser::new();
+    
+    // Two identical tool calls with no text between them
+    let content = r#"{"tool": "todo_read", "args": {}}
+{"tool": "todo_read", "args": {}}"#;
+    
+    let tools = parser.process_chunk(&chunk(content, true));
+    
+    // The parser should detect both, but the deduplication logic
+    // (which happens at a higher level in the agent) should mark
+    // the second one as a duplicate
+    // Here we just verify both are parsed
+    assert!(tools.len() >= 1, "Should detect at least one tool call");
+}
+
+// =============================================================================
+// Test: Text content detection for duplicate logic
+// =============================================================================
+
+#[test]
+fn test_has_text_after_tool_call() {
+    // Helper test to verify we can detect text after a tool call
+    let content_with_text = r#"{"tool": "test", "args": {}} Some text after"#;
+    let content_without_text = r#"{"tool": "test", "args": {}}"#;
+    let content_with_whitespace_only = r#"{"tool": "test", "args": {}}   
+  "#;
+    
+    // Find the end of the JSON in each case
+    let end1 = StreamingToolParser::find_complete_json_object_end(content_with_text).unwrap();
+    let end2 = StreamingToolParser::find_complete_json_object_end(content_without_text).unwrap();
+    let end3 = StreamingToolParser::find_complete_json_object_end(content_with_whitespace_only).unwrap();
+    
+    // Check what's after the JSON
+    let after1 = content_with_text[end1 + 1..].trim();
+    let after2 = content_without_text.get(end2 + 1..).unwrap_or("").trim();
+    let after3 = content_with_whitespace_only[end3 + 1..].trim();
+    
+    assert!(!after1.is_empty(), "Should have text after tool call");
+    assert!(after2.is_empty(), "Should have no text after tool call");
+    assert!(after3.is_empty(), "Whitespace-only should count as no text");
+}
+
+// =============================================================================
+// Test: Edge cases
+// =============================================================================
+
+#[test]
+fn test_tool_call_with_newlines_between() {
+    let mut parser = StreamingToolParser::new();
+    
+    // Tool calls separated by multiple newlines (but no actual text)
+    // This SHOULD be considered a duplicate since there's no meaningful text
+    let content = r#"{"tool": "todo_read", "args": {}}
+
+
+{"tool": "todo_read", "args": {}}"#;
+    
+    let tools = parser.process_chunk(&chunk(content, true));
+    assert!(tools.len() >= 1, "Should detect at least one tool call");
+}
+
+#[test]
+fn test_tool_call_with_whitespace_text_between() {
+    let mut parser = StreamingToolParser::new();
+    
+    // Tool calls separated by text that's just whitespace and punctuation
+    // The key is whether there's "meaningful" text
+    let content = r#"{"tool": "todo_read", "args": {}}
+OK, now again:
+{"tool": "todo_read", "args": {}}"#;
+    
+    let tools = parser.process_chunk(&chunk(content, true));
+    
+    // Both should be detected since there's text between them
+    assert!(tools.len() >= 1, "Should detect tool calls");
+}
+
+#[test]
+fn test_tool_call_in_middle_of_text() {
+    let mut parser = StreamingToolParser::new();
+    
+    // Tool call surrounded by text
+    let content = r#"Let me read the file first.
+{"tool": "read_file", "args": {"file_path": "test.txt"}}
+Now I'll analyze the contents."#;
+    
+    let tools = parser.process_chunk(&chunk(content, true));
+    assert_eq!(tools.len(), 1, "Should detect the tool call");
+    assert_eq!(tools[0].tool, "read_file");
+}
+
+#[test]
+fn test_multiple_different_tool_calls_with_text() {
+    let mut parser = StreamingToolParser::new();
+    
+    // Multiple different tool calls with text between each
+    let content = r#"First, let me read the file:
+{"tool": "read_file", "args": {"file_path": "test.txt"}}
+Now let me check the TODO:
+{"tool": "todo_read", "args": {}}
+Finally, let me run a command:
+{"tool": "shell", "args": {"command": "ls"}}"#;
+    
+    let tools = parser.process_chunk(&chunk(content, true));
+    
+    // All three should be detected
+    assert!(tools.len() >= 1, "Should detect tool calls");
+}
--- a/crates/g3-core/tests/incomplete_tool_call_test.rs
+++ b/crates/g3-core/tests/incomplete_tool_call_test.rs
@@ -0,0 +1,182 @@
+//! Tests for the incomplete tool call detection feature
+
+use g3_core::StreamingToolParser;
+use g3_providers::CompletionChunk;
+
+#[test]
+fn test_has_incomplete_tool_call_empty_buffer() {
+    let parser = StreamingToolParser::new();
+    assert!(!parser.has_incomplete_tool_call());
+}
+
+#[test]
+fn test_has_incomplete_tool_call_no_tool_pattern() {
+    let mut parser = StreamingToolParser::new();
+    let chunk = CompletionChunk {
+        content: "Hello, I will help you with that.".to_string(),
+        finished: false,
+        tool_calls: None,
+        usage: None,
+    };
+    parser.process_chunk(&chunk);
+    assert!(!parser.has_incomplete_tool_call());
+}
+
+#[test]
+fn test_has_incomplete_tool_call_complete_tool_call() {
+    let mut parser = StreamingToolParser::new();
+    let chunk = CompletionChunk {
+        content: r#"{"tool": "read_file", "args": {"file_path": "test.txt"}}"#.to_string(),
+        finished: false,
+        tool_calls: None,
+        usage: None,
+    };
+    parser.process_chunk(&chunk);
+    // Complete JSON should NOT be detected as incomplete
+    assert!(!parser.has_incomplete_tool_call());
+}
+
+#[test]
+fn test_has_incomplete_tool_call_truncated_tool_call() {
+    let mut parser = StreamingToolParser::new();
+    // Simulate truncated tool call - missing closing braces
+    let chunk = CompletionChunk {
+        content: r#"{"tool": "read_file", "args": {"file_path": "test.txt""#.to_string(),
+        finished: false,
+        tool_calls: None,
+        usage: None,
+    };
+    parser.process_chunk(&chunk);
+    // Incomplete JSON should be detected
+    assert!(parser.has_incomplete_tool_call());
+}
+
+#[test]
+fn test_has_incomplete_tool_call_truncated_mid_value() {
+    let mut parser = StreamingToolParser::new();
+    // Simulate truncated tool call - cut off mid-value
+    let chunk = CompletionChunk {
+        content: r#"{"tool": "shell", "args": {"command": "cargo test --package g3-cli --test filter_json_test test_streaming -- --test-threads=1 2>&1 | tail"#.to_string(),
+        finished: false,
+        tool_calls: None,
+        usage: None,
+    };
+    parser.process_chunk(&chunk);
+    // Incomplete JSON should be detected
+    assert!(parser.has_incomplete_tool_call());
+}
+
+#[test]
+fn test_has_incomplete_tool_call_with_text_before() {
+    let mut parser = StreamingToolParser::new();
+    // Text before the incomplete tool call
+    let chunk = CompletionChunk {
+        content: r#"Let me read that file for you.
+
+{"tool": "read_file", "args": {"file_path":"#.to_string(),
+        finished: false,
+        tool_calls: None,
+        usage: None,
+    };
+    parser.process_chunk(&chunk);
+    // Incomplete JSON should be detected
+    assert!(parser.has_incomplete_tool_call());
+}
+
+#[test]
+fn test_has_incomplete_tool_call_malformed_like_trace() {
+    let mut parser = StreamingToolParser::new();
+    // This simulates a truncated tool call where the stream ended mid-JSON
+    // The actual trace showed truncated output, not malformed characters
+    let chunk = CompletionChunk {
+        content: r#"{"tool": "read_file", "args": {"file_path":"src/engine.rkt""#.to_string(),
+        finished: false,
+        tool_calls: None,
+        usage: None,
+    };
+    parser.process_chunk(&chunk);
+    // Truncated JSON (missing closing braces) should be detected as incomplete
+    assert!(parser.has_incomplete_tool_call());
+}
+
+#[test]
+fn test_has_unexecuted_tool_call_empty_buffer() {
+    let parser = StreamingToolParser::new();
+    assert!(!parser.has_unexecuted_tool_call());
+}
+
+#[test]
+fn test_has_unexecuted_tool_call_no_tool_pattern() {
+    let mut parser = StreamingToolParser::new();
+    let chunk = CompletionChunk {
+        content: "Hello, I will help you with that.".to_string(),
+        finished: false,
+        tool_calls: None,
+        usage: None,
+    };
+    parser.process_chunk(&chunk);
+    assert!(!parser.has_unexecuted_tool_call());
+}
+
+#[test]
+fn test_has_unexecuted_tool_call_complete_tool_call() {
+    let mut parser = StreamingToolParser::new();
+    let chunk = CompletionChunk {
+        content: r#"{"tool": "read_file", "args": {"file_path": "test.txt"}}"#.to_string(),
+        finished: false,
+        tool_calls: None,
+        usage: None,
+    };
+    parser.process_chunk(&chunk);
+    // Complete JSON tool call that wasn't executed should be detected
+    assert!(parser.has_unexecuted_tool_call());
+}
+
+#[test]
+fn test_has_unexecuted_tool_call_incomplete_json() {
+    let mut parser = StreamingToolParser::new();
+    let chunk = CompletionChunk {
+        content: r#"{"tool": "read_file", "args": {"file_path": "test.txt""#.to_string(),
+        finished: false,
+        tool_calls: None,
+        usage: None,
+    };
+    parser.process_chunk(&chunk);
+    // Incomplete JSON should NOT be detected as unexecuted (it's incomplete, not unexecuted)
+    assert!(!parser.has_unexecuted_tool_call());
+}
+
+#[test]
+fn test_has_unexecuted_tool_call_with_trailing_text() {
+    let mut parser = StreamingToolParser::new();
+    // Complete JSON tool call followed by trailing text
+    let chunk = CompletionChunk {
+        content: r#"{"tool": "read_file", "args": {"file_path": "test.txt"}}
+
+Some trailing text after the JSON"#.to_string(),
+        finished: false,
+        tool_calls: None,
+        usage: None,
+    };
+    parser.process_chunk(&chunk);
+    // Complete JSON tool call should be detected even with trailing text
+    assert!(parser.has_unexecuted_tool_call());
+}
+
+#[test]
+fn test_has_unexecuted_tool_call_with_text_before_and_after() {
+    let mut parser = StreamingToolParser::new();
+    let chunk = CompletionChunk {
+        content: r#"Let me read that file.
+
+{"tool": "shell", "args": {"command": "ls -la"}}
+
+I'll execute this command now."#.to_string(),
+        finished: false,
+        tool_calls: None,
+        usage: None,
+    };
+    parser.process_chunk(&chunk);
+    // Complete JSON tool call should be detected
+    assert!(parser.has_unexecuted_tool_call());
+}
--- a/crates/g3-core/tests/streaming_parser_test.rs
+++ b/crates/g3-core/tests/streaming_parser_test.rs
@@ -0,0 +1,545 @@
+//! Comprehensive tests for StreamingToolParser
+//!
+//! Tests cover:
+//! - Multiple tool calls in one response
+//! - Tool call followed by text
+//! - Incomplete tool calls at various truncation points
+//! - Parser reset behavior
+//! - Buffer management
+
+use g3_core::StreamingToolParser;
+use g3_providers::CompletionChunk;
+
+// Helper to create a chunk
+fn chunk(content: &str, finished: bool) -> CompletionChunk {
+    CompletionChunk {
+        content: content.to_string(),
+        finished,
+        tool_calls: None,
+        usage: None,
+    }
+}
+
+// =============================================================================
+// Test: Multiple tool calls in one response
+// =============================================================================
+
+#[test]
+fn test_multiple_tool_calls_in_single_chunk() {
+    let mut parser = StreamingToolParser::new();
+    
+    // Two complete tool calls in one chunk
+    let content = r#"Let me do two things:
+{"tool": "read_file", "args": {"file_path": "a.txt"}}
+Now the second:
+{"tool": "shell", "args": {"command": "ls"}}"#;
+    
+    let tools = parser.process_chunk(&chunk(content, false));
+    
+    // Should detect at least one tool call
+    // Note: Current implementation may only return the first one found
+    assert!(!tools.is_empty(), "Should detect at least one tool call");
+}
+
+#[test]
+fn test_multiple_tool_calls_across_chunks() {
+    let mut parser = StreamingToolParser::new();
+    
+    // First tool call
+    let tools1 = parser.process_chunk(&chunk(
+        r#"{"tool": "read_file", "args": {"file_path": "a.txt"}}"#,
+        false
+    ));
+    assert_eq!(tools1.len(), 1, "First tool call should be detected");
+    assert_eq!(tools1[0].tool, "read_file");
+    
+    // Reset parser (simulating what happens after tool execution)
+    parser.reset();
+    
+    // Second tool call
+    let tools2 = parser.process_chunk(&chunk(
+        r#"{"tool": "shell", "args": {"command": "ls"}}"#,
+        false
+    ));
+    assert_eq!(tools2.len(), 1, "Second tool call should be detected");
+    assert_eq!(tools2[0].tool, "shell");
+}
+
+#[test]
+fn test_first_complete_second_incomplete() {
+    let mut parser = StreamingToolParser::new();
+    
+    // First complete, second incomplete
+    let content = r#"{"tool": "read_file", "args": {"file_path": "a.txt"}}
+{"tool": "shell", "args": {"command": "ls"#;
+    
+    let tools = parser.process_chunk(&chunk(content, false));
+    
+    // Should detect the first complete tool call
+    // The incomplete one should be detected by has_incomplete_tool_call
+    assert!(parser.has_incomplete_tool_call(), "Should detect incomplete tool call");
+}
+
+// =============================================================================
+// Test: Tool call followed by text
+// =============================================================================
+
+#[test]
+fn test_tool_call_with_trailing_text() {
+    let mut parser = StreamingToolParser::new();
+    
+    let content = r#"{"tool": "read_file", "args": {"file_path": "test.txt"}}
+
+Here is the content of the file..."#;
+    
+    let tools = parser.process_chunk(&chunk(content, false));
+    
+    assert_eq!(tools.len(), 1);
+    assert_eq!(tools[0].tool, "read_file");
+    
+    // The trailing text should be in the buffer
+    let text = parser.get_text_content();
+    assert!(text.contains("Here is the content"), "Trailing text should be preserved");
+}
+
+#[test]
+fn test_text_before_tool_call() {
+    let mut parser = StreamingToolParser::new();
+    
+    let content = r#"Let me read that file for you.
+
+{"tool": "read_file", "args": {"file_path": "test.txt"}}"#;
+    
+    let tools = parser.process_chunk(&chunk(content, false));
+    
+    assert_eq!(tools.len(), 1);
+    assert_eq!(tools[0].tool, "read_file");
+    
+    // The leading text should be in the buffer
+    let text = parser.get_text_content();
+    assert!(text.contains("Let me read"), "Leading text should be preserved");
+}
+
+#[test]
+fn test_text_before_and_after_tool_call() {
+    let mut parser = StreamingToolParser::new();
+    
+    let content = r#"I'll check the file.
+
+{"tool": "read_file", "args": {"file_path": "test.txt"}}
+
+Done checking."#;
+    
+    let tools = parser.process_chunk(&chunk(content, false));
+    
+    assert_eq!(tools.len(), 1);
+    
+    let text = parser.get_text_content();
+    assert!(text.contains("I'll check"), "Leading text should be preserved");
+    assert!(text.contains("Done checking"), "Trailing text should be preserved");
+}
+
+// =============================================================================
+// Test: Incomplete tool calls at various truncation points
+// =============================================================================
+
+#[test]
+fn test_incomplete_after_tool_key() {
+    let mut parser = StreamingToolParser::new();
+    parser.process_chunk(&chunk(r#"{"tool":"#, false));
+    assert!(parser.has_incomplete_tool_call());
+}
+
+#[test]
+fn test_incomplete_after_tool_name() {
+    let mut parser = StreamingToolParser::new();
+    parser.process_chunk(&chunk(r#"{"tool": "read_file""#, false));
+    assert!(parser.has_incomplete_tool_call());
+}
+
+#[test]
+fn test_incomplete_after_args_key() {
+    let mut parser = StreamingToolParser::new();
+    parser.process_chunk(&chunk(r#"{"tool": "read_file", "args":"#, false));
+    assert!(parser.has_incomplete_tool_call());
+}
+
+#[test]
+fn test_incomplete_mid_args_object() {
+    let mut parser = StreamingToolParser::new();
+    parser.process_chunk(&chunk(r#"{"tool": "read_file", "args": {"file_path":"#, false));
+    assert!(parser.has_incomplete_tool_call());
+}
+
+#[test]
+fn test_incomplete_mid_string_value() {
+    let mut parser = StreamingToolParser::new();
+    parser.process_chunk(&chunk(r#"{"tool": "shell", "args": {"command": "ls -la /very/long/path"#, false));
+    assert!(parser.has_incomplete_tool_call());
+}
+
+#[test]
+fn test_incomplete_missing_final_brace() {
+    let mut parser = StreamingToolParser::new();
+    parser.process_chunk(&chunk(r#"{"tool": "read_file", "args": {"file_path": "test.txt"}"#, false));
+    assert!(parser.has_incomplete_tool_call());
+}
+
+#[test]
+fn test_complete_tool_call_not_incomplete() {
+    let mut parser = StreamingToolParser::new();
+    parser.process_chunk(&chunk(r#"{"tool": "read_file", "args": {"file_path": "test.txt"}}"#, false));
+    assert!(!parser.has_incomplete_tool_call(), "Complete tool call should not be marked incomplete");
+}
+
+// =============================================================================
+// Test: Parser reset behavior
+// =============================================================================
+
+#[test]
+fn test_reset_clears_buffer() {
+    let mut parser = StreamingToolParser::new();
+    
+    parser.process_chunk(&chunk("Some content here", false));
+    assert!(!parser.get_text_content().is_empty());
+    
+    parser.reset();
+    
+    assert!(parser.get_text_content().is_empty(), "Buffer should be empty after reset");
+}
+
+#[test]
+fn test_reset_clears_incomplete_state() {
+    let mut parser = StreamingToolParser::new();
+    
+    // Create incomplete tool call
+    parser.process_chunk(&chunk(r#"{"tool": "read_file", "args": {"#, false));
+    assert!(parser.has_incomplete_tool_call());
+    
+    parser.reset();
+    
+    assert!(!parser.has_incomplete_tool_call(), "Incomplete state should be cleared after reset");
+}
+
+#[test]
+fn test_reset_clears_unexecuted_state() {
+    let mut parser = StreamingToolParser::new();
+    
+    // Create complete but "unexecuted" tool call
+    parser.process_chunk(&chunk(r#"{"tool": "read_file", "args": {"file_path": "test.txt"}}"#, false));
+    assert!(parser.has_unexecuted_tool_call());
+    
+    parser.reset();
+    
+    assert!(!parser.has_unexecuted_tool_call(), "Unexecuted state should be cleared after reset");
+}
+
+#[test]
+fn test_reset_allows_new_tool_calls() {
+    let mut parser = StreamingToolParser::new();
+    
+    // First tool call
+    let tools1 = parser.process_chunk(&chunk(
+        r#"{"tool": "read_file", "args": {"file_path": "a.txt"}}"#,
+        false
+    ));
+    assert_eq!(tools1.len(), 1);
+    
+    parser.reset();
+    
+    // Second tool call after reset
+    let tools2 = parser.process_chunk(&chunk(
+        r#"{"tool": "shell", "args": {"command": "ls"}}"#,
+        false
+    ));
+    assert_eq!(tools2.len(), 1);
+    assert_eq!(tools2[0].tool, "shell");
+}
+
+// =============================================================================
+// Test: Buffer management and edge cases
+// =============================================================================
+
+#[test]
+fn test_streaming_chunks_accumulate() {
+    let mut parser = StreamingToolParser::new();
+    
+    // Stream in chunks
+    parser.process_chunk(&chunk(r#"{"tool": "#, false));
+    parser.process_chunk(&chunk(r#""read_file", "#, false));
+    parser.process_chunk(&chunk(r#""args": {"file_path": "#, false));
+    parser.process_chunk(&chunk(r#""test.txt"}}"#, false));
+    
+    // Should have accumulated the complete tool call
+    let text = parser.get_text_content();
+    assert!(text.contains(r#""tool""#));
+    assert!(text.contains(r#""read_file""#));
+}
+
+#[test]
+fn test_finished_chunk_triggers_final_parse() {
+    let mut parser = StreamingToolParser::new();
+    
+    // Incomplete chunks
+    parser.process_chunk(&chunk(r#"{"tool": "read_file", "#, false));
+    let tools1 = parser.process_chunk(&chunk(r#""args": {"file_path": "test.txt"}}"#, false));
+    
+    // Tool should be detected before finished
+    assert!(!tools1.is_empty() || !parser.has_unexecuted_tool_call(), 
+        "Tool should be detected during streaming or marked as unexecuted");
+}
+
+#[test]
+fn test_empty_chunks_ignored() {
+    let mut parser = StreamingToolParser::new();
+    
+    parser.process_chunk(&chunk("", false));
+    parser.process_chunk(&chunk("", false));
+    
+    assert!(parser.get_text_content().is_empty());
+    assert!(!parser.has_incomplete_tool_call());
+    assert!(!parser.has_unexecuted_tool_call());
+}
+
+#[test]
+fn test_whitespace_only_chunks() {
+    let mut parser = StreamingToolParser::new();
+    
+    parser.process_chunk(&chunk("   \n\t  ", false));
+    
+    assert!(!parser.has_incomplete_tool_call());
+    assert!(!parser.has_unexecuted_tool_call());
+}
+
+#[test]
+fn test_json_with_escaped_quotes() {
+    let mut parser = StreamingToolParser::new();
+    
+    let content = r#"{"tool": "shell", "args": {"command": "echo \"hello\""}}"#;
+    let tools = parser.process_chunk(&chunk(content, false));
+    
+    assert_eq!(tools.len(), 1);
+    assert_eq!(tools[0].tool, "shell");
+}
+
+#[test]
+fn test_json_with_escaped_backslashes() {
+    let mut parser = StreamingToolParser::new();
+    
+    let content = r#"{"tool": "write_file", "args": {"file_path": "C:\\Users\\test.txt", "content": "data"}}"#;
+    let tools = parser.process_chunk(&chunk(content, false));
+    
+    assert_eq!(tools.len(), 1);
+    assert_eq!(tools[0].tool, "write_file");
+}
+
+#[test]
+fn test_json_with_nested_braces_in_string() {
+    let mut parser = StreamingToolParser::new();
+    
+    let content = r#"{"tool": "write_file", "args": {"content": "{\"nested\": {\"json\": true}}"}}"#;
+    let tools = parser.process_chunk(&chunk(content, false));
+    
+    assert_eq!(tools.len(), 1);
+    assert_eq!(tools[0].tool, "write_file");
+}
+
+#[test]
+fn test_text_buffer_length_tracking() {
+    let mut parser = StreamingToolParser::new();
+    
+    parser.process_chunk(&chunk("Hello", false));
+    assert_eq!(parser.text_buffer_len(), 5);
+    
+    parser.process_chunk(&chunk(" World", false));
+    assert_eq!(parser.text_buffer_len(), 11);
+    
+    parser.reset();
+    assert_eq!(parser.text_buffer_len(), 0);
+}
+
+#[test]
+fn test_message_stopped_flag() {
+    let mut parser = StreamingToolParser::new();
+    
+    parser.process_chunk(&chunk("Hello", false));
+    assert!(!parser.is_message_stopped());
+    
+    parser.process_chunk(&chunk(" World", true));
+    assert!(parser.is_message_stopped());
+    
+    parser.reset();
+    assert!(!parser.is_message_stopped());
+}
+
+// =============================================================================
+// Test: Tool call pattern variations
+// =============================================================================
+
+#[test]
+fn test_tool_pattern_no_spaces() {
+    let mut parser = StreamingToolParser::new();
+    let tools = parser.process_chunk(&chunk(
+        r#"{"tool":"read_file","args":{"file_path":"test.txt"}}"#,
+        false
+    ));
+    assert_eq!(tools.len(), 1);
+}
+
+// =============================================================================
+// Test: mark_tool_calls_consumed functionality
+// =============================================================================
+
+#[test]
+fn test_mark_consumed_clears_unexecuted_state() {
+    let mut parser = StreamingToolParser::new();
+    
+    // Add a complete tool call
+    parser.process_chunk(&chunk(
+        r#"{"tool": "read_file", "args": {"file_path": "test.txt"}}"#,
+        false
+    ));
+    
+    // Should be detected as unexecuted
+    assert!(parser.has_unexecuted_tool_call());
+    
+    // Mark as consumed
+    parser.mark_tool_calls_consumed();
+    
+    // Should no longer be detected as unexecuted
+    assert!(!parser.has_unexecuted_tool_call(), 
+        "After marking consumed, has_unexecuted_tool_call should return false");
+}
+
+#[test]
+fn test_mark_consumed_allows_new_tool_detection() {
+    let mut parser = StreamingToolParser::new();
+    
+    // First tool call
+    parser.process_chunk(&chunk(
+        r#"{"tool": "read_file", "args": {"file_path": "a.txt"}}"#,
+        false
+    ));
+    parser.mark_tool_calls_consumed();
+    
+    // Second tool call (without reset)
+    parser.process_chunk(&chunk(
+        r#"{"tool": "shell", "args": {"command": "ls"}}"#,
+        false
+    ));
+    
+    // Should detect the new unexecuted tool call
+    assert!(parser.has_unexecuted_tool_call(), 
+        "New tool call after consumed position should be detected");
+}
+
+#[test]
+fn test_bare_brace_not_incomplete() {
+    let mut parser = StreamingToolParser::new();
+    
+    // Just a bare opening brace - not a tool call pattern
+    parser.process_chunk(&chunk(r#"{""#, false));
+    
+    // Should NOT be detected as incomplete because it doesn't match tool patterns
+    assert!(!parser.has_incomplete_tool_call(), 
+        "Bare {{ should not be detected as incomplete tool call");
+}
+
+#[test]
+fn test_duplicate_tool_call_pattern() {
+    let mut parser = StreamingToolParser::new();
+    
+    // Simulate the problematic pattern: tool call, garbage, duplicate tool call
+    let content = concat!(
+        r#"{"tool": "str_replace", "args": {"file_path": "test.rs", "diff": "test"}}"#,
+        "\n\n{\"\n\n",
+        r#"{"tool": "str_replace", "args": {"file_path": "test.rs", "diff": "test"}}"#
+    );
+    let tools = parser.process_chunk(&chunk(content, false));
+    
+    // Should detect at least one tool call
+    assert!(!tools.is_empty(), "Should detect at least one tool call");
+    
+    // After processing, there should be an unexecuted tool call (the duplicate)
+    // because the parser only returns the first one it finds during streaming
+    assert!(parser.has_unexecuted_tool_call(), 
+        "Should detect the duplicate as unexecuted");
+}
+
+#[test]
+fn test_multiple_tool_calls_returned_on_finish() {
+    let mut parser = StreamingToolParser::new();
+    
+    // Two complete tool calls in one chunk, with finished=true
+    let content = concat!(
+        r#"{"tool": "read_file", "args": {"file_path": "a.txt"}}"#,
+        "\nSome text\n",
+        r#"{"tool": "shell", "args": {"command": "ls"}}"#
+    );
+    
+    // First, add content without finishing
+    parser.process_chunk(&chunk(content, false));
+    
+    // Now finish the stream - should return ALL tool calls
+    let tools = parser.process_chunk(&chunk("", true));
+    
+    // Should return both tool calls
+    assert_eq!(tools.len(), 2, "Should return both tool calls when stream finishes");
+    assert_eq!(tools[0].tool, "read_file");
+    assert_eq!(tools[1].tool, "shell");
+}
+
+#[test]
+fn test_tool_pattern_extra_spaces() {
+    let mut parser = StreamingToolParser::new();
+    let tools = parser.process_chunk(&chunk(
+        r#"{ "tool" : "read_file" , "args" : { "file_path" : "test.txt" } }"#,
+        false
+    ));
+    assert_eq!(tools.len(), 1);
+}
+
+#[test]
+fn test_tool_pattern_with_newlines() {
+    let mut parser = StreamingToolParser::new();
+    // Note: The parser looks for specific patterns like {"tool": or { "tool":
+    // Multi-line JSON with newlines between { and "tool" won't match
+    // This is expected behavior - the pattern matching is intentionally strict
+    let _tools = parser.process_chunk(&chunk(
+        r#"{
+  "tool": "read_file",
+  "args": {
+    "file_path": "test.txt"
+  }
+}"#,
+        false
+    ));
+    // This won't be detected as a tool call due to newline after {
+    // The has_unexecuted_tool_call check also won't find it
+    // This is a known limitation of the pattern-based detection
+}
+
+// =============================================================================
+// Test: Edge cases for has_message_like_keys validation
+// =============================================================================
+
+#[test]
+fn test_normal_args_accepted() {
+    let mut parser = StreamingToolParser::new();
+    let tools = parser.process_chunk(&chunk(
+        r#"{"tool": "read_file", "args": {"file_path": "test.txt", "start": 0, "end": 100}}"#,
+        false
+    ));
+    assert_eq!(tools.len(), 1);
+}
+
+#[test] 
+fn test_content_with_phrases_in_value_accepted() {
+    let mut parser = StreamingToolParser::new();
+    // Phrases like "I'll" in VALUES should be fine (only keys are checked)
+    let tools = parser.process_chunk(&chunk(
+        r#"{"tool": "write_file", "args": {"file_path": "test.txt", "content": "I'll help you with that. Let me explain."}}"#,
+        false
+    ));
+    assert_eq!(tools.len(), 1);
+}
--- a/crates/g3-ensembles/src/flock.rs
+++ b/crates/g3-ensembles/src/flock.rs
@@ -7,7 +7,7 @@ use std::path::{Path, PathBuf};
 use std::process::Stdio;
 use tokio::io::{AsyncBufReadExt, BufReader};
 use tokio::process::Command;
-use tracing::{debug, error, info, warn};
+use tracing::{debug, error, warn};
 use uuid::Uuid;

 use crate::status::{FlockStatus, SegmentState, SegmentStatus};
@@ -174,7 +174,7 @@ impl FlockMode {

    /// Run flock mode
    pub async fn run(&mut self) -> Result<()> {
-        info!(
+        debug!(
            "Starting flock mode with {} segments",
            self.config.num_segments
        );
@@ -625,7 +625,7 @@ async fn run_segment(
    status_file: PathBuf,
    session_id: String,
 ) -> Result<SegmentStatus> {
-    info!(
+    debug!(
        "Starting segment {} in {}",
        segment_id,
        segment_dir.display()
--- a/crates/g3-execution/src/lib.rs
+++ b/crates/g3-execution/src/lib.rs
@@ -3,7 +3,7 @@ use regex::Regex;
 use std::io::Write;
 use std::process::Command;
 use tempfile::NamedTempFile;
-use tracing::{debug, error, info};
+use tracing::{debug, error};

 /// Expand tilde (~) in a path to the user's home directory
 fn expand_tilde(path: &str) -> String {
@@ -72,7 +72,7 @@ impl CodeExecutor {
        }

        for (language, code) in code_blocks {
-            info!("Executing {} code", language);
+            debug!("Executing {} code", language);

            if show_code {
                results.push(format!("📋 Running {} code:", language));
@@ -459,7 +459,7 @@ pub fn is_cargo_llvm_cov_installed() -> Result<bool> {

 /// Install llvm-tools-preview via rustup
 pub fn install_llvm_tools() -> Result<()> {
-    info!("Installing llvm-tools-preview...");
+    debug!("Installing llvm-tools-preview...");
    let output = Command::new("rustup")
        .args(&["component", "add", "llvm-tools-preview"])
        .output()?;
@@ -469,13 +469,13 @@ pub fn install_llvm_tools() -> Result<()> {
        anyhow::bail!("Failed to install llvm-tools-preview: {}", stderr);
    }

-    info!("✅ llvm-tools-preview installed successfully");
+    debug!("✅ llvm-tools-preview installed successfully");
    Ok(())
 }

 /// Install cargo-llvm-cov via cargo install
 pub fn install_cargo_llvm_cov() -> Result<()> {
-    info!("Installing cargo-llvm-cov... (this may take a few minutes)");
+    debug!("Installing cargo-llvm-cov... (this may take a few minutes)");
    let output = Command::new("cargo")
        .args(&["install", "cargo-llvm-cov"])
        .output()?;
@@ -485,7 +485,7 @@ pub fn install_cargo_llvm_cov() -> Result<()> {
        anyhow::bail!("Failed to install cargo-llvm-cov: {}", stderr);
    }

-    info!("✅ cargo-llvm-cov installed successfully");
+    debug!("✅ cargo-llvm-cov installed successfully");
    Ok(())
 }

@@ -496,20 +496,20 @@ pub fn ensure_coverage_tools_installed() -> Result<bool> {

    // Check and install llvm-tools-preview
    if !is_llvm_tools_installed()? {
-        info!("llvm-tools-preview not found, installing...");
+        debug!("llvm-tools-preview not found, installing...");
        install_llvm_tools()?;
        already_installed = false;
    } else {
-        info!("✅ llvm-tools-preview is already installed");
+        debug!("✅ llvm-tools-preview is already installed");
    }

    // Check and install cargo-llvm-cov
    if !is_cargo_llvm_cov_installed()? {
-        info!("cargo-llvm-cov not found, installing...");
+        debug!("cargo-llvm-cov not found, installing...");
        install_cargo_llvm_cov()?;
        already_installed = false;
    } else {
-        info!("✅ cargo-llvm-cov is already installed");
+        debug!("✅ cargo-llvm-cov is already installed");
    }

    Ok(already_installed)
--- a/crates/g3-providers/src/anthropic.rs
+++ b/crates/g3-providers/src/anthropic.rs
@@ -328,7 +328,7 @@ impl AnthropicProvider {
        tracing::debug!("create_request_body called: max_tokens={}, disable_thinking={}, thinking_budget_tokens={:?}", max_tokens, disable_thinking, self.thinking_budget_tokens);

        let thinking = if disable_thinking {
-            tracing::info!(
+            tracing::debug!(
                "Thinking mode explicitly disabled for this request (max_tokens={})",
                max_tokens
            );
--- a/crates/g3-providers/src/databricks.rs
+++ b/crates/g3-providers/src/databricks.rs
@@ -64,7 +64,7 @@ use serde::{Deserialize, Serialize};
 use std::time::Duration;
 use tokio::sync::mpsc;
 use tokio_stream::wrappers::ReceiverStream;
-use tracing::{debug, error, info, warn};
+use tracing::{debug, error, warn};

 use crate::{
    CompletionChunk, CompletionRequest, CompletionResponse, CompletionStream, LLMProvider, Message,
@@ -166,7 +166,7 @@ impl DatabricksProvider {
            .build()
            .map_err(|e| anyhow!("Failed to create HTTP client: {}", e))?;

-        info!(
+        debug!(
            "Initialized Databricks provider with model: {} on host: {}",
            model, host
        );
@@ -196,7 +196,7 @@ impl DatabricksProvider {
            .build()
            .map_err(|e| anyhow!("Failed to create HTTP client: {}", e))?;

-        info!("Initialized Databricks provider '{}' with model: {} on host: {}", name, model, host);
+        debug!("Initialized Databricks provider '{}' with model: {} on host: {}", name, model, host);

        Ok(Self {
            client,
@@ -220,7 +220,7 @@ impl DatabricksProvider {
            .build()
            .map_err(|e| anyhow!("Failed to create HTTP client: {}", e))?;

-        info!(
+        debug!(
            "Initialized Databricks provider with OAuth for model: {} on host: {}",
            model, host
        );
@@ -249,7 +249,7 @@ impl DatabricksProvider {
            .build()
            .map_err(|e| anyhow!("Failed to create HTTP client: {}", e))?;

-        info!("Initialized Databricks provider '{}' with OAuth for model: {} on host: {}", name, model, host);
+        debug!("Initialized Databricks provider '{}' with OAuth for model: {} on host: {}", name, model, host);

        Ok(Self {
            client,
@@ -857,7 +857,7 @@ impl LLMProvider for DatabricksProvider {
            if status == reqwest::StatusCode::FORBIDDEN
                && (error_text.contains("Invalid Token") || error_text.contains("invalid_token"))
            {
-                info!("Received 403 Invalid Token error, attempting to refresh OAuth token");
+                debug!("Received 403 Invalid Token error, attempting to refresh OAuth token");

                // Try to refresh the token if we're using OAuth
                if let DatabricksAuth::OAuth { .. } = &provider_clone.auth {
@@ -867,7 +867,7 @@ impl LLMProvider for DatabricksProvider {
                    // Try to get a new token (will attempt refresh or new OAuth flow)
                    match provider_clone.auth.get_token().await {
                        Ok(_new_token) => {
-                            info!("Successfully refreshed OAuth token, retrying request");
+                            debug!("Successfully refreshed OAuth token, retrying request");

                            // Retry the request with the new token
                            response = provider_clone
@@ -1038,7 +1038,7 @@ impl LLMProvider for DatabricksProvider {
            if status == reqwest::StatusCode::FORBIDDEN
                && (error_text.contains("Invalid Token") || error_text.contains("invalid_token"))
            {
-                info!("Received 403 Invalid Token error, attempting to refresh OAuth token");
+                debug!("Received 403 Invalid Token error, attempting to refresh OAuth token");

                // Try to refresh the token if we're using OAuth
                if let DatabricksAuth::OAuth { .. } = &provider_clone.auth {
@@ -1048,7 +1048,7 @@ impl LLMProvider for DatabricksProvider {
                    // Try to get a new token (will attempt refresh or new OAuth flow)
                    match provider_clone.auth.get_token().await {
                        Ok(_new_token) => {
-                            info!("Successfully refreshed OAuth token, retrying streaming request");
+                            debug!("Successfully refreshed OAuth token, retrying streaming request");

                            // Retry the request with the new token
                            response = provider_clone
--- a/crates/g3-providers/src/embedded.rs
+++ b/crates/g3-providers/src/embedded.rs
@@ -12,7 +12,7 @@ use std::sync::Arc;
 use tokio::sync::mpsc;
 use tokio::sync::Mutex;
 use tokio_stream::wrappers::ReceiverStream;
-use tracing::{debug, error, info};
+use tracing::{debug, error};

 pub struct EmbeddedProvider {
    session: Arc<Mutex<LlamaSession>>,
@@ -32,7 +32,7 @@ impl EmbeddedProvider {
        gpu_layers: Option<u32>,
        threads: Option<u32>,
    ) -> Result<Self> {
-        info!("Loading embedded model from: {}", model_path);
+        debug!("Loading embedded model from: {}", model_path);

        // Expand tilde in path
        let expanded_path = shellexpand::tilde(&model_path);
@@ -41,7 +41,7 @@ impl EmbeddedProvider {
        // If model doesn't exist and it's the default Qwen model, offer to download it
        if !model_path_buf.exists() {
            if model_path.contains("qwen2.5-7b-instruct-q3_k_m.gguf") {
-                info!("Model file not found. Attempting to download Qwen 2.5 7B model...");
+                debug!("Model file not found. Attempting to download Qwen 2.5 7B model...");
                Self::download_qwen_model(&model_path_buf)?;
            } else {
                anyhow::bail!("Model file not found: {}", model_path_buf.display());
@@ -55,14 +55,14 @@ impl EmbeddedProvider {

        if let Some(gpu_layers) = gpu_layers {
            params.n_gpu_layers = gpu_layers;
-            info!("Using {} GPU layers", gpu_layers);
+            debug!("Using {} GPU layers", gpu_layers);
        }

        let context_size = context_length.unwrap_or(4096);
-        info!("Using context length: {}", context_size);
+        debug!("Using context length: {}", context_size);

        // Load the model
-        info!("Loading model...");
+        debug!("Loading model...");
        let model = LlamaModel::load_from_file(model_path, params)
            .map_err(|e| anyhow::anyhow!("Failed to load model: {}", e))?;

@@ -79,7 +79,7 @@ impl EmbeddedProvider {
            .create_session(session_params)
            .map_err(|e| anyhow::anyhow!("Failed to create session: {}", e))?;

-        info!("Successfully loaded {} model", model_type);
+        debug!("Successfully loaded {} model", model_type);

        Ok(Self {
            session: Arc::new(Mutex::new(session)),
@@ -330,7 +330,7 @@ impl EmbeddedProvider {
            Ok(inner_result) => match inner_result {
                Ok(task_result) => match task_result {
                    Ok((text, token_count)) => {
-                        info!(
+                        debug!(
                            "Completed generation: {} tokens (dynamic limit was {})",
                            token_count, dynamic_max_tokens
                        );
@@ -448,9 +448,9 @@ impl EmbeddedProvider {
            fs::create_dir_all(parent)?;
        }

-        info!("Downloading Qwen 2.5 7B model (Q3_K_M quantization, ~3.5GB)...");
-        info!("This is a one-time download that may take several minutes depending on your connection.");
-        info!("Downloading to: {}", model_path.display());
+        debug!("Downloading Qwen 2.5 7B model (Q3_K_M quantization, ~3.5GB)...");
+        debug!("This is a one-time download that may take several minutes depending on your connection.");
+        debug!("Downloading to: {}", model_path.display());

        // Use curl with progress bar for download
        let output = Command::new("curl")
@@ -497,7 +497,7 @@ impl EmbeddedProvider {
            );
        }

-        info!("Successfully downloaded Qwen 2.5 7B model ({}MB)", size_mb);
+        debug!("Successfully downloaded Qwen 2.5 7B model ({}MB)", size_mb);
        Ok(())
    }
 }
--- a/crates/g3-providers/src/oauth.rs
+++ b/crates/g3-providers/src/oauth.rs
@@ -392,7 +392,7 @@ pub async fn get_oauth_token_async(
                            if let Err(e) = token_cache.save_token(&new_token) {
                                tracing::warn!("Failed to save refreshed token: {}", e);
                            }
-                            tracing::info!("Successfully refreshed token");
+                            tracing::debug!("Successfully refreshed token");
                            return Ok(new_token.access_token);
                        }
                        Err(e) => {
--- a/tmp/test_planner_ui.sh
+++ b/tmp/test_planner_ui.sh
@@ -1,24 +0,0 @@
-#!/bin/bash
-set -e
-
-# Clean logs first
-rm -rf ~/RustroverProjects/g3/logs/*.log ~/RustroverProjects/g3/logs/*.txt 2>/dev/null || true
-
-# Create test requirements file
-mkdir -p /tmp/g3-test-planning/g3-plan
-cat > /tmp/g3-test-planning/g3-plan/new_requirements.md <<'EOF'
-Simple test task: List all .rs files in the src directory.
-EOF
-
-# Initialize git repo for test (planning mode requires git)
-cd /tmp/g3-test-planning
-if [ ! -d .git ]; then
-    git init
-    git config user.name "Test User"
-    git config user.email "test@example.com"
-    git add .
-    git commit -m "Initial commit" || true
-fi
-
-echo "Test environment ready at /tmp/g3-test-planning"
-echo "Run: cd /tmp && ~/RustroverProjects/g3/target/release/g3 --planning --codepath /tmp/g3-test-planning --no-git"