diff --git a/crates/g3-core/tests/compaction_behavior_test.rs b/crates/g3-core/tests/compaction_behavior_test.rs
new file mode 100644
index 0000000..f949dec
--- /dev/null
+++ b/crates/g3-core/tests/compaction_behavior_test.rs
@@ -0,0 +1,236 @@
+//! Compaction Behavior Integration Tests
+//!
+//! CHARACTERIZATION: These tests verify the observable behavior of context
+//! compaction through stable public interfaces.
+//!
+//! What these tests protect:
+//! - Compaction configuration calculation (token caps, thinking mode)
+//! - Summary message building from conversation history
+//! - Compaction result handling (success/failure)
+//!
+//! What these tests intentionally do NOT assert:
+//! - Internal implementation details of compaction
+//! - Specific LLM responses (mocked at provider boundary)
+//! - Exact token counts (only relative behavior)
+
+use g3_core::compaction::{
+    calculate_capped_summary_tokens, should_disable_thinking, build_summary_messages,
+    CompactionResult, SUMMARY_MIN_TOKENS,
+};
+use g3_core::ContextWindow;
+use g3_providers::{Message, MessageRole};
+
+// =============================================================================
+// Test: Token cap calculation for different providers
+// =============================================================================
+
+mod token_cap_calculation {
+    use super::*;
+
+    /// Test that Anthropic provider gets appropriate token caps
+    #[test]
+    fn test_anthropic_token_cap() {
+        let config = g3_config::Config::default();
+        
+        // Large base tokens should be capped
+        let capped = calculate_capped_summary_tokens(&config, "anthropic", 50000);
+        assert!(capped <= 10000, "Anthropic should cap at 10000 by default, got {}", capped);
+        assert!(capped >= SUMMARY_MIN_TOKENS, "Should respect minimum floor");
+    }
+
+    /// Test that Databricks provider gets appropriate token caps
+    #[test]
+    fn test_databricks_token_cap() {
+        let config = g3_config::Config::default();
+        
+        let capped = calculate_capped_summary_tokens(&config, "databricks", 50000);
+        assert!(capped <= 10000, "Databricks should cap at 10000, got {}", capped);
+        assert!(capped >= SUMMARY_MIN_TOKENS, "Should respect minimum floor");
+    }
+
+    /// Test that embedded provider gets lower token caps
+    #[test]
+    fn test_embedded_token_cap() {
+        let config = g3_config::Config::default();
+        
+        let capped = calculate_capped_summary_tokens(&config, "embedded", 50000);
+        assert!(capped <= 3000, "Embedded should cap at 3000, got {}", capped);
+        assert!(capped >= SUMMARY_MIN_TOKENS, "Should respect minimum floor");
+    }
+
+    /// Test that unknown providers get conservative caps
+    #[test]
+    fn test_unknown_provider_token_cap() {
+        let config = g3_config::Config::default();
+        
+        let capped = calculate_capped_summary_tokens(&config, "unknown_provider", 50000);
+        assert!(capped <= 5000, "Unknown providers should cap at 5000, got {}", capped);
+        assert!(capped >= SUMMARY_MIN_TOKENS, "Should respect minimum floor");
+    }
+
+    /// Test that small base tokens are preserved (not increased)
+    #[test]
+    fn test_small_base_tokens_preserved() {
+        let config = g3_config::Config::default();
+        
+        // If base is already small, it should be preserved (but not below minimum)
+        let capped = calculate_capped_summary_tokens(&config, "anthropic", 2000);
+        assert_eq!(capped, 2000, "Small base tokens should be preserved");
+    }
+
+    /// Test minimum floor is enforced
+    #[test]
+    fn test_minimum_floor_enforced() {
+        let config = g3_config::Config::default();
+        
+        // Even with very small base, minimum should be enforced
+        let capped = calculate_capped_summary_tokens(&config, "anthropic", 100);
+        assert_eq!(capped, SUMMARY_MIN_TOKENS, "Minimum floor should be enforced");
+    }
+}
+
+// =============================================================================
+// Test: Thinking mode disable logic
+// =============================================================================
+
+mod thinking_mode_disable {
+    use super::*;
+
+    /// Test that thinking mode is not disabled when no thinking config exists
+    #[test]
+    fn test_no_thinking_config_no_disable() {
+        let config = g3_config::Config::default();
+        
+        // Without thinking config, should never disable
+        let should_disable = should_disable_thinking(&config, "anthropic", 5000);
+        assert!(!should_disable, "Should not disable thinking when no config exists");
+    }
+
+    /// Test that non-Anthropic providers don't trigger thinking disable
+    #[test]
+    fn test_non_anthropic_no_thinking_disable() {
+        let config = g3_config::Config::default();
+        
+        // Non-Anthropic providers don't have thinking mode
+        let should_disable = should_disable_thinking(&config, "databricks", 1000);
+        assert!(!should_disable, "Non-Anthropic providers should not disable thinking");
+    }
+}
+
+// =============================================================================
+// Test: Summary message building
+// =============================================================================
+
+mod summary_message_building {
+    use super::*;
+
+    /// Test that summary messages are built correctly from conversation
+    #[test]
+    fn test_build_summary_messages_basic() {
+        let mut context = ContextWindow::new(10000);
+        
+        // Add a simple conversation
+        context.add_message(Message::new(
+            MessageRole::System,
+            "You are a helpful assistant.".to_string(),
+        ));
+        context.add_message(Message::new(
+            MessageRole::User,
+            "Hello, how are you?".to_string(),
+        ));
+        context.add_message(Message::new(
+            MessageRole::Assistant,
+            "I'm doing well, thank you!".to_string(),
+        ));
+        
+        let messages = build_summary_messages(&context);
+        
+        // Should have exactly 2 messages: system prompt and user request
+        assert_eq!(messages.len(), 2, "Should have system and user messages");
+        
+        // First should be system message for summarization
+        assert!(matches!(messages[0].role, MessageRole::System));
+        assert!(messages[0].content.contains("concise summaries"));
+        
+        // Second should be user message with conversation
+        assert!(matches!(messages[1].role, MessageRole::User));
+        assert!(messages[1].content.contains("Hello, how are you?"));
+        assert!(messages[1].content.contains("I'm doing well"));
+    }
+
+    /// Test that empty conversation produces valid summary request
+    #[test]
+    fn test_build_summary_messages_empty_conversation() {
+        let context = ContextWindow::new(10000);
+        
+        let messages = build_summary_messages(&context);
+        
+        // Should still produce valid structure
+        assert_eq!(messages.len(), 2);
+        assert!(matches!(messages[0].role, MessageRole::System));
+        assert!(matches!(messages[1].role, MessageRole::User));
+    }
+
+    /// Test that long conversations are included in summary request
+    #[test]
+    fn test_build_summary_messages_long_conversation() {
+        let mut context = ContextWindow::new(100000);
+        
+        // Add many messages
+        for i in 0..50 {
+            context.add_message(Message::new(
+                MessageRole::User,
+                format!("User message number {}", i),
+            ));
+            context.add_message(Message::new(
+                MessageRole::Assistant,
+                format!("Assistant response number {}", i),
+            ));
+        }
+        
+        let messages = build_summary_messages(&context);
+        
+        // Should include all conversation content
+        let user_content = &messages[1].content;
+        assert!(user_content.contains("User message number 0"));
+        assert!(user_content.contains("User message number 49"));
+        assert!(user_content.contains("Assistant response number 49"));
+    }
+}
+
+// =============================================================================
+// Test: CompactionResult behavior
+// =============================================================================
+
+mod compaction_result {
+    use super::*;
+
+    /// Test success result creation
+    #[test]
+    fn test_success_result() {
+        let result = CompactionResult::success(5000);
+        
+        assert!(result.success);
+        assert_eq!(result.chars_saved, 5000);
+        assert!(result.error.is_none());
+    }
+
+    /// Test failure result creation
+    #[test]
+    fn test_failure_result() {
+        let result = CompactionResult::failure("API error".to_string());
+        
+        assert!(!result.success);
+        assert_eq!(result.chars_saved, 0);
+        assert_eq!(result.error, Some("API error".to_string()));
+    }
+
+    /// Test zero chars saved is valid success
+    #[test]
+    fn test_zero_chars_saved_success() {
+        let result = CompactionResult::success(0);
+        
+        assert!(result.success);
+        assert_eq!(result.chars_saved, 0);
+    }
+}
diff --git a/crates/g3-core/tests/error_classification_test.rs b/crates/g3-core/tests/error_classification_test.rs
new file mode 100644
index 0000000..3944ae1
--- /dev/null
+++ b/crates/g3-core/tests/error_classification_test.rs
@@ -0,0 +1,356 @@
+//! Error Classification Integration Tests
+//!
+//! CHARACTERIZATION: These tests verify the observable behavior of error
+//! classification through stable public interfaces.
+//!
+//! What these tests protect:
+//! - Error messages are correctly classified as recoverable/non-recoverable
+//! - Specific error types (rate limit, timeout, server error) are detected
+//! - Retry delay calculation produces reasonable values
+//!
+//! What these tests intentionally do NOT assert:
+//! - Exact delay values (only ranges and relative behavior)
+//! - Internal classification implementation details
+
+use g3_core::error_handling::{
+    classify_error, calculate_retry_delay, ErrorType, RecoverableError,
+};
+
+// =============================================================================
+// Test: Error classification for recoverable errors
+// =============================================================================
+
+mod recoverable_error_classification {
+    use super::*;
+
+    /// Test rate limit errors are classified as recoverable
+    #[test]
+    fn test_rate_limit_detected() {
+        let error = anyhow::anyhow!("Rate limit exceeded");
+        let error_type = classify_error(&error);
+        
+        assert!(
+            matches!(error_type, ErrorType::Recoverable(RecoverableError::RateLimit)),
+            "Rate limit should be recoverable: {:?}", error_type
+        );
+    }
+
+    /// Test 429 status code is classified as rate limit
+    #[test]
+    fn test_429_status_detected() {
+        let error = anyhow::anyhow!("HTTP 429 Too Many Requests");
+        let error_type = classify_error(&error);
+        
+        assert!(
+            matches!(error_type, ErrorType::Recoverable(RecoverableError::RateLimit)),
+            "429 should be rate limit: {:?}", error_type
+        );
+    }
+
+    /// Test timeout errors are classified as recoverable
+    #[test]
+    fn test_timeout_detected() {
+        let error = anyhow::anyhow!("Request timed out");
+        let error_type = classify_error(&error);
+        
+        assert!(
+            matches!(error_type, ErrorType::Recoverable(RecoverableError::Timeout)),
+            "Timeout should be recoverable: {:?}", error_type
+        );
+    }
+
+    /// Test server errors (5xx) are classified as recoverable
+    #[test]
+    fn test_server_error_500_detected() {
+        let error = anyhow::anyhow!("Server error 500");
+        let error_type = classify_error(&error);
+        
+        assert!(
+            matches!(error_type, ErrorType::Recoverable(RecoverableError::ServerError)),
+            "500 should be server error: {:?}", error_type
+        );
+    }
+
+    /// Test 502 Bad Gateway is classified as server error
+    #[test]
+    fn test_server_error_502_detected() {
+        let error = anyhow::anyhow!("502 Bad Gateway");
+        let error_type = classify_error(&error);
+        
+        assert!(
+            matches!(error_type, ErrorType::Recoverable(RecoverableError::ServerError)),
+            "502 should be server error: {:?}", error_type
+        );
+    }
+
+    /// Test 503 Service Unavailable is classified as server error
+    #[test]
+    fn test_server_error_503_detected() {
+        let error = anyhow::anyhow!("503 Service Unavailable");
+        let error_type = classify_error(&error);
+        
+        assert!(
+            matches!(error_type, ErrorType::Recoverable(RecoverableError::ServerError)),
+            "503 should be server error: {:?}", error_type
+        );
+    }
+
+    /// Test network errors are classified as recoverable
+    #[test]
+    fn test_network_error_detected() {
+        let error = anyhow::anyhow!("Connection refused");
+        let error_type = classify_error(&error);
+        
+        assert!(
+            matches!(error_type, ErrorType::Recoverable(RecoverableError::NetworkError)),
+            "Connection refused should be network error: {:?}", error_type
+        );
+    }
+
+    /// Test connection reset is classified as network error
+    #[test]
+    fn test_connection_reset_detected() {
+        let error = anyhow::anyhow!("Connection reset by peer");
+        let error_type = classify_error(&error);
+        
+        assert!(
+            matches!(error_type, ErrorType::Recoverable(RecoverableError::NetworkError)),
+            "Connection reset should be network error: {:?}", error_type
+        );
+    }
+
+    /// Test "overloaded" is classified as busy
+    #[test]
+    fn test_model_busy_detected() {
+        let error = anyhow::anyhow!("Server is overloaded");
+        let error_type = classify_error(&error);
+        
+        assert!(
+            matches!(error_type, ErrorType::Recoverable(RecoverableError::ModelBusy)),
+            "Overloaded should be model busy: {:?}", error_type
+        );
+    }
+
+    /// Test context length exceeded requires 400 status code
+    /// CHARACTERIZATION: The error must contain "400" or "bad request" along with
+    /// context length keywords to be classified as ContextLengthExceeded
+    #[test]
+    fn test_context_length_exceeded_detected() {
+        let error = anyhow::anyhow!("400 Bad Request: context_length_exceeded: too many tokens");
+        let error_type = classify_error(&error);
+        
+        assert!(
+            matches!(error_type, ErrorType::Recoverable(RecoverableError::ContextLengthExceeded)),
+            "Context length exceeded should be detected: {:?}", error_type
+        );
+    }
+
+    /// Test token limit exceeded is classified correctly
+    /// CHARACTERIZATION: Must contain "token" AND ("limit" OR "exceeded")
+    #[test]
+    fn test_token_limit_detected() {
+        let error = anyhow::anyhow!("token limit exceeded");
+        let error_type = classify_error(&error);
+        
+        assert!(
+            matches!(error_type, ErrorType::Recoverable(RecoverableError::TokenLimit)),
+            "Token limit should be detected: {:?}", error_type
+        );
+    }
+}
+
+// =============================================================================
+// Test: Error classification for non-recoverable errors
+// =============================================================================
+
+mod non_recoverable_error_classification {
+    use super::*;
+
+    /// Test invalid API key is non-recoverable
+    #[test]
+    fn test_invalid_api_key_non_recoverable() {
+        let error = anyhow::anyhow!("Invalid API key");
+        let error_type = classify_error(&error);
+        
+        assert!(
+            matches!(error_type, ErrorType::NonRecoverable),
+            "Invalid API key should be non-recoverable: {:?}", error_type
+        );
+    }
+
+    /// Test authentication failure is non-recoverable
+    #[test]
+    fn test_auth_failure_non_recoverable() {
+        let error = anyhow::anyhow!("Authentication failed");
+        let error_type = classify_error(&error);
+        
+        assert!(
+            matches!(error_type, ErrorType::NonRecoverable),
+            "Auth failure should be non-recoverable: {:?}", error_type
+        );
+    }
+
+    /// Test generic errors are non-recoverable
+    #[test]
+    fn test_generic_error_non_recoverable() {
+        let error = anyhow::anyhow!("Something went wrong");
+        let error_type = classify_error(&error);
+        
+        assert!(
+            matches!(error_type, ErrorType::NonRecoverable),
+            "Generic error should be non-recoverable: {:?}", error_type
+        );
+    }
+
+    /// Test 401 Unauthorized is non-recoverable
+    #[test]
+    fn test_401_non_recoverable() {
+        let error = anyhow::anyhow!("401 Unauthorized");
+        let error_type = classify_error(&error);
+        
+        assert!(
+            matches!(error_type, ErrorType::NonRecoverable),
+            "401 should be non-recoverable: {:?}", error_type
+        );
+    }
+
+    /// Test 403 Forbidden is non-recoverable
+    #[test]
+    fn test_403_non_recoverable() {
+        let error = anyhow::anyhow!("403 Forbidden");
+        let error_type = classify_error(&error);
+        
+        assert!(
+            matches!(error_type, ErrorType::NonRecoverable),
+            "403 should be non-recoverable: {:?}", error_type
+        );
+    }
+}
+
+// =============================================================================
+// Test: Retry delay calculation
+// =============================================================================
+
+mod retry_delay_calculation {
+    use super::*;
+    use std::time::Duration;
+
+    /// Test first retry has reasonable delay
+    #[test]
+    fn test_first_retry_delay() {
+        let delay = calculate_retry_delay(1, false);
+        
+        // First retry should be around 1-2 seconds (with jitter)
+        assert!(delay >= Duration::from_millis(500), "Delay should be at least 500ms: {:?}", delay);
+        assert!(delay <= Duration::from_secs(5), "Delay should be at most 5s: {:?}", delay);
+    }
+
+    /// Test delays increase with retry count
+    #[test]
+    fn test_delays_increase() {
+        let delay1 = calculate_retry_delay(1, false);
+        let delay2 = calculate_retry_delay(2, false);
+        let delay3 = calculate_retry_delay(3, false);
+        
+        // Later retries should generally have longer delays
+        // (accounting for jitter, we check the trend)
+        assert!(delay2 >= delay1 || delay3 >= delay2, 
+            "Delays should generally increase: {:?} -> {:?} -> {:?}", delay1, delay2, delay3);
+    }
+
+    /// Test autonomous mode has different delays
+    #[test]
+    fn test_autonomous_mode_delays() {
+        let default_delay = calculate_retry_delay(3, false);
+        let autonomous_delay = calculate_retry_delay(3, true);
+        
+        // Autonomous mode should have longer delays (spread over 10 minutes)
+        // But with jitter, we just check they're both reasonable
+        assert!(default_delay <= Duration::from_secs(30), 
+            "Default delay should be reasonable: {:?}", default_delay);
+        assert!(autonomous_delay <= Duration::from_secs(180), 
+            "Autonomous delay should be reasonable: {:?}", autonomous_delay);
+    }
+
+    /// Test delays are capped at maximum
+    #[test]
+    fn test_delay_cap() {
+        // Even with high retry count, delay should be capped
+        let delay = calculate_retry_delay(10, false);
+        
+        assert!(delay <= Duration::from_secs(15), 
+            "Default mode delay should be capped: {:?}", delay);
+    }
+
+    /// Test autonomous mode delay cap
+    /// CHARACTERIZATION: Autonomous mode uses longer delays spread over 10 minutes
+    #[test]
+    fn test_autonomous_delay_cap() {
+        let delay = calculate_retry_delay(10, true);
+        
+        // Autonomous mode has longer delays (up to ~200s + jitter)
+        assert!(delay <= Duration::from_secs(300), 
+            "Autonomous delay should be capped: {:?}", delay);
+    }
+}
+
+// =============================================================================
+// Test: Edge cases and priority
+// =============================================================================
+
+mod edge_cases {
+    use super::*;
+
+    /// Test error with multiple keywords uses correct priority
+    #[test]
+    fn test_rate_limit_priority_over_timeout() {
+        // Rate limit should take priority
+        let error = anyhow::anyhow!("Rate limit exceeded after timeout");
+        let error_type = classify_error(&error);
+        
+        assert!(
+            matches!(error_type, ErrorType::Recoverable(RecoverableError::RateLimit)),
+            "Rate limit should take priority: {:?}", error_type
+        );
+    }
+
+    /// Test case insensitivity
+    #[test]
+    fn test_case_insensitive_detection() {
+        let error = anyhow::anyhow!("RATE LIMIT EXCEEDED");
+        let error_type = classify_error(&error);
+        
+        assert!(
+            matches!(error_type, ErrorType::Recoverable(RecoverableError::RateLimit)),
+            "Should detect uppercase: {:?}", error_type
+        );
+    }
+
+    /// Test empty error message
+    #[test]
+    fn test_empty_error_message() {
+        let error = anyhow::anyhow!("");
+        let error_type = classify_error(&error);
+        
+        // Empty message should be non-recoverable
+        assert!(
+            matches!(error_type, ErrorType::NonRecoverable),
+            "Empty error should be non-recoverable: {:?}", error_type
+        );
+    }
+
+    /// Test connection timeout is network error (not timeout)
+    /// Note: This documents the current behavior where "connection" keyword
+    /// takes priority over "timeout"
+    #[test]
+    fn test_connection_timeout_classification() {
+        let error = anyhow::anyhow!("Connection timeout");
+        let error_type = classify_error(&error);
+        
+        // Per memory: "Connection timeout" classifies as NetworkError due to "connection" keyword priority
+        assert!(
+            matches!(error_type, ErrorType::Recoverable(RecoverableError::NetworkError)),
+            "Connection timeout should be network error (per priority): {:?}", error_type
+        );
+    }
+}
diff --git a/crates/g3-core/tests/retry_behavior_test.rs b/crates/g3-core/tests/retry_behavior_test.rs
new file mode 100644
index 0000000..f0b1d6f
--- /dev/null
+++ b/crates/g3-core/tests/retry_behavior_test.rs
@@ -0,0 +1,300 @@
+//! Retry Behavior Integration Tests
+//!
+//! CHARACTERIZATION: These tests verify the observable behavior of retry
+//! infrastructure through stable public interfaces.
+//!
+//! What these tests protect:
+//! - RetryConfig construction and presets
+//! - RetryResult state transitions
+//! - retry_operation behavior with simulated errors
+//!
+//! What these tests intentionally do NOT assert:
+//! - Internal timing details (only that delays occur)
+//! - Specific backoff calculations (only that they increase)
+//! - Agent internals (tested via execute_with_retry separately)
+
+use g3_core::retry::{RetryConfig, RetryResult, retry_operation};
+use g3_core::ContextWindow;
+use g3_core::TaskResult;
+use std::sync::atomic::{AtomicU32, Ordering};
+use std::sync::Arc;
+
+// =============================================================================
+// Test: RetryConfig presets and customization
+// =============================================================================
+
+mod retry_config_presets {
+    use super::*;
+
+    /// Test default config values
+    #[test]
+    fn test_default_config() {
+        let config = RetryConfig::default();
+        
+        assert_eq!(config.max_retries, 3);
+        assert!(!config.is_autonomous);
+        assert_eq!(config.role_name, "agent");
+    }
+
+    /// Test player preset
+    #[test]
+    fn test_player_preset() {
+        let config = RetryConfig::player();
+        
+        assert_eq!(config.max_retries, 3);
+        assert!(config.is_autonomous, "Player should be autonomous");
+        assert_eq!(config.role_name, "player");
+    }
+
+    /// Test coach preset
+    #[test]
+    fn test_coach_preset() {
+        let config = RetryConfig::coach();
+        
+        assert_eq!(config.max_retries, 3);
+        assert!(config.is_autonomous, "Coach should be autonomous");
+        assert_eq!(config.role_name, "coach");
+    }
+
+    /// Test planning preset with custom role
+    #[test]
+    fn test_planning_preset() {
+        let config = RetryConfig::planning("reviewer");
+        
+        assert_eq!(config.max_retries, 3);
+        assert!(config.is_autonomous, "Planning should be autonomous");
+        assert_eq!(config.role_name, "reviewer");
+    }
+
+    /// Test custom max retries
+    #[test]
+    fn test_custom_max_retries() {
+        let config = RetryConfig::player().with_max_retries(10);
+        
+        assert_eq!(config.max_retries, 10);
+        // Other fields should be preserved
+        assert!(config.is_autonomous);
+        assert_eq!(config.role_name, "player");
+    }
+
+    /// Test chaining customizations
+    #[test]
+    fn test_chained_customization() {
+        let config = RetryConfig::default()
+            .with_max_retries(5);
+        
+        assert_eq!(config.max_retries, 5);
+        assert!(!config.is_autonomous); // Default is not autonomous
+    }
+}
+
+// =============================================================================
+// Test: RetryResult state handling
+// =============================================================================
+
+mod retry_result_states {
+    use super::*;
+
+    /// Test success result
+    #[test]
+    fn test_success_is_success() {
+        let ctx = ContextWindow::new(1000);
+        let result = RetryResult::Success(TaskResult::new("done".to_string(), ctx));
+        
+        assert!(result.is_success());
+    }
+
+    /// Test max retries reached is not success
+    #[test]
+    fn test_max_retries_not_success() {
+        let result = RetryResult::MaxRetriesReached("timeout".to_string());
+        
+        assert!(!result.is_success());
+    }
+
+    /// Test context length exceeded is not success
+    #[test]
+    fn test_context_exceeded_not_success() {
+        let result = RetryResult::ContextLengthExceeded("too long".to_string());
+        
+        assert!(!result.is_success());
+    }
+
+    /// Test panic is not success
+    #[test]
+    fn test_panic_not_success() {
+        let result = RetryResult::Panic(anyhow::anyhow!("panic occurred"));
+        
+        assert!(!result.is_success());
+    }
+
+    /// Test into_result extracts TaskResult on success
+    #[test]
+    fn test_into_result_success() {
+        let ctx = ContextWindow::new(1000);
+        let result = RetryResult::Success(TaskResult::new("done".to_string(), ctx));
+        
+        let task_result = result.into_result();
+        assert!(task_result.is_some());
+        assert_eq!(task_result.unwrap().response, "done");
+    }
+
+    /// Test into_result returns None on failure
+    #[test]
+    fn test_into_result_failure() {
+        let result = RetryResult::MaxRetriesReached("error".to_string());
+        
+        let task_result = result.into_result();
+        assert!(task_result.is_none());
+    }
+}
+
+// =============================================================================
+// Test: retry_operation behavior
+// =============================================================================
+
+mod retry_operation_behavior {
+    use super::*;
+
+    /// Test successful operation on first try
+    #[tokio::test]
+    async fn test_success_first_try() {
+        let call_count = Arc::new(AtomicU32::new(0));
+        let call_count_clone = call_count.clone();
+        
+        let result = retry_operation(
+            "test_op",
+            || {
+                let count = call_count_clone.clone();
+                async move {
+                    count.fetch_add(1, Ordering::SeqCst);
+                    Ok::<_, anyhow::Error>("success")
+                }
+            },
+            3,
+            false,
+            |_msg| {},
+        ).await;
+        
+        assert!(result.is_ok());
+        assert_eq!(result.unwrap(), "success");
+        assert_eq!(call_count.load(Ordering::SeqCst), 1, "Should only call once on success");
+    }
+
+    /// Test non-recoverable error fails immediately
+    #[tokio::test]
+    async fn test_non_recoverable_fails_immediately() {
+        let call_count = Arc::new(AtomicU32::new(0));
+        let call_count_clone = call_count.clone();
+        
+        let result = retry_operation(
+            "test_op",
+            || {
+                let count = call_count_clone.clone();
+                async move {
+                    count.fetch_add(1, Ordering::SeqCst);
+                    Err::<String, _>(anyhow::anyhow!("Invalid API key"))
+                }
+            },
+            3,
+            false,
+            |_msg| {},
+        ).await;
+        
+        assert!(result.is_err());
+        assert_eq!(call_count.load(Ordering::SeqCst), 1, "Non-recoverable should not retry");
+    }
+
+    /// Test recoverable error retries up to max
+    #[tokio::test]
+    async fn test_recoverable_retries_to_max() {
+        let call_count = Arc::new(AtomicU32::new(0));
+        let call_count_clone = call_count.clone();
+        
+        let result = retry_operation(
+            "test_op",
+            || {
+                let count = call_count_clone.clone();
+                async move {
+                    count.fetch_add(1, Ordering::SeqCst);
+                    // Rate limit is a recoverable error
+                    Err::<String, _>(anyhow::anyhow!("Rate limit exceeded"))
+                }
+            },
+            3, // max retries
+            false,
+            |_msg| {},
+        ).await;
+        
+        assert!(result.is_err());
+        // Should try initial + max_retries times
+        assert_eq!(call_count.load(Ordering::SeqCst), 3, "Should retry up to max");
+    }
+
+    /// Test recoverable error succeeds on retry
+    #[tokio::test]
+    async fn test_recoverable_succeeds_on_retry() {
+        let call_count = Arc::new(AtomicU32::new(0));
+        let call_count_clone = call_count.clone();
+        
+        let result = retry_operation(
+            "test_op",
+            || {
+                let count = call_count_clone.clone();
+                async move {
+                    let current = count.fetch_add(1, Ordering::SeqCst);
+                    if current < 2 {
+                        // Fail first two times with recoverable error
+                        Err(anyhow::anyhow!("Server error 500"))
+                    } else {
+                        // Succeed on third try
+                        Ok("success after retry")
+                    }
+                }
+            },
+            5, // max retries
+            false,
+            |_msg| {},
+        ).await;
+        
+        assert!(result.is_ok());
+        assert_eq!(result.unwrap(), "success after retry");
+        assert_eq!(call_count.load(Ordering::SeqCst), 3, "Should succeed on third try");
+    }
+
+    /// Test print function is called on retry
+    #[tokio::test]
+    async fn test_print_fn_called_on_retry() {
+        let messages = Arc::new(std::sync::Mutex::new(Vec::new()));
+        let messages_clone = messages.clone();
+        
+        let call_count = Arc::new(AtomicU32::new(0));
+        let call_count_clone = call_count.clone();
+        
+        let _ = retry_operation(
+            "test_op",
+            || {
+                let count = call_count_clone.clone();
+                async move {
+                    let current = count.fetch_add(1, Ordering::SeqCst);
+                    if current < 1 {
+                        Err(anyhow::anyhow!("Rate limit exceeded"))
+                    } else {
+                        Ok("success")
+                    }
+                }
+            },
+            3,
+            false,
+            |msg| {
+                messages_clone.lock().unwrap().push(msg.to_string());
+            },
+        ).await;
+        
+        let msgs = messages.lock().unwrap();
+        assert!(!msgs.is_empty(), "Should have printed retry messages");
+        // Should mention the error type
+        assert!(msgs.iter().any(|m| m.contains("RateLimit") || m.contains("rate")), 
+            "Should mention rate limit in messages: {:?}", msgs);
+    }
+}
diff --git a/crates/g3-core/tests/tool_execution_roundtrip_test.rs b/crates/g3-core/tests/tool_execution_roundtrip_test.rs
new file mode 100644
index 0000000..c4cf30a
--- /dev/null
+++ b/crates/g3-core/tests/tool_execution_roundtrip_test.rs
@@ -0,0 +1,466 @@
+//! Tool Execution Round-Trip Integration Tests
+//!
+//! CHARACTERIZATION: These tests verify that tools execute correctly through
+//! the Agent interface, testing the full round-trip from tool call to result.
+//!
+//! What these tests protect:
+//! - File operations (read, write, str_replace) work end-to-end
+//! - Shell command execution produces expected output
+//! - TODO operations persist correctly
+//! - Error handling for invalid inputs
+//!
+//! What these tests intentionally do NOT assert:
+//! - Internal implementation details of tools
+//! - Specific formatting of success messages (only key content)
+//! - UI writer behavior (uses NullUiWriter)
+
+use g3_core::ui_writer::NullUiWriter;
+use g3_core::{Agent, ToolCall};
+use serial_test::serial;
+use std::fs;
+use tempfile::TempDir;
+
+// =============================================================================
+// Test Helpers
+// =============================================================================
+
+/// Create a test agent in a temporary directory
+async fn create_test_agent(temp_dir: &TempDir) -> Agent<NullUiWriter> {
+    std::env::set_current_dir(temp_dir.path()).unwrap();
+    let config = g3_config::Config::default();
+    let ui_writer = NullUiWriter;
+    Agent::new(config, ui_writer).await.unwrap()
+}
+
+/// Create a ToolCall with the given tool name and arguments
+fn make_tool_call(tool: &str, args: serde_json::Value) -> ToolCall {
+    ToolCall {
+        tool: tool.to_string(),
+        args,
+    }
+}
+
+// =============================================================================
+// Test: read_file tool execution
+// =============================================================================
+
+mod read_file_execution {
+    use super::*;
+
+    /// Test reading an existing file
+    #[tokio::test]
+    #[serial]
+    async fn test_read_existing_file() {
+        let temp_dir = TempDir::new().unwrap();
+        let test_file = temp_dir.path().join("test.txt");
+        fs::write(&test_file, "Hello, World!\nLine 2\nLine 3").unwrap();
+        
+        let mut agent = create_test_agent(&temp_dir).await;
+        
+        let tool_call = make_tool_call(
+            "read_file",
+            serde_json::json!({ "file_path": test_file.to_string_lossy() }),
+        );
+        
+        let result = agent.execute_tool(&tool_call).await.unwrap();
+        
+        assert!(result.contains("Hello, World!"), "Should contain file content: {}", result);
+        assert!(result.contains("Line 2"), "Should contain all lines: {}", result);
+    }
+
+    /// Test reading a non-existent file returns error
+    #[tokio::test]
+    #[serial]
+    async fn test_read_nonexistent_file() {
+        let temp_dir = TempDir::new().unwrap();
+        let mut agent = create_test_agent(&temp_dir).await;
+        
+        let tool_call = make_tool_call(
+            "read_file",
+            serde_json::json!({ "file_path": "/nonexistent/path/file.txt" }),
+        );
+        
+        let result = agent.execute_tool(&tool_call).await;
+        
+        // Should return an error or error message
+        assert!(
+            result.is_err() || result.as_ref().unwrap().contains("error") || result.as_ref().unwrap().contains("not found") || result.as_ref().unwrap().contains("No such file"),
+            "Should indicate file not found: {:?}", result
+        );
+    }
+
+    /// Test reading with character range
+    #[tokio::test]
+    #[serial]
+    async fn test_read_file_with_range() {
+        let temp_dir = TempDir::new().unwrap();
+        let test_file = temp_dir.path().join("test.txt");
+        fs::write(&test_file, "0123456789ABCDEF").unwrap();
+        
+        let mut agent = create_test_agent(&temp_dir).await;
+        
+        let tool_call = make_tool_call(
+            "read_file",
+            serde_json::json!({
+                "file_path": test_file.to_string_lossy(),
+                "start": 5,
+                "end": 10
+            }),
+        );
+        
+        let result = agent.execute_tool(&tool_call).await.unwrap();
+        
+        // Should contain the substring from position 5 to 10
+        assert!(result.contains("56789"), "Should contain range content: {}", result);
+    }
+}
+
+// =============================================================================
+// Test: write_file tool execution
+// =============================================================================
+
+mod write_file_execution {
+    use super::*;
+
+    /// Test writing a new file
+    #[tokio::test]
+    #[serial]
+    async fn test_write_new_file() {
+        let temp_dir = TempDir::new().unwrap();
+        let new_file = temp_dir.path().join("new_file.txt");
+        
+        assert!(!new_file.exists(), "File should not exist initially");
+        
+        let mut agent = create_test_agent(&temp_dir).await;
+        
+        let tool_call = make_tool_call(
+            "write_file",
+            serde_json::json!({
+                "file_path": new_file.to_string_lossy(),
+                "content": "New content here"
+            }),
+        );
+        
+        let result = agent.execute_tool(&tool_call).await.unwrap();
+        
+        // Should report success
+        assert!(result.contains("✅") || result.to_lowercase().contains("success") || result.to_lowercase().contains("wrote"),
+            "Should report success: {}", result);
+        
+        // File should now exist with correct content
+        assert!(new_file.exists(), "File should exist after write");
+        let content = fs::read_to_string(&new_file).unwrap();
+        assert_eq!(content, "New content here");
+    }
+
+    /// Test overwriting an existing file
+    #[tokio::test]
+    #[serial]
+    async fn test_overwrite_existing_file() {
+        let temp_dir = TempDir::new().unwrap();
+        let test_file = temp_dir.path().join("existing.txt");
+        fs::write(&test_file, "Original content").unwrap();
+        
+        let mut agent = create_test_agent(&temp_dir).await;
+        
+        let tool_call = make_tool_call(
+            "write_file",
+            serde_json::json!({
+                "file_path": test_file.to_string_lossy(),
+                "content": "Replaced content"
+            }),
+        );
+        
+        let result = agent.execute_tool(&tool_call).await.unwrap();
+        
+        assert!(result.contains("✅") || result.to_lowercase().contains("success") || result.to_lowercase().contains("wrote"),
+            "Should report success: {}", result);
+        
+        let content = fs::read_to_string(&test_file).unwrap();
+        assert_eq!(content, "Replaced content");
+    }
+
+    /// Test writing creates parent directories
+    #[tokio::test]
+    #[serial]
+    async fn test_write_creates_parent_dirs() {
+        let temp_dir = TempDir::new().unwrap();
+        let nested_file = temp_dir.path().join("a/b/c/nested.txt");
+        
+        let mut agent = create_test_agent(&temp_dir).await;
+        
+        let tool_call = make_tool_call(
+            "write_file",
+            serde_json::json!({
+                "file_path": nested_file.to_string_lossy(),
+                "content": "Nested content"
+            }),
+        );
+        
+        let result = agent.execute_tool(&tool_call).await.unwrap();
+        
+        assert!(result.contains("✅") || result.to_lowercase().contains("success") || result.to_lowercase().contains("wrote"),
+            "Should report success: {}", result);
+        
+        assert!(nested_file.exists(), "Nested file should exist");
+        let content = fs::read_to_string(&nested_file).unwrap();
+        assert_eq!(content, "Nested content");
+    }
+}
+
+// =============================================================================
+// Test: shell tool execution
+// =============================================================================
+
+mod shell_execution {
+    use super::*;
+
+    /// Test simple echo command
+    #[tokio::test]
+    #[serial]
+    async fn test_shell_echo() {
+        let temp_dir = TempDir::new().unwrap();
+        let mut agent = create_test_agent(&temp_dir).await;
+        
+        let tool_call = make_tool_call(
+            "shell",
+            serde_json::json!({ "command": "echo 'hello world'" }),
+        );
+        
+        let result = agent.execute_tool(&tool_call).await.unwrap();
+        
+        assert!(result.contains("hello world"), "Should contain echo output: {}", result);
+    }
+
+    /// Test command that produces multi-line output
+    #[tokio::test]
+    #[serial]
+    async fn test_shell_multiline_output() {
+        let temp_dir = TempDir::new().unwrap();
+        let mut agent = create_test_agent(&temp_dir).await;
+        
+        let tool_call = make_tool_call(
+            "shell",
+            serde_json::json!({ "command": "echo 'line1'; echo 'line2'; echo 'line3'" }),
+        );
+        
+        let result = agent.execute_tool(&tool_call).await.unwrap();
+        
+        assert!(result.contains("line1"), "Should contain line1: {}", result);
+        assert!(result.contains("line2"), "Should contain line2: {}", result);
+        assert!(result.contains("line3"), "Should contain line3: {}", result);
+    }
+
+    /// Test command that fails
+    #[tokio::test]
+    #[serial]
+    async fn test_shell_failing_command() {
+        let temp_dir = TempDir::new().unwrap();
+        let mut agent = create_test_agent(&temp_dir).await;
+        
+        let tool_call = make_tool_call(
+            "shell",
+            serde_json::json!({ "command": "exit 1" }),
+        );
+        
+        let result = agent.execute_tool(&tool_call).await;
+        
+        // Should indicate failure (either error or non-zero exit)
+        assert!(
+            result.is_err() || result.as_ref().unwrap().contains("exit") || result.as_ref().unwrap().contains("failed") || result.as_ref().unwrap().contains("error"),
+            "Should indicate command failure: {:?}", result
+        );
+    }
+
+    /// Test command with working directory context
+    #[tokio::test]
+    #[serial]
+    async fn test_shell_pwd() {
+        let temp_dir = TempDir::new().unwrap();
+        let mut agent = create_test_agent(&temp_dir).await;
+        
+        let tool_call = make_tool_call(
+            "shell",
+            serde_json::json!({ "command": "pwd" }),
+        );
+        
+        let result = agent.execute_tool(&tool_call).await.unwrap();
+        
+        // Should show the temp directory path
+        let temp_path = temp_dir.path().to_string_lossy();
+        assert!(result.contains(&*temp_path) || result.contains("private"), 
+            "Should show current directory: {} (expected to contain {})", result, temp_path);
+    }
+}
+
+// =============================================================================
+// Test: str_replace tool execution
+// =============================================================================
+
+mod str_replace_execution {
+    use super::*;
+
+    /// Test applying a simple diff
+    #[tokio::test]
+    #[serial]
+    async fn test_str_replace_simple() {
+        let temp_dir = TempDir::new().unwrap();
+        let test_file = temp_dir.path().join("test.txt");
+        fs::write(&test_file, "line 1\nold line\nline 3\n").unwrap();
+        
+        let mut agent = create_test_agent(&temp_dir).await;
+        
+        let diff = "@@ -1,3 +1,3 @@\n line 1\n-old line\n+new line\n line 3\n";
+        
+        let tool_call = make_tool_call(
+            "str_replace",
+            serde_json::json!({
+                "file_path": test_file.to_string_lossy(),
+                "diff": diff
+            }),
+        );
+        
+        let result = agent.execute_tool(&tool_call).await.unwrap();
+        
+        assert!(result.contains("✅") || result.to_lowercase().contains("applied") || result.to_lowercase().contains("success"),
+            "Should report success: {}", result);
+        
+        let content = fs::read_to_string(&test_file).unwrap();
+        assert!(content.contains("new line"), "Should contain new content: {}", content);
+        assert!(!content.contains("old line"), "Should not contain old content: {}", content);
+    }
+
+    /// Test diff that adds lines
+    #[tokio::test]
+    #[serial]
+    async fn test_str_replace_add_lines() {
+        let temp_dir = TempDir::new().unwrap();
+        let test_file = temp_dir.path().join("test.txt");
+        fs::write(&test_file, "line 1\nline 3\n").unwrap();
+        
+        let mut agent = create_test_agent(&temp_dir).await;
+        
+        let diff = "@@ -1,2 +1,3 @@\n line 1\n+line 2\n line 3\n";
+        
+        let tool_call = make_tool_call(
+            "str_replace",
+            serde_json::json!({
+                "file_path": test_file.to_string_lossy(),
+                "diff": diff
+            }),
+        );
+        
+        let result = agent.execute_tool(&tool_call).await.unwrap();
+        
+        assert!(result.contains("✅") || result.to_lowercase().contains("applied"),
+            "Should report success: {}", result);
+        
+        let content = fs::read_to_string(&test_file).unwrap();
+        assert!(content.contains("line 2"), "Should contain added line: {}", content);
+    }
+
+    /// Test diff with pattern not found
+    #[tokio::test]
+    #[serial]
+    async fn test_str_replace_pattern_not_found() {
+        let temp_dir = TempDir::new().unwrap();
+        let test_file = temp_dir.path().join("test.txt");
+        fs::write(&test_file, "actual content\n").unwrap();
+        
+        let mut agent = create_test_agent(&temp_dir).await;
+        
+        let diff = "@@ -1,1 +1,1 @@\n-nonexistent pattern\n+replacement\n";
+        
+        let tool_call = make_tool_call(
+            "str_replace",
+            serde_json::json!({
+                "file_path": test_file.to_string_lossy(),
+                "diff": diff
+            }),
+        );
+        
+        let result = agent.execute_tool(&tool_call).await;
+        
+        // Should indicate pattern not found
+        assert!(
+            result.is_err() || result.as_ref().unwrap().to_lowercase().contains("not found") || result.as_ref().unwrap().to_lowercase().contains("pattern") || result.as_ref().unwrap().to_lowercase().contains("error"),
+            "Should indicate pattern not found: {:?}", result
+        );
+    }
+}
+
+// =============================================================================
+// Test: TODO tool execution
+// =============================================================================
+
+mod todo_execution {
+    use super::*;
+
+    /// Test writing and reading TODO
+    #[tokio::test]
+    #[serial]
+    async fn test_todo_write_and_read() {
+        let temp_dir = TempDir::new().unwrap();
+        let mut agent = create_test_agent(&temp_dir).await;
+        
+        // Write TODO
+        let write_call = make_tool_call(
+            "todo_write",
+            serde_json::json!({
+                "content": "- [ ] Task 1\n- [x] Task 2\n- [ ] Task 3"
+            }),
+        );
+        
+        let write_result = agent.execute_tool(&write_call).await.unwrap();
+        assert!(write_result.contains("✅") || write_result.to_lowercase().contains("success"),
+            "Write should succeed: {}", write_result);
+        
+        // Read TODO
+        let read_call = make_tool_call("todo_read", serde_json::json!({}));
+        let read_result = agent.execute_tool(&read_call).await.unwrap();
+        
+        assert!(read_result.contains("Task 1"), "Should contain Task 1: {}", read_result);
+        assert!(read_result.contains("Task 2"), "Should contain Task 2: {}", read_result);
+        assert!(read_result.contains("Task 3"), "Should contain Task 3: {}", read_result);
+    }
+
+    /// Test reading empty TODO
+    #[tokio::test]
+    #[serial]
+    async fn test_todo_read_empty() {
+        let temp_dir = TempDir::new().unwrap();
+        let mut agent = create_test_agent(&temp_dir).await;
+        
+        let read_call = make_tool_call("todo_read", serde_json::json!({}));
+        let result = agent.execute_tool(&read_call).await.unwrap();
+        
+        assert!(result.to_lowercase().contains("empty") || result.contains("no todo"),
+            "Should indicate empty: {}", result);
+    }
+
+    /// Test TODO persists to file
+    #[tokio::test]
+    #[serial]
+    async fn test_todo_persists_to_file() {
+        let temp_dir = TempDir::new().unwrap();
+        let todo_path = temp_dir.path().join("todo.g3.md");
+        
+        {
+            let mut agent = create_test_agent(&temp_dir).await;
+            
+            let write_call = make_tool_call(
+                "todo_write",
+                serde_json::json!({
+                    "content": "- [ ] Persistent task"
+                }),
+            );
+            
+            agent.execute_tool(&write_call).await.unwrap();
+        }
+        
+        // File should exist after agent is dropped
+        assert!(todo_path.exists(), "TODO file should persist");
+        let content = fs::read_to_string(&todo_path).unwrap();
+        assert!(content.contains("Persistent task"), "Content should persist: {}", content);
+    }
+}