Merge sessions/hopper/1156b5c9

2026-01-12 11:53:14 +05:30
parent 9e26d6bbf9 5dfabaf19a
commit 8d5dd9f84a
4 changed files with 1358 additions and 0 deletions
--- a/crates/g3-core/tests/compaction_behavior_test.rs
+++ b/crates/g3-core/tests/compaction_behavior_test.rs
@@ -0,0 +1,236 @@
 //! Compaction Behavior Integration Tests
 //!
 //! CHARACTERIZATION: These tests verify the observable behavior of context
 //! compaction through stable public interfaces.
 //!
 //! What these tests protect:
 //! - Compaction configuration calculation (token caps, thinking mode)
 //! - Summary message building from conversation history
 //! - Compaction result handling (success/failure)
 //!
 //! What these tests intentionally do NOT assert:
 //! - Internal implementation details of compaction
 //! - Specific LLM responses (mocked at provider boundary)
 //! - Exact token counts (only relative behavior)
 use g3_core::compaction::{
    calculate_capped_summary_tokens, should_disable_thinking, build_summary_messages,
    CompactionResult, SUMMARY_MIN_TOKENS,
 };
 use g3_core::ContextWindow;
 use g3_providers::{Message, MessageRole};
 // =============================================================================
 // Test: Token cap calculation for different providers
 // =============================================================================
 mod token_cap_calculation {
    use super::*;
    /// Test that Anthropic provider gets appropriate token caps
    #[test]
    fn test_anthropic_token_cap() {
        let config = g3_config::Config::default();
        // Large base tokens should be capped
        let capped = calculate_capped_summary_tokens(&config, "anthropic", 50000);
        assert!(capped <= 10000, "Anthropic should cap at 10000 by default, got {}", capped);
        assert!(capped >= SUMMARY_MIN_TOKENS, "Should respect minimum floor");
    }
    /// Test that Databricks provider gets appropriate token caps
    #[test]
    fn test_databricks_token_cap() {
        let config = g3_config::Config::default();
        let capped = calculate_capped_summary_tokens(&config, "databricks", 50000);
        assert!(capped <= 10000, "Databricks should cap at 10000, got {}", capped);
        assert!(capped >= SUMMARY_MIN_TOKENS, "Should respect minimum floor");
    }
    /// Test that embedded provider gets lower token caps
    #[test]
    fn test_embedded_token_cap() {
        let config = g3_config::Config::default();
        let capped = calculate_capped_summary_tokens(&config, "embedded", 50000);
        assert!(capped <= 3000, "Embedded should cap at 3000, got {}", capped);
        assert!(capped >= SUMMARY_MIN_TOKENS, "Should respect minimum floor");
    }
    /// Test that unknown providers get conservative caps
    #[test]
    fn test_unknown_provider_token_cap() {
        let config = g3_config::Config::default();
        let capped = calculate_capped_summary_tokens(&config, "unknown_provider", 50000);
        assert!(capped <= 5000, "Unknown providers should cap at 5000, got {}", capped);
        assert!(capped >= SUMMARY_MIN_TOKENS, "Should respect minimum floor");
    }
    /// Test that small base tokens are preserved (not increased)
    #[test]
    fn test_small_base_tokens_preserved() {
        let config = g3_config::Config::default();
        // If base is already small, it should be preserved (but not below minimum)
        let capped = calculate_capped_summary_tokens(&config, "anthropic", 2000);
        assert_eq!(capped, 2000, "Small base tokens should be preserved");
    }
    /// Test minimum floor is enforced
    #[test]
    fn test_minimum_floor_enforced() {
        let config = g3_config::Config::default();
        // Even with very small base, minimum should be enforced
        let capped = calculate_capped_summary_tokens(&config, "anthropic", 100);
        assert_eq!(capped, SUMMARY_MIN_TOKENS, "Minimum floor should be enforced");
    }
 }
 // =============================================================================
 // Test: Thinking mode disable logic
 // =============================================================================
 mod thinking_mode_disable {
    use super::*;
    /// Test that thinking mode is not disabled when no thinking config exists
    #[test]
    fn test_no_thinking_config_no_disable() {
        let config = g3_config::Config::default();
        // Without thinking config, should never disable
        let should_disable = should_disable_thinking(&config, "anthropic", 5000);
        assert!(!should_disable, "Should not disable thinking when no config exists");
    }
    /// Test that non-Anthropic providers don't trigger thinking disable
    #[test]
    fn test_non_anthropic_no_thinking_disable() {
        let config = g3_config::Config::default();
        // Non-Anthropic providers don't have thinking mode
        let should_disable = should_disable_thinking(&config, "databricks", 1000);
        assert!(!should_disable, "Non-Anthropic providers should not disable thinking");
    }
 }
 // =============================================================================
 // Test: Summary message building
 // =============================================================================
 mod summary_message_building {
    use super::*;
    /// Test that summary messages are built correctly from conversation
    #[test]
    fn test_build_summary_messages_basic() {
        let mut context = ContextWindow::new(10000);
        // Add a simple conversation
        context.add_message(Message::new(
            MessageRole::System,
            "You are a helpful assistant.".to_string(),
        ));
        context.add_message(Message::new(
            MessageRole::User,
            "Hello, how are you?".to_string(),
        ));
        context.add_message(Message::new(
            MessageRole::Assistant,
            "I'm doing well, thank you!".to_string(),
        ));
        let messages = build_summary_messages(&context);
        // Should have exactly 2 messages: system prompt and user request
        assert_eq!(messages.len(), 2, "Should have system and user messages");
        // First should be system message for summarization
        assert!(matches!(messages[0].role, MessageRole::System));
        assert!(messages[0].content.contains("concise summaries"));
        // Second should be user message with conversation
        assert!(matches!(messages[1].role, MessageRole::User));
        assert!(messages[1].content.contains("Hello, how are you?"));
        assert!(messages[1].content.contains("I'm doing well"));
    }
    /// Test that empty conversation produces valid summary request
    #[test]
    fn test_build_summary_messages_empty_conversation() {
        let context = ContextWindow::new(10000);
        let messages = build_summary_messages(&context);
        // Should still produce valid structure
        assert_eq!(messages.len(), 2);
        assert!(matches!(messages[0].role, MessageRole::System));
        assert!(matches!(messages[1].role, MessageRole::User));
    }
    /// Test that long conversations are included in summary request
    #[test]
    fn test_build_summary_messages_long_conversation() {
        let mut context = ContextWindow::new(100000);
        // Add many messages
        for i in 0..50 {
            context.add_message(Message::new(
                MessageRole::User,
                format!("User message number {}", i),
            ));
            context.add_message(Message::new(
                MessageRole::Assistant,
                format!("Assistant response number {}", i),
            ));
        }
        let messages = build_summary_messages(&context);
        // Should include all conversation content
        let user_content = &messages[1].content;
        assert!(user_content.contains("User message number 0"));
        assert!(user_content.contains("User message number 49"));
        assert!(user_content.contains("Assistant response number 49"));
    }
 }
 // =============================================================================
 // Test: CompactionResult behavior
 // =============================================================================
 mod compaction_result {
    use super::*;
    /// Test success result creation
    #[test]
    fn test_success_result() {
        let result = CompactionResult::success(5000);
        assert!(result.success);
        assert_eq!(result.chars_saved, 5000);
        assert!(result.error.is_none());
    }
    /// Test failure result creation
    #[test]
    fn test_failure_result() {
        let result = CompactionResult::failure("API error".to_string());
        assert!(!result.success);
        assert_eq!(result.chars_saved, 0);
        assert_eq!(result.error, Some("API error".to_string()));
    }
    /// Test zero chars saved is valid success
    #[test]
    fn test_zero_chars_saved_success() {
        let result = CompactionResult::success(0);
        assert!(result.success);
        assert_eq!(result.chars_saved, 0);
    }
 }
--- a/crates/g3-core/tests/error_classification_test.rs
+++ b/crates/g3-core/tests/error_classification_test.rs
@@ -0,0 +1,356 @@
 //! Error Classification Integration Tests
 //!
 //! CHARACTERIZATION: These tests verify the observable behavior of error
 //! classification through stable public interfaces.
 //!
 //! What these tests protect:
 //! - Error messages are correctly classified as recoverable/non-recoverable
 //! - Specific error types (rate limit, timeout, server error) are detected
 //! - Retry delay calculation produces reasonable values
 //!
 //! What these tests intentionally do NOT assert:
 //! - Exact delay values (only ranges and relative behavior)
 //! - Internal classification implementation details
 use g3_core::error_handling::{
    classify_error, calculate_retry_delay, ErrorType, RecoverableError,
 };
 // =============================================================================
 // Test: Error classification for recoverable errors
 // =============================================================================
 mod recoverable_error_classification {
    use super::*;
    /// Test rate limit errors are classified as recoverable
    #[test]
    fn test_rate_limit_detected() {
        let error = anyhow::anyhow!("Rate limit exceeded");
        let error_type = classify_error(&error);
        assert!(
            matches!(error_type, ErrorType::Recoverable(RecoverableError::RateLimit)),
            "Rate limit should be recoverable: {:?}", error_type
        );
    }
    /// Test 429 status code is classified as rate limit
    #[test]
    fn test_429_status_detected() {
        let error = anyhow::anyhow!("HTTP 429 Too Many Requests");
        let error_type = classify_error(&error);
        assert!(
            matches!(error_type, ErrorType::Recoverable(RecoverableError::RateLimit)),
            "429 should be rate limit: {:?}", error_type
        );
    }
    /// Test timeout errors are classified as recoverable
    #[test]
    fn test_timeout_detected() {
        let error = anyhow::anyhow!("Request timed out");
        let error_type = classify_error(&error);
        assert!(
            matches!(error_type, ErrorType::Recoverable(RecoverableError::Timeout)),
            "Timeout should be recoverable: {:?}", error_type
        );
    }
    /// Test server errors (5xx) are classified as recoverable
    #[test]
    fn test_server_error_500_detected() {
        let error = anyhow::anyhow!("Server error 500");
        let error_type = classify_error(&error);
        assert!(
            matches!(error_type, ErrorType::Recoverable(RecoverableError::ServerError)),
            "500 should be server error: {:?}", error_type
        );
    }
    /// Test 502 Bad Gateway is classified as server error
    #[test]
    fn test_server_error_502_detected() {
        let error = anyhow::anyhow!("502 Bad Gateway");
        let error_type = classify_error(&error);
        assert!(
            matches!(error_type, ErrorType::Recoverable(RecoverableError::ServerError)),
            "502 should be server error: {:?}", error_type
        );
    }
    /// Test 503 Service Unavailable is classified as server error
    #[test]
    fn test_server_error_503_detected() {
        let error = anyhow::anyhow!("503 Service Unavailable");
        let error_type = classify_error(&error);
        assert!(
            matches!(error_type, ErrorType::Recoverable(RecoverableError::ServerError)),
            "503 should be server error: {:?}", error_type
        );
    }
    /// Test network errors are classified as recoverable
    #[test]
    fn test_network_error_detected() {
        let error = anyhow::anyhow!("Connection refused");
        let error_type = classify_error(&error);
        assert!(
            matches!(error_type, ErrorType::Recoverable(RecoverableError::NetworkError)),
            "Connection refused should be network error: {:?}", error_type
        );
    }
    /// Test connection reset is classified as network error
    #[test]
    fn test_connection_reset_detected() {
        let error = anyhow::anyhow!("Connection reset by peer");
        let error_type = classify_error(&error);
        assert!(
            matches!(error_type, ErrorType::Recoverable(RecoverableError::NetworkError)),
            "Connection reset should be network error: {:?}", error_type
        );
    }
    /// Test "overloaded" is classified as busy
    #[test]
    fn test_model_busy_detected() {
        let error = anyhow::anyhow!("Server is overloaded");
        let error_type = classify_error(&error);
        assert!(
            matches!(error_type, ErrorType::Recoverable(RecoverableError::ModelBusy)),
            "Overloaded should be model busy: {:?}", error_type
        );
    }
    /// Test context length exceeded requires 400 status code
    /// CHARACTERIZATION: The error must contain "400" or "bad request" along with
    /// context length keywords to be classified as ContextLengthExceeded
    #[test]
    fn test_context_length_exceeded_detected() {
        let error = anyhow::anyhow!("400 Bad Request: context_length_exceeded: too many tokens");
        let error_type = classify_error(&error);
        assert!(
            matches!(error_type, ErrorType::Recoverable(RecoverableError::ContextLengthExceeded)),
            "Context length exceeded should be detected: {:?}", error_type
        );
    }
    /// Test token limit exceeded is classified correctly
    /// CHARACTERIZATION: Must contain "token" AND ("limit" OR "exceeded")
    #[test]
    fn test_token_limit_detected() {
        let error = anyhow::anyhow!("token limit exceeded");
        let error_type = classify_error(&error);
        assert!(
            matches!(error_type, ErrorType::Recoverable(RecoverableError::TokenLimit)),
            "Token limit should be detected: {:?}", error_type
        );
    }
 }
 // =============================================================================
 // Test: Error classification for non-recoverable errors
 // =============================================================================
 mod non_recoverable_error_classification {
    use super::*;
    /// Test invalid API key is non-recoverable
    #[test]
    fn test_invalid_api_key_non_recoverable() {
        let error = anyhow::anyhow!("Invalid API key");
        let error_type = classify_error(&error);
        assert!(
            matches!(error_type, ErrorType::NonRecoverable),
            "Invalid API key should be non-recoverable: {:?}", error_type
        );
    }
    /// Test authentication failure is non-recoverable
    #[test]
    fn test_auth_failure_non_recoverable() {
        let error = anyhow::anyhow!("Authentication failed");
        let error_type = classify_error(&error);
        assert!(
            matches!(error_type, ErrorType::NonRecoverable),
            "Auth failure should be non-recoverable: {:?}", error_type
        );
    }
    /// Test generic errors are non-recoverable
    #[test]
    fn test_generic_error_non_recoverable() {
        let error = anyhow::anyhow!("Something went wrong");
        let error_type = classify_error(&error);
        assert!(
            matches!(error_type, ErrorType::NonRecoverable),
            "Generic error should be non-recoverable: {:?}", error_type
        );
    }
    /// Test 401 Unauthorized is non-recoverable
    #[test]
    fn test_401_non_recoverable() {
        let error = anyhow::anyhow!("401 Unauthorized");
        let error_type = classify_error(&error);
        assert!(
            matches!(error_type, ErrorType::NonRecoverable),
            "401 should be non-recoverable: {:?}", error_type
        );
    }
    /// Test 403 Forbidden is non-recoverable
    #[test]
    fn test_403_non_recoverable() {
        let error = anyhow::anyhow!("403 Forbidden");
        let error_type = classify_error(&error);
        assert!(
            matches!(error_type, ErrorType::NonRecoverable),
            "403 should be non-recoverable: {:?}", error_type
        );
    }
 }
 // =============================================================================
 // Test: Retry delay calculation
 // =============================================================================
 mod retry_delay_calculation {
    use super::*;
    use std::time::Duration;
    /// Test first retry has reasonable delay
    #[test]
    fn test_first_retry_delay() {
        let delay = calculate_retry_delay(1, false);
        // First retry should be around 1-2 seconds (with jitter)
        assert!(delay >= Duration::from_millis(500), "Delay should be at least 500ms: {:?}", delay);
        assert!(delay <= Duration::from_secs(5), "Delay should be at most 5s: {:?}", delay);
    }
    /// Test delays increase with retry count
    #[test]
    fn test_delays_increase() {
        let delay1 = calculate_retry_delay(1, false);
        let delay2 = calculate_retry_delay(2, false);
        let delay3 = calculate_retry_delay(3, false);
        // Later retries should generally have longer delays
        // (accounting for jitter, we check the trend)
        assert!(delay2 >= delay1 || delay3 >= delay2, 
            "Delays should generally increase: {:?} -> {:?} -> {:?}", delay1, delay2, delay3);
    }
    /// Test autonomous mode has different delays
    #[test]
    fn test_autonomous_mode_delays() {
        let default_delay = calculate_retry_delay(3, false);
        let autonomous_delay = calculate_retry_delay(3, true);
        // Autonomous mode should have longer delays (spread over 10 minutes)
        // But with jitter, we just check they're both reasonable
        assert!(default_delay <= Duration::from_secs(30), 
            "Default delay should be reasonable: {:?}", default_delay);
        assert!(autonomous_delay <= Duration::from_secs(180), 
            "Autonomous delay should be reasonable: {:?}", autonomous_delay);
    }
    /// Test delays are capped at maximum
    #[test]
    fn test_delay_cap() {
        // Even with high retry count, delay should be capped
        let delay = calculate_retry_delay(10, false);
        assert!(delay <= Duration::from_secs(15), 
            "Default mode delay should be capped: {:?}", delay);
    }
    /// Test autonomous mode delay cap
    /// CHARACTERIZATION: Autonomous mode uses longer delays spread over 10 minutes
    #[test]
    fn test_autonomous_delay_cap() {
        let delay = calculate_retry_delay(10, true);
        // Autonomous mode has longer delays (up to ~200s + jitter)
        assert!(delay <= Duration::from_secs(300), 
            "Autonomous delay should be capped: {:?}", delay);
    }
 }
 // =============================================================================
 // Test: Edge cases and priority
 // =============================================================================
 mod edge_cases {
    use super::*;
    /// Test error with multiple keywords uses correct priority
    #[test]
    fn test_rate_limit_priority_over_timeout() {
        // Rate limit should take priority
        let error = anyhow::anyhow!("Rate limit exceeded after timeout");
        let error_type = classify_error(&error);
        assert!(
            matches!(error_type, ErrorType::Recoverable(RecoverableError::RateLimit)),
            "Rate limit should take priority: {:?}", error_type
        );
    }
    /// Test case insensitivity
    #[test]
    fn test_case_insensitive_detection() {
        let error = anyhow::anyhow!("RATE LIMIT EXCEEDED");
        let error_type = classify_error(&error);
        assert!(
            matches!(error_type, ErrorType::Recoverable(RecoverableError::RateLimit)),
            "Should detect uppercase: {:?}", error_type
        );
    }
    /// Test empty error message
    #[test]
    fn test_empty_error_message() {
        let error = anyhow::anyhow!("");
        let error_type = classify_error(&error);
        // Empty message should be non-recoverable
        assert!(
            matches!(error_type, ErrorType::NonRecoverable),
            "Empty error should be non-recoverable: {:?}", error_type
        );
    }
    /// Test connection timeout is network error (not timeout)
    /// Note: This documents the current behavior where "connection" keyword
    /// takes priority over "timeout"
    #[test]
    fn test_connection_timeout_classification() {
        let error = anyhow::anyhow!("Connection timeout");
        let error_type = classify_error(&error);
        // Per memory: "Connection timeout" classifies as NetworkError due to "connection" keyword priority
        assert!(
            matches!(error_type, ErrorType::Recoverable(RecoverableError::NetworkError)),
            "Connection timeout should be network error (per priority): {:?}", error_type
        );
    }
 }
--- a/crates/g3-core/tests/retry_behavior_test.rs
+++ b/crates/g3-core/tests/retry_behavior_test.rs
@@ -0,0 +1,300 @@
 //! Retry Behavior Integration Tests
 //!
 //! CHARACTERIZATION: These tests verify the observable behavior of retry
 //! infrastructure through stable public interfaces.
 //!
 //! What these tests protect:
 //! - RetryConfig construction and presets
 //! - RetryResult state transitions
 //! - retry_operation behavior with simulated errors
 //!
 //! What these tests intentionally do NOT assert:
 //! - Internal timing details (only that delays occur)
 //! - Specific backoff calculations (only that they increase)
 //! - Agent internals (tested via execute_with_retry separately)
 use g3_core::retry::{RetryConfig, RetryResult, retry_operation};
 use g3_core::ContextWindow;
 use g3_core::TaskResult;
 use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::Arc;
 // =============================================================================
 // Test: RetryConfig presets and customization
 // =============================================================================
 mod retry_config_presets {
    use super::*;
    /// Test default config values
    #[test]
    fn test_default_config() {
        let config = RetryConfig::default();
        assert_eq!(config.max_retries, 3);
        assert!(!config.is_autonomous);
        assert_eq!(config.role_name, "agent");
    }
    /// Test player preset
    #[test]
    fn test_player_preset() {
        let config = RetryConfig::player();
        assert_eq!(config.max_retries, 3);
        assert!(config.is_autonomous, "Player should be autonomous");
        assert_eq!(config.role_name, "player");
    }
    /// Test coach preset
    #[test]
    fn test_coach_preset() {
        let config = RetryConfig::coach();
        assert_eq!(config.max_retries, 3);
        assert!(config.is_autonomous, "Coach should be autonomous");
        assert_eq!(config.role_name, "coach");
    }
    /// Test planning preset with custom role
    #[test]
    fn test_planning_preset() {
        let config = RetryConfig::planning("reviewer");
        assert_eq!(config.max_retries, 3);
        assert!(config.is_autonomous, "Planning should be autonomous");
        assert_eq!(config.role_name, "reviewer");
    }
    /// Test custom max retries
    #[test]
    fn test_custom_max_retries() {
        let config = RetryConfig::player().with_max_retries(10);
        assert_eq!(config.max_retries, 10);
        // Other fields should be preserved
        assert!(config.is_autonomous);
        assert_eq!(config.role_name, "player");
    }
    /// Test chaining customizations
    #[test]
    fn test_chained_customization() {
        let config = RetryConfig::default()
            .with_max_retries(5);
        assert_eq!(config.max_retries, 5);
        assert!(!config.is_autonomous); // Default is not autonomous
    }
 }
 // =============================================================================
 // Test: RetryResult state handling
 // =============================================================================
 mod retry_result_states {
    use super::*;
    /// Test success result
    #[test]
    fn test_success_is_success() {
        let ctx = ContextWindow::new(1000);
        let result = RetryResult::Success(TaskResult::new("done".to_string(), ctx));
        assert!(result.is_success());
    }
    /// Test max retries reached is not success
    #[test]
    fn test_max_retries_not_success() {
        let result = RetryResult::MaxRetriesReached("timeout".to_string());
        assert!(!result.is_success());
    }
    /// Test context length exceeded is not success
    #[test]
    fn test_context_exceeded_not_success() {
        let result = RetryResult::ContextLengthExceeded("too long".to_string());
        assert!(!result.is_success());
    }
    /// Test panic is not success
    #[test]
    fn test_panic_not_success() {
        let result = RetryResult::Panic(anyhow::anyhow!("panic occurred"));
        assert!(!result.is_success());
    }
    /// Test into_result extracts TaskResult on success
    #[test]
    fn test_into_result_success() {
        let ctx = ContextWindow::new(1000);
        let result = RetryResult::Success(TaskResult::new("done".to_string(), ctx));
        let task_result = result.into_result();
        assert!(task_result.is_some());
        assert_eq!(task_result.unwrap().response, "done");
    }
    /// Test into_result returns None on failure
    #[test]
    fn test_into_result_failure() {
        let result = RetryResult::MaxRetriesReached("error".to_string());
        let task_result = result.into_result();
        assert!(task_result.is_none());
    }
 }
 // =============================================================================
 // Test: retry_operation behavior
 // =============================================================================
 mod retry_operation_behavior {
    use super::*;
    /// Test successful operation on first try
    #[tokio::test]
    async fn test_success_first_try() {
        let call_count = Arc::new(AtomicU32::new(0));
        let call_count_clone = call_count.clone();
        let result = retry_operation(
            "test_op",
            || {
                let count = call_count_clone.clone();
                async move {
                    count.fetch_add(1, Ordering::SeqCst);
                    Ok::<_, anyhow::Error>("success")
                }
            },
            3,
            false,
            |_msg| {},
        ).await;
        assert!(result.is_ok());
        assert_eq!(result.unwrap(), "success");
        assert_eq!(call_count.load(Ordering::SeqCst), 1, "Should only call once on success");
    }
    /// Test non-recoverable error fails immediately
    #[tokio::test]
    async fn test_non_recoverable_fails_immediately() {
        let call_count = Arc::new(AtomicU32::new(0));
        let call_count_clone = call_count.clone();
        let result = retry_operation(
            "test_op",
            || {
                let count = call_count_clone.clone();
                async move {
                    count.fetch_add(1, Ordering::SeqCst);
                    Err::<String, _>(anyhow::anyhow!("Invalid API key"))
                }
            },
            3,
            false,
            |_msg| {},
        ).await;
        assert!(result.is_err());
        assert_eq!(call_count.load(Ordering::SeqCst), 1, "Non-recoverable should not retry");
    }
    /// Test recoverable error retries up to max
    #[tokio::test]
    async fn test_recoverable_retries_to_max() {
        let call_count = Arc::new(AtomicU32::new(0));
        let call_count_clone = call_count.clone();
        let result = retry_operation(
            "test_op",
            || {
                let count = call_count_clone.clone();
                async move {
                    count.fetch_add(1, Ordering::SeqCst);
                    // Rate limit is a recoverable error
                    Err::<String, _>(anyhow::anyhow!("Rate limit exceeded"))
                }
            },
            3, // max retries
            false,
            |_msg| {},
        ).await;
        assert!(result.is_err());
        // Should try initial + max_retries times
        assert_eq!(call_count.load(Ordering::SeqCst), 3, "Should retry up to max");
    }
    /// Test recoverable error succeeds on retry
    #[tokio::test]
    async fn test_recoverable_succeeds_on_retry() {
        let call_count = Arc::new(AtomicU32::new(0));
        let call_count_clone = call_count.clone();
        let result = retry_operation(
            "test_op",
            || {
                let count = call_count_clone.clone();
                async move {
                    let current = count.fetch_add(1, Ordering::SeqCst);
                    if current < 2 {
                        // Fail first two times with recoverable error
                        Err(anyhow::anyhow!("Server error 500"))
                    } else {
                        // Succeed on third try
                        Ok("success after retry")
                    }
                }
            },
            5, // max retries
            false,
            |_msg| {},
        ).await;
        assert!(result.is_ok());
        assert_eq!(result.unwrap(), "success after retry");
        assert_eq!(call_count.load(Ordering::SeqCst), 3, "Should succeed on third try");
    }
    /// Test print function is called on retry
    #[tokio::test]
    async fn test_print_fn_called_on_retry() {
        let messages = Arc::new(std::sync::Mutex::new(Vec::new()));
        let messages_clone = messages.clone();
        let call_count = Arc::new(AtomicU32::new(0));
        let call_count_clone = call_count.clone();
        let _ = retry_operation(
            "test_op",
            || {
                let count = call_count_clone.clone();
                async move {
                    let current = count.fetch_add(1, Ordering::SeqCst);
                    if current < 1 {
                        Err(anyhow::anyhow!("Rate limit exceeded"))
                    } else {
                        Ok("success")
                    }
                }
            },
            3,
            false,
            |msg| {
                messages_clone.lock().unwrap().push(msg.to_string());
            },
        ).await;
        let msgs = messages.lock().unwrap();
        assert!(!msgs.is_empty(), "Should have printed retry messages");
        // Should mention the error type
        assert!(msgs.iter().any(|m| m.contains("RateLimit") || m.contains("rate")), 
            "Should mention rate limit in messages: {:?}", msgs);
    }
 }
--- a/crates/g3-core/tests/tool_execution_roundtrip_test.rs
+++ b/crates/g3-core/tests/tool_execution_roundtrip_test.rs
@@ -0,0 +1,466 @@
 //! Tool Execution Round-Trip Integration Tests
 //!
 //! CHARACTERIZATION: These tests verify that tools execute correctly through
 //! the Agent interface, testing the full round-trip from tool call to result.
 //!
 //! What these tests protect:
 //! - File operations (read, write, str_replace) work end-to-end
 //! - Shell command execution produces expected output
 //! - TODO operations persist correctly
 //! - Error handling for invalid inputs
 //!
 //! What these tests intentionally do NOT assert:
 //! - Internal implementation details of tools
 //! - Specific formatting of success messages (only key content)
 //! - UI writer behavior (uses NullUiWriter)
 use g3_core::ui_writer::NullUiWriter;
 use g3_core::{Agent, ToolCall};
 use serial_test::serial;
 use std::fs;
 use tempfile::TempDir;
 // =============================================================================
 // Test Helpers
 // =============================================================================
 /// Create a test agent in a temporary directory
 async fn create_test_agent(temp_dir: &TempDir) -> Agent<NullUiWriter> {
    std::env::set_current_dir(temp_dir.path()).unwrap();
    let config = g3_config::Config::default();
    let ui_writer = NullUiWriter;
    Agent::new(config, ui_writer).await.unwrap()
 }
 /// Create a ToolCall with the given tool name and arguments
 fn make_tool_call(tool: &str, args: serde_json::Value) -> ToolCall {
    ToolCall {
        tool: tool.to_string(),
        args,
    }
 }
 // =============================================================================
 // Test: read_file tool execution
 // =============================================================================
 mod read_file_execution {
    use super::*;
    /// Test reading an existing file
    #[tokio::test]
    #[serial]
    async fn test_read_existing_file() {
        let temp_dir = TempDir::new().unwrap();
        let test_file = temp_dir.path().join("test.txt");
        fs::write(&test_file, "Hello, World!\nLine 2\nLine 3").unwrap();
        let mut agent = create_test_agent(&temp_dir).await;
        let tool_call = make_tool_call(
            "read_file",
            serde_json::json!({ "file_path": test_file.to_string_lossy() }),
        );
        let result = agent.execute_tool(&tool_call).await.unwrap();
        assert!(result.contains("Hello, World!"), "Should contain file content: {}", result);
        assert!(result.contains("Line 2"), "Should contain all lines: {}", result);
    }
    /// Test reading a non-existent file returns error
    #[tokio::test]
    #[serial]
    async fn test_read_nonexistent_file() {
        let temp_dir = TempDir::new().unwrap();
        let mut agent = create_test_agent(&temp_dir).await;
        let tool_call = make_tool_call(
            "read_file",
            serde_json::json!({ "file_path": "/nonexistent/path/file.txt" }),
        );
        let result = agent.execute_tool(&tool_call).await;
        // Should return an error or error message
        assert!(
            result.is_err() || result.as_ref().unwrap().contains("error") || result.as_ref().unwrap().contains("not found") || result.as_ref().unwrap().contains("No such file"),
            "Should indicate file not found: {:?}", result
        );
    }
    /// Test reading with character range
    #[tokio::test]
    #[serial]
    async fn test_read_file_with_range() {
        let temp_dir = TempDir::new().unwrap();
        let test_file = temp_dir.path().join("test.txt");
        fs::write(&test_file, "0123456789ABCDEF").unwrap();
        let mut agent = create_test_agent(&temp_dir).await;
        let tool_call = make_tool_call(
            "read_file",
            serde_json::json!({
                "file_path": test_file.to_string_lossy(),
                "start": 5,
                "end": 10
            }),
        );
        let result = agent.execute_tool(&tool_call).await.unwrap();
        // Should contain the substring from position 5 to 10
        assert!(result.contains("56789"), "Should contain range content: {}", result);
    }
 }
 // =============================================================================
 // Test: write_file tool execution
 // =============================================================================
 mod write_file_execution {
    use super::*;
    /// Test writing a new file
    #[tokio::test]
    #[serial]
    async fn test_write_new_file() {
        let temp_dir = TempDir::new().unwrap();
        let new_file = temp_dir.path().join("new_file.txt");
        assert!(!new_file.exists(), "File should not exist initially");
        let mut agent = create_test_agent(&temp_dir).await;
        let tool_call = make_tool_call(
            "write_file",
            serde_json::json!({
                "file_path": new_file.to_string_lossy(),
                "content": "New content here"
            }),
        );
        let result = agent.execute_tool(&tool_call).await.unwrap();
        // Should report success
        assert!(result.contains("✅") || result.to_lowercase().contains("success") || result.to_lowercase().contains("wrote"),
            "Should report success: {}", result);
        // File should now exist with correct content
        assert!(new_file.exists(), "File should exist after write");
        let content = fs::read_to_string(&new_file).unwrap();
        assert_eq!(content, "New content here");
    }
    /// Test overwriting an existing file
    #[tokio::test]
    #[serial]
    async fn test_overwrite_existing_file() {
        let temp_dir = TempDir::new().unwrap();
        let test_file = temp_dir.path().join("existing.txt");
        fs::write(&test_file, "Original content").unwrap();
        let mut agent = create_test_agent(&temp_dir).await;
        let tool_call = make_tool_call(
            "write_file",
            serde_json::json!({
                "file_path": test_file.to_string_lossy(),
                "content": "Replaced content"
            }),
        );
        let result = agent.execute_tool(&tool_call).await.unwrap();
        assert!(result.contains("✅") || result.to_lowercase().contains("success") || result.to_lowercase().contains("wrote"),
            "Should report success: {}", result);
        let content = fs::read_to_string(&test_file).unwrap();
        assert_eq!(content, "Replaced content");
    }
    /// Test writing creates parent directories
    #[tokio::test]
    #[serial]
    async fn test_write_creates_parent_dirs() {
        let temp_dir = TempDir::new().unwrap();
        let nested_file = temp_dir.path().join("a/b/c/nested.txt");
        let mut agent = create_test_agent(&temp_dir).await;
        let tool_call = make_tool_call(
            "write_file",
            serde_json::json!({
                "file_path": nested_file.to_string_lossy(),
                "content": "Nested content"
            }),
        );
        let result = agent.execute_tool(&tool_call).await.unwrap();
        assert!(result.contains("✅") || result.to_lowercase().contains("success") || result.to_lowercase().contains("wrote"),
            "Should report success: {}", result);
        assert!(nested_file.exists(), "Nested file should exist");
        let content = fs::read_to_string(&nested_file).unwrap();
        assert_eq!(content, "Nested content");
    }
 }
 // =============================================================================
 // Test: shell tool execution
 // =============================================================================
 mod shell_execution {
    use super::*;
    /// Test simple echo command
    #[tokio::test]
    #[serial]
    async fn test_shell_echo() {
        let temp_dir = TempDir::new().unwrap();
        let mut agent = create_test_agent(&temp_dir).await;
        let tool_call = make_tool_call(
            "shell",
            serde_json::json!({ "command": "echo 'hello world'" }),
        );
        let result = agent.execute_tool(&tool_call).await.unwrap();
        assert!(result.contains("hello world"), "Should contain echo output: {}", result);
    }
    /// Test command that produces multi-line output
    #[tokio::test]
    #[serial]
    async fn test_shell_multiline_output() {
        let temp_dir = TempDir::new().unwrap();
        let mut agent = create_test_agent(&temp_dir).await;
        let tool_call = make_tool_call(
            "shell",
            serde_json::json!({ "command": "echo 'line1'; echo 'line2'; echo 'line3'" }),
        );
        let result = agent.execute_tool(&tool_call).await.unwrap();
        assert!(result.contains("line1"), "Should contain line1: {}", result);
        assert!(result.contains("line2"), "Should contain line2: {}", result);
        assert!(result.contains("line3"), "Should contain line3: {}", result);
    }
    /// Test command that fails
    #[tokio::test]
    #[serial]
    async fn test_shell_failing_command() {
        let temp_dir = TempDir::new().unwrap();
        let mut agent = create_test_agent(&temp_dir).await;
        let tool_call = make_tool_call(
            "shell",
            serde_json::json!({ "command": "exit 1" }),
        );
        let result = agent.execute_tool(&tool_call).await;
        // Should indicate failure (either error or non-zero exit)
        assert!(
            result.is_err() || result.as_ref().unwrap().contains("exit") || result.as_ref().unwrap().contains("failed") || result.as_ref().unwrap().contains("error"),
            "Should indicate command failure: {:?}", result
        );
    }
    /// Test command with working directory context
    #[tokio::test]
    #[serial]
    async fn test_shell_pwd() {
        let temp_dir = TempDir::new().unwrap();
        let mut agent = create_test_agent(&temp_dir).await;
        let tool_call = make_tool_call(
            "shell",
            serde_json::json!({ "command": "pwd" }),
        );
        let result = agent.execute_tool(&tool_call).await.unwrap();
        // Should show the temp directory path
        let temp_path = temp_dir.path().to_string_lossy();
        assert!(result.contains(&*temp_path) || result.contains("private"), 
            "Should show current directory: {} (expected to contain {})", result, temp_path);
    }
 }
 // =============================================================================
 // Test: str_replace tool execution
 // =============================================================================
 mod str_replace_execution {
    use super::*;
    /// Test applying a simple diff
    #[tokio::test]
    #[serial]
    async fn test_str_replace_simple() {
        let temp_dir = TempDir::new().unwrap();
        let test_file = temp_dir.path().join("test.txt");
        fs::write(&test_file, "line 1\nold line\nline 3\n").unwrap();
        let mut agent = create_test_agent(&temp_dir).await;
        let diff = "@@ -1,3 +1,3 @@\n line 1\n-old line\n+new line\n line 3\n";
        let tool_call = make_tool_call(
            "str_replace",
            serde_json::json!({
                "file_path": test_file.to_string_lossy(),
                "diff": diff
            }),
        );
        let result = agent.execute_tool(&tool_call).await.unwrap();
        assert!(result.contains("✅") || result.to_lowercase().contains("applied") || result.to_lowercase().contains("success"),
            "Should report success: {}", result);
        let content = fs::read_to_string(&test_file).unwrap();
        assert!(content.contains("new line"), "Should contain new content: {}", content);
        assert!(!content.contains("old line"), "Should not contain old content: {}", content);
    }
    /// Test diff that adds lines
    #[tokio::test]
    #[serial]
    async fn test_str_replace_add_lines() {
        let temp_dir = TempDir::new().unwrap();
        let test_file = temp_dir.path().join("test.txt");
        fs::write(&test_file, "line 1\nline 3\n").unwrap();
        let mut agent = create_test_agent(&temp_dir).await;
        let diff = "@@ -1,2 +1,3 @@\n line 1\n+line 2\n line 3\n";
        let tool_call = make_tool_call(
            "str_replace",
            serde_json::json!({
                "file_path": test_file.to_string_lossy(),
                "diff": diff
            }),
        );
        let result = agent.execute_tool(&tool_call).await.unwrap();
        assert!(result.contains("✅") || result.to_lowercase().contains("applied"),
            "Should report success: {}", result);
        let content = fs::read_to_string(&test_file).unwrap();
        assert!(content.contains("line 2"), "Should contain added line: {}", content);
    }
    /// Test diff with pattern not found
    #[tokio::test]
    #[serial]
    async fn test_str_replace_pattern_not_found() {
        let temp_dir = TempDir::new().unwrap();
        let test_file = temp_dir.path().join("test.txt");
        fs::write(&test_file, "actual content\n").unwrap();
        let mut agent = create_test_agent(&temp_dir).await;
        let diff = "@@ -1,1 +1,1 @@\n-nonexistent pattern\n+replacement\n";
        let tool_call = make_tool_call(
            "str_replace",
            serde_json::json!({
                "file_path": test_file.to_string_lossy(),
                "diff": diff
            }),
        );
        let result = agent.execute_tool(&tool_call).await;
        // Should indicate pattern not found
        assert!(
            result.is_err() || result.as_ref().unwrap().to_lowercase().contains("not found") || result.as_ref().unwrap().to_lowercase().contains("pattern") || result.as_ref().unwrap().to_lowercase().contains("error"),
            "Should indicate pattern not found: {:?}", result
        );
    }
 }
 // =============================================================================
 // Test: TODO tool execution
 // =============================================================================
 mod todo_execution {
    use super::*;
    /// Test writing and reading TODO
    #[tokio::test]
    #[serial]
    async fn test_todo_write_and_read() {
        let temp_dir = TempDir::new().unwrap();
        let mut agent = create_test_agent(&temp_dir).await;
        // Write TODO
        let write_call = make_tool_call(
            "todo_write",
            serde_json::json!({
                "content": "- [ ] Task 1\n- [x] Task 2\n- [ ] Task 3"
            }),
        );
        let write_result = agent.execute_tool(&write_call).await.unwrap();
        assert!(write_result.contains("✅") || write_result.to_lowercase().contains("success"),
            "Write should succeed: {}", write_result);
        // Read TODO
        let read_call = make_tool_call("todo_read", serde_json::json!({}));
        let read_result = agent.execute_tool(&read_call).await.unwrap();
        assert!(read_result.contains("Task 1"), "Should contain Task 1: {}", read_result);
        assert!(read_result.contains("Task 2"), "Should contain Task 2: {}", read_result);
        assert!(read_result.contains("Task 3"), "Should contain Task 3: {}", read_result);
    }
    /// Test reading empty TODO
    #[tokio::test]
    #[serial]
    async fn test_todo_read_empty() {
        let temp_dir = TempDir::new().unwrap();
        let mut agent = create_test_agent(&temp_dir).await;
        let read_call = make_tool_call("todo_read", serde_json::json!({}));
        let result = agent.execute_tool(&read_call).await.unwrap();
        assert!(result.to_lowercase().contains("empty") || result.contains("no todo"),
            "Should indicate empty: {}", result);
    }
    /// Test TODO persists to file
    #[tokio::test]
    #[serial]
    async fn test_todo_persists_to_file() {
        let temp_dir = TempDir::new().unwrap();
        let todo_path = temp_dir.path().join("todo.g3.md");
        {
            let mut agent = create_test_agent(&temp_dir).await;
            let write_call = make_tool_call(
                "todo_write",
                serde_json::json!({
                    "content": "- [ ] Persistent task"
                }),
            );
            agent.execute_tool(&write_call).await.unwrap();
        }
        // File should exist after agent is dropped
        assert!(todo_path.exists(), "TODO file should persist");
        let content = fs::read_to_string(&todo_path).unwrap();
        assert!(content.contains("Persistent task"), "Content should persist: {}", content);
    }
 }