diff --git a/crates/g3-core/tests/compaction_behavior_test.rs b/crates/g3-core/tests/compaction_behavior_test.rs new file mode 100644 index 0000000..f949dec --- /dev/null +++ b/crates/g3-core/tests/compaction_behavior_test.rs @@ -0,0 +1,236 @@ +//! Compaction Behavior Integration Tests +//! +//! CHARACTERIZATION: These tests verify the observable behavior of context +//! compaction through stable public interfaces. +//! +//! What these tests protect: +//! - Compaction configuration calculation (token caps, thinking mode) +//! - Summary message building from conversation history +//! - Compaction result handling (success/failure) +//! +//! What these tests intentionally do NOT assert: +//! - Internal implementation details of compaction +//! - Specific LLM responses (mocked at provider boundary) +//! - Exact token counts (only relative behavior) + +use g3_core::compaction::{ + calculate_capped_summary_tokens, should_disable_thinking, build_summary_messages, + CompactionResult, SUMMARY_MIN_TOKENS, +}; +use g3_core::ContextWindow; +use g3_providers::{Message, MessageRole}; + +// ============================================================================= +// Test: Token cap calculation for different providers +// ============================================================================= + +mod token_cap_calculation { + use super::*; + + /// Test that Anthropic provider gets appropriate token caps + #[test] + fn test_anthropic_token_cap() { + let config = g3_config::Config::default(); + + // Large base tokens should be capped + let capped = calculate_capped_summary_tokens(&config, "anthropic", 50000); + assert!(capped <= 10000, "Anthropic should cap at 10000 by default, got {}", capped); + assert!(capped >= SUMMARY_MIN_TOKENS, "Should respect minimum floor"); + } + + /// Test that Databricks provider gets appropriate token caps + #[test] + fn test_databricks_token_cap() { + let config = g3_config::Config::default(); + + let capped = calculate_capped_summary_tokens(&config, "databricks", 50000); + assert!(capped <= 10000, "Databricks should cap at 10000, got {}", capped); + assert!(capped >= SUMMARY_MIN_TOKENS, "Should respect minimum floor"); + } + + /// Test that embedded provider gets lower token caps + #[test] + fn test_embedded_token_cap() { + let config = g3_config::Config::default(); + + let capped = calculate_capped_summary_tokens(&config, "embedded", 50000); + assert!(capped <= 3000, "Embedded should cap at 3000, got {}", capped); + assert!(capped >= SUMMARY_MIN_TOKENS, "Should respect minimum floor"); + } + + /// Test that unknown providers get conservative caps + #[test] + fn test_unknown_provider_token_cap() { + let config = g3_config::Config::default(); + + let capped = calculate_capped_summary_tokens(&config, "unknown_provider", 50000); + assert!(capped <= 5000, "Unknown providers should cap at 5000, got {}", capped); + assert!(capped >= SUMMARY_MIN_TOKENS, "Should respect minimum floor"); + } + + /// Test that small base tokens are preserved (not increased) + #[test] + fn test_small_base_tokens_preserved() { + let config = g3_config::Config::default(); + + // If base is already small, it should be preserved (but not below minimum) + let capped = calculate_capped_summary_tokens(&config, "anthropic", 2000); + assert_eq!(capped, 2000, "Small base tokens should be preserved"); + } + + /// Test minimum floor is enforced + #[test] + fn test_minimum_floor_enforced() { + let config = g3_config::Config::default(); + + // Even with very small base, minimum should be enforced + let capped = calculate_capped_summary_tokens(&config, "anthropic", 100); + assert_eq!(capped, SUMMARY_MIN_TOKENS, "Minimum floor should be enforced"); + } +} + +// ============================================================================= +// Test: Thinking mode disable logic +// ============================================================================= + +mod thinking_mode_disable { + use super::*; + + /// Test that thinking mode is not disabled when no thinking config exists + #[test] + fn test_no_thinking_config_no_disable() { + let config = g3_config::Config::default(); + + // Without thinking config, should never disable + let should_disable = should_disable_thinking(&config, "anthropic", 5000); + assert!(!should_disable, "Should not disable thinking when no config exists"); + } + + /// Test that non-Anthropic providers don't trigger thinking disable + #[test] + fn test_non_anthropic_no_thinking_disable() { + let config = g3_config::Config::default(); + + // Non-Anthropic providers don't have thinking mode + let should_disable = should_disable_thinking(&config, "databricks", 1000); + assert!(!should_disable, "Non-Anthropic providers should not disable thinking"); + } +} + +// ============================================================================= +// Test: Summary message building +// ============================================================================= + +mod summary_message_building { + use super::*; + + /// Test that summary messages are built correctly from conversation + #[test] + fn test_build_summary_messages_basic() { + let mut context = ContextWindow::new(10000); + + // Add a simple conversation + context.add_message(Message::new( + MessageRole::System, + "You are a helpful assistant.".to_string(), + )); + context.add_message(Message::new( + MessageRole::User, + "Hello, how are you?".to_string(), + )); + context.add_message(Message::new( + MessageRole::Assistant, + "I'm doing well, thank you!".to_string(), + )); + + let messages = build_summary_messages(&context); + + // Should have exactly 2 messages: system prompt and user request + assert_eq!(messages.len(), 2, "Should have system and user messages"); + + // First should be system message for summarization + assert!(matches!(messages[0].role, MessageRole::System)); + assert!(messages[0].content.contains("concise summaries")); + + // Second should be user message with conversation + assert!(matches!(messages[1].role, MessageRole::User)); + assert!(messages[1].content.contains("Hello, how are you?")); + assert!(messages[1].content.contains("I'm doing well")); + } + + /// Test that empty conversation produces valid summary request + #[test] + fn test_build_summary_messages_empty_conversation() { + let context = ContextWindow::new(10000); + + let messages = build_summary_messages(&context); + + // Should still produce valid structure + assert_eq!(messages.len(), 2); + assert!(matches!(messages[0].role, MessageRole::System)); + assert!(matches!(messages[1].role, MessageRole::User)); + } + + /// Test that long conversations are included in summary request + #[test] + fn test_build_summary_messages_long_conversation() { + let mut context = ContextWindow::new(100000); + + // Add many messages + for i in 0..50 { + context.add_message(Message::new( + MessageRole::User, + format!("User message number {}", i), + )); + context.add_message(Message::new( + MessageRole::Assistant, + format!("Assistant response number {}", i), + )); + } + + let messages = build_summary_messages(&context); + + // Should include all conversation content + let user_content = &messages[1].content; + assert!(user_content.contains("User message number 0")); + assert!(user_content.contains("User message number 49")); + assert!(user_content.contains("Assistant response number 49")); + } +} + +// ============================================================================= +// Test: CompactionResult behavior +// ============================================================================= + +mod compaction_result { + use super::*; + + /// Test success result creation + #[test] + fn test_success_result() { + let result = CompactionResult::success(5000); + + assert!(result.success); + assert_eq!(result.chars_saved, 5000); + assert!(result.error.is_none()); + } + + /// Test failure result creation + #[test] + fn test_failure_result() { + let result = CompactionResult::failure("API error".to_string()); + + assert!(!result.success); + assert_eq!(result.chars_saved, 0); + assert_eq!(result.error, Some("API error".to_string())); + } + + /// Test zero chars saved is valid success + #[test] + fn test_zero_chars_saved_success() { + let result = CompactionResult::success(0); + + assert!(result.success); + assert_eq!(result.chars_saved, 0); + } +} diff --git a/crates/g3-core/tests/error_classification_test.rs b/crates/g3-core/tests/error_classification_test.rs new file mode 100644 index 0000000..3944ae1 --- /dev/null +++ b/crates/g3-core/tests/error_classification_test.rs @@ -0,0 +1,356 @@ +//! Error Classification Integration Tests +//! +//! CHARACTERIZATION: These tests verify the observable behavior of error +//! classification through stable public interfaces. +//! +//! What these tests protect: +//! - Error messages are correctly classified as recoverable/non-recoverable +//! - Specific error types (rate limit, timeout, server error) are detected +//! - Retry delay calculation produces reasonable values +//! +//! What these tests intentionally do NOT assert: +//! - Exact delay values (only ranges and relative behavior) +//! - Internal classification implementation details + +use g3_core::error_handling::{ + classify_error, calculate_retry_delay, ErrorType, RecoverableError, +}; + +// ============================================================================= +// Test: Error classification for recoverable errors +// ============================================================================= + +mod recoverable_error_classification { + use super::*; + + /// Test rate limit errors are classified as recoverable + #[test] + fn test_rate_limit_detected() { + let error = anyhow::anyhow!("Rate limit exceeded"); + let error_type = classify_error(&error); + + assert!( + matches!(error_type, ErrorType::Recoverable(RecoverableError::RateLimit)), + "Rate limit should be recoverable: {:?}", error_type + ); + } + + /// Test 429 status code is classified as rate limit + #[test] + fn test_429_status_detected() { + let error = anyhow::anyhow!("HTTP 429 Too Many Requests"); + let error_type = classify_error(&error); + + assert!( + matches!(error_type, ErrorType::Recoverable(RecoverableError::RateLimit)), + "429 should be rate limit: {:?}", error_type + ); + } + + /// Test timeout errors are classified as recoverable + #[test] + fn test_timeout_detected() { + let error = anyhow::anyhow!("Request timed out"); + let error_type = classify_error(&error); + + assert!( + matches!(error_type, ErrorType::Recoverable(RecoverableError::Timeout)), + "Timeout should be recoverable: {:?}", error_type + ); + } + + /// Test server errors (5xx) are classified as recoverable + #[test] + fn test_server_error_500_detected() { + let error = anyhow::anyhow!("Server error 500"); + let error_type = classify_error(&error); + + assert!( + matches!(error_type, ErrorType::Recoverable(RecoverableError::ServerError)), + "500 should be server error: {:?}", error_type + ); + } + + /// Test 502 Bad Gateway is classified as server error + #[test] + fn test_server_error_502_detected() { + let error = anyhow::anyhow!("502 Bad Gateway"); + let error_type = classify_error(&error); + + assert!( + matches!(error_type, ErrorType::Recoverable(RecoverableError::ServerError)), + "502 should be server error: {:?}", error_type + ); + } + + /// Test 503 Service Unavailable is classified as server error + #[test] + fn test_server_error_503_detected() { + let error = anyhow::anyhow!("503 Service Unavailable"); + let error_type = classify_error(&error); + + assert!( + matches!(error_type, ErrorType::Recoverable(RecoverableError::ServerError)), + "503 should be server error: {:?}", error_type + ); + } + + /// Test network errors are classified as recoverable + #[test] + fn test_network_error_detected() { + let error = anyhow::anyhow!("Connection refused"); + let error_type = classify_error(&error); + + assert!( + matches!(error_type, ErrorType::Recoverable(RecoverableError::NetworkError)), + "Connection refused should be network error: {:?}", error_type + ); + } + + /// Test connection reset is classified as network error + #[test] + fn test_connection_reset_detected() { + let error = anyhow::anyhow!("Connection reset by peer"); + let error_type = classify_error(&error); + + assert!( + matches!(error_type, ErrorType::Recoverable(RecoverableError::NetworkError)), + "Connection reset should be network error: {:?}", error_type + ); + } + + /// Test "overloaded" is classified as busy + #[test] + fn test_model_busy_detected() { + let error = anyhow::anyhow!("Server is overloaded"); + let error_type = classify_error(&error); + + assert!( + matches!(error_type, ErrorType::Recoverable(RecoverableError::ModelBusy)), + "Overloaded should be model busy: {:?}", error_type + ); + } + + /// Test context length exceeded requires 400 status code + /// CHARACTERIZATION: The error must contain "400" or "bad request" along with + /// context length keywords to be classified as ContextLengthExceeded + #[test] + fn test_context_length_exceeded_detected() { + let error = anyhow::anyhow!("400 Bad Request: context_length_exceeded: too many tokens"); + let error_type = classify_error(&error); + + assert!( + matches!(error_type, ErrorType::Recoverable(RecoverableError::ContextLengthExceeded)), + "Context length exceeded should be detected: {:?}", error_type + ); + } + + /// Test token limit exceeded is classified correctly + /// CHARACTERIZATION: Must contain "token" AND ("limit" OR "exceeded") + #[test] + fn test_token_limit_detected() { + let error = anyhow::anyhow!("token limit exceeded"); + let error_type = classify_error(&error); + + assert!( + matches!(error_type, ErrorType::Recoverable(RecoverableError::TokenLimit)), + "Token limit should be detected: {:?}", error_type + ); + } +} + +// ============================================================================= +// Test: Error classification for non-recoverable errors +// ============================================================================= + +mod non_recoverable_error_classification { + use super::*; + + /// Test invalid API key is non-recoverable + #[test] + fn test_invalid_api_key_non_recoverable() { + let error = anyhow::anyhow!("Invalid API key"); + let error_type = classify_error(&error); + + assert!( + matches!(error_type, ErrorType::NonRecoverable), + "Invalid API key should be non-recoverable: {:?}", error_type + ); + } + + /// Test authentication failure is non-recoverable + #[test] + fn test_auth_failure_non_recoverable() { + let error = anyhow::anyhow!("Authentication failed"); + let error_type = classify_error(&error); + + assert!( + matches!(error_type, ErrorType::NonRecoverable), + "Auth failure should be non-recoverable: {:?}", error_type + ); + } + + /// Test generic errors are non-recoverable + #[test] + fn test_generic_error_non_recoverable() { + let error = anyhow::anyhow!("Something went wrong"); + let error_type = classify_error(&error); + + assert!( + matches!(error_type, ErrorType::NonRecoverable), + "Generic error should be non-recoverable: {:?}", error_type + ); + } + + /// Test 401 Unauthorized is non-recoverable + #[test] + fn test_401_non_recoverable() { + let error = anyhow::anyhow!("401 Unauthorized"); + let error_type = classify_error(&error); + + assert!( + matches!(error_type, ErrorType::NonRecoverable), + "401 should be non-recoverable: {:?}", error_type + ); + } + + /// Test 403 Forbidden is non-recoverable + #[test] + fn test_403_non_recoverable() { + let error = anyhow::anyhow!("403 Forbidden"); + let error_type = classify_error(&error); + + assert!( + matches!(error_type, ErrorType::NonRecoverable), + "403 should be non-recoverable: {:?}", error_type + ); + } +} + +// ============================================================================= +// Test: Retry delay calculation +// ============================================================================= + +mod retry_delay_calculation { + use super::*; + use std::time::Duration; + + /// Test first retry has reasonable delay + #[test] + fn test_first_retry_delay() { + let delay = calculate_retry_delay(1, false); + + // First retry should be around 1-2 seconds (with jitter) + assert!(delay >= Duration::from_millis(500), "Delay should be at least 500ms: {:?}", delay); + assert!(delay <= Duration::from_secs(5), "Delay should be at most 5s: {:?}", delay); + } + + /// Test delays increase with retry count + #[test] + fn test_delays_increase() { + let delay1 = calculate_retry_delay(1, false); + let delay2 = calculate_retry_delay(2, false); + let delay3 = calculate_retry_delay(3, false); + + // Later retries should generally have longer delays + // (accounting for jitter, we check the trend) + assert!(delay2 >= delay1 || delay3 >= delay2, + "Delays should generally increase: {:?} -> {:?} -> {:?}", delay1, delay2, delay3); + } + + /// Test autonomous mode has different delays + #[test] + fn test_autonomous_mode_delays() { + let default_delay = calculate_retry_delay(3, false); + let autonomous_delay = calculate_retry_delay(3, true); + + // Autonomous mode should have longer delays (spread over 10 minutes) + // But with jitter, we just check they're both reasonable + assert!(default_delay <= Duration::from_secs(30), + "Default delay should be reasonable: {:?}", default_delay); + assert!(autonomous_delay <= Duration::from_secs(180), + "Autonomous delay should be reasonable: {:?}", autonomous_delay); + } + + /// Test delays are capped at maximum + #[test] + fn test_delay_cap() { + // Even with high retry count, delay should be capped + let delay = calculate_retry_delay(10, false); + + assert!(delay <= Duration::from_secs(15), + "Default mode delay should be capped: {:?}", delay); + } + + /// Test autonomous mode delay cap + /// CHARACTERIZATION: Autonomous mode uses longer delays spread over 10 minutes + #[test] + fn test_autonomous_delay_cap() { + let delay = calculate_retry_delay(10, true); + + // Autonomous mode has longer delays (up to ~200s + jitter) + assert!(delay <= Duration::from_secs(300), + "Autonomous delay should be capped: {:?}", delay); + } +} + +// ============================================================================= +// Test: Edge cases and priority +// ============================================================================= + +mod edge_cases { + use super::*; + + /// Test error with multiple keywords uses correct priority + #[test] + fn test_rate_limit_priority_over_timeout() { + // Rate limit should take priority + let error = anyhow::anyhow!("Rate limit exceeded after timeout"); + let error_type = classify_error(&error); + + assert!( + matches!(error_type, ErrorType::Recoverable(RecoverableError::RateLimit)), + "Rate limit should take priority: {:?}", error_type + ); + } + + /// Test case insensitivity + #[test] + fn test_case_insensitive_detection() { + let error = anyhow::anyhow!("RATE LIMIT EXCEEDED"); + let error_type = classify_error(&error); + + assert!( + matches!(error_type, ErrorType::Recoverable(RecoverableError::RateLimit)), + "Should detect uppercase: {:?}", error_type + ); + } + + /// Test empty error message + #[test] + fn test_empty_error_message() { + let error = anyhow::anyhow!(""); + let error_type = classify_error(&error); + + // Empty message should be non-recoverable + assert!( + matches!(error_type, ErrorType::NonRecoverable), + "Empty error should be non-recoverable: {:?}", error_type + ); + } + + /// Test connection timeout is network error (not timeout) + /// Note: This documents the current behavior where "connection" keyword + /// takes priority over "timeout" + #[test] + fn test_connection_timeout_classification() { + let error = anyhow::anyhow!("Connection timeout"); + let error_type = classify_error(&error); + + // Per memory: "Connection timeout" classifies as NetworkError due to "connection" keyword priority + assert!( + matches!(error_type, ErrorType::Recoverable(RecoverableError::NetworkError)), + "Connection timeout should be network error (per priority): {:?}", error_type + ); + } +} diff --git a/crates/g3-core/tests/retry_behavior_test.rs b/crates/g3-core/tests/retry_behavior_test.rs new file mode 100644 index 0000000..f0b1d6f --- /dev/null +++ b/crates/g3-core/tests/retry_behavior_test.rs @@ -0,0 +1,300 @@ +//! Retry Behavior Integration Tests +//! +//! CHARACTERIZATION: These tests verify the observable behavior of retry +//! infrastructure through stable public interfaces. +//! +//! What these tests protect: +//! - RetryConfig construction and presets +//! - RetryResult state transitions +//! - retry_operation behavior with simulated errors +//! +//! What these tests intentionally do NOT assert: +//! - Internal timing details (only that delays occur) +//! - Specific backoff calculations (only that they increase) +//! - Agent internals (tested via execute_with_retry separately) + +use g3_core::retry::{RetryConfig, RetryResult, retry_operation}; +use g3_core::ContextWindow; +use g3_core::TaskResult; +use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::Arc; + +// ============================================================================= +// Test: RetryConfig presets and customization +// ============================================================================= + +mod retry_config_presets { + use super::*; + + /// Test default config values + #[test] + fn test_default_config() { + let config = RetryConfig::default(); + + assert_eq!(config.max_retries, 3); + assert!(!config.is_autonomous); + assert_eq!(config.role_name, "agent"); + } + + /// Test player preset + #[test] + fn test_player_preset() { + let config = RetryConfig::player(); + + assert_eq!(config.max_retries, 3); + assert!(config.is_autonomous, "Player should be autonomous"); + assert_eq!(config.role_name, "player"); + } + + /// Test coach preset + #[test] + fn test_coach_preset() { + let config = RetryConfig::coach(); + + assert_eq!(config.max_retries, 3); + assert!(config.is_autonomous, "Coach should be autonomous"); + assert_eq!(config.role_name, "coach"); + } + + /// Test planning preset with custom role + #[test] + fn test_planning_preset() { + let config = RetryConfig::planning("reviewer"); + + assert_eq!(config.max_retries, 3); + assert!(config.is_autonomous, "Planning should be autonomous"); + assert_eq!(config.role_name, "reviewer"); + } + + /// Test custom max retries + #[test] + fn test_custom_max_retries() { + let config = RetryConfig::player().with_max_retries(10); + + assert_eq!(config.max_retries, 10); + // Other fields should be preserved + assert!(config.is_autonomous); + assert_eq!(config.role_name, "player"); + } + + /// Test chaining customizations + #[test] + fn test_chained_customization() { + let config = RetryConfig::default() + .with_max_retries(5); + + assert_eq!(config.max_retries, 5); + assert!(!config.is_autonomous); // Default is not autonomous + } +} + +// ============================================================================= +// Test: RetryResult state handling +// ============================================================================= + +mod retry_result_states { + use super::*; + + /// Test success result + #[test] + fn test_success_is_success() { + let ctx = ContextWindow::new(1000); + let result = RetryResult::Success(TaskResult::new("done".to_string(), ctx)); + + assert!(result.is_success()); + } + + /// Test max retries reached is not success + #[test] + fn test_max_retries_not_success() { + let result = RetryResult::MaxRetriesReached("timeout".to_string()); + + assert!(!result.is_success()); + } + + /// Test context length exceeded is not success + #[test] + fn test_context_exceeded_not_success() { + let result = RetryResult::ContextLengthExceeded("too long".to_string()); + + assert!(!result.is_success()); + } + + /// Test panic is not success + #[test] + fn test_panic_not_success() { + let result = RetryResult::Panic(anyhow::anyhow!("panic occurred")); + + assert!(!result.is_success()); + } + + /// Test into_result extracts TaskResult on success + #[test] + fn test_into_result_success() { + let ctx = ContextWindow::new(1000); + let result = RetryResult::Success(TaskResult::new("done".to_string(), ctx)); + + let task_result = result.into_result(); + assert!(task_result.is_some()); + assert_eq!(task_result.unwrap().response, "done"); + } + + /// Test into_result returns None on failure + #[test] + fn test_into_result_failure() { + let result = RetryResult::MaxRetriesReached("error".to_string()); + + let task_result = result.into_result(); + assert!(task_result.is_none()); + } +} + +// ============================================================================= +// Test: retry_operation behavior +// ============================================================================= + +mod retry_operation_behavior { + use super::*; + + /// Test successful operation on first try + #[tokio::test] + async fn test_success_first_try() { + let call_count = Arc::new(AtomicU32::new(0)); + let call_count_clone = call_count.clone(); + + let result = retry_operation( + "test_op", + || { + let count = call_count_clone.clone(); + async move { + count.fetch_add(1, Ordering::SeqCst); + Ok::<_, anyhow::Error>("success") + } + }, + 3, + false, + |_msg| {}, + ).await; + + assert!(result.is_ok()); + assert_eq!(result.unwrap(), "success"); + assert_eq!(call_count.load(Ordering::SeqCst), 1, "Should only call once on success"); + } + + /// Test non-recoverable error fails immediately + #[tokio::test] + async fn test_non_recoverable_fails_immediately() { + let call_count = Arc::new(AtomicU32::new(0)); + let call_count_clone = call_count.clone(); + + let result = retry_operation( + "test_op", + || { + let count = call_count_clone.clone(); + async move { + count.fetch_add(1, Ordering::SeqCst); + Err::(anyhow::anyhow!("Invalid API key")) + } + }, + 3, + false, + |_msg| {}, + ).await; + + assert!(result.is_err()); + assert_eq!(call_count.load(Ordering::SeqCst), 1, "Non-recoverable should not retry"); + } + + /// Test recoverable error retries up to max + #[tokio::test] + async fn test_recoverable_retries_to_max() { + let call_count = Arc::new(AtomicU32::new(0)); + let call_count_clone = call_count.clone(); + + let result = retry_operation( + "test_op", + || { + let count = call_count_clone.clone(); + async move { + count.fetch_add(1, Ordering::SeqCst); + // Rate limit is a recoverable error + Err::(anyhow::anyhow!("Rate limit exceeded")) + } + }, + 3, // max retries + false, + |_msg| {}, + ).await; + + assert!(result.is_err()); + // Should try initial + max_retries times + assert_eq!(call_count.load(Ordering::SeqCst), 3, "Should retry up to max"); + } + + /// Test recoverable error succeeds on retry + #[tokio::test] + async fn test_recoverable_succeeds_on_retry() { + let call_count = Arc::new(AtomicU32::new(0)); + let call_count_clone = call_count.clone(); + + let result = retry_operation( + "test_op", + || { + let count = call_count_clone.clone(); + async move { + let current = count.fetch_add(1, Ordering::SeqCst); + if current < 2 { + // Fail first two times with recoverable error + Err(anyhow::anyhow!("Server error 500")) + } else { + // Succeed on third try + Ok("success after retry") + } + } + }, + 5, // max retries + false, + |_msg| {}, + ).await; + + assert!(result.is_ok()); + assert_eq!(result.unwrap(), "success after retry"); + assert_eq!(call_count.load(Ordering::SeqCst), 3, "Should succeed on third try"); + } + + /// Test print function is called on retry + #[tokio::test] + async fn test_print_fn_called_on_retry() { + let messages = Arc::new(std::sync::Mutex::new(Vec::new())); + let messages_clone = messages.clone(); + + let call_count = Arc::new(AtomicU32::new(0)); + let call_count_clone = call_count.clone(); + + let _ = retry_operation( + "test_op", + || { + let count = call_count_clone.clone(); + async move { + let current = count.fetch_add(1, Ordering::SeqCst); + if current < 1 { + Err(anyhow::anyhow!("Rate limit exceeded")) + } else { + Ok("success") + } + } + }, + 3, + false, + |msg| { + messages_clone.lock().unwrap().push(msg.to_string()); + }, + ).await; + + let msgs = messages.lock().unwrap(); + assert!(!msgs.is_empty(), "Should have printed retry messages"); + // Should mention the error type + assert!(msgs.iter().any(|m| m.contains("RateLimit") || m.contains("rate")), + "Should mention rate limit in messages: {:?}", msgs); + } +} diff --git a/crates/g3-core/tests/tool_execution_roundtrip_test.rs b/crates/g3-core/tests/tool_execution_roundtrip_test.rs new file mode 100644 index 0000000..c4cf30a --- /dev/null +++ b/crates/g3-core/tests/tool_execution_roundtrip_test.rs @@ -0,0 +1,466 @@ +//! Tool Execution Round-Trip Integration Tests +//! +//! CHARACTERIZATION: These tests verify that tools execute correctly through +//! the Agent interface, testing the full round-trip from tool call to result. +//! +//! What these tests protect: +//! - File operations (read, write, str_replace) work end-to-end +//! - Shell command execution produces expected output +//! - TODO operations persist correctly +//! - Error handling for invalid inputs +//! +//! What these tests intentionally do NOT assert: +//! - Internal implementation details of tools +//! - Specific formatting of success messages (only key content) +//! - UI writer behavior (uses NullUiWriter) + +use g3_core::ui_writer::NullUiWriter; +use g3_core::{Agent, ToolCall}; +use serial_test::serial; +use std::fs; +use tempfile::TempDir; + +// ============================================================================= +// Test Helpers +// ============================================================================= + +/// Create a test agent in a temporary directory +async fn create_test_agent(temp_dir: &TempDir) -> Agent { + std::env::set_current_dir(temp_dir.path()).unwrap(); + let config = g3_config::Config::default(); + let ui_writer = NullUiWriter; + Agent::new(config, ui_writer).await.unwrap() +} + +/// Create a ToolCall with the given tool name and arguments +fn make_tool_call(tool: &str, args: serde_json::Value) -> ToolCall { + ToolCall { + tool: tool.to_string(), + args, + } +} + +// ============================================================================= +// Test: read_file tool execution +// ============================================================================= + +mod read_file_execution { + use super::*; + + /// Test reading an existing file + #[tokio::test] + #[serial] + async fn test_read_existing_file() { + let temp_dir = TempDir::new().unwrap(); + let test_file = temp_dir.path().join("test.txt"); + fs::write(&test_file, "Hello, World!\nLine 2\nLine 3").unwrap(); + + let mut agent = create_test_agent(&temp_dir).await; + + let tool_call = make_tool_call( + "read_file", + serde_json::json!({ "file_path": test_file.to_string_lossy() }), + ); + + let result = agent.execute_tool(&tool_call).await.unwrap(); + + assert!(result.contains("Hello, World!"), "Should contain file content: {}", result); + assert!(result.contains("Line 2"), "Should contain all lines: {}", result); + } + + /// Test reading a non-existent file returns error + #[tokio::test] + #[serial] + async fn test_read_nonexistent_file() { + let temp_dir = TempDir::new().unwrap(); + let mut agent = create_test_agent(&temp_dir).await; + + let tool_call = make_tool_call( + "read_file", + serde_json::json!({ "file_path": "/nonexistent/path/file.txt" }), + ); + + let result = agent.execute_tool(&tool_call).await; + + // Should return an error or error message + assert!( + result.is_err() || result.as_ref().unwrap().contains("error") || result.as_ref().unwrap().contains("not found") || result.as_ref().unwrap().contains("No such file"), + "Should indicate file not found: {:?}", result + ); + } + + /// Test reading with character range + #[tokio::test] + #[serial] + async fn test_read_file_with_range() { + let temp_dir = TempDir::new().unwrap(); + let test_file = temp_dir.path().join("test.txt"); + fs::write(&test_file, "0123456789ABCDEF").unwrap(); + + let mut agent = create_test_agent(&temp_dir).await; + + let tool_call = make_tool_call( + "read_file", + serde_json::json!({ + "file_path": test_file.to_string_lossy(), + "start": 5, + "end": 10 + }), + ); + + let result = agent.execute_tool(&tool_call).await.unwrap(); + + // Should contain the substring from position 5 to 10 + assert!(result.contains("56789"), "Should contain range content: {}", result); + } +} + +// ============================================================================= +// Test: write_file tool execution +// ============================================================================= + +mod write_file_execution { + use super::*; + + /// Test writing a new file + #[tokio::test] + #[serial] + async fn test_write_new_file() { + let temp_dir = TempDir::new().unwrap(); + let new_file = temp_dir.path().join("new_file.txt"); + + assert!(!new_file.exists(), "File should not exist initially"); + + let mut agent = create_test_agent(&temp_dir).await; + + let tool_call = make_tool_call( + "write_file", + serde_json::json!({ + "file_path": new_file.to_string_lossy(), + "content": "New content here" + }), + ); + + let result = agent.execute_tool(&tool_call).await.unwrap(); + + // Should report success + assert!(result.contains("✅") || result.to_lowercase().contains("success") || result.to_lowercase().contains("wrote"), + "Should report success: {}", result); + + // File should now exist with correct content + assert!(new_file.exists(), "File should exist after write"); + let content = fs::read_to_string(&new_file).unwrap(); + assert_eq!(content, "New content here"); + } + + /// Test overwriting an existing file + #[tokio::test] + #[serial] + async fn test_overwrite_existing_file() { + let temp_dir = TempDir::new().unwrap(); + let test_file = temp_dir.path().join("existing.txt"); + fs::write(&test_file, "Original content").unwrap(); + + let mut agent = create_test_agent(&temp_dir).await; + + let tool_call = make_tool_call( + "write_file", + serde_json::json!({ + "file_path": test_file.to_string_lossy(), + "content": "Replaced content" + }), + ); + + let result = agent.execute_tool(&tool_call).await.unwrap(); + + assert!(result.contains("✅") || result.to_lowercase().contains("success") || result.to_lowercase().contains("wrote"), + "Should report success: {}", result); + + let content = fs::read_to_string(&test_file).unwrap(); + assert_eq!(content, "Replaced content"); + } + + /// Test writing creates parent directories + #[tokio::test] + #[serial] + async fn test_write_creates_parent_dirs() { + let temp_dir = TempDir::new().unwrap(); + let nested_file = temp_dir.path().join("a/b/c/nested.txt"); + + let mut agent = create_test_agent(&temp_dir).await; + + let tool_call = make_tool_call( + "write_file", + serde_json::json!({ + "file_path": nested_file.to_string_lossy(), + "content": "Nested content" + }), + ); + + let result = agent.execute_tool(&tool_call).await.unwrap(); + + assert!(result.contains("✅") || result.to_lowercase().contains("success") || result.to_lowercase().contains("wrote"), + "Should report success: {}", result); + + assert!(nested_file.exists(), "Nested file should exist"); + let content = fs::read_to_string(&nested_file).unwrap(); + assert_eq!(content, "Nested content"); + } +} + +// ============================================================================= +// Test: shell tool execution +// ============================================================================= + +mod shell_execution { + use super::*; + + /// Test simple echo command + #[tokio::test] + #[serial] + async fn test_shell_echo() { + let temp_dir = TempDir::new().unwrap(); + let mut agent = create_test_agent(&temp_dir).await; + + let tool_call = make_tool_call( + "shell", + serde_json::json!({ "command": "echo 'hello world'" }), + ); + + let result = agent.execute_tool(&tool_call).await.unwrap(); + + assert!(result.contains("hello world"), "Should contain echo output: {}", result); + } + + /// Test command that produces multi-line output + #[tokio::test] + #[serial] + async fn test_shell_multiline_output() { + let temp_dir = TempDir::new().unwrap(); + let mut agent = create_test_agent(&temp_dir).await; + + let tool_call = make_tool_call( + "shell", + serde_json::json!({ "command": "echo 'line1'; echo 'line2'; echo 'line3'" }), + ); + + let result = agent.execute_tool(&tool_call).await.unwrap(); + + assert!(result.contains("line1"), "Should contain line1: {}", result); + assert!(result.contains("line2"), "Should contain line2: {}", result); + assert!(result.contains("line3"), "Should contain line3: {}", result); + } + + /// Test command that fails + #[tokio::test] + #[serial] + async fn test_shell_failing_command() { + let temp_dir = TempDir::new().unwrap(); + let mut agent = create_test_agent(&temp_dir).await; + + let tool_call = make_tool_call( + "shell", + serde_json::json!({ "command": "exit 1" }), + ); + + let result = agent.execute_tool(&tool_call).await; + + // Should indicate failure (either error or non-zero exit) + assert!( + result.is_err() || result.as_ref().unwrap().contains("exit") || result.as_ref().unwrap().contains("failed") || result.as_ref().unwrap().contains("error"), + "Should indicate command failure: {:?}", result + ); + } + + /// Test command with working directory context + #[tokio::test] + #[serial] + async fn test_shell_pwd() { + let temp_dir = TempDir::new().unwrap(); + let mut agent = create_test_agent(&temp_dir).await; + + let tool_call = make_tool_call( + "shell", + serde_json::json!({ "command": "pwd" }), + ); + + let result = agent.execute_tool(&tool_call).await.unwrap(); + + // Should show the temp directory path + let temp_path = temp_dir.path().to_string_lossy(); + assert!(result.contains(&*temp_path) || result.contains("private"), + "Should show current directory: {} (expected to contain {})", result, temp_path); + } +} + +// ============================================================================= +// Test: str_replace tool execution +// ============================================================================= + +mod str_replace_execution { + use super::*; + + /// Test applying a simple diff + #[tokio::test] + #[serial] + async fn test_str_replace_simple() { + let temp_dir = TempDir::new().unwrap(); + let test_file = temp_dir.path().join("test.txt"); + fs::write(&test_file, "line 1\nold line\nline 3\n").unwrap(); + + let mut agent = create_test_agent(&temp_dir).await; + + let diff = "@@ -1,3 +1,3 @@\n line 1\n-old line\n+new line\n line 3\n"; + + let tool_call = make_tool_call( + "str_replace", + serde_json::json!({ + "file_path": test_file.to_string_lossy(), + "diff": diff + }), + ); + + let result = agent.execute_tool(&tool_call).await.unwrap(); + + assert!(result.contains("✅") || result.to_lowercase().contains("applied") || result.to_lowercase().contains("success"), + "Should report success: {}", result); + + let content = fs::read_to_string(&test_file).unwrap(); + assert!(content.contains("new line"), "Should contain new content: {}", content); + assert!(!content.contains("old line"), "Should not contain old content: {}", content); + } + + /// Test diff that adds lines + #[tokio::test] + #[serial] + async fn test_str_replace_add_lines() { + let temp_dir = TempDir::new().unwrap(); + let test_file = temp_dir.path().join("test.txt"); + fs::write(&test_file, "line 1\nline 3\n").unwrap(); + + let mut agent = create_test_agent(&temp_dir).await; + + let diff = "@@ -1,2 +1,3 @@\n line 1\n+line 2\n line 3\n"; + + let tool_call = make_tool_call( + "str_replace", + serde_json::json!({ + "file_path": test_file.to_string_lossy(), + "diff": diff + }), + ); + + let result = agent.execute_tool(&tool_call).await.unwrap(); + + assert!(result.contains("✅") || result.to_lowercase().contains("applied"), + "Should report success: {}", result); + + let content = fs::read_to_string(&test_file).unwrap(); + assert!(content.contains("line 2"), "Should contain added line: {}", content); + } + + /// Test diff with pattern not found + #[tokio::test] + #[serial] + async fn test_str_replace_pattern_not_found() { + let temp_dir = TempDir::new().unwrap(); + let test_file = temp_dir.path().join("test.txt"); + fs::write(&test_file, "actual content\n").unwrap(); + + let mut agent = create_test_agent(&temp_dir).await; + + let diff = "@@ -1,1 +1,1 @@\n-nonexistent pattern\n+replacement\n"; + + let tool_call = make_tool_call( + "str_replace", + serde_json::json!({ + "file_path": test_file.to_string_lossy(), + "diff": diff + }), + ); + + let result = agent.execute_tool(&tool_call).await; + + // Should indicate pattern not found + assert!( + result.is_err() || result.as_ref().unwrap().to_lowercase().contains("not found") || result.as_ref().unwrap().to_lowercase().contains("pattern") || result.as_ref().unwrap().to_lowercase().contains("error"), + "Should indicate pattern not found: {:?}", result + ); + } +} + +// ============================================================================= +// Test: TODO tool execution +// ============================================================================= + +mod todo_execution { + use super::*; + + /// Test writing and reading TODO + #[tokio::test] + #[serial] + async fn test_todo_write_and_read() { + let temp_dir = TempDir::new().unwrap(); + let mut agent = create_test_agent(&temp_dir).await; + + // Write TODO + let write_call = make_tool_call( + "todo_write", + serde_json::json!({ + "content": "- [ ] Task 1\n- [x] Task 2\n- [ ] Task 3" + }), + ); + + let write_result = agent.execute_tool(&write_call).await.unwrap(); + assert!(write_result.contains("✅") || write_result.to_lowercase().contains("success"), + "Write should succeed: {}", write_result); + + // Read TODO + let read_call = make_tool_call("todo_read", serde_json::json!({})); + let read_result = agent.execute_tool(&read_call).await.unwrap(); + + assert!(read_result.contains("Task 1"), "Should contain Task 1: {}", read_result); + assert!(read_result.contains("Task 2"), "Should contain Task 2: {}", read_result); + assert!(read_result.contains("Task 3"), "Should contain Task 3: {}", read_result); + } + + /// Test reading empty TODO + #[tokio::test] + #[serial] + async fn test_todo_read_empty() { + let temp_dir = TempDir::new().unwrap(); + let mut agent = create_test_agent(&temp_dir).await; + + let read_call = make_tool_call("todo_read", serde_json::json!({})); + let result = agent.execute_tool(&read_call).await.unwrap(); + + assert!(result.to_lowercase().contains("empty") || result.contains("no todo"), + "Should indicate empty: {}", result); + } + + /// Test TODO persists to file + #[tokio::test] + #[serial] + async fn test_todo_persists_to_file() { + let temp_dir = TempDir::new().unwrap(); + let todo_path = temp_dir.path().join("todo.g3.md"); + + { + let mut agent = create_test_agent(&temp_dir).await; + + let write_call = make_tool_call( + "todo_write", + serde_json::json!({ + "content": "- [ ] Persistent task" + }), + ); + + agent.execute_tool(&write_call).await.unwrap(); + } + + // File should exist after agent is dropped + assert!(todo_path.exists(), "TODO file should persist"); + let content = fs::read_to_string(&todo_path).unwrap(); + assert!(content.contains("Persistent task"), "Content should persist: {}", content); + } +}