Warnings fixed: - Remove unused 'warn' import from retry.rs - Prefix unused 'output' param with underscore - Prefix unused 'rel_start' with underscore - Add #[allow(dead_code)] to G3Status::info() Message format tweaked per feedback: - 'g3: model overloaded [error]' (no attempt info) - 'g3: retrying in 2.2s (1/3) ... [done]' (attempt info moved here) - Handle empty error message in Status::Error to show just '[error]'
353 lines
12 KiB
Rust
353 lines
12 KiB
Rust
//! Retry infrastructure for agent task execution
|
|
//!
|
|
//! This module provides reusable retry logic for executing agent tasks,
|
|
//! including error classification, exponential backoff, and configurable retry strategies.
|
|
//!
|
|
//! Used by both autonomous mode (g3-cli) and planning mode (g3-planner).
|
|
|
|
use crate::error_handling::{calculate_retry_delay, classify_error, ErrorType, RecoverableError};
|
|
use crate::ui_writer::UiWriter;
|
|
use crate::{Agent, DiscoveryOptions, TaskResult};
|
|
use anyhow::Result;
|
|
use std::time::Instant;
|
|
use tracing::debug;
|
|
|
|
/// Configuration for retry behavior
|
|
#[derive(Debug, Clone)]
|
|
pub struct RetryConfig {
|
|
/// Maximum number of retry attempts
|
|
pub max_retries: u32,
|
|
/// Whether this is autonomous mode (affects backoff timing)
|
|
pub is_autonomous: bool,
|
|
/// Role name for logging (e.g., "player", "coach")
|
|
pub role_name: String,
|
|
}
|
|
|
|
impl Default for RetryConfig {
|
|
fn default() -> Self {
|
|
Self {
|
|
max_retries: 3,
|
|
is_autonomous: false,
|
|
role_name: "agent".to_string(),
|
|
}
|
|
}
|
|
}
|
|
|
|
impl RetryConfig {
|
|
/// Create a retry config for player agent
|
|
pub fn player() -> Self {
|
|
Self {
|
|
max_retries: 3,
|
|
is_autonomous: true,
|
|
role_name: "player".to_string(),
|
|
}
|
|
}
|
|
|
|
/// Create a retry config for coach agent
|
|
pub fn coach() -> Self {
|
|
Self {
|
|
max_retries: 3,
|
|
is_autonomous: true,
|
|
role_name: "coach".to_string(),
|
|
}
|
|
}
|
|
|
|
/// Create a retry config for planning mode
|
|
pub fn planning(role: &str) -> Self {
|
|
Self {
|
|
max_retries: 3,
|
|
is_autonomous: true,
|
|
role_name: role.to_string(),
|
|
}
|
|
}
|
|
|
|
/// Set custom max retries
|
|
pub fn with_max_retries(mut self, max_retries: u32) -> Self {
|
|
self.max_retries = max_retries;
|
|
self
|
|
}
|
|
}
|
|
|
|
/// Result of a retry operation
|
|
#[derive(Debug)]
|
|
pub enum RetryResult {
|
|
/// Task succeeded with result
|
|
Success(TaskResult),
|
|
/// Task failed after max retries (contains last error message)
|
|
MaxRetriesReached(String),
|
|
/// Context length exceeded - should end current turn
|
|
ContextLengthExceeded(String),
|
|
/// Panic detected - should terminate
|
|
Panic(anyhow::Error),
|
|
}
|
|
|
|
impl RetryResult {
|
|
/// Check if the result is a success
|
|
pub fn is_success(&self) -> bool {
|
|
matches!(self, RetryResult::Success(_))
|
|
}
|
|
|
|
/// Get the task result if successful
|
|
pub fn into_result(self) -> Option<TaskResult> {
|
|
match self {
|
|
RetryResult::Success(result) => Some(result),
|
|
_ => None,
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Callback for handling context length exceeded errors
|
|
pub type ContextExceededCallback<W> = Box<dyn FnOnce(&Agent<W>, &anyhow::Error, u32) + Send>;
|
|
|
|
/// Execute an agent task with retry logic
|
|
///
|
|
/// This function handles:
|
|
/// - Error classification (timeout, rate limit, server error, etc.)
|
|
/// - Exponential backoff between retries
|
|
/// - Context length exceeded errors (ends turn gracefully)
|
|
/// - Panic detection (terminates execution)
|
|
///
|
|
/// # Arguments
|
|
/// * `agent` - The agent to execute the task
|
|
/// * `prompt` - The task prompt
|
|
/// * `config` - Retry configuration
|
|
/// * `show_prompt` - Whether to show the prompt
|
|
/// * `show_code` - Whether to show code in output
|
|
/// * `discovery` - Optional discovery options
|
|
/// * `print_fn` - Function to print status messages
|
|
///
|
|
/// # Returns
|
|
/// A `RetryResult` indicating success, failure, or special conditions
|
|
pub async fn execute_with_retry<W, F>(
|
|
agent: &mut Agent<W>,
|
|
prompt: &str,
|
|
config: &RetryConfig,
|
|
show_prompt: bool,
|
|
show_code: bool,
|
|
discovery: Option<DiscoveryOptions<'_>>,
|
|
mut print_fn: F,
|
|
) -> RetryResult
|
|
where
|
|
W: UiWriter + Clone + Send + Sync + 'static,
|
|
F: FnMut(&str),
|
|
{
|
|
let mut retry_count = 0;
|
|
let start_time = Instant::now();
|
|
|
|
loop {
|
|
let result = agent
|
|
.execute_task_with_timing(prompt, None, false, show_prompt, show_code, true, discovery.clone())
|
|
.await;
|
|
|
|
match result {
|
|
Ok(task_result) => {
|
|
if retry_count > 0 {
|
|
debug!(
|
|
"{} task succeeded after {} retries (elapsed: {:?})",
|
|
config.role_name,
|
|
retry_count,
|
|
start_time.elapsed()
|
|
);
|
|
}
|
|
return RetryResult::Success(task_result);
|
|
}
|
|
Err(e) => {
|
|
let error_type = classify_error(&e);
|
|
|
|
// Check for context length exceeded
|
|
if matches!(
|
|
error_type,
|
|
ErrorType::Recoverable(RecoverableError::ContextLengthExceeded)
|
|
) {
|
|
let msg = format!(
|
|
"⚠️ Context length exceeded in {} turn: {}",
|
|
config.role_name, e
|
|
);
|
|
print_fn(&msg);
|
|
print_fn("📝 Logging error to session and ending current turn...");
|
|
|
|
// Log to session with forensic context
|
|
let forensic_context = format!(
|
|
"Role: {}\nContext tokens: {}\nTotal available: {}\nPercentage used: {:.1}%\nPrompt length: {} chars\nError occurred at: {}",
|
|
config.role_name,
|
|
agent.get_context_window().used_tokens,
|
|
agent.get_context_window().total_tokens,
|
|
agent.get_context_window().percentage_used(),
|
|
prompt.len(),
|
|
chrono::Utc::now().to_rfc3339()
|
|
);
|
|
agent.log_error_to_session(&e, "assistant", Some(forensic_context));
|
|
|
|
return RetryResult::ContextLengthExceeded(e.to_string());
|
|
}
|
|
|
|
// Check for panic
|
|
if e.to_string().contains("panic") {
|
|
print_fn(&format!("💥 {} panic detected: {}", config.role_name, e));
|
|
return RetryResult::Panic(e);
|
|
}
|
|
|
|
// Check if error is recoverable
|
|
match error_type {
|
|
ErrorType::Recoverable(ref recoverable_type) => {
|
|
retry_count += 1;
|
|
|
|
if retry_count >= config.max_retries {
|
|
let msg = format!(
|
|
"g3: {} max retries reached [failed]",
|
|
config.role_name
|
|
);
|
|
print_fn(&msg);
|
|
return RetryResult::MaxRetriesReached(e.to_string());
|
|
}
|
|
|
|
// Calculate backoff delay
|
|
let delay = calculate_retry_delay(retry_count, config.is_autonomous);
|
|
let delay_secs = delay.as_secs_f64();
|
|
|
|
// Clean error message
|
|
let msg = format!("g3: {:?} [error]", recoverable_type);
|
|
print_fn(&msg);
|
|
|
|
// Retry message - note: can't show [done] here since we don't control when sleep finishes
|
|
let retry_msg = format!("g3: retrying in {:.1}s ({}/{}) ...", delay_secs, retry_count, config.max_retries);
|
|
print_fn(&retry_msg);
|
|
|
|
debug!(
|
|
"Recoverable error ({:?}) in {} (attempt {}/{}). Retrying in {:?}...",
|
|
recoverable_type, config.role_name, retry_count, config.max_retries, delay
|
|
);
|
|
|
|
tokio::time::sleep(delay).await;
|
|
}
|
|
ErrorType::NonRecoverable => {
|
|
let msg = format!(
|
|
"g3: {} error [failed]",
|
|
config.role_name
|
|
);
|
|
print_fn(&msg);
|
|
return RetryResult::MaxRetriesReached(e.to_string());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Execute a simple async operation with retry (for non-agent tasks)
|
|
///
|
|
/// This is a simpler retry wrapper for operations like LLM API calls
|
|
/// that don't involve the full agent machinery.
|
|
pub async fn retry_operation<F, Fut, T, P>(
|
|
operation_name: &str,
|
|
mut operation: F,
|
|
max_retries: u32,
|
|
is_autonomous: bool,
|
|
mut print_fn: P,
|
|
) -> Result<T>
|
|
where
|
|
F: FnMut() -> Fut,
|
|
Fut: std::future::Future<Output = Result<T>>,
|
|
P: FnMut(&str),
|
|
{
|
|
let mut retry_count = 0;
|
|
|
|
loop {
|
|
match operation().await {
|
|
Ok(result) => {
|
|
if retry_count > 0 {
|
|
debug!(
|
|
"Operation '{}' succeeded after {} retries",
|
|
operation_name, retry_count
|
|
);
|
|
}
|
|
return Ok(result);
|
|
}
|
|
Err(e) => {
|
|
let error_type = classify_error(&e);
|
|
|
|
match error_type {
|
|
ErrorType::Recoverable(ref recoverable_type) => {
|
|
retry_count += 1;
|
|
|
|
if retry_count >= max_retries {
|
|
let msg = format!(
|
|
"g3: {} max retries reached [failed]",
|
|
operation_name
|
|
);
|
|
print_fn(&msg);
|
|
return Err(e);
|
|
}
|
|
|
|
let delay = calculate_retry_delay(retry_count, is_autonomous);
|
|
let delay_secs = delay.as_secs_f64();
|
|
|
|
// Clean error message
|
|
let msg = format!("g3: {:?} [error]", recoverable_type);
|
|
print_fn(&msg);
|
|
|
|
let retry_msg = format!("g3: retrying in {:.1}s ({}/{}) ...", delay_secs, retry_count, max_retries);
|
|
print_fn(&retry_msg);
|
|
|
|
tokio::time::sleep(delay).await;
|
|
}
|
|
ErrorType::NonRecoverable => {
|
|
let msg = format!(
|
|
"g3: {} error [failed]",
|
|
operation_name
|
|
);
|
|
print_fn(&msg);
|
|
return Err(e);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_retry_config_defaults() {
|
|
let config = RetryConfig::default();
|
|
assert_eq!(config.max_retries, 3);
|
|
assert!(!config.is_autonomous);
|
|
assert_eq!(config.role_name, "agent");
|
|
}
|
|
|
|
#[test]
|
|
fn test_retry_config_player() {
|
|
let config = RetryConfig::player();
|
|
assert_eq!(config.max_retries, 3);
|
|
assert!(config.is_autonomous);
|
|
assert_eq!(config.role_name, "player");
|
|
}
|
|
|
|
#[test]
|
|
fn test_retry_config_coach() {
|
|
let config = RetryConfig::coach();
|
|
assert_eq!(config.max_retries, 3);
|
|
assert!(config.is_autonomous);
|
|
assert_eq!(config.role_name, "coach");
|
|
}
|
|
|
|
#[test]
|
|
fn test_retry_config_with_max_retries() {
|
|
let config = RetryConfig::player().with_max_retries(5);
|
|
assert_eq!(config.max_retries, 5);
|
|
}
|
|
|
|
#[test]
|
|
fn test_retry_result_is_success() {
|
|
use crate::ContextWindow;
|
|
let ctx = ContextWindow::new(1000);
|
|
let result = RetryResult::Success(TaskResult::new("test".to_string(), ctx));
|
|
assert!(result.is_success());
|
|
|
|
let failed = RetryResult::MaxRetriesReached("error".to_string());
|
|
assert!(!failed.is_success());
|
|
}
|
|
}
|