retry on errors

This commit is contained in:
Dhanji Prasanna
2025-10-07 11:20:19 +11:00
parent 5a83e1b7e0
commit e6cec5ef0f
6 changed files with 104 additions and 43 deletions

View File

@@ -133,7 +133,11 @@ pub async fn run() -> Result<()> {
// Initialize agent
let ui_writer = ConsoleUiWriter::new();
let mut agent = Agent::new(config.clone(), ui_writer).await?;
let mut agent = if cli.autonomous {
Agent::new_autonomous(config.clone(), ui_writer).await?
} else {
Agent::new(config.clone(), ui_writer).await?
};
// Execute task, autonomous mode, or start interactive mode
if cli.autonomous {
@@ -700,7 +704,7 @@ async fn run_autonomous(
// Create a new agent instance for coach mode to ensure fresh context
let config = g3_config::Config::load(None)?;
let ui_writer = ConsoleUiWriter::new();
let mut coach_agent = Agent::new(config, ui_writer).await?;
let mut coach_agent = Agent::new_autonomous(config, ui_writer).await?;
// Ensure coach agent is also in the workspace directory
project.enter_workspace()?;

View File

@@ -11,16 +11,29 @@ use serde::{Deserialize, Serialize};
use std::time::Duration;
use tracing::{error, info, warn};
/// Maximum number of retry attempts for recoverable errors
const MAX_RETRY_ATTEMPTS: u32 = 3;
/// Maximum number of retry attempts for recoverable errors (default mode)
const DEFAULT_MAX_RETRY_ATTEMPTS: u32 = 3;
/// Maximum number of retry attempts for autonomous mode
const AUTONOMOUS_MAX_RETRY_ATTEMPTS: u32 = 6;
/// Base delay for exponential backoff (in milliseconds)
const BASE_RETRY_DELAY_MS: u64 = 1000;
/// Maximum delay between retries (in milliseconds)
const MAX_RETRY_DELAY_MS: u64 = 10000;
/// Maximum delay between retries (in milliseconds) for default mode
const DEFAULT_MAX_RETRY_DELAY_MS: u64 = 10000;
/// Jitter factor (0.0 to 1.0) to randomize retry delays
/// Maximum delay between retries (in milliseconds) for autonomous mode
/// Spread over 10 minutes (600 seconds) with 6 retries
const AUTONOMOUS_MAX_RETRY_DELAY_MS: u64 = 120000; // 2 minutes max per retry
/// Total time budget for autonomous mode retries (10 minutes)
const AUTONOMOUS_RETRY_BUDGET_MS: u64 = 600000;
/// Jitter factor (0.0 to 1.0) to randomize retry delays (default)
const DEFAULT_JITTER_FACTOR: f64 = 0.3;
/// Jitter factor for autonomous mode (higher for better distribution)
const JITTER_FACTOR: f64 = 0.3;
/// Error context information for detailed logging
@@ -209,13 +222,39 @@ pub fn classify_error(error: &anyhow::Error) -> ErrorType {
ErrorType::NonRecoverable
}
/// Calculate retry delay with exponential backoff and jitter
pub fn calculate_retry_delay(attempt: u32) -> Duration {
/// Calculate retry delay for autonomous mode with better distribution over 10 minutes
fn calculate_autonomous_retry_delay(attempt: u32) -> Duration {
use rand::Rng;
let mut rng = rand::thread_rng();
// Distribute 6 retries over 10 minutes (600 seconds)
// Base delays: 10s, 30s, 60s, 120s, 180s, 200s = 600s total
let base_delays_ms = [10000, 30000, 60000, 120000, 180000, 200000];
let base_delay = base_delays_ms.get(attempt.saturating_sub(1) as usize).unwrap_or(&200000);
// Add jitter of ±30% to prevent thundering herd
let jitter = (*base_delay as f64 * 0.3 * rng.gen::<f64>()) as u64;
let final_delay = if rng.gen_bool(0.5) {
base_delay + jitter
} else {
base_delay.saturating_sub(jitter)
};
Duration::from_millis(final_delay)
}
/// Calculate retry delay with exponential backoff and jitter
pub fn calculate_retry_delay(attempt: u32, is_autonomous: bool) -> Duration {
if is_autonomous {
return calculate_autonomous_retry_delay(attempt);
}
use rand::Rng;
let max_retry_delay_ms = if is_autonomous { AUTONOMOUS_MAX_RETRY_DELAY_MS } else { DEFAULT_MAX_RETRY_DELAY_MS };
// Exponential backoff: delay = base * 2^attempt
let base_delay = BASE_RETRY_DELAY_MS * (2_u64.pow(attempt.saturating_sub(1)));
let capped_delay = base_delay.min(MAX_RETRY_DELAY_MS);
let capped_delay = base_delay.min(max_retry_delay_ms);
// Add jitter to prevent thundering herd
let mut rng = rand::thread_rng();
@@ -234,6 +273,7 @@ pub async fn retry_with_backoff<F, Fut, T>(
operation_name: &str,
mut operation: F,
context: &ErrorContext,
is_autonomous: bool,
) -> Result<T>
where
F: FnMut() -> Fut,
@@ -257,10 +297,11 @@ where
}
Err(error) => {
let error_type = classify_error(&error);
let max_attempts = if is_autonomous { AUTONOMOUS_MAX_RETRY_ATTEMPTS } else { DEFAULT_MAX_RETRY_ATTEMPTS };
match error_type {
ErrorType::Recoverable(recoverable_type) => {
if attempt >= MAX_RETRY_ATTEMPTS {
if attempt >= max_attempts {
error!(
"Operation '{}' failed after {} attempts. Giving up.",
operation_name, attempt
@@ -269,10 +310,10 @@ where
return Err(error);
}
let delay = calculate_retry_delay(attempt);
let delay = calculate_retry_delay(attempt, is_autonomous);
warn!(
"Recoverable error ({:?}) in '{}' (attempt {}/{}). Retrying in {:?}...",
recoverable_type, operation_name, attempt, MAX_RETRY_ATTEMPTS, delay
recoverable_type, operation_name, attempt, max_attempts, delay
);
warn!("Error details: {}", error);
@@ -380,9 +421,9 @@ mod tests {
#[test]
fn test_retry_delay_calculation() {
// Test that delays increase exponentially
let delay1 = calculate_retry_delay(1);
let delay2 = calculate_retry_delay(2);
let delay3 = calculate_retry_delay(3);
let delay1 = calculate_retry_delay(1, false);
let delay2 = calculate_retry_delay(2, false);
let delay3 = calculate_retry_delay(3, false);
// Due to jitter, we can't test exact values, but the base should increase
assert!(delay1.as_millis() >= (BASE_RETRY_DELAY_MS as f64 * 0.7) as u128);
@@ -395,8 +436,28 @@ mod tests {
assert!(delay3.as_millis() >= delay2.as_millis());
// Test max cap
let delay_max = calculate_retry_delay(10);
assert!(delay_max.as_millis() <= (MAX_RETRY_DELAY_MS as f64 * 1.3) as u128);
let delay_max = calculate_retry_delay(10, false);
assert!(delay_max.as_millis() <= (DEFAULT_MAX_RETRY_DELAY_MS as f64 * 1.3) as u128);
}
#[test]
fn test_autonomous_retry_delay_calculation() {
// Test autonomous mode delays are distributed over 10 minutes
let delay1 = calculate_retry_delay(1, true);
let delay2 = calculate_retry_delay(2, true);
let delay3 = calculate_retry_delay(3, true);
let delay4 = calculate_retry_delay(4, true);
let delay5 = calculate_retry_delay(5, true);
let delay6 = calculate_retry_delay(6, true);
// Base delays should be around: 10s, 30s, 60s, 120s, 180s, 200s
// With ±30% jitter
assert!(delay1.as_millis() >= 7000 && delay1.as_millis() <= 13000);
assert!(delay2.as_millis() >= 21000 && delay2.as_millis() <= 39000);
assert!(delay3.as_millis() >= 42000 && delay3.as_millis() <= 78000);
assert!(delay4.as_millis() >= 84000 && delay4.as_millis() <= 156000);
assert!(delay5.as_millis() >= 126000 && delay5.as_millis() <= 234000);
assert!(delay6.as_millis() >= 140000 && delay6.as_millis() <= 260000);
}
#[test]

View File

@@ -36,6 +36,7 @@ mod tests {
}
},
&context,
false, // not autonomous mode
)
.await;
@@ -68,6 +69,7 @@ mod tests {
}
},
&context,
false, // not autonomous mode
)
.await;
@@ -99,6 +101,7 @@ mod tests {
}
},
&context,
false, // not autonomous mode
)
.await;
@@ -125,9 +128,9 @@ mod tests {
#[test]
fn test_retry_delay_increases() {
let delay1 = calculate_retry_delay(1);
let delay2 = calculate_retry_delay(2);
let delay3 = calculate_retry_delay(3);
let delay1 = calculate_retry_delay(1, false);
let delay2 = calculate_retry_delay(2, false);
let delay3 = calculate_retry_delay(3, false);
// Delays should generally increase (though jitter can affect this)
// We'll test the base delays without jitter

View File

@@ -338,10 +338,19 @@ pub struct Agent<W: UiWriter> {
session_id: Option<String>,
tool_call_metrics: Vec<(String, Duration, bool)>, // (tool_name, duration, success)
ui_writer: W,
is_autonomous: bool,
}
impl<W: UiWriter> Agent<W> {
pub async fn new(config: Config, ui_writer: W) -> Result<Self> {
Self::new_with_mode(config, ui_writer, false).await
}
pub async fn new_autonomous(config: Config, ui_writer: W) -> Result<Self> {
Self::new_with_mode(config, ui_writer, true).await
}
async fn new_with_mode(config: Config, ui_writer: W, is_autonomous: bool) -> Result<Self> {
let mut providers = ProviderRegistry::new();
// Only register providers that are configured AND selected as the default provider
@@ -431,6 +440,7 @@ impl<W: UiWriter> Agent<W> {
session_id: None,
tool_call_metrics: Vec::new(),
ui_writer,
is_autonomous,
})
}
@@ -954,7 +964,7 @@ The tool will execute immediately and you'll receive the result (success or erro
use crate::error_handling::{calculate_retry_delay, classify_error, ErrorType};
let mut attempt = 0;
const MAX_ATTEMPTS: u32 = 3;
let max_attempts = if self.is_autonomous { 6 } else { 3 };
loop {
attempt += 1;
@@ -974,12 +984,12 @@ The tool will execute immediately and you'll receive the result (success or erro
);
return Ok(stream);
}
Err(e) if attempt < MAX_ATTEMPTS => {
Err(e) if attempt < max_attempts => {
if matches!(classify_error(&e), ErrorType::Recoverable(_)) {
let delay = calculate_retry_delay(attempt);
let delay = calculate_retry_delay(attempt, self.is_autonomous);
warn!(
"Recoverable error on attempt {}/{}: {}. Retrying in {:?}...",
attempt, MAX_ATTEMPTS, e, delay
attempt, max_attempts, e, delay
);
tokio::time::sleep(delay).await;
} else {