retry on errors

This commit is contained in:
Dhanji Prasanna
2025-10-07 11:20:19 +11:00
parent 5a83e1b7e0
commit e6cec5ef0f
6 changed files with 104 additions and 43 deletions

17
Cargo.lock generated
View File

@@ -2877,16 +2877,6 @@ dependencies = [
"windows-sys 0.60.2", "windows-sys 0.60.2",
] ]
[[package]]
name = "term_size"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e4129646ca0ed8f45d09b929036bafad5377103edd06e50bf574b353d2b08d9"
dependencies = [
"libc",
"winapi",
]
[[package]] [[package]]
name = "termimad" name = "termimad"
version = "0.34.0" version = "0.34.0"
@@ -2903,13 +2893,6 @@ dependencies = [
"unicode-width 0.1.14", "unicode-width 0.1.14",
] ]
[[package]]
name = "test_scroll"
version = "0.1.0"
dependencies = [
"term_size",
]
[[package]] [[package]]
name = "thiserror" name = "thiserror"
version = "1.0.69" version = "1.0.69"

View File

@@ -5,7 +5,7 @@ members = [
"crates/g3-providers", "crates/g3-providers",
"crates/g3-config", "crates/g3-config",
"crates/g3-execution" "crates/g3-execution"
, "tmp"] ]
resolver = "2" resolver = "2"
[workspace.dependencies] [workspace.dependencies]

View File

@@ -133,7 +133,11 @@ pub async fn run() -> Result<()> {
// Initialize agent // Initialize agent
let ui_writer = ConsoleUiWriter::new(); let ui_writer = ConsoleUiWriter::new();
let mut agent = Agent::new(config.clone(), ui_writer).await?; let mut agent = if cli.autonomous {
Agent::new_autonomous(config.clone(), ui_writer).await?
} else {
Agent::new(config.clone(), ui_writer).await?
};
// Execute task, autonomous mode, or start interactive mode // Execute task, autonomous mode, or start interactive mode
if cli.autonomous { if cli.autonomous {
@@ -700,7 +704,7 @@ async fn run_autonomous(
// Create a new agent instance for coach mode to ensure fresh context // Create a new agent instance for coach mode to ensure fresh context
let config = g3_config::Config::load(None)?; let config = g3_config::Config::load(None)?;
let ui_writer = ConsoleUiWriter::new(); let ui_writer = ConsoleUiWriter::new();
let mut coach_agent = Agent::new(config, ui_writer).await?; let mut coach_agent = Agent::new_autonomous(config, ui_writer).await?;
// Ensure coach agent is also in the workspace directory // Ensure coach agent is also in the workspace directory
project.enter_workspace()?; project.enter_workspace()?;

View File

@@ -11,16 +11,29 @@ use serde::{Deserialize, Serialize};
use std::time::Duration; use std::time::Duration;
use tracing::{error, info, warn}; use tracing::{error, info, warn};
/// Maximum number of retry attempts for recoverable errors /// Maximum number of retry attempts for recoverable errors (default mode)
const MAX_RETRY_ATTEMPTS: u32 = 3; const DEFAULT_MAX_RETRY_ATTEMPTS: u32 = 3;
/// Maximum number of retry attempts for autonomous mode
const AUTONOMOUS_MAX_RETRY_ATTEMPTS: u32 = 6;
/// Base delay for exponential backoff (in milliseconds) /// Base delay for exponential backoff (in milliseconds)
const BASE_RETRY_DELAY_MS: u64 = 1000; const BASE_RETRY_DELAY_MS: u64 = 1000;
/// Maximum delay between retries (in milliseconds) /// Maximum delay between retries (in milliseconds) for default mode
const MAX_RETRY_DELAY_MS: u64 = 10000; const DEFAULT_MAX_RETRY_DELAY_MS: u64 = 10000;
/// Jitter factor (0.0 to 1.0) to randomize retry delays /// Maximum delay between retries (in milliseconds) for autonomous mode
/// Spread over 10 minutes (600 seconds) with 6 retries
const AUTONOMOUS_MAX_RETRY_DELAY_MS: u64 = 120000; // 2 minutes max per retry
/// Total time budget for autonomous mode retries (10 minutes)
const AUTONOMOUS_RETRY_BUDGET_MS: u64 = 600000;
/// Jitter factor (0.0 to 1.0) to randomize retry delays (default)
const DEFAULT_JITTER_FACTOR: f64 = 0.3;
/// Jitter factor for autonomous mode (higher for better distribution)
const JITTER_FACTOR: f64 = 0.3; const JITTER_FACTOR: f64 = 0.3;
/// Error context information for detailed logging /// Error context information for detailed logging
@@ -209,13 +222,39 @@ pub fn classify_error(error: &anyhow::Error) -> ErrorType {
ErrorType::NonRecoverable ErrorType::NonRecoverable
} }
/// Calculate retry delay with exponential backoff and jitter /// Calculate retry delay for autonomous mode with better distribution over 10 minutes
pub fn calculate_retry_delay(attempt: u32) -> Duration { fn calculate_autonomous_retry_delay(attempt: u32) -> Duration {
use rand::Rng; use rand::Rng;
let mut rng = rand::thread_rng();
// Distribute 6 retries over 10 minutes (600 seconds)
// Base delays: 10s, 30s, 60s, 120s, 180s, 200s = 600s total
let base_delays_ms = [10000, 30000, 60000, 120000, 180000, 200000];
let base_delay = base_delays_ms.get(attempt.saturating_sub(1) as usize).unwrap_or(&200000);
// Add jitter of ±30% to prevent thundering herd
let jitter = (*base_delay as f64 * 0.3 * rng.gen::<f64>()) as u64;
let final_delay = if rng.gen_bool(0.5) {
base_delay + jitter
} else {
base_delay.saturating_sub(jitter)
};
Duration::from_millis(final_delay)
}
/// Calculate retry delay with exponential backoff and jitter
pub fn calculate_retry_delay(attempt: u32, is_autonomous: bool) -> Duration {
if is_autonomous {
return calculate_autonomous_retry_delay(attempt);
}
use rand::Rng;
let max_retry_delay_ms = if is_autonomous { AUTONOMOUS_MAX_RETRY_DELAY_MS } else { DEFAULT_MAX_RETRY_DELAY_MS };
// Exponential backoff: delay = base * 2^attempt // Exponential backoff: delay = base * 2^attempt
let base_delay = BASE_RETRY_DELAY_MS * (2_u64.pow(attempt.saturating_sub(1))); let base_delay = BASE_RETRY_DELAY_MS * (2_u64.pow(attempt.saturating_sub(1)));
let capped_delay = base_delay.min(MAX_RETRY_DELAY_MS); let capped_delay = base_delay.min(max_retry_delay_ms);
// Add jitter to prevent thundering herd // Add jitter to prevent thundering herd
let mut rng = rand::thread_rng(); let mut rng = rand::thread_rng();
@@ -234,6 +273,7 @@ pub async fn retry_with_backoff<F, Fut, T>(
operation_name: &str, operation_name: &str,
mut operation: F, mut operation: F,
context: &ErrorContext, context: &ErrorContext,
is_autonomous: bool,
) -> Result<T> ) -> Result<T>
where where
F: FnMut() -> Fut, F: FnMut() -> Fut,
@@ -257,10 +297,11 @@ where
} }
Err(error) => { Err(error) => {
let error_type = classify_error(&error); let error_type = classify_error(&error);
let max_attempts = if is_autonomous { AUTONOMOUS_MAX_RETRY_ATTEMPTS } else { DEFAULT_MAX_RETRY_ATTEMPTS };
match error_type { match error_type {
ErrorType::Recoverable(recoverable_type) => { ErrorType::Recoverable(recoverable_type) => {
if attempt >= MAX_RETRY_ATTEMPTS { if attempt >= max_attempts {
error!( error!(
"Operation '{}' failed after {} attempts. Giving up.", "Operation '{}' failed after {} attempts. Giving up.",
operation_name, attempt operation_name, attempt
@@ -269,10 +310,10 @@ where
return Err(error); return Err(error);
} }
let delay = calculate_retry_delay(attempt); let delay = calculate_retry_delay(attempt, is_autonomous);
warn!( warn!(
"Recoverable error ({:?}) in '{}' (attempt {}/{}). Retrying in {:?}...", "Recoverable error ({:?}) in '{}' (attempt {}/{}). Retrying in {:?}...",
recoverable_type, operation_name, attempt, MAX_RETRY_ATTEMPTS, delay recoverable_type, operation_name, attempt, max_attempts, delay
); );
warn!("Error details: {}", error); warn!("Error details: {}", error);
@@ -380,9 +421,9 @@ mod tests {
#[test] #[test]
fn test_retry_delay_calculation() { fn test_retry_delay_calculation() {
// Test that delays increase exponentially // Test that delays increase exponentially
let delay1 = calculate_retry_delay(1); let delay1 = calculate_retry_delay(1, false);
let delay2 = calculate_retry_delay(2); let delay2 = calculate_retry_delay(2, false);
let delay3 = calculate_retry_delay(3); let delay3 = calculate_retry_delay(3, false);
// Due to jitter, we can't test exact values, but the base should increase // Due to jitter, we can't test exact values, but the base should increase
assert!(delay1.as_millis() >= (BASE_RETRY_DELAY_MS as f64 * 0.7) as u128); assert!(delay1.as_millis() >= (BASE_RETRY_DELAY_MS as f64 * 0.7) as u128);
@@ -395,8 +436,28 @@ mod tests {
assert!(delay3.as_millis() >= delay2.as_millis()); assert!(delay3.as_millis() >= delay2.as_millis());
// Test max cap // Test max cap
let delay_max = calculate_retry_delay(10); let delay_max = calculate_retry_delay(10, false);
assert!(delay_max.as_millis() <= (MAX_RETRY_DELAY_MS as f64 * 1.3) as u128); assert!(delay_max.as_millis() <= (DEFAULT_MAX_RETRY_DELAY_MS as f64 * 1.3) as u128);
}
#[test]
fn test_autonomous_retry_delay_calculation() {
// Test autonomous mode delays are distributed over 10 minutes
let delay1 = calculate_retry_delay(1, true);
let delay2 = calculate_retry_delay(2, true);
let delay3 = calculate_retry_delay(3, true);
let delay4 = calculate_retry_delay(4, true);
let delay5 = calculate_retry_delay(5, true);
let delay6 = calculate_retry_delay(6, true);
// Base delays should be around: 10s, 30s, 60s, 120s, 180s, 200s
// With ±30% jitter
assert!(delay1.as_millis() >= 7000 && delay1.as_millis() <= 13000);
assert!(delay2.as_millis() >= 21000 && delay2.as_millis() <= 39000);
assert!(delay3.as_millis() >= 42000 && delay3.as_millis() <= 78000);
assert!(delay4.as_millis() >= 84000 && delay4.as_millis() <= 156000);
assert!(delay5.as_millis() >= 126000 && delay5.as_millis() <= 234000);
assert!(delay6.as_millis() >= 140000 && delay6.as_millis() <= 260000);
} }
#[test] #[test]

View File

@@ -36,6 +36,7 @@ mod tests {
} }
}, },
&context, &context,
false, // not autonomous mode
) )
.await; .await;
@@ -68,6 +69,7 @@ mod tests {
} }
}, },
&context, &context,
false, // not autonomous mode
) )
.await; .await;
@@ -99,6 +101,7 @@ mod tests {
} }
}, },
&context, &context,
false, // not autonomous mode
) )
.await; .await;
@@ -125,9 +128,9 @@ mod tests {
#[test] #[test]
fn test_retry_delay_increases() { fn test_retry_delay_increases() {
let delay1 = calculate_retry_delay(1); let delay1 = calculate_retry_delay(1, false);
let delay2 = calculate_retry_delay(2); let delay2 = calculate_retry_delay(2, false);
let delay3 = calculate_retry_delay(3); let delay3 = calculate_retry_delay(3, false);
// Delays should generally increase (though jitter can affect this) // Delays should generally increase (though jitter can affect this)
// We'll test the base delays without jitter // We'll test the base delays without jitter

View File

@@ -338,10 +338,19 @@ pub struct Agent<W: UiWriter> {
session_id: Option<String>, session_id: Option<String>,
tool_call_metrics: Vec<(String, Duration, bool)>, // (tool_name, duration, success) tool_call_metrics: Vec<(String, Duration, bool)>, // (tool_name, duration, success)
ui_writer: W, ui_writer: W,
is_autonomous: bool,
} }
impl<W: UiWriter> Agent<W> { impl<W: UiWriter> Agent<W> {
pub async fn new(config: Config, ui_writer: W) -> Result<Self> { pub async fn new(config: Config, ui_writer: W) -> Result<Self> {
Self::new_with_mode(config, ui_writer, false).await
}
pub async fn new_autonomous(config: Config, ui_writer: W) -> Result<Self> {
Self::new_with_mode(config, ui_writer, true).await
}
async fn new_with_mode(config: Config, ui_writer: W, is_autonomous: bool) -> Result<Self> {
let mut providers = ProviderRegistry::new(); let mut providers = ProviderRegistry::new();
// Only register providers that are configured AND selected as the default provider // Only register providers that are configured AND selected as the default provider
@@ -431,6 +440,7 @@ impl<W: UiWriter> Agent<W> {
session_id: None, session_id: None,
tool_call_metrics: Vec::new(), tool_call_metrics: Vec::new(),
ui_writer, ui_writer,
is_autonomous,
}) })
} }
@@ -954,7 +964,7 @@ The tool will execute immediately and you'll receive the result (success or erro
use crate::error_handling::{calculate_retry_delay, classify_error, ErrorType}; use crate::error_handling::{calculate_retry_delay, classify_error, ErrorType};
let mut attempt = 0; let mut attempt = 0;
const MAX_ATTEMPTS: u32 = 3; let max_attempts = if self.is_autonomous { 6 } else { 3 };
loop { loop {
attempt += 1; attempt += 1;
@@ -974,12 +984,12 @@ The tool will execute immediately and you'll receive the result (success or erro
); );
return Ok(stream); return Ok(stream);
} }
Err(e) if attempt < MAX_ATTEMPTS => { Err(e) if attempt < max_attempts => {
if matches!(classify_error(&e), ErrorType::Recoverable(_)) { if matches!(classify_error(&e), ErrorType::Recoverable(_)) {
let delay = calculate_retry_delay(attempt); let delay = calculate_retry_delay(attempt, self.is_autonomous);
warn!( warn!(
"Recoverable error on attempt {}/{}: {}. Retrying in {:?}...", "Recoverable error on attempt {}/{}: {}. Retrying in {:?}...",
attempt, MAX_ATTEMPTS, e, delay attempt, max_attempts, e, delay
); );
tokio::time::sleep(delay).await; tokio::time::sleep(delay).await;
} else { } else {