retry on errors
This commit is contained in:
17
Cargo.lock
generated
17
Cargo.lock
generated
@@ -2877,16 +2877,6 @@ dependencies = [
|
|||||||
"windows-sys 0.60.2",
|
"windows-sys 0.60.2",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "term_size"
|
|
||||||
version = "0.3.2"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "1e4129646ca0ed8f45d09b929036bafad5377103edd06e50bf574b353d2b08d9"
|
|
||||||
dependencies = [
|
|
||||||
"libc",
|
|
||||||
"winapi",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "termimad"
|
name = "termimad"
|
||||||
version = "0.34.0"
|
version = "0.34.0"
|
||||||
@@ -2903,13 +2893,6 @@ dependencies = [
|
|||||||
"unicode-width 0.1.14",
|
"unicode-width 0.1.14",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "test_scroll"
|
|
||||||
version = "0.1.0"
|
|
||||||
dependencies = [
|
|
||||||
"term_size",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "thiserror"
|
name = "thiserror"
|
||||||
version = "1.0.69"
|
version = "1.0.69"
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ members = [
|
|||||||
"crates/g3-providers",
|
"crates/g3-providers",
|
||||||
"crates/g3-config",
|
"crates/g3-config",
|
||||||
"crates/g3-execution"
|
"crates/g3-execution"
|
||||||
, "tmp"]
|
]
|
||||||
resolver = "2"
|
resolver = "2"
|
||||||
|
|
||||||
[workspace.dependencies]
|
[workspace.dependencies]
|
||||||
|
|||||||
@@ -133,7 +133,11 @@ pub async fn run() -> Result<()> {
|
|||||||
|
|
||||||
// Initialize agent
|
// Initialize agent
|
||||||
let ui_writer = ConsoleUiWriter::new();
|
let ui_writer = ConsoleUiWriter::new();
|
||||||
let mut agent = Agent::new(config.clone(), ui_writer).await?;
|
let mut agent = if cli.autonomous {
|
||||||
|
Agent::new_autonomous(config.clone(), ui_writer).await?
|
||||||
|
} else {
|
||||||
|
Agent::new(config.clone(), ui_writer).await?
|
||||||
|
};
|
||||||
|
|
||||||
// Execute task, autonomous mode, or start interactive mode
|
// Execute task, autonomous mode, or start interactive mode
|
||||||
if cli.autonomous {
|
if cli.autonomous {
|
||||||
@@ -700,7 +704,7 @@ async fn run_autonomous(
|
|||||||
// Create a new agent instance for coach mode to ensure fresh context
|
// Create a new agent instance for coach mode to ensure fresh context
|
||||||
let config = g3_config::Config::load(None)?;
|
let config = g3_config::Config::load(None)?;
|
||||||
let ui_writer = ConsoleUiWriter::new();
|
let ui_writer = ConsoleUiWriter::new();
|
||||||
let mut coach_agent = Agent::new(config, ui_writer).await?;
|
let mut coach_agent = Agent::new_autonomous(config, ui_writer).await?;
|
||||||
|
|
||||||
// Ensure coach agent is also in the workspace directory
|
// Ensure coach agent is also in the workspace directory
|
||||||
project.enter_workspace()?;
|
project.enter_workspace()?;
|
||||||
|
|||||||
@@ -11,16 +11,29 @@ use serde::{Deserialize, Serialize};
|
|||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use tracing::{error, info, warn};
|
use tracing::{error, info, warn};
|
||||||
|
|
||||||
/// Maximum number of retry attempts for recoverable errors
|
/// Maximum number of retry attempts for recoverable errors (default mode)
|
||||||
const MAX_RETRY_ATTEMPTS: u32 = 3;
|
const DEFAULT_MAX_RETRY_ATTEMPTS: u32 = 3;
|
||||||
|
|
||||||
|
/// Maximum number of retry attempts for autonomous mode
|
||||||
|
const AUTONOMOUS_MAX_RETRY_ATTEMPTS: u32 = 6;
|
||||||
|
|
||||||
/// Base delay for exponential backoff (in milliseconds)
|
/// Base delay for exponential backoff (in milliseconds)
|
||||||
const BASE_RETRY_DELAY_MS: u64 = 1000;
|
const BASE_RETRY_DELAY_MS: u64 = 1000;
|
||||||
|
|
||||||
/// Maximum delay between retries (in milliseconds)
|
/// Maximum delay between retries (in milliseconds) for default mode
|
||||||
const MAX_RETRY_DELAY_MS: u64 = 10000;
|
const DEFAULT_MAX_RETRY_DELAY_MS: u64 = 10000;
|
||||||
|
|
||||||
/// Jitter factor (0.0 to 1.0) to randomize retry delays
|
/// Maximum delay between retries (in milliseconds) for autonomous mode
|
||||||
|
/// Spread over 10 minutes (600 seconds) with 6 retries
|
||||||
|
const AUTONOMOUS_MAX_RETRY_DELAY_MS: u64 = 120000; // 2 minutes max per retry
|
||||||
|
|
||||||
|
/// Total time budget for autonomous mode retries (10 minutes)
|
||||||
|
const AUTONOMOUS_RETRY_BUDGET_MS: u64 = 600000;
|
||||||
|
|
||||||
|
/// Jitter factor (0.0 to 1.0) to randomize retry delays (default)
|
||||||
|
const DEFAULT_JITTER_FACTOR: f64 = 0.3;
|
||||||
|
|
||||||
|
/// Jitter factor for autonomous mode (higher for better distribution)
|
||||||
const JITTER_FACTOR: f64 = 0.3;
|
const JITTER_FACTOR: f64 = 0.3;
|
||||||
|
|
||||||
/// Error context information for detailed logging
|
/// Error context information for detailed logging
|
||||||
@@ -209,13 +222,39 @@ pub fn classify_error(error: &anyhow::Error) -> ErrorType {
|
|||||||
ErrorType::NonRecoverable
|
ErrorType::NonRecoverable
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Calculate retry delay with exponential backoff and jitter
|
/// Calculate retry delay for autonomous mode with better distribution over 10 minutes
|
||||||
pub fn calculate_retry_delay(attempt: u32) -> Duration {
|
fn calculate_autonomous_retry_delay(attempt: u32) -> Duration {
|
||||||
use rand::Rng;
|
use rand::Rng;
|
||||||
|
let mut rng = rand::thread_rng();
|
||||||
|
|
||||||
|
// Distribute 6 retries over 10 minutes (600 seconds)
|
||||||
|
// Base delays: 10s, 30s, 60s, 120s, 180s, 200s = 600s total
|
||||||
|
let base_delays_ms = [10000, 30000, 60000, 120000, 180000, 200000];
|
||||||
|
let base_delay = base_delays_ms.get(attempt.saturating_sub(1) as usize).unwrap_or(&200000);
|
||||||
|
|
||||||
|
// Add jitter of ±30% to prevent thundering herd
|
||||||
|
let jitter = (*base_delay as f64 * 0.3 * rng.gen::<f64>()) as u64;
|
||||||
|
let final_delay = if rng.gen_bool(0.5) {
|
||||||
|
base_delay + jitter
|
||||||
|
} else {
|
||||||
|
base_delay.saturating_sub(jitter)
|
||||||
|
};
|
||||||
|
|
||||||
|
Duration::from_millis(final_delay)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Calculate retry delay with exponential backoff and jitter
|
||||||
|
pub fn calculate_retry_delay(attempt: u32, is_autonomous: bool) -> Duration {
|
||||||
|
if is_autonomous {
|
||||||
|
return calculate_autonomous_retry_delay(attempt);
|
||||||
|
}
|
||||||
|
|
||||||
|
use rand::Rng;
|
||||||
|
let max_retry_delay_ms = if is_autonomous { AUTONOMOUS_MAX_RETRY_DELAY_MS } else { DEFAULT_MAX_RETRY_DELAY_MS };
|
||||||
|
|
||||||
// Exponential backoff: delay = base * 2^attempt
|
// Exponential backoff: delay = base * 2^attempt
|
||||||
let base_delay = BASE_RETRY_DELAY_MS * (2_u64.pow(attempt.saturating_sub(1)));
|
let base_delay = BASE_RETRY_DELAY_MS * (2_u64.pow(attempt.saturating_sub(1)));
|
||||||
let capped_delay = base_delay.min(MAX_RETRY_DELAY_MS);
|
let capped_delay = base_delay.min(max_retry_delay_ms);
|
||||||
|
|
||||||
// Add jitter to prevent thundering herd
|
// Add jitter to prevent thundering herd
|
||||||
let mut rng = rand::thread_rng();
|
let mut rng = rand::thread_rng();
|
||||||
@@ -234,6 +273,7 @@ pub async fn retry_with_backoff<F, Fut, T>(
|
|||||||
operation_name: &str,
|
operation_name: &str,
|
||||||
mut operation: F,
|
mut operation: F,
|
||||||
context: &ErrorContext,
|
context: &ErrorContext,
|
||||||
|
is_autonomous: bool,
|
||||||
) -> Result<T>
|
) -> Result<T>
|
||||||
where
|
where
|
||||||
F: FnMut() -> Fut,
|
F: FnMut() -> Fut,
|
||||||
@@ -257,10 +297,11 @@ where
|
|||||||
}
|
}
|
||||||
Err(error) => {
|
Err(error) => {
|
||||||
let error_type = classify_error(&error);
|
let error_type = classify_error(&error);
|
||||||
|
let max_attempts = if is_autonomous { AUTONOMOUS_MAX_RETRY_ATTEMPTS } else { DEFAULT_MAX_RETRY_ATTEMPTS };
|
||||||
|
|
||||||
match error_type {
|
match error_type {
|
||||||
ErrorType::Recoverable(recoverable_type) => {
|
ErrorType::Recoverable(recoverable_type) => {
|
||||||
if attempt >= MAX_RETRY_ATTEMPTS {
|
if attempt >= max_attempts {
|
||||||
error!(
|
error!(
|
||||||
"Operation '{}' failed after {} attempts. Giving up.",
|
"Operation '{}' failed after {} attempts. Giving up.",
|
||||||
operation_name, attempt
|
operation_name, attempt
|
||||||
@@ -269,10 +310,10 @@ where
|
|||||||
return Err(error);
|
return Err(error);
|
||||||
}
|
}
|
||||||
|
|
||||||
let delay = calculate_retry_delay(attempt);
|
let delay = calculate_retry_delay(attempt, is_autonomous);
|
||||||
warn!(
|
warn!(
|
||||||
"Recoverable error ({:?}) in '{}' (attempt {}/{}). Retrying in {:?}...",
|
"Recoverable error ({:?}) in '{}' (attempt {}/{}). Retrying in {:?}...",
|
||||||
recoverable_type, operation_name, attempt, MAX_RETRY_ATTEMPTS, delay
|
recoverable_type, operation_name, attempt, max_attempts, delay
|
||||||
);
|
);
|
||||||
warn!("Error details: {}", error);
|
warn!("Error details: {}", error);
|
||||||
|
|
||||||
@@ -380,9 +421,9 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_retry_delay_calculation() {
|
fn test_retry_delay_calculation() {
|
||||||
// Test that delays increase exponentially
|
// Test that delays increase exponentially
|
||||||
let delay1 = calculate_retry_delay(1);
|
let delay1 = calculate_retry_delay(1, false);
|
||||||
let delay2 = calculate_retry_delay(2);
|
let delay2 = calculate_retry_delay(2, false);
|
||||||
let delay3 = calculate_retry_delay(3);
|
let delay3 = calculate_retry_delay(3, false);
|
||||||
|
|
||||||
// Due to jitter, we can't test exact values, but the base should increase
|
// Due to jitter, we can't test exact values, but the base should increase
|
||||||
assert!(delay1.as_millis() >= (BASE_RETRY_DELAY_MS as f64 * 0.7) as u128);
|
assert!(delay1.as_millis() >= (BASE_RETRY_DELAY_MS as f64 * 0.7) as u128);
|
||||||
@@ -395,8 +436,28 @@ mod tests {
|
|||||||
assert!(delay3.as_millis() >= delay2.as_millis());
|
assert!(delay3.as_millis() >= delay2.as_millis());
|
||||||
|
|
||||||
// Test max cap
|
// Test max cap
|
||||||
let delay_max = calculate_retry_delay(10);
|
let delay_max = calculate_retry_delay(10, false);
|
||||||
assert!(delay_max.as_millis() <= (MAX_RETRY_DELAY_MS as f64 * 1.3) as u128);
|
assert!(delay_max.as_millis() <= (DEFAULT_MAX_RETRY_DELAY_MS as f64 * 1.3) as u128);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_autonomous_retry_delay_calculation() {
|
||||||
|
// Test autonomous mode delays are distributed over 10 minutes
|
||||||
|
let delay1 = calculate_retry_delay(1, true);
|
||||||
|
let delay2 = calculate_retry_delay(2, true);
|
||||||
|
let delay3 = calculate_retry_delay(3, true);
|
||||||
|
let delay4 = calculate_retry_delay(4, true);
|
||||||
|
let delay5 = calculate_retry_delay(5, true);
|
||||||
|
let delay6 = calculate_retry_delay(6, true);
|
||||||
|
|
||||||
|
// Base delays should be around: 10s, 30s, 60s, 120s, 180s, 200s
|
||||||
|
// With ±30% jitter
|
||||||
|
assert!(delay1.as_millis() >= 7000 && delay1.as_millis() <= 13000);
|
||||||
|
assert!(delay2.as_millis() >= 21000 && delay2.as_millis() <= 39000);
|
||||||
|
assert!(delay3.as_millis() >= 42000 && delay3.as_millis() <= 78000);
|
||||||
|
assert!(delay4.as_millis() >= 84000 && delay4.as_millis() <= 156000);
|
||||||
|
assert!(delay5.as_millis() >= 126000 && delay5.as_millis() <= 234000);
|
||||||
|
assert!(delay6.as_millis() >= 140000 && delay6.as_millis() <= 260000);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
@@ -36,6 +36,7 @@ mod tests {
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
&context,
|
&context,
|
||||||
|
false, // not autonomous mode
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
@@ -68,6 +69,7 @@ mod tests {
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
&context,
|
&context,
|
||||||
|
false, // not autonomous mode
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
@@ -99,6 +101,7 @@ mod tests {
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
&context,
|
&context,
|
||||||
|
false, // not autonomous mode
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
@@ -125,9 +128,9 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_retry_delay_increases() {
|
fn test_retry_delay_increases() {
|
||||||
let delay1 = calculate_retry_delay(1);
|
let delay1 = calculate_retry_delay(1, false);
|
||||||
let delay2 = calculate_retry_delay(2);
|
let delay2 = calculate_retry_delay(2, false);
|
||||||
let delay3 = calculate_retry_delay(3);
|
let delay3 = calculate_retry_delay(3, false);
|
||||||
|
|
||||||
// Delays should generally increase (though jitter can affect this)
|
// Delays should generally increase (though jitter can affect this)
|
||||||
// We'll test the base delays without jitter
|
// We'll test the base delays without jitter
|
||||||
|
|||||||
@@ -338,10 +338,19 @@ pub struct Agent<W: UiWriter> {
|
|||||||
session_id: Option<String>,
|
session_id: Option<String>,
|
||||||
tool_call_metrics: Vec<(String, Duration, bool)>, // (tool_name, duration, success)
|
tool_call_metrics: Vec<(String, Duration, bool)>, // (tool_name, duration, success)
|
||||||
ui_writer: W,
|
ui_writer: W,
|
||||||
|
is_autonomous: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<W: UiWriter> Agent<W> {
|
impl<W: UiWriter> Agent<W> {
|
||||||
pub async fn new(config: Config, ui_writer: W) -> Result<Self> {
|
pub async fn new(config: Config, ui_writer: W) -> Result<Self> {
|
||||||
|
Self::new_with_mode(config, ui_writer, false).await
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn new_autonomous(config: Config, ui_writer: W) -> Result<Self> {
|
||||||
|
Self::new_with_mode(config, ui_writer, true).await
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn new_with_mode(config: Config, ui_writer: W, is_autonomous: bool) -> Result<Self> {
|
||||||
let mut providers = ProviderRegistry::new();
|
let mut providers = ProviderRegistry::new();
|
||||||
|
|
||||||
// Only register providers that are configured AND selected as the default provider
|
// Only register providers that are configured AND selected as the default provider
|
||||||
@@ -431,6 +440,7 @@ impl<W: UiWriter> Agent<W> {
|
|||||||
session_id: None,
|
session_id: None,
|
||||||
tool_call_metrics: Vec::new(),
|
tool_call_metrics: Vec::new(),
|
||||||
ui_writer,
|
ui_writer,
|
||||||
|
is_autonomous,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -954,7 +964,7 @@ The tool will execute immediately and you'll receive the result (success or erro
|
|||||||
use crate::error_handling::{calculate_retry_delay, classify_error, ErrorType};
|
use crate::error_handling::{calculate_retry_delay, classify_error, ErrorType};
|
||||||
|
|
||||||
let mut attempt = 0;
|
let mut attempt = 0;
|
||||||
const MAX_ATTEMPTS: u32 = 3;
|
let max_attempts = if self.is_autonomous { 6 } else { 3 };
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
attempt += 1;
|
attempt += 1;
|
||||||
@@ -974,12 +984,12 @@ The tool will execute immediately and you'll receive the result (success or erro
|
|||||||
);
|
);
|
||||||
return Ok(stream);
|
return Ok(stream);
|
||||||
}
|
}
|
||||||
Err(e) if attempt < MAX_ATTEMPTS => {
|
Err(e) if attempt < max_attempts => {
|
||||||
if matches!(classify_error(&e), ErrorType::Recoverable(_)) {
|
if matches!(classify_error(&e), ErrorType::Recoverable(_)) {
|
||||||
let delay = calculate_retry_delay(attempt);
|
let delay = calculate_retry_delay(attempt, self.is_autonomous);
|
||||||
warn!(
|
warn!(
|
||||||
"Recoverable error on attempt {}/{}: {}. Retrying in {:?}...",
|
"Recoverable error on attempt {}/{}: {}. Retrying in {:?}...",
|
||||||
attempt, MAX_ATTEMPTS, e, delay
|
attempt, max_attempts, e, delay
|
||||||
);
|
);
|
||||||
tokio::time::sleep(delay).await;
|
tokio::time::sleep(delay).await;
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
Reference in New Issue
Block a user