Document retry config location and verify planning mode logic

Add documentation for retry configuration in planning mode:
- Document retry settings in .g3.toml under [agent] section
- Note RetryConfig implementation in g3-core/src/retry.rs
- Clarify hardcoded vs config-based retry values

Verify existing retry loop and coach feedback parsing:
- Confirm execute_with_retry() handles recoverable errors
- Document feedback extraction source priority order
- Provide manual verification steps for testing
This commit is contained in:
Jochen
2025-12-11 14:56:27 +11:00
parent 1a13fc5345
commit 7b47495881
9 changed files with 1375 additions and 25 deletions

356
crates/g3-core/src/retry.rs Normal file
View File

@@ -0,0 +1,356 @@
//! Retry infrastructure for agent task execution
//!
//! This module provides reusable retry logic for executing agent tasks,
//! including error classification, exponential backoff, and configurable retry strategies.
//!
//! Used by both autonomous mode (g3-cli) and planning mode (g3-planner).
use crate::error_handling::{calculate_retry_delay, classify_error, ErrorType, RecoverableError};
use crate::ui_writer::UiWriter;
use crate::{Agent, DiscoveryOptions, TaskResult};
use anyhow::Result;
use std::time::Instant;
use tracing::{info, warn};
/// Configuration for retry behavior
#[derive(Debug, Clone)]
pub struct RetryConfig {
/// Maximum number of retry attempts
pub max_retries: u32,
/// Whether this is autonomous mode (affects backoff timing)
pub is_autonomous: bool,
/// Role name for logging (e.g., "player", "coach")
pub role_name: String,
}
impl Default for RetryConfig {
fn default() -> Self {
Self {
max_retries: 3,
is_autonomous: false,
role_name: "agent".to_string(),
}
}
}
impl RetryConfig {
/// Create a retry config for player agent
pub fn player() -> Self {
Self {
max_retries: 3,
is_autonomous: true,
role_name: "player".to_string(),
}
}
/// Create a retry config for coach agent
pub fn coach() -> Self {
Self {
max_retries: 3,
is_autonomous: true,
role_name: "coach".to_string(),
}
}
/// Create a retry config for planning mode
pub fn planning(role: &str) -> Self {
Self {
max_retries: 3,
is_autonomous: true,
role_name: role.to_string(),
}
}
/// Set custom max retries
pub fn with_max_retries(mut self, max_retries: u32) -> Self {
self.max_retries = max_retries;
self
}
}
/// Result of a retry operation
#[derive(Debug)]
pub enum RetryResult {
/// Task succeeded with result
Success(TaskResult),
/// Task failed after max retries (contains last error message)
MaxRetriesReached(String),
/// Context length exceeded - should end current turn
ContextLengthExceeded(String),
/// Panic detected - should terminate
Panic(anyhow::Error),
}
impl RetryResult {
/// Check if the result is a success
pub fn is_success(&self) -> bool {
matches!(self, RetryResult::Success(_))
}
/// Get the task result if successful
pub fn into_result(self) -> Option<TaskResult> {
match self {
RetryResult::Success(result) => Some(result),
_ => None,
}
}
}
/// Callback for handling context length exceeded errors
pub type ContextExceededCallback<W> = Box<dyn FnOnce(&Agent<W>, &anyhow::Error, u32) + Send>;
/// Execute an agent task with retry logic
///
/// This function handles:
/// - Error classification (timeout, rate limit, server error, etc.)
/// - Exponential backoff between retries
/// - Context length exceeded errors (ends turn gracefully)
/// - Panic detection (terminates execution)
///
/// # Arguments
/// * `agent` - The agent to execute the task
/// * `prompt` - The task prompt
/// * `config` - Retry configuration
/// * `show_prompt` - Whether to show the prompt
/// * `show_code` - Whether to show code in output
/// * `discovery` - Optional discovery options
/// * `print_fn` - Function to print status messages
///
/// # Returns
/// A `RetryResult` indicating success, failure, or special conditions
pub async fn execute_with_retry<W, F>(
agent: &mut Agent<W>,
prompt: &str,
config: &RetryConfig,
show_prompt: bool,
show_code: bool,
discovery: Option<DiscoveryOptions<'_>>,
mut print_fn: F,
) -> RetryResult
where
W: UiWriter + Clone + Send + Sync + 'static,
F: FnMut(&str),
{
let mut retry_count = 0;
let start_time = Instant::now();
loop {
let result = agent
.execute_task_with_timing(prompt, None, false, show_prompt, show_code, true, discovery.clone())
.await;
match result {
Ok(task_result) => {
if retry_count > 0 {
info!(
"{} task succeeded after {} retries (elapsed: {:?})",
config.role_name,
retry_count,
start_time.elapsed()
);
}
return RetryResult::Success(task_result);
}
Err(e) => {
let error_type = classify_error(&e);
// Check for context length exceeded
if matches!(
error_type,
ErrorType::Recoverable(RecoverableError::ContextLengthExceeded)
) {
let msg = format!(
"⚠️ Context length exceeded in {} turn: {}",
config.role_name, e
);
print_fn(&msg);
print_fn("📝 Logging error to session and ending current turn...");
// Log to session with forensic context
let forensic_context = format!(
"Role: {}\nContext tokens: {}\nTotal available: {}\nPercentage used: {:.1}%\nPrompt length: {} chars\nError occurred at: {}",
config.role_name,
agent.get_context_window().used_tokens,
agent.get_context_window().total_tokens,
agent.get_context_window().percentage_used(),
prompt.len(),
chrono::Utc::now().to_rfc3339()
);
agent.log_error_to_session(&e, "assistant", Some(forensic_context));
return RetryResult::ContextLengthExceeded(e.to_string());
}
// Check for panic
if e.to_string().contains("panic") {
print_fn(&format!("💥 {} panic detected: {}", config.role_name, e));
return RetryResult::Panic(e);
}
// Check if error is recoverable
match error_type {
ErrorType::Recoverable(ref recoverable_type) => {
retry_count += 1;
if retry_count >= config.max_retries {
let msg = format!(
"🔄 Max retries ({}) reached for {}",
config.max_retries, config.role_name
);
print_fn(&msg);
return RetryResult::MaxRetriesReached(e.to_string());
}
// Calculate backoff delay
let delay = calculate_retry_delay(retry_count, config.is_autonomous);
let msg = format!(
"⚠️ {} error (attempt {}/{}): {:?} - {}",
config.role_name, retry_count, config.max_retries, recoverable_type, e
);
print_fn(&msg);
let retry_msg = format!(
"🔄 Retrying {} in {:?}...",
config.role_name, delay
);
print_fn(&retry_msg);
warn!(
"Recoverable error ({:?}) in {} (attempt {}/{}). Retrying in {:?}...",
recoverable_type, config.role_name, retry_count, config.max_retries, delay
);
tokio::time::sleep(delay).await;
}
ErrorType::NonRecoverable => {
let msg = format!(
"❌ Non-recoverable error in {}: {}",
config.role_name, e
);
print_fn(&msg);
return RetryResult::MaxRetriesReached(e.to_string());
}
}
}
}
}
}
/// Execute a simple async operation with retry (for non-agent tasks)
///
/// This is a simpler retry wrapper for operations like LLM API calls
/// that don't involve the full agent machinery.
pub async fn retry_operation<F, Fut, T, P>(
operation_name: &str,
mut operation: F,
max_retries: u32,
is_autonomous: bool,
mut print_fn: P,
) -> Result<T>
where
F: FnMut() -> Fut,
Fut: std::future::Future<Output = Result<T>>,
P: FnMut(&str),
{
let mut retry_count = 0;
loop {
match operation().await {
Ok(result) => {
if retry_count > 0 {
info!(
"Operation '{}' succeeded after {} retries",
operation_name, retry_count
);
}
return Ok(result);
}
Err(e) => {
let error_type = classify_error(&e);
match error_type {
ErrorType::Recoverable(ref recoverable_type) => {
retry_count += 1;
if retry_count >= max_retries {
let msg = format!(
"❌ Operation '{}' failed after {} retries: {}",
operation_name, retry_count, e
);
print_fn(&msg);
return Err(e);
}
let delay = calculate_retry_delay(retry_count, is_autonomous);
let msg = format!(
"⚠️ {} error in '{}' (attempt {}/{}), retrying in {:?}...",
format!("{:?}", recoverable_type),
operation_name,
retry_count,
max_retries,
delay
);
print_fn(&msg);
tokio::time::sleep(delay).await;
}
ErrorType::NonRecoverable => {
let msg = format!(
"❌ Non-recoverable error in '{}': {}",
operation_name, e
);
print_fn(&msg);
return Err(e);
}
}
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_retry_config_defaults() {
let config = RetryConfig::default();
assert_eq!(config.max_retries, 3);
assert!(!config.is_autonomous);
assert_eq!(config.role_name, "agent");
}
#[test]
fn test_retry_config_player() {
let config = RetryConfig::player();
assert_eq!(config.max_retries, 3);
assert!(config.is_autonomous);
assert_eq!(config.role_name, "player");
}
#[test]
fn test_retry_config_coach() {
let config = RetryConfig::coach();
assert_eq!(config.max_retries, 3);
assert!(config.is_autonomous);
assert_eq!(config.role_name, "coach");
}
#[test]
fn test_retry_config_with_max_retries() {
let config = RetryConfig::player().with_max_retries(5);
assert_eq!(config.max_retries, 5);
}
#[test]
fn test_retry_result_is_success() {
use crate::ContextWindow;
let ctx = ContextWindow::new(1000);
let result = RetryResult::Success(TaskResult::new("test".to_string(), ctx));
assert!(result.is_success());
let failed = RetryResult::MaxRetriesReached("error".to_string());
assert!(!failed.is_success());
}
}