Files
g3/crates/g3-core/src/error_handling.rs
Dhanji R. Prasanna c2aa80647a Remove legacy logs/ directory, consolidate all data under .g3/
This change removes the legacy logs/ directory and consolidates all
session data, error logs, and discovery files under the .g3/ directory.

New directory structure:
- .g3/sessions/<session_id>/session.json - session logs
- .g3/errors/ - error logs (was logs/errors/)
- .g3/background_processes/ - background process logs
- .g3/discovery/ - planner discovery files (was workspace/logs/)

Changes:
- paths.rs: Remove get_logs_dir()/logs_dir(), add get_errors_dir(),
  get_background_processes_dir(), get_discovery_dir()
- session.rs: Anonymous sessions now use .g3/sessions/anonymous_<ts>/
- error_handling.rs: Errors now saved to .g3/errors/
- project.rs: Remove logs_dir() and ensure_logs_dir() methods
- feedback_extraction.rs: Remove logs_dir field and fallback logic
- planner: Use .g3/ for workspace data and .g3/discovery/ for reports
- flock.rs: Look for session metrics in .g3/sessions/
- coach_feedback.rs: Remove fallback to logs/ path
- Update all tests to use new paths
- Update README.md and .gitignore
2026-01-12 18:20:08 +05:30

567 lines
19 KiB
Rust

//! Error handling module for G3 with retry logic and detailed logging
//!
//! This module provides:
//! - Classification of errors as recoverable or non-recoverable
//! - Retry logic with exponential backoff and jitter for recoverable errors
//! - Detailed error logging with context information
//! - Request/response capture for debugging
use anyhow::Result;
use serde::{Deserialize, Serialize};
use std::time::Duration;
use tracing::{debug, error, warn};
/// Base delay for exponential backoff (in milliseconds)
const BASE_RETRY_DELAY_MS: u64 = 1000;
/// Maximum delay between retries (in milliseconds) for default mode
const DEFAULT_MAX_RETRY_DELAY_MS: u64 = 10000;
/// Maximum delay between retries (in milliseconds) for autonomous mode
/// Spread over 10 minutes (600 seconds) with 6 retries
const AUTONOMOUS_MAX_RETRY_DELAY_MS: u64 = 120000; // 2 minutes max per retry
// Removed unused constants AUTONOMOUS_RETRY_BUDGET_MS and DEFAULT_JITTER_FACTOR
/// Jitter factor for autonomous mode (higher for better distribution)
const JITTER_FACTOR: f64 = 0.3;
/// Error context information for detailed logging
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ErrorContext {
/// The operation that was being performed
pub operation: String,
/// The provider being used
pub provider: String,
/// The model being used
pub model: String,
/// The last prompt sent (truncated for logging)
pub last_prompt: String,
/// Raw request data (if available)
pub raw_request: Option<String>,
/// Raw response data (if available)
pub raw_response: Option<String>,
/// Stack trace
pub stack_trace: String,
/// Timestamp
pub timestamp: u64,
/// Number of tokens in context
pub context_tokens: u32,
/// Session ID if available
pub session_id: Option<String>,
/// Whether to skip file logging (quiet mode)
pub quiet: bool,
}
impl ErrorContext {
pub fn new(
operation: String,
provider: String,
model: String,
last_prompt: String,
session_id: Option<String>,
context_tokens: u32,
quiet: bool,
) -> Self {
let timestamp = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.as_secs();
// Capture stack trace
let stack_trace = std::backtrace::Backtrace::force_capture().to_string();
Self {
operation,
provider,
model,
last_prompt: truncate_for_logging(&last_prompt, 1000),
raw_request: None,
raw_response: None,
stack_trace,
timestamp,
context_tokens,
session_id,
quiet,
}
}
pub fn with_request(mut self, request: String) -> Self {
self.raw_request = Some(truncate_for_logging(&request, 5000));
self
}
pub fn with_response(mut self, response: String) -> Self {
self.raw_response = Some(truncate_for_logging(&response, 5000));
self
}
/// Log the error context with ERROR level
pub fn log_error(&self, error: &anyhow::Error) {
error!("=== G3 ERROR DETAILS ===");
error!("Operation: {}", self.operation);
error!("Provider: {} | Model: {}", self.provider, self.model);
error!("Error: {}", error);
error!("Timestamp: {}", self.timestamp);
error!("Session ID: {:?}", self.session_id);
error!("Context Tokens: {}", self.context_tokens);
error!("Last Prompt: {}", self.last_prompt);
if let Some(ref req) = self.raw_request {
error!("Raw Request: {}", req);
}
if let Some(ref resp) = self.raw_response {
error!("Raw Response: {}", resp);
}
error!("Stack Trace:\n{}", self.stack_trace);
error!("=== END ERROR DETAILS ===");
// Also save to error log file
self.save_to_file();
}
/// Save error context to a file for later analysis
fn save_to_file(&self) {
// Skip file logging if quiet mode is enabled
if self.quiet {
return;
}
let logs_dir = crate::paths::get_errors_dir();
if !logs_dir.exists() {
if let Err(e) = std::fs::create_dir_all(&logs_dir) {
error!("Failed to create error logs directory: {}", e);
return;
}
}
let filename = logs_dir.join(format!(
"error_{}_{}.json",
self.timestamp,
self.session_id.as_deref().unwrap_or("unknown")
));
match serde_json::to_string_pretty(self) {
Ok(json_content) => {
if let Err(e) = std::fs::write(&filename, json_content) {
error!("Failed to save error context to {:?}: {}", &filename, e);
} else {
debug!("Error details saved to: {:?}", &filename);
}
}
Err(e) => {
error!("Failed to serialize error context: {}", e);
}
}
}
}
/// Classification of error types
#[derive(Debug, Clone, PartialEq)]
pub enum ErrorType {
/// Recoverable errors that should be retried
Recoverable(RecoverableError),
/// Non-recoverable errors that should terminate execution
NonRecoverable,
}
/// Types of recoverable errors
#[derive(Debug, Clone, PartialEq)]
pub enum RecoverableError {
/// Rate limit exceeded
RateLimit,
/// Temporary network error
NetworkError,
/// Server error (5xx)
ServerError,
/// Model is busy/overloaded
ModelBusy,
/// Timeout
Timeout,
/// Token limit exceeded (might be recoverable with compaction)
TokenLimit,
/// Context length exceeded (prompt too long) - should end current turn in autonomous mode
ContextLengthExceeded,
}
/// Classify an error as recoverable or non-recoverable
pub fn classify_error(error: &anyhow::Error) -> ErrorType {
let error_str = error.to_string().to_lowercase();
// Check for recoverable error patterns
if error_str.contains("rate limit")
|| error_str.contains("rate_limit")
|| error_str.contains("429")
{
return ErrorType::Recoverable(RecoverableError::RateLimit);
}
if error_str.contains("network")
|| error_str.contains("connection")
|| error_str.contains("dns")
|| error_str.contains("refused")
{
return ErrorType::Recoverable(RecoverableError::NetworkError);
}
if error_str.contains("500")
|| error_str.contains("502")
|| error_str.contains("503")
|| error_str.contains("504")
|| error_str.contains("server error")
|| error_str.contains("internal error")
{
return ErrorType::Recoverable(RecoverableError::ServerError);
}
if error_str.contains("busy")
|| error_str.contains("overloaded")
|| error_str.contains("capacity")
|| error_str.contains("unavailable")
{
return ErrorType::Recoverable(RecoverableError::ModelBusy);
}
// Enhanced timeout detection - check for various timeout patterns
if error_str.contains("timeout") ||
error_str.contains("timed out") ||
error_str.contains("operation timed out") ||
error_str.contains("request or response body error") || // Common timeout pattern
error_str.contains("stream error") && error_str.contains("timed out")
{
return ErrorType::Recoverable(RecoverableError::Timeout);
}
// Check for context length exceeded errors (HTTP 400 with specific messages)
if (error_str.contains("400") || error_str.contains("bad request"))
&& (error_str.contains("context length")
|| error_str.contains("prompt is too long")
|| error_str.contains("maximum context length")
|| error_str.contains("context_length_exceeded"))
{
return ErrorType::Recoverable(RecoverableError::ContextLengthExceeded);
}
if error_str.contains("token")
&& (error_str.contains("limit") || error_str.contains("exceeded"))
{
return ErrorType::Recoverable(RecoverableError::TokenLimit);
}
// Default to non-recoverable for unknown errors
ErrorType::NonRecoverable
}
/// Calculate retry delay for autonomous mode with better distribution over 10 minutes
fn calculate_autonomous_retry_delay(attempt: u32) -> Duration {
use rand::Rng;
let mut rng = rand::thread_rng();
// Distribute 6 retries over 10 minutes (600 seconds)
// Base delays: 10s, 30s, 60s, 120s, 180s, 200s = 600s total
let base_delays_ms = [10000, 30000, 60000, 120000, 180000, 200000];
let base_delay = base_delays_ms
.get(attempt.saturating_sub(1) as usize)
.unwrap_or(&200000);
// Add jitter of ±30% to prevent thundering herd
let jitter = (*base_delay as f64 * 0.3 * rng.gen::<f64>()) as u64;
let final_delay = if rng.gen_bool(0.5) {
base_delay + jitter
} else {
base_delay.saturating_sub(jitter)
};
Duration::from_millis(final_delay)
}
/// Calculate retry delay with exponential backoff and jitter
pub fn calculate_retry_delay(attempt: u32, is_autonomous: bool) -> Duration {
if is_autonomous {
return calculate_autonomous_retry_delay(attempt);
}
use rand::Rng;
let max_retry_delay_ms = if is_autonomous {
AUTONOMOUS_MAX_RETRY_DELAY_MS
} else {
DEFAULT_MAX_RETRY_DELAY_MS
};
// Exponential backoff: delay = base * 2^attempt
let base_delay = BASE_RETRY_DELAY_MS * (2_u64.pow(attempt.saturating_sub(1)));
let capped_delay = base_delay.min(max_retry_delay_ms);
// Add jitter to prevent thundering herd
let mut rng = rand::thread_rng();
let jitter = (capped_delay as f64 * JITTER_FACTOR * rng.gen::<f64>()) as u64;
let final_delay = if rng.gen_bool(0.5) {
capped_delay + jitter
} else {
capped_delay.saturating_sub(jitter)
};
Duration::from_millis(final_delay)
}
/// Retry logic for async operations
pub async fn retry_with_backoff<F, Fut, T>(
operation_name: &str,
mut operation: F,
context: &ErrorContext,
is_autonomous: bool,
max_attempts: u32,
) -> Result<T>
where
F: FnMut() -> Fut,
Fut: std::future::Future<Output = Result<T>>,
{
let mut attempt = 0;
let mut _last_error = None;
loop {
attempt += 1;
match operation().await {
Ok(result) => {
if attempt > 1 {
debug!(
"Operation '{}' succeeded after {} attempts",
operation_name, attempt
);
}
return Ok(result);
}
Err(error) => {
let error_type = classify_error(&error);
match error_type {
ErrorType::Recoverable(recoverable_type) => {
if attempt >= max_attempts {
error!(
"Operation '{}' failed after {} attempts. Giving up.",
operation_name, attempt
);
context.clone().log_error(&error);
return Err(error);
}
let delay = calculate_retry_delay(attempt, is_autonomous);
warn!(
"Recoverable error ({:?}) in '{}' (attempt {}/{}). Retrying in {:?}...",
recoverable_type, operation_name, attempt, max_attempts, delay
);
warn!("Error details: {}", error);
// Special handling for token limit errors
if matches!(recoverable_type, RecoverableError::TokenLimit) {
debug!("Token limit error detected. Consider triggering compaction.");
}
tokio::time::sleep(delay).await;
_last_error = Some(error);
}
ErrorType::NonRecoverable => {
error!(
"Non-recoverable error in '{}' (attempt {}). Terminating.",
operation_name, attempt
);
context.clone().log_error(&error);
return Err(error);
}
}
}
}
}
}
/// Helper function to truncate strings for logging
fn truncate_for_logging(s: &str, max_len: usize) -> String {
if s.len() <= max_len {
s.to_string()
} else {
// Find a safe UTF-8 boundary to truncate at
// We need to ensure we don't cut in the middle of a multi-byte character
let mut truncate_at = max_len;
// Walk backwards from max_len to find a character boundary
while truncate_at > 0 && !s.is_char_boundary(truncate_at) {
truncate_at -= 1;
}
// If we couldn't find a boundary (shouldn't happen), use a safe default
if truncate_at == 0 {
truncate_at = max_len.min(s.len());
}
format!(
"{}... (truncated, {} total bytes)",
&s[..truncate_at],
s.len()
)
}
}
/// Macro for creating error context easily
#[macro_export]
macro_rules! error_context {
($operation:expr, $provider:expr, $model:expr, $prompt:expr, $session_id:expr, $tokens:expr) => {
$crate::error_handling::ErrorContext::new(
$operation.to_string(),
$provider.to_string(),
$model.to_string(),
$prompt.to_string(),
$session_id,
$tokens,
)
};
}
#[cfg(test)]
mod tests {
use super::*;
use anyhow::anyhow;
#[test]
fn test_error_classification() {
// Rate limit errors
let error = anyhow!("Rate limit exceeded");
assert_eq!(
classify_error(&error),
ErrorType::Recoverable(RecoverableError::RateLimit)
);
let error = anyhow!("HTTP 429 Too Many Requests");
assert_eq!(
classify_error(&error),
ErrorType::Recoverable(RecoverableError::RateLimit)
);
// Network errors
let error = anyhow!("Network connection failed");
assert_eq!(
classify_error(&error),
ErrorType::Recoverable(RecoverableError::NetworkError)
);
// Server errors
let error = anyhow!("HTTP 503 Service Unavailable");
assert_eq!(
classify_error(&error),
ErrorType::Recoverable(RecoverableError::ServerError)
);
// Model busy
let error = anyhow!("Model is busy, please try again");
assert_eq!(
classify_error(&error),
ErrorType::Recoverable(RecoverableError::ModelBusy)
);
// Timeout
let error = anyhow!("Request timed out");
assert_eq!(
classify_error(&error),
ErrorType::Recoverable(RecoverableError::Timeout)
);
// Token limit
let error = anyhow!("Token limit exceeded");
assert_eq!(
classify_error(&error),
ErrorType::Recoverable(RecoverableError::TokenLimit)
);
// Context length exceeded
let error = anyhow!("HTTP 400 Bad Request: context length exceeded");
assert_eq!(
classify_error(&error),
ErrorType::Recoverable(RecoverableError::ContextLengthExceeded)
);
let error = anyhow!("Error 400: prompt is too long");
assert_eq!(
classify_error(&error),
ErrorType::Recoverable(RecoverableError::ContextLengthExceeded)
);
// Non-recoverable
let error = anyhow!("Invalid API key");
assert_eq!(classify_error(&error), ErrorType::NonRecoverable);
let error = anyhow!("Malformed request");
assert_eq!(classify_error(&error), ErrorType::NonRecoverable);
}
#[test]
fn test_retry_delay_calculation() {
// Test that delays increase exponentially
let delay1 = calculate_retry_delay(1, false);
let delay2 = calculate_retry_delay(2, false);
let delay3 = calculate_retry_delay(3, false);
// Due to jitter, we can't test exact values, but the base should increase
assert!(delay1.as_millis() >= (BASE_RETRY_DELAY_MS as f64 * 0.7) as u128);
assert!(delay1.as_millis() <= (BASE_RETRY_DELAY_MS as f64 * 1.3) as u128);
// Delay 2 should be roughly 2x delay 1 (minus jitter)
assert!(delay2.as_millis() >= delay1.as_millis());
// Delay 3 should be roughly 2x delay 2 (minus jitter)
assert!(delay3.as_millis() >= delay2.as_millis());
// Test max cap
let delay_max = calculate_retry_delay(10, false);
assert!(delay_max.as_millis() <= (DEFAULT_MAX_RETRY_DELAY_MS as f64 * 1.3) as u128);
}
#[test]
fn test_autonomous_retry_delay_calculation() {
// Test autonomous mode delays are distributed over 10 minutes
let delay1 = calculate_retry_delay(1, true);
let delay2 = calculate_retry_delay(2, true);
let delay3 = calculate_retry_delay(3, true);
let delay4 = calculate_retry_delay(4, true);
let delay5 = calculate_retry_delay(5, true);
let delay6 = calculate_retry_delay(6, true);
// Base delays should be around: 10s, 30s, 60s, 120s, 180s, 200s
// With ±30% jitter
assert!(delay1.as_millis() >= 7000 && delay1.as_millis() <= 13000);
assert!(delay2.as_millis() >= 21000 && delay2.as_millis() <= 39000);
assert!(delay3.as_millis() >= 42000 && delay3.as_millis() <= 78000);
assert!(delay4.as_millis() >= 84000 && delay4.as_millis() <= 156000);
assert!(delay5.as_millis() >= 126000 && delay5.as_millis() <= 234000);
assert!(delay6.as_millis() >= 140000 && delay6.as_millis() <= 260000);
}
#[test]
fn test_truncate_for_logging() {
let short_text = "Hello, world!";
assert_eq!(truncate_for_logging(short_text, 20), "Hello, world!");
let long_text = "This is a very long text that should be truncated for logging purposes";
let truncated = truncate_for_logging(long_text, 20);
assert!(truncated.starts_with("This is a very long "));
assert!(truncated.contains("truncated"));
assert!(truncated.contains("total bytes"));
}
#[test]
fn test_truncate_with_multibyte_chars() {
// Test with multi-byte UTF-8 characters
let text_with_emoji = "Hello 👋 World 🌍 Test ✨ More text here";
let truncated = truncate_for_logging(text_with_emoji, 10);
// Should truncate at a valid UTF-8 boundary
assert!(truncated.starts_with("Hello "));
// Test with box-drawing characters like the one causing the panic
let text_with_box = "Some text ┌─────┐ more text";
let truncated = truncate_for_logging(text_with_box, 12);
// Should not panic and should truncate at a valid boundary
assert!(truncated.contains("Some text"));
assert!(truncated.contains("truncated"));
}
}