Extract a new g3_status module in g3-cli that provides consistent formatting for all 'g3:' prefixed system status messages. Key changes: - Add G3Status struct with methods for progress, done, failed, error, etc. - Add Status enum with Done, Failed, Error, Resolved, Insufficient, NoChanges - Add ThinResult struct in g3-core for semantic thinning data - Update UiWriter trait with print_thin_result() method - Refactor context thinning to return ThinResult instead of formatted strings - Update all callers to use the new centralized formatting - Session resume/decline messages now use G3Status - Compaction status messages now use G3Status This maintains clean separation of concerns: g3-core emits semantic data, g3-cli handles all terminal formatting and colors.
331 lines
11 KiB
Rust
331 lines
11 KiB
Rust
//! Context compaction logic.
|
|
//!
|
|
//! This module provides unified compaction functionality used by both
|
|
//! manual compaction (`/compact` command) and automatic compaction
|
|
//! (when context window reaches capacity during streaming).
|
|
|
|
use anyhow::Result;
|
|
use g3_providers::{CompletionRequest, Message, MessageRole, ProviderRegistry};
|
|
use tracing::{debug, error, warn};
|
|
|
|
use crate::context_window::ContextWindow;
|
|
use crate::provider_config;
|
|
use crate::ui_writer::UiWriter;
|
|
|
|
/// Minimum tokens for summary requests to avoid API errors when context is nearly full.
|
|
pub const SUMMARY_MIN_TOKENS: u32 = 1000;
|
|
|
|
/// Result of a compaction operation.
|
|
#[derive(Debug)]
|
|
pub struct CompactionResult {
|
|
/// Whether compaction succeeded
|
|
pub success: bool,
|
|
/// Characters saved by compaction (if successful)
|
|
pub chars_saved: usize,
|
|
/// Error message (if failed)
|
|
pub error: Option<String>,
|
|
}
|
|
|
|
impl CompactionResult {
|
|
pub fn success(chars_saved: usize) -> Self {
|
|
Self {
|
|
success: true,
|
|
chars_saved,
|
|
error: None,
|
|
}
|
|
}
|
|
|
|
pub fn failure(error: String) -> Self {
|
|
Self {
|
|
success: false,
|
|
chars_saved: 0,
|
|
error: Some(error),
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Configuration for a compaction operation.
|
|
#[derive(Debug, Clone)]
|
|
pub struct CompactionConfig<'a> {
|
|
/// Provider name (e.g., "anthropic", "openai")
|
|
pub provider_name: &'a str,
|
|
/// Latest user message to preserve after compaction
|
|
pub latest_user_msg: Option<String>,
|
|
}
|
|
|
|
/// Calculate the summary max_tokens with provider-specific caps applied.
|
|
///
|
|
/// This is the canonical implementation - both manual and auto compaction use this.
|
|
pub fn calculate_capped_summary_tokens(
|
|
config: &g3_config::Config,
|
|
provider_name: &str,
|
|
base_max_tokens: u32,
|
|
) -> u32 {
|
|
// Apply provider-specific caps
|
|
// For Anthropic with thinking enabled, we need max_tokens > thinking.budget_tokens
|
|
// So we set a higher cap when thinking is configured
|
|
let anthropic_cap = match provider_config::get_thinking_budget_tokens(config, provider_name) {
|
|
Some(budget) => (budget + 2000).max(10_000), // At least budget + 2000 for response
|
|
None => 10_000,
|
|
};
|
|
|
|
let capped = match provider_name {
|
|
name if name.starts_with("anthropic") => base_max_tokens.min(anthropic_cap),
|
|
name if name.starts_with("databricks") => base_max_tokens.min(10_000),
|
|
name if name.starts_with("embedded") => base_max_tokens.min(3000),
|
|
_ => base_max_tokens.min(5000),
|
|
};
|
|
|
|
// Ensure minimum floor as defense-in-depth
|
|
capped.max(SUMMARY_MIN_TOKENS)
|
|
}
|
|
|
|
/// Check if thinking mode should be disabled for a summary request.
|
|
///
|
|
/// Anthropic requires: max_tokens > thinking.budget_tokens + 1024
|
|
pub fn should_disable_thinking(
|
|
config: &g3_config::Config,
|
|
provider_name: &str,
|
|
summary_max_tokens: u32,
|
|
) -> bool {
|
|
provider_config::get_thinking_budget_tokens(config, provider_name).map_or(false, |budget| {
|
|
let minimum_for_thinking = budget + 1024;
|
|
let should_disable = summary_max_tokens <= minimum_for_thinking;
|
|
if should_disable {
|
|
warn!(
|
|
"Disabling thinking mode for summary: max_tokens ({}) <= minimum_for_thinking ({})",
|
|
summary_max_tokens, minimum_for_thinking
|
|
);
|
|
}
|
|
should_disable
|
|
})
|
|
}
|
|
|
|
/// Build the summary request messages from conversation history.
|
|
pub fn build_summary_messages(context_window: &ContextWindow) -> Vec<Message> {
|
|
let summary_prompt = context_window.create_summary_prompt();
|
|
|
|
let conversation_text = context_window
|
|
.conversation_history
|
|
.iter()
|
|
.map(|m| format!("{:?}: {}", m.role, m.content))
|
|
.collect::<Vec<_>>()
|
|
.join("\n\n");
|
|
|
|
vec![
|
|
Message::new(
|
|
MessageRole::System,
|
|
"You are a helpful assistant that creates concise summaries.".to_string(),
|
|
),
|
|
Message::new(
|
|
MessageRole::User,
|
|
format!(
|
|
"Based on this conversation history, {}\n\nConversation:\n{}",
|
|
summary_prompt, conversation_text
|
|
),
|
|
),
|
|
]
|
|
}
|
|
|
|
/// Perform context compaction by summarizing conversation history.
|
|
///
|
|
/// This is the unified implementation used by both:
|
|
/// - `force_compact()` - manual compaction via `/compact` command
|
|
/// - `stream_completion_with_tools()` - automatic compaction when context is full
|
|
///
|
|
/// # Arguments
|
|
/// * `providers` - Provider registry to get the LLM provider
|
|
/// * `context_window` - Context window to compact
|
|
/// * `config` - Application config for provider settings
|
|
/// * `compaction_config` - Configuration for this compaction operation
|
|
/// * `ui_writer` - UI writer for status messages
|
|
///
|
|
/// # Returns
|
|
/// `CompactionResult` indicating success/failure and chars saved
|
|
pub async fn perform_compaction<W: UiWriter>(
|
|
providers: &ProviderRegistry,
|
|
context_window: &mut ContextWindow,
|
|
config: &g3_config::Config,
|
|
compaction_config: CompactionConfig<'_>,
|
|
ui_writer: &W,
|
|
thinning_events: &mut Vec<usize>,
|
|
) -> Result<CompactionResult> {
|
|
let provider_name = compaction_config.provider_name;
|
|
|
|
// Apply fallback sequence: thinnify -> skinnify -> hard-coded 5000
|
|
let base_max_tokens = apply_summary_fallback_sequence(
|
|
context_window,
|
|
config,
|
|
provider_name,
|
|
ui_writer,
|
|
thinning_events,
|
|
);
|
|
|
|
let summary_max_tokens = calculate_capped_summary_tokens(config, provider_name, base_max_tokens);
|
|
|
|
debug!(
|
|
"Requesting summary with max_tokens: {} (current usage: {} tokens)",
|
|
summary_max_tokens, context_window.used_tokens
|
|
);
|
|
|
|
// Build summary request
|
|
let summary_messages = build_summary_messages(context_window);
|
|
let provider = providers.get(None)?;
|
|
|
|
let disable_thinking = should_disable_thinking(config, provider.name(), summary_max_tokens);
|
|
|
|
debug!(
|
|
"Creating summary request: max_tokens={}, disable_thinking={}",
|
|
summary_max_tokens, disable_thinking
|
|
);
|
|
|
|
let summary_request = CompletionRequest {
|
|
messages: summary_messages,
|
|
max_tokens: Some(summary_max_tokens),
|
|
temperature: Some(provider_config::resolve_temperature(config, provider.name())),
|
|
stream: false,
|
|
tools: None,
|
|
disable_thinking,
|
|
};
|
|
|
|
// Execute summary request
|
|
match provider.complete(summary_request).await {
|
|
Ok(summary_response) => {
|
|
// Note: ACD dehydration now happens at the end of each turn in Agent::dehydrate_context()
|
|
// Compaction just does lossy summarization of the existing stubs + summaries
|
|
let chars_saved = context_window.reset_with_summary(
|
|
summary_response.content,
|
|
compaction_config.latest_user_msg,
|
|
);
|
|
Ok(CompactionResult::success(chars_saved))
|
|
}
|
|
Err(e) => {
|
|
error!("Failed to create summary: {}", e);
|
|
Ok(CompactionResult::failure(e.to_string()))
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Apply the fallback sequence for summary requests to free up context space.
|
|
///
|
|
/// Sequence: thinnify -> skinnify -> hard-coded minimum
|
|
fn apply_summary_fallback_sequence<W: UiWriter>(
|
|
context_window: &mut ContextWindow,
|
|
config: &g3_config::Config,
|
|
provider_name: &str,
|
|
ui_writer: &W,
|
|
thinning_events: &mut Vec<usize>,
|
|
) -> u32 {
|
|
// Initial validation
|
|
let (mut max_tokens, needs_reduction) = provider_config::calculate_summary_max_tokens(
|
|
config,
|
|
provider_name,
|
|
context_window.total_tokens,
|
|
context_window.used_tokens,
|
|
);
|
|
|
|
if !needs_reduction {
|
|
return max_tokens;
|
|
}
|
|
|
|
ui_writer.print_context_status(
|
|
"⚠️ Context window too full for thinking budget. Applying fallback sequence...\n",
|
|
);
|
|
|
|
// Step 1: Try thinnify (first third of context)
|
|
ui_writer.print_context_status("🥒 Step 1: Trying thinnify...\n");
|
|
let thin_result = context_window.thin_context(None);
|
|
thinning_events.push(thin_result.chars_saved);
|
|
ui_writer.print_thin_result(&thin_result);
|
|
|
|
// Recalculate after thinnify
|
|
let (new_max, still_needs_reduction) = provider_config::calculate_summary_max_tokens(
|
|
config,
|
|
provider_name,
|
|
context_window.total_tokens,
|
|
context_window.used_tokens,
|
|
);
|
|
max_tokens = new_max;
|
|
if !still_needs_reduction {
|
|
ui_writer.print_context_status("✅ Thinnify resolved capacity issue. Continuing...\n");
|
|
return max_tokens;
|
|
}
|
|
|
|
// Step 2: Try skinnify (entire context)
|
|
ui_writer.print_context_status("🦴 Step 2: Trying skinnify...\n");
|
|
let skinny_result = context_window.thin_context_all(None);
|
|
thinning_events.push(skinny_result.chars_saved);
|
|
ui_writer.print_thin_result(&skinny_result);
|
|
|
|
// Recalculate after skinnify
|
|
let (final_max, final_needs_reduction) = provider_config::calculate_summary_max_tokens(
|
|
config,
|
|
provider_name,
|
|
context_window.total_tokens,
|
|
context_window.used_tokens,
|
|
);
|
|
if !final_needs_reduction {
|
|
ui_writer.print_context_status("✅ Skinnify resolved capacity issue. Continuing...\n");
|
|
return final_max;
|
|
}
|
|
|
|
// Step 3: Nothing worked, use hard-coded minimum
|
|
const HARD_CODED_MINIMUM: u32 = 5000;
|
|
ui_writer.print_context_status(&format!(
|
|
"⚠️ Step 3: Context reduction insufficient. Using hard-coded max_tokens={} as last resort...\n",
|
|
HARD_CODED_MINIMUM
|
|
));
|
|
HARD_CODED_MINIMUM
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_calculate_capped_summary_tokens_anthropic() {
|
|
let config = g3_config::Config::default();
|
|
// Without thinking budget, should cap at 10_000
|
|
let result = calculate_capped_summary_tokens(&config, "anthropic", 20_000);
|
|
assert_eq!(result, 10_000);
|
|
}
|
|
|
|
#[test]
|
|
fn test_calculate_capped_summary_tokens_databricks() {
|
|
let config = g3_config::Config::default();
|
|
let result = calculate_capped_summary_tokens(&config, "databricks", 20_000);
|
|
assert_eq!(result, 10_000);
|
|
}
|
|
|
|
#[test]
|
|
fn test_calculate_capped_summary_tokens_embedded() {
|
|
let config = g3_config::Config::default();
|
|
let result = calculate_capped_summary_tokens(&config, "embedded", 20_000);
|
|
assert_eq!(result, 3000);
|
|
}
|
|
|
|
#[test]
|
|
fn test_calculate_capped_summary_tokens_minimum_floor() {
|
|
let config = g3_config::Config::default();
|
|
// Even with very low input, should return at least SUMMARY_MIN_TOKENS
|
|
let result = calculate_capped_summary_tokens(&config, "embedded", 100);
|
|
assert_eq!(result, SUMMARY_MIN_TOKENS);
|
|
}
|
|
|
|
#[test]
|
|
fn test_compaction_result_success() {
|
|
let result = CompactionResult::success(5000);
|
|
assert!(result.success);
|
|
assert_eq!(result.chars_saved, 5000);
|
|
assert!(result.error.is_none());
|
|
}
|
|
|
|
#[test]
|
|
fn test_compaction_result_failure() {
|
|
let result = CompactionResult::failure("test error".to_string());
|
|
assert!(!result.success);
|
|
assert_eq!(result.chars_saved, 0);
|
|
assert_eq!(result.error, Some("test error".to_string()));
|
|
}
|
|
}
|