Add status line showing loaded context in agent mode
Shows checkmarks for README, AGENTS.md, and Memory if loaded, or dots if not found. Displayed below the working directory line.
This commit is contained in:
328
crates/g3-core/src/compaction.rs
Normal file
328
crates/g3-core/src/compaction.rs
Normal file
@@ -0,0 +1,328 @@
|
||||
//! Context compaction logic.
|
||||
//!
|
||||
//! This module provides unified compaction functionality used by both
|
||||
//! manual compaction (`/compact` command) and automatic compaction
|
||||
//! (when context window reaches capacity during streaming).
|
||||
|
||||
use anyhow::Result;
|
||||
use g3_providers::{CompletionRequest, Message, MessageRole, ProviderRegistry};
|
||||
use tracing::{debug, error, warn};
|
||||
|
||||
use crate::context_window::ContextWindow;
|
||||
use crate::provider_config;
|
||||
use crate::ui_writer::UiWriter;
|
||||
|
||||
/// Minimum tokens for summary requests to avoid API errors when context is nearly full.
|
||||
pub const SUMMARY_MIN_TOKENS: u32 = 1000;
|
||||
|
||||
/// Result of a compaction operation.
|
||||
#[derive(Debug)]
|
||||
pub struct CompactionResult {
|
||||
/// Whether compaction succeeded
|
||||
pub success: bool,
|
||||
/// Characters saved by compaction (if successful)
|
||||
pub chars_saved: usize,
|
||||
/// Error message (if failed)
|
||||
pub error: Option<String>,
|
||||
}
|
||||
|
||||
impl CompactionResult {
|
||||
pub fn success(chars_saved: usize) -> Self {
|
||||
Self {
|
||||
success: true,
|
||||
chars_saved,
|
||||
error: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn failure(error: String) -> Self {
|
||||
Self {
|
||||
success: false,
|
||||
chars_saved: 0,
|
||||
error: Some(error),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Configuration for a compaction operation.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CompactionConfig<'a> {
|
||||
/// Provider name (e.g., "anthropic", "openai")
|
||||
pub provider_name: &'a str,
|
||||
/// Latest user message to preserve after compaction
|
||||
pub latest_user_msg: Option<String>,
|
||||
}
|
||||
|
||||
/// Calculate the summary max_tokens with provider-specific caps applied.
|
||||
///
|
||||
/// This is the canonical implementation - both manual and auto compaction use this.
|
||||
pub fn calculate_capped_summary_tokens(
|
||||
config: &g3_config::Config,
|
||||
provider_name: &str,
|
||||
base_max_tokens: u32,
|
||||
) -> u32 {
|
||||
// Apply provider-specific caps
|
||||
// For Anthropic with thinking enabled, we need max_tokens > thinking.budget_tokens
|
||||
// So we set a higher cap when thinking is configured
|
||||
let anthropic_cap = match provider_config::get_thinking_budget_tokens(config, provider_name) {
|
||||
Some(budget) => (budget + 2000).max(10_000), // At least budget + 2000 for response
|
||||
None => 10_000,
|
||||
};
|
||||
|
||||
let capped = match provider_name {
|
||||
name if name.starts_with("anthropic") => base_max_tokens.min(anthropic_cap),
|
||||
name if name.starts_with("databricks") => base_max_tokens.min(10_000),
|
||||
name if name.starts_with("embedded") => base_max_tokens.min(3000),
|
||||
_ => base_max_tokens.min(5000),
|
||||
};
|
||||
|
||||
// Ensure minimum floor as defense-in-depth
|
||||
capped.max(SUMMARY_MIN_TOKENS)
|
||||
}
|
||||
|
||||
/// Check if thinking mode should be disabled for a summary request.
|
||||
///
|
||||
/// Anthropic requires: max_tokens > thinking.budget_tokens + 1024
|
||||
pub fn should_disable_thinking(
|
||||
config: &g3_config::Config,
|
||||
provider_name: &str,
|
||||
summary_max_tokens: u32,
|
||||
) -> bool {
|
||||
provider_config::get_thinking_budget_tokens(config, provider_name).map_or(false, |budget| {
|
||||
let minimum_for_thinking = budget + 1024;
|
||||
let should_disable = summary_max_tokens <= minimum_for_thinking;
|
||||
if should_disable {
|
||||
warn!(
|
||||
"Disabling thinking mode for summary: max_tokens ({}) <= minimum_for_thinking ({})",
|
||||
summary_max_tokens, minimum_for_thinking
|
||||
);
|
||||
}
|
||||
should_disable
|
||||
})
|
||||
}
|
||||
|
||||
/// Build the summary request messages from conversation history.
|
||||
pub fn build_summary_messages(context_window: &ContextWindow) -> Vec<Message> {
|
||||
let summary_prompt = context_window.create_summary_prompt();
|
||||
|
||||
let conversation_text = context_window
|
||||
.conversation_history
|
||||
.iter()
|
||||
.map(|m| format!("{:?}: {}", m.role, m.content))
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n\n");
|
||||
|
||||
vec![
|
||||
Message::new(
|
||||
MessageRole::System,
|
||||
"You are a helpful assistant that creates concise summaries.".to_string(),
|
||||
),
|
||||
Message::new(
|
||||
MessageRole::User,
|
||||
format!(
|
||||
"Based on this conversation history, {}\n\nConversation:\n{}",
|
||||
summary_prompt, conversation_text
|
||||
),
|
||||
),
|
||||
]
|
||||
}
|
||||
|
||||
/// Perform context compaction by summarizing conversation history.
|
||||
///
|
||||
/// This is the unified implementation used by both:
|
||||
/// - `force_compact()` - manual compaction via `/compact` command
|
||||
/// - `stream_completion_with_tools()` - automatic compaction when context is full
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `providers` - Provider registry to get the LLM provider
|
||||
/// * `context_window` - Context window to compact
|
||||
/// * `config` - Application config for provider settings
|
||||
/// * `compaction_config` - Configuration for this compaction operation
|
||||
/// * `ui_writer` - UI writer for status messages
|
||||
///
|
||||
/// # Returns
|
||||
/// `CompactionResult` indicating success/failure and chars saved
|
||||
pub async fn perform_compaction<W: UiWriter>(
|
||||
providers: &ProviderRegistry,
|
||||
context_window: &mut ContextWindow,
|
||||
config: &g3_config::Config,
|
||||
compaction_config: CompactionConfig<'_>,
|
||||
ui_writer: &W,
|
||||
thinning_events: &mut Vec<usize>,
|
||||
) -> Result<CompactionResult> {
|
||||
let provider_name = compaction_config.provider_name;
|
||||
|
||||
// Apply fallback sequence: thinnify -> skinnify -> hard-coded 5000
|
||||
let base_max_tokens = apply_summary_fallback_sequence(
|
||||
context_window,
|
||||
config,
|
||||
provider_name,
|
||||
ui_writer,
|
||||
thinning_events,
|
||||
);
|
||||
|
||||
let summary_max_tokens = calculate_capped_summary_tokens(config, provider_name, base_max_tokens);
|
||||
|
||||
debug!(
|
||||
"Requesting summary with max_tokens: {} (current usage: {} tokens)",
|
||||
summary_max_tokens, context_window.used_tokens
|
||||
);
|
||||
|
||||
// Build summary request
|
||||
let summary_messages = build_summary_messages(context_window);
|
||||
let provider = providers.get(None)?;
|
||||
|
||||
let disable_thinking = should_disable_thinking(config, provider.name(), summary_max_tokens);
|
||||
|
||||
debug!(
|
||||
"Creating summary request: max_tokens={}, disable_thinking={}",
|
||||
summary_max_tokens, disable_thinking
|
||||
);
|
||||
|
||||
let summary_request = CompletionRequest {
|
||||
messages: summary_messages,
|
||||
max_tokens: Some(summary_max_tokens),
|
||||
temperature: Some(provider_config::resolve_temperature(config, provider.name())),
|
||||
stream: false,
|
||||
tools: None,
|
||||
disable_thinking,
|
||||
};
|
||||
|
||||
// Execute summary request
|
||||
match provider.complete(summary_request).await {
|
||||
Ok(summary_response) => {
|
||||
let chars_saved = context_window.reset_with_summary(
|
||||
summary_response.content,
|
||||
compaction_config.latest_user_msg,
|
||||
);
|
||||
Ok(CompactionResult::success(chars_saved))
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to create summary: {}", e);
|
||||
Ok(CompactionResult::failure(e.to_string()))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Apply the fallback sequence for summary requests to free up context space.
|
||||
///
|
||||
/// Sequence: thinnify -> skinnify -> hard-coded minimum
|
||||
fn apply_summary_fallback_sequence<W: UiWriter>(
|
||||
context_window: &mut ContextWindow,
|
||||
config: &g3_config::Config,
|
||||
provider_name: &str,
|
||||
ui_writer: &W,
|
||||
thinning_events: &mut Vec<usize>,
|
||||
) -> u32 {
|
||||
// Initial validation
|
||||
let (mut max_tokens, needs_reduction) = provider_config::calculate_summary_max_tokens(
|
||||
config,
|
||||
provider_name,
|
||||
context_window.total_tokens,
|
||||
context_window.used_tokens,
|
||||
);
|
||||
|
||||
if !needs_reduction {
|
||||
return max_tokens;
|
||||
}
|
||||
|
||||
ui_writer.print_context_status(
|
||||
"⚠️ Context window too full for thinking budget. Applying fallback sequence...\n",
|
||||
);
|
||||
|
||||
// Step 1: Try thinnify (first third of context)
|
||||
ui_writer.print_context_status("🥒 Step 1: Trying thinnify...\n");
|
||||
let (thin_msg, chars_saved) = context_window.thin_context(None);
|
||||
thinning_events.push(chars_saved);
|
||||
ui_writer.print_context_thinning(&thin_msg);
|
||||
|
||||
// Recalculate after thinnify
|
||||
let (new_max, still_needs_reduction) = provider_config::calculate_summary_max_tokens(
|
||||
config,
|
||||
provider_name,
|
||||
context_window.total_tokens,
|
||||
context_window.used_tokens,
|
||||
);
|
||||
max_tokens = new_max;
|
||||
if !still_needs_reduction {
|
||||
ui_writer.print_context_status("✅ Thinnify resolved capacity issue. Continuing...\n");
|
||||
return max_tokens;
|
||||
}
|
||||
|
||||
// Step 2: Try skinnify (entire context)
|
||||
ui_writer.print_context_status("🦴 Step 2: Trying skinnify...\n");
|
||||
let (skinny_msg, chars_saved) = context_window.thin_context_all(None);
|
||||
thinning_events.push(chars_saved);
|
||||
ui_writer.print_context_thinning(&skinny_msg);
|
||||
|
||||
// Recalculate after skinnify
|
||||
let (final_max, final_needs_reduction) = provider_config::calculate_summary_max_tokens(
|
||||
config,
|
||||
provider_name,
|
||||
context_window.total_tokens,
|
||||
context_window.used_tokens,
|
||||
);
|
||||
if !final_needs_reduction {
|
||||
ui_writer.print_context_status("✅ Skinnify resolved capacity issue. Continuing...\n");
|
||||
return final_max;
|
||||
}
|
||||
|
||||
// Step 3: Nothing worked, use hard-coded minimum
|
||||
const HARD_CODED_MINIMUM: u32 = 5000;
|
||||
ui_writer.print_context_status(&format!(
|
||||
"⚠️ Step 3: Context reduction insufficient. Using hard-coded max_tokens={} as last resort...\n",
|
||||
HARD_CODED_MINIMUM
|
||||
));
|
||||
HARD_CODED_MINIMUM
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_calculate_capped_summary_tokens_anthropic() {
|
||||
let config = g3_config::Config::default();
|
||||
// Without thinking budget, should cap at 10_000
|
||||
let result = calculate_capped_summary_tokens(&config, "anthropic", 20_000);
|
||||
assert_eq!(result, 10_000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_calculate_capped_summary_tokens_databricks() {
|
||||
let config = g3_config::Config::default();
|
||||
let result = calculate_capped_summary_tokens(&config, "databricks", 20_000);
|
||||
assert_eq!(result, 10_000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_calculate_capped_summary_tokens_embedded() {
|
||||
let config = g3_config::Config::default();
|
||||
let result = calculate_capped_summary_tokens(&config, "embedded", 20_000);
|
||||
assert_eq!(result, 3000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_calculate_capped_summary_tokens_minimum_floor() {
|
||||
let config = g3_config::Config::default();
|
||||
// Even with very low input, should return at least SUMMARY_MIN_TOKENS
|
||||
let result = calculate_capped_summary_tokens(&config, "embedded", 100);
|
||||
assert_eq!(result, SUMMARY_MIN_TOKENS);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compaction_result_success() {
|
||||
let result = CompactionResult::success(5000);
|
||||
assert!(result.success);
|
||||
assert_eq!(result.chars_saved, 5000);
|
||||
assert!(result.error.is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compaction_result_failure() {
|
||||
let result = CompactionResult::failure("test error".to_string());
|
||||
assert!(!result.success);
|
||||
assert_eq!(result.chars_saved, 0);
|
||||
assert_eq!(result.error, Some("test error".to_string()));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user