diff --git a/crates/g3-cli/src/lib.rs b/crates/g3-cli/src/lib.rs index e9fa0fd..6331507 100644 --- a/crates/g3-cli/src/lib.rs +++ b/crates/g3-cli/src/lib.rs @@ -562,20 +562,27 @@ async fn run_agent_mode( let system_prompt = get_agent_system_prompt(&agent_prompt, true); // Load AGENTS.md, README, and memory - same as normal mode - let agents_content = read_agents_config(&workspace_dir); - let readme_content = read_project_readme(&workspace_dir); - let memory_content = read_project_memory(&workspace_dir); + let agents_content_opt = read_agents_config(&workspace_dir); + let readme_content_opt = read_project_readme(&workspace_dir); + let memory_content_opt = read_project_memory(&workspace_dir); + + // Show what was loaded + let readme_status = if readme_content_opt.is_some() { "✓" } else { "·" }; + let agents_status = if agents_content_opt.is_some() { "✓" } else { "·" }; + let memory_status = if memory_content_opt.is_some() { "✓" } else { "·" }; + output.print(&format!(" {} README | {} AGENTS.md | {} Memory", + readme_status, agents_status, memory_status)); // Combine all content for the agent's context let combined_content = { let mut parts = Vec::new(); - if let Some(agents) = agents_content { + if let Some(agents) = agents_content_opt { parts.push(agents); } - if let Some(readme) = readme_content { + if let Some(readme) = readme_content_opt { parts.push(readme); } - if let Some(memory) = memory_content { + if let Some(memory) = memory_content_opt { parts.push(memory); } if parts.is_empty() { diff --git a/crates/g3-core/src/compaction.rs b/crates/g3-core/src/compaction.rs new file mode 100644 index 0000000..a998b8c --- /dev/null +++ b/crates/g3-core/src/compaction.rs @@ -0,0 +1,328 @@ +//! Context compaction logic. +//! +//! This module provides unified compaction functionality used by both +//! manual compaction (`/compact` command) and automatic compaction +//! (when context window reaches capacity during streaming). + +use anyhow::Result; +use g3_providers::{CompletionRequest, Message, MessageRole, ProviderRegistry}; +use tracing::{debug, error, warn}; + +use crate::context_window::ContextWindow; +use crate::provider_config; +use crate::ui_writer::UiWriter; + +/// Minimum tokens for summary requests to avoid API errors when context is nearly full. +pub const SUMMARY_MIN_TOKENS: u32 = 1000; + +/// Result of a compaction operation. +#[derive(Debug)] +pub struct CompactionResult { + /// Whether compaction succeeded + pub success: bool, + /// Characters saved by compaction (if successful) + pub chars_saved: usize, + /// Error message (if failed) + pub error: Option, +} + +impl CompactionResult { + pub fn success(chars_saved: usize) -> Self { + Self { + success: true, + chars_saved, + error: None, + } + } + + pub fn failure(error: String) -> Self { + Self { + success: false, + chars_saved: 0, + error: Some(error), + } + } +} + +/// Configuration for a compaction operation. +#[derive(Debug, Clone)] +pub struct CompactionConfig<'a> { + /// Provider name (e.g., "anthropic", "openai") + pub provider_name: &'a str, + /// Latest user message to preserve after compaction + pub latest_user_msg: Option, +} + +/// Calculate the summary max_tokens with provider-specific caps applied. +/// +/// This is the canonical implementation - both manual and auto compaction use this. +pub fn calculate_capped_summary_tokens( + config: &g3_config::Config, + provider_name: &str, + base_max_tokens: u32, +) -> u32 { + // Apply provider-specific caps + // For Anthropic with thinking enabled, we need max_tokens > thinking.budget_tokens + // So we set a higher cap when thinking is configured + let anthropic_cap = match provider_config::get_thinking_budget_tokens(config, provider_name) { + Some(budget) => (budget + 2000).max(10_000), // At least budget + 2000 for response + None => 10_000, + }; + + let capped = match provider_name { + name if name.starts_with("anthropic") => base_max_tokens.min(anthropic_cap), + name if name.starts_with("databricks") => base_max_tokens.min(10_000), + name if name.starts_with("embedded") => base_max_tokens.min(3000), + _ => base_max_tokens.min(5000), + }; + + // Ensure minimum floor as defense-in-depth + capped.max(SUMMARY_MIN_TOKENS) +} + +/// Check if thinking mode should be disabled for a summary request. +/// +/// Anthropic requires: max_tokens > thinking.budget_tokens + 1024 +pub fn should_disable_thinking( + config: &g3_config::Config, + provider_name: &str, + summary_max_tokens: u32, +) -> bool { + provider_config::get_thinking_budget_tokens(config, provider_name).map_or(false, |budget| { + let minimum_for_thinking = budget + 1024; + let should_disable = summary_max_tokens <= minimum_for_thinking; + if should_disable { + warn!( + "Disabling thinking mode for summary: max_tokens ({}) <= minimum_for_thinking ({})", + summary_max_tokens, minimum_for_thinking + ); + } + should_disable + }) +} + +/// Build the summary request messages from conversation history. +pub fn build_summary_messages(context_window: &ContextWindow) -> Vec { + let summary_prompt = context_window.create_summary_prompt(); + + let conversation_text = context_window + .conversation_history + .iter() + .map(|m| format!("{:?}: {}", m.role, m.content)) + .collect::>() + .join("\n\n"); + + vec![ + Message::new( + MessageRole::System, + "You are a helpful assistant that creates concise summaries.".to_string(), + ), + Message::new( + MessageRole::User, + format!( + "Based on this conversation history, {}\n\nConversation:\n{}", + summary_prompt, conversation_text + ), + ), + ] +} + +/// Perform context compaction by summarizing conversation history. +/// +/// This is the unified implementation used by both: +/// - `force_compact()` - manual compaction via `/compact` command +/// - `stream_completion_with_tools()` - automatic compaction when context is full +/// +/// # Arguments +/// * `providers` - Provider registry to get the LLM provider +/// * `context_window` - Context window to compact +/// * `config` - Application config for provider settings +/// * `compaction_config` - Configuration for this compaction operation +/// * `ui_writer` - UI writer for status messages +/// +/// # Returns +/// `CompactionResult` indicating success/failure and chars saved +pub async fn perform_compaction( + providers: &ProviderRegistry, + context_window: &mut ContextWindow, + config: &g3_config::Config, + compaction_config: CompactionConfig<'_>, + ui_writer: &W, + thinning_events: &mut Vec, +) -> Result { + let provider_name = compaction_config.provider_name; + + // Apply fallback sequence: thinnify -> skinnify -> hard-coded 5000 + let base_max_tokens = apply_summary_fallback_sequence( + context_window, + config, + provider_name, + ui_writer, + thinning_events, + ); + + let summary_max_tokens = calculate_capped_summary_tokens(config, provider_name, base_max_tokens); + + debug!( + "Requesting summary with max_tokens: {} (current usage: {} tokens)", + summary_max_tokens, context_window.used_tokens + ); + + // Build summary request + let summary_messages = build_summary_messages(context_window); + let provider = providers.get(None)?; + + let disable_thinking = should_disable_thinking(config, provider.name(), summary_max_tokens); + + debug!( + "Creating summary request: max_tokens={}, disable_thinking={}", + summary_max_tokens, disable_thinking + ); + + let summary_request = CompletionRequest { + messages: summary_messages, + max_tokens: Some(summary_max_tokens), + temperature: Some(provider_config::resolve_temperature(config, provider.name())), + stream: false, + tools: None, + disable_thinking, + }; + + // Execute summary request + match provider.complete(summary_request).await { + Ok(summary_response) => { + let chars_saved = context_window.reset_with_summary( + summary_response.content, + compaction_config.latest_user_msg, + ); + Ok(CompactionResult::success(chars_saved)) + } + Err(e) => { + error!("Failed to create summary: {}", e); + Ok(CompactionResult::failure(e.to_string())) + } + } +} + +/// Apply the fallback sequence for summary requests to free up context space. +/// +/// Sequence: thinnify -> skinnify -> hard-coded minimum +fn apply_summary_fallback_sequence( + context_window: &mut ContextWindow, + config: &g3_config::Config, + provider_name: &str, + ui_writer: &W, + thinning_events: &mut Vec, +) -> u32 { + // Initial validation + let (mut max_tokens, needs_reduction) = provider_config::calculate_summary_max_tokens( + config, + provider_name, + context_window.total_tokens, + context_window.used_tokens, + ); + + if !needs_reduction { + return max_tokens; + } + + ui_writer.print_context_status( + "⚠️ Context window too full for thinking budget. Applying fallback sequence...\n", + ); + + // Step 1: Try thinnify (first third of context) + ui_writer.print_context_status("🥒 Step 1: Trying thinnify...\n"); + let (thin_msg, chars_saved) = context_window.thin_context(None); + thinning_events.push(chars_saved); + ui_writer.print_context_thinning(&thin_msg); + + // Recalculate after thinnify + let (new_max, still_needs_reduction) = provider_config::calculate_summary_max_tokens( + config, + provider_name, + context_window.total_tokens, + context_window.used_tokens, + ); + max_tokens = new_max; + if !still_needs_reduction { + ui_writer.print_context_status("✅ Thinnify resolved capacity issue. Continuing...\n"); + return max_tokens; + } + + // Step 2: Try skinnify (entire context) + ui_writer.print_context_status("🦴 Step 2: Trying skinnify...\n"); + let (skinny_msg, chars_saved) = context_window.thin_context_all(None); + thinning_events.push(chars_saved); + ui_writer.print_context_thinning(&skinny_msg); + + // Recalculate after skinnify + let (final_max, final_needs_reduction) = provider_config::calculate_summary_max_tokens( + config, + provider_name, + context_window.total_tokens, + context_window.used_tokens, + ); + if !final_needs_reduction { + ui_writer.print_context_status("✅ Skinnify resolved capacity issue. Continuing...\n"); + return final_max; + } + + // Step 3: Nothing worked, use hard-coded minimum + const HARD_CODED_MINIMUM: u32 = 5000; + ui_writer.print_context_status(&format!( + "⚠️ Step 3: Context reduction insufficient. Using hard-coded max_tokens={} as last resort...\n", + HARD_CODED_MINIMUM + )); + HARD_CODED_MINIMUM +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_calculate_capped_summary_tokens_anthropic() { + let config = g3_config::Config::default(); + // Without thinking budget, should cap at 10_000 + let result = calculate_capped_summary_tokens(&config, "anthropic", 20_000); + assert_eq!(result, 10_000); + } + + #[test] + fn test_calculate_capped_summary_tokens_databricks() { + let config = g3_config::Config::default(); + let result = calculate_capped_summary_tokens(&config, "databricks", 20_000); + assert_eq!(result, 10_000); + } + + #[test] + fn test_calculate_capped_summary_tokens_embedded() { + let config = g3_config::Config::default(); + let result = calculate_capped_summary_tokens(&config, "embedded", 20_000); + assert_eq!(result, 3000); + } + + #[test] + fn test_calculate_capped_summary_tokens_minimum_floor() { + let config = g3_config::Config::default(); + // Even with very low input, should return at least SUMMARY_MIN_TOKENS + let result = calculate_capped_summary_tokens(&config, "embedded", 100); + assert_eq!(result, SUMMARY_MIN_TOKENS); + } + + #[test] + fn test_compaction_result_success() { + let result = CompactionResult::success(5000); + assert!(result.success); + assert_eq!(result.chars_saved, 5000); + assert!(result.error.is_none()); + } + + #[test] + fn test_compaction_result_failure() { + let result = CompactionResult::failure("test error".to_string()); + assert!(!result.success); + assert_eq!(result.chars_saved, 0); + assert_eq!(result.error, Some("test error".to_string())); + } +} diff --git a/crates/g3-core/src/lib.rs b/crates/g3-core/src/lib.rs index 428e03d..8aa97b6 100644 --- a/crates/g3-core/src/lib.rs +++ b/crates/g3-core/src/lib.rs @@ -1,5 +1,6 @@ pub mod context_window; pub mod background_process; +pub mod compaction; pub mod code_search; pub mod error_handling; pub mod feedback_extraction;