validate max_tokens for call, also fallbacks for summary

When the CW is full, max_tokens is often passed at 0 or tiny. The LLM will fail. For Anthropic with thining, there is also the thinking budget. This can happen during summary attempts, in that case first try thinnify, skinnify etc..
2025-12-09 10:15:32 +11:00
parent 48e6d594bc
commit 696c441a47
2 changed files with 434 additions and 106 deletions
--- a/crates/g3-core/src/lib.rs
+++ b/crates/g3-core/src/lib.rs
@@ -1418,6 +1418,204 @@ impl<W: UiWriter> Agent<W> {
        }
    }

+    /// Get the thinking budget tokens for Anthropic provider, if configured
+    fn get_thinking_budget_tokens(&self) -> Option<u32> {
+        self.config
+            .providers
+            .anthropic
+            .as_ref()
+            .and_then(|c| c.thinking_budget_tokens)
+    }
+
+    /// Pre-flight check to validate and adjust max_tokens for the thinking.budget_tokens constraint.
+    /// Returns the adjusted max_tokens that satisfies: max_tokens > thinking.budget_tokens
+    /// Also returns whether we need to apply fallback actions (thinnify/skinnify).
+    ///
+    /// Returns: (adjusted_max_tokens, needs_context_reduction)
+    fn preflight_validate_max_tokens(
+        &self,
+        provider_name: &str,
+        proposed_max_tokens: u32,
+    ) -> (u32, bool) {
+        // Only applies to Anthropic provider with thinking enabled
+        if provider_name != "anthropic" {
+            return (proposed_max_tokens, false);
+        }
+
+        let budget_tokens = match self.get_thinking_budget_tokens() {
+            Some(budget) => budget,
+            None => return (proposed_max_tokens, false), // No thinking enabled
+        };
+
+        // Anthropic requires: max_tokens > budget_tokens
+        // We add a minimum output buffer of 1024 tokens for actual response content
+        let minimum_required = budget_tokens + 1024;
+
+        if proposed_max_tokens >= minimum_required {
+            // We have enough headroom
+            (proposed_max_tokens, false)
+        } else {
+            // max_tokens is too low - need to either adjust or reduce context
+            warn!(
+                "max_tokens ({}) is below required minimum ({}) for thinking.budget_tokens ({}). Context reduction needed.",
+                proposed_max_tokens, minimum_required, budget_tokens
+            );
+            // Return the minimum required, but flag that we need context reduction
+            (minimum_required, true)
+        }
+    }
+
+    /// Calculate max_tokens for a summary request, ensuring it satisfies the thinking constraint.
+    /// Applies fallback sequence: thinnify -> skinnify -> hard-coded minimum
+    /// Returns (max_tokens, whether_fallback_was_used)
+    fn calculate_summary_max_tokens(
+        &mut self,
+        provider_name: &str,
+    ) -> (u32, bool) {
+        let model_limit = self.context_window.total_tokens;
+        let current_usage = self.context_window.used_tokens;
+        
+        // Calculate available tokens with buffer
+        let buffer = (model_limit / 40).clamp(1000, 10000); // 2.5% buffer
+        let available = model_limit
+            .saturating_sub(current_usage)
+            .saturating_sub(buffer);
+        let proposed_max_tokens = available.min(10_000);
+
+        // Validate against thinking budget constraint
+        let (adjusted, needs_reduction) = self.preflight_validate_max_tokens(provider_name, proposed_max_tokens);
+        
+        if !needs_reduction {
+            return (adjusted, false);
+        }
+
+        // We need more headroom - the context is too full
+        // Return the adjusted value but flag that fallbacks are needed
+        (adjusted, true)
+    }
+
+    /// Apply the fallback sequence to free up context space for thinking budget.
+    /// Sequence: thinnify (first third) → skinnify (all) → hard-coded minimum
+    /// Returns the validated max_tokens that satisfies thinking.budget_tokens constraint.
+    fn apply_max_tokens_fallback_sequence(
+        &mut self,
+        provider_name: &str,
+        initial_max_tokens: u32,
+        hard_coded_minimum: u32,
+    ) -> u32 {
+        let (mut max_tokens, needs_reduction) = self.preflight_validate_max_tokens(provider_name, initial_max_tokens);
+        
+        if !needs_reduction {
+            return max_tokens;
+        }
+
+        self.ui_writer.print_context_status(
+            "⚠️ Context window too full for thinking budget. Applying fallback sequence...\n",
+        );
+
+        // Step 1: Try thinnify (first third of context)
+        self.ui_writer.print_context_status("🥒 Step 1: Trying thinnify...\n");
+        let (thin_msg, thin_saved) = self.context_window.thin_context();
+        self.thinning_events.push(thin_saved);
+        self.ui_writer.print_context_thinning(&thin_msg);
+
+        // Recalculate max_tokens after thinnify
+        let recalc_max = self.resolve_max_tokens(provider_name);
+        let (new_max, still_needs_reduction) = self.preflight_validate_max_tokens(provider_name, recalc_max);
+        max_tokens = new_max;
+
+        if !still_needs_reduction {
+            self.ui_writer.print_context_status(
+                "✅ Thinnify resolved capacity issue. Continuing...\n",
+            );
+            return max_tokens;
+        }
+
+        // Step 2: Try skinnify (entire context)
+        self.ui_writer.print_context_status("🦴 Step 2: Trying skinnify...\n");
+        let (skinny_msg, skinny_saved) = self.context_window.thin_context_all();
+        self.thinning_events.push(skinny_saved);
+        self.ui_writer.print_context_thinning(&skinny_msg);
+
+        // Recalculate max_tokens after skinnify
+        let recalc_max = self.resolve_max_tokens(provider_name);
+        let (final_max, final_needs_reduction) = self.preflight_validate_max_tokens(provider_name, recalc_max);
+        max_tokens = final_max;
+
+        if !final_needs_reduction {
+            self.ui_writer.print_context_status(
+                "✅ Skinnify resolved capacity issue. Continuing...\n",
+            );
+            return max_tokens;
+        }
+
+        // Step 3: Nothing worked, use hard-coded minimum as last resort
+        self.ui_writer.print_context_status(&format!(
+            "⚠️ Step 3: Context reduction insufficient. Using hard-coded max_tokens={} as last resort...\n",
+            hard_coded_minimum
+        ));
+        
+        hard_coded_minimum
+    }
+
+    /// Apply the fallback sequence for summary requests to free up context space.
+    /// Uses calculate_summary_max_tokens for recalculation (based on available space).
+    /// Returns the validated max_tokens for summary requests.
+    fn apply_summary_fallback_sequence(
+        &mut self,
+        provider_name: &str,
+    ) -> u32 {
+        let (mut summary_max_tokens, needs_reduction) = self.calculate_summary_max_tokens(provider_name);
+        
+        if !needs_reduction {
+            return summary_max_tokens;
+        }
+
+        self.ui_writer.print_context_status(
+            "⚠️ Context window too full for thinking budget. Applying fallback sequence...\n",
+        );
+
+        // Step 1: Try thinnify (first third of context)
+        self.ui_writer.print_context_status("🥒 Step 1: Trying thinnify...\n");
+        let (thin_msg, thin_saved) = self.context_window.thin_context();
+        self.thinning_events.push(thin_saved);
+        self.ui_writer.print_context_thinning(&thin_msg);
+
+        // Recalculate max_tokens after thinnify
+        let (new_max, still_needs_reduction) = self.calculate_summary_max_tokens(provider_name);
+        summary_max_tokens = new_max;
+
+        if !still_needs_reduction {
+            self.ui_writer.print_context_status(
+                "✅ Thinnify resolved capacity issue. Continuing...\n",
+            );
+            return summary_max_tokens;
+        }
+
+        // Step 2: Try skinnify (entire context)
+        self.ui_writer.print_context_status("🦴 Step 2: Trying skinnify...\n");
+        let (skinny_msg, skinny_saved) = self.context_window.thin_context_all();
+        self.thinning_events.push(skinny_saved);
+        self.ui_writer.print_context_thinning(&skinny_msg);
+
+        // Recalculate max_tokens after skinnify
+        let (final_max, final_needs_reduction) = self.calculate_summary_max_tokens(provider_name);
+        summary_max_tokens = final_max;
+
+        if !final_needs_reduction {
+            self.ui_writer.print_context_status(
+                "✅ Skinnify resolved capacity issue. Continuing...\n",
+            );
+            return summary_max_tokens;
+        }
+
+        // Step 3: Nothing worked, use hard-coded minimum
+        self.ui_writer.print_context_status(
+            "⚠️ Step 3: Context reduction insufficient. Using hard-coded max_tokens=5000 as last resort...\n",
+        );
+        5000
+    }
+
    /// Resolve the temperature to use for a given provider, applying fallbacks
    fn resolve_temperature(&self, provider_name: &str) -> f32 {
        match provider_name {
@@ -1805,8 +2003,14 @@ impl<W: UiWriter> Agent<W> {
        };
        let _ = provider; // Drop the provider reference to avoid borrowing issues

-        // Get max_tokens from provider configuration, falling back to sensible defaults
-        let max_tokens = Some(self.resolve_max_tokens(&provider_name));
+        // Get max_tokens from provider configuration with preflight validation
+        // This ensures max_tokens > thinking.budget_tokens for Anthropic with extended thinking
+        let initial_max_tokens = self.resolve_max_tokens(&provider_name);
+        let max_tokens = Some(self.apply_max_tokens_fallback_sequence(
+            &provider_name,
+            initial_max_tokens,
+            16000, // Hard-coded minimum for main API calls (higher than summary's 5000)
+        ));

        let request = CompletionRequest {
            messages,
@@ -2211,6 +2415,25 @@ impl<W: UiWriter> Agent<W> {
            self.context_window.percentage_used() as u32
        ));

+        let provider = self.providers.get(None)?;
+        let provider_name = provider.name().to_string();
+        let _ = provider; // Release borrow early
+
+        // Apply fallback sequence: thinnify -> skinnify -> hard-coded 5000
+        let mut summary_max_tokens = self.apply_summary_fallback_sequence(&provider_name);
+
+        // Apply provider-specific caps
+        summary_max_tokens = match provider_name.as_str() {
+            "databricks" | "anthropic" => summary_max_tokens.min(10_000),
+            "embedded" => summary_max_tokens.min(3000),
+            _ => summary_max_tokens.min(5000),
+        };
+
+        debug!(
+            "Requesting summary with max_tokens: {} (current usage: {} tokens)",
+            summary_max_tokens, self.context_window.used_tokens
+        );
+
        // Create summary request with FULL history
        let summary_prompt = self.context_window.create_summary_prompt();

@@ -2239,38 +2462,9 @@ impl<W: UiWriter> Agent<W> {

        let provider = self.providers.get(None)?;

-        // Dynamically calculate max_tokens for summary based on what's left
-        let summary_max_tokens = match provider.name() {
-            "databricks" | "anthropic" => {
-                let model_limit = self.context_window.total_tokens;
-                let current_usage = self.context_window.used_tokens;
-                let available = model_limit
-                    .saturating_sub(current_usage)
-                    .saturating_sub(5000);
-                Some(available.min(10_000))
-            }
-            "embedded" => {
-                let model_limit = self.context_window.total_tokens;
-                let current_usage = self.context_window.used_tokens;
-                let available = model_limit
-                    .saturating_sub(current_usage)
-                    .saturating_sub(1000);
-                Some(available.min(3000))
-            }
-            _ => {
-                let available = self.context_window.remaining_tokens().saturating_sub(2000);
-                Some(available.min(5000))
-            }
-        };
-
-        debug!(
-            "Requesting summary with max_tokens: {:?} (current usage: {} tokens)",
-            summary_max_tokens, self.context_window.used_tokens
-        );
-
        let summary_request = CompletionRequest {
            messages: summary_messages,
-            max_tokens: summary_max_tokens,
+            max_tokens: Some(summary_max_tokens),
            temperature: Some(self.resolve_temperature(provider.name())),
            stream: false,
            tools: None,
@@ -3234,6 +3428,25 @@ impl<W: UiWriter> Agent<W> {
                    self.context_window.percentage_used() as u32
                ));

+                let provider = self.providers.get(None)?;
+                let provider_name = provider.name().to_string();
+                let _ = provider; // Release borrow early
+
+                // Apply fallback sequence: thinnify -> skinnify -> hard-coded 5000
+                let mut summary_max_tokens = self.apply_summary_fallback_sequence(&provider_name);
+
+                // Apply provider-specific caps
+                summary_max_tokens = match provider_name.as_str() {
+                    "databricks" | "anthropic" => summary_max_tokens.min(10_000),
+                    "embedded" => summary_max_tokens.min(3000),
+                    _ => summary_max_tokens.min(5000),
+                };
+
+                debug!(
+                    "Requesting summary with max_tokens: {} (current usage: {} tokens)",
+                    summary_max_tokens, self.context_window.used_tokens
+                );
+
                // Create summary request with FULL history
                let summary_prompt = self.context_window.create_summary_prompt();

@@ -3262,82 +3475,9 @@ impl<W: UiWriter> Agent<W> {

                let provider = self.providers.get(None)?;

-                // Dynamically calculate max_tokens for summary based on what's left
-                // We need to ensure: used_tokens + max_tokens <= total_context_limit
-                let summary_max_tokens = match provider.name() {
-                    "databricks" | "anthropic" => {
-                        // Use the actual configured context window size
-                        let model_limit = self.context_window.total_tokens;
-                        let current_usage = self.context_window.used_tokens;
-
-                        // Check if we have enough capacity for summarization
-                        if current_usage >= model_limit.saturating_sub(1000) {
-                            error!("Context window at capacity ({}%), cannot summarize. Current: {}, Limit: {}", 
-                               self.context_window.percentage_used(), current_usage, model_limit);
-                            return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify or /compact commands to reduce context size, or start a new session."));
-                        }
-
-                        // Leave buffer proportional to model size (min 1k, max 10k)
-                        let buffer = (model_limit / 40).clamp(1000, 10000); // 2.5% buffer
-                        let available = model_limit
-                            .saturating_sub(current_usage)
-                            .saturating_sub(buffer);
-                        // Cap at a reasonable summary size (10k tokens max)
-                        Some(available.min(10_000))
-                    }
-                    "embedded" => {
-                        // For smaller context models, be more conservative
-                        let model_limit = self.context_window.total_tokens;
-                        let current_usage = self.context_window.used_tokens;
-
-                        // Check capacity for embedded models too
-                        if current_usage >= model_limit.saturating_sub(500) {
-                            error!(
-                                "Embedded model context window at capacity ({}%)",
-                                self.context_window.percentage_used()
-                            );
-                            return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify command to reduce context size, or start a new session."));
-                        }
-
-                        // Leave 1k buffer
-                        let available = model_limit
-                            .saturating_sub(current_usage)
-                            .saturating_sub(1000);
-                        // Cap at 3k for embedded models
-                        Some(available.min(3000))
-                    }
-                    _ => {
-                        // Default: conservative approach
-                        let model_limit = self.context_window.total_tokens;
-                        let current_usage = self.context_window.used_tokens;
-
-                        if current_usage >= model_limit.saturating_sub(1000) {
-                            error!(
-                                "Context window at capacity ({}%)",
-                                self.context_window.percentage_used()
-                            );
-                            return Err(anyhow::anyhow!("Context window at capacity. Try using /thinnify or /compact commands, or start a new session."));
-                        }
-
-                        let available = self.context_window.remaining_tokens().saturating_sub(2000);
-                        Some(available.min(5000))
-                    }
-                };
-
-                debug!(
-                    "Requesting summary with max_tokens: {:?} (current usage: {} tokens)",
-                    summary_max_tokens, self.context_window.used_tokens
-                );
-
-                // Final safety check
-                if summary_max_tokens.unwrap_or(0) == 0 {
-                    error!("No tokens available for summarization");
-                    return Err(anyhow::anyhow!("No context window capacity left for summarization. Use /thinnify to reduce context size or start a new session."));
-                }
-
                let summary_request = CompletionRequest {
                    messages: summary_messages,
-                    max_tokens: summary_max_tokens,
+                    max_tokens: Some(summary_max_tokens),
                    temperature: Some(self.resolve_temperature(provider.name())),
                    stream: false,
                    tools: None,