max_tokens fix

2025-09-29 11:05:57 +10:00
parent ce273ba3fb
commit 69fc3e90dc
2 changed files with 284 additions and 824 deletions
--- a/crates/g3-cli/src/lib.rs
+++ b/crates/g3-cli/src/lib.rs
--- a/crates/g3-core/src/lib.rs
+++ b/crates/g3-core/src/lib.rs
@@ -238,12 +238,26 @@ impl ContextWindow {
            return;
        }
-        // Simple token estimation: ~4 characters per token
+        // Better token estimation based on content type
-        let estimated_tokens = (message.content.len() as f32 / 4.0).ceil() as u32;
+        let estimated_tokens = Self::estimate_tokens(&message.content);
        self.used_tokens += estimated_tokens;
        self.conversation_history.push(message);
    }
    /// More accurate token estimation
    fn estimate_tokens(text: &str) -> u32 {
        // Better heuristic: 
        // - Average English text: ~4 characters per token
        // - Code/JSON: ~3 characters per token (more symbols)
        // - Add 10% buffer for safety
        let base_estimate = if text.contains("{") || text.contains("```") || text.contains("fn ") {
            (text.len() as f32 / 3.0).ceil() as u32  // Code/JSON
        } else {
            (text.len() as f32 / 4.0).ceil() as u32  // Regular text
        };
        (base_estimate as f32 * 1.1).ceil() as u32  // Add 10% buffer
    }
    pub fn update_usage(&mut self, usage: &g3_providers::Usage) {
        // Update with actual token usage from the provider
        self.used_tokens = usage.total_tokens;
@@ -261,15 +275,25 @@ impl ContextWindow {
        self.total_tokens.saturating_sub(self.used_tokens)
    }
    /// Check if we should trigger summarization (at 80% capacity)
    pub fn should_summarize(&self) -> bool {
-        self.percentage_used() >= 80.0
+        // Trigger at 80% OR if we're getting close to absolute limits
        // This prevents issues with models that have large contexts but still hit limits
        let percentage_trigger = self.percentage_used() >= 80.0;
        // Also trigger if we're approaching common token limits
        // Most models start having issues around 150k tokens
        let absolute_trigger = self.used_tokens > 150_000;
        percentage_trigger || absolute_trigger
    }
    /// Create a summary request prompt for the current conversation
    pub fn create_summary_prompt(&self) -> String {
        "Please provide a comprehensive summary of our conversation so far. Include:
 1. **Main Topic/Goal**: What is the primary task or objective being worked on?
 2. **Key Decisions**: What important decisions have been made?
 3. **Actions Taken**: What specific actions, commands, or code changes have been completed?
@@ -897,43 +921,77 @@ The tool will execute immediately and you'll receive the result (success or erro
        // Check if we need to summarize before starting
        if self.context_window.should_summarize() {
            info!(
-                "Context window at {}%, triggering auto-summarization",
+                "Context window at {}% ({}/{} tokens), triggering auto-summarization", 
-                self.context_window.percentage_used() as u32
+                self.context_window.percentage_used() as u32,
                self.context_window.used_tokens,
                self.context_window.total_tokens
            );
            // Notify user about summarization
-            println!(
+            println!("\n📊 Context window reaching capacity ({}%). Creating summary...", 
-                "\n📊 Context window reaching capacity ({}%). Creating summary...",
+                self.context_window.percentage_used() as u32);
                self.context_window.percentage_used() as u32
            );
-            // Create summary request
+            // Create summary request with FULL history
            let summary_prompt = self.context_window.create_summary_prompt();
            // Get the full conversation history
            let conversation_text = self.context_window.conversation_history
                .iter()
                .map(|m| format!("{:?}: {}", m.role, m.content))
                .collect::<Vec<_>>()
                .join("\n\n");
            let summary_messages = vec![
                Message {
                    role: MessageRole::System,
-                    content: "You are a helpful assistant that creates concise summaries."
+                    content: "You are a helpful assistant that creates concise summaries.".to_string(),
                        .to_string(),
                },
                Message {
                    role: MessageRole::User,
-                    content: format!(
+                    content: format!("Based on this conversation history, {}\n\nConversation:\n{}", 
                        "Based on this conversation history, {}\n\nConversation:\n{}",
                        summary_prompt,
-                        self.context_window
+                        conversation_text
                            .conversation_history
                            .iter()
                            .map(|m| format!("{:?}: {}", m.role, m.content))
                            .collect::<Vec<_>>()
                            .join("\n\n")
                    ),
                },
            ];
            let provider = self.providers.get(None)?;
            // Dynamically calculate max_tokens for summary based on what's left
            // We need to ensure: used_tokens + max_tokens <= total_context_limit
            let summary_max_tokens = match provider.name() {
                "databricks" | "anthropic" => {
                    // Claude models have 200k context
                    // Calculate how much room we have left
                    let model_limit = 200_000u32;
                    let current_usage = self.context_window.used_tokens;
                    // Leave some buffer (5k tokens) for safety
                    let available = model_limit.saturating_sub(current_usage).saturating_sub(5000);
                    // Cap at a reasonable summary size (10k tokens max)
                    Some(available.min(10_000))
                }
                "embedded" => {
                    // For smaller context models, be more conservative
                    let model_limit = self.context_window.total_tokens;
                    let current_usage = self.context_window.used_tokens;
                    // Leave 1k buffer
                    let available = model_limit.saturating_sub(current_usage).saturating_sub(1000);
                    // Cap at 3k for embedded models
                    Some(available.min(3000))
                }
                _ => {
                    // Default: conservative approach
                    let available = self.context_window.remaining_tokens().saturating_sub(2000);
                    Some(available.min(5000))
                }
            };
            info!("Requesting summary with max_tokens: {:?} (current usage: {} tokens)", 
                summary_max_tokens, self.context_window.used_tokens);
            let summary_request = CompletionRequest {
                messages: summary_messages,
-                max_tokens: Some(4000), // Reasonable size for summary
+                max_tokens: summary_max_tokens,
                temperature: Some(0.3), // Lower temperature for factual summary
                stream: false,
                tools: None,
@@ -962,7 +1020,11 @@ The tool will execute immediately and you'll receive the result (success or erro
                    println!("🔄 Context reset complete. Continuing with your request...\n");
                }
                Err(e) => {
-                    warn!("Failed to create summary: {}. Continuing without reset.", e);
+                    error!("Failed to create summary: {}", e);
                    println!("⚠️ Unable to create summary. Consider starting a new session if you continue to see errors.\n");
                    // Don't continue with the original request if summarization failed
                    // as we're likely at token limit
                    return Err(anyhow::anyhow!("Context window at capacity and summarization failed. Please start a new session."));
                }
            }
        }