max_tokens fix

This commit is contained in:
Dhanji Prasanna
2025-09-29 11:05:57 +10:00
parent ce273ba3fb
commit 69fc3e90dc
2 changed files with 284 additions and 824 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -238,12 +238,26 @@ impl ContextWindow {
return; return;
} }
// Simple token estimation: ~4 characters per token // Better token estimation based on content type
let estimated_tokens = (message.content.len() as f32 / 4.0).ceil() as u32; let estimated_tokens = Self::estimate_tokens(&message.content);
self.used_tokens += estimated_tokens; self.used_tokens += estimated_tokens;
self.conversation_history.push(message); self.conversation_history.push(message);
} }
/// More accurate token estimation
fn estimate_tokens(text: &str) -> u32 {
// Better heuristic:
// - Average English text: ~4 characters per token
// - Code/JSON: ~3 characters per token (more symbols)
// - Add 10% buffer for safety
let base_estimate = if text.contains("{") || text.contains("```") || text.contains("fn ") {
(text.len() as f32 / 3.0).ceil() as u32 // Code/JSON
} else {
(text.len() as f32 / 4.0).ceil() as u32 // Regular text
};
(base_estimate as f32 * 1.1).ceil() as u32 // Add 10% buffer
}
pub fn update_usage(&mut self, usage: &g3_providers::Usage) { pub fn update_usage(&mut self, usage: &g3_providers::Usage) {
// Update with actual token usage from the provider // Update with actual token usage from the provider
self.used_tokens = usage.total_tokens; self.used_tokens = usage.total_tokens;
@@ -261,15 +275,25 @@ impl ContextWindow {
self.total_tokens.saturating_sub(self.used_tokens) self.total_tokens.saturating_sub(self.used_tokens)
} }
/// Check if we should trigger summarization (at 80% capacity) /// Check if we should trigger summarization (at 80% capacity)
pub fn should_summarize(&self) -> bool { pub fn should_summarize(&self) -> bool {
self.percentage_used() >= 80.0 // Trigger at 80% OR if we're getting close to absolute limits
// This prevents issues with models that have large contexts but still hit limits
let percentage_trigger = self.percentage_used() >= 80.0;
// Also trigger if we're approaching common token limits
// Most models start having issues around 150k tokens
let absolute_trigger = self.used_tokens > 150_000;
percentage_trigger || absolute_trigger
} }
/// Create a summary request prompt for the current conversation /// Create a summary request prompt for the current conversation
pub fn create_summary_prompt(&self) -> String { pub fn create_summary_prompt(&self) -> String {
"Please provide a comprehensive summary of our conversation so far. Include: "Please provide a comprehensive summary of our conversation so far. Include:
1. **Main Topic/Goal**: What is the primary task or objective being worked on? 1. **Main Topic/Goal**: What is the primary task or objective being worked on?
2. **Key Decisions**: What important decisions have been made? 2. **Key Decisions**: What important decisions have been made?
3. **Actions Taken**: What specific actions, commands, or code changes have been completed? 3. **Actions Taken**: What specific actions, commands, or code changes have been completed?
@@ -897,43 +921,77 @@ The tool will execute immediately and you'll receive the result (success or erro
// Check if we need to summarize before starting // Check if we need to summarize before starting
if self.context_window.should_summarize() { if self.context_window.should_summarize() {
info!( info!(
"Context window at {}%, triggering auto-summarization", "Context window at {}% ({}/{} tokens), triggering auto-summarization",
self.context_window.percentage_used() as u32 self.context_window.percentage_used() as u32,
self.context_window.used_tokens,
self.context_window.total_tokens
); );
// Notify user about summarization // Notify user about summarization
println!( println!("\n📊 Context window reaching capacity ({}%). Creating summary...",
"\n📊 Context window reaching capacity ({}%). Creating summary...", self.context_window.percentage_used() as u32);
self.context_window.percentage_used() as u32
);
// Create summary request // Create summary request with FULL history
let summary_prompt = self.context_window.create_summary_prompt(); let summary_prompt = self.context_window.create_summary_prompt();
// Get the full conversation history
let conversation_text = self.context_window.conversation_history
.iter()
.map(|m| format!("{:?}: {}", m.role, m.content))
.collect::<Vec<_>>()
.join("\n\n");
let summary_messages = vec![ let summary_messages = vec![
Message { Message {
role: MessageRole::System, role: MessageRole::System,
content: "You are a helpful assistant that creates concise summaries." content: "You are a helpful assistant that creates concise summaries.".to_string(),
.to_string(),
}, },
Message { Message {
role: MessageRole::User, role: MessageRole::User,
content: format!( content: format!("Based on this conversation history, {}\n\nConversation:\n{}",
"Based on this conversation history, {}\n\nConversation:\n{}",
summary_prompt, summary_prompt,
self.context_window conversation_text
.conversation_history
.iter()
.map(|m| format!("{:?}: {}", m.role, m.content))
.collect::<Vec<_>>()
.join("\n\n")
), ),
}, },
]; ];
let provider = self.providers.get(None)?; let provider = self.providers.get(None)?;
// Dynamically calculate max_tokens for summary based on what's left
// We need to ensure: used_tokens + max_tokens <= total_context_limit
let summary_max_tokens = match provider.name() {
"databricks" | "anthropic" => {
// Claude models have 200k context
// Calculate how much room we have left
let model_limit = 200_000u32;
let current_usage = self.context_window.used_tokens;
// Leave some buffer (5k tokens) for safety
let available = model_limit.saturating_sub(current_usage).saturating_sub(5000);
// Cap at a reasonable summary size (10k tokens max)
Some(available.min(10_000))
}
"embedded" => {
// For smaller context models, be more conservative
let model_limit = self.context_window.total_tokens;
let current_usage = self.context_window.used_tokens;
// Leave 1k buffer
let available = model_limit.saturating_sub(current_usage).saturating_sub(1000);
// Cap at 3k for embedded models
Some(available.min(3000))
}
_ => {
// Default: conservative approach
let available = self.context_window.remaining_tokens().saturating_sub(2000);
Some(available.min(5000))
}
};
info!("Requesting summary with max_tokens: {:?} (current usage: {} tokens)",
summary_max_tokens, self.context_window.used_tokens);
let summary_request = CompletionRequest { let summary_request = CompletionRequest {
messages: summary_messages, messages: summary_messages,
max_tokens: Some(4000), // Reasonable size for summary max_tokens: summary_max_tokens,
temperature: Some(0.3), // Lower temperature for factual summary temperature: Some(0.3), // Lower temperature for factual summary
stream: false, stream: false,
tools: None, tools: None,
@@ -962,7 +1020,11 @@ The tool will execute immediately and you'll receive the result (success or erro
println!("🔄 Context reset complete. Continuing with your request...\n"); println!("🔄 Context reset complete. Continuing with your request...\n");
} }
Err(e) => { Err(e) => {
warn!("Failed to create summary: {}. Continuing without reset.", e); error!("Failed to create summary: {}", e);
println!("⚠️ Unable to create summary. Consider starting a new session if you continue to see errors.\n");
// Don't continue with the original request if summarization failed
// as we're likely at token limit
return Err(anyhow::anyhow!("Context window at capacity and summarization failed. Please start a new session."));
} }
} }
} }