Streaming token support

This commit is contained in:
Dhanji Prasanna
2025-09-08 13:24:39 +10:00
parent 33d4cef00b
commit 1e06b9fea3
3 changed files with 358 additions and 202 deletions

View File

@@ -6,8 +6,7 @@ use serde::{Deserialize, Serialize};
use std::path::Path;
use std::time::{Duration, Instant};
use tokio_util::sync::CancellationToken;
use tracing::field::debug;
use tracing::info;
use tracing::{error, field::debug, info};
#[derive(Debug, Clone)]
pub struct ContextWindow {
@@ -94,8 +93,9 @@ impl Agent {
// Set default provider
providers.set_default(&config.providers.default_provider)?;
// Initialize context window with configured max context length
let context_window = ContextWindow::new(config.agent.max_context_length as u32);
// Determine context window size based on active provider
let context_length = Self::determine_context_length(&config, &providers)?;
let context_window = ContextWindow::new(context_length);
Ok(Self {
providers,
@@ -104,6 +104,62 @@ impl Agent {
})
}
fn determine_context_length(config: &Config, providers: &ProviderRegistry) -> Result<u32> {
// Get the active provider to determine context length
let provider = providers.get(None)?;
let provider_name = provider.name();
let model_name = provider.model();
// Use provider-specific context length if available, otherwise fall back to agent config
let context_length = match provider_name {
"embedded" => {
// For embedded models, use the configured context_length or model-specific defaults
if let Some(embedded_config) = &config.providers.embedded {
embedded_config.context_length.unwrap_or_else(|| {
// Model-specific defaults for embedded models
match embedded_config.model_type.to_lowercase().as_str() {
"codellama" => 16384, // CodeLlama supports 16k context
"llama" => 4096, // Base Llama models
"mistral" => 8192, // Mistral models
_ => 4096, // Conservative default
}
})
} else {
config.agent.max_context_length as u32
}
}
"openai" => {
// OpenAI model-specific context lengths
match model_name {
m if m.contains("gpt-4") => 128000, // GPT-4 models have 128k context
m if m.contains("gpt-3.5") => 16384, // GPT-3.5-turbo has 16k context
_ => 4096, // Conservative default
}
}
"anthropic" => {
// Anthropic model-specific context lengths
match model_name {
m if m.contains("claude-3") => 200000, // Claude-3 has 200k context
m if m.contains("claude-2") => 100000, // Claude-2 has 100k context
_ => 100000, // Conservative default for Claude
}
}
_ => config.agent.max_context_length as u32,
};
info!(
"Using context length: {} tokens for provider: {} (model: {})",
context_length, provider_name, model_name
);
Ok(context_length)
}
pub fn get_provider_info(&self) -> Result<(String, String)> {
let provider = self.providers.get(None)?;
Ok((provider.name().to_string(), provider.model().to_string()))
}
pub async fn execute_task(
&mut self,
description: &str,
@@ -168,41 +224,67 @@ impl Agent {
) -> Result<String> {
info!("Executing task: {}", description);
let total_start = Instant::now();
let provider = self.providers.get(None)?;
let system_prompt = format!(
"You are G3, a code-first AI agent. Your goal is to solve problems by writing code that completes the desired task.
"You are G3, a general-purpose AI agent. Your goal is to analyze and write code to solve given problems.
When given a task:
1. Analyze what needs to be done
2. Rate the difficulty of the task from 1 (easy, file operations) to 10 (difficult, build complex applications like Firefox)
3. Choose the most appropriate programming language{}
4. Include any necessary imports/dependencies
5. Add error handling where appropriate
6. Generate code to complete the task, or ask for more details, but no other output
G3 uses LLMs with tool calling capability.
Tools allow external systems to provide context and data to G3. You solve higher level problems using
tools, and can interact with multiple at once. When you want to perform an action, use 'I' as the pronoun.
Prefer these languages:
- Bash/Shell: File operations, system administration, simple tasks
- Python: Complex data processing, when libraries are needed
- Rust: Performance-critical tasks, system programming
# Available Tools
- shell:
Execute a command in the shell.
Only use Rust/Python when you need libraries or complex logic that bash can't handle easily.
This will return the output and error concatenated into a single string, as
you would see from running on the command line. There will also be an indication
of if the command succeeded or failed.
Format your code response in markdown backticks as follows:
difficulty rating: [X]
```[language]
[code]
```
Avoid commands that produce a large amount of output, and consider piping those outputs to files.
with nothing afterwards.",
if let Some(lang) = language {
format!(" (prefer {})", lang)
} else {
" based on the task type".to_string()
}
);
**Important**: Each shell command runs in its own process. Things like directory changes or
sourcing files do not persist between tool calls. So you may need to repeat them each time by
stringing together commands, e.g. `cd example && ls` or `source env/bin/activate && pip install numpy`
Multiple commands: Use ; or && to chain commands, avoid newlines
Pathnames: Use absolute paths and avoid cd unless explicitly requested
Usage:
- Call the `shell` tool with the desired bash/shell commands.
- search:
Search the web for information about any topic.
- final_output:
This tool signals the final output for a user in a conversation and MUST be used for the final message to the user. You must
pass in a detailed summary of the work done to this tool call.
Purpose:
- Collects the final output for a user
- Provides clear validation feedback when output isn't valid
Usage:
- Call the `final_output` tool with a summary of the work performed.
# Response Guidelines
- Use Markdown formatting for all responses.
- Follow best practices for Markdown, including:
- Using headers for organization.
- Bullet points for lists.
- Links formatted correctly, either as linked text (e.g., [this is linked text](https://example.com)) or automatic links using angle brackets (e.g., <http://example.com/>).
- For code, use fenced code blocks by placing triple backticks (` ``` `) before and after the code. Include the language identifier after the opening backticks (e.g., ` ```python `) to enable syntax highlighting.
- Ensure clarity, conciseness, and proper formatting to enhance readability and usability.
IMPORTANT INSTRUCTIONS:
Please keep going until the user's query is completely resolved, before ending your turn and yielding back to the user.
Only terminate your turn when you are sure that the problem is solved.
If you are not sure about file content or codebase structure, or other information pertaining to the user's request,
use your tools to read files and gather the relevant information: do NOT guess or make up an answer. It is important
you use tools that can assist with providing the right context.
");
if show_prompt {
println!("🔍 System Prompt:");
@@ -232,26 +314,33 @@ with nothing afterwards.",
messages,
max_tokens: Some(2048),
temperature: Some(0.2),
stream: false,
stream: true, // Enable streaming
};
// Time the LLM call with cancellation support
// Time the LLM call with cancellation support and streaming
let llm_start = Instant::now();
let response = tokio::select! {
result = provider.complete(request) => result?,
let response_content = tokio::select! {
result = self.stream_completion(request) => result?,
_ = cancellation_token.cancelled() => {
return Err(anyhow::anyhow!("Operation cancelled by user"));
}
};
let llm_duration = llm_start.elapsed();
// Update context window with actual token usage
self.context_window.update_usage(&response.usage);
// Create a mock usage for now (we'll need to track this during streaming)
let mock_usage = g3_providers::Usage {
prompt_tokens: 100, // Estimate
completion_tokens: response_content.len() as u32 / 4, // Rough estimate
total_tokens: 100 + (response_content.len() as u32 / 4),
};
// Update context window with estimated token usage
self.context_window.update_usage(&mock_usage);
// Add assistant response to context window
let assistant_message = Message {
role: MessageRole::Assistant,
content: response.content.clone(),
content: response_content.clone(),
};
self.context_window.add_message(assistant_message);
@@ -259,19 +348,16 @@ with nothing afterwards.",
let exec_start = Instant::now();
let executor = CodeExecutor::new();
let result = tokio::select! {
result = executor.execute_from_response_with_options(&response.content, show_code) => result?,
result = executor.execute_from_response_with_options(&response_content, show_code) => result?,
_ = cancellation_token.cancelled() => {
return Err(anyhow::anyhow!("Operation cancelled by user"));
}
};
let exec_duration = exec_start.elapsed();
let total_duration = total_start.elapsed();
if show_timing {
let timing_summary = format!(
"\n{} [💡: {} ⚡️: {}]",
Self::format_duration(total_duration),
"\n💭 {} | ⚡️ {}",
Self::format_duration(llm_duration),
Self::format_duration(exec_duration)
);
@@ -285,6 +371,39 @@ with nothing afterwards.",
&self.context_window
}
async fn stream_completion(&self, request: CompletionRequest) -> Result<String> {
use tokio_stream::StreamExt;
let provider = self.providers.get(None)?;
let mut stream = provider.stream(request).await?;
let mut full_content = String::new();
print!("🤖 "); // Start the response indicator
use std::io::{self, Write};
io::stdout().flush()?;
while let Some(chunk_result) = stream.next().await {
match chunk_result {
Ok(chunk) => {
print!("{}", chunk.content);
io::stdout().flush()?;
full_content.push_str(&chunk.content);
if chunk.finished {
break;
}
}
Err(e) => {
error!("Streaming error: {}", e);
return Err(e);
}
}
}
println!(); // New line after streaming completes
Ok(full_content)
}
fn format_duration(duration: Duration) -> String {
let total_ms = duration.as_millis();