Streaming token support
This commit is contained in:
@@ -6,8 +6,7 @@ use serde::{Deserialize, Serialize};
|
||||
use std::path::Path;
|
||||
use std::time::{Duration, Instant};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::field::debug;
|
||||
use tracing::info;
|
||||
use tracing::{error, field::debug, info};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ContextWindow {
|
||||
@@ -94,8 +93,9 @@ impl Agent {
|
||||
// Set default provider
|
||||
providers.set_default(&config.providers.default_provider)?;
|
||||
|
||||
// Initialize context window with configured max context length
|
||||
let context_window = ContextWindow::new(config.agent.max_context_length as u32);
|
||||
// Determine context window size based on active provider
|
||||
let context_length = Self::determine_context_length(&config, &providers)?;
|
||||
let context_window = ContextWindow::new(context_length);
|
||||
|
||||
Ok(Self {
|
||||
providers,
|
||||
@@ -104,6 +104,62 @@ impl Agent {
|
||||
})
|
||||
}
|
||||
|
||||
fn determine_context_length(config: &Config, providers: &ProviderRegistry) -> Result<u32> {
|
||||
// Get the active provider to determine context length
|
||||
let provider = providers.get(None)?;
|
||||
let provider_name = provider.name();
|
||||
let model_name = provider.model();
|
||||
|
||||
// Use provider-specific context length if available, otherwise fall back to agent config
|
||||
let context_length = match provider_name {
|
||||
"embedded" => {
|
||||
// For embedded models, use the configured context_length or model-specific defaults
|
||||
if let Some(embedded_config) = &config.providers.embedded {
|
||||
embedded_config.context_length.unwrap_or_else(|| {
|
||||
// Model-specific defaults for embedded models
|
||||
match embedded_config.model_type.to_lowercase().as_str() {
|
||||
"codellama" => 16384, // CodeLlama supports 16k context
|
||||
"llama" => 4096, // Base Llama models
|
||||
"mistral" => 8192, // Mistral models
|
||||
_ => 4096, // Conservative default
|
||||
}
|
||||
})
|
||||
} else {
|
||||
config.agent.max_context_length as u32
|
||||
}
|
||||
}
|
||||
"openai" => {
|
||||
// OpenAI model-specific context lengths
|
||||
match model_name {
|
||||
m if m.contains("gpt-4") => 128000, // GPT-4 models have 128k context
|
||||
m if m.contains("gpt-3.5") => 16384, // GPT-3.5-turbo has 16k context
|
||||
_ => 4096, // Conservative default
|
||||
}
|
||||
}
|
||||
"anthropic" => {
|
||||
// Anthropic model-specific context lengths
|
||||
match model_name {
|
||||
m if m.contains("claude-3") => 200000, // Claude-3 has 200k context
|
||||
m if m.contains("claude-2") => 100000, // Claude-2 has 100k context
|
||||
_ => 100000, // Conservative default for Claude
|
||||
}
|
||||
}
|
||||
_ => config.agent.max_context_length as u32,
|
||||
};
|
||||
|
||||
info!(
|
||||
"Using context length: {} tokens for provider: {} (model: {})",
|
||||
context_length, provider_name, model_name
|
||||
);
|
||||
|
||||
Ok(context_length)
|
||||
}
|
||||
|
||||
pub fn get_provider_info(&self) -> Result<(String, String)> {
|
||||
let provider = self.providers.get(None)?;
|
||||
Ok((provider.name().to_string(), provider.model().to_string()))
|
||||
}
|
||||
|
||||
pub async fn execute_task(
|
||||
&mut self,
|
||||
description: &str,
|
||||
@@ -168,41 +224,67 @@ impl Agent {
|
||||
) -> Result<String> {
|
||||
info!("Executing task: {}", description);
|
||||
|
||||
let total_start = Instant::now();
|
||||
|
||||
let provider = self.providers.get(None)?;
|
||||
|
||||
let system_prompt = format!(
|
||||
"You are G3, a code-first AI agent. Your goal is to solve problems by writing code that completes the desired task.
|
||||
"You are G3, a general-purpose AI agent. Your goal is to analyze and write code to solve given problems.
|
||||
|
||||
When given a task:
|
||||
1. Analyze what needs to be done
|
||||
2. Rate the difficulty of the task from 1 (easy, file operations) to 10 (difficult, build complex applications like Firefox)
|
||||
3. Choose the most appropriate programming language{}
|
||||
4. Include any necessary imports/dependencies
|
||||
5. Add error handling where appropriate
|
||||
6. Generate code to complete the task, or ask for more details, but no other output
|
||||
G3 uses LLMs with tool calling capability.
|
||||
Tools allow external systems to provide context and data to G3. You solve higher level problems using
|
||||
tools, and can interact with multiple at once. When you want to perform an action, use 'I' as the pronoun.
|
||||
|
||||
Prefer these languages:
|
||||
- Bash/Shell: File operations, system administration, simple tasks
|
||||
- Python: Complex data processing, when libraries are needed
|
||||
- Rust: Performance-critical tasks, system programming
|
||||
# Available Tools
|
||||
- shell:
|
||||
Execute a command in the shell.
|
||||
|
||||
Only use Rust/Python when you need libraries or complex logic that bash can't handle easily.
|
||||
This will return the output and error concatenated into a single string, as
|
||||
you would see from running on the command line. There will also be an indication
|
||||
of if the command succeeded or failed.
|
||||
|
||||
Format your code response in markdown backticks as follows:
|
||||
difficulty rating: [X]
|
||||
```[language]
|
||||
[code]
|
||||
```
|
||||
Avoid commands that produce a large amount of output, and consider piping those outputs to files.
|
||||
|
||||
with nothing afterwards.",
|
||||
if let Some(lang) = language {
|
||||
format!(" (prefer {})", lang)
|
||||
} else {
|
||||
" based on the task type".to_string()
|
||||
}
|
||||
);
|
||||
**Important**: Each shell command runs in its own process. Things like directory changes or
|
||||
sourcing files do not persist between tool calls. So you may need to repeat them each time by
|
||||
stringing together commands, e.g. `cd example && ls` or `source env/bin/activate && pip install numpy`
|
||||
|
||||
Multiple commands: Use ; or && to chain commands, avoid newlines
|
||||
Pathnames: Use absolute paths and avoid cd unless explicitly requested
|
||||
|
||||
Usage:
|
||||
- Call the `shell` tool with the desired bash/shell commands.
|
||||
|
||||
- search:
|
||||
Search the web for information about any topic.
|
||||
|
||||
- final_output:
|
||||
This tool signals the final output for a user in a conversation and MUST be used for the final message to the user. You must
|
||||
pass in a detailed summary of the work done to this tool call.
|
||||
|
||||
Purpose:
|
||||
- Collects the final output for a user
|
||||
- Provides clear validation feedback when output isn't valid
|
||||
|
||||
Usage:
|
||||
- Call the `final_output` tool with a summary of the work performed.
|
||||
|
||||
# Response Guidelines
|
||||
- Use Markdown formatting for all responses.
|
||||
- Follow best practices for Markdown, including:
|
||||
- Using headers for organization.
|
||||
- Bullet points for lists.
|
||||
- Links formatted correctly, either as linked text (e.g., [this is linked text](https://example.com)) or automatic links using angle brackets (e.g., <http://example.com/>).
|
||||
- For code, use fenced code blocks by placing triple backticks (` ``` `) before and after the code. Include the language identifier after the opening backticks (e.g., ` ```python `) to enable syntax highlighting.
|
||||
- Ensure clarity, conciseness, and proper formatting to enhance readability and usability.
|
||||
|
||||
IMPORTANT INSTRUCTIONS:
|
||||
|
||||
Please keep going until the user's query is completely resolved, before ending your turn and yielding back to the user.
|
||||
Only terminate your turn when you are sure that the problem is solved.
|
||||
|
||||
If you are not sure about file content or codebase structure, or other information pertaining to the user's request,
|
||||
use your tools to read files and gather the relevant information: do NOT guess or make up an answer. It is important
|
||||
you use tools that can assist with providing the right context.
|
||||
");
|
||||
|
||||
if show_prompt {
|
||||
println!("🔍 System Prompt:");
|
||||
@@ -232,26 +314,33 @@ with nothing afterwards.",
|
||||
messages,
|
||||
max_tokens: Some(2048),
|
||||
temperature: Some(0.2),
|
||||
stream: false,
|
||||
stream: true, // Enable streaming
|
||||
};
|
||||
|
||||
// Time the LLM call with cancellation support
|
||||
// Time the LLM call with cancellation support and streaming
|
||||
let llm_start = Instant::now();
|
||||
let response = tokio::select! {
|
||||
result = provider.complete(request) => result?,
|
||||
let response_content = tokio::select! {
|
||||
result = self.stream_completion(request) => result?,
|
||||
_ = cancellation_token.cancelled() => {
|
||||
return Err(anyhow::anyhow!("Operation cancelled by user"));
|
||||
}
|
||||
};
|
||||
let llm_duration = llm_start.elapsed();
|
||||
|
||||
// Update context window with actual token usage
|
||||
self.context_window.update_usage(&response.usage);
|
||||
// Create a mock usage for now (we'll need to track this during streaming)
|
||||
let mock_usage = g3_providers::Usage {
|
||||
prompt_tokens: 100, // Estimate
|
||||
completion_tokens: response_content.len() as u32 / 4, // Rough estimate
|
||||
total_tokens: 100 + (response_content.len() as u32 / 4),
|
||||
};
|
||||
|
||||
// Update context window with estimated token usage
|
||||
self.context_window.update_usage(&mock_usage);
|
||||
|
||||
// Add assistant response to context window
|
||||
let assistant_message = Message {
|
||||
role: MessageRole::Assistant,
|
||||
content: response.content.clone(),
|
||||
content: response_content.clone(),
|
||||
};
|
||||
self.context_window.add_message(assistant_message);
|
||||
|
||||
@@ -259,19 +348,16 @@ with nothing afterwards.",
|
||||
let exec_start = Instant::now();
|
||||
let executor = CodeExecutor::new();
|
||||
let result = tokio::select! {
|
||||
result = executor.execute_from_response_with_options(&response.content, show_code) => result?,
|
||||
result = executor.execute_from_response_with_options(&response_content, show_code) => result?,
|
||||
_ = cancellation_token.cancelled() => {
|
||||
return Err(anyhow::anyhow!("Operation cancelled by user"));
|
||||
}
|
||||
};
|
||||
let exec_duration = exec_start.elapsed();
|
||||
|
||||
let total_duration = total_start.elapsed();
|
||||
|
||||
if show_timing {
|
||||
let timing_summary = format!(
|
||||
"\n{} [💡: {} ⚡️: {}]",
|
||||
Self::format_duration(total_duration),
|
||||
"\n💭 {} | ⚡️ {}",
|
||||
Self::format_duration(llm_duration),
|
||||
Self::format_duration(exec_duration)
|
||||
);
|
||||
@@ -285,6 +371,39 @@ with nothing afterwards.",
|
||||
&self.context_window
|
||||
}
|
||||
|
||||
async fn stream_completion(&self, request: CompletionRequest) -> Result<String> {
|
||||
use tokio_stream::StreamExt;
|
||||
|
||||
let provider = self.providers.get(None)?;
|
||||
let mut stream = provider.stream(request).await?;
|
||||
|
||||
let mut full_content = String::new();
|
||||
print!("🤖 "); // Start the response indicator
|
||||
use std::io::{self, Write};
|
||||
io::stdout().flush()?;
|
||||
|
||||
while let Some(chunk_result) = stream.next().await {
|
||||
match chunk_result {
|
||||
Ok(chunk) => {
|
||||
print!("{}", chunk.content);
|
||||
io::stdout().flush()?;
|
||||
full_content.push_str(&chunk.content);
|
||||
|
||||
if chunk.finished {
|
||||
break;
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Streaming error: {}", e);
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
println!(); // New line after streaming completes
|
||||
Ok(full_content)
|
||||
}
|
||||
|
||||
fn format_duration(duration: Duration) -> String {
|
||||
let total_ms = duration.as_millis();
|
||||
|
||||
|
||||
Reference in New Issue
Block a user