Streaming token support

2025-09-08 13:24:39 +10:00
parent 33d4cef00b
commit 1e06b9fea3
3 changed files with 358 additions and 202 deletions
--- a/crates/g3-core/src/lib.rs
+++ b/crates/g3-core/src/lib.rs
@@ -6,8 +6,7 @@ use serde::{Deserialize, Serialize};
 use std::path::Path;
 use std::time::{Duration, Instant};
 use tokio_util::sync::CancellationToken;
-use tracing::field::debug;
-use tracing::info;
+use tracing::{error, field::debug, info};

 #[derive(Debug, Clone)]
 pub struct ContextWindow {
@@ -94,8 +93,9 @@ impl Agent {
        // Set default provider
        providers.set_default(&config.providers.default_provider)?;

-        // Initialize context window with configured max context length
-        let context_window = ContextWindow::new(config.agent.max_context_length as u32);
+        // Determine context window size based on active provider
+        let context_length = Self::determine_context_length(&config, &providers)?;
+        let context_window = ContextWindow::new(context_length);

        Ok(Self {
            providers,
@@ -104,6 +104,62 @@ impl Agent {
        })
    }

+    fn determine_context_length(config: &Config, providers: &ProviderRegistry) -> Result<u32> {
+        // Get the active provider to determine context length
+        let provider = providers.get(None)?;
+        let provider_name = provider.name();
+        let model_name = provider.model();
+
+        // Use provider-specific context length if available, otherwise fall back to agent config
+        let context_length = match provider_name {
+            "embedded" => {
+                // For embedded models, use the configured context_length or model-specific defaults
+                if let Some(embedded_config) = &config.providers.embedded {
+                    embedded_config.context_length.unwrap_or_else(|| {
+                        // Model-specific defaults for embedded models
+                        match embedded_config.model_type.to_lowercase().as_str() {
+                            "codellama" => 16384, // CodeLlama supports 16k context
+                            "llama" => 4096,      // Base Llama models
+                            "mistral" => 8192,    // Mistral models
+                            _ => 4096,            // Conservative default
+                        }
+                    })
+                } else {
+                    config.agent.max_context_length as u32
+                }
+            }
+            "openai" => {
+                // OpenAI model-specific context lengths
+                match model_name {
+                    m if m.contains("gpt-4") => 128000, // GPT-4 models have 128k context
+                    m if m.contains("gpt-3.5") => 16384, // GPT-3.5-turbo has 16k context
+                    _ => 4096,                          // Conservative default
+                }
+            }
+            "anthropic" => {
+                // Anthropic model-specific context lengths
+                match model_name {
+                    m if m.contains("claude-3") => 200000, // Claude-3 has 200k context
+                    m if m.contains("claude-2") => 100000, // Claude-2 has 100k context
+                    _ => 100000,                           // Conservative default for Claude
+                }
+            }
+            _ => config.agent.max_context_length as u32,
+        };
+
+        info!(
+            "Using context length: {} tokens for provider: {} (model: {})",
+            context_length, provider_name, model_name
+        );
+
+        Ok(context_length)
+    }
+
+    pub fn get_provider_info(&self) -> Result<(String, String)> {
+        let provider = self.providers.get(None)?;
+        Ok((provider.name().to_string(), provider.model().to_string()))
+    }
+
    pub async fn execute_task(
        &mut self,
        description: &str,
@@ -168,41 +224,67 @@ impl Agent {
    ) -> Result<String> {
        info!("Executing task: {}", description);

-        let total_start = Instant::now();
-
        let provider = self.providers.get(None)?;

        let system_prompt = format!(
-            "You are G3, a code-first AI agent. Your goal is to solve problems by writing code that completes the desired task.
+            "You are G3, a general-purpose AI agent. Your goal is to analyze and write code to solve given problems.

-When given a task:
-1. Analyze what needs to be done
-2. Rate the difficulty of the task from 1 (easy, file operations) to 10 (difficult, build complex applications like Firefox)
-3. Choose the most appropriate programming language{}
-4. Include any necessary imports/dependencies
-5. Add error handling where appropriate
-6. Generate code to complete the task, or ask for more details, but no other output
+            G3 uses LLMs with tool calling capability.
+            Tools allow external systems to provide context and data to G3. You solve higher level problems using
+            tools, and can interact with multiple at once. When you want to perform an action, use 'I' as the pronoun.

-Prefer these languages:
- Bash/Shell: File operations, system administration, simple tasks
- Python: Complex data processing, when libraries are needed
- Rust: Performance-critical tasks, system programming
+# Available Tools
+- shell:
+    Execute a command in the shell.

-Only use Rust/Python when you need libraries or complex logic that bash can't handle easily.
+    This will return the output and error concatenated into a single string, as
+    you would see from running on the command line. There will also be an indication
+    of if the command succeeded or failed.

-Format your code response in markdown backticks as follows:
-difficulty rating: [X]
-```[language]
-[code]
-```
+    Avoid commands that produce a large amount of output, and consider piping those outputs to files.

-with nothing afterwards.",
-            if let Some(lang) = language {
-                format!(" (prefer {})", lang)
-            } else {
-                " based on the task type".to_string()
-            }
-        );
+    **Important**: Each shell command runs in its own process. Things like directory changes or
+    sourcing files do not persist between tool calls. So you may need to repeat them each time by
+    stringing together commands, e.g. `cd example && ls` or `source env/bin/activate && pip install numpy`
+
+    Multiple commands: Use ; or && to chain commands, avoid newlines
+    Pathnames: Use absolute paths and avoid cd unless explicitly requested
+
+    Usage:
+    - Call the `shell` tool with the desired bash/shell commands.
+
+- search:
+    Search the web for information about any topic.
+
+- final_output:
+    This tool signals the final output for a user in a conversation and MUST be used for the final message to the user. You must
+    pass in a detailed summary of the work done to this tool call.
+
+    Purpose:
+    - Collects the final output for a user
+    - Provides clear validation feedback when output isn't valid
+
+    Usage:
+    - Call the `final_output` tool with a summary of the work performed.
+
+# Response Guidelines
+- Use Markdown formatting for all responses.
+- Follow best practices for Markdown, including:
+    - Using headers for organization.
+    - Bullet points for lists.
+    - Links formatted correctly, either as linked text (e.g., [this is linked text](https://example.com)) or automatic links using angle brackets (e.g., <http://example.com/>).
+- For code, use fenced code blocks by placing triple backticks (` ``` `) before and after the code. Include the language identifier after the opening backticks (e.g., ` ```python `) to enable syntax highlighting.
+- Ensure clarity, conciseness, and proper formatting to enhance readability and usability.
+
+IMPORTANT INSTRUCTIONS:
+
+Please keep going until the user's query is completely resolved, before ending your turn and yielding back to the user.
+Only terminate your turn when you are sure that the problem is solved.
+
+If you are not sure about file content or codebase structure, or other information pertaining to the user's request,
+use your tools to read files and gather the relevant information: do NOT guess or make up an answer. It is important
+you use tools that can assist with providing the right context.
+");

        if show_prompt {
            println!("🔍 System Prompt:");
@@ -232,26 +314,33 @@ with nothing afterwards.",
            messages,
            max_tokens: Some(2048),
            temperature: Some(0.2),
-            stream: false,
+            stream: true, // Enable streaming
        };

-        // Time the LLM call with cancellation support
+        // Time the LLM call with cancellation support and streaming
        let llm_start = Instant::now();
-        let response = tokio::select! {
-            result = provider.complete(request) => result?,
+        let response_content = tokio::select! {
+            result = self.stream_completion(request) => result?,
            _ = cancellation_token.cancelled() => {
                return Err(anyhow::anyhow!("Operation cancelled by user"));
            }
        };
        let llm_duration = llm_start.elapsed();

-        // Update context window with actual token usage
-        self.context_window.update_usage(&response.usage);
+        // Create a mock usage for now (we'll need to track this during streaming)
+        let mock_usage = g3_providers::Usage {
+            prompt_tokens: 100,                                   // Estimate
+            completion_tokens: response_content.len() as u32 / 4, // Rough estimate
+            total_tokens: 100 + (response_content.len() as u32 / 4),
+        };
+
+        // Update context window with estimated token usage
+        self.context_window.update_usage(&mock_usage);

        // Add assistant response to context window
        let assistant_message = Message {
            role: MessageRole::Assistant,
-            content: response.content.clone(),
+            content: response_content.clone(),
        };
        self.context_window.add_message(assistant_message);

@@ -259,19 +348,16 @@ with nothing afterwards.",
        let exec_start = Instant::now();
        let executor = CodeExecutor::new();
        let result = tokio::select! {
-            result = executor.execute_from_response_with_options(&response.content, show_code) => result?,
+            result = executor.execute_from_response_with_options(&response_content, show_code) => result?,
            _ = cancellation_token.cancelled() => {
                return Err(anyhow::anyhow!("Operation cancelled by user"));
            }
        };
        let exec_duration = exec_start.elapsed();

-        let total_duration = total_start.elapsed();
-
        if show_timing {
            let timing_summary = format!(
-                "\n{} [💡: {} ⚡️: {}]",
-                Self::format_duration(total_duration),
+                "\n💭 {} | ⚡️ {}",
                Self::format_duration(llm_duration),
                Self::format_duration(exec_duration)
            );
@@ -285,6 +371,39 @@ with nothing afterwards.",
        &self.context_window
    }

+    async fn stream_completion(&self, request: CompletionRequest) -> Result<String> {
+        use tokio_stream::StreamExt;
+
+        let provider = self.providers.get(None)?;
+        let mut stream = provider.stream(request).await?;
+
+        let mut full_content = String::new();
+        print!("🤖 "); // Start the response indicator
+        use std::io::{self, Write};
+        io::stdout().flush()?;
+
+        while let Some(chunk_result) = stream.next().await {
+            match chunk_result {
+                Ok(chunk) => {
+                    print!("{}", chunk.content);
+                    io::stdout().flush()?;
+                    full_content.push_str(&chunk.content);
+
+                    if chunk.finished {
+                        break;
+                    }
+                }
+                Err(e) => {
+                    error!("Streaming error: {}", e);
+                    return Err(e);
+                }
+            }
+        }
+
+        println!(); // New line after streaming completes
+        Ok(full_content)
+    }
+
    fn format_duration(duration: Duration) -> String {
        let total_ms = duration.as_millis();