Streaming token support

This commit is contained in:
Dhanji Prasanna
2025-09-08 13:24:39 +10:00
parent 33d4cef00b
commit 1e06b9fea3
3 changed files with 358 additions and 202 deletions

View File

@@ -75,9 +75,20 @@ async fn run_interactive(mut agent: Agent, show_prompt: bool, show_code: bool) -
println!( println!(
"I solve problems by writing and executing code. Tell me what you need to accomplish!" "I solve problems by writing and executing code. Tell me what you need to accomplish!"
); );
println!();
// Display provider and model information
match agent.get_provider_info() {
Ok((provider, model)) => {
println!("🔧 Provider: {} | Model: {}", provider, model);
}
Err(e) => {
error!("Failed to get provider info: {}", e);
}
}
println!(); println!();
println!("Type 'exit' or 'quit' to exit, use Up/Down arrows for command history"); println!("Type 'exit' or 'quit' to exit, use Up/Down arrows for command history");
println!("Press ESC during operations to cancel the current request");
println!(); println!();
// Initialize rustyline editor with history // Initialize rustyline editor with history

View File

@@ -6,8 +6,7 @@ use serde::{Deserialize, Serialize};
use std::path::Path; use std::path::Path;
use std::time::{Duration, Instant}; use std::time::{Duration, Instant};
use tokio_util::sync::CancellationToken; use tokio_util::sync::CancellationToken;
use tracing::field::debug; use tracing::{error, field::debug, info};
use tracing::info;
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct ContextWindow { pub struct ContextWindow {
@@ -94,8 +93,9 @@ impl Agent {
// Set default provider // Set default provider
providers.set_default(&config.providers.default_provider)?; providers.set_default(&config.providers.default_provider)?;
// Initialize context window with configured max context length // Determine context window size based on active provider
let context_window = ContextWindow::new(config.agent.max_context_length as u32); let context_length = Self::determine_context_length(&config, &providers)?;
let context_window = ContextWindow::new(context_length);
Ok(Self { Ok(Self {
providers, providers,
@@ -104,6 +104,62 @@ impl Agent {
}) })
} }
fn determine_context_length(config: &Config, providers: &ProviderRegistry) -> Result<u32> {
// Get the active provider to determine context length
let provider = providers.get(None)?;
let provider_name = provider.name();
let model_name = provider.model();
// Use provider-specific context length if available, otherwise fall back to agent config
let context_length = match provider_name {
"embedded" => {
// For embedded models, use the configured context_length or model-specific defaults
if let Some(embedded_config) = &config.providers.embedded {
embedded_config.context_length.unwrap_or_else(|| {
// Model-specific defaults for embedded models
match embedded_config.model_type.to_lowercase().as_str() {
"codellama" => 16384, // CodeLlama supports 16k context
"llama" => 4096, // Base Llama models
"mistral" => 8192, // Mistral models
_ => 4096, // Conservative default
}
})
} else {
config.agent.max_context_length as u32
}
}
"openai" => {
// OpenAI model-specific context lengths
match model_name {
m if m.contains("gpt-4") => 128000, // GPT-4 models have 128k context
m if m.contains("gpt-3.5") => 16384, // GPT-3.5-turbo has 16k context
_ => 4096, // Conservative default
}
}
"anthropic" => {
// Anthropic model-specific context lengths
match model_name {
m if m.contains("claude-3") => 200000, // Claude-3 has 200k context
m if m.contains("claude-2") => 100000, // Claude-2 has 100k context
_ => 100000, // Conservative default for Claude
}
}
_ => config.agent.max_context_length as u32,
};
info!(
"Using context length: {} tokens for provider: {} (model: {})",
context_length, provider_name, model_name
);
Ok(context_length)
}
pub fn get_provider_info(&self) -> Result<(String, String)> {
let provider = self.providers.get(None)?;
Ok((provider.name().to_string(), provider.model().to_string()))
}
pub async fn execute_task( pub async fn execute_task(
&mut self, &mut self,
description: &str, description: &str,
@@ -168,41 +224,67 @@ impl Agent {
) -> Result<String> { ) -> Result<String> {
info!("Executing task: {}", description); info!("Executing task: {}", description);
let total_start = Instant::now();
let provider = self.providers.get(None)?; let provider = self.providers.get(None)?;
let system_prompt = format!( let system_prompt = format!(
"You are G3, a code-first AI agent. Your goal is to solve problems by writing code that completes the desired task. "You are G3, a general-purpose AI agent. Your goal is to analyze and write code to solve given problems.
When given a task: G3 uses LLMs with tool calling capability.
1. Analyze what needs to be done Tools allow external systems to provide context and data to G3. You solve higher level problems using
2. Rate the difficulty of the task from 1 (easy, file operations) to 10 (difficult, build complex applications like Firefox) tools, and can interact with multiple at once. When you want to perform an action, use 'I' as the pronoun.
3. Choose the most appropriate programming language{}
4. Include any necessary imports/dependencies
5. Add error handling where appropriate
6. Generate code to complete the task, or ask for more details, but no other output
Prefer these languages: # Available Tools
- Bash/Shell: File operations, system administration, simple tasks - shell:
- Python: Complex data processing, when libraries are needed Execute a command in the shell.
- Rust: Performance-critical tasks, system programming
Only use Rust/Python when you need libraries or complex logic that bash can't handle easily. This will return the output and error concatenated into a single string, as
you would see from running on the command line. There will also be an indication
of if the command succeeded or failed.
Format your code response in markdown backticks as follows: Avoid commands that produce a large amount of output, and consider piping those outputs to files.
difficulty rating: [X]
```[language]
[code]
```
with nothing afterwards.", **Important**: Each shell command runs in its own process. Things like directory changes or
if let Some(lang) = language { sourcing files do not persist between tool calls. So you may need to repeat them each time by
format!(" (prefer {})", lang) stringing together commands, e.g. `cd example && ls` or `source env/bin/activate && pip install numpy`
} else {
" based on the task type".to_string() Multiple commands: Use ; or && to chain commands, avoid newlines
} Pathnames: Use absolute paths and avoid cd unless explicitly requested
);
Usage:
- Call the `shell` tool with the desired bash/shell commands.
- search:
Search the web for information about any topic.
- final_output:
This tool signals the final output for a user in a conversation and MUST be used for the final message to the user. You must
pass in a detailed summary of the work done to this tool call.
Purpose:
- Collects the final output for a user
- Provides clear validation feedback when output isn't valid
Usage:
- Call the `final_output` tool with a summary of the work performed.
# Response Guidelines
- Use Markdown formatting for all responses.
- Follow best practices for Markdown, including:
- Using headers for organization.
- Bullet points for lists.
- Links formatted correctly, either as linked text (e.g., [this is linked text](https://example.com)) or automatic links using angle brackets (e.g., <http://example.com/>).
- For code, use fenced code blocks by placing triple backticks (` ``` `) before and after the code. Include the language identifier after the opening backticks (e.g., ` ```python `) to enable syntax highlighting.
- Ensure clarity, conciseness, and proper formatting to enhance readability and usability.
IMPORTANT INSTRUCTIONS:
Please keep going until the user's query is completely resolved, before ending your turn and yielding back to the user.
Only terminate your turn when you are sure that the problem is solved.
If you are not sure about file content or codebase structure, or other information pertaining to the user's request,
use your tools to read files and gather the relevant information: do NOT guess or make up an answer. It is important
you use tools that can assist with providing the right context.
");
if show_prompt { if show_prompt {
println!("🔍 System Prompt:"); println!("🔍 System Prompt:");
@@ -232,26 +314,33 @@ with nothing afterwards.",
messages, messages,
max_tokens: Some(2048), max_tokens: Some(2048),
temperature: Some(0.2), temperature: Some(0.2),
stream: false, stream: true, // Enable streaming
}; };
// Time the LLM call with cancellation support // Time the LLM call with cancellation support and streaming
let llm_start = Instant::now(); let llm_start = Instant::now();
let response = tokio::select! { let response_content = tokio::select! {
result = provider.complete(request) => result?, result = self.stream_completion(request) => result?,
_ = cancellation_token.cancelled() => { _ = cancellation_token.cancelled() => {
return Err(anyhow::anyhow!("Operation cancelled by user")); return Err(anyhow::anyhow!("Operation cancelled by user"));
} }
}; };
let llm_duration = llm_start.elapsed(); let llm_duration = llm_start.elapsed();
// Update context window with actual token usage // Create a mock usage for now (we'll need to track this during streaming)
self.context_window.update_usage(&response.usage); let mock_usage = g3_providers::Usage {
prompt_tokens: 100, // Estimate
completion_tokens: response_content.len() as u32 / 4, // Rough estimate
total_tokens: 100 + (response_content.len() as u32 / 4),
};
// Update context window with estimated token usage
self.context_window.update_usage(&mock_usage);
// Add assistant response to context window // Add assistant response to context window
let assistant_message = Message { let assistant_message = Message {
role: MessageRole::Assistant, role: MessageRole::Assistant,
content: response.content.clone(), content: response_content.clone(),
}; };
self.context_window.add_message(assistant_message); self.context_window.add_message(assistant_message);
@@ -259,19 +348,16 @@ with nothing afterwards.",
let exec_start = Instant::now(); let exec_start = Instant::now();
let executor = CodeExecutor::new(); let executor = CodeExecutor::new();
let result = tokio::select! { let result = tokio::select! {
result = executor.execute_from_response_with_options(&response.content, show_code) => result?, result = executor.execute_from_response_with_options(&response_content, show_code) => result?,
_ = cancellation_token.cancelled() => { _ = cancellation_token.cancelled() => {
return Err(anyhow::anyhow!("Operation cancelled by user")); return Err(anyhow::anyhow!("Operation cancelled by user"));
} }
}; };
let exec_duration = exec_start.elapsed(); let exec_duration = exec_start.elapsed();
let total_duration = total_start.elapsed();
if show_timing { if show_timing {
let timing_summary = format!( let timing_summary = format!(
"\n{} [💡: {} ⚡️: {}]", "\n💭 {} | ⚡️ {}",
Self::format_duration(total_duration),
Self::format_duration(llm_duration), Self::format_duration(llm_duration),
Self::format_duration(exec_duration) Self::format_duration(exec_duration)
); );
@@ -285,6 +371,39 @@ with nothing afterwards.",
&self.context_window &self.context_window
} }
async fn stream_completion(&self, request: CompletionRequest) -> Result<String> {
use tokio_stream::StreamExt;
let provider = self.providers.get(None)?;
let mut stream = provider.stream(request).await?;
let mut full_content = String::new();
print!("🤖 "); // Start the response indicator
use std::io::{self, Write};
io::stdout().flush()?;
while let Some(chunk_result) = stream.next().await {
match chunk_result {
Ok(chunk) => {
print!("{}", chunk.content);
io::stdout().flush()?;
full_content.push_str(&chunk.content);
if chunk.finished {
break;
}
}
Err(e) => {
error!("Streaming error: {}", e);
return Err(e);
}
}
}
println!(); // New line after streaming completes
Ok(full_content)
}
fn format_duration(duration: Duration) -> String { fn format_duration(duration: Duration) -> String {
let total_ms = duration.as_millis(); let total_ms = duration.as_millis();

View File

@@ -1,13 +1,19 @@
use g3_providers::{LLMProvider, CompletionRequest, CompletionResponse, CompletionStream, CompletionChunk, Usage, Message, MessageRole};
use anyhow::Result; use anyhow::Result;
use llama_cpp::{LlamaModel, LlamaSession, LlamaParams, SessionParams, standard_sampler::{StandardSampler, SamplerStage}}; use g3_providers::{
CompletionChunk, CompletionRequest, CompletionResponse, CompletionStream, LLMProvider, Message,
MessageRole, Usage,
};
use llama_cpp::{
standard_sampler::{SamplerStage, StandardSampler},
LlamaModel, LlamaParams, LlamaSession, SessionParams,
};
use std::path::Path; use std::path::Path;
use std::sync::Arc;
use tokio::sync::Mutex;
use tracing::{debug, info, error, warn};
use tokio::sync::mpsc;
use tokio_stream::wrappers::ReceiverStream;
use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use tokio::sync::mpsc;
use tokio::sync::Mutex;
use tokio_stream::wrappers::ReceiverStream;
use tracing::{debug, error, info, warn};
pub struct EmbeddedProvider { pub struct EmbeddedProvider {
model: Arc<LlamaModel>, model: Arc<LlamaModel>,
@@ -62,7 +68,8 @@ impl EmbeddedProvider {
session_params.n_threads = threads; session_params.n_threads = threads;
} }
let session = model.create_session(session_params) let session = model
.create_session(session_params)
.map_err(|e| anyhow::anyhow!("Failed to create session: {}", e))?; .map_err(|e| anyhow::anyhow!("Failed to create session: {}", e))?;
info!("Successfully loaded {} model", model_type); info!("Successfully loaded {} model", model_type);
@@ -85,7 +92,10 @@ impl EmbeddedProvider {
for message in messages { for message in messages {
match message.role { match message.role {
MessageRole::System => { MessageRole::System => {
formatted.push_str(&format!("[INST] <<SYS>>\n{}\n<</SYS>>\n\n", message.content)); formatted.push_str(&format!(
"[INST] <<SYS>>\n{}\n<</SYS>>\n\n",
message.content
));
} }
MessageRole::User => { MessageRole::User => {
formatted.push_str(&format!("{} [/INST] ", message.content)); formatted.push_str(&format!("{} [/INST] ", message.content));
@@ -99,13 +109,21 @@ impl EmbeddedProvider {
formatted formatted
} }
async fn generate_completion(&self, prompt: &str, max_tokens: u32, temperature: f32) -> Result<String> { async fn generate_completion(
&self,
prompt: &str,
max_tokens: u32,
temperature: f32,
) -> Result<String> {
let session = self.session.clone(); let session = self.session.clone();
let prompt = prompt.to_string(); let prompt = prompt.to_string();
// Calculate dynamic max tokens based on available context headroom // Calculate dynamic max tokens based on available context headroom
let prompt_tokens = self.estimate_tokens(&prompt); let prompt_tokens = self.estimate_tokens(&prompt);
let available_tokens = self.context_length.saturating_sub(prompt_tokens).saturating_sub(50); // Reserve 50 tokens for safety let available_tokens = self
.context_length
.saturating_sub(prompt_tokens)
.saturating_sub(50); // Reserve 50 tokens for safety
let dynamic_max_tokens = std::cmp::min(max_tokens as usize, available_tokens as usize); let dynamic_max_tokens = std::cmp::min(max_tokens as usize, available_tokens as usize);
debug!("Context calculation: prompt_tokens={}, context_length={}, available_tokens={}, dynamic_max_tokens={}", debug!("Context calculation: prompt_tokens={}, context_length={}, available_tokens={}, dynamic_max_tokens={}",
@@ -114,89 +132,100 @@ impl EmbeddedProvider {
// Add timeout to the entire operation // Add timeout to the entire operation
let timeout_duration = std::time::Duration::from_secs(30); // Increased timeout for larger contexts let timeout_duration = std::time::Duration::from_secs(30); // Increased timeout for larger contexts
let result = tokio::time::timeout(timeout_duration, tokio::task::spawn_blocking(move || { let result = tokio::time::timeout(
let mut session = match session.try_lock() { timeout_duration,
Ok(ctx) => ctx, tokio::task::spawn_blocking(move || {
Err(_) => return Err(anyhow::anyhow!("Model is busy, please try again")), let mut session = match session.try_lock() {
}; Ok(ctx) => ctx,
Err(_) => return Err(anyhow::anyhow!("Model is busy, please try again")),
};
debug!("Starting inference with prompt length: {} chars, estimated {} tokens", prompt.len(), prompt_tokens); debug!(
"Starting inference with prompt length: {} chars, estimated {} tokens",
prompt.len(),
prompt_tokens
);
// Set context to the prompt // Set context to the prompt
debug!("About to call set_context..."); debug!("About to call set_context...");
session.set_context(&prompt) session
.map_err(|e| anyhow::anyhow!("Failed to set context: {}", e))?; .set_context(&prompt)
debug!("set_context completed successfully"); .map_err(|e| anyhow::anyhow!("Failed to set context: {}", e))?;
debug!("set_context completed successfully");
// Create sampler with temperature // Create sampler with temperature
debug!("Creating sampler..."); debug!("Creating sampler...");
let stages = vec![ let stages = vec![
SamplerStage::Temperature(temperature), SamplerStage::Temperature(temperature),
SamplerStage::TopK(40), SamplerStage::TopK(40),
SamplerStage::TopP(0.9), SamplerStage::TopP(0.9),
]; ];
let sampler = StandardSampler::new_softmax(stages, 1); let sampler = StandardSampler::new_softmax(stages, 1);
debug!("Sampler created successfully"); debug!("Sampler created successfully");
// Start completion with dynamic max tokens // Start completion with dynamic max tokens
debug!("About to call start_completing_with with {} max tokens...", dynamic_max_tokens); debug!(
let mut completion_handle = session.start_completing_with(sampler, dynamic_max_tokens) "About to call start_completing_with with {} max tokens...",
.map_err(|e| anyhow::anyhow!("Failed to start completion: {}", e))?; dynamic_max_tokens
debug!("start_completing_with completed successfully"); );
let mut completion_handle = session
.start_completing_with(sampler, dynamic_max_tokens)
.map_err(|e| anyhow::anyhow!("Failed to start completion: {}", e))?;
debug!("start_completing_with completed successfully");
let mut generated_text = String::new(); let mut generated_text = String::new();
let mut token_count = 0; let mut token_count = 0;
let start_time = std::time::Instant::now(); let start_time = std::time::Instant::now();
debug!("Starting token generation loop..."); debug!("Starting token generation loop...");
// Generate tokens with dynamic limits // Generate tokens with dynamic limits
while let Some(token) = completion_handle.next_token() { while let Some(token) = completion_handle.next_token() {
// Check for timeout on each token // Check for timeout on each token
if start_time.elapsed() > std::time::Duration::from_secs(25) { if start_time.elapsed() > std::time::Duration::from_secs(25) {
debug!("Token generation timeout after {} tokens", token_count); debug!("Token generation timeout after {} tokens", token_count);
break; break;
}
let token_string = session.model().token_to_piece(token);
generated_text.push_str(&token_string);
token_count += 1;
if token_count <= 10 || token_count % 50 == 0 {
debug!("Generated token {}: '{}'", token_count, token_string);
}
// Use dynamic token limit
if token_count >= dynamic_max_tokens {
debug!("Reached dynamic token limit: {}", dynamic_max_tokens);
break;
}
// Stop on completion markers
if generated_text.contains("</s>") || generated_text.contains("[/INST]") {
debug!("Hit CodeLlama stop sequence at {} tokens", token_count);
break;
}
} }
let token_string = session.model().token_to_piece(token); debug!(
generated_text.push_str(&token_string); "Token generation loop completed. Generated {} tokens in {:?}",
token_count += 1; token_count,
start_time.elapsed()
if token_count <= 10 || token_count % 50 == 0 { );
debug!("Generated token {}: '{}'", token_count, token_string); Ok((generated_text.trim().to_string(), token_count))
} }),
)
// Use dynamic token limit .await;
if token_count >= dynamic_max_tokens {
debug!("Reached dynamic token limit: {}", dynamic_max_tokens);
break;
}
// Stop on completion markers
if generated_text.contains("</s>") || generated_text.contains("[/INST]") {
debug!("Hit CodeLlama stop sequence at {} tokens", token_count);
break;
}
// Stop on natural completion points after reasonable generation
if token_count >= 20 && (
generated_text.trim().ends_with("```") ||
(generated_text.contains("```") && generated_text.matches("```").count() % 2 == 0) // Complete code blocks
) {
debug!("Hit code block completion at {} tokens", token_count);
break;
}
}
debug!("Token generation loop completed. Generated {} tokens in {:?}", token_count, start_time.elapsed());
Ok((generated_text.trim().to_string(), token_count))
})).await;
match result { match result {
Ok(inner_result) => match inner_result { Ok(inner_result) => match inner_result {
Ok(task_result) => match task_result { Ok(task_result) => match task_result {
Ok((text, token_count)) => { Ok((text, token_count)) => {
info!("Completed generation: {} tokens (dynamic limit was {})", token_count, dynamic_max_tokens); info!(
"Completed generation: {} tokens (dynamic limit was {})",
token_count, dynamic_max_tokens
);
Ok(text) Ok(text)
} }
Err(e) => Err(e), Err(e) => Err(e),
@@ -221,7 +250,10 @@ impl EmbeddedProvider {
#[async_trait::async_trait] #[async_trait::async_trait]
impl LLMProvider for EmbeddedProvider { impl LLMProvider for EmbeddedProvider {
async fn complete(&self, request: CompletionRequest) -> Result<CompletionResponse> { async fn complete(&self, request: CompletionRequest) -> Result<CompletionResponse> {
debug!("Processing completion request with {} messages", request.messages.len()); debug!(
"Processing completion request with {} messages",
request.messages.len()
);
let prompt = self.format_messages(&request.messages); let prompt = self.format_messages(&request.messages);
let max_tokens = request.max_tokens.unwrap_or(self.max_tokens); let max_tokens = request.max_tokens.unwrap_or(self.max_tokens);
@@ -229,7 +261,9 @@ impl LLMProvider for EmbeddedProvider {
debug!("Formatted prompt length: {} chars", prompt.len()); debug!("Formatted prompt length: {} chars", prompt.len());
let content = self.generate_completion(&prompt, max_tokens, temperature).await?; let content = self
.generate_completion(&prompt, max_tokens, temperature)
.await?;
// Estimate token usage (rough approximation) // Estimate token usage (rough approximation)
let prompt_tokens = (prompt.len() / 4) as u32; // Rough estimate: 4 chars per token let prompt_tokens = (prompt.len() / 4) as u32; // Rough estimate: 4 chars per token
@@ -247,7 +281,10 @@ impl LLMProvider for EmbeddedProvider {
} }
async fn stream(&self, request: CompletionRequest) -> Result<CompletionStream> { async fn stream(&self, request: CompletionRequest) -> Result<CompletionStream> {
debug!("Processing streaming request with {} messages", request.messages.len()); debug!(
"Processing streaming request with {} messages",
request.messages.len()
);
let prompt = self.format_messages(&request.messages); let prompt = self.format_messages(&request.messages);
let max_tokens = request.max_tokens.unwrap_or(self.max_tokens); let max_tokens = request.max_tokens.unwrap_or(self.max_tokens);
@@ -262,7 +299,8 @@ impl LLMProvider for EmbeddedProvider {
let mut session = match session.try_lock() { let mut session = match session.try_lock() {
Ok(ctx) => ctx, Ok(ctx) => ctx,
Err(_) => { Err(_) => {
let _ = tx.blocking_send(Err(anyhow::anyhow!("Model is busy, please try again"))); let _ =
tx.blocking_send(Err(anyhow::anyhow!("Model is busy, please try again")));
return; return;
} }
}; };
@@ -282,10 +320,13 @@ impl LLMProvider for EmbeddedProvider {
let sampler = StandardSampler::new_softmax(stages, 1); let sampler = StandardSampler::new_softmax(stages, 1);
// Start completion // Start completion
let mut completion_handle = match session.start_completing_with(sampler, max_tokens as usize) { let mut completion_handle = match session
.start_completing_with(sampler, max_tokens as usize)
{
Ok(handle) => handle, Ok(handle) => handle,
Err(e) => { Err(e) => {
let _ = tx.blocking_send(Err(anyhow::anyhow!("Failed to start completion: {}", e))); let _ =
tx.blocking_send(Err(anyhow::anyhow!("Failed to start completion: {}", e)));
return; return;
} }
}; };
@@ -316,29 +357,14 @@ impl LLMProvider for EmbeddedProvider {
} }
// Stop if we hit common stop sequences // Stop if we hit common stop sequences
if accumulated_text.contains("### Human") || if accumulated_text.contains("### Human")
accumulated_text.contains("### System") || || accumulated_text.contains("### System")
accumulated_text.contains("<|end|>") || || accumulated_text.contains("<|end|>")
accumulated_text.contains("</s>") || || accumulated_text.contains("</s>")
accumulated_text.trim().ends_with("```") { {
debug!("Hit stop sequence in streaming, stopping generation"); debug!("Hit stop sequence in streaming, stopping generation");
break; break;
} }
// Emergency brake for streaming too
if token_count > 0 && token_count % 100 == 0 {
debug!("Streaming: Generated {} tokens so far", token_count);
if accumulated_text.trim().len() > 50 &&
(accumulated_text.contains('\n') || accumulated_text.len() > 200) {
if accumulated_text.trim().ends_with('.') ||
accumulated_text.trim().ends_with('!') ||
accumulated_text.trim().ends_with('?') ||
accumulated_text.trim().ends_with('\n') {
debug!("Found natural stopping point in streaming at {} tokens", token_count);
break;
}
}
}
} }
// Send final chunk // Send final chunk