Time to First Token

This commit is contained in:
Dhanji Prasanna
2025-09-08 14:04:07 +10:00
parent 1e06b9fea3
commit a69054cb2b

View File

@@ -234,7 +234,11 @@ impl Agent {
tools, and can interact with multiple at once. When you want to perform an action, use 'I' as the pronoun. tools, and can interact with multiple at once. When you want to perform an action, use 'I' as the pronoun.
# Available Tools # Available Tools
- shell:
- name: shell
type: function
usage: shell [command]
description: \"
Execute a command in the shell. Execute a command in the shell.
This will return the output and error concatenated into a single string, as This will return the output and error concatenated into a single string, as
@@ -249,23 +253,14 @@ impl Agent {
Multiple commands: Use ; or && to chain commands, avoid newlines Multiple commands: Use ; or && to chain commands, avoid newlines
Pathnames: Use absolute paths and avoid cd unless explicitly requested Pathnames: Use absolute paths and avoid cd unless explicitly requested
\"
Usage: - name: final_output
- Call the `shell` tool with the desired bash/shell commands. type: function
usage: final_output [summary]
- search: description: \"
Search the web for information about any topic.
- final_output:
This tool signals the final output for a user in a conversation and MUST be used for the final message to the user. You must This tool signals the final output for a user in a conversation and MUST be used for the final message to the user. You must
pass in a detailed summary of the work done to this tool call. pass in a detailed summary of the work done so far.\"
Purpose:
- Collects the final output for a user
- Provides clear validation feedback when output isn't valid
Usage:
- Call the `final_output` tool with a summary of the work performed.
# Response Guidelines # Response Guidelines
- Use Markdown formatting for all responses. - Use Markdown formatting for all responses.
@@ -278,12 +273,10 @@ impl Agent {
IMPORTANT INSTRUCTIONS: IMPORTANT INSTRUCTIONS:
Please keep going until the user's query is completely resolved, before ending your turn and yielding back to the user. Break down your task into smaller steps and do one step and tool call at a time.
Only terminate your turn when you are sure that the problem is solved. Do not try to use multiple tools at once.
**After you get the tool result back, consider the result and then proceed to do
If you are not sure about file content or codebase structure, or other information pertaining to the user's request, the next step and tool call if required.**
use your tools to read files and gather the relevant information: do NOT guess or make up an answer. It is important
you use tools that can assist with providing the right context.
"); ");
if show_prompt { if show_prompt {
@@ -319,7 +312,7 @@ you use tools that can assist with providing the right context.
// Time the LLM call with cancellation support and streaming // Time the LLM call with cancellation support and streaming
let llm_start = Instant::now(); let llm_start = Instant::now();
let response_content = tokio::select! { let (response_content, think_time) = tokio::select! {
result = self.stream_completion(request) => result?, result = self.stream_completion(request) => result?,
_ = cancellation_token.cancelled() => { _ = cancellation_token.cancelled() => {
return Err(anyhow::anyhow!("Operation cancelled by user")); return Err(anyhow::anyhow!("Operation cancelled by user"));
@@ -357,8 +350,9 @@ you use tools that can assist with providing the right context.
if show_timing { if show_timing {
let timing_summary = format!( let timing_summary = format!(
"\n💭 {} | ⚡️ {}", "\n⏱️ {} | 💭 {} | ⚡️ {}",
Self::format_duration(llm_duration), Self::format_duration(llm_duration),
Self::format_duration(think_time),
Self::format_duration(exec_duration) Self::format_duration(exec_duration)
); );
Ok(format!("{}\n{}", result, timing_summary)) Ok(format!("{}\n{}", result, timing_summary))
@@ -371,13 +365,16 @@ you use tools that can assist with providing the right context.
&self.context_window &self.context_window
} }
async fn stream_completion(&self, request: CompletionRequest) -> Result<String> { async fn stream_completion(&self, request: CompletionRequest) -> Result<(String, Duration)> {
use tokio_stream::StreamExt; use tokio_stream::StreamExt;
let provider = self.providers.get(None)?; let provider = self.providers.get(None)?;
let mut stream = provider.stream(request).await?; let mut stream = provider.stream(request).await?;
let mut full_content = String::new(); let mut full_content = String::new();
let mut first_token_time: Option<Duration> = None;
let stream_start = Instant::now();
print!("🤖 "); // Start the response indicator print!("🤖 "); // Start the response indicator
use std::io::{self, Write}; use std::io::{self, Write};
io::stdout().flush()?; io::stdout().flush()?;
@@ -385,6 +382,11 @@ you use tools that can assist with providing the right context.
while let Some(chunk_result) = stream.next().await { while let Some(chunk_result) = stream.next().await {
match chunk_result { match chunk_result {
Ok(chunk) => { Ok(chunk) => {
// Record time to first token
if first_token_time.is_none() && !chunk.content.is_empty() {
first_token_time = Some(stream_start.elapsed());
}
print!("{}", chunk.content); print!("{}", chunk.content);
io::stdout().flush()?; io::stdout().flush()?;
full_content.push_str(&chunk.content); full_content.push_str(&chunk.content);
@@ -401,7 +403,8 @@ you use tools that can assist with providing the right context.
} }
println!(); // New line after streaming completes println!(); // New line after streaming completes
Ok(full_content) let ttft = first_token_time.unwrap_or_else(|| stream_start.elapsed());
Ok((full_content, ttft))
} }
fn format_duration(duration: Duration) -> String { fn format_duration(duration: Duration) -> String {