Add token usage and context % to timing footer

Added a quality-of-life feature that displays:
- Tokens used in the current turn (from LLM response, not estimated)
- Current context window usage percentage

These are displayed dimmed after the timing info:
  ⏱️ 1.2s | 💭 0.3s  1234tk | 45% ctx

The token count comes directly from the LLM's usage response data,
not from any estimation. If no usage data is available from the LLM,
only the context percentage is shown.
This commit is contained in:
Dhanji R. Prasanna
2025-12-22 17:22:54 +11:00
parent 720ad8cad7
commit 743d622468

View File

@@ -205,6 +205,7 @@ impl g3_computer_control::WebDriverController for WebDriverSession {
WebDriverSession::Chrome(driver) => driver.quit().await, WebDriverSession::Chrome(driver) => driver.quit().await,
} }
} }
} }
// Additional methods for WebDriverSession that aren't part of the WebDriverController trait // Additional methods for WebDriverSession that aren't part of the WebDriverController trait
@@ -3925,6 +3926,7 @@ impl<W: UiWriter> Agent<W> {
const MAX_AUTO_SUMMARY_ATTEMPTS: usize = 5; // Limit auto-summary retries (increased from 2 for better recovery) const MAX_AUTO_SUMMARY_ATTEMPTS: usize = 5; // Limit auto-summary retries (increased from 2 for better recovery)
let mut final_output_called = false; // Track if final_output was called let mut final_output_called = false; // Track if final_output was called
// Note: Session-level duplicate tracking was removed - we only prevent sequential duplicates (DUP IN CHUNK, DUP IN MSG) // Note: Session-level duplicate tracking was removed - we only prevent sequential duplicates (DUP IN CHUNK, DUP IN MSG)
let mut turn_accumulated_usage: Option<g3_providers::Usage> = None; // Track token usage for timing footer
// Check if we need to summarize before starting // Check if we need to summarize before starting
if self.context_window.should_summarize() { if self.context_window.should_summarize() {
@@ -4170,6 +4172,7 @@ impl<W: UiWriter> Agent<W> {
// Capture usage data if available // Capture usage data if available
if let Some(ref usage) = chunk.usage { if let Some(ref usage) = chunk.usage {
accumulated_usage = Some(usage.clone()); accumulated_usage = Some(usage.clone());
turn_accumulated_usage = Some(usage.clone());
debug!( debug!(
"Received usage data - prompt: {}, completion: {}, total: {}", "Received usage data - prompt: {}, completion: {}, total: {}",
usage.prompt_tokens, usage.completion_tokens, usage.total_tokens usage.prompt_tokens, usage.completion_tokens, usage.total_tokens
@@ -4866,11 +4869,17 @@ impl<W: UiWriter> Agent<W> {
// Add timing if needed // Add timing if needed
let final_response = if show_timing { let final_response = if show_timing {
let turn_tokens = turn_accumulated_usage.as_ref().map(|u| u.total_tokens);
let timing_footer = Self::format_timing_footer(
stream_start.elapsed(),
_ttft,
turn_tokens,
self.context_window.percentage_used(),
);
format!( format!(
"{}\n\n⏱️ {} | 💭 {}", "{}\n\n{}",
full_response, full_response,
Self::format_duration(stream_start.elapsed()), timing_footer
Self::format_duration(_ttft)
) )
} else { } else {
full_response full_response
@@ -5119,11 +5128,17 @@ impl<W: UiWriter> Agent<W> {
// Add timing if needed // Add timing if needed
let final_response = if show_timing { let final_response = if show_timing {
let turn_tokens = turn_accumulated_usage.as_ref().map(|u| u.total_tokens);
let timing_footer = Self::format_timing_footer(
stream_start.elapsed(),
_ttft,
turn_tokens,
self.context_window.percentage_used(),
);
format!( format!(
"{}\n\n⏱️ {} | 💭 {}", "{}\n\n{}",
full_response, full_response,
Self::format_duration(stream_start.elapsed()), timing_footer
Self::format_duration(_ttft)
) )
} else { } else {
full_response full_response
@@ -5140,11 +5155,17 @@ impl<W: UiWriter> Agent<W> {
// Add timing if needed // Add timing if needed
let final_response = if show_timing { let final_response = if show_timing {
let turn_tokens = turn_accumulated_usage.as_ref().map(|u| u.total_tokens);
let timing_footer = Self::format_timing_footer(
stream_start.elapsed(),
_ttft,
turn_tokens,
self.context_window.percentage_used(),
);
format!( format!(
"{}\n\n⏱️ {} | 💭 {}", "{}\n\n{}",
full_response, full_response,
Self::format_duration(stream_start.elapsed()), timing_footer
Self::format_duration(_ttft)
) )
} else { } else {
full_response full_response
@@ -6977,6 +6998,23 @@ impl<W: UiWriter> Agent<W> {
format!("{}m {:.1}s", minutes, remaining_seconds) format!("{}m {:.1}s", minutes, remaining_seconds)
} }
} }
/// Format the timing footer with optional token usage info
fn format_timing_footer(
elapsed: Duration,
ttft: Duration,
turn_tokens: Option<u32>,
context_percentage: f32,
) -> String {
let timing = format!("⏱️ {} | 💭 {}", Self::format_duration(elapsed), Self::format_duration(ttft));
// Add token usage info if available (dimmed)
if let Some(tokens) = turn_tokens {
format!("{} \x1b[2m{}tk | {:.0}% ctx\x1b[0m", timing, tokens, context_percentage)
} else {
format!("{} \x1b[2m{:.0}% ctx\x1b[0m", timing, context_percentage)
}
}
} }
// Note: JSON tool call filtering is now handled by UiWriter::filter_json_tool_calls (implemented in g3-cli) // Note: JSON tool call filtering is now handled by UiWriter::filter_json_tool_calls (implemented in g3-cli)