diff --git a/crates/g3-cli/src/utils.rs b/crates/g3-cli/src/utils.rs index 06ed395..8d892f9 100644 --- a/crates/g3-cli/src/utils.rs +++ b/crates/g3-cli/src/utils.rs @@ -139,9 +139,10 @@ pub fn load_config_with_cli_overrides(cli: &Cli) -> Result { // Validate provider if specified if let Some(ref provider) = cli.provider { let valid_providers = ["anthropic", "databricks", "embedded", "openai"]; - if !valid_providers.contains(&provider.as_str()) { + let provider_type = provider.split('.').next().unwrap_or(provider); + if !valid_providers.contains(&provider_type) { return Err(anyhow::anyhow!( - "Invalid provider '{}'. Valid options: {:?}", + "Invalid provider '{}'. Provider type must be one of: {:?}", provider, valid_providers )); diff --git a/crates/g3-providers/src/embedded.rs b/crates/g3-providers/src/embedded.rs index 526e0fd..ac99e03 100644 --- a/crates/g3-providers/src/embedded.rs +++ b/crates/g3-providers/src/embedded.rs @@ -14,10 +14,46 @@ use llama_cpp_2::{ use std::num::NonZeroU32; use std::path::PathBuf; use std::sync::Arc; +use std::sync::OnceLock; use tokio::sync::mpsc; use tokio_stream::wrappers::ReceiverStream; use tracing::{debug, error}; +/// Global llama.cpp backend - can only be initialized once per process +static LLAMA_BACKEND: OnceLock> = OnceLock::new(); + +/// Get or initialize the global llama.cpp backend +fn get_or_init_backend() -> Result> { + // Check if already initialized + if let Some(backend) = LLAMA_BACKEND.get() { + return Ok(Arc::clone(backend)); + } + + // Suppress llama.cpp's verbose logging to stderr before initialization + unsafe { + unsafe extern "C" fn void_log( + _level: std::ffi::c_int, + _text: *const std::os::raw::c_char, + _user_data: *mut std::os::raw::c_void, + ) { + // Intentionally empty - suppress all llama.cpp logging + } + // Call the underlying C function directly + extern "C" { fn llama_log_set(log_callback: Option, user_data: *mut std::os::raw::c_void); } + llama_log_set(Some(void_log), std::ptr::null_mut()); + } + + // Try to initialize + debug!("Initializing llama.cpp backend..."); + let backend = LlamaBackend::init() + .map_err(|e| anyhow::anyhow!("Failed to initialize llama.cpp backend: {:?}", e))?; + + // Store it (ignore if another thread beat us to it) + let _ = LLAMA_BACKEND.set(Arc::new(backend)); + let backend = LLAMA_BACKEND.get().expect("backend was just set"); + Ok(Arc::clone(backend)) +} + /// Embedded LLM provider using llama.cpp with Metal acceleration on macOS. /// /// Supports multiple model families with their native chat templates: @@ -103,9 +139,8 @@ impl EmbeddedProvider { anyhow::bail!("Model file not found: {}", model_path_buf.display()); } - // Initialize the llama.cpp backend - let backend = LlamaBackend::init() - .map_err(|e| anyhow::anyhow!("Failed to initialize llama.cpp backend: {:?}", e))?; + // Get or initialize the global llama.cpp backend + let backend = get_or_init_backend()?; // Set up model parameters let n_gpu_layers = gpu_layers.unwrap_or(99); @@ -130,7 +165,7 @@ impl EmbeddedProvider { Ok(Self { name, model: Arc::new(model), - backend: Arc::new(backend), + backend, model_type: model_type.to_lowercase(), model_name: format!("embedded-{}", model_type), max_tokens,