Refactor system prompts to eliminate duplication; upgrade embedded provider

- Refactor prompts.rs: extract shared sections (intro, TODO, workspace memory, web research, response guidelines) used by both native and non-native prompts - Fix typo in native prompt: "save them.." -> "save them." - Fix non-native prompt: add missing closing braces in JSON examples, add IMPORTANT steps section, align with native prompt quality - Add 9 unit tests to verify both prompts contain required sections - Upgrade llama-cpp-2 dependency and refactor embedded provider - Update config.example.toml with embedded model examples - Update workspace memory
2026-01-28 09:56:39 +11:00
parent 585684a86e
commit a902be1562
9 changed files with 1027 additions and 851 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -229,16 +229,14 @@ dependencies = [

 [[package]]
 name = "bindgen"
-version = "0.69.5"
+version = "0.72.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088"
+checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895"
 dependencies = [
 "bitflags 2.10.0",
 "cexpr",
 "clang-sys",
- "itertools 0.12.1",
- "lazy_static",
- "lazycell",
+ "itertools",
 "log",
 "prettyplease",
 "proc-macro2",
@@ -247,7 +245,6 @@ dependencies = [
 "rustc-hash",
 "shlex",
 "syn",
- "which",
 ]

 [[package]]
@@ -438,6 +435,15 @@ dependencies = [
 "error-code",
 ]

+[[package]]
+name = "cmake"
+version = "0.1.54"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e7caa3f9de89ddbe2c607f4101924c5abec803763ae9534e4f4d7d8f84aa81f0"
+dependencies = [
+ "cc",
+]
+
 [[package]]
 name = "cocoa"
 version = "0.25.0"
@@ -605,12 +611,6 @@ dependencies = [
 "unicode-xid",
 ]

-[[package]]
-name = "convert_case"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e"
-
 [[package]]
 name = "convert_case"
 version = "0.6.0"
@@ -857,7 +857,7 @@ checksum = "d8b9f2e4c67f833b660cdb0a3523065869fb35570177239812ed4c905aeff87b"
 dependencies = [
 "bitflags 2.10.0",
 "crossterm_winapi",
- "derive_more 2.0.1",
+ "derive_more",
 "document-features",
 "mio",
 "parking_lot",
@@ -936,19 +936,6 @@ dependencies = [
 "powerfmt",
 ]

-[[package]]
-name = "derive_more"
-version = "0.99.20"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6edb4b64a43d977b8e99788fe3a04d483834fba1215a7e02caa415b626497f7f"
-dependencies = [
- "convert_case 0.4.0",
- "proc-macro2",
- "quote",
- "rustc_version",
- "syn",
-]
-
 [[package]]
 name = "derive_more"
 version = "2.0.1"
@@ -1078,6 +1065,26 @@ version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d"

+[[package]]
+name = "enumflags2"
+version = "0.7.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1027f7680c853e056ebcec683615fb6fbbc07dbaa13b4d5d9442b146ded4ecef"
+dependencies = [
+ "enumflags2_derive",
+]
+
+[[package]]
+name = "enumflags2_derive"
+version = "0.7.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "67c78a4d8fdf9953a5c9d458f9efe940fd97a0cab0941c075a813ac594733827"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "equivalent"
 version = "1.0.2"
@@ -1171,6 +1178,15 @@ version = "0.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "52051878f80a721bb68ebfbc930e07b65ba72f2da88968ea5c06fd6ca3d3a127"

+[[package]]
+name = "find_cuda_helper"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f9f9e65c593dd01ac77daad909ea4ad17f0d6d1776193fc8ea766356177abdad"
+dependencies = [
+ "glob",
+]
+
 [[package]]
 name = "flate2"
 version = "1.1.5"
@@ -1504,7 +1520,7 @@ dependencies = [
 "chrono",
 "dirs 5.0.1",
 "futures-util",
- "llama_cpp",
+ "llama-cpp-2",
 "nanoid",
 "rand",
 "reqwest",
@@ -1643,12 +1659,6 @@ version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"

-[[package]]
-name = "hermit-abi"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c"
-
 [[package]]
 name = "hex"
 version = "0.4.3"
@@ -2040,15 +2050,6 @@ version = "1.70.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"

-[[package]]
-name = "itertools"
-version = "0.12.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
-dependencies = [
- "either",
-]
-
 [[package]]
 name = "itertools"
 version = "0.13.0"
@@ -2155,12 +2156,6 @@ version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"

-[[package]]
-name = "lazycell"
-version = "1.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
-
 [[package]]
 name = "lebe"
 version = "0.5.3"
@@ -2193,15 +2188,6 @@ dependencies = [
 "libc",
 ]

-[[package]]
-name = "link-cplusplus"
-version = "1.0.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f78c730aaa7d0b9336a299029ea49f9ee53b0ed06e9202e8cb7db9bae7b8c82"
-dependencies = [
- "cc",
-]
-
 [[package]]
 name = "linked-hash-map"
 version = "0.5.6"
@@ -2233,30 +2219,30 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "11d3d7f243d5c5a8b9bb5d6dd2b1602c0cb0b9db1621bafc7ed66e35ff9fe092"

 [[package]]
-name = "llama_cpp"
-version = "0.3.2"
+name = "llama-cpp-2"
+version = "0.1.125"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f126770a2ed5e0e4596119479dc56f56b99037246bf0e36c544f7581a9458fd"
+checksum = "14cc99d19a12f372957e1ad1cb33c5459e6080c7914389e52f2464d8fb043175"
 dependencies = [
- "derive_more 0.99.20",
- "futures",
- "llama_cpp_sys",
- "num_cpus",
+ "enumflags2",
+ "llama-cpp-sys-2",
 "thiserror 1.0.69",
- "tokio",
 "tracing",
+ "tracing-core",
 ]

 [[package]]
-name = "llama_cpp_sys"
-version = "0.3.2"
+name = "llama-cpp-sys-2"
+version = "0.1.125"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "037a1881ada3592c6a922224d5177b4b4f452e6b2979eb97393b71989e48357f"
+checksum = "cc9443103277a9808b0e7055966a39fd2de14c7877fecdec4daf7b8770c46ec3"
 dependencies = [
 "bindgen",
 "cc",
- "link-cplusplus",
- "once_cell",
+ "cmake",
+ "find_cuda_helper",
+ "glob",
+ "walkdir",
 ]

 [[package]]
@@ -2443,16 +2429,6 @@ dependencies = [
 "autocfg",
 ]

-[[package]]
-name = "num_cpus"
-version = "1.17.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b"
-dependencies = [
- "hermit-abi",
- "libc",
-]
-
 [[package]]
 name = "number_prefix"
 version = "0.4.0"
@@ -2860,7 +2836,7 @@ dependencies = [
 "crossterm 0.28.1",
 "indoc",
 "instability",
- "itertools 0.13.0",
+ "itertools",
 "lru",
 "paste",
 "strum",
@@ -3015,18 +2991,9 @@ dependencies = [

 [[package]]
 name = "rustc-hash"
-version = "1.1.0"
+version = "2.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
-
-[[package]]
-name = "rustc_version"
-version = "0.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92"
-dependencies = [
- "semver",
-]
+checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d"

 [[package]]
 name = "rustix"
@@ -3171,12 +3138,6 @@ dependencies = [
 "libc",
 ]

-[[package]]
-name = "semver"
-version = "1.0.27"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2"
-
 [[package]]
 name = "serde"
 version = "1.0.228"
@@ -4048,7 +4009,7 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b3644627a5af5fa321c95b9b235a72fd24cd29c648c2c379431e6628655627bf"
 dependencies = [
- "itertools 0.13.0",
+ "itertools",
 "unicode-segmentation",
 "unicode-width 0.1.14",
 ]
@@ -4311,18 +4272,6 @@ version = "0.1.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a751b3277700db47d3e574514de2eced5e54dc8a5436a3bf7a0b248b2cee16f3"

-[[package]]
-name = "which"
-version = "4.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7"
-dependencies = [
- "either",
- "home",
- "once_cell",
- "rustix 0.38.44",
-]
-
 [[package]]
 name = "winapi"
 version = "0.3.9"
--- a/analysis/memory.md
+++ b/analysis/memory.md
@@ -1,5 +1,5 @@
 # Workspace Memory
-> Updated: 2026-01-27T00:12:18Z | Size: 19.5k chars
+> Updated: 2026-01-27T02:55:20Z | Size: 21.9k chars

 ### Remember Tool Wiring
 - `crates/g3-core/src/tools/memory.rs` [0..5000] - `execute_remember()`, `get_memory_path()`, `merge_memory()`
@@ -347,3 +347,57 @@ Tracks prompt/prefix caching efficacy across Anthropic and OpenAI providers.
 - `crates/g3-core/src/stats.rs`
  - `AgentStatsSnapshot.cache_stats` [20] - reference to cache stats for formatting
  - `format_cache_stats()` [189..230] - formats cache statistics section with hit rate and efficiency metrics
+
+### Embedded Provider (Local LLM via llama.cpp)
+Local model inference using llama-cpp-rs bindings with Metal acceleration on macOS.
+
+- `crates/g3-providers/src/embedded.rs`
+  - `EmbeddedProvider` [22..85] - struct with session, model_name, max_tokens, temperature, context_length
+  - `new()` [26..85] - constructor, handles tilde expansion, auto-downloads Qwen if missing
+  - `format_messages()` [87..175] - converts Message[] to prompt string, supports Qwen/Mistral/Llama templates
+  - `get_stop_sequences()` [280..340] - returns model-specific stop tokens
+  - `generate_completion()` [177..278] - non-streaming inference with timeout
+  - `stream()` [560..780] - streaming inference via spawn_blocking + mpsc channel
+
+**Known Issues (as of 2025-01):**
+- Provider name hardcoded as `"embedded"` instead of `"embedded.{name}"` format
+- Missing GLM-4 chat template (uses `[gMASK]<sop><|role|>` NOT ChatML)
+- Missing `has_native_tool_calling()` override (defaults to false)
+- No streaming usage tracking (unlike Anthropic)
+- No tool streaming hints (`make_tool_streaming_hint()` not used)
+
+### Chat Template Formats (Embedded Provider)
+| Model | Format | Start Token | End Token |
+|-------|--------|-------------|----------|
+| Qwen | ChatML | `<\|im_start\|>role\n` | `<\|im_end\|>` |
+| GLM-4 | ChatGLM4 | `[gMASK]<sop><\|role\|>\n` | `<\|endoftext\|>` |
+| Mistral | Instruct | `<s>[INST]` | `[/INST]` |
+| Llama | Llama2 | `<<SYS>>` | `<</SYS>>` |
+
+### GLM-4 Model Downloads
+Recommended GGUF models for Mac M4 Max with 128GB unified memory.
+
+**Download commands:**
+```bash
+# GLM-4 9B Q8_0 (~10GB) - Very capable, fast
+python3 -m huggingface_hub.commands.huggingface_cli download bartowski/THUDM_GLM-4-9B-0414-GGUF \
+  --include "THUDM_GLM-4-9B-0414-Q8_0.gguf" --local-dir ~/.g3/models/
+
+# GLM-4 32B Q6_K_L (~27GB) - TOP TIER for coding/reasoning
+python3 -m huggingface_hub.commands.huggingface_cli download bartowski/THUDM_GLM-4-32B-0414-GGUF \
+  --include "THUDM_GLM-4-32B-0414-Q6_K_L.gguf" --local-dir ~/.g3/models/
+
+# Qwen3 4B Q4_K_M (~2.3GB) - Small but rivals 72B performance
+python3 -m huggingface_hub.commands.huggingface_cli download Qwen/Qwen3-4B-GGUF \
+  --include "qwen3-4b-q4_k_m.gguf" --local-dir ~/.g3/models/
+```
+
+**Config example:**
+```toml
+[providers.embedded.glm4]
+model_path = "~/.g3/models/THUDM_GLM-4-32B-0414-Q6_K_L.gguf"
+model_type = "glm4"
+context_length = 32768
+max_tokens = 4096
+gpu_layers = 99
+```
--- a/config.example.toml
+++ b/config.example.toml
@@ -52,6 +52,35 @@ model = "claude-sonnet-4-5"
 # model = "anthropic/claude-3.5-sonnet"
 # base_url = "https://openrouter.ai/api/v1"

+# =============================================================================
+# Embedded providers (local models via llama.cpp with Metal acceleration)
+# =============================================================================
+# Download models from Hugging Face:
+#   huggingface-cli download bartowski/THUDM_GLM-4-32B-0414-GGUF \
+#     --include "THUDM_GLM-4-32B-0414-Q6_K_L.gguf" --local-dir ~/.g3/models/
+#
+# GLM-4 32B - Top-tier local model for coding/reasoning (context_length auto-detected from GGUF)
+# [providers.embedded.glm4]
+# model_path = "~/.g3/models/THUDM_GLM-4-32B-0414-Q6_K_L.gguf"
+# model_type = "glm4"            # Required: glm4, qwen, mistral, llama, codellama
+# context_length = 32768         # Optional: auto-detected from GGUF (GLM-4 = 32K)
+# max_tokens = 4096              # Optional: defaults to min(4096, context/4)
+# temperature = 0.1
+# gpu_layers = 99                # Use all GPU layers on Apple Silicon
+# threads = 8
+
+# GLM-4 9B - Smaller but very capable (minimal config - most settings auto-detected)
+# [providers.embedded.glm4-9b]
+# model_path = "~/.g3/models/THUDM_GLM-4-9B-0414-Q8_0.gguf"
+# model_type = "glm4"
+# gpu_layers = 99                # Optional but recommended for Apple Silicon
+
+# Qwen3 4B - Small but powerful, good for ensemble usage (minimal config)
+# [providers.embedded.qwen3]
+# model_path = "~/.g3/models/qwen3-4b-q4_k_m.gguf"
+# model_type = "qwen"
+# gpu_layers = 99                # Optional but recommended for Apple Silicon
+
 # =============================================================================
 # Agent settings (all optional - these are the defaults)
 # =============================================================================
--- a/crates/g3-core/src/lib.rs
+++ b/crates/g3-core/src/lib.rs
@@ -54,7 +54,7 @@ mod prompts;
 use anyhow::Result;
 use g3_config::Config;
 use g3_providers::{CacheControl, CompletionRequest, Message, MessageRole, ProviderRegistry};
-use prompts::{get_system_prompt_for_native, SYSTEM_PROMPT_FOR_NON_NATIVE_TOOL_USE};
+use prompts::{get_system_prompt_for_native, get_system_prompt_for_non_native};
 use serde::{Deserialize, Serialize};
 use std::time::{Duration, Instant};
 use tokio_util::sync::CancellationToken;
@@ -354,7 +354,7 @@ impl<W: UiWriter> Agent<W> {
                get_system_prompt_for_native()
            } else {
                // For non-native providers (embedded models), use JSON format instructions
-                SYSTEM_PROMPT_FOR_NON_NATIVE_TOOL_USE.to_string()
+                get_system_prompt_for_non_native()
            }
        };

@@ -426,12 +426,13 @@ impl<W: UiWriter> Agent<W> {
        }

        // Check for system prompt markers that are present in both standard and agent mode
-        // Agent mode replaces the identity line but keeps all other instructions
+        // Check for system prompt markers that are present in both native and non-native prompts
+        // Both prompts contain "You have access to tools" as a common marker
        let has_tool_instructions = first_message
            .content
-            .contains("IMPORTANT: You must call tools to achieve goals");
+            .contains("You have access to tools");
        if !has_tool_instructions {
-            panic!("FATAL: First system message does not contain the system prompt. This likely means the README was added before the system prompt.");
+            panic!("FATAL: First system message does not contain the system prompt marker 'You have access to tools'. This likely means the README was added before the system prompt.");
        }
    }

--- a/crates/g3-core/src/prompts.rs
+++ b/crates/g3-core/src/prompts.rs
@@ -1,5 +1,10 @@
-const SYSTEM_NATIVE_TOOL_CALLS: &'static str =
-"You are G3, an AI programming agent of the same skill level as a seasoned engineer at a major technology company. You analyze given tasks and write code to achieve goals.
+// ============================================================================
+// SHARED PROMPT SECTIONS
+// These are used by both native and non-native tool calling prompts
+// ============================================================================
+
+const SHARED_INTRO: &str = "\
+You are G3, an AI programming agent of the same skill level as a seasoned engineer at a major technology company. You analyze given tasks and write code to achieve goals.

 You have access to tools. When you need to accomplish a task, you MUST use the appropriate tool. Do not just describe what you would do - actually use the tools.

@@ -11,8 +16,9 @@ IMPORTANT: You must call tools to achieve goals. When you receive a request:
 5. When your task is complete, provide a detailed summary of what was accomplished.

 For shell commands: Use the shell tool with the exact command needed. Always use `rg` (ripgrep) instead of `grep` - it's faster, has better defaults, and respects .gitignore. Avoid commands that produce a large amount of output, and consider piping those outputs to files. Example: If asked to list files, immediately call the shell tool with command parameter \"ls\".
-If you create temporary files for verification, place these in a subdir named 'tmp'. Do NOT pollute the current dir.
+If you create temporary files for verification, place these in a subdir named 'tmp'. Do NOT pollute the current dir.";

+const SHARED_TODO_SECTION: &str = "\
 # Task Management with TODO Tools

 **REQUIRED for multi-step tasks.** Use TODO tools when your task involves ANY of:
@@ -75,12 +81,14 @@ Keep items short, specific, and action-oriented.
 ✓ Helps recover from interruptions
 ✓ Creates better summaries

-If you can complete it with 1-2 tool calls, skip TODO.
+If you can complete it with 1-2 tool calls, skip TODO.";

+const SHARED_TEMPORARY_FILES: &str = "\
 # Temporary files

-If you create temporary files for verification or investigation, place these in a subdir named 'tmp'. Do NOT pollute the current dir.
+If you create temporary files for verification or investigation, place these in a subdir named 'tmp'. Do NOT pollute the current dir.";

+const SHARED_WEB_RESEARCH: &str = "\
 # Web Research

 When you need to look up documentation, search for resources, find data online, or research a topic to complete your task, use the `research` tool.
@@ -95,13 +103,14 @@ Simply call `research` with a specific query describing what you need to know. T

 IMPORTANT: If the user asks you to just respond with text (like \"just say hello\" or \"tell me about X\"), do NOT use tools. Simply respond with the requested text directly. Only use tools when you need to execute commands or complete tasks that require action.

-Do not explain what you're going to do - just do it by calling the tools.
+Do not explain what you're going to do - just do it by calling the tools.";

+const SHARED_WORKSPACE_MEMORY: &str = "\
 # Workspace Memory

 Workspace memory is automatically loaded at startup alongside README.md and AGENTS.md. It contains an index of features -> code locations, patterns, and entry points. If you need to re-read memory from disk (e.g., after another agent updates it), use `read_file analysis/memory.md`.

-**IMPORTANT**: After completing a task where you discovered code locations, you **MUST** call the `remember` tool to save them..
+**IMPORTANT**: After completing a task where you discovered code locations, you **MUST** call the `remember` tool to save them.

 ## Memory Format

@@ -143,33 +152,27 @@ After discovering how session continuation works:

 After discovering a useful pattern:

-{\"tool\": \"remember\", \"args\": {\"notes\": \"### UTF-8 Safe String Slicing\\nRust string slices use byte indices. Multi-byte chars (emoji, CJK) cause panics if sliced mid-character.\\n\\n1. Use `s.char_indices().nth(n)` to get byte index of Nth character\\n2. Use `s.chars().count()` for length, not `s.len()`\\n3. Danger zones: display truncation, user input, any non-ASCII text\"}}
+{\"tool\": \"remember\", \"args\": {\"notes\": \"### UTF-8 Safe String Slicing\\nRust string slices use byte indices. Multi-byte chars (emoji, CJK) cause panics if sliced mid-character.\\n\\n1. Use `s.char_indices().nth(n)` to get byte index of Nth character\\n2. Use `s.chars().count()` for length, not `s.len()`\\n3. Danger zones: display truncation, user input, any non-ASCII text\"}}";

+const SHARED_RESPONSE_GUIDELINES: &str = "\
 # Response Guidelines

 - Use Markdown formatting for all responses except tool calls.
 - Whenever taking actions, use the pronoun 'I'
 - When you discover features, patterns and code locations, call `remember` to save them.
- When showing example tool call JSON in prose or code blocks, use the fullwidth left curly bracket `｛` (U+FF5B) instead of `{` to prevent parser confusion.
-";
+- When showing example tool call JSON in prose or code blocks, use the fullwidth left curly bracket `｛` (U+FF5B) instead of `{` to prevent parser confusion.";

-pub const SYSTEM_PROMPT_FOR_NATIVE_TOOL_USE: &'static str = SYSTEM_NATIVE_TOOL_CALLS;
-
-/// Generate system prompt based on whether multiple tool calls are allowed
-pub fn get_system_prompt_for_native() -> String {
-    SYSTEM_PROMPT_FOR_NATIVE_TOOL_USE.to_string()
-}
-
-const SYSTEM_NON_NATIVE_TOOL_USE: &'static str =
-"You are G3, a general-purpose AI agent. Your goal is to analyze and solve problems by writing code.
-
-You have access to tools. When you need to accomplish a task, you MUST use the appropriate tool. Do not just describe what you would do - actually use the tools.
+// ============================================================================
+// NON-NATIVE SPECIFIC SECTIONS
+// These are only used by providers without native tool calling
+// ============================================================================

+const NON_NATIVE_TOOL_FORMAT: &str = "\
 # Tool Call Format

 When you need to execute a tool, write ONLY the JSON tool call on a new line:

-{\"tool\": \"tool_name\", \"args\": {\"param\": \"value\"}
+{\"tool\": \"tool_name\", \"args\": {\"param\": \"value\"}}

 The tool will execute immediately and you'll receive the result (success or error) to continue with.

@@ -178,8 +181,8 @@ The tool will execute immediately and you'll receive the result (success or erro
 Short description for providers without native calling specs:

 - **shell**: Execute shell commands
-  - Format: {\"tool\": \"shell\", \"args\": {\"command\": \"your_command_here\"}
-  - Example: {\"tool\": \"shell\", \"args\": {\"command\": \"ls ~/Downloads\"}
+  - Format: {\"tool\": \"shell\", \"args\": {\"command\": \"your_command_here\"}}
+  - Example: {\"tool\": \"shell\", \"args\": {\"command\": \"ls ~/Downloads\"}}
  - Always use `rg` (ripgrep) instead of `grep` - it's faster and respects .gitignore

 - **background_process**: Launch a long-running process in the background (e.g., game servers, dev servers)
@@ -189,21 +192,21 @@ Short description for providers without native calling specs:
  - Note: Process runs independently; logs are captured to a file for later inspection

 - **read_file**: Read the contents of a file (supports partial reads via start/end)
-  - Format: {\"tool\": \"read_file\", \"args\": {\"file_path\": \"path/to/file\", \"start\": 0, \"end\": 100}
-  - Example: {\"tool\": \"read_file\", \"args\": {\"file_path\": \"src/main.rs\"}
-  - Example (partial): {\"tool\": \"read_file\", \"args\": {\"file_path\": \"large.log\", \"start\": 0, \"end\": 1000}
+  - Format: {\"tool\": \"read_file\", \"args\": {\"file_path\": \"path/to/file\", \"start\": 0, \"end\": 100}}
+  - Example: {\"tool\": \"read_file\", \"args\": {\"file_path\": \"src/main.rs\"}}
+  - Example (partial): {\"tool\": \"read_file\", \"args\": {\"file_path\": \"large.log\", \"start\": 0, \"end\": 1000}}

 - **read_image**: Read an image file for visual analysis (PNG, JPEG, GIF, WebP)
  - Format: {\"tool\": \"read_image\", \"args\": {\"file_paths\": [\"path/to/image.png\"]}}
  - Example: {\"tool\": \"read_image\", \"args\": {\"file_paths\": [\"sprites/fairy.png\"]}}

 - **write_file**: Write content to a file (creates or overwrites)
-  - Format: {\"tool\": \"write_file\", \"args\": {\"file_path\": \"path/to/file\", \"content\": \"file content\"}
-  - Example: {\"tool\": \"write_file\", \"args\": {\"file_path\": \"src/lib.rs\", \"content\": \"pub fn hello() {}\"}
+  - Format: {\"tool\": \"write_file\", \"args\": {\"file_path\": \"path/to/file\", \"content\": \"file content\"}}
+  - Example: {\"tool\": \"write_file\", \"args\": {\"file_path\": \"src/lib.rs\", \"content\": \"pub fn hello() {}\"}}

 - **str_replace**: Replace text in a file using a diff
-  - Format: {\"tool\": \"str_replace\", \"args\": {\"file_path\": \"path/to/file\", \"diff\": \"--- old\\n-old text\\n+++ new\\n+new text\"}
-  - Example: {\"tool\": \"str_replace\", \"args\": {\"file_path\": \"src/main.rs\", \"diff\": \"--- old\\n-old_code();\\n+++ new\\n+new_code();\"}
+  - Format: {\"tool\": \"str_replace\", \"args\": {\"file_path\": \"path/to/file\", \"diff\": \"--- old\\n-old text\\n+++ new\\n+new text\"}}
+  - Example: {\"tool\": \"str_replace\", \"args\": {\"file_path\": \"src/main.rs\", \"diff\": \"--- old\\n-old_code();\\n+++ new\\n+new_code();\"}}

 - **todo_read**: Read the current session's TODO list from todo.g3.md (session-scoped)
  - Format: {\"tool\": \"todo_read\", \"args\": {}}
@@ -220,8 +223,6 @@ Short description for providers without native calling specs:
  - Find structs: {\"tool\": \"code_search\", \"args\": {\"searches\": [{\"name\": \"structs\", \"query\": \"(struct_item name: (type_identifier) @name)\", \"language\": \"rust\"}]}}
  - Multiple searches: {\"tool\": \"code_search\", \"args\": {\"searches\": [{\"name\": \"funcs\", \"query\": \"(function_item name: (identifier) @name)\", \"language\": \"rust\"}, {\"name\": \"structs\", \"query\": \"(struct_item name: (type_identifier) @name)\", \"language\": \"rust\"}]}}
  - With context lines: {\"tool\": \"code_search\", \"args\": {\"searches\": [{\"name\": \"funcs\", \"query\": \"(function_item name: (identifier) @name)\", \"language\": \"rust\", \"context_lines\": 3}]}}
-       - \"context\": 3 (show surrounding lines),
-       - \"json_style\": \"stream\" (for large results)

 - **research**: Perform web-based research and return a structured report
  - Format: {\"tool\": \"research\", \"args\": {\"query\": \"your research question\"}}
@@ -230,9 +231,10 @@ Short description for providers without native calling specs:

 - **remember**: Save discovered code locations to workspace memory
  - Format: {\"tool\": \"remember\", \"args\": {\"notes\": \"markdown notes\"}}
-  - Example: {\"tool\": \"remember\", \"args\": {\"notes\": \"### Feature Name\\n- `file.rs` [0..100] - `function_name()`\"}}
-  - Use at the END of your turn after discovering code locations via search tools
+  - Example: {\"tool\": \"remember\", \"args\": {\"notes\": \"### Feature Name\\n- `file.rs` [0..100] - `function_name()\"}}
+  - Use at the END of your turn after discovering code locations via search tools";

+const NON_NATIVE_INSTRUCTIONS: &str = "\
 # Instructions

 1. Analyze the request and break down into smaller tasks if appropriate
@@ -240,6 +242,10 @@ Short description for providers without native calling specs:
 3. STOP when the original request was satisfied
 4. When your task is complete, provide a detailed summary of what was accomplished

+IMPORTANT: If the user asks you to just respond with text (like \"just say hello\" or \"tell me about X\"), do NOT use tools. Simply respond with the requested text directly. Only use tools when you need to execute commands or complete tasks that require action.
+
+Do not explain what you're going to do - just do it by calling the tools.
+
 For reading files, prioritize use of code_search tool use with multiple search requests per call instead of read_file, if it makes sense.

 Exception to using ONE tool at a time:
@@ -256,104 +262,53 @@ But NOT:
 write_file(\"helper.rs\", \"...\")
 write_file(\"file2.txt\", \"...\")
 write_file(\"helper.rs\", \"...\")
-[DONE]
+[DONE]";

-# Task Management with TODO Tools
-
-**REQUIRED for multi-step tasks.** Use TODO tools when your task involves ANY of:
- Multiple files to create/modify (2+)
- Multiple distinct steps (3+)
- Dependencies between steps
- Testing or verification needed
- Uncertainty about approach
-
-## Workflow
-
-Every multi-step task follows this pattern:
-1. **Start**: Call todo_read, then todo_write to create your plan
-2. **During**: Execute steps, then todo_read and todo_write to mark progress
-3. **End**: Call todo_read to verify all items complete
-
-Note: todo_write replaces the entire list, so always read first to preserve content.
+const NON_NATIVE_TODO_ADDENDUM: &str = "

 IMPORTANT: If you are provided with a SHA256 hash of the requirements file, you MUST include it as the very first line of the todo.g3.md file in the following format:
 `{{Based on the requirements file with SHA256: <SHA>}}`
-This ensures the TODO list is tracked against the specific version of requirements it was generated from.
+This ensures the TODO list is tracked against the specific version of requirements it was generated from.";

-## Examples
+// ============================================================================
+// COMPOSED PROMPTS
+// ============================================================================

-**Example 1: Feature Implementation**
-User asks: \"Add user authentication with tests\"
+/// System prompt for providers with native tool calling (Anthropic, OpenAI, etc.)
+/// Note: This is kept for backwards compatibility but the function is preferred
+pub const SYSTEM_PROMPT_FOR_NATIVE_TOOL_USE: &str = "";

-First action:
-{\"tool\": \"todo_read\", \"args\": {}}
+/// Generate system prompt for native tool calling providers
+pub fn get_system_prompt_for_native() -> String {
+    format!(
+        "{}\n\n{}\n\n{}\n\n{}\n\n{}\n\n{}",
+        SHARED_INTRO,
+        SHARED_TODO_SECTION,
+        SHARED_TEMPORARY_FILES,
+        SHARED_WEB_RESEARCH,
+        SHARED_WORKSPACE_MEMORY,
+        SHARED_RESPONSE_GUIDELINES
+    )
+}

-Then create plan:
-{\"tool\": \"todo_write\", \"args\": {\"content\": \"- [ ] Add user authentication\\n  - [ ] Create User struct\\n  - [ ] Add login endpoint\\n  - [ ] Add password hashing\\n  - [ ] Write unit tests\\n  - [ ] Write integration tests\"}}
+/// System prompt for providers without native tool calling (embedded models)
+/// Note: This is kept for backwards compatibility but the function is preferred
+pub const SYSTEM_PROMPT_FOR_NON_NATIVE_TOOL_USE: &str = "";

-After completing User struct:
-{\"tool\": \"todo_read\", \"args\": {}}
-{\"tool\": \"todo_write\", \"args\": {\"content\": \"- [ ] Add user authentication\\n  - [x] Create User struct\\n  - [ ] Add login endpoint\\n  - [ ] Add password hashing\\n  - [ ] Write unit tests\\n  - [ ] Write integration tests\"}}
-
-**Example 2: Bug Fix**
-User asks: \"Fix the memory leak in cache module\"
-
-{\"tool\": \"todo_read\", \"args\": {}}
-{\"tool\": \"todo_write\", \"args\": {\"content\": \"- [ ] Fix memory leak\\n  - [ ] Review cache.rs\\n  - [ ] Check for unclosed resources\\n  - [ ] Add drop implementation\\n  - [ ] Write test to verify fix\"}}
-
-**Example 3: Refactoring**
-User asks: \"Refactor database layer to use async/await\"
-
-{\"tool\": \"todo_read\", \"args\": {}}
-{\"tool\": \"todo_write\", \"args\": {\"content\": \"- [ ] Refactor to async\\n  - [ ] Update function signatures\\n  - [ ] Replace blocking calls\\n  - [ ] Update all callers\\n  - [ ] Update tests\"}}
-
-## Format
-
-Use markdown checkboxes:
- \"- [ ]\" for incomplete tasks
- \"- [x]\" for completed tasks
- Indent with 2 spaces for subtasks
-
-Keep items short, specific, and action-oriented.
-
-## Benefits
-
-✓ Prevents missed steps
-✓ Makes progress visible
-✓ Helps recover from interruptions
-✓ Creates better summaries
-
-## When NOT to Use
-
-Skip TODO tools for simple single-step tasks:
- \"List files\" → just use shell
- \"Read config.json\" → just use read_file
- \"Search for functions\" → just use code_search
-
-If you can complete it with 1-2 tool calls, skip TODO.
-
-# Workspace Memory
-
-Workspace memory (if available) is automatically loaded at startup. It contains feature locations and patterns discovered in previous sessions. If you need to re-read memory from disk (e.g., after another agent updates it), use `read_file analysis/memory.md`.
-
-**ALWAYS** call `remember` at the END of your turn when you discovered:
- A feature's location (file + char range + function/struct names)
- A useful pattern or workflow
- An entry point for a subsystem
-
-This applies whenever you use search tools like `code_search`, `rg`, `grep`, `find`, or `read_file` to locate code.
-
-Do NOT save duplicates - check the Workspace Memory section (loaded at startup) to see what's already known.
-
-# Response Guidelines
-
- Use Markdown formatting for all responses except tool calls.
- Whenever taking actions, use the pronoun 'I'
- After discovering code locations via search tools, call `remember` to save them.
- When showing example tool call JSON in prose or code blocks, use the fullwidth left curly bracket `｛` (U+FF5B) instead of `{` to prevent parser confusion.
-";
-
-pub const SYSTEM_PROMPT_FOR_NON_NATIVE_TOOL_USE: &'static str = SYSTEM_NON_NATIVE_TOOL_USE;
+/// Generate system prompt for non-native tool calling providers (embedded models)
+pub fn get_system_prompt_for_non_native() -> String {
+    format!(
+        "{}\n\n{}\n\n{}\n\n{}{}\n\n{}\n\n{}\n\n{}",
+        SHARED_INTRO,
+        NON_NATIVE_TOOL_FORMAT,
+        NON_NATIVE_INSTRUCTIONS,
+        SHARED_TODO_SECTION,
+        NON_NATIVE_TODO_ADDENDUM,
+        SHARED_WEB_RESEARCH,
+        SHARED_WORKSPACE_MEMORY,
+        SHARED_RESPONSE_GUIDELINES
+    )
+}

 /// The G3 identity line that gets replaced in agent mode
 const G3_IDENTITY_LINE: &str = "You are G3, an AI programming agent of the same skill level as a seasoned engineer at a major technology company. You analyze given tasks and write code to achieve goals.";
@@ -371,3 +326,80 @@ pub fn get_agent_system_prompt(agent_prompt: &str, allow_multiple_tool_calls: bo
    // Replace only the G3 identity line with the custom agent prompt
    full_prompt.replace(G3_IDENTITY_LINE, agent_prompt.trim())
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_native_prompt_contains_validation_string() {
+        let prompt = get_system_prompt_for_native();
+        assert!(prompt.contains("You have access to tools"),
+            "Native prompt must contain validation string");
+    }
+
+    #[test]
+    fn test_non_native_prompt_contains_validation_string() {
+        let prompt = get_system_prompt_for_non_native();
+        assert!(prompt.contains("You have access to tools"),
+            "Non-native prompt must contain validation string");
+    }
+
+    #[test]
+    fn test_native_prompt_contains_important_directive() {
+        let prompt = get_system_prompt_for_native();
+        assert!(prompt.contains("IMPORTANT: You must call tools to achieve goals"),
+            "Native prompt must contain IMPORTANT directive");
+    }
+
+    #[test]
+    fn test_non_native_prompt_contains_important_directive() {
+        let prompt = get_system_prompt_for_non_native();
+        assert!(prompt.contains("IMPORTANT: You must call tools to achieve goals"),
+            "Non-native prompt must contain IMPORTANT directive");
+    }
+
+    #[test]
+    fn test_non_native_prompt_contains_tool_format() {
+        let prompt = get_system_prompt_for_non_native();
+        assert!(prompt.contains("# Tool Call Format"),
+            "Non-native prompt must contain tool format section");
+        assert!(prompt.contains("# Available Tools"),
+            "Non-native prompt must contain available tools section");
+    }
+
+    #[test]
+    fn test_agent_prompt_replaces_identity() {
+        let custom = "You are TestAgent, a specialized testing assistant.";
+        let prompt = get_agent_system_prompt(custom, true);
+        assert!(prompt.contains(custom), "Agent prompt should contain custom identity");
+        assert!(!prompt.contains(G3_IDENTITY_LINE), "Agent prompt should not contain G3 identity");
+    }
+
+    #[test]
+    fn test_both_prompts_have_todo_section() {
+        let native = get_system_prompt_for_native();
+        let non_native = get_system_prompt_for_non_native();
+        
+        assert!(native.contains("# Task Management with TODO Tools"));
+        assert!(non_native.contains("# Task Management with TODO Tools"));
+    }
+
+    #[test]
+    fn test_both_prompts_have_workspace_memory() {
+        let native = get_system_prompt_for_native();
+        let non_native = get_system_prompt_for_non_native();
+        
+        assert!(native.contains("# Workspace Memory"));
+        assert!(non_native.contains("# Workspace Memory"));
+    }
+
+    #[test]
+    fn test_both_prompts_have_web_research() {
+        let native = get_system_prompt_for_native();
+        let non_native = get_system_prompt_for_non_native();
+        
+        assert!(native.contains("# Web Research"));
+        assert!(non_native.contains("# Web Research"));
+    }
+}
--- a/crates/g3-core/src/provider_registration.rs
+++ b/crates/g3-core/src/provider_registration.rs
@@ -78,7 +78,8 @@ fn register_embedded_providers(
 ) -> Result<()> {
    for (name, embedded_config) in &config.providers.embedded {
        if should_register(providers_to_register, "embedded", name) {
-            let embedded_provider = g3_providers::EmbeddedProvider::new(
+            let embedded_provider = g3_providers::EmbeddedProvider::new_with_name(
+                format!("embedded.{}", name),
                embedded_config.model_path.clone(),
                embedded_config.model_type.clone(),
                embedded_config.context_length,
--- a/crates/g3-core/tests/project_context_test.rs
+++ b/crates/g3-core/tests/project_context_test.rs
@@ -40,7 +40,7 @@ async fn test_context_window_initial_structure() {
    
    // First message should be system prompt
    let system_msg = &context.conversation_history[0];
-    assert!(system_msg.content.contains("IMPORTANT: You must call tools to achieve goals"),
+    assert!(system_msg.content.contains("You have access to tools"),
        "First message should be system prompt with tool instructions");
    
    // Second message should be README content
@@ -285,7 +285,7 @@ async fn test_full_context_order() {
    
    // Message 0: System prompt
    let system = &context.conversation_history[0].content;
-    assert!(system.contains("IMPORTANT: You must call tools"),
+    assert!(system.contains("You have access to tools"),
        "Message 0 should be system prompt");
    
    // Message 1: Combined content with project appended
--- a/crates/g3-providers/Cargo.toml
+++ b/crates/g3-providers/Cargo.toml
@@ -27,6 +27,6 @@ nanoid = "0.4"
 serde_urlencoded = "0.7"
 tokio-util = "0.7"
 dirs = "5.0"
-llama_cpp = { version = "0.3.2", features = ["metal"] }
+llama-cpp-2 = { version = "0.1", features = ["metal"] }
 shellexpand = "3.1"
 rand = "0.8"
--- a/crates/g3-providers/src/embedded.rs
+++ b/crates/g3-providers/src/embedded.rs