fix: use consistent max_tokens defaults across providers

- Fix aliasing issue where resolve_max_tokens() used fallback_default_max_tokens
  (8192) instead of provider-specific defaults
- Update fallback_default_max_tokens from 8192 to 32000
- Set provider-specific max_tokens defaults:
  - Anthropic: 32000
  - OpenAI: 32000 (was 16000)
  - Databricks: 32000 (was 50000, now matches Anthropic as passthru)
  - Embedded: 2048
- Context window lengths unchanged:
  - OpenAI: 400,000
  - Anthropic: 200,000
  - Databricks (Claude): 200,000

This fixes the 'LLM response was cut off due to max_tokens limit' error
in agent mode that occurred because 8192 was being used instead of 32000.
This commit is contained in:
Dhanji R. Prasanna
2026-01-16 07:05:57 +05:30
parent 65e0217c68
commit 01cb4f6691
5 changed files with 23 additions and 21 deletions

View File

@@ -44,7 +44,7 @@ model_path = "test.gguf"
model_type = "llama"
[agent]
fallback_default_max_tokens = 8192
fallback_default_max_tokens = 32000
enable_streaming = true
timeout_seconds = 60
auto_compact = true
@@ -88,7 +88,7 @@ token = "test-token"
model = "test-model"
[agent]
fallback_default_max_tokens = 8192
fallback_default_max_tokens = 32000
enable_streaming = true
timeout_seconds = 60
auto_compact = true
@@ -132,7 +132,7 @@ token = "test-token"
model = "test-model"
[agent]
fallback_default_max_tokens = 8192
fallback_default_max_tokens = 32000
enable_streaming = true
timeout_seconds = 60
auto_compact = true
@@ -169,7 +169,7 @@ api_key = "test-key"
model = "claude-3"
[agent]
fallback_default_max_tokens = 8192
fallback_default_max_tokens = 32000
enable_streaming = true
timeout_seconds = 60
auto_compact = true
@@ -210,7 +210,7 @@ model = "claude-opus"
thinking_budget_tokens = 16000
[agent]
fallback_default_max_tokens = 8192
fallback_default_max_tokens = 32000
enable_streaming = true
timeout_seconds = 60
auto_compact = true
@@ -248,7 +248,7 @@ token = "test-token"
model = "test-model"
[agent]
fallback_default_max_tokens = 8192
fallback_default_max_tokens = 32000
enable_streaming = true
timeout_seconds = 60
auto_compact = true