mirror of
https://github.com/mudler/LocalAI.git
synced 2025-12-20 08:50:38 -06:00
fix: properly terminate llama.cpp kv_overrides array with empty key + updated doc (#6672)
* fix: properly terminate kv_overrides array with empty key The llama model loading function expects KV overrides to be terminated with an empty key (key[0] == 0). Previously, the kv_overrides vector was not being properly terminated, causing an assertion failure. This commit ensures that after parsing all KV override strings, we add a final terminating entry with an empty key to satisfy the C-style array termination requirement. This fixes the assertion error and allows the model to load correctly with custom KV overrides. Fixes #6643 - Also included a reference to the usage of the `overrides` option in the advanced-usage section. Signed-off-by: blob42 <contact@blob42.xyz> * doc: document the `overrides` option --------- Signed-off-by: blob42 <contact@blob42.xyz>
This commit is contained in:
@@ -291,6 +291,11 @@ static void params_parse(server_context& ctx_server, const backend::ModelOptions
|
||||
}
|
||||
}
|
||||
|
||||
if (!params.kv_overrides.empty()) {
|
||||
params.kv_overrides.emplace_back();
|
||||
params.kv_overrides.back().key[0] = 0;
|
||||
}
|
||||
|
||||
// TODO: Add yarn
|
||||
|
||||
if (!request->tensorsplit().empty()) {
|
||||
|
||||
@@ -233,6 +233,15 @@ n_draft: 0
|
||||
# Quantization settings for the model, impacting memory and processing speed.
|
||||
quantization: ""
|
||||
|
||||
# List of KV Overrides for llama.cpp (--override-kv flag)
|
||||
# Format: KEY=TYPE:VALUE
|
||||
# Example: `qwen3moe.expert_used_count=int:10`
|
||||
# Use this to override model configuration values at runtime.
|
||||
# Supported types include: int, float, string, bool.
|
||||
# Multiple overrides can be specified as a list.
|
||||
overrides:
|
||||
- KEY=TYPE:VALUE
|
||||
|
||||
# Utilization percentage of GPU memory to allocate for the model. (vLLM)
|
||||
gpu_memory_utilization: 0
|
||||
|
||||
|
||||
Reference in New Issue
Block a user