From 32c0ab3a7fe957c0ae0639c8ea6a3083ff66b263 Mon Sep 17 00:00:00 2001 From: Chakib Benziane Date: Thu, 23 Oct 2025 09:31:55 +0200 Subject: [PATCH] fix: properly terminate llama.cpp kv_overrides array with empty key + updated doc (#6672) * fix: properly terminate kv_overrides array with empty key The llama model loading function expects KV overrides to be terminated with an empty key (key[0] == 0). Previously, the kv_overrides vector was not being properly terminated, causing an assertion failure. This commit ensures that after parsing all KV override strings, we add a final terminating entry with an empty key to satisfy the C-style array termination requirement. This fixes the assertion error and allows the model to load correctly with custom KV overrides. Fixes #6643 - Also included a reference to the usage of the `overrides` option in the advanced-usage section. Signed-off-by: blob42 * doc: document the `overrides` option --------- Signed-off-by: blob42 --- backend/cpp/llama-cpp/grpc-server.cpp | 5 +++++ docs/content/docs/advanced/advanced-usage.md | 9 +++++++++ 2 files changed, 14 insertions(+) diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index 46066202e..386aa78d0 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -291,6 +291,11 @@ static void params_parse(server_context& ctx_server, const backend::ModelOptions } } + if (!params.kv_overrides.empty()) { + params.kv_overrides.emplace_back(); + params.kv_overrides.back().key[0] = 0; + } + // TODO: Add yarn if (!request->tensorsplit().empty()) { diff --git a/docs/content/docs/advanced/advanced-usage.md b/docs/content/docs/advanced/advanced-usage.md index 8feaf9518..77e6ef63e 100644 --- a/docs/content/docs/advanced/advanced-usage.md +++ b/docs/content/docs/advanced/advanced-usage.md @@ -233,6 +233,15 @@ n_draft: 0 # Quantization settings for the model, impacting memory and processing speed. quantization: "" +# List of KV Overrides for llama.cpp (--override-kv flag) +# Format: KEY=TYPE:VALUE +# Example: `qwen3moe.expert_used_count=int:10` +# Use this to override model configuration values at runtime. +# Supported types include: int, float, string, bool. +# Multiple overrides can be specified as a list. +overrides: + - KEY=TYPE:VALUE + # Utilization percentage of GPU memory to allocate for the model. (vLLM) gpu_memory_utilization: 0