From 32c0ab3a7fe957c0ae0639c8ea6a3083ff66b263 Mon Sep 17 00:00:00 2001
From: Chakib Benziane <contact@blob42.xyz>
Date: Thu, 23 Oct 2025 09:31:55 +0200
Subject: [PATCH] fix: properly terminate llama.cpp kv_overrides array with
 empty key + updated doc (#6672)

* fix: properly terminate kv_overrides array with empty key

The llama model loading function expects KV overrides to be terminated
with an empty key (key[0] == 0). Previously, the kv_overrides vector was
not being properly terminated, causing an assertion failure.

This commit ensures that after parsing all KV override strings, we add a
final terminating entry with an empty key to satisfy the C-style array
termination requirement. This fixes the assertion error and allows the
model to load correctly with custom KV overrides.

Fixes #6643

- Also included a reference to the usage of the `overrides` option in
  the advanced-usage section.

Signed-off-by: blob42 <contact@blob42.xyz>

* doc: document the `overrides` option

---------

Signed-off-by: blob42 <contact@blob42.xyz>
---
 backend/cpp/llama-cpp/grpc-server.cpp        | 5 +++++
 docs/content/docs/advanced/advanced-usage.md | 9 +++++++++
 2 files changed, 14 insertions(+)

diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp
index 46066202e..386aa78d0 100644
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -291,6 +291,11 @@ static void params_parse(server_context& ctx_server, const backend::ModelOptions
         }
     }
 
+    if (!params.kv_overrides.empty()) {
+        params.kv_overrides.emplace_back();
+        params.kv_overrides.back().key[0] = 0;
+    }
+
     // TODO: Add yarn
 
     if (!request->tensorsplit().empty()) {
diff --git a/docs/content/docs/advanced/advanced-usage.md b/docs/content/docs/advanced/advanced-usage.md
index 8feaf9518..77e6ef63e 100644
--- a/docs/content/docs/advanced/advanced-usage.md
+++ b/docs/content/docs/advanced/advanced-usage.md
@@ -233,6 +233,15 @@ n_draft: 0
 # Quantization settings for the model, impacting memory and processing speed.
 quantization: ""
 
+# List of KV Overrides for llama.cpp (--override-kv flag)
+# Format: KEY=TYPE:VALUE
+# Example: `qwen3moe.expert_used_count=int:10`
+# Use this to override model configuration values at runtime.
+# Supported types include: int, float, string, bool.
+# Multiple overrides can be specified as a list.
+overrides:
+  - KEY=TYPE:VALUE
+
 # Utilization percentage of GPU memory to allocate for the model. (vLLM)
 gpu_memory_utilization: 0