feat(flash_attention): set auto for flash_attention in llama.cpp (#6168)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto
2025-08-31 17:59:09 +02:00
committed by GitHub
parent dbdf2908ad
commit 739573e41b
5 changed files with 22 additions and 8 deletions

View File

@@ -1,5 +1,5 @@
LLAMA_VERSION?=3d16b29c3bb1ec816ac0e782f20d169097063919
LLAMA_VERSION?=4d74393bcc956ccd7df68a6a06d1a0575cfa712c
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
CMAKE_ARGS?=

View File

@@ -304,7 +304,15 @@ static void params_parse(const backend::ModelOptions* request,
}
params.use_mlock = request->mlock();
params.use_mmap = request->mmap();
params.flash_attn = request->flashattention();
if (request->flashattention() == "on" || request->flashattention() == "enabled") {
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
} else if (request->flashattention() == "off" || request->flashattention() == "disabled") {
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
} else if (request->flashattention() == "auto") {
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
}
params.no_kv_offload = request->nokvoffload();
params.ctx_shift = false; // We control context-shifting in any case (and we disable it as it could just lead to infinite loops)