From 739573e41bb5c8dd1f3ff1ff305ba83c0d51f002 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sun, 31 Aug 2025 17:59:09 +0200 Subject: [PATCH] feat(flash_attention): set auto for flash_attention in llama.cpp (#6168) Signed-off-by: Ettore Di Giacinto --- backend/backend.proto | 2 +- backend/cpp/llama-cpp/Makefile | 2 +- backend/cpp/llama-cpp/grpc-server.cpp | 10 +++++++++- core/backend/options.go | 8 +++++++- core/config/backend_config.go | 8 ++++---- 5 files changed, 22 insertions(+), 8 deletions(-) diff --git a/backend/backend.proto b/backend/backend.proto index 77791b7ee..b020553fe 100644 --- a/backend/backend.proto +++ b/backend/backend.proto @@ -242,7 +242,7 @@ message ModelOptions { string Type = 49; - bool FlashAttention = 56; + string FlashAttention = 56; bool NoKVOffload = 57; string ModelPath = 59; diff --git a/backend/cpp/llama-cpp/Makefile b/backend/cpp/llama-cpp/Makefile index 359080474..291a2d956 100644 --- a/backend/cpp/llama-cpp/Makefile +++ b/backend/cpp/llama-cpp/Makefile @@ -1,5 +1,5 @@ -LLAMA_VERSION?=3d16b29c3bb1ec816ac0e782f20d169097063919 +LLAMA_VERSION?=4d74393bcc956ccd7df68a6a06d1a0575cfa712c LLAMA_REPO?=https://github.com/ggerganov/llama.cpp CMAKE_ARGS?= diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index 184a8913f..2c7c2e52d 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -304,7 +304,15 @@ static void params_parse(const backend::ModelOptions* request, } params.use_mlock = request->mlock(); params.use_mmap = request->mmap(); - params.flash_attn = request->flashattention(); + + if (request->flashattention() == "on" || request->flashattention() == "enabled") { + params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED; + } else if (request->flashattention() == "off" || request->flashattention() == "disabled") { + params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED; + } else if (request->flashattention() == "auto") { + params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO; + } + params.no_kv_offload = request->nokvoffload(); params.ctx_shift = false; // We control context-shifting in any case (and we disable it as it could just lead to infinite loops) diff --git a/core/backend/options.go b/core/backend/options.go index a64fbb74b..bd3fbd74b 100644 --- a/core/backend/options.go +++ b/core/backend/options.go @@ -78,6 +78,12 @@ func grpcModelOpts(c config.ModelConfig) *pb.ModelOptions { b = c.Batch } + flashAttention := "auto" + + if c.FlashAttention != nil { + flashAttention = *c.FlashAttention + } + f16 := false if c.F16 != nil { f16 = *c.F16 @@ -166,7 +172,7 @@ func grpcModelOpts(c config.ModelConfig) *pb.ModelOptions { LimitVideoPerPrompt: int32(c.LimitMMPerPrompt.LimitVideoPerPrompt), LimitAudioPerPrompt: int32(c.LimitMMPerPrompt.LimitAudioPerPrompt), MMProj: c.MMProj, - FlashAttention: c.FlashAttention, + FlashAttention: flashAttention, CacheTypeKey: c.CacheTypeK, CacheTypeValue: c.CacheTypeV, NoKVOffload: c.NoKVOffloading, diff --git a/core/config/backend_config.go b/core/config/backend_config.go index 123462fc6..31173ecf8 100644 --- a/core/config/backend_config.go +++ b/core/config/backend_config.go @@ -164,10 +164,10 @@ type LLMConfig struct { LimitMMPerPrompt LimitMMPerPrompt `yaml:"limit_mm_per_prompt" json:"limit_mm_per_prompt"` // vLLM MMProj string `yaml:"mmproj" json:"mmproj"` - FlashAttention bool `yaml:"flash_attention" json:"flash_attention"` - NoKVOffloading bool `yaml:"no_kv_offloading" json:"no_kv_offloading"` - CacheTypeK string `yaml:"cache_type_k" json:"cache_type_k"` - CacheTypeV string `yaml:"cache_type_v" json:"cache_type_v"` + FlashAttention *string `yaml:"flash_attention" json:"flash_attention"` + NoKVOffloading bool `yaml:"no_kv_offloading" json:"no_kv_offloading"` + CacheTypeK string `yaml:"cache_type_k" json:"cache_type_k"` + CacheTypeV string `yaml:"cache_type_v" json:"cache_type_v"` RopeScaling string `yaml:"rope_scaling" json:"rope_scaling"` ModelType string `yaml:"type" json:"type"`