From 739573e41bb5c8dd1f3ff1ff305ba83c0d51f002 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sun, 31 Aug 2025 17:59:09 +0200
Subject: [PATCH] feat(flash_attention): set auto for flash_attention in
 llama.cpp (#6168)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 backend/backend.proto                 |  2 +-
 backend/cpp/llama-cpp/Makefile        |  2 +-
 backend/cpp/llama-cpp/grpc-server.cpp | 10 +++++++++-
 core/backend/options.go               |  8 +++++++-
 core/config/backend_config.go         |  8 ++++----
 5 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/backend/backend.proto b/backend/backend.proto
index 77791b7ee..b020553fe 100644
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -242,7 +242,7 @@ message ModelOptions {
 
   string Type = 49;
 
-  bool FlashAttention = 56;
+  string FlashAttention = 56;
   bool NoKVOffload = 57;
 
   string ModelPath = 59;
diff --git a/backend/cpp/llama-cpp/Makefile b/backend/cpp/llama-cpp/Makefile
index 359080474..291a2d956 100644
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@
 
-LLAMA_VERSION?=3d16b29c3bb1ec816ac0e782f20d169097063919
+LLAMA_VERSION?=4d74393bcc956ccd7df68a6a06d1a0575cfa712c
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
 
 CMAKE_ARGS?=
diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp
index 184a8913f..2c7c2e52d 100644
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -304,7 +304,15 @@ static void params_parse(const backend::ModelOptions* request,
     }
     params.use_mlock = request->mlock();
     params.use_mmap = request->mmap();
-    params.flash_attn = request->flashattention();
+
+    if (request->flashattention() == "on" || request->flashattention() == "enabled") {
+        params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
+    } else if (request->flashattention() == "off" || request->flashattention() == "disabled") {
+        params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
+    } else if (request->flashattention() == "auto") {
+        params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
+    }
+
     params.no_kv_offload = request->nokvoffload();
     params.ctx_shift = false; // We control context-shifting in any case (and we disable it as it could just lead to infinite loops)
 
diff --git a/core/backend/options.go b/core/backend/options.go
index a64fbb74b..bd3fbd74b 100644
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -78,6 +78,12 @@ func grpcModelOpts(c config.ModelConfig) *pb.ModelOptions {
 		b = c.Batch
 	}
 
+	flashAttention := "auto"
+
+	if c.FlashAttention != nil {
+		flashAttention = *c.FlashAttention
+	}
+
 	f16 := false
 	if c.F16 != nil {
 		f16 = *c.F16
@@ -166,7 +172,7 @@ func grpcModelOpts(c config.ModelConfig) *pb.ModelOptions {
 		LimitVideoPerPrompt: int32(c.LimitMMPerPrompt.LimitVideoPerPrompt),
 		LimitAudioPerPrompt: int32(c.LimitMMPerPrompt.LimitAudioPerPrompt),
 		MMProj:              c.MMProj,
-		FlashAttention:      c.FlashAttention,
+		FlashAttention:      flashAttention,
 		CacheTypeKey:        c.CacheTypeK,
 		CacheTypeValue:      c.CacheTypeV,
 		NoKVOffload:         c.NoKVOffloading,
diff --git a/core/config/backend_config.go b/core/config/backend_config.go
index 123462fc6..31173ecf8 100644
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@@ -164,10 +164,10 @@ type LLMConfig struct {
 	LimitMMPerPrompt     LimitMMPerPrompt `yaml:"limit_mm_per_prompt" json:"limit_mm_per_prompt"`       // vLLM
 	MMProj               string           `yaml:"mmproj" json:"mmproj"`
 
-	FlashAttention bool   `yaml:"flash_attention" json:"flash_attention"`
-	NoKVOffloading bool   `yaml:"no_kv_offloading" json:"no_kv_offloading"`
-	CacheTypeK     string `yaml:"cache_type_k" json:"cache_type_k"`
-	CacheTypeV     string `yaml:"cache_type_v" json:"cache_type_v"`
+	FlashAttention *string `yaml:"flash_attention" json:"flash_attention"`
+	NoKVOffloading bool    `yaml:"no_kv_offloading" json:"no_kv_offloading"`
+	CacheTypeK     string  `yaml:"cache_type_k" json:"cache_type_k"`
+	CacheTypeV     string  `yaml:"cache_type_v" json:"cache_type_v"`
 
 	RopeScaling string `yaml:"rope_scaling" json:"rope_scaling"`
 	ModelType   string `yaml:"type" json:"type"`