feat(loader): enhance single active backend to support LRU eviction (#7535)

* feat(loader): refactor single active backend support to LRU This changeset introduces LRU management of loaded backends. Users can set now a maximum number of models to be loaded concurrently, and, when setting LocalAI in single active backend mode we set LRU to 1 for backward compatibility. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * chore: add tests Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Update docs Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Fixups Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-01-07 19:20:04 -06:00 · 2025-12-12 12:28:38 +01:00
parent c141a40e00
commit fc5b9ebfcc
39 changed files with 836 additions and 131 deletions
--- a/core/cli/backends.go
+++ b/core/cli/backends.go
@@ -102,7 +102,7 @@ func (bi *BackendsInstall) Run(ctx *cliContext.Context) error {
 		}
 	}

-	modelLoader := model.NewModelLoader(systemState, true)
+	modelLoader := model.NewModelLoader(systemState)
 	err = startup.InstallExternalBackends(context.Background(), galleries, systemState, modelLoader, progressCallback, bi.BackendArgs, bi.Name, bi.Alias)
 	if err != nil {
 		return err
--- a/core/cli/models.go
+++ b/core/cli/models.go
@@ -80,7 +80,7 @@ func (mi *ModelsInstall) Run(ctx *cliContext.Context) error {
 		return err
 	}

-	galleryService := services.NewGalleryService(&config.ApplicationConfig{}, model.NewModelLoader(systemState, true))
+	galleryService := services.NewGalleryService(&config.ApplicationConfig{}, model.NewModelLoader(systemState))
 	err = galleryService.Start(context.Background(), config.NewModelConfigLoader(mi.ModelsPath), systemState)
 	if err != nil {
 		return err
@@ -134,7 +134,7 @@ func (mi *ModelsInstall) Run(ctx *cliContext.Context) error {
 			log.Info().Str("model", modelName).Str("license", model.License).Msg("installing model")
 		}

-		modelLoader := model.NewModelLoader(systemState, true)
+		modelLoader := model.NewModelLoader(systemState)
 		err = startup.InstallModels(context.Background(), galleryService, galleries, backendGalleries, systemState, modelLoader, !mi.DisablePredownloadScan, mi.AutoloadBackendGalleries, progressCallback, modelName)
 		if err != nil {
 			return err
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -64,7 +64,8 @@ type RunCMD struct {
 	Peer2PeerToken                     string   `env:"LOCALAI_P2P_TOKEN,P2P_TOKEN,TOKEN" name:"p2ptoken" help:"Token for P2P mode (optional)" group:"p2p"`
 	Peer2PeerNetworkID                 string   `env:"LOCALAI_P2P_NETWORK_ID,P2P_NETWORK_ID" help:"Network ID for P2P mode, can be set arbitrarly by the user for grouping a set of instances" group:"p2p"`
 	ParallelRequests                   bool     `env:"LOCALAI_PARALLEL_REQUESTS,PARALLEL_REQUESTS" help:"Enable backends to handle multiple requests in parallel if they support it (e.g.: llama.cpp or vllm)" group:"backends"`
-	SingleActiveBackend                bool     `env:"LOCALAI_SINGLE_ACTIVE_BACKEND,SINGLE_ACTIVE_BACKEND" help:"Allow only one backend to be run at a time" group:"backends"`
+	SingleActiveBackend                bool     `env:"LOCALAI_SINGLE_ACTIVE_BACKEND,SINGLE_ACTIVE_BACKEND" help:"Allow only one backend to be run at a time (deprecated: use --max-active-backends=1 instead)" group:"backends"`
+	MaxActiveBackends                  int      `env:"LOCALAI_MAX_ACTIVE_BACKENDS,MAX_ACTIVE_BACKENDS" default:"0" help:"Maximum number of backends to keep loaded at once (0 = unlimited, 1 = single backend mode). Least recently used backends are evicted when limit is reached" group:"backends"`
 	PreloadBackendOnly                 bool     `env:"LOCALAI_PRELOAD_BACKEND_ONLY,PRELOAD_BACKEND_ONLY" default:"false" help:"Do not launch the API services, only the preloaded models / backends are started (useful for multi-node setups)" group:"backends"`
 	ExternalGRPCBackends               []string `env:"LOCALAI_EXTERNAL_GRPC_BACKENDS,EXTERNAL_GRPC_BACKENDS" help:"A list of external grpc backends" group:"backends"`
 	EnableWatchdogIdle                 bool     `env:"LOCALAI_WATCHDOG_IDLE,WATCHDOG_IDLE" default:"false" help:"Enable watchdog for stopping backends that are idle longer than the watchdog-idle-timeout" group:"backends"`
@@ -202,7 +203,13 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 	if r.ParallelRequests {
 		opts = append(opts, config.EnableParallelBackendRequests)
 	}
-	if r.SingleActiveBackend {
+
+	// Handle max active backends (LRU eviction)
+	// MaxActiveBackends takes precedence over SingleActiveBackend
+	if r.MaxActiveBackends > 0 {
+		opts = append(opts, config.SetMaxActiveBackends(r.MaxActiveBackends))
+	} else if r.SingleActiveBackend {
+		// Backward compatibility: --single-active-backend is equivalent to --max-active-backends=1
 		opts = append(opts, config.EnableSingleBackend)
 	}

--- a/core/cli/soundgeneration.go
+++ b/core/cli/soundgeneration.go
@@ -79,7 +79,7 @@ func (t *SoundGenerationCMD) Run(ctx *cliContext.Context) error {
 		GeneratedContentDir:  outputDir,
 		ExternalGRPCBackends: externalBackends,
 	}
-	ml := model.NewModelLoader(systemState, opts.SingleBackend)
+	ml := model.NewModelLoader(systemState)

 	defer func() {
 		err := ml.StopAllGRPC()
--- a/core/cli/transcript.go
+++ b/core/cli/transcript.go
@@ -38,7 +38,7 @@ func (t *TranscriptCMD) Run(ctx *cliContext.Context) error {
 	}

 	cl := config.NewModelConfigLoader(t.ModelsPath)
-	ml := model.NewModelLoader(systemState, opts.SingleBackend)
+	ml := model.NewModelLoader(systemState)
 	if err := cl.LoadModelConfigsFromPath(t.ModelsPath); err != nil {
 		return err
 	}
--- a/core/cli/tts.go
+++ b/core/cli/tts.go
@@ -48,7 +48,7 @@ func (t *TTSCMD) Run(ctx *cliContext.Context) error {
 		GeneratedContentDir: outputDir,
 	}

-	ml := model.NewModelLoader(systemState, opts.SingleBackend)
+	ml := model.NewModelLoader(systemState)

 	defer func() {
 		err := ml.StopAllGRPC()
--- a/core/cli/worker/worker_llamacpp.go
+++ b/core/cli/worker/worker_llamacpp.go
@@ -37,7 +37,7 @@ func findLLamaCPPBackend(galleries string, systemState *system.SystemState) (str

 	backend, ok := backends.Get(llamaCPPGalleryName)
 	if !ok {
-		ml := model.NewModelLoader(systemState, true)
+		ml := model.NewModelLoader(systemState)
 		var gals []config.Gallery
 		if err := json.Unmarshal([]byte(galleries), &gals); err != nil {
 			log.Error().Err(err).Msg("failed loading galleries")