feat(loader): enhance single active backend to support LRU eviction (#7535)

* feat(loader): refactor single active backend support to LRU

This changeset introduces LRU management of loaded backends. Users can
set now a maximum number of models to be loaded concurrently, and, when
setting LocalAI in single active backend mode we set LRU to 1 for
backward compatibility.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* chore: add tests

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Update docs

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Fixups

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto
2025-12-12 12:28:38 +01:00
committed by GitHub
parent c141a40e00
commit fc5b9ebfcc
39 changed files with 836 additions and 131 deletions

View File

@@ -102,7 +102,7 @@ func (bi *BackendsInstall) Run(ctx *cliContext.Context) error {
}
}
modelLoader := model.NewModelLoader(systemState, true)
modelLoader := model.NewModelLoader(systemState)
err = startup.InstallExternalBackends(context.Background(), galleries, systemState, modelLoader, progressCallback, bi.BackendArgs, bi.Name, bi.Alias)
if err != nil {
return err

View File

@@ -80,7 +80,7 @@ func (mi *ModelsInstall) Run(ctx *cliContext.Context) error {
return err
}
galleryService := services.NewGalleryService(&config.ApplicationConfig{}, model.NewModelLoader(systemState, true))
galleryService := services.NewGalleryService(&config.ApplicationConfig{}, model.NewModelLoader(systemState))
err = galleryService.Start(context.Background(), config.NewModelConfigLoader(mi.ModelsPath), systemState)
if err != nil {
return err
@@ -134,7 +134,7 @@ func (mi *ModelsInstall) Run(ctx *cliContext.Context) error {
log.Info().Str("model", modelName).Str("license", model.License).Msg("installing model")
}
modelLoader := model.NewModelLoader(systemState, true)
modelLoader := model.NewModelLoader(systemState)
err = startup.InstallModels(context.Background(), galleryService, galleries, backendGalleries, systemState, modelLoader, !mi.DisablePredownloadScan, mi.AutoloadBackendGalleries, progressCallback, modelName)
if err != nil {
return err

View File

@@ -64,7 +64,8 @@ type RunCMD struct {
Peer2PeerToken string `env:"LOCALAI_P2P_TOKEN,P2P_TOKEN,TOKEN" name:"p2ptoken" help:"Token for P2P mode (optional)" group:"p2p"`
Peer2PeerNetworkID string `env:"LOCALAI_P2P_NETWORK_ID,P2P_NETWORK_ID" help:"Network ID for P2P mode, can be set arbitrarly by the user for grouping a set of instances" group:"p2p"`
ParallelRequests bool `env:"LOCALAI_PARALLEL_REQUESTS,PARALLEL_REQUESTS" help:"Enable backends to handle multiple requests in parallel if they support it (e.g.: llama.cpp or vllm)" group:"backends"`
SingleActiveBackend bool `env:"LOCALAI_SINGLE_ACTIVE_BACKEND,SINGLE_ACTIVE_BACKEND" help:"Allow only one backend to be run at a time" group:"backends"`
SingleActiveBackend bool `env:"LOCALAI_SINGLE_ACTIVE_BACKEND,SINGLE_ACTIVE_BACKEND" help:"Allow only one backend to be run at a time (deprecated: use --max-active-backends=1 instead)" group:"backends"`
MaxActiveBackends int `env:"LOCALAI_MAX_ACTIVE_BACKENDS,MAX_ACTIVE_BACKENDS" default:"0" help:"Maximum number of backends to keep loaded at once (0 = unlimited, 1 = single backend mode). Least recently used backends are evicted when limit is reached" group:"backends"`
PreloadBackendOnly bool `env:"LOCALAI_PRELOAD_BACKEND_ONLY,PRELOAD_BACKEND_ONLY" default:"false" help:"Do not launch the API services, only the preloaded models / backends are started (useful for multi-node setups)" group:"backends"`
ExternalGRPCBackends []string `env:"LOCALAI_EXTERNAL_GRPC_BACKENDS,EXTERNAL_GRPC_BACKENDS" help:"A list of external grpc backends" group:"backends"`
EnableWatchdogIdle bool `env:"LOCALAI_WATCHDOG_IDLE,WATCHDOG_IDLE" default:"false" help:"Enable watchdog for stopping backends that are idle longer than the watchdog-idle-timeout" group:"backends"`
@@ -202,7 +203,13 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
if r.ParallelRequests {
opts = append(opts, config.EnableParallelBackendRequests)
}
if r.SingleActiveBackend {
// Handle max active backends (LRU eviction)
// MaxActiveBackends takes precedence over SingleActiveBackend
if r.MaxActiveBackends > 0 {
opts = append(opts, config.SetMaxActiveBackends(r.MaxActiveBackends))
} else if r.SingleActiveBackend {
// Backward compatibility: --single-active-backend is equivalent to --max-active-backends=1
opts = append(opts, config.EnableSingleBackend)
}

View File

@@ -79,7 +79,7 @@ func (t *SoundGenerationCMD) Run(ctx *cliContext.Context) error {
GeneratedContentDir: outputDir,
ExternalGRPCBackends: externalBackends,
}
ml := model.NewModelLoader(systemState, opts.SingleBackend)
ml := model.NewModelLoader(systemState)
defer func() {
err := ml.StopAllGRPC()

View File

@@ -38,7 +38,7 @@ func (t *TranscriptCMD) Run(ctx *cliContext.Context) error {
}
cl := config.NewModelConfigLoader(t.ModelsPath)
ml := model.NewModelLoader(systemState, opts.SingleBackend)
ml := model.NewModelLoader(systemState)
if err := cl.LoadModelConfigsFromPath(t.ModelsPath); err != nil {
return err
}

View File

@@ -48,7 +48,7 @@ func (t *TTSCMD) Run(ctx *cliContext.Context) error {
GeneratedContentDir: outputDir,
}
ml := model.NewModelLoader(systemState, opts.SingleBackend)
ml := model.NewModelLoader(systemState)
defer func() {
err := ml.StopAllGRPC()

View File

@@ -37,7 +37,7 @@ func findLLamaCPPBackend(galleries string, systemState *system.SystemState) (str
backend, ok := backends.Get(llamaCPPGalleryName)
if !ok {
ml := model.NewModelLoader(systemState, true)
ml := model.NewModelLoader(systemState)
var gals []config.Gallery
if err := json.Unmarshal([]byte(galleries), &gals); err != nil {
log.Error().Err(err).Msg("failed loading galleries")