diff --git a/core/application/application.go b/core/application/application.go index 3e241c698..38a9d2cf9 100644 --- a/core/application/application.go +++ b/core/application/application.go @@ -29,7 +29,7 @@ type Application struct { func newApplication(appConfig *config.ApplicationConfig) *Application { return &Application{ backendLoader: config.NewModelConfigLoader(appConfig.SystemState.Model.ModelsPath), - modelLoader: model.NewModelLoader(appConfig.SystemState, appConfig.SingleBackend), + modelLoader: model.NewModelLoader(appConfig.SystemState), applicationConfig: appConfig, templatesEvaluator: templates.NewEvaluator(appConfig.SystemState.Model.ModelsPath), } diff --git a/core/application/config_file_watcher.go b/core/application/config_file_watcher.go index 999d29aec..30b3e5ad6 100644 --- a/core/application/config_file_watcher.go +++ b/core/application/config_file_watcher.go @@ -191,7 +191,8 @@ type runtimeSettings struct { WatchdogBusyEnabled *bool `json:"watchdog_busy_enabled,omitempty"` WatchdogIdleTimeout *string `json:"watchdog_idle_timeout,omitempty"` WatchdogBusyTimeout *string `json:"watchdog_busy_timeout,omitempty"` - SingleBackend *bool `json:"single_backend,omitempty"` + SingleBackend *bool `json:"single_backend,omitempty"` // Deprecated: use MaxActiveBackends = 1 instead + MaxActiveBackends *int `json:"max_active_backends,omitempty"` // Maximum number of active backends (0 = unlimited, 1 = single backend mode) ParallelBackendRequests *bool `json:"parallel_backend_requests,omitempty"` Threads *int `json:"threads,omitempty"` ContextSize *int `json:"context_size,omitempty"` @@ -224,6 +225,7 @@ func readRuntimeSettingsJson(startupAppConfig config.ApplicationConfig) fileHand envWatchdogIdleTimeout := appConfig.WatchDogIdleTimeout == startupAppConfig.WatchDogIdleTimeout envWatchdogBusyTimeout := appConfig.WatchDogBusyTimeout == startupAppConfig.WatchDogBusyTimeout envSingleBackend := appConfig.SingleBackend == startupAppConfig.SingleBackend + envMaxActiveBackends := appConfig.MaxActiveBackends == startupAppConfig.MaxActiveBackends envParallelRequests := appConfig.ParallelBackendRequests == startupAppConfig.ParallelBackendRequests envThreads := appConfig.Threads == startupAppConfig.Threads envContextSize := appConfig.ContextSize == startupAppConfig.ContextSize @@ -275,8 +277,19 @@ func readRuntimeSettingsJson(startupAppConfig config.ApplicationConfig) fileHand log.Warn().Err(err).Str("timeout", *settings.WatchdogBusyTimeout).Msg("invalid watchdog busy timeout in runtime_settings.json") } } - if settings.SingleBackend != nil && !envSingleBackend { + // Handle MaxActiveBackends (new) and SingleBackend (deprecated) + if settings.MaxActiveBackends != nil && !envMaxActiveBackends { + appConfig.MaxActiveBackends = *settings.MaxActiveBackends + // For backward compatibility, also set SingleBackend if MaxActiveBackends == 1 + appConfig.SingleBackend = (*settings.MaxActiveBackends == 1) + } else if settings.SingleBackend != nil && !envSingleBackend { + // Legacy: SingleBackend maps to MaxActiveBackends = 1 appConfig.SingleBackend = *settings.SingleBackend + if *settings.SingleBackend { + appConfig.MaxActiveBackends = 1 + } else { + appConfig.MaxActiveBackends = 0 + } } if settings.ParallelBackendRequests != nil && !envParallelRequests { appConfig.ParallelBackendRequests = *settings.ParallelBackendRequests diff --git a/core/application/startup.go b/core/application/startup.go index 2bbbdfac7..490fee24e 100644 --- a/core/application/startup.go +++ b/core/application/startup.go @@ -224,7 +224,8 @@ func loadRuntimeSettingsFromFile(options *config.ApplicationConfig) { WatchdogBusyEnabled *bool `json:"watchdog_busy_enabled,omitempty"` WatchdogIdleTimeout *string `json:"watchdog_idle_timeout,omitempty"` WatchdogBusyTimeout *string `json:"watchdog_busy_timeout,omitempty"` - SingleBackend *bool `json:"single_backend,omitempty"` + SingleBackend *bool `json:"single_backend,omitempty"` // Deprecated: use MaxActiveBackends = 1 instead + MaxActiveBackends *int `json:"max_active_backends,omitempty"` // Maximum number of active backends (0 = unlimited) ParallelBackendRequests *bool `json:"parallel_backend_requests,omitempty"` AgentJobRetentionDays *int `json:"agent_job_retention_days,omitempty"` } @@ -280,9 +281,21 @@ func loadRuntimeSettingsFromFile(options *config.ApplicationConfig) { } } } - if settings.SingleBackend != nil { + // Handle MaxActiveBackends (new) and SingleBackend (deprecated) + if settings.MaxActiveBackends != nil { + // Only apply if current value is default (0), suggesting it wasn't set from env var + if options.MaxActiveBackends == 0 { + options.MaxActiveBackends = *settings.MaxActiveBackends + // For backward compatibility, also set SingleBackend if MaxActiveBackends == 1 + options.SingleBackend = (*settings.MaxActiveBackends == 1) + } + } else if settings.SingleBackend != nil { + // Legacy: SingleBackend maps to MaxActiveBackends = 1 if !options.SingleBackend { options.SingleBackend = *settings.SingleBackend + if *settings.SingleBackend { + options.MaxActiveBackends = 1 + } } } if settings.ParallelBackendRequests != nil { @@ -307,15 +320,25 @@ func loadRuntimeSettingsFromFile(options *config.ApplicationConfig) { // initializeWatchdog initializes the watchdog with current ApplicationConfig settings func initializeWatchdog(application *Application, options *config.ApplicationConfig) { - if options.WatchDog { + // Get effective max active backends (considers both MaxActiveBackends and deprecated SingleBackend) + lruLimit := options.GetEffectiveMaxActiveBackends() + + // Create watchdog if enabled OR if LRU limit is set + if options.WatchDog || lruLimit > 0 { wd := model.NewWatchDog( application.ModelLoader(), options.WatchDogBusyTimeout, options.WatchDogIdleTimeout, options.WatchDogBusy, - options.WatchDogIdle) + options.WatchDogIdle, + lruLimit) application.ModelLoader().SetWatchDog(wd) - go wd.Run() + + // Start watchdog goroutine only if busy/idle checks are enabled + if options.WatchDogBusy || options.WatchDogIdle { + go wd.Run() + } + go func() { <-options.Context.Done() log.Debug().Msgf("Context canceled, shutting down") diff --git a/core/application/watchdog.go b/core/application/watchdog.go index 20acf0b7a..e82ac28dc 100644 --- a/core/application/watchdog.go +++ b/core/application/watchdog.go @@ -20,21 +20,29 @@ func (a *Application) StopWatchdog() error { func (a *Application) startWatchdog() error { appConfig := a.ApplicationConfig() - // Create new watchdog if enabled - if appConfig.WatchDog { + // Get effective max active backends (considers both MaxActiveBackends and deprecated SingleBackend) + lruLimit := appConfig.GetEffectiveMaxActiveBackends() + + // Create watchdog if enabled OR if LRU limit is set + // LRU eviction requires watchdog infrastructure even without busy/idle checks + if appConfig.WatchDog || lruLimit > 0 { wd := model.NewWatchDog( a.modelLoader, appConfig.WatchDogBusyTimeout, appConfig.WatchDogIdleTimeout, appConfig.WatchDogBusy, - appConfig.WatchDogIdle) + appConfig.WatchDogIdle, + lruLimit) a.modelLoader.SetWatchDog(wd) // Create new stop channel a.watchdogStop = make(chan bool, 1) - // Start watchdog goroutine - go wd.Run() + // Start watchdog goroutine only if busy/idle checks are enabled + // LRU eviction doesn't need the Run() loop - it's triggered on model load + if appConfig.WatchDogBusy || appConfig.WatchDogIdle { + go wd.Run() + } // Setup shutdown handler go func() { @@ -48,7 +56,7 @@ func (a *Application) startWatchdog() error { } }() - log.Info().Msg("Watchdog started with new settings") + log.Info().Int("lruLimit", lruLimit).Bool("busyCheck", appConfig.WatchDogBusy).Bool("idleCheck", appConfig.WatchDogIdle).Msg("Watchdog started with new settings") } else { log.Info().Msg("Watchdog disabled") } diff --git a/core/backend/detection.go b/core/backend/detection.go index a3a443952..1b1991824 100644 --- a/core/backend/detection.go +++ b/core/backend/detection.go @@ -20,7 +20,6 @@ func Detection( if err != nil { return nil, err } - defer loader.Close() if detectionModel == nil { return nil, fmt.Errorf("could not load detection model") diff --git a/core/backend/embeddings.go b/core/backend/embeddings.go index c809992a4..2383023c0 100644 --- a/core/backend/embeddings.go +++ b/core/backend/embeddings.go @@ -17,7 +17,6 @@ func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, modelConf if err != nil { return nil, err } - defer loader.Close() var fn func() ([]float32, error) switch model := inferenceModel.(type) { diff --git a/core/backend/image.go b/core/backend/image.go index 796c71979..b6bb4f8a7 100644 --- a/core/backend/image.go +++ b/core/backend/image.go @@ -16,7 +16,6 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat if err != nil { return nil, err } - defer loader.Close() fn := func() error { _, err := inferenceModel.GenerateImage( diff --git a/core/backend/llm.go b/core/backend/llm.go index c00f5876d..92ba91839 100644 --- a/core/backend/llm.go +++ b/core/backend/llm.go @@ -60,7 +60,6 @@ func ModelInference(ctx context.Context, s string, messages schema.Messages, ima if err != nil { return nil, err } - defer loader.Close() var protoMessages []*proto.Message // if we are using the tokenizer template, we need to convert the messages to proto messages diff --git a/core/backend/rerank.go b/core/backend/rerank.go index 068d05e68..bcfad7382 100644 --- a/core/backend/rerank.go +++ b/core/backend/rerank.go @@ -15,7 +15,6 @@ func Rerank(request *proto.RerankRequest, loader *model.ModelLoader, appConfig * if err != nil { return nil, err } - defer loader.Close() if rerankModel == nil { return nil, fmt.Errorf("could not load rerank model") diff --git a/core/backend/soundgeneration.go b/core/backend/soundgeneration.go index 2c91958cf..ca78b2db9 100644 --- a/core/backend/soundgeneration.go +++ b/core/backend/soundgeneration.go @@ -29,7 +29,6 @@ func SoundGeneration( if err != nil { return "", nil, err } - defer loader.Close() if soundGenModel == nil { return "", nil, fmt.Errorf("could not load sound generation model") diff --git a/core/backend/token_metrics.go b/core/backend/token_metrics.go index c3e15d773..c81f57cab 100644 --- a/core/backend/token_metrics.go +++ b/core/backend/token_metrics.go @@ -20,7 +20,6 @@ func TokenMetrics( if err != nil { return nil, err } - defer loader.Close() if model == nil { return nil, fmt.Errorf("could not loadmodel model") diff --git a/core/backend/tokenize.go b/core/backend/tokenize.go index e85958b27..5803e44be 100644 --- a/core/backend/tokenize.go +++ b/core/backend/tokenize.go @@ -17,7 +17,6 @@ func ModelTokenize(s string, loader *model.ModelLoader, modelConfig config.Model if err != nil { return schema.TokenizeResponse{}, err } - defer loader.Close() predictOptions := gRPCPredictOpts(modelConfig, loader.ModelPath) predictOptions.Prompt = s diff --git a/core/backend/transcript.go b/core/backend/transcript.go index 576458250..9781e26fd 100644 --- a/core/backend/transcript.go +++ b/core/backend/transcript.go @@ -24,7 +24,6 @@ func ModelTranscription(audio, language string, translate bool, diarize bool, ml if err != nil { return nil, err } - defer ml.Close() if transcriptionModel == nil { return nil, fmt.Errorf("could not load transcription model") diff --git a/core/backend/tts.go b/core/backend/tts.go index 7b478a5fc..9c75cb37a 100644 --- a/core/backend/tts.go +++ b/core/backend/tts.go @@ -26,7 +26,6 @@ func ModelTTS( if err != nil { return "", nil, err } - defer loader.Close() if ttsModel == nil { return "", nil, fmt.Errorf("could not load tts model %q", modelConfig.Model) diff --git a/core/backend/vad.go b/core/backend/vad.go index 91f70bbc3..37859931d 100644 --- a/core/backend/vad.go +++ b/core/backend/vad.go @@ -19,7 +19,6 @@ func VAD(request *schema.VADRequest, if err != nil { return nil, err } - defer ml.Close() req := proto.VADRequest{ Audio: request.Audio, diff --git a/core/backend/video.go b/core/backend/video.go index a7a39bf24..666a76252 100644 --- a/core/backend/video.go +++ b/core/backend/video.go @@ -16,7 +16,6 @@ func VideoGeneration(height, width int32, prompt, negativePrompt, startImage, en if err != nil { return nil, err } - defer loader.Close() fn := func() error { _, err := inferenceModel.GenerateVideo( diff --git a/core/cli/backends.go b/core/cli/backends.go index 6ccc6496a..aa32a40e4 100644 --- a/core/cli/backends.go +++ b/core/cli/backends.go @@ -102,7 +102,7 @@ func (bi *BackendsInstall) Run(ctx *cliContext.Context) error { } } - modelLoader := model.NewModelLoader(systemState, true) + modelLoader := model.NewModelLoader(systemState) err = startup.InstallExternalBackends(context.Background(), galleries, systemState, modelLoader, progressCallback, bi.BackendArgs, bi.Name, bi.Alias) if err != nil { return err diff --git a/core/cli/models.go b/core/cli/models.go index bcbb60d48..ba76e8527 100644 --- a/core/cli/models.go +++ b/core/cli/models.go @@ -80,7 +80,7 @@ func (mi *ModelsInstall) Run(ctx *cliContext.Context) error { return err } - galleryService := services.NewGalleryService(&config.ApplicationConfig{}, model.NewModelLoader(systemState, true)) + galleryService := services.NewGalleryService(&config.ApplicationConfig{}, model.NewModelLoader(systemState)) err = galleryService.Start(context.Background(), config.NewModelConfigLoader(mi.ModelsPath), systemState) if err != nil { return err @@ -134,7 +134,7 @@ func (mi *ModelsInstall) Run(ctx *cliContext.Context) error { log.Info().Str("model", modelName).Str("license", model.License).Msg("installing model") } - modelLoader := model.NewModelLoader(systemState, true) + modelLoader := model.NewModelLoader(systemState) err = startup.InstallModels(context.Background(), galleryService, galleries, backendGalleries, systemState, modelLoader, !mi.DisablePredownloadScan, mi.AutoloadBackendGalleries, progressCallback, modelName) if err != nil { return err diff --git a/core/cli/run.go b/core/cli/run.go index 3cc77baf1..4df4fbdf3 100644 --- a/core/cli/run.go +++ b/core/cli/run.go @@ -64,7 +64,8 @@ type RunCMD struct { Peer2PeerToken string `env:"LOCALAI_P2P_TOKEN,P2P_TOKEN,TOKEN" name:"p2ptoken" help:"Token for P2P mode (optional)" group:"p2p"` Peer2PeerNetworkID string `env:"LOCALAI_P2P_NETWORK_ID,P2P_NETWORK_ID" help:"Network ID for P2P mode, can be set arbitrarly by the user for grouping a set of instances" group:"p2p"` ParallelRequests bool `env:"LOCALAI_PARALLEL_REQUESTS,PARALLEL_REQUESTS" help:"Enable backends to handle multiple requests in parallel if they support it (e.g.: llama.cpp or vllm)" group:"backends"` - SingleActiveBackend bool `env:"LOCALAI_SINGLE_ACTIVE_BACKEND,SINGLE_ACTIVE_BACKEND" help:"Allow only one backend to be run at a time" group:"backends"` + SingleActiveBackend bool `env:"LOCALAI_SINGLE_ACTIVE_BACKEND,SINGLE_ACTIVE_BACKEND" help:"Allow only one backend to be run at a time (deprecated: use --max-active-backends=1 instead)" group:"backends"` + MaxActiveBackends int `env:"LOCALAI_MAX_ACTIVE_BACKENDS,MAX_ACTIVE_BACKENDS" default:"0" help:"Maximum number of backends to keep loaded at once (0 = unlimited, 1 = single backend mode). Least recently used backends are evicted when limit is reached" group:"backends"` PreloadBackendOnly bool `env:"LOCALAI_PRELOAD_BACKEND_ONLY,PRELOAD_BACKEND_ONLY" default:"false" help:"Do not launch the API services, only the preloaded models / backends are started (useful for multi-node setups)" group:"backends"` ExternalGRPCBackends []string `env:"LOCALAI_EXTERNAL_GRPC_BACKENDS,EXTERNAL_GRPC_BACKENDS" help:"A list of external grpc backends" group:"backends"` EnableWatchdogIdle bool `env:"LOCALAI_WATCHDOG_IDLE,WATCHDOG_IDLE" default:"false" help:"Enable watchdog for stopping backends that are idle longer than the watchdog-idle-timeout" group:"backends"` @@ -202,7 +203,13 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error { if r.ParallelRequests { opts = append(opts, config.EnableParallelBackendRequests) } - if r.SingleActiveBackend { + + // Handle max active backends (LRU eviction) + // MaxActiveBackends takes precedence over SingleActiveBackend + if r.MaxActiveBackends > 0 { + opts = append(opts, config.SetMaxActiveBackends(r.MaxActiveBackends)) + } else if r.SingleActiveBackend { + // Backward compatibility: --single-active-backend is equivalent to --max-active-backends=1 opts = append(opts, config.EnableSingleBackend) } diff --git a/core/cli/soundgeneration.go b/core/cli/soundgeneration.go index a0f96b4fb..c94bb294d 100644 --- a/core/cli/soundgeneration.go +++ b/core/cli/soundgeneration.go @@ -79,7 +79,7 @@ func (t *SoundGenerationCMD) Run(ctx *cliContext.Context) error { GeneratedContentDir: outputDir, ExternalGRPCBackends: externalBackends, } - ml := model.NewModelLoader(systemState, opts.SingleBackend) + ml := model.NewModelLoader(systemState) defer func() { err := ml.StopAllGRPC() diff --git a/core/cli/transcript.go b/core/cli/transcript.go index 30a686aab..2beb00944 100644 --- a/core/cli/transcript.go +++ b/core/cli/transcript.go @@ -38,7 +38,7 @@ func (t *TranscriptCMD) Run(ctx *cliContext.Context) error { } cl := config.NewModelConfigLoader(t.ModelsPath) - ml := model.NewModelLoader(systemState, opts.SingleBackend) + ml := model.NewModelLoader(systemState) if err := cl.LoadModelConfigsFromPath(t.ModelsPath); err != nil { return err } diff --git a/core/cli/tts.go b/core/cli/tts.go index ed0266714..43c1749e8 100644 --- a/core/cli/tts.go +++ b/core/cli/tts.go @@ -48,7 +48,7 @@ func (t *TTSCMD) Run(ctx *cliContext.Context) error { GeneratedContentDir: outputDir, } - ml := model.NewModelLoader(systemState, opts.SingleBackend) + ml := model.NewModelLoader(systemState) defer func() { err := ml.StopAllGRPC() diff --git a/core/cli/worker/worker_llamacpp.go b/core/cli/worker/worker_llamacpp.go index 1b4be6736..a7ecec406 100644 --- a/core/cli/worker/worker_llamacpp.go +++ b/core/cli/worker/worker_llamacpp.go @@ -37,7 +37,7 @@ func findLLamaCPPBackend(galleries string, systemState *system.SystemState) (str backend, ok := backends.Get(llamaCPPGalleryName) if !ok { - ml := model.NewModelLoader(systemState, true) + ml := model.NewModelLoader(systemState) var gals []config.Gallery if err := json.Unmarshal([]byte(galleries), &gals); err != nil { log.Error().Err(err).Msg("failed loading galleries") diff --git a/core/config/application_config.go b/core/config/application_config.go index 4d770179b..c67e24f5c 100644 --- a/core/config/application_config.go +++ b/core/config/application_config.go @@ -52,7 +52,8 @@ type ApplicationConfig struct { AutoloadGalleries, AutoloadBackendGalleries bool - SingleBackend bool + SingleBackend bool // Deprecated: use MaxActiveBackends = 1 instead + MaxActiveBackends int // Maximum number of active backends (0 = unlimited, 1 = single backend mode) ParallelBackendRequests bool WatchDogIdle bool @@ -186,8 +187,38 @@ func SetWatchDogIdleTimeout(t time.Duration) AppOption { } } +// EnableSingleBackend is deprecated: use SetMaxActiveBackends(1) instead. +// This is kept for backward compatibility. var EnableSingleBackend = func(o *ApplicationConfig) { o.SingleBackend = true + o.MaxActiveBackends = 1 +} + +// SetMaxActiveBackends sets the maximum number of active backends. +// 0 = unlimited, 1 = single backend mode (replaces EnableSingleBackend) +func SetMaxActiveBackends(n int) AppOption { + return func(o *ApplicationConfig) { + o.MaxActiveBackends = n + // For backward compatibility, also set SingleBackend if n == 1 + if n == 1 { + o.SingleBackend = true + } + } +} + +// GetEffectiveMaxActiveBackends returns the effective max active backends limit. +// It considers both MaxActiveBackends and the deprecated SingleBackend setting. +// If MaxActiveBackends is set (> 0), it takes precedence. +// If SingleBackend is true and MaxActiveBackends is 0, returns 1. +// Otherwise returns 0 (unlimited). +func (o *ApplicationConfig) GetEffectiveMaxActiveBackends() int { + if o.MaxActiveBackends > 0 { + return o.MaxActiveBackends + } + if o.SingleBackend { + return 1 + } + return 0 } var EnableParallelBackendRequests = func(o *ApplicationConfig) { diff --git a/core/gallery/backends_test.go b/core/gallery/backends_test.go index 756d2e7a2..3799dc682 100644 --- a/core/gallery/backends_test.go +++ b/core/gallery/backends_test.go @@ -108,7 +108,7 @@ var _ = Describe("Gallery Backends", func() { } systemState, err = system.GetSystemState(system.WithBackendPath(tempDir)) Expect(err).NotTo(HaveOccurred()) - ml = model.NewModelLoader(systemState, true) + ml = model.NewModelLoader(systemState) }) AfterEach(func() { diff --git a/core/http/endpoints/localai/settings.go b/core/http/endpoints/localai/settings.go index d5c5cd7db..dee77646e 100644 --- a/core/http/endpoints/localai/settings.go +++ b/core/http/endpoints/localai/settings.go @@ -27,7 +27,8 @@ type RuntimeSettings struct { WatchdogBusyEnabled *bool `json:"watchdog_busy_enabled,omitempty"` WatchdogIdleTimeout *string `json:"watchdog_idle_timeout,omitempty"` WatchdogBusyTimeout *string `json:"watchdog_busy_timeout,omitempty"` - SingleBackend *bool `json:"single_backend,omitempty"` + SingleBackend *bool `json:"single_backend,omitempty"` // Deprecated: use MaxActiveBackends = 1 instead + MaxActiveBackends *int `json:"max_active_backends,omitempty"` // Maximum number of active backends (0 = unlimited, 1 = single backend mode) ParallelBackendRequests *bool `json:"parallel_backend_requests,omitempty"` Threads *int `json:"threads,omitempty"` ContextSize *int `json:"context_size,omitempty"` @@ -65,6 +66,7 @@ func GetSettingsEndpoint(app *application.Application) echo.HandlerFunc { watchdogBusy := appConfig.WatchDogBusy watchdogEnabled := appConfig.WatchDog singleBackend := appConfig.SingleBackend + maxActiveBackends := appConfig.MaxActiveBackends parallelBackendRequests := appConfig.ParallelBackendRequests threads := appConfig.Threads contextSize := appConfig.ContextSize @@ -87,6 +89,7 @@ func GetSettingsEndpoint(app *application.Application) echo.HandlerFunc { settings.WatchdogBusyEnabled = &watchdogBusy settings.WatchdogEnabled = &watchdogEnabled settings.SingleBackend = &singleBackend + settings.MaxActiveBackends = &maxActiveBackends settings.ParallelBackendRequests = ¶llelBackendRequests settings.Threads = &threads settings.ContextSize = &contextSize @@ -223,8 +226,20 @@ func UpdateSettingsEndpoint(app *application.Application) echo.HandlerFunc { appConfig.WatchDogBusyTimeout = dur watchdogChanged = true } - if settings.SingleBackend != nil { + if settings.MaxActiveBackends != nil { + appConfig.MaxActiveBackends = *settings.MaxActiveBackends + // For backward compatibility, update SingleBackend too + appConfig.SingleBackend = (*settings.MaxActiveBackends == 1) + watchdogChanged = true // LRU limit is managed by watchdog + } else if settings.SingleBackend != nil { + // Legacy support: SingleBackend maps to MaxActiveBackends = 1 appConfig.SingleBackend = *settings.SingleBackend + if *settings.SingleBackend { + appConfig.MaxActiveBackends = 1 + } else { + appConfig.MaxActiveBackends = 0 + } + watchdogChanged = true // LRU limit is managed by watchdog } if settings.ParallelBackendRequests != nil { appConfig.ParallelBackendRequests = *settings.ParallelBackendRequests diff --git a/core/http/endpoints/localai/stores.go b/core/http/endpoints/localai/stores.go index 033334375..8074da9e0 100644 --- a/core/http/endpoints/localai/stores.go +++ b/core/http/endpoints/localai/stores.go @@ -21,7 +21,6 @@ func StoresSetEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConfi if err != nil { return err } - defer sl.Close() vals := make([][]byte, len(input.Values)) for i, v := range input.Values { @@ -49,7 +48,6 @@ func StoresDeleteEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationCo if err != nil { return err } - defer sl.Close() if err := store.DeleteCols(c.Request().Context(), sb, input.Keys); err != nil { return err @@ -71,7 +69,6 @@ func StoresGetEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConfi if err != nil { return err } - defer sl.Close() keys, vals, err := store.GetCols(c.Request().Context(), sb, input.Keys) if err != nil { @@ -103,7 +100,6 @@ func StoresFindEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConf if err != nil { return err } - defer sl.Close() keys, vals, similarities, err := store.Find(c.Request().Context(), sb, input.Key, input.Topk) if err != nil { diff --git a/core/http/views/settings.html b/core/http/views/settings.html index 95bec85bb..37292007e 100644 --- a/core/http/views/settings.html +++ b/core/http/views/settings.html @@ -138,17 +138,15 @@
Allow only one backend to be active at a time
-Maximum number of models to keep loaded at once (0 = unlimited, 1 = single backend mode). Least recently used models are evicted when limit is reached.
+