feat: disable force eviction (#7725)

* feat: allow to set forcing backends eviction while requests are in flight

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat: try to make the request sit and retry if eviction couldn't be done

Otherwise calls that in order to pass would need to shutdown other
backends would just fail.

In this way instead we make the request sit and retry eviction until it
succeeds. The thresholds can be configured by the user.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* add tests

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* expose settings to CLI

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Update docs

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto
2025-12-25 14:26:18 +01:00
committed by GitHub
parent bb459e671f
commit c844b7ac58
18 changed files with 739 additions and 41 deletions

View File

@@ -214,6 +214,9 @@ func readRuntimeSettingsJson(startupAppConfig config.ApplicationConfig) fileHand
envAutoloadGalleries := appConfig.AutoloadGalleries == startupAppConfig.AutoloadGalleries
envAutoloadBackendGalleries := appConfig.AutoloadBackendGalleries == startupAppConfig.AutoloadBackendGalleries
envAgentJobRetentionDays := appConfig.AgentJobRetentionDays == startupAppConfig.AgentJobRetentionDays
envForceEvictionWhenBusy := appConfig.ForceEvictionWhenBusy == startupAppConfig.ForceEvictionWhenBusy
envLRUEvictionMaxRetries := appConfig.LRUEvictionMaxRetries == startupAppConfig.LRUEvictionMaxRetries
envLRUEvictionRetryInterval := appConfig.LRUEvictionRetryInterval == startupAppConfig.LRUEvictionRetryInterval
if len(fileContent) > 0 {
var settings config.RuntimeSettings
@@ -277,6 +280,20 @@ func readRuntimeSettingsJson(startupAppConfig config.ApplicationConfig) fileHand
if settings.MemoryReclaimerThreshold != nil && !envMemoryReclaimerThreshold {
appConfig.MemoryReclaimerThreshold = *settings.MemoryReclaimerThreshold
}
if settings.ForceEvictionWhenBusy != nil && !envForceEvictionWhenBusy {
appConfig.ForceEvictionWhenBusy = *settings.ForceEvictionWhenBusy
}
if settings.LRUEvictionMaxRetries != nil && !envLRUEvictionMaxRetries {
appConfig.LRUEvictionMaxRetries = *settings.LRUEvictionMaxRetries
}
if settings.LRUEvictionRetryInterval != nil && !envLRUEvictionRetryInterval {
dur, err := time.ParseDuration(*settings.LRUEvictionRetryInterval)
if err == nil {
appConfig.LRUEvictionRetryInterval = dur
} else {
xlog.Warn("invalid LRU eviction retry interval in runtime_settings.json", "error", err, "interval", *settings.LRUEvictionRetryInterval)
}
}
if settings.Threads != nil && !envThreads {
appConfig.Threads = *settings.Threads
}

View File

@@ -350,9 +350,16 @@ func initializeWatchdog(application *Application, options *config.ApplicationCon
model.WithIdleCheck(options.WatchDogIdle),
model.WithLRULimit(lruLimit),
model.WithMemoryReclaimer(options.MemoryReclaimerEnabled, options.MemoryReclaimerThreshold),
model.WithForceEvictionWhenBusy(options.ForceEvictionWhenBusy),
)
application.ModelLoader().SetWatchDog(wd)
// Initialize ModelLoader LRU eviction retry settings
application.ModelLoader().SetLRUEvictionRetrySettings(
options.LRUEvictionMaxRetries,
options.LRUEvictionRetryInterval,
)
// Start watchdog goroutine if any periodic checks are enabled
// LRU eviction doesn't need the Run() loop - it's triggered on model load
// But memory reclaimer needs the Run() loop for periodic checking

View File

@@ -35,6 +35,7 @@ func (a *Application) startWatchdog() error {
model.WithIdleCheck(appConfig.WatchDogIdle),
model.WithLRULimit(lruLimit),
model.WithMemoryReclaimer(appConfig.MemoryReclaimerEnabled, appConfig.MemoryReclaimerThreshold),
model.WithForceEvictionWhenBusy(appConfig.ForceEvictionWhenBusy),
)
a.modelLoader.SetWatchDog(wd)

View File

@@ -73,6 +73,9 @@ type RunCMD struct {
WatchdogBusyTimeout string `env:"LOCALAI_WATCHDOG_BUSY_TIMEOUT,WATCHDOG_BUSY_TIMEOUT" default:"5m" help:"Threshold beyond which a busy backend should be stopped" group:"backends"`
EnableMemoryReclaimer bool `env:"LOCALAI_MEMORY_RECLAIMER,MEMORY_RECLAIMER,LOCALAI_GPU_RECLAIMER,GPU_RECLAIMER" default:"false" help:"Enable memory threshold monitoring to auto-evict backends when memory usage exceeds threshold (uses GPU VRAM if available, otherwise RAM)" group:"backends"`
MemoryReclaimerThreshold float64 `env:"LOCALAI_MEMORY_RECLAIMER_THRESHOLD,MEMORY_RECLAIMER_THRESHOLD,LOCALAI_GPU_RECLAIMER_THRESHOLD,GPU_RECLAIMER_THRESHOLD" default:"0.95" help:"Memory usage threshold (0.0-1.0) that triggers backend eviction (default 0.95 = 95%%)" group:"backends"`
ForceEvictionWhenBusy bool `env:"LOCALAI_FORCE_EVICTION_WHEN_BUSY,FORCE_EVICTION_WHEN_BUSY" default:"false" help:"Force eviction even when models have active API calls (default: false for safety)" group:"backends"`
LRUEvictionMaxRetries int `env:"LOCALAI_LRU_EVICTION_MAX_RETRIES,LRU_EVICTION_MAX_RETRIES" default:"30" help:"Maximum number of retries when waiting for busy models to become idle before eviction (default: 30)" group:"backends"`
LRUEvictionRetryInterval string `env:"LOCALAI_LRU_EVICTION_RETRY_INTERVAL,LRU_EVICTION_RETRY_INTERVAL" default:"1s" help:"Interval between retries when waiting for busy models to become idle (e.g., 1s, 2s) (default: 1s)" group:"backends"`
Federated bool `env:"LOCALAI_FEDERATED,FEDERATED" help:"Enable federated instance" group:"federated"`
DisableGalleryEndpoint bool `env:"LOCALAI_DISABLE_GALLERY_ENDPOINT,DISABLE_GALLERY_ENDPOINT" help:"Disable the gallery endpoints" group:"api"`
MachineTag string `env:"LOCALAI_MACHINE_TAG,MACHINE_TAG" help:"Add Machine-Tag header to each response which is useful to track the machine in the P2P network" group:"api"`
@@ -220,6 +223,21 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
opts = append(opts, config.EnableSingleBackend)
}
// Handle LRU eviction settings
if r.ForceEvictionWhenBusy {
opts = append(opts, config.WithForceEvictionWhenBusy(true))
}
if r.LRUEvictionMaxRetries > 0 {
opts = append(opts, config.WithLRUEvictionMaxRetries(r.LRUEvictionMaxRetries))
}
if r.LRUEvictionRetryInterval != "" {
dur, err := time.ParseDuration(r.LRUEvictionRetryInterval)
if err != nil {
return fmt.Errorf("invalid LRU eviction retry interval: %w", err)
}
opts = append(opts, config.WithLRUEvictionRetryInterval(dur))
}
// split ":" to get backend name and the uri
for _, v := range r.ExternalGRPCBackends {
backend := v[:strings.IndexByte(v, ':')]

View File

@@ -64,6 +64,11 @@ type ApplicationConfig struct {
MemoryReclaimerEnabled bool // Enable memory threshold monitoring
MemoryReclaimerThreshold float64 // Threshold 0.0-1.0 (e.g., 0.95 = 95%)
// Eviction settings
ForceEvictionWhenBusy bool // Force eviction even when models have active API calls (default: false for safety)
LRUEvictionMaxRetries int // Maximum number of retries when waiting for busy models to become idle (default: 30)
LRUEvictionRetryInterval time.Duration // Interval between retries when waiting for busy models (default: 1s)
ModelsURL []string
WatchDogBusyTimeout, WatchDogIdleTimeout time.Duration
@@ -86,10 +91,12 @@ type AppOption func(*ApplicationConfig)
func NewApplicationConfig(o ...AppOption) *ApplicationConfig {
opt := &ApplicationConfig{
Context: context.Background(),
UploadLimitMB: 15,
Debug: true,
AgentJobRetentionDays: 30, // Default: 30 days
Context: context.Background(),
UploadLimitMB: 15,
Debug: true,
AgentJobRetentionDays: 30, // Default: 30 days
LRUEvictionMaxRetries: 30, // Default: 30 retries
LRUEvictionRetryInterval: 1 * time.Second, // Default: 1 second
PathWithoutAuth: []string{
"/static/",
"/generated-audio/",
@@ -259,6 +266,31 @@ func (o *ApplicationConfig) GetEffectiveMaxActiveBackends() int {
return 0
}
// WithForceEvictionWhenBusy sets whether to force eviction even when models have active API calls
func WithForceEvictionWhenBusy(enabled bool) AppOption {
return func(o *ApplicationConfig) {
o.ForceEvictionWhenBusy = enabled
}
}
// WithLRUEvictionMaxRetries sets the maximum number of retries when waiting for busy models to become idle
func WithLRUEvictionMaxRetries(maxRetries int) AppOption {
return func(o *ApplicationConfig) {
if maxRetries > 0 {
o.LRUEvictionMaxRetries = maxRetries
}
}
}
// WithLRUEvictionRetryInterval sets the interval between retries when waiting for busy models
func WithLRUEvictionRetryInterval(interval time.Duration) AppOption {
return func(o *ApplicationConfig) {
if interval > 0 {
o.LRUEvictionRetryInterval = interval
}
}
}
var EnableParallelBackendRequests = func(o *ApplicationConfig) {
o.ParallelBackendRequests = true
}
@@ -505,6 +537,8 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings {
parallelBackendRequests := o.ParallelBackendRequests
memoryReclaimerEnabled := o.MemoryReclaimerEnabled
memoryReclaimerThreshold := o.MemoryReclaimerThreshold
forceEvictionWhenBusy := o.ForceEvictionWhenBusy
lruEvictionMaxRetries := o.LRUEvictionMaxRetries
threads := o.Threads
contextSize := o.ContextSize
f16 := o.F16
@@ -539,6 +573,12 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings {
} else {
watchdogInterval = "2s" // default
}
var lruEvictionRetryInterval string
if o.LRUEvictionRetryInterval > 0 {
lruEvictionRetryInterval = o.LRUEvictionRetryInterval.String()
} else {
lruEvictionRetryInterval = "1s" // default
}
return RuntimeSettings{
WatchdogEnabled: &watchdogEnabled,
@@ -552,6 +592,9 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings {
ParallelBackendRequests: &parallelBackendRequests,
MemoryReclaimerEnabled: &memoryReclaimerEnabled,
MemoryReclaimerThreshold: &memoryReclaimerThreshold,
ForceEvictionWhenBusy: &forceEvictionWhenBusy,
LRUEvictionMaxRetries: &lruEvictionMaxRetries,
LRUEvictionRetryInterval: &lruEvictionRetryInterval,
Threads: &threads,
ContextSize: &contextSize,
F16: &f16,
@@ -644,6 +687,20 @@ func (o *ApplicationConfig) ApplyRuntimeSettings(settings *RuntimeSettings) (req
requireRestart = true
}
}
if settings.ForceEvictionWhenBusy != nil {
o.ForceEvictionWhenBusy = *settings.ForceEvictionWhenBusy
// This setting doesn't require restart, can be updated dynamically
}
if settings.LRUEvictionMaxRetries != nil {
o.LRUEvictionMaxRetries = *settings.LRUEvictionMaxRetries
// This setting doesn't require restart, can be updated dynamically
}
if settings.LRUEvictionRetryInterval != nil {
if dur, err := time.ParseDuration(*settings.LRUEvictionRetryInterval); err == nil {
o.LRUEvictionRetryInterval = dur
// This setting doesn't require restart, can be updated dynamically
}
}
if settings.Threads != nil {
o.Threads = *settings.Threads
}

View File

@@ -26,6 +26,11 @@ type RuntimeSettings struct {
MemoryReclaimerEnabled *bool `json:"memory_reclaimer_enabled,omitempty"` // Enable memory threshold monitoring
MemoryReclaimerThreshold *float64 `json:"memory_reclaimer_threshold,omitempty"` // Threshold 0.0-1.0 (e.g., 0.95 = 95%)
// Eviction settings
ForceEvictionWhenBusy *bool `json:"force_eviction_when_busy,omitempty"` // Force eviction even when models have active API calls (default: false for safety)
LRUEvictionMaxRetries *int `json:"lru_eviction_max_retries,omitempty"` // Maximum number of retries when waiting for busy models to become idle (default: 30)
LRUEvictionRetryInterval *string `json:"lru_eviction_retry_interval,omitempty"` // Interval between retries when waiting for busy models (e.g., 1s, 2s) (default: 1s)
// Performance settings
Threads *int `json:"threads,omitempty"`
ContextSize *int `json:"context_size,omitempty"`

View File

@@ -76,6 +76,14 @@ func UpdateSettingsEndpoint(app *application.Application) echo.HandlerFunc {
})
}
}
if settings.LRUEvictionRetryInterval != nil {
if _, err := time.ParseDuration(*settings.LRUEvictionRetryInterval); err != nil {
return c.JSON(http.StatusBadRequest, schema.SettingsResponse{
Success: false,
Error: "Invalid lru_eviction_retry_interval format: " + err.Error(),
})
}
}
// Save to file
if appConfig.DynamicConfigsDir == "" {
@@ -111,6 +119,31 @@ func UpdateSettingsEndpoint(app *application.Application) echo.HandlerFunc {
appConfig.ApiKeys = append(envKeys, runtimeKeys...)
}
// Update watchdog dynamically for settings that don't require restart
if settings.ForceEvictionWhenBusy != nil {
currentWD := app.ModelLoader().GetWatchDog()
if currentWD != nil {
currentWD.SetForceEvictionWhenBusy(*settings.ForceEvictionWhenBusy)
xlog.Info("Updated watchdog force eviction when busy setting", "forceEvictionWhenBusy", *settings.ForceEvictionWhenBusy)
}
}
// Update ModelLoader LRU eviction retry settings dynamically
maxRetries := appConfig.LRUEvictionMaxRetries
retryInterval := appConfig.LRUEvictionRetryInterval
if settings.LRUEvictionMaxRetries != nil {
maxRetries = *settings.LRUEvictionMaxRetries
}
if settings.LRUEvictionRetryInterval != nil {
if dur, err := time.ParseDuration(*settings.LRUEvictionRetryInterval); err == nil {
retryInterval = dur
}
}
if settings.LRUEvictionMaxRetries != nil || settings.LRUEvictionRetryInterval != nil {
app.ModelLoader().SetLRUEvictionRetrySettings(maxRetries, retryInterval)
xlog.Info("Updated LRU eviction retry settings", "maxRetries", maxRetries, "retryInterval", retryInterval)
}
// Check if agent job retention changed
agentJobChanged := settings.AgentJobRetentionDays != nil

View File

@@ -136,6 +136,43 @@
:class="!settings.watchdog_enabled ? 'opacity-50 cursor-not-allowed' : ''">
</div>
<!-- Force Eviction When Busy -->
<div class="flex items-center justify-between">
<div>
<label class="text-sm font-medium text-[var(--color-text-primary)]">Force Eviction When Busy</label>
<p class="text-xs text-[var(--color-text-secondary)] mt-1">Allow evicting models even when they have active API calls (default: disabled for safety)</p>
</div>
<label class="relative inline-flex items-center cursor-pointer">
<input type="checkbox" x-model="settings.force_eviction_when_busy"
:disabled="!settings.watchdog_enabled"
class="sr-only peer" :class="!settings.watchdog_enabled ? 'opacity-50' : ''">
<div class="w-11 h-6 bg-[var(--color-bg-primary)] peer-focus:outline-none peer-focus:ring-4 peer-focus:ring-[var(--color-primary-light)] rounded-full peer peer-checked:after:translate-x-full peer-checked:after:border-white after:content-[''] after:absolute after:top-[2px] after:left-[2px] after:bg-white after:border-gray-300 after:border after:rounded-full after:h-5 after:w-5 after:transition-all peer-checked:bg-[var(--color-primary)]"></div>
</label>
</div>
<!-- LRU Eviction Max Retries -->
<div>
<label class="block text-sm font-medium text-[var(--color-text-primary)] mb-2">LRU Eviction Max Retries</label>
<p class="text-xs text-[var(--color-text-secondary)] mb-2">Maximum number of retries when waiting for busy models to become idle (default: 30)</p>
<input type="number" x-model="settings.lru_eviction_max_retries"
:disabled="!settings.watchdog_enabled"
min="1"
placeholder="30"
class="w-full px-3 py-2 bg-[var(--color-bg-primary)] border border-[var(--color-primary-border)]/20 rounded text-sm text-[var(--color-text-primary)] focus:outline-none focus:ring-2 focus:ring-[var(--color-primary-border)]"
:class="!settings.watchdog_enabled ? 'opacity-50 cursor-not-allowed' : ''">
</div>
<!-- LRU Eviction Retry Interval -->
<div>
<label class="block text-sm font-medium text-[var(--color-text-primary)] mb-2">LRU Eviction Retry Interval</label>
<p class="text-xs text-[var(--color-text-secondary)] mb-2">Interval between retries when waiting for busy models (e.g., 1s, 2s) (default: 1s)</p>
<input type="text" x-model="settings.lru_eviction_retry_interval"
:disabled="!settings.watchdog_enabled"
placeholder="1s"
class="w-full px-3 py-2 bg-[var(--color-bg-primary)] border border-[var(--color-primary-border)]/20 rounded text-sm text-[var(--color-text-primary)] focus:outline-none focus:ring-2 focus:ring-[var(--color-primary-border)]"
:class="!settings.watchdog_enabled ? 'opacity-50 cursor-not-allowed' : ''">
</div>
<!-- Memory Reclaimer Subsection -->
<div class="mt-6 pt-4 border-t border-[var(--color-primary-border)]/20">
<h3 class="text-md font-medium text-[var(--color-text-primary)] mb-3 flex items-center">
@@ -545,6 +582,9 @@ function settingsDashboard() {
watchdog_idle_timeout: '15m',
watchdog_busy_timeout: '5m',
watchdog_interval: '2s',
force_eviction_when_busy: false,
lru_eviction_max_retries: 30,
lru_eviction_retry_interval: '1s',
max_active_backends: 0,
parallel_backend_requests: false,
memory_reclaimer_enabled: false,
@@ -587,6 +627,9 @@ function settingsDashboard() {
watchdog_idle_timeout: data.watchdog_idle_timeout || '15m',
watchdog_busy_timeout: data.watchdog_busy_timeout || '5m',
watchdog_interval: data.watchdog_interval || '2s',
force_eviction_when_busy: data.force_eviction_when_busy || false,
lru_eviction_max_retries: data.lru_eviction_max_retries || 30,
lru_eviction_retry_interval: data.lru_eviction_retry_interval || '1s',
max_active_backends: data.max_active_backends || 0,
parallel_backend_requests: data.parallel_backend_requests,
memory_reclaimer_enabled: data.memory_reclaimer_enabled || false,
@@ -660,6 +703,15 @@ function settingsDashboard() {
if (this.settings.watchdog_interval) {
payload.watchdog_interval = this.settings.watchdog_interval;
}
if (this.settings.force_eviction_when_busy !== undefined) {
payload.force_eviction_when_busy = this.settings.force_eviction_when_busy;
}
if (this.settings.lru_eviction_max_retries !== undefined) {
payload.lru_eviction_max_retries = parseInt(this.settings.lru_eviction_max_retries) || 30;
}
if (this.settings.lru_eviction_retry_interval) {
payload.lru_eviction_retry_interval = this.settings.lru_eviction_retry_interval;
}
if (this.settings.max_active_backends !== undefined) {
payload.max_active_backends = parseInt(this.settings.max_active_backends) || 0;
}

View File

@@ -52,6 +52,49 @@ Setting the limit to `1` is equivalent to single active backend mode (see below)
3. The LRU model(s) are automatically unloaded to make room for the new model
4. Concurrent requests for loading different models are handled safely - the system accounts for models currently being loaded when calculating evictions
### Eviction Behavior with Active Requests
By default, LocalAI will **skip evicting models that have active API calls** to prevent interrupting ongoing requests. This means:
- If all models are busy (have active requests), eviction will be skipped and the system will wait for models to become idle
- The loading request will retry eviction with configurable retry settings
- This ensures data integrity and prevents request failures
You can configure this behavior via WebUI or using the following settings:
#### Force Eviction When Busy
To allow evicting models even when they have active API calls (not recommended for production):
```bash
# Via CLI
./local-ai --force-eviction-when-busy
# Via environment variable
LOCALAI_FORCE_EVICTION_WHEN_BUSY=true ./local-ai
```
> **Warning:** Enabling force eviction can interrupt active requests and cause errors. Only use this if you understand the implications.
#### LRU Eviction Retry Settings
When models are busy and cannot be evicted, LocalAI will retry eviction with configurable settings:
```bash
# Configure maximum retries (default: 30)
./local-ai --lru-eviction-max-retries=50
# Configure retry interval (default: 1s)
./local-ai --lru-eviction-retry-interval=2s
# Using environment variables
LOCALAI_LRU_EVICTION_MAX_RETRIES=50 \
LOCALAI_LRU_EVICTION_RETRY_INTERVAL=2s \
./local-ai
```
These settings control how long the system will wait for busy models to become idle before giving up. The retry mechanism allows busy models to complete their requests before being evicted, preventing request failures.
### Example
```bash
@@ -207,6 +250,33 @@ This configuration:
- Automatically unloads any model that hasn't been used for 15 minutes
- Provides both hard limits and time-based cleanup
### Example with Retry Settings
You can also configure retry behavior when models are busy:
```bash
# Allow up to 2 active backends with custom retry settings
LOCALAI_MAX_ACTIVE_BACKENDS=2 \
LOCALAI_LRU_EVICTION_MAX_RETRIES=50 \
LOCALAI_LRU_EVICTION_RETRY_INTERVAL=2s \
./local-ai
```
Or using command line flags:
```bash
./local-ai \
--max-active-backends=2 \
--lru-eviction-max-retries=50 \
--lru-eviction-retry-interval=2s
```
This configuration:
- Limits to 2 active backends
- Will retry eviction up to 50 times if models are busy
- Waits 2 seconds between retry attempts
- Ensures busy models have time to complete their requests before eviction
## Limitations and Considerations
### VRAM Usage Estimation

View File

@@ -29,9 +29,23 @@ Changes to watchdog settings are applied immediately by restarting the watchdog
- **Max Active Backends**: Maximum number of active backends (loaded models). When exceeded, the least recently used model is automatically evicted. Set to `0` for unlimited, `1` for single-backend mode
- **Parallel Backend Requests**: Enable backends to handle multiple requests in parallel if supported
- **Force Eviction When Busy**: Allow evicting models even when they have active API calls (default: disabled for safety). **Warning:** Enabling this can interrupt active requests
- **LRU Eviction Max Retries**: Maximum number of retries when waiting for busy models to become idle before eviction (default: 30)
- **LRU Eviction Retry Interval**: Interval between retries when waiting for busy models (default: `1s`)
> **Note:** The "Single Backend" setting is deprecated. Use "Max Active Backends" set to `1` for single-backend behavior.
#### LRU Eviction Behavior
By default, LocalAI will skip evicting models that have active API calls to prevent interrupting ongoing requests. When all models are busy and eviction is needed:
1. The system will wait for models to become idle
2. It will retry eviction up to the configured maximum number of retries
3. The retry interval determines how long to wait between attempts
4. If all retries are exhausted, the system will proceed (which may cause out-of-memory errors if resources are truly exhausted)
You can configure these settings via the web UI or through environment variables. See [VRAM Management]({{%relref "advanced/vram-management" %}}) for more details.
### Performance Settings
- **Threads**: Number of threads used for parallel computation (recommended: number of physical cores)
@@ -94,6 +108,9 @@ The `runtime_settings.json` file follows this structure:
"watchdog_busy_timeout": "5m",
"max_active_backends": 0,
"parallel_backend_requests": true,
"force_eviction_when_busy": false,
"lru_eviction_max_retries": 30,
"lru_eviction_retry_interval": "1s",
"threads": 8,
"context_size": 2048,
"f16": false,

View File

@@ -128,7 +128,7 @@ Future versions of LocalAI will expose additional control over audio generation
#### Setup
Install the `vibevoice` model in the Model gallery.
Install the `vibevoice` model in the Model gallery or run `local-ai run models install vibevoice`.
#### Usage

View File

@@ -46,6 +46,9 @@ Complete reference for all LocalAI command-line interface (CLI) parameters and e
| `--watchdog-idle-timeout` | `15m` | Threshold beyond which an idle backend should be stopped | `$LOCALAI_WATCHDOG_IDLE_TIMEOUT`, `$WATCHDOG_IDLE_TIMEOUT` |
| `--enable-watchdog-busy` | `false` | Enable watchdog for stopping backends that are busy longer than the watchdog-busy-timeout | `$LOCALAI_WATCHDOG_BUSY`, `$WATCHDOG_BUSY` |
| `--watchdog-busy-timeout` | `5m` | Threshold beyond which a busy backend should be stopped | `$LOCALAI_WATCHDOG_BUSY_TIMEOUT`, `$WATCHDOG_BUSY_TIMEOUT` |
| `--force-eviction-when-busy` | `false` | Force eviction even when models have active API calls (default: false for safety). **Warning:** Enabling this can interrupt active requests | `$LOCALAI_FORCE_EVICTION_WHEN_BUSY`, `$FORCE_EVICTION_WHEN_BUSY` |
| `--lru-eviction-max-retries` | `30` | Maximum number of retries when waiting for busy models to become idle before eviction | `$LOCALAI_LRU_EVICTION_MAX_RETRIES`, `$LRU_EVICTION_MAX_RETRIES` |
| `--lru-eviction-retry-interval` | `1s` | Interval between retries when waiting for busy models to become idle (e.g., `1s`, `2s`) | `$LOCALAI_LRU_EVICTION_RETRY_INTERVAL`, `$LRU_EVICTION_RETRY_INTERVAL` |
For more information on VRAM management, see [VRAM and Memory Management]({{%relref "advanced/vram-management" %}}).

View File

@@ -9,8 +9,8 @@ import (
"time"
grpc "github.com/mudler/LocalAI/pkg/grpc"
"github.com/phayes/freeport"
"github.com/mudler/xlog"
"github.com/phayes/freeport"
)
const (
@@ -173,7 +173,7 @@ func (ml *ModelLoader) backendLoader(opts ...Option) (client grpc.Backend, err e
model, err := ml.LoadModel(o.modelID, o.model, ml.grpcModel(backend, o))
if err != nil {
if stopErr := ml.StopGRPC(only(o.modelID));stopErr != nil {
if stopErr := ml.StopGRPC(only(o.modelID)); stopErr != nil {
xlog.Error("error stopping model", "error", stopErr, "model", o.modelID)
}
xlog.Error("Failed to load model", "modelID", o.modelID, "error", err, "backend", o.backendString)
@@ -186,13 +186,47 @@ func (ml *ModelLoader) backendLoader(opts ...Option) (client grpc.Backend, err e
// enforceLRULimit enforces the LRU limit before loading a new model.
// This is called before loading a model to ensure we don't exceed the limit.
// It accounts for models that are currently being loaded by other goroutines.
// If models are busy and can't be evicted, it will wait and retry until space is available.
func (ml *ModelLoader) enforceLRULimit() {
if ml.wd == nil {
return
}
// Get the count of models currently being loaded to account for concurrent requests
pendingLoads := ml.GetLoadingCount()
ml.wd.EnforceLRULimit(pendingLoads)
// Get retry settings from ModelLoader
ml.mu.Lock()
maxRetries := ml.lruEvictionMaxRetries
retryInterval := ml.lruEvictionRetryInterval
ml.mu.Unlock()
for attempt := 0; attempt < maxRetries; attempt++ {
result := ml.wd.EnforceLRULimit(pendingLoads)
if !result.NeedMore {
// Successfully evicted enough models (or no eviction needed)
if result.EvictedCount > 0 {
xlog.Info("[ModelLoader] LRU enforcement complete", "evicted", result.EvictedCount)
}
return
}
// Need more evictions but models are busy - wait and retry
if attempt < maxRetries-1 {
xlog.Info("[ModelLoader] Waiting for busy models to become idle before eviction",
"evicted", result.EvictedCount,
"attempt", attempt+1,
"maxRetries", maxRetries,
"retryIn", retryInterval)
time.Sleep(retryInterval)
} else {
// Last attempt - log warning but proceed (might fail to load, but at least we tried)
xlog.Warn("[ModelLoader] LRU enforcement incomplete after max retries",
"evicted", result.EvictedCount,
"reason", "models are still busy with active API calls")
}
}
}
// updateModelLastUsed updates the last used time for a model (for LRU tracking)

View File

@@ -20,22 +20,26 @@ import (
// TODO: Split ModelLoader and TemplateLoader? Just to keep things more organized. Left together to share a mutex until I look into that. Would split if we separate directories for .bin/.yaml and .tmpl
type ModelLoader struct {
ModelPath string
mu sync.Mutex
models map[string]*Model
loading map[string]chan struct{} // tracks models currently being loaded
wd *WatchDog
externalBackends map[string]string
ModelPath string
mu sync.Mutex
models map[string]*Model
loading map[string]chan struct{} // tracks models currently being loaded
wd *WatchDog
externalBackends map[string]string
lruEvictionMaxRetries int // Maximum number of retries when waiting for busy models
lruEvictionRetryInterval time.Duration // Interval between retries when waiting for busy models
}
// NewModelLoader creates a new ModelLoader instance.
// LRU eviction is now managed through the WatchDog component.
func NewModelLoader(system *system.SystemState) *ModelLoader {
nml := &ModelLoader{
ModelPath: system.Model.ModelsPath,
models: make(map[string]*Model),
loading: make(map[string]chan struct{}),
externalBackends: make(map[string]string),
ModelPath: system.Model.ModelsPath,
models: make(map[string]*Model),
loading: make(map[string]chan struct{}),
externalBackends: make(map[string]string),
lruEvictionMaxRetries: 30, // Default: 30 retries
lruEvictionRetryInterval: 1 * time.Second, // Default: 1 second
}
return nml
@@ -56,6 +60,14 @@ func (ml *ModelLoader) GetWatchDog() *WatchDog {
return ml.wd
}
// SetLRUEvictionRetrySettings updates the LRU eviction retry settings
func (ml *ModelLoader) SetLRUEvictionRetrySettings(maxRetries int, retryInterval time.Duration) {
ml.mu.Lock()
defer ml.mu.Unlock()
ml.lruEvictionMaxRetries = maxRetries
ml.lruEvictionRetryInterval = retryInterval
}
func (ml *ModelLoader) ExistsInModelPath(s string) bool {
return utils.ExistsInPath(ml.ModelPath, s)
}

View File

@@ -262,4 +262,13 @@ var _ = Describe("ModelLoader", func() {
Expect(modelLoader.GetLoadingCount()).To(Equal(0))
})
})
Context("LRU Eviction Retry Settings", func() {
It("should allow updating retry settings", func() {
modelLoader.SetLRUEvictionRetrySettings(50, 2*time.Second)
// Settings are updated - we can verify through behavior if needed
// For now, just verify the call doesn't panic
Expect(modelLoader).ToNot(BeNil())
})
})
})

View File

@@ -41,6 +41,9 @@ type WatchDog struct {
memoryReclaimerEnabled bool // Enable memory threshold monitoring
memoryReclaimerThreshold float64 // Threshold 0.0-1.0 (e.g., 0.95 = 95%)
watchdogInterval time.Duration
// Eviction settings
forceEvictionWhenBusy bool // Force eviction even when models have active API calls (default: false for safety)
}
type ProcessManager interface {
@@ -78,6 +81,7 @@ func NewWatchDog(opts ...WatchDogOption) *WatchDog {
memoryReclaimerEnabled: o.memoryReclaimerEnabled,
memoryReclaimerThreshold: o.memoryReclaimerThreshold,
watchdogInterval: o.watchdogInterval,
forceEvictionWhenBusy: o.forceEvictionWhenBusy,
}
}
@@ -110,6 +114,13 @@ func (wd *WatchDog) GetMemoryReclaimerSettings() (enabled bool, threshold float6
return wd.memoryReclaimerEnabled, wd.memoryReclaimerThreshold
}
// SetForceEvictionWhenBusy updates the force eviction when busy setting dynamically
func (wd *WatchDog) SetForceEvictionWhenBusy(force bool) {
wd.Lock()
defer wd.Unlock()
wd.forceEvictionWhenBusy = force
}
func (wd *WatchDog) Shutdown() {
wd.Lock()
defer wd.Unlock()
@@ -169,13 +180,19 @@ type modelUsageInfo struct {
lastUsed time.Time
}
// EnforceLRULimitResult contains the result of LRU enforcement
type EnforceLRULimitResult struct {
EvictedCount int // Number of models successfully evicted
NeedMore bool // True if more evictions are needed but couldn't be done (e.g., all models are busy)
}
// EnforceLRULimit ensures we're under the LRU limit by evicting least recently used models.
// This should be called before loading a new model.
// pendingLoads is the number of models currently being loaded (to account for concurrent loads).
// Returns the number of models evicted.
func (wd *WatchDog) EnforceLRULimit(pendingLoads int) int {
// Returns the result containing evicted count and whether more evictions are needed.
func (wd *WatchDog) EnforceLRULimit(pendingLoads int) EnforceLRULimitResult {
if wd.lruLimit <= 0 {
return 0 // LRU disabled
return EnforceLRULimitResult{EvictedCount: 0, NeedMore: false} // LRU disabled
}
wd.Lock()
@@ -186,9 +203,10 @@ func (wd *WatchDog) EnforceLRULimit(pendingLoads int) int {
// We need: currentCount + pendingLoads + 1 <= lruLimit
// So evict: currentCount + pendingLoads + 1 - lruLimit = currentCount - lruLimit + pendingLoads + 1
modelsToEvict := currentCount - wd.lruLimit + pendingLoads + 1
forceEvictionWhenBusy := wd.forceEvictionWhenBusy
if modelsToEvict <= 0 {
wd.Unlock()
return 0
return EnforceLRULimitResult{EvictedCount: 0, NeedMore: false}
}
xlog.Debug("[WatchDog] LRU enforcement triggered", "current", currentCount, "pendingLoads", pendingLoads, "limit", wd.lruLimit, "toEvict", modelsToEvict)
@@ -215,13 +233,25 @@ func (wd *WatchDog) EnforceLRULimit(pendingLoads int) int {
// Collect models to evict (the oldest ones)
var modelsToShutdown []string
for i := 0; i < modelsToEvict && i < len(models); i++ {
evictedCount := 0
skippedBusyCount := 0
for i := 0; evictedCount < modelsToEvict && i < len(models); i++ {
m := models[i]
xlog.Info("[WatchDog] LRU evicting model", "model", m.model, "lastUsed", m.lastUsed)
// Check if model is busy
_, isBusy := wd.busyTime[m.address]
if isBusy && !forceEvictionWhenBusy {
// Skip eviction for busy models when forceEvictionWhenBusy is false
xlog.Warn("[WatchDog] Skipping LRU eviction for busy model", "model", m.model, "reason", "model has active API calls")
skippedBusyCount++
continue
}
xlog.Info("[WatchDog] LRU evicting model", "model", m.model, "lastUsed", m.lastUsed, "busy", isBusy)
modelsToShutdown = append(modelsToShutdown, m.model)
// Clean up the maps while we have the lock
wd.untrack(m.address)
evictedCount++
}
needMore := evictedCount < modelsToEvict && skippedBusyCount > 0
wd.Unlock()
// Now shutdown models without holding the watchdog lock to prevent deadlock
@@ -232,7 +262,14 @@ func (wd *WatchDog) EnforceLRULimit(pendingLoads int) int {
xlog.Debug("[WatchDog] LRU eviction complete", "model", model)
}
return len(modelsToShutdown)
if needMore {
xlog.Warn("[WatchDog] LRU eviction incomplete", "evicted", evictedCount, "needed", modelsToEvict, "skippedBusy", skippedBusyCount, "reason", "some models are busy with active API calls")
}
return EnforceLRULimitResult{
EvictedCount: len(modelsToShutdown),
NeedMore: needMore,
}
}
func (wd *WatchDog) Run() {
@@ -376,6 +413,8 @@ func (wd *WatchDog) evictLRUModel() {
return
}
forceEvictionWhenBusy := wd.forceEvictionWhenBusy
// Build a list of models sorted by last used time (oldest first)
var models []modelUsageInfo
for address, model := range wd.addressModelMap {
@@ -400,8 +439,27 @@ func (wd *WatchDog) evictLRUModel() {
return models[i].lastUsed.Before(models[j].lastUsed)
})
// Get the LRU model
lruModel := models[0]
// Find the first non-busy model (or first model if forceEvictionWhenBusy is true)
var lruModel *modelUsageInfo
for i := 0; i < len(models); i++ {
m := models[i]
_, isBusy := wd.busyTime[m.address]
if isBusy && !forceEvictionWhenBusy {
// Skip busy models when forceEvictionWhenBusy is false
xlog.Warn("[WatchDog] Skipping memory reclaimer eviction for busy model", "model", m.model, "reason", "model has active API calls")
continue
}
lruModel = &m
break
}
if lruModel == nil {
// All models are busy and forceEvictionWhenBusy is false
wd.Unlock()
xlog.Warn("[WatchDog] Memory reclaimer cannot evict: all models are busy with active API calls")
return
}
xlog.Info("[WatchDog] Memory reclaimer evicting LRU model", "model", lruModel.model, "lastUsed", lruModel.lastUsed)
// Untrack the model

View File

@@ -28,6 +28,9 @@ type WatchDogOptions struct {
// Memory reclaimer settings (works with GPU if available, otherwise RAM)
memoryReclaimerEnabled bool // Enable memory threshold monitoring
memoryReclaimerThreshold float64 // Threshold 0.0-1.0 (e.g., 0.95 = 95%)
// Eviction settings
forceEvictionWhenBusy bool // Force eviction even when models have active API calls (default: false for safety)
}
// WatchDogOption is a function that configures WatchDogOptions
@@ -105,6 +108,14 @@ func WithMemoryReclaimerThreshold(threshold float64) WatchDogOption {
}
}
// WithForceEvictionWhenBusy sets whether to force eviction even when models have active API calls
// Default: false (skip eviction when busy for safety)
func WithForceEvictionWhenBusy(force bool) WatchDogOption {
return func(o *WatchDogOptions) {
o.forceEvictionWhenBusy = force
}
}
// DefaultWatchDogOptions returns default options for the watchdog
func DefaultWatchDogOptions() *WatchDogOptions {
return &WatchDogOptions{
@@ -116,6 +127,7 @@ func DefaultWatchDogOptions() *WatchDogOptions {
lruLimit: 0,
memoryReclaimerEnabled: false,
memoryReclaimerThreshold: DefaultMemoryReclaimerThreshold,
forceEvictionWhenBusy: false, // Default: skip eviction when busy for safety
}
}

View File

@@ -170,15 +170,18 @@ var _ = Describe("WatchDog", func() {
model.WithBusyTimeout(5*time.Minute),
model.WithIdleTimeout(15*time.Minute),
model.WithLRULimit(2),
model.WithForceEvictionWhenBusy(true), // Enable force eviction for these tests to match old behavior
)
})
It("should not evict when under limit", func() {
wd.AddAddressModelMap("addr1", "model1")
wd.Mark("addr1")
wd.UnMark("addr1") // Unmark to make it idle (not busy)
evicted := wd.EnforceLRULimit(0)
Expect(evicted).To(Equal(0))
result := wd.EnforceLRULimit(0)
Expect(result.EvictedCount).To(Equal(0))
Expect(result.NeedMore).To(BeFalse())
Expect(pm.getShutdownCalls()).To(BeEmpty())
})
@@ -186,14 +189,17 @@ var _ = Describe("WatchDog", func() {
// Add two models
wd.AddAddressModelMap("addr1", "model1")
wd.Mark("addr1")
wd.UnMark("addr1") // Unmark to make it idle
time.Sleep(10 * time.Millisecond)
wd.AddAddressModelMap("addr2", "model2")
wd.Mark("addr2")
wd.UnMark("addr2") // Unmark to make it idle
// Enforce LRU with limit of 2 (need to make room for 1 new model)
evicted := wd.EnforceLRULimit(0)
Expect(evicted).To(Equal(1))
result := wd.EnforceLRULimit(0)
Expect(result.EvictedCount).To(Equal(1))
Expect(result.NeedMore).To(BeFalse())
Expect(pm.getShutdownCalls()).To(ContainElement("model1")) // oldest should be evicted
})
@@ -201,19 +207,23 @@ var _ = Describe("WatchDog", func() {
// Add three models
wd.AddAddressModelMap("addr1", "model1")
wd.Mark("addr1")
wd.UnMark("addr1") // Unmark to make it idle
time.Sleep(10 * time.Millisecond)
wd.AddAddressModelMap("addr2", "model2")
wd.Mark("addr2")
wd.UnMark("addr2") // Unmark to make it idle
time.Sleep(10 * time.Millisecond)
wd.AddAddressModelMap("addr3", "model3")
wd.Mark("addr3")
wd.UnMark("addr3") // Unmark to make it idle
// Set limit to 1, should evict 2 oldest + 1 for new = 3 evictions
wd.SetLRULimit(1)
evicted := wd.EnforceLRULimit(0)
Expect(evicted).To(Equal(3))
result := wd.EnforceLRULimit(0)
Expect(result.EvictedCount).To(Equal(3))
Expect(result.NeedMore).To(BeFalse())
shutdowns := pm.getShutdownCalls()
Expect(shutdowns).To(ContainElement("model1"))
Expect(shutdowns).To(ContainElement("model2"))
@@ -224,15 +234,18 @@ var _ = Describe("WatchDog", func() {
// Add two models (at limit)
wd.AddAddressModelMap("addr1", "model1")
wd.Mark("addr1")
wd.UnMark("addr1") // Unmark to make it idle
time.Sleep(10 * time.Millisecond)
wd.AddAddressModelMap("addr2", "model2")
wd.Mark("addr2")
wd.UnMark("addr2") // Unmark to make it idle
// With 1 pending load, we need to evict 2 (current=2, pending=1, new=1, limit=2)
// total after = 2 + 1 + 1 = 4, need to evict 4 - 2 = 2
evicted := wd.EnforceLRULimit(1)
Expect(evicted).To(Equal(2))
result := wd.EnforceLRULimit(1)
Expect(result.EvictedCount).To(Equal(2))
Expect(result.NeedMore).To(BeFalse())
})
It("should not evict when LRU is disabled", func() {
@@ -242,8 +255,9 @@ var _ = Describe("WatchDog", func() {
wd.AddAddressModelMap("addr2", "model2")
wd.AddAddressModelMap("addr3", "model3")
evicted := wd.EnforceLRULimit(0)
Expect(evicted).To(Equal(0))
result := wd.EnforceLRULimit(0)
Expect(result.EvictedCount).To(Equal(0))
Expect(result.NeedMore).To(BeFalse())
Expect(pm.getShutdownCalls()).To(BeEmpty())
})
@@ -253,10 +267,12 @@ var _ = Describe("WatchDog", func() {
// Add models with different lastUsed times
wd.AddAddressModelMap("addr1", "model1")
wd.Mark("addr1")
wd.UnMark("addr1") // Unmark to make it idle
time.Sleep(20 * time.Millisecond)
wd.AddAddressModelMap("addr2", "model2")
wd.Mark("addr2")
wd.UnMark("addr2") // Unmark to make it idle
time.Sleep(20 * time.Millisecond)
// Touch model1 again to make it more recent
@@ -265,10 +281,12 @@ var _ = Describe("WatchDog", func() {
wd.AddAddressModelMap("addr3", "model3")
wd.Mark("addr3")
wd.UnMark("addr3") // Unmark to make it idle
// Now model2 is the oldest, should be evicted first
evicted := wd.EnforceLRULimit(0)
Expect(evicted).To(BeNumerically(">=", 1))
result := wd.EnforceLRULimit(0)
Expect(result.EvictedCount).To(BeNumerically(">=", 1))
Expect(result.NeedMore).To(BeFalse())
shutdowns := pm.getShutdownCalls()
// model2 should be evicted first (it's the oldest)
@@ -285,16 +303,19 @@ var _ = Describe("WatchDog", func() {
model.WithBusyTimeout(5*time.Minute),
model.WithIdleTimeout(15*time.Minute),
model.WithLRULimit(1),
model.WithForceEvictionWhenBusy(true), // Enable force eviction for these tests
)
})
It("should evict existing model when loading new one", func() {
wd.AddAddressModelMap("addr1", "model1")
wd.Mark("addr1")
wd.UnMark("addr1") // Unmark to make it idle
// With limit=1, loading a new model should evict the existing one
evicted := wd.EnforceLRULimit(0)
Expect(evicted).To(Equal(1))
result := wd.EnforceLRULimit(0)
Expect(result.EvictedCount).To(Equal(1))
Expect(result.NeedMore).To(BeFalse())
Expect(pm.getShutdownCalls()).To(ContainElement("model1"))
})
@@ -302,6 +323,7 @@ var _ = Describe("WatchDog", func() {
for i := 0; i < 5; i++ {
wd.AddAddressModelMap("addr", "model")
wd.Mark("addr")
wd.UnMark("addr") // Unmark to make it idle
wd.EnforceLRULimit(0)
}
// All previous models should have been evicted
@@ -309,6 +331,233 @@ var _ = Describe("WatchDog", func() {
})
})
Context("Force Eviction When Busy", func() {
BeforeEach(func() {
wd = model.NewWatchDog(
model.WithProcessManager(pm),
model.WithLRULimit(2),
model.WithForceEvictionWhenBusy(false), // Default: skip eviction when busy
)
})
It("should skip eviction for busy models when forceEvictionWhenBusy is false", func() {
// Add two models (at limit of 2, need to evict 1 for new model)
wd.AddAddressModelMap("addr1", "model1")
wd.Mark("addr1")
time.Sleep(10 * time.Millisecond)
wd.AddAddressModelMap("addr2", "model2")
wd.Mark("addr2")
wd.UnMark("addr2") // Make model2 idle
// Keep model1 as busy (simulating active API call)
// model1 is already marked as busy from the first Mark call
// Try to enforce LRU - should skip busy model1, evict model2
result := wd.EnforceLRULimit(0)
// Should evict model2 (not busy) but skip model1 (busy)
// Since we evicted 1 (which is what we needed), NeedMore should be false
Expect(result.EvictedCount).To(Equal(1))
Expect(result.NeedMore).To(BeFalse()) // We evicted enough, even though we skipped model1
Expect(pm.getShutdownCalls()).To(ContainElement("model2"))
Expect(pm.getShutdownCalls()).ToNot(ContainElement("model1"))
})
It("should evict busy models when forceEvictionWhenBusy is true", func() {
wd.SetForceEvictionWhenBusy(true)
// Add two models
wd.AddAddressModelMap("addr1", "model1")
wd.Mark("addr1")
time.Sleep(10 * time.Millisecond)
wd.AddAddressModelMap("addr2", "model2")
wd.Mark("addr2")
// Keep model1 as busy (already marked from first Mark call)
// Try to enforce LRU - should evict model1 even though busy
result := wd.EnforceLRULimit(0)
Expect(result.EvictedCount).To(Equal(1))
Expect(result.NeedMore).To(BeFalse())
Expect(pm.getShutdownCalls()).To(ContainElement("model1"))
})
It("should set NeedMore when all models are busy and forceEvictionWhenBusy is false", func() {
// Add two models
wd.AddAddressModelMap("addr1", "model1")
wd.Mark("addr1")
time.Sleep(10 * time.Millisecond)
wd.AddAddressModelMap("addr2", "model2")
wd.Mark("addr2")
// Mark both as busy
wd.Mark("addr1")
wd.Mark("addr2")
// Try to enforce LRU - should skip both busy models
result := wd.EnforceLRULimit(0)
Expect(result.EvictedCount).To(Equal(0))
Expect(result.NeedMore).To(BeTrue())
Expect(pm.getShutdownCalls()).To(BeEmpty())
})
It("should allow updating forceEvictionWhenBusy dynamically", func() {
// Start with false
Expect(wd).ToNot(BeNil())
// Add models
wd.AddAddressModelMap("addr1", "model1")
wd.Mark("addr1")
time.Sleep(10 * time.Millisecond)
wd.AddAddressModelMap("addr2", "model2")
wd.Mark("addr2")
wd.UnMark("addr2") // Make model2 idle
// Keep model1 busy (already marked)
// With forceEvictionWhenBusy=false, should skip busy model1, evict model2
result := wd.EnforceLRULimit(0)
Expect(result.NeedMore).To(BeFalse()) // We evicted enough (1 model)
Expect(result.EvictedCount).To(Equal(1)) // Should evict model2 (not busy)
// Now enable force eviction
wd.SetForceEvictionWhenBusy(true)
// Add models again
wd.AddAddressModelMap("addr1", "model1")
wd.Mark("addr1")
time.Sleep(10 * time.Millisecond)
wd.AddAddressModelMap("addr2", "model2")
wd.Mark("addr2")
// Keep model1 busy (already marked)
// With forceEvictionWhenBusy=true, should evict busy model1
result = wd.EnforceLRULimit(0)
Expect(result.NeedMore).To(BeFalse())
Expect(result.EvictedCount).To(Equal(1))
})
It("should continue to next LRU model when busy model is skipped", func() {
// Add three models
wd.AddAddressModelMap("addr1", "model1")
wd.Mark("addr1")
time.Sleep(10 * time.Millisecond)
wd.AddAddressModelMap("addr2", "model2")
wd.Mark("addr2")
wd.UnMark("addr2") // Make model2 idle
time.Sleep(10 * time.Millisecond)
wd.AddAddressModelMap("addr3", "model3")
wd.Mark("addr3")
wd.UnMark("addr3") // Make model3 idle
// Keep model1 as busy (oldest, already marked)
// Need to evict 2 models (limit=2, current=3, need room for 1 new)
// Should skip model1 (busy), evict model2 and model3 (not busy)
result := wd.EnforceLRULimit(0)
// Should evict model2 and model3 (2 models, which is what we needed)
Expect(result.EvictedCount).To(Equal(2))
Expect(result.NeedMore).To(BeFalse()) // We evicted enough (2 models)
Expect(pm.getShutdownCalls()).To(ContainElement("model2"))
Expect(pm.getShutdownCalls()).To(ContainElement("model3"))
})
})
Context("EnforceLRULimitResult", func() {
BeforeEach(func() {
wd = model.NewWatchDog(
model.WithProcessManager(pm),
model.WithLRULimit(2),
model.WithForceEvictionWhenBusy(false),
)
})
It("should return NeedMore=false when eviction is successful", func() {
wd.AddAddressModelMap("addr1", "model1")
wd.Mark("addr1")
wd.UnMark("addr1") // Make idle
time.Sleep(10 * time.Millisecond)
wd.AddAddressModelMap("addr2", "model2")
wd.Mark("addr2")
wd.UnMark("addr2") // Make idle
result := wd.EnforceLRULimit(0)
Expect(result.NeedMore).To(BeFalse())
Expect(result.EvictedCount).To(Equal(1))
})
It("should return NeedMore=true when not enough models can be evicted", func() {
// Add two models (at limit of 2, need to evict 1 for new model)
wd.AddAddressModelMap("addr1", "model1")
wd.Mark("addr1")
time.Sleep(10 * time.Millisecond)
wd.AddAddressModelMap("addr2", "model2")
wd.Mark("addr2")
// Mark both as busy (keep them busy)
// Both are already marked as busy from the Mark calls above
// Need to evict 1, but both are busy
result := wd.EnforceLRULimit(0)
Expect(result.NeedMore).To(BeTrue())
Expect(result.EvictedCount).To(Equal(0))
})
It("should return NeedMore=true when need to evict multiple but some are busy", func() {
// Set limit to 1, add 3 models (need to evict 2 for new model)
wd.SetLRULimit(1)
wd.AddAddressModelMap("addr1", "model1")
wd.Mark("addr1")
time.Sleep(10 * time.Millisecond)
wd.AddAddressModelMap("addr2", "model2")
wd.Mark("addr2")
wd.UnMark("addr2") // Make model2 idle
time.Sleep(10 * time.Millisecond)
wd.AddAddressModelMap("addr3", "model3")
wd.Mark("addr3")
// Keep model1 and model3 busy
// Need to evict 2 models, but model1 and model3 are busy, only model2 is idle
// Should evict model2 (1 model), but NeedMore=true because we needed 2
result := wd.EnforceLRULimit(0)
Expect(result.EvictedCount).To(Equal(1))
Expect(result.NeedMore).To(BeTrue())
})
It("should return correct EvictedCount when some models are evicted", func() {
// Add three models
wd.AddAddressModelMap("addr1", "model1")
wd.Mark("addr1")
time.Sleep(10 * time.Millisecond)
wd.AddAddressModelMap("addr2", "model2")
wd.Mark("addr2")
wd.UnMark("addr2") // Make model2 idle
time.Sleep(10 * time.Millisecond)
wd.AddAddressModelMap("addr3", "model3")
wd.Mark("addr3")
wd.UnMark("addr3") // Make model3 idle
// Keep model1 as busy (already marked)
// Need to evict 2 models, but model1 is busy
// Should evict model2 and model3 (2 models, which is what we needed)
result := wd.EnforceLRULimit(0)
Expect(result.EvictedCount).To(Equal(2))
Expect(result.NeedMore).To(BeFalse()) // We evicted enough (2 models)
})
})
Context("Functional Options", func() {
It("should use default options when none provided", func() {
wd = model.NewWatchDog(
@@ -331,6 +580,7 @@ var _ = Describe("WatchDog", func() {
model.WithLRULimit(5),
model.WithMemoryReclaimerEnabled(true),
model.WithMemoryReclaimerThreshold(0.80),
model.WithForceEvictionWhenBusy(true),
)
Expect(wd.GetLRULimit()).To(Equal(5))
@@ -339,5 +589,48 @@ var _ = Describe("WatchDog", func() {
Expect(enabled).To(BeTrue())
Expect(threshold).To(Equal(0.80))
})
It("should use default forceEvictionWhenBusy (false) when not specified", func() {
wd = model.NewWatchDog(
model.WithProcessManager(pm),
)
// Default should be false - we can test this by checking behavior
// Add a busy model and verify it's skipped
wd.AddAddressModelMap("addr1", "model1")
wd.Mark("addr1")
time.Sleep(10 * time.Millisecond)
wd.AddAddressModelMap("addr2", "model2")
wd.Mark("addr2")
wd.Mark("addr1") // Keep model1 busy
wd.SetLRULimit(1)
result := wd.EnforceLRULimit(0)
// Should skip busy model1, evict model2, but NeedMore=true
Expect(result.NeedMore).To(BeTrue())
})
It("should allow setting forceEvictionWhenBusy via option", func() {
wd = model.NewWatchDog(
model.WithProcessManager(pm),
model.WithLRULimit(2),
model.WithForceEvictionWhenBusy(true),
)
// Add models
wd.AddAddressModelMap("addr1", "model1")
wd.Mark("addr1")
time.Sleep(10 * time.Millisecond)
wd.AddAddressModelMap("addr2", "model2")
wd.Mark("addr2")
// Keep model1 busy (already marked from first Mark call)
// Should evict busy model1
result := wd.EnforceLRULimit(0)
Expect(result.NeedMore).To(BeFalse())
Expect(result.EvictedCount).To(Equal(1))
Expect(pm.getShutdownCalls()).To(ContainElement("model1"))
})
})
})