feat: disable force eviction (#7725)

* feat: allow to set forcing backends eviction while requests are in flight Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat: try to make the request sit and retry if eviction couldn't be done Otherwise calls that in order to pass would need to shutdown other backends would just fail. In this way instead we make the request sit and retry eviction until it succeeds. The thresholds can be configured by the user. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * add tests Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * expose settings to CLI Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Update docs Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-01-16 23:49:44 -06:00 · 2025-12-25 14:26:18 +01:00
parent bb459e671f
commit c844b7ac58
18 changed files with 739 additions and 41 deletions
--- a/core/application/config_file_watcher.go
+++ b/core/application/config_file_watcher.go
@@ -214,6 +214,9 @@ func readRuntimeSettingsJson(startupAppConfig config.ApplicationConfig) fileHand
 		envAutoloadGalleries := appConfig.AutoloadGalleries == startupAppConfig.AutoloadGalleries
 		envAutoloadBackendGalleries := appConfig.AutoloadBackendGalleries == startupAppConfig.AutoloadBackendGalleries
 		envAgentJobRetentionDays := appConfig.AgentJobRetentionDays == startupAppConfig.AgentJobRetentionDays
+		envForceEvictionWhenBusy := appConfig.ForceEvictionWhenBusy == startupAppConfig.ForceEvictionWhenBusy
+		envLRUEvictionMaxRetries := appConfig.LRUEvictionMaxRetries == startupAppConfig.LRUEvictionMaxRetries
+		envLRUEvictionRetryInterval := appConfig.LRUEvictionRetryInterval == startupAppConfig.LRUEvictionRetryInterval

 		if len(fileContent) > 0 {
 			var settings config.RuntimeSettings
@@ -277,6 +280,20 @@ func readRuntimeSettingsJson(startupAppConfig config.ApplicationConfig) fileHand
 			if settings.MemoryReclaimerThreshold != nil && !envMemoryReclaimerThreshold {
 				appConfig.MemoryReclaimerThreshold = *settings.MemoryReclaimerThreshold
 			}
+			if settings.ForceEvictionWhenBusy != nil && !envForceEvictionWhenBusy {
+				appConfig.ForceEvictionWhenBusy = *settings.ForceEvictionWhenBusy
+			}
+			if settings.LRUEvictionMaxRetries != nil && !envLRUEvictionMaxRetries {
+				appConfig.LRUEvictionMaxRetries = *settings.LRUEvictionMaxRetries
+			}
+			if settings.LRUEvictionRetryInterval != nil && !envLRUEvictionRetryInterval {
+				dur, err := time.ParseDuration(*settings.LRUEvictionRetryInterval)
+				if err == nil {
+					appConfig.LRUEvictionRetryInterval = dur
+				} else {
+					xlog.Warn("invalid LRU eviction retry interval in runtime_settings.json", "error", err, "interval", *settings.LRUEvictionRetryInterval)
+				}
+			}
 			if settings.Threads != nil && !envThreads {
 				appConfig.Threads = *settings.Threads
 			}
--- a/core/application/startup.go
+++ b/core/application/startup.go
@@ -350,9 +350,16 @@ func initializeWatchdog(application *Application, options *config.ApplicationCon
 			model.WithIdleCheck(options.WatchDogIdle),
 			model.WithLRULimit(lruLimit),
 			model.WithMemoryReclaimer(options.MemoryReclaimerEnabled, options.MemoryReclaimerThreshold),
+			model.WithForceEvictionWhenBusy(options.ForceEvictionWhenBusy),
 		)
 		application.ModelLoader().SetWatchDog(wd)

+		// Initialize ModelLoader LRU eviction retry settings
+		application.ModelLoader().SetLRUEvictionRetrySettings(
+			options.LRUEvictionMaxRetries,
+			options.LRUEvictionRetryInterval,
+		)
+
 		// Start watchdog goroutine if any periodic checks are enabled
 		// LRU eviction doesn't need the Run() loop - it's triggered on model load
 		// But memory reclaimer needs the Run() loop for periodic checking
--- a/core/application/watchdog.go
+++ b/core/application/watchdog.go
@@ -35,6 +35,7 @@ func (a *Application) startWatchdog() error {
 			model.WithIdleCheck(appConfig.WatchDogIdle),
 			model.WithLRULimit(lruLimit),
 			model.WithMemoryReclaimer(appConfig.MemoryReclaimerEnabled, appConfig.MemoryReclaimerThreshold),
+			model.WithForceEvictionWhenBusy(appConfig.ForceEvictionWhenBusy),
 		)
 		a.modelLoader.SetWatchDog(wd)

--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -73,6 +73,9 @@ type RunCMD struct {
 	WatchdogBusyTimeout                string   `env:"LOCALAI_WATCHDOG_BUSY_TIMEOUT,WATCHDOG_BUSY_TIMEOUT" default:"5m" help:"Threshold beyond which a busy backend should be stopped" group:"backends"`
 	EnableMemoryReclaimer              bool     `env:"LOCALAI_MEMORY_RECLAIMER,MEMORY_RECLAIMER,LOCALAI_GPU_RECLAIMER,GPU_RECLAIMER" default:"false" help:"Enable memory threshold monitoring to auto-evict backends when memory usage exceeds threshold (uses GPU VRAM if available, otherwise RAM)" group:"backends"`
 	MemoryReclaimerThreshold           float64  `env:"LOCALAI_MEMORY_RECLAIMER_THRESHOLD,MEMORY_RECLAIMER_THRESHOLD,LOCALAI_GPU_RECLAIMER_THRESHOLD,GPU_RECLAIMER_THRESHOLD" default:"0.95" help:"Memory usage threshold (0.0-1.0) that triggers backend eviction (default 0.95 = 95%%)" group:"backends"`
+	ForceEvictionWhenBusy              bool     `env:"LOCALAI_FORCE_EVICTION_WHEN_BUSY,FORCE_EVICTION_WHEN_BUSY" default:"false" help:"Force eviction even when models have active API calls (default: false for safety)" group:"backends"`
+	LRUEvictionMaxRetries              int      `env:"LOCALAI_LRU_EVICTION_MAX_RETRIES,LRU_EVICTION_MAX_RETRIES" default:"30" help:"Maximum number of retries when waiting for busy models to become idle before eviction (default: 30)" group:"backends"`
+	LRUEvictionRetryInterval           string   `env:"LOCALAI_LRU_EVICTION_RETRY_INTERVAL,LRU_EVICTION_RETRY_INTERVAL" default:"1s" help:"Interval between retries when waiting for busy models to become idle (e.g., 1s, 2s) (default: 1s)" group:"backends"`
 	Federated                          bool     `env:"LOCALAI_FEDERATED,FEDERATED" help:"Enable federated instance" group:"federated"`
 	DisableGalleryEndpoint             bool     `env:"LOCALAI_DISABLE_GALLERY_ENDPOINT,DISABLE_GALLERY_ENDPOINT" help:"Disable the gallery endpoints" group:"api"`
 	MachineTag                         string   `env:"LOCALAI_MACHINE_TAG,MACHINE_TAG" help:"Add Machine-Tag header to each response which is useful to track the machine in the P2P network" group:"api"`
@@ -220,6 +223,21 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 		opts = append(opts, config.EnableSingleBackend)
 	}

+	// Handle LRU eviction settings
+	if r.ForceEvictionWhenBusy {
+		opts = append(opts, config.WithForceEvictionWhenBusy(true))
+	}
+	if r.LRUEvictionMaxRetries > 0 {
+		opts = append(opts, config.WithLRUEvictionMaxRetries(r.LRUEvictionMaxRetries))
+	}
+	if r.LRUEvictionRetryInterval != "" {
+		dur, err := time.ParseDuration(r.LRUEvictionRetryInterval)
+		if err != nil {
+			return fmt.Errorf("invalid LRU eviction retry interval: %w", err)
+		}
+		opts = append(opts, config.WithLRUEvictionRetryInterval(dur))
+	}
+
 	// split ":" to get backend name and the uri
 	for _, v := range r.ExternalGRPCBackends {
 		backend := v[:strings.IndexByte(v, ':')]
--- a/core/config/application_config.go
+++ b/core/config/application_config.go
@@ -64,6 +64,11 @@ type ApplicationConfig struct {
 	MemoryReclaimerEnabled   bool    // Enable memory threshold monitoring
 	MemoryReclaimerThreshold float64 // Threshold 0.0-1.0 (e.g., 0.95 = 95%)

+	// Eviction settings
+	ForceEvictionWhenBusy    bool          // Force eviction even when models have active API calls (default: false for safety)
+	LRUEvictionMaxRetries    int           // Maximum number of retries when waiting for busy models to become idle (default: 30)
+	LRUEvictionRetryInterval time.Duration // Interval between retries when waiting for busy models (default: 1s)
+
 	ModelsURL []string

 	WatchDogBusyTimeout, WatchDogIdleTimeout time.Duration
@@ -86,10 +91,12 @@ type AppOption func(*ApplicationConfig)

 func NewApplicationConfig(o ...AppOption) *ApplicationConfig {
 	opt := &ApplicationConfig{
-		Context:               context.Background(),
-		UploadLimitMB:         15,
-		Debug:                 true,
-		AgentJobRetentionDays: 30, // Default: 30 days
+		Context:                  context.Background(),
+		UploadLimitMB:            15,
+		Debug:                    true,
+		AgentJobRetentionDays:    30,              // Default: 30 days
+		LRUEvictionMaxRetries:    30,              // Default: 30 retries
+		LRUEvictionRetryInterval: 1 * time.Second, // Default: 1 second
 		PathWithoutAuth: []string{
 			"/static/",
 			"/generated-audio/",
@@ -259,6 +266,31 @@ func (o *ApplicationConfig) GetEffectiveMaxActiveBackends() int {
 	return 0
 }

+// WithForceEvictionWhenBusy sets whether to force eviction even when models have active API calls
+func WithForceEvictionWhenBusy(enabled bool) AppOption {
+	return func(o *ApplicationConfig) {
+		o.ForceEvictionWhenBusy = enabled
+	}
+}
+
+// WithLRUEvictionMaxRetries sets the maximum number of retries when waiting for busy models to become idle
+func WithLRUEvictionMaxRetries(maxRetries int) AppOption {
+	return func(o *ApplicationConfig) {
+		if maxRetries > 0 {
+			o.LRUEvictionMaxRetries = maxRetries
+		}
+	}
+}
+
+// WithLRUEvictionRetryInterval sets the interval between retries when waiting for busy models
+func WithLRUEvictionRetryInterval(interval time.Duration) AppOption {
+	return func(o *ApplicationConfig) {
+		if interval > 0 {
+			o.LRUEvictionRetryInterval = interval
+		}
+	}
+}
+
 var EnableParallelBackendRequests = func(o *ApplicationConfig) {
 	o.ParallelBackendRequests = true
 }
@@ -505,6 +537,8 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings {
 	parallelBackendRequests := o.ParallelBackendRequests
 	memoryReclaimerEnabled := o.MemoryReclaimerEnabled
 	memoryReclaimerThreshold := o.MemoryReclaimerThreshold
+	forceEvictionWhenBusy := o.ForceEvictionWhenBusy
+	lruEvictionMaxRetries := o.LRUEvictionMaxRetries
 	threads := o.Threads
 	contextSize := o.ContextSize
 	f16 := o.F16
@@ -539,6 +573,12 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings {
 	} else {
 		watchdogInterval = "2s" // default
 	}
+	var lruEvictionRetryInterval string
+	if o.LRUEvictionRetryInterval > 0 {
+		lruEvictionRetryInterval = o.LRUEvictionRetryInterval.String()
+	} else {
+		lruEvictionRetryInterval = "1s" // default
+	}

 	return RuntimeSettings{
 		WatchdogEnabled:          &watchdogEnabled,
@@ -552,6 +592,9 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings {
 		ParallelBackendRequests:  &parallelBackendRequests,
 		MemoryReclaimerEnabled:   &memoryReclaimerEnabled,
 		MemoryReclaimerThreshold: &memoryReclaimerThreshold,
+		ForceEvictionWhenBusy:    &forceEvictionWhenBusy,
+		LRUEvictionMaxRetries:    &lruEvictionMaxRetries,
+		LRUEvictionRetryInterval: &lruEvictionRetryInterval,
 		Threads:                  &threads,
 		ContextSize:              &contextSize,
 		F16:                      &f16,
@@ -644,6 +687,20 @@ func (o *ApplicationConfig) ApplyRuntimeSettings(settings *RuntimeSettings) (req
 			requireRestart = true
 		}
 	}
+	if settings.ForceEvictionWhenBusy != nil {
+		o.ForceEvictionWhenBusy = *settings.ForceEvictionWhenBusy
+		// This setting doesn't require restart, can be updated dynamically
+	}
+	if settings.LRUEvictionMaxRetries != nil {
+		o.LRUEvictionMaxRetries = *settings.LRUEvictionMaxRetries
+		// This setting doesn't require restart, can be updated dynamically
+	}
+	if settings.LRUEvictionRetryInterval != nil {
+		if dur, err := time.ParseDuration(*settings.LRUEvictionRetryInterval); err == nil {
+			o.LRUEvictionRetryInterval = dur
+			// This setting doesn't require restart, can be updated dynamically
+		}
+	}
 	if settings.Threads != nil {
 		o.Threads = *settings.Threads
 	}
--- a/core/config/runtime_settings.go
+++ b/core/config/runtime_settings.go
@@ -26,6 +26,11 @@ type RuntimeSettings struct {
 	MemoryReclaimerEnabled   *bool    `json:"memory_reclaimer_enabled,omitempty"`   // Enable memory threshold monitoring
 	MemoryReclaimerThreshold *float64 `json:"memory_reclaimer_threshold,omitempty"` // Threshold 0.0-1.0 (e.g., 0.95 = 95%)

+	// Eviction settings
+	ForceEvictionWhenBusy      *bool   `json:"force_eviction_when_busy,omitempty"`      // Force eviction even when models have active API calls (default: false for safety)
+	LRUEvictionMaxRetries      *int    `json:"lru_eviction_max_retries,omitempty"`      // Maximum number of retries when waiting for busy models to become idle (default: 30)
+	LRUEvictionRetryInterval   *string `json:"lru_eviction_retry_interval,omitempty"`   // Interval between retries when waiting for busy models (e.g., 1s, 2s) (default: 1s)
+
 	// Performance settings
 	Threads     *int  `json:"threads,omitempty"`
 	ContextSize *int  `json:"context_size,omitempty"`
--- a/core/http/endpoints/localai/settings.go
+++ b/core/http/endpoints/localai/settings.go
@@ -76,6 +76,14 @@ func UpdateSettingsEndpoint(app *application.Application) echo.HandlerFunc {
 				})
 			}
 		}
+		if settings.LRUEvictionRetryInterval != nil {
+			if _, err := time.ParseDuration(*settings.LRUEvictionRetryInterval); err != nil {
+				return c.JSON(http.StatusBadRequest, schema.SettingsResponse{
+					Success: false,
+					Error:   "Invalid lru_eviction_retry_interval format: " + err.Error(),
+				})
+			}
+		}

 		// Save to file
 		if appConfig.DynamicConfigsDir == "" {
@@ -111,6 +119,31 @@ func UpdateSettingsEndpoint(app *application.Application) echo.HandlerFunc {
 			appConfig.ApiKeys = append(envKeys, runtimeKeys...)
 		}

+		// Update watchdog dynamically for settings that don't require restart
+		if settings.ForceEvictionWhenBusy != nil {
+			currentWD := app.ModelLoader().GetWatchDog()
+			if currentWD != nil {
+				currentWD.SetForceEvictionWhenBusy(*settings.ForceEvictionWhenBusy)
+				xlog.Info("Updated watchdog force eviction when busy setting", "forceEvictionWhenBusy", *settings.ForceEvictionWhenBusy)
+			}
+		}
+
+		// Update ModelLoader LRU eviction retry settings dynamically
+		maxRetries := appConfig.LRUEvictionMaxRetries
+		retryInterval := appConfig.LRUEvictionRetryInterval
+		if settings.LRUEvictionMaxRetries != nil {
+			maxRetries = *settings.LRUEvictionMaxRetries
+		}
+		if settings.LRUEvictionRetryInterval != nil {
+			if dur, err := time.ParseDuration(*settings.LRUEvictionRetryInterval); err == nil {
+				retryInterval = dur
+			}
+		}
+		if settings.LRUEvictionMaxRetries != nil || settings.LRUEvictionRetryInterval != nil {
+			app.ModelLoader().SetLRUEvictionRetrySettings(maxRetries, retryInterval)
+			xlog.Info("Updated LRU eviction retry settings", "maxRetries", maxRetries, "retryInterval", retryInterval)
+		}
+
 		// Check if agent job retention changed
 		agentJobChanged := settings.AgentJobRetentionDays != nil

--- a/core/http/views/settings.html
+++ b/core/http/views/settings.html
@@ -136,6 +136,43 @@
                               :class="!settings.watchdog_enabled ? 'opacity-50 cursor-not-allowed' : ''">
                    </div>

+                    <!-- Force Eviction When Busy -->
+                    <div class="flex items-center justify-between">
+                        <div>
+                            <label class="text-sm font-medium text-[var(--color-text-primary)]">Force Eviction When Busy</label>
+                            <p class="text-xs text-[var(--color-text-secondary)] mt-1">Allow evicting models even when they have active API calls (default: disabled for safety)</p>
+                        </div>
+                        <label class="relative inline-flex items-center cursor-pointer">
+                            <input type="checkbox" x-model="settings.force_eviction_when_busy" 
+                                   :disabled="!settings.watchdog_enabled"
+                                   class="sr-only peer" :class="!settings.watchdog_enabled ? 'opacity-50' : ''">
+                            <div class="w-11 h-6 bg-[var(--color-bg-primary)] peer-focus:outline-none peer-focus:ring-4 peer-focus:ring-[var(--color-primary-light)] rounded-full peer peer-checked:after:translate-x-full peer-checked:after:border-white after:content-[''] after:absolute after:top-[2px] after:left-[2px] after:bg-white after:border-gray-300 after:border after:rounded-full after:h-5 after:w-5 after:transition-all peer-checked:bg-[var(--color-primary)]"></div>
+                        </label>
+                    </div>
+
+                    <!-- LRU Eviction Max Retries -->
+                    <div>
+                        <label class="block text-sm font-medium text-[var(--color-text-primary)] mb-2">LRU Eviction Max Retries</label>
+                        <p class="text-xs text-[var(--color-text-secondary)] mb-2">Maximum number of retries when waiting for busy models to become idle (default: 30)</p>
+                        <input type="number" x-model="settings.lru_eviction_max_retries" 
+                               :disabled="!settings.watchdog_enabled"
+                               min="1"
+                               placeholder="30"
+                               class="w-full px-3 py-2 bg-[var(--color-bg-primary)] border border-[var(--color-primary-border)]/20 rounded text-sm text-[var(--color-text-primary)] focus:outline-none focus:ring-2 focus:ring-[var(--color-primary-border)]"
+                               :class="!settings.watchdog_enabled ? 'opacity-50 cursor-not-allowed' : ''">
+                    </div>
+
+                    <!-- LRU Eviction Retry Interval -->
+                    <div>
+                        <label class="block text-sm font-medium text-[var(--color-text-primary)] mb-2">LRU Eviction Retry Interval</label>
+                        <p class="text-xs text-[var(--color-text-secondary)] mb-2">Interval between retries when waiting for busy models (e.g., 1s, 2s) (default: 1s)</p>
+                        <input type="text" x-model="settings.lru_eviction_retry_interval" 
+                               :disabled="!settings.watchdog_enabled"
+                               placeholder="1s"
+                               class="w-full px-3 py-2 bg-[var(--color-bg-primary)] border border-[var(--color-primary-border)]/20 rounded text-sm text-[var(--color-text-primary)] focus:outline-none focus:ring-2 focus:ring-[var(--color-primary-border)]"
+                               :class="!settings.watchdog_enabled ? 'opacity-50 cursor-not-allowed' : ''">
+                    </div>
+
                    <!-- Memory Reclaimer Subsection -->
                    <div class="mt-6 pt-4 border-t border-[var(--color-primary-border)]/20">
                        <h3 class="text-md font-medium text-[var(--color-text-primary)] mb-3 flex items-center">
@@ -545,6 +582,9 @@ function settingsDashboard() {
            watchdog_idle_timeout: '15m',
            watchdog_busy_timeout: '5m',
            watchdog_interval: '2s',
+            force_eviction_when_busy: false,
+            lru_eviction_max_retries: 30,
+            lru_eviction_retry_interval: '1s',
            max_active_backends: 0,
            parallel_backend_requests: false,
            memory_reclaimer_enabled: false,
@@ -587,6 +627,9 @@ function settingsDashboard() {
                        watchdog_idle_timeout: data.watchdog_idle_timeout || '15m',
                        watchdog_busy_timeout: data.watchdog_busy_timeout || '5m',
                        watchdog_interval: data.watchdog_interval || '2s',
+                        force_eviction_when_busy: data.force_eviction_when_busy || false,
+                        lru_eviction_max_retries: data.lru_eviction_max_retries || 30,
+                        lru_eviction_retry_interval: data.lru_eviction_retry_interval || '1s',
                        max_active_backends: data.max_active_backends || 0,
                        parallel_backend_requests: data.parallel_backend_requests,
                        memory_reclaimer_enabled: data.memory_reclaimer_enabled || false,
@@ -660,6 +703,15 @@ function settingsDashboard() {
                if (this.settings.watchdog_interval) {
                    payload.watchdog_interval = this.settings.watchdog_interval;
                }
+                if (this.settings.force_eviction_when_busy !== undefined) {
+                    payload.force_eviction_when_busy = this.settings.force_eviction_when_busy;
+                }
+                if (this.settings.lru_eviction_max_retries !== undefined) {
+                    payload.lru_eviction_max_retries = parseInt(this.settings.lru_eviction_max_retries) || 30;
+                }
+                if (this.settings.lru_eviction_retry_interval) {
+                    payload.lru_eviction_retry_interval = this.settings.lru_eviction_retry_interval;
+                }
                if (this.settings.max_active_backends !== undefined) {
                    payload.max_active_backends = parseInt(this.settings.max_active_backends) || 0;
                }
--- a/docs/content/advanced/vram-management.md
+++ b/docs/content/advanced/vram-management.md
@@ -52,6 +52,49 @@ Setting the limit to `1` is equivalent to single active backend mode (see below)
 3. The LRU model(s) are automatically unloaded to make room for the new model
 4. Concurrent requests for loading different models are handled safely - the system accounts for models currently being loaded when calculating evictions

+### Eviction Behavior with Active Requests
+
+By default, LocalAI will **skip evicting models that have active API calls** to prevent interrupting ongoing requests. This means:
+
+- If all models are busy (have active requests), eviction will be skipped and the system will wait for models to become idle
+- The loading request will retry eviction with configurable retry settings
+- This ensures data integrity and prevents request failures
+
+You can configure this behavior via WebUI or using the following settings:
+
+#### Force Eviction When Busy
+
+To allow evicting models even when they have active API calls (not recommended for production):
+
+```bash
+# Via CLI
+./local-ai --force-eviction-when-busy
+
+# Via environment variable
+LOCALAI_FORCE_EVICTION_WHEN_BUSY=true ./local-ai
+```
+
+> **Warning:** Enabling force eviction can interrupt active requests and cause errors. Only use this if you understand the implications.
+
+#### LRU Eviction Retry Settings
+
+When models are busy and cannot be evicted, LocalAI will retry eviction with configurable settings:
+
+```bash
+# Configure maximum retries (default: 30)
+./local-ai --lru-eviction-max-retries=50
+
+# Configure retry interval (default: 1s)
+./local-ai --lru-eviction-retry-interval=2s
+
+# Using environment variables
+LOCALAI_LRU_EVICTION_MAX_RETRIES=50 \
+LOCALAI_LRU_EVICTION_RETRY_INTERVAL=2s \
+./local-ai
+```
+
+These settings control how long the system will wait for busy models to become idle before giving up. The retry mechanism allows busy models to complete their requests before being evicted, preventing request failures.
+
 ### Example

 ```bash
@@ -207,6 +250,33 @@ This configuration:
 - Automatically unloads any model that hasn't been used for 15 minutes
 - Provides both hard limits and time-based cleanup

+### Example with Retry Settings
+
+You can also configure retry behavior when models are busy:
+
+```bash
+# Allow up to 2 active backends with custom retry settings
+LOCALAI_MAX_ACTIVE_BACKENDS=2 \
+LOCALAI_LRU_EVICTION_MAX_RETRIES=50 \
+LOCALAI_LRU_EVICTION_RETRY_INTERVAL=2s \
+./local-ai
+```
+
+Or using command line flags:
+
+```bash
+./local-ai \
+  --max-active-backends=2 \
+  --lru-eviction-max-retries=50 \
+  --lru-eviction-retry-interval=2s
+```
+
+This configuration:
+- Limits to 2 active backends
+- Will retry eviction up to 50 times if models are busy
+- Waits 2 seconds between retry attempts
+- Ensures busy models have time to complete their requests before eviction
+
 ## Limitations and Considerations

 ### VRAM Usage Estimation
--- a/docs/content/features/runtime-settings.md
+++ b/docs/content/features/runtime-settings.md
@@ -29,9 +29,23 @@ Changes to watchdog settings are applied immediately by restarting the watchdog

 - **Max Active Backends**: Maximum number of active backends (loaded models). When exceeded, the least recently used model is automatically evicted. Set to `0` for unlimited, `1` for single-backend mode
 - **Parallel Backend Requests**: Enable backends to handle multiple requests in parallel if supported
+- **Force Eviction When Busy**: Allow evicting models even when they have active API calls (default: disabled for safety). **Warning:** Enabling this can interrupt active requests
+- **LRU Eviction Max Retries**: Maximum number of retries when waiting for busy models to become idle before eviction (default: 30)
+- **LRU Eviction Retry Interval**: Interval between retries when waiting for busy models (default: `1s`)

 > **Note:** The "Single Backend" setting is deprecated. Use "Max Active Backends" set to `1` for single-backend behavior.

+#### LRU Eviction Behavior
+
+By default, LocalAI will skip evicting models that have active API calls to prevent interrupting ongoing requests. When all models are busy and eviction is needed:
+
+1. The system will wait for models to become idle
+2. It will retry eviction up to the configured maximum number of retries
+3. The retry interval determines how long to wait between attempts
+4. If all retries are exhausted, the system will proceed (which may cause out-of-memory errors if resources are truly exhausted)
+
+You can configure these settings via the web UI or through environment variables. See [VRAM Management]({{%relref "advanced/vram-management" %}}) for more details.
+
 ### Performance Settings

 - **Threads**: Number of threads used for parallel computation (recommended: number of physical cores)
@@ -94,6 +108,9 @@ The `runtime_settings.json` file follows this structure:
  "watchdog_busy_timeout": "5m",
  "max_active_backends": 0,
  "parallel_backend_requests": true,
+  "force_eviction_when_busy": false,
+  "lru_eviction_max_retries": 30,
+  "lru_eviction_retry_interval": "1s",
  "threads": 8,
  "context_size": 2048,
  "f16": false,
--- a/docs/content/features/text-to-audio.md
+++ b/docs/content/features/text-to-audio.md
@@ -128,7 +128,7 @@ Future versions of LocalAI will expose additional control over audio generation

 #### Setup

-Install the `vibevoice` model in the Model gallery.
+Install the `vibevoice` model in the Model gallery or run `local-ai run models install vibevoice`.

 #### Usage

--- a/docs/content/reference/cli-reference.md
+++ b/docs/content/reference/cli-reference.md
@@ -46,6 +46,9 @@ Complete reference for all LocalAI command-line interface (CLI) parameters and e
 | `--watchdog-idle-timeout` | `15m` | Threshold beyond which an idle backend should be stopped | `$LOCALAI_WATCHDOG_IDLE_TIMEOUT`, `$WATCHDOG_IDLE_TIMEOUT` |
 | `--enable-watchdog-busy` | `false` | Enable watchdog for stopping backends that are busy longer than the watchdog-busy-timeout | `$LOCALAI_WATCHDOG_BUSY`, `$WATCHDOG_BUSY` |
 | `--watchdog-busy-timeout` | `5m` | Threshold beyond which a busy backend should be stopped | `$LOCALAI_WATCHDOG_BUSY_TIMEOUT`, `$WATCHDOG_BUSY_TIMEOUT` |
+| `--force-eviction-when-busy` | `false` | Force eviction even when models have active API calls (default: false for safety). **Warning:** Enabling this can interrupt active requests | `$LOCALAI_FORCE_EVICTION_WHEN_BUSY`, `$FORCE_EVICTION_WHEN_BUSY` |
+| `--lru-eviction-max-retries` | `30` | Maximum number of retries when waiting for busy models to become idle before eviction | `$LOCALAI_LRU_EVICTION_MAX_RETRIES`, `$LRU_EVICTION_MAX_RETRIES` |
+| `--lru-eviction-retry-interval` | `1s` | Interval between retries when waiting for busy models to become idle (e.g., `1s`, `2s`) | `$LOCALAI_LRU_EVICTION_RETRY_INTERVAL`, `$LRU_EVICTION_RETRY_INTERVAL` |

 For more information on VRAM management, see [VRAM and Memory Management]({{%relref "advanced/vram-management" %}}).

--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@@ -9,8 +9,8 @@ import (
 	"time"

 	grpc "github.com/mudler/LocalAI/pkg/grpc"
-	"github.com/phayes/freeport"
 	"github.com/mudler/xlog"
+	"github.com/phayes/freeport"
 )

 const (
@@ -173,7 +173,7 @@ func (ml *ModelLoader) backendLoader(opts ...Option) (client grpc.Backend, err e

 	model, err := ml.LoadModel(o.modelID, o.model, ml.grpcModel(backend, o))
 	if err != nil {
-		if stopErr := ml.StopGRPC(only(o.modelID));stopErr != nil {
+		if stopErr := ml.StopGRPC(only(o.modelID)); stopErr != nil {
 			xlog.Error("error stopping model", "error", stopErr, "model", o.modelID)
 		}
 		xlog.Error("Failed to load model", "modelID", o.modelID, "error", err, "backend", o.backendString)
@@ -186,13 +186,47 @@ func (ml *ModelLoader) backendLoader(opts ...Option) (client grpc.Backend, err e
 // enforceLRULimit enforces the LRU limit before loading a new model.
 // This is called before loading a model to ensure we don't exceed the limit.
 // It accounts for models that are currently being loaded by other goroutines.
+// If models are busy and can't be evicted, it will wait and retry until space is available.
 func (ml *ModelLoader) enforceLRULimit() {
 	if ml.wd == nil {
 		return
 	}
+
 	// Get the count of models currently being loaded to account for concurrent requests
 	pendingLoads := ml.GetLoadingCount()
-	ml.wd.EnforceLRULimit(pendingLoads)
+
+	// Get retry settings from ModelLoader
+	ml.mu.Lock()
+	maxRetries := ml.lruEvictionMaxRetries
+	retryInterval := ml.lruEvictionRetryInterval
+	ml.mu.Unlock()
+
+	for attempt := 0; attempt < maxRetries; attempt++ {
+		result := ml.wd.EnforceLRULimit(pendingLoads)
+
+		if !result.NeedMore {
+			// Successfully evicted enough models (or no eviction needed)
+			if result.EvictedCount > 0 {
+				xlog.Info("[ModelLoader] LRU enforcement complete", "evicted", result.EvictedCount)
+			}
+			return
+		}
+
+		// Need more evictions but models are busy - wait and retry
+		if attempt < maxRetries-1 {
+			xlog.Info("[ModelLoader] Waiting for busy models to become idle before eviction",
+				"evicted", result.EvictedCount,
+				"attempt", attempt+1,
+				"maxRetries", maxRetries,
+				"retryIn", retryInterval)
+			time.Sleep(retryInterval)
+		} else {
+			// Last attempt - log warning but proceed (might fail to load, but at least we tried)
+			xlog.Warn("[ModelLoader] LRU enforcement incomplete after max retries",
+				"evicted", result.EvictedCount,
+				"reason", "models are still busy with active API calls")
+		}
+	}
 }

 // updateModelLastUsed updates the last used time for a model (for LRU tracking)
--- a/pkg/model/loader.go
+++ b/pkg/model/loader.go
@@ -20,22 +20,26 @@ import (

 // TODO: Split ModelLoader and TemplateLoader? Just to keep things more organized. Left together to share a mutex until I look into that. Would split if we separate directories for .bin/.yaml and .tmpl
 type ModelLoader struct {
-	ModelPath        string
-	mu               sync.Mutex
-	models           map[string]*Model
-	loading          map[string]chan struct{} // tracks models currently being loaded
-	wd               *WatchDog
-	externalBackends map[string]string
+	ModelPath                string
+	mu                       sync.Mutex
+	models                   map[string]*Model
+	loading                  map[string]chan struct{} // tracks models currently being loaded
+	wd                       *WatchDog
+	externalBackends         map[string]string
+	lruEvictionMaxRetries    int           // Maximum number of retries when waiting for busy models
+	lruEvictionRetryInterval time.Duration // Interval between retries when waiting for busy models
 }

 // NewModelLoader creates a new ModelLoader instance.
 // LRU eviction is now managed through the WatchDog component.
 func NewModelLoader(system *system.SystemState) *ModelLoader {
 	nml := &ModelLoader{
-		ModelPath:        system.Model.ModelsPath,
-		models:           make(map[string]*Model),
-		loading:          make(map[string]chan struct{}),
-		externalBackends: make(map[string]string),
+		ModelPath:                system.Model.ModelsPath,
+		models:                   make(map[string]*Model),
+		loading:                  make(map[string]chan struct{}),
+		externalBackends:         make(map[string]string),
+		lruEvictionMaxRetries:    30,              // Default: 30 retries
+		lruEvictionRetryInterval: 1 * time.Second, // Default: 1 second
 	}

 	return nml
@@ -56,6 +60,14 @@ func (ml *ModelLoader) GetWatchDog() *WatchDog {
 	return ml.wd
 }

+// SetLRUEvictionRetrySettings updates the LRU eviction retry settings
+func (ml *ModelLoader) SetLRUEvictionRetrySettings(maxRetries int, retryInterval time.Duration) {
+	ml.mu.Lock()
+	defer ml.mu.Unlock()
+	ml.lruEvictionMaxRetries = maxRetries
+	ml.lruEvictionRetryInterval = retryInterval
+}
+
 func (ml *ModelLoader) ExistsInModelPath(s string) bool {
 	return utils.ExistsInPath(ml.ModelPath, s)
 }
--- a/pkg/model/loader_test.go
+++ b/pkg/model/loader_test.go
@@ -262,4 +262,13 @@ var _ = Describe("ModelLoader", func() {
 			Expect(modelLoader.GetLoadingCount()).To(Equal(0))
 		})
 	})
+
+	Context("LRU Eviction Retry Settings", func() {
+		It("should allow updating retry settings", func() {
+			modelLoader.SetLRUEvictionRetrySettings(50, 2*time.Second)
+			// Settings are updated - we can verify through behavior if needed
+			// For now, just verify the call doesn't panic
+			Expect(modelLoader).ToNot(BeNil())
+		})
+	})
 })
--- a/pkg/model/watchdog.go
+++ b/pkg/model/watchdog.go
@@ -41,6 +41,9 @@ type WatchDog struct {
 	memoryReclaimerEnabled   bool    // Enable memory threshold monitoring
 	memoryReclaimerThreshold float64 // Threshold 0.0-1.0 (e.g., 0.95 = 95%)
 	watchdogInterval         time.Duration
+
+	// Eviction settings
+	forceEvictionWhenBusy bool // Force eviction even when models have active API calls (default: false for safety)
 }

 type ProcessManager interface {
@@ -78,6 +81,7 @@ func NewWatchDog(opts ...WatchDogOption) *WatchDog {
 		memoryReclaimerEnabled:   o.memoryReclaimerEnabled,
 		memoryReclaimerThreshold: o.memoryReclaimerThreshold,
 		watchdogInterval:         o.watchdogInterval,
+		forceEvictionWhenBusy:    o.forceEvictionWhenBusy,
 	}
 }

@@ -110,6 +114,13 @@ func (wd *WatchDog) GetMemoryReclaimerSettings() (enabled bool, threshold float6
 	return wd.memoryReclaimerEnabled, wd.memoryReclaimerThreshold
 }

+// SetForceEvictionWhenBusy updates the force eviction when busy setting dynamically
+func (wd *WatchDog) SetForceEvictionWhenBusy(force bool) {
+	wd.Lock()
+	defer wd.Unlock()
+	wd.forceEvictionWhenBusy = force
+}
+
 func (wd *WatchDog) Shutdown() {
 	wd.Lock()
 	defer wd.Unlock()
@@ -169,13 +180,19 @@ type modelUsageInfo struct {
 	lastUsed time.Time
 }

+// EnforceLRULimitResult contains the result of LRU enforcement
+type EnforceLRULimitResult struct {
+	EvictedCount int  // Number of models successfully evicted
+	NeedMore     bool // True if more evictions are needed but couldn't be done (e.g., all models are busy)
+}
+
 // EnforceLRULimit ensures we're under the LRU limit by evicting least recently used models.
 // This should be called before loading a new model.
 // pendingLoads is the number of models currently being loaded (to account for concurrent loads).
-// Returns the number of models evicted.
-func (wd *WatchDog) EnforceLRULimit(pendingLoads int) int {
+// Returns the result containing evicted count and whether more evictions are needed.
+func (wd *WatchDog) EnforceLRULimit(pendingLoads int) EnforceLRULimitResult {
 	if wd.lruLimit <= 0 {
-		return 0 // LRU disabled
+		return EnforceLRULimitResult{EvictedCount: 0, NeedMore: false} // LRU disabled
 	}

 	wd.Lock()
@@ -186,9 +203,10 @@ func (wd *WatchDog) EnforceLRULimit(pendingLoads int) int {
 	// We need: currentCount + pendingLoads + 1 <= lruLimit
 	// So evict: currentCount + pendingLoads + 1 - lruLimit = currentCount - lruLimit + pendingLoads + 1
 	modelsToEvict := currentCount - wd.lruLimit + pendingLoads + 1
+	forceEvictionWhenBusy := wd.forceEvictionWhenBusy
 	if modelsToEvict <= 0 {
 		wd.Unlock()
-		return 0
+		return EnforceLRULimitResult{EvictedCount: 0, NeedMore: false}
 	}

 	xlog.Debug("[WatchDog] LRU enforcement triggered", "current", currentCount, "pendingLoads", pendingLoads, "limit", wd.lruLimit, "toEvict", modelsToEvict)
@@ -215,13 +233,25 @@ func (wd *WatchDog) EnforceLRULimit(pendingLoads int) int {

 	// Collect models to evict (the oldest ones)
 	var modelsToShutdown []string
-	for i := 0; i < modelsToEvict && i < len(models); i++ {
+	evictedCount := 0
+	skippedBusyCount := 0
+	for i := 0; evictedCount < modelsToEvict && i < len(models); i++ {
 		m := models[i]
-		xlog.Info("[WatchDog] LRU evicting model", "model", m.model, "lastUsed", m.lastUsed)
+		// Check if model is busy
+		_, isBusy := wd.busyTime[m.address]
+		if isBusy && !forceEvictionWhenBusy {
+			// Skip eviction for busy models when forceEvictionWhenBusy is false
+			xlog.Warn("[WatchDog] Skipping LRU eviction for busy model", "model", m.model, "reason", "model has active API calls")
+			skippedBusyCount++
+			continue
+		}
+		xlog.Info("[WatchDog] LRU evicting model", "model", m.model, "lastUsed", m.lastUsed, "busy", isBusy)
 		modelsToShutdown = append(modelsToShutdown, m.model)
 		// Clean up the maps while we have the lock
 		wd.untrack(m.address)
+		evictedCount++
 	}
+	needMore := evictedCount < modelsToEvict && skippedBusyCount > 0
 	wd.Unlock()

 	// Now shutdown models without holding the watchdog lock to prevent deadlock
@@ -232,7 +262,14 @@ func (wd *WatchDog) EnforceLRULimit(pendingLoads int) int {
 		xlog.Debug("[WatchDog] LRU eviction complete", "model", model)
 	}

-	return len(modelsToShutdown)
+	if needMore {
+		xlog.Warn("[WatchDog] LRU eviction incomplete", "evicted", evictedCount, "needed", modelsToEvict, "skippedBusy", skippedBusyCount, "reason", "some models are busy with active API calls")
+	}
+
+	return EnforceLRULimitResult{
+		EvictedCount: len(modelsToShutdown),
+		NeedMore:     needMore,
+	}
 }

 func (wd *WatchDog) Run() {
@@ -376,6 +413,8 @@ func (wd *WatchDog) evictLRUModel() {
 		return
 	}

+	forceEvictionWhenBusy := wd.forceEvictionWhenBusy
+
 	// Build a list of models sorted by last used time (oldest first)
 	var models []modelUsageInfo
 	for address, model := range wd.addressModelMap {
@@ -400,8 +439,27 @@ func (wd *WatchDog) evictLRUModel() {
 		return models[i].lastUsed.Before(models[j].lastUsed)
 	})

-	// Get the LRU model
-	lruModel := models[0]
+	// Find the first non-busy model (or first model if forceEvictionWhenBusy is true)
+	var lruModel *modelUsageInfo
+	for i := 0; i < len(models); i++ {
+		m := models[i]
+		_, isBusy := wd.busyTime[m.address]
+		if isBusy && !forceEvictionWhenBusy {
+			// Skip busy models when forceEvictionWhenBusy is false
+			xlog.Warn("[WatchDog] Skipping memory reclaimer eviction for busy model", "model", m.model, "reason", "model has active API calls")
+			continue
+		}
+		lruModel = &m
+		break
+	}
+
+	if lruModel == nil {
+		// All models are busy and forceEvictionWhenBusy is false
+		wd.Unlock()
+		xlog.Warn("[WatchDog] Memory reclaimer cannot evict: all models are busy with active API calls")
+		return
+	}
+
 	xlog.Info("[WatchDog] Memory reclaimer evicting LRU model", "model", lruModel.model, "lastUsed", lruModel.lastUsed)

 	// Untrack the model
--- a/pkg/model/watchdog_options.go
+++ b/pkg/model/watchdog_options.go
@@ -28,6 +28,9 @@ type WatchDogOptions struct {
 	// Memory reclaimer settings (works with GPU if available, otherwise RAM)
 	memoryReclaimerEnabled   bool    // Enable memory threshold monitoring
 	memoryReclaimerThreshold float64 // Threshold 0.0-1.0 (e.g., 0.95 = 95%)
+
+	// Eviction settings
+	forceEvictionWhenBusy bool // Force eviction even when models have active API calls (default: false for safety)
 }

 // WatchDogOption is a function that configures WatchDogOptions
@@ -105,6 +108,14 @@ func WithMemoryReclaimerThreshold(threshold float64) WatchDogOption {
 	}
 }

+// WithForceEvictionWhenBusy sets whether to force eviction even when models have active API calls
+// Default: false (skip eviction when busy for safety)
+func WithForceEvictionWhenBusy(force bool) WatchDogOption {
+	return func(o *WatchDogOptions) {
+		o.forceEvictionWhenBusy = force
+	}
+}
+
 // DefaultWatchDogOptions returns default options for the watchdog
 func DefaultWatchDogOptions() *WatchDogOptions {
 	return &WatchDogOptions{
@@ -116,6 +127,7 @@ func DefaultWatchDogOptions() *WatchDogOptions {
 		lruLimit:                 0,
 		memoryReclaimerEnabled:   false,
 		memoryReclaimerThreshold: DefaultMemoryReclaimerThreshold,
+		forceEvictionWhenBusy:    false, // Default: skip eviction when busy for safety
 	}
 }

--- a/pkg/model/watchdog_test.go
+++ b/pkg/model/watchdog_test.go
@@ -170,15 +170,18 @@ var _ = Describe("WatchDog", func() {
 				model.WithBusyTimeout(5*time.Minute),
 				model.WithIdleTimeout(15*time.Minute),
 				model.WithLRULimit(2),
+				model.WithForceEvictionWhenBusy(true), // Enable force eviction for these tests to match old behavior
 			)
 		})

 		It("should not evict when under limit", func() {
 			wd.AddAddressModelMap("addr1", "model1")
 			wd.Mark("addr1")
+			wd.UnMark("addr1") // Unmark to make it idle (not busy)

-			evicted := wd.EnforceLRULimit(0)
-			Expect(evicted).To(Equal(0))
+			result := wd.EnforceLRULimit(0)
+			Expect(result.EvictedCount).To(Equal(0))
+			Expect(result.NeedMore).To(BeFalse())
 			Expect(pm.getShutdownCalls()).To(BeEmpty())
 		})

@@ -186,14 +189,17 @@ var _ = Describe("WatchDog", func() {
 			// Add two models
 			wd.AddAddressModelMap("addr1", "model1")
 			wd.Mark("addr1")
+			wd.UnMark("addr1") // Unmark to make it idle
 			time.Sleep(10 * time.Millisecond)

 			wd.AddAddressModelMap("addr2", "model2")
 			wd.Mark("addr2")
+			wd.UnMark("addr2") // Unmark to make it idle

 			// Enforce LRU with limit of 2 (need to make room for 1 new model)
-			evicted := wd.EnforceLRULimit(0)
-			Expect(evicted).To(Equal(1))
+			result := wd.EnforceLRULimit(0)
+			Expect(result.EvictedCount).To(Equal(1))
+			Expect(result.NeedMore).To(BeFalse())
 			Expect(pm.getShutdownCalls()).To(ContainElement("model1")) // oldest should be evicted
 		})

@@ -201,19 +207,23 @@ var _ = Describe("WatchDog", func() {
 			// Add three models
 			wd.AddAddressModelMap("addr1", "model1")
 			wd.Mark("addr1")
+			wd.UnMark("addr1") // Unmark to make it idle
 			time.Sleep(10 * time.Millisecond)

 			wd.AddAddressModelMap("addr2", "model2")
 			wd.Mark("addr2")
+			wd.UnMark("addr2") // Unmark to make it idle
 			time.Sleep(10 * time.Millisecond)

 			wd.AddAddressModelMap("addr3", "model3")
 			wd.Mark("addr3")
+			wd.UnMark("addr3") // Unmark to make it idle

 			// Set limit to 1, should evict 2 oldest + 1 for new = 3 evictions
 			wd.SetLRULimit(1)
-			evicted := wd.EnforceLRULimit(0)
-			Expect(evicted).To(Equal(3))
+			result := wd.EnforceLRULimit(0)
+			Expect(result.EvictedCount).To(Equal(3))
+			Expect(result.NeedMore).To(BeFalse())
 			shutdowns := pm.getShutdownCalls()
 			Expect(shutdowns).To(ContainElement("model1"))
 			Expect(shutdowns).To(ContainElement("model2"))
@@ -224,15 +234,18 @@ var _ = Describe("WatchDog", func() {
 			// Add two models (at limit)
 			wd.AddAddressModelMap("addr1", "model1")
 			wd.Mark("addr1")
+			wd.UnMark("addr1") // Unmark to make it idle
 			time.Sleep(10 * time.Millisecond)

 			wd.AddAddressModelMap("addr2", "model2")
 			wd.Mark("addr2")
+			wd.UnMark("addr2") // Unmark to make it idle

 			// With 1 pending load, we need to evict 2 (current=2, pending=1, new=1, limit=2)
 			// total after = 2 + 1 + 1 = 4, need to evict 4 - 2 = 2
-			evicted := wd.EnforceLRULimit(1)
-			Expect(evicted).To(Equal(2))
+			result := wd.EnforceLRULimit(1)
+			Expect(result.EvictedCount).To(Equal(2))
+			Expect(result.NeedMore).To(BeFalse())
 		})

 		It("should not evict when LRU is disabled", func() {
@@ -242,8 +255,9 @@ var _ = Describe("WatchDog", func() {
 			wd.AddAddressModelMap("addr2", "model2")
 			wd.AddAddressModelMap("addr3", "model3")

-			evicted := wd.EnforceLRULimit(0)
-			Expect(evicted).To(Equal(0))
+			result := wd.EnforceLRULimit(0)
+			Expect(result.EvictedCount).To(Equal(0))
+			Expect(result.NeedMore).To(BeFalse())
 			Expect(pm.getShutdownCalls()).To(BeEmpty())
 		})

@@ -253,10 +267,12 @@ var _ = Describe("WatchDog", func() {
 			// Add models with different lastUsed times
 			wd.AddAddressModelMap("addr1", "model1")
 			wd.Mark("addr1")
+			wd.UnMark("addr1") // Unmark to make it idle
 			time.Sleep(20 * time.Millisecond)

 			wd.AddAddressModelMap("addr2", "model2")
 			wd.Mark("addr2")
+			wd.UnMark("addr2") // Unmark to make it idle
 			time.Sleep(20 * time.Millisecond)

 			// Touch model1 again to make it more recent
@@ -265,10 +281,12 @@ var _ = Describe("WatchDog", func() {

 			wd.AddAddressModelMap("addr3", "model3")
 			wd.Mark("addr3")
+			wd.UnMark("addr3") // Unmark to make it idle

 			// Now model2 is the oldest, should be evicted first
-			evicted := wd.EnforceLRULimit(0)
-			Expect(evicted).To(BeNumerically(">=", 1))
+			result := wd.EnforceLRULimit(0)
+			Expect(result.EvictedCount).To(BeNumerically(">=", 1))
+			Expect(result.NeedMore).To(BeFalse())

 			shutdowns := pm.getShutdownCalls()
 			// model2 should be evicted first (it's the oldest)
@@ -285,16 +303,19 @@ var _ = Describe("WatchDog", func() {
 				model.WithBusyTimeout(5*time.Minute),
 				model.WithIdleTimeout(15*time.Minute),
 				model.WithLRULimit(1),
+				model.WithForceEvictionWhenBusy(true), // Enable force eviction for these tests
 			)
 		})

 		It("should evict existing model when loading new one", func() {
 			wd.AddAddressModelMap("addr1", "model1")
 			wd.Mark("addr1")
+			wd.UnMark("addr1") // Unmark to make it idle

 			// With limit=1, loading a new model should evict the existing one
-			evicted := wd.EnforceLRULimit(0)
-			Expect(evicted).To(Equal(1))
+			result := wd.EnforceLRULimit(0)
+			Expect(result.EvictedCount).To(Equal(1))
+			Expect(result.NeedMore).To(BeFalse())
 			Expect(pm.getShutdownCalls()).To(ContainElement("model1"))
 		})

@@ -302,6 +323,7 @@ var _ = Describe("WatchDog", func() {
 			for i := 0; i < 5; i++ {
 				wd.AddAddressModelMap("addr", "model")
 				wd.Mark("addr")
+				wd.UnMark("addr") // Unmark to make it idle
 				wd.EnforceLRULimit(0)
 			}
 			// All previous models should have been evicted
@@ -309,6 +331,233 @@ var _ = Describe("WatchDog", func() {
 		})
 	})

+	Context("Force Eviction When Busy", func() {
+		BeforeEach(func() {
+			wd = model.NewWatchDog(
+				model.WithProcessManager(pm),
+				model.WithLRULimit(2),
+				model.WithForceEvictionWhenBusy(false), // Default: skip eviction when busy
+			)
+		})
+
+		It("should skip eviction for busy models when forceEvictionWhenBusy is false", func() {
+			// Add two models (at limit of 2, need to evict 1 for new model)
+			wd.AddAddressModelMap("addr1", "model1")
+			wd.Mark("addr1")
+			time.Sleep(10 * time.Millisecond)
+
+			wd.AddAddressModelMap("addr2", "model2")
+			wd.Mark("addr2")
+			wd.UnMark("addr2") // Make model2 idle
+
+			// Keep model1 as busy (simulating active API call)
+			// model1 is already marked as busy from the first Mark call
+
+			// Try to enforce LRU - should skip busy model1, evict model2
+			result := wd.EnforceLRULimit(0)
+			// Should evict model2 (not busy) but skip model1 (busy)
+			// Since we evicted 1 (which is what we needed), NeedMore should be false
+			Expect(result.EvictedCount).To(Equal(1))
+			Expect(result.NeedMore).To(BeFalse()) // We evicted enough, even though we skipped model1
+			Expect(pm.getShutdownCalls()).To(ContainElement("model2"))
+			Expect(pm.getShutdownCalls()).ToNot(ContainElement("model1"))
+		})
+
+		It("should evict busy models when forceEvictionWhenBusy is true", func() {
+			wd.SetForceEvictionWhenBusy(true)
+
+			// Add two models
+			wd.AddAddressModelMap("addr1", "model1")
+			wd.Mark("addr1")
+			time.Sleep(10 * time.Millisecond)
+
+			wd.AddAddressModelMap("addr2", "model2")
+			wd.Mark("addr2")
+
+			// Keep model1 as busy (already marked from first Mark call)
+
+			// Try to enforce LRU - should evict model1 even though busy
+			result := wd.EnforceLRULimit(0)
+			Expect(result.EvictedCount).To(Equal(1))
+			Expect(result.NeedMore).To(BeFalse())
+			Expect(pm.getShutdownCalls()).To(ContainElement("model1"))
+		})
+
+		It("should set NeedMore when all models are busy and forceEvictionWhenBusy is false", func() {
+			// Add two models
+			wd.AddAddressModelMap("addr1", "model1")
+			wd.Mark("addr1")
+			time.Sleep(10 * time.Millisecond)
+
+			wd.AddAddressModelMap("addr2", "model2")
+			wd.Mark("addr2")
+
+			// Mark both as busy
+			wd.Mark("addr1")
+			wd.Mark("addr2")
+
+			// Try to enforce LRU - should skip both busy models
+			result := wd.EnforceLRULimit(0)
+			Expect(result.EvictedCount).To(Equal(0))
+			Expect(result.NeedMore).To(BeTrue())
+			Expect(pm.getShutdownCalls()).To(BeEmpty())
+		})
+
+		It("should allow updating forceEvictionWhenBusy dynamically", func() {
+			// Start with false
+			Expect(wd).ToNot(BeNil())
+
+			// Add models
+			wd.AddAddressModelMap("addr1", "model1")
+			wd.Mark("addr1")
+			time.Sleep(10 * time.Millisecond)
+
+			wd.AddAddressModelMap("addr2", "model2")
+			wd.Mark("addr2")
+			wd.UnMark("addr2") // Make model2 idle
+			// Keep model1 busy (already marked)
+
+			// With forceEvictionWhenBusy=false, should skip busy model1, evict model2
+			result := wd.EnforceLRULimit(0)
+			Expect(result.NeedMore).To(BeFalse())    // We evicted enough (1 model)
+			Expect(result.EvictedCount).To(Equal(1)) // Should evict model2 (not busy)
+
+			// Now enable force eviction
+			wd.SetForceEvictionWhenBusy(true)
+
+			// Add models again
+			wd.AddAddressModelMap("addr1", "model1")
+			wd.Mark("addr1")
+			time.Sleep(10 * time.Millisecond)
+
+			wd.AddAddressModelMap("addr2", "model2")
+			wd.Mark("addr2")
+			// Keep model1 busy (already marked)
+
+			// With forceEvictionWhenBusy=true, should evict busy model1
+			result = wd.EnforceLRULimit(0)
+			Expect(result.NeedMore).To(BeFalse())
+			Expect(result.EvictedCount).To(Equal(1))
+		})
+
+		It("should continue to next LRU model when busy model is skipped", func() {
+			// Add three models
+			wd.AddAddressModelMap("addr1", "model1")
+			wd.Mark("addr1")
+			time.Sleep(10 * time.Millisecond)
+
+			wd.AddAddressModelMap("addr2", "model2")
+			wd.Mark("addr2")
+			wd.UnMark("addr2") // Make model2 idle
+			time.Sleep(10 * time.Millisecond)
+
+			wd.AddAddressModelMap("addr3", "model3")
+			wd.Mark("addr3")
+			wd.UnMark("addr3") // Make model3 idle
+
+			// Keep model1 as busy (oldest, already marked)
+
+			// Need to evict 2 models (limit=2, current=3, need room for 1 new)
+			// Should skip model1 (busy), evict model2 and model3 (not busy)
+			result := wd.EnforceLRULimit(0)
+			// Should evict model2 and model3 (2 models, which is what we needed)
+			Expect(result.EvictedCount).To(Equal(2))
+			Expect(result.NeedMore).To(BeFalse()) // We evicted enough (2 models)
+			Expect(pm.getShutdownCalls()).To(ContainElement("model2"))
+			Expect(pm.getShutdownCalls()).To(ContainElement("model3"))
+		})
+	})
+
+	Context("EnforceLRULimitResult", func() {
+		BeforeEach(func() {
+			wd = model.NewWatchDog(
+				model.WithProcessManager(pm),
+				model.WithLRULimit(2),
+				model.WithForceEvictionWhenBusy(false),
+			)
+		})
+
+		It("should return NeedMore=false when eviction is successful", func() {
+			wd.AddAddressModelMap("addr1", "model1")
+			wd.Mark("addr1")
+			wd.UnMark("addr1") // Make idle
+			time.Sleep(10 * time.Millisecond)
+
+			wd.AddAddressModelMap("addr2", "model2")
+			wd.Mark("addr2")
+			wd.UnMark("addr2") // Make idle
+
+			result := wd.EnforceLRULimit(0)
+			Expect(result.NeedMore).To(BeFalse())
+			Expect(result.EvictedCount).To(Equal(1))
+		})
+
+		It("should return NeedMore=true when not enough models can be evicted", func() {
+			// Add two models (at limit of 2, need to evict 1 for new model)
+			wd.AddAddressModelMap("addr1", "model1")
+			wd.Mark("addr1")
+			time.Sleep(10 * time.Millisecond)
+
+			wd.AddAddressModelMap("addr2", "model2")
+			wd.Mark("addr2")
+
+			// Mark both as busy (keep them busy)
+			// Both are already marked as busy from the Mark calls above
+
+			// Need to evict 1, but both are busy
+			result := wd.EnforceLRULimit(0)
+			Expect(result.NeedMore).To(BeTrue())
+			Expect(result.EvictedCount).To(Equal(0))
+		})
+
+		It("should return NeedMore=true when need to evict multiple but some are busy", func() {
+			// Set limit to 1, add 3 models (need to evict 2 for new model)
+			wd.SetLRULimit(1)
+			wd.AddAddressModelMap("addr1", "model1")
+			wd.Mark("addr1")
+			time.Sleep(10 * time.Millisecond)
+
+			wd.AddAddressModelMap("addr2", "model2")
+			wd.Mark("addr2")
+			wd.UnMark("addr2") // Make model2 idle
+			time.Sleep(10 * time.Millisecond)
+
+			wd.AddAddressModelMap("addr3", "model3")
+			wd.Mark("addr3")
+			// Keep model1 and model3 busy
+
+			// Need to evict 2 models, but model1 and model3 are busy, only model2 is idle
+			// Should evict model2 (1 model), but NeedMore=true because we needed 2
+			result := wd.EnforceLRULimit(0)
+			Expect(result.EvictedCount).To(Equal(1))
+			Expect(result.NeedMore).To(BeTrue())
+		})
+
+		It("should return correct EvictedCount when some models are evicted", func() {
+			// Add three models
+			wd.AddAddressModelMap("addr1", "model1")
+			wd.Mark("addr1")
+			time.Sleep(10 * time.Millisecond)
+
+			wd.AddAddressModelMap("addr2", "model2")
+			wd.Mark("addr2")
+			wd.UnMark("addr2") // Make model2 idle
+			time.Sleep(10 * time.Millisecond)
+
+			wd.AddAddressModelMap("addr3", "model3")
+			wd.Mark("addr3")
+			wd.UnMark("addr3") // Make model3 idle
+
+			// Keep model1 as busy (already marked)
+
+			// Need to evict 2 models, but model1 is busy
+			// Should evict model2 and model3 (2 models, which is what we needed)
+			result := wd.EnforceLRULimit(0)
+			Expect(result.EvictedCount).To(Equal(2))
+			Expect(result.NeedMore).To(BeFalse()) // We evicted enough (2 models)
+		})
+	})
+
 	Context("Functional Options", func() {
 		It("should use default options when none provided", func() {
 			wd = model.NewWatchDog(
@@ -331,6 +580,7 @@ var _ = Describe("WatchDog", func() {
 				model.WithLRULimit(5),
 				model.WithMemoryReclaimerEnabled(true),
 				model.WithMemoryReclaimerThreshold(0.80),
+				model.WithForceEvictionWhenBusy(true),
 			)

 			Expect(wd.GetLRULimit()).To(Equal(5))
@@ -339,5 +589,48 @@ var _ = Describe("WatchDog", func() {
 			Expect(enabled).To(BeTrue())
 			Expect(threshold).To(Equal(0.80))
 		})
+
+		It("should use default forceEvictionWhenBusy (false) when not specified", func() {
+			wd = model.NewWatchDog(
+				model.WithProcessManager(pm),
+			)
+			// Default should be false - we can test this by checking behavior
+			// Add a busy model and verify it's skipped
+			wd.AddAddressModelMap("addr1", "model1")
+			wd.Mark("addr1")
+			time.Sleep(10 * time.Millisecond)
+
+			wd.AddAddressModelMap("addr2", "model2")
+			wd.Mark("addr2")
+			wd.Mark("addr1") // Keep model1 busy
+
+			wd.SetLRULimit(1)
+			result := wd.EnforceLRULimit(0)
+			// Should skip busy model1, evict model2, but NeedMore=true
+			Expect(result.NeedMore).To(BeTrue())
+		})
+
+		It("should allow setting forceEvictionWhenBusy via option", func() {
+			wd = model.NewWatchDog(
+				model.WithProcessManager(pm),
+				model.WithLRULimit(2),
+				model.WithForceEvictionWhenBusy(true),
+			)
+
+			// Add models
+			wd.AddAddressModelMap("addr1", "model1")
+			wd.Mark("addr1")
+			time.Sleep(10 * time.Millisecond)
+
+			wd.AddAddressModelMap("addr2", "model2")
+			wd.Mark("addr2")
+			// Keep model1 busy (already marked from first Mark call)
+
+			// Should evict busy model1
+			result := wd.EnforceLRULimit(0)
+			Expect(result.NeedMore).To(BeFalse())
+			Expect(result.EvictedCount).To(Equal(1))
+			Expect(pm.getShutdownCalls()).To(ContainElement("model1"))
+		})
 	})
 })