fix: improve API startup reliability with timeout budget tracking (#1824)

## Summary

- Add startup budget tracking to prevent silent hangs during API boot
- Add timeout wrappers around startup operations with graceful
degradation
- Add detailed logging for startup progress and failures

## Background

A user reported being unable to start their array on v7.2-beta.1 due to
the API failing to start. The root cause was a leftover
`dynamix.my.servers` folder from a previously uninstalled Connect
plugin. The API would hang during startup with no error messages, and
PM2 would eventually kill it after 15 seconds with no diagnostic
information.

**Original syslog:**
```
Aug 2 11:55:48 Vault root: Starting Unraid API service...
Aug 2 11:55:48 Vault root: Backup file not found at '/boot/config/plugins/dynamix.my.servers/node_modules-for-v4.12.0.tar.xz'. Skipping restore.
Aug 2 11:55:52 Vault root: Starting the Unraid API
[API never completes - PM2 times out waiting for 'ready' signal]
```

## Solution

### Startup Budget Tracking

Instead of fixed timeouts per operation (which could exceed PM2's
15-second limit in aggregate), we now track a total startup budget:

- **Total budget:** 13 seconds (2 seconds before PM2's 15-second kill
timeout)
- **Bootstrap reserve:** 8 seconds reserved for NestJS bootstrap
- **Per-operation max:** 2 seconds for pre-bootstrap operations

The `StartupBudget` class dynamically calculates timeouts based on
remaining time, ensuring we never exceed PM2's limit and always provide
clear timeout messages.

### Graceful Degradation

Non-critical operations now fail gracefully with warnings instead of
crashing:
- `loadStateFiles()` - continues with default state
- `loadRegistrationKey()` - continues without registration key  
- `loadDynamixConfig()` - continues with default config
- `StateManager` - continues without file watching
- `setupRegistrationKeyWatch()` - continues without key watching

Critical operations still fail fast:
- Config directory creation
- NestJS server bootstrap

### Improved Logging

Each startup phase now logs its completion, making it easy to identify
where hangs occur:
```
Config directory ready
Emhttp state loaded
Registration key loaded
Dynamix config loaded
State manager initialized
Registration key watch active
Bootstrapping NestJS server (budget: 11234ms)...
Startup complete in 1766ms
```

## Test plan

- [x] Verify API starts normally with all startup logs visible
- [x] Verify startup completes within PM2's 15-second timeout
- [ ] Test with missing/corrupted config files to verify graceful
degradation
- [ ] Verify timeout messages appear before PM2 kills the process

---------

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Pujit Mehrotra
2025-12-08 14:59:30 -05:00
committed by GitHub
parent 23a71207dd
commit 51f025b105
6 changed files with 481 additions and 20 deletions

View File

@@ -1,5 +1,5 @@
{
"version": "4.25.3",
"version": "4.27.2",
"extraOrigins": [],
"sandbox": true,
"ssoSubIds": [],

View File

@@ -0,0 +1,231 @@
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
import { TimeoutBudget } from '@app/core/utils/misc/timeout-budget.js';
describe('TimeoutBudget', () => {
beforeEach(() => {
vi.useFakeTimers();
});
afterEach(() => {
vi.useRealTimers();
});
describe('constructor', () => {
it('initializes with the given budget', () => {
const budget = new TimeoutBudget(10000);
expect(budget.remaining()).toBe(10000);
expect(budget.elapsed()).toBe(0);
});
});
describe('remaining', () => {
it('returns full budget immediately after construction', () => {
const budget = new TimeoutBudget(5000);
expect(budget.remaining()).toBe(5000);
});
it('decreases as time passes', () => {
const budget = new TimeoutBudget(5000);
vi.advanceTimersByTime(1000);
expect(budget.remaining()).toBe(4000);
vi.advanceTimersByTime(2000);
expect(budget.remaining()).toBe(2000);
});
it('never returns negative values', () => {
const budget = new TimeoutBudget(1000);
vi.advanceTimersByTime(5000); // Well past the budget
expect(budget.remaining()).toBe(0);
});
it('returns zero when budget is exactly exhausted', () => {
const budget = new TimeoutBudget(1000);
vi.advanceTimersByTime(1000);
expect(budget.remaining()).toBe(0);
});
});
describe('elapsed', () => {
it('returns zero immediately after construction', () => {
const budget = new TimeoutBudget(5000);
expect(budget.elapsed()).toBe(0);
});
it('increases as time passes', () => {
const budget = new TimeoutBudget(5000);
vi.advanceTimersByTime(1000);
expect(budget.elapsed()).toBe(1000);
vi.advanceTimersByTime(500);
expect(budget.elapsed()).toBe(1500);
});
it('continues increasing past the budget limit', () => {
const budget = new TimeoutBudget(1000);
vi.advanceTimersByTime(2000);
expect(budget.elapsed()).toBe(2000);
});
});
describe('getTimeout', () => {
it('returns maxMs when plenty of budget remains', () => {
const budget = new TimeoutBudget(10000);
expect(budget.getTimeout(2000)).toBe(2000);
});
it('returns maxMs when budget minus reserve is sufficient', () => {
const budget = new TimeoutBudget(10000);
expect(budget.getTimeout(2000, 5000)).toBe(2000);
});
it('caps timeout to available budget minus reserve', () => {
const budget = new TimeoutBudget(10000);
vi.advanceTimersByTime(5000); // 5000ms remaining
// Want 2000ms but reserve 4000ms, only 1000ms available
expect(budget.getTimeout(2000, 4000)).toBe(1000);
});
it('caps timeout to remaining budget when no reserve', () => {
const budget = new TimeoutBudget(1000);
vi.advanceTimersByTime(800); // 200ms remaining
expect(budget.getTimeout(500)).toBe(200);
});
it('returns minimum of 100ms even when budget is exhausted', () => {
const budget = new TimeoutBudget(1000);
vi.advanceTimersByTime(2000); // Budget exhausted
expect(budget.getTimeout(500)).toBe(100);
});
it('returns minimum of 100ms when reserve exceeds remaining', () => {
const budget = new TimeoutBudget(5000);
vi.advanceTimersByTime(4000); // 1000ms remaining
// Reserve 2000ms but only 1000ms remaining
expect(budget.getTimeout(500, 2000)).toBe(100);
});
it('uses default reserve of 0 when not specified', () => {
const budget = new TimeoutBudget(1000);
vi.advanceTimersByTime(500); // 500ms remaining
expect(budget.getTimeout(1000)).toBe(500); // Capped to remaining
});
});
describe('hasTimeFor', () => {
it('returns true when enough time remains', () => {
const budget = new TimeoutBudget(5000);
expect(budget.hasTimeFor(3000)).toBe(true);
});
it('returns true when exactly enough time remains', () => {
const budget = new TimeoutBudget(5000);
expect(budget.hasTimeFor(5000)).toBe(true);
});
it('returns false when not enough time remains', () => {
const budget = new TimeoutBudget(5000);
expect(budget.hasTimeFor(6000)).toBe(false);
});
it('accounts for elapsed time', () => {
const budget = new TimeoutBudget(5000);
vi.advanceTimersByTime(3000); // 2000ms remaining
expect(budget.hasTimeFor(2000)).toBe(true);
expect(budget.hasTimeFor(3000)).toBe(false);
});
it('returns false when budget is exhausted', () => {
const budget = new TimeoutBudget(1000);
vi.advanceTimersByTime(2000);
expect(budget.hasTimeFor(1)).toBe(false);
});
it('returns true for zero required time', () => {
const budget = new TimeoutBudget(1000);
vi.advanceTimersByTime(2000); // Budget exhausted
expect(budget.hasTimeFor(0)).toBe(true);
});
});
describe('integration scenarios', () => {
it('simulates a typical startup sequence', () => {
const budget = new TimeoutBudget(13000); // 13 second budget
const BOOTSTRAP_RESERVE = 8000;
const MAX_OP_TIMEOUT = 2000;
// First operation - should get full 2000ms
const op1Timeout = budget.getTimeout(MAX_OP_TIMEOUT, BOOTSTRAP_RESERVE);
expect(op1Timeout).toBe(2000);
// Simulate operation taking 500ms
vi.advanceTimersByTime(500);
// Second operation - still have plenty of budget
const op2Timeout = budget.getTimeout(MAX_OP_TIMEOUT, BOOTSTRAP_RESERVE);
expect(op2Timeout).toBe(2000);
// Simulate operation taking 1000ms
vi.advanceTimersByTime(1000);
// Third operation
const op3Timeout = budget.getTimeout(MAX_OP_TIMEOUT, BOOTSTRAP_RESERVE);
expect(op3Timeout).toBe(2000);
// Simulate slow operation taking 2000ms
vi.advanceTimersByTime(2000);
// Now 3500ms elapsed, 9500ms remaining
// After reserve, only 1500ms available - less than max
const op4Timeout = budget.getTimeout(MAX_OP_TIMEOUT, BOOTSTRAP_RESERVE);
expect(op4Timeout).toBe(1500);
// Simulate operation completing
vi.advanceTimersByTime(1000);
// Bootstrap phase - use all remaining time
const bootstrapTimeout = budget.remaining();
expect(bootstrapTimeout).toBe(8500);
expect(budget.hasTimeFor(8000)).toBe(true);
});
it('handles worst-case scenario where all operations timeout', () => {
const budget = new TimeoutBudget(13000);
const BOOTSTRAP_RESERVE = 8000;
const MAX_OP_TIMEOUT = 2000;
// Each operation times out at its limit
// Available for operations: 13000 - 8000 = 5000ms
// Op 1: gets 2000ms, times out
budget.getTimeout(MAX_OP_TIMEOUT, BOOTSTRAP_RESERVE);
vi.advanceTimersByTime(2000);
// Op 2: gets 2000ms, times out
budget.getTimeout(MAX_OP_TIMEOUT, BOOTSTRAP_RESERVE);
vi.advanceTimersByTime(2000);
// Op 3: only 1000ms available (5000 - 4000), times out
const op3Timeout = budget.getTimeout(MAX_OP_TIMEOUT, BOOTSTRAP_RESERVE);
expect(op3Timeout).toBe(1000);
vi.advanceTimersByTime(1000);
// Bootstrap: should still have 8000ms
expect(budget.remaining()).toBe(8000);
});
});
});

View File

@@ -0,0 +1,65 @@
import { describe, expect, it } from 'vitest';
import { withTimeout } from '@app/core/utils/misc/with-timeout.js';
describe('withTimeout', () => {
it('resolves when promise completes before timeout', async () => {
const promise = Promise.resolve('success');
const result = await withTimeout(promise, 1000, 'testOp');
expect(result).toBe('success');
});
it('resolves with correct value for delayed promise within timeout', async () => {
const promise = new Promise<number>((resolve) => setTimeout(() => resolve(42), 50));
const result = await withTimeout(promise, 1000, 'testOp');
expect(result).toBe(42);
});
it('rejects when promise takes longer than timeout', async () => {
const promise = new Promise<string>((resolve) => setTimeout(() => resolve('late'), 500));
await expect(withTimeout(promise, 50, 'slowOp')).rejects.toThrow('slowOp timed out after 50ms');
});
it('includes operation name in timeout error message', async () => {
const promise = new Promise<void>(() => {}); // Never resolves
await expect(withTimeout(promise, 10, 'myCustomOperation')).rejects.toThrow(
'myCustomOperation timed out after 10ms'
);
});
it('propagates rejection from the original promise', async () => {
const promise = Promise.reject(new Error('original error'));
await expect(withTimeout(promise, 1000, 'testOp')).rejects.toThrow('original error');
});
it('resolves immediately for already-resolved promises', async () => {
const promise = Promise.resolve('immediate');
const start = Date.now();
const result = await withTimeout(promise, 1000, 'testOp');
const elapsed = Date.now() - start;
expect(result).toBe('immediate');
expect(elapsed).toBeLessThan(50); // Should be nearly instant
});
it('works with zero timeout (immediately times out for pending promises)', async () => {
const promise = new Promise<void>(() => {}); // Never resolves
await expect(withTimeout(promise, 0, 'zeroTimeout')).rejects.toThrow(
'zeroTimeout timed out after 0ms'
);
});
it('preserves the type of the resolved value', async () => {
interface TestType {
id: number;
name: string;
}
const testObj: TestType = { id: 1, name: 'test' };
const promise = Promise.resolve(testObj);
const result = await withTimeout(promise, 1000, 'testOp');
expect(result.id).toBe(1);
expect(result.name).toBe('test');
});
});

View File

@@ -0,0 +1,70 @@
/**
* Tracks remaining time budget to ensure we don't exceed external timeouts (e.g., PM2's listen_timeout).
*
* This class helps coordinate multiple async operations by:
* - Tracking elapsed time from construction
* - Calculating dynamic timeouts based on remaining budget
* - Reserving time for critical operations (like server bootstrap)
*
* @example
* ```typescript
* const budget = new TimeoutBudget(15000); // 15 second total budget
*
* // Each operation gets a timeout capped by remaining budget
* await withTimeout(loadConfig(), budget.getTimeout(2000, 8000), 'loadConfig');
* await withTimeout(loadState(), budget.getTimeout(2000, 8000), 'loadState');
*
* // Bootstrap gets all remaining time
* await withTimeout(bootstrap(), budget.remaining(), 'bootstrap');
*
* console.log(`Completed in ${budget.elapsed()}ms`);
* ```
*/
export class TimeoutBudget {
private startTime: number;
private budgetMs: number;
/**
* Creates a new startup budget tracker.
* @param budgetMs Total time budget in milliseconds
*/
constructor(budgetMs: number) {
this.startTime = Date.now();
this.budgetMs = budgetMs;
}
/**
* Returns remaining time in milliseconds.
* Never returns negative values.
*/
remaining(): number {
return Math.max(0, this.budgetMs - (Date.now() - this.startTime));
}
/**
* Returns elapsed time in milliseconds since construction.
*/
elapsed(): number {
return Date.now() - this.startTime;
}
/**
* Returns timeout for an operation, capped by remaining budget.
*
* @param maxMs Maximum timeout for this operation
* @param reserveMs Time to reserve for future operations (e.g., server bootstrap)
* @returns Timeout in milliseconds (minimum 100ms to avoid instant failures)
*/
getTimeout(maxMs: number, reserveMs: number = 0): number {
const available = this.remaining() - reserveMs;
return Math.max(100, Math.min(maxMs, available));
}
/**
* Checks if there's enough time remaining for an operation.
* @param requiredMs Time required in milliseconds
*/
hasTimeFor(requiredMs: number): boolean {
return this.remaining() >= requiredMs;
}
}

View File

@@ -0,0 +1,25 @@
/**
* Wraps a promise with a timeout to prevent hangs.
* If the operation takes longer than timeoutMs, it rejects with a timeout error.
*
* @param promise The promise to wrap with a timeout
* @param timeoutMs Maximum time in milliseconds before timing out
* @param operationName Name of the operation for the error message
* @returns The result of the promise if it completes in time
* @throws Error if the operation times out
*/
export const withTimeout = <T>(
promise: Promise<T>,
timeoutMs: number,
operationName: string
): Promise<T> => {
return Promise.race([
promise,
new Promise<never>((_, reject) =>
setTimeout(
() => reject(new Error(`${operationName} timed out after ${timeoutMs}ms`)),
timeoutMs
)
),
]);
};

View File

@@ -4,7 +4,7 @@ import '@app/dotenv.js';
import { type NestFastifyApplication } from '@nestjs/platform-fastify';
import { unlinkSync } from 'fs';
import { mkdir, readFile } from 'fs/promises';
import { mkdir } from 'fs/promises';
import http from 'http';
import https from 'https';
@@ -15,6 +15,8 @@ import { WebSocket } from 'ws';
import { logger } from '@app/core/log.js';
import { fileExistsSync } from '@app/core/utils/files/file-exists.js';
import { TimeoutBudget } from '@app/core/utils/misc/timeout-budget.js';
import { withTimeout } from '@app/core/utils/misc/with-timeout.js';
import { getServerIdentifier } from '@app/core/utils/server-identifier.js';
import { environment, PATHS_CONFIG_MODULES, PORT } from '@app/environment.js';
import * as envVars from '@app/environment.js';
@@ -28,13 +30,23 @@ import { StateManager } from '@app/store/watch/state-watch.js';
let server: NestFastifyApplication<RawServerDefault> | null = null;
// PM2 listen_timeout is 15 seconds (ecosystem.config.json)
// We use 13 seconds as our total budget to ensure our timeout triggers before PM2 kills us
const TOTAL_STARTUP_BUDGET_MS = 13_000;
// Reserve time for the NestJS bootstrap (the most critical and time-consuming operation)
const BOOTSTRAP_RESERVED_MS = 8_000;
// Maximum time for any single pre-bootstrap operation
const MAX_OPERATION_TIMEOUT_MS = 2_000;
const unlinkUnixPort = () => {
if (isNaN(parseInt(PORT, 10))) {
if (fileExistsSync(PORT)) unlinkSync(PORT);
}
};
export const viteNodeApp = async () => {
export const viteNodeApp = async (): Promise<NestFastifyApplication<RawServerDefault>> => {
const budget = new TimeoutBudget(TOTAL_STARTUP_BUDGET_MS);
try {
await import('json-bigint-patch');
environment.IS_MAIN_PROCESS = true;
@@ -42,15 +54,15 @@ export const viteNodeApp = async () => {
/**------------------------------------------------------------------------
* Attaching getServerIdentifier to globalThis
* getServerIdentifier is tightly coupled to the deprecated redux store,
* getServerIdentifier is tightly coupled to the deprecated redux store,
* which we don't want to share with other packages or plugins.
*
*
* At the same time, we need to use it in @unraid/shared as a building block,
* where it's used & available outside of NestJS's DI context.
*
* Attaching to globalThis is a temporary solution to avoid refactoring
*
* Attaching to globalThis is a temporary solution to avoid refactoring
* config sync & management outside of NestJS's DI context.
*
*
* Plugin authors should import getServerIdentifier from @unraid/shared instead,
* to avoid breaking changes to their code.
*------------------------------------------------------------------------**/
@@ -58,7 +70,18 @@ export const viteNodeApp = async () => {
logger.info('ENV %o', envVars);
logger.info('PATHS %o', store.getState().paths);
await mkdir(PATHS_CONFIG_MODULES, { recursive: true });
// Note: we use logger.info for checkpoints instead of a lower log level
// to ensure emission during an unraid server's boot,
// where the log level will be set to INFO by default.
// Create config directory
try {
await mkdir(PATHS_CONFIG_MODULES, { recursive: true });
logger.info('Config directory ready');
} catch (error) {
logger.error(error, 'Failed to create config directory');
throw error;
}
const cacheable = new CacheableLookup();
@@ -68,29 +91,73 @@ export const viteNodeApp = async () => {
cacheable.install(https.globalAgent);
// Load emhttp state into store
await store.dispatch(loadStateFiles());
try {
const timeout = budget.getTimeout(MAX_OPERATION_TIMEOUT_MS, BOOTSTRAP_RESERVED_MS);
await withTimeout(store.dispatch(loadStateFiles()), timeout, 'loadStateFiles');
logger.info('Emhttp state loaded');
} catch (error) {
logger.error(error, 'Failed to load emhttp state files');
logger.warn('Continuing with default state');
}
// Load initial registration key into store
await store.dispatch(loadRegistrationKey());
try {
const timeout = budget.getTimeout(MAX_OPERATION_TIMEOUT_MS, BOOTSTRAP_RESERVED_MS);
await withTimeout(store.dispatch(loadRegistrationKey()), timeout, 'loadRegistrationKey');
logger.info('Registration key loaded');
} catch (error) {
logger.error(error, 'Failed to load registration key');
logger.warn('Continuing without registration key');
}
// Load my dynamix config file into store
loadDynamixConfig();
try {
loadDynamixConfig();
logger.info('Dynamix config loaded');
} catch (error) {
logger.error(error, 'Failed to load dynamix config');
logger.warn('Continuing with default dynamix config');
}
// Start listening to file updates
StateManager.getInstance();
try {
StateManager.getInstance();
logger.info('State manager initialized');
} catch (error) {
logger.error(error, 'Failed to initialize state manager');
logger.warn('Continuing without state watching');
}
// Start listening to key file changes
setupRegistrationKeyWatch();
try {
setupRegistrationKeyWatch();
logger.info('Registration key watch active');
} catch (error) {
logger.error(error, 'Failed to setup registration key watch');
logger.warn('Continuing without key file watching');
}
// If port is unix socket, delete old socket before starting http server
unlinkUnixPort();
startMiddlewareListeners();
// Start webserver
const { bootstrapNestServer } = await import('@app/unraid-api/main.js');
server = await bootstrapNestServer();
// Start webserver - use all remaining budget
try {
const bootstrapTimeout = budget.remaining();
if (bootstrapTimeout < 1000) {
logger.warn(
`Insufficient startup budget remaining (${bootstrapTimeout}ms) for NestJS bootstrap`
);
}
logger.info('Bootstrapping NestJS server (budget: %dms)...', bootstrapTimeout);
const { bootstrapNestServer } = await import('@app/unraid-api/main.js');
server = await withTimeout(bootstrapNestServer(), bootstrapTimeout, 'bootstrapNestServer');
logger.info('Startup complete in %dms', budget.elapsed());
} catch (error) {
logger.error(error, 'Failed to start NestJS server');
throw error; // This is critical - must rethrow to trigger graceful exit
}
asyncExitHook(
async (signal) => {
@@ -103,8 +170,10 @@ export const viteNodeApp = async () => {
gracefulExit();
},
{ wait: 9999 }
{ wait: 10_000 }
);
return server;
} catch (error: unknown) {
if (error instanceof Error) {
logger.error(error, 'API-ERROR');
@@ -115,8 +184,9 @@ export const viteNodeApp = async () => {
await server?.close?.();
}
shutdownApiEvent();
// Kill application
// Kill application - gracefulExit calls process.exit but TS doesn't know it never returns
gracefulExit(1);
throw new Error('Unreachable');
}
};