refactor handleIncidents

This commit is contained in:
Alex Holliday
2026-02-15 18:20:42 +00:00
parent 7061eb8eb0
commit 8020bfde21
3 changed files with 46 additions and 101 deletions
@@ -120,17 +120,15 @@ class SuperSimpleQueueHelper {
// Step 6. Handle notifications (best effort, continue even in event of failure, don't wait)
if (decision.shouldSendNotification) {
this.notificationsService
.handleNotifications(statusChangeResult.monitor, status, statusChangeResult.prevStatus, statusChangeResult.statusChanged)
.catch((error: any) => {
this.logger.error({
message: error.message,
service: SERVICE_NAME,
method: "getMonitorJob",
details: `Error sending notifications for job ${statusChangeResult.monitor.id}: ${error.message}`,
stack: error.stack,
});
this.notificationsService.handleNotifications(statusChangeResult.monitor, status, decision).catch((error: any) => {
this.logger.error({
message: error.message,
service: SERVICE_NAME,
method: "getMonitorJob",
details: `Error sending notifications for job ${statusChangeResult.monitor.id}: ${error.message}`,
stack: error.stack,
});
});
}
// Step 7. Handle incidents (best effort, don't wait)
@@ -27,15 +27,22 @@ export const buildHardwareAlerts = (
const cpuThreshold = monitor.cpuAlertThreshold !== undefined ? monitor.cpuAlertThreshold / 100 : -1;
const memoryThreshold = monitor.memoryAlertThreshold !== undefined ? monitor.memoryAlertThreshold / 100 : -1;
const diskThreshold = monitor.diskAlertThreshold !== undefined ? monitor.diskAlertThreshold / 100 : -1;
const tempThreshold = monitor.tempAlertThreshold !== undefined ? monitor.tempAlertThreshold : -1;
const payload = networkResponse?.payload as HardwareStatusPayload;
const metrics = payload.data || {};
const { cpu: { usage_percent: cpuUsage = -1 } = {}, memory: { usage_percent: memoryUsage = -1 } = {}, disk = [] } = metrics;
const { cpu = {}, memory = {}, disk = [] } = metrics;
const cpuUsage = cpu.usage_percent ?? -1;
const memoryUsage = memory.usage_percent ?? -1;
// Get max temperature from CPU temperature sensors array
const temps = cpu.temperature ?? [];
const maxTemp = temps.length > 0 ? Math.max(...temps) : -1;
const alerts: Record<string, boolean> = {
cpu: cpuThreshold !== -1 && cpuUsage > cpuThreshold ? true : false,
memory: memoryThreshold !== -1 && memoryUsage > memoryThreshold ? true : false,
disk: disk?.some((d) => diskThreshold !== -1 && typeof d?.usage_percent === "number" && d?.usage_percent > diskThreshold) ?? false,
temp: tempThreshold !== -1 && maxTemp > tempThreshold ? true : false,
};
const alertsToSend = [];
@@ -45,6 +52,7 @@ export const buildHardwareAlerts = (
{ name: "URL", value: monitor.url, inline: false },
];
const goToIncidentField = { name: `Go to incident`, value: `${clientHost}/infrastructure/${monitor.id}` };
const formatDiscordAlert = {
cpu: () => ({
title: "CPU alert",
@@ -77,38 +85,33 @@ export const buildHardwareAlerts = (
goToIncidentField,
],
}),
temp: () => ({
title: "Temperature alert",
description: `Your current temperature (${maxTemp.toFixed(0)}°C) is above your threshold (${tempThreshold.toFixed(0)}°C)`,
color: 15548997,
fields: [...monitorInfoFields, goToIncidentField],
footer: { text: "Checkmate" },
}),
};
const alertTypes = ["cpu", "memory", "disk"] as const;
const alertThresholdKeyMap: Record<(typeof alertTypes)[number], "cpuAlertThreshold" | "memoryAlertThreshold" | "diskAlertThreshold"> = {
cpu: "cpuAlertThreshold",
memory: "memoryAlertThreshold",
disk: "diskAlertThreshold",
};
const alertTypes = ["cpu", "memory", "disk", "temp"] as const;
for (const type of alertTypes) {
const thresholdKey = alertThresholdKeyMap[type];
// Iterate over each alert type to see if any need to be decremented
if (alerts[type] === true) {
const nextValue = monitor[thresholdKey] - 1;
monitor[thresholdKey] = nextValue; // Decrement threshold if an alert is triggered
if (monitor[thresholdKey] <= 0) {
// If threshold drops below 0, reset and send notification
monitor[thresholdKey] = 100;
const formatAlert = {
cpu: () => `Your current CPU usage (${(cpuUsage * 100).toFixed(0)}%) is above your threshold (${(cpuThreshold * 100).toFixed(0)}%)`,
memory: () =>
`Your current memory usage (${(memoryUsage * 100).toFixed(0)}%) is above your threshold (${(memoryThreshold * 100).toFixed(0)}%)`,
disk: () =>
`Your current disk usage: ${disk
.map((d, idx) => `(Disk${idx}: ${(d?.usage_percent ?? 0 * 100).toFixed(0)}%)`)
.join(", ")} is above your threshold (${(diskThreshold * 100).toFixed(0)}%)`,
};
alertsToSend.push(formatAlert[type]());
discordEmbeds.push(formatDiscordAlert[type]());
}
const formatAlert = {
cpu: () => `Your current CPU usage (${(cpuUsage * 100).toFixed(0)}%) is above your threshold (${(cpuThreshold * 100).toFixed(0)}%)`,
memory: () =>
`Your current memory usage (${(memoryUsage * 100).toFixed(0)}%) is above your threshold (${(memoryThreshold * 100).toFixed(0)}%)`,
disk: () =>
`Your current disk usage: ${disk.map((d, idx) => `(Disk${idx}: ${(d?.usage_percent ?? 0 * 100).toFixed(0)}%)`).join(", ")} is above your threshold (${(diskThreshold * 100).toFixed(0)}%)`,
temp: () => `Your current temperature (${maxTemp.toFixed(0)}°C) is above your threshold (${tempThreshold.toFixed(0)}°C)`,
};
alertsToSend.push(formatAlert[type]());
discordEmbeds.push(formatDiscordAlert[type]());
}
}
const discordPayload = discordEmbeds.length ? { embeds: discordEmbeds } : null;
return { alertsToSend, discordPayload };
};
@@ -125,34 +128,6 @@ export const buildHardwareWebhookBody = (alerts: string[], monitor: Monitor): st
return content;
};
export const shouldSendHardwareAlert = (monitor: Monitor, networkResponse: MonitorStatusResponse): boolean => {
// Thresholds are stored as percentages (0-100), convert to decimal (0-1) for comparison
const cpuThreshold = monitor.cpuAlertThreshold !== undefined ? monitor.cpuAlertThreshold / 100 : -1;
const memoryThreshold = monitor.memoryAlertThreshold !== undefined ? monitor.memoryAlertThreshold / 100 : -1;
const diskThreshold = monitor.diskAlertThreshold !== undefined ? monitor.diskAlertThreshold / 100 : -1;
const payload = networkResponse?.payload as HardwareStatusPayload;
const metrics = payload.data || {};
const { cpu: { usage_percent: cpuUsage = -1 } = {}, memory: { usage_percent: memoryUsage = -1 } = {}, disk = [] } = metrics;
const cpuBreach = cpuThreshold !== -1 && cpuUsage > cpuThreshold;
if (cpuBreach && monitor.cpuAlertThreshold - 1 <= 0) {
return true;
}
const memoryBreach = memoryThreshold !== -1 && memoryUsage > memoryThreshold;
if (memoryBreach && monitor.memoryAlertThreshold - 1 <= 0) {
return true;
}
const diskBreach = disk?.some((d) => diskThreshold !== -1 && typeof d?.usage_percent === "number" && d?.usage_percent > diskThreshold);
if (diskBreach && monitor.diskAlertThreshold - 1 <= 0) {
return true;
}
return false;
};
export const buildWebhookBody = (monitor: Monitor, monitorStatusResponse: MonitorStatusResponse) => {
const { status, code } = monitorStatusResponse;
const { localTimeZone, localTime, utcTime } = getTime();
@@ -1,7 +1,7 @@
import type { HardwareStatusPayload, Monitor, MonitorStatusResponse, Notification, MonitorStatus } from "@/types/index.js";
import { shouldSendHardwareAlert } from "@/service/infrastructure/notificationProviders/utils.js";
import { IMonitorsRepository, INotificationsRepository } from "@/repositories/index.js";
import { INotificationProvider } from "./notificationProviders/INotificationProvider.js";
import type { MonitorActionDecision } from "@/service/infrastructure/SuperSimpleQueue/SuperSimpleQueueHelper.js";
export interface INotificationsService {
createNotification: (notificationData: Partial<Notification>) => Promise<Notification>;
@@ -9,12 +9,7 @@ export interface INotificationsService {
findNotificationsByTeamId: (teamId: string) => Promise<Notification[]>;
updateById(id: string, teamId: string, updateData: Partial<Notification>): Promise<Notification>;
deleteById: (id: string, teamId: string) => Promise<Notification>;
handleNotifications: (
monitor: Monitor,
monitorStatusResponse: MonitorStatusResponse,
prevStatus: MonitorStatus,
statusChanged: boolean
) => Promise<boolean>;
handleNotifications: (monitor: Monitor, monitorStatusResponse: MonitorStatusResponse, decision: MonitorActionDecision) => Promise<boolean>;
sendTestNotification: (notification: Notification) => Promise<boolean>;
testAllNotifications: (notificationIds: string[]) => Promise<boolean>;
@@ -229,36 +224,13 @@ export class NotificationsService implements INotificationsService {
return await this.emailProvider.sendAlert(notification, syntheticMonitor, baseStatus);
};
handleNotifications = async (monitor: Monitor, monitorStatusResponse: MonitorStatusResponse, prevStatus: MonitorStatus, statusChanged: boolean) => {
const { type } = monitor;
const payload = monitorStatusResponse.payload as HardwareStatusPayload;
// If this is a non-hardeware type monitor and status did not change, we're done
if (type !== "hardware" && statusChanged === false) return false;
// if prevStatus is undefined, monitor is resuming, we're done
if (type !== "hardware" && prevStatus === undefined) return false;
// Deal with hardware thresholds
if (type === "hardware") {
// Check if any thresholds are set
const hasThresholds =
monitor.cpuAlertThreshold !== undefined ||
monitor.memoryAlertThreshold !== undefined ||
monitor.diskAlertThreshold !== undefined ||
monitor.tempAlertThreshold !== undefined;
if (!hasThresholds) return false; // No thresholds set, we're done
const metrics = payload?.data ?? null;
if (metrics === null) return false; // No metrics, we're done
// We should send a notificaiton
const shouldSend = shouldSendHardwareAlert(monitor, monitorStatusResponse);
if (shouldSend === false) return false;
return await this.sendNotifications(monitor, monitorStatusResponse);
handleNotifications = async (monitor: Monitor, monitorStatusResponse: MonitorStatusResponse, decision: MonitorActionDecision) => {
// Early return if no notification should be sent
if (!decision.shouldSendNotification) {
return false;
}
// We should send a notification for non-hardware monitor status change
// Send notifications based on decision
return await this.sendNotifications(monitor, monitorStatusResponse);
};