mirror of
https://github.com/bluewave-labs/Checkmate.git
synced 2026-05-24 11:59:39 -05:00
refactor handleIncidents
This commit is contained in:
@@ -120,17 +120,15 @@ class SuperSimpleQueueHelper {
|
||||
|
||||
// Step 6. Handle notifications (best effort, continue even in event of failure, don't wait)
|
||||
if (decision.shouldSendNotification) {
|
||||
this.notificationsService
|
||||
.handleNotifications(statusChangeResult.monitor, status, statusChangeResult.prevStatus, statusChangeResult.statusChanged)
|
||||
.catch((error: any) => {
|
||||
this.logger.error({
|
||||
message: error.message,
|
||||
service: SERVICE_NAME,
|
||||
method: "getMonitorJob",
|
||||
details: `Error sending notifications for job ${statusChangeResult.monitor.id}: ${error.message}`,
|
||||
stack: error.stack,
|
||||
});
|
||||
this.notificationsService.handleNotifications(statusChangeResult.monitor, status, decision).catch((error: any) => {
|
||||
this.logger.error({
|
||||
message: error.message,
|
||||
service: SERVICE_NAME,
|
||||
method: "getMonitorJob",
|
||||
details: `Error sending notifications for job ${statusChangeResult.monitor.id}: ${error.message}`,
|
||||
stack: error.stack,
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
// Step 7. Handle incidents (best effort, don't wait)
|
||||
|
||||
@@ -27,15 +27,22 @@ export const buildHardwareAlerts = (
|
||||
const cpuThreshold = monitor.cpuAlertThreshold !== undefined ? monitor.cpuAlertThreshold / 100 : -1;
|
||||
const memoryThreshold = monitor.memoryAlertThreshold !== undefined ? monitor.memoryAlertThreshold / 100 : -1;
|
||||
const diskThreshold = monitor.diskAlertThreshold !== undefined ? monitor.diskAlertThreshold / 100 : -1;
|
||||
const tempThreshold = monitor.tempAlertThreshold !== undefined ? monitor.tempAlertThreshold : -1;
|
||||
|
||||
const payload = networkResponse?.payload as HardwareStatusPayload;
|
||||
const metrics = payload.data || {};
|
||||
const { cpu: { usage_percent: cpuUsage = -1 } = {}, memory: { usage_percent: memoryUsage = -1 } = {}, disk = [] } = metrics;
|
||||
const { cpu = {}, memory = {}, disk = [] } = metrics;
|
||||
const cpuUsage = cpu.usage_percent ?? -1;
|
||||
const memoryUsage = memory.usage_percent ?? -1;
|
||||
// Get max temperature from CPU temperature sensors array
|
||||
const temps = cpu.temperature ?? [];
|
||||
const maxTemp = temps.length > 0 ? Math.max(...temps) : -1;
|
||||
|
||||
const alerts: Record<string, boolean> = {
|
||||
cpu: cpuThreshold !== -1 && cpuUsage > cpuThreshold ? true : false,
|
||||
memory: memoryThreshold !== -1 && memoryUsage > memoryThreshold ? true : false,
|
||||
disk: disk?.some((d) => diskThreshold !== -1 && typeof d?.usage_percent === "number" && d?.usage_percent > diskThreshold) ?? false,
|
||||
temp: tempThreshold !== -1 && maxTemp > tempThreshold ? true : false,
|
||||
};
|
||||
|
||||
const alertsToSend = [];
|
||||
@@ -45,6 +52,7 @@ export const buildHardwareAlerts = (
|
||||
{ name: "URL", value: monitor.url, inline: false },
|
||||
];
|
||||
const goToIncidentField = { name: `Go to incident`, value: `${clientHost}/infrastructure/${monitor.id}` };
|
||||
|
||||
const formatDiscordAlert = {
|
||||
cpu: () => ({
|
||||
title: "CPU alert",
|
||||
@@ -77,38 +85,33 @@ export const buildHardwareAlerts = (
|
||||
goToIncidentField,
|
||||
],
|
||||
}),
|
||||
|
||||
temp: () => ({
|
||||
title: "Temperature alert",
|
||||
description: `Your current temperature (${maxTemp.toFixed(0)}°C) is above your threshold (${tempThreshold.toFixed(0)}°C)`,
|
||||
color: 15548997,
|
||||
fields: [...monitorInfoFields, goToIncidentField],
|
||||
footer: { text: "Checkmate" },
|
||||
}),
|
||||
};
|
||||
const alertTypes = ["cpu", "memory", "disk"] as const;
|
||||
const alertThresholdKeyMap: Record<(typeof alertTypes)[number], "cpuAlertThreshold" | "memoryAlertThreshold" | "diskAlertThreshold"> = {
|
||||
cpu: "cpuAlertThreshold",
|
||||
memory: "memoryAlertThreshold",
|
||||
disk: "diskAlertThreshold",
|
||||
};
|
||||
|
||||
const alertTypes = ["cpu", "memory", "disk", "temp"] as const;
|
||||
|
||||
for (const type of alertTypes) {
|
||||
const thresholdKey = alertThresholdKeyMap[type];
|
||||
// Iterate over each alert type to see if any need to be decremented
|
||||
if (alerts[type] === true) {
|
||||
const nextValue = monitor[thresholdKey] - 1;
|
||||
monitor[thresholdKey] = nextValue; // Decrement threshold if an alert is triggered
|
||||
|
||||
if (monitor[thresholdKey] <= 0) {
|
||||
// If threshold drops below 0, reset and send notification
|
||||
monitor[thresholdKey] = 100;
|
||||
|
||||
const formatAlert = {
|
||||
cpu: () => `Your current CPU usage (${(cpuUsage * 100).toFixed(0)}%) is above your threshold (${(cpuThreshold * 100).toFixed(0)}%)`,
|
||||
memory: () =>
|
||||
`Your current memory usage (${(memoryUsage * 100).toFixed(0)}%) is above your threshold (${(memoryThreshold * 100).toFixed(0)}%)`,
|
||||
disk: () =>
|
||||
`Your current disk usage: ${disk
|
||||
.map((d, idx) => `(Disk${idx}: ${(d?.usage_percent ?? 0 * 100).toFixed(0)}%)`)
|
||||
.join(", ")} is above your threshold (${(diskThreshold * 100).toFixed(0)}%)`,
|
||||
};
|
||||
alertsToSend.push(formatAlert[type]());
|
||||
discordEmbeds.push(formatDiscordAlert[type]());
|
||||
}
|
||||
const formatAlert = {
|
||||
cpu: () => `Your current CPU usage (${(cpuUsage * 100).toFixed(0)}%) is above your threshold (${(cpuThreshold * 100).toFixed(0)}%)`,
|
||||
memory: () =>
|
||||
`Your current memory usage (${(memoryUsage * 100).toFixed(0)}%) is above your threshold (${(memoryThreshold * 100).toFixed(0)}%)`,
|
||||
disk: () =>
|
||||
`Your current disk usage: ${disk.map((d, idx) => `(Disk${idx}: ${(d?.usage_percent ?? 0 * 100).toFixed(0)}%)`).join(", ")} is above your threshold (${(diskThreshold * 100).toFixed(0)}%)`,
|
||||
temp: () => `Your current temperature (${maxTemp.toFixed(0)}°C) is above your threshold (${tempThreshold.toFixed(0)}°C)`,
|
||||
};
|
||||
alertsToSend.push(formatAlert[type]());
|
||||
discordEmbeds.push(formatDiscordAlert[type]());
|
||||
}
|
||||
}
|
||||
|
||||
const discordPayload = discordEmbeds.length ? { embeds: discordEmbeds } : null;
|
||||
return { alertsToSend, discordPayload };
|
||||
};
|
||||
@@ -125,34 +128,6 @@ export const buildHardwareWebhookBody = (alerts: string[], monitor: Monitor): st
|
||||
return content;
|
||||
};
|
||||
|
||||
export const shouldSendHardwareAlert = (monitor: Monitor, networkResponse: MonitorStatusResponse): boolean => {
|
||||
// Thresholds are stored as percentages (0-100), convert to decimal (0-1) for comparison
|
||||
const cpuThreshold = monitor.cpuAlertThreshold !== undefined ? monitor.cpuAlertThreshold / 100 : -1;
|
||||
const memoryThreshold = monitor.memoryAlertThreshold !== undefined ? monitor.memoryAlertThreshold / 100 : -1;
|
||||
const diskThreshold = monitor.diskAlertThreshold !== undefined ? monitor.diskAlertThreshold / 100 : -1;
|
||||
|
||||
const payload = networkResponse?.payload as HardwareStatusPayload;
|
||||
const metrics = payload.data || {};
|
||||
const { cpu: { usage_percent: cpuUsage = -1 } = {}, memory: { usage_percent: memoryUsage = -1 } = {}, disk = [] } = metrics;
|
||||
|
||||
const cpuBreach = cpuThreshold !== -1 && cpuUsage > cpuThreshold;
|
||||
if (cpuBreach && monitor.cpuAlertThreshold - 1 <= 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const memoryBreach = memoryThreshold !== -1 && memoryUsage > memoryThreshold;
|
||||
if (memoryBreach && monitor.memoryAlertThreshold - 1 <= 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const diskBreach = disk?.some((d) => diskThreshold !== -1 && typeof d?.usage_percent === "number" && d?.usage_percent > diskThreshold);
|
||||
if (diskBreach && monitor.diskAlertThreshold - 1 <= 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
};
|
||||
|
||||
export const buildWebhookBody = (monitor: Monitor, monitorStatusResponse: MonitorStatusResponse) => {
|
||||
const { status, code } = monitorStatusResponse;
|
||||
const { localTimeZone, localTime, utcTime } = getTime();
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import type { HardwareStatusPayload, Monitor, MonitorStatusResponse, Notification, MonitorStatus } from "@/types/index.js";
|
||||
import { shouldSendHardwareAlert } from "@/service/infrastructure/notificationProviders/utils.js";
|
||||
import { IMonitorsRepository, INotificationsRepository } from "@/repositories/index.js";
|
||||
import { INotificationProvider } from "./notificationProviders/INotificationProvider.js";
|
||||
import type { MonitorActionDecision } from "@/service/infrastructure/SuperSimpleQueue/SuperSimpleQueueHelper.js";
|
||||
|
||||
export interface INotificationsService {
|
||||
createNotification: (notificationData: Partial<Notification>) => Promise<Notification>;
|
||||
@@ -9,12 +9,7 @@ export interface INotificationsService {
|
||||
findNotificationsByTeamId: (teamId: string) => Promise<Notification[]>;
|
||||
updateById(id: string, teamId: string, updateData: Partial<Notification>): Promise<Notification>;
|
||||
deleteById: (id: string, teamId: string) => Promise<Notification>;
|
||||
handleNotifications: (
|
||||
monitor: Monitor,
|
||||
monitorStatusResponse: MonitorStatusResponse,
|
||||
prevStatus: MonitorStatus,
|
||||
statusChanged: boolean
|
||||
) => Promise<boolean>;
|
||||
handleNotifications: (monitor: Monitor, monitorStatusResponse: MonitorStatusResponse, decision: MonitorActionDecision) => Promise<boolean>;
|
||||
|
||||
sendTestNotification: (notification: Notification) => Promise<boolean>;
|
||||
testAllNotifications: (notificationIds: string[]) => Promise<boolean>;
|
||||
@@ -229,36 +224,13 @@ export class NotificationsService implements INotificationsService {
|
||||
return await this.emailProvider.sendAlert(notification, syntheticMonitor, baseStatus);
|
||||
};
|
||||
|
||||
handleNotifications = async (monitor: Monitor, monitorStatusResponse: MonitorStatusResponse, prevStatus: MonitorStatus, statusChanged: boolean) => {
|
||||
const { type } = monitor;
|
||||
const payload = monitorStatusResponse.payload as HardwareStatusPayload;
|
||||
// If this is a non-hardeware type monitor and status did not change, we're done
|
||||
if (type !== "hardware" && statusChanged === false) return false;
|
||||
// if prevStatus is undefined, monitor is resuming, we're done
|
||||
if (type !== "hardware" && prevStatus === undefined) return false;
|
||||
|
||||
// Deal with hardware thresholds
|
||||
if (type === "hardware") {
|
||||
// Check if any thresholds are set
|
||||
const hasThresholds =
|
||||
monitor.cpuAlertThreshold !== undefined ||
|
||||
monitor.memoryAlertThreshold !== undefined ||
|
||||
monitor.diskAlertThreshold !== undefined ||
|
||||
monitor.tempAlertThreshold !== undefined;
|
||||
|
||||
if (!hasThresholds) return false; // No thresholds set, we're done
|
||||
const metrics = payload?.data ?? null;
|
||||
if (metrics === null) return false; // No metrics, we're done
|
||||
|
||||
// We should send a notificaiton
|
||||
|
||||
const shouldSend = shouldSendHardwareAlert(monitor, monitorStatusResponse);
|
||||
if (shouldSend === false) return false;
|
||||
|
||||
return await this.sendNotifications(monitor, monitorStatusResponse);
|
||||
handleNotifications = async (monitor: Monitor, monitorStatusResponse: MonitorStatusResponse, decision: MonitorActionDecision) => {
|
||||
// Early return if no notification should be sent
|
||||
if (!decision.shouldSendNotification) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// We should send a notification for non-hardware monitor status change
|
||||
// Send notifications based on decision
|
||||
return await this.sendNotifications(monitor, monitorStatusResponse);
|
||||
};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user