mirror of
https://github.com/SigNoz/signoz.git
synced 2025-12-16 18:44:36 -06:00
feat: add support for recovery threshold (#9428)
This commit is contained in:
committed by
GitHub
parent
79988b448f
commit
52228bc6c4
2
Makefile
2
Makefile
@@ -86,7 +86,7 @@ go-run-enterprise: ## Runs the enterprise go backend server
|
||||
SIGNOZ_TELEMETRYSTORE_CLICKHOUSE_DSN=tcp://127.0.0.1:9000 \
|
||||
SIGNOZ_TELEMETRYSTORE_CLICKHOUSE_CLUSTER=cluster \
|
||||
go run -race \
|
||||
$(GO_BUILD_CONTEXT_ENTERPRISE)/*.go
|
||||
$(GO_BUILD_CONTEXT_ENTERPRISE)/*.go server
|
||||
|
||||
.PHONY: go-test
|
||||
go-test: ## Runs go unit tests
|
||||
|
||||
@@ -246,7 +246,9 @@ func (r *AnomalyRule) buildAndRunQuery(ctx context.Context, orgID valuer.UUID, t
|
||||
continue
|
||||
}
|
||||
}
|
||||
results, err := r.Threshold.ShouldAlert(*series, r.Unit())
|
||||
results, err := r.Threshold.Eval(*series, r.Unit(), ruletypes.EvalData{
|
||||
ActiveAlerts: r.ActiveAlertsLabelFP(),
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -296,7 +298,9 @@ func (r *AnomalyRule) buildAndRunQueryV5(ctx context.Context, orgID valuer.UUID,
|
||||
continue
|
||||
}
|
||||
}
|
||||
results, err := r.Threshold.ShouldAlert(*series, r.Unit())
|
||||
results, err := r.Threshold.Eval(*series, r.Unit(), ruletypes.EvalData{
|
||||
ActiveAlerts: r.ActiveAlertsLabelFP(),
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -410,6 +414,7 @@ func (r *AnomalyRule) Eval(ctx context.Context, ts time.Time) (interface{}, erro
|
||||
GeneratorURL: r.GeneratorURL(),
|
||||
Receivers: ruleReceiverMap[lbs.Map()[ruletypes.LabelThresholdName]],
|
||||
Missing: smpl.IsMissing,
|
||||
IsRecovering: smpl.IsRecovering,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -422,6 +427,9 @@ func (r *AnomalyRule) Eval(ctx context.Context, ts time.Time) (interface{}, erro
|
||||
|
||||
alert.Value = a.Value
|
||||
alert.Annotations = a.Annotations
|
||||
// Update the recovering and missing state of existing alert
|
||||
alert.IsRecovering = a.IsRecovering
|
||||
alert.Missing = a.Missing
|
||||
if v, ok := alert.Labels.Map()[ruletypes.LabelThresholdName]; ok {
|
||||
alert.Receivers = ruleReceiverMap[v]
|
||||
}
|
||||
@@ -480,6 +488,30 @@ func (r *AnomalyRule) Eval(ctx context.Context, ts time.Time) (interface{}, erro
|
||||
Value: a.Value,
|
||||
})
|
||||
}
|
||||
|
||||
// We need to change firing alert to recovering if the returned sample meets recovery threshold
|
||||
changeFiringToRecovering := a.State == model.StateFiring && a.IsRecovering
|
||||
// We need to change recovering alerts to firing if the returned sample meets target threshold
|
||||
changeRecoveringToFiring := a.State == model.StateRecovering && !a.IsRecovering && !a.Missing
|
||||
// in any of the above case we need to update the status of alert
|
||||
if changeFiringToRecovering || changeRecoveringToFiring {
|
||||
state := model.StateRecovering
|
||||
if changeRecoveringToFiring {
|
||||
state = model.StateFiring
|
||||
}
|
||||
a.State = state
|
||||
r.logger.DebugContext(ctx, "converting alert state", "name", r.Name(), "state", state)
|
||||
itemsToAdd = append(itemsToAdd, model.RuleStateHistory{
|
||||
RuleID: r.ID(),
|
||||
RuleName: r.Name(),
|
||||
State: state,
|
||||
StateChanged: true,
|
||||
UnixMilli: ts.UnixMilli(),
|
||||
Labels: model.LabelsString(labelsJSON),
|
||||
Fingerprint: a.QueryResultLables.Hash(),
|
||||
Value: a.Value,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
currentState := r.State()
|
||||
|
||||
@@ -12,9 +12,12 @@ import (
|
||||
// AlertState denotes the state of an active alert.
|
||||
type AlertState int
|
||||
|
||||
// The enum values are ordered by priority (lowest to highest).
|
||||
// When determining overall rule state, higher numeric values take precedence.
|
||||
const (
|
||||
StateInactive AlertState = iota
|
||||
StatePending
|
||||
StateRecovering
|
||||
StateFiring
|
||||
StateNoData
|
||||
StateDisabled
|
||||
@@ -32,6 +35,8 @@ func (s AlertState) String() string {
|
||||
return "nodata"
|
||||
case StateDisabled:
|
||||
return "disabled"
|
||||
case StateRecovering:
|
||||
return "recovering"
|
||||
}
|
||||
panic(errors.Errorf("unknown alert state: %d", s))
|
||||
}
|
||||
@@ -58,6 +63,8 @@ func (s *AlertState) UnmarshalJSON(b []byte) error {
|
||||
*s = StateNoData
|
||||
case "disabled":
|
||||
*s = StateDisabled
|
||||
case "recovering":
|
||||
*s = StateRecovering
|
||||
default:
|
||||
*s = StateInactive
|
||||
}
|
||||
@@ -83,6 +90,8 @@ func (s *AlertState) Scan(value interface{}) error {
|
||||
*s = StateNoData
|
||||
case "disabled":
|
||||
*s = StateDisabled
|
||||
case "recovering":
|
||||
*s = StateRecovering
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -191,6 +191,26 @@ func (r *BaseRule) currentAlerts() []*ruletypes.Alert {
|
||||
return alerts
|
||||
}
|
||||
|
||||
// ActiveAlertsLabelFP returns a map of active alert labels fingerprint and
|
||||
// the fingerprint is computed using the QueryResultLables.Hash() method.
|
||||
// We use the QueryResultLables instead of labels as these labels are raw labels
|
||||
// that we get from the sample.
|
||||
// This is useful in cases where we want to check if an alert is still active
|
||||
// based on the labels we have.
|
||||
func (r *BaseRule) ActiveAlertsLabelFP() map[uint64]struct{} {
|
||||
r.mtx.Lock()
|
||||
defer r.mtx.Unlock()
|
||||
|
||||
activeAlerts := make(map[uint64]struct{}, len(r.Active))
|
||||
for _, alert := range r.Active {
|
||||
if alert == nil || alert.QueryResultLables == nil {
|
||||
continue
|
||||
}
|
||||
activeAlerts[alert.QueryResultLables.Hash()] = struct{}{}
|
||||
}
|
||||
return activeAlerts
|
||||
}
|
||||
|
||||
func (r *BaseRule) EvalDelay() time.Duration {
|
||||
return r.evalDelay
|
||||
}
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
package rules
|
||||
|
||||
import (
|
||||
"github.com/stretchr/testify/require"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
v3 "github.com/SigNoz/signoz/pkg/query-service/model/v3"
|
||||
ruletypes "github.com/SigNoz/signoz/pkg/types/ruletypes"
|
||||
)
|
||||
@@ -74,7 +75,7 @@ func TestBaseRule_RequireMinPoints(t *testing.T) {
|
||||
|
||||
for _, test := range tests {
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
_, err := test.rule.Threshold.ShouldAlert(*test.series, "")
|
||||
_, err := test.rule.Threshold.Eval(*test.series, "", ruletypes.EvalData{})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, len(test.series.Points) >= test.rule.ruleCondition.RequiredNumPoints, test.shouldAlert)
|
||||
})
|
||||
|
||||
@@ -4,13 +4,14 @@ import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"github.com/SigNoz/signoz/pkg/query-service/utils/labels"
|
||||
"log/slog"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/SigNoz/signoz/pkg/query-service/utils/labels"
|
||||
|
||||
"go.uber.org/zap"
|
||||
|
||||
"github.com/go-openapi/strfmt"
|
||||
|
||||
@@ -159,7 +159,9 @@ func (r *PromRule) Eval(ctx context.Context, ts time.Time) (interface{}, error)
|
||||
continue
|
||||
}
|
||||
|
||||
results, err := r.Threshold.ShouldAlert(toCommonSeries(series), r.Unit())
|
||||
results, err := r.Threshold.Eval(toCommonSeries(series), r.Unit(), ruletypes.EvalData{
|
||||
ActiveAlerts: r.ActiveAlertsLabelFP(),
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -233,6 +235,7 @@ func (r *PromRule) Eval(ctx context.Context, ts time.Time) (interface{}, error)
|
||||
Value: result.V,
|
||||
GeneratorURL: r.GeneratorURL(),
|
||||
Receivers: ruleReceiverMap[lbs.Map()[ruletypes.LabelThresholdName]],
|
||||
IsRecovering: result.IsRecovering,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -245,6 +248,9 @@ func (r *PromRule) Eval(ctx context.Context, ts time.Time) (interface{}, error)
|
||||
if alert, ok := r.Active[h]; ok && alert.State != model.StateInactive {
|
||||
alert.Value = a.Value
|
||||
alert.Annotations = a.Annotations
|
||||
// Update the recovering and missing state of existing alert
|
||||
alert.IsRecovering = a.IsRecovering
|
||||
alert.Missing = a.Missing
|
||||
if v, ok := alert.Labels.Map()[ruletypes.LabelThresholdName]; ok {
|
||||
alert.Receivers = ruleReceiverMap[v]
|
||||
}
|
||||
@@ -304,6 +310,29 @@ func (r *PromRule) Eval(ctx context.Context, ts time.Time) (interface{}, error)
|
||||
})
|
||||
}
|
||||
|
||||
// We need to change firing alert to recovering if the returned sample meets recovery threshold
|
||||
changeAlertingToRecovering := a.State == model.StateFiring && a.IsRecovering
|
||||
// We need to change recovering alerts to firing if the returned sample meets target threshold
|
||||
changeRecoveringToFiring := a.State == model.StateRecovering && !a.IsRecovering && !a.Missing
|
||||
// in any of the above case we need to update the status of alert
|
||||
if changeAlertingToRecovering || changeRecoveringToFiring {
|
||||
state := model.StateRecovering
|
||||
if changeRecoveringToFiring {
|
||||
state = model.StateFiring
|
||||
}
|
||||
a.State = state
|
||||
r.logger.DebugContext(ctx, "converting alert state", "name", r.Name(), "state", state)
|
||||
itemsToAdd = append(itemsToAdd, model.RuleStateHistory{
|
||||
RuleID: r.ID(),
|
||||
RuleName: r.Name(),
|
||||
State: state,
|
||||
StateChanged: true,
|
||||
UnixMilli: ts.UnixMilli(),
|
||||
Labels: model.LabelsString(labelsJSON),
|
||||
Fingerprint: a.QueryResultLables.Hash(),
|
||||
Value: a.Value,
|
||||
})
|
||||
}
|
||||
}
|
||||
r.health = ruletypes.HealthGood
|
||||
r.lastError = err
|
||||
|
||||
@@ -23,7 +23,7 @@ func getVectorValues(vectors []ruletypes.Sample) []float64 {
|
||||
return values
|
||||
}
|
||||
|
||||
func TestPromRuleShouldAlert(t *testing.T) {
|
||||
func TestPromRuleEval(t *testing.T) {
|
||||
postableRule := ruletypes.PostableRule{
|
||||
AlertName: "Test Rule",
|
||||
AlertType: ruletypes.AlertTypeMetric,
|
||||
@@ -696,7 +696,7 @@ func TestPromRuleShouldAlert(t *testing.T) {
|
||||
assert.NoError(t, err)
|
||||
}
|
||||
|
||||
resultVectors, err := rule.Threshold.ShouldAlert(toCommonSeries(c.values), rule.Unit())
|
||||
resultVectors, err := rule.Threshold.Eval(toCommonSeries(c.values), rule.Unit(), ruletypes.EvalData{})
|
||||
assert.NoError(t, err)
|
||||
|
||||
// Compare full result vector with expected vector
|
||||
|
||||
@@ -24,6 +24,8 @@ type Rule interface {
|
||||
HoldDuration() time.Duration
|
||||
State() model.AlertState
|
||||
ActiveAlerts() []*ruletypes.Alert
|
||||
// ActiveAlertsLabelFP returns a map of active alert labels fingerprint
|
||||
ActiveAlertsLabelFP() map[uint64]struct{}
|
||||
|
||||
PreferredChannels() []string
|
||||
|
||||
|
||||
@@ -488,7 +488,9 @@ func (r *ThresholdRule) buildAndRunQuery(ctx context.Context, orgID valuer.UUID,
|
||||
continue
|
||||
}
|
||||
}
|
||||
resultSeries, err := r.Threshold.ShouldAlert(*series, r.Unit())
|
||||
resultSeries, err := r.Threshold.Eval(*series, r.Unit(), ruletypes.EvalData{
|
||||
ActiveAlerts: r.ActiveAlertsLabelFP(),
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -565,7 +567,9 @@ func (r *ThresholdRule) buildAndRunQueryV5(ctx context.Context, orgID valuer.UUI
|
||||
continue
|
||||
}
|
||||
}
|
||||
resultSeries, err := r.Threshold.ShouldAlert(*series, r.Unit())
|
||||
resultSeries, err := r.Threshold.Eval(*series, r.Unit(), ruletypes.EvalData{
|
||||
ActiveAlerts: r.ActiveAlertsLabelFP(),
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -666,13 +670,14 @@ func (r *ThresholdRule) Eval(ctx context.Context, ts time.Time) (interface{}, er
|
||||
// Links with timestamps should go in annotations since labels
|
||||
// is used alert grouping, and we want to group alerts with the same
|
||||
// label set, but different timestamps, together.
|
||||
if r.typ == ruletypes.AlertTypeTraces {
|
||||
switch r.typ {
|
||||
case ruletypes.AlertTypeTraces:
|
||||
link := r.prepareLinksToTraces(ctx, ts, smpl.Metric)
|
||||
if link != "" && r.hostFromSource() != "" {
|
||||
r.logger.InfoContext(ctx, "adding traces link to annotations", "link", fmt.Sprintf("%s/traces-explorer?%s", r.hostFromSource(), link))
|
||||
annotations = append(annotations, labels.Label{Name: "related_traces", Value: fmt.Sprintf("%s/traces-explorer?%s", r.hostFromSource(), link)})
|
||||
}
|
||||
} else if r.typ == ruletypes.AlertTypeLogs {
|
||||
case ruletypes.AlertTypeLogs:
|
||||
link := r.prepareLinksToLogs(ctx, ts, smpl.Metric)
|
||||
if link != "" && r.hostFromSource() != "" {
|
||||
r.logger.InfoContext(ctx, "adding logs link to annotations", "link", fmt.Sprintf("%s/logs/logs-explorer?%s", r.hostFromSource(), link))
|
||||
@@ -698,6 +703,7 @@ func (r *ThresholdRule) Eval(ctx context.Context, ts time.Time) (interface{}, er
|
||||
GeneratorURL: r.GeneratorURL(),
|
||||
Receivers: ruleReceiverMap[lbs.Map()[ruletypes.LabelThresholdName]],
|
||||
Missing: smpl.IsMissing,
|
||||
IsRecovering: smpl.IsRecovering,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -711,6 +717,9 @@ func (r *ThresholdRule) Eval(ctx context.Context, ts time.Time) (interface{}, er
|
||||
|
||||
alert.Value = a.Value
|
||||
alert.Annotations = a.Annotations
|
||||
// Update the recovering and missing state of existing alert
|
||||
alert.IsRecovering = a.IsRecovering
|
||||
alert.Missing = a.Missing
|
||||
if v, ok := alert.Labels.Map()[ruletypes.LabelThresholdName]; ok {
|
||||
alert.Receivers = ruleReceiverMap[v]
|
||||
}
|
||||
@@ -735,6 +744,7 @@ func (r *ThresholdRule) Eval(ctx context.Context, ts time.Time) (interface{}, er
|
||||
delete(r.Active, fp)
|
||||
}
|
||||
if a.State != model.StateInactive {
|
||||
r.logger.DebugContext(ctx, "converting firing alert to inActive", "name", r.Name())
|
||||
a.State = model.StateInactive
|
||||
a.ResolvedAt = ts
|
||||
itemsToAdd = append(itemsToAdd, model.RuleStateHistory{
|
||||
@@ -752,6 +762,7 @@ func (r *ThresholdRule) Eval(ctx context.Context, ts time.Time) (interface{}, er
|
||||
}
|
||||
|
||||
if a.State == model.StatePending && ts.Sub(a.ActiveAt) >= r.holdDuration {
|
||||
r.logger.DebugContext(ctx, "converting pending alert to firing", "name", r.Name())
|
||||
a.State = model.StateFiring
|
||||
a.FiredAt = ts
|
||||
state := model.StateFiring
|
||||
@@ -769,6 +780,30 @@ func (r *ThresholdRule) Eval(ctx context.Context, ts time.Time) (interface{}, er
|
||||
Value: a.Value,
|
||||
})
|
||||
}
|
||||
|
||||
// We need to change firing alert to recovering if the returned sample meets recovery threshold
|
||||
changeAlertingToRecovering := a.State == model.StateFiring && a.IsRecovering
|
||||
// We need to change recovering alerts to firing if the returned sample meets target threshold
|
||||
changeRecoveringToFiring := a.State == model.StateRecovering && !a.IsRecovering && !a.Missing
|
||||
// in any of the above case we need to update the status of alert
|
||||
if changeAlertingToRecovering || changeRecoveringToFiring {
|
||||
state := model.StateRecovering
|
||||
if changeRecoveringToFiring {
|
||||
state = model.StateFiring
|
||||
}
|
||||
a.State = state
|
||||
r.logger.DebugContext(ctx, "converting alert state", "name", r.Name(), "state", state)
|
||||
itemsToAdd = append(itemsToAdd, model.RuleStateHistory{
|
||||
RuleID: r.ID(),
|
||||
RuleName: r.Name(),
|
||||
State: state,
|
||||
StateChanged: true,
|
||||
UnixMilli: ts.UnixMilli(),
|
||||
Labels: model.LabelsString(labelsJSON),
|
||||
Fingerprint: a.QueryResultLables.Hash(),
|
||||
Value: a.Value,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
currentState := r.State()
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -59,7 +59,8 @@ type Alert struct {
|
||||
LastSentAt time.Time
|
||||
ValidUntil time.Time
|
||||
|
||||
Missing bool
|
||||
Missing bool
|
||||
IsRecovering bool
|
||||
}
|
||||
|
||||
func (a *Alert) NeedsSending(ts time.Time, resendDelay time.Duration) bool {
|
||||
|
||||
@@ -621,10 +621,10 @@ func TestParseIntoRuleThresholdGeneration(t *testing.T) {
|
||||
}
|
||||
|
||||
// Test that threshold can evaluate properly
|
||||
vector, err := threshold.ShouldAlert(v3.Series{
|
||||
vector, err := threshold.Eval(v3.Series{
|
||||
Points: []v3.Point{{Value: 0.15, Timestamp: 1000}}, // 150ms in seconds
|
||||
Labels: map[string]string{"test": "label"},
|
||||
}, "")
|
||||
}, "", EvalData{})
|
||||
if err != nil {
|
||||
t.Fatalf("Unexpected error in shouldAlert: %v", err)
|
||||
}
|
||||
@@ -698,20 +698,20 @@ func TestParseIntoRuleMultipleThresholds(t *testing.T) {
|
||||
}
|
||||
|
||||
// Test with a value that should trigger both WARNING and CRITICAL thresholds
|
||||
vector, err := threshold.ShouldAlert(v3.Series{
|
||||
vector, err := threshold.Eval(v3.Series{
|
||||
Points: []v3.Point{{Value: 95.0, Timestamp: 1000}}, // 95% CPU usage
|
||||
Labels: map[string]string{"service": "test"},
|
||||
}, "")
|
||||
}, "", EvalData{})
|
||||
if err != nil {
|
||||
t.Fatalf("Unexpected error in shouldAlert: %v", err)
|
||||
}
|
||||
|
||||
assert.Equal(t, 2, len(vector))
|
||||
|
||||
vector, err = threshold.ShouldAlert(v3.Series{
|
||||
vector, err = threshold.Eval(v3.Series{
|
||||
Points: []v3.Point{{Value: 75.0, Timestamp: 1000}}, // 75% CPU usage
|
||||
Labels: map[string]string{"service": "test"},
|
||||
}, "")
|
||||
}, "", EvalData{})
|
||||
if err != nil {
|
||||
t.Fatalf("Unexpected error in shouldAlert: %v", err)
|
||||
}
|
||||
@@ -719,7 +719,7 @@ func TestParseIntoRuleMultipleThresholds(t *testing.T) {
|
||||
assert.Equal(t, 1, len(vector))
|
||||
}
|
||||
|
||||
func TestAnomalyNegationShouldAlert(t *testing.T) {
|
||||
func TestAnomalyNegationEval(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
ruleJSON []byte
|
||||
@@ -1046,9 +1046,9 @@ func TestAnomalyNegationShouldAlert(t *testing.T) {
|
||||
t.Fatalf("unexpected error from GetRuleThreshold: %v", err)
|
||||
}
|
||||
|
||||
resultVector, err := ruleThreshold.ShouldAlert(tt.series, "")
|
||||
resultVector, err := ruleThreshold.Eval(tt.series, "", EvalData{})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error from ShouldAlert: %v", err)
|
||||
t.Fatalf("unexpected error from Eval: %v", err)
|
||||
}
|
||||
|
||||
shouldAlert := len(resultVector) > 0
|
||||
|
||||
@@ -19,7 +19,11 @@ type Sample struct {
|
||||
|
||||
IsMissing bool
|
||||
|
||||
Target float64
|
||||
// IsRecovering is true if the sample is part of a recovering alert.
|
||||
IsRecovering bool
|
||||
|
||||
Target float64
|
||||
RecoveryTarget *float64
|
||||
|
||||
TargetUnit string
|
||||
}
|
||||
|
||||
@@ -57,8 +57,28 @@ type RuleReceivers struct {
|
||||
Name string `json:"name"`
|
||||
}
|
||||
|
||||
// EvalData are other dependent values used to evaluate the threshold rules.
|
||||
type EvalData struct {
|
||||
// ActiveAlerts is a map of active alert fingerprints
|
||||
// used to check if a sample is part of an active alert
|
||||
// when evaluating the recovery threshold.
|
||||
ActiveAlerts map[uint64]struct{}
|
||||
}
|
||||
|
||||
// HasActiveAlert checks if the given sample figerprint is active
|
||||
// as an alert.
|
||||
func (eval EvalData) HasActiveAlert(sampleLabelFp uint64) bool {
|
||||
if len(eval.ActiveAlerts) == 0 {
|
||||
return false
|
||||
}
|
||||
_, ok := eval.ActiveAlerts[sampleLabelFp]
|
||||
return ok
|
||||
}
|
||||
|
||||
type RuleThreshold interface {
|
||||
ShouldAlert(series v3.Series, unit string) (Vector, error)
|
||||
// Eval runs the given series through the threshold rules
|
||||
// using the given EvalData and returns the matching series
|
||||
Eval(series v3.Series, unit string, evalData EvalData) (Vector, error)
|
||||
GetRuleReceivers() []RuleReceivers
|
||||
}
|
||||
|
||||
@@ -97,7 +117,7 @@ func (r BasicRuleThresholds) Validate() error {
|
||||
return errors.Join(errs...)
|
||||
}
|
||||
|
||||
func (r BasicRuleThresholds) ShouldAlert(series v3.Series, unit string) (Vector, error) {
|
||||
func (r BasicRuleThresholds) Eval(series v3.Series, unit string, evalData EvalData) (Vector, error) {
|
||||
var resultVector Vector
|
||||
thresholds := []BasicRuleThreshold(r)
|
||||
sortThresholds(thresholds)
|
||||
@@ -105,8 +125,31 @@ func (r BasicRuleThresholds) ShouldAlert(series v3.Series, unit string) (Vector,
|
||||
smpl, shouldAlert := threshold.shouldAlert(series, unit)
|
||||
if shouldAlert {
|
||||
smpl.Target = *threshold.TargetValue
|
||||
if threshold.RecoveryTarget != nil {
|
||||
smpl.RecoveryTarget = threshold.RecoveryTarget
|
||||
}
|
||||
smpl.TargetUnit = threshold.TargetUnit
|
||||
resultVector = append(resultVector, smpl)
|
||||
continue
|
||||
}
|
||||
|
||||
// Prepare alert hash from series labels and threshold name if recovery target option was provided
|
||||
if threshold.RecoveryTarget == nil {
|
||||
continue
|
||||
}
|
||||
sampleLabels := PrepareSampleLabelsForRule(series.Labels, threshold.Name)
|
||||
alertHash := sampleLabels.Hash()
|
||||
// check if alert is active and then check if recovery threshold matches
|
||||
if evalData.HasActiveAlert(alertHash) {
|
||||
smpl, matchesRecoveryThrehold := threshold.matchesRecoveryThreshold(series, unit)
|
||||
if matchesRecoveryThrehold {
|
||||
smpl.Target = *threshold.TargetValue
|
||||
smpl.RecoveryTarget = threshold.RecoveryTarget
|
||||
smpl.TargetUnit = threshold.TargetUnit
|
||||
// IsRecovering to notify that metrics is in recovery stage
|
||||
smpl.IsRecovering = true
|
||||
resultVector = append(resultVector, smpl)
|
||||
}
|
||||
}
|
||||
}
|
||||
return resultVector, nil
|
||||
@@ -133,16 +176,27 @@ func sortThresholds(thresholds []BasicRuleThreshold) {
|
||||
})
|
||||
}
|
||||
|
||||
func (b BasicRuleThreshold) target(ruleUnit string) float64 {
|
||||
// convertToRuleUnit converts the given value from the target unit to the rule unit
|
||||
func (b BasicRuleThreshold) convertToRuleUnit(val float64, ruleUnit string) float64 {
|
||||
unitConverter := converter.FromUnit(converter.Unit(b.TargetUnit))
|
||||
// convert the target value to the y-axis unit
|
||||
value := unitConverter.Convert(converter.Value{
|
||||
F: *b.TargetValue,
|
||||
F: val,
|
||||
U: converter.Unit(b.TargetUnit),
|
||||
}, converter.Unit(ruleUnit))
|
||||
return value.F
|
||||
}
|
||||
|
||||
// target returns the target value in the rule unit
|
||||
func (b BasicRuleThreshold) target(ruleUnit string) float64 {
|
||||
return b.convertToRuleUnit(*b.TargetValue, ruleUnit)
|
||||
}
|
||||
|
||||
// recoveryTarget returns the recovery target value in the rule unit
|
||||
func (b BasicRuleThreshold) recoveryTarget(ruleUnit string) float64 {
|
||||
return b.convertToRuleUnit(*b.RecoveryTarget, ruleUnit)
|
||||
}
|
||||
|
||||
func (b BasicRuleThreshold) getCompareOp() CompareOp {
|
||||
return b.CompareOp
|
||||
}
|
||||
@@ -178,6 +232,13 @@ func (b BasicRuleThreshold) Validate() error {
|
||||
return errors.Join(errs...)
|
||||
}
|
||||
|
||||
func (b BasicRuleThreshold) matchesRecoveryThreshold(series v3.Series, ruleUnit string) (Sample, bool) {
|
||||
return b.shouldAlertWithTarget(series, b.recoveryTarget(ruleUnit))
|
||||
}
|
||||
func (b BasicRuleThreshold) shouldAlert(series v3.Series, ruleUnit string) (Sample, bool) {
|
||||
return b.shouldAlertWithTarget(series, b.target(ruleUnit))
|
||||
}
|
||||
|
||||
func removeGroupinSetPoints(series v3.Series) []v3.Point {
|
||||
var result []v3.Point
|
||||
for _, s := range series.Points {
|
||||
@@ -188,21 +249,22 @@ func removeGroupinSetPoints(series v3.Series) []v3.Point {
|
||||
return result
|
||||
}
|
||||
|
||||
func (b BasicRuleThreshold) shouldAlert(series v3.Series, ruleUnit string) (Sample, bool) {
|
||||
// PrepareSampleLabelsForRule prepares the labels for the sample to be used in the alerting.
|
||||
// It accepts seriesLabels and thresholdName as input and returns the labels with the threshold name label added.
|
||||
func PrepareSampleLabelsForRule(seriesLabels map[string]string, thresholdName string) (lbls labels.Labels) {
|
||||
lb := labels.NewBuilder(labels.Labels{})
|
||||
for name, value := range seriesLabels {
|
||||
lb.Set(name, value)
|
||||
}
|
||||
lb.Set(LabelThresholdName, thresholdName)
|
||||
lb.Set(LabelSeverityName, strings.ToLower(thresholdName))
|
||||
return lb.Labels()
|
||||
}
|
||||
|
||||
func (b BasicRuleThreshold) shouldAlertWithTarget(series v3.Series, target float64) (Sample, bool) {
|
||||
var shouldAlert bool
|
||||
var alertSmpl Sample
|
||||
var lbls labels.Labels
|
||||
|
||||
for name, value := range series.Labels {
|
||||
lbls = append(lbls, labels.Label{Name: name, Value: value})
|
||||
}
|
||||
|
||||
target := b.target(ruleUnit)
|
||||
|
||||
// TODO(srikanthccv): is it better to move the logic to notifier instead of
|
||||
// adding two labels?
|
||||
lbls = append(lbls, labels.Label{Name: LabelThresholdName, Value: b.Name})
|
||||
lbls = append(lbls, labels.Label{Name: LabelSeverityName, Value: strings.ToLower(b.Name)})
|
||||
lbls := PrepareSampleLabelsForRule(series.Labels, b.Name)
|
||||
|
||||
series.Points = removeGroupinSetPoints(series)
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ import (
|
||||
v3 "github.com/SigNoz/signoz/pkg/query-service/model/v3"
|
||||
)
|
||||
|
||||
func TestBasicRuleThresholdShouldAlert_UnitConversion(t *testing.T) {
|
||||
func TestBasicRuleThresholdEval_UnitConversion(t *testing.T) {
|
||||
target := 100.0
|
||||
|
||||
tests := []struct {
|
||||
@@ -270,7 +270,7 @@ func TestBasicRuleThresholdShouldAlert_UnitConversion(t *testing.T) {
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
thresholds := BasicRuleThresholds{tt.threshold}
|
||||
vector, err := thresholds.ShouldAlert(tt.series, tt.ruleUnit)
|
||||
vector, err := thresholds.Eval(tt.series, tt.ruleUnit, EvalData{})
|
||||
assert.NoError(t, err)
|
||||
|
||||
alert := len(vector) > 0
|
||||
@@ -301,3 +301,31 @@ func TestBasicRuleThresholdShouldAlert_UnitConversion(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestPrepareSampleLabelsForRule(t *testing.T) {
|
||||
alertAllHashes := make(map[uint64]struct{})
|
||||
thresholdName := "test"
|
||||
for range 50_000 {
|
||||
sampleLabels := map[string]string{
|
||||
"service": "test",
|
||||
"env": "prod",
|
||||
"tier": "backend",
|
||||
"namespace": "default",
|
||||
"pod": "test-pod",
|
||||
"container": "test-container",
|
||||
"node": "test-node",
|
||||
"cluster": "test-cluster",
|
||||
"region": "test-region",
|
||||
"az": "test-az",
|
||||
"hostname": "test-hostname",
|
||||
"ip": "192.168.1.1",
|
||||
"port": "8080",
|
||||
}
|
||||
lbls := PrepareSampleLabelsForRule(sampleLabels, thresholdName)
|
||||
assert.True(t, lbls.Has(LabelThresholdName), "LabelThresholdName not found in labels")
|
||||
alertAllHashes[lbls.Hash()] = struct{}{}
|
||||
}
|
||||
t.Logf("Total hashes: %d", len(alertAllHashes))
|
||||
// there should be only one hash for all the samples
|
||||
assert.Equal(t, 1, len(alertAllHashes), "Expected only one hash for all the samples")
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user