mirror of
https://github.com/portainer/portainer.git
synced 2026-06-23 04:10:29 +00:00
feat(alerting): expand tiered rules into per-severity evaluators with state aggregation [R8S-1003] (#2586)
This commit is contained in:
@@ -2,6 +2,7 @@ package libprometheus
|
||||
|
||||
import (
|
||||
"context"
|
||||
"slices"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
@@ -50,7 +51,14 @@ func ReloadRules(mgr *rules.Manager, evalInterval time.Duration, alertsFilePath
|
||||
// ExtractAlertStates returns the current evaluation state for each alerting rule
|
||||
// managed by the given rules.Manager.
|
||||
func ExtractAlertStates(mgr *rules.Manager) []pkgmetrics.EdgeAlertRuleState {
|
||||
var states []pkgmetrics.EdgeAlertRuleState
|
||||
type aggregateState struct {
|
||||
state pkgmetrics.AlertRuleStateType
|
||||
lastEvaluation int64
|
||||
lastError string
|
||||
severity string
|
||||
}
|
||||
|
||||
aggregated := make(map[int]aggregateState)
|
||||
|
||||
for _, group := range mgr.RuleGroups() {
|
||||
for _, rule := range group.Rules() {
|
||||
@@ -82,14 +90,86 @@ func ExtractAlertStates(mgr *rules.Manager) []pkgmetrics.EdgeAlertRuleState {
|
||||
lastErr = alertRule.LastError().Error()
|
||||
}
|
||||
|
||||
states = append(states, pkgmetrics.EdgeAlertRuleState{
|
||||
RuleID: ruleID,
|
||||
State: state,
|
||||
LastEvaluation: alertRule.GetEvaluationTimestamp().UnixMilli(),
|
||||
LastError: lastErr,
|
||||
})
|
||||
tierSeverity := alertRule.Labels().Get(pkgmetrics.AlertTierLabel)
|
||||
statePriority := alertStatePriority(state)
|
||||
evalMillis := alertRule.GetEvaluationTimestamp().UnixMilli()
|
||||
|
||||
existing, exists := aggregated[ruleID]
|
||||
if !exists {
|
||||
aggregated[ruleID] = aggregateState{
|
||||
state: state,
|
||||
lastEvaluation: evalMillis,
|
||||
lastError: lastErr,
|
||||
severity: tierSeverity,
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
existingStatePriority := alertStatePriority(existing.state)
|
||||
winsState := statePriority > existingStatePriority
|
||||
tiebreakWins := statePriority == existingStatePriority && tierSeverityPriority(tierSeverity) > tierSeverityPriority(existing.severity)
|
||||
|
||||
if winsState || tiebreakWins {
|
||||
existing.state = state
|
||||
existing.severity = tierSeverity
|
||||
existing.lastError = lastErr
|
||||
}
|
||||
|
||||
if evalMillis > existing.lastEvaluation {
|
||||
existing.lastEvaluation = evalMillis
|
||||
}
|
||||
|
||||
aggregated[ruleID] = existing
|
||||
}
|
||||
}
|
||||
|
||||
if len(aggregated) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
ruleIDs := make([]int, 0, len(aggregated))
|
||||
for ruleID := range aggregated {
|
||||
ruleIDs = append(ruleIDs, ruleID)
|
||||
}
|
||||
slices.Sort(ruleIDs)
|
||||
|
||||
states := make([]pkgmetrics.EdgeAlertRuleState, 0, len(ruleIDs))
|
||||
for _, ruleID := range ruleIDs {
|
||||
state := aggregated[ruleID]
|
||||
states = append(states, pkgmetrics.EdgeAlertRuleState{
|
||||
RuleID: ruleID,
|
||||
State: state.state,
|
||||
LastEvaluation: state.lastEvaluation,
|
||||
LastError: state.lastError,
|
||||
})
|
||||
}
|
||||
|
||||
return states
|
||||
}
|
||||
|
||||
func alertStatePriority(state pkgmetrics.AlertRuleStateType) int {
|
||||
switch state {
|
||||
case pkgmetrics.AlertRuleStateFiring:
|
||||
return 2
|
||||
case pkgmetrics.AlertRuleStatePending:
|
||||
return 1
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
// tierSeverityPriority mirrors the canonical severity ordering from the EE
|
||||
// alertexpr package. CE cannot import EE, so this is a
|
||||
// deliberate duplicate — keep the values aligned if the canonical list changes.
|
||||
func tierSeverityPriority(severity string) int {
|
||||
switch severity {
|
||||
case "critical":
|
||||
return 2
|
||||
case "warning":
|
||||
return 1
|
||||
case "info":
|
||||
return 0
|
||||
default:
|
||||
return -1
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,10 +3,10 @@ package libprometheus_test
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/portainer/portainer/api/filesystem"
|
||||
libprom "github.com/portainer/portainer/pkg/libprometheus"
|
||||
pkgmetrics "github.com/portainer/portainer/pkg/metrics"
|
||||
prometheusreg "github.com/prometheus/client_golang/prometheus"
|
||||
@@ -64,7 +64,7 @@ func TestReloadRules(t *testing.T) {
|
||||
|
||||
t.Run("valid rule file loads successfully", func(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
alertsFile := filepath.Join(dir, "alerts.yaml")
|
||||
alertsFile := filesystem.JoinPaths(dir, "alerts.yaml")
|
||||
|
||||
rulesYAML := `groups:
|
||||
- name: test-group
|
||||
@@ -90,30 +90,38 @@ func TestReloadRules(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestExtractAlertStates(t *testing.T) {
|
||||
reg := prometheusreg.NewRegistry()
|
||||
db, err := libprom.NewInMemoryTSDB(reg)
|
||||
require.NoError(t, err)
|
||||
defer func() { require.NoError(t, db.Close()) }()
|
||||
newTestRuleManager := func(t *testing.T) *rules.Manager {
|
||||
t.Helper()
|
||||
|
||||
engine := libprom.NewEngine()
|
||||
reg := prometheusreg.NewRegistry()
|
||||
db, err := libprom.NewInMemoryTSDB(reg)
|
||||
require.NoError(t, err)
|
||||
t.Cleanup(func() {
|
||||
require.NoError(t, db.Close())
|
||||
})
|
||||
|
||||
mgr := libprom.NewRuleManager(libprom.RuleManagerConfig{
|
||||
Engine: engine,
|
||||
Queryable: db,
|
||||
Appendable: db,
|
||||
NotifyFunc: func(_ context.Context, _ string, _ ...*rules.Alert) {},
|
||||
Context: context.Background(),
|
||||
Registerer: reg,
|
||||
})
|
||||
engine := libprom.NewEngine()
|
||||
|
||||
return libprom.NewRuleManager(libprom.RuleManagerConfig{
|
||||
Engine: engine,
|
||||
Queryable: db,
|
||||
Appendable: db,
|
||||
NotifyFunc: func(_ context.Context, _ string, _ ...*rules.Alert) {},
|
||||
Context: context.Background(),
|
||||
Registerer: reg,
|
||||
})
|
||||
}
|
||||
|
||||
t.Run("no rules returns nil", func(t *testing.T) {
|
||||
mgr := newTestRuleManager(t)
|
||||
states := libprom.ExtractAlertStates(mgr)
|
||||
assert.Nil(t, states)
|
||||
})
|
||||
|
||||
t.Run("loaded rules return states", func(t *testing.T) {
|
||||
mgr := newTestRuleManager(t)
|
||||
dir := t.TempDir()
|
||||
alertsFile := filepath.Join(dir, "alerts.yaml")
|
||||
alertsFile := filesystem.JoinPaths(dir, "alerts.yaml")
|
||||
|
||||
rulesYAML := `groups:
|
||||
- name: test-group
|
||||
@@ -133,4 +141,60 @@ func TestExtractAlertStates(t *testing.T) {
|
||||
assert.Equal(t, 7, states[0].RuleID)
|
||||
assert.Equal(t, pkgmetrics.AlertRuleStateOK, states[0].State)
|
||||
})
|
||||
|
||||
t.Run("duplicate alert_rule_id values are aggregated", func(t *testing.T) {
|
||||
mgr := newTestRuleManager(t)
|
||||
dir := t.TempDir()
|
||||
alertsFile := filesystem.JoinPaths(dir, "alerts.yaml")
|
||||
|
||||
rulesYAML := `groups:
|
||||
- name: test-group
|
||||
rules:
|
||||
- alert: CpuWarning
|
||||
expr: up == 0
|
||||
labels:
|
||||
alert_rule_id: "42"
|
||||
severity: warning
|
||||
- alert: CpuCritical
|
||||
expr: up >= 0
|
||||
labels:
|
||||
alert_rule_id: "42"
|
||||
severity: critical
|
||||
`
|
||||
require.NoError(t, os.WriteFile(alertsFile, []byte(rulesYAML), 0o644))
|
||||
require.NoError(t, libprom.ReloadRules(mgr, 15*time.Second, alertsFile))
|
||||
|
||||
states := libprom.ExtractAlertStates(mgr)
|
||||
require.Len(t, states, 1)
|
||||
assert.Equal(t, 42, states[0].RuleID)
|
||||
assert.Equal(t, pkgmetrics.AlertRuleStateOK, states[0].State)
|
||||
})
|
||||
|
||||
t.Run("states are returned in stable rule ID order", func(t *testing.T) {
|
||||
mgr := newTestRuleManager(t)
|
||||
dir := t.TempDir()
|
||||
alertsFile := filesystem.JoinPaths(dir, "alerts.yaml")
|
||||
|
||||
rulesYAML := `groups:
|
||||
- name: test-group
|
||||
rules:
|
||||
- alert: RuleTen
|
||||
expr: up == 0
|
||||
labels:
|
||||
alert_rule_id: "10"
|
||||
severity: warning
|
||||
- alert: RuleThree
|
||||
expr: up >= 0
|
||||
labels:
|
||||
alert_rule_id: "3"
|
||||
severity: critical
|
||||
`
|
||||
require.NoError(t, os.WriteFile(alertsFile, []byte(rulesYAML), 0o644))
|
||||
require.NoError(t, libprom.ReloadRules(mgr, 15*time.Second, alertsFile))
|
||||
|
||||
states := libprom.ExtractAlertStates(mgr)
|
||||
require.Len(t, states, 2)
|
||||
assert.Equal(t, 3, states[0].RuleID)
|
||||
assert.Equal(t, 10, states[1].RuleID)
|
||||
})
|
||||
}
|
||||
|
||||
@@ -4,6 +4,8 @@ const (
|
||||
// AlertRuleIDLabel is the Prometheus label key used to correlate alerts
|
||||
// with their Portainer alert rule ID across agent and server packages.
|
||||
AlertRuleIDLabel = "alert_rule_id"
|
||||
// AlertTierLabel marks generated evaluator rules that belong to a parent tiered rule.
|
||||
AlertTierLabel = "portainer_alert_tier"
|
||||
|
||||
ClusterCPUUsageCoresMetric = "portainer_edge_agent_cluster_cpu_usage_cores"
|
||||
ClusterCPUCapacityCoresMetric = "portainer_edge_agent_cluster_cpu_capacity_cores"
|
||||
@@ -99,5 +101,6 @@ type EdgeAlertRule struct {
|
||||
Severity string `json:"severity"`
|
||||
PromqlExpr string `json:"promql_expr"`
|
||||
ForDurationMinutes int `json:"for_duration_minutes,omitempty"`
|
||||
Labels map[string]string `json:"labels,omitempty"`
|
||||
Annotations map[string]string `json:"annotations,omitempty"`
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user