feat(alerting): expand tiered rules into per-severity evaluators with state aggregation [R8S-1003] (#2586)

This commit is contained in:
RHCowan
2026-05-08 14:50:59 +12:00
committed by GitHub
parent ed7f074380
commit ff169ed356
3 changed files with 170 additions and 23 deletions
+87 -7
View File
@@ -2,6 +2,7 @@ package libprometheus
import (
"context"
"slices"
"strconv"
"time"
@@ -50,7 +51,14 @@ func ReloadRules(mgr *rules.Manager, evalInterval time.Duration, alertsFilePath
// ExtractAlertStates returns the current evaluation state for each alerting rule
// managed by the given rules.Manager.
func ExtractAlertStates(mgr *rules.Manager) []pkgmetrics.EdgeAlertRuleState {
var states []pkgmetrics.EdgeAlertRuleState
type aggregateState struct {
state pkgmetrics.AlertRuleStateType
lastEvaluation int64
lastError string
severity string
}
aggregated := make(map[int]aggregateState)
for _, group := range mgr.RuleGroups() {
for _, rule := range group.Rules() {
@@ -82,14 +90,86 @@ func ExtractAlertStates(mgr *rules.Manager) []pkgmetrics.EdgeAlertRuleState {
lastErr = alertRule.LastError().Error()
}
states = append(states, pkgmetrics.EdgeAlertRuleState{
RuleID: ruleID,
State: state,
LastEvaluation: alertRule.GetEvaluationTimestamp().UnixMilli(),
LastError: lastErr,
})
tierSeverity := alertRule.Labels().Get(pkgmetrics.AlertTierLabel)
statePriority := alertStatePriority(state)
evalMillis := alertRule.GetEvaluationTimestamp().UnixMilli()
existing, exists := aggregated[ruleID]
if !exists {
aggregated[ruleID] = aggregateState{
state: state,
lastEvaluation: evalMillis,
lastError: lastErr,
severity: tierSeverity,
}
continue
}
existingStatePriority := alertStatePriority(existing.state)
winsState := statePriority > existingStatePriority
tiebreakWins := statePriority == existingStatePriority && tierSeverityPriority(tierSeverity) > tierSeverityPriority(existing.severity)
if winsState || tiebreakWins {
existing.state = state
existing.severity = tierSeverity
existing.lastError = lastErr
}
if evalMillis > existing.lastEvaluation {
existing.lastEvaluation = evalMillis
}
aggregated[ruleID] = existing
}
}
if len(aggregated) == 0 {
return nil
}
ruleIDs := make([]int, 0, len(aggregated))
for ruleID := range aggregated {
ruleIDs = append(ruleIDs, ruleID)
}
slices.Sort(ruleIDs)
states := make([]pkgmetrics.EdgeAlertRuleState, 0, len(ruleIDs))
for _, ruleID := range ruleIDs {
state := aggregated[ruleID]
states = append(states, pkgmetrics.EdgeAlertRuleState{
RuleID: ruleID,
State: state.state,
LastEvaluation: state.lastEvaluation,
LastError: state.lastError,
})
}
return states
}
func alertStatePriority(state pkgmetrics.AlertRuleStateType) int {
switch state {
case pkgmetrics.AlertRuleStateFiring:
return 2
case pkgmetrics.AlertRuleStatePending:
return 1
default:
return 0
}
}
// tierSeverityPriority mirrors the canonical severity ordering from the EE
// alertexpr package. CE cannot import EE, so this is a
// deliberate duplicate — keep the values aligned if the canonical list changes.
func tierSeverityPriority(severity string) int {
switch severity {
case "critical":
return 2
case "warning":
return 1
case "info":
return 0
default:
return -1
}
}
+80 -16
View File
@@ -3,10 +3,10 @@ package libprometheus_test
import (
"context"
"os"
"path/filepath"
"testing"
"time"
"github.com/portainer/portainer/api/filesystem"
libprom "github.com/portainer/portainer/pkg/libprometheus"
pkgmetrics "github.com/portainer/portainer/pkg/metrics"
prometheusreg "github.com/prometheus/client_golang/prometheus"
@@ -64,7 +64,7 @@ func TestReloadRules(t *testing.T) {
t.Run("valid rule file loads successfully", func(t *testing.T) {
dir := t.TempDir()
alertsFile := filepath.Join(dir, "alerts.yaml")
alertsFile := filesystem.JoinPaths(dir, "alerts.yaml")
rulesYAML := `groups:
- name: test-group
@@ -90,30 +90,38 @@ func TestReloadRules(t *testing.T) {
}
func TestExtractAlertStates(t *testing.T) {
reg := prometheusreg.NewRegistry()
db, err := libprom.NewInMemoryTSDB(reg)
require.NoError(t, err)
defer func() { require.NoError(t, db.Close()) }()
newTestRuleManager := func(t *testing.T) *rules.Manager {
t.Helper()
engine := libprom.NewEngine()
reg := prometheusreg.NewRegistry()
db, err := libprom.NewInMemoryTSDB(reg)
require.NoError(t, err)
t.Cleanup(func() {
require.NoError(t, db.Close())
})
mgr := libprom.NewRuleManager(libprom.RuleManagerConfig{
Engine: engine,
Queryable: db,
Appendable: db,
NotifyFunc: func(_ context.Context, _ string, _ ...*rules.Alert) {},
Context: context.Background(),
Registerer: reg,
})
engine := libprom.NewEngine()
return libprom.NewRuleManager(libprom.RuleManagerConfig{
Engine: engine,
Queryable: db,
Appendable: db,
NotifyFunc: func(_ context.Context, _ string, _ ...*rules.Alert) {},
Context: context.Background(),
Registerer: reg,
})
}
t.Run("no rules returns nil", func(t *testing.T) {
mgr := newTestRuleManager(t)
states := libprom.ExtractAlertStates(mgr)
assert.Nil(t, states)
})
t.Run("loaded rules return states", func(t *testing.T) {
mgr := newTestRuleManager(t)
dir := t.TempDir()
alertsFile := filepath.Join(dir, "alerts.yaml")
alertsFile := filesystem.JoinPaths(dir, "alerts.yaml")
rulesYAML := `groups:
- name: test-group
@@ -133,4 +141,60 @@ func TestExtractAlertStates(t *testing.T) {
assert.Equal(t, 7, states[0].RuleID)
assert.Equal(t, pkgmetrics.AlertRuleStateOK, states[0].State)
})
t.Run("duplicate alert_rule_id values are aggregated", func(t *testing.T) {
mgr := newTestRuleManager(t)
dir := t.TempDir()
alertsFile := filesystem.JoinPaths(dir, "alerts.yaml")
rulesYAML := `groups:
- name: test-group
rules:
- alert: CpuWarning
expr: up == 0
labels:
alert_rule_id: "42"
severity: warning
- alert: CpuCritical
expr: up >= 0
labels:
alert_rule_id: "42"
severity: critical
`
require.NoError(t, os.WriteFile(alertsFile, []byte(rulesYAML), 0o644))
require.NoError(t, libprom.ReloadRules(mgr, 15*time.Second, alertsFile))
states := libprom.ExtractAlertStates(mgr)
require.Len(t, states, 1)
assert.Equal(t, 42, states[0].RuleID)
assert.Equal(t, pkgmetrics.AlertRuleStateOK, states[0].State)
})
t.Run("states are returned in stable rule ID order", func(t *testing.T) {
mgr := newTestRuleManager(t)
dir := t.TempDir()
alertsFile := filesystem.JoinPaths(dir, "alerts.yaml")
rulesYAML := `groups:
- name: test-group
rules:
- alert: RuleTen
expr: up == 0
labels:
alert_rule_id: "10"
severity: warning
- alert: RuleThree
expr: up >= 0
labels:
alert_rule_id: "3"
severity: critical
`
require.NoError(t, os.WriteFile(alertsFile, []byte(rulesYAML), 0o644))
require.NoError(t, libprom.ReloadRules(mgr, 15*time.Second, alertsFile))
states := libprom.ExtractAlertStates(mgr)
require.Len(t, states, 2)
assert.Equal(t, 3, states[0].RuleID)
assert.Equal(t, 10, states[1].RuleID)
})
}
+3
View File
@@ -4,6 +4,8 @@ const (
// AlertRuleIDLabel is the Prometheus label key used to correlate alerts
// with their Portainer alert rule ID across agent and server packages.
AlertRuleIDLabel = "alert_rule_id"
// AlertTierLabel marks generated evaluator rules that belong to a parent tiered rule.
AlertTierLabel = "portainer_alert_tier"
ClusterCPUUsageCoresMetric = "portainer_edge_agent_cluster_cpu_usage_cores"
ClusterCPUCapacityCoresMetric = "portainer_edge_agent_cluster_cpu_capacity_cores"
@@ -99,5 +101,6 @@ type EdgeAlertRule struct {
Severity string `json:"severity"`
PromqlExpr string `json:"promql_expr"`
ForDurationMinutes int `json:"for_duration_minutes,omitempty"`
Labels map[string]string `json:"labels,omitempty"`
Annotations map[string]string `json:"annotations,omitempty"`
}