From ff169ed356b90cec6cbd4918a5b93f445a9e2ff9 Mon Sep 17 00:00:00 2001 From: RHCowan <50324595+RHCowan@users.noreply.github.com> Date: Fri, 8 May 2026 14:50:59 +1200 Subject: [PATCH] feat(alerting): expand tiered rules into per-severity evaluators with state aggregation [R8S-1003] (#2586) --- pkg/libprometheus/rules.go | 94 +++++++++++++++++++++++++++++--- pkg/libprometheus/rules_test.go | 96 +++++++++++++++++++++++++++------ pkg/metrics/types.go | 3 ++ 3 files changed, 170 insertions(+), 23 deletions(-) diff --git a/pkg/libprometheus/rules.go b/pkg/libprometheus/rules.go index 891c1ccf86..553b83b246 100644 --- a/pkg/libprometheus/rules.go +++ b/pkg/libprometheus/rules.go @@ -2,6 +2,7 @@ package libprometheus import ( "context" + "slices" "strconv" "time" @@ -50,7 +51,14 @@ func ReloadRules(mgr *rules.Manager, evalInterval time.Duration, alertsFilePath // ExtractAlertStates returns the current evaluation state for each alerting rule // managed by the given rules.Manager. func ExtractAlertStates(mgr *rules.Manager) []pkgmetrics.EdgeAlertRuleState { - var states []pkgmetrics.EdgeAlertRuleState + type aggregateState struct { + state pkgmetrics.AlertRuleStateType + lastEvaluation int64 + lastError string + severity string + } + + aggregated := make(map[int]aggregateState) for _, group := range mgr.RuleGroups() { for _, rule := range group.Rules() { @@ -82,14 +90,86 @@ func ExtractAlertStates(mgr *rules.Manager) []pkgmetrics.EdgeAlertRuleState { lastErr = alertRule.LastError().Error() } - states = append(states, pkgmetrics.EdgeAlertRuleState{ - RuleID: ruleID, - State: state, - LastEvaluation: alertRule.GetEvaluationTimestamp().UnixMilli(), - LastError: lastErr, - }) + tierSeverity := alertRule.Labels().Get(pkgmetrics.AlertTierLabel) + statePriority := alertStatePriority(state) + evalMillis := alertRule.GetEvaluationTimestamp().UnixMilli() + + existing, exists := aggregated[ruleID] + if !exists { + aggregated[ruleID] = aggregateState{ + state: state, + lastEvaluation: evalMillis, + lastError: lastErr, + severity: tierSeverity, + } + continue + } + + existingStatePriority := alertStatePriority(existing.state) + winsState := statePriority > existingStatePriority + tiebreakWins := statePriority == existingStatePriority && tierSeverityPriority(tierSeverity) > tierSeverityPriority(existing.severity) + + if winsState || tiebreakWins { + existing.state = state + existing.severity = tierSeverity + existing.lastError = lastErr + } + + if evalMillis > existing.lastEvaluation { + existing.lastEvaluation = evalMillis + } + + aggregated[ruleID] = existing } } + if len(aggregated) == 0 { + return nil + } + + ruleIDs := make([]int, 0, len(aggregated)) + for ruleID := range aggregated { + ruleIDs = append(ruleIDs, ruleID) + } + slices.Sort(ruleIDs) + + states := make([]pkgmetrics.EdgeAlertRuleState, 0, len(ruleIDs)) + for _, ruleID := range ruleIDs { + state := aggregated[ruleID] + states = append(states, pkgmetrics.EdgeAlertRuleState{ + RuleID: ruleID, + State: state.state, + LastEvaluation: state.lastEvaluation, + LastError: state.lastError, + }) + } + return states } + +func alertStatePriority(state pkgmetrics.AlertRuleStateType) int { + switch state { + case pkgmetrics.AlertRuleStateFiring: + return 2 + case pkgmetrics.AlertRuleStatePending: + return 1 + default: + return 0 + } +} + +// tierSeverityPriority mirrors the canonical severity ordering from the EE +// alertexpr package. CE cannot import EE, so this is a +// deliberate duplicate — keep the values aligned if the canonical list changes. +func tierSeverityPriority(severity string) int { + switch severity { + case "critical": + return 2 + case "warning": + return 1 + case "info": + return 0 + default: + return -1 + } +} diff --git a/pkg/libprometheus/rules_test.go b/pkg/libprometheus/rules_test.go index ce81458072..f0b29241d9 100644 --- a/pkg/libprometheus/rules_test.go +++ b/pkg/libprometheus/rules_test.go @@ -3,10 +3,10 @@ package libprometheus_test import ( "context" "os" - "path/filepath" "testing" "time" + "github.com/portainer/portainer/api/filesystem" libprom "github.com/portainer/portainer/pkg/libprometheus" pkgmetrics "github.com/portainer/portainer/pkg/metrics" prometheusreg "github.com/prometheus/client_golang/prometheus" @@ -64,7 +64,7 @@ func TestReloadRules(t *testing.T) { t.Run("valid rule file loads successfully", func(t *testing.T) { dir := t.TempDir() - alertsFile := filepath.Join(dir, "alerts.yaml") + alertsFile := filesystem.JoinPaths(dir, "alerts.yaml") rulesYAML := `groups: - name: test-group @@ -90,30 +90,38 @@ func TestReloadRules(t *testing.T) { } func TestExtractAlertStates(t *testing.T) { - reg := prometheusreg.NewRegistry() - db, err := libprom.NewInMemoryTSDB(reg) - require.NoError(t, err) - defer func() { require.NoError(t, db.Close()) }() + newTestRuleManager := func(t *testing.T) *rules.Manager { + t.Helper() - engine := libprom.NewEngine() + reg := prometheusreg.NewRegistry() + db, err := libprom.NewInMemoryTSDB(reg) + require.NoError(t, err) + t.Cleanup(func() { + require.NoError(t, db.Close()) + }) - mgr := libprom.NewRuleManager(libprom.RuleManagerConfig{ - Engine: engine, - Queryable: db, - Appendable: db, - NotifyFunc: func(_ context.Context, _ string, _ ...*rules.Alert) {}, - Context: context.Background(), - Registerer: reg, - }) + engine := libprom.NewEngine() + + return libprom.NewRuleManager(libprom.RuleManagerConfig{ + Engine: engine, + Queryable: db, + Appendable: db, + NotifyFunc: func(_ context.Context, _ string, _ ...*rules.Alert) {}, + Context: context.Background(), + Registerer: reg, + }) + } t.Run("no rules returns nil", func(t *testing.T) { + mgr := newTestRuleManager(t) states := libprom.ExtractAlertStates(mgr) assert.Nil(t, states) }) t.Run("loaded rules return states", func(t *testing.T) { + mgr := newTestRuleManager(t) dir := t.TempDir() - alertsFile := filepath.Join(dir, "alerts.yaml") + alertsFile := filesystem.JoinPaths(dir, "alerts.yaml") rulesYAML := `groups: - name: test-group @@ -133,4 +141,60 @@ func TestExtractAlertStates(t *testing.T) { assert.Equal(t, 7, states[0].RuleID) assert.Equal(t, pkgmetrics.AlertRuleStateOK, states[0].State) }) + + t.Run("duplicate alert_rule_id values are aggregated", func(t *testing.T) { + mgr := newTestRuleManager(t) + dir := t.TempDir() + alertsFile := filesystem.JoinPaths(dir, "alerts.yaml") + + rulesYAML := `groups: + - name: test-group + rules: + - alert: CpuWarning + expr: up == 0 + labels: + alert_rule_id: "42" + severity: warning + - alert: CpuCritical + expr: up >= 0 + labels: + alert_rule_id: "42" + severity: critical +` + require.NoError(t, os.WriteFile(alertsFile, []byte(rulesYAML), 0o644)) + require.NoError(t, libprom.ReloadRules(mgr, 15*time.Second, alertsFile)) + + states := libprom.ExtractAlertStates(mgr) + require.Len(t, states, 1) + assert.Equal(t, 42, states[0].RuleID) + assert.Equal(t, pkgmetrics.AlertRuleStateOK, states[0].State) + }) + + t.Run("states are returned in stable rule ID order", func(t *testing.T) { + mgr := newTestRuleManager(t) + dir := t.TempDir() + alertsFile := filesystem.JoinPaths(dir, "alerts.yaml") + + rulesYAML := `groups: + - name: test-group + rules: + - alert: RuleTen + expr: up == 0 + labels: + alert_rule_id: "10" + severity: warning + - alert: RuleThree + expr: up >= 0 + labels: + alert_rule_id: "3" + severity: critical +` + require.NoError(t, os.WriteFile(alertsFile, []byte(rulesYAML), 0o644)) + require.NoError(t, libprom.ReloadRules(mgr, 15*time.Second, alertsFile)) + + states := libprom.ExtractAlertStates(mgr) + require.Len(t, states, 2) + assert.Equal(t, 3, states[0].RuleID) + assert.Equal(t, 10, states[1].RuleID) + }) } diff --git a/pkg/metrics/types.go b/pkg/metrics/types.go index a9433f4fcb..2033ae2e54 100644 --- a/pkg/metrics/types.go +++ b/pkg/metrics/types.go @@ -4,6 +4,8 @@ const ( // AlertRuleIDLabel is the Prometheus label key used to correlate alerts // with their Portainer alert rule ID across agent and server packages. AlertRuleIDLabel = "alert_rule_id" + // AlertTierLabel marks generated evaluator rules that belong to a parent tiered rule. + AlertTierLabel = "portainer_alert_tier" ClusterCPUUsageCoresMetric = "portainer_edge_agent_cluster_cpu_usage_cores" ClusterCPUCapacityCoresMetric = "portainer_edge_agent_cluster_cpu_capacity_cores" @@ -99,5 +101,6 @@ type EdgeAlertRule struct { Severity string `json:"severity"` PromqlExpr string `json:"promql_expr"` ForDurationMinutes int `json:"for_duration_minutes,omitempty"` + Labels map[string]string `json:"labels,omitempty"` Annotations map[string]string `json:"annotations,omitempty"` }