feat: disk-fill default rule + mountUsedPercent metric expression (#4713)

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Amir Raminfar
2026-05-19 08:46:54 -07:00
committed by GitHub
parent 1c7e804138
commit 0201f813ba
25 changed files with 254 additions and 21 deletions
+27 -10
View File
@@ -35,6 +35,8 @@ const i18n = createI18n({
"oom-desc": "Fires when Docker reports an out-of-memory kill.",
restart: "Container restarted",
"restart-desc": "Off by default — noisy on its own; Cloud also uses this for loop detection.",
disk: "Disk space running low on any volume",
"disk-desc": "Fires when any mounted volume is over 85% full.",
},
},
},
@@ -98,23 +100,38 @@ describe("<WelcomeModal /> Create First Alert", () => {
await flushPromises();
const ruleCalls = fetchMock.mock.calls.filter((c) => String(c[0]).includes("/api/notifications/rules"));
expect(ruleCalls).toHaveLength(3); // exited + unhealthy + oom on by default; restart off
expect(ruleCalls).toHaveLength(4); // exited + unhealthy + oom + disk on by default; restart off
const expressions = ruleCalls.map((c) => JSON.parse((c[1] as RequestInit).body as string).eventExpression);
expect(expressions).toContain('name == "die" && attributes["exitCode"] != "0"');
expect(expressions).toContain('name == "health_status" && attributes["healthStatus"] == "unhealthy"');
expect(expressions).toContain('name == "oom"');
expect(expressions).not.toContain('name == "restart"');
const bodies = ruleCalls.map((c) => JSON.parse((c[1] as RequestInit).body as string));
const eventExpressions = bodies.map((b) => b.eventExpression).filter(Boolean);
expect(eventExpressions).toContain('name == "die" && attributes["exitCode"] != "0"');
expect(eventExpressions).toContain('name == "health_status" && attributes["healthStatus"] == "unhealthy"');
expect(eventExpressions).toContain('name == "oom"');
expect(eventExpressions).not.toContain('name == "restart"');
// every POST uses cloud dispatcher id
for (const c of ruleCalls) {
const body = JSON.parse((c[1] as RequestInit).body as string);
expect(body).toMatchObject({
const metricExpressions = bodies.map((b) => b.metricExpression).filter(Boolean);
expect(metricExpressions).toContain("any(mounts, .usedPercent >= 85)");
// disk rule should carry its own cooldown/sampleWindow; event rules should remain at 0
const diskBody = bodies.find((b) => b.metricExpression === "any(mounts, .usedPercent >= 85)");
expect(diskBody).toMatchObject({
enabled: true,
dispatcherId: 7,
cooldown: 3600,
sampleWindow: 60,
containerExpression: "true",
eventExpression: "",
});
// event-based POSTs use cloud dispatcher id with no cooldown
for (const b of bodies.filter((x) => x.eventExpression)) {
expect(b).toMatchObject({
enabled: true,
dispatcherId: 7,
cooldown: 0,
sampleWindow: 0,
containerExpression: "true",
metricExpression: "",
});
}
+22 -5
View File
@@ -110,10 +110,12 @@ const chipOptions = [
{ value: "something_else", label: t("cloud.welcome.chip-other") },
];
type SignalKey = "exited" | "unhealthy" | "oom" | "restart";
type SignalKey = "exited" | "unhealthy" | "oom" | "restart" | "disk";
type SignalKind = "event" | "metric";
interface SignalDef {
key: SignalKey;
kind: SignalKind;
label: string;
description: string;
// ruleName is intentionally English/stable so the rule stays recognizable
@@ -126,6 +128,7 @@ interface SignalDef {
const signals = computed<SignalDef[]>(() => [
{
key: "exited",
kind: "event",
label: t("cloud.welcome.signals.exited"),
description: t("cloud.welcome.signals.exited-desc"),
ruleName: "Container exited with an error",
@@ -134,6 +137,7 @@ const signals = computed<SignalDef[]>(() => [
},
{
key: "unhealthy",
kind: "event",
label: t("cloud.welcome.signals.unhealthy"),
description: t("cloud.welcome.signals.unhealthy-desc"),
ruleName: "Container became unhealthy",
@@ -142,6 +146,7 @@ const signals = computed<SignalDef[]>(() => [
},
{
key: "oom",
kind: "event",
label: t("cloud.welcome.signals.oom"),
description: t("cloud.welcome.signals.oom-desc"),
ruleName: "Container killed (OOM)",
@@ -150,12 +155,22 @@ const signals = computed<SignalDef[]>(() => [
},
{
key: "restart",
kind: "event",
label: t("cloud.welcome.signals.restart"),
description: t("cloud.welcome.signals.restart-desc"),
ruleName: "Container restarted",
expression: 'name == "restart"',
defaultOn: false,
},
{
key: "disk",
kind: "metric",
label: t("cloud.welcome.signals.disk"),
description: t("cloud.welcome.signals.disk-desc"),
ruleName: "Volume running out of space",
expression: "any(mounts, .usedPercent >= 85)",
defaultOn: true,
},
]);
const selectedSignals = ref<SignalKey[]>([]);
@@ -241,10 +256,12 @@ async function createDefaultAlerts() {
dispatcherId: cloud.id,
logExpression: "",
containerExpression: "true",
eventExpression: signal.expression,
metricExpression: "",
cooldown: 0,
sampleWindow: 0,
eventExpression: signal.kind === "event" ? signal.expression : "",
metricExpression: signal.kind === "metric" ? signal.expression : "",
// Metric alerts: don't re-fire more than once an hour per container,
// and require the threshold to hold for the default sample window.
cooldown: signal.kind === "metric" ? 3600 : 0,
sampleWindow: signal.kind === "metric" ? 60 : 0,
}),
}).then((res) => {
if (!res.ok) throw new Error("rule POST failed");
+11
View File
@@ -80,6 +80,11 @@ export function createMetricHints(): Completion[] {
{ label: "cpu", detail: "CPU usage percent", type: "property" },
{ label: "memory", detail: "memory usage percent", type: "property" },
{ label: "memoryUsage", detail: "memory usage bytes", type: "property" },
{ label: "mounts", detail: "list of container mounts with free-space info", type: "property" },
{ label: ".usedPercent", detail: "mount field: % of mount used", type: "property" },
{ label: ".availableBytes", detail: "mount field: free bytes on mount", type: "property" },
{ label: ".destination", detail: "mount field: in-container mount path", type: "property" },
{ label: "any(mounts, ...)", detail: "true if any mount matches the predicate", type: "keyword" },
...exprOperators,
{ label: ">", detail: "greater than", type: "operator" },
{ label: "<", detail: "less than", type: "operator" },
@@ -88,6 +93,12 @@ export function createMetricHints(): Completion[] {
{ label: "cpu > 80", detail: "CPU over 80%", type: "text", boost: 10 },
{ label: "memory > 90", detail: "memory over 90%", type: "text", boost: 10 },
{ label: "cpu > 80 || memory > 90", detail: "CPU or memory high", type: "text", boost: 10 },
{
label: "any(mounts, .usedPercent >= 85)",
detail: "alert when any mount is over 85% full",
type: "text",
boost: 10,
},
];
}
+1 -1
View File
@@ -112,7 +112,7 @@ func (m *MockedClientService) UpdateContainer(ctx context.Context, c container.C
var wantedContainer = container.Container{}
func init() {
faker.FakeData(&wantedContainer, options.WithFieldsToIgnore("Stats", "MountStats"))
faker.FakeData(&wantedContainer, options.WithFieldsToIgnore("Stats", "MountStats", "Ports"))
wantedContainer.FinishedAt = wantedContainer.FinishedAt.UTC()
wantedContainer.Created = wantedContainer.Created.UTC()
wantedContainer.StartedAt = wantedContainer.StartedAt.UTC()
+1
View File
@@ -118,6 +118,7 @@ func (m *Manager) processStatEvent(event *ContainerStatEvent) {
CPUPercent: event.Stat.CPUPercent,
MemoryPercent: event.Stat.MemoryPercent,
MemoryUsage: event.Stat.MemoryUsage,
Mounts: FromContainerMounts(event.Container),
}
notificationContainer := FromContainerModel(event.Container, event.Host)
+5 -1
View File
@@ -40,7 +40,11 @@ func NewContainerStatsListener(ctx context.Context, clients []container_support.
clients: clients,
channel: make(chan *ContainerStatEvent, 1000),
parentCtx: ctx,
cache: NewTTLCache[string, containerInfo](ctx, 30*time.Second),
// 5s TTL: the cache exists to avoid re-resolving the container+host on every
// per-second stat tick, but mount free-space (Container.MountStats) is refreshed
// out-of-band by the volume monitor and we want metric expressions that read
// `mounts[*].usedPercent` to see fresh values within a few seconds.
cache: NewTTLCache[string, containerInfo](ctx, 5*time.Second),
}
}
+31 -1
View File
@@ -21,6 +21,36 @@ func isDozzleContainer(c container.Container) bool {
return strings.Contains(c.Image, "amir20/dozzle")
}
// FromContainerMounts converts a container's MountStats map into the slice form
// exposed to metric expressions. Mounts whose free-space could not be measured
// (Available == false — e.g. Windows volumes or permission errors) are skipped
// so that `any(mounts, .usedPercent >= 85)` never fires on unmeasurable mounts.
func FromContainerMounts(c container.Container) []types.NotificationMount {
if len(c.MountStats) == 0 {
return nil
}
mounts := make([]types.NotificationMount, 0, len(c.MountStats))
for _, ms := range c.MountStats {
if !ms.Available || ms.Total == 0 {
continue
}
used := ms.Used
// Some fs implementations report Used as Total-Free; recompute to be safe.
if used == 0 && ms.Free <= ms.Total {
used = ms.Total - ms.Free
}
mounts = append(mounts, types.NotificationMount{
Destination: ms.Destination,
TotalBytes: ms.Total,
FreeBytes: ms.Free,
UsedBytes: used,
UsedPercent: float64(used) / float64(ms.Total) * 100.0,
AvailableBytes: ms.Free,
})
}
return mounts
}
// FromContainerModel converts internal container.Container to types.NotificationContainer
func FromContainerModel(c container.Container, host container.Host) types.NotificationContainer {
return types.NotificationContainer{
@@ -96,7 +126,7 @@ type Subscription struct {
ContainerExpression string `json:"containerExpression" yaml:"containerExpression"`
MetricExpression string `json:"metricExpression,omitempty" yaml:"metricExpression,omitempty"`
EventExpression string `json:"eventExpression,omitempty" yaml:"eventExpression,omitempty"`
Cooldown int `json:"cooldown,omitempty" yaml:"cooldown,omitempty"` // seconds between metric notifications, default 300
Cooldown int `json:"cooldown,omitempty" yaml:"cooldown,omitempty"` // seconds between metric notifications, default 300
SampleWindow int `json:"sampleWindow,omitempty" yaml:"sampleWindow,omitempty"` // seconds of samples to evaluate, default 15
// Compiled filter expressions
+106
View File
@@ -433,3 +433,109 @@ func TestFromLogEvent_OrderedMapConversion(t *testing.T) {
})
}
}
func TestSubscription_MatchesMetric_Mounts(t *testing.T) {
tests := []struct {
name string
expression string
stat types.NotificationStat
want bool
}{
{
name: "any mount over 85 percent matches",
expression: `any(mounts, .usedPercent >= 85)`,
stat: types.NotificationStat{
Mounts: []types.NotificationMount{
{Destination: "/data", TotalBytes: 100, UsedBytes: 50, FreeBytes: 50, UsedPercent: 50},
{Destination: "/logs", TotalBytes: 100, UsedBytes: 90, FreeBytes: 10, UsedPercent: 90},
},
},
want: true,
},
{
name: "no mount over 85 percent does not match",
expression: `any(mounts, .usedPercent >= 85)`,
stat: types.NotificationStat{
Mounts: []types.NotificationMount{
{Destination: "/data", TotalBytes: 100, UsedBytes: 50, FreeBytes: 50, UsedPercent: 50},
{Destination: "/logs", TotalBytes: 100, UsedBytes: 80, FreeBytes: 20, UsedPercent: 80},
},
},
want: false,
},
{
name: "empty mounts does not match",
expression: `any(mounts, .usedPercent >= 85)`,
stat: types.NotificationStat{},
want: false,
},
{
name: "available bytes filter",
expression: `any(mounts, .availableBytes < 1024)`,
stat: types.NotificationStat{
Mounts: []types.NotificationMount{
{Destination: "/data", FreeBytes: 500, AvailableBytes: 500},
},
},
want: true,
},
{
name: "combined cpu and mount expression",
expression: `cpu > 80 || any(mounts, .usedPercent >= 85)`,
stat: types.NotificationStat{
CPUPercent: 10,
Mounts: []types.NotificationMount{
{Destination: "/data", UsedPercent: 95},
},
},
want: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
program, err := expr.Compile(tt.expression, expr.Env(types.NotificationStat{}))
require.NoError(t, err, "failed to compile expression")
sub := &Subscription{
MetricExpression: tt.expression,
MetricProgram: program,
}
got := sub.MatchesMetric(tt.stat)
assert.Equal(t, tt.want, got)
})
}
}
func TestFromContainerMounts(t *testing.T) {
t.Run("skips unavailable mounts", func(t *testing.T) {
c := container.Container{
MountStats: map[string]container.MountStat{
"/data": {Destination: "/data", Total: 100, Used: 80, Free: 20, Available: true},
"/win": {Destination: "/win", Total: 0, Available: false},
},
}
got := FromContainerMounts(c)
require.Len(t, got, 1)
assert.Equal(t, "/data", got[0].Destination)
assert.InDelta(t, 80.0, got[0].UsedPercent, 0.01)
assert.Equal(t, uint64(20), got[0].AvailableBytes)
})
t.Run("derives used from total minus free", func(t *testing.T) {
c := container.Container{
MountStats: map[string]container.MountStat{
"/data": {Destination: "/data", Total: 100, Used: 0, Free: 25, Available: true},
},
}
got := FromContainerMounts(c)
require.Len(t, got, 1)
assert.Equal(t, uint64(75), got[0].UsedBytes)
assert.InDelta(t, 75.0, got[0].UsedPercent, 0.01)
})
t.Run("nil for empty input", func(t *testing.T) {
assert.Nil(t, FromContainerMounts(container.Container{}))
})
}
+2
View File
@@ -344,6 +344,8 @@ cloud:
oom-desc: "Udløses, når Docker rapporterer et out-of-memory-kill."
restart: "Container genstartede"
restart-desc: "Slået fra som standard — støjende alene; Cloud bruger det også til loop-detektion."
disk: "Diskplads på et volumen er ved at slippe op"
disk-desc: "Udløses, når et monteret volumen er over 85% fyldt."
create-alerts: "Slå valgte signaler til"
later: "Det gør jeg senere"
cloud-search:
+2
View File
@@ -344,6 +344,8 @@ cloud:
oom-desc: "Wird ausgelöst, wenn Docker einen Out-of-Memory-Kill meldet."
restart: "Container neu gestartet"
restart-desc: "Standardmäßig aus — für sich allein laut; Cloud nutzt es auch zur Loop-Erkennung."
disk: "Speicherplatz auf einem Volume wird knapp"
disk-desc: "Wird ausgelöst, wenn ein eingebundenes Volume zu über 85% belegt ist."
create-alerts: "Ausgewählte Signale aktivieren"
later: "Das mache ich später"
cloud-search:
+2
View File
@@ -385,6 +385,8 @@ cloud:
oom-desc: "Fires when Docker reports an out-of-memory kill."
restart: Container restarted
restart-desc: "Off by default — noisy on its own; Cloud also uses this for loop detection."
disk: Disk space running low on any volume
disk-desc: "Fires when any mounted volume is over 85% full."
create-alert: Create Your First Alert
create-alerts: Turn on selected signals
later: "I'll do this later"
+2
View File
@@ -372,5 +372,7 @@ cloud:
oom-desc: "Se dispara cuando Docker reporta un kill por falta de memoria."
restart: "Contenedor reiniciado"
restart-desc: "Desactivado por defecto — ruidoso por sí solo; Cloud también lo usa para detectar bucles."
disk: "Espacio en disco bajo en algún volumen"
disk-desc: "Se dispara cuando algún volumen montado supera el 85% de uso."
create-alerts: "Activar señales seleccionadas"
later: "Lo haré después"
+2
View File
@@ -344,6 +344,8 @@ cloud:
oom-desc: "Se déclenche quand Docker signale un kill par manque de mémoire."
restart: "Conteneur redémarré"
restart-desc: "Désactivé par défaut — bruyant seul ; Cloud l'utilise aussi pour détecter les boucles."
disk: "Espace disque faible sur un volume"
disk-desc: "Se déclenche quand un volume monté est rempli à plus de 85%."
create-alerts: "Activer les signaux sélectionnés"
later: "Je ferai ça plus tard"
cloud-search:
+2
View File
@@ -356,6 +356,8 @@ cloud:
oom-desc: "Dipicu ketika Docker melaporkan kill karena kehabisan memori."
restart: "Kontainer dimulai ulang"
restart-desc: "Mati secara default — berisik sendirian; Cloud juga memakainya untuk deteksi loop."
disk: "Ruang disk hampir habis di salah satu volume"
disk-desc: "Dipicu ketika ada volume yang terpasang lebih dari 85% terisi."
create-alerts: "Aktifkan sinyal terpilih"
later: "Nanti saja"
cloud-search:
+2
View File
@@ -344,6 +344,8 @@ cloud:
oom-desc: "Si attiva quando Docker segnala un kill per esaurimento memoria."
restart: "Container riavviato"
restart-desc: "Disattivo di default — rumoroso da solo; Cloud lo usa anche per rilevare loop."
disk: "Spazio su disco in esaurimento su un volume"
disk-desc: "Si attiva quando un volume montato supera l'85% di utilizzo."
create-alerts: "Attiva i segnali selezionati"
later: "Lo farò dopo"
cloud-search:
+2
View File
@@ -347,6 +347,8 @@ cloud:
oom-desc: "Docker가 메모리 부족 종료를 보고할 때 발동합니다."
restart: "컨테이너 재시작됨"
restart-desc: "기본 꺼짐 — 단독으로는 시끄러움; Cloud는 루프 감지에도 사용합니다."
disk: "볼륨의 디스크 공간 부족"
disk-desc: "마운트된 볼륨의 사용량이 85%를 초과하면 발동합니다."
create-alerts: "선택한 신호 켜기"
later: "나중에 할게요"
cloud-search:
+2
View File
@@ -345,6 +345,8 @@ cloud:
oom-desc: "Wordt geactiveerd wanneer Docker een out-of-memory kill meldt."
restart: "Container herstart"
restart-desc: "Standaard uit — luidruchtig op zichzelf; Cloud gebruikt het ook voor loopdetectie."
disk: "Schijfruimte raakt op op een volume"
disk-desc: "Wordt geactiveerd wanneer een aangekoppeld volume voor meer dan 85% vol is."
create-alerts: "Geselecteerde signalen aanzetten"
later: "Dat doe ik later"
cloud-search:
+2
View File
@@ -351,6 +351,8 @@ cloud:
oom-desc: "Wyzwala się, gdy Docker zgłasza zabicie z powodu braku pamięci."
restart: "Kontener zrestartowany"
restart-desc: "Domyślnie wyłączone — głośne samo w sobie; Cloud używa tego też do wykrywania pętli."
disk: "Mało miejsca na dysku na jednym z wolumenów"
disk-desc: "Wyzwala się, gdy jakikolwiek zamontowany wolumen jest zapełniony w ponad 85%."
create-alerts: "Włącz wybrane sygnały"
later: "Zrobię to później"
cloud-search:
+2
View File
@@ -343,6 +343,8 @@ cloud:
oom-desc: "É disparado quando o Docker reporta um kill por falta de memória."
restart: "Contentor reiniciado"
restart-desc: "Desativado por omissão — ruidoso por si só; o Cloud também o usa para deteção de loops."
disk: "Espaço em disco a esgotar-se num volume"
disk-desc: "É disparado quando algum volume montado está com mais de 85% de utilização."
create-alerts: "Ativar sinais selecionados"
later: "Farei isso depois"
cloud-search:
+2
View File
@@ -344,6 +344,8 @@ cloud:
oom-desc: "Срабатывает, когда Docker сообщает о завершении из-за нехватки памяти."
restart: "Контейнер перезапущен"
restart-desc: "Выключено по умолчанию — шумно само по себе; Cloud также использует это для обнаружения циклов."
disk: "Заканчивается место на одном из томов"
disk-desc: "Срабатывает, когда любой смонтированный том заполнен более чем на 85%."
create-alerts: "Включить выбранные сигналы"
later: "Сделаю это позже"
cloud-search:
+2
View File
@@ -349,6 +349,8 @@ cloud:
oom-desc: "Sproži se, ko Docker poroča o prekinitvi zaradi pomanjkanja pomnilnika."
restart: "Vsebnik znova zagnan"
restart-desc: "Privzeto izklopljeno — sam po sebi hrupen; Cloud ga uporablja tudi za zaznavanje zank."
disk: "Na enem od nosilcev zmanjkuje prostora na disku"
disk-desc: "Sproži se, ko je kateri koli priklopljen nosilec napolnjen več kot 85%."
create-alerts: "Vklopi izbrane signale"
later: "To bom naredil pozneje"
cloud-search:
+2
View File
@@ -344,6 +344,8 @@ cloud:
oom-desc: "Docker bellek yetersizliği nedeniyle bir sonlandırma bildirdiğinde tetiklenir."
restart: "Container yeniden başlatıldı"
restart-desc: "Varsayılan olarak kapalı — tek başına gürültülü; Cloud bunu döngü tespiti için de kullanır."
disk: "Bir bağlamada disk alanı azalıyor"
disk-desc: "Bağlı bir birim %85 doluluğu aştığında tetiklenir."
create-alerts: "Seçili sinyalleri aç"
later: "Bunu daha sonra yapacağım"
cloud-search:
+2
View File
@@ -347,6 +347,8 @@ cloud:
oom-desc: "當 Docker 回報記憶體不足而終止時觸發。"
restart: "容器已重新啟動"
restart-desc: "預設關閉 — 單獨使用會很吵;Cloud 也用它來偵測循環。"
disk: "某個磁碟區的磁碟空間即將用盡"
disk-desc: "當任一掛載的磁碟區使用率超過 85% 時觸發。"
create-alerts: "啟用所選訊號"
later: "稍後再說"
cloud-search:
+2
View File
@@ -344,6 +344,8 @@ cloud:
oom-desc: "当 Docker 报告内存不足终止时触发。"
restart: "容器已重启"
restart-desc: "默认关闭 — 单独使用会很吵;Cloud 也用它来检测循环。"
disk: "某个卷的磁盘空间即将耗尽"
disk-desc: "当任一已挂载的卷使用率超过 85% 时触发。"
create-alerts: "启用所选信号"
later: "稍后再说"
cloud-search:
+18 -3
View File
@@ -48,9 +48,24 @@ type NotificationLog struct {
// NotificationStat represents container resource metrics for metric-based alerts
type NotificationStat struct {
CPUPercent float64 `json:"cpu" expr:"cpu"`
MemoryPercent float64 `json:"memory" expr:"memory"`
MemoryUsage float64 `json:"memoryUsage" expr:"memoryUsage"`
CPUPercent float64 `json:"cpu" expr:"cpu"`
MemoryPercent float64 `json:"memory" expr:"memory"`
MemoryUsage float64 `json:"memoryUsage" expr:"memoryUsage"`
Mounts []NotificationMount `json:"mounts,omitempty" expr:"mounts"`
}
// NotificationMount represents a single container mount's free-space stats,
// exposed to metric expressions via the `mounts` field (e.g. `any(mounts, .usedPercent >= 85)`).
// Only mounts where free-space reporting succeeded (Available == true on the source MountStat)
// are included — mounts that can't be measured (Windows volumes, permission errors) are skipped
// so they never trigger or suppress an alert spuriously.
type NotificationMount struct {
Destination string `json:"destination" expr:"destination"`
TotalBytes uint64 `json:"totalBytes" expr:"totalBytes"`
FreeBytes uint64 `json:"freeBytes" expr:"freeBytes"`
UsedBytes uint64 `json:"usedBytes" expr:"usedBytes"`
UsedPercent float64 `json:"usedPercent" expr:"usedPercent"`
AvailableBytes uint64 `json:"availableBytes" expr:"availableBytes"` // alias of FreeBytes for expression ergonomics
}
// NotificationEvent represents a Docker container lifecycle event for event-based alerts