diff --git a/docs/adr/0009-confirmation-threshold-write-time-backfill.md b/docs/adr/0009-confirmation-threshold-write-time-backfill.md index ed201849..455369ca 100644 --- a/docs/adr/0009-confirmation-threshold-write-time-backfill.md +++ b/docs/adr/0009-confirmation-threshold-write-time-backfill.md @@ -8,6 +8,8 @@ Damping happens at **write time**, not read time. The rejected alternative — r The flip is **retroactive**, not forward-only. Forward-only (first N−1 failing samples stay `UP` forever, `DOWN` begins at confirmation) is simpler — history would stay append-only — but it systematically shaves N−1 minutes off the front of every real outage, permanently flattering uptime and misreporting outage start times. Retroactive backfill rewrites a bounded window (at most threshold−1 rows, one `UPDATE`) at the moment of crossing; it also makes the threshold compose with the alert Failure Threshold by `max` rather than `+`, because once backfilled the alert lookback sees N consecutive confirmed rows immediately. The cost accepted: `monitoring_data` is no longer append-only over the pending window, so the last-value cache and any reader that observed those rows mid-window saw the pre-flip side until convergence. +A held (pending) row keeps the check's real measured latency and observed error text — no diagnostic information is discarded — tagged with `| Status held during grace period`. On confirmation the backfill preserves that text and appends `| Down confirmed after N consecutive checks` (rather than overwriting it); a confirmed recovery clears the error, since the row becomes the UP side. `raw_status` remains the canonical record of what each check observed. + Deliberately accepted boundaries, chosen for one uniform rule rather than special cases: - **Count, not minutes.** The threshold is consecutive observations, well-defined for any cron, matching the alerting thresholds' unit; for the common every-minute cron, count and minutes coincide. diff --git a/src/lib/server/db/repositories/monitoring.ts b/src/lib/server/db/repositories/monitoring.ts index 3e12b3b8..9a5decf4 100644 --- a/src/lib/server/db/repositories/monitoring.ts +++ b/src/lib/server/db/repositories/monitoring.ts @@ -362,14 +362,45 @@ export class MonitoringRepository extends BaseRepository { message: string | null, ): Promise { if (timestamps.length === 0) return 0; - return await this.knex("monitoring_data") + + // Recovery (confirmed UP): rows become the UP side — clear any held error text in one update. + if (message === null) { + return await this.knex("monitoring_data") + .where("monitor_tag", monitor_tag) + .whereIn("timestamp", timestamps) + .whereNotNull("raw_status") + .update({ + status: this.knex.ref("raw_status"), + error_message: null, + }); + } + + // Confirmed unhealthy: set each row's status from its observed raw_status and APPEND the + // confirmation note to the existing error text (preserving the observed failure reason). + // Done per-row for portable string concatenation (|| vs CONCAT differ across SQLite/PG/MySQL) + // and to stay idempotent if the backfill is ever replayed. + const rows = await this.knex("monitoring_data") + .select("timestamp", "error_message", "raw_status") .where("monitor_tag", monitor_tag) .whereIn("timestamp", timestamps) - .whereNotNull("raw_status") - .update({ - status: this.knex.ref("raw_status"), - error_message: message, - }); + .whereNotNull("raw_status"); + + let updated = 0; + for (const row of rows) { + const existing: string | null = row.error_message; + let nextMessage: string; + if (!existing) { + nextMessage = message; + } else if (existing.indexOf(message) !== -1) { + nextMessage = existing; // already appended — keep idempotent + } else { + nextMessage = `${existing} | ${message}`; + } + updated += await this.knex("monitoring_data") + .where({ monitor_tag, timestamp: row.timestamp }) + .update({ status: row.raw_status, error_message: nextMessage }); + } + return updated; } async updateMonitoringData( diff --git a/src/lib/server/queues/monitorExecuteQueue.ts b/src/lib/server/queues/monitorExecuteQueue.ts index 579e7ea1..52db457d 100644 --- a/src/lib/server/queues/monitorExecuteQueue.ts +++ b/src/lib/server/queues/monitorExecuteQueue.ts @@ -140,10 +140,13 @@ const addWorker = () => { }); realtimeData[ts].status = resolved.status; if (resolved.pendingHold) { - // Hold the confirmed side for display, but keep the real measured latency — zeroing it - // would lose data and dent the latency chart during every grace window. Only the error - // text is dropped so a held row never shows a status-contradicting failure message. - delete realtimeData[ts].error_message; + // Hold the confirmed side for display, but PRESERVE the observed latency and error text — + // no diagnostic info is discarded. Tag the row to record that the status is being held + // during the grace period; on confirmation the backfill appends the confirmation note (#756). + const observedError = realtimeData[ts].error_message; + realtimeData[ts].error_message = observedError + ? `${observedError} | Status held during grace period` + : "Status held during grace period"; } } }