mirror of
https://github.com/rajnandan1/kener.git
synced 2026-06-23 04:10:22 +00:00
fix(confirmation): address PR #757 review comments
- PATCH: confirmation_threshold:null resets to 1 (off); undefined keeps existing (Copilot) - backfill note is per-row severity-aware: 'Down'/'Degraded confirmed after N…' (Copilot) - enforce 1–60 at the data layer via clampConfirmationThreshold on insert/update, covering all app write paths incl. the manage API (coderabbit) - anchor via dedicated getLastObservedStatus query so a long incident/maintenance window can no longer push the anchor out of the lookback and bypass damping (coderabbit) - overlays fetched AFTER execute() and keyed by job ts, making the freeze gate timestamp-safe and catching mid-check overlays (coderabbit + greptile) - use Array.includes over indexOf!==-1 (greptile); refresh pendingHold doc (coderabbit) Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
@@ -65,6 +65,7 @@ class DbImpl {
|
||||
consecutivelyLatencyGreaterThan!: MonitoringRepository["consecutivelyLatencyGreaterThan"];
|
||||
consecutivelyLatencyLessThan!: MonitoringRepository["consecutivelyLatencyLessThan"];
|
||||
getRecentSamplesForConfirmation!: MonitoringRepository["getRecentSamplesForConfirmation"];
|
||||
getLastObservedStatus!: MonitoringRepository["getLastObservedStatus"];
|
||||
backfillConfirmedStatus!: MonitoringRepository["backfillConfirmedStatus"];
|
||||
updateMonitoringData!: MonitoringRepository["updateMonitoringData"];
|
||||
deleteMonitorDataByTag!: MonitoringRepository["deleteMonitorDataByTag"];
|
||||
@@ -427,6 +428,7 @@ class DbImpl {
|
||||
this.consecutivelyLatencyGreaterThan = this.monitoring.consecutivelyLatencyGreaterThan.bind(this.monitoring);
|
||||
this.consecutivelyLatencyLessThan = this.monitoring.consecutivelyLatencyLessThan.bind(this.monitoring);
|
||||
this.getRecentSamplesForConfirmation = this.monitoring.getRecentSamplesForConfirmation.bind(this.monitoring);
|
||||
this.getLastObservedStatus = this.monitoring.getLastObservedStatus.bind(this.monitoring);
|
||||
this.backfillConfirmedStatus = this.monitoring.backfillConfirmedStatus.bind(this.monitoring);
|
||||
this.updateMonitoringData = this.monitoring.updateMonitoringData.bind(this.monitoring);
|
||||
this.deleteMonitorDataByTag = this.monitoring.deleteMonitorDataByTag.bind(this.monitoring);
|
||||
|
||||
@@ -352,19 +352,40 @@ export class MonitoringRepository extends BaseRepository {
|
||||
}
|
||||
|
||||
/**
|
||||
* Backfill a confirmed status flip: set each row's committed status to its observed
|
||||
* raw_status. `message` is applied to error_message (use a generic string for confirmed
|
||||
* outages, null for confirmed recoveries).
|
||||
* The committed status of the most recent real scheduled-check observation before `beforeTs`
|
||||
* — the Confirmation Threshold "anchor" (the side currently shown). Looks past overlays,
|
||||
* MANUAL/DEFAULT, and NO_DATA so a long incident/maintenance window can never hide the anchor
|
||||
* (issue #712). Returns null when there is no prior observation (cold start).
|
||||
*/
|
||||
async getLastObservedStatus(monitor_tag: string, beforeTs: number): Promise<string | null> {
|
||||
const row = await this.knex("monitoring_data")
|
||||
.select("status")
|
||||
.where("monitor_tag", monitor_tag)
|
||||
.where("timestamp", "<", beforeTs)
|
||||
.whereIn("type", OBSERVED_CHECK_TYPES)
|
||||
.whereNot("status", GC.NO_DATA)
|
||||
.orderBy("timestamp", "desc")
|
||||
.limit(1)
|
||||
.first();
|
||||
return row ? (row.status ?? null) : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Backfill a confirmed status flip: set each row's committed status to its observed raw_status.
|
||||
* `confirmThreshold` is the number of consecutive checks that confirmed the flip — when it is a
|
||||
* number the run resolved to an unhealthy side and a per-row note ("Down"/"Degraded confirmed
|
||||
* after N consecutive checks", matching each row's own severity) is appended to the existing
|
||||
* error text; when it is null the run resolved to UP (recovery) and the error text is cleared.
|
||||
*/
|
||||
async backfillConfirmedStatus(
|
||||
monitor_tag: string,
|
||||
timestamps: number[],
|
||||
message: string | null,
|
||||
confirmThreshold: number | null,
|
||||
): Promise<number> {
|
||||
if (timestamps.length === 0) return 0;
|
||||
|
||||
// Recovery (confirmed UP): rows become the UP side — clear any held error text in one update.
|
||||
if (message === null) {
|
||||
if (confirmThreshold === null) {
|
||||
return await this.knex("monitoring_data")
|
||||
.where("monitor_tag", monitor_tag)
|
||||
.whereIn("timestamp", timestamps)
|
||||
@@ -375,10 +396,10 @@ export class MonitoringRepository extends BaseRepository {
|
||||
});
|
||||
}
|
||||
|
||||
// Confirmed unhealthy: set each row's status from its observed raw_status and APPEND the
|
||||
// confirmation note to the existing error text (preserving the observed failure reason).
|
||||
// Done per-row for portable string concatenation (|| vs CONCAT differ across SQLite/PG/MySQL)
|
||||
// and to stay idempotent if the backfill is ever replayed.
|
||||
// Confirmed unhealthy: set each row's status from its observed raw_status and APPEND a
|
||||
// severity-matched confirmation note to the existing error text (preserving the observed
|
||||
// failure reason). Done per-row for portable string concatenation (|| vs CONCAT differ across
|
||||
// SQLite/PG/MySQL), per-row severity wording, and idempotency if the backfill is replayed.
|
||||
const rows = await this.knex("monitoring_data")
|
||||
.select("timestamp", "error_message", "raw_status")
|
||||
.where("monitor_tag", monitor_tag)
|
||||
@@ -387,14 +408,16 @@ export class MonitoringRepository extends BaseRepository {
|
||||
|
||||
let updated = 0;
|
||||
for (const row of rows) {
|
||||
const severity = row.raw_status === GC.DEGRADED ? "Degraded" : "Down";
|
||||
const note = `${severity} confirmed after ${confirmThreshold} consecutive checks`;
|
||||
const existing: string | null = row.error_message;
|
||||
let nextMessage: string;
|
||||
if (!existing) {
|
||||
nextMessage = message;
|
||||
} else if (existing.indexOf(message) !== -1) {
|
||||
nextMessage = note;
|
||||
} else if (existing.indexOf(note) !== -1) {
|
||||
nextMessage = existing; // already appended — keep idempotent
|
||||
} else {
|
||||
nextMessage = `${existing} | ${message}`;
|
||||
nextMessage = `${existing} | ${note}`;
|
||||
}
|
||||
updated += await this.knex("monitoring_data")
|
||||
.where({ monitor_tag, timestamp: row.timestamp })
|
||||
|
||||
@@ -2,6 +2,17 @@ import type { Knex as KnexType } from "knex";
|
||||
import { BaseRepository, type MonitorFilter, type CountResult } from "./base.js";
|
||||
import type { MonitorRecord, MonitorRecordInsert } from "../../types/db.js";
|
||||
|
||||
/**
|
||||
* Clamp the Confirmation Threshold to its 1–60 invariant at the data layer, so the bound holds
|
||||
* for every app write path (v4 API, manage API, clone, group), not only the v4 API validator.
|
||||
* A non-finite/missing value defaults to 1 (off).
|
||||
*/
|
||||
function clampConfirmationThreshold(value: number | null | undefined): number {
|
||||
const n = Math.round(Number(value));
|
||||
if (!Number.isFinite(n)) return 1;
|
||||
return Math.min(60, Math.max(1, n));
|
||||
}
|
||||
|
||||
/**
|
||||
* Repository for monitors CRUD operations
|
||||
*/
|
||||
@@ -28,7 +39,7 @@ export class MonitorsRepository extends BaseRepository {
|
||||
type_data: data.type_data,
|
||||
day_degraded_minimum_count: data.day_degraded_minimum_count,
|
||||
day_down_minimum_count: data.day_down_minimum_count,
|
||||
confirmation_threshold: data.confirmation_threshold ?? 1,
|
||||
confirmation_threshold: clampConfirmationThreshold(data.confirmation_threshold),
|
||||
include_degraded_in_downtime: data.include_degraded_in_downtime,
|
||||
is_hidden: data.is_hidden || "NO",
|
||||
monitor_settings_json: data.monitor_settings_json,
|
||||
@@ -52,7 +63,7 @@ export class MonitorsRepository extends BaseRepository {
|
||||
type_data: data.type_data,
|
||||
day_degraded_minimum_count: data.day_degraded_minimum_count,
|
||||
day_down_minimum_count: data.day_down_minimum_count,
|
||||
confirmation_threshold: data.confirmation_threshold ?? 1,
|
||||
confirmation_threshold: clampConfirmationThreshold(data.confirmation_threshold),
|
||||
include_degraded_in_downtime: data.include_degraded_in_downtime,
|
||||
is_hidden: data.is_hidden,
|
||||
monitor_settings_json: data.monitor_settings_json,
|
||||
|
||||
@@ -26,8 +26,13 @@ const getQueue = () => {
|
||||
return monitorExecuteQueue;
|
||||
};
|
||||
|
||||
async function manualMaintenance(monitor: MonitorRecordTyped): Promise<{ [timestamp: number]: MonitoringResult }> {
|
||||
let startTs = GetMinuteStartNowTimestampUTC();
|
||||
async function manualMaintenance(
|
||||
monitor: MonitorRecordTyped,
|
||||
ts?: number,
|
||||
): Promise<{ [timestamp: number]: MonitoringResult }> {
|
||||
// Key by the job's `ts` (already a minute-start) so the overlay aligns with the realtime/default
|
||||
// rows and the freeze gate; fall back to "now" only when called without a ts.
|
||||
let startTs = ts !== undefined ? ts : GetMinuteStartNowTimestampUTC();
|
||||
let maintenanceArr = await db.getMaintenancesByMonitorTagRealtime(monitor.tag, startTs);
|
||||
|
||||
let impact = "";
|
||||
@@ -67,8 +72,13 @@ async function manualMaintenance(monitor: MonitorRecordTyped): Promise<{ [timest
|
||||
return manualData;
|
||||
}
|
||||
|
||||
async function manualIncident(monitor: MonitorRecordTyped): Promise<{ [timestamp: number]: MonitoringResult }> {
|
||||
let startTs = GetMinuteStartNowTimestampUTC();
|
||||
async function manualIncident(
|
||||
monitor: MonitorRecordTyped,
|
||||
ts?: number,
|
||||
): Promise<{ [timestamp: number]: MonitoringResult }> {
|
||||
// Key by the job's `ts` (already a minute-start) so the overlay aligns with the realtime/default
|
||||
// rows and the freeze gate; fall back to "now" only when called without a ts.
|
||||
let startTs = ts !== undefined ? ts : GetMinuteStartNowTimestampUTC();
|
||||
let incidentArr = await db.getIncidentsByMonitorTagRealtime(monitor.tag, startTs);
|
||||
|
||||
let impact = "";
|
||||
@@ -114,11 +124,14 @@ const addWorker = () => {
|
||||
const { monitor, ts } = job.data as JobData;
|
||||
const serviceClient = new Service(monitor as MonitorWithType);
|
||||
|
||||
let incidentData: MonitoringResultTS = await manualIncident(monitor);
|
||||
let maintenanceData: MonitoringResultTS = await manualMaintenance(monitor);
|
||||
|
||||
const exeResult = await serviceClient.execute(ts);
|
||||
|
||||
// Fetch overlays AFTER the check runs so a maintenance/incident that starts mid-check is still
|
||||
// detected, and key them by the job's `ts` so the freeze gate (incidentData[ts]) is
|
||||
// timestamp-safe even if the job is delayed or retried (#756).
|
||||
let incidentData: MonitoringResultTS = await manualIncident(monitor, ts);
|
||||
let maintenanceData: MonitoringResultTS = await manualMaintenance(monitor, ts);
|
||||
|
||||
let realtimeData: MonitoringResultTS = {};
|
||||
if (exeResult) {
|
||||
realtimeData[ts] = exeResult;
|
||||
@@ -127,7 +140,7 @@ const addWorker = () => {
|
||||
|
||||
// Confirmation Threshold damping (#712): scheduled checks only.
|
||||
const threshold = Number(monitor.confirmation_threshold ?? 1);
|
||||
const isScheduledCheck = ([GC.REALTIME, GC.TIMEOUT, GC.ERROR] as string[]).indexOf(exeResult.type) !== -1;
|
||||
const isScheduledCheck = ([GC.REALTIME, GC.TIMEOUT, GC.ERROR] as string[]).includes(exeResult.type);
|
||||
// Confirmation Threshold freezes while an incident/maintenance overlay is active for this
|
||||
// minute: the overlay wins display and the count must neither advance nor backfill (#756).
|
||||
const overlayActive = incidentData[ts] !== undefined || maintenanceData[ts] !== undefined;
|
||||
|
||||
@@ -20,21 +20,26 @@ export interface ResolveInput {
|
||||
export interface ResolveResult {
|
||||
/** Effective status to commit for this minute. */
|
||||
status: string;
|
||||
/** True while holding the confirmed side (pending) — the caller must blank latency/error for this row. */
|
||||
/**
|
||||
* True while the displayed status is the held confirmed side (pending confirmation) rather
|
||||
* than the observed side. The caller keeps the observed latency and error text (tagging the
|
||||
* row) — it does not blank them.
|
||||
*/
|
||||
pendingHold: boolean;
|
||||
}
|
||||
|
||||
/** Minimal data access this resolver needs; defaults to the db singleton, injectable for tests. */
|
||||
export interface ConfirmationDeps {
|
||||
getRecentSamplesForConfirmation: typeof db.getRecentSamplesForConfirmation;
|
||||
getLastObservedStatus: typeof db.getLastObservedStatus;
|
||||
backfillConfirmedStatus: typeof db.backfillConfirmedStatus;
|
||||
}
|
||||
|
||||
// Overlay sample types that freeze the count (must match OVERLAY_TYPES in the monitoring repository).
|
||||
const OVERLAY_TYPES: string[] = [GC.INCIDENT, GC.MAINTENANCE];
|
||||
// Extra lookback rows beyond the threshold: headroom for the anchor row and any interleaved
|
||||
// overlay rows. NO_DATA observations are excluded by the query (neutral), so they never
|
||||
// consume slots — the buffer does not need to scale with NO_DATA density.
|
||||
// Extra lookback rows beyond the threshold for the pending-run scan: headroom for interleaved
|
||||
// overlay rows. NO_DATA observations are excluded by the query, and the anchor is fetched
|
||||
// separately, so the buffer need not scale with NO_DATA density or overlay-window length.
|
||||
const LOOKBACK_BUFFER = 10;
|
||||
|
||||
/**
|
||||
@@ -43,10 +48,10 @@ const LOOKBACK_BUFFER = 10;
|
||||
*
|
||||
* IMPORTANT ordering contract: this MUST be called BEFORE the current row at `ts` is
|
||||
* persisted, and only when no incident/maintenance overlay is active for `ts` (the caller
|
||||
* gates that — overlays freeze the count). It anchors on the most recent stored sample
|
||||
* gates that — overlays freeze the count). It anchors on the most recent stored observation
|
||||
* (timestamp < ts).
|
||||
*
|
||||
* Neutral (`NO_DATA`) observations are skipped in the count (neither advance nor reset).
|
||||
* Neutral (`NO_DATA`) observations are excluded from the count (neither advance nor reset).
|
||||
* Overlay rows act as a hard boundary: the pending run never crosses one, so monitoring
|
||||
* resumes with a fresh count after an incident/maintenance window.
|
||||
*/
|
||||
@@ -62,34 +67,24 @@ export async function resolveConfirmedStatus(
|
||||
return { status: rawStatus, pendingHold: false };
|
||||
}
|
||||
|
||||
const recent = await deps.getRecentSamplesForConfirmation(monitor_tag, ts, threshold + LOOKBACK_BUFFER);
|
||||
|
||||
// Confirmed side = the most recent non-neutral scheduled-check observation's committed
|
||||
// status (overlays and NO_DATA are skipped — they don't define the confirmed side).
|
||||
let confirmedSide: Side = null;
|
||||
for (const row of recent) {
|
||||
if (row.type !== null && OVERLAY_TYPES.indexOf(row.type) !== -1) continue; // overlay: skip for anchor
|
||||
const s = sideOf(row.status);
|
||||
if (s !== null) {
|
||||
confirmedSide = s;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Anchor = the side currently shown = the most recent real observation's committed status.
|
||||
// Fetched with a dedicated query (not from the windowed scan) so a long incident/maintenance
|
||||
// window can never push the anchor out of range and bypass damping.
|
||||
const confirmedStatus = await deps.getLastObservedStatus(monitor_tag, ts);
|
||||
const confirmedSide = sideOf(confirmedStatus);
|
||||
|
||||
// Cold start / no usable anchor / same side: commit immediately.
|
||||
if (confirmedSide === null || observedSide === confirmedSide) {
|
||||
if (confirmedStatus === null || confirmedSide === null || observedSide === confirmedSide) {
|
||||
return { status: rawStatus, pendingHold: false };
|
||||
}
|
||||
|
||||
// The displayed status to hold while pending = the confirmed side's actual stored status.
|
||||
const confirmedStatus = confirmedSideStatus(recent, confirmedSide);
|
||||
|
||||
// Opposite side: count the trailing pending run (incl. current = 1), skipping NO_DATA
|
||||
// (neutral) and stopping at an overlay boundary (freeze) or a confirmed-side observation.
|
||||
// Opposite side: count the trailing pending run (incl. current = 1), stopping at an overlay
|
||||
// boundary (freeze) or at a confirmed-side observation. NO_DATA rows are excluded by the query.
|
||||
const recent = await deps.getRecentSamplesForConfirmation(monitor_tag, ts, threshold + LOOKBACK_BUFFER);
|
||||
let pendingRun = 1;
|
||||
const pendingTimestamps: number[] = [];
|
||||
for (const row of recent) {
|
||||
if (row.type !== null && OVERLAY_TYPES.indexOf(row.type) !== -1) break; // freeze boundary
|
||||
if (row.type !== null && OVERLAY_TYPES.includes(row.type)) break; // freeze boundary
|
||||
const rawSide = sideOf(row.raw_status);
|
||||
if (rawSide === null) continue; // NO_DATA: neutral (excluded by the query; this is a defensive guard)
|
||||
if (rawSide === observedSide && sideOf(row.status) === confirmedSide) {
|
||||
@@ -101,25 +96,12 @@ export async function resolveConfirmedStatus(
|
||||
}
|
||||
|
||||
if (pendingRun >= threshold) {
|
||||
const message =
|
||||
observedSide === "DOWN" ? `Down confirmed after ${threshold} consecutive checks` : null;
|
||||
await deps.backfillConfirmedStatus(monitor_tag, pendingTimestamps, message);
|
||||
// Unhealthy confirm passes the count so the backfill can write a per-row severity note
|
||||
// ("Down"/"Degraded confirmed after N…"); recovery passes null (clears the held error text).
|
||||
await deps.backfillConfirmedStatus(monitor_tag, pendingTimestamps, observedSide === "DOWN" ? threshold : null);
|
||||
return { status: rawStatus, pendingHold: false };
|
||||
}
|
||||
|
||||
// Still pending: hold the confirmed side, display clean.
|
||||
// Still pending: hold the confirmed side.
|
||||
return { status: confirmedStatus, pendingHold: true };
|
||||
}
|
||||
|
||||
/** The displayed status of the confirmed side — the most recent non-neutral, non-overlay row on it. */
|
||||
function confirmedSideStatus(
|
||||
recent: Array<{ status: string | null; raw_status: string | null; type: string | null }>,
|
||||
confirmedSide: Side,
|
||||
): string {
|
||||
for (const row of recent) {
|
||||
if (row.type !== null && OVERLAY_TYPES.indexOf(row.type) !== -1) continue;
|
||||
if (sideOf(row.status) === confirmedSide && row.status) return row.status;
|
||||
}
|
||||
// Defensive fallback (unreachable when an anchor was found above); side is correct, severity may coarsen.
|
||||
return confirmedSide === "UP" ? GC.UP : GC.DOWN;
|
||||
}
|
||||
|
||||
@@ -80,7 +80,10 @@ export const PATCH: RequestHandler = async ({ locals, request }) => {
|
||||
|
||||
updateData.is_hidden = body.is_hidden !== undefined ? body.is_hidden : existingMonitor.is_hidden;
|
||||
|
||||
if (body.confirmation_threshold !== undefined && body.confirmation_threshold !== null) {
|
||||
if (body.confirmation_threshold === null) {
|
||||
// Explicit null resets the grace period to the default (1 = off); undefined keeps the existing value.
|
||||
updateData.confirmation_threshold = 1;
|
||||
} else if (body.confirmation_threshold !== undefined) {
|
||||
const ct = Number(body.confirmation_threshold);
|
||||
if (!Number.isInteger(ct) || ct < 1 || ct > 60) {
|
||||
const errorResponse: BadRequestResponse = {
|
||||
|
||||
Reference in New Issue
Block a user