added group

This commit is contained in:
Raj Nandan Sharma
2026-02-09 17:35:18 +05:30
parent 228808a267
commit eed71d3633
12 changed files with 697 additions and 160 deletions
@@ -81,15 +81,12 @@ const insertStatusQueue = new Queue({
autostart: true, // Automatically start the queue (optional)
});
export const InsertMonitoringData = async (data: MonitoringDataInput): Promise<number[]> => {
export const InsertMonitoringData = async (data: MonitoringDataInput): Promise<MonitoringData | null> => {
//do validation if present all fields below
if (!data.monitor_tag || !data.timestamp || !data.status || !data.type) {
throw new Error("Invalid data");
}
// insertStatusQueue.push(async (cb) => {
// await ProcessGroupUpdate(data as GroupUpdateData);
// if (cb) cb();
// });
return await db.insertMonitoringData({
monitor_tag: data.monitor_tag,
timestamp: data.timestamp,
@@ -324,7 +321,7 @@ export const GetLastHeartbeat = async (monitor_tag: string): Promise<MonitoringD
return await db.getLastHeartbeat(monitor_tag);
};
export const RegisterHeartbeat = async (tag: string, secret: string): Promise<number[] | null> => {
export const RegisterHeartbeat = async (tag: string, secret: string): Promise<MonitoringData | null> => {
let monitor = await db.getMonitorByTag(tag);
if (!monitor) {
return null;
+8 -1
View File
@@ -9,6 +9,7 @@ import type { MonitorRecordTyped } from "./types/db.js";
import type { MonitoringResult } from "./types/monitor.js";
import { MONITOR_TYPES, type MonitorType } from "../types/monitor.js";
import type { JobsOptions } from "bullmq";
dotenv.config();
const Minuter = async (monitor: MonitorRecordTyped) => {
@@ -16,7 +17,13 @@ const Minuter = async (monitor: MonitorRecordTyped) => {
throw new Error(`Invalid monitor type: ${monitor.monitor_type}. Valid types are: ${MONITOR_TYPES.join(", ")}`);
}
const startOfMinute = GetMinuteStartNowTimestampUTC();
await monitorExecuteQueue.push(monitor, startOfMinute);
let options: JobsOptions | undefined = undefined;
if (monitor.monitor_type === "GROUP") {
options = {
delay: parseInt(String(monitor.type_data?.executionDelay)) || 1000, // default 1 second delay for group monitors
};
}
await monitorExecuteQueue.push(monitor, startOfMinute, options);
};
export { Minuter };
+14 -11
View File
@@ -13,12 +13,22 @@ import type {
* Repository for monitoring data operations
*/
export class MonitoringRepository extends BaseRepository {
async insertMonitoringData(data: MonitoringDataInsert): Promise<number[]> {
async insertMonitoringData(data: MonitoringDataInsert): Promise<MonitoringData | null> {
const { monitor_tag, timestamp, status, latency, type, error_message } = data;
return await this.knex("monitoring_data")
// Perform insert/update - works across PostgreSQL, MySQL, and SQLite
await this.knex("monitoring_data")
.insert({ monitor_tag, timestamp, status, latency, type, error_message })
.onConflict(["monitor_tag", "timestamp"])
.merge({ status, latency, type, error_message });
// Query and return the inserted/updated record (works consistently across all databases)
const record = await this.knex("monitoring_data")
.where("monitor_tag", monitor_tag)
.where("timestamp", timestamp)
.first();
return record as MonitoringData | null;
}
async getMonitoringData(monitor_tag: string, start: number, end: number): Promise<MonitoringData[]> {
@@ -296,11 +306,7 @@ export class MonitoringRepository extends BaseRepository {
return result.is_affected === 1;
}
async consecutivelyLatencyLessThan(
monitor_tag: string,
latencyThreshold: number,
lastX: number,
): Promise<boolean> {
async consecutivelyLatencyLessThan(monitor_tag: string, latencyThreshold: number, lastX: number): Promise<boolean> {
const result = await this.knex
.with("last_records", (qb: KnexType.QueryBuilder) => {
qb.select("*")
@@ -462,10 +468,7 @@ export class MonitoringRepository extends BaseRepository {
* @param lastX - Number of most recent rows to include
* @returns Object with ts=0, counts of each status, and average latency
*/
async getStatusCountsForLastN(
monitorTag: string | string[],
lastX: number,
): Promise<TimestampStatusCount> {
async getStatusCountsForLastN(monitorTag: string | string[], lastX: number): Promise<TimestampStatusCount> {
const tags = Array.isArray(monitorTag) ? monitorTag : [monitorTag];
const result = await this.knex
+2 -14
View File
@@ -16,12 +16,6 @@ const jobNamePrefix = "monitorExecuteJob";
interface JobData {
monitor: MonitorRecordTyped;
ts: number;
executeOptions?: ExecuteOptions;
}
interface ExecuteOptions {
countTimeoutRetries?: number;
maxTimeoutRetries?: number;
}
const getQueue = () => {
@@ -116,7 +110,7 @@ const addWorker = () => {
if (worker) return worker;
worker = q.createWorker(getQueue(), async (job: Job): Promise<MonitoringResultTS> => {
const { monitor, ts, executeOptions } = job.data as JobData;
const { monitor, ts } = job.data as JobData;
const serviceClient = new Service(monitor as MonitorWithType);
const exeResult = await serviceClient.execute(ts);
@@ -191,12 +185,7 @@ const addWorker = () => {
return worker;
};
export const push = async (
monitor: MonitorRecordTyped,
ts: number,
executeOptions?: ExecuteOptions,
options?: JobsOptions,
) => {
export const push = async (monitor: MonitorRecordTyped, ts: number, options?: JobsOptions) => {
const deDupId = `${monitor.tag}-${ts}`;
if (!options) {
options = {};
@@ -213,7 +202,6 @@ export const push = async (
{
monitor,
ts,
executeOptions,
},
options,
);
+14 -11
View File
@@ -4,6 +4,7 @@ import q from "./q.js";
import { InsertMonitoringData } from "../controllers/controller.js";
import { SetLastMonitoringValue } from "../cache/setGet.js";
import alertingQueue from "./alertingQueue.js";
import type { MonitoringData } from "../types/db.js";
let monitorResponseQueue: Queue | null = null;
let worker: Worker | null = null;
const queueName = "monitorResponseQueue";
@@ -28,7 +29,7 @@ const getQueue = () => {
const addWorker = () => {
if (worker) return worker;
worker = q.createWorker(getQueue(), async (job: Job): Promise<number[]> => {
worker = q.createWorker(getQueue(), async (job: Job): Promise<MonitoringData | null> => {
const { monitorTag, ts, status, latency, type, error_message } = job.data as JobData;
const dbRes = await InsertMonitoringData({
@@ -40,18 +41,20 @@ const addWorker = () => {
error_message: error_message,
});
if (dbRes.length > 0) {
await SetLastMonitoringValue(monitorTag, {
monitor_tag: monitorTag,
timestamp: ts,
status: status,
latency: latency,
type: type,
});
alertingQueue.push(monitorTag, ts, status);
if (!dbRes) {
throw new Error("Failed to insert monitoring data");
}
await SetLastMonitoringValue(monitorTag, {
monitor_tag: monitorTag,
timestamp: ts,
status: status,
latency: latency,
type: type,
});
alertingQueue.push(monitorTag, ts, status);
return dbRes;
});
+59 -52
View File
@@ -1,84 +1,91 @@
import axios from "axios";
import { GetRequiredSecrets, ReplaceAllOccurrences, Wait, GetMinuteStartNowTimestampUTC } from "../tool.js";
import { GetMinuteStartNowTimestampUTC } from "../tool.js";
import GC from "../../global-constants.js";
import db from "../db/db.js";
import type { GroupMonitor, MonitoringResult } from "../types/monitor.js";
import { GetLastMonitoringValue } from "../cache/setGet.js";
import { GetLatestMonitoringData } from "../controllers/controller.js";
async function waitForDataAndReturn(tag: string): Promise<MonitoringResult> {
let res = await db.getLatestMonitoringData(tag);
if (!!res) {
return {
status: res.status || GC.DOWN,
latency: res.latency || 0,
type: GC.REALTIME,
};
}
return {
status: GC.DOWN,
latency: 0,
type: GC.REALTIME,
};
}
/**
* Numeric scores for each status.
* UP = 0, DEGRADED = 1, DOWN = 2, MAINTENANCE = 3
*/
const STATUS_SCORE: Record<string, number> = {
[GC.UP]: 0,
[GC.DEGRADED]: 1,
[GC.DOWN]: 2,
[GC.MAINTENANCE]: 3,
};
//implement a function that takes array of statuses
//UP, DEGRADED, MAINTENANCE, DOWN
// takes a parameter called statusCalculationType which can be "ANY", "ALL", "MAJORITY"
//MAINTENANCE > DOWN > DEGRADED > UP
/** Map a weighted score back to a status string. */
function scoreToStatus(score: number): string {
if (score >= 3) return GC.MAINTENANCE;
if (score >= 2) return GC.DOWN;
if (score >= 1) return GC.DEGRADED;
return GC.UP;
}
class GroupCall {
monitor: GroupMonitor;
constructor(monitor: GroupMonitor, timestamp?: number) {
constructor(monitor: GroupMonitor, _timestamp?: number) {
this.monitor = monitor;
}
async execute(startOfMinute?: number): Promise<MonitoringResult> {
if (!!!startOfMinute) {
if (!startOfMinute) {
startOfMinute = GetMinuteStartNowTimestampUTC();
}
const dataPoints: Array<{ status: string; latency: number }> = [];
const monitorTagArr = this.monitor.type_data.monitors.map((m) => m.tag);
for (const tag of monitorTagArr) {
let lastObj = (await GetLastMonitoringValue(tag, () => GetLatestMonitoringData(tag))) as {
const members = this.monitor.type_data.monitors;
// Collect each member's latest status + latency
const statusMap = new Map<string, { status: string; latency: number }>();
for (const member of members) {
const lastObj = (await GetLastMonitoringValue(member.tag, () => GetLatestMonitoringData(member.tag))) as {
status: string;
latency: number;
} | null;
if (!!lastObj) {
dataPoints.push(lastObj);
if (lastObj) {
statusMap.set(member.tag, lastObj);
}
}
// --- Latency calculation ---
const latencyValues = [...statusMap.values()].map((v) => v.latency);
const latencyCalculationType = this.monitor.type_data.latencyCalculation || "AVG";
let latency = 0;
if (latencyCalculationType === "AVG") {
latency = dataPoints.reduce((acc, val) => acc + val.latency, 0) / dataPoints.length;
} else if (latencyCalculationType === "MAX") {
latency = Math.max(...dataPoints.map((d) => d.latency));
} else if (latencyCalculationType === "MIN") {
latency = Math.min(...dataPoints.map((d) => d.latency));
if (latencyValues.length > 0) {
if (latencyCalculationType === "AVG") {
latency = latencyValues.reduce((a, b) => a + b, 0) / latencyValues.length;
} else if (latencyCalculationType === "MAX") {
latency = Math.max(...latencyValues);
} else if (latencyCalculationType === "MIN") {
latency = Math.min(...latencyValues);
}
}
//status calculation: if any monitor is MAINTENANCE, then group monitor is MAINTENANCE.
// else if any monitor is DOWN, then group monitor is DOWN.
// else if any monitor is DEGRADED, then group monitor is DEGRADED.
// else group monitor is UP.
// --- Status calculation via weighted scores ---
// Each status has a numeric score: UP=0, DEGRADED=1, DOWN=2, MAINTENANCE=3
// Weighted sum = Σ(weight × score), weights should sum to 1.
// Result is mapped back: <1 → UP, ≥1 → DEGRADED, ≥2 → DOWN, ≥3 → MAINTENANCE
let weightedScore = 0;
let totalWeight = 0;
let status: string = GC.UP;
if (dataPoints.some((d) => d.status === GC.MAINTENANCE)) {
status = GC.MAINTENANCE;
} else if (dataPoints.some((d) => d.status === GC.DOWN)) {
status = GC.DOWN;
} else if (dataPoints.some((d) => d.status === GC.DEGRADED)) {
status = GC.DEGRADED;
for (const member of members) {
const data = statusMap.get(member.tag);
if (!data) continue;
const score = STATUS_SCORE[data.status] ?? 0;
weightedScore += member.weight * score;
totalWeight += member.weight;
}
return {
status,
latency,
type: GC.REALTIME,
};
// Normalize if weights don't sum to 1 (safety net)
if (totalWeight > 0 && totalWeight !== 1) {
weightedScore = weightedScore / totalWeight;
}
const status = scoreToStatus(weightedScore);
return { status, latency, type: GC.REALTIME };
}
}
+8 -2
View File
@@ -73,9 +73,15 @@ export interface HeartbeatMonitorTypeData {
degradedRemainingMinutes: number;
}
export interface GroupMonitorMember {
tag: string;
/** Weight for this monitor. All weights in a group must sum to 1. */
weight: number;
}
export interface GroupMonitorTypeData extends Record<string, unknown> {
monitors: Array<{ tag: string }>;
timeout: number;
monitors: GroupMonitorMember[];
executionDelay: number;
latencyCalculation: "AVG" | "MAX" | "MIN";
}
+4
View File
@@ -106,6 +106,10 @@
{
"title": "GameDig Monitors",
"slug": "monitors/gamedig"
},
{
"title": "Group Monitors",
"slug": "monitors/group"
}
]
},
@@ -0,0 +1,434 @@
---
title: Group Monitor
description: Aggregate multiple monitors into a single status view using weighted scoring
---
Group monitors allow you to combine multiple monitors into a unified status view. Instead of checking individual services, a group monitor aggregates the status of its member monitors using a weighted scoring system. This is ideal for representing complex systems where different components have varying levels of importance to the overall service health.
## How Group Monitoring Works {#how-group-monitoring-works}
Kener's Group monitoring follows this workflow:
1. **Collect Member Status**: Group monitor retrieves the latest status of each configured member monitor.
2. **Calculate Weighted Score**: Each status (UP, DEGRADED, DOWN, MAINTENANCE) has a numeric score. The group calculates a weighted average based on member weights.
3. **Map to Status**: The weighted score is mapped back to a group status (UP, DEGRADED, DOWN, or MAINTENANCE).
4. **Aggregate Latency**: Member latencies are combined using the selected calculation method (AVG, MAX, or MIN).
### Status Scoring System {#status-scoring-system}
Each monitor status has a numeric score used in weighted calculations:
| Status | Score | Meaning |
| :-------------- | :---- | :------------------------------- |
| **UP** | `0` | Service operating normally |
| **DEGRADED** | `1` | Service experiencing issues |
| **DOWN** | `2` | Service unavailable |
| **MAINTENANCE** | `3` | Service in scheduled maintenance |
### Weighted Score Calculation {#weighted-score-calculation}
The group status is determined by:
```
Weighted Score = Σ(monitor_weight × monitor_status_score)
where weights must sum to 1.0
```
The weighted score is then mapped back to a status:
| Score Range | Group Status |
| :---------------- | :-------------- |
| `< 1.0` | **UP** |
| `≥ 1.0 and < 2.0` | **DEGRADED** |
| `≥ 2.0 and < 3.0` | **DOWN** |
| `≥ 3.0` | **MAINTENANCE** |
### Example Calculation {#example-calculation}
Given three monitors with weights:
- **API** (weight: 0.5): DOWN (score: 2)
- **Database** (weight: 0.3): UP (score: 0)
- **Cache** (weight: 0.2): UP (score: 0)
```
Weighted Score = (0.5 × 2) + (0.3 × 0) + (0.2 × 0) = 1.0
Result: DEGRADED
```
The group is marked **DEGRADED** because the high-weight API monitor is down, even though two other services are up.
## Configuration Options {#configuration-options}
| Field | Type | Description | Default |
| :---------------------- | :------- | :---------------------------------------------------------------------------------------------------------------- | :------ |
| **Monitors** | `array` | List of member monitors with their tags and weights. Weights must sum to 1. | `[]` |
| **Execution Delay** | `number` | Delay in milliseconds before the group monitor executes, allowing member monitors to complete their checks first. | `1000` |
| **Latency Calculation** | `string` | How to calculate group latency from members: `AVG` (average), `MAX`, or `MIN`. | `AVG` |
### Execution Delay {#execution-delay}
The **executionDelay** parameter is critical for group monitors:
- **Purpose**: Delays the group monitor execution to ensure all member monitors have completed their checks for the current minute.
- **Why It Matters**: Group monitors aggregate status from member monitors. If the group runs too early, it may read stale data from the previous minute, resulting in incorrect status calculations.
- **Timing**: Member monitors run at the start of each minute (cron schedule). Setting executionDelay to 1000ms (1 second) or higher ensures member data is fresh.
- **Recommendation**: Set executionDelay higher than the slowest member monitor's expected execution time. For example, if your API monitor has a 500ms timeout, use 1000ms or more for the group.
> [!WARNING]
> Setting executionDelay too low (e.g., 100ms) can cause the group to aggregate stale data, making the group status lag behind actual member statuses.
### Monitor Weights {#monitor-weights}
Each member monitor in a group has a **weight** between `0` and `1`. The weights determine how much each monitor influences the overall group status.
- **Higher weight** = Greater impact on group status
- **Lower weight** = Lesser impact on group status
- **All weights must sum to 1.0**
> [!NOTE]
> Groups cannot contain other group monitors. Only non-group, active monitors can be added to a group.
## Weight Assignment Strategies {#weight-assignment-strategies}
### Equal Weighting {#equal-weighting}
All monitors have equal importance:
```json
{
"monitors": [
{ "tag": "api", "weight": 0.333 },
{ "tag": "database", "weight": 0.333 },
{ "tag": "cache", "weight": 0.334 }
]
}
```
Each monitor contributes equally to the group status.
### Critical Component Weighting {#critical-component-weighting}
Primary service has higher weight:
```json
{
"monitors": [
{ "tag": "api", "weight": 0.6 },
{ "tag": "database", "weight": 0.25 },
{ "tag": "cache", "weight": 0.15 }
]
}
```
The API has more influence on overall status than supporting services.
### Tiered Importance {#tiered-importance}
```json
{
"monitors": [
{ "tag": "core-api", "weight": 0.5 },
{ "tag": "database", "weight": 0.3 },
{ "tag": "cdn", "weight": 0.15 },
{ "tag": "analytics", "weight": 0.05 }
]
}
```
Core services have higher weights, while auxiliary services have minimal impact.
## Latency Calculation Methods {#latency-calculation-methods}
Group monitors can aggregate member latencies using different methods:
| Method | Description | Use Case |
| :------ | :-------------------------------------- | :------------------------------ |
| **AVG** | Average latency of all member monitors | General performance overview |
| **MAX** | Highest latency among members (slowest) | Worst-case performance tracking |
| **MIN** | Lowest latency among members (fastest) | Best-case performance tracking |
## Examples {#examples}
### 1. Basic Web Application Stack {#basic-web-application-stack}
Monitor a typical web app with equal weights:
```json
{
"tag": "webapp-stack",
"name": "Web Application",
"type": "GROUP",
"cron": "* * * * *",
"type_data": {
"monitors": [
{ "tag": "frontend", "weight": 0.333 },
{ "tag": "api-backend", "weight": 0.333 },
{ "tag": "database", "weight": 0.334 }
],
"executionDelay": 1000,
"latencyCalculation": "AVG"
}
}
```
**Behavior:**
- All three components equally affect the group status
- If any one component goes DOWN (score 2), weighted score is ~0.667 → Group remains **UP** but approaching **DEGRADED**
- If two components go DOWN, weighted score is ~1.333 → Group becomes **DEGRADED**
### 2. Critical Service with Dependencies {#critical-service-with-dependencies}
Primary API is critical, supporting services are less impactful:
```json
{
"tag": "payment-service",
"name": "Payment Processing",
"type": "GROUP",
"cron": "* * * * *",
"type_data": {
"monitors": [
{ "tag": "payment-api", "weight": 0.7 },
{ "tag": "fraud-check", "weight": 0.2 },
{ "tag": "notification", "weight": 0.1 }
],
"executionDelay": 2000,
"latencyCalculation": "MAX"
}
}
```
**Behavior:**
- If payment-api goes DOWN (score 2): `0.7 × 2 = 1.4` → Group is **DEGRADED**
- If only notification goes DOWN: `0.1 × 2 = 0.2` → Group remains **UP**
- Uses MAX latency to track the slowest component
### 3. E-commerce Platform {#e-commerce-platform}
Multi-tier application with varying weights:
```json
{
"tag": "ecommerce-platform",
"name": "E-Commerce Platform",
"type": "GROUP",
"cron": "*/2 * * * *",
"type_data": {
"monitors": [
{ "tag": "storefront", "weight": 0.4 },
{ "tag": "product-api", "weight": 0.25 },
{ "tag": "checkout", "weight": 0.2 },
{ "tag": "search", "weight": 0.1 },
{ "tag": "recommendations", "weight": 0.05 }
],
"executionDelay": 1500,
"latencyCalculation": "AVG"
}
}
```
**Behavior:**
- Storefront and core APIs have high weight
- Search being down only contributes `0.1 × 2 = 0.2` to the score
- Recommendations being down adds just `0.05 × 2 = 0.1` (minimal impact)
### 4. Gradual Degradation Example {#gradual-degradation-example}
Understanding how multiple failures affect group status:
With equal weights (0.25 each for 4 monitors):
| Failed Monitors | Calculation | Score | Status |
| :-------------- | :--------------------------- | :----- | :------- |
| None | `0` | `0.0` | UP |
| 1 DOWN | `0.25 × 2 = 0.5` | `0.5` | UP |
| 2 DOWN | `0.5 × 2 = 1.0` | `1.0` | DEGRADED |
| 3 DOWN | `0.75 × 2 = 1.5` | `1.5` | DEGRADED |
| 4 DOWN | `1.0 × 2 = 2.0` | `2.0` | DOWN |
| 2 DEGRADED | `0.5 × 1 = 0.5` | `0.5` | UP |
| 1 DOWN + 1 DEG | `0.25 × 2 + 0.25 × 1 = 0.75` | `0.75` | UP |
## Best Practices {#best-practices}
### Weight Assignment {#best-practices-weight-assignment}
1. **Identify Critical Components**: Services that can bring down the entire system should have higher weights (0.5-0.7).
2. **Sum to 1.0**: Always ensure weights add up to exactly 1.0.
3. **Use the "Distribute Equally" Button**: The UI provides a button to auto-calculate equal weights.
4. **Round to 2-3 Decimals**: Avoid overly precise weights like `0.142857` — round to `0.143`.
### Group Organization {#best-practices-group-organization}
1. **Logical Grouping**: Group monitors that actually belong together (e.g., all microservices for one product).
2. **Avoid Deep Nesting**: Groups cannot contain other groups — keep hierarchies flat.
3. **Limit Group Size**: 3-7 monitors per group is ideal. Too many makes weights hard to reason about.
4. **Name Clearly**: Use descriptive names like "Payment Stack" not "Group 1".
### Monitoring Strategy {#best-practices-monitoring-strategy}
1. **Check Frequency**: Group monitors should run at least as often as their slowest member.
2. **Execution Delay**: Set executionDelay higher than your slowest member monitor's timeout + processing time. For example:
- If API monitor has 500ms timeout: use 1000ms+ executionDelay
- If SQL monitor has 2000ms timeout: use 2500ms+ executionDelay
- Default 1000ms works for most monitors with timeouts under 500ms
3. **Latency Method**:
- Use **AVG** for general overview
- Use **MAX** when you care about the slowest component
- Use **MIN** rarely (only when fastest matters)
### Weight Scenarios {#best-practices-weight-scenarios}
**Scenario: Primary + Backup**
```json
{ "primary": 0.8, "backup": 0.2 }
```
Backup has low weight since it only matters when primary fails.
**Scenario: Load-Balanced Services**
```json
{ "server1": 0.5, "server2": 0.5 }
```
Both servers equally important for availability.
**Scenario: Microservices with Shared Database**
```json
{ "api1": 0.25, "api2": 0.25, "api3": 0.25, "database": 0.25 }
```
All components equally critical.
## Common Patterns {#common-patterns}
### Pattern 1: Frontend + Backend + Database {#pattern-frontend-backend-database}
```
Frontend (0.3) ──► Backend (0.5) ──► Database (0.2)
```
Backend has highest weight as it's the core business logic. Database lower because issues often manifest in backend first.
### Pattern 2: Microservices Architecture {#pattern-microservices}
```
Gateway (0.4)
├─► Service A (0.2)
├─► Service B (0.2)
└─► Service C (0.2)
```
Gateway is critical as it's the entry point. Services have equal weight.
### Pattern 3: CDN + Origin {#pattern-cdn-origin}
```
CDN (0.3) ──► Origin (0.7)
```
Origin is more critical since CDN failures can be worked around, but origin failures are total.
## Troubleshooting {#troubleshooting}
### Group Stays UP When Members Are DOWN {#troubleshooting-up-when-down}
**Problem**: Group shows UP even though some members are DOWN.
**Solution**: Check total weighted score. With low weights, a single DOWN monitor may not push the score to 1.0:
- `0.2 × 2 = 0.4` (still < 1.0, so UP)
- Increase weights for critical monitors or adjust thresholds.
### Group Always Shows DEGRADED {#troubleshooting-always-degraded}
**Problem**: Group constantly shows DEGRADED status.
**Solution**:
- Check if member monitors frequently show DEGRADED status
- Verify weight distribution — heavily weighted monitors have outsized impact
- Review member monitor configurations for sensitivity
### Weights Don't Sum to 1.0 {#troubleshooting-weights-sum}
**Problem**: Cannot save group configuration.
**Solution**: Use the "Distribute Equally" button or manually adjust weights:
```
Total: 0.99 → Add 0.01 to one monitor
Total: 1.01 → Subtract 0.01 from one monitor
```
### Missing Member Data {#troubleshooting-missing-data}
**Problem**: Group uses partial data or shows NO_DATA.
**Solution**:
- Ensure member monitors are running on schedule
- Check that member monitors are active (not paused)
- **Increase executionDelay** if member monitors take longer than expected to complete
- Member monitors must complete before group aggregates
- If a member has a 1000ms timeout, set group executionDelay to at least 1500ms
## Advanced Use Cases {#advanced-use-cases}
### Blue-Green Deployment Monitoring {#blue-green-deployment}
Monitor both environments with dynamic weighting:
**During Normal Operation:**
```json
{ "blue": 1.0, "green": 0.0 }
```
**During Deployment (50/50 traffic):**
```json
{ "blue": 0.5, "green": 0.5 }
```
**After Cutover:**
```json
{ "blue": 0.0, "green": 1.0 }
```
### Multi-Region Service {#multi-region-service}
Weight by traffic distribution:
```json
{
"us-east": 0.4,
"us-west": 0.3,
"eu-west": 0.2,
"ap-south": 0.1
}
```
Reflects actual user impact if a region goes down.
### SLA-Based Weighting {#sla-based-weighting}
Weight by contractual importance:
```json
{
"enterprise-api": 0.6,
"standard-api": 0.3,
"free-api": 0.1
}
```
Prioritizes monitoring of revenue-generating tiers.
+1 -1
View File
@@ -70,7 +70,7 @@
<link rel="stylesheet" href={data.font.cssSrc} />
{/if}
{@html `
<style id="dynamic-styles">
<style>
.kener-manage {
--up: ${data.siteStatusColors.UP};
--degraded: ${data.siteStatusColors.DEGRADED};
@@ -49,7 +49,7 @@
function createDefaultGroupTypeData(): GroupMonitorTypeData {
return {
monitors: [],
timeout: GROUP_MIN_TIMEOUT_MS,
executionDelay: GROUP_MIN_TIMEOUT_MS,
latencyCalculation: "AVG"
};
}
@@ -57,29 +57,33 @@
function normalizeGroupTypeData(raw: unknown): GroupMonitorTypeData {
const candidate = (raw ?? {}) as Record<string, unknown>;
const monitors = Array.isArray(candidate.monitors)
? (candidate.monitors as Array<{ tag?: string }>).reduce<Array<{ tag: string }>>((acc, monitor) => {
if (monitor && typeof monitor.tag === "string" && monitor.tag.trim().length > 0) {
acc.push({ tag: monitor.tag });
}
return acc;
}, [])
? (candidate.monitors as Array<{ tag?: string; weight?: number }>).reduce<Array<{ tag: string; weight: number }>>(
(acc, monitor) => {
if (monitor && typeof monitor.tag === "string" && monitor.tag.trim().length > 0) {
const weight = typeof monitor.weight === "number" && Number.isFinite(monitor.weight) ? monitor.weight : 0;
acc.push({ tag: monitor.tag, weight });
}
return acc;
},
[]
)
: [];
const timeout =
typeof candidate.timeout === "number" &&
Number.isFinite(candidate.timeout) &&
candidate.timeout >= GROUP_MIN_TIMEOUT_MS
? candidate.timeout
: GROUP_MIN_TIMEOUT_MS;
const latencyCalculation = isGroupLatencyCalculation(candidate.latencyCalculation)
? candidate.latencyCalculation
: "AVG";
const executionDelay =
typeof candidate.executionDelay === "number" &&
Number.isFinite(candidate.executionDelay) &&
candidate.executionDelay >= GROUP_MIN_TIMEOUT_MS
? candidate.executionDelay
: GROUP_MIN_TIMEOUT_MS;
return {
monitors,
timeout,
latencyCalculation
latencyCalculation,
executionDelay
};
}
@@ -155,8 +159,11 @@
case "GROUP": {
const data = typeData as Partial<GroupMonitorTypeData>;
if (!data.monitors || !Array.isArray(data.monitors) || data.monitors.length < GROUP_MIN_MONITORS) return false;
if (typeof data.timeout !== "number" || data.timeout < GROUP_MIN_TIMEOUT_MS) return false;
if (typeof data.executionDelay !== "number" || data.executionDelay < GROUP_MIN_TIMEOUT_MS) return false;
if (!isGroupLatencyCalculation((data as GroupMonitorTypeData).latencyCalculation)) return false;
// Weights must sum to 1
const totalWeight = data.monitors.reduce((sum, m) => sum + (m.weight ?? 0), 0);
if (Math.abs(totalWeight - 1) >= 0.01) return false;
return true;
}
@@ -308,7 +315,7 @@
{/if}
</div>
</Card.Content>
<Card.Footer class="flex justify-between gap-2">
<Card.Footer class=" flex justify-between gap-2">
<Dialog.Root
onOpenChange={(e) => {
if (e) testMonitor();
@@ -334,7 +341,7 @@
{/if}
</Dialog.Title>
</Dialog.Header>
<div class="flex flex-col justify-center gap-2">
<div class="kener-manage flex flex-col justify-center gap-2">
{#if testingMonitor}
<div class="flex flex-col items-center gap-2 py-8">
<Loader class="size-8 animate-spin" />
@@ -4,8 +4,10 @@
import * as Select from "$lib/components/ui/select/index.js";
import { Switch } from "$lib/components/ui/switch/index.js";
import type { MonitorRecord } from "$lib/server/types/db.js";
import type { GroupMonitorMember } from "$lib/server/types/monitor.js";
import clientResolver from "$lib/client/resolver.js";
import { resolve } from "$app/paths";
const LATENCY_CALCULATION_OPTIONS = ["AVG", "MAX", "MIN"] as const;
type LatencyCalculationOption = (typeof LATENCY_CALCULATION_OPTIONS)[number];
const LATENCY_CALCULATION_LABELS: Record<LatencyCalculationOption, string> = {
@@ -15,8 +17,8 @@
};
type GroupMonitorFormData = {
monitors: Array<{ tag: string }>;
delay: number;
monitors: GroupMonitorMember[];
executionDelay: number;
latencyCalculation: LatencyCalculationOption;
};
@@ -25,43 +27,74 @@
availableMonitors = [],
tag = ""
}: { data: Record<string, unknown>; availableMonitors: MonitorRecord[]; tag: string } = $props();
console.log(">>>>>>---- monitor-group:27 ", data);
const formData = data as GroupMonitorFormData;
const MIN_SELECTED_MONITORS = 2;
const MIN_DELAY_MS = 1000;
console.log(">>>>>>---- monitor-group:35 ", formData);
// Initialize defaults if not set
if (!Array.isArray(formData.monitors)) formData.monitors = [];
if (typeof formData.delay !== "number" || !Number.isFinite(formData.delay) || formData.delay < MIN_DELAY_MS) {
formData.delay = MIN_DELAY_MS;
if (
typeof formData.executionDelay !== "number" ||
!Number.isFinite(formData.executionDelay) ||
formData.executionDelay < MIN_DELAY_MS
) {
formData.executionDelay = MIN_DELAY_MS;
}
if (!LATENCY_CALCULATION_OPTIONS.includes(formData.latencyCalculation)) {
formData.latencyCalculation = "AVG";
}
let delayInput = $state(String(formData.delay));
const parsedDelay = $derived.by(() => {
const value = Number(delayInput);
let executionDelayInput = $state(String(formData.executionDelay));
const parsedExecutionDelay = $derived.by(() => {
const value = Number(executionDelayInput);
return Number.isFinite(value) ? value : MIN_DELAY_MS;
});
$effect(() => {
formData.delay = parsedDelay;
formData.executionDelay = parsedExecutionDelay;
});
// Filter out GROUP monitors - groups can't contain other groups
let eligibleMonitors = $derived(availableMonitors.filter((m) => m.monitor_type !== "GROUP" && m.status === "ACTIVE"));
function isSelected(tag: string): boolean {
return formData.monitors.some((m: { tag: string }) => m.tag === tag);
let totalWeight = $derived(Math.round(formData.monitors.reduce((sum, m) => sum + m.weight, 0) * 1000) / 1000);
let weightsValid = $derived(Math.abs(totalWeight - 1) < 0.001 || formData.monitors.length === 0);
function findMember(monitorTag: string): GroupMonitorMember | undefined {
return formData.monitors.find((m) => m.tag === monitorTag);
}
function toggleMonitor(tag: string) {
if (isSelected(tag)) {
formData.monitors = formData.monitors.filter((m: { tag: string }) => m.tag !== tag);
function isSelected(monitorTag: string): boolean {
return formData.monitors.some((m) => m.tag === monitorTag);
}
/** Distribute weights equally across all selected monitors. */
function distributeEqually() {
const count = formData.monitors.length;
if (count === 0) return;
const weight = Math.round((1 / count) * 1000) / 1000;
formData.monitors = formData.monitors.map((m, i) => ({
...m,
// Give the last monitor the remainder to ensure sum = 1
weight: i === count - 1 ? Math.round((1 - weight * (count - 1)) * 1000) / 1000 : weight
}));
}
function toggleMonitor(monitorTag: string) {
if (isSelected(monitorTag)) {
formData.monitors = formData.monitors.filter((m) => m.tag !== monitorTag);
} else {
formData.monitors = [...formData.monitors, { tag }];
formData.monitors = [...formData.monitors, { tag: monitorTag, weight: 0 }];
}
distributeEqually();
}
function setWeight(monitorTag: string, weight: number) {
formData.monitors = formData.monitors.map((m) => {
if (m.tag !== monitorTag) return m;
return { ...m, weight: Math.min(1, Math.max(0, Math.round(weight * 1000) / 1000)) };
});
}
</script>
@@ -70,39 +103,60 @@
<div class="flex flex-col gap-1">
<Label>Select Monitors to Group</Label>
<p class="text-muted-foreground text-xs">
Group monitors aggregate the status of multiple monitors. Only active non-group monitors can be added.
Group monitors aggregate the status of multiple monitors using weighted scores. Each status has a score: UP=0,
DEGRADED=1, DOWN=2, MAINTENANCE=3. The weighted sum determines the group status.
</p>
<p class="text-muted-foreground text-xs">
Select at least {MIN_SELECTED_MONITORS} monitors to enable saving.
Select at least {MIN_SELECTED_MONITORS} monitors. Weights must sum to 1.
</p>
</div>
{#if eligibleMonitors.length > 0}
<div class="grid gap-2">
{#each eligibleMonitors.filter((m) => m.tag !== tag) as monitor (monitor.id ?? monitor.tag)}
<div
class="flex items-center justify-between rounded-lg border p-3 transition-colors {isSelected(monitor.tag)
? 'bg-primary/5'
: ''}"
>
<div class="flex items-center gap-3">
{#if monitor.image}
<img
src={clientResolver(resolve, monitor.image)}
alt={monitor.name}
class="size-8 rounded object-cover"
/>
{:else}
<div class="bg-muted flex size-8 items-center justify-center rounded text-xs font-medium">
{monitor.name.charAt(0).toUpperCase()}
{@const member = findMember(monitor.tag)}
{@const selected = !!member}
<div class="rounded-lg border p-3 transition-colors {selected ? 'bg-primary/5' : ''}">
<div class="flex items-center justify-between">
<div class="flex items-center gap-3">
{#if monitor.image}
<img
src={clientResolver(resolve, monitor.image)}
alt={monitor.name}
class="size-8 rounded object-cover"
/>
{:else}
<div class="bg-muted flex size-8 items-center justify-center rounded text-xs font-medium">
{monitor.name.charAt(0).toUpperCase()}
</div>
{/if}
<div>
<p class="text-sm font-medium">{monitor.name}</p>
<p class="text-muted-foreground text-xs">{monitor.tag}</p>
</div>
{/if}
<div>
<p class="text-sm font-medium">{monitor.name}</p>
<p class="text-muted-foreground text-xs">{monitor.tag}</p>
</div>
<Switch checked={selected} onCheckedChange={() => toggleMonitor(monitor.tag)} />
</div>
<Switch checked={isSelected(monitor.tag)} onCheckedChange={() => toggleMonitor(monitor.tag)} />
{#if selected && member}
<div class="mt-3 flex items-end gap-4 border-t pt-3">
<div class="space-y-1">
<Label class="text-xs">Weight</Label>
<Input
type="number"
min={0}
max={1}
step={0.01}
value={String(member.weight)}
class="h-8 w-[100px] text-xs"
onchange={(e) => {
const target = e.currentTarget as HTMLInputElement;
setWeight(monitor.tag, Number(target.value));
}}
/>
</div>
</div>
{/if}
</div>
{/each}
</div>
@@ -111,13 +165,35 @@
{/if}
</div>
{#if formData.monitors.length > 0}
<div class="flex items-center gap-3">
<div
class="flex items-center gap-2 rounded-md border px-3 py-1.5 text-sm {weightsValid
? 'border-green-500/50 bg-green-50 text-green-700 dark:bg-green-950 dark:text-green-300'
: 'border-destructive/50 text-destructive bg-red-50 dark:bg-red-950'}"
>
Total weight: {totalWeight}
{#if !weightsValid}
<span class="text-xs">(must equal 1)</span>
{/if}
</div>
<button
type="button"
class="text-muted-foreground hover:text-foreground text-xs underline underline-offset-2"
onclick={distributeEqually}
>
Distribute equally
</button>
</div>
{/if}
<div class="grid gap-4 md:grid-cols-2">
<div class="space-y-2">
<div class="flex items-center justify-between">
<Label for="group-timeout">Timeout (ms)</Label>
<Label for="group-execution-delay">Execution Delay (ms)</Label>
<span class="text-muted-foreground text-xs">Minimum {MIN_DELAY_MS}ms</span>
</div>
<Input id="group-timeout" type="number" min={MIN_DELAY_MS} step={100} bind:value={delayInput} />
<Input id="group-execution-delay" type="number" min={MIN_DELAY_MS} step={100} bind:value={executionDelayInput} />
<p class="text-muted-foreground text-xs">
Determines how long to wait for all child monitors before aggregating results.
</p>
@@ -150,9 +226,14 @@
{#if formData.monitors.length > 0}
<div class="bg-muted/50 rounded-lg p-3">
<p class="text-sm font-medium">Selected: {formData.monitors.length} monitor(s)</p>
<p class="text-muted-foreground text-xs">
{formData.monitors.map((m: { tag: string }) => m.tag).join(", ")}
</p>
<div class="mt-1 space-y-0.5">
{#each formData.monitors as m (m.tag)}
<p class="text-muted-foreground text-xs">
{m.tag}
<span class="ml-1 text-[10px]">weight {m.weight}</span>
</p>
{/each}
</div>
</div>
{/if}
</div>