mirror of
https://github.com/ultravioletrs/cocos.git
synced 2026-06-23 04:10:25 +00:00
8eb1fac9ad
* Refactor and update dependencies in the project - Updated go.sum to replace `github.com/absmach/magistrala` with `github.com/absmach/supermq` across various modules. - Removed VSock configuration from environment variables and QEMU arguments. - Updated QEMU configuration and related tests to remove references to guest CID and VSock. - Added new HTTP transport layer for API endpoints in the manager. - Introduced Prometheus monitoring configuration with alert rules and Alertmanager setup. - Updated service and VM interfaces to remove unused methods and references. - Refactored tests to align with the new structure and dependencies. Signed-off-by: Sammy Oina <sammyoina@gmail.com> * Add MaxVMs configuration and enforce limit on VM creation Signed-off-by: Sammy Oina <sammyoina@gmail.com> * Add comprehensive tests for HTTP transport handlers and endpoints Signed-off-by: Sammy Oina <sammyoina@gmail.com> * Add test case for exceeding maximum number of VMs in TestRun Signed-off-by: Sammy Oina <sammyoina@gmail.com> * Improve error handling in TestHandlerWithCustomRouter to ensure response writing is checked Signed-off-by: Sammy Oina <sammyoina@gmail.com> * Update dependencies to latest versions - Upgrade cel.dev/expr from v0.23.0 to v0.24.0 - Upgrade github.com/absmach/supermq from v0.16.0 to v0.17.0 - Upgrade github.com/cenkalti/backoff from v4.3.0 to v5.0.2 - Upgrade github.com/cncf/xds/go to v0.0.0-20250501225837-2ac532fd4443 - Upgrade github.com/go-chi/chi/v5 from v5.2.1 to v5.2.2 - Upgrade github.com/go-jose/go-jose/v3 from v3.0.3 to v3.0.4 - Upgrade github.com/gofrs/uuid/v5 from v5.3.0 to v5.3.2 - Upgrade github.com/prometheus/client_golang from v1.22.0 to v1.23.0 - Upgrade github.com/prometheus/client_model from v0.6.1 to v0.6.2 - Upgrade github.com/prometheus/common from v0.62.0 to v0.65.0 - Upgrade github.com/prometheus/procfs from v0.15.1 to v0.16.1 - Upgrade go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp from v0.60.0 to v0.62.0 - Upgrade go.opentelemetry.io/otel/exporters/otlp/otlptrace from v1.36.0 to v1.37.0 - Upgrade golang.org/x/crypto from v0.39.0 to v0.40.0 - Upgrade golang.org/x/sys from v0.33.0 to v0.34.0 - Upgrade golang.org/x/text from v0.26.0 to v0.27.0 - Upgrade golang.org/x/time from v0.11.0 to v0.12.0 - Upgrade google.golang.org/grpc from v1.73.0 to v1.74.2 Signed-off-by: Sammy Oina <sammyoina@gmail.com> --------- Signed-off-by: Sammy Oina <sammyoina@gmail.com>
58 lines
2.2 KiB
YAML
58 lines
2.2 KiB
YAML
groups:
|
|
- name: cocos-manager-alerts
|
|
rules:
|
|
# Service Down Alert - Primary method using up metric
|
|
- alert: CocosManagerDown
|
|
expr: up{job="cocos-manager"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
service: cocos-manager
|
|
annotations:
|
|
summary: "Cocos Manager service is down"
|
|
description: "Cocos Manager service has been down for more than 1 minute. Instance: {{ $labels.instance }}"
|
|
|
|
# Alternative: No metrics received (for cases where up metric might not be reliable)
|
|
- alert: CocosManagerNoMetrics
|
|
expr: absent(up{job="cocos-manager"})
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
service: cocos-manager
|
|
annotations:
|
|
summary: "No metrics received from Cocos Manager"
|
|
description: "No metrics have been received from Cocos Manager for more than 2 minutes"
|
|
|
|
# Health endpoint specific alert (if you expose health as a metric)
|
|
- alert: CocosManagerUnhealthy
|
|
expr: health_check{job="cocos-manager"} == 0
|
|
for: 30s
|
|
labels:
|
|
severity: warning
|
|
service: cocos-manager
|
|
annotations:
|
|
summary: "Cocos Manager health check failing"
|
|
description: "Cocos Manager health check has been failing for more than 30 seconds. Instance: {{ $labels.instance }}"
|
|
|
|
# High latency alert
|
|
- alert: CocosManagerHighLatency
|
|
expr: histogram_quantile(0.95, rate(cocos_manager_latency_bucket[5m])) > 2
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: cocos-manager
|
|
annotations:
|
|
summary: "Cocos Manager high latency"
|
|
description: "Cocos Manager 95th percentile latency is above 2 seconds for more than 5 minutes. Current value: {{ $value }}s"
|
|
|
|
# High error rate alert
|
|
- alert: CocosManagerHighErrorRate
|
|
expr: rate(cocos_manager_errors_total[5m]) > 0.1
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
service: cocos-manager
|
|
annotations:
|
|
summary: "Cocos Manager high error rate"
|
|
description: "Cocos Manager error rate is above 10% for more than 2 minutes. Current rate: {{ $value }}"
|