feat(gpu-stats): add gpu stats to environments [C9S-200] (#2735)

This commit is contained in:
bernard-portainer
2026-06-22 09:21:43 +12:00
committed by GitHub
parent 217fe870ef
commit 5395dee4c6
9 changed files with 292 additions and 13 deletions
@@ -135,6 +135,10 @@ func (handler *Handler) parseHeaders(r *http.Request, endpoint *portainer.Endpoi
version := r.Header.Get(portainer.PortainerAgentHeader) version := r.Header.Get(portainer.PortainerAgentHeader)
endpoint.Agent.Version = version endpoint.Agent.Version = version
if gpuOperatorHeader := r.Header.Get(portainer.HTTPResponseAgentGPUOperator); gpuOperatorHeader != "" {
endpoint.Kubernetes.Flags.GPUOperator = gpuOperatorHeader == "true"
}
return nil return nil
} }
+1 -1
View File
@@ -24,5 +24,5 @@ func (snapshotter *Snapshotter) CreateSnapshot(endpoint *portainer.Endpoint) (*p
return nil, err return nil, err
} }
return snapshot.CreateKubernetesSnapshot(client) return snapshot.CreateKubernetesSnapshot(client, endpoint.Kubernetes.Flags.GPUOperator)
} }
+5
View File
@@ -830,6 +830,7 @@ type (
IsServerMetricsDetected bool `json:"IsServerMetricsDetected" validate:"required"` IsServerMetricsDetected bool `json:"IsServerMetricsDetected" validate:"required"`
IsServerIngressClassDetected bool `json:"IsServerIngressClassDetected" validate:"required"` IsServerIngressClassDetected bool `json:"IsServerIngressClassDetected" validate:"required"`
IsServerStorageDetected bool `json:"IsServerStorageDetected" validate:"required"` IsServerStorageDetected bool `json:"IsServerStorageDetected" validate:"required"`
GPUOperator bool `json:"GPUOperator,omitempty"`
} }
// KubernetesSnapshot represents a snapshot of a specific Kubernetes environment(endpoint) at a specific time // KubernetesSnapshot represents a snapshot of a specific Kubernetes environment(endpoint) at a specific time
@@ -840,6 +841,8 @@ type (
TotalCPU int64 `json:"TotalCPU" validate:"required"` TotalCPU int64 `json:"TotalCPU" validate:"required"`
TotalMemory int64 `json:"TotalMemory" validate:"required"` TotalMemory int64 `json:"TotalMemory" validate:"required"`
ClusterType string `json:"ClusterType,omitempty"` ClusterType string `json:"ClusterType,omitempty"`
GPUNodeCount int `json:"GPUNodeCount,omitempty"`
TotalGPU map[string]int64 `json:"TotalGPU,omitempty"`
DiagnosticsData *DiagnosticsData `json:"DiagnosticsData,omitempty"` DiagnosticsData *DiagnosticsData `json:"DiagnosticsData,omitempty"`
PerformanceMetrics *PerformanceMetrics `json:"PerformanceMetrics,omitempty"` PerformanceMetrics *PerformanceMetrics `json:"PerformanceMetrics,omitempty"`
} }
@@ -2078,6 +2081,8 @@ const (
PortainerAgentKubernetesSATokenHeader = "X-PortainerAgent-SA-Token" PortainerAgentKubernetesSATokenHeader = "X-PortainerAgent-SA-Token"
// HTTPAlertStateHeaderName is the name of the header used to transmit edge alert evaluation state // HTTPAlertStateHeaderName is the name of the header used to transmit edge alert evaluation state
HTTPAlertStateHeaderName = "X-PortainerAgent-AlertState" HTTPAlertStateHeaderName = "X-PortainerAgent-AlertState"
// HTTPResponseAgentGPUOperator represents the name of the header indicating whether the GPU operator is enabled on the agent
HTTPResponseAgentGPUOperator = "Portainer-Agent-GPU-Operator"
// PortainerAgentSignatureMessage represents the message used to create a digital signature // PortainerAgentSignatureMessage represents the message used to create a digital signature
// to be used when communicating with an agent // to be used when communicating with an agent
PortainerAgentSignatureMessage = "Portainer-App" PortainerAgentSignatureMessage = "Portainer-App"
+15 -1
View File
@@ -1,6 +1,12 @@
import { render, screen } from '@testing-library/react'; import { render, screen } from '@testing-library/react';
import { ContainerStats, CPUStats, MemoryStats, NodeStats } from './StatsItem'; import {
ContainerStats,
CPUStats,
GpuStats,
MemoryStats,
NodeStats,
} from './StatsItem';
describe('StatsItem', () => { describe('StatsItem', () => {
describe('NodeStats', () => { describe('NodeStats', () => {
@@ -27,6 +33,14 @@ describe('StatsItem', () => {
}); });
}); });
describe('GpuStats', () => {
it('renders GPU count with GPUS label', () => {
render(<GpuStats value={4} />);
expect(screen.getByText('4')).toBeVisible();
expect(screen.getByText('GPUS')).toBeVisible();
});
});
describe('ContainerStats', () => { describe('ContainerStats', () => {
it('renders running/total containers with a progress bar', () => { it('renders running/total containers with a progress bar', () => {
render(<ContainerStats total={5} running={3} stopped={2} />); render(<ContainerStats total={5} running={3} stopped={2} />);
+9 -1
View File
@@ -1,6 +1,6 @@
import clsx from 'clsx'; import clsx from 'clsx';
import { PropsWithChildren } from 'react'; import { PropsWithChildren } from 'react';
import { Cpu, Hexagon, LaptopMinimal, MemoryStick } from 'lucide-react'; import { Cpu, Gpu, Hexagon, LaptopMinimal, MemoryStick } from 'lucide-react';
import { Icon, IconProps } from '@/react/components/Icon'; import { Icon, IconProps } from '@/react/components/Icon';
@@ -65,6 +65,14 @@ export function MemoryStats({ value }: StatsProps) {
); );
} }
export function GpuStats({ value }: StatsProps) {
return (
<StatsItem icon={Gpu} title="GPUS">
<span className="text-left font-bold leading-none">{value}</span>
</StatsItem>
);
}
interface ContainerStatsProps { interface ContainerStatsProps {
total: number; total: number;
running: number; running: number;
@@ -1,7 +1,7 @@
import { KubernetesSnapshot } from '@/react/portainer/environments/types'; import { KubernetesSnapshot } from '@/react/portainer/environments/types';
import { humanize } from '@/portainer/filters/filters'; import { humanize } from '@/portainer/filters/filters';
import { CPUStats, MemoryStats, NodeStats } from '@@/StatsItem'; import { CPUStats, GpuStats, MemoryStats, NodeStats } from '@@/StatsItem';
interface Props { interface Props {
snapshot?: KubernetesSnapshot; snapshot?: KubernetesSnapshot;
@@ -12,8 +12,15 @@ export function EnvironmentStatsKubernetes({ snapshot }: Props) {
return <>No snapshot available</>; return <>No snapshot available</>;
} }
const totalGpuCount = Object.values(snapshot.TotalGPU ?? {}).reduce(
(sum, v) => sum + v,
0
);
return ( return (
<> <>
{totalGpuCount > 0 && <GpuStats value={totalGpuCount} />}
<NodeStats value={snapshot.NodeCount} /> <NodeStats value={snapshot.NodeCount} />
<CPUStats value={snapshot.TotalCPU} /> <CPUStats value={snapshot.TotalCPU} />
@@ -58,6 +58,8 @@ export interface KubernetesSnapshot {
TotalMemory: number; TotalMemory: number;
Time: number; Time: number;
NodeCount: number; NodeCount: number;
GPUNodeCount?: number;
TotalGPU?: Record<string, number>;
} }
export type IngressClass = { export type IngressClass = {
+27 -3
View File
@@ -22,14 +22,14 @@ import (
"k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes"
) )
func CreateKubernetesSnapshot(cli *kubernetes.Clientset) (*portainer.KubernetesSnapshot, error) { func CreateKubernetesSnapshot(cli *kubernetes.Clientset, gpuOperator bool) (*portainer.KubernetesSnapshot, error) {
kubernetesSnapshot := &portainer.KubernetesSnapshot{} kubernetesSnapshot := &portainer.KubernetesSnapshot{}
err := kubernetesSnapshotVersion(kubernetesSnapshot, cli) err := kubernetesSnapshotVersion(kubernetesSnapshot, cli)
if err != nil { if err != nil {
log.Warn().Err(err).Msg("unable to snapshot cluster version") log.Warn().Err(err).Msg("unable to snapshot cluster version")
} }
err = kubernetesSnapshotNodes(kubernetesSnapshot, cli) err = kubernetesSnapshotNodes(kubernetesSnapshot, cli, gpuOperator)
if err != nil { if err != nil {
log.Warn().Err(err).Msg("unable to snapshot cluster nodes") log.Warn().Err(err).Msg("unable to snapshot cluster nodes")
} }
@@ -48,7 +48,7 @@ func kubernetesSnapshotVersion(snapshot *portainer.KubernetesSnapshot, cli kuber
return nil return nil
} }
func kubernetesSnapshotNodes(snapshot *portainer.KubernetesSnapshot, cli kubernetes.Interface) error { func kubernetesSnapshotNodes(snapshot *portainer.KubernetesSnapshot, cli kubernetes.Interface, gpuOperator bool) error {
nodeList, err := cli.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{}) nodeList, err := cli.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
if err != nil { if err != nil {
return err return err
@@ -58,16 +58,40 @@ func kubernetesSnapshotNodes(snapshot *portainer.KubernetesSnapshot, cli kuberne
return nil return nil
} }
totalGPU := make(map[string]int64)
var totalCPUs, totalMemory int64 var totalCPUs, totalMemory int64
var gpuNodeCount int
for _, node := range nodeList.Items { for _, node := range nodeList.Items {
totalCPUs += node.Status.Capacity.Cpu().Value() totalCPUs += node.Status.Capacity.Cpu().Value()
totalMemory += node.Status.Capacity.Memory().Value() totalMemory += node.Status.Capacity.Memory().Value()
if gpuOperator {
nodeHasGPU := false
for resourceName, quantity := range node.Status.Capacity {
if strings.HasPrefix(string(resourceName), "nvidia.com/") {
totalGPU[string(resourceName)] += quantity.Value()
nodeHasGPU = true
}
}
if nodeHasGPU {
gpuNodeCount++
}
}
} }
snapshot.TotalCPU = totalCPUs snapshot.TotalCPU = totalCPUs
snapshot.TotalMemory = totalMemory snapshot.TotalMemory = totalMemory
snapshot.NodeCount = len(nodeList.Items) snapshot.NodeCount = len(nodeList.Items)
snapshot.ClusterType = clusterTypeFromProviderID(nodeList.Items[0].Spec.ProviderID) snapshot.ClusterType = clusterTypeFromProviderID(nodeList.Items[0].Spec.ProviderID)
if gpuOperator {
snapshot.GPUNodeCount = gpuNodeCount
if len(totalGPU) > 0 {
snapshot.TotalGPU = totalGPU
}
}
return nil return nil
} }
+221 -6
View File
@@ -2,6 +2,7 @@ package snapshot
import ( import (
"errors" "errors"
"fmt"
"testing" "testing"
portainer "github.com/portainer/portainer/api" portainer "github.com/portainer/portainer/api"
@@ -94,7 +95,7 @@ func TestKubernetesSnapshotNodes(t *testing.T) {
snapshot := &portainer.KubernetesSnapshot{} snapshot := &portainer.KubernetesSnapshot{}
// Use the actual function now that it accepts kubernetes.Interface // Use the actual function now that it accepts kubernetes.Interface
err = kubernetesSnapshotNodes(snapshot, fakeClient) err = kubernetesSnapshotNodes(snapshot, fakeClient, false)
require.NoError(t, err) require.NoError(t, err)
// Verify the results - these should match what kubernetesSnapshotNodes would produce // Verify the results - these should match what kubernetesSnapshotNodes would produce
@@ -103,6 +104,8 @@ func TestKubernetesSnapshotNodes(t *testing.T) {
require.Equal(t, int64(25769803776), snapshot.TotalMemory) // 12GB + 8GB + 4GB = 24GB in bytes require.Equal(t, int64(25769803776), snapshot.TotalMemory) // 12GB + 8GB + 4GB = 24GB in bytes
require.Equal(t, ClusterTypeGKEAutopilot, snapshot.ClusterType) // detected from node1's ProviderID require.Equal(t, ClusterTypeGKEAutopilot, snapshot.ClusterType) // detected from node1's ProviderID
require.Nil(t, snapshot.PerformanceMetrics) // Performance metrics are no longer collected server-side require.Nil(t, snapshot.PerformanceMetrics) // Performance metrics are no longer collected server-side
require.Equal(t, 0, snapshot.GPUNodeCount)
require.Nil(t, snapshot.TotalGPU)
t.Logf("kubernetesSnapshotNodes test result: Nodes=%d, CPUs=%d, Memory=%d bytes", t.Logf("kubernetesSnapshotNodes test result: Nodes=%d, CPUs=%d, Memory=%d bytes",
snapshot.NodeCount, snapshot.TotalCPU, snapshot.TotalMemory) snapshot.NodeCount, snapshot.TotalCPU, snapshot.TotalMemory)
@@ -114,7 +117,7 @@ func TestKubernetesSnapshotNodesEmptyCluster(t *testing.T) {
fakeClient := kfake.NewClientset() fakeClient := kfake.NewClientset()
snapshot := &portainer.KubernetesSnapshot{} snapshot := &portainer.KubernetesSnapshot{}
err := kubernetesSnapshotNodes(snapshot, fakeClient) err := kubernetesSnapshotNodes(snapshot, fakeClient, false)
require.NoError(t, err) require.NoError(t, err)
// Values should remain at their zero state when no nodes exist // Values should remain at their zero state when no nodes exist
@@ -170,7 +173,7 @@ func TestCreateKubernetesSnapshotIntegration(t *testing.T) {
// Test that kubernetesSnapshotNodes logic works // Test that kubernetesSnapshotNodes logic works
snapshot := &portainer.KubernetesSnapshot{} snapshot := &portainer.KubernetesSnapshot{}
err = kubernetesSnapshotNodes(snapshot, fakeClient) err = kubernetesSnapshotNodes(snapshot, fakeClient, false)
require.NoError(t, err) require.NoError(t, err)
// Verify the integration results // Verify the integration results
@@ -198,7 +201,7 @@ func TestKubernetesSnapshotNodesWithAPIError(t *testing.T) {
}) })
snapshot := &portainer.KubernetesSnapshot{} snapshot := &portainer.KubernetesSnapshot{}
err := kubernetesSnapshotNodes(snapshot, fakeClient) err := kubernetesSnapshotNodes(snapshot, fakeClient, false)
// Should return the API error // Should return the API error
require.Error(t, err) require.Error(t, err)
@@ -234,7 +237,7 @@ func TestKubernetesSnapshotNodesSingleNode(t *testing.T) {
require.NoError(t, err) require.NoError(t, err)
snapshot := &portainer.KubernetesSnapshot{} snapshot := &portainer.KubernetesSnapshot{}
err = kubernetesSnapshotNodes(snapshot, fakeClient) err = kubernetesSnapshotNodes(snapshot, fakeClient, false)
require.NoError(t, err) require.NoError(t, err)
require.Equal(t, 1, snapshot.NodeCount) require.Equal(t, 1, snapshot.NodeCount)
@@ -246,6 +249,123 @@ func TestKubernetesSnapshotNodesSingleNode(t *testing.T) {
snapshot.NodeCount, snapshot.TotalCPU, snapshot.TotalMemory) snapshot.NodeCount, snapshot.TotalCPU, snapshot.TotalMemory)
} }
func TestKubernetesSnapshotNodesWithGPU(t *testing.T) {
t.Parallel()
fakeClient := kfake.NewClientset()
gpuNode := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{Name: "gpu-node"},
Status: corev1.NodeStatus{
Capacity: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("8"),
corev1.ResourceMemory: resource.MustParse("16Gi"),
"nvidia.com/gpu": resource.MustParse("4"),
},
},
}
cpuNode := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{Name: "cpu-node"},
Status: corev1.NodeStatus{
Capacity: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("4"),
corev1.ResourceMemory: resource.MustParse("8Gi"),
},
},
}
_, err := fakeClient.CoreV1().Nodes().Create(t.Context(), gpuNode, metav1.CreateOptions{})
require.NoError(t, err)
_, err = fakeClient.CoreV1().Nodes().Create(t.Context(), cpuNode, metav1.CreateOptions{})
require.NoError(t, err)
snapshot := &portainer.KubernetesSnapshot{}
err = kubernetesSnapshotNodes(snapshot, fakeClient, true)
require.NoError(t, err)
require.Equal(t, 2, snapshot.NodeCount)
require.Equal(t, 1, snapshot.GPUNodeCount)
require.Equal(t, int64(4), snapshot.TotalGPU["nvidia.com/gpu"])
}
func TestKubernetesSnapshotNodesMultipleGPUTypes(t *testing.T) {
t.Parallel()
fakeClient := kfake.NewClientset()
node := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{Name: "mig-node"},
Status: corev1.NodeStatus{
Capacity: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("8"),
corev1.ResourceMemory: resource.MustParse("16Gi"),
"nvidia.com/gpu": resource.MustParse("2"),
"nvidia.com/mig-2g.10gb": resource.MustParse("4"),
},
},
}
_, err := fakeClient.CoreV1().Nodes().Create(t.Context(), node, metav1.CreateOptions{})
require.NoError(t, err)
snapshot := &portainer.KubernetesSnapshot{}
err = kubernetesSnapshotNodes(snapshot, fakeClient, true)
require.NoError(t, err)
require.Equal(t, 1, snapshot.GPUNodeCount)
require.Equal(t, int64(2), snapshot.TotalGPU["nvidia.com/gpu"])
require.Equal(t, int64(4), snapshot.TotalGPU["nvidia.com/mig-2g.10gb"])
}
func TestKubernetesSnapshotNodesGPUAggregatedAcrossNodes(t *testing.T) {
t.Parallel()
fakeClient := kfake.NewClientset()
for i, gpuCount := range []int{2, 4} {
node := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{Name: fmt.Sprintf("gpu-node-%d", i)},
Status: corev1.NodeStatus{
Capacity: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("8"),
corev1.ResourceMemory: resource.MustParse("16Gi"),
"nvidia.com/gpu": *resource.NewQuantity(int64(gpuCount), resource.DecimalSI),
},
},
}
_, err := fakeClient.CoreV1().Nodes().Create(t.Context(), node, metav1.CreateOptions{})
require.NoError(t, err)
}
snapshot := &portainer.KubernetesSnapshot{}
err := kubernetesSnapshotNodes(snapshot, fakeClient, true)
require.NoError(t, err)
require.Equal(t, 2, snapshot.GPUNodeCount)
require.Equal(t, int64(6), snapshot.TotalGPU["nvidia.com/gpu"]) // 2 + 4
}
func TestKubernetesSnapshotNodesNoGPULeavesTotalGPUNil(t *testing.T) {
t.Parallel()
fakeClient := kfake.NewClientset()
node := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{Name: "cpu-only-node"},
Status: corev1.NodeStatus{
Capacity: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("4"),
corev1.ResourceMemory: resource.MustParse("8Gi"),
},
},
}
_, err := fakeClient.CoreV1().Nodes().Create(t.Context(), node, metav1.CreateOptions{})
require.NoError(t, err)
snapshot := &portainer.KubernetesSnapshot{}
err = kubernetesSnapshotNodes(snapshot, fakeClient, false)
require.NoError(t, err)
require.Equal(t, 0, snapshot.GPUNodeCount)
require.Nil(t, snapshot.TotalGPU)
}
func TestKubernetesSnapshotNodesZeroResources(t *testing.T) { func TestKubernetesSnapshotNodesZeroResources(t *testing.T) {
t.Parallel() t.Parallel()
// Test with nodes that have zero or very small resources // Test with nodes that have zero or very small resources
@@ -267,7 +387,7 @@ func TestKubernetesSnapshotNodesZeroResources(t *testing.T) {
require.NoError(t, err) require.NoError(t, err)
snapshot := &portainer.KubernetesSnapshot{} snapshot := &portainer.KubernetesSnapshot{}
err = kubernetesSnapshotNodes(snapshot, fakeClient) err = kubernetesSnapshotNodes(snapshot, fakeClient, false)
require.NoError(t, err) require.NoError(t, err)
require.Equal(t, 1, snapshot.NodeCount) require.Equal(t, 1, snapshot.NodeCount)
@@ -277,3 +397,98 @@ func TestKubernetesSnapshotNodesZeroResources(t *testing.T) {
t.Log("Zero resources test passed - handles edge case correctly") t.Log("Zero resources test passed - handles edge case correctly")
} }
func TestKubernetesSnapshotNodesGPUOperator(t *testing.T) {
t.Parallel()
const gpuMemoryBytes = int64(25769803776) // 16GiB + 8GiB
tests := []struct {
name string
gpuOperator bool
wantGPUCount int
wantTotalGPU map[string]int64
}{
{
name: "disabled does not populate GPU fields even when GPU nodes exist",
gpuOperator: false,
wantGPUCount: 0,
wantTotalGPU: nil,
},
{
name: "enabled populates GPU fields from GPU nodes",
gpuOperator: true,
wantGPUCount: 1,
wantTotalGPU: map[string]int64{"nvidia.com/gpu": 4},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
fakeClient := kfake.NewClientset()
nodes := []*corev1.Node{
{
ObjectMeta: metav1.ObjectMeta{Name: "gpu-node"},
Status: corev1.NodeStatus{
Capacity: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("8"),
corev1.ResourceMemory: resource.MustParse("16Gi"),
"nvidia.com/gpu": resource.MustParse("4"),
},
},
},
{
ObjectMeta: metav1.ObjectMeta{Name: "cpu-node"},
Status: corev1.NodeStatus{
Capacity: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("4"),
corev1.ResourceMemory: resource.MustParse("8Gi"),
},
},
},
}
for _, n := range nodes {
_, err := fakeClient.CoreV1().Nodes().Create(t.Context(), n, metav1.CreateOptions{})
require.NoError(t, err)
}
snap := &portainer.KubernetesSnapshot{}
err := kubernetesSnapshotNodes(snap, fakeClient, tt.gpuOperator)
require.NoError(t, err)
require.Equal(t, 2, snap.NodeCount)
require.Equal(t, int64(12), snap.TotalCPU)
require.Equal(t, gpuMemoryBytes, snap.TotalMemory)
require.Equal(t, tt.wantGPUCount, snap.GPUNodeCount)
require.Equal(t, tt.wantTotalGPU, snap.TotalGPU)
})
}
}
func TestKubernetesSnapshotNodesGPUOperatorEnabledNoGPUNodes(t *testing.T) {
t.Parallel()
fakeClient := kfake.NewClientset()
node := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{Name: "cpu-node"},
Status: corev1.NodeStatus{
Capacity: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("4"),
corev1.ResourceMemory: resource.MustParse("8Gi"),
},
},
}
_, err := fakeClient.CoreV1().Nodes().Create(t.Context(), node, metav1.CreateOptions{})
require.NoError(t, err)
snap := &portainer.KubernetesSnapshot{}
err = kubernetesSnapshotNodes(snap, fakeClient, true)
require.NoError(t, err)
require.Equal(t, 1, snap.NodeCount)
require.Equal(t, int64(4), snap.TotalCPU)
require.Equal(t, 0, snap.GPUNodeCount)
require.Nil(t, snap.TotalGPU)
}