From 5395dee4c6da5aeeff5367c470916fc192545936 Mon Sep 17 00:00:00 2001 From: bernard-portainer Date: Mon, 22 Jun 2026 09:21:43 +1200 Subject: [PATCH] feat(gpu-stats): add gpu stats to environments [C9S-200] (#2735) --- .../endpointedge_status_inspect.go | 4 + api/kubernetes/snapshot.go | 2 +- api/portainer.go | 5 + app/react/components/StatsItem.test.tsx | 16 +- app/react/components/StatsItem.tsx | 10 +- .../EnvironmentStatsKubernetes.tsx | 9 +- app/react/portainer/environments/types.ts | 2 + pkg/snapshot/kubernetes.go | 30 ++- pkg/snapshot/kubernetes_test.go | 227 +++++++++++++++++- 9 files changed, 292 insertions(+), 13 deletions(-) diff --git a/api/http/handler/endpointedge/endpointedge_status_inspect.go b/api/http/handler/endpointedge/endpointedge_status_inspect.go index 65f39c7e4b..23d5434fab 100644 --- a/api/http/handler/endpointedge/endpointedge_status_inspect.go +++ b/api/http/handler/endpointedge/endpointedge_status_inspect.go @@ -135,6 +135,10 @@ func (handler *Handler) parseHeaders(r *http.Request, endpoint *portainer.Endpoi version := r.Header.Get(portainer.PortainerAgentHeader) endpoint.Agent.Version = version + if gpuOperatorHeader := r.Header.Get(portainer.HTTPResponseAgentGPUOperator); gpuOperatorHeader != "" { + endpoint.Kubernetes.Flags.GPUOperator = gpuOperatorHeader == "true" + } + return nil } diff --git a/api/kubernetes/snapshot.go b/api/kubernetes/snapshot.go index c8f6c2e598..c75d510846 100644 --- a/api/kubernetes/snapshot.go +++ b/api/kubernetes/snapshot.go @@ -24,5 +24,5 @@ func (snapshotter *Snapshotter) CreateSnapshot(endpoint *portainer.Endpoint) (*p return nil, err } - return snapshot.CreateKubernetesSnapshot(client) + return snapshot.CreateKubernetesSnapshot(client, endpoint.Kubernetes.Flags.GPUOperator) } diff --git a/api/portainer.go b/api/portainer.go index 1bca4c04ea..5b925e9cda 100644 --- a/api/portainer.go +++ b/api/portainer.go @@ -830,6 +830,7 @@ type ( IsServerMetricsDetected bool `json:"IsServerMetricsDetected" validate:"required"` IsServerIngressClassDetected bool `json:"IsServerIngressClassDetected" validate:"required"` IsServerStorageDetected bool `json:"IsServerStorageDetected" validate:"required"` + GPUOperator bool `json:"GPUOperator,omitempty"` } // KubernetesSnapshot represents a snapshot of a specific Kubernetes environment(endpoint) at a specific time @@ -840,6 +841,8 @@ type ( TotalCPU int64 `json:"TotalCPU" validate:"required"` TotalMemory int64 `json:"TotalMemory" validate:"required"` ClusterType string `json:"ClusterType,omitempty"` + GPUNodeCount int `json:"GPUNodeCount,omitempty"` + TotalGPU map[string]int64 `json:"TotalGPU,omitempty"` DiagnosticsData *DiagnosticsData `json:"DiagnosticsData,omitempty"` PerformanceMetrics *PerformanceMetrics `json:"PerformanceMetrics,omitempty"` } @@ -2078,6 +2081,8 @@ const ( PortainerAgentKubernetesSATokenHeader = "X-PortainerAgent-SA-Token" // HTTPAlertStateHeaderName is the name of the header used to transmit edge alert evaluation state HTTPAlertStateHeaderName = "X-PortainerAgent-AlertState" + // HTTPResponseAgentGPUOperator represents the name of the header indicating whether the GPU operator is enabled on the agent + HTTPResponseAgentGPUOperator = "Portainer-Agent-GPU-Operator" // PortainerAgentSignatureMessage represents the message used to create a digital signature // to be used when communicating with an agent PortainerAgentSignatureMessage = "Portainer-App" diff --git a/app/react/components/StatsItem.test.tsx b/app/react/components/StatsItem.test.tsx index 254fc8d86c..e9a922bfd7 100644 --- a/app/react/components/StatsItem.test.tsx +++ b/app/react/components/StatsItem.test.tsx @@ -1,6 +1,12 @@ import { render, screen } from '@testing-library/react'; -import { ContainerStats, CPUStats, MemoryStats, NodeStats } from './StatsItem'; +import { + ContainerStats, + CPUStats, + GpuStats, + MemoryStats, + NodeStats, +} from './StatsItem'; describe('StatsItem', () => { describe('NodeStats', () => { @@ -27,6 +33,14 @@ describe('StatsItem', () => { }); }); + describe('GpuStats', () => { + it('renders GPU count with GPUS label', () => { + render(); + expect(screen.getByText('4')).toBeVisible(); + expect(screen.getByText('GPUS')).toBeVisible(); + }); + }); + describe('ContainerStats', () => { it('renders running/total containers with a progress bar', () => { render(); diff --git a/app/react/components/StatsItem.tsx b/app/react/components/StatsItem.tsx index fdbaefe002..3555de1e62 100644 --- a/app/react/components/StatsItem.tsx +++ b/app/react/components/StatsItem.tsx @@ -1,6 +1,6 @@ import clsx from 'clsx'; import { PropsWithChildren } from 'react'; -import { Cpu, Hexagon, LaptopMinimal, MemoryStick } from 'lucide-react'; +import { Cpu, Gpu, Hexagon, LaptopMinimal, MemoryStick } from 'lucide-react'; import { Icon, IconProps } from '@/react/components/Icon'; @@ -65,6 +65,14 @@ export function MemoryStats({ value }: StatsProps) { ); } +export function GpuStats({ value }: StatsProps) { + return ( + + {value} + + ); +} + interface ContainerStatsProps { total: number; running: number; diff --git a/app/react/portainer/HomeView/EnvironmentList/EnvironmentItem/EnvironmentStatsKubernetes.tsx b/app/react/portainer/HomeView/EnvironmentList/EnvironmentItem/EnvironmentStatsKubernetes.tsx index 1be4cd870e..855c94f7c0 100644 --- a/app/react/portainer/HomeView/EnvironmentList/EnvironmentItem/EnvironmentStatsKubernetes.tsx +++ b/app/react/portainer/HomeView/EnvironmentList/EnvironmentItem/EnvironmentStatsKubernetes.tsx @@ -1,7 +1,7 @@ import { KubernetesSnapshot } from '@/react/portainer/environments/types'; import { humanize } from '@/portainer/filters/filters'; -import { CPUStats, MemoryStats, NodeStats } from '@@/StatsItem'; +import { CPUStats, GpuStats, MemoryStats, NodeStats } from '@@/StatsItem'; interface Props { snapshot?: KubernetesSnapshot; @@ -12,8 +12,15 @@ export function EnvironmentStatsKubernetes({ snapshot }: Props) { return <>No snapshot available; } + const totalGpuCount = Object.values(snapshot.TotalGPU ?? {}).reduce( + (sum, v) => sum + v, + 0 + ); + return ( <> + {totalGpuCount > 0 && } + diff --git a/app/react/portainer/environments/types.ts b/app/react/portainer/environments/types.ts index d8efb8a9ae..195db28d12 100644 --- a/app/react/portainer/environments/types.ts +++ b/app/react/portainer/environments/types.ts @@ -58,6 +58,8 @@ export interface KubernetesSnapshot { TotalMemory: number; Time: number; NodeCount: number; + GPUNodeCount?: number; + TotalGPU?: Record; } export type IngressClass = { diff --git a/pkg/snapshot/kubernetes.go b/pkg/snapshot/kubernetes.go index 1fd2b36f42..c7b5e125f9 100644 --- a/pkg/snapshot/kubernetes.go +++ b/pkg/snapshot/kubernetes.go @@ -22,14 +22,14 @@ import ( "k8s.io/client-go/kubernetes" ) -func CreateKubernetesSnapshot(cli *kubernetes.Clientset) (*portainer.KubernetesSnapshot, error) { +func CreateKubernetesSnapshot(cli *kubernetes.Clientset, gpuOperator bool) (*portainer.KubernetesSnapshot, error) { kubernetesSnapshot := &portainer.KubernetesSnapshot{} err := kubernetesSnapshotVersion(kubernetesSnapshot, cli) if err != nil { log.Warn().Err(err).Msg("unable to snapshot cluster version") } - err = kubernetesSnapshotNodes(kubernetesSnapshot, cli) + err = kubernetesSnapshotNodes(kubernetesSnapshot, cli, gpuOperator) if err != nil { log.Warn().Err(err).Msg("unable to snapshot cluster nodes") } @@ -48,7 +48,7 @@ func kubernetesSnapshotVersion(snapshot *portainer.KubernetesSnapshot, cli kuber return nil } -func kubernetesSnapshotNodes(snapshot *portainer.KubernetesSnapshot, cli kubernetes.Interface) error { +func kubernetesSnapshotNodes(snapshot *portainer.KubernetesSnapshot, cli kubernetes.Interface, gpuOperator bool) error { nodeList, err := cli.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{}) if err != nil { return err @@ -58,16 +58,40 @@ func kubernetesSnapshotNodes(snapshot *portainer.KubernetesSnapshot, cli kuberne return nil } + totalGPU := make(map[string]int64) var totalCPUs, totalMemory int64 + var gpuNodeCount int + for _, node := range nodeList.Items { totalCPUs += node.Status.Capacity.Cpu().Value() totalMemory += node.Status.Capacity.Memory().Value() + + if gpuOperator { + nodeHasGPU := false + for resourceName, quantity := range node.Status.Capacity { + if strings.HasPrefix(string(resourceName), "nvidia.com/") { + totalGPU[string(resourceName)] += quantity.Value() + nodeHasGPU = true + } + } + if nodeHasGPU { + gpuNodeCount++ + } + } } + snapshot.TotalCPU = totalCPUs snapshot.TotalMemory = totalMemory snapshot.NodeCount = len(nodeList.Items) snapshot.ClusterType = clusterTypeFromProviderID(nodeList.Items[0].Spec.ProviderID) + if gpuOperator { + snapshot.GPUNodeCount = gpuNodeCount + if len(totalGPU) > 0 { + snapshot.TotalGPU = totalGPU + } + } + return nil } diff --git a/pkg/snapshot/kubernetes_test.go b/pkg/snapshot/kubernetes_test.go index 03795a59db..79b9df87b1 100644 --- a/pkg/snapshot/kubernetes_test.go +++ b/pkg/snapshot/kubernetes_test.go @@ -2,6 +2,7 @@ package snapshot import ( "errors" + "fmt" "testing" portainer "github.com/portainer/portainer/api" @@ -94,7 +95,7 @@ func TestKubernetesSnapshotNodes(t *testing.T) { snapshot := &portainer.KubernetesSnapshot{} // Use the actual function now that it accepts kubernetes.Interface - err = kubernetesSnapshotNodes(snapshot, fakeClient) + err = kubernetesSnapshotNodes(snapshot, fakeClient, false) require.NoError(t, err) // Verify the results - these should match what kubernetesSnapshotNodes would produce @@ -103,6 +104,8 @@ func TestKubernetesSnapshotNodes(t *testing.T) { require.Equal(t, int64(25769803776), snapshot.TotalMemory) // 12GB + 8GB + 4GB = 24GB in bytes require.Equal(t, ClusterTypeGKEAutopilot, snapshot.ClusterType) // detected from node1's ProviderID require.Nil(t, snapshot.PerformanceMetrics) // Performance metrics are no longer collected server-side + require.Equal(t, 0, snapshot.GPUNodeCount) + require.Nil(t, snapshot.TotalGPU) t.Logf("kubernetesSnapshotNodes test result: Nodes=%d, CPUs=%d, Memory=%d bytes", snapshot.NodeCount, snapshot.TotalCPU, snapshot.TotalMemory) @@ -114,7 +117,7 @@ func TestKubernetesSnapshotNodesEmptyCluster(t *testing.T) { fakeClient := kfake.NewClientset() snapshot := &portainer.KubernetesSnapshot{} - err := kubernetesSnapshotNodes(snapshot, fakeClient) + err := kubernetesSnapshotNodes(snapshot, fakeClient, false) require.NoError(t, err) // Values should remain at their zero state when no nodes exist @@ -170,7 +173,7 @@ func TestCreateKubernetesSnapshotIntegration(t *testing.T) { // Test that kubernetesSnapshotNodes logic works snapshot := &portainer.KubernetesSnapshot{} - err = kubernetesSnapshotNodes(snapshot, fakeClient) + err = kubernetesSnapshotNodes(snapshot, fakeClient, false) require.NoError(t, err) // Verify the integration results @@ -198,7 +201,7 @@ func TestKubernetesSnapshotNodesWithAPIError(t *testing.T) { }) snapshot := &portainer.KubernetesSnapshot{} - err := kubernetesSnapshotNodes(snapshot, fakeClient) + err := kubernetesSnapshotNodes(snapshot, fakeClient, false) // Should return the API error require.Error(t, err) @@ -234,7 +237,7 @@ func TestKubernetesSnapshotNodesSingleNode(t *testing.T) { require.NoError(t, err) snapshot := &portainer.KubernetesSnapshot{} - err = kubernetesSnapshotNodes(snapshot, fakeClient) + err = kubernetesSnapshotNodes(snapshot, fakeClient, false) require.NoError(t, err) require.Equal(t, 1, snapshot.NodeCount) @@ -246,6 +249,123 @@ func TestKubernetesSnapshotNodesSingleNode(t *testing.T) { snapshot.NodeCount, snapshot.TotalCPU, snapshot.TotalMemory) } +func TestKubernetesSnapshotNodesWithGPU(t *testing.T) { + t.Parallel() + fakeClient := kfake.NewClientset() + + gpuNode := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "gpu-node"}, + Status: corev1.NodeStatus{ + Capacity: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("8"), + corev1.ResourceMemory: resource.MustParse("16Gi"), + "nvidia.com/gpu": resource.MustParse("4"), + }, + }, + } + cpuNode := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "cpu-node"}, + Status: corev1.NodeStatus{ + Capacity: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("4"), + corev1.ResourceMemory: resource.MustParse("8Gi"), + }, + }, + } + + _, err := fakeClient.CoreV1().Nodes().Create(t.Context(), gpuNode, metav1.CreateOptions{}) + require.NoError(t, err) + _, err = fakeClient.CoreV1().Nodes().Create(t.Context(), cpuNode, metav1.CreateOptions{}) + require.NoError(t, err) + + snapshot := &portainer.KubernetesSnapshot{} + err = kubernetesSnapshotNodes(snapshot, fakeClient, true) + require.NoError(t, err) + + require.Equal(t, 2, snapshot.NodeCount) + require.Equal(t, 1, snapshot.GPUNodeCount) + require.Equal(t, int64(4), snapshot.TotalGPU["nvidia.com/gpu"]) +} + +func TestKubernetesSnapshotNodesMultipleGPUTypes(t *testing.T) { + t.Parallel() + fakeClient := kfake.NewClientset() + + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "mig-node"}, + Status: corev1.NodeStatus{ + Capacity: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("8"), + corev1.ResourceMemory: resource.MustParse("16Gi"), + "nvidia.com/gpu": resource.MustParse("2"), + "nvidia.com/mig-2g.10gb": resource.MustParse("4"), + }, + }, + } + + _, err := fakeClient.CoreV1().Nodes().Create(t.Context(), node, metav1.CreateOptions{}) + require.NoError(t, err) + + snapshot := &portainer.KubernetesSnapshot{} + err = kubernetesSnapshotNodes(snapshot, fakeClient, true) + require.NoError(t, err) + + require.Equal(t, 1, snapshot.GPUNodeCount) + require.Equal(t, int64(2), snapshot.TotalGPU["nvidia.com/gpu"]) + require.Equal(t, int64(4), snapshot.TotalGPU["nvidia.com/mig-2g.10gb"]) +} + +func TestKubernetesSnapshotNodesGPUAggregatedAcrossNodes(t *testing.T) { + t.Parallel() + fakeClient := kfake.NewClientset() + + for i, gpuCount := range []int{2, 4} { + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: fmt.Sprintf("gpu-node-%d", i)}, + Status: corev1.NodeStatus{ + Capacity: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("8"), + corev1.ResourceMemory: resource.MustParse("16Gi"), + "nvidia.com/gpu": *resource.NewQuantity(int64(gpuCount), resource.DecimalSI), + }, + }, + } + _, err := fakeClient.CoreV1().Nodes().Create(t.Context(), node, metav1.CreateOptions{}) + require.NoError(t, err) + } + + snapshot := &portainer.KubernetesSnapshot{} + err := kubernetesSnapshotNodes(snapshot, fakeClient, true) + require.NoError(t, err) + + require.Equal(t, 2, snapshot.GPUNodeCount) + require.Equal(t, int64(6), snapshot.TotalGPU["nvidia.com/gpu"]) // 2 + 4 +} + +func TestKubernetesSnapshotNodesNoGPULeavesTotalGPUNil(t *testing.T) { + t.Parallel() + fakeClient := kfake.NewClientset() + + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "cpu-only-node"}, + Status: corev1.NodeStatus{ + Capacity: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("4"), + corev1.ResourceMemory: resource.MustParse("8Gi"), + }, + }, + } + _, err := fakeClient.CoreV1().Nodes().Create(t.Context(), node, metav1.CreateOptions{}) + require.NoError(t, err) + + snapshot := &portainer.KubernetesSnapshot{} + err = kubernetesSnapshotNodes(snapshot, fakeClient, false) + require.NoError(t, err) + + require.Equal(t, 0, snapshot.GPUNodeCount) + require.Nil(t, snapshot.TotalGPU) +} + func TestKubernetesSnapshotNodesZeroResources(t *testing.T) { t.Parallel() // Test with nodes that have zero or very small resources @@ -267,7 +387,7 @@ func TestKubernetesSnapshotNodesZeroResources(t *testing.T) { require.NoError(t, err) snapshot := &portainer.KubernetesSnapshot{} - err = kubernetesSnapshotNodes(snapshot, fakeClient) + err = kubernetesSnapshotNodes(snapshot, fakeClient, false) require.NoError(t, err) require.Equal(t, 1, snapshot.NodeCount) @@ -277,3 +397,98 @@ func TestKubernetesSnapshotNodesZeroResources(t *testing.T) { t.Log("Zero resources test passed - handles edge case correctly") } + +func TestKubernetesSnapshotNodesGPUOperator(t *testing.T) { + t.Parallel() + + const gpuMemoryBytes = int64(25769803776) // 16GiB + 8GiB + + tests := []struct { + name string + gpuOperator bool + wantGPUCount int + wantTotalGPU map[string]int64 + }{ + { + name: "disabled does not populate GPU fields even when GPU nodes exist", + gpuOperator: false, + wantGPUCount: 0, + wantTotalGPU: nil, + }, + { + name: "enabled populates GPU fields from GPU nodes", + gpuOperator: true, + wantGPUCount: 1, + wantTotalGPU: map[string]int64{"nvidia.com/gpu": 4}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + fakeClient := kfake.NewClientset() + nodes := []*corev1.Node{ + { + ObjectMeta: metav1.ObjectMeta{Name: "gpu-node"}, + Status: corev1.NodeStatus{ + Capacity: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("8"), + corev1.ResourceMemory: resource.MustParse("16Gi"), + "nvidia.com/gpu": resource.MustParse("4"), + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "cpu-node"}, + Status: corev1.NodeStatus{ + Capacity: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("4"), + corev1.ResourceMemory: resource.MustParse("8Gi"), + }, + }, + }, + } + for _, n := range nodes { + _, err := fakeClient.CoreV1().Nodes().Create(t.Context(), n, metav1.CreateOptions{}) + require.NoError(t, err) + } + + snap := &portainer.KubernetesSnapshot{} + err := kubernetesSnapshotNodes(snap, fakeClient, tt.gpuOperator) + require.NoError(t, err) + + require.Equal(t, 2, snap.NodeCount) + require.Equal(t, int64(12), snap.TotalCPU) + require.Equal(t, gpuMemoryBytes, snap.TotalMemory) + require.Equal(t, tt.wantGPUCount, snap.GPUNodeCount) + require.Equal(t, tt.wantTotalGPU, snap.TotalGPU) + }) + } +} + +func TestKubernetesSnapshotNodesGPUOperatorEnabledNoGPUNodes(t *testing.T) { + t.Parallel() + + fakeClient := kfake.NewClientset() + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "cpu-node"}, + Status: corev1.NodeStatus{ + Capacity: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("4"), + corev1.ResourceMemory: resource.MustParse("8Gi"), + }, + }, + } + _, err := fakeClient.CoreV1().Nodes().Create(t.Context(), node, metav1.CreateOptions{}) + require.NoError(t, err) + + snap := &portainer.KubernetesSnapshot{} + err = kubernetesSnapshotNodes(snap, fakeClient, true) + require.NoError(t, err) + + require.Equal(t, 1, snap.NodeCount) + require.Equal(t, int64(4), snap.TotalCPU) + require.Equal(t, 0, snap.GPUNodeCount) + require.Nil(t, snap.TotalGPU) +}