diff --git a/api/http/handler/endpointedge/endpointedge_status_inspect.go b/api/http/handler/endpointedge/endpointedge_status_inspect.go
index 65f39c7e4b..23d5434fab 100644
--- a/api/http/handler/endpointedge/endpointedge_status_inspect.go
+++ b/api/http/handler/endpointedge/endpointedge_status_inspect.go
@@ -135,6 +135,10 @@ func (handler *Handler) parseHeaders(r *http.Request, endpoint *portainer.Endpoi
version := r.Header.Get(portainer.PortainerAgentHeader)
endpoint.Agent.Version = version
+ if gpuOperatorHeader := r.Header.Get(portainer.HTTPResponseAgentGPUOperator); gpuOperatorHeader != "" {
+ endpoint.Kubernetes.Flags.GPUOperator = gpuOperatorHeader == "true"
+ }
+
return nil
}
diff --git a/api/kubernetes/snapshot.go b/api/kubernetes/snapshot.go
index c8f6c2e598..c75d510846 100644
--- a/api/kubernetes/snapshot.go
+++ b/api/kubernetes/snapshot.go
@@ -24,5 +24,5 @@ func (snapshotter *Snapshotter) CreateSnapshot(endpoint *portainer.Endpoint) (*p
return nil, err
}
- return snapshot.CreateKubernetesSnapshot(client)
+ return snapshot.CreateKubernetesSnapshot(client, endpoint.Kubernetes.Flags.GPUOperator)
}
diff --git a/api/portainer.go b/api/portainer.go
index 1bca4c04ea..5b925e9cda 100644
--- a/api/portainer.go
+++ b/api/portainer.go
@@ -830,6 +830,7 @@ type (
IsServerMetricsDetected bool `json:"IsServerMetricsDetected" validate:"required"`
IsServerIngressClassDetected bool `json:"IsServerIngressClassDetected" validate:"required"`
IsServerStorageDetected bool `json:"IsServerStorageDetected" validate:"required"`
+ GPUOperator bool `json:"GPUOperator,omitempty"`
}
// KubernetesSnapshot represents a snapshot of a specific Kubernetes environment(endpoint) at a specific time
@@ -840,6 +841,8 @@ type (
TotalCPU int64 `json:"TotalCPU" validate:"required"`
TotalMemory int64 `json:"TotalMemory" validate:"required"`
ClusterType string `json:"ClusterType,omitempty"`
+ GPUNodeCount int `json:"GPUNodeCount,omitempty"`
+ TotalGPU map[string]int64 `json:"TotalGPU,omitempty"`
DiagnosticsData *DiagnosticsData `json:"DiagnosticsData,omitempty"`
PerformanceMetrics *PerformanceMetrics `json:"PerformanceMetrics,omitempty"`
}
@@ -2078,6 +2081,8 @@ const (
PortainerAgentKubernetesSATokenHeader = "X-PortainerAgent-SA-Token"
// HTTPAlertStateHeaderName is the name of the header used to transmit edge alert evaluation state
HTTPAlertStateHeaderName = "X-PortainerAgent-AlertState"
+ // HTTPResponseAgentGPUOperator represents the name of the header indicating whether the GPU operator is enabled on the agent
+ HTTPResponseAgentGPUOperator = "Portainer-Agent-GPU-Operator"
// PortainerAgentSignatureMessage represents the message used to create a digital signature
// to be used when communicating with an agent
PortainerAgentSignatureMessage = "Portainer-App"
diff --git a/app/react/components/StatsItem.test.tsx b/app/react/components/StatsItem.test.tsx
index 254fc8d86c..e9a922bfd7 100644
--- a/app/react/components/StatsItem.test.tsx
+++ b/app/react/components/StatsItem.test.tsx
@@ -1,6 +1,12 @@
import { render, screen } from '@testing-library/react';
-import { ContainerStats, CPUStats, MemoryStats, NodeStats } from './StatsItem';
+import {
+ ContainerStats,
+ CPUStats,
+ GpuStats,
+ MemoryStats,
+ NodeStats,
+} from './StatsItem';
describe('StatsItem', () => {
describe('NodeStats', () => {
@@ -27,6 +33,14 @@ describe('StatsItem', () => {
});
});
+ describe('GpuStats', () => {
+ it('renders GPU count with GPUS label', () => {
+ render();
+ expect(screen.getByText('4')).toBeVisible();
+ expect(screen.getByText('GPUS')).toBeVisible();
+ });
+ });
+
describe('ContainerStats', () => {
it('renders running/total containers with a progress bar', () => {
render();
diff --git a/app/react/components/StatsItem.tsx b/app/react/components/StatsItem.tsx
index fdbaefe002..3555de1e62 100644
--- a/app/react/components/StatsItem.tsx
+++ b/app/react/components/StatsItem.tsx
@@ -1,6 +1,6 @@
import clsx from 'clsx';
import { PropsWithChildren } from 'react';
-import { Cpu, Hexagon, LaptopMinimal, MemoryStick } from 'lucide-react';
+import { Cpu, Gpu, Hexagon, LaptopMinimal, MemoryStick } from 'lucide-react';
import { Icon, IconProps } from '@/react/components/Icon';
@@ -65,6 +65,14 @@ export function MemoryStats({ value }: StatsProps) {
);
}
+export function GpuStats({ value }: StatsProps) {
+ return (
+
+ {value}
+
+ );
+}
+
interface ContainerStatsProps {
total: number;
running: number;
diff --git a/app/react/portainer/HomeView/EnvironmentList/EnvironmentItem/EnvironmentStatsKubernetes.tsx b/app/react/portainer/HomeView/EnvironmentList/EnvironmentItem/EnvironmentStatsKubernetes.tsx
index 1be4cd870e..855c94f7c0 100644
--- a/app/react/portainer/HomeView/EnvironmentList/EnvironmentItem/EnvironmentStatsKubernetes.tsx
+++ b/app/react/portainer/HomeView/EnvironmentList/EnvironmentItem/EnvironmentStatsKubernetes.tsx
@@ -1,7 +1,7 @@
import { KubernetesSnapshot } from '@/react/portainer/environments/types';
import { humanize } from '@/portainer/filters/filters';
-import { CPUStats, MemoryStats, NodeStats } from '@@/StatsItem';
+import { CPUStats, GpuStats, MemoryStats, NodeStats } from '@@/StatsItem';
interface Props {
snapshot?: KubernetesSnapshot;
@@ -12,8 +12,15 @@ export function EnvironmentStatsKubernetes({ snapshot }: Props) {
return <>No snapshot available>;
}
+ const totalGpuCount = Object.values(snapshot.TotalGPU ?? {}).reduce(
+ (sum, v) => sum + v,
+ 0
+ );
+
return (
<>
+ {totalGpuCount > 0 && }
+
diff --git a/app/react/portainer/environments/types.ts b/app/react/portainer/environments/types.ts
index d8efb8a9ae..195db28d12 100644
--- a/app/react/portainer/environments/types.ts
+++ b/app/react/portainer/environments/types.ts
@@ -58,6 +58,8 @@ export interface KubernetesSnapshot {
TotalMemory: number;
Time: number;
NodeCount: number;
+ GPUNodeCount?: number;
+ TotalGPU?: Record;
}
export type IngressClass = {
diff --git a/pkg/snapshot/kubernetes.go b/pkg/snapshot/kubernetes.go
index 1fd2b36f42..c7b5e125f9 100644
--- a/pkg/snapshot/kubernetes.go
+++ b/pkg/snapshot/kubernetes.go
@@ -22,14 +22,14 @@ import (
"k8s.io/client-go/kubernetes"
)
-func CreateKubernetesSnapshot(cli *kubernetes.Clientset) (*portainer.KubernetesSnapshot, error) {
+func CreateKubernetesSnapshot(cli *kubernetes.Clientset, gpuOperator bool) (*portainer.KubernetesSnapshot, error) {
kubernetesSnapshot := &portainer.KubernetesSnapshot{}
err := kubernetesSnapshotVersion(kubernetesSnapshot, cli)
if err != nil {
log.Warn().Err(err).Msg("unable to snapshot cluster version")
}
- err = kubernetesSnapshotNodes(kubernetesSnapshot, cli)
+ err = kubernetesSnapshotNodes(kubernetesSnapshot, cli, gpuOperator)
if err != nil {
log.Warn().Err(err).Msg("unable to snapshot cluster nodes")
}
@@ -48,7 +48,7 @@ func kubernetesSnapshotVersion(snapshot *portainer.KubernetesSnapshot, cli kuber
return nil
}
-func kubernetesSnapshotNodes(snapshot *portainer.KubernetesSnapshot, cli kubernetes.Interface) error {
+func kubernetesSnapshotNodes(snapshot *portainer.KubernetesSnapshot, cli kubernetes.Interface, gpuOperator bool) error {
nodeList, err := cli.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
if err != nil {
return err
@@ -58,16 +58,40 @@ func kubernetesSnapshotNodes(snapshot *portainer.KubernetesSnapshot, cli kuberne
return nil
}
+ totalGPU := make(map[string]int64)
var totalCPUs, totalMemory int64
+ var gpuNodeCount int
+
for _, node := range nodeList.Items {
totalCPUs += node.Status.Capacity.Cpu().Value()
totalMemory += node.Status.Capacity.Memory().Value()
+
+ if gpuOperator {
+ nodeHasGPU := false
+ for resourceName, quantity := range node.Status.Capacity {
+ if strings.HasPrefix(string(resourceName), "nvidia.com/") {
+ totalGPU[string(resourceName)] += quantity.Value()
+ nodeHasGPU = true
+ }
+ }
+ if nodeHasGPU {
+ gpuNodeCount++
+ }
+ }
}
+
snapshot.TotalCPU = totalCPUs
snapshot.TotalMemory = totalMemory
snapshot.NodeCount = len(nodeList.Items)
snapshot.ClusterType = clusterTypeFromProviderID(nodeList.Items[0].Spec.ProviderID)
+ if gpuOperator {
+ snapshot.GPUNodeCount = gpuNodeCount
+ if len(totalGPU) > 0 {
+ snapshot.TotalGPU = totalGPU
+ }
+ }
+
return nil
}
diff --git a/pkg/snapshot/kubernetes_test.go b/pkg/snapshot/kubernetes_test.go
index 03795a59db..79b9df87b1 100644
--- a/pkg/snapshot/kubernetes_test.go
+++ b/pkg/snapshot/kubernetes_test.go
@@ -2,6 +2,7 @@ package snapshot
import (
"errors"
+ "fmt"
"testing"
portainer "github.com/portainer/portainer/api"
@@ -94,7 +95,7 @@ func TestKubernetesSnapshotNodes(t *testing.T) {
snapshot := &portainer.KubernetesSnapshot{}
// Use the actual function now that it accepts kubernetes.Interface
- err = kubernetesSnapshotNodes(snapshot, fakeClient)
+ err = kubernetesSnapshotNodes(snapshot, fakeClient, false)
require.NoError(t, err)
// Verify the results - these should match what kubernetesSnapshotNodes would produce
@@ -103,6 +104,8 @@ func TestKubernetesSnapshotNodes(t *testing.T) {
require.Equal(t, int64(25769803776), snapshot.TotalMemory) // 12GB + 8GB + 4GB = 24GB in bytes
require.Equal(t, ClusterTypeGKEAutopilot, snapshot.ClusterType) // detected from node1's ProviderID
require.Nil(t, snapshot.PerformanceMetrics) // Performance metrics are no longer collected server-side
+ require.Equal(t, 0, snapshot.GPUNodeCount)
+ require.Nil(t, snapshot.TotalGPU)
t.Logf("kubernetesSnapshotNodes test result: Nodes=%d, CPUs=%d, Memory=%d bytes",
snapshot.NodeCount, snapshot.TotalCPU, snapshot.TotalMemory)
@@ -114,7 +117,7 @@ func TestKubernetesSnapshotNodesEmptyCluster(t *testing.T) {
fakeClient := kfake.NewClientset()
snapshot := &portainer.KubernetesSnapshot{}
- err := kubernetesSnapshotNodes(snapshot, fakeClient)
+ err := kubernetesSnapshotNodes(snapshot, fakeClient, false)
require.NoError(t, err)
// Values should remain at their zero state when no nodes exist
@@ -170,7 +173,7 @@ func TestCreateKubernetesSnapshotIntegration(t *testing.T) {
// Test that kubernetesSnapshotNodes logic works
snapshot := &portainer.KubernetesSnapshot{}
- err = kubernetesSnapshotNodes(snapshot, fakeClient)
+ err = kubernetesSnapshotNodes(snapshot, fakeClient, false)
require.NoError(t, err)
// Verify the integration results
@@ -198,7 +201,7 @@ func TestKubernetesSnapshotNodesWithAPIError(t *testing.T) {
})
snapshot := &portainer.KubernetesSnapshot{}
- err := kubernetesSnapshotNodes(snapshot, fakeClient)
+ err := kubernetesSnapshotNodes(snapshot, fakeClient, false)
// Should return the API error
require.Error(t, err)
@@ -234,7 +237,7 @@ func TestKubernetesSnapshotNodesSingleNode(t *testing.T) {
require.NoError(t, err)
snapshot := &portainer.KubernetesSnapshot{}
- err = kubernetesSnapshotNodes(snapshot, fakeClient)
+ err = kubernetesSnapshotNodes(snapshot, fakeClient, false)
require.NoError(t, err)
require.Equal(t, 1, snapshot.NodeCount)
@@ -246,6 +249,123 @@ func TestKubernetesSnapshotNodesSingleNode(t *testing.T) {
snapshot.NodeCount, snapshot.TotalCPU, snapshot.TotalMemory)
}
+func TestKubernetesSnapshotNodesWithGPU(t *testing.T) {
+ t.Parallel()
+ fakeClient := kfake.NewClientset()
+
+ gpuNode := &corev1.Node{
+ ObjectMeta: metav1.ObjectMeta{Name: "gpu-node"},
+ Status: corev1.NodeStatus{
+ Capacity: corev1.ResourceList{
+ corev1.ResourceCPU: resource.MustParse("8"),
+ corev1.ResourceMemory: resource.MustParse("16Gi"),
+ "nvidia.com/gpu": resource.MustParse("4"),
+ },
+ },
+ }
+ cpuNode := &corev1.Node{
+ ObjectMeta: metav1.ObjectMeta{Name: "cpu-node"},
+ Status: corev1.NodeStatus{
+ Capacity: corev1.ResourceList{
+ corev1.ResourceCPU: resource.MustParse("4"),
+ corev1.ResourceMemory: resource.MustParse("8Gi"),
+ },
+ },
+ }
+
+ _, err := fakeClient.CoreV1().Nodes().Create(t.Context(), gpuNode, metav1.CreateOptions{})
+ require.NoError(t, err)
+ _, err = fakeClient.CoreV1().Nodes().Create(t.Context(), cpuNode, metav1.CreateOptions{})
+ require.NoError(t, err)
+
+ snapshot := &portainer.KubernetesSnapshot{}
+ err = kubernetesSnapshotNodes(snapshot, fakeClient, true)
+ require.NoError(t, err)
+
+ require.Equal(t, 2, snapshot.NodeCount)
+ require.Equal(t, 1, snapshot.GPUNodeCount)
+ require.Equal(t, int64(4), snapshot.TotalGPU["nvidia.com/gpu"])
+}
+
+func TestKubernetesSnapshotNodesMultipleGPUTypes(t *testing.T) {
+ t.Parallel()
+ fakeClient := kfake.NewClientset()
+
+ node := &corev1.Node{
+ ObjectMeta: metav1.ObjectMeta{Name: "mig-node"},
+ Status: corev1.NodeStatus{
+ Capacity: corev1.ResourceList{
+ corev1.ResourceCPU: resource.MustParse("8"),
+ corev1.ResourceMemory: resource.MustParse("16Gi"),
+ "nvidia.com/gpu": resource.MustParse("2"),
+ "nvidia.com/mig-2g.10gb": resource.MustParse("4"),
+ },
+ },
+ }
+
+ _, err := fakeClient.CoreV1().Nodes().Create(t.Context(), node, metav1.CreateOptions{})
+ require.NoError(t, err)
+
+ snapshot := &portainer.KubernetesSnapshot{}
+ err = kubernetesSnapshotNodes(snapshot, fakeClient, true)
+ require.NoError(t, err)
+
+ require.Equal(t, 1, snapshot.GPUNodeCount)
+ require.Equal(t, int64(2), snapshot.TotalGPU["nvidia.com/gpu"])
+ require.Equal(t, int64(4), snapshot.TotalGPU["nvidia.com/mig-2g.10gb"])
+}
+
+func TestKubernetesSnapshotNodesGPUAggregatedAcrossNodes(t *testing.T) {
+ t.Parallel()
+ fakeClient := kfake.NewClientset()
+
+ for i, gpuCount := range []int{2, 4} {
+ node := &corev1.Node{
+ ObjectMeta: metav1.ObjectMeta{Name: fmt.Sprintf("gpu-node-%d", i)},
+ Status: corev1.NodeStatus{
+ Capacity: corev1.ResourceList{
+ corev1.ResourceCPU: resource.MustParse("8"),
+ corev1.ResourceMemory: resource.MustParse("16Gi"),
+ "nvidia.com/gpu": *resource.NewQuantity(int64(gpuCount), resource.DecimalSI),
+ },
+ },
+ }
+ _, err := fakeClient.CoreV1().Nodes().Create(t.Context(), node, metav1.CreateOptions{})
+ require.NoError(t, err)
+ }
+
+ snapshot := &portainer.KubernetesSnapshot{}
+ err := kubernetesSnapshotNodes(snapshot, fakeClient, true)
+ require.NoError(t, err)
+
+ require.Equal(t, 2, snapshot.GPUNodeCount)
+ require.Equal(t, int64(6), snapshot.TotalGPU["nvidia.com/gpu"]) // 2 + 4
+}
+
+func TestKubernetesSnapshotNodesNoGPULeavesTotalGPUNil(t *testing.T) {
+ t.Parallel()
+ fakeClient := kfake.NewClientset()
+
+ node := &corev1.Node{
+ ObjectMeta: metav1.ObjectMeta{Name: "cpu-only-node"},
+ Status: corev1.NodeStatus{
+ Capacity: corev1.ResourceList{
+ corev1.ResourceCPU: resource.MustParse("4"),
+ corev1.ResourceMemory: resource.MustParse("8Gi"),
+ },
+ },
+ }
+ _, err := fakeClient.CoreV1().Nodes().Create(t.Context(), node, metav1.CreateOptions{})
+ require.NoError(t, err)
+
+ snapshot := &portainer.KubernetesSnapshot{}
+ err = kubernetesSnapshotNodes(snapshot, fakeClient, false)
+ require.NoError(t, err)
+
+ require.Equal(t, 0, snapshot.GPUNodeCount)
+ require.Nil(t, snapshot.TotalGPU)
+}
+
func TestKubernetesSnapshotNodesZeroResources(t *testing.T) {
t.Parallel()
// Test with nodes that have zero or very small resources
@@ -267,7 +387,7 @@ func TestKubernetesSnapshotNodesZeroResources(t *testing.T) {
require.NoError(t, err)
snapshot := &portainer.KubernetesSnapshot{}
- err = kubernetesSnapshotNodes(snapshot, fakeClient)
+ err = kubernetesSnapshotNodes(snapshot, fakeClient, false)
require.NoError(t, err)
require.Equal(t, 1, snapshot.NodeCount)
@@ -277,3 +397,98 @@ func TestKubernetesSnapshotNodesZeroResources(t *testing.T) {
t.Log("Zero resources test passed - handles edge case correctly")
}
+
+func TestKubernetesSnapshotNodesGPUOperator(t *testing.T) {
+ t.Parallel()
+
+ const gpuMemoryBytes = int64(25769803776) // 16GiB + 8GiB
+
+ tests := []struct {
+ name string
+ gpuOperator bool
+ wantGPUCount int
+ wantTotalGPU map[string]int64
+ }{
+ {
+ name: "disabled does not populate GPU fields even when GPU nodes exist",
+ gpuOperator: false,
+ wantGPUCount: 0,
+ wantTotalGPU: nil,
+ },
+ {
+ name: "enabled populates GPU fields from GPU nodes",
+ gpuOperator: true,
+ wantGPUCount: 1,
+ wantTotalGPU: map[string]int64{"nvidia.com/gpu": 4},
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ t.Parallel()
+
+ fakeClient := kfake.NewClientset()
+ nodes := []*corev1.Node{
+ {
+ ObjectMeta: metav1.ObjectMeta{Name: "gpu-node"},
+ Status: corev1.NodeStatus{
+ Capacity: corev1.ResourceList{
+ corev1.ResourceCPU: resource.MustParse("8"),
+ corev1.ResourceMemory: resource.MustParse("16Gi"),
+ "nvidia.com/gpu": resource.MustParse("4"),
+ },
+ },
+ },
+ {
+ ObjectMeta: metav1.ObjectMeta{Name: "cpu-node"},
+ Status: corev1.NodeStatus{
+ Capacity: corev1.ResourceList{
+ corev1.ResourceCPU: resource.MustParse("4"),
+ corev1.ResourceMemory: resource.MustParse("8Gi"),
+ },
+ },
+ },
+ }
+ for _, n := range nodes {
+ _, err := fakeClient.CoreV1().Nodes().Create(t.Context(), n, metav1.CreateOptions{})
+ require.NoError(t, err)
+ }
+
+ snap := &portainer.KubernetesSnapshot{}
+ err := kubernetesSnapshotNodes(snap, fakeClient, tt.gpuOperator)
+ require.NoError(t, err)
+
+ require.Equal(t, 2, snap.NodeCount)
+ require.Equal(t, int64(12), snap.TotalCPU)
+ require.Equal(t, gpuMemoryBytes, snap.TotalMemory)
+ require.Equal(t, tt.wantGPUCount, snap.GPUNodeCount)
+ require.Equal(t, tt.wantTotalGPU, snap.TotalGPU)
+ })
+ }
+}
+
+func TestKubernetesSnapshotNodesGPUOperatorEnabledNoGPUNodes(t *testing.T) {
+ t.Parallel()
+
+ fakeClient := kfake.NewClientset()
+ node := &corev1.Node{
+ ObjectMeta: metav1.ObjectMeta{Name: "cpu-node"},
+ Status: corev1.NodeStatus{
+ Capacity: corev1.ResourceList{
+ corev1.ResourceCPU: resource.MustParse("4"),
+ corev1.ResourceMemory: resource.MustParse("8Gi"),
+ },
+ },
+ }
+ _, err := fakeClient.CoreV1().Nodes().Create(t.Context(), node, metav1.CreateOptions{})
+ require.NoError(t, err)
+
+ snap := &portainer.KubernetesSnapshot{}
+ err = kubernetesSnapshotNodes(snap, fakeClient, true)
+ require.NoError(t, err)
+
+ require.Equal(t, 1, snap.NodeCount)
+ require.Equal(t, int64(4), snap.TotalCPU)
+ require.Equal(t, 0, snap.GPUNodeCount)
+ require.Nil(t, snap.TotalGPU)
+}