mirror of
https://github.com/portainer/portainer.git
synced 2026-06-23 04:30:16 +00:00
feat(gpu-stats): add gpu stats to environments [C9S-200] (#2735)
This commit is contained in:
committed by
GitHub
parent
217fe870ef
commit
5395dee4c6
@@ -135,6 +135,10 @@ func (handler *Handler) parseHeaders(r *http.Request, endpoint *portainer.Endpoi
|
|||||||
version := r.Header.Get(portainer.PortainerAgentHeader)
|
version := r.Header.Get(portainer.PortainerAgentHeader)
|
||||||
endpoint.Agent.Version = version
|
endpoint.Agent.Version = version
|
||||||
|
|
||||||
|
if gpuOperatorHeader := r.Header.Get(portainer.HTTPResponseAgentGPUOperator); gpuOperatorHeader != "" {
|
||||||
|
endpoint.Kubernetes.Flags.GPUOperator = gpuOperatorHeader == "true"
|
||||||
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -24,5 +24,5 @@ func (snapshotter *Snapshotter) CreateSnapshot(endpoint *portainer.Endpoint) (*p
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
return snapshot.CreateKubernetesSnapshot(client)
|
return snapshot.CreateKubernetesSnapshot(client, endpoint.Kubernetes.Flags.GPUOperator)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -830,6 +830,7 @@ type (
|
|||||||
IsServerMetricsDetected bool `json:"IsServerMetricsDetected" validate:"required"`
|
IsServerMetricsDetected bool `json:"IsServerMetricsDetected" validate:"required"`
|
||||||
IsServerIngressClassDetected bool `json:"IsServerIngressClassDetected" validate:"required"`
|
IsServerIngressClassDetected bool `json:"IsServerIngressClassDetected" validate:"required"`
|
||||||
IsServerStorageDetected bool `json:"IsServerStorageDetected" validate:"required"`
|
IsServerStorageDetected bool `json:"IsServerStorageDetected" validate:"required"`
|
||||||
|
GPUOperator bool `json:"GPUOperator,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// KubernetesSnapshot represents a snapshot of a specific Kubernetes environment(endpoint) at a specific time
|
// KubernetesSnapshot represents a snapshot of a specific Kubernetes environment(endpoint) at a specific time
|
||||||
@@ -840,6 +841,8 @@ type (
|
|||||||
TotalCPU int64 `json:"TotalCPU" validate:"required"`
|
TotalCPU int64 `json:"TotalCPU" validate:"required"`
|
||||||
TotalMemory int64 `json:"TotalMemory" validate:"required"`
|
TotalMemory int64 `json:"TotalMemory" validate:"required"`
|
||||||
ClusterType string `json:"ClusterType,omitempty"`
|
ClusterType string `json:"ClusterType,omitempty"`
|
||||||
|
GPUNodeCount int `json:"GPUNodeCount,omitempty"`
|
||||||
|
TotalGPU map[string]int64 `json:"TotalGPU,omitempty"`
|
||||||
DiagnosticsData *DiagnosticsData `json:"DiagnosticsData,omitempty"`
|
DiagnosticsData *DiagnosticsData `json:"DiagnosticsData,omitempty"`
|
||||||
PerformanceMetrics *PerformanceMetrics `json:"PerformanceMetrics,omitempty"`
|
PerformanceMetrics *PerformanceMetrics `json:"PerformanceMetrics,omitempty"`
|
||||||
}
|
}
|
||||||
@@ -2078,6 +2081,8 @@ const (
|
|||||||
PortainerAgentKubernetesSATokenHeader = "X-PortainerAgent-SA-Token"
|
PortainerAgentKubernetesSATokenHeader = "X-PortainerAgent-SA-Token"
|
||||||
// HTTPAlertStateHeaderName is the name of the header used to transmit edge alert evaluation state
|
// HTTPAlertStateHeaderName is the name of the header used to transmit edge alert evaluation state
|
||||||
HTTPAlertStateHeaderName = "X-PortainerAgent-AlertState"
|
HTTPAlertStateHeaderName = "X-PortainerAgent-AlertState"
|
||||||
|
// HTTPResponseAgentGPUOperator represents the name of the header indicating whether the GPU operator is enabled on the agent
|
||||||
|
HTTPResponseAgentGPUOperator = "Portainer-Agent-GPU-Operator"
|
||||||
// PortainerAgentSignatureMessage represents the message used to create a digital signature
|
// PortainerAgentSignatureMessage represents the message used to create a digital signature
|
||||||
// to be used when communicating with an agent
|
// to be used when communicating with an agent
|
||||||
PortainerAgentSignatureMessage = "Portainer-App"
|
PortainerAgentSignatureMessage = "Portainer-App"
|
||||||
|
|||||||
@@ -1,6 +1,12 @@
|
|||||||
import { render, screen } from '@testing-library/react';
|
import { render, screen } from '@testing-library/react';
|
||||||
|
|
||||||
import { ContainerStats, CPUStats, MemoryStats, NodeStats } from './StatsItem';
|
import {
|
||||||
|
ContainerStats,
|
||||||
|
CPUStats,
|
||||||
|
GpuStats,
|
||||||
|
MemoryStats,
|
||||||
|
NodeStats,
|
||||||
|
} from './StatsItem';
|
||||||
|
|
||||||
describe('StatsItem', () => {
|
describe('StatsItem', () => {
|
||||||
describe('NodeStats', () => {
|
describe('NodeStats', () => {
|
||||||
@@ -27,6 +33,14 @@ describe('StatsItem', () => {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe('GpuStats', () => {
|
||||||
|
it('renders GPU count with GPUS label', () => {
|
||||||
|
render(<GpuStats value={4} />);
|
||||||
|
expect(screen.getByText('4')).toBeVisible();
|
||||||
|
expect(screen.getByText('GPUS')).toBeVisible();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
describe('ContainerStats', () => {
|
describe('ContainerStats', () => {
|
||||||
it('renders running/total containers with a progress bar', () => {
|
it('renders running/total containers with a progress bar', () => {
|
||||||
render(<ContainerStats total={5} running={3} stopped={2} />);
|
render(<ContainerStats total={5} running={3} stopped={2} />);
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import clsx from 'clsx';
|
import clsx from 'clsx';
|
||||||
import { PropsWithChildren } from 'react';
|
import { PropsWithChildren } from 'react';
|
||||||
import { Cpu, Hexagon, LaptopMinimal, MemoryStick } from 'lucide-react';
|
import { Cpu, Gpu, Hexagon, LaptopMinimal, MemoryStick } from 'lucide-react';
|
||||||
|
|
||||||
import { Icon, IconProps } from '@/react/components/Icon';
|
import { Icon, IconProps } from '@/react/components/Icon';
|
||||||
|
|
||||||
@@ -65,6 +65,14 @@ export function MemoryStats({ value }: StatsProps) {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function GpuStats({ value }: StatsProps) {
|
||||||
|
return (
|
||||||
|
<StatsItem icon={Gpu} title="GPUS">
|
||||||
|
<span className="text-left font-bold leading-none">{value}</span>
|
||||||
|
</StatsItem>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
interface ContainerStatsProps {
|
interface ContainerStatsProps {
|
||||||
total: number;
|
total: number;
|
||||||
running: number;
|
running: number;
|
||||||
|
|||||||
+8
-1
@@ -1,7 +1,7 @@
|
|||||||
import { KubernetesSnapshot } from '@/react/portainer/environments/types';
|
import { KubernetesSnapshot } from '@/react/portainer/environments/types';
|
||||||
import { humanize } from '@/portainer/filters/filters';
|
import { humanize } from '@/portainer/filters/filters';
|
||||||
|
|
||||||
import { CPUStats, MemoryStats, NodeStats } from '@@/StatsItem';
|
import { CPUStats, GpuStats, MemoryStats, NodeStats } from '@@/StatsItem';
|
||||||
|
|
||||||
interface Props {
|
interface Props {
|
||||||
snapshot?: KubernetesSnapshot;
|
snapshot?: KubernetesSnapshot;
|
||||||
@@ -12,8 +12,15 @@ export function EnvironmentStatsKubernetes({ snapshot }: Props) {
|
|||||||
return <>No snapshot available</>;
|
return <>No snapshot available</>;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const totalGpuCount = Object.values(snapshot.TotalGPU ?? {}).reduce(
|
||||||
|
(sum, v) => sum + v,
|
||||||
|
0
|
||||||
|
);
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<>
|
<>
|
||||||
|
{totalGpuCount > 0 && <GpuStats value={totalGpuCount} />}
|
||||||
|
|
||||||
<NodeStats value={snapshot.NodeCount} />
|
<NodeStats value={snapshot.NodeCount} />
|
||||||
|
|
||||||
<CPUStats value={snapshot.TotalCPU} />
|
<CPUStats value={snapshot.TotalCPU} />
|
||||||
|
|||||||
@@ -58,6 +58,8 @@ export interface KubernetesSnapshot {
|
|||||||
TotalMemory: number;
|
TotalMemory: number;
|
||||||
Time: number;
|
Time: number;
|
||||||
NodeCount: number;
|
NodeCount: number;
|
||||||
|
GPUNodeCount?: number;
|
||||||
|
TotalGPU?: Record<string, number>;
|
||||||
}
|
}
|
||||||
|
|
||||||
export type IngressClass = {
|
export type IngressClass = {
|
||||||
|
|||||||
@@ -22,14 +22,14 @@ import (
|
|||||||
"k8s.io/client-go/kubernetes"
|
"k8s.io/client-go/kubernetes"
|
||||||
)
|
)
|
||||||
|
|
||||||
func CreateKubernetesSnapshot(cli *kubernetes.Clientset) (*portainer.KubernetesSnapshot, error) {
|
func CreateKubernetesSnapshot(cli *kubernetes.Clientset, gpuOperator bool) (*portainer.KubernetesSnapshot, error) {
|
||||||
kubernetesSnapshot := &portainer.KubernetesSnapshot{}
|
kubernetesSnapshot := &portainer.KubernetesSnapshot{}
|
||||||
err := kubernetesSnapshotVersion(kubernetesSnapshot, cli)
|
err := kubernetesSnapshotVersion(kubernetesSnapshot, cli)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn().Err(err).Msg("unable to snapshot cluster version")
|
log.Warn().Err(err).Msg("unable to snapshot cluster version")
|
||||||
}
|
}
|
||||||
|
|
||||||
err = kubernetesSnapshotNodes(kubernetesSnapshot, cli)
|
err = kubernetesSnapshotNodes(kubernetesSnapshot, cli, gpuOperator)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn().Err(err).Msg("unable to snapshot cluster nodes")
|
log.Warn().Err(err).Msg("unable to snapshot cluster nodes")
|
||||||
}
|
}
|
||||||
@@ -48,7 +48,7 @@ func kubernetesSnapshotVersion(snapshot *portainer.KubernetesSnapshot, cli kuber
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func kubernetesSnapshotNodes(snapshot *portainer.KubernetesSnapshot, cli kubernetes.Interface) error {
|
func kubernetesSnapshotNodes(snapshot *portainer.KubernetesSnapshot, cli kubernetes.Interface, gpuOperator bool) error {
|
||||||
nodeList, err := cli.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
|
nodeList, err := cli.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
@@ -58,16 +58,40 @@ func kubernetesSnapshotNodes(snapshot *portainer.KubernetesSnapshot, cli kuberne
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
totalGPU := make(map[string]int64)
|
||||||
var totalCPUs, totalMemory int64
|
var totalCPUs, totalMemory int64
|
||||||
|
var gpuNodeCount int
|
||||||
|
|
||||||
for _, node := range nodeList.Items {
|
for _, node := range nodeList.Items {
|
||||||
totalCPUs += node.Status.Capacity.Cpu().Value()
|
totalCPUs += node.Status.Capacity.Cpu().Value()
|
||||||
totalMemory += node.Status.Capacity.Memory().Value()
|
totalMemory += node.Status.Capacity.Memory().Value()
|
||||||
|
|
||||||
|
if gpuOperator {
|
||||||
|
nodeHasGPU := false
|
||||||
|
for resourceName, quantity := range node.Status.Capacity {
|
||||||
|
if strings.HasPrefix(string(resourceName), "nvidia.com/") {
|
||||||
|
totalGPU[string(resourceName)] += quantity.Value()
|
||||||
|
nodeHasGPU = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if nodeHasGPU {
|
||||||
|
gpuNodeCount++
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
snapshot.TotalCPU = totalCPUs
|
snapshot.TotalCPU = totalCPUs
|
||||||
snapshot.TotalMemory = totalMemory
|
snapshot.TotalMemory = totalMemory
|
||||||
snapshot.NodeCount = len(nodeList.Items)
|
snapshot.NodeCount = len(nodeList.Items)
|
||||||
snapshot.ClusterType = clusterTypeFromProviderID(nodeList.Items[0].Spec.ProviderID)
|
snapshot.ClusterType = clusterTypeFromProviderID(nodeList.Items[0].Spec.ProviderID)
|
||||||
|
|
||||||
|
if gpuOperator {
|
||||||
|
snapshot.GPUNodeCount = gpuNodeCount
|
||||||
|
if len(totalGPU) > 0 {
|
||||||
|
snapshot.TotalGPU = totalGPU
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ package snapshot
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"errors"
|
"errors"
|
||||||
|
"fmt"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
portainer "github.com/portainer/portainer/api"
|
portainer "github.com/portainer/portainer/api"
|
||||||
@@ -94,7 +95,7 @@ func TestKubernetesSnapshotNodes(t *testing.T) {
|
|||||||
snapshot := &portainer.KubernetesSnapshot{}
|
snapshot := &portainer.KubernetesSnapshot{}
|
||||||
|
|
||||||
// Use the actual function now that it accepts kubernetes.Interface
|
// Use the actual function now that it accepts kubernetes.Interface
|
||||||
err = kubernetesSnapshotNodes(snapshot, fakeClient)
|
err = kubernetesSnapshotNodes(snapshot, fakeClient, false)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
// Verify the results - these should match what kubernetesSnapshotNodes would produce
|
// Verify the results - these should match what kubernetesSnapshotNodes would produce
|
||||||
@@ -103,6 +104,8 @@ func TestKubernetesSnapshotNodes(t *testing.T) {
|
|||||||
require.Equal(t, int64(25769803776), snapshot.TotalMemory) // 12GB + 8GB + 4GB = 24GB in bytes
|
require.Equal(t, int64(25769803776), snapshot.TotalMemory) // 12GB + 8GB + 4GB = 24GB in bytes
|
||||||
require.Equal(t, ClusterTypeGKEAutopilot, snapshot.ClusterType) // detected from node1's ProviderID
|
require.Equal(t, ClusterTypeGKEAutopilot, snapshot.ClusterType) // detected from node1's ProviderID
|
||||||
require.Nil(t, snapshot.PerformanceMetrics) // Performance metrics are no longer collected server-side
|
require.Nil(t, snapshot.PerformanceMetrics) // Performance metrics are no longer collected server-side
|
||||||
|
require.Equal(t, 0, snapshot.GPUNodeCount)
|
||||||
|
require.Nil(t, snapshot.TotalGPU)
|
||||||
|
|
||||||
t.Logf("kubernetesSnapshotNodes test result: Nodes=%d, CPUs=%d, Memory=%d bytes",
|
t.Logf("kubernetesSnapshotNodes test result: Nodes=%d, CPUs=%d, Memory=%d bytes",
|
||||||
snapshot.NodeCount, snapshot.TotalCPU, snapshot.TotalMemory)
|
snapshot.NodeCount, snapshot.TotalCPU, snapshot.TotalMemory)
|
||||||
@@ -114,7 +117,7 @@ func TestKubernetesSnapshotNodesEmptyCluster(t *testing.T) {
|
|||||||
fakeClient := kfake.NewClientset()
|
fakeClient := kfake.NewClientset()
|
||||||
snapshot := &portainer.KubernetesSnapshot{}
|
snapshot := &portainer.KubernetesSnapshot{}
|
||||||
|
|
||||||
err := kubernetesSnapshotNodes(snapshot, fakeClient)
|
err := kubernetesSnapshotNodes(snapshot, fakeClient, false)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
// Values should remain at their zero state when no nodes exist
|
// Values should remain at their zero state when no nodes exist
|
||||||
@@ -170,7 +173,7 @@ func TestCreateKubernetesSnapshotIntegration(t *testing.T) {
|
|||||||
|
|
||||||
// Test that kubernetesSnapshotNodes logic works
|
// Test that kubernetesSnapshotNodes logic works
|
||||||
snapshot := &portainer.KubernetesSnapshot{}
|
snapshot := &portainer.KubernetesSnapshot{}
|
||||||
err = kubernetesSnapshotNodes(snapshot, fakeClient)
|
err = kubernetesSnapshotNodes(snapshot, fakeClient, false)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
// Verify the integration results
|
// Verify the integration results
|
||||||
@@ -198,7 +201,7 @@ func TestKubernetesSnapshotNodesWithAPIError(t *testing.T) {
|
|||||||
})
|
})
|
||||||
|
|
||||||
snapshot := &portainer.KubernetesSnapshot{}
|
snapshot := &portainer.KubernetesSnapshot{}
|
||||||
err := kubernetesSnapshotNodes(snapshot, fakeClient)
|
err := kubernetesSnapshotNodes(snapshot, fakeClient, false)
|
||||||
|
|
||||||
// Should return the API error
|
// Should return the API error
|
||||||
require.Error(t, err)
|
require.Error(t, err)
|
||||||
@@ -234,7 +237,7 @@ func TestKubernetesSnapshotNodesSingleNode(t *testing.T) {
|
|||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
snapshot := &portainer.KubernetesSnapshot{}
|
snapshot := &portainer.KubernetesSnapshot{}
|
||||||
err = kubernetesSnapshotNodes(snapshot, fakeClient)
|
err = kubernetesSnapshotNodes(snapshot, fakeClient, false)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
require.Equal(t, 1, snapshot.NodeCount)
|
require.Equal(t, 1, snapshot.NodeCount)
|
||||||
@@ -246,6 +249,123 @@ func TestKubernetesSnapshotNodesSingleNode(t *testing.T) {
|
|||||||
snapshot.NodeCount, snapshot.TotalCPU, snapshot.TotalMemory)
|
snapshot.NodeCount, snapshot.TotalCPU, snapshot.TotalMemory)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestKubernetesSnapshotNodesWithGPU(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
fakeClient := kfake.NewClientset()
|
||||||
|
|
||||||
|
gpuNode := &corev1.Node{
|
||||||
|
ObjectMeta: metav1.ObjectMeta{Name: "gpu-node"},
|
||||||
|
Status: corev1.NodeStatus{
|
||||||
|
Capacity: corev1.ResourceList{
|
||||||
|
corev1.ResourceCPU: resource.MustParse("8"),
|
||||||
|
corev1.ResourceMemory: resource.MustParse("16Gi"),
|
||||||
|
"nvidia.com/gpu": resource.MustParse("4"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
cpuNode := &corev1.Node{
|
||||||
|
ObjectMeta: metav1.ObjectMeta{Name: "cpu-node"},
|
||||||
|
Status: corev1.NodeStatus{
|
||||||
|
Capacity: corev1.ResourceList{
|
||||||
|
corev1.ResourceCPU: resource.MustParse("4"),
|
||||||
|
corev1.ResourceMemory: resource.MustParse("8Gi"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err := fakeClient.CoreV1().Nodes().Create(t.Context(), gpuNode, metav1.CreateOptions{})
|
||||||
|
require.NoError(t, err)
|
||||||
|
_, err = fakeClient.CoreV1().Nodes().Create(t.Context(), cpuNode, metav1.CreateOptions{})
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
snapshot := &portainer.KubernetesSnapshot{}
|
||||||
|
err = kubernetesSnapshotNodes(snapshot, fakeClient, true)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
require.Equal(t, 2, snapshot.NodeCount)
|
||||||
|
require.Equal(t, 1, snapshot.GPUNodeCount)
|
||||||
|
require.Equal(t, int64(4), snapshot.TotalGPU["nvidia.com/gpu"])
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestKubernetesSnapshotNodesMultipleGPUTypes(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
fakeClient := kfake.NewClientset()
|
||||||
|
|
||||||
|
node := &corev1.Node{
|
||||||
|
ObjectMeta: metav1.ObjectMeta{Name: "mig-node"},
|
||||||
|
Status: corev1.NodeStatus{
|
||||||
|
Capacity: corev1.ResourceList{
|
||||||
|
corev1.ResourceCPU: resource.MustParse("8"),
|
||||||
|
corev1.ResourceMemory: resource.MustParse("16Gi"),
|
||||||
|
"nvidia.com/gpu": resource.MustParse("2"),
|
||||||
|
"nvidia.com/mig-2g.10gb": resource.MustParse("4"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err := fakeClient.CoreV1().Nodes().Create(t.Context(), node, metav1.CreateOptions{})
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
snapshot := &portainer.KubernetesSnapshot{}
|
||||||
|
err = kubernetesSnapshotNodes(snapshot, fakeClient, true)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
require.Equal(t, 1, snapshot.GPUNodeCount)
|
||||||
|
require.Equal(t, int64(2), snapshot.TotalGPU["nvidia.com/gpu"])
|
||||||
|
require.Equal(t, int64(4), snapshot.TotalGPU["nvidia.com/mig-2g.10gb"])
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestKubernetesSnapshotNodesGPUAggregatedAcrossNodes(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
fakeClient := kfake.NewClientset()
|
||||||
|
|
||||||
|
for i, gpuCount := range []int{2, 4} {
|
||||||
|
node := &corev1.Node{
|
||||||
|
ObjectMeta: metav1.ObjectMeta{Name: fmt.Sprintf("gpu-node-%d", i)},
|
||||||
|
Status: corev1.NodeStatus{
|
||||||
|
Capacity: corev1.ResourceList{
|
||||||
|
corev1.ResourceCPU: resource.MustParse("8"),
|
||||||
|
corev1.ResourceMemory: resource.MustParse("16Gi"),
|
||||||
|
"nvidia.com/gpu": *resource.NewQuantity(int64(gpuCount), resource.DecimalSI),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
_, err := fakeClient.CoreV1().Nodes().Create(t.Context(), node, metav1.CreateOptions{})
|
||||||
|
require.NoError(t, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
snapshot := &portainer.KubernetesSnapshot{}
|
||||||
|
err := kubernetesSnapshotNodes(snapshot, fakeClient, true)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
require.Equal(t, 2, snapshot.GPUNodeCount)
|
||||||
|
require.Equal(t, int64(6), snapshot.TotalGPU["nvidia.com/gpu"]) // 2 + 4
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestKubernetesSnapshotNodesNoGPULeavesTotalGPUNil(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
fakeClient := kfake.NewClientset()
|
||||||
|
|
||||||
|
node := &corev1.Node{
|
||||||
|
ObjectMeta: metav1.ObjectMeta{Name: "cpu-only-node"},
|
||||||
|
Status: corev1.NodeStatus{
|
||||||
|
Capacity: corev1.ResourceList{
|
||||||
|
corev1.ResourceCPU: resource.MustParse("4"),
|
||||||
|
corev1.ResourceMemory: resource.MustParse("8Gi"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
_, err := fakeClient.CoreV1().Nodes().Create(t.Context(), node, metav1.CreateOptions{})
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
snapshot := &portainer.KubernetesSnapshot{}
|
||||||
|
err = kubernetesSnapshotNodes(snapshot, fakeClient, false)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
require.Equal(t, 0, snapshot.GPUNodeCount)
|
||||||
|
require.Nil(t, snapshot.TotalGPU)
|
||||||
|
}
|
||||||
|
|
||||||
func TestKubernetesSnapshotNodesZeroResources(t *testing.T) {
|
func TestKubernetesSnapshotNodesZeroResources(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
// Test with nodes that have zero or very small resources
|
// Test with nodes that have zero or very small resources
|
||||||
@@ -267,7 +387,7 @@ func TestKubernetesSnapshotNodesZeroResources(t *testing.T) {
|
|||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
snapshot := &portainer.KubernetesSnapshot{}
|
snapshot := &portainer.KubernetesSnapshot{}
|
||||||
err = kubernetesSnapshotNodes(snapshot, fakeClient)
|
err = kubernetesSnapshotNodes(snapshot, fakeClient, false)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
require.Equal(t, 1, snapshot.NodeCount)
|
require.Equal(t, 1, snapshot.NodeCount)
|
||||||
@@ -277,3 +397,98 @@ func TestKubernetesSnapshotNodesZeroResources(t *testing.T) {
|
|||||||
|
|
||||||
t.Log("Zero resources test passed - handles edge case correctly")
|
t.Log("Zero resources test passed - handles edge case correctly")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestKubernetesSnapshotNodesGPUOperator(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
const gpuMemoryBytes = int64(25769803776) // 16GiB + 8GiB
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
gpuOperator bool
|
||||||
|
wantGPUCount int
|
||||||
|
wantTotalGPU map[string]int64
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "disabled does not populate GPU fields even when GPU nodes exist",
|
||||||
|
gpuOperator: false,
|
||||||
|
wantGPUCount: 0,
|
||||||
|
wantTotalGPU: nil,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "enabled populates GPU fields from GPU nodes",
|
||||||
|
gpuOperator: true,
|
||||||
|
wantGPUCount: 1,
|
||||||
|
wantTotalGPU: map[string]int64{"nvidia.com/gpu": 4},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
fakeClient := kfake.NewClientset()
|
||||||
|
nodes := []*corev1.Node{
|
||||||
|
{
|
||||||
|
ObjectMeta: metav1.ObjectMeta{Name: "gpu-node"},
|
||||||
|
Status: corev1.NodeStatus{
|
||||||
|
Capacity: corev1.ResourceList{
|
||||||
|
corev1.ResourceCPU: resource.MustParse("8"),
|
||||||
|
corev1.ResourceMemory: resource.MustParse("16Gi"),
|
||||||
|
"nvidia.com/gpu": resource.MustParse("4"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ObjectMeta: metav1.ObjectMeta{Name: "cpu-node"},
|
||||||
|
Status: corev1.NodeStatus{
|
||||||
|
Capacity: corev1.ResourceList{
|
||||||
|
corev1.ResourceCPU: resource.MustParse("4"),
|
||||||
|
corev1.ResourceMemory: resource.MustParse("8Gi"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
for _, n := range nodes {
|
||||||
|
_, err := fakeClient.CoreV1().Nodes().Create(t.Context(), n, metav1.CreateOptions{})
|
||||||
|
require.NoError(t, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
snap := &portainer.KubernetesSnapshot{}
|
||||||
|
err := kubernetesSnapshotNodes(snap, fakeClient, tt.gpuOperator)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
require.Equal(t, 2, snap.NodeCount)
|
||||||
|
require.Equal(t, int64(12), snap.TotalCPU)
|
||||||
|
require.Equal(t, gpuMemoryBytes, snap.TotalMemory)
|
||||||
|
require.Equal(t, tt.wantGPUCount, snap.GPUNodeCount)
|
||||||
|
require.Equal(t, tt.wantTotalGPU, snap.TotalGPU)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestKubernetesSnapshotNodesGPUOperatorEnabledNoGPUNodes(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
fakeClient := kfake.NewClientset()
|
||||||
|
node := &corev1.Node{
|
||||||
|
ObjectMeta: metav1.ObjectMeta{Name: "cpu-node"},
|
||||||
|
Status: corev1.NodeStatus{
|
||||||
|
Capacity: corev1.ResourceList{
|
||||||
|
corev1.ResourceCPU: resource.MustParse("4"),
|
||||||
|
corev1.ResourceMemory: resource.MustParse("8Gi"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
_, err := fakeClient.CoreV1().Nodes().Create(t.Context(), node, metav1.CreateOptions{})
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
snap := &portainer.KubernetesSnapshot{}
|
||||||
|
err = kubernetesSnapshotNodes(snap, fakeClient, true)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
require.Equal(t, 1, snap.NodeCount)
|
||||||
|
require.Equal(t, int64(4), snap.TotalCPU)
|
||||||
|
require.Equal(t, 0, snap.GPUNodeCount)
|
||||||
|
require.Nil(t, snap.TotalGPU)
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user