From a2ce53aa853c99c756ae3f84003613221ace58b8 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Thu, 26 Sep 2024 12:53:19 +0200 Subject: [PATCH] Enable labels for ClusterUUID and CliqueId Signed-off-by: Carlos Eduardo Arango Gutierrez --- api/config/v1/flags.go | 3 + cmd/gpu-feature-discovery/main.go | 6 + .../templates/daemonset-gfd.yml | 13 ++ .../helm/nvidia-device-plugin/values.yaml | 1 + internal/lm/nvml.go | 119 ++++++++++++++++++ internal/resource/cuda-device.go | 12 ++ internal/resource/device_mock.go | 111 ++++++++++++++++ internal/resource/nvml-device.go | 33 +++++ internal/resource/nvml-mig-device.go | 13 ++ internal/resource/sysfs-device.go | 12 ++ internal/resource/types.go | 3 + 11 files changed, 326 insertions(+) diff --git a/api/config/v1/flags.go b/api/config/v1/flags.go index 2720379e2..60cc0d67c 100644 --- a/api/config/v1/flags.go +++ b/api/config/v1/flags.go @@ -107,6 +107,7 @@ type GFDCommandLineFlags struct { NoTimestamp *bool `json:"noTimestamp" yaml:"noTimestamp"` SleepInterval *Duration `json:"sleepInterval" yaml:"sleepInterval"` OutputFile *string `json:"outputFile" yaml:"outputFile"` + ImexNodesConfig *string `json:"imexNodesConfig" yaml:"imexNodesConfig"` MachineTypeFile *string `json:"machineTypeFile" yaml:"machineTypeFile"` } @@ -162,6 +163,8 @@ func (f *Flags) UpdateFromCLIFlags(c *cli.Context, flags []cli.Flag) { updateFromCLIFlag(&f.GFD.Oneshot, c, n) case "output-file": updateFromCLIFlag(&f.GFD.OutputFile, c, n) + case "imex-nodes-config": + updateFromCLIFlag(&f.GFD.ImexNodesConfig, c, n) case "sleep-interval": updateFromCLIFlag(&f.GFD.SleepInterval, c, n) case "no-timestamp": diff --git a/cmd/gpu-feature-discovery/main.go b/cmd/gpu-feature-discovery/main.go index c824ffcc2..80895a61c 100644 --- a/cmd/gpu-feature-discovery/main.go +++ b/cmd/gpu-feature-discovery/main.go @@ -86,6 +86,12 @@ func main() { Value: "/etc/kubernetes/node-feature-discovery/features.d/gfd", EnvVars: []string{"GFD_OUTPUT_FILE"}, }, + &cli.StringFlag{ + Name: "imex-nodes-config", + Usage: "the path to nvidia-imex nodes config file", + Value: "/etc/nvidia-imex/nodes_config.cfg", + EnvVars: []string{"GFD_IMEX_NODES_CONFIG"}, + }, &cli.StringFlag{ Name: "machine-type-file", Value: "/sys/class/dmi/id/product_name", diff --git a/deployments/helm/nvidia-device-plugin/templates/daemonset-gfd.yml b/deployments/helm/nvidia-device-plugin/templates/daemonset-gfd.yml index 940dcc902..e724d6ece 100644 --- a/deployments/helm/nvidia-device-plugin/templates/daemonset-gfd.yml +++ b/deployments/helm/nvidia-device-plugin/templates/daemonset-gfd.yml @@ -163,6 +163,10 @@ spec: - name: GFD_USE_NODE_FEATURE_API value: {{ .Values.nfd.enableNodeFeatureApi | quote }} {{- end }} + {{- if typeIs "string" .Values.imexNodesConfigFile }} + - name: GFD_IMEX_NODES_CONFIG + value: {{ .Values.imexNodesConfigFile | quote }} + {{- end }} {{- if $options.hasConfigMap }} - name: CONFIG_FILE value: /config/config.yaml @@ -182,6 +186,10 @@ spec: mountPath: "/etc/kubernetes/node-feature-discovery/features.d" - name: host-sys mountPath: "/sys" + {{- if typeIs "string" .Values.imexNodesConfigFile }} + - name: imex-nodes-config + mountPath: {{ .Values.imexNodesConfigFile | quote }} + {{- end }} {{- if $options.hasConfigMap }} - name: available-configs mountPath: /available-configs @@ -199,6 +207,11 @@ spec: - name: host-sys hostPath: path: "/sys" + {{- if typeIs "string" .Values.imexNodesConfigFile }} + - name: imex-nodes-config + hostPath: + path: {{ .Values.imexNodesConfigFile | quote }} + {{- end }} {{- if $options.hasConfigMap }} - name: available-configs configMap: diff --git a/deployments/helm/nvidia-device-plugin/values.yaml b/deployments/helm/nvidia-device-plugin/values.yaml index b0c624295..90d2d09dc 100644 --- a/deployments/helm/nvidia-device-plugin/values.yaml +++ b/deployments/helm/nvidia-device-plugin/values.yaml @@ -35,6 +35,7 @@ deviceIDStrategy: null nvidiaDriverRoot: null gdsEnabled: null mofedEnabled: null +imexNodesConfigFile: null deviceDiscoveryStrategy: null nameOverride: "" diff --git a/internal/lm/nvml.go b/internal/lm/nvml.go index 0b5ed6e9a..d97ec6789 100644 --- a/internal/lm/nvml.go +++ b/internal/lm/nvml.go @@ -17,8 +17,13 @@ package lm import ( + "bufio" "errors" "fmt" + "math/rand" + "net" + "os" + "sort" "strconv" "strings" @@ -28,6 +33,7 @@ import ( spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1" "github.com/NVIDIA/k8s-device-plugin/internal/resource" + "github.com/google/uuid" ) var errMPSSharingNotSupported = errors.New("MPS sharing is not supported") @@ -80,6 +86,11 @@ func NewDeviceLabeler(manager resource.Manager, config *spec.Config) (Labeler, e return nil, fmt.Errorf("error creating resource labeler: %v", err) } + imexLabeler, err := newImexDomainLabeler(*config.Flags.GFD.ImexNodesConfig, devices) + if err != nil { + return nil, fmt.Errorf("error creating imex domain labeler: %v", err) + } + l := Merge( machineTypeLabeler, versionLabeler, @@ -87,6 +98,7 @@ func NewDeviceLabeler(manager resource.Manager, config *spec.Config) (Labeler, e sharingLabeler, resourceLabeler, gpuModeLabeler, + imexLabeler, ) return l, nil @@ -218,6 +230,96 @@ func newGPUModeLabeler(devices []resource.Device) (Labeler, error) { return labels, nil } +func newImexDomainLabeler(configFile string, device []resource.Device) (Labeler, error) { + if configFile == "" { + return nil, nil + } + + // Read file and parse it + _, err := os.Stat(configFile) + if os.IsNotExist(err) { + return nil, nil + } + imexConfig, err := os.Open(configFile) + if err != nil { + return nil, fmt.Errorf("failed to open imex config file: %v", err) + } + defer imexConfig.Close() + + // check if the file is empty + stat, err := imexConfig.Stat() + if err != nil { + return nil, nil + } + if stat.Size() == 0 { + return nil, nil + } + + // Read the file line by line + var ips []string + scanner := bufio.NewScanner(imexConfig) + for scanner.Scan() { + ip := strings.TrimSpace(scanner.Text()) + if net.ParseIP(ip) == nil { + return nil, fmt.Errorf("invalid IP address in imex config file: %s", ip) + } + ips = append(ips, ip) + } + + if err := scanner.Err(); err != nil { + return nil, fmt.Errorf("failed to read imex config file: %v", err) + } + + // Sort the IP addresses + sort.Strings(ips) + + // Join the sorted IPs into a single string + sortedIPs := strings.Join(ips, "\n") + + hashedconfig := generateUUIDs(sortedIPs) + + var commonClusterUUID string + var commonCliqueID string + for _, d := range device { + // Skip non NVML devices + if ok, _ := d.IsImexCapable(); !ok { + continue + } + + clusterUUID, err := d.GetClusterUUID() + if err != nil { + return nil, fmt.Errorf("error getting cluster UUID: %v", err) + } + if commonClusterUUID == "" { + commonClusterUUID = clusterUUID + } + if commonClusterUUID != clusterUUID { + klog.Warningf("Cluster UUIDs are different: %s != %s", commonClusterUUID, clusterUUID) + return nil, nil + } + + cliqueID, err := d.GetCliqueIP() + if err != nil { + return nil, fmt.Errorf("error getting clique ID: %v", err) + } + if commonCliqueID == "" { + commonCliqueID = cliqueID + } + if commonCliqueID != cliqueID { + klog.Warningf("Clique IDs are different: %s != %s", commonCliqueID, cliqueID) + return nil, nil + } + } + + labels := Labels{ + "nvidia.com/gpu.clusteruuid": commonClusterUUID, + "nvidia.com/gpu.cliqueid": commonCliqueID, + "nvidia.com/gpu.imex-domain": hashedconfig + "-" + commonCliqueID, + } + + return labels, nil +} + func getModeForClasses(classes []uint32) string { if len(classes) == 0 { return "unknown" @@ -254,3 +356,20 @@ func getDeviceClasses(devices []resource.Device) ([]uint32, error) { } return classes, nil } + +func generateUUIDs(seed string) string { + rand := rand.New(rand.NewSource(hash(seed))) + + charset := make([]byte, 16) + rand.Read(charset) + uuid, _ := uuid.FromBytes(charset) + return uuid.String() +} + +func hash(s string) int64 { + h := int64(0) + for _, c := range s { + h = 31*h + int64(c) + } + return h +} diff --git a/internal/resource/cuda-device.go b/internal/resource/cuda-device.go index a4f4bc4a4..563fb2213 100644 --- a/internal/resource/cuda-device.go +++ b/internal/resource/cuda-device.go @@ -97,6 +97,18 @@ func (d *cudaDevice) IsMigEnabled() (bool, error) { return false, nil } +func (d *cudaDevice) IsImexCapable() (bool, error) { + return false, nil +} + func (d *cudaDevice) GetPCIClass() (uint32, error) { return 0, nil } + +func (d *cudaDevice) GetClusterUUID() (string, error) { + return "", nil +} + +func (d *cudaDevice) GetCliqueIP() (string, error) { + return "", nil +} diff --git a/internal/resource/device_mock.go b/internal/resource/device_mock.go index 1024ec96b..e08f99392 100644 --- a/internal/resource/device_mock.go +++ b/internal/resource/device_mock.go @@ -20,6 +20,12 @@ var _ Device = &DeviceMock{} // GetAttributesFunc: func() (map[string]interface{}, error) { // panic("mock out the GetAttributes method") // }, +// GetCliqueIPFunc: func() (string, error) { +// panic("mock out the GetCliqueIP method") +// }, +// GetClusterUUIDFunc: func() (string, error) { +// panic("mock out the GetClusterUUID method") +// }, // GetCudaComputeCapabilityFunc: func() (int, int, error) { // panic("mock out the GetCudaComputeCapability method") // }, @@ -38,6 +44,9 @@ var _ Device = &DeviceMock{} // GetTotalMemoryMBFunc: func() (uint64, error) { // panic("mock out the GetTotalMemoryMB method") // }, +// IsImexCapableFunc: func() (bool, error) { +// panic("mock out the IsImexCapable method") +// }, // IsMigCapableFunc: func() (bool, error) { // panic("mock out the IsMigCapable method") // }, @@ -54,6 +63,12 @@ type DeviceMock struct { // GetAttributesFunc mocks the GetAttributes method. GetAttributesFunc func() (map[string]interface{}, error) + // GetCliqueIPFunc mocks the GetCliqueIP method. + GetCliqueIPFunc func() (string, error) + + // GetClusterUUIDFunc mocks the GetClusterUUID method. + GetClusterUUIDFunc func() (string, error) + // GetCudaComputeCapabilityFunc mocks the GetCudaComputeCapability method. GetCudaComputeCapabilityFunc func() (int, int, error) @@ -72,6 +87,9 @@ type DeviceMock struct { // GetTotalMemoryMBFunc mocks the GetTotalMemoryMB method. GetTotalMemoryMBFunc func() (uint64, error) + // IsImexCapableFunc mocks the IsImexCapable method. + IsImexCapableFunc func() (bool, error) + // IsMigCapableFunc mocks the IsMigCapable method. IsMigCapableFunc func() (bool, error) @@ -83,6 +101,12 @@ type DeviceMock struct { // GetAttributes holds details about calls to the GetAttributes method. GetAttributes []struct { } + // GetCliqueIP holds details about calls to the GetCliqueIP method. + GetCliqueIP []struct { + } + // GetClusterUUID holds details about calls to the GetClusterUUID method. + GetClusterUUID []struct { + } // GetCudaComputeCapability holds details about calls to the GetCudaComputeCapability method. GetCudaComputeCapability []struct { } @@ -101,6 +125,9 @@ type DeviceMock struct { // GetTotalMemoryMB holds details about calls to the GetTotalMemoryMB method. GetTotalMemoryMB []struct { } + // IsImexCapable holds details about calls to the IsImexCapable method. + IsImexCapable []struct { + } // IsMigCapable holds details about calls to the IsMigCapable method. IsMigCapable []struct { } @@ -109,12 +136,15 @@ type DeviceMock struct { } } lockGetAttributes sync.RWMutex + lockGetCliqueIP sync.RWMutex + lockGetClusterUUID sync.RWMutex lockGetCudaComputeCapability sync.RWMutex lockGetDeviceHandleFromMigDeviceHandle sync.RWMutex lockGetMigDevices sync.RWMutex lockGetName sync.RWMutex lockGetPCIClass sync.RWMutex lockGetTotalMemoryMB sync.RWMutex + lockIsImexCapable sync.RWMutex lockIsMigCapable sync.RWMutex lockIsMigEnabled sync.RWMutex } @@ -146,6 +176,60 @@ func (mock *DeviceMock) GetAttributesCalls() []struct { return calls } +// GetCliqueIP calls GetCliqueIPFunc. +func (mock *DeviceMock) GetCliqueIP() (string, error) { + if mock.GetCliqueIPFunc == nil { + panic("DeviceMock.GetCliqueIPFunc: method is nil but Device.GetCliqueIP was just called") + } + callInfo := struct { + }{} + mock.lockGetCliqueIP.Lock() + mock.calls.GetCliqueIP = append(mock.calls.GetCliqueIP, callInfo) + mock.lockGetCliqueIP.Unlock() + return mock.GetCliqueIPFunc() +} + +// GetCliqueIPCalls gets all the calls that were made to GetCliqueIP. +// Check the length with: +// +// len(mockedDevice.GetCliqueIPCalls()) +func (mock *DeviceMock) GetCliqueIPCalls() []struct { +} { + var calls []struct { + } + mock.lockGetCliqueIP.RLock() + calls = mock.calls.GetCliqueIP + mock.lockGetCliqueIP.RUnlock() + return calls +} + +// GetClusterUUID calls GetClusterUUIDFunc. +func (mock *DeviceMock) GetClusterUUID() (string, error) { + if mock.GetClusterUUIDFunc == nil { + panic("DeviceMock.GetClusterUUIDFunc: method is nil but Device.GetClusterUUID was just called") + } + callInfo := struct { + }{} + mock.lockGetClusterUUID.Lock() + mock.calls.GetClusterUUID = append(mock.calls.GetClusterUUID, callInfo) + mock.lockGetClusterUUID.Unlock() + return mock.GetClusterUUIDFunc() +} + +// GetClusterUUIDCalls gets all the calls that were made to GetClusterUUID. +// Check the length with: +// +// len(mockedDevice.GetClusterUUIDCalls()) +func (mock *DeviceMock) GetClusterUUIDCalls() []struct { +} { + var calls []struct { + } + mock.lockGetClusterUUID.RLock() + calls = mock.calls.GetClusterUUID + mock.lockGetClusterUUID.RUnlock() + return calls +} + // GetCudaComputeCapability calls GetCudaComputeCapabilityFunc. func (mock *DeviceMock) GetCudaComputeCapability() (int, int, error) { if mock.GetCudaComputeCapabilityFunc == nil { @@ -308,6 +392,33 @@ func (mock *DeviceMock) GetTotalMemoryMBCalls() []struct { return calls } +// IsImexCapable calls IsImexCapableFunc. +func (mock *DeviceMock) IsImexCapable() (bool, error) { + if mock.IsImexCapableFunc == nil { + panic("DeviceMock.IsImexCapableFunc: method is nil but Device.IsImexCapable was just called") + } + callInfo := struct { + }{} + mock.lockIsImexCapable.Lock() + mock.calls.IsImexCapable = append(mock.calls.IsImexCapable, callInfo) + mock.lockIsImexCapable.Unlock() + return mock.IsImexCapableFunc() +} + +// IsImexCapableCalls gets all the calls that were made to IsImexCapable. +// Check the length with: +// +// len(mockedDevice.IsImexCapableCalls()) +func (mock *DeviceMock) IsImexCapableCalls() []struct { +} { + var calls []struct { + } + mock.lockIsImexCapable.RLock() + calls = mock.calls.IsImexCapable + mock.lockIsImexCapable.RUnlock() + return calls +} + // IsMigCapable calls IsMigCapableFunc. func (mock *DeviceMock) IsMigCapable() (bool, error) { if mock.IsMigCapableFunc == nil { diff --git a/internal/resource/nvml-device.go b/internal/resource/nvml-device.go index 1184657d2..bfe776804 100644 --- a/internal/resource/nvml-device.go +++ b/internal/resource/nvml-device.go @@ -18,6 +18,7 @@ package resource import ( "fmt" + "strconv" "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" "github.com/NVIDIA/go-nvlib/pkg/nvpci" @@ -99,3 +100,35 @@ func (d nvmlDevice) GetPCIClass() (uint32, error) { } return nvDevice.Class, nil } + +func (d nvmlDevice) IsImexCapable() (bool, error) { + return true, nil +} + +func (d nvmlDevice) GetClusterUUID() (string, error) { + gfInfo, ret := d.GetGpuFabricInfo() + if ret != nvml.SUCCESS { + return "", ret + } + + // Convert the array to a byte slice + byteSlice := gfInfo.ClusterUuid[:] + clusterUUID := fmt.Sprintf("%08x-%04x-%04x-%04x-%012x", + byteSlice[0:4], + byteSlice[4:6], + byteSlice[6:8], + byteSlice[8:10], + byteSlice[10:16], + ) + + return clusterUUID, nil +} + +func (d nvmlDevice) GetCliqueIP() (string, error) { + gfInfo, ret := d.GetGpuFabricInfo() + if ret != nvml.SUCCESS { + return "", ret + } + + return strconv.FormatUint(uint64(gfInfo.CliqueId), 10), nil +} diff --git a/internal/resource/nvml-mig-device.go b/internal/resource/nvml-mig-device.go index 8ef933ff5..29cfb8604 100644 --- a/internal/resource/nvml-mig-device.go +++ b/internal/resource/nvml-mig-device.go @@ -82,6 +82,11 @@ func (d nvmlMigDevice) IsMigEnabled() (bool, error) { return false, fmt.Errorf("IsMigEnabled is not supported for MIG devices") } +// IsImexCapable is not supported for MIG devices +func (d nvmlMigDevice) IsImexCapable() (bool, error) { + return false, fmt.Errorf("IsImexCapable is not supported for MIG devices") +} + // GetMigDevices is not supported for MIG devices func (d nvmlMigDevice) GetMigDevices() ([]Device, error) { return nil, fmt.Errorf("GetMigDevices is not implemented for MIG devices") @@ -138,3 +143,11 @@ func (d nvmlMigDevice) GetPCIClass() (uint32, error) { // GPU devices that support MIG do not support switching mode between graphics and compute, so they are always in compute mode. return nvpci.PCI3dControllerClass, nil } + +func (d nvmlMigDevice) GetClusterUUID() (string, error) { + return "", fmt.Errorf("GetClusterUUID is not supported for MIG devices") +} + +func (d nvmlMigDevice) GetCliqueIP() (string, error) { + return "", fmt.Errorf("GetCliqueIP is not supported for MIG devices") +} diff --git a/internal/resource/sysfs-device.go b/internal/resource/sysfs-device.go index 105229fe4..0eba54def 100644 --- a/internal/resource/sysfs-device.go +++ b/internal/resource/sysfs-device.go @@ -65,6 +65,18 @@ func (d vfioDevice) IsMigCapable() (bool, error) { return false, nil } +func (d vfioDevice) IsImexCapable() (bool, error) { + return false, nil +} + func (d vfioDevice) GetPCIClass() (uint32, error) { return d.nvidiaPCIDevice.Class, nil } + +func (d vfioDevice) GetClusterUUID() (string, error) { + return "", fmt.Errorf("GetClusterUUID is not supported for vfio devices") +} + +func (d vfioDevice) GetCliqueIP() (string, error) { + return "", fmt.Errorf("GetCliqueIP is not supported for vfio devices") +} diff --git a/internal/resource/types.go b/internal/resource/types.go index ec89ec579..d182c64e2 100644 --- a/internal/resource/types.go +++ b/internal/resource/types.go @@ -31,6 +31,7 @@ type Manager interface { // //go:generate moq -out device_mock.go . Device type Device interface { + IsImexCapable() (bool, error) IsMigEnabled() (bool, error) IsMigCapable() (bool, error) GetMigDevices() ([]Device, error) @@ -40,4 +41,6 @@ type Device interface { GetDeviceHandleFromMigDeviceHandle() (Device, error) GetCudaComputeCapability() (int, int, error) GetPCIClass() (uint32, error) + GetClusterUUID() (string, error) + GetCliqueIP() (string, error) }