Skip to content

Commit

Permalink
Enable labels for ClusterUUID and CliqueId
Browse files Browse the repository at this point in the history
Signed-off-by: Carlos Eduardo Arango Gutierrez <[email protected]>
  • Loading branch information
ArangoGutierrez committed Sep 24, 2024
1 parent 71c1fa7 commit 91aa9c4
Show file tree
Hide file tree
Showing 7 changed files with 179 additions and 0 deletions.
45 changes: 45 additions & 0 deletions internal/lm/nvml.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,13 +80,19 @@ func NewDeviceLabeler(manager resource.Manager, config *spec.Config) (Labeler, e
return nil, fmt.Errorf("error creating resource labeler: %v", err)
}

imexLabeler, err := newImexDomainLabeler(devices)
if err != nil {
return nil, fmt.Errorf("error creating imex domain labeler: %v", err)
}

l := Merge(
machineTypeLabeler,
versionLabeler,
migCapabilityLabeler,
sharingLabeler,
resourceLabeler,
gpuModeLabeler,
imexLabeler,
)

return l, nil
Expand Down Expand Up @@ -218,6 +224,45 @@ func newGPUModeLabeler(devices []resource.Device) (Labeler, error) {
return labels, nil
}

func newImexDomainLabeler(devices []resource.Device) (Labeler, error) {
var commonClusterUUID, commonCliqueID string

for _, d := range devices {
clusterUuid, err := d.GetClusterUUID()
if err != nil {
return nil, err
}
if commonClusterUUID == "" {
commonClusterUUID = clusterUuid
}
if commonClusterUUID != clusterUuid {
klog.Warningf("cluster UUID mismatch: %s != %s", commonClusterUUID, clusterUuid)
return nil, fmt.Errorf("cluster UUID mismatch: %s != %s", commonClusterUUID, clusterUuid)
}

cliqueId, err := d.GetCliqueID()
if err != nil {
return nil, err
}
if commonCliqueID == "" {
commonCliqueID = cliqueId
}
if commonCliqueID != cliqueId {
klog.Warningf("clique ID mismatch: %s != %s", commonCliqueID, cliqueId)
return nil, fmt.Errorf("clique ID mismatch: %s != %s", commonCliqueID, cliqueId)
}
}

domain := fmt.Sprintf("%s-%s", commonClusterUUID, commonCliqueID)
labels := Labels{
"nvidia.com/gpu.clusteruuid": commonClusterUUID,
"nvidia.com/gpu.cliqueid": commonCliqueID,
"nvidia.com/gpu.imex-domain": domain,
}

return labels, nil
}

func getModeForClasses(classes []uint32) string {
if len(classes) == 0 {
return "unknown"
Expand Down
8 changes: 8 additions & 0 deletions internal/resource/cuda-device.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,3 +100,11 @@ func (d *cudaDevice) IsMigEnabled() (bool, error) {
func (d *cudaDevice) GetPCIClass() (uint32, error) {
return 0, nil
}

func (d *cudaDevice) GetClusterUUID() (string, error) {
return "", nil
}

func (d *cudaDevice) GetCliqueID() (string, error) {
return "", nil
}
74 changes: 74 additions & 0 deletions internal/resource/device_mock.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

34 changes: 34 additions & 0 deletions internal/resource/nvml-device.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
package resource

import (
"encoding/hex"
"fmt"
"strconv"

"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
"github.com/NVIDIA/go-nvlib/pkg/nvpci"
Expand Down Expand Up @@ -99,3 +101,35 @@ func (d nvmlDevice) GetPCIClass() (uint32, error) {
}
return nvDevice.Class, nil
}

func (d nvmlDevice) GetClusterUUID() (string, error) {
gfInfo, ret := d.GetGpuFabricInfo()
if ret != nvml.SUCCESS {
return "", ret
}

// Convert the array to a byte slice
byteSlice := gfInfo.ClusterUuid[:]

// Encode the byte slice as a hex string
hexStr := hex.EncodeToString(byteSlice)

// Format the hex string with dashes to match UUID format
uuid := fmt.Sprintf("%s-%s-%s-%s-%s",
hexStr[0:8], // First 8 characters
hexStr[8:12], // Next 4 characters
hexStr[12:16], // Next 4 characters
hexStr[16:20], // Next 4 characters
hexStr[20:32]) // Last 12 characters

return uuid, nil
}

func (d nvmlDevice) GetCliqueID() (string, error) {
gfInfo, ret := d.GetGpuFabricInfo()
if ret != nvml.SUCCESS {
return "", ret
}

return strconv.FormatUint(uint64(gfInfo.CliqueId), 10), nil
}
8 changes: 8 additions & 0 deletions internal/resource/nvml-mig-device.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,3 +138,11 @@ func (d nvmlMigDevice) GetPCIClass() (uint32, error) {
// GPU devices that support MIG do not support switching mode between graphics and compute, so they are always in compute mode.
return nvpci.PCI3dControllerClass, nil
}

func (d nvmlMigDevice) GetClusterUUID() (string, error) {
return "", fmt.Errorf("GetClusterUUID is not supported for MIG devices")
}

func (d nvmlMigDevice) GetCliqueID() (string, error) {
return "", fmt.Errorf("GetCliqueID is not supported for MIG devices")
}
8 changes: 8 additions & 0 deletions internal/resource/sysfs-device.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,11 @@ func (d vfioDevice) IsMigCapable() (bool, error) {
func (d vfioDevice) GetPCIClass() (uint32, error) {
return d.nvidiaPCIDevice.Class, nil
}

func (d vfioDevice) GetClusterUUID() (string, error) {
return "", nil
}

func (d vfioDevice) GetCliqueID() (string, error) {
return "", nil
}
2 changes: 2 additions & 0 deletions internal/resource/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,6 @@ type Device interface {
GetDeviceHandleFromMigDeviceHandle() (Device, error)
GetCudaComputeCapability() (int, int, error)
GetPCIClass() (uint32, error)
GetClusterUUID() (string, error)
GetCliqueID() (string, error)
}

0 comments on commit 91aa9c4

Please sign in to comment.