Skip to content

Commit

Permalink
Enable labels for ClusterUUID and CliqueId
Browse files Browse the repository at this point in the history
Signed-off-by: Carlos Eduardo Arango Gutierrez <[email protected]>
  • Loading branch information
ArangoGutierrez committed Sep 26, 2024
1 parent 71c1fa7 commit a2ce53a
Show file tree
Hide file tree
Showing 11 changed files with 326 additions and 0 deletions.
3 changes: 3 additions & 0 deletions api/config/v1/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ type GFDCommandLineFlags struct {
NoTimestamp *bool `json:"noTimestamp" yaml:"noTimestamp"`
SleepInterval *Duration `json:"sleepInterval" yaml:"sleepInterval"`
OutputFile *string `json:"outputFile" yaml:"outputFile"`
ImexNodesConfig *string `json:"imexNodesConfig" yaml:"imexNodesConfig"`
MachineTypeFile *string `json:"machineTypeFile" yaml:"machineTypeFile"`
}

Expand Down Expand Up @@ -162,6 +163,8 @@ func (f *Flags) UpdateFromCLIFlags(c *cli.Context, flags []cli.Flag) {
updateFromCLIFlag(&f.GFD.Oneshot, c, n)
case "output-file":
updateFromCLIFlag(&f.GFD.OutputFile, c, n)
case "imex-nodes-config":
updateFromCLIFlag(&f.GFD.ImexNodesConfig, c, n)
case "sleep-interval":
updateFromCLIFlag(&f.GFD.SleepInterval, c, n)
case "no-timestamp":
Expand Down
6 changes: 6 additions & 0 deletions cmd/gpu-feature-discovery/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,12 @@ func main() {
Value: "/etc/kubernetes/node-feature-discovery/features.d/gfd",
EnvVars: []string{"GFD_OUTPUT_FILE"},
},
&cli.StringFlag{
Name: "imex-nodes-config",
Usage: "the path to nvidia-imex nodes config file",
Value: "/etc/nvidia-imex/nodes_config.cfg",
EnvVars: []string{"GFD_IMEX_NODES_CONFIG"},
},
&cli.StringFlag{
Name: "machine-type-file",
Value: "/sys/class/dmi/id/product_name",
Expand Down
13 changes: 13 additions & 0 deletions deployments/helm/nvidia-device-plugin/templates/daemonset-gfd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,10 @@ spec:
- name: GFD_USE_NODE_FEATURE_API
value: {{ .Values.nfd.enableNodeFeatureApi | quote }}
{{- end }}
{{- if typeIs "string" .Values.imexNodesConfigFile }}
- name: GFD_IMEX_NODES_CONFIG
value: {{ .Values.imexNodesConfigFile | quote }}
{{- end }}
{{- if $options.hasConfigMap }}
- name: CONFIG_FILE
value: /config/config.yaml
Expand All @@ -182,6 +186,10 @@ spec:
mountPath: "/etc/kubernetes/node-feature-discovery/features.d"
- name: host-sys
mountPath: "/sys"
{{- if typeIs "string" .Values.imexNodesConfigFile }}
- name: imex-nodes-config
mountPath: {{ .Values.imexNodesConfigFile | quote }}
{{- end }}
{{- if $options.hasConfigMap }}
- name: available-configs
mountPath: /available-configs
Expand All @@ -199,6 +207,11 @@ spec:
- name: host-sys
hostPath:
path: "/sys"
{{- if typeIs "string" .Values.imexNodesConfigFile }}
- name: imex-nodes-config
hostPath:
path: {{ .Values.imexNodesConfigFile | quote }}
{{- end }}
{{- if $options.hasConfigMap }}
- name: available-configs
configMap:
Expand Down
1 change: 1 addition & 0 deletions deployments/helm/nvidia-device-plugin/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ deviceIDStrategy: null
nvidiaDriverRoot: null
gdsEnabled: null
mofedEnabled: null
imexNodesConfigFile: null
deviceDiscoveryStrategy: null

nameOverride: ""
Expand Down
119 changes: 119 additions & 0 deletions internal/lm/nvml.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,13 @@
package lm

import (
"bufio"
"errors"
"fmt"
"math/rand"
"net"
"os"
"sort"
"strconv"
"strings"

Expand All @@ -28,6 +33,7 @@ import (

spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1"
"github.com/NVIDIA/k8s-device-plugin/internal/resource"
"github.com/google/uuid"
)

var errMPSSharingNotSupported = errors.New("MPS sharing is not supported")
Expand Down Expand Up @@ -80,13 +86,19 @@ func NewDeviceLabeler(manager resource.Manager, config *spec.Config) (Labeler, e
return nil, fmt.Errorf("error creating resource labeler: %v", err)
}

imexLabeler, err := newImexDomainLabeler(*config.Flags.GFD.ImexNodesConfig, devices)
if err != nil {
return nil, fmt.Errorf("error creating imex domain labeler: %v", err)
}

l := Merge(
machineTypeLabeler,
versionLabeler,
migCapabilityLabeler,
sharingLabeler,
resourceLabeler,
gpuModeLabeler,
imexLabeler,
)

return l, nil
Expand Down Expand Up @@ -218,6 +230,96 @@ func newGPUModeLabeler(devices []resource.Device) (Labeler, error) {
return labels, nil
}

func newImexDomainLabeler(configFile string, device []resource.Device) (Labeler, error) {
if configFile == "" {
return nil, nil
}

// Read file and parse it
_, err := os.Stat(configFile)
if os.IsNotExist(err) {
return nil, nil
}
imexConfig, err := os.Open(configFile)
if err != nil {
return nil, fmt.Errorf("failed to open imex config file: %v", err)
}
defer imexConfig.Close()

// check if the file is empty
stat, err := imexConfig.Stat()
if err != nil {
return nil, nil
}
if stat.Size() == 0 {
return nil, nil
}

// Read the file line by line
var ips []string
scanner := bufio.NewScanner(imexConfig)
for scanner.Scan() {
ip := strings.TrimSpace(scanner.Text())
if net.ParseIP(ip) == nil {
return nil, fmt.Errorf("invalid IP address in imex config file: %s", ip)
}
ips = append(ips, ip)
}

if err := scanner.Err(); err != nil {
return nil, fmt.Errorf("failed to read imex config file: %v", err)
}

// Sort the IP addresses
sort.Strings(ips)

// Join the sorted IPs into a single string
sortedIPs := strings.Join(ips, "\n")

hashedconfig := generateUUIDs(sortedIPs)

var commonClusterUUID string
var commonCliqueID string
for _, d := range device {
// Skip non NVML devices
if ok, _ := d.IsImexCapable(); !ok {
continue
}

clusterUUID, err := d.GetClusterUUID()
if err != nil {
return nil, fmt.Errorf("error getting cluster UUID: %v", err)
}
if commonClusterUUID == "" {
commonClusterUUID = clusterUUID
}
if commonClusterUUID != clusterUUID {
klog.Warningf("Cluster UUIDs are different: %s != %s", commonClusterUUID, clusterUUID)
return nil, nil
}

cliqueID, err := d.GetCliqueIP()
if err != nil {
return nil, fmt.Errorf("error getting clique ID: %v", err)
}
if commonCliqueID == "" {
commonCliqueID = cliqueID
}
if commonCliqueID != cliqueID {
klog.Warningf("Clique IDs are different: %s != %s", commonCliqueID, cliqueID)
return nil, nil
}
}

labels := Labels{
"nvidia.com/gpu.clusteruuid": commonClusterUUID,
"nvidia.com/gpu.cliqueid": commonCliqueID,
"nvidia.com/gpu.imex-domain": hashedconfig + "-" + commonCliqueID,
}

return labels, nil
}

func getModeForClasses(classes []uint32) string {
if len(classes) == 0 {
return "unknown"
Expand Down Expand Up @@ -254,3 +356,20 @@ func getDeviceClasses(devices []resource.Device) ([]uint32, error) {
}
return classes, nil
}

func generateUUIDs(seed string) string {
rand := rand.New(rand.NewSource(hash(seed)))

Check failure on line 361 in internal/lm/nvml.go

View workflow job for this annotation

GitHub Actions / check

G404: Use of weak random number generator (math/rand or math/rand/v2 instead of crypto/rand) (gosec)

charset := make([]byte, 16)
rand.Read(charset)
uuid, _ := uuid.FromBytes(charset)
return uuid.String()
}

func hash(s string) int64 {
h := int64(0)
for _, c := range s {
h = 31*h + int64(c)
}
return h
}
12 changes: 12 additions & 0 deletions internal/resource/cuda-device.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,18 @@ func (d *cudaDevice) IsMigEnabled() (bool, error) {
return false, nil
}

func (d *cudaDevice) IsImexCapable() (bool, error) {
return false, nil
}

func (d *cudaDevice) GetPCIClass() (uint32, error) {
return 0, nil
}

func (d *cudaDevice) GetClusterUUID() (string, error) {
return "", nil
}

func (d *cudaDevice) GetCliqueIP() (string, error) {
return "", nil
}
Loading

0 comments on commit a2ce53a

Please sign in to comment.