Skip to content

Commit

Permalink
Enable labels for ClusterUUID and CliqueId
Browse files Browse the repository at this point in the history
Signed-off-by: Carlos Eduardo Arango Gutierrez <[email protected]>
  • Loading branch information
ArangoGutierrez committed Sep 25, 2024
1 parent 71c1fa7 commit 97ea6a6
Show file tree
Hide file tree
Showing 15 changed files with 264 additions and 5 deletions.
3 changes: 3 additions & 0 deletions api/config/v1/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ type GFDCommandLineFlags struct {
NoTimestamp *bool `json:"noTimestamp" yaml:"noTimestamp"`
SleepInterval *Duration `json:"sleepInterval" yaml:"sleepInterval"`
OutputFile *string `json:"outputFile" yaml:"outputFile"`
ImexNodesConfig *string `json:"imexNodesConfig" yaml:"imexNodesConfig"`
MachineTypeFile *string `json:"machineTypeFile" yaml:"machineTypeFile"`
}

Expand Down Expand Up @@ -162,6 +163,8 @@ func (f *Flags) UpdateFromCLIFlags(c *cli.Context, flags []cli.Flag) {
updateFromCLIFlag(&f.GFD.Oneshot, c, n)
case "output-file":
updateFromCLIFlag(&f.GFD.OutputFile, c, n)
case "imex-nodes-config":
updateFromCLIFlag(&f.GFD.ImexNodesConfig, c, n)
case "sleep-interval":
updateFromCLIFlag(&f.GFD.SleepInterval, c, n)
case "no-timestamp":
Expand Down
6 changes: 6 additions & 0 deletions cmd/gpu-feature-discovery/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,12 @@ func main() {
Value: "/etc/kubernetes/node-feature-discovery/features.d/gfd",
EnvVars: []string{"GFD_OUTPUT_FILE"},
},
&cli.StringFlag{
Name: "imex-nodes-config",
Usage: "the path to nvidia-imex nodes config file",
Value: "/etc/nvidia-imex/nodes_config.cfg",
EnvVars: []string{"GFD_IMEX_NODES_CONFIG"},
},
&cli.StringFlag{
Name: "machine-type-file",
Value: "/sys/class/dmi/id/product_name",
Expand Down
4 changes: 2 additions & 2 deletions deployments/container/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

ARG GOLANG_VERSION=1.22.6
ARG GOLANG_VERSION=1.23.1
FROM nvcr.io/nvidia/cuda:12.6.0-base-ubi9 AS build

RUN yum install -y \
Expand All @@ -30,7 +30,7 @@ RUN set -eux; \
aarch64) ARCH='arm64' ;; \
*) echo "unsupported architecture" ; exit 1 ;; \
esac; \
wget -nv -O - https://storage.googleapis.com/golang/go${GOLANG_VERSION}.linux-${ARCH}.tar.gz \
wget -nv -O - https://go.dev/dl/go1.23.1.linux-arm64.tar.gz \
| tar -C /usr/local -xz

ENV GOPATH /go
Expand Down
2 changes: 1 addition & 1 deletion deployments/container/native-only.mk
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# limitations under the License.

PUSH_ON_BUILD ?= false
DOCKER_BUILD_PLATFORM_OPTIONS = --platform=linux/amd64
DOCKER_BUILD_PLATFORM_OPTIONS = --platform=linux/aarch64

ifeq ($(PUSH_ON_BUILD),true)
$(BUILD_TARGETS): build-%: image-%
Expand Down
2 changes: 1 addition & 1 deletion deployments/devel/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

# This Dockerfile is also used to define the golang version used in this project
# This allows dependabot to manage this version in addition to other images.
FROM golang:1.22.6
FROM golang:1.23

WORKDIR /work
COPY * .
Expand Down
3 changes: 2 additions & 1 deletion deployments/devel/go.mod
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
module github.com/NVIDIA/k8s-device-plugin/deployments/devel

go 1.22
go 1.23

toolchain go1.23.0

require github.com/matryer/moq v0.5.0
Expand Down
13 changes: 13 additions & 0 deletions deployments/helm/nvidia-device-plugin/templates/daemonset-gfd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,10 @@ spec:
- name: GFD_USE_NODE_FEATURE_API
value: {{ .Values.nfd.enableNodeFeatureApi | quote }}
{{- end }}
{{- if typeIs "string" .Values.imexNodesConfigFile }}
- name: GFD_IMEX_NODES_CONFIG
value: {{ .Values.imexNodesConfigFile | quote }}
{{- end }}
{{- if $options.hasConfigMap }}
- name: CONFIG_FILE
value: /config/config.yaml
Expand All @@ -182,6 +186,10 @@ spec:
mountPath: "/etc/kubernetes/node-feature-discovery/features.d"
- name: host-sys
mountPath: "/sys"
{{- if typeIs "string" .Values.imexNodesConfigFile }}
- name: imex-nodes-config
mountPath: {{ .Values.imexNodesConfigFile | quote }}
{{- end }}
{{- if $options.hasConfigMap }}
- name: available-configs
mountPath: /available-configs
Expand All @@ -199,6 +207,11 @@ spec:
- name: host-sys
hostPath:
path: "/sys"
{{- if typeIs "string" .Values.imexNodesConfigFile }}
- name: imex-nodes-config
hostPath:
path: {{ .Values.imexNodesConfigFile | quote }}
{{- end }}
{{- if $options.hasConfigMap }}
- name: available-configs
configMap:
Expand Down
2 changes: 2 additions & 0 deletions deployments/helm/nvidia-device-plugin/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ deviceIDStrategy: null
nvidiaDriverRoot: null
gdsEnabled: null
mofedEnabled: null
# Default value is "/etc/nvidia-imex/nodes_config.cfg"
imexNodesConfigFile: null
deviceDiscoveryStrategy: null

nameOverride: ""
Expand Down
105 changes: 105 additions & 0 deletions internal/lm/nvml.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,13 @@
package lm

import (
"bufio"
"errors"
"fmt"
"math/rand"
"net"
"os"
"sort"
"strconv"
"strings"

Expand All @@ -28,6 +33,7 @@ import (

spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1"
"github.com/NVIDIA/k8s-device-plugin/internal/resource"
"github.com/google/uuid"
)

var errMPSSharingNotSupported = errors.New("MPS sharing is not supported")
Expand Down Expand Up @@ -80,13 +86,19 @@ func NewDeviceLabeler(manager resource.Manager, config *spec.Config) (Labeler, e
return nil, fmt.Errorf("error creating resource labeler: %v", err)
}

imexLabeler, err := newImexDomainLabeler(*config.Flags.GFD.ImexNodesConfig, devices)
if err != nil {
return nil, fmt.Errorf("error creating imex domain labeler: %v", err)
}

l := Merge(
machineTypeLabeler,
versionLabeler,
migCapabilityLabeler,
sharingLabeler,
resourceLabeler,
gpuModeLabeler,
imexLabeler,
)

return l, nil
Expand Down Expand Up @@ -218,6 +230,79 @@ func newGPUModeLabeler(devices []resource.Device) (Labeler, error) {
return labels, nil
}

func newImexDomainLabeler(configFile string, device []resource.Device) (Labeler, error) {
if configFile == "" {
return nil, nil
}

// read file and parse it
imexConfig, err := os.Open(configFile)
if err != nil {
klog.Warningf("failed to open imex config file: %v", err)
return nil, nil
}
defer imexConfig.Close()

// Read the file line by line
var ips []string
scanner := bufio.NewScanner(imexConfig)
for scanner.Scan() {
ip := strings.TrimSpace(scanner.Text())
if net.ParseIP(ip) == nil {
return nil, fmt.Errorf("invalid IP address in imex config file: %s", ip)
}
ips = append(ips, ip)
}

if err := scanner.Err(); err != nil {
return nil, fmt.Errorf("failed to read imex config file: %v", err)
}

// Sort the IP addresses
sort.Strings(ips)

// Join the sorted IPs into a single string
sortedIPs := strings.Join(ips, "\n")

hashedconfig := generateUUIDs(sortedIPs)[0]

var commonClusterUUID string
var commonCliqueID string
for _, d := range device {
clusterUUID, err := d.GetClusterUUID()
if err != nil {
return nil, fmt.Errorf("error getting cluster UUID: %v", err)
}
if commonClusterUUID == "" {
commonClusterUUID = clusterUUID
}
if commonClusterUUID != clusterUUID {
klog.Warningf("Cluster UUIDs are different: %s != %s", commonClusterUUID, clusterUUID)
return nil, nil
}

cliqueID, err := d.GetCliqueIP()
if err != nil {
return nil, fmt.Errorf("error getting clique ID: %v", err)
}
if commonCliqueID == "" {
commonCliqueID = cliqueID
}
if commonCliqueID != cliqueID {
klog.Warningf("Clique IDs are different: %s != %s", commonCliqueID, cliqueID)
return nil, nil
}
}

labels := Labels{
"nvidia.com/gpu.clusteruuid": commonClusterUUID,
"nvidia.com/gpu.cliqueid": commonCliqueID,
"nvidia.com/gpu.imex-domain": hashedconfig + "-" + commonCliqueID,
}

return labels, nil
}

func getModeForClasses(classes []uint32) string {
if len(classes) == 0 {
return "unknown"
Expand Down Expand Up @@ -254,3 +339,23 @@ func getDeviceClasses(devices []resource.Device) ([]uint32, error) {
}
return classes, nil
}

func generateUUIDs(seed string) []string {
rand := rand.New(rand.NewSource(hash(seed)))

Check failure on line 344 in internal/lm/nvml.go

View workflow job for this annotation

GitHub Actions / check

G404: Use of weak random number generator (math/rand or math/rand/v2 instead of crypto/rand) (gosec)

uuids := make([]string, 1)
charset := make([]byte, 16)
rand.Read(charset)
uuid, _ := uuid.FromBytes(charset)
uuids[0] = uuid.String()

return uuids
}

func hash(s string) int64 {
h := int64(0)
for _, c := range s {
h = 31*h + int64(c)
}
return h
}
8 changes: 8 additions & 0 deletions internal/resource/cuda-device.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,3 +100,11 @@ func (d *cudaDevice) IsMigEnabled() (bool, error) {
func (d *cudaDevice) GetPCIClass() (uint32, error) {
return 0, nil
}

func (d *cudaDevice) GetClusterUUID() (string, error) {
return "", nil
}

func (d *cudaDevice) GetCliqueIP() (string, error) {
return "", nil
}
74 changes: 74 additions & 0 deletions internal/resource/device_mock.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 97ea6a6

Please sign in to comment.