Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate from ClusterPolicy to NVIDIADriver owned driver daemonsets #732

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
167 changes: 167 additions & 0 deletions controllers/nvidiadriver_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,14 @@
"context"
"fmt"
"maps"
"os"
"time"

appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/util/workqueue"
Expand Down Expand Up @@ -57,6 +59,7 @@
stateManager state.Manager
nodeSelectorValidator validator.Validator
conditionUpdater conditions.Updater
operatorNamespace string
}

//+kubebuilder:rbac:groups=nvidia.com,resources=nvidiadrivers,verbs=get;list;watch;create;update;patch;delete
Expand Down Expand Up @@ -124,6 +127,8 @@
}
clusterPolicyInstance := clusterPolicyList.Items[0]

r.operatorNamespace = os.Getenv("OPERATOR_NAMESPACE")

// Create a new InfoCatalog which is a generic interface for passing information to state managers
infoCatalog := state.NewInfoCatalog()

Expand Down Expand Up @@ -168,6 +173,16 @@
return reconcile.Result{}, nil
}

clusterpolicyDriverLabels, err := getClusterpolicyDriverLabels(r.ClusterInfo, clusterPolicyInstance)
if err != nil {
return reconcile.Result{}, fmt.Errorf("failed to get clusterpolicy driver labels: %w", err)
}

err = updateNodesManagedByDriver(ctx, r, instance, clusterpolicyDriverLabels)
if err != nil {
return reconcile.Result{}, fmt.Errorf("failed to update nodes managed by driver: %w", err)
}

// Sync state and update status
managerStatus := r.stateManager.SyncState(ctx, instance, infoCatalog)

Expand All @@ -191,6 +206,9 @@
}
}
// if no errors are reported from any state, then we would be waiting on driver daemonset pods
// TODO: Avoid marking 'default' NVIDIADriver instances as NotReady if DesiredNumberScheduled == 0.
// This will avoid unnecessary reconciliations when the 'default' instance is overriden on all

Check failure on line 210 in controllers/nvidiadriver_controller.go

View workflow job for this annotation

GitHub Actions / go-check

`overriden` is a misspelling of `overridden` (misspell)
// GPU nodes, and thus, DesiredNumberScheduled being 0 is valid.
if errorInfo == nil {
condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.DriverNotReady, "Waiting for driver pod to be ready")
if condErr != nil {
Expand Down Expand Up @@ -404,5 +422,154 @@
return fmt.Errorf("failed to add index key: %w", err)
}

if err := mgr.GetFieldIndexer().IndexField(ctx, &corev1.Pod{}, "spec.nodeName", func(rawObj client.Object) []string {
pod := rawObj.(*corev1.Pod)
return []string{pod.Spec.NodeName}
}); err != nil {
return err
}

return nil
}

func updateNodesManagedByDriver(ctx context.Context, r *NVIDIADriverReconciler, instance *nvidiav1alpha1.NVIDIADriver, clusterpolicyDriverLabels map[string]string) error {
nodes, err := getNVIDIADriverSelectedNodes(ctx, r.Client, instance)
if err != nil {
return fmt.Errorf("failed to get selected nodes for NVIDIADriver CR: %w", err)
}

// A map tracking which node objects need to be updated. E.g. updated label / annotations
// need to be applied.
nodesToUpdate := map[*corev1.Node]struct{}{}

for _, node := range nodes.Items {
labels := node.GetLabels()
annotations := node.GetAnnotations()

managedBy, exists := labels["nvidia.com/gpu.driver.managed-by"]
if !exists {
// if 'managed-by' label does not exist, label node with cr.Name
labels["nvidia.com/gpu.driver.managed-by"] = instance.Name
node.SetLabels(labels)
nodesToUpdate[&node] = struct{}{}
// If there is an orphan driver pod running on the node,
// indicate to the upgrade controller that an upgrade is required.
// This will occur when we are migrating from a Clusterpolicy owned
// daemonset to a NVIDIADriver owned daemonset.
podList := &corev1.PodList{}
err = r.Client.List(ctx, podList,
client.InNamespace(r.operatorNamespace),
client.MatchingLabels(clusterpolicyDriverLabels),
client.MatchingFields{"spec.nodeName": node.Name})
if err != nil {
return fmt.Errorf("failed to list driver pods: %w", err)
}
if len(podList.Items) == 0 {
continue
}
pod := podList.Items[0]
if pod.OwnerReferences == nil || len(pod.OwnerReferences) == 0 {

Check failure on line 471 in controllers/nvidiadriver_controller.go

View workflow job for this annotation

GitHub Actions / go-check

S1009: should omit nil check; len() for []k8s.io/apimachinery/pkg/apis/meta/v1.OwnerReference is defined as zero (gosimple)
annotations["nvidia.com/gpu-driver-upgrade-requested"] = "true"
node.SetAnnotations(annotations)
}
continue
}

// do nothing if node is already being managed by this CR
if managedBy == instance.Name {
continue
}

// If node is 'managed-by' another CR, there are several scenarios
// 1) There is no driver pod running on the node. Therefore the label is stale.
// 2) There is a driver pod running on the node, and it is not an orphan. There are
// two possible actions:
// a) If the NVIDIADriver instance we are currently reconciling is the 'default'
// instance (the node selector is empty), do nothing. All other NVIDIADriver
// instances take precedence.
// b) The pod running no longer falls into the node pool of the CR it is currently
// being managed by. Thus, the NVIDIADriver instance we are currently reconciling
// should take ownership of the node.
podList := &corev1.PodList{}
err = r.Client.List(ctx, podList,
client.InNamespace(r.operatorNamespace),
client.MatchingLabels(map[string]string{AppComponentLabelKey: AppComponentLabelValue}),
client.MatchingFields{"spec.nodeName": node.Name})
if err != nil {
return fmt.Errorf("failed to list driver pods: %w", err)
}
if len(podList.Items) == 0 {
labels["nvidia.com/gpu.driver.managed-by"] = instance.Name
node.SetLabels(labels)
nodesToUpdate[&node] = struct{}{}
continue
}
if instance.Spec.NodeSelector == nil || len(instance.Spec.NodeSelector) == 0 {

Check failure on line 507 in controllers/nvidiadriver_controller.go

View workflow job for this annotation

GitHub Actions / go-check

S1009: should omit nil check; len() for map[string]string is defined as zero (gosimple)
// If the nodeSelector for the NVIDIADriver instance is empty, then we
// treat it as the 'default' CR which has the lowest precedence. Allow
// the existing driver pod, owned by another NVIDIADriver CR, to continue
// to run.
continue
}
pod := podList.Items[0]
if pod.OwnerReferences != nil && len(pod.OwnerReferences) > 0 {

Check failure on line 515 in controllers/nvidiadriver_controller.go

View workflow job for this annotation

GitHub Actions / go-check

S1009: should omit nil check; len() for []k8s.io/apimachinery/pkg/apis/meta/v1.OwnerReference is defined as zero (gosimple)
err := r.Client.Patch(ctx, &pod, client.RawPatch(types.MergePatchType, []byte(fmt.Sprintf(`{"metadata":{"labels":{"app": null}}}`))))

Check failure on line 516 in controllers/nvidiadriver_controller.go

View workflow job for this annotation

GitHub Actions / go-check

S1039: unnecessary use of fmt.Sprintf (gosimple)
if err != nil {
return fmt.Errorf("failed to patch pod in order to make it an orphan: %w", err)
}
}
labels["nvidia.com/gpu.driver.managed-by"] = instance.Name
annotations["nvidia.com/gpu-driver-upgrade-requested"] = "true"
node.SetLabels(labels)
node.SetAnnotations(annotations)
nodesToUpdate[&node] = struct{}{}
}

// Apply updated labels / annotations on node objects
for node := range nodesToUpdate {
err = r.Client.Update(ctx, node)
if err != nil {
return fmt.Errorf("failed to update node %s: %w", node.Name, err)
}
}

return nil
}

// getNVIDIADriverSelectedNodes returns selected nodes based on the nodeselector labels set for a given NVIDIADriver instance
func getNVIDIADriverSelectedNodes(ctx context.Context, k8sClient client.Client, cr *nvidiav1alpha1.NVIDIADriver) (*corev1.NodeList, error) {
nodeList := &corev1.NodeList{}

if cr.Spec.NodeSelector == nil {
cr.Spec.NodeSelector = cr.GetNodeSelector()
}

selector := labels.Set(cr.Spec.NodeSelector).AsSelector()

opts := []client.ListOption{
client.MatchingLabelsSelector{Selector: selector},
}
err := k8sClient.List(ctx, nodeList, opts...)

return nodeList, err
}

// getClusterpolicyDriverLabels returns a set of labels that can be used to identify driver pods running that are owned by Clusterpolicy
func getClusterpolicyDriverLabels(clusterInfo clusterinfo.Interface, clusterpolicy gpuv1.ClusterPolicy) (map[string]string, error) {
// initialize with common app=nvidia-driver-daemonset label
driverLabelKey := DriverLabelKey
driverLabelValue := DriverLabelValue

ocpVer, err := clusterInfo.GetOpenshiftVersion()
if err != nil {
return nil, fmt.Errorf("failed to get the OpenShift version: %w", err)
}
if ocpVer != "" && clusterpolicy.Spec.Operator.UseOpenShiftDriverToolkit != nil && *clusterpolicy.Spec.Operator.UseOpenShiftDriverToolkit == true {

Check failure on line 567 in controllers/nvidiadriver_controller.go

View workflow job for this annotation

GitHub Actions / go-check

S1002: should omit comparison to bool constant, can be simplified to `*clusterpolicy.Spec.Operator.UseOpenShiftDriverToolkit` (gosimple)
// For OCP, when DTK is enabled app=nvidia-driver-daemonset label is not
// constant and changes based on rhcos version. Hence use DTK label instead
driverLabelKey = ocpDriverToolkitIdentificationLabel
driverLabelValue = ocpDriverToolkitIdentificationValue
}

return map[string]string{driverLabelKey: driverLabelValue}, nil
}
4 changes: 2 additions & 2 deletions controllers/object_controls.go
Original file line number Diff line number Diff line change
Expand Up @@ -3745,7 +3745,7 @@ func ocpHasDriverToolkitImageStream(n *ClusterPolicyController) (bool, error) {
return true, nil
}

func (n ClusterPolicyController) cleanupAllDriverDaemonSets(ctx context.Context) error {
func (n ClusterPolicyController) cleanupAllDriverDaemonSets(ctx context.Context, deleteOptions *client.DeleteOptions) error {
// Get all DaemonSets owned by ClusterPolicy
//
// (cdesiniotis) There is a limitation with the controller-runtime client where only a single field selector
Expand All @@ -3762,7 +3762,7 @@ func (n ClusterPolicyController) cleanupAllDriverDaemonSets(ctx context.Context)
// filter out DaemonSets which are not the NVIDIA driver/vgpu-manager
if strings.HasPrefix(ds.Name, commonDriverDaemonsetName) || strings.HasPrefix(ds.Name, commonVGPUManagerDaemonsetName) {
n.logger.Info("Deleting NVIDIA driver daemonset owned by ClusterPolicy", "Name", ds.Name)
err = n.client.Delete(ctx, &ds)
err = n.client.Delete(ctx, &ds, deleteOptions)
if err != nil {
return fmt.Errorf("error deleting NVIDIA driver daemonset: %w", err)
}
Expand Down
8 changes: 6 additions & 2 deletions controllers/state_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -952,8 +952,12 @@ func (n *ClusterPolicyController) step() (gpuv1.State, error) {
n.singleton.Spec.Driver.UseNvdiaDriverCRDType() {
n.logger.Info("NVIDIADriver CRD is enabled, cleaning up all NVIDIA driver daemonsets owned by ClusterPolicy")
n.idx++
// Cleanup all driver daemonsets owned by ClusterPolicy.
err := n.cleanupAllDriverDaemonSets(n.ctx)
// Cleanup all driver daemonsets owned by ClusterPolicy, but orphan the dependent pod objects.
// This way, switching to the new NVIDIADriver API does not cause a cluster-wide disruption.
// NVIDIA driver pods owned by ClusterPolicy daemonsets will remain running until the NVIDIADriver
// controller migrates these pods to new ones owned by NVIDIADriver daemonsets.
deletePropagationOrphan := metav1.DeletePropagationOrphan
err := n.cleanupAllDriverDaemonSets(n.ctx, &client.DeleteOptions{PropagationPolicy: &deletePropagationOrphan})
if err != nil {
return gpuv1.NotReady, fmt.Errorf("failed to cleanup all NVIDIA driver daemonsets owned by ClusterPolicy: %w", err)
}
Expand Down
1 change: 1 addition & 0 deletions internal/state/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -548,6 +548,7 @@ func getDriverSpec(cr *nvidiav1alpha1.NVIDIADriver, nodePool nodePool) (*driverS

return &driverSpec{
Spec: spec,
CRName: cr.Name,
AppName: nvidiaDriverAppName,
Name: nvidiaDriverName,
ImagePath: imagePath,
Expand Down
1 change: 1 addition & 0 deletions internal/state/driver_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -624,6 +624,7 @@ func getMinimalDriverRenderData() *driverRenderData {
ReadinessProbe: getDefaultContainerProbeSpec(),
DriverType: nvidiav1alpha1.GPU,
},
CRName: "test-cr",
AppName: "nvidia-gpu-driver-ubuntu22.04-7c6d7bd86b",
Name: "nvidia-gpu-driver-ubuntu22.04",
ImagePath: "nvcr.io/nvidia/driver:525.85.03-ubuntu22.04",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@ spec:
name: run-mellanox-drivers
nodeSelector:
nvidia.com/gpu.deploy.driver: "true"
nvidia.com/gpu.driver.managed-by: test-cr
priorityClassName: system-node-critical
serviceAccountName: nvidia-gpu-driver-ubuntu22.04
tolerations:
Expand Down
3 changes: 1 addition & 2 deletions internal/state/testdata/golden/driver-full-spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -254,9 +254,8 @@ spec:
mountPropagation: HostToContainer
name: run-mellanox-drivers
nodeSelector:
example.com/bar: bar
example.com/foo: foo
nvidia.com/gpu.deploy.driver: "true"
nvidia.com/gpu.driver.managed-by: test-cr
priorityClassName: custom-priority-class-name
serviceAccountName: nvidia-gpu-driver-ubuntu22.04
tolerations:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,7 @@ spec:
nodeSelector:
feature.node.kubernetes.io/system-os_release.OSTREE_VERSION: 413.92.202304252344-0
nvidia.com/gpu.deploy.driver: "true"
nvidia.com/gpu.driver.managed-by: test-cr
priorityClassName: system-node-critical
serviceAccountName: nvidia-gpu-driver-openshift
tolerations:
Expand Down
1 change: 1 addition & 0 deletions internal/state/testdata/golden/driver-gdrcopy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,7 @@ spec:
name: run-mellanox-drivers
nodeSelector:
nvidia.com/gpu.deploy.driver: "true"
nvidia.com/gpu.driver.managed-by: test-cr
priorityClassName: system-node-critical
serviceAccountName: nvidia-gpu-driver-ubuntu22.04
tolerations:
Expand Down
1 change: 1 addition & 0 deletions internal/state/testdata/golden/driver-gds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,7 @@ spec:
name: run-mellanox-drivers
nodeSelector:
nvidia.com/gpu.deploy.driver: "true"
nvidia.com/gpu.driver.managed-by: test-cr
priorityClassName: system-node-critical
serviceAccountName: nvidia-gpu-driver-ubuntu22.04
tolerations:
Expand Down
1 change: 1 addition & 0 deletions internal/state/testdata/golden/driver-minimal.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,7 @@ spec:
name: run-mellanox-drivers
nodeSelector:
nvidia.com/gpu.deploy.driver: "true"
nvidia.com/gpu.driver.managed-by: test-cr
priorityClassName: system-node-critical
serviceAccountName: nvidia-gpu-driver-ubuntu22.04
tolerations:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,7 @@ spec:
nodeSelector:
feature.node.kubernetes.io/system-os_release.OSTREE_VERSION: 413.92.202304252344-0
nvidia.com/gpu.deploy.driver: "true"
nvidia.com/gpu.driver.managed-by: test-cr
priorityClassName: system-node-critical
serviceAccountName: nvidia-gpu-driver-openshift
tolerations:
Expand Down
1 change: 1 addition & 0 deletions internal/state/testdata/golden/driver-precompiled.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,7 @@ spec:
nodeSelector:
feature.node.kubernetes.io/kernel-version.full: 5.4.0-150-generic
nvidia.com/gpu.deploy.driver: "true"
nvidia.com/gpu.driver.managed-by: test-cr
priorityClassName: system-node-critical
serviceAccountName: nvidia-gpu-driver-ubuntu22.04
tolerations:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,7 @@ spec:
name: run-mellanox-drivers
nodeSelector:
nvidia.com/gpu.deploy.driver: "true"
nvidia.com/gpu.driver.managed-by: test-cr
priorityClassName: system-node-critical
serviceAccountName: nvidia-gpu-driver-ubuntu22.04
tolerations:
Expand Down
1 change: 1 addition & 0 deletions internal/state/testdata/golden/driver-rdma.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,7 @@ spec:
name: run-mellanox-drivers
nodeSelector:
nvidia.com/gpu.deploy.driver: "true"
nvidia.com/gpu.driver.managed-by: test-cr
priorityClassName: system-node-critical
serviceAccountName: nvidia-gpu-driver-ubuntu22.04
tolerations:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,7 @@ spec:
nodeSelector:
feature.node.kubernetes.io/system-os_release.OSTREE_VERSION: 413.92.202304252344-0
nvidia.com/gpu.deploy.vgpu-manager: "true"
nvidia.com/gpu.driver.managed-by: test-cr
priorityClassName: system-node-critical
serviceAccountName: nvidia-vgpu-manager-openshift
tolerations:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,7 @@ spec:
name: run-mellanox-drivers
nodeSelector:
nvidia.com/gpu.deploy.vgpu-manager: "true"
nvidia.com/gpu.driver.managed-by: test-cr
priorityClassName: system-node-critical
serviceAccountName: nvidia-vgpu-manager-ubuntu22.04
tolerations:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,7 @@ spec:
name: run-mellanox-drivers
nodeSelector:
nvidia.com/gpu.deploy.driver: "true"
nvidia.com/gpu.driver.managed-by: test-cr
priorityClassName: system-node-critical
serviceAccountName: nvidia-gpu-driver-ubuntu22.04
tolerations:
Expand Down
1 change: 1 addition & 0 deletions internal/state/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ type SyncingSource source.SyncingSource
// which is to be populated with the fully-qualified image path.
type driverSpec struct {
Spec *nvidiav1alpha1.NVIDIADriverSpec
CRName string
AppName string
Name string
ImagePath string
Expand Down
Loading
Loading