Skip to content

Commit

Permalink
wip: handle driver migrations in nvidiadriver controller
Browse files Browse the repository at this point in the history
Signed-off-by: Christopher Desiniotis <[email protected]>
  • Loading branch information
cdesiniotis committed Aug 9, 2024
1 parent 81640fd commit 8db1ba7
Showing 1 changed file with 111 additions and 0 deletions.
111 changes: 111 additions & 0 deletions controllers/nvidiadriver_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/util/workqueue"
Expand Down Expand Up @@ -168,6 +169,10 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
return reconcile.Result{}, nil
}

err = updateNodesManagedByDriver(ctx, r, instance)

Check failure on line 172 in controllers/nvidiadriver_controller.go

View workflow job for this annotation

GitHub Actions / validate-helm-values

cannot use r (variable of type *NVIDIADriverReconciler) as NVIDIADriverReconciler value in argument to updateNodesManagedByDriver

Check failure on line 172 in controllers/nvidiadriver_controller.go

View workflow job for this annotation

GitHub Actions / validate-csv

cannot use r (variable of type *NVIDIADriverReconciler) as NVIDIADriverReconciler value in argument to updateNodesManagedByDriver
if err != nil {
return reconcile.Result{}, fmt.Errorf("failed to update nodes managed by driver: %w", err)
}
// Sync state and update status
managerStatus := r.stateManager.SyncState(ctx, instance, infoCatalog)

Expand Down Expand Up @@ -404,5 +409,111 @@ func (r *NVIDIADriverReconciler) SetupWithManager(ctx context.Context, mgr ctrl.
return fmt.Errorf("failed to add index key: %w", err)
}

if err := mgr.GetFieldIndexer().IndexField(ctx, &corev1.Pod{}, "spec.nodeName", func(rawObj client.Object) []string {
pod := rawObj.(*corev1.Pod)
return []string{pod.Spec.NodeName}
}); err != nil {
return err
}

return nil
}

func updateNodesManagedByDriver(ctx context.Context, r NVIDIADriverReconciler, instance *nvidiav1alpha1.NVIDIADriver) error {
nodes, err := getNVIDIADriverSelectedNodes(ctx, r.Client, instance)
if err != nil {
return fmt.Errorf("failed to get selected nodes for NVIDIADriver CR: %w", err)
}

// A map tracking which node objects need to be updated. E.g. updated label / annotations
// need to be applied.
nodesToUpdate := map[*corev1.Node]struct{}{}

for _, node := range nodes.Items {
labels := node.GetLabels()
annotations := node.GetAnnotations()

managedBy, exists := labels["nvidia.com/gpu.driver.managed-by"]
if !exists {
// if 'managed-by' label does not exist, label node with cr.Name
labels["nvidia.com/gpu.driver.managed-by"] = instance.Name
nodesToUpdate[&node] = struct{}{}
// if there is an orphan driver pod running on the node,
// indicate to the upgrade controller that an upgrade is required
podList := &corev1.PodList{}
err = r.Client.List(ctx, podList,
client.InNamespace("gpu-operator"),
client.MatchingLabels(map[string]string{DriverLabelKey: DriverLabelValue}),
client.MatchingFields{"spec.nodeName": node.Name})
if err != nil {
return fmt.Errorf("failed to list driver pods: %w", err)
}
if len(podList.Items) == 0 {
continue
}
if len(podList.Items) != 1 {
return fmt.Errorf("there are multiple driver pods running on node %s", node.Name)
}
pod := podList.Items[0]
if pod.OwnerReferences == nil || len(pod.OwnerReferences) == 0 {
annotations["nvidia.com/gpu-driver-upgrade-requested"] = "true"
}
continue
}

// do nothing if node is already being managed by this CR
if managedBy == instance.Name {
continue
}

}

// Apply updated labels / annotations on node objects
for node := range nodesToUpdate {
err = r.Client.Update(ctx, node)
if err != nil {
return fmt.Errorf("failed to update node %s: %w", node.Name, err)
}
}

return nil
}

// getNVIDIADriverSelectedNodes returns selected nodes based on the nodeselector labels set for a given NVIDIADriver instance
func getNVIDIADriverSelectedNodes(ctx context.Context, k8sClient client.Client, cr *nvidiav1alpha1.NVIDIADriver) (*corev1.NodeList, error) {
nodeList := &corev1.NodeList{}

if cr.Spec.NodeSelector == nil {
cr.Spec.NodeSelector = cr.GetNodeSelector()
}

selector := labels.Set(cr.Spec.NodeSelector).AsSelector()

opts := []client.ListOption{
client.MatchingLabelsSelector{Selector: selector},
}
err := k8sClient.List(ctx, nodeList, opts...)

return nodeList, err
}

/*
func getDriverPodLabelSelector(clusterPolicy gpuv1.ClusterPolicy) map[string]string {
// initialize with common app=nvidia-driver-daemonset label
driverLabelKey := DriverLabelKey
driverLabelValue := DriverLabelValue
if clusterPolicy.Spec.Driver.UseNvdiaDriverCRDType() {
// app component label is added for all new driver daemonsets deployed by NVIDIADriver controller
driverLabelKey = AppComponentLabelKey
driverLabelValue = AppComponentLabelValue
} else if clusterPolicyCtrl.openshift != "" && clusterPolicyCtrl.ocpDriverToolkit.enabled {
// For OCP, when DTK is enabled app=nvidia-driver-daemonset label is not constant and changes
// based on rhcos version. Hence use DTK label instead
driverLabelKey = ocpDriverToolkitIdentificationLabel
driverLabelValue = ocpDriverToolkitIdentificationValue
}
return map[string]string{driverLabelKey: driverLabelValue}
}
*/

0 comments on commit 8db1ba7

Please sign in to comment.