diff --git a/controllers/nvidiadriver_controller.go b/controllers/nvidiadriver_controller.go index 829dcff6b..6b862e39d 100644 --- a/controllers/nvidiadriver_controller.go +++ b/controllers/nvidiadriver_controller.go @@ -26,6 +26,7 @@ import ( corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/util/workqueue" @@ -168,6 +169,10 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request return reconcile.Result{}, nil } + err = updateNodesManagedByDriver(ctx, r, instance) + if err != nil { + return reconcile.Result{}, fmt.Errorf("failed to update nodes managed by driver: %w", err) + } // Sync state and update status managerStatus := r.stateManager.SyncState(ctx, instance, infoCatalog) @@ -404,5 +409,111 @@ func (r *NVIDIADriverReconciler) SetupWithManager(ctx context.Context, mgr ctrl. return fmt.Errorf("failed to add index key: %w", err) } + if err := mgr.GetFieldIndexer().IndexField(ctx, &corev1.Pod{}, "spec.nodeName", func(rawObj client.Object) []string { + pod := rawObj.(*corev1.Pod) + return []string{pod.Spec.NodeName} + }); err != nil { + return err + } + + return nil +} + +func updateNodesManagedByDriver(ctx context.Context, r NVIDIADriverReconciler, instance *nvidiav1alpha1.NVIDIADriver) error { + nodes, err := getNVIDIADriverSelectedNodes(ctx, r.Client, instance) + if err != nil { + return fmt.Errorf("failed to get selected nodes for NVIDIADriver CR: %w", err) + } + + // A map tracking which node objects need to be updated. E.g. updated label / annotations + // need to be applied. + nodesToUpdate := map[*corev1.Node]struct{}{} + + for _, node := range nodes.Items { + labels := node.GetLabels() + annotations := node.GetAnnotations() + + managedBy, exists := labels["nvidia.com/gpu.driver.managed-by"] + if !exists { + // if 'managed-by' label does not exist, label node with cr.Name + labels["nvidia.com/gpu.driver.managed-by"] = instance.Name + nodesToUpdate[&node] = struct{}{} + // if there is an orphan driver pod running on the node, + // indicate to the upgrade controller that an upgrade is required + podList := &corev1.PodList{} + err = r.Client.List(ctx, podList, + client.InNamespace("gpu-operator"), + client.MatchingLabels(map[string]string{DriverLabelKey: DriverLabelValue}), + client.MatchingFields{"spec.nodeName": node.Name}) + if err != nil { + return fmt.Errorf("failed to list driver pods: %w", err) + } + if len(podList.Items) == 0 { + continue + } + if len(podList.Items) != 1 { + return fmt.Errorf("there are multiple driver pods running on node %s", node.Name) + } + pod := podList.Items[0] + if pod.OwnerReferences == nil || len(pod.OwnerReferences) == 0 { + annotations["nvidia.com/gpu-driver-upgrade-requested"] = "true" + } + continue + } + + // do nothing if node is already being managed by this CR + if managedBy == instance.Name { + continue + } + + } + + // Apply updated labels / annotations on node objects + for node := range nodesToUpdate { + err = r.Client.Update(ctx, node) + if err != nil { + return fmt.Errorf("failed to update node %s: %w", node.Name, err) + } + } + return nil } + +// getNVIDIADriverSelectedNodes returns selected nodes based on the nodeselector labels set for a given NVIDIADriver instance +func getNVIDIADriverSelectedNodes(ctx context.Context, k8sClient client.Client, cr *nvidiav1alpha1.NVIDIADriver) (*corev1.NodeList, error) { + nodeList := &corev1.NodeList{} + + if cr.Spec.NodeSelector == nil { + cr.Spec.NodeSelector = cr.GetNodeSelector() + } + + selector := labels.Set(cr.Spec.NodeSelector).AsSelector() + + opts := []client.ListOption{ + client.MatchingLabelsSelector{Selector: selector}, + } + err := k8sClient.List(ctx, nodeList, opts...) + + return nodeList, err +} + +/* +func getDriverPodLabelSelector(clusterPolicy gpuv1.ClusterPolicy) map[string]string { + // initialize with common app=nvidia-driver-daemonset label + driverLabelKey := DriverLabelKey + driverLabelValue := DriverLabelValue + + if clusterPolicy.Spec.Driver.UseNvdiaDriverCRDType() { + // app component label is added for all new driver daemonsets deployed by NVIDIADriver controller + driverLabelKey = AppComponentLabelKey + driverLabelValue = AppComponentLabelValue + } else if clusterPolicyCtrl.openshift != "" && clusterPolicyCtrl.ocpDriverToolkit.enabled { + // For OCP, when DTK is enabled app=nvidia-driver-daemonset label is not constant and changes + // based on rhcos version. Hence use DTK label instead + driverLabelKey = ocpDriverToolkitIdentificationLabel + driverLabelValue = ocpDriverToolkitIdentificationValue + } + + return map[string]string{driverLabelKey: driverLabelValue} +} +*/