diff --git a/api/hardwaremanagement/v1alpha1/conditions.go b/api/hardwaremanagement/v1alpha1/conditions.go index d6a371e6..fb19f781 100644 --- a/api/hardwaremanagement/v1alpha1/conditions.go +++ b/api/hardwaremanagement/v1alpha1/conditions.go @@ -41,4 +41,5 @@ const ( Unprovisioned ConditionReason = "Unprovisioned" Failed ConditionReason = "Failed" NotInitialized ConditionReason = "NotInitialized" + TimedOut ConditionReason = "TimedOut" ) diff --git a/api/v1alpha1/clusterrequest_types.go b/api/v1alpha1/clusterrequest_types.go index 469b9230..a71013bf 100644 --- a/api/v1alpha1/clusterrequest_types.go +++ b/api/v1alpha1/clusterrequest_types.go @@ -66,6 +66,8 @@ type NodePoolRef struct { Name string `json:"name,omitempty"` // Contains the namespace of the created NodePool. Namespace string `json:"namespace,omitempty"` + // Represents the timestamp of the first status check for hardware provisioning + HardwareProvisioningCheckStart metav1.Time `json:"hardwareProvisioningCheckStart,omitempty"` } type ClusterDetails struct { diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index ae97c271..58e03621 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -152,7 +152,7 @@ func (in *ClusterRequestStatus) DeepCopyInto(out *ClusterRequestStatus) { if in.NodePoolRef != nil { in, out := &in.NodePoolRef, &out.NodePoolRef *out = new(NodePoolRef) - **out = **in + (*in).DeepCopyInto(*out) } if in.Policies != nil { in, out := &in.Policies, &out.Policies @@ -460,6 +460,7 @@ func (in *MetadataServerConfig) DeepCopy() *MetadataServerConfig { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *NodePoolRef) DeepCopyInto(out *NodePoolRef) { *out = *in + in.HardwareProvisioningCheckStart.DeepCopyInto(&out.HardwareProvisioningCheckStart) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodePoolRef. diff --git a/bundle/manifests/o2ims.oran.openshift.io_clusterrequests.yaml b/bundle/manifests/o2ims.oran.openshift.io_clusterrequests.yaml index 4e7cfa81..a06c64e7 100644 --- a/bundle/manifests/o2ims.oran.openshift.io_clusterrequests.yaml +++ b/bundle/manifests/o2ims.oran.openshift.io_clusterrequests.yaml @@ -185,6 +185,11 @@ spec: nodePoolRef: description: NodePoolRef references to the NodePool. properties: + hardwareProvisioningCheckStart: + description: Represents the timestamp of the first status check + for hardware provisioning + format: date-time + type: string name: description: Contains the name of the created NodePool. type: string diff --git a/config/crd/bases/o2ims.oran.openshift.io_clusterrequests.yaml b/config/crd/bases/o2ims.oran.openshift.io_clusterrequests.yaml index 05905946..64d51909 100644 --- a/config/crd/bases/o2ims.oran.openshift.io_clusterrequests.yaml +++ b/config/crd/bases/o2ims.oran.openshift.io_clusterrequests.yaml @@ -185,6 +185,11 @@ spec: nodePoolRef: description: NodePoolRef references to the NodePool. properties: + hardwareProvisioningCheckStart: + description: Represents the timestamp of the first status check + for hardware provisioning + format: date-time + type: string name: description: Contains the name of the created NodePool. type: string diff --git a/config/samples/v1alpha1_hardwaretemplate.yaml b/config/samples/v1alpha1_hardwaretemplate.yaml new file mode 100644 index 00000000..b1cb650a --- /dev/null +++ b/config/samples/v1alpha1_hardwaretemplate.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: dell-intel-cu-template-configmap-v1 + namespace: oran-o2ims +data: + hwMgrId: dell-hwmgr + bootInterfaceLabel: bootable-interface + node-pools-data: | + - name: master + hwProfile: profile-spr-single-processor-64G + - name: worker + hwProfile: profile-spr-dual-processor-128G diff --git a/internal/controllers/clusterrequest_controller.go b/internal/controllers/clusterrequest_controller.go index b4b51e7c..7de92e47 100644 --- a/internal/controllers/clusterrequest_controller.go +++ b/internal/controllers/clusterrequest_controller.go @@ -213,7 +213,15 @@ func (t *clusterRequestReconcilerTask) run(ctx context.Context) (ctrl.Result, er } // wait for the NodePool to be provisioned and update BMC details in ClusterInstance - if !t.waitForHardwareData(ctx, renderedClusterInstance, renderedNodePool) { + provisioned, timedOutOrFailed, err := t.waitForHardwareData(ctx, renderedClusterInstance, renderedNodePool) + if err != nil { + return requeueWithError(err) + } + if timedOutOrFailed { + // Timeout occurred or failed, stop requeuing + return doNotRequeue(), nil + } + if !provisioned { t.logger.InfoContext( ctx, fmt.Sprintf( @@ -231,6 +239,7 @@ func (t *clusterRequestReconcilerTask) run(ctx context.Context) (ctrl.Result, er hwProvisionedCond := meta.FindStatusCondition( t.object.Status.Conditions, string(utils.CRconditionTypes.HardwareProvisioned)) + if hwProvisionedCond != nil { // TODO: check hwProvisionedCond.Status == metav1.ConditionTrue // after hw plugin is ready @@ -273,7 +282,10 @@ func (t *clusterRequestReconcilerTask) checkClusterDeployConfigState(ctx context nodePool := &hwv1alpha1.NodePool{} nodePool.SetName(t.object.Status.NodePoolRef.Name) nodePool.SetNamespace(t.object.Status.NodePoolRef.Namespace) - t.checkNodePoolProvisionStatus(ctx, nodePool) + _, _, err := t.checkNodePoolProvisionStatus(ctx, nodePool) + if err != nil { + return requeueWithError(err) + } hwProvisionedCond := meta.FindStatusCondition( t.object.Status.Conditions, @@ -1659,24 +1671,21 @@ func (r *ClusterRequestReconciler) handleFinalizer( // checkNodePoolProvisionStatus checks for the NodePool status to be in the provisioned state. func (t *clusterRequestReconcilerTask) checkNodePoolProvisionStatus(ctx context.Context, - nodePool *hwv1alpha1.NodePool) bool { + nodePool *hwv1alpha1.NodePool) (bool, bool, error) { // Get the generated NodePool and its status. exists, err := utils.DoesK8SResourceExist(ctx, t.client, nodePool.GetName(), nodePool.GetNamespace(), nodePool) - if err != nil || !exists { - t.logger.ErrorContext( - ctx, - "Failed to get the NodePools", - slog.String("name", nodePool.GetName()), - slog.String("namespace", nodePool.GetNamespace()), - ) - return false + if err != nil { + return false, false, fmt.Errorf("failed to get node pool; %w", err) + } + if !exists { + return false, false, fmt.Errorf("node pool does not exist") } // Update the Cluster Request Status with status from the NodePool object. - err = t.updateHardwareProvisioningStatus(ctx, nodePool) + provisioned, timedOutOrFailed, err := t.updateHardwareProvisioningStatus(ctx, nodePool) if err != nil { t.logger.ErrorContext( ctx, @@ -1684,20 +1693,8 @@ func (t *clusterRequestReconcilerTask) checkNodePoolProvisionStatus(ctx context. slog.String("name", t.object.Name), ) } - // Check if provisioning is completed - provisionedCondition := meta.FindStatusCondition(nodePool.Status.Conditions, string(hwv1alpha1.Provisioned)) - if provisionedCondition != nil && provisionedCondition.Status == metav1.ConditionTrue { - t.logger.InfoContext( - ctx, - fmt.Sprintf( - "NodePool %s in the namespace %s is provisioned", - nodePool.GetName(), - nodePool.GetNamespace(), - ), - ) - return true - } - return false + + return provisioned, timedOutOrFailed, err } // updateClusterInstance updates the given ClusterInstance object based on the provisioned nodePool. @@ -1723,10 +1720,10 @@ func (t *clusterRequestReconcilerTask) updateClusterInstance(ctx context.Context // waitForHardwareData waits for the NodePool to be provisioned and update BMC details // and bootMacAddress in ClusterInstance. func (t *clusterRequestReconcilerTask) waitForHardwareData(ctx context.Context, - clusterInstance *siteconfig.ClusterInstance, nodePool *hwv1alpha1.NodePool) bool { + clusterInstance *siteconfig.ClusterInstance, nodePool *hwv1alpha1.NodePool) (bool, bool, error) { - provisioned := t.checkNodePoolProvisionStatus(ctx, nodePool) - if provisioned { + provisioned, timedOutOrFailed, err := t.checkNodePoolProvisionStatus(ctx, nodePool) + if provisioned && err == nil { t.logger.InfoContext( ctx, fmt.Sprintf( @@ -1735,9 +1732,11 @@ func (t *clusterRequestReconcilerTask) waitForHardwareData(ctx context.Context, nodePool.GetNamespace(), ), ) - return t.updateClusterInstance(ctx, clusterInstance, nodePool) + if !t.updateClusterInstance(ctx, clusterInstance, nodePool) { + err = fmt.Errorf("failed to update the rendered cluster instance") + } } - return false + return provisioned, timedOutOrFailed, err } // collectNodeDetails collects BMC and node interfaces details @@ -1902,59 +1901,83 @@ func (t *clusterRequestReconcilerTask) updateNodeStatusWithHostname(ctx context. return true } -// updateHardwareProvisioningStatus updates the status for the created ClusterInstance +// updateHardwareProvisioningStatus updates the status for the ClusterRequest func (t *clusterRequestReconcilerTask) updateHardwareProvisioningStatus( - ctx context.Context, nodePool *hwv1alpha1.NodePool) error { + ctx context.Context, nodePool *hwv1alpha1.NodePool) (bool, bool, error) { + var status metav1.ConditionStatus + var reason string + var message string + var err error + timedOutOrFailed := false // Default to false unless explicitly needed - if len(nodePool.Status.Conditions) > 0 { - provisionedCondition := meta.FindStatusCondition( - nodePool.Status.Conditions, string(hwv1alpha1.Provisioned)) - if provisionedCondition != nil { - utils.SetStatusCondition(&t.object.Status.Conditions, - utils.CRconditionTypes.HardwareProvisioned, - utils.ConditionReason(provisionedCondition.Reason), - provisionedCondition.Status, - provisionedCondition.Message) - } else { - utils.SetStatusCondition(&t.object.Status.Conditions, - utils.CRconditionTypes.HardwareProvisioned, - utils.CRconditionReasons.Unknown, - metav1.ConditionUnknown, - "Unknown state of hardware provisioning", - ) - } + if t.object.Status.NodePoolRef == nil { + t.object.Status.NodePoolRef = &oranv1alpha1.NodePoolRef{} + } - if err := utils.UpdateK8sCRStatus(ctx, t.client, t.object); err != nil { - t.logger.ErrorContext( + t.object.Status.NodePoolRef.Name = nodePool.GetName() + t.object.Status.NodePoolRef.Namespace = nodePool.GetNamespace() + if t.object.Status.NodePoolRef.HardwareProvisioningCheckStart.IsZero() { + t.object.Status.NodePoolRef.HardwareProvisioningCheckStart = metav1.Now() + } + + provisionedCondition := meta.FindStatusCondition( + nodePool.Status.Conditions, string(hwv1alpha1.Provisioned)) + if provisionedCondition != nil { + status = provisionedCondition.Status + reason = provisionedCondition.Reason + message = provisionedCondition.Message + + if provisionedCondition.Status == metav1.ConditionFalse && reason == string(hwv1alpha1.Failed) { + t.logger.InfoContext( ctx, - "Failed to update the HardwareProvisioning status for ClusterRequest", - slog.String("name", t.object.Name), - slog.Any("specificError", err), + fmt.Sprintf( + "NodePool %s in the namespace %s provisioning failed", + nodePool.GetName(), + nodePool.GetNamespace(), + ), ) - return fmt.Errorf("failed to update HardwareProvisioning status: %w", err) + // Ensure a consistent message for the cluster request, regardless of which plugin is used. + message = "Hardware provisioning failed" + timedOutOrFailed = true } - } else if nodePool.ObjectMeta.Namespace == utils.TempDellPluginNamespace || nodePool.ObjectMeta.Namespace == utils.UnitTestHwmgrNamespace { - // TODO: For test purposes only. Code to be removed once hwmgr plugin(s) are fully utilized - meta.SetStatusCondition( - &nodePool.Status.Conditions, - metav1.Condition{ - Type: string(hwv1alpha1.Unknown), - Status: metav1.ConditionUnknown, - Reason: string(hwv1alpha1.NotInitialized), - }, - ) - if err := utils.UpdateK8sCRStatus(ctx, t.client, nodePool); err != nil { - t.logger.ErrorContext( + } else { + // No provisioning condition found, set the status to unknown. + status = metav1.ConditionUnknown + reason = string(utils.CRconditionReasons.Unknown) + message = "Unknown state of hardware provisioning" + } + + // Check for timeout if not already failed or provisioned + if status != metav1.ConditionTrue && reason != string(hwv1alpha1.Failed) { + elapsedTime := time.Since(t.object.Status.NodePoolRef.HardwareProvisioningCheckStart.Time) + if elapsedTime >= time.Duration(t.object.Spec.Timeout.HardwareProvisioning)*time.Minute { + t.logger.InfoContext( ctx, - "Failed to update the NodePool status", - slog.String("name", nodePool.Name), - slog.Any("specificError", err), + fmt.Sprintf( + "NodePool %s in the namespace %s provisioning timed out", + nodePool.GetName(), + nodePool.GetNamespace(), + ), ) - return fmt.Errorf("failed to update NodePool status: %w", err) + reason = string(hwv1alpha1.TimedOut) + message = "Hardware provisioning timed out" + status = metav1.ConditionFalse + timedOutOrFailed = true } + } + // Set the status condition for hardware provisioning. + utils.SetStatusCondition(&t.object.Status.Conditions, + utils.CRconditionTypes.HardwareProvisioned, + utils.ConditionReason(reason), + status, + message) + + // Update the CR status for the ClusterRequest. + if err = utils.UpdateK8sCRStatus(ctx, t.client, t.object); err != nil { + err = fmt.Errorf("failed to update HardwareProvisioning status: %w", err) } - return nil + return status == metav1.ConditionTrue, timedOutOrFailed, err } // findClusterInstanceForClusterRequest maps the ClusterInstance created by a diff --git a/internal/controllers/clusterrequest_controller_test.go b/internal/controllers/clusterrequest_controller_test.go index 2fc66610..e8b56b46 100644 --- a/internal/controllers/clusterrequest_controller_test.go +++ b/internal/controllers/clusterrequest_controller_test.go @@ -592,6 +592,9 @@ var _ = Describe("ClusterRequestReconcile", func() { Raw: []byte(testPolicyTemplateInput), }, }, + Timeout: oranv1alpha1.Timeout{ + HardwareProvisioning: 1, + }, }, } @@ -723,7 +726,7 @@ var _ = Describe("ClusterRequestReconcile", func() { Name: crName, Namespace: "hwmgr"}, nodePool)).To(Succeed()) // Verify the ClusterRequest's status conditions - Expect(len(conditions)).To(Equal(4)) + Expect(len(conditions)).To(Equal(6)) verifyStatusCondition(conditions[0], metav1.Condition{ Type: string(utils.CRconditionTypes.Validated), Status: metav1.ConditionTrue, @@ -744,6 +747,18 @@ var _ = Describe("ClusterRequestReconcile", func() { Status: metav1.ConditionTrue, Reason: string(utils.CRconditionReasons.Completed), }) + verifyStatusCondition(conditions[4], metav1.Condition{ + Type: string(utils.CRconditionTypes.HardwareProvisioned), + Status: metav1.ConditionUnknown, + Reason: string(utils.CRconditionReasons.Unknown), + }) + verifyStatusCondition(conditions[5], metav1.Condition{ + Type: string(utils.CRconditionTypes.ClusterInstanceProcessed), + Status: metav1.ConditionUnknown, + Reason: string(utils.CRconditionReasons.Unknown), + }) + // Verify the start timestamp has been set for HardwareProvisioning + Expect(reconciledCR.Status.NodePoolRef.HardwareProvisioningCheckStart).ToNot(BeZero()) }) }) @@ -787,6 +802,8 @@ var _ = Describe("ClusterRequestReconcile", func() { Status: metav1.ConditionFalse, Reason: string(utils.CRconditionReasons.InProgress), }) + // Verify the start timestamp has been set for HardwareProvisioning + Expect(reconciledCR.Status.NodePoolRef.HardwareProvisioningCheckStart).ToNot(BeZero()) }) It("Verify ClusterInstance should be created when NodePool has provisioned", func() { @@ -1011,6 +1028,8 @@ var _ = Describe("ClusterRequestReconcile", func() { Reason: string(utils.CRconditionReasons.ClusterNotReady), Message: "The Cluster is not yet ready", }) + // Verify the start timestamp has been set for HardwareProvisioning + Expect(reconciledCR.Status.NodePoolRef.HardwareProvisioningCheckStart).ToNot(BeZero()) }) It("Verify status conditions when ClusterInstance provision has completed, ManagedCluster becomes ready and configuration policy becomes compliant", func() { @@ -1537,6 +1556,11 @@ var _ = Describe("waitForNodePoolProvision", func() { ObjectMeta: metav1.ObjectMeta{ Name: crName, }, + Spec: oranv1alpha1.ClusterRequestSpec{ + Timeout: oranv1alpha1.Timeout{ + HardwareProvisioning: 1, + }, + }, } // Define the node pool. @@ -1562,9 +1586,58 @@ var _ = Describe("waitForNodePoolProvision", func() { } }) - It("returns false when error fetching NodePool", func() { - rt := task.checkNodePoolProvisionStatus(ctx, np) - Expect(rt).To(Equal(false)) + It("returns error when error fetching NodePool", func() { + provisioned, timedOutOrFailed, err := task.checkNodePoolProvisionStatus(ctx, np) + Expect(provisioned).To(Equal(false)) + Expect(timedOutOrFailed).To(Equal(false)) + Expect(err).To(HaveOccurred()) + }) + + It("returns failed when NodePool provisioning failed", func() { + provisionedCondition := metav1.Condition{ + Type: "Provisioned", + Status: metav1.ConditionFalse, + Reason: string(hwv1alpha1.Failed), + } + np.Status.Conditions = append(np.Status.Conditions, provisionedCondition) + Expect(c.Create(ctx, np)).To(Succeed()) + provisioned, timedOutOrFailed, err := task.checkNodePoolProvisionStatus(ctx, np) + Expect(provisioned).To(Equal(false)) + Expect(timedOutOrFailed).To(Equal(true)) // It should be failed + Expect(err).ToNot(HaveOccurred()) + condition := meta.FindStatusCondition(cr.Status.Conditions, string(utils.CRconditionTypes.HardwareProvisioned)) + Expect(condition).ToNot(BeNil()) + Expect(condition.Status).To(Equal(metav1.ConditionFalse)) + Expect(condition.Reason).To(Equal(string(hwv1alpha1.Failed))) + }) + + It("returns timeout when NodePool provisioning timed out", func() { + provisionedCondition := metav1.Condition{ + Type: "Provisioned", + Status: metav1.ConditionFalse, + } + np.Status.Conditions = append(np.Status.Conditions, provisionedCondition) + Expect(c.Create(ctx, np)).To(Succeed()) + + // First call to checkNodePoolProvisionStatus (before timeout) + provisioned, timedOutOrFailed, err := task.checkNodePoolProvisionStatus(ctx, np) + Expect(provisioned).To(Equal(false)) + Expect(timedOutOrFailed).To(Equal(false)) + Expect(err).ToNot(HaveOccurred()) + + // Simulate a delay to represent timeout (greater than 1 minute) + time.Sleep(60 * time.Second) // This ensures the timeout occurs + + // Call checkNodePoolProvisionStatus again (after timeout) + provisioned, timedOutOrFailed, err = task.checkNodePoolProvisionStatus(ctx, np) + Expect(provisioned).To(Equal(false)) + Expect(timedOutOrFailed).To(Equal(true)) // Now it should time out + Expect(err).ToNot(HaveOccurred()) + + condition := meta.FindStatusCondition(cr.Status.Conditions, string(utils.CRconditionTypes.HardwareProvisioned)) + Expect(condition).ToNot(BeNil()) + Expect(condition.Status).To(Equal(metav1.ConditionFalse)) + Expect(condition.Reason).To(Equal(string(hwv1alpha1.TimedOut))) }) It("returns false when NodePool is not provisioned", func() { @@ -1575,8 +1648,10 @@ var _ = Describe("waitForNodePoolProvision", func() { np.Status.Conditions = append(np.Status.Conditions, provisionedCondition) Expect(c.Create(ctx, np)).To(Succeed()) - rt := task.checkNodePoolProvisionStatus(ctx, np) - Expect(rt).To(Equal(false)) + provisioned, timedOutOrFailed, err := task.checkNodePoolProvisionStatus(ctx, np) + Expect(provisioned).To(Equal(false)) + Expect(timedOutOrFailed).To(Equal(false)) + Expect(err).ToNot(HaveOccurred()) condition := meta.FindStatusCondition(cr.Status.Conditions, string(utils.CRconditionTypes.HardwareProvisioned)) Expect(condition).ToNot(BeNil()) Expect(condition.Status).To(Equal(metav1.ConditionFalse)) @@ -1589,8 +1664,10 @@ var _ = Describe("waitForNodePoolProvision", func() { } np.Status.Conditions = append(np.Status.Conditions, provisionedCondition) Expect(c.Create(ctx, np)).To(Succeed()) - rt := task.checkNodePoolProvisionStatus(ctx, np) - Expect(rt).To(Equal(true)) + provisioned, timedOutOrFailed, err := task.checkNodePoolProvisionStatus(ctx, np) + Expect(provisioned).To(Equal(true)) + Expect(timedOutOrFailed).To(Equal(false)) + Expect(err).ToNot(HaveOccurred()) condition := meta.FindStatusCondition(cr.Status.Conditions, string(utils.CRconditionTypes.HardwareProvisioned)) Expect(condition).ToNot(BeNil()) Expect(condition.Status).To(Equal(metav1.ConditionTrue)) @@ -1973,7 +2050,8 @@ defaultHugepagesSize: "1G"`, }, }, Timeout: oranv1alpha1.Timeout{ - Configuration: 1, + Configuration: 1, + HardwareProvisioning: 1, }, }, Status: oranv1alpha1.ClusterRequestStatus{ diff --git a/vendor/github.com/openshift-kni/oran-o2ims/api/hardwaremanagement/v1alpha1/conditions.go b/vendor/github.com/openshift-kni/oran-o2ims/api/hardwaremanagement/v1alpha1/conditions.go index d6a371e6..fb19f781 100644 --- a/vendor/github.com/openshift-kni/oran-o2ims/api/hardwaremanagement/v1alpha1/conditions.go +++ b/vendor/github.com/openshift-kni/oran-o2ims/api/hardwaremanagement/v1alpha1/conditions.go @@ -41,4 +41,5 @@ const ( Unprovisioned ConditionReason = "Unprovisioned" Failed ConditionReason = "Failed" NotInitialized ConditionReason = "NotInitialized" + TimedOut ConditionReason = "TimedOut" )