diff --git a/api/v1alpha1/clusterrequest_types.go b/api/v1alpha1/clusterrequest_types.go index 469b9230..b2818f81 100644 --- a/api/v1alpha1/clusterrequest_types.go +++ b/api/v1alpha1/clusterrequest_types.go @@ -75,6 +75,9 @@ type ClusterDetails struct { // Says if ZTP has complete or not. ZtpStatus string `json:"ztpStatus,omitempty"` + // A timestamp indicating the cluster provisoning has started + ClusterProvisionStartedAt metav1.Time `json:"clusterProvisionStartedAt,omitempty"` + // Holds the first timestamp when the configuration was found NonCompliant for the cluster. NonCompliantAt metav1.Time `json:"nonCompliantAt,omitempty"` } diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index ae97c271..f8a6ee08 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -44,6 +44,7 @@ func (in *AlarmSubscriptionServerConfig) DeepCopy() *AlarmSubscriptionServerConf // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ClusterDetails) DeepCopyInto(out *ClusterDetails) { *out = *in + in.ClusterProvisionStartedAt.DeepCopyInto(&out.ClusterProvisionStartedAt) in.NonCompliantAt.DeepCopyInto(&out.NonCompliantAt) } diff --git a/bundle/manifests/o2ims.oran.openshift.io_clusterrequests.yaml b/bundle/manifests/o2ims.oran.openshift.io_clusterrequests.yaml index 4e7cfa81..bf0c6020 100644 --- a/bundle/manifests/o2ims.oran.openshift.io_clusterrequests.yaml +++ b/bundle/manifests/o2ims.oran.openshift.io_clusterrequests.yaml @@ -101,6 +101,11 @@ spec: clusterDetails: description: ClusterDetails references to the ClusterInstance. properties: + clusterProvisionStartedAt: + description: A timestamp indicating the cluster provisoning has + started + format: date-time + type: string name: description: Contains the name of the created ClusterInstance. type: string diff --git a/config/crd/bases/o2ims.oran.openshift.io_clusterrequests.yaml b/config/crd/bases/o2ims.oran.openshift.io_clusterrequests.yaml index 05905946..0a385724 100644 --- a/config/crd/bases/o2ims.oran.openshift.io_clusterrequests.yaml +++ b/config/crd/bases/o2ims.oran.openshift.io_clusterrequests.yaml @@ -101,6 +101,11 @@ spec: clusterDetails: description: ClusterDetails references to the ClusterInstance. properties: + clusterProvisionStartedAt: + description: A timestamp indicating the cluster provisoning has + started + format: date-time + type: string name: description: Contains the name of the created ClusterInstance. type: string diff --git a/internal/controllers/clusterrequest_controller.go b/internal/controllers/clusterrequest_controller.go index 0437fc97..bf8df229 100644 --- a/internal/controllers/clusterrequest_controller.go +++ b/internal/controllers/clusterrequest_controller.go @@ -237,25 +237,27 @@ func (t *clusterRequestReconcilerTask) run(ctx context.Context) (ctrl.Result, er // after hw plugin is ready // Handle the cluster install with ClusterInstance - err = t.handleClusterInstallation(ctx, renderedClusterInstance) + err := t.handleClusterInstallation(ctx, renderedClusterInstance) if err != nil { return requeueWithError(err) } } // Handle policy configuration only after the cluster provisioning - // has started. - crProvisionedCond := meta.FindStatusCondition( - t.object.Status.Conditions, - string(utils.CRconditionTypes.ClusterProvisioned)) - if crProvisionedCond != nil { + // has started, and not failed or timedout (completed, in-progress or unknown) + if utils.IsClusterProvisionPresent(t.object) && + !utils.IsClusterProvisionTimedOutOrFailed(t.object) { + // Handle configuration through policies. requeue, err := t.handleClusterPolicyConfiguration(ctx) if err != nil { return requeueWithError(err) } - if requeue { - return requeueWithMediumInterval(), nil + + // Requeue if cluster provisioning is not completed (in-progress or unknown) + // or there are enforce policies that are not Compliant + if !utils.IsClusterProvisionCompleted(t.object) || requeue { + return requeueWithLongInterval(), nil } } @@ -291,17 +293,18 @@ func (t *clusterRequestReconcilerTask) checkClusterDeployConfigState(ctx context } } - // Check the policy configuration status - crProvisionedCond := meta.FindStatusCondition( - t.object.Status.Conditions, - string(utils.CRconditionTypes.ClusterProvisioned)) - if crProvisionedCond != nil { + // Check the policy configuration status only after the cluster provisioning + // has started, and not failed or timedout + if utils.IsClusterProvisionPresent(t.object) && + !utils.IsClusterProvisionTimedOutOrFailed(t.object) { requeue, err := t.handleClusterPolicyConfiguration(ctx) if err != nil { return requeueWithError(err) } - if requeue { - return requeueWithMediumInterval(), nil + // Requeue if Cluster Provisioned is not completed (in-progress or unknown) + // or there are enforce policies that are not Compliant + if !utils.IsClusterProvisionCompleted(t.object) || requeue { + return requeueWithLongInterval(), nil } } return doNotRequeue(), nil @@ -1192,6 +1195,27 @@ func (t *clusterRequestReconcilerTask) updateClusterProvisionStatus(ci *siteconf ciProvisionedCondition.Message, ) } + + if utils.IsClusterProvisionPresent(t.object) { + // Set the start timestamp if it's already set + if t.object.Status.ClusterDetails.ClusterProvisionStartedAt.IsZero() { + t.object.Status.ClusterDetails.ClusterProvisionStartedAt = metav1.Now() + } + + // If it's not failed or completed, check if it has timed out + if !utils.IsClusterProvisionCompletedOrFailed(t.object) { + if time.Since(t.object.Status.ClusterDetails.ClusterProvisionStartedAt.Time) > + time.Duration(t.object.Spec.Timeout.ClusterProvisioning)*time.Minute { + // timed out + utils.SetStatusCondition(&t.object.Status.Conditions, + utils.CRconditionTypes.ClusterProvisioned, + utils.CRconditionReasons.TimedOut, + metav1.ConditionFalse, + "Cluster provisioning timed out", + ) + } + } + } } // createOrUpdateClusterResources creates/updates all the resources needed for cluster deployment diff --git a/internal/controllers/clusterrequest_controller_test.go b/internal/controllers/clusterrequest_controller_test.go index a3dfe463..d4387819 100644 --- a/internal/controllers/clusterrequest_controller_test.go +++ b/internal/controllers/clusterrequest_controller_test.go @@ -593,6 +593,10 @@ var _ = Describe("ClusterRequestReconcile", func() { Raw: []byte(testPolicyTemplateInput), }, }, + Timeout: oranv1alpha1.Timeout{ + ClusterProvisioning: 1, + Configuration: 1, + }, }, } @@ -987,7 +991,7 @@ var _ = Describe("ClusterRequestReconcile", func() { result, err := reconciler.Reconcile(ctx, req) // Verify the reconciliation result Expect(err).ToNot(HaveOccurred()) - Expect(result).To(Equal(requeueWithMediumInterval())) + Expect(result).To(Equal(requeueWithLongInterval())) reconciledCR := &oranv1alpha1.ClusterRequest{} Expect(c.Get(ctx, req.NamespacedName, reconciledCR)).To(Succeed()) @@ -1012,6 +1016,55 @@ var _ = Describe("ClusterRequestReconcile", func() { Reason: string(utils.CRconditionReasons.ClusterNotReady), Message: "The Cluster is not yet ready", }) + + // Verify the start timestamp has been set for ClusterInstance + Expect(reconciledCR.Status.ClusterDetails.ClusterProvisionStartedAt).ToNot(BeZero()) + // Verify the nonCompliantAt timestamp is not set, even though Non-compliant enforce policy exists + // but Cluster is not ready + Expect(reconciledCR.Status.ClusterDetails.NonCompliantAt).To(BeZero()) + }) + + It("Verify status conditions when ClusterInstance provision has timedout", func() { + // Initial reconciliation to populate ClusterProvisionStartedAt timestamp + _, err := reconciler.Reconcile(ctx, req) + Expect(err).ToNot(HaveOccurred()) + + cr := &oranv1alpha1.ClusterRequest{} + Expect(c.Get(ctx, req.NamespacedName, cr)).To(Succeed()) + // Verify the start timestamp has been set for ClusterInstance + Expect(cr.Status.ClusterDetails.ClusterProvisionStartedAt).ToNot(BeZero()) + // Verify the nonCompliantAt timestamp is not set, even though Non-compliant enforce policy exists + // but Cluster is not ready + Expect(cr.Status.ClusterDetails.NonCompliantAt).To(BeZero()) + + // Patch ClusterProvisionStartedAt timestamp to mock timeout + cr.Status.ClusterDetails = &oranv1alpha1.ClusterDetails{Name: "cluster-1"} + cr.Status.ClusterDetails.ClusterProvisionStartedAt.Time = metav1.Now().Add(-2 * time.Minute) + Expect(c.Status().Update(ctx, cr)).To(Succeed()) + + // Start reconciliation again + result, err := reconciler.Reconcile(ctx, req) + // Verify the reconciliation result + Expect(err).ToNot(HaveOccurred()) + Expect(result).To(Equal(doNotRequeue())) // stop reconciliation on ClusterProvision timeout + + reconciledCR := &oranv1alpha1.ClusterRequest{} + Expect(c.Get(ctx, req.NamespacedName, reconciledCR)).To(Succeed()) + conditions := reconciledCR.Status.Conditions + + // Verify the ClusterRequest's status conditions + Expect(len(conditions)).To(Equal(8)) + verifyStatusCondition(conditions[6], metav1.Condition{ + Type: string(utils.CRconditionTypes.ClusterProvisioned), + Status: metav1.ConditionFalse, + Reason: string(utils.CRconditionReasons.TimedOut), + }) + verifyStatusCondition(conditions[7], metav1.Condition{ + Type: string(utils.CRconditionTypes.ConfigurationApplied), + Status: metav1.ConditionFalse, + Reason: string(utils.CRconditionReasons.ClusterNotReady), + Message: "The Cluster is not yet ready", + }) }) It("Verify status conditions when ClusterInstance provision has completed, ManagedCluster becomes ready and configuration policy becomes compliant", func() { @@ -1053,6 +1106,11 @@ var _ = Describe("ClusterRequestReconcile", func() { Reason: string(utils.CRconditionReasons.Completed), Message: "The configuration is up to date", }) + + // Verify the start timestamp is not cleared even Cluster provision has completed + Expect(reconciledCR.Status.ClusterDetails.ClusterProvisionStartedAt).ToNot(BeZero()) + // Verify the nonCompliantAt timestamp is not set since enforce policy is compliant + Expect(reconciledCR.Status.ClusterDetails.NonCompliantAt).To(BeZero()) }) It("Verify status conditions when configuration change causes ClusterRequest validation to fail but ClusterInstance becomes provisioned", func() { @@ -1076,7 +1134,7 @@ var _ = Describe("ClusterRequestReconcile", func() { result, err := reconciler.Reconcile(ctx, req) // Verify the reconciliation result Expect(err).ToNot(HaveOccurred()) - Expect(result).To(Equal(requeueWithMediumInterval())) + Expect(result).To(Equal(requeueWithLongInterval())) reconciledCR := &oranv1alpha1.ClusterRequest{} Expect(c.Get(ctx, req.NamespacedName, reconciledCR)).To(Succeed()) @@ -1102,6 +1160,61 @@ var _ = Describe("ClusterRequestReconcile", func() { Reason: string(utils.CRconditionReasons.ClusterNotReady), Message: "The Cluster is not yet ready", }) + + // Verify the start timestamp is not cleared even Cluster provision has completed + Expect(reconciledCR.Status.ClusterDetails.ClusterProvisionStartedAt).ToNot(BeZero()) + // Verify the nonCompliantAt timestamp is not set, even though Non-compliant enforce policy exists + // but Cluster is not ready + Expect(reconciledCR.Status.ClusterDetails.NonCompliantAt).To(BeZero()) + }) + + It("Verify status conditions when configuration change causes ClusterRequest validation to fail but ClusterProvision becomes timeout", func() { + // Initial reconciliation to populate initial status + _, err := reconciler.Reconcile(ctx, req) + Expect(err).ToNot(HaveOccurred()) + + cr := &oranv1alpha1.ClusterRequest{} + Expect(c.Get(ctx, req.NamespacedName, cr)).To(Succeed()) + // Verify the start timestamp has been set for ClusterInstance + Expect(cr.Status.ClusterDetails.ClusterProvisionStartedAt).ToNot(BeZero()) + // Patch ClusterProvisionStartedAt timestamp to mock timeout + cr.Status.ClusterDetails = &oranv1alpha1.ClusterDetails{Name: "cluster-1"} + cr.Status.ClusterDetails.ClusterProvisionStartedAt.Time = metav1.Now().Add(-2 * time.Minute) + Expect(c.Status().Update(ctx, cr)).To(Succeed()) + + // Remove required field hostname to fail ClusterRequest validation + removeRequiredFieldFromClusterInstanceInput(ctx, c, crName, ctNamespace) + + // Start reconciliation + result, err := reconciler.Reconcile(ctx, req) + // Verify the reconciliation result + Expect(err).ToNot(HaveOccurred()) + Expect(result).To(Equal(doNotRequeue())) // stop reconciliation on ClusterProvision timeout + + reconciledCR := &oranv1alpha1.ClusterRequest{} + Expect(c.Get(ctx, req.NamespacedName, reconciledCR)).To(Succeed()) + conditions := reconciledCR.Status.Conditions + + // Verify that the Validated condition fails but ClusterProvisioned condition + // is also up-to-date with the current status timeout + Expect(len(conditions)).To(Equal(8)) + verifyStatusCondition(conditions[0], metav1.Condition{ + Type: string(utils.CRconditionTypes.Validated), + Status: metav1.ConditionFalse, + Reason: string(utils.CRconditionReasons.Failed), + Message: "nodes.0: hostName is required", + }) + verifyStatusCondition(conditions[6], metav1.Condition{ + Type: string(utils.CRconditionTypes.ClusterProvisioned), + Status: metav1.ConditionFalse, + Reason: string(utils.CRconditionReasons.TimedOut), + }) + verifyStatusCondition(conditions[7], metav1.Condition{ + Type: string(utils.CRconditionTypes.ConfigurationApplied), + Status: metav1.ConditionFalse, + Reason: string(utils.CRconditionReasons.ClusterNotReady), + Message: "The Cluster is not yet ready", + }) }) It("Verify status conditions when configuration change causes ClusterInstance rendering to fail but configuration policy becomes compliant", func() { @@ -2277,7 +2390,8 @@ defaultHugepagesSize: "1G"`, }, }, Timeout: oranv1alpha1.Timeout{ - Configuration: 1, + ClusterProvisioning: 1, + Configuration: 1, }, }, Status: oranv1alpha1.ClusterRequestStatus{ @@ -2460,6 +2574,7 @@ defaultHugepagesSize: "1G"`, metav1.ConditionFalse, "", ) + CRTask.object.Status.ClusterDetails.ClusterProvisionStartedAt = metav1.Now() Expect(c.Status().Update(ctx, CRTask.object)).To(Succeed()) // Call the Reconciliation function. @@ -2482,7 +2597,7 @@ defaultHugepagesSize: "1G"`, Expect(err).ToNot(HaveOccurred()) // Expect to not requeue on valid cluster request. Expect(result.Requeue).To(BeFalse()) - Expect(result.RequeueAfter).To(Equal(1 * time.Minute)) // Medium interval + Expect(result.RequeueAfter).To(Equal(5 * time.Minute)) // Long interval Expect(CRTask.object.Status.ClusterDetails.NonCompliantAt).ToNot(BeZero()) Expect(CRTask.object.Status.Policies).ToNot(BeEmpty()) }) diff --git a/internal/controllers/utils/conditions.go b/internal/controllers/utils/conditions.go index a34dbfa2..c7f394e5 100644 --- a/internal/controllers/utils/conditions.go +++ b/internal/controllers/utils/conditions.go @@ -1,6 +1,7 @@ package utils import ( + oranv1alpha1 "github.com/openshift-kni/oran-o2ims/api/v1alpha1" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -84,3 +85,46 @@ func SetStatusCondition(existingConditions *[]metav1.Condition, conditionType Co }, ) } + +// IsStatusConditionPresent checks if the cluster provision condition is present +func IsClusterProvisionPresent(cr *oranv1alpha1.ClusterRequest) bool { + condition := meta.FindStatusCondition(cr.Status.Conditions, (string(CRconditionTypes.ClusterProvisioned))) + return condition != nil +} + +// IsClusterProvisionCompleted checks if the cluster provision condition status is completed +func IsClusterProvisionCompleted(cr *oranv1alpha1.ClusterRequest) bool { + condition := meta.FindStatusCondition(cr.Status.Conditions, (string(CRconditionTypes.ClusterProvisioned))) + if condition != nil { + if condition.Status == metav1.ConditionTrue && condition.Reason == string(CRconditionReasons.Completed) { + return true + } + } + return false +} + +// IsClusterProvisionTimedOutOrFailed checks if the cluster provision condition status is timedout or failed +func IsClusterProvisionTimedOutOrFailed(cr *oranv1alpha1.ClusterRequest) bool { + condition := meta.FindStatusCondition(cr.Status.Conditions, (string(CRconditionTypes.ClusterProvisioned))) + if condition != nil { + if condition.Status == metav1.ConditionFalse && + (condition.Reason == string(CRconditionReasons.Failed) || + condition.Reason == string(CRconditionReasons.TimedOut)) { + return true + } + } + return false +} + +// IsClusterProvisionCompletedOrFailed checks if the cluster provision condition status is completed or failed +func IsClusterProvisionCompletedOrFailed(cr *oranv1alpha1.ClusterRequest) bool { + condition := meta.FindStatusCondition(cr.Status.Conditions, (string(CRconditionTypes.ClusterProvisioned))) + if condition != nil { + if condition.Status == metav1.ConditionTrue || + (condition.Status == metav1.ConditionFalse && + condition.Reason == string(CRconditionReasons.Failed)) { + return true + } + } + return false +}