Skip to content

Commit

Permalink
Add timeout for Cluster provisioning
Browse files Browse the repository at this point in the history
  • Loading branch information
Missxiaoguo committed Sep 27, 2024
1 parent a617e9c commit 4a9e8d5
Show file tree
Hide file tree
Showing 7 changed files with 216 additions and 19 deletions.
3 changes: 3 additions & 0 deletions api/v1alpha1/clusterrequest_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ type ClusterDetails struct {
// Says if ZTP has complete or not.
ZtpStatus string `json:"ztpStatus,omitempty"`

// A timestamp indicating the cluster provisoning has started
ClusterProvisionStartedAt metav1.Time `json:"clusterProvisionStartedAt,omitempty"`

// Holds the first timestamp when the configuration was found NonCompliant for the cluster.
NonCompliantAt metav1.Time `json:"nonCompliantAt,omitempty"`
}
Expand Down
1 change: 1 addition & 0 deletions api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions bundle/manifests/o2ims.oran.openshift.io_clusterrequests.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions config/crd/bases/o2ims.oran.openshift.io_clusterrequests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,11 @@ spec:
clusterDetails:
description: ClusterDetails references to the ClusterInstance.
properties:
clusterProvisionStartedAt:
description: A timestamp indicating the cluster provisoning has
started
format: date-time
type: string
name:
description: Contains the name of the created ClusterInstance.
type: string
Expand Down
54 changes: 39 additions & 15 deletions internal/controllers/clusterrequest_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -237,25 +237,27 @@ func (t *clusterRequestReconcilerTask) run(ctx context.Context) (ctrl.Result, er
// after hw plugin is ready

// Handle the cluster install with ClusterInstance
err = t.handleClusterInstallation(ctx, renderedClusterInstance)
err := t.handleClusterInstallation(ctx, renderedClusterInstance)
if err != nil {
return requeueWithError(err)
}
}

// Handle policy configuration only after the cluster provisioning
// has started.
crProvisionedCond := meta.FindStatusCondition(
t.object.Status.Conditions,
string(utils.CRconditionTypes.ClusterProvisioned))
if crProvisionedCond != nil {
// has started, and not failed or timedout (completed, in-progress or unknown)
if utils.IsClusterProvisionPresent(t.object) &&
!utils.IsClusterProvisionTimedOutOrFailed(t.object) {

// Handle configuration through policies.
requeue, err := t.handleClusterPolicyConfiguration(ctx)
if err != nil {
return requeueWithError(err)
}
if requeue {
return requeueWithMediumInterval(), nil

// Requeue if cluster provisioning is not completed (in-progress or unknown)
// or there are enforce policies that are not Compliant
if !utils.IsClusterProvisionCompleted(t.object) || requeue {
return requeueWithLongInterval(), nil
}
}

Expand Down Expand Up @@ -291,17 +293,18 @@ func (t *clusterRequestReconcilerTask) checkClusterDeployConfigState(ctx context
}
}

// Check the policy configuration status
crProvisionedCond := meta.FindStatusCondition(
t.object.Status.Conditions,
string(utils.CRconditionTypes.ClusterProvisioned))
if crProvisionedCond != nil {
// Check the policy configuration status only after the cluster provisioning
// has started, and not failed or timedout
if utils.IsClusterProvisionPresent(t.object) &&
!utils.IsClusterProvisionTimedOutOrFailed(t.object) {
requeue, err := t.handleClusterPolicyConfiguration(ctx)
if err != nil {
return requeueWithError(err)
}
if requeue {
return requeueWithMediumInterval(), nil
// Requeue if Cluster Provisioned is not completed (in-progress or unknown)
// or there are enforce policies that are not Compliant
if !utils.IsClusterProvisionCompleted(t.object) || requeue {
return requeueWithLongInterval(), nil
}
}
return doNotRequeue(), nil
Expand Down Expand Up @@ -1192,6 +1195,27 @@ func (t *clusterRequestReconcilerTask) updateClusterProvisionStatus(ci *siteconf
ciProvisionedCondition.Message,
)
}

if utils.IsClusterProvisionPresent(t.object) {
// Set the start timestamp if it's already set
if t.object.Status.ClusterDetails.ClusterProvisionStartedAt.IsZero() {
t.object.Status.ClusterDetails.ClusterProvisionStartedAt = metav1.Now()
}

// If it's not failed or completed, check if it has timed out
if !utils.IsClusterProvisionCompletedOrFailed(t.object) {
if time.Since(t.object.Status.ClusterDetails.ClusterProvisionStartedAt.Time) >
time.Duration(t.object.Spec.Timeout.ClusterProvisioning)*time.Minute {
// timed out
utils.SetStatusCondition(&t.object.Status.Conditions,
utils.CRconditionTypes.ClusterProvisioned,
utils.CRconditionReasons.TimedOut,
metav1.ConditionFalse,
"Cluster provisioning timed out",
)
}
}
}
}

// createOrUpdateClusterResources creates/updates all the resources needed for cluster deployment
Expand Down
123 changes: 119 additions & 4 deletions internal/controllers/clusterrequest_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -593,6 +593,10 @@ var _ = Describe("ClusterRequestReconcile", func() {
Raw: []byte(testPolicyTemplateInput),
},
},
Timeout: oranv1alpha1.Timeout{
ClusterProvisioning: 1,
Configuration: 1,
},
},
}

Expand Down Expand Up @@ -987,7 +991,7 @@ var _ = Describe("ClusterRequestReconcile", func() {
result, err := reconciler.Reconcile(ctx, req)
// Verify the reconciliation result
Expect(err).ToNot(HaveOccurred())
Expect(result).To(Equal(requeueWithMediumInterval()))
Expect(result).To(Equal(requeueWithLongInterval()))

reconciledCR := &oranv1alpha1.ClusterRequest{}
Expect(c.Get(ctx, req.NamespacedName, reconciledCR)).To(Succeed())
Expand All @@ -1012,6 +1016,55 @@ var _ = Describe("ClusterRequestReconcile", func() {
Reason: string(utils.CRconditionReasons.ClusterNotReady),
Message: "The Cluster is not yet ready",
})

// Verify the start timestamp has been set for ClusterInstance
Expect(reconciledCR.Status.ClusterDetails.ClusterProvisionStartedAt).ToNot(BeZero())
// Verify the nonCompliantAt timestamp is not set, even though Non-compliant enforce policy exists
// but Cluster is not ready
Expect(reconciledCR.Status.ClusterDetails.NonCompliantAt).To(BeZero())
})

It("Verify status conditions when ClusterInstance provision has timedout", func() {
// Initial reconciliation to populate ClusterProvisionStartedAt timestamp
_, err := reconciler.Reconcile(ctx, req)
Expect(err).ToNot(HaveOccurred())

cr := &oranv1alpha1.ClusterRequest{}
Expect(c.Get(ctx, req.NamespacedName, cr)).To(Succeed())
// Verify the start timestamp has been set for ClusterInstance
Expect(cr.Status.ClusterDetails.ClusterProvisionStartedAt).ToNot(BeZero())
// Verify the nonCompliantAt timestamp is not set, even though Non-compliant enforce policy exists
// but Cluster is not ready
Expect(cr.Status.ClusterDetails.NonCompliantAt).To(BeZero())

// Patch ClusterProvisionStartedAt timestamp to mock timeout
cr.Status.ClusterDetails = &oranv1alpha1.ClusterDetails{Name: "cluster-1"}
cr.Status.ClusterDetails.ClusterProvisionStartedAt.Time = metav1.Now().Add(-2 * time.Minute)
Expect(c.Status().Update(ctx, cr)).To(Succeed())

// Start reconciliation again
result, err := reconciler.Reconcile(ctx, req)
// Verify the reconciliation result
Expect(err).ToNot(HaveOccurred())
Expect(result).To(Equal(doNotRequeue())) // stop reconciliation on ClusterProvision timeout

reconciledCR := &oranv1alpha1.ClusterRequest{}
Expect(c.Get(ctx, req.NamespacedName, reconciledCR)).To(Succeed())
conditions := reconciledCR.Status.Conditions

// Verify the ClusterRequest's status conditions
Expect(len(conditions)).To(Equal(8))
verifyStatusCondition(conditions[6], metav1.Condition{
Type: string(utils.CRconditionTypes.ClusterProvisioned),
Status: metav1.ConditionFalse,
Reason: string(utils.CRconditionReasons.TimedOut),
})
verifyStatusCondition(conditions[7], metav1.Condition{
Type: string(utils.CRconditionTypes.ConfigurationApplied),
Status: metav1.ConditionFalse,
Reason: string(utils.CRconditionReasons.ClusterNotReady),
Message: "The Cluster is not yet ready",
})
})

It("Verify status conditions when ClusterInstance provision has completed, ManagedCluster becomes ready and configuration policy becomes compliant", func() {
Expand Down Expand Up @@ -1053,6 +1106,11 @@ var _ = Describe("ClusterRequestReconcile", func() {
Reason: string(utils.CRconditionReasons.Completed),
Message: "The configuration is up to date",
})

// Verify the start timestamp is not cleared even Cluster provision has completed
Expect(reconciledCR.Status.ClusterDetails.ClusterProvisionStartedAt).ToNot(BeZero())
// Verify the nonCompliantAt timestamp is not set since enforce policy is compliant
Expect(reconciledCR.Status.ClusterDetails.NonCompliantAt).To(BeZero())
})

It("Verify status conditions when configuration change causes ClusterRequest validation to fail but ClusterInstance becomes provisioned", func() {
Expand All @@ -1076,7 +1134,7 @@ var _ = Describe("ClusterRequestReconcile", func() {
result, err := reconciler.Reconcile(ctx, req)
// Verify the reconciliation result
Expect(err).ToNot(HaveOccurred())
Expect(result).To(Equal(requeueWithMediumInterval()))
Expect(result).To(Equal(requeueWithLongInterval()))

reconciledCR := &oranv1alpha1.ClusterRequest{}
Expect(c.Get(ctx, req.NamespacedName, reconciledCR)).To(Succeed())
Expand All @@ -1102,6 +1160,61 @@ var _ = Describe("ClusterRequestReconcile", func() {
Reason: string(utils.CRconditionReasons.ClusterNotReady),
Message: "The Cluster is not yet ready",
})

// Verify the start timestamp is not cleared even Cluster provision has completed
Expect(reconciledCR.Status.ClusterDetails.ClusterProvisionStartedAt).ToNot(BeZero())
// Verify the nonCompliantAt timestamp is not set, even though Non-compliant enforce policy exists
// but Cluster is not ready
Expect(reconciledCR.Status.ClusterDetails.NonCompliantAt).To(BeZero())
})

It("Verify status conditions when configuration change causes ClusterRequest validation to fail but ClusterProvision becomes timeout", func() {
// Initial reconciliation to populate initial status
_, err := reconciler.Reconcile(ctx, req)
Expect(err).ToNot(HaveOccurred())

cr := &oranv1alpha1.ClusterRequest{}
Expect(c.Get(ctx, req.NamespacedName, cr)).To(Succeed())
// Verify the start timestamp has been set for ClusterInstance
Expect(cr.Status.ClusterDetails.ClusterProvisionStartedAt).ToNot(BeZero())
// Patch ClusterProvisionStartedAt timestamp to mock timeout
cr.Status.ClusterDetails = &oranv1alpha1.ClusterDetails{Name: "cluster-1"}
cr.Status.ClusterDetails.ClusterProvisionStartedAt.Time = metav1.Now().Add(-2 * time.Minute)
Expect(c.Status().Update(ctx, cr)).To(Succeed())

// Remove required field hostname to fail ClusterRequest validation
removeRequiredFieldFromClusterInstanceInput(ctx, c, crName, ctNamespace)

// Start reconciliation
result, err := reconciler.Reconcile(ctx, req)
// Verify the reconciliation result
Expect(err).ToNot(HaveOccurred())
Expect(result).To(Equal(doNotRequeue())) // stop reconciliation on ClusterProvision timeout

reconciledCR := &oranv1alpha1.ClusterRequest{}
Expect(c.Get(ctx, req.NamespacedName, reconciledCR)).To(Succeed())
conditions := reconciledCR.Status.Conditions

// Verify that the Validated condition fails but ClusterProvisioned condition
// is also up-to-date with the current status timeout
Expect(len(conditions)).To(Equal(8))
verifyStatusCondition(conditions[0], metav1.Condition{
Type: string(utils.CRconditionTypes.Validated),
Status: metav1.ConditionFalse,
Reason: string(utils.CRconditionReasons.Failed),
Message: "nodes.0: hostName is required",
})
verifyStatusCondition(conditions[6], metav1.Condition{
Type: string(utils.CRconditionTypes.ClusterProvisioned),
Status: metav1.ConditionFalse,
Reason: string(utils.CRconditionReasons.TimedOut),
})
verifyStatusCondition(conditions[7], metav1.Condition{
Type: string(utils.CRconditionTypes.ConfigurationApplied),
Status: metav1.ConditionFalse,
Reason: string(utils.CRconditionReasons.ClusterNotReady),
Message: "The Cluster is not yet ready",
})
})

It("Verify status conditions when configuration change causes ClusterInstance rendering to fail but configuration policy becomes compliant", func() {
Expand Down Expand Up @@ -2277,7 +2390,8 @@ defaultHugepagesSize: "1G"`,
},
},
Timeout: oranv1alpha1.Timeout{
Configuration: 1,
ClusterProvisioning: 1,
Configuration: 1,
},
},
Status: oranv1alpha1.ClusterRequestStatus{
Expand Down Expand Up @@ -2460,6 +2574,7 @@ defaultHugepagesSize: "1G"`,
metav1.ConditionFalse,
"",
)
CRTask.object.Status.ClusterDetails.ClusterProvisionStartedAt = metav1.Now()
Expect(c.Status().Update(ctx, CRTask.object)).To(Succeed())

// Call the Reconciliation function.
Expand All @@ -2482,7 +2597,7 @@ defaultHugepagesSize: "1G"`,
Expect(err).ToNot(HaveOccurred())
// Expect to not requeue on valid cluster request.
Expect(result.Requeue).To(BeFalse())
Expect(result.RequeueAfter).To(Equal(1 * time.Minute)) // Medium interval
Expect(result.RequeueAfter).To(Equal(5 * time.Minute)) // Long interval
Expect(CRTask.object.Status.ClusterDetails.NonCompliantAt).ToNot(BeZero())
Expect(CRTask.object.Status.Policies).ToNot(BeEmpty())
})
Expand Down
44 changes: 44 additions & 0 deletions internal/controllers/utils/conditions.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package utils

import (
oranv1alpha1 "github.com/openshift-kni/oran-o2ims/api/v1alpha1"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
Expand Down Expand Up @@ -84,3 +85,46 @@ func SetStatusCondition(existingConditions *[]metav1.Condition, conditionType Co
},
)
}

// IsStatusConditionPresent checks if the cluster provision condition is present
func IsClusterProvisionPresent(cr *oranv1alpha1.ClusterRequest) bool {
condition := meta.FindStatusCondition(cr.Status.Conditions, (string(CRconditionTypes.ClusterProvisioned)))
return condition != nil
}

// IsClusterProvisionCompleted checks if the cluster provision condition status is completed
func IsClusterProvisionCompleted(cr *oranv1alpha1.ClusterRequest) bool {
condition := meta.FindStatusCondition(cr.Status.Conditions, (string(CRconditionTypes.ClusterProvisioned)))
if condition != nil {
if condition.Status == metav1.ConditionTrue && condition.Reason == string(CRconditionReasons.Completed) {
return true
}
}
return false
}

// IsClusterProvisionTimedOutOrFailed checks if the cluster provision condition status is timedout or failed
func IsClusterProvisionTimedOutOrFailed(cr *oranv1alpha1.ClusterRequest) bool {
condition := meta.FindStatusCondition(cr.Status.Conditions, (string(CRconditionTypes.ClusterProvisioned)))
if condition != nil {
if condition.Status == metav1.ConditionFalse &&
(condition.Reason == string(CRconditionReasons.Failed) ||
condition.Reason == string(CRconditionReasons.TimedOut)) {
return true
}
}
return false
}

// IsClusterProvisionCompletedOrFailed checks if the cluster provision condition status is completed or failed
func IsClusterProvisionCompletedOrFailed(cr *oranv1alpha1.ClusterRequest) bool {
condition := meta.FindStatusCondition(cr.Status.Conditions, (string(CRconditionTypes.ClusterProvisioned)))
if condition != nil {
if condition.Status == metav1.ConditionTrue ||
(condition.Status == metav1.ConditionFalse &&
condition.Reason == string(CRconditionReasons.Failed)) {
return true
}
}
return false
}

0 comments on commit 4a9e8d5

Please sign in to comment.