Skip to content

Commit

Permalink
CNF-14425: Add timeout handling for hardware provisioning
Browse files Browse the repository at this point in the history
This update introduces timeout handling for node pool provisioning status.
It tracks the start time of the hardware provisioning process, allowing
the system to detect if it exceeds the configured timeout. If a timeout
occurs, it is reflected in the HardwareProvisioned condition of the cluster
request.
Additionally, this update improves error handling by implementing requeue
on errors and stopping requeue when provisioning has failed.
A sample of hardware templates has also been included for reference.

Signed-off-by: Tao Liu <[email protected]>
  • Loading branch information
tliu2021 committed Sep 26, 2024
1 parent e3d81b2 commit 15e6985
Show file tree
Hide file tree
Showing 9 changed files with 212 additions and 83 deletions.
1 change: 1 addition & 0 deletions api/hardwaremanagement/v1alpha1/conditions.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,4 +41,5 @@ const (
Unprovisioned ConditionReason = "Unprovisioned"
Failed ConditionReason = "Failed"
NotInitialized ConditionReason = "NotInitialized"
TimedOut ConditionReason = "TimedOut"
)
2 changes: 2 additions & 0 deletions api/v1alpha1/clusterrequest_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ type NodePoolRef struct {
Name string `json:"name,omitempty"`
// Contains the namespace of the created NodePool.
Namespace string `json:"namespace,omitempty"`
// Represents the timestamp of the first status check for hardware provisioning
HardwareProvisioningCheckStart metav1.Time `json:"hardwareProvisioningCheckStart,omitempty"`
}

type ClusterDetails struct {
Expand Down
3 changes: 2 additions & 1 deletion api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions bundle/manifests/o2ims.oran.openshift.io_clusterrequests.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions config/crd/bases/o2ims.oran.openshift.io_clusterrequests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,11 @@ spec:
nodePoolRef:
description: NodePoolRef references to the NodePool.
properties:
hardwareProvisioningCheckStart:
description: Represents the timestamp of the first status check
for hardware provisioning
format: date-time
type: string
name:
description: Contains the name of the created NodePool.
type: string
Expand Down
13 changes: 13 additions & 0 deletions config/samples/v1alpha1_hardwaretemplate.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: dell-intel-cu-template-configmap-v1
namespace: oran-o2ims
data:
hwMgrId: dell-hwmgr
bootInterfaceLabel: bootable-interface
node-pools-data: |
- name: master
hwProfile: profile-spr-single-processor-64G
- name: worker
hwProfile: profile-spr-dual-processor-128G
169 changes: 96 additions & 73 deletions internal/controllers/clusterrequest_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,15 @@ func (t *clusterRequestReconcilerTask) run(ctx context.Context) (ctrl.Result, er
}

// wait for the NodePool to be provisioned and update BMC details in ClusterInstance
if !t.waitForHardwareData(ctx, renderedClusterInstance, renderedNodePool) {
provisioned, timedOutOrFailed, err := t.waitForHardwareData(ctx, renderedClusterInstance, renderedNodePool)
if err != nil {
return requeueWithError(err)
}
if timedOutOrFailed {
// Timeout occurred or failed, stop requeuing
return doNotRequeue(), nil
}
if !provisioned {
t.logger.InfoContext(
ctx,
fmt.Sprintf(
Expand All @@ -231,6 +239,7 @@ func (t *clusterRequestReconcilerTask) run(ctx context.Context) (ctrl.Result, er
hwProvisionedCond := meta.FindStatusCondition(
t.object.Status.Conditions,
string(utils.CRconditionTypes.HardwareProvisioned))

if hwProvisionedCond != nil {
// TODO: check hwProvisionedCond.Status == metav1.ConditionTrue
// after hw plugin is ready
Expand Down Expand Up @@ -273,7 +282,10 @@ func (t *clusterRequestReconcilerTask) checkClusterDeployConfigState(ctx context
nodePool := &hwv1alpha1.NodePool{}
nodePool.SetName(t.object.Status.NodePoolRef.Name)
nodePool.SetNamespace(t.object.Status.NodePoolRef.Namespace)
t.checkNodePoolProvisionStatus(ctx, nodePool)
_, _, err := t.checkNodePoolProvisionStatus(ctx, nodePool)
if err != nil {
return requeueWithError(err)
}

hwProvisionedCond := meta.FindStatusCondition(
t.object.Status.Conditions,
Expand Down Expand Up @@ -1659,45 +1671,30 @@ func (r *ClusterRequestReconciler) handleFinalizer(

// checkNodePoolProvisionStatus checks for the NodePool status to be in the provisioned state.
func (t *clusterRequestReconcilerTask) checkNodePoolProvisionStatus(ctx context.Context,
nodePool *hwv1alpha1.NodePool) bool {
nodePool *hwv1alpha1.NodePool) (bool, bool, error) {

// Get the generated NodePool and its status.
exists, err := utils.DoesK8SResourceExist(ctx, t.client, nodePool.GetName(),
nodePool.GetNamespace(), nodePool)

if err != nil || !exists {
t.logger.ErrorContext(
ctx,
"Failed to get the NodePools",
slog.String("name", nodePool.GetName()),
slog.String("namespace", nodePool.GetNamespace()),
)
return false
if err != nil {
return false, false, fmt.Errorf("failed to get node pool; %w", err)
}
if !exists {
return false, false, fmt.Errorf("node pool does not exist")
}

// Update the Cluster Request Status with status from the NodePool object.
err = t.updateHardwareProvisioningStatus(ctx, nodePool)
provisioned, timedOutOrFailed, err := t.updateHardwareProvisioningStatus(ctx, nodePool)
if err != nil {
t.logger.ErrorContext(
ctx,
"Failed to update the NodePool status for ClusterRequest",
slog.String("name", t.object.Name),
)
}
// Check if provisioning is completed
provisionedCondition := meta.FindStatusCondition(nodePool.Status.Conditions, string(hwv1alpha1.Provisioned))
if provisionedCondition != nil && provisionedCondition.Status == metav1.ConditionTrue {
t.logger.InfoContext(
ctx,
fmt.Sprintf(
"NodePool %s in the namespace %s is provisioned",
nodePool.GetName(),
nodePool.GetNamespace(),
),
)
return true
}
return false

return provisioned, timedOutOrFailed, err
}

// updateClusterInstance updates the given ClusterInstance object based on the provisioned nodePool.
Expand All @@ -1723,10 +1720,10 @@ func (t *clusterRequestReconcilerTask) updateClusterInstance(ctx context.Context
// waitForHardwareData waits for the NodePool to be provisioned and update BMC details
// and bootMacAddress in ClusterInstance.
func (t *clusterRequestReconcilerTask) waitForHardwareData(ctx context.Context,
clusterInstance *siteconfig.ClusterInstance, nodePool *hwv1alpha1.NodePool) bool {
clusterInstance *siteconfig.ClusterInstance, nodePool *hwv1alpha1.NodePool) (bool, bool, error) {

provisioned := t.checkNodePoolProvisionStatus(ctx, nodePool)
if provisioned {
provisioned, timedOutOrFailed, err := t.checkNodePoolProvisionStatus(ctx, nodePool)
if provisioned && err == nil {
t.logger.InfoContext(
ctx,
fmt.Sprintf(
Expand All @@ -1735,9 +1732,11 @@ func (t *clusterRequestReconcilerTask) waitForHardwareData(ctx context.Context,
nodePool.GetNamespace(),
),
)
return t.updateClusterInstance(ctx, clusterInstance, nodePool)
if !t.updateClusterInstance(ctx, clusterInstance, nodePool) {
err = fmt.Errorf("failed to update the rendered cluster instance")
}
}
return false
return provisioned, timedOutOrFailed, err
}

// collectNodeDetails collects BMC and node interfaces details
Expand Down Expand Up @@ -1902,59 +1901,83 @@ func (t *clusterRequestReconcilerTask) updateNodeStatusWithHostname(ctx context.
return true
}

// updateHardwareProvisioningStatus updates the status for the created ClusterInstance
// updateHardwareProvisioningStatus updates the status for the ClusterRequest
func (t *clusterRequestReconcilerTask) updateHardwareProvisioningStatus(
ctx context.Context, nodePool *hwv1alpha1.NodePool) error {
ctx context.Context, nodePool *hwv1alpha1.NodePool) (bool, bool, error) {
var status metav1.ConditionStatus
var reason string
var message string
var err error
timedOutOrFailed := false // Default to false unless explicitly needed

if len(nodePool.Status.Conditions) > 0 {
provisionedCondition := meta.FindStatusCondition(
nodePool.Status.Conditions, string(hwv1alpha1.Provisioned))
if provisionedCondition != nil {
utils.SetStatusCondition(&t.object.Status.Conditions,
utils.CRconditionTypes.HardwareProvisioned,
utils.ConditionReason(provisionedCondition.Reason),
provisionedCondition.Status,
provisionedCondition.Message)
} else {
utils.SetStatusCondition(&t.object.Status.Conditions,
utils.CRconditionTypes.HardwareProvisioned,
utils.CRconditionReasons.Unknown,
metav1.ConditionUnknown,
"Unknown state of hardware provisioning",
)
}
if t.object.Status.NodePoolRef == nil {
t.object.Status.NodePoolRef = &oranv1alpha1.NodePoolRef{}
}

if err := utils.UpdateK8sCRStatus(ctx, t.client, t.object); err != nil {
t.logger.ErrorContext(
t.object.Status.NodePoolRef.Name = nodePool.GetName()
t.object.Status.NodePoolRef.Namespace = nodePool.GetNamespace()
if t.object.Status.NodePoolRef.HardwareProvisioningCheckStart.IsZero() {
t.object.Status.NodePoolRef.HardwareProvisioningCheckStart = metav1.Now()
}

provisionedCondition := meta.FindStatusCondition(
nodePool.Status.Conditions, string(hwv1alpha1.Provisioned))
if provisionedCondition != nil {
status = provisionedCondition.Status
reason = provisionedCondition.Reason
message = provisionedCondition.Message

if provisionedCondition.Status == metav1.ConditionFalse && reason == string(hwv1alpha1.Failed) {
t.logger.InfoContext(
ctx,
"Failed to update the HardwareProvisioning status for ClusterRequest",
slog.String("name", t.object.Name),
slog.Any("specificError", err),
fmt.Sprintf(
"NodePool %s in the namespace %s provisioning failed",
nodePool.GetName(),
nodePool.GetNamespace(),
),
)
return fmt.Errorf("failed to update HardwareProvisioning status: %w", err)
// Ensure a consistent message for the cluster request, regardless of which plugin is used.
message = "Hardware provisioning failed"
timedOutOrFailed = true
}
} else if nodePool.ObjectMeta.Namespace == utils.TempDellPluginNamespace || nodePool.ObjectMeta.Namespace == utils.UnitTestHwmgrNamespace {
// TODO: For test purposes only. Code to be removed once hwmgr plugin(s) are fully utilized
meta.SetStatusCondition(
&nodePool.Status.Conditions,
metav1.Condition{
Type: string(hwv1alpha1.Unknown),
Status: metav1.ConditionUnknown,
Reason: string(hwv1alpha1.NotInitialized),
},
)
if err := utils.UpdateK8sCRStatus(ctx, t.client, nodePool); err != nil {
t.logger.ErrorContext(
} else {
// No provisioning condition found, set the status to unknown.
status = metav1.ConditionUnknown
reason = string(utils.CRconditionReasons.Unknown)
message = "Unknown state of hardware provisioning"
}

// Check for timeout if not already failed or provisioned
if status != metav1.ConditionTrue && reason != string(hwv1alpha1.Failed) {
elapsedTime := time.Since(t.object.Status.NodePoolRef.HardwareProvisioningCheckStart.Time)
if elapsedTime >= time.Duration(t.object.Spec.Timeout.HardwareProvisioning)*time.Minute {
t.logger.InfoContext(
ctx,
"Failed to update the NodePool status",
slog.String("name", nodePool.Name),
slog.Any("specificError", err),
fmt.Sprintf(
"NodePool %s in the namespace %s provisioning timed out",
nodePool.GetName(),
nodePool.GetNamespace(),
),
)
return fmt.Errorf("failed to update NodePool status: %w", err)
reason = string(hwv1alpha1.TimedOut)
message = "Hardware provisioning timed out"
status = metav1.ConditionFalse
timedOutOrFailed = true
}
}

// Set the status condition for hardware provisioning.
utils.SetStatusCondition(&t.object.Status.Conditions,
utils.CRconditionTypes.HardwareProvisioned,
utils.ConditionReason(reason),
status,
message)

// Update the CR status for the ClusterRequest.
if err = utils.UpdateK8sCRStatus(ctx, t.client, t.object); err != nil {
err = fmt.Errorf("failed to update HardwareProvisioning status: %w", err)
}
return nil
return status == metav1.ConditionTrue, timedOutOrFailed, err
}

// findClusterInstanceForClusterRequest maps the ClusterInstance created by a
Expand Down
Loading

0 comments on commit 15e6985

Please sign in to comment.