Skip to content

Commit

Permalink
Merge pull request #92 from k-keiichi-rh/ecoproject-1326
Browse files Browse the repository at this point in the history
Enable out-of-service taint in FAR
  • Loading branch information
mshitrit authored Apr 19, 2024
2 parents ec0c8f6 + 45d1fb9 commit 7f4a492
Show file tree
Hide file tree
Showing 20 changed files with 625 additions and 102 deletions.
13 changes: 13 additions & 0 deletions api/v1alpha1/fenceagentsremediation_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,14 @@ const (
FenceAgentSucceeded ConditionsChangeReason = "FenceAgentSucceeded"
// RemediationFinishedSuccessfully - The unhealthy node was fully remediated/fenced (it was tainted, fenced by FA and all of its resources have been deleted)
RemediationFinishedSuccessfully ConditionsChangeReason = "RemediationFinishedSuccessfully"

ResourceDeletionRemediationStrategy = RemediationStrategyType("ResourceDeletion")
OutOfServiceTaintRemediationStrategy = RemediationStrategyType("OutOfServiceTaint")
)

type ParameterName string
type NodeName string
type RemediationStrategyType string

// FenceAgentsRemediationSpec defines the desired state of FenceAgentsRemediation
type FenceAgentsRemediationSpec struct {
Expand Down Expand Up @@ -84,6 +88,15 @@ type FenceAgentsRemediationSpec struct {
// NodeParameters are passed to the fencing agent according to the node that is fenced, since they are node specific
//+operator-sdk:csv:customresourcedefinitions:type=spec
NodeParameters map[ParameterName]map[NodeName]string `json:"nodeparameters,omitempty"`

// RemediationStrategy is the remediation method for unhealthy nodes.
// Currently, it could be either "OutOfServiceTaint" or "ResourceDeletion".
// ResourceDeletion will iterate over all pods related to the unhealthy node and delete them.
// OutOfServiceTaint will add the out-of-service taint which is a new well-known taint "node.kubernetes.io/out-of-service"
// that enables automatic deletion of pv-attached pods on failed nodes, "out-of-service" taint is only supported on clusters with k8s version 1.26+ or OCP/OKD version 4.13+.
// +kubebuilder:default:="ResourceDeletion"
// +kubebuilder:validation:Enum=ResourceDeletion;OutOfServiceTaint
RemediationStrategy RemediationStrategyType `json:"remediationStrategy,omitempty"`
}

// FenceAgentsRemediationStatus defines the observed state of FenceAgentsRemediation
Expand Down
24 changes: 22 additions & 2 deletions api/v1alpha1/fenceagentsremediation_webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ var (
webhookFARLog = logf.Log.WithName("fenceagentsremediation-resource")
// verify agent existence with os.Stat function
agentValidator = validation.NewAgentValidator()
// isOutOfServiceTaintSupported will be set to true in case out-of-service taint is supported (k8s 1.26 or higher)
isOutOfServiceTaintSupported bool
)

func (r *FenceAgentsRemediation) SetupWebhookWithManager(mgr ctrl.Manager) error {
Expand All @@ -53,13 +55,13 @@ var _ webhook.Validator = &FenceAgentsRemediation{}
// ValidateCreate implements webhook.Validator so a webhook will be registered for the type
func (far *FenceAgentsRemediation) ValidateCreate() (admission.Warnings, error) {
webhookFARLog.Info("validate create", "name", far.Name)
return validateAgentName(far.Spec.Agent)
return validateFAR(&far.Spec)
}

// ValidateUpdate implements webhook.Validator so a webhook will be registered for the type
func (far *FenceAgentsRemediation) ValidateUpdate(old runtime.Object) (admission.Warnings, error) {
webhookFARLog.Info("validate update", "name", far.Name)
return validateAgentName(far.Spec.Agent)
return validateFAR(&far.Spec)
}

// ValidateDelete implements webhook.Validator so a webhook will be registered for the type
Expand All @@ -68,6 +70,17 @@ func (far *FenceAgentsRemediation) ValidateDelete() (admission.Warnings, error)
return nil, nil
}

func validateFAR(farSpec *FenceAgentsRemediationSpec) (admission.Warnings, error) {
if _, err := validateAgentName(farSpec.Agent); err != nil {
return nil, err
}
return validateStrategy(farSpec.RemediationStrategy)
}

func InitOutOfServiceTaintSupportedFlag(outOfServiceTaintSupported bool) {
isOutOfServiceTaintSupported = outOfServiceTaintSupported
}

func validateAgentName(agent string) (admission.Warnings, error) {
exists, err := agentValidator.ValidateAgentName(agent)
if err != nil {
Expand All @@ -78,3 +91,10 @@ func validateAgentName(agent string) (admission.Warnings, error) {
}
return nil, nil
}

func validateStrategy(farRemStrategy RemediationStrategyType) (admission.Warnings, error) {
if farRemStrategy == OutOfServiceTaintRemediationStrategy && !isOutOfServiceTaintSupported {
return nil, fmt.Errorf("%s remediation strategy is not supported at kubernetes version lower than 1.26, please use a different remediation strategy", OutOfServiceTaintRemediationStrategy)
}
return nil, nil
}
77 changes: 66 additions & 11 deletions api/v1alpha1/fenceagentsremediation_webhook_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,41 @@ var _ = Describe("FenceAgentsRemediation Validation", func() {
When("agent name match format and binary", func() {
It("should be accepted", func() {
far := getTestFAR(validAgentName)
_, err := far.ValidateCreate()
Expect(err).ToNot(HaveOccurred())
Expect(far.ValidateCreate()).Error().NotTo(HaveOccurred())
})
})

When("agent name was not found ", func() {
It("should be rejected", func() {
far := getTestFAR(invalidAgentName)
_, err := far.ValidateCreate()
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring("unsupported fence agent: %s", invalidAgentName))
Expect(far.ValidateCreate()).Error().To(MatchError(ContainSubstring("unsupported fence agent: %s", invalidAgentName)))
})
})

Context("with OutOfServiceTaint strategy", func() {
var outOfServiceStrategy *FenceAgentsRemediation

BeforeEach(func() {
orgValue := isOutOfServiceTaintSupported
DeferCleanup(func() { isOutOfServiceTaintSupported = orgValue })

outOfServiceStrategy = getFAR(validAgentName, OutOfServiceTaintRemediationStrategy)
})
When("out of service taint is supported", func() {
BeforeEach(func() {
isOutOfServiceTaintSupported = true
})
It("should be allowed", func() {
Expect(outOfServiceStrategy.ValidateCreate()).Error().NotTo(HaveOccurred())
})
})
When("out of service taint is not supported", func() {
BeforeEach(func() {
isOutOfServiceTaintSupported = false
})
It("should be denied", func() {
Expect(outOfServiceStrategy.ValidateCreate()).Error().To(MatchError(ContainSubstring(outOfServiceTaintUnsupportedMsg)))
})
})
})
})
Expand All @@ -37,8 +61,7 @@ var _ = Describe("FenceAgentsRemediation Validation", func() {
})
It("should be accepted", func() {
far := getTestFAR(validAgentName)
_, err := far.ValidateUpdate(oldFAR)
Expect(err).ToNot(HaveOccurred())
Expect(far.ValidateUpdate(oldFAR)).Error().NotTo(HaveOccurred())
})
})

Expand All @@ -48,21 +71,53 @@ var _ = Describe("FenceAgentsRemediation Validation", func() {
})
It("should be rejected", func() {
far := getTestFAR(invalidAgentName)
_, err := far.ValidateUpdate(oldFAR)
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring("unsupported fence agent: %s", invalidAgentName))
Expect(far.ValidateUpdate(oldFAR)).Error().To(MatchError(ContainSubstring("unsupported fence agent: %s", invalidAgentName)))
})
})

Context("with OutOfServiceTaint strategy", func() {
var outOfServiceStrategy *FenceAgentsRemediation
var resourceDeletionStrategy *FenceAgentsRemediation

BeforeEach(func() {
orgValue := isOutOfServiceTaintSupported
DeferCleanup(func() { isOutOfServiceTaintSupported = orgValue })

outOfServiceStrategy = getFAR(validAgentName, OutOfServiceTaintRemediationStrategy)
resourceDeletionStrategy = getFAR(validAgentName, ResourceDeletionRemediationStrategy)
})
When("out of service taint is supported", func() {
BeforeEach(func() {
isOutOfServiceTaintSupported = true
})
It("should be allowed", func() {
Expect(outOfServiceStrategy.ValidateUpdate(resourceDeletionStrategy)).Error().NotTo(HaveOccurred())
})
})
When("out of service taint is not supported", func() {
BeforeEach(func() {
isOutOfServiceTaintSupported = false
})
It("should be denied", func() {
Expect(outOfServiceStrategy.ValidateUpdate(resourceDeletionStrategy)).Error().To(MatchError(ContainSubstring(outOfServiceTaintUnsupportedMsg)))
})
})
})
})
})

func getTestFAR(agentName string) *FenceAgentsRemediation {
return getFAR(agentName, ResourceDeletionRemediationStrategy)
}

func getFAR(agentName string, strategy RemediationStrategyType) *FenceAgentsRemediation {
return &FenceAgentsRemediation{
ObjectMeta: metav1.ObjectMeta{
Name: "test-" + agentName,
},
Spec: FenceAgentsRemediationSpec{
Agent: agentName,
Agent: agentName,
RemediationStrategy: strategy,
},
}
}
4 changes: 2 additions & 2 deletions api/v1alpha1/fenceagentsremediationtemplate_webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,13 @@ var _ webhook.Validator = &FenceAgentsRemediationTemplate{}
// ValidateCreate implements webhook.Validator so a webhook will be registered for the type
func (farTemplate *FenceAgentsRemediationTemplate) ValidateCreate() (admission.Warnings, error) {
webhookFARTemplateLog.Info("validate create", "name", farTemplate.Name)
return validateAgentName(farTemplate.Spec.Template.Spec.Agent)
return validateFAR(&farTemplate.Spec.Template.Spec)
}

// ValidateUpdate implements webhook.Validator so a webhook will be registered for the type
func (farTemplate *FenceAgentsRemediationTemplate) ValidateUpdate(old runtime.Object) (admission.Warnings, error) {
webhookFARTemplateLog.Info("validate update", "name", farTemplate.Name)
return validateAgentName(farTemplate.Spec.Template.Spec.Agent)
return validateFAR(&farTemplate.Spec.Template.Spec)
}

// ValidateDelete implements webhook.Validator so a webhook will be registered for the type
Expand Down
81 changes: 70 additions & 11 deletions api/v1alpha1/fenceagentsremediationtemplate_webhook_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,43 @@ var _ = Describe("FenceAgentsRemediationTemplate Validation", func() {
When("agent name match format and binary", func() {
It("should be accepted", func() {
farTemplate := getTestFARTemplate(validAgentName)
_, err := farTemplate.ValidateCreate()
Expect(err).ToNot(HaveOccurred())
Expect(farTemplate.ValidateCreate()).Error().NotTo(HaveOccurred())
})
})

When("agent name was not found ", func() {
It("should be rejected", func() {
farTemplate := getTestFARTemplate(invalidAgentName)
_, err := farTemplate.ValidateCreate()
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring("unsupported fence agent: %s", invalidAgentName))
Expect(farTemplate.ValidateCreate()).Error().To(MatchError(ContainSubstring("unsupported fence agent: %s", invalidAgentName)))
})
})

Context("with OutOfServiceTaint strategy", func() {
var outOfServiceStrategy *FenceAgentsRemediationTemplate

BeforeEach(func() {
orgValue := isOutOfServiceTaintSupported
DeferCleanup(func() { isOutOfServiceTaintSupported = orgValue })

outOfServiceStrategy = getFARTemplate(validAgentName, OutOfServiceTaintRemediationStrategy)
})

When("out of service taint is supported", func() {
BeforeEach(func() {
isOutOfServiceTaintSupported = true
})
It("should be allowed", func() {
Expect(outOfServiceStrategy.ValidateCreate()).Error().NotTo(HaveOccurred())
})
})

When("out of service taint is not supported", func() {
BeforeEach(func() {
isOutOfServiceTaintSupported = false
})
It("should be denied", func() {
Expect(outOfServiceStrategy.ValidateCreate()).Error().To(MatchError(ContainSubstring(outOfServiceTaintUnsupportedMsg)))
})
})
})
})
Expand All @@ -37,8 +63,7 @@ var _ = Describe("FenceAgentsRemediationTemplate Validation", func() {
})
It("should be accepted", func() {
farTemplate := getTestFARTemplate(validAgentName)
_, err := farTemplate.ValidateUpdate(oldFARTemplate)
Expect(err).ToNot(HaveOccurred())
Expect(farTemplate.ValidateUpdate(oldFARTemplate)).Error().NotTo(HaveOccurred())
})
})

Expand All @@ -48,23 +73,57 @@ var _ = Describe("FenceAgentsRemediationTemplate Validation", func() {
})
It("should be rejected", func() {
farTemplate := getTestFARTemplate(invalidAgentName)
_, err := farTemplate.ValidateUpdate(oldFARTemplate)
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring("unsupported fence agent: %s", invalidAgentName))
Expect(farTemplate.ValidateUpdate(oldFARTemplate)).Error().To(MatchError(ContainSubstring("unsupported fence agent: %s", invalidAgentName)))
})
})

Context("with OutOfServiceTaint strategy", func() {
var outOfServiceStrategy *FenceAgentsRemediationTemplate
var resourceDeletionStrategy *FenceAgentsRemediationTemplate

BeforeEach(func() {
orgValue := isOutOfServiceTaintSupported
DeferCleanup(func() { isOutOfServiceTaintSupported = orgValue })

outOfServiceStrategy = getFARTemplate(validAgentName, OutOfServiceTaintRemediationStrategy)
resourceDeletionStrategy = getFARTemplate(validAgentName, ResourceDeletionRemediationStrategy)
})

When("out of service taint is supported", func() {
BeforeEach(func() {
isOutOfServiceTaintSupported = true
})
It("should be allowed", func() {
Expect(outOfServiceStrategy.ValidateUpdate(resourceDeletionStrategy)).Error().NotTo(HaveOccurred())
})
})

When("out of service taint is not supported", func() {
BeforeEach(func() {
isOutOfServiceTaintSupported = false
})
It("should be denied", func() {
Expect(outOfServiceStrategy.ValidateUpdate(resourceDeletionStrategy)).Error().To(MatchError(ContainSubstring(outOfServiceTaintUnsupportedMsg)))
})
})
})
})
})

func getTestFARTemplate(agentName string) *FenceAgentsRemediationTemplate {
return getFARTemplate(agentName, ResourceDeletionRemediationStrategy)
}

func getFARTemplate(agentName string, strategy RemediationStrategyType) *FenceAgentsRemediationTemplate {
return &FenceAgentsRemediationTemplate{
ObjectMeta: metav1.ObjectMeta{
Name: "test-" + agentName + "-template",
},
Spec: FenceAgentsRemediationTemplateSpec{
Template: FenceAgentsRemediationTemplateResource{
Spec: FenceAgentsRemediationSpec{
Agent: agentName,
Agent: agentName,
RemediationStrategy: strategy,
},
},
},
Expand Down
5 changes: 3 additions & 2 deletions api/v1alpha1/webhook_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,9 @@ import (
// http://onsi.github.io/ginkgo/ to learn more about Ginkgo.

const (
validAgentName = "fence_ipmilan"
invalidAgentName = "fence_ip"
validAgentName = "fence_ipmilan"
invalidAgentName = "fence_ip"
outOfServiceTaintUnsupportedMsg = "OutOfServiceTaint remediation strategy is not supported at kubernetes version lower than 1.26, please use a different remediation strategy"
)

var (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,18 @@ spec:
description: NodeParameters are passed to the fencing agent according
to the node that is fenced, since they are node specific
type: object
remediationStrategy:
default: ResourceDeletion
description: |-
RemediationStrategy is the remediation method for unhealthy nodes.
Currently, it could be either "OutOfServiceTaint" or "ResourceDeletion".
ResourceDeletion will iterate over all pods related to the unhealthy node and delete them.
OutOfServiceTaint will add the out-of-service taint which is a new well-known taint "node.kubernetes.io/out-of-service"
that enables automatic deletion of pv-attached pods on failed nodes, "out-of-service" taint is only supported on clusters with k8s version 1.26+ or OCP/OKD version 4.13+.
enum:
- ResourceDeletion
- OutOfServiceTaint
type: string
retrycount:
default: 5
description: RetryCount is the number of times the fencing agent will
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,18 @@ spec:
according to the node that is fenced, since they are node
specific
type: object
remediationStrategy:
default: ResourceDeletion
description: |-
RemediationStrategy is the remediation method for unhealthy nodes.
Currently, it could be either "OutOfServiceTaint" or "ResourceDeletion".
ResourceDeletion will iterate over all pods related to the unhealthy node and delete them.
OutOfServiceTaint will add the out-of-service taint which is a new well-known taint "node.kubernetes.io/out-of-service"
that enables automatic deletion of pv-attached pods on failed nodes, "out-of-service" taint is only supported on clusters with k8s version 1.26+ or OCP/OKD version 4.13+.
enum:
- ResourceDeletion
- OutOfServiceTaint
type: string
retrycount:
default: 5
description: RetryCount is the number of times the fencing
Expand Down
Loading

0 comments on commit 7f4a492

Please sign in to comment.