Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Explicit Workload Deletion #63

Merged
merged 4 commits into from
Jul 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ test: test-no-verify verify-unchanged ## Generate and format code, run tests, ge
# --vv: If set, emits with maximal verbosity - includes skipped and pending tests.
test-no-verify: manifests generate go-verify fmt vet fix-imports envtest ginkgo # Generate and format code, and run tests
KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(ENVTEST_DIR)/$(ENVTEST_VERSION) -p path)" \
$(GINKGO) -r --keep-going --require-suite --vv -coverprofile cover.out ./controllers/... ./pkg/...
$(GINKGO) -r --keep-going --require-suite --vv -coverprofile cover.out ./pkg/... ./controllers/...

.PHONY: bundle-run
export BUNDLE_RUN_NAMESPACE ?= openshift-operators
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,15 @@ Then, run `operator-sdk run bundle quay.io/medik8s/fence-agents-remediation-oper
FAR is recommended for using with NHC to create a complete solution for unhealty nodes, since NHC detects unhelthy nodes and creates an extrenal remediation CR, e.g., FAR's CR, for unhealthy nodes.
This automated way is preferable as it gives the responsibily on FAR CRs (creation and deletion) to NHC, even though FAR can also act as standalone remediator, but it with expense from the administrator to create and delete CRs.

Either way a user must be familier with fence agent to be used - Knowing it's parameters and any other requirements on the cluster (e.g., fence_ipmilan needs machines that support IPMI).
Either way a user must be familier with fence agent to be used - Knowing its parameters and any other requirements on the cluster (e.g., fence_ipmilan needs machines that support IPMI).

### FAR with NHC

* Install FAR using one of the above options ([Installation](#installation)).

* Load the yaml manifest of the FAR template (see below).

* Modify NHC CR to use FAR as it's remediator -
* Modify NHC CR to use FAR as its remediator -
This is basically a specific use case of an [external remediation of NHC CR](https://github.com/medik8s/node-healthcheck-operator#external-remediation-resources).
In order to set it up, please make sure that Node Health Check is running, FAR controller exists and then creates the necessary CRs (*FenceAgentsRemediationTemplate* and then *NodeHealthCheck*).

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,15 @@ spec:
- get
- patch
- update
- apiGroups:
- storage.k8s.io
resources:
- volumeattachments
verbs:
- delete
- get
- list
- watch
- apiGroups:
- authentication.k8s.io
resources:
Expand Down
9 changes: 9 additions & 0 deletions config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,12 @@ rules:
- get
- patch
- update
- apiGroups:
- storage.k8s.io
resources:
- volumeattachments
verbs:
- delete
- get
- list
- watch
13 changes: 11 additions & 2 deletions controllers/fenceagentsremediation_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ func (r *FenceAgentsRemediationReconciler) SetupWithManager(mgr ctrl.Manager) er
Complete(r)
}

//+kubebuilder:rbac:groups=storage.k8s.io,resources=volumeattachments,verbs=get;list;watch;delete
//+kubebuilder:rbac:groups=core,resources=pods/exec,verbs=create
//+kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;update;delete;deletecollection
//+kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch;update;delete
Expand Down Expand Up @@ -132,7 +133,7 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct
r.Log.Info("Fetch FAR's pod")
pod, err := utils.GetFenceAgentsRemediationPod(r.Client)
if err != nil {
r.Log.Error(err, "Can't find FAR's pod by it's label", "CR's Name", req.Name)
r.Log.Error(err, "Can't find FAR's pod by its label", "CR's Name", req.Name)
return emptyResult, err
}
//TODO: Check that FA is excutable? run cli.IsExecuteable
Expand Down Expand Up @@ -164,6 +165,14 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct
r.Log.Error(err, "Fence Agent response wasn't a success message", "CR's Name", req.Name)
return emptyResult, err
}

// Reboot was finished and now we remove workloads (pods and their VA)
r.Log.Info("Manual workload deletion", "Fence Agent", far.Spec.Agent, "Node Name", req.Name)
if err := utils.DeleteResources(ctx, r.Client, req.Name); err != nil {
r.Log.Error(err, "Manual workload deletion has failed", "CR's Name", req.Name)
return emptyResult, err
}

return emptyResult, nil
}

Expand All @@ -188,7 +197,7 @@ func buildFenceAgentParams(far *v1alpha1.FenceAgentsRemediation) ([]string, erro
return nil, err
}
}
// if --action attribute was not selected, then it's default value is reboot
// if --action attribute was not selected, then its default value is reboot
// https://github.com/ClusterLabs/fence-agents/blob/main/lib/fencing.py.py#L103
// Therefore we can safely add the reboot action regardless if it was initially added into the CR
fenceAgentParams = appendParamToSlice(fenceAgentParams, parameterActionName, parameterActionValue)
Expand Down
195 changes: 162 additions & 33 deletions controllers/fenceagentsremediation_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ import (
. "github.com/onsi/gomega"

corev1 "k8s.io/api/core/v1"
storagev1 "k8s.io/api/storage/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
Expand All @@ -38,13 +40,17 @@ import (

const (
dummyNode = "dummy-node"
node01 = "worker-0"
workerNode = "worker-0"
fenceAgentIPMI = "fence_ipmilan"
farPodName = "far-pod"
testPodName = "far-pod-test-1"
vaName1 = "va-test-1"
vaName2 = "va-test-2"
)

var (
faPodLabels = map[string]string{"app.kubernetes.io/name": "fence-agents-remediation-operator"}
fenceAgentsPod *corev1.Pod
faPodLabels = map[string]string{"app.kubernetes.io/name": "fence-agents-remediation-operator"}
log = ctrl.Log.WithName("controllers-unit-test")
)

var _ = Describe("FAR Controller", func() {
Expand Down Expand Up @@ -77,13 +83,13 @@ var _ = Describe("FAR Controller", func() {
}

// default FenceAgentsRemediation CR
underTestFAR := getFenceAgentsRemediation(node01, fenceAgentIPMI, testShareParam, testNodeParam)
underTestFAR := getFenceAgentsRemediation(workerNode, fenceAgentIPMI, testShareParam, testNodeParam)

Context("Functionality", func() {
Context("buildFenceAgentParams", func() {
When("FAR include different action than reboot", func() {
It("should succeed with a warning", func() {
invalidValTestFAR := getFenceAgentsRemediation(node01, fenceAgentIPMI, invalidShareParam, testNodeParam)
invalidValTestFAR := getFenceAgentsRemediation(workerNode, fenceAgentIPMI, invalidShareParam, testNodeParam)
invalidShareString, err := buildFenceAgentParams(invalidValTestFAR)
Expect(err).NotTo(HaveOccurred())
validShareString, err := buildFenceAgentParams(underTestFAR)
Expand All @@ -102,69 +108,84 @@ var _ = Describe("FAR Controller", func() {
})
When("FAR CR's name does match a node name", func() {
It("should succeed", func() {
underTestFAR.ObjectMeta.Name = node01
underTestFAR.ObjectMeta.Name = workerNode
Expect(buildFenceAgentParams(underTestFAR)).Error().NotTo(HaveOccurred())
})
})
})
})
Context("Reconcile", func() {
nodeKey := client.ObjectKey{Name: node01}
farNamespacedName := client.ObjectKey{Name: node01, Namespace: defaultNamespace}
nodeKey := client.ObjectKey{Name: workerNode}
farNamespacedName := client.ObjectKey{Name: workerNode, Namespace: defaultNamespace}
farNoExecuteTaint := utils.CreateFARNoExecuteTaint()
//Scenarios
resourceDeletionWasTriggered := true // corresponds to testVADeletion bool value
BeforeEach(func() {
fenceAgentsPod = buildFarPod()
// Create, Update status (for GetFenceAgentsRemediationPod), and DeferCleanUp the fenceAgentsPod
Expect(k8sClient.Create(context.Background(), fenceAgentsPod)).To(Succeed())
fenceAgentsPod.Status.Phase = corev1.PodRunning
Expect(k8sClient.Status().Update(context.Background(), fenceAgentsPod)).To(Succeed())
DeferCleanup(k8sClient.Delete, context.Background(), fenceAgentsPod)
// Create two VAs and two pods, and at the end clean them up with DeferCleanup
va1 := createVA(vaName1, workerNode)
va2 := createVA(vaName2, workerNode)
testPod := createRunningPod("far-test-1", testPodName, workerNode)
DeferCleanup(cleanupTestedResources, va1, va2, testPod)
farPod := createRunningPod("far-manager-test", farPodName, "")
DeferCleanup(k8sClient.Delete, context.Background(), farPod)
})
JustBeforeEach(func() {
// DeferCleanUp and Create node, and FAR CR
// Create node, and FAR CR, and at the end clean them up with DeferCleanup
Expect(k8sClient.Create(context.Background(), node)).To(Succeed())
DeferCleanup(k8sClient.Delete, context.Background(), node)
Expect(k8sClient.Create(context.Background(), underTestFAR)).To(Succeed())
DeferCleanup(k8sClient.Delete, context.Background(), underTestFAR)
})

// TODO: add more scenarios?
When("creating valid FAR CR", func() {
BeforeEach(func() {
node = utils.GetNode("", node01)
node = utils.GetNode("", workerNode)
})
It("should have finalizer and taint", func() {
It("should have finalizer, taint, while the two VAs and one pod will be deleted", func() {
By("Searching for remediation taint")
Eventually(func() bool {
Expect(k8sClient.Get(context.Background(), nodeKey, node)).To(Succeed())
Expect(k8sClient.Get(context.Background(), farNamespacedName, underTestFAR)).To(Succeed())
res, _ := cliCommandsEquality(underTestFAR)
return utils.TaintExists(node.Spec.Taints, &farNoExecuteTaint) && res
}, 100*time.Millisecond, 10*time.Millisecond).Should(BeTrue(), "taint should be added, and command format is correct")
// If taint was added, then defenintly the finzlier was added as well

// If taint was added, then definitely the finalizer was added as well
By("Having a finalizer if we have a remediation taint")
Expect(controllerutil.ContainsFinalizer(underTestFAR, v1alpha1.FARFinalizer)).To(BeTrue())

By("Not having any VAs nor the test pod")
testVADeletion(vaName1, resourceDeletionWasTriggered)
testVADeletion(vaName2, resourceDeletionWasTriggered)
testPodDeletion(testPodName, resourceDeletionWasTriggered)
})
})
When("creating invalid FAR CR Name", func() {
BeforeEach(func() {
node = utils.GetNode("", node01)
node = utils.GetNode("", workerNode)
underTestFAR = getFenceAgentsRemediation(dummyNode, fenceAgentIPMI, testShareParam, testNodeParam)
})
It("should not have a finalizer nor taint", func() {
It("should not have a finalizer nor taint, while the two VAs and one pod will remain", func() {
By("Not finding a matching node to FAR CR's name")
nodeKey.Name = dummyNode
nodeKey.Name = underTestFAR.Name
Expect(k8sClient.Get(context.Background(), nodeKey, node)).To(Not(Succeed()))

By("Not having finalizer")
farNamespacedName.Name = dummyNode
farNamespacedName.Name = underTestFAR.Name
Eventually(func() bool {
Expect(k8sClient.Get(context.Background(), farNamespacedName, underTestFAR)).To(Succeed())
return controllerutil.ContainsFinalizer(underTestFAR, v1alpha1.FARFinalizer)
}, 100*time.Millisecond, 10*time.Millisecond).Should(BeFalse(), "finalizer shouldn't be added")

// If finalizer is missing, then a taint shouldn't be existed
By("Not having remediation taint")
Expect(utils.TaintExists(node.Spec.Taints, &farNoExecuteTaint)).To(BeFalse())

By("Still having all the VAs and one test pod")
resourceDeletionWasTriggered = false
testVADeletion(vaName1, resourceDeletionWasTriggered)
testVADeletion(vaName2, resourceDeletionWasTriggered)
testPodDeletion(testPodName, resourceDeletionWasTriggered)
})
})
})
Expand All @@ -182,18 +203,74 @@ func getFenceAgentsRemediation(nodeName, agent string, sharedparameters map[v1al
}
}

// buildFarPod builds a dummy pod with FAR label and namespace
func buildFarPod() *corev1.Pod {
fenceAgentsPod := &corev1.Pod{}
fenceAgentsPod.Labels = faPodLabels
fenceAgentsPod.Name = "mock-fence-agents"
fenceAgentsPod.Namespace = defaultNamespace
// buildPod builds a dummy pod
func buildPod(containerName, podName, nodeName string) *corev1.Pod {
pod := &corev1.Pod{}
pod.Name = podName
if podName == farPodName {
// only when we build FAR pod then we add its label
pod.Labels = faPodLabels
} else {
// testedPod should reside in unhealthy node
pod.Spec.NodeName = nodeName
}
pod.Namespace = defaultNamespace
container := corev1.Container{
Name: "foo",
Name: containerName,
Image: "foo",
}
fenceAgentsPod.Spec.Containers = []corev1.Container{container}
return fenceAgentsPod
pod.Spec.Containers = []corev1.Container{container}
return pod
}

// createRunningPod builds new pod format, create it, and set its status as running
func createRunningPod(containerName, podName, nodeName string) *corev1.Pod {
pod := buildPod(containerName, podName, nodeName)
Expect(k8sClient.Create(context.Background(), pod)).To(Succeed())
pod.Status.Phase = corev1.PodRunning
Expect(k8sClient.Status().Update(context.Background(), pod)).To(Succeed())
return pod
}

// createVA creates new volume attachment and return its object
func createVA(vaName, unhealthyNodeName string) *storagev1.VolumeAttachment {
va := &storagev1.VolumeAttachment{
ObjectMeta: metav1.ObjectMeta{
Name: vaName,
Namespace: defaultNamespace,
},
Spec: storagev1.VolumeAttachmentSpec{
Attacher: "foo",
Source: storagev1.VolumeAttachmentSource{},
NodeName: unhealthyNodeName,
},
}
foo := "foo"
va.Spec.Source.PersistentVolumeName = &foo
ExpectWithOffset(1, k8sClient.Create(context.Background(), va)).To(Succeed())
return va
}

// cleanupTestedResources fetches all the resources that we have crated for the test
// and if they are still exist at the end of the test, then we clean them up for next test
func cleanupTestedResources(va1, va2 *storagev1.VolumeAttachment, pod *corev1.Pod) {
// clean test volume attachments if it exists
vaTest := &storagev1.VolumeAttachment{}
if err := k8sClient.Get(context.Background(), client.ObjectKeyFromObject(va1), vaTest); err == nil {
log.Info("Cleanup: clean volume attachment", "va name", vaTest.Name)
Expect(k8sClient.Delete(context.Background(), vaTest)).To(Succeed())
}
if err := k8sClient.Get(context.Background(), client.ObjectKeyFromObject(va2), vaTest); err == nil {
log.Info("Cleanup: clean volume attachment", "va name", vaTest.Name)
Expect(k8sClient.Delete(context.Background(), vaTest)).To(Succeed())

}
// clean test pod if it exists
podTest := &corev1.Pod{}
if err := k8sClient.Get(context.Background(), client.ObjectKeyFromObject(pod), podTest); err == nil {
log.Info("Cleanup: clean pod", "pod name", podTest.Name)
Expect(k8sClient.Delete(context.Background(), podTest)).To(Succeed())
}
}

// isEqualStringLists return true if two string lists share the same values
Expand All @@ -217,6 +294,58 @@ func cliCommandsEquality(far *v1alpha1.FenceAgentsRemediation) (bool, error) {
return isEqualStringLists(mocksExecuter.command, expectedCommand), nil
}

// testVADeletion tests whether the volume attachment no longer exist for successful FAR CR
// and consistently check if the volume attachment exist and was not deleted
func testVADeletion(vaName string, resourceDeletionWasTriggered bool) {
vaKey := client.ObjectKey{
Namespace: defaultNamespace,
Name: vaName,
}
if resourceDeletionWasTriggered {
EventuallyWithOffset(1, func() bool {
va := &storagev1.VolumeAttachment{}
err := k8sClient.Get(context.Background(), vaKey, va)
return apierrors.IsNotFound(err)

}, 5*time.Second, 250*time.Millisecond).Should(BeTrue())
log.Info("Volume attachment is no longer exist", "va", vaName)
} else {
ConsistentlyWithOffset(1, func() bool {
va := &storagev1.VolumeAttachment{}
err := k8sClient.Get(context.Background(), vaKey, va)
return apierrors.IsNotFound(err)

}, 5*time.Second, 250*time.Millisecond).Should(BeFalse())
log.Info("Volume attachment exist", "va", vaName)
}
}

// testPodDeletion tests whether the pod no longer exist for successful FAR CR
// and consistently check if the pod exist and was not deleted
func testPodDeletion(podName string, resourceDeletionWasTriggered bool) {
podKey := client.ObjectKey{
Namespace: defaultNamespace,
Name: podName,
}
if resourceDeletionWasTriggered {
EventuallyWithOffset(1, func() bool {
pod := &corev1.Pod{}
err := k8sClient.Get(context.Background(), podKey, pod)
return apierrors.IsNotFound(err)

}, 5*time.Second, 250*time.Millisecond).Should(BeTrue())
log.Info("Pod is no longer exist", "pod", podName)
} else {
ConsistentlyWithOffset(1, func() bool {
pod := &corev1.Pod{}
err := k8sClient.Get(context.Background(), podKey, pod)
return apierrors.IsNotFound(err)

}, 5*time.Second, 250*time.Millisecond).Should(BeFalse())
log.Info("Pod exist", "pod", podName)
}
}

// Implements Execute function to mock/test Execute of FenceAgentsRemediationReconciler
type mockExecuter struct {
command []string
Expand All @@ -234,5 +363,5 @@ func newMockExecuter() *mockExecuter {
func (m *mockExecuter) Execute(_ *corev1.Pod, command []string) (stdout string, stderr string, err error) {
m.command = command
m.mockLog.Info("Executed command has been stored", "command", m.command)
return SuccessFAResponse, "", nil
return SuccessFAResponse + "\n", "", nil
}
Loading