diff --git a/Makefile b/Makefile index c6892c17..a2ca00d8 100644 --- a/Makefile +++ b/Makefile @@ -184,7 +184,7 @@ test: test-no-verify verify-unchanged ## Generate and format code, run tests, ge # --vv: If set, emits with maximal verbosity - includes skipped and pending tests. test-no-verify: manifests generate go-verify fmt vet fix-imports envtest ginkgo # Generate and format code, and run tests KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(ENVTEST_DIR)/$(ENVTEST_VERSION) -p path)" \ - $(GINKGO) -r --keep-going --require-suite --vv -coverprofile cover.out ./controllers/... ./pkg/... + $(GINKGO) -r --keep-going --require-suite --vv -coverprofile cover.out ./pkg/... ./controllers/... .PHONY: bundle-run export BUNDLE_RUN_NAMESPACE ?= openshift-operators diff --git a/README.md b/README.md index 8de5634d..a5d93759 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ Then, run `operator-sdk run bundle quay.io/medik8s/fence-agents-remediation-oper FAR is recommended for using with NHC to create a complete solution for unhealty nodes, since NHC detects unhelthy nodes and creates an extrenal remediation CR, e.g., FAR's CR, for unhealthy nodes. This automated way is preferable as it gives the responsibily on FAR CRs (creation and deletion) to NHC, even though FAR can also act as standalone remediator, but it with expense from the administrator to create and delete CRs. -Either way a user must be familier with fence agent to be used - Knowing it's parameters and any other requirements on the cluster (e.g., fence_ipmilan needs machines that support IPMI). +Either way a user must be familier with fence agent to be used - Knowing its parameters and any other requirements on the cluster (e.g., fence_ipmilan needs machines that support IPMI). ### FAR with NHC @@ -50,7 +50,7 @@ Either way a user must be familier with fence agent to be used - Knowing it's pa * Load the yaml manifest of the FAR template (see below). -* Modify NHC CR to use FAR as it's remediator - +* Modify NHC CR to use FAR as its remediator - This is basically a specific use case of an [external remediation of NHC CR](https://github.com/medik8s/node-healthcheck-operator#external-remediation-resources). In order to set it up, please make sure that Node Health Check is running, FAR controller exists and then creates the necessary CRs (*FenceAgentsRemediationTemplate* and then *NodeHealthCheck*). diff --git a/bundle/manifests/fence-agents-remediation.clusterserviceversion.yaml b/bundle/manifests/fence-agents-remediation.clusterserviceversion.yaml index 25f486c6..b602965e 100644 --- a/bundle/manifests/fence-agents-remediation.clusterserviceversion.yaml +++ b/bundle/manifests/fence-agents-remediation.clusterserviceversion.yaml @@ -173,6 +173,15 @@ spec: - get - patch - update + - apiGroups: + - storage.k8s.io + resources: + - volumeattachments + verbs: + - delete + - get + - list + - watch - apiGroups: - authentication.k8s.io resources: diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index a12355c7..9bbcf628 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -57,3 +57,12 @@ rules: - get - patch - update +- apiGroups: + - storage.k8s.io + resources: + - volumeattachments + verbs: + - delete + - get + - list + - watch diff --git a/controllers/fenceagentsremediation_controller.go b/controllers/fenceagentsremediation_controller.go index d8d6538a..c4faea8c 100644 --- a/controllers/fenceagentsremediation_controller.go +++ b/controllers/fenceagentsremediation_controller.go @@ -57,6 +57,7 @@ func (r *FenceAgentsRemediationReconciler) SetupWithManager(mgr ctrl.Manager) er Complete(r) } +//+kubebuilder:rbac:groups=storage.k8s.io,resources=volumeattachments,verbs=get;list;watch;delete //+kubebuilder:rbac:groups=core,resources=pods/exec,verbs=create //+kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;update;delete;deletecollection //+kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch;update;delete @@ -132,7 +133,7 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct r.Log.Info("Fetch FAR's pod") pod, err := utils.GetFenceAgentsRemediationPod(r.Client) if err != nil { - r.Log.Error(err, "Can't find FAR's pod by it's label", "CR's Name", req.Name) + r.Log.Error(err, "Can't find FAR's pod by its label", "CR's Name", req.Name) return emptyResult, err } //TODO: Check that FA is excutable? run cli.IsExecuteable @@ -164,6 +165,14 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct r.Log.Error(err, "Fence Agent response wasn't a success message", "CR's Name", req.Name) return emptyResult, err } + + // Reboot was finished and now we remove workloads (pods and their VA) + r.Log.Info("Manual workload deletion", "Fence Agent", far.Spec.Agent, "Node Name", req.Name) + if err := utils.DeleteResources(ctx, r.Client, req.Name); err != nil { + r.Log.Error(err, "Manual workload deletion has failed", "CR's Name", req.Name) + return emptyResult, err + } + return emptyResult, nil } @@ -188,7 +197,7 @@ func buildFenceAgentParams(far *v1alpha1.FenceAgentsRemediation) ([]string, erro return nil, err } } - // if --action attribute was not selected, then it's default value is reboot + // if --action attribute was not selected, then its default value is reboot // https://github.com/ClusterLabs/fence-agents/blob/main/lib/fencing.py.py#L103 // Therefore we can safely add the reboot action regardless if it was initially added into the CR fenceAgentParams = appendParamToSlice(fenceAgentParams, parameterActionName, parameterActionValue) diff --git a/controllers/fenceagentsremediation_controller_test.go b/controllers/fenceagentsremediation_controller_test.go index bbb5026c..43387093 100644 --- a/controllers/fenceagentsremediation_controller_test.go +++ b/controllers/fenceagentsremediation_controller_test.go @@ -27,6 +27,8 @@ import ( . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" + storagev1 "k8s.io/api/storage/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -38,13 +40,17 @@ import ( const ( dummyNode = "dummy-node" - node01 = "worker-0" + workerNode = "worker-0" fenceAgentIPMI = "fence_ipmilan" + farPodName = "far-pod" + testPodName = "far-pod-test-1" + vaName1 = "va-test-1" + vaName2 = "va-test-2" ) var ( - faPodLabels = map[string]string{"app.kubernetes.io/name": "fence-agents-remediation-operator"} - fenceAgentsPod *corev1.Pod + faPodLabels = map[string]string{"app.kubernetes.io/name": "fence-agents-remediation-operator"} + log = ctrl.Log.WithName("controllers-unit-test") ) var _ = Describe("FAR Controller", func() { @@ -77,13 +83,13 @@ var _ = Describe("FAR Controller", func() { } // default FenceAgentsRemediation CR - underTestFAR := getFenceAgentsRemediation(node01, fenceAgentIPMI, testShareParam, testNodeParam) + underTestFAR := getFenceAgentsRemediation(workerNode, fenceAgentIPMI, testShareParam, testNodeParam) Context("Functionality", func() { Context("buildFenceAgentParams", func() { When("FAR include different action than reboot", func() { It("should succeed with a warning", func() { - invalidValTestFAR := getFenceAgentsRemediation(node01, fenceAgentIPMI, invalidShareParam, testNodeParam) + invalidValTestFAR := getFenceAgentsRemediation(workerNode, fenceAgentIPMI, invalidShareParam, testNodeParam) invalidShareString, err := buildFenceAgentParams(invalidValTestFAR) Expect(err).NotTo(HaveOccurred()) validShareString, err := buildFenceAgentParams(underTestFAR) @@ -102,38 +108,40 @@ var _ = Describe("FAR Controller", func() { }) When("FAR CR's name does match a node name", func() { It("should succeed", func() { - underTestFAR.ObjectMeta.Name = node01 + underTestFAR.ObjectMeta.Name = workerNode Expect(buildFenceAgentParams(underTestFAR)).Error().NotTo(HaveOccurred()) }) }) }) }) Context("Reconcile", func() { - nodeKey := client.ObjectKey{Name: node01} - farNamespacedName := client.ObjectKey{Name: node01, Namespace: defaultNamespace} + nodeKey := client.ObjectKey{Name: workerNode} + farNamespacedName := client.ObjectKey{Name: workerNode, Namespace: defaultNamespace} farNoExecuteTaint := utils.CreateFARNoExecuteTaint() - //Scenarios + resourceDeletionWasTriggered := true // corresponds to testVADeletion bool value BeforeEach(func() { - fenceAgentsPod = buildFarPod() - // Create, Update status (for GetFenceAgentsRemediationPod), and DeferCleanUp the fenceAgentsPod - Expect(k8sClient.Create(context.Background(), fenceAgentsPod)).To(Succeed()) - fenceAgentsPod.Status.Phase = corev1.PodRunning - Expect(k8sClient.Status().Update(context.Background(), fenceAgentsPod)).To(Succeed()) - DeferCleanup(k8sClient.Delete, context.Background(), fenceAgentsPod) + // Create two VAs and two pods, and at the end clean them up with DeferCleanup + va1 := createVA(vaName1, workerNode) + va2 := createVA(vaName2, workerNode) + testPod := createRunningPod("far-test-1", testPodName, workerNode) + DeferCleanup(cleanupTestedResources, va1, va2, testPod) + farPod := createRunningPod("far-manager-test", farPodName, "") + DeferCleanup(k8sClient.Delete, context.Background(), farPod) }) JustBeforeEach(func() { - // DeferCleanUp and Create node, and FAR CR + // Create node, and FAR CR, and at the end clean them up with DeferCleanup Expect(k8sClient.Create(context.Background(), node)).To(Succeed()) DeferCleanup(k8sClient.Delete, context.Background(), node) Expect(k8sClient.Create(context.Background(), underTestFAR)).To(Succeed()) DeferCleanup(k8sClient.Delete, context.Background(), underTestFAR) }) + // TODO: add more scenarios? When("creating valid FAR CR", func() { BeforeEach(func() { - node = utils.GetNode("", node01) + node = utils.GetNode("", workerNode) }) - It("should have finalizer and taint", func() { + It("should have finalizer, taint, while the two VAs and one pod will be deleted", func() { By("Searching for remediation taint") Eventually(func() bool { Expect(k8sClient.Get(context.Background(), nodeKey, node)).To(Succeed()) @@ -141,30 +149,43 @@ var _ = Describe("FAR Controller", func() { res, _ := cliCommandsEquality(underTestFAR) return utils.TaintExists(node.Spec.Taints, &farNoExecuteTaint) && res }, 100*time.Millisecond, 10*time.Millisecond).Should(BeTrue(), "taint should be added, and command format is correct") - // If taint was added, then defenintly the finzlier was added as well + + // If taint was added, then definitely the finalizer was added as well By("Having a finalizer if we have a remediation taint") Expect(controllerutil.ContainsFinalizer(underTestFAR, v1alpha1.FARFinalizer)).To(BeTrue()) + By("Not having any VAs nor the test pod") + testVADeletion(vaName1, resourceDeletionWasTriggered) + testVADeletion(vaName2, resourceDeletionWasTriggered) + testPodDeletion(testPodName, resourceDeletionWasTriggered) }) }) When("creating invalid FAR CR Name", func() { BeforeEach(func() { - node = utils.GetNode("", node01) + node = utils.GetNode("", workerNode) underTestFAR = getFenceAgentsRemediation(dummyNode, fenceAgentIPMI, testShareParam, testNodeParam) }) - It("should not have a finalizer nor taint", func() { + It("should not have a finalizer nor taint, while the two VAs and one pod will remain", func() { By("Not finding a matching node to FAR CR's name") - nodeKey.Name = dummyNode + nodeKey.Name = underTestFAR.Name Expect(k8sClient.Get(context.Background(), nodeKey, node)).To(Not(Succeed())) + By("Not having finalizer") - farNamespacedName.Name = dummyNode + farNamespacedName.Name = underTestFAR.Name Eventually(func() bool { Expect(k8sClient.Get(context.Background(), farNamespacedName, underTestFAR)).To(Succeed()) return controllerutil.ContainsFinalizer(underTestFAR, v1alpha1.FARFinalizer) }, 100*time.Millisecond, 10*time.Millisecond).Should(BeFalse(), "finalizer shouldn't be added") + // If finalizer is missing, then a taint shouldn't be existed By("Not having remediation taint") Expect(utils.TaintExists(node.Spec.Taints, &farNoExecuteTaint)).To(BeFalse()) + + By("Still having all the VAs and one test pod") + resourceDeletionWasTriggered = false + testVADeletion(vaName1, resourceDeletionWasTriggered) + testVADeletion(vaName2, resourceDeletionWasTriggered) + testPodDeletion(testPodName, resourceDeletionWasTriggered) }) }) }) @@ -182,18 +203,74 @@ func getFenceAgentsRemediation(nodeName, agent string, sharedparameters map[v1al } } -// buildFarPod builds a dummy pod with FAR label and namespace -func buildFarPod() *corev1.Pod { - fenceAgentsPod := &corev1.Pod{} - fenceAgentsPod.Labels = faPodLabels - fenceAgentsPod.Name = "mock-fence-agents" - fenceAgentsPod.Namespace = defaultNamespace +// buildPod builds a dummy pod +func buildPod(containerName, podName, nodeName string) *corev1.Pod { + pod := &corev1.Pod{} + pod.Name = podName + if podName == farPodName { + // only when we build FAR pod then we add its label + pod.Labels = faPodLabels + } else { + // testedPod should reside in unhealthy node + pod.Spec.NodeName = nodeName + } + pod.Namespace = defaultNamespace container := corev1.Container{ - Name: "foo", + Name: containerName, Image: "foo", } - fenceAgentsPod.Spec.Containers = []corev1.Container{container} - return fenceAgentsPod + pod.Spec.Containers = []corev1.Container{container} + return pod +} + +// createRunningPod builds new pod format, create it, and set its status as running +func createRunningPod(containerName, podName, nodeName string) *corev1.Pod { + pod := buildPod(containerName, podName, nodeName) + Expect(k8sClient.Create(context.Background(), pod)).To(Succeed()) + pod.Status.Phase = corev1.PodRunning + Expect(k8sClient.Status().Update(context.Background(), pod)).To(Succeed()) + return pod +} + +// createVA creates new volume attachment and return its object +func createVA(vaName, unhealthyNodeName string) *storagev1.VolumeAttachment { + va := &storagev1.VolumeAttachment{ + ObjectMeta: metav1.ObjectMeta{ + Name: vaName, + Namespace: defaultNamespace, + }, + Spec: storagev1.VolumeAttachmentSpec{ + Attacher: "foo", + Source: storagev1.VolumeAttachmentSource{}, + NodeName: unhealthyNodeName, + }, + } + foo := "foo" + va.Spec.Source.PersistentVolumeName = &foo + ExpectWithOffset(1, k8sClient.Create(context.Background(), va)).To(Succeed()) + return va +} + +// cleanupTestedResources fetches all the resources that we have crated for the test +// and if they are still exist at the end of the test, then we clean them up for next test +func cleanupTestedResources(va1, va2 *storagev1.VolumeAttachment, pod *corev1.Pod) { + // clean test volume attachments if it exists + vaTest := &storagev1.VolumeAttachment{} + if err := k8sClient.Get(context.Background(), client.ObjectKeyFromObject(va1), vaTest); err == nil { + log.Info("Cleanup: clean volume attachment", "va name", vaTest.Name) + Expect(k8sClient.Delete(context.Background(), vaTest)).To(Succeed()) + } + if err := k8sClient.Get(context.Background(), client.ObjectKeyFromObject(va2), vaTest); err == nil { + log.Info("Cleanup: clean volume attachment", "va name", vaTest.Name) + Expect(k8sClient.Delete(context.Background(), vaTest)).To(Succeed()) + + } + // clean test pod if it exists + podTest := &corev1.Pod{} + if err := k8sClient.Get(context.Background(), client.ObjectKeyFromObject(pod), podTest); err == nil { + log.Info("Cleanup: clean pod", "pod name", podTest.Name) + Expect(k8sClient.Delete(context.Background(), podTest)).To(Succeed()) + } } // isEqualStringLists return true if two string lists share the same values @@ -217,6 +294,58 @@ func cliCommandsEquality(far *v1alpha1.FenceAgentsRemediation) (bool, error) { return isEqualStringLists(mocksExecuter.command, expectedCommand), nil } +// testVADeletion tests whether the volume attachment no longer exist for successful FAR CR +// and consistently check if the volume attachment exist and was not deleted +func testVADeletion(vaName string, resourceDeletionWasTriggered bool) { + vaKey := client.ObjectKey{ + Namespace: defaultNamespace, + Name: vaName, + } + if resourceDeletionWasTriggered { + EventuallyWithOffset(1, func() bool { + va := &storagev1.VolumeAttachment{} + err := k8sClient.Get(context.Background(), vaKey, va) + return apierrors.IsNotFound(err) + + }, 5*time.Second, 250*time.Millisecond).Should(BeTrue()) + log.Info("Volume attachment is no longer exist", "va", vaName) + } else { + ConsistentlyWithOffset(1, func() bool { + va := &storagev1.VolumeAttachment{} + err := k8sClient.Get(context.Background(), vaKey, va) + return apierrors.IsNotFound(err) + + }, 5*time.Second, 250*time.Millisecond).Should(BeFalse()) + log.Info("Volume attachment exist", "va", vaName) + } +} + +// testPodDeletion tests whether the pod no longer exist for successful FAR CR +// and consistently check if the pod exist and was not deleted +func testPodDeletion(podName string, resourceDeletionWasTriggered bool) { + podKey := client.ObjectKey{ + Namespace: defaultNamespace, + Name: podName, + } + if resourceDeletionWasTriggered { + EventuallyWithOffset(1, func() bool { + pod := &corev1.Pod{} + err := k8sClient.Get(context.Background(), podKey, pod) + return apierrors.IsNotFound(err) + + }, 5*time.Second, 250*time.Millisecond).Should(BeTrue()) + log.Info("Pod is no longer exist", "pod", podName) + } else { + ConsistentlyWithOffset(1, func() bool { + pod := &corev1.Pod{} + err := k8sClient.Get(context.Background(), podKey, pod) + return apierrors.IsNotFound(err) + + }, 5*time.Second, 250*time.Millisecond).Should(BeFalse()) + log.Info("Pod exist", "pod", podName) + } +} + // Implements Execute function to mock/test Execute of FenceAgentsRemediationReconciler type mockExecuter struct { command []string @@ -234,5 +363,5 @@ func newMockExecuter() *mockExecuter { func (m *mockExecuter) Execute(_ *corev1.Pod, command []string) (stdout string, stderr string, err error) { m.command = command m.mockLog.Info("Executed command has been stored", "command", m.command) - return SuccessFAResponse, "", nil + return SuccessFAResponse + "\n", "", nil } diff --git a/pkg/utils/resources.go b/pkg/utils/resources.go new file mode 100644 index 00000000..2ac8f8a6 --- /dev/null +++ b/pkg/utils/resources.go @@ -0,0 +1,74 @@ +package utils + +// Inspired from SNR - https://github.com/medik8s/self-node-remediation/blob/main/controllers/selfnoderemediation_controller.go#L283-L346 +import ( + "context" + + corev1 "k8s.io/api/core/v1" + storagev1 "k8s.io/api/storage/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/fields" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +var ( + log = ctrl.Log.WithName("utils-resource") +) + +func DeleteResources(ctx context.Context, r client.Client, nodeName string) error { + zero := int64(0) + backgroundDeletePolicy := metav1.DeletePropagationBackground + + deleteOptions := &client.DeleteAllOfOptions{ + ListOptions: client.ListOptions{ + FieldSelector: fields.SelectorFromSet(fields.Set{"spec.nodeName": nodeName}), + Namespace: "", + Limit: 0, + }, + DeleteOptions: client.DeleteOptions{ + GracePeriodSeconds: &zero, + PropagationPolicy: &backgroundDeletePolicy, + }, + } + + namespaces := corev1.NamespaceList{} + if err := r.List(ctx, &namespaces); err != nil { + log.Error(err, "failed to list namespaces", err) + return err + } + + log.Info("starting to delete node resources", "node name", nodeName) + + pod := &corev1.Pod{} + for _, ns := range namespaces.Items { + deleteOptions.Namespace = ns.Name + err := r.DeleteAllOf(ctx, pod, deleteOptions) + if err != nil { + log.Error(err, "failed to delete pods of unhealthy node", "namespace", ns.Name) + return err + } + } + + volumeAttachments := &storagev1.VolumeAttachmentList{} + if err := r.List(ctx, volumeAttachments); err != nil { + log.Error(err, "failed to get volumeAttachments list") + return err + } + forceDeleteOption := &client.DeleteOptions{ + GracePeriodSeconds: &zero, + } + for _, va := range volumeAttachments.Items { + if va.Spec.NodeName == nodeName { + err := r.Delete(ctx, &va, forceDeleteOption) + if err != nil { + log.Error(err, "failed to delete volumeAttachment", "name", va.Name) + return err + } + } + } + + log.Info("done deleting node resources", "node name", nodeName) + + return nil +} diff --git a/test/e2e/far_e2e_test.go b/test/e2e/far_e2e_test.go index e53c2114..dac55b36 100644 --- a/test/e2e/far_e2e_test.go +++ b/test/e2e/far_e2e_test.go @@ -11,6 +11,7 @@ import ( . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" + storagev1 "k8s.io/api/storage/v1" apiErrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" @@ -32,12 +33,17 @@ const ( nodeIdentifierPrefixAWS = "--plug" nodeIdentifierPrefixIPMI = "--ipport" containerName = "manager" + testVolumeAttachment = "test-va" + testContainerName = "test-container" + testPodName = "test-pod" //TODO: try to minimize timeout // eventually parameters - timeoutLogs = 3 * time.Minute - timeoutReboot = 6 * time.Minute // fencing with fence_aws should be completed within 6 minutes - pollInterval = 10 * time.Second + timeoutLogs = 3 * time.Minute + timeoutReboot = 6 * time.Minute // fencing with fence_aws should be completed within 6 minutes + timeoutDeletion = 10 * time.Second // this timeout is used after all the other steps have been succesfult + pollDeletion = 250 * time.Millisecond + pollInterval = 10 * time.Second ) var previousNodeName string @@ -75,9 +81,12 @@ var _ = Describe("FAR E2e", func() { Context("stress cluster", func() { var ( - nodeName string - nodeBootTimeBefore time.Time - err error + err error + testNodeName string + va *storagev1.VolumeAttachment + pod *corev1.Pod + creationTimePod, nodeBootTimeBefore time.Time + far *v1alpha1.FenceAgentsRemediation ) BeforeEach(func() { nodes := &corev1.NodeList{} @@ -89,25 +98,37 @@ var _ = Describe("FAR E2e", func() { Fail("No worker nodes found in the cluster") } - nodeName = randomizeWorkerNode(nodes) - previousNodeName = nodeName - nodeNameParam := v1alpha1.NodeName(nodeName) + testNodeName = randomizeWorkerNode(nodes) + previousNodeName = testNodeName + nodeNameParam := v1alpha1.NodeName(testNodeName) parameterName := v1alpha1.ParameterName(nodeIdentifierPrefix) testNodeID := testNodeParam[parameterName][nodeNameParam] - log.Info("Testing Node", "Node name", nodeName, "Node ID", testNodeID) + log.Info("Testing Node", "Node name", testNodeName, "Node ID", testNodeID) // save the node's boot time prior to the fence agent call - nodeBootTimeBefore, err = e2eUtils.GetBootTime(clientSet, nodeName, testNsName, log) + nodeBootTimeBefore, err = e2eUtils.GetBootTime(clientSet, testNodeName, testNsName, log) Expect(err).ToNot(HaveOccurred(), "failed to get boot time of the node") - far := createFAR(nodeName, fenceAgent, testShareParam, testNodeParam) + + // create tested pod, and save its creation time + // it will be deleted by FAR CR + pod = e2eUtils.GetPod(testNodeName, testContainerName) + pod.Name = testPodName + pod.Namespace = testNsName + Expect(k8sClient.Create(context.Background(), pod)).To(Succeed()) + log.Info("Tested pod has been created", "pod", testPodName) + creationTimePod = metav1.Now().Time + va = createVA(testNodeName) + DeferCleanup(cleanupTestedResources, va, pod) + + far = createFAR(testNodeName, fenceAgent, testShareParam, testNodeParam) DeferCleanup(deleteFAR, far) }) When("running FAR to reboot two nodes", func() { It("should successfully remediate the first node", func() { - checkRemediation(nodeName, nodeBootTimeBefore) + checkRemediation(testNodeName, nodeBootTimeBefore, creationTimePod, va, pod) }) It("should successfully remediate the second node", func() { - checkRemediation(nodeName, nodeBootTimeBefore) + checkRemediation(testNodeName, nodeBootTimeBefore, creationTimePod, va, pod) }) }) }) @@ -213,6 +234,28 @@ func randomizeWorkerNode(nodes *corev1.NodeList) string { return nodeName } +// createVA creates dummy volume attachment for testing the resource deletion +func createVA(nodeName string) *storagev1.VolumeAttachment { + pv := "test-pv" + va := &storagev1.VolumeAttachment{ + ObjectMeta: metav1.ObjectMeta{ + Name: testVolumeAttachment, + Namespace: testNsName, + }, + Spec: storagev1.VolumeAttachmentSpec{ + Attacher: pv, + Source: storagev1.VolumeAttachmentSource{ + PersistentVolumeName: &pv, + }, + NodeName: nodeName, + }, + } + + ExpectWithOffset(1, k8sClient.Create(context.Background(), va)).To(Succeed()) + log.Info("Volume attachment has been created", "va", va.Name) + return va +} + // createFAR assigns the input to FenceAgentsRemediation object, creates CR, and returns the CR object func createFAR(nodeName string, agent string, sharedParameters map[v1alpha1.ParameterName]string, nodeParameters map[v1alpha1.ParameterName]map[v1alpha1.NodeName]string) *v1alpha1.FenceAgentsRemediation { far := &v1alpha1.FenceAgentsRemediation{ @@ -238,6 +281,21 @@ func deleteFAR(far *v1alpha1.FenceAgentsRemediation) { }, 2*time.Minute, 10*time.Second).ShouldNot(HaveOccurred(), "failed to delete far") } +// cleanupTestedResources deletes an old pod and old va if it was not deleted from FAR CR +func cleanupTestedResources(va *storagev1.VolumeAttachment, pod *corev1.Pod) { + newVa := &storagev1.VolumeAttachment{} + if err := k8sClient.Get(context.Background(), client.ObjectKeyFromObject(va), newVa); err == nil { + Expect(k8sClient.Delete(context.Background(), newVa)).To(Succeed()) + log.Info("cleanup: Volume attachment has not been deleted by remediation", "va name", va.Name) + } + + newPod := &corev1.Pod{} + if err := k8sClient.Get(context.Background(), client.ObjectKeyFromObject(pod), newPod); err == nil { + Expect(k8sClient.Delete(context.Background(), newPod)).To(Succeed()) + log.Info("cleanup: Pod has not been deleted by remediation", "pod name", pod.Name) + } +} + // wasFarTaintAdded checks whether the FAR taint was added to the tested node func wasFarTaintAdded(nodeName string) { farTaint := utils.CreateFARNoExecuteTaint() @@ -251,7 +309,7 @@ func wasFarTaintAdded(nodeName string) { log.Info("FAR taint was added", "node name", node.Name, "taint key", farTaint.Key, "taint effect", farTaint.Effect) } -// checkFarLogs gets the FAR pod and checks whether it's logs have logString +// checkFarLogs gets the FAR pod and checks whether its logs have logString func checkFarLogs(logString string) { EventuallyWithOffset(1, func() string { pod, err := utils.GetFenceAgentsRemediationPod(k8sClient) @@ -291,8 +349,29 @@ func wasNodeRebooted(nodeName string, nodeBootTimeBefore time.Time) { log.Info("successful reboot", "node", nodeName, "offset between last boot", nodeBootTimeAfter.Sub(nodeBootTimeBefore), "new boot time", nodeBootTimeAfter) } +// checkVaDeleted verifies if the va has already been deleted due to resource deletion +func checkVaDeleted(va *storagev1.VolumeAttachment) { + EventuallyWithOffset(1, func() bool { + newVa := &storagev1.VolumeAttachment{} + err := k8sClient.Get(context.Background(), client.ObjectKeyFromObject(va), newVa) + return apiErrors.IsNotFound(err) + + }, timeoutDeletion, pollDeletion).Should(BeTrue()) + log.Info("Volume Attachment has already been deleted", "va name", va.Name) +} + +// checkPodDeleted vefifies if the pod has already been deleted due to resource deletion +func checkPodDeleted(pod *corev1.Pod) { + ConsistentlyWithOffset(1, func() bool { + newPod := &corev1.Pod{} + err := k8sClient.Get(context.Background(), client.ObjectKeyFromObject(pod), newPod) + return apiErrors.IsNotFound(err) + }, timeoutDeletion, pollDeletion).Should(BeTrue()) + log.Info("Pod has already been deleted", "pod name", pod.Name) +} + // checkRemediation verify whether the node was remediated -func checkRemediation(nodeName string, nodeBootTimeBefore time.Time) { +func checkRemediation(nodeName string, nodeBootTimeBefore time.Time, oldPodCreationTime time.Time, va *storagev1.VolumeAttachment, pod *corev1.Pod) { By("Check if FAR NoExecute taint was added") wasFarTaintAdded(nodeName) @@ -304,4 +383,10 @@ func checkRemediation(nodeName string, nodeBootTimeBefore time.Time) { By("Getting new node's boot time") wasNodeRebooted(nodeName, nodeBootTimeBefore) + + By("checking if old VA has been deleted") + checkVaDeleted(va) + + By("checking if old pod has been deleted") + checkPodDeleted(pod) } diff --git a/test/e2e/utils/cluster.go b/test/e2e/utils/cluster.go index 7c675f6b..07098066 100644 --- a/test/e2e/utils/cluster.go +++ b/test/e2e/utils/cluster.go @@ -23,7 +23,7 @@ const ( machinesNamespace = "openshift-machine-api" ) -// GetClusterInfo fetch the cluster's infrastructure object to identify it's type +// GetClusterInfo fetch the cluster's infrastructure object to identify its type func GetClusterInfo(config configclient.Interface) (*configv1.Infrastructure, error) { // oc get Infrastructure.config.openshift.io/cluster -o jsonpath='{.metadata.name}' // oc get Infrastructure.config.openshift.io/cluster -o jsonpath='{.spec.platformSpec.type}' @@ -35,8 +35,8 @@ func GetClusterInfo(config configclient.Interface) (*configv1.Infrastructure, er return clusterInfra, nil } -// GetSecretData searches for the platform's secret, and then returns it's decoded two data values. -// E.g. on AWS it would be the Access Key and it's ID, but on BMH with fence_impilan it would be useranme and password +// GetSecretData searches for the platform's secret, and then returns its decoded two data values. +// E.g. on AWS it would be the Access Key and its ID, but on BMH with fence_impilan it would be useranme and password func GetSecretData(clientSet *kubernetes.Clientset, secretName, secretNamespace, secretData1, secretData2 string) (string, string, error) { // oc get secrets -n openshift-machine-api aws-cloud-credentials -o jsonpath='{.data.aws_access_key_id}' | base64 -d // oc get secrets -n openshift-machine-api aws-cloud-credentials -o jsonpath='{.data.aws_secret_access_key}' | base64 -d diff --git a/test/e2e/utils/command.go b/test/e2e/utils/command.go index 491b1890..ae16ab38 100644 --- a/test/e2e/utils/command.go +++ b/test/e2e/utils/command.go @@ -24,6 +24,10 @@ import ( "github.com/medik8s/fence-agents-remediation/api/v1alpha1" ) +const ( + containerTestName = "test-command" +) + // GetBootTime gets the boot time of the given node by running a pod on it executing uptime command func GetBootTime(c *kubernetes.Clientset, nodeName string, ns string, log logr.Logger) (time.Time, error) { emptyTime := time.Time{} @@ -44,7 +48,7 @@ func GetBootTime(c *kubernetes.Clientset, nodeName string, ns string, log logr.L func RunCommandInCluster(c *kubernetes.Clientset, nodeName string, ns string, command string, log logr.Logger) (string, error) { // create a pod and wait that it's running - pod := getPod(nodeName) + pod := GetPod(nodeName, containerTestName) pod, err := c.CoreV1().Pods(ns).Create(context.Background(), pod, metav1.CreateOptions{}) if err != nil { return "", err @@ -144,7 +148,7 @@ func waitForCondition(c *kubernetes.Clientset, pod *corev1.Pod, conditionType co }) } -func getPod(nodeName string) *corev1.Pod { +func GetPod(nodeName, containerName string) *corev1.Pod { return &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ GenerateName: "far-test-", @@ -161,7 +165,7 @@ func getPod(nodeName string) *corev1.Pod { RestartPolicy: corev1.RestartPolicyNever, Containers: []corev1.Container{ { - Name: "test", + Name: containerName, Image: "registry.access.redhat.com/ubi8/ubi-minimal", SecurityContext: &corev1.SecurityContext{ Privileged: pointer.Bool(true),