From b5837bde1ca7eb6e6e28c894f4dbb0c234cf8c2b Mon Sep 17 00:00:00 2001 From: razo7 Date: Sun, 9 Jul 2023 18:07:05 +0300 Subject: [PATCH 1/4] Test second reoboot for different node Stress the cluster with one after another FAR CRs, and watch if they succeed --- test/e2e/far_e2e_test.go | 140 ++++++++++++++++++++++++--------------- 1 file changed, 86 insertions(+), 54 deletions(-) diff --git a/test/e2e/far_e2e_test.go b/test/e2e/far_e2e_test.go index 377b9198..635445d9 100644 --- a/test/e2e/far_e2e_test.go +++ b/test/e2e/far_e2e_test.go @@ -41,26 +41,50 @@ const ( var _ = Describe("FAR E2e", func() { var ( - far *v1alpha1.FenceAgentsRemediation - fenceAgent string - clusterPlatform *configv1.Infrastructure - err error + fenceAgent, nodeIdentifierPrefix string + testShareParam map[v1alpha1.ParameterName]string + testNodeParam map[v1alpha1.ParameterName]map[v1alpha1.NodeName]string + nodeIndex int + secondRun bool ) BeforeEach(func() { - clusterPlatform, err = e2eUtils.GetClusterInfo(configClient) + // create FAR CR spec based on OCP platformn + clusterPlatform, err := e2eUtils.GetClusterInfo(configClient) if err != nil { Fail("can't identify the cluster platform") } fmt.Printf("\ncluster name: %s and PlatformType: %s \n", string(clusterPlatform.Name), string(clusterPlatform.Status.PlatformStatus.Type)) + + switch clusterPlatform.Status.PlatformStatus.Type { + case configv1.AWSPlatformType: + fenceAgent = fenceAgentAWS + nodeIdentifierPrefix = nodeIdentifierPrefixAWS + By("running fence_aws") + case configv1.BareMetalPlatformType: + fenceAgent = fenceAgentIPMI + nodeIdentifierPrefix = nodeIdentifierPrefixIPMI + By("running fence_ipmilan") + default: + Skip("FAR haven't been tested on this kind of cluster (non AWS or BareMetal)") + } + + testShareParam, err = buildSharedParameters(clusterPlatform, fenceAgentAction) + if err != nil { + Fail("can't get shared information") + } + testNodeParam, err = buildNodeParameters(clusterPlatform.Status.PlatformStatus.Type) + if err != nil { + Fail("can't get node information") + } + // run FA on the first worker node + nodeIndex = 0 }) - Context("fence agent - fence_aws or fence_ipmilan", func() { + Context("stress cluster", func() { var ( - nodeBootTimeBefore time.Time - errBoot error - testNodeName string - nodeIdentifierPrefix string - testNodeID string + nodeName, errString string + nodeBootTimeBefore time.Time + err error ) BeforeEach(func() { nodes := &corev1.NodeList{} @@ -71,61 +95,57 @@ var _ = Describe("FAR E2e", func() { if len(nodes.Items) < 1 { Fail("there are no worker nodes in the cluster") } - //TODO: Randomize the node selection & verify valid index - // run FA on the first worker node - nodeObj := nodes.Items[0] - testNodeName = nodeObj.Name - - switch clusterPlatform.Status.PlatformStatus.Type { - case configv1.AWSPlatformType: - fenceAgent = fenceAgentAWS - nodeIdentifierPrefix = nodeIdentifierPrefixAWS - By("running fence_aws") - case configv1.BareMetalPlatformType: - fenceAgent = fenceAgentIPMI - nodeIdentifierPrefix = nodeIdentifierPrefixIPMI - By("running fence_ipmilan") - default: - Skip("FAR haven't been tested on this kind of cluster (non AWS or BareMetal)") + if secondRun { + nodeIndex++ } - - testShareParam, err := buildSharedParameters(clusterPlatform, fenceAgentAction) - if err != nil { - Fail("can't get shared information") + nodeName, errString = getNodeName(nodeIndex) + if errString != "" { + if nodeIndex <= 0 { + Fail(errString) + } + Skip(errString) } - testNodeParam, err := buildNodeParameters(clusterPlatform.Status.PlatformStatus.Type) - if err != nil { - Fail("can't get node information") - } - nodeName := v1alpha1.NodeName(testNodeName) + nodeNameParam := v1alpha1.NodeName(nodeName) parameterName := v1alpha1.ParameterName(nodeIdentifierPrefix) - testNodeID = testNodeParam[parameterName][nodeName] - log.Info("Testing Node", "Node name", testNodeName, "Node ID", testNodeID) + testNodeID := testNodeParam[parameterName][nodeNameParam] + log.Info("Testing Node", "Node name", nodeName, "Node ID", testNodeID) // save the node's boot time prior to the fence agent call - nodeBootTimeBefore, errBoot = e2eUtils.GetBootTime(clientSet, testNodeName, testNsName, log) - Expect(errBoot).ToNot(HaveOccurred(), "failed to get boot time of the node") - - far = createFAR(testNodeName, fenceAgent, testShareParam, testNodeParam) + nodeBootTimeBefore, err = e2eUtils.GetBootTime(clientSet, nodeName, testNsName, log) + Expect(err).ToNot(HaveOccurred(), "failed to get boot time of the node") + far := createFAR(nodeName, fenceAgent, testShareParam, testNodeParam) DeferCleanup(deleteFAR, far) }) + When("running FAR to reboot two nodes", func() { + It("should successfully remediate the first node", func() { + remediateNode(nodeName, succeesRebootMessage, nodeBootTimeBefore) + // next run create CR for the next worker node + secondRun = true + }) + It("should successfully remediate the second node", func() { + remediateNode(nodeName, succeesRebootMessage, nodeBootTimeBefore) - When("running FAR to reboot node ", func() { - It("should execute the fence agent cli command", func() { - By("checking the CR has been created") - testFarCR := &v1alpha1.FenceAgentsRemediation{} - Expect(k8sClient.Get(context.Background(), client.ObjectKeyFromObject(far), testFarCR)).To(Succeed(), "failed to get FAR CR") - - By("checking the command has been executed successfully") - checkFarLogs(succeesRebootMessage) - - By("checking the node's boot time after running the FA") - wasNodeRebooted(testNodeName, nodeBootTimeBefore) }) }) }) }) +// getNodeName returns the node's name based on valid index, otherwise it returns an error +func getNodeName(index int) (string, string) { + nodes := &corev1.NodeList{} + selector := labels.NewSelector() + requirement, _ := labels.NewRequirement(utils.WorkerLabelName, selection.Exists, []string{}) + selector = selector.Add(*requirement) + Expect(k8sClient.List(context.Background(), nodes, &client.ListOptions{LabelSelector: selector})).ToNot(HaveOccurred()) + if index < 0 { + return "", "nodeIndex is invalid - smaller than zero" + } + if index >= len(nodes.Items) { + return "", fmt.Sprintf("nodeIndex is invalid - there are not enough available worker nodes for nodeIndex %d", index) + } + return nodes.Items[index].Name, "" +} + // createFAR assigns the input to FenceAgentsRemediation object, creates CR, and returns the CR object func createFAR(nodeName string, agent string, sharedParameters map[v1alpha1.ParameterName]string, nodeParameters map[v1alpha1.ParameterName]map[v1alpha1.NodeName]string) *v1alpha1.FenceAgentsRemediation { far := &v1alpha1.FenceAgentsRemediation{ @@ -249,7 +269,7 @@ func checkFarLogs(logString string) { logs, err := e2eUtils.GetLogs(clientSet, pod, containerName) if err != nil { if apiErrors.IsNotFound(err) { - // If FAR pod was running in testNodeName, then after reboot it was recreated in another node, and with a new name. + // If FAR pod was running in nodeObj, then after reboot it was recreated in another node, and with a new name. // Thus the "old" pod's name prior to this eventually won't link to a running pod, since it was already evicted by the reboot log.Error(err, "failed to get logs. FAR pod might have been recreated due to rebooting the node it was resided. Might try again", "pod", pod.Name) return "" @@ -277,3 +297,15 @@ func wasNodeRebooted(nodeName string, nodeBootTimeBefore time.Time) { log.Info("successful reboot", "node", nodeName, "offset between last boot", nodeBootTimeAfter.Sub(nodeBootTimeBefore), "new boot time", nodeBootTimeAfter) } + +// remediateNode run three functions to verify whether the node was remediated +func remediateNode(nodeName, logString string, nodeBootTimeBefore time.Time) { + By("Executing the FA command and receive success response") + // TODO: When reboot is running only once and it is running on FAR node, then FAR pod will + // be recreated on a new node and since the FA command won't be exuected again, then the log + // won't include any success message + checkFarLogs(succeesRebootMessage) + + By("Getting new node's boot time") + wasNodeRebooted(nodeName, nodeBootTimeBefore) +} From e2658601add0e4897f0546be84dfe6741abc682a Mon Sep 17 00:00:00 2001 From: razo7 Date: Wed, 19 Jul 2023 14:29:55 +0300 Subject: [PATCH 2/4] Test the addition of FAR taint Add wasFarTaintAdded to check if the node has FAR taint prior to checking the pod's logs and the node's boot time --- pkg/utils/nodes.go | 6 +++--- pkg/utils/taints.go | 4 ++-- test/e2e/far_e2e_test.go | 28 ++++++++++++++++++++++------ 3 files changed, 27 insertions(+), 11 deletions(-) diff --git a/pkg/utils/nodes.go b/pkg/utils/nodes.go index c663d58a..b09c38f3 100644 --- a/pkg/utils/nodes.go +++ b/pkg/utils/nodes.go @@ -11,8 +11,8 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" ) -// getNodeWithName returns a node with a name nodeName, or an error if it can't be found -func getNodeWithName(r client.Reader, nodeName string) (*corev1.Node, error) { +// GetNodeWithName returns a node with a name nodeName, or an error if it can't be found +func GetNodeWithName(r client.Reader, nodeName string) (*corev1.Node, error) { node := &corev1.Node{} key := client.ObjectKey{Name: nodeName} if err := r.Get(context.TODO(), key, node); err != nil { @@ -23,7 +23,7 @@ func getNodeWithName(r client.Reader, nodeName string) (*corev1.Node, error) { // IsNodeNameValid returns an error if nodeName doesn't match any node name int the cluster, otherwise a nil func IsNodeNameValid(r client.Reader, nodeName string) (bool, error) { - _, err := getNodeWithName(r, nodeName) + _, err := GetNodeWithName(r, nodeName) if err != nil { if apiErrors.IsNotFound(err) { // In case of notFound API error we don't return error, since it is valid result diff --git a/pkg/utils/taints.go b/pkg/utils/taints.go index 3818d9d2..81484e97 100644 --- a/pkg/utils/taints.go +++ b/pkg/utils/taints.go @@ -54,7 +54,7 @@ func CreateFARNoExecuteTaint() corev1.Taint { // AppendTaint appends new taint to the taint list when it is not present, and returns error if it fails in the process func AppendTaint(r client.Client, nodeName string) error { // find node by name - node, err := getNodeWithName(r, nodeName) + node, err := GetNodeWithName(r, nodeName) if err != nil { return err } @@ -81,7 +81,7 @@ func AppendTaint(r client.Client, nodeName string) error { // RemoveTaint removes taint from the taint list when it is existed, and returns error if it fails in the process func RemoveTaint(r client.Client, nodeName string) error { // find node by name - node, err := getNodeWithName(r, nodeName) + node, err := GetNodeWithName(r, nodeName) if err != nil { return err } diff --git a/test/e2e/far_e2e_test.go b/test/e2e/far_e2e_test.go index 635445d9..ed6e7015 100644 --- a/test/e2e/far_e2e_test.go +++ b/test/e2e/far_e2e_test.go @@ -118,12 +118,12 @@ var _ = Describe("FAR E2e", func() { }) When("running FAR to reboot two nodes", func() { It("should successfully remediate the first node", func() { - remediateNode(nodeName, succeesRebootMessage, nodeBootTimeBefore) + checkRemediation(nodeName, succeesRebootMessage, nodeBootTimeBefore) // next run create CR for the next worker node secondRun = true }) It("should successfully remediate the second node", func() { - remediateNode(nodeName, succeesRebootMessage, nodeBootTimeBefore) + checkRemediation(nodeName, succeesRebootMessage, nodeBootTimeBefore) }) }) @@ -134,7 +134,7 @@ var _ = Describe("FAR E2e", func() { func getNodeName(index int) (string, string) { nodes := &corev1.NodeList{} selector := labels.NewSelector() - requirement, _ := labels.NewRequirement(utils.WorkerLabelName, selection.Exists, []string{}) + requirement, _ := labels.NewRequirement(medik8sLabels.WorkerRole, selection.Exists, []string{}) selector = selector.Add(*requirement) Expect(k8sClient.List(context.Background(), nodes, &client.ListOptions{LabelSelector: selector})).ToNot(HaveOccurred()) if index < 0 { @@ -258,6 +258,19 @@ func buildNodeParameters(clusterPlatformType configv1.PlatformType) (map[v1alpha return testNodeParam, nil } +// wasFarTaintAdded checks whether the FAR taint was added to the tested node +func wasFarTaintAdded(nodeName string) { + farTaint := utils.CreateFARNoExecuteTaint() + var node *corev1.Node + Eventually(func() bool { + var err error + node, err = utils.GetNodeWithName(k8sClient, nodeName) + Expect(err).ToNot(HaveOccurred()) + return utils.TaintExists(node.Spec.Taints, &farTaint) + }, 1*time.Second, "200ms").Should(BeTrue()) + log.Info("FAR taint was added", "node name", node.Name, "taint key", farTaint.Key, "taint effect", farTaint.Effect) +} + // checkFarLogs gets the FAR pod and checks whether it's logs have logString func checkFarLogs(logString string) { EventuallyWithOffset(1, func() string { @@ -298,9 +311,12 @@ func wasNodeRebooted(nodeName string, nodeBootTimeBefore time.Time) { log.Info("successful reboot", "node", nodeName, "offset between last boot", nodeBootTimeAfter.Sub(nodeBootTimeBefore), "new boot time", nodeBootTimeAfter) } -// remediateNode run three functions to verify whether the node was remediated -func remediateNode(nodeName, logString string, nodeBootTimeBefore time.Time) { - By("Executing the FA command and receive success response") +// checkRemediation verify whether the node was remediated +func checkRemediation(nodeName, logString string, nodeBootTimeBefore time.Time) { + By("Check if FAR NoExecute taint was added") + wasFarTaintAdded(nodeName) + + By("Check if the response of the FA was a success") // TODO: When reboot is running only once and it is running on FAR node, then FAR pod will // be recreated on a new node and since the FA command won't be exuected again, then the log // won't include any success message From 79fe6c141f111fb0e7147be5e75e442ee9009d2a Mon Sep 17 00:00:00 2001 From: razo7 Date: Wed, 19 Jul 2023 15:55:40 +0300 Subject: [PATCH 3/4] Randomize tested worker node We randomize using the current time as seed and we store the old node name so we won't create new CR for the same node on the second test --- test/e2e/far_e2e_test.go | 121 +++++++++++++++++---------------------- 1 file changed, 51 insertions(+), 70 deletions(-) diff --git a/test/e2e/far_e2e_test.go b/test/e2e/far_e2e_test.go index ed6e7015..0880d851 100644 --- a/test/e2e/far_e2e_test.go +++ b/test/e2e/far_e2e_test.go @@ -3,6 +3,7 @@ package e2e import ( "context" "fmt" + "math/rand" "time" medik8sLabels "github.com/medik8s/common/pkg/labels" @@ -39,20 +40,18 @@ const ( pollInterval = 10 * time.Second ) +var previousNodeName string + var _ = Describe("FAR E2e", func() { var ( fenceAgent, nodeIdentifierPrefix string testShareParam map[v1alpha1.ParameterName]string testNodeParam map[v1alpha1.ParameterName]map[v1alpha1.NodeName]string - nodeIndex int - secondRun bool ) BeforeEach(func() { // create FAR CR spec based on OCP platformn clusterPlatform, err := e2eUtils.GetClusterInfo(configClient) - if err != nil { - Fail("can't identify the cluster platform") - } + Expect(err).ToNot(HaveOccurred(), "can't identify the cluster platform") fmt.Printf("\ncluster name: %s and PlatformType: %s \n", string(clusterPlatform.Name), string(clusterPlatform.Status.PlatformStatus.Type)) switch clusterPlatform.Status.PlatformStatus.Type { @@ -69,22 +68,16 @@ var _ = Describe("FAR E2e", func() { } testShareParam, err = buildSharedParameters(clusterPlatform, fenceAgentAction) - if err != nil { - Fail("can't get shared information") - } + Expect(err).ToNot(HaveOccurred(), "can't get shared information") testNodeParam, err = buildNodeParameters(clusterPlatform.Status.PlatformStatus.Type) - if err != nil { - Fail("can't get node information") - } - // run FA on the first worker node - nodeIndex = 0 + Expect(err).ToNot(HaveOccurred(), "can't get node information") }) Context("stress cluster", func() { var ( - nodeName, errString string - nodeBootTimeBefore time.Time - err error + nodeName string + nodeBootTimeBefore time.Time + err error ) BeforeEach(func() { nodes := &corev1.NodeList{} @@ -93,18 +86,11 @@ var _ = Describe("FAR E2e", func() { selector = selector.Add(*requirement) Expect(k8sClient.List(context.Background(), nodes, &client.ListOptions{LabelSelector: selector})).ToNot(HaveOccurred()) if len(nodes.Items) < 1 { - Fail("there are no worker nodes in the cluster") - } - if secondRun { - nodeIndex++ - } - nodeName, errString = getNodeName(nodeIndex) - if errString != "" { - if nodeIndex <= 0 { - Fail(errString) - } - Skip(errString) + Fail("No worker nodes found in the cluster") } + + nodeName = randomizeWorkerNode(nodes) + previousNodeName = nodeName nodeNameParam := v1alpha1.NodeName(nodeName) parameterName := v1alpha1.ParameterName(nodeIdentifierPrefix) testNodeID := testNodeParam[parameterName][nodeNameParam] @@ -119,8 +105,6 @@ var _ = Describe("FAR E2e", func() { When("running FAR to reboot two nodes", func() { It("should successfully remediate the first node", func() { checkRemediation(nodeName, succeesRebootMessage, nodeBootTimeBefore) - // next run create CR for the next worker node - secondRun = true }) It("should successfully remediate the second node", func() { checkRemediation(nodeName, succeesRebootMessage, nodeBootTimeBefore) @@ -130,47 +114,6 @@ var _ = Describe("FAR E2e", func() { }) }) -// getNodeName returns the node's name based on valid index, otherwise it returns an error -func getNodeName(index int) (string, string) { - nodes := &corev1.NodeList{} - selector := labels.NewSelector() - requirement, _ := labels.NewRequirement(medik8sLabels.WorkerRole, selection.Exists, []string{}) - selector = selector.Add(*requirement) - Expect(k8sClient.List(context.Background(), nodes, &client.ListOptions{LabelSelector: selector})).ToNot(HaveOccurred()) - if index < 0 { - return "", "nodeIndex is invalid - smaller than zero" - } - if index >= len(nodes.Items) { - return "", fmt.Sprintf("nodeIndex is invalid - there are not enough available worker nodes for nodeIndex %d", index) - } - return nodes.Items[index].Name, "" -} - -// createFAR assigns the input to FenceAgentsRemediation object, creates CR, and returns the CR object -func createFAR(nodeName string, agent string, sharedParameters map[v1alpha1.ParameterName]string, nodeParameters map[v1alpha1.ParameterName]map[v1alpha1.NodeName]string) *v1alpha1.FenceAgentsRemediation { - far := &v1alpha1.FenceAgentsRemediation{ - ObjectMeta: metav1.ObjectMeta{Name: nodeName, Namespace: operatorNsName}, - Spec: v1alpha1.FenceAgentsRemediationSpec{ - Agent: agent, - SharedParameters: sharedParameters, - NodeParameters: nodeParameters, - }, - } - ExpectWithOffset(1, k8sClient.Create(context.Background(), far)).ToNot(HaveOccurred()) - return far -} - -// deleteFAR deletes the CR with offset -func deleteFAR(far *v1alpha1.FenceAgentsRemediation) { - EventuallyWithOffset(1, func() error { - err := k8sClient.Delete(context.Background(), far) - if apiErrors.IsNotFound(err) { - return nil - } - return err - }, 2*time.Minute, 10*time.Second).ShouldNot(HaveOccurred(), "failed to delete far") -} - // buildSharedParameters returns a map key-value of shared parameters based on cluster platform type if it finds the credentials, otherwise an error func buildSharedParameters(clusterPlatform *configv1.Infrastructure, action string) (map[v1alpha1.ParameterName]string, error) { const ( @@ -258,6 +201,44 @@ func buildNodeParameters(clusterPlatformType configv1.PlatformType) (map[v1alpha return testNodeParam, nil } +// randomizeWorkerNode returns a worker node name which is different than the previous one +// (on the first call it will allways return new node) +func randomizeWorkerNode(nodes *corev1.NodeList) string { + nodeName := previousNodeName + for previousNodeName == nodeName { + // Generate a random seed based on the current time + rand.New(rand.NewSource(time.Now().UnixNano())) + // Randomly select a worker node + nodeName = nodes.Items[rand.Intn(len(nodes.Items))].Name + } + return nodeName +} + +// createFAR assigns the input to FenceAgentsRemediation object, creates CR, and returns the CR object +func createFAR(nodeName string, agent string, sharedParameters map[v1alpha1.ParameterName]string, nodeParameters map[v1alpha1.ParameterName]map[v1alpha1.NodeName]string) *v1alpha1.FenceAgentsRemediation { + far := &v1alpha1.FenceAgentsRemediation{ + ObjectMeta: metav1.ObjectMeta{Name: nodeName, Namespace: operatorNsName}, + Spec: v1alpha1.FenceAgentsRemediationSpec{ + Agent: agent, + SharedParameters: sharedParameters, + NodeParameters: nodeParameters, + }, + } + ExpectWithOffset(1, k8sClient.Create(context.Background(), far)).ToNot(HaveOccurred()) + return far +} + +// deleteFAR deletes the CR with offset +func deleteFAR(far *v1alpha1.FenceAgentsRemediation) { + EventuallyWithOffset(1, func() error { + err := k8sClient.Delete(context.Background(), far) + if apiErrors.IsNotFound(err) { + return nil + } + return err + }, 2*time.Minute, 10*time.Second).ShouldNot(HaveOccurred(), "failed to delete far") +} + // wasFarTaintAdded checks whether the FAR taint was added to the tested node func wasFarTaintAdded(nodeName string) { farTaint := utils.CreateFARNoExecuteTaint() From fd36520ae2d0c4a16cd6b13b30d85bb41e8906d4 Mon Sep 17 00:00:00 2001 From: razo7 Date: Tue, 25 Jul 2023 14:29:57 +0300 Subject: [PATCH 4/4] Use random seed for node selection in E2E The random number was not been used to select the worker node --- test/e2e/far_e2e_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/e2e/far_e2e_test.go b/test/e2e/far_e2e_test.go index 0880d851..c5344dfa 100644 --- a/test/e2e/far_e2e_test.go +++ b/test/e2e/far_e2e_test.go @@ -207,9 +207,9 @@ func randomizeWorkerNode(nodes *corev1.NodeList) string { nodeName := previousNodeName for previousNodeName == nodeName { // Generate a random seed based on the current time - rand.New(rand.NewSource(time.Now().UnixNano())) + r := rand.New(rand.NewSource(time.Now().UnixNano())) // Randomly select a worker node - nodeName = nodes.Items[rand.Intn(len(nodes.Items))].Name + nodeName = nodes.Items[r.Intn(len(nodes.Items))].Name } return nodeName }