Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Two Remediations and Taint Addition for E2E #62

Merged
merged 4 commits into from
Jul 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions pkg/utils/nodes.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"
)

// getNodeWithName returns a node with a name nodeName, or an error if it can't be found
func getNodeWithName(r client.Reader, nodeName string) (*corev1.Node, error) {
// GetNodeWithName returns a node with a name nodeName, or an error if it can't be found
func GetNodeWithName(r client.Reader, nodeName string) (*corev1.Node, error) {
node := &corev1.Node{}
key := client.ObjectKey{Name: nodeName}
if err := r.Get(context.TODO(), key, node); err != nil {
Expand All @@ -23,7 +23,7 @@ func getNodeWithName(r client.Reader, nodeName string) (*corev1.Node, error) {

// IsNodeNameValid returns an error if nodeName doesn't match any node name int the cluster, otherwise a nil
func IsNodeNameValid(r client.Reader, nodeName string) (bool, error) {
_, err := getNodeWithName(r, nodeName)
_, err := GetNodeWithName(r, nodeName)
if err != nil {
if apiErrors.IsNotFound(err) {
// In case of notFound API error we don't return error, since it is valid result
Expand Down
4 changes: 2 additions & 2 deletions pkg/utils/taints.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ func CreateFARNoExecuteTaint() corev1.Taint {
// AppendTaint appends new taint to the taint list when it is not present, and returns error if it fails in the process
func AppendTaint(r client.Client, nodeName string) error {
// find node by name
node, err := getNodeWithName(r, nodeName)
node, err := GetNodeWithName(r, nodeName)
if err != nil {
return err
}
Expand All @@ -81,7 +81,7 @@ func AppendTaint(r client.Client, nodeName string) error {
// RemoveTaint removes taint from the taint list when it is existed, and returns error if it fails in the process
func RemoveTaint(r client.Client, nodeName string) error {
// find node by name
node, err := getNodeWithName(r, nodeName)
node, err := GetNodeWithName(r, nodeName)
if err != nil {
return err
}
Expand Down
197 changes: 113 additions & 84 deletions test/e2e/far_e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package e2e
import (
"context"
"fmt"
"math/rand"
"time"

medik8sLabels "github.com/medik8s/common/pkg/labels"
Expand Down Expand Up @@ -39,28 +40,44 @@ const (
pollInterval = 10 * time.Second
)

var previousNodeName string

var _ = Describe("FAR E2e", func() {
var (
far *v1alpha1.FenceAgentsRemediation
fenceAgent string
clusterPlatform *configv1.Infrastructure
err error
fenceAgent, nodeIdentifierPrefix string
testShareParam map[v1alpha1.ParameterName]string
testNodeParam map[v1alpha1.ParameterName]map[v1alpha1.NodeName]string
)
BeforeEach(func() {
clusterPlatform, err = e2eUtils.GetClusterInfo(configClient)
if err != nil {
Fail("can't identify the cluster platform")
}
// create FAR CR spec based on OCP platformn
clusterPlatform, err := e2eUtils.GetClusterInfo(configClient)
Expect(err).ToNot(HaveOccurred(), "can't identify the cluster platform")
fmt.Printf("\ncluster name: %s and PlatformType: %s \n", string(clusterPlatform.Name), string(clusterPlatform.Status.PlatformStatus.Type))

switch clusterPlatform.Status.PlatformStatus.Type {
case configv1.AWSPlatformType:
fenceAgent = fenceAgentAWS
nodeIdentifierPrefix = nodeIdentifierPrefixAWS
By("running fence_aws")
case configv1.BareMetalPlatformType:
fenceAgent = fenceAgentIPMI
nodeIdentifierPrefix = nodeIdentifierPrefixIPMI
By("running fence_ipmilan")
default:
Skip("FAR haven't been tested on this kind of cluster (non AWS or BareMetal)")
}

testShareParam, err = buildSharedParameters(clusterPlatform, fenceAgentAction)
Expect(err).ToNot(HaveOccurred(), "can't get shared information")
testNodeParam, err = buildNodeParameters(clusterPlatform.Status.PlatformStatus.Type)
Expect(err).ToNot(HaveOccurred(), "can't get node information")
})

Context("fence agent - fence_aws or fence_ipmilan", func() {
Context("stress cluster", func() {
var (
nodeBootTimeBefore time.Time
errBoot error
testNodeName string
nodeIdentifierPrefix string
testNodeID string
nodeName string
nodeBootTimeBefore time.Time
err error
)
BeforeEach(func() {
nodes := &corev1.NodeList{}
Expand All @@ -69,88 +86,34 @@ var _ = Describe("FAR E2e", func() {
selector = selector.Add(*requirement)
Expect(k8sClient.List(context.Background(), nodes, &client.ListOptions{LabelSelector: selector})).ToNot(HaveOccurred())
if len(nodes.Items) < 1 {
Fail("there are no worker nodes in the cluster")
}
//TODO: Randomize the node selection & verify valid index
// run FA on the first worker node
nodeObj := nodes.Items[0]
testNodeName = nodeObj.Name

switch clusterPlatform.Status.PlatformStatus.Type {
case configv1.AWSPlatformType:
fenceAgent = fenceAgentAWS
nodeIdentifierPrefix = nodeIdentifierPrefixAWS
By("running fence_aws")
case configv1.BareMetalPlatformType:
fenceAgent = fenceAgentIPMI
nodeIdentifierPrefix = nodeIdentifierPrefixIPMI
By("running fence_ipmilan")
default:
Skip("FAR haven't been tested on this kind of cluster (non AWS or BareMetal)")
Fail("No worker nodes found in the cluster")
}

testShareParam, err := buildSharedParameters(clusterPlatform, fenceAgentAction)
if err != nil {
Fail("can't get shared information")
}
testNodeParam, err := buildNodeParameters(clusterPlatform.Status.PlatformStatus.Type)
if err != nil {
Fail("can't get node information")
}
nodeName := v1alpha1.NodeName(testNodeName)
nodeName = randomizeWorkerNode(nodes)
previousNodeName = nodeName
nodeNameParam := v1alpha1.NodeName(nodeName)
parameterName := v1alpha1.ParameterName(nodeIdentifierPrefix)
testNodeID = testNodeParam[parameterName][nodeName]
log.Info("Testing Node", "Node name", testNodeName, "Node ID", testNodeID)
testNodeID := testNodeParam[parameterName][nodeNameParam]
log.Info("Testing Node", "Node name", nodeName, "Node ID", testNodeID)

// save the node's boot time prior to the fence agent call
nodeBootTimeBefore, errBoot = e2eUtils.GetBootTime(clientSet, testNodeName, testNsName, log)
Expect(errBoot).ToNot(HaveOccurred(), "failed to get boot time of the node")

far = createFAR(testNodeName, fenceAgent, testShareParam, testNodeParam)
nodeBootTimeBefore, err = e2eUtils.GetBootTime(clientSet, nodeName, testNsName, log)
Expect(err).ToNot(HaveOccurred(), "failed to get boot time of the node")
far := createFAR(nodeName, fenceAgent, testShareParam, testNodeParam)
DeferCleanup(deleteFAR, far)
})
When("running FAR to reboot two nodes", func() {
It("should successfully remediate the first node", func() {
checkRemediation(nodeName, succeesRebootMessage, nodeBootTimeBefore)
})
It("should successfully remediate the second node", func() {
checkRemediation(nodeName, succeesRebootMessage, nodeBootTimeBefore)

When("running FAR to reboot node ", func() {
It("should execute the fence agent cli command", func() {
By("checking the CR has been created")
testFarCR := &v1alpha1.FenceAgentsRemediation{}
Expect(k8sClient.Get(context.Background(), client.ObjectKeyFromObject(far), testFarCR)).To(Succeed(), "failed to get FAR CR")

By("checking the command has been executed successfully")
checkFarLogs(succeesRebootMessage)

By("checking the node's boot time after running the FA")
wasNodeRebooted(testNodeName, nodeBootTimeBefore)
})
})
})
})

// createFAR assigns the input to FenceAgentsRemediation object, creates CR, and returns the CR object
func createFAR(nodeName string, agent string, sharedParameters map[v1alpha1.ParameterName]string, nodeParameters map[v1alpha1.ParameterName]map[v1alpha1.NodeName]string) *v1alpha1.FenceAgentsRemediation {
far := &v1alpha1.FenceAgentsRemediation{
ObjectMeta: metav1.ObjectMeta{Name: nodeName, Namespace: operatorNsName},
Spec: v1alpha1.FenceAgentsRemediationSpec{
Agent: agent,
SharedParameters: sharedParameters,
NodeParameters: nodeParameters,
},
}
ExpectWithOffset(1, k8sClient.Create(context.Background(), far)).ToNot(HaveOccurred())
return far
}

// deleteFAR deletes the CR with offset
func deleteFAR(far *v1alpha1.FenceAgentsRemediation) {
EventuallyWithOffset(1, func() error {
err := k8sClient.Delete(context.Background(), far)
if apiErrors.IsNotFound(err) {
return nil
}
return err
}, 2*time.Minute, 10*time.Second).ShouldNot(HaveOccurred(), "failed to delete far")
}

// buildSharedParameters returns a map key-value of shared parameters based on cluster platform type if it finds the credentials, otherwise an error
func buildSharedParameters(clusterPlatform *configv1.Infrastructure, action string) (map[v1alpha1.ParameterName]string, error) {
const (
Expand Down Expand Up @@ -238,6 +201,57 @@ func buildNodeParameters(clusterPlatformType configv1.PlatformType) (map[v1alpha
return testNodeParam, nil
}

// randomizeWorkerNode returns a worker node name which is different than the previous one
// (on the first call it will allways return new node)
func randomizeWorkerNode(nodes *corev1.NodeList) string {
nodeName := previousNodeName
for previousNodeName == nodeName {
// Generate a random seed based on the current time
r := rand.New(rand.NewSource(time.Now().UnixNano()))
// Randomly select a worker node
nodeName = nodes.Items[r.Intn(len(nodes.Items))].Name
}
return nodeName
}

// createFAR assigns the input to FenceAgentsRemediation object, creates CR, and returns the CR object
func createFAR(nodeName string, agent string, sharedParameters map[v1alpha1.ParameterName]string, nodeParameters map[v1alpha1.ParameterName]map[v1alpha1.NodeName]string) *v1alpha1.FenceAgentsRemediation {
far := &v1alpha1.FenceAgentsRemediation{
ObjectMeta: metav1.ObjectMeta{Name: nodeName, Namespace: operatorNsName},
Spec: v1alpha1.FenceAgentsRemediationSpec{
Agent: agent,
SharedParameters: sharedParameters,
NodeParameters: nodeParameters,
},
}
ExpectWithOffset(1, k8sClient.Create(context.Background(), far)).ToNot(HaveOccurred())
return far
}

// deleteFAR deletes the CR with offset
func deleteFAR(far *v1alpha1.FenceAgentsRemediation) {
EventuallyWithOffset(1, func() error {
err := k8sClient.Delete(context.Background(), far)
if apiErrors.IsNotFound(err) {
return nil
}
return err
}, 2*time.Minute, 10*time.Second).ShouldNot(HaveOccurred(), "failed to delete far")
}

// wasFarTaintAdded checks whether the FAR taint was added to the tested node
func wasFarTaintAdded(nodeName string) {
farTaint := utils.CreateFARNoExecuteTaint()
var node *corev1.Node
Eventually(func() bool {
var err error
node, err = utils.GetNodeWithName(k8sClient, nodeName)
Expect(err).ToNot(HaveOccurred())
return utils.TaintExists(node.Spec.Taints, &farTaint)
}, 1*time.Second, "200ms").Should(BeTrue())
log.Info("FAR taint was added", "node name", node.Name, "taint key", farTaint.Key, "taint effect", farTaint.Effect)
}

// checkFarLogs gets the FAR pod and checks whether it's logs have logString
func checkFarLogs(logString string) {
EventuallyWithOffset(1, func() string {
Expand All @@ -249,7 +263,7 @@ func checkFarLogs(logString string) {
logs, err := e2eUtils.GetLogs(clientSet, pod, containerName)
if err != nil {
if apiErrors.IsNotFound(err) {
// If FAR pod was running in testNodeName, then after reboot it was recreated in another node, and with a new name.
// If FAR pod was running in nodeObj, then after reboot it was recreated in another node, and with a new name.
// Thus the "old" pod's name prior to this eventually won't link to a running pod, since it was already evicted by the reboot
log.Error(err, "failed to get logs. FAR pod might have been recreated due to rebooting the node it was resided. Might try again", "pod", pod.Name)
return ""
Expand Down Expand Up @@ -277,3 +291,18 @@ func wasNodeRebooted(nodeName string, nodeBootTimeBefore time.Time) {

log.Info("successful reboot", "node", nodeName, "offset between last boot", nodeBootTimeAfter.Sub(nodeBootTimeBefore), "new boot time", nodeBootTimeAfter)
}

// checkRemediation verify whether the node was remediated
func checkRemediation(nodeName, logString string, nodeBootTimeBefore time.Time) {
By("Check if FAR NoExecute taint was added")
wasFarTaintAdded(nodeName)

By("Check if the response of the FA was a success")
// TODO: When reboot is running only once and it is running on FAR node, then FAR pod will
razo7 marked this conversation as resolved.
Show resolved Hide resolved
// be recreated on a new node and since the FA command won't be exuected again, then the log
// won't include any success message
checkFarLogs(succeesRebootMessage)

By("Getting new node's boot time")
wasNodeRebooted(nodeName, nodeBootTimeBefore)
}