Skip to content

Commit

Permalink
Add RanDU soft reboot test case with helper functions
Browse files Browse the repository at this point in the history
Add a test case which soft reboots the cluster nodes via systemctl
reboot, waits for the node to recover and validates that the test
applications starts without issues.
  • Loading branch information
mcornea committed Oct 5, 2023
1 parent 2ead379 commit c5f11e8
Show file tree
Hide file tree
Showing 5 changed files with 263 additions and 0 deletions.
18 changes: 18 additions & 0 deletions tests/internal/reboot/reboot.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
package reboot

import (
"github.com/openshift-kni/eco-gosystem/tests/internal/cmd"
)

// SoftRebootNode executes systemctl reboot on a node.
func SoftRebootNode(nodeName string) error {
cmdToExec := []string{"chroot", "/rootfs", "systemctl", "reboot"}

_, err := cmd.ExecCmd(cmdToExec, nodeName)

if err != nil {
return err
}

return nil
}
77 changes: 77 additions & 0 deletions tests/internal/sriov/list.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
package sriov

import (
"encoding/json"

"github.com/golang/glog"
"github.com/openshift-kni/eco-goinfra/pkg/clients"
"github.com/openshift-kni/eco-goinfra/pkg/sriov"
metaV1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

// NetworkInfo structure to store pod network information.
type NetworkInfo struct {
Name string `json:"name"`
}

// ListNetworksByDeviceType returns a list of sriov networks matching the policies
// device type.
func ListNetworksByDeviceType(
apiClient *clients.Settings,
deviceType string,
) ([]string, error) {
var devNetworks []string

operatornsname := "openshift-sriov-network-operator"
options := metaV1.ListOptions{}
sriovPolicies, err := sriov.ListPolicy(apiClient, operatornsname, options)

if err != nil {
glog.V(100).Infof("Failed to list sriov policies in namespace: %s", operatornsname)

return nil, err
}

sriovNetworks, err := sriov.List(apiClient, operatornsname, options)

if err != nil {
glog.V(100).Infof("Failed to list sriov networks in namespace: %s", operatornsname)

return nil, err
}

for _, policy := range sriovPolicies {
if policy.Definition.Spec.DeviceType == deviceType {
for _, network := range sriovNetworks {
if policy.Definition.Spec.ResourceName == network.Definition.Spec.ResourceName {
devNetworks = append(devNetworks, network.Definition.Name)
}
}
}
}

return devNetworks, nil
}

// ExtractNetworkNames returns the name of the networks based on the pods
// network status annotations.
func ExtractNetworkNames(jsonData string) ([]string, error) {
var networkInfo []NetworkInfo

// Unmarshal the JSON data into the networkInfo slice.
err := json.Unmarshal([]byte(jsonData), &networkInfo)
if err != nil {
return nil, err
}

// Extract the interface names into a separate slice.
var networkNames []string

for _, info := range networkInfo {
if info.Name != "ovn-kubernetes" {
networkNames = append(networkNames, info.Name)
}
}

return networkNames, nil
}
1 change: 1 addition & 0 deletions tests/ran-du/internal/randuconfig/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ type RanDuConfig struct {
CreateMethod string `yaml:"create_method" envconfig:"ECO_RANDU_TESTWORKLOAD_CREATE_METHOD"`
CreateShellCmd string `yaml:"create_shell_cmd" envconfig:"ECO_RANDU_TESTWORKLOAD_CREATE_SHELLCMD"`
} `yaml:"randu_test_workload"`
SoftRebootIterations string `yaml:"soft_reboot_iterations" envconfig:"ECO_RANDU_SOFT_REBOOT_ITERATIONS"`
}

// NewRanDuConfig returns instance of RanDuConfig config type.
Expand Down
1 change: 1 addition & 0 deletions tests/ran-du/internal/randuconfig/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ randu_test_workload:
namespace: 'test'
create_method: 'shell'
create_shell_cmd: '/opt/vdu-workload-emulator/add_test-deployments.sh'
soft_reboot_iterations: '5'
166 changes: 166 additions & 0 deletions tests/ran-du/tests/soft-reboot.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
package ran_du_system_test

import (
"fmt"
"strconv"
"strings"
"time"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"github.com/openshift-kni/eco-goinfra/pkg/deployment"
"github.com/openshift-kni/eco-goinfra/pkg/namespace"
"github.com/openshift-kni/eco-goinfra/pkg/nodes"
"github.com/openshift-kni/eco-goinfra/pkg/pod"
"github.com/openshift-kni/eco-goinfra/pkg/polarion"
"github.com/openshift-kni/eco-gosystem/tests/internal/await"
"github.com/openshift-kni/eco-gosystem/tests/internal/reboot"
"github.com/openshift-kni/eco-gosystem/tests/internal/shell"
"github.com/openshift-kni/eco-gosystem/tests/internal/sriov"
. "github.com/openshift-kni/eco-gosystem/tests/ran-du/internal/randuinittools"
"github.com/openshift-kni/eco-gosystem/tests/ran-du/internal/randuparams"
"github.com/openshift-kni/eco-gosystem/tests/ran-du/internal/randutestworkload"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

var _ = Describe(
"SoftReboot",
Ordered,
ContinueOnFailure,
Label("SoftReboot"), func() {
BeforeAll(func() {
By("Preparing workload")

if namespace.NewBuilder(APIClient, RanDuTestConfig.TestWorkload.Namespace).Exists() {
err := randutestworkload.CleanNameSpace(randuparams.DefaultTimeout, RanDuTestConfig.TestWorkload.Namespace)
Expect(err).ToNot(HaveOccurred(), "Failed to clean workload test namespace objects")
}

if RanDuTestConfig.TestWorkload.CreateMethod == "shell" {
By("Launching workload using shell method")
_, err := shell.ExecuteCmd(RanDuTestConfig.TestWorkload.CreateShellCmd)
Expect(err).ToNot(HaveOccurred(), "Failed to launch workload")
}

By("Waiting for deployment replicas to become ready")
_, err := await.WaitUntilAllDeploymentsReady(APIClient, RanDuTestConfig.TestWorkload.Namespace,
randuparams.DefaultTimeout)
Expect(err).ToNot(HaveOccurred(), "error while waiting for deployment to become ready")

By("Waiting for statefulset replicas to become ready")
_, err = await.WaitUntilAllStatefulSetsReady(APIClient, RanDuTestConfig.TestWorkload.Namespace,
randuparams.DefaultTimeout)
Expect(err).ToNot(HaveOccurred(), "error while waiting for statefulsets to become ready")

})
It("Soft reboot nodes", polarion.ID("42738"), Label("SoftReboot"), func() {
By("Retrieve nodes list")
nodeList, err := nodes.List(
APIClient,
metav1.ListOptions{},
)
Expect(err).ToNot(HaveOccurred(), "Error listing nodes.")

By("Pull openshift-apiserver deployment spec")
deploy, err := deployment.Pull(APIClient, "apiserver", "openshift-apiserver")
Expect(err).ToNot(HaveOccurred(), "error while pulling openshift apiserver deployment")

rebootIterations, err := strconv.Atoi(RanDuTestConfig.SoftRebootIterations)
if err != nil {
fmt.Println(err)
}

for r := 0; r < rebootIterations; r++ {
By("Soft rebooting cluster")
fmt.Printf("Soft reboot iteration no. %d\n", r)
for _, node := range nodeList {
By("Reboot node")
fmt.Printf("Reboot node %s", node.Definition.Name)
err = reboot.SoftRebootNode(node.Definition.Name)
Expect(err).ToNot(HaveOccurred(), "Error rebooting the nodes.")

By("Wait for node to become unreachable")
fmt.Printf("Wait for node %s to become unreachable", node.Definition.Name)
err = await.WaitUntilNodeIsUnreachable(node.Definition.Name, 3*time.Minute)
Expect(err).ToNot(HaveOccurred(), "Node is still reachable: %s", err)

By("Wait for the openshift apiserver deployment to be available")
err = deploy.WaitUntilCondition("Available", 5*time.Minute)
Expect(err).ToNot(HaveOccurred(), "openshift apiserver deployment has not recovered in time after reboot")

By("Wait for two more minutes for the cluster resources to reconciliate their state")
time.Sleep(2 * time.Minute)

By("Remove any pods in UnexpectedAdmissionError state")
listOptions := metav1.ListOptions{
FieldSelector: "status.phase=Failed",
}
podsList, err := pod.List(APIClient, RanDuTestConfig.TestWorkload.Namespace, listOptions)
Expect(err).ToNot(HaveOccurred(), "could not retrieve pod list")

for _, failedPod := range podsList {
if failedPod.Definition.Status.Reason == "UnexpectedAdmissionError" {
_, err := failedPod.DeleteAndWait(60 * time.Second)
Expect(err).ToNot(HaveOccurred(), "could not delete pod in UnexpectedAdmissionError state")
}
}

By("Waiting for deployment replicas to become ready")
_, err = await.WaitUntilAllDeploymentsReady(APIClient, RanDuTestConfig.TestWorkload.Namespace,
randuparams.DefaultTimeout)
Expect(err).ToNot(HaveOccurred(), "error while waiting for deployment to become ready")

By("Waiting for statefulset replicas to become ready")
_, err = await.WaitUntilAllStatefulSetsReady(APIClient, RanDuTestConfig.TestWorkload.Namespace,
randuparams.DefaultTimeout)
Expect(err).ToNot(HaveOccurred(), "error while waiting for statefulsets to become ready")

By("Retrieve pod list")
podsList, err = pod.List(APIClient, RanDuTestConfig.TestWorkload.Namespace, metav1.ListOptions{})
Expect(err).ToNot(HaveOccurred(), "could not retrieve pod list")

By("Retrieve sriov networks with vfio-pci driver")
vfioNetworks, err := sriov.ListNetworksByDeviceType(APIClient, "vfio-pci")
Expect(err).ToNot(HaveOccurred(), "error when retrieving sriov network using vfio-pci driver")

By("Assert devices under /dev/vfio on pod are equal or more to the pods vfio-pci network attachments\n")
for _, pod := range podsList {
networkNames, err := sriov.ExtractNetworkNames(pod.Object.Annotations["k8s.v1.cni.cncf.io/network-status"])
Expect(err).ToNot(HaveOccurred(), "error when retrieving pod network attachments")

podvfioDevices := 0

for _, vfioNet := range vfioNetworks {
for _, podNet := range networkNames {
if strings.Contains(podNet, pod.Definition.Namespace+"/"+vfioNet) {
podvfioDevices++
}
}
}

if podvfioDevices > 0 {
fmt.Printf("Check /dev/vfio on pod %s\n", pod.Definition.Name)
lscmd := []string{"ls", "--color=never", "/dev/vfio"}
cmd, err := pod.ExecCommand(lscmd)
Expect(err).ToNot(HaveOccurred(), "error when executing command on pod")

// retry in case the command exec returns an empty string
if len(cmd.String()) == 0 {
cmd, err = pod.ExecCommand(lscmd)
Expect(err).ToNot(HaveOccurred(), "error when executing command on pod")
}

vfioDevls := strings.Fields(strings.ReplaceAll(cmd.String(), "vfio", ""))
Expect(len(vfioDevls)).To(BeNumerically(">=", podvfioDevices),
"error: vfio devices inside pod( %s ) do not match pod %s attachments:", cmd.String(), pod.Definition.Name)
}
}
}
}
})
AfterAll(func() {
By("Cleaning up test workload resources")
err := randutestworkload.CleanNameSpace(randuparams.DefaultTimeout, RanDuTestConfig.TestWorkload.Namespace)
Expect(err).ToNot(HaveOccurred(), "Failed to clean workload test namespace objects")
})
})

0 comments on commit c5f11e8

Please sign in to comment.