Skip to content

Commit

Permalink
Several fixes:
Browse files Browse the repository at this point in the history
- fix agent reconciler to respect SNR owned by Machines
- fix peer server SNR check
- align name / owner checks into own file
- reduce noisy logging

Signed-off-by: Marc Sluiter <[email protected]>
  • Loading branch information
slintes committed Jul 9, 2024
1 parent 3bdc9b3 commit 3dc4a7c
Show file tree
Hide file tree
Showing 9 changed files with 145 additions and 118 deletions.
96 changes: 96 additions & 0 deletions controllers/owner_and_name.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
package controllers

import (
"context"
"errors"

"github.com/go-logr/logr"
commonAnnotations "github.com/medik8s/common/pkg/annotations"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/openshift/api/machine/v1beta1"

"github.com/medik8s/self-node-remediation/api/v1alpha1"
)

// GetNodeName gets the node name:
// - if owned by NHC, or as fallback, from annotation or CR name
// - if owned by a Machine, from the Machine's node reference
func GetNodeName(ctx context.Context, c client.Client, snr *v1alpha1.SelfNodeRemediation, log logr.Logger) (string, error) {
// NHC has priority, so check it first: in case the SNR is owned by NHC, get the node name from annotation or CR name
if ownedByNHC, _ := IsOwnedByNHC(snr); ownedByNHC {
return getNodeNameDirect(snr), nil
}
// in case the SNR is owned by a Machine, we need to check the Machine's nodeRef
if ownedByMachine, ref := IsOwnedByMachine(snr); ownedByMachine {
return getNodeNameFromMachine(ctx, c, ref, snr.GetNamespace(), log)
}
// fallback: annotation or name
return getNodeNameDirect(snr), nil
}

func getNodeNameDirect(snr *v1alpha1.SelfNodeRemediation) string {
nodeName, isNodeNameAnnotationExist := snr.GetAnnotations()[commonAnnotations.NodeNameAnnotation]
if isNodeNameAnnotationExist {
return nodeName
}
return snr.GetName()
}

// IsOwnedByNHC checks if the SNR CR is owned by a NodeHealthCheck CR.
func IsOwnedByNHC(snr *v1alpha1.SelfNodeRemediation) (bool, *metav1.OwnerReference) {
for _, ownerRef := range snr.OwnerReferences {
if ownerRef.Kind == "NodeHealthCheck" {
return true, &ownerRef
}
}
return false, nil
}

// IsOwnedByMachine checks if the SNR CR is owned by a Machine CR.
func IsOwnedByMachine(snr *v1alpha1.SelfNodeRemediation) (bool, *metav1.OwnerReference) {
for _, ownerRef := range snr.OwnerReferences {
if ownerRef.Kind == "Machine" {
return true, &ownerRef
}
}
return false, nil
}

// IsSNRMatching checks if the SNR CR is matching the node or machine name,
// and additionally returns the node name for the SNR in case machineName is empty
func IsSNRMatching(ctx context.Context, c client.Client, snr *v1alpha1.SelfNodeRemediation, nodeName string, machineName string, log logr.Logger) (bool, string, error) {
if isOwnedByMachine, ref := IsOwnedByMachine(snr); isOwnedByMachine && machineName == ref.Name {
return true, "", nil
}
snrNodeName, err := GetNodeName(ctx, c, snr, log)
if err != nil {
log.Error(err, "failed to get node name from machine")
return false, "", err
}
return snrNodeName == nodeName, snrNodeName, nil
}

func getNodeNameFromMachine(ctx context.Context, c client.Client, ref *metav1.OwnerReference, ns string, log logr.Logger) (string, error) {
machine := &v1beta1.Machine{}
machineKey := client.ObjectKey{
Name: ref.Name,
Namespace: ns,
}

if err := c.Get(ctx, machineKey, machine); err != nil {
log.Error(err, "failed to get machine from SelfNodeRemediation CR owner ref",
"machine name", machineKey.Name, "namespace", machineKey.Namespace)
return "", err
}

if machine.Status.NodeRef == nil {
err := errors.New("nodeRef is nil")
log.Error(err, "failed to retrieve node from the unhealthy machine")
return "", err
}

return machine.Status.NodeRef.Name, nil
}
90 changes: 14 additions & 76 deletions controllers/selfnoderemediation_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ import (
"time"

"github.com/go-logr/logr"
commonAnnotations "github.com/medik8s/common/pkg/annotations"
"github.com/medik8s/common/pkg/events"
"github.com/medik8s/common/pkg/resources"
"github.com/pkg/errors"
Expand All @@ -39,8 +38,6 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"

"github.com/openshift/api/machine/v1beta1"

"github.com/medik8s/self-node-remediation/api/v1alpha1"
"github.com/medik8s/self-node-remediation/pkg/reboot"
"github.com/medik8s/self-node-remediation/pkg/utils"
Expand Down Expand Up @@ -172,8 +169,12 @@ func (r *SelfNodeRemediationReconciler) ReconcileAgent(ctx context.Context, req
return ctrl.Result{}, err
}

targetNodeName := getNodeName(snr)
if targetNodeName != r.MyNodeName {
snrMatches, targetNodeName, err := IsSNRMatching(ctx, r.Client, snr, r.MyNodeName, "", r.logger)
if err != nil {
r.logger.Error(err, "failed to check if SNR matches our node")
return ctrl.Result{}, err
}
if !snrMatches {
r.logger.Info("agent pod skipping remediation because node belongs to a different agent", "Agent node name", r.MyNodeName, "Remediated node name", targetNodeName)
return ctrl.Result{}, nil
}
Expand All @@ -183,7 +184,7 @@ func (r *SelfNodeRemediationReconciler) ReconcileAgent(ctx context.Context, req
switch phase {
case preRebootCompletedPhase:
r.logger.Info("node reboot not completed yet, start rebooting")
node, err := r.getNodeFromSnr(snr)
node, err := r.getNodeFromSnr(ctx, snr)
if err != nil {
r.logger.Info("didn't find node, eventing might be incomplete", "node name", targetNodeName)
}
Expand Down Expand Up @@ -254,7 +255,7 @@ func (r *SelfNodeRemediationReconciler) ReconcileManager(ctx context.Context, re
result := ctrl.Result{}
var err error

node, err := r.getNodeFromSnr(snr)
node, err := r.getNodeFromSnr(ctx, snr)
if err != nil {
if apiErrors.IsNotFound(err) {
r.logger.Info("couldn't find node matching remediation", "remediation name", snr.Name)
Expand Down Expand Up @@ -688,65 +689,20 @@ func (r *SelfNodeRemediationReconciler) setTimeAssumedRebooted(ctx context.Conte
}

// getNodeFromSnr returns the unhealthy node reported in the given snr
func (r *SelfNodeRemediationReconciler) getNodeFromSnr(snr *v1alpha1.SelfNodeRemediation) (*v1.Node, error) {
// SNR could be created by either machine based controller (e.g. MHC) or
// by a node based controller (e.g. NHC).
// In case snr is created with machine owner reference if NHC isn't it's owner it means
// it was created by a machine based controller (e.g. MHC).
if !IsOwnedByNHC(snr) {
for _, ownerRef := range snr.OwnerReferences {
if ownerRef.Kind == "Machine" {
return r.getNodeFromMachine(ownerRef, snr.Namespace)
}
}
}

// since we didn't find a Machine owner ref, we assume that SNR remediation contains the node's name either in the
// remediation name or in its annotation
node := &v1.Node{}
key := client.ObjectKey{
Name: getNodeName(snr),
Namespace: "",
}

if err := r.Get(context.TODO(), key, node); err != nil {
return nil, err
}

return node, nil
}

func (r *SelfNodeRemediationReconciler) getNodeFromMachine(ref metav1.OwnerReference, ns string) (*v1.Node, error) {
machine := &v1beta1.Machine{}
machineKey := client.ObjectKey{
Name: ref.Name,
Namespace: ns,
}

if err := r.Client.Get(context.Background(), machineKey, machine); err != nil {
r.logger.Error(err, "failed to get machine from SelfNodeRemediation CR owner ref",
"machine name", machineKey.Name, "namespace", machineKey.Namespace)
return nil, err
}

if machine.Status.NodeRef == nil {
err := errors.New("nodeRef is nil")
r.logger.Error(err, "failed to retrieve node from the unhealthy machine")
func (r *SelfNodeRemediationReconciler) getNodeFromSnr(ctx context.Context, snr *v1alpha1.SelfNodeRemediation) (*v1.Node, error) {
nodeName, err := GetNodeName(ctx, r.Client, snr, r.logger)
if err != nil {
return nil, err
}

node := &v1.Node{}
key := client.ObjectKey{
Name: machine.Status.NodeRef.Name,
Namespace: machine.Status.NodeRef.Namespace,
Name: nodeName,
Namespace: "",
}

if err := r.Get(context.Background(), key, node); err != nil {
r.logger.Error(err, "failed to retrieve node from the unhealthy machine",
"node name", node.Name, "machine name", machine.Name)
if err := r.Get(ctx, key, node); err != nil {
return nil, err
}

return node, nil
}

Expand Down Expand Up @@ -950,21 +906,3 @@ func (r *SelfNodeRemediationReconciler) getRuntimeStrategy(snr *v1alpha1.SelfNod
return remediationStrategy

}

func IsOwnedByNHC(snr *v1alpha1.SelfNodeRemediation) bool {
for _, ownerRef := range snr.OwnerReferences {
if ownerRef.Kind == "NodeHealthCheck" {
return true
}
}
return false
}

// getNodeName checks for the node name in SNR CR's annotation. If it does not exist it assumes the node name equals to SNR CR's name and returns it.
func getNodeName(snr *v1alpha1.SelfNodeRemediation) string {
nodeName, isNodeNameAnnotationExist := snr.GetAnnotations()[commonAnnotations.NodeNameAnnotation]
if isNodeNameAnnotationExist {
return nodeName
}
return snr.GetName()
}
Original file line number Diff line number Diff line change
Expand Up @@ -419,7 +419,7 @@ var _ = Describe("SNR Controller", func() {
verifyEvent("Warning", "RemediationCannotStart", "Could not get remediation target Node")
})
})
When("NHC isn set as owner in the remediation", func() {
When("NHC is set as owner in the remediation", func() {
BeforeEach(func() {
snr.OwnerReferences = append(snr.OwnerReferences, metav1.OwnerReference{Name: "nhc", Kind: "NodeHealthCheck", APIVersion: "remediation.medik8s.io/v1alpha1", UID: "12345"})
})
Expand Down Expand Up @@ -625,7 +625,7 @@ func removeUnschedulableTaint() {
func verifyNodeIsUnschedulable() *v1.Node {
By("Verify that node was marked as unschedulable")
node := &v1.Node{}
Eventually(func() (bool, error) {
EventuallyWithOffset(1, func() (bool, error) {
err := k8sClient.Client.Get(context.TODO(), unhealthyNodeNamespacedName, node)
return node.Spec.Unschedulable, err
}, 5*time.Second, 250*time.Millisecond).Should(BeTrue(), "node should be marked as unschedulable")
Expand Down
2 changes: 1 addition & 1 deletion e2e/self_node_remediation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ var _ = Describe("Self Node Remediation E2E", func() {
Describe("Without API connectivity", func() {
var testStartTime *metav1.Time
BeforeEach(func() {
testStartTime = &metav1.Time{time.Now()}
testStartTime = &metav1.Time{Time: time.Now()}
})

Context("Healthy node (no SNR)", func() {
Expand Down
2 changes: 1 addition & 1 deletion main.go
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,7 @@ func initSelfNodeRemediationAgent(mgr manager.Manager) {

setupLog.Info("init grpc server")
// TODO make port configurable?
server, err := peerhealth.NewServer(snrReconciler, mgr.GetConfig(), ctrl.Log.WithName("peerhealth").WithName("server"), peerHealthDefaultPort, certReader)
server, err := peerhealth.NewServer(mgr.GetClient(), mgr.GetAPIReader(), ctrl.Log.WithName("peerhealth").WithName("server"), peerHealthDefaultPort, certReader)
if err != nil {
setupLog.Error(err, "failed to init grpc server")
os.Exit(1)
Expand Down
4 changes: 2 additions & 2 deletions pkg/apicheck/check.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,15 +88,15 @@ func (c *ApiConnectivityCheck) Start(ctx context.Context) error {
}
}
if failure != "" {
c.config.Log.Error(fmt.Errorf(failure), "failed to check api server")
c.config.Log.Info(fmt.Sprintf("failed to check api server: %s", failure))
if isHealthy := c.isConsideredHealthy(); !isHealthy {
// we have a problem on this node
c.config.Log.Error(err, "we are unhealthy, triggering a reboot")
if err := c.config.Rebooter.Reboot(); err != nil {
c.config.Log.Error(err, "failed to trigger reboot")
}
} else {
c.config.Log.Error(err, "peers did not confirm that we are unhealthy, ignoring error")
c.config.Log.Info("peers did not confirm that we are unhealthy, ignoring error")
}
return
} else {
Expand Down
2 changes: 1 addition & 1 deletion pkg/peerhealth/client_server_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ var _ = Describe("Checking health using grpc client and server", func() {
}

By("Creating server")
phServer, err = NewServer(snrReconciler, cfg, ctrl.Log.WithName("peerhealth test").WithName("phServer"), 9000, certReader)
phServer, err = NewServer(k8sClient, reader, ctrl.Log.WithName("peerhealth test").WithName("phServer"), 9000, certReader)
Expect(err).ToNot(HaveOccurred())

By("Starting server")
Expand Down
Loading

0 comments on commit 3dc4a7c

Please sign in to comment.