diff --git a/controllers/owner_and_name.go b/controllers/owner_and_name.go new file mode 100644 index 00000000..eda6eadb --- /dev/null +++ b/controllers/owner_and_name.go @@ -0,0 +1,96 @@ +package controllers + +import ( + "context" + "errors" + + "github.com/go-logr/logr" + commonAnnotations "github.com/medik8s/common/pkg/annotations" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/openshift/api/machine/v1beta1" + + "github.com/medik8s/self-node-remediation/api/v1alpha1" +) + +// GetNodeName gets the node name: +// - if owned by NHC, or as fallback, from annotation or CR name +// - if owned by a Machine, from the Machine's node reference +func GetNodeName(ctx context.Context, c client.Client, snr *v1alpha1.SelfNodeRemediation, log logr.Logger) (string, error) { + // NHC has priority, so check it first: in case the SNR is owned by NHC, get the node name from annotation or CR name + if ownedByNHC, _ := IsOwnedByNHC(snr); ownedByNHC { + return getNodeNameDirect(snr), nil + } + // in case the SNR is owned by a Machine, we need to check the Machine's nodeRef + if ownedByMachine, ref := IsOwnedByMachine(snr); ownedByMachine { + return getNodeNameFromMachine(ctx, c, ref, snr.GetNamespace(), log) + } + // fallback: annotation or name + return getNodeNameDirect(snr), nil +} + +func getNodeNameDirect(snr *v1alpha1.SelfNodeRemediation) string { + nodeName, isNodeNameAnnotationExist := snr.GetAnnotations()[commonAnnotations.NodeNameAnnotation] + if isNodeNameAnnotationExist { + return nodeName + } + return snr.GetName() +} + +// IsOwnedByNHC checks if the SNR CR is owned by a NodeHealthCheck CR. +func IsOwnedByNHC(snr *v1alpha1.SelfNodeRemediation) (bool, *metav1.OwnerReference) { + for _, ownerRef := range snr.OwnerReferences { + if ownerRef.Kind == "NodeHealthCheck" { + return true, &ownerRef + } + } + return false, nil +} + +// IsOwnedByMachine checks if the SNR CR is owned by a Machine CR. +func IsOwnedByMachine(snr *v1alpha1.SelfNodeRemediation) (bool, *metav1.OwnerReference) { + for _, ownerRef := range snr.OwnerReferences { + if ownerRef.Kind == "Machine" { + return true, &ownerRef + } + } + return false, nil +} + +// IsSNRMatching checks if the SNR CR is matching the node or machine name, +// and additionally returns the node name for the SNR in case machineName is empty +func IsSNRMatching(ctx context.Context, c client.Client, snr *v1alpha1.SelfNodeRemediation, nodeName string, machineName string, log logr.Logger) (bool, string, error) { + if isOwnedByMachine, ref := IsOwnedByMachine(snr); isOwnedByMachine && machineName == ref.Name { + return true, "", nil + } + snrNodeName, err := GetNodeName(ctx, c, snr, log) + if err != nil { + log.Error(err, "failed to get node name from machine") + return false, "", err + } + return snrNodeName == nodeName, snrNodeName, nil +} + +func getNodeNameFromMachine(ctx context.Context, c client.Client, ref *metav1.OwnerReference, ns string, log logr.Logger) (string, error) { + machine := &v1beta1.Machine{} + machineKey := client.ObjectKey{ + Name: ref.Name, + Namespace: ns, + } + + if err := c.Get(ctx, machineKey, machine); err != nil { + log.Error(err, "failed to get machine from SelfNodeRemediation CR owner ref", + "machine name", machineKey.Name, "namespace", machineKey.Namespace) + return "", err + } + + if machine.Status.NodeRef == nil { + err := errors.New("nodeRef is nil") + log.Error(err, "failed to retrieve node from the unhealthy machine") + return "", err + } + + return machine.Status.NodeRef.Name, nil +} diff --git a/controllers/selfnoderemediation_controller.go b/controllers/selfnoderemediation_controller.go index b4a57240..4283f0ec 100644 --- a/controllers/selfnoderemediation_controller.go +++ b/controllers/selfnoderemediation_controller.go @@ -22,7 +22,6 @@ import ( "time" "github.com/go-logr/logr" - commonAnnotations "github.com/medik8s/common/pkg/annotations" "github.com/medik8s/common/pkg/events" "github.com/medik8s/common/pkg/resources" "github.com/pkg/errors" @@ -39,8 +38,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" - "github.com/openshift/api/machine/v1beta1" - "github.com/medik8s/self-node-remediation/api/v1alpha1" "github.com/medik8s/self-node-remediation/pkg/reboot" "github.com/medik8s/self-node-remediation/pkg/utils" @@ -172,8 +169,12 @@ func (r *SelfNodeRemediationReconciler) ReconcileAgent(ctx context.Context, req return ctrl.Result{}, err } - targetNodeName := getNodeName(snr) - if targetNodeName != r.MyNodeName { + snrMatches, targetNodeName, err := IsSNRMatching(ctx, r.Client, snr, r.MyNodeName, "", r.logger) + if err != nil { + r.logger.Error(err, "failed to check if SNR matches our node") + return ctrl.Result{}, err + } + if !snrMatches { r.logger.Info("agent pod skipping remediation because node belongs to a different agent", "Agent node name", r.MyNodeName, "Remediated node name", targetNodeName) return ctrl.Result{}, nil } @@ -183,7 +184,7 @@ func (r *SelfNodeRemediationReconciler) ReconcileAgent(ctx context.Context, req switch phase { case preRebootCompletedPhase: r.logger.Info("node reboot not completed yet, start rebooting") - node, err := r.getNodeFromSnr(snr) + node, err := r.getNodeFromSnr(ctx, snr) if err != nil { r.logger.Info("didn't find node, eventing might be incomplete", "node name", targetNodeName) } @@ -254,7 +255,7 @@ func (r *SelfNodeRemediationReconciler) ReconcileManager(ctx context.Context, re result := ctrl.Result{} var err error - node, err := r.getNodeFromSnr(snr) + node, err := r.getNodeFromSnr(ctx, snr) if err != nil { if apiErrors.IsNotFound(err) { r.logger.Info("couldn't find node matching remediation", "remediation name", snr.Name) @@ -688,65 +689,20 @@ func (r *SelfNodeRemediationReconciler) setTimeAssumedRebooted(ctx context.Conte } // getNodeFromSnr returns the unhealthy node reported in the given snr -func (r *SelfNodeRemediationReconciler) getNodeFromSnr(snr *v1alpha1.SelfNodeRemediation) (*v1.Node, error) { - // SNR could be created by either machine based controller (e.g. MHC) or - // by a node based controller (e.g. NHC). - // In case snr is created with machine owner reference if NHC isn't it's owner it means - // it was created by a machine based controller (e.g. MHC). - if !IsOwnedByNHC(snr) { - for _, ownerRef := range snr.OwnerReferences { - if ownerRef.Kind == "Machine" { - return r.getNodeFromMachine(ownerRef, snr.Namespace) - } - } - } - - // since we didn't find a Machine owner ref, we assume that SNR remediation contains the node's name either in the - // remediation name or in its annotation - node := &v1.Node{} - key := client.ObjectKey{ - Name: getNodeName(snr), - Namespace: "", - } - - if err := r.Get(context.TODO(), key, node); err != nil { - return nil, err - } - - return node, nil -} - -func (r *SelfNodeRemediationReconciler) getNodeFromMachine(ref metav1.OwnerReference, ns string) (*v1.Node, error) { - machine := &v1beta1.Machine{} - machineKey := client.ObjectKey{ - Name: ref.Name, - Namespace: ns, - } - - if err := r.Client.Get(context.Background(), machineKey, machine); err != nil { - r.logger.Error(err, "failed to get machine from SelfNodeRemediation CR owner ref", - "machine name", machineKey.Name, "namespace", machineKey.Namespace) - return nil, err - } - - if machine.Status.NodeRef == nil { - err := errors.New("nodeRef is nil") - r.logger.Error(err, "failed to retrieve node from the unhealthy machine") +func (r *SelfNodeRemediationReconciler) getNodeFromSnr(ctx context.Context, snr *v1alpha1.SelfNodeRemediation) (*v1.Node, error) { + nodeName, err := GetNodeName(ctx, r.Client, snr, r.logger) + if err != nil { return nil, err } node := &v1.Node{} key := client.ObjectKey{ - Name: machine.Status.NodeRef.Name, - Namespace: machine.Status.NodeRef.Namespace, + Name: nodeName, + Namespace: "", } - - if err := r.Get(context.Background(), key, node); err != nil { - r.logger.Error(err, "failed to retrieve node from the unhealthy machine", - "node name", node.Name, "machine name", machine.Name) + if err := r.Get(ctx, key, node); err != nil { return nil, err } - return node, nil } @@ -950,21 +906,3 @@ func (r *SelfNodeRemediationReconciler) getRuntimeStrategy(snr *v1alpha1.SelfNod return remediationStrategy } - -func IsOwnedByNHC(snr *v1alpha1.SelfNodeRemediation) bool { - for _, ownerRef := range snr.OwnerReferences { - if ownerRef.Kind == "NodeHealthCheck" { - return true - } - } - return false -} - -// getNodeName checks for the node name in SNR CR's annotation. If it does not exist it assumes the node name equals to SNR CR's name and returns it. -func getNodeName(snr *v1alpha1.SelfNodeRemediation) string { - nodeName, isNodeNameAnnotationExist := snr.GetAnnotations()[commonAnnotations.NodeNameAnnotation] - if isNodeNameAnnotationExist { - return nodeName - } - return snr.GetName() -} diff --git a/controllers/tests/controller/selfnoderemediation_controller_test.go b/controllers/tests/controller/selfnoderemediation_controller_test.go index d365491b..fe9a15ca 100644 --- a/controllers/tests/controller/selfnoderemediation_controller_test.go +++ b/controllers/tests/controller/selfnoderemediation_controller_test.go @@ -419,7 +419,7 @@ var _ = Describe("SNR Controller", func() { verifyEvent("Warning", "RemediationCannotStart", "Could not get remediation target Node") }) }) - When("NHC isn set as owner in the remediation", func() { + When("NHC is set as owner in the remediation", func() { BeforeEach(func() { snr.OwnerReferences = append(snr.OwnerReferences, metav1.OwnerReference{Name: "nhc", Kind: "NodeHealthCheck", APIVersion: "remediation.medik8s.io/v1alpha1", UID: "12345"}) }) @@ -625,7 +625,7 @@ func removeUnschedulableTaint() { func verifyNodeIsUnschedulable() *v1.Node { By("Verify that node was marked as unschedulable") node := &v1.Node{} - Eventually(func() (bool, error) { + EventuallyWithOffset(1, func() (bool, error) { err := k8sClient.Client.Get(context.TODO(), unhealthyNodeNamespacedName, node) return node.Spec.Unschedulable, err }, 5*time.Second, 250*time.Millisecond).Should(BeTrue(), "node should be marked as unschedulable") diff --git a/e2e/self_node_remediation_test.go b/e2e/self_node_remediation_test.go index 0ef9dc74..40f3b365 100644 --- a/e2e/self_node_remediation_test.go +++ b/e2e/self_node_remediation_test.go @@ -133,7 +133,7 @@ var _ = Describe("Self Node Remediation E2E", func() { Describe("Without API connectivity", func() { var testStartTime *metav1.Time BeforeEach(func() { - testStartTime = &metav1.Time{time.Now()} + testStartTime = &metav1.Time{Time: time.Now()} }) Context("Healthy node (no SNR)", func() { diff --git a/main.go b/main.go index 3ad723fa..cb0dbb73 100644 --- a/main.go +++ b/main.go @@ -377,7 +377,7 @@ func initSelfNodeRemediationAgent(mgr manager.Manager) { setupLog.Info("init grpc server") // TODO make port configurable? - server, err := peerhealth.NewServer(snrReconciler, mgr.GetConfig(), ctrl.Log.WithName("peerhealth").WithName("server"), peerHealthDefaultPort, certReader) + server, err := peerhealth.NewServer(mgr.GetClient(), mgr.GetAPIReader(), ctrl.Log.WithName("peerhealth").WithName("server"), peerHealthDefaultPort, certReader) if err != nil { setupLog.Error(err, "failed to init grpc server") os.Exit(1) diff --git a/pkg/apicheck/check.go b/pkg/apicheck/check.go index 4fb6275e..f3270a4d 100644 --- a/pkg/apicheck/check.go +++ b/pkg/apicheck/check.go @@ -88,7 +88,7 @@ func (c *ApiConnectivityCheck) Start(ctx context.Context) error { } } if failure != "" { - c.config.Log.Error(fmt.Errorf(failure), "failed to check api server") + c.config.Log.Info(fmt.Sprintf("failed to check api server: %s", failure)) if isHealthy := c.isConsideredHealthy(); !isHealthy { // we have a problem on this node c.config.Log.Error(err, "we are unhealthy, triggering a reboot") @@ -96,7 +96,7 @@ func (c *ApiConnectivityCheck) Start(ctx context.Context) error { c.config.Log.Error(err, "failed to trigger reboot") } } else { - c.config.Log.Error(err, "peers did not confirm that we are unhealthy, ignoring error") + c.config.Log.Info("peers did not confirm that we are unhealthy, ignoring error") } return } else { diff --git a/pkg/peerhealth/client_server_test.go b/pkg/peerhealth/client_server_test.go index 5ddbe7bc..b520c91e 100644 --- a/pkg/peerhealth/client_server_test.go +++ b/pkg/peerhealth/client_server_test.go @@ -48,7 +48,7 @@ var _ = Describe("Checking health using grpc client and server", func() { } By("Creating server") - phServer, err = NewServer(snrReconciler, cfg, ctrl.Log.WithName("peerhealth test").WithName("phServer"), 9000, certReader) + phServer, err = NewServer(k8sClient, reader, ctrl.Log.WithName("peerhealth test").WithName("phServer"), 9000, certReader) Expect(err).ToNot(HaveOccurred()) By("Starting server") diff --git a/pkg/peerhealth/server.go b/pkg/peerhealth/server.go index e02c5bc8..93ae7cb2 100644 --- a/pkg/peerhealth/server.go +++ b/pkg/peerhealth/server.go @@ -4,18 +4,14 @@ import ( "context" "fmt" "net" - "strings" "time" "github.com/go-logr/logr" "google.golang.org/grpc" corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime/schema" - "k8s.io/client-go/dynamic" - "k8s.io/client-go/rest" + "sigs.k8s.io/controller-runtime/pkg/client" selfNodeRemediationApis "github.com/medik8s/self-node-remediation/api" "github.com/medik8s/self-node-remediation/api/v1alpha1" @@ -46,25 +42,18 @@ var ( type Server struct { UnimplementedPeerHealthServer - client dynamic.Interface - snr *controllers.SelfNodeRemediationReconciler + c client.Client + reader client.Reader log logr.Logger certReader certificates.CertStorageReader port int } // NewServer returns a new Server -func NewServer(snr *controllers.SelfNodeRemediationReconciler, conf *rest.Config, log logr.Logger, port int, certReader certificates.CertStorageReader) (*Server, error) { - - // create dynamic client - c, err := dynamic.NewForConfig(conf) - if err != nil { - return nil, err - } - +func NewServer(c client.Client, reader client.Reader, log logr.Logger, port int, certReader certificates.CertStorageReader) (*Server, error) { return &Server{ - client: c, - snr: snr, + c: c, + reader: reader, log: log, certReader: certReader, port: port, @@ -112,7 +101,7 @@ func (s *Server) Start(ctx context.Context) error { } // IsHealthy checks if the given node is healthy -func (s Server) IsHealthy(ctx context.Context, request *HealthRequest) (*HealthResponse, error) { +func (s *Server) IsHealthy(ctx context.Context, request *HealthRequest) (*HealthResponse, error) { s.log.Info("IsHealthy", "node", request.GetNodeName()) nodeName := request.GetNodeName() @@ -123,36 +112,36 @@ func (s Server) IsHealthy(ctx context.Context, request *HealthRequest) (*HealthR apiCtx, cancelFunc := context.WithTimeout(ctx, apiServerTimeout) defer cancelFunc() - //fetch all snrs from all ns + // list snrs from all ns + // don't use cache, because this also tests API server connectivity! snrs := &v1alpha1.SelfNodeRemediationList{} - if err := s.snr.List(apiCtx, snrs); err != nil { - s.log.Error(err, "api error failed to fetch snrs") + if err := s.reader.List(apiCtx, snrs); err != nil { + s.log.Error(err, "api error, failed to list snrs") return toResponse(selfNodeRemediationApis.ApiError) } - //return healthy only if all of snrs are considered healthy for that node - for _, snr := range snrs.Items { - isOwnedByNHC := controllers.IsOwnedByNHC(&snr) - if isOwnedByNHC && strings.HasPrefix(snr.Name, nodeName) { - s.log.Info("IsHealthy OWNED by NHC unhealthy", "snr name", snr.Name, "node", nodeName) - return toResponse(selfNodeRemediationApis.Unhealthy) - - } else if !isOwnedByNHC && snr.Name == request.MachineName { - s.log.Info("IsHealthy NOT OWNED by NHC unhealthy", "snr name", snr.Name, "machine", request.MachineName) + // return healthy only if no snr matches that node + for i := range snrs.Items { + snrMatches, _, err := controllers.IsSNRMatching(ctx, s.c, &snrs.Items[i], nodeName, request.GetMachineName(), s.log) + if err != nil { + s.log.Error(err, "failed to check if SNR matches node") + continue + } + if snrMatches { + s.log.Info("found matching SNR, node is unhealthy", "node", nodeName, "machine", request.MachineName) return toResponse(selfNodeRemediationApis.Unhealthy) } - s.log.Info("IsHealthy continue", "snr name", snr.Name) } - s.log.Info("IsHealthy IS indeed healthy", "node", nodeName, "machine", request.MachineName) + s.log.Info("no matching SNR found, node is considered healthy", "node", nodeName, "machine", request.MachineName) return toResponse(selfNodeRemediationApis.Healthy) } -func (s Server) getNode(ctx context.Context, nodeName string) (*unstructured.Unstructured, error) { +func (s *Server) getNode(ctx context.Context, nodeName string) (*corev1.Node, error) { apiCtx, cancelFunc := context.WithTimeout(ctx, apiServerTimeout) defer cancelFunc() - node, err := s.client.Resource(nodeRes).Namespace("").Get(apiCtx, nodeName, metav1.GetOptions{}) - if err != nil { + node := &corev1.Node{} + if err := s.c.Get(apiCtx, client.ObjectKey{Name: nodeName}, node); err != nil { s.log.Error(err, "api error") return nil, err } diff --git a/pkg/peerhealth/suite_test.go b/pkg/peerhealth/suite_test.go index 1cf3b1e2..c76abf80 100644 --- a/pkg/peerhealth/suite_test.go +++ b/pkg/peerhealth/suite_test.go @@ -32,6 +32,7 @@ const nodeName = "somenode" var cfg *rest.Config var k8sClient client.Client +var reader client.Reader var testEnv *envtest.Environment var snrReconciler *controllers.SelfNodeRemediationReconciler var cancelFunc context.CancelFunc @@ -72,6 +73,9 @@ var _ = BeforeSuite(func() { k8sClient = k8sManager.GetClient() Expect(k8sClient).ToNot(BeNil()) + reader = k8sManager.GetAPIReader() + Expect(reader).ToNot(BeNil()) + // we need a reconciler for getting last SNR namespace snrReconciler = &controllers.SelfNodeRemediationReconciler{ Client: k8sClient,