From 990a2bcb426299dd404c03dbc2a882d291322643 Mon Sep 17 00:00:00 2001 From: Keiichi Kii Date: Thu, 6 Jul 2023 07:37:14 -0400 Subject: [PATCH 1/9] Add log messages to trace OutOfServiceTaint remediation Signed-off-by: Keiichi Kii Signed-off-by: Keiichi Kii --- controllers/selfnoderemediation_controller.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/controllers/selfnoderemediation_controller.go b/controllers/selfnoderemediation_controller.go index 86a3f664..797aba2c 100644 --- a/controllers/selfnoderemediation_controller.go +++ b/controllers/selfnoderemediation_controller.go @@ -871,6 +871,7 @@ func (r *SelfNodeRemediationReconciler) removeOutOfServiceTaint(node *v1.Node) e r.logger.Error(err, "Failed to remove taint from node,", "node name", node.Name, "taint key", OutOfServiceTaint.Key, "taint effect", OutOfServiceTaint.Effect) return err } + r.logger.Info("outofservice taint removed", "new taints", node.Spec.Taints) return nil } @@ -882,6 +883,7 @@ func (r *SelfNodeRemediationReconciler) isResourceDeletionCompleted(node *v1.Nod } for _, pod := range pods.Items { if pod.Spec.NodeName == node.Name && r.isPodTerminating(&pod) { + r.logger.Info("waiting for terminating pod ", "pod name", pod.Name, "phase", pod.Status.Phase) return false } } @@ -892,6 +894,7 @@ func (r *SelfNodeRemediationReconciler) isResourceDeletionCompleted(node *v1.Nod } for _, va := range volumeAttachments.Items { if va.Spec.NodeName == node.Name { + r.logger.Info("waiting for deleting volumeAttachement", "name", va.Name) return false } } From 653f6a5310d85a740e93786c582314632dcceb0c Mon Sep 17 00:00:00 2001 From: Carlo Lobrano Date: Tue, 11 Jul 2023 12:11:12 +0200 Subject: [PATCH 2/9] Permit to pass further options to Makefile test targets Add TEST_OPS variable to permit passing convenient flags while adding or updating tests/features e.g. * -ginkgo.v[v] * -ginkgo.focus Signed-off-by: Carlo Lobrano --- Makefile | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 2f92886f..32cbfd8d 100644 --- a/Makefile +++ b/Makefile @@ -163,11 +163,13 @@ BIN_ASSETS_DIR=$(shell pwd)/bin ENVTEST_ASSETS_DIR = ${BIN_ASSETS_DIR}/setup-envtest ENVTEST = $(shell pwd)/bin/setup-envtest +# Use TEST_OPS to pass further options to `go test` (e.g. -gingo.v and/or -ginkgo.focus) +export TEST_OPS ?= "" .PHONY: test test: envtest manifests generate fmt vet ## Run tests. KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) -p path --bin-dir $(PROJECT_DIR)/testbin)" \ KUBEBUILDER_CONTROLPLANE_STOP_TIMEOUT="60s"\ - go test ./api/... ./controllers/... ./pkg/... -coverprofile cover.out -v + go test ./api/... ./controllers/... ./pkg/... -coverprofile cover.out -v ${TEST_OPS} .PHONY: bundle-run export BUNDLE_RUN_NAMESPACE ?= openshift-operators @@ -328,7 +330,7 @@ protoc-gen-go-grpc: ## Download protoc-gen-go-grpc locally if necessary. e2e-test: # KUBECONFIG must be set to the cluster, and PP needs to be deployed already # count arg makes the test ignoring cached test results - go test ./e2e -ginkgo.v -ginkgo.progress -test.v -timeout 60m -count=1 + go test ./e2e -ginkgo.v -ginkgo.progress -test.v -timeout 60m -count=1 ${TEST_OPS} .PHONY: operator-sdk OPERATOR_SDK_BIN_FOLDER = ./bin/operator-sdk @@ -429,4 +431,4 @@ fix-imports: sort-imports ## Sort imports $(SORT_IMPORTS) -w . .PHONY: full-gen -full-gen: generate manifests vendor tidy bundle fix-imports bundle-reset ## generates all automatically generated content \ No newline at end of file +full-gen: generate manifests vendor tidy bundle fix-imports bundle-reset ## generates all automatically generated content From 275bc6f6d8551adae286d4be4d23f59e801bd070 Mon Sep 17 00:00:00 2001 From: Carlo Lobrano Date: Wed, 5 Jul 2023 14:27:43 +0200 Subject: [PATCH 3/9] Dockerfile improvements - reduce image package size cleaning up temporary files kept for repositories. closes: https://issues.redhat.com/browse/ECOPROJECT-1417 Signed-off-by: Carlo Lobrano --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index b2bac207..7c7d22cf 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ # Build the manager binary FROM quay.io/centos/centos:stream8 AS builder -RUN yum install golang -y +RUN yum install golang -y && yum clean all # Ensure correct Go version ENV GO_VERSION=1.20 From 322836d0a84be89f65ce3d748ca8eae9992136dd Mon Sep 17 00:00:00 2001 From: Michael Shitrit Date: Wed, 12 Jul 2023 18:19:10 +0300 Subject: [PATCH 4/9] use sysrq-trigger for software reboot Signed-off-by: Michael Shitrit --- pkg/reboot/rebooter.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/reboot/rebooter.go b/pkg/reboot/rebooter.go index 4d3bccec..067117dd 100644 --- a/pkg/reboot/rebooter.go +++ b/pkg/reboot/rebooter.go @@ -82,7 +82,7 @@ func (r *watchdogRebooter) Reboot() error { func (r *watchdogRebooter) softwareReboot() error { r.log.Info("about to try software reboot") // hostPID: true and privileged:true required to run this - rebootCmd := exec.Command("/usr/bin/nsenter", "-m/proc/1/ns/mnt", "/bin/systemctl", "reboot", "--force", "--force") + rebootCmd := exec.Command("bash", "-c", "echo c > /proc/sysrq-trigger") if err := rebootCmd.Run(); err != nil { r.log.Error(err, "failed to run reboot command") From 968cd10d425f39d8cf9b8588c02d4dc856cc1dc9 Mon Sep 17 00:00:00 2001 From: Carlo Lobrano Date: Wed, 5 Jul 2023 14:34:50 +0200 Subject: [PATCH 5/9] Explicitly set MinVersion of TLS Currently, the default MinVersion value for TLS configuration is used, which is TLS1.0 and considered insecure. Explicitly set the MinVersion to a secure version of TLS. closes: https://issues.redhat.com/browse/ECOPROJECT-1419 Signed-off-by: Carlo Lobrano --- pkg/certificates/credentials.go | 4 ++++ pkg/controlplane/manager.go | 6 +++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/pkg/certificates/credentials.go b/pkg/certificates/credentials.go index 32824e0b..c7ebc946 100644 --- a/pkg/certificates/credentials.go +++ b/pkg/certificates/credentials.go @@ -8,6 +8,8 @@ import ( "google.golang.org/grpc/credentials" ) +const TLSMinVersion = tls.VersionTLS13 + func GetServerCredentialsFromCerts(certReader CertStorageReader) (credentials.TransportCredentials, error) { keyPair, pool, err := prepareCredentials(certReader) @@ -19,6 +21,7 @@ func GetServerCredentialsFromCerts(certReader CertStorageReader) (credentials.Tr Certificates: []tls.Certificate{*keyPair}, ClientAuth: tls.RequireAndVerifyClientCert, ClientCAs: pool, + MinVersion: TLSMinVersion, }), nil } @@ -33,6 +36,7 @@ func GetClientCredentialsFromCerts(certReader CertStorageReader) (credentials.Tr Certificates: []tls.Certificate{*keyPair}, RootCAs: pool, ServerName: fixedCertIP.String(), + MinVersion: TLSMinVersion, }), nil } diff --git a/pkg/controlplane/manager.go b/pkg/controlplane/manager.go index 0f92fed5..ad50b536 100644 --- a/pkg/controlplane/manager.go +++ b/pkg/controlplane/manager.go @@ -16,6 +16,7 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + "github.com/medik8s/self-node-remediation/pkg/certificates" "github.com/medik8s/self-node-remediation/pkg/peers" "github.com/medik8s/self-node-remediation/pkg/utils" ) @@ -150,7 +151,10 @@ func (manager *Manager) isEndpointAccessible() bool { func (manager *Manager) isKubeletServiceRunning() bool { url := fmt.Sprintf("https://%s:%s/pods", manager.nodeName, kubeletPort) tr := &http.Transport{ - TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, + TLSClientConfig: &tls.Config{ + InsecureSkipVerify: true, + MinVersion: certificates.TLSMinVersion, + }, } httpClient := &http.Client{Transport: tr} From e05bac40b6ab4c8dd3f9d00a18c2baa0059bf1c5 Mon Sep 17 00:00:00 2001 From: Michael Shitrit Date: Sun, 23 Jul 2023 12:30:46 +0300 Subject: [PATCH 6/9] changing from b to c (c includes unneeded kdump) Signed-off-by: Michael Shitrit --- pkg/reboot/rebooter.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/reboot/rebooter.go b/pkg/reboot/rebooter.go index 067117dd..f5215c1e 100644 --- a/pkg/reboot/rebooter.go +++ b/pkg/reboot/rebooter.go @@ -82,7 +82,7 @@ func (r *watchdogRebooter) Reboot() error { func (r *watchdogRebooter) softwareReboot() error { r.log.Info("about to try software reboot") // hostPID: true and privileged:true required to run this - rebootCmd := exec.Command("bash", "-c", "echo c > /proc/sysrq-trigger") + rebootCmd := exec.Command("bash", "-b", "echo c > /proc/sysrq-trigger") if err := rebootCmd.Run(); err != nil { r.log.Error(err, "failed to run reboot command") From 5534bd41fbbb2e76f033159b91202c9c1c10cf9e Mon Sep 17 00:00:00 2001 From: Michael Shitrit Date: Tue, 25 Jul 2023 14:52:04 +0300 Subject: [PATCH 7/9] use `nsenter` to make sure we have access to relevant binaries installed on the host and are required for the reboot action. Signed-off-by: Michael Shitrit --- pkg/reboot/rebooter.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/reboot/rebooter.go b/pkg/reboot/rebooter.go index f5215c1e..49bde2ab 100644 --- a/pkg/reboot/rebooter.go +++ b/pkg/reboot/rebooter.go @@ -82,7 +82,7 @@ func (r *watchdogRebooter) Reboot() error { func (r *watchdogRebooter) softwareReboot() error { r.log.Info("about to try software reboot") // hostPID: true and privileged:true required to run this - rebootCmd := exec.Command("bash", "-b", "echo c > /proc/sysrq-trigger") + rebootCmd := exec.Command("/usr/bin/nsenter", "-m/proc/1/ns/mnt", "bash", "-b", "echo c > /proc/sysrq-trigger") if err := rebootCmd.Run(); err != nil { r.log.Error(err, "failed to run reboot command") From 97c85d5cb66c68ceab1366ea84cb71ddac58c8aa Mon Sep 17 00:00:00 2001 From: Michael Shitrit Date: Wed, 26 Jul 2023 15:01:28 +0300 Subject: [PATCH 8/9] use full bash path, correct how b option is used Signed-off-by: Michael Shitrit --- pkg/reboot/rebooter.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/reboot/rebooter.go b/pkg/reboot/rebooter.go index 49bde2ab..fec3ee17 100644 --- a/pkg/reboot/rebooter.go +++ b/pkg/reboot/rebooter.go @@ -82,7 +82,7 @@ func (r *watchdogRebooter) Reboot() error { func (r *watchdogRebooter) softwareReboot() error { r.log.Info("about to try software reboot") // hostPID: true and privileged:true required to run this - rebootCmd := exec.Command("/usr/bin/nsenter", "-m/proc/1/ns/mnt", "bash", "-b", "echo c > /proc/sysrq-trigger") + rebootCmd := exec.Command("/usr/bin/nsenter", "-m/proc/1/ns/mnt", "/bin/bash", "-c", "echo b > /proc/sysrq-trigger") if err := rebootCmd.Run(); err != nil { r.log.Error(err, "failed to run reboot command") From 5118acbb5b8606cc5efbdcf764392202dfbe784c Mon Sep 17 00:00:00 2001 From: Michael Shitrit Date: Mon, 31 Jul 2023 15:18:52 +0300 Subject: [PATCH 9/9] reduce e2e test flakiness based on cluster resources Signed-off-by: Michael Shitrit --- e2e/self_node_remediation_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/e2e/self_node_remediation_test.go b/e2e/self_node_remediation_test.go index 75504514..76baa281 100644 --- a/e2e/self_node_remediation_test.go +++ b/e2e/self_node_remediation_test.go @@ -424,7 +424,7 @@ func getBootTime(node *v1.Node) (*time.Time, error) { return err } return nil - }, 6*time.Minute, 10*time.Second).ShouldNot(HaveOccurred()) + }, 15*time.Minute, 10*time.Second).ShouldNot(HaveOccurred()) return &bootTime, nil }