From a7df091c60bc5e68d04eee558f20ec24390f21db Mon Sep 17 00:00:00 2001 From: Michael Shitrit Date: Sun, 14 Jul 2024 14:10:00 +0300 Subject: [PATCH 1/2] Adding a retry to set up of OutOfService flag, in order to overcome temporary network issues. Signed-off-by: Michael Shitrit --- main.go | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/main.go b/main.go index cb0dbb73..508e9824 100644 --- a/main.go +++ b/main.go @@ -35,6 +35,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" pkgruntime "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/apimachinery/pkg/util/wait" clientgoscheme "k8s.io/client-go/kubernetes/scheme" // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) @@ -141,7 +142,22 @@ func main() { os.Exit(1) } - if err := utils.InitOutOfServiceTaintFlags(mgr.GetConfig()); err != nil { + interval := 2 * time.Second // retry every 2 seconds + timeout := 10 * time.Second // for a period of 10 seconds + + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + + // Using wait.PollUntilContextTimeout to retry InitOutOfServiceTaintFlags in case there is a temporary network issue. + // Since the last internal error returned by InitOutOfServiceTaintFlags also indicates whether polling succeed or not, there is no need to also keep the context error returned by PollUntilContextTimeout. + _ = wait.PollUntilContextTimeout(ctx, interval, timeout, true, func(ctx context.Context) (bool, error) { + if err = utils.InitOutOfServiceTaintFlags(mgr.GetConfig()); err != nil { + return false, nil // Keep retrying + } + return true, nil // Success + }) + + if err != nil { setupLog.Error(err, "unable to verify out-of-service taint support. out-of-service taint isn't supported") } From 5bc18b5ed8ff426822e979dc40962810b1e9cebd Mon Sep 17 00:00:00 2001 From: Michael Shitrit Date: Wed, 17 Jul 2024 15:15:49 +0300 Subject: [PATCH 2/2] extract the retry mechanism into a separate method Signed-off-by: Michael Shitrit --- main.go | 18 +----------------- pkg/utils/taints.go | 23 ++++++++++++++++++++++- 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/main.go b/main.go index 508e9824..443792a4 100644 --- a/main.go +++ b/main.go @@ -35,7 +35,6 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" pkgruntime "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" - "k8s.io/apimachinery/pkg/util/wait" clientgoscheme "k8s.io/client-go/kubernetes/scheme" // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) @@ -142,22 +141,7 @@ func main() { os.Exit(1) } - interval := 2 * time.Second // retry every 2 seconds - timeout := 10 * time.Second // for a period of 10 seconds - - ctx, cancel := context.WithTimeout(context.Background(), timeout) - defer cancel() - - // Using wait.PollUntilContextTimeout to retry InitOutOfServiceTaintFlags in case there is a temporary network issue. - // Since the last internal error returned by InitOutOfServiceTaintFlags also indicates whether polling succeed or not, there is no need to also keep the context error returned by PollUntilContextTimeout. - _ = wait.PollUntilContextTimeout(ctx, interval, timeout, true, func(ctx context.Context) (bool, error) { - if err = utils.InitOutOfServiceTaintFlags(mgr.GetConfig()); err != nil { - return false, nil // Keep retrying - } - return true, nil // Success - }) - - if err != nil { + if err := utils.InitOutOfServiceTaintFlagsWithRetry(context.Background(), mgr.GetConfig()); err != nil { setupLog.Error(err, "unable to verify out-of-service taint support. out-of-service taint isn't supported") } diff --git a/pkg/utils/taints.go b/pkg/utils/taints.go index 551472b7..2005ad89 100644 --- a/pkg/utils/taints.go +++ b/pkg/utils/taints.go @@ -1,11 +1,14 @@ package utils import ( + "context" "fmt" "regexp" "strconv" + "time" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/apimachinery/pkg/version" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" @@ -54,7 +57,25 @@ func DeleteTaint(taints []v1.Taint, taintToDelete *v1.Taint) ([]v1.Taint, bool) return newTaints, deleted } -func InitOutOfServiceTaintFlags(config *rest.Config) error { +// InitOutOfServiceTaintFlagsWithRetry tries to initialize the OutOfService flags based on k8s version, in case it fails (potentially due to network issues) it will retry for a limited number of times +func InitOutOfServiceTaintFlagsWithRetry(ctx context.Context, config *rest.Config) error { + + var err error + interval := 2 * time.Second // retry every 2 seconds + timeout := 10 * time.Second // for a period of 10 seconds + + // Since the last internal error returned by InitOutOfServiceTaintFlags also indicates whether polling succeed or not, there is no need to also keep the context error returned by PollUntilContextTimeout. + // Using wait.PollUntilContextTimeout to retry initOutOfServiceTaintFlags in case there is a temporary network issue. + _ = wait.PollUntilContextTimeout(ctx, interval, timeout, true, func(ctx context.Context) (bool, error) { + if err = initOutOfServiceTaintFlags(config); err != nil { + return false, nil // Keep retrying + } + return true, nil // Success + }) + return err +} + +func initOutOfServiceTaintFlags(config *rest.Config) error { if cs, err := kubernetes.NewForConfig(config); err != nil || cs == nil { if cs == nil { err = fmt.Errorf("k8s client set is nil")