From d60d19ec6005ae2687ee1a4fb5f8a6fb2a1a02e4 Mon Sep 17 00:00:00 2001 From: okozachenko1203 Date: Sat, 20 Apr 2024 05:27:40 +1000 Subject: [PATCH 1/9] fix: set certificate expiry day for kcpt --- magnum_cluster_api/driver.py | 4 ++++ magnum_cluster_api/resources.py | 16 ++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/magnum_cluster_api/driver.py b/magnum_cluster_api/driver.py index 13760969..8a48d7ec 100644 --- a/magnum_cluster_api/driver.py +++ b/magnum_cluster_api/driver.py @@ -220,6 +220,7 @@ def update_cluster_status( cluster.status = fields.ClusterStatus.UPDATE_COMPLETE cluster.save() + return if cluster.status == fields.ClusterStatus.DELETE_IN_PROGRESS: if capi_cluster and capi_cluster.exists(): @@ -254,6 +255,9 @@ def update_cluster_status( cluster.status_reason = None cluster.status = fields.ClusterStatus.DELETE_COMPLETE cluster.save() + return + + resources.set_certificate_expiry_days(self.k8s_api) @cluster_lock_wrapper def update_cluster( diff --git a/magnum_cluster_api/resources.py b/magnum_cluster_api/resources.py index df847727..a9b547ec 100644 --- a/magnum_cluster_api/resources.py +++ b/magnum_cluster_api/resources.py @@ -57,6 +57,7 @@ AUTOSCALE_ANNOTATION_MAX = "cluster.x-k8s.io/cluster-api-autoscaler-node-group-max-size" DEFAULT_POD_CIDR = "10.100.0.0/16" +CERTIFICATE_EXPIRY_DAY_FIX_APPLIED = False class ClusterAutoscalerHelmRelease: @@ -2816,3 +2817,18 @@ def get_machine_deployment( if len(mds) == 1: return list(mds)[0] return None + + +def set_certificate_expiry_days( + api: pykube.HTTPClient, +): + global CERTIFICATE_EXPIRY_DAY_FIX_APPLIED + if not CERTIFICATE_EXPIRY_DAY_FIX_APPLIED: + kcpts = objects.KubeadmControlPlaneTemplate.objects( + api, namespace="magnum-system" + ).all() + for kcpt in kcpts: + kcpt.obj["spec"]["template"]["spec"]["rolloutBefore"][ + "certificatesExpiryDays" + ] = 21 + utils.kube_apply_patch(kcpt) From 4f70e8af3ed105e9996b030529c9472fd9fcbd33 Mon Sep 17 00:00:00 2001 From: okozachenko1203 Date: Sat, 20 Apr 2024 05:34:33 +1000 Subject: [PATCH 2/9] set global variable as true --- magnum_cluster_api/resources.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/magnum_cluster_api/resources.py b/magnum_cluster_api/resources.py index a9b547ec..69da8507 100644 --- a/magnum_cluster_api/resources.py +++ b/magnum_cluster_api/resources.py @@ -2832,3 +2832,5 @@ def set_certificate_expiry_days( "certificatesExpiryDays" ] = 21 utils.kube_apply_patch(kcpt) + + CERTIFICATE_EXPIRY_DAY_FIX_APPLIED = True From 62a7bba0d08920521f28bb31880d6eb16216db68 Mon Sep 17 00:00:00 2001 From: Mohammed Naser Date: Sun, 21 Apr 2024 16:34:28 -0400 Subject: [PATCH 3/9] Move to hacks module --- magnum_cluster_api/driver.py | 3 ++- magnum_cluster_api/hacks.py | 47 +++++++++++++++++++++++++++++++++ magnum_cluster_api/resources.py | 17 ------------ 3 files changed, 49 insertions(+), 18 deletions(-) create mode 100644 magnum_cluster_api/hacks.py diff --git a/magnum_cluster_api/driver.py b/magnum_cluster_api/driver.py index 8a48d7ec..c6df7540 100644 --- a/magnum_cluster_api/driver.py +++ b/magnum_cluster_api/driver.py @@ -24,6 +24,7 @@ from magnum_cluster_api import ( clients, exceptions, + hacks, monitor, objects, resources, @@ -257,7 +258,7 @@ def update_cluster_status( cluster.save() return - resources.set_certificate_expiry_days(self.k8s_api) + hacks.set_certificate_expiry_days(self.k8s_api) @cluster_lock_wrapper def update_cluster( diff --git a/magnum_cluster_api/hacks.py b/magnum_cluster_api/hacks.py new file mode 100644 index 00000000..fea9009c --- /dev/null +++ b/magnum_cluster_api/hacks.py @@ -0,0 +1,47 @@ +# Copyright (c) 2024 VEXXHOST, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +""" +The purpose of this module is to provide a list of hacks and workarounds +in place for modifications done to existing Cluster API resources to +address issues in pre-existing clusters without changing their entire +ClusterClass. +""" + +import pykube + +from magnum_cluster_api import objects, utils + +CERTIFICATE_EXPIRY_DAYS_FIX_APPLIED = False + + +def set_certificate_expiry_days( + api: pykube.HTTPClient, +): + global CERTIFICATE_EXPIRY_DAYS_FIX_APPLIED + if not CERTIFICATE_EXPIRY_DAYS_FIX_APPLIED: + kcpts = objects.KubeadmControlPlaneTemplate.objects( + api, namespace="magnum-system" + ).all() + for kcpt in kcpts: + rollout_before = kcpt.obj["spec"]["template"]["spec"].get( + "rolloutBefore", {} + ) + if "certificatesExpiryDays" not in rollout_before: + kcpt.obj["spec"]["template"]["spec"]["rolloutBefore"][ + "certificatesExpiryDays" + ] = 21 + utils.kube_apply_patch(kcpt) + + CERTIFICATE_EXPIRY_DAYS_FIX_APPLIED = True diff --git a/magnum_cluster_api/resources.py b/magnum_cluster_api/resources.py index 69da8507..13447ff8 100644 --- a/magnum_cluster_api/resources.py +++ b/magnum_cluster_api/resources.py @@ -2817,20 +2817,3 @@ def get_machine_deployment( if len(mds) == 1: return list(mds)[0] return None - - -def set_certificate_expiry_days( - api: pykube.HTTPClient, -): - global CERTIFICATE_EXPIRY_DAY_FIX_APPLIED - if not CERTIFICATE_EXPIRY_DAY_FIX_APPLIED: - kcpts = objects.KubeadmControlPlaneTemplate.objects( - api, namespace="magnum-system" - ).all() - for kcpt in kcpts: - kcpt.obj["spec"]["template"]["spec"]["rolloutBefore"][ - "certificatesExpiryDays" - ] = 21 - utils.kube_apply_patch(kcpt) - - CERTIFICATE_EXPIRY_DAY_FIX_APPLIED = True From 5fc90d3ea58871cb3497e8f50e2ac1b6123c0e96 Mon Sep 17 00:00:00 2001 From: Mohammed Naser Date: Sun, 21 Apr 2024 21:44:37 -0400 Subject: [PATCH 4/9] Add functional tests --- magnum_cluster_api/hacks.py | 15 ++ magnum_cluster_api/resources.py | 7 +- .../tests/functional/test_hacks.py | 142 ++++++++++++++++++ 3 files changed, 162 insertions(+), 2 deletions(-) create mode 100644 magnum_cluster_api/tests/functional/test_hacks.py diff --git a/magnum_cluster_api/hacks.py b/magnum_cluster_api/hacks.py index fea9009c..245f8e43 100644 --- a/magnum_cluster_api/hacks.py +++ b/magnum_cluster_api/hacks.py @@ -39,9 +39,24 @@ def set_certificate_expiry_days( "rolloutBefore", {} ) if "certificatesExpiryDays" not in rollout_before: + kcpt.obj["spec"]["template"]["spec"].setdefault("rolloutBefore", {}) kcpt.obj["spec"]["template"]["spec"]["rolloutBefore"][ "certificatesExpiryDays" ] = 21 + + # NOTE(mnaser): Since the KubeadmControlPlaneTemplate is immutable, we need to + # delete the object and re-create it. + kcpt.delete() + del kcpt.obj["metadata"]["uid"] + utils.kube_apply_patch(kcpt) CERTIFICATE_EXPIRY_DAYS_FIX_APPLIED = True + + kcps = objects.KubeadmControlPlane.objects( + api, namespace="magnum-system" + ).all() + for kcp in kcps: + kcp.obj["spec"].setdefault("rolloutBefore", {}) + kcp.obj["spec"]["rolloutBefore"]["certificatesExpiryDays"] = 21 + utils.kube_apply_patch(kcp) diff --git a/magnum_cluster_api/resources.py b/magnum_cluster_api/resources.py index 13447ff8..c69d450b 100644 --- a/magnum_cluster_api/resources.py +++ b/magnum_cluster_api/resources.py @@ -2413,10 +2413,12 @@ def __init__( context: context.RequestContext, api: pykube.HTTPClient, cluster: magnum_objects.Cluster, + cluster_class: str = CLUSTER_CLASS_NAME, ): self.context = context self.api = api self.cluster = cluster + self.cluster_class = cluster_class @property def labels(self) -> dict: @@ -2481,7 +2483,7 @@ def get_object(self) -> objects.Cluster: }, }, "topology": { - "class": CLUSTER_CLASS_NAME, + "class": self.cluster_class, "version": utils.get_kube_tag(self.cluster), "controlPlane": { "metadata": { @@ -2775,6 +2777,7 @@ def apply_cluster_from_magnum_cluster( context: context.RequestContext, api: pykube.HTTPClient, cluster: magnum_objects.Cluster, + cluster_class: str = CLUSTER_CLASS_NAME, skip_auto_scaling_release: bool = False, ) -> None: """ @@ -2784,7 +2787,7 @@ def apply_cluster_from_magnum_cluster( ClusterResourcesConfigMap(context, api, cluster).apply() ClusterResourceSet(api, cluster).apply() - Cluster(context, api, cluster).apply() + Cluster(context, api, cluster, cluster_class).apply() if not skip_auto_scaling_release and utils.get_auto_scaling_enabled(cluster): ClusterAutoscalerHelmRelease(api, cluster).apply() diff --git a/magnum_cluster_api/tests/functional/test_hacks.py b/magnum_cluster_api/tests/functional/test_hacks.py new file mode 100644 index 00000000..e969c48b --- /dev/null +++ b/magnum_cluster_api/tests/functional/test_hacks.py @@ -0,0 +1,142 @@ +# Copyright (c) 2024 VEXXHOST, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import pytest +from tenacity import ( + Retrying, + retry_if_result, + stop_after_delay, + wait_fixed, +) + +from magnum_cluster_api import clients, hacks, objects, resources, utils + + +@pytest.fixture +def kubeadm_control_plane_template_without_certificates_expiry_days(): + api = clients.get_pykube_api() + + kcpf = resources.KubeadmControlPlaneTemplate(api).get_object() + kcpf.obj["metadata"]["name"] = "test-hacks-set-certificate-expiry-days" + + rollout_before = kcpf.obj["spec"]["template"]["spec"].pop("rolloutBefore", {}) + assert "certificatesExpiryDays" in rollout_before + + utils.kube_apply_patch(kcpf) + yield kcpf + kcpf.delete() + + +@pytest.fixture +def cluster_class_without_certificates_expiry_days( + kubeadm_control_plane_template_without_certificates_expiry_days, +): + api = clients.get_pykube_api() + + cc = resources.ClusterClass(api).get_object() + cc.obj["metadata"]["name"] = "test-hacks-set-certificate-expiry-days" + cc.obj["spec"]["controlPlane"]["ref"][ + "name" + ] = kubeadm_control_plane_template_without_certificates_expiry_days.name + + utils.kube_apply_patch(cc) + yield cc + cc.delete() + + +class TestHacks: + @pytest.fixture(autouse=True) + def setup(self, cluster): + self.api = clients.get_pykube_api() + self.cluster = cluster + + def test_set_certificate_expiry_days( + self, context, cluster_class_without_certificates_expiry_days + ): + # Delete the created Cluster resource + resources.Cluster(context, self.api, self.cluster).delete() + + # Use tenacity to wait for cluster to be deleted + for attempt in Retrying( + retry=retry_if_result(lambda result: result is not None), + stop=stop_after_delay(10), + wait=wait_fixed(1), + ): + with attempt: + capi_cluster = resources.Cluster( + context, self.api, self.cluster + ).get_or_none() + if not attempt.retry_state.outcome.failed: + attempt.retry_state.set_result(capi_cluster) + + try: + # Create a new Cluster resource with the updated ClusterClass + resources.Cluster( + context, + self.api, + self.cluster, + cluster_class_without_certificates_expiry_days.name, + ).apply() + + # Wait for the Cluster to be ready + cluster_resource = objects.Cluster.for_magnum_cluster( + self.api, self.cluster + ) + cluster_resource.wait_for_observed_generation_changed( + existing_observed_generation=1 + ) + + # Get the current KCP + kcp = resources.get_kubeadm_control_plane(self.api, self.cluster) + + # Run the hack + hacks.set_certificate_expiry_days(self.api) + + # Check if the KCPTemplate has been updated + kcp_template = objects.KubeadmControlPlaneTemplate.objects( + self.api, namespace="magnum-system" + ).get( + name=cluster_class_without_certificates_expiry_days.obj["spec"][ + "controlPlane" + ]["ref"]["name"] + ) + assert ( + kcp_template.obj["spec"]["template"]["spec"]["rolloutBefore"][ + "certificatesExpiryDays" + ] + == 21 + ) + + # Wait for the KubeadmControlPlane to reconcile + kcp.wait_for_observed_generation_changed() + + # Assert that the hack has been applied + kcp = resources.get_kubeadm_control_plane(self.api, self.cluster) + assert kcp.obj["spec"]["rolloutBefore"]["certificatesExpiryDays"] == 21 + finally: + # Delete the created Cluster resource + resources.Cluster(context, self.api, self.cluster).delete() + + # Use tenacity to wait for cluster to be deleted + for attempt in Retrying( + retry=retry_if_result(lambda result: result is not None), + stop=stop_after_delay(10), + wait=wait_fixed(1), + ): + with attempt: + capi_cluster = resources.Cluster( + context, self.api, self.cluster + ).get_or_none() + if not attempt.retry_state.outcome.failed: + attempt.retry_state.set_result(capi_cluster) From 1b98989d619bcab9a2a4923933a15b42047473db Mon Sep 17 00:00:00 2001 From: okozachenko1203 Date: Tue, 23 Apr 2024 01:53:59 +1000 Subject: [PATCH 5/9] run hacks in health monitor --- magnum_cluster_api/driver.py | 5 ----- magnum_cluster_api/hacks.py | 8 -------- magnum_cluster_api/monitor.py | 4 +++- magnum_cluster_api/tests/functional/test_hacks.py | 7 +------ 4 files changed, 4 insertions(+), 20 deletions(-) diff --git a/magnum_cluster_api/driver.py b/magnum_cluster_api/driver.py index c6df7540..13760969 100644 --- a/magnum_cluster_api/driver.py +++ b/magnum_cluster_api/driver.py @@ -24,7 +24,6 @@ from magnum_cluster_api import ( clients, exceptions, - hacks, monitor, objects, resources, @@ -221,7 +220,6 @@ def update_cluster_status( cluster.status = fields.ClusterStatus.UPDATE_COMPLETE cluster.save() - return if cluster.status == fields.ClusterStatus.DELETE_IN_PROGRESS: if capi_cluster and capi_cluster.exists(): @@ -256,9 +254,6 @@ def update_cluster_status( cluster.status_reason = None cluster.status = fields.ClusterStatus.DELETE_COMPLETE cluster.save() - return - - hacks.set_certificate_expiry_days(self.k8s_api) @cluster_lock_wrapper def update_cluster( diff --git a/magnum_cluster_api/hacks.py b/magnum_cluster_api/hacks.py index 245f8e43..a0b6e683 100644 --- a/magnum_cluster_api/hacks.py +++ b/magnum_cluster_api/hacks.py @@ -52,11 +52,3 @@ def set_certificate_expiry_days( utils.kube_apply_patch(kcpt) CERTIFICATE_EXPIRY_DAYS_FIX_APPLIED = True - - kcps = objects.KubeadmControlPlane.objects( - api, namespace="magnum-system" - ).all() - for kcp in kcps: - kcp.obj["spec"].setdefault("rolloutBefore", {}) - kcp.obj["spec"]["rolloutBefore"]["certificatesExpiryDays"] = 21 - utils.kube_apply_patch(kcp) diff --git a/magnum_cluster_api/monitor.py b/magnum_cluster_api/monitor.py index 54d33bf5..9b19ab2e 100644 --- a/magnum_cluster_api/monitor.py +++ b/magnum_cluster_api/monitor.py @@ -17,7 +17,7 @@ from oslo_log import log as logging from oslo_utils import strutils -from magnum_cluster_api import clients, objects, utils +from magnum_cluster_api import clients, hacks, objects, utils LOG = logging.getLogger(__name__) @@ -78,3 +78,5 @@ def poll_health_status(self): self.data["health_status"] = fields.ClusterHealthStatus.HEALTHY self.poll_nodegroup_replicas() + + hacks.set_certificate_expiry_days(self.k8s_api) diff --git a/magnum_cluster_api/tests/functional/test_hacks.py b/magnum_cluster_api/tests/functional/test_hacks.py index e969c48b..c73453b4 100644 --- a/magnum_cluster_api/tests/functional/test_hacks.py +++ b/magnum_cluster_api/tests/functional/test_hacks.py @@ -13,12 +13,7 @@ # under the License. import pytest -from tenacity import ( - Retrying, - retry_if_result, - stop_after_delay, - wait_fixed, -) +from tenacity import Retrying, retry_if_result, stop_after_delay, wait_fixed from magnum_cluster_api import clients, hacks, objects, resources, utils From e7297c057dc3c47512c534ffd3b4adf3c70a0c7c Mon Sep 17 00:00:00 2001 From: okozachenko1203 Date: Tue, 23 Apr 2024 01:56:58 +1000 Subject: [PATCH 6/9] fix typo --- magnum_cluster_api/monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/magnum_cluster_api/monitor.py b/magnum_cluster_api/monitor.py index 9b19ab2e..75a77773 100644 --- a/magnum_cluster_api/monitor.py +++ b/magnum_cluster_api/monitor.py @@ -79,4 +79,4 @@ def poll_health_status(self): self.poll_nodegroup_replicas() - hacks.set_certificate_expiry_days(self.k8s_api) + hacks.set_certificate_expiry_days(k8s_api) From f0ee5a27d527a507f3204557fb81c7afafcb3609 Mon Sep 17 00:00:00 2001 From: okozachenko1203 Date: Thu, 2 May 2024 17:25:38 +1000 Subject: [PATCH 7/9] set lock for hacks --- magnum_cluster_api/monitor.py | 6 +++--- magnum_cluster_api/sync.py | 21 +++++++++++++++++++++ 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/magnum_cluster_api/monitor.py b/magnum_cluster_api/monitor.py index 75a77773..50c097ea 100644 --- a/magnum_cluster_api/monitor.py +++ b/magnum_cluster_api/monitor.py @@ -17,7 +17,7 @@ from oslo_log import log as logging from oslo_utils import strutils -from magnum_cluster_api import clients, hacks, objects, utils +from magnum_cluster_api import clients, hacks, objects, sync, utils LOG = logging.getLogger(__name__) @@ -78,5 +78,5 @@ def poll_health_status(self): self.data["health_status"] = fields.ClusterHealthStatus.HEALTHY self.poll_nodegroup_replicas() - - hacks.set_certificate_expiry_days(k8s_api) + with sync.TaskLock("set_certificate_expiry_days"): + hacks.set_certificate_expiry_days(k8s_api) diff --git a/magnum_cluster_api/sync.py b/magnum_cluster_api/sync.py index c3dc18a1..2ff5c3ae 100644 --- a/magnum_cluster_api/sync.py +++ b/magnum_cluster_api/sync.py @@ -34,3 +34,24 @@ def __init__(self, cluster_id: str, expire: int = DEFAULT_EXPIRE): k8s_namespace="magnum-system", expire=expire, ) + + +class TaskLock(sherlock.KubernetesLock): + """ + A task lock that is used to lock for a certain task across all of + the conductor nodes. + """ + + DEFAULT_EXPIRE: int = 60 + + def __init__(self, task_id: str, expire: int = DEFAULT_EXPIRE): + sherlock.configure( + backend=sherlock.backends.KUBERNETES, + retry_interval=1, + ) + + super().__init__( + lock_name="task-%s" % task_id, + k8s_namespace="magnum-system", + expire=expire, + ) From 4d17fae3e30cfce1bf9597ce5944a199f1bd18ad Mon Sep 17 00:00:00 2001 From: okozachenko1203 Date: Thu, 2 May 2024 19:47:45 +1000 Subject: [PATCH 8/9] use retry logic for kcpt recreation --- magnum_cluster_api/hacks.py | 39 +++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/magnum_cluster_api/hacks.py b/magnum_cluster_api/hacks.py index a0b6e683..75867d5f 100644 --- a/magnum_cluster_api/hacks.py +++ b/magnum_cluster_api/hacks.py @@ -20,6 +20,7 @@ """ import pykube +from tenacity import Retrying, retry_if_result, stop_after_delay, wait_fixed from magnum_cluster_api import objects, utils @@ -38,17 +39,31 @@ def set_certificate_expiry_days( rollout_before = kcpt.obj["spec"]["template"]["spec"].get( "rolloutBefore", {} ) - if "certificatesExpiryDays" not in rollout_before: - kcpt.obj["spec"]["template"]["spec"].setdefault("rolloutBefore", {}) - kcpt.obj["spec"]["template"]["spec"]["rolloutBefore"][ - "certificatesExpiryDays" - ] = 21 - - # NOTE(mnaser): Since the KubeadmControlPlaneTemplate is immutable, we need to - # delete the object and re-create it. - kcpt.delete() - del kcpt.obj["metadata"]["uid"] - - utils.kube_apply_patch(kcpt) + if "certificatesExpiryDays" in rollout_before: + continue + + # NOTE(mnaser): Since the KubeadmControlPlaneTemplate is immutable, we need to + # delete the object and re-create it. + kcpt.delete() + + del kcpt.obj["metadata"]["uid"] + kcpt.obj["spec"]["template"]["spec"].setdefault("rolloutBefore", {}) + kcpt.obj["spec"]["template"]["spec"]["rolloutBefore"][ + "certificatesExpiryDays" + ] = 21 + + # Use tenacity to wait for kcpt to be created + for attempt in Retrying( + retry=retry_if_result(lambda result: result is None), + stop=stop_after_delay(10), + wait=wait_fixed(1), + ): + with attempt: + utils.kube_apply_patch(kcpt) + new_kcpt = objects.KubeadmControlPlaneTemplate.objects( + api, namespace="magnum-system" + ).get(name=kcpt.obj["metadata"]["name"]) + if not attempt.retry_state.outcome.failed: + attempt.retry_state.set_result(new_kcpt) CERTIFICATE_EXPIRY_DAYS_FIX_APPLIED = True From cc94acb44fa092f750baa0f7e98f0ea67a395c4c Mon Sep 17 00:00:00 2001 From: okozachenko1203 Date: Tue, 10 Sep 2024 20:47:26 +1000 Subject: [PATCH 9/9] add exception for kcpt creation failure --- magnum_cluster_api/hacks.py | 72 ++++++++++++++++++++++++++----------- 1 file changed, 51 insertions(+), 21 deletions(-) diff --git a/magnum_cluster_api/hacks.py b/magnum_cluster_api/hacks.py index 75867d5f..face1161 100644 --- a/magnum_cluster_api/hacks.py +++ b/magnum_cluster_api/hacks.py @@ -23,7 +23,9 @@ from tenacity import Retrying, retry_if_result, stop_after_delay, wait_fixed from magnum_cluster_api import objects, utils +from oslo_log import log as logging +LOG = logging.getLogger(__name__) CERTIFICATE_EXPIRY_DAYS_FIX_APPLIED = False @@ -42,28 +44,56 @@ def set_certificate_expiry_days( if "certificatesExpiryDays" in rollout_before: continue - # NOTE(mnaser): Since the KubeadmControlPlaneTemplate is immutable, we need to - # delete the object and re-create it. - kcpt.delete() + # Backup the original object in case we need to restore it + original_kcpt = kcpt.obj.copy() - del kcpt.obj["metadata"]["uid"] - kcpt.obj["spec"]["template"]["spec"].setdefault("rolloutBefore", {}) - kcpt.obj["spec"]["template"]["spec"]["rolloutBefore"][ - "certificatesExpiryDays" - ] = 21 + try: + # NOTE(mnaser): Since the KubeadmControlPlaneTemplate is immutable, we need to + # delete the object and re-create it. + kcpt.delete() - # Use tenacity to wait for kcpt to be created - for attempt in Retrying( - retry=retry_if_result(lambda result: result is None), - stop=stop_after_delay(10), - wait=wait_fixed(1), - ): - with attempt: - utils.kube_apply_patch(kcpt) - new_kcpt = objects.KubeadmControlPlaneTemplate.objects( - api, namespace="magnum-system" - ).get(name=kcpt.obj["metadata"]["name"]) - if not attempt.retry_state.outcome.failed: - attempt.retry_state.set_result(new_kcpt) + del kcpt.obj["metadata"]["uid"] + kcpt.obj["spec"]["template"]["spec"].setdefault("rolloutBefore", {}) + kcpt.obj["spec"]["template"]["spec"]["rolloutBefore"][ + "certificatesExpiryDays" + ] = 21 + + # Use tenacity to wait for kcpt to be created + for attempt in Retrying( + retry=retry_if_result(lambda result: result is None), + stop=stop_after_delay(10), + wait=wait_fixed(1), + ): + with attempt: + utils.kube_apply_patch(kcpt) + new_kcpt = objects.KubeadmControlPlaneTemplate.objects( + api, namespace="magnum-system" + ).get(name=kcpt.obj["metadata"]["name"]) + if not attempt.retry_state.outcome.failed: + attempt.retry_state.set_result(new_kcpt) + except Exception as e: + LOG.exception( + "Failed to set certificate expiry days for kcpt %s: %s", + kcpt.obj["metadata"]["name"], + str(e), + ) + del original_kcpt["metadata"]["uid"] + # Use tenacity to wait for kcpt to be created + for attempt in Retrying( + retry=retry_if_result(lambda result: result is None), + stop=stop_after_delay(10), + wait=wait_fixed(1), + ): + with attempt: + utils.kube_apply_patch(original_kcpt) + new_kcpt = objects.KubeadmControlPlaneTemplate.objects( + api, namespace="magnum-system" + ).get(name=original_kcpt.obj["metadata"]["name"]) + if not attempt.retry_state.outcome.failed: + attempt.retry_state.set_result(new_kcpt) + LOG.info( + "Recreated kcpt %s with original values", + kcpt.obj["metadata"]["name"], + ) CERTIFICATE_EXPIRY_DAYS_FIX_APPLIED = True