diff --git a/magnum_cluster_api/hacks.py b/magnum_cluster_api/hacks.py new file mode 100644 index 00000000..face1161 --- /dev/null +++ b/magnum_cluster_api/hacks.py @@ -0,0 +1,99 @@ +# Copyright (c) 2024 VEXXHOST, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +""" +The purpose of this module is to provide a list of hacks and workarounds +in place for modifications done to existing Cluster API resources to +address issues in pre-existing clusters without changing their entire +ClusterClass. +""" + +import pykube +from tenacity import Retrying, retry_if_result, stop_after_delay, wait_fixed + +from magnum_cluster_api import objects, utils +from oslo_log import log as logging + +LOG = logging.getLogger(__name__) +CERTIFICATE_EXPIRY_DAYS_FIX_APPLIED = False + + +def set_certificate_expiry_days( + api: pykube.HTTPClient, +): + global CERTIFICATE_EXPIRY_DAYS_FIX_APPLIED + if not CERTIFICATE_EXPIRY_DAYS_FIX_APPLIED: + kcpts = objects.KubeadmControlPlaneTemplate.objects( + api, namespace="magnum-system" + ).all() + for kcpt in kcpts: + rollout_before = kcpt.obj["spec"]["template"]["spec"].get( + "rolloutBefore", {} + ) + if "certificatesExpiryDays" in rollout_before: + continue + + # Backup the original object in case we need to restore it + original_kcpt = kcpt.obj.copy() + + try: + # NOTE(mnaser): Since the KubeadmControlPlaneTemplate is immutable, we need to + # delete the object and re-create it. + kcpt.delete() + + del kcpt.obj["metadata"]["uid"] + kcpt.obj["spec"]["template"]["spec"].setdefault("rolloutBefore", {}) + kcpt.obj["spec"]["template"]["spec"]["rolloutBefore"][ + "certificatesExpiryDays" + ] = 21 + + # Use tenacity to wait for kcpt to be created + for attempt in Retrying( + retry=retry_if_result(lambda result: result is None), + stop=stop_after_delay(10), + wait=wait_fixed(1), + ): + with attempt: + utils.kube_apply_patch(kcpt) + new_kcpt = objects.KubeadmControlPlaneTemplate.objects( + api, namespace="magnum-system" + ).get(name=kcpt.obj["metadata"]["name"]) + if not attempt.retry_state.outcome.failed: + attempt.retry_state.set_result(new_kcpt) + except Exception as e: + LOG.exception( + "Failed to set certificate expiry days for kcpt %s: %s", + kcpt.obj["metadata"]["name"], + str(e), + ) + del original_kcpt["metadata"]["uid"] + # Use tenacity to wait for kcpt to be created + for attempt in Retrying( + retry=retry_if_result(lambda result: result is None), + stop=stop_after_delay(10), + wait=wait_fixed(1), + ): + with attempt: + utils.kube_apply_patch(original_kcpt) + new_kcpt = objects.KubeadmControlPlaneTemplate.objects( + api, namespace="magnum-system" + ).get(name=original_kcpt.obj["metadata"]["name"]) + if not attempt.retry_state.outcome.failed: + attempt.retry_state.set_result(new_kcpt) + LOG.info( + "Recreated kcpt %s with original values", + kcpt.obj["metadata"]["name"], + ) + + CERTIFICATE_EXPIRY_DAYS_FIX_APPLIED = True diff --git a/magnum_cluster_api/monitor.py b/magnum_cluster_api/monitor.py index 54d33bf5..50c097ea 100644 --- a/magnum_cluster_api/monitor.py +++ b/magnum_cluster_api/monitor.py @@ -17,7 +17,7 @@ from oslo_log import log as logging from oslo_utils import strutils -from magnum_cluster_api import clients, objects, utils +from magnum_cluster_api import clients, hacks, objects, sync, utils LOG = logging.getLogger(__name__) @@ -78,3 +78,5 @@ def poll_health_status(self): self.data["health_status"] = fields.ClusterHealthStatus.HEALTHY self.poll_nodegroup_replicas() + with sync.TaskLock("set_certificate_expiry_days"): + hacks.set_certificate_expiry_days(k8s_api) diff --git a/magnum_cluster_api/resources.py b/magnum_cluster_api/resources.py index df847727..c69d450b 100644 --- a/magnum_cluster_api/resources.py +++ b/magnum_cluster_api/resources.py @@ -57,6 +57,7 @@ AUTOSCALE_ANNOTATION_MAX = "cluster.x-k8s.io/cluster-api-autoscaler-node-group-max-size" DEFAULT_POD_CIDR = "10.100.0.0/16" +CERTIFICATE_EXPIRY_DAY_FIX_APPLIED = False class ClusterAutoscalerHelmRelease: @@ -2412,10 +2413,12 @@ def __init__( context: context.RequestContext, api: pykube.HTTPClient, cluster: magnum_objects.Cluster, + cluster_class: str = CLUSTER_CLASS_NAME, ): self.context = context self.api = api self.cluster = cluster + self.cluster_class = cluster_class @property def labels(self) -> dict: @@ -2480,7 +2483,7 @@ def get_object(self) -> objects.Cluster: }, }, "topology": { - "class": CLUSTER_CLASS_NAME, + "class": self.cluster_class, "version": utils.get_kube_tag(self.cluster), "controlPlane": { "metadata": { @@ -2774,6 +2777,7 @@ def apply_cluster_from_magnum_cluster( context: context.RequestContext, api: pykube.HTTPClient, cluster: magnum_objects.Cluster, + cluster_class: str = CLUSTER_CLASS_NAME, skip_auto_scaling_release: bool = False, ) -> None: """ @@ -2783,7 +2787,7 @@ def apply_cluster_from_magnum_cluster( ClusterResourcesConfigMap(context, api, cluster).apply() ClusterResourceSet(api, cluster).apply() - Cluster(context, api, cluster).apply() + Cluster(context, api, cluster, cluster_class).apply() if not skip_auto_scaling_release and utils.get_auto_scaling_enabled(cluster): ClusterAutoscalerHelmRelease(api, cluster).apply() diff --git a/magnum_cluster_api/sync.py b/magnum_cluster_api/sync.py index c3dc18a1..2ff5c3ae 100644 --- a/magnum_cluster_api/sync.py +++ b/magnum_cluster_api/sync.py @@ -34,3 +34,24 @@ def __init__(self, cluster_id: str, expire: int = DEFAULT_EXPIRE): k8s_namespace="magnum-system", expire=expire, ) + + +class TaskLock(sherlock.KubernetesLock): + """ + A task lock that is used to lock for a certain task across all of + the conductor nodes. + """ + + DEFAULT_EXPIRE: int = 60 + + def __init__(self, task_id: str, expire: int = DEFAULT_EXPIRE): + sherlock.configure( + backend=sherlock.backends.KUBERNETES, + retry_interval=1, + ) + + super().__init__( + lock_name="task-%s" % task_id, + k8s_namespace="magnum-system", + expire=expire, + ) diff --git a/magnum_cluster_api/tests/functional/test_hacks.py b/magnum_cluster_api/tests/functional/test_hacks.py new file mode 100644 index 00000000..c73453b4 --- /dev/null +++ b/magnum_cluster_api/tests/functional/test_hacks.py @@ -0,0 +1,137 @@ +# Copyright (c) 2024 VEXXHOST, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import pytest +from tenacity import Retrying, retry_if_result, stop_after_delay, wait_fixed + +from magnum_cluster_api import clients, hacks, objects, resources, utils + + +@pytest.fixture +def kubeadm_control_plane_template_without_certificates_expiry_days(): + api = clients.get_pykube_api() + + kcpf = resources.KubeadmControlPlaneTemplate(api).get_object() + kcpf.obj["metadata"]["name"] = "test-hacks-set-certificate-expiry-days" + + rollout_before = kcpf.obj["spec"]["template"]["spec"].pop("rolloutBefore", {}) + assert "certificatesExpiryDays" in rollout_before + + utils.kube_apply_patch(kcpf) + yield kcpf + kcpf.delete() + + +@pytest.fixture +def cluster_class_without_certificates_expiry_days( + kubeadm_control_plane_template_without_certificates_expiry_days, +): + api = clients.get_pykube_api() + + cc = resources.ClusterClass(api).get_object() + cc.obj["metadata"]["name"] = "test-hacks-set-certificate-expiry-days" + cc.obj["spec"]["controlPlane"]["ref"][ + "name" + ] = kubeadm_control_plane_template_without_certificates_expiry_days.name + + utils.kube_apply_patch(cc) + yield cc + cc.delete() + + +class TestHacks: + @pytest.fixture(autouse=True) + def setup(self, cluster): + self.api = clients.get_pykube_api() + self.cluster = cluster + + def test_set_certificate_expiry_days( + self, context, cluster_class_without_certificates_expiry_days + ): + # Delete the created Cluster resource + resources.Cluster(context, self.api, self.cluster).delete() + + # Use tenacity to wait for cluster to be deleted + for attempt in Retrying( + retry=retry_if_result(lambda result: result is not None), + stop=stop_after_delay(10), + wait=wait_fixed(1), + ): + with attempt: + capi_cluster = resources.Cluster( + context, self.api, self.cluster + ).get_or_none() + if not attempt.retry_state.outcome.failed: + attempt.retry_state.set_result(capi_cluster) + + try: + # Create a new Cluster resource with the updated ClusterClass + resources.Cluster( + context, + self.api, + self.cluster, + cluster_class_without_certificates_expiry_days.name, + ).apply() + + # Wait for the Cluster to be ready + cluster_resource = objects.Cluster.for_magnum_cluster( + self.api, self.cluster + ) + cluster_resource.wait_for_observed_generation_changed( + existing_observed_generation=1 + ) + + # Get the current KCP + kcp = resources.get_kubeadm_control_plane(self.api, self.cluster) + + # Run the hack + hacks.set_certificate_expiry_days(self.api) + + # Check if the KCPTemplate has been updated + kcp_template = objects.KubeadmControlPlaneTemplate.objects( + self.api, namespace="magnum-system" + ).get( + name=cluster_class_without_certificates_expiry_days.obj["spec"][ + "controlPlane" + ]["ref"]["name"] + ) + assert ( + kcp_template.obj["spec"]["template"]["spec"]["rolloutBefore"][ + "certificatesExpiryDays" + ] + == 21 + ) + + # Wait for the KubeadmControlPlane to reconcile + kcp.wait_for_observed_generation_changed() + + # Assert that the hack has been applied + kcp = resources.get_kubeadm_control_plane(self.api, self.cluster) + assert kcp.obj["spec"]["rolloutBefore"]["certificatesExpiryDays"] == 21 + finally: + # Delete the created Cluster resource + resources.Cluster(context, self.api, self.cluster).delete() + + # Use tenacity to wait for cluster to be deleted + for attempt in Retrying( + retry=retry_if_result(lambda result: result is not None), + stop=stop_after_delay(10), + wait=wait_fixed(1), + ): + with attempt: + capi_cluster = resources.Cluster( + context, self.api, self.cluster + ).get_or_none() + if not attempt.retry_state.outcome.failed: + attempt.retry_state.set_result(capi_cluster)