Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: set certificate expiry day for kcpt #363

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 99 additions & 0 deletions magnum_cluster_api/hacks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# Copyright (c) 2024 VEXXHOST, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

"""
The purpose of this module is to provide a list of hacks and workarounds
in place for modifications done to existing Cluster API resources to
address issues in pre-existing clusters without changing their entire
ClusterClass.
"""

import pykube
from tenacity import Retrying, retry_if_result, stop_after_delay, wait_fixed

from magnum_cluster_api import objects, utils
from oslo_log import log as logging

LOG = logging.getLogger(__name__)
CERTIFICATE_EXPIRY_DAYS_FIX_APPLIED = False


def set_certificate_expiry_days(
api: pykube.HTTPClient,
):
global CERTIFICATE_EXPIRY_DAYS_FIX_APPLIED
if not CERTIFICATE_EXPIRY_DAYS_FIX_APPLIED:
kcpts = objects.KubeadmControlPlaneTemplate.objects(
api, namespace="magnum-system"
).all()
for kcpt in kcpts:
rollout_before = kcpt.obj["spec"]["template"]["spec"].get(
"rolloutBefore", {}
)
if "certificatesExpiryDays" in rollout_before:
continue

# Backup the original object in case we need to restore it
original_kcpt = kcpt.obj.copy()

try:
# NOTE(mnaser): Since the KubeadmControlPlaneTemplate is immutable, we need to
# delete the object and re-create it.
kcpt.delete()

del kcpt.obj["metadata"]["uid"]
kcpt.obj["spec"]["template"]["spec"].setdefault("rolloutBefore", {})
kcpt.obj["spec"]["template"]["spec"]["rolloutBefore"][
"certificatesExpiryDays"
] = 21

# Use tenacity to wait for kcpt to be created
for attempt in Retrying(
retry=retry_if_result(lambda result: result is None),
stop=stop_after_delay(10),
wait=wait_fixed(1),
):
with attempt:
utils.kube_apply_patch(kcpt)
new_kcpt = objects.KubeadmControlPlaneTemplate.objects(
api, namespace="magnum-system"
).get(name=kcpt.obj["metadata"]["name"])
if not attempt.retry_state.outcome.failed:
attempt.retry_state.set_result(new_kcpt)
except Exception as e:
LOG.exception(
"Failed to set certificate expiry days for kcpt %s: %s",
kcpt.obj["metadata"]["name"],
str(e),
)
del original_kcpt["metadata"]["uid"]
# Use tenacity to wait for kcpt to be created
for attempt in Retrying(
retry=retry_if_result(lambda result: result is None),
stop=stop_after_delay(10),
wait=wait_fixed(1),
):
with attempt:
utils.kube_apply_patch(original_kcpt)
new_kcpt = objects.KubeadmControlPlaneTemplate.objects(
api, namespace="magnum-system"
).get(name=original_kcpt.obj["metadata"]["name"])
if not attempt.retry_state.outcome.failed:
attempt.retry_state.set_result(new_kcpt)
LOG.info(
"Recreated kcpt %s with original values",
kcpt.obj["metadata"]["name"],
)

CERTIFICATE_EXPIRY_DAYS_FIX_APPLIED = True
4 changes: 3 additions & 1 deletion magnum_cluster_api/monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from oslo_log import log as logging
from oslo_utils import strutils

from magnum_cluster_api import clients, objects, utils
from magnum_cluster_api import clients, hacks, objects, sync, utils

LOG = logging.getLogger(__name__)

Expand Down Expand Up @@ -78,3 +78,5 @@ def poll_health_status(self):
self.data["health_status"] = fields.ClusterHealthStatus.HEALTHY

self.poll_nodegroup_replicas()
with sync.TaskLock("set_certificate_expiry_days"):
hacks.set_certificate_expiry_days(k8s_api)
8 changes: 6 additions & 2 deletions magnum_cluster_api/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
AUTOSCALE_ANNOTATION_MAX = "cluster.x-k8s.io/cluster-api-autoscaler-node-group-max-size"

DEFAULT_POD_CIDR = "10.100.0.0/16"
CERTIFICATE_EXPIRY_DAY_FIX_APPLIED = False


class ClusterAutoscalerHelmRelease:
Expand Down Expand Up @@ -2412,10 +2413,12 @@ def __init__(
context: context.RequestContext,
api: pykube.HTTPClient,
cluster: magnum_objects.Cluster,
cluster_class: str = CLUSTER_CLASS_NAME,
):
self.context = context
self.api = api
self.cluster = cluster
self.cluster_class = cluster_class

@property
def labels(self) -> dict:
Expand Down Expand Up @@ -2480,7 +2483,7 @@ def get_object(self) -> objects.Cluster:
},
},
"topology": {
"class": CLUSTER_CLASS_NAME,
"class": self.cluster_class,
"version": utils.get_kube_tag(self.cluster),
"controlPlane": {
"metadata": {
Expand Down Expand Up @@ -2774,6 +2777,7 @@ def apply_cluster_from_magnum_cluster(
context: context.RequestContext,
api: pykube.HTTPClient,
cluster: magnum_objects.Cluster,
cluster_class: str = CLUSTER_CLASS_NAME,
skip_auto_scaling_release: bool = False,
) -> None:
"""
Expand All @@ -2783,7 +2787,7 @@ def apply_cluster_from_magnum_cluster(

ClusterResourcesConfigMap(context, api, cluster).apply()
ClusterResourceSet(api, cluster).apply()
Cluster(context, api, cluster).apply()
Cluster(context, api, cluster, cluster_class).apply()

if not skip_auto_scaling_release and utils.get_auto_scaling_enabled(cluster):
ClusterAutoscalerHelmRelease(api, cluster).apply()
Expand Down
21 changes: 21 additions & 0 deletions magnum_cluster_api/sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,24 @@ def __init__(self, cluster_id: str, expire: int = DEFAULT_EXPIRE):
k8s_namespace="magnum-system",
expire=expire,
)


class TaskLock(sherlock.KubernetesLock):
"""
A task lock that is used to lock for a certain task across all of
the conductor nodes.
"""

DEFAULT_EXPIRE: int = 60

def __init__(self, task_id: str, expire: int = DEFAULT_EXPIRE):
sherlock.configure(
backend=sherlock.backends.KUBERNETES,
retry_interval=1,
)

super().__init__(
lock_name="task-%s" % task_id,
k8s_namespace="magnum-system",
expire=expire,
)
137 changes: 137 additions & 0 deletions magnum_cluster_api/tests/functional/test_hacks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
# Copyright (c) 2024 VEXXHOST, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

import pytest
from tenacity import Retrying, retry_if_result, stop_after_delay, wait_fixed

from magnum_cluster_api import clients, hacks, objects, resources, utils


@pytest.fixture
def kubeadm_control_plane_template_without_certificates_expiry_days():
api = clients.get_pykube_api()

kcpf = resources.KubeadmControlPlaneTemplate(api).get_object()
kcpf.obj["metadata"]["name"] = "test-hacks-set-certificate-expiry-days"

rollout_before = kcpf.obj["spec"]["template"]["spec"].pop("rolloutBefore", {})
assert "certificatesExpiryDays" in rollout_before

utils.kube_apply_patch(kcpf)
yield kcpf
kcpf.delete()


@pytest.fixture
def cluster_class_without_certificates_expiry_days(
kubeadm_control_plane_template_without_certificates_expiry_days,
):
api = clients.get_pykube_api()

cc = resources.ClusterClass(api).get_object()
cc.obj["metadata"]["name"] = "test-hacks-set-certificate-expiry-days"
cc.obj["spec"]["controlPlane"]["ref"][
"name"
] = kubeadm_control_plane_template_without_certificates_expiry_days.name

utils.kube_apply_patch(cc)
yield cc
cc.delete()


class TestHacks:
@pytest.fixture(autouse=True)
def setup(self, cluster):
self.api = clients.get_pykube_api()
self.cluster = cluster

def test_set_certificate_expiry_days(
self, context, cluster_class_without_certificates_expiry_days
):
# Delete the created Cluster resource
resources.Cluster(context, self.api, self.cluster).delete()

# Use tenacity to wait for cluster to be deleted
for attempt in Retrying(
retry=retry_if_result(lambda result: result is not None),
stop=stop_after_delay(10),
wait=wait_fixed(1),
):
with attempt:
capi_cluster = resources.Cluster(
context, self.api, self.cluster
).get_or_none()
if not attempt.retry_state.outcome.failed:
attempt.retry_state.set_result(capi_cluster)

try:
# Create a new Cluster resource with the updated ClusterClass
resources.Cluster(
context,
self.api,
self.cluster,
cluster_class_without_certificates_expiry_days.name,
).apply()

# Wait for the Cluster to be ready
cluster_resource = objects.Cluster.for_magnum_cluster(
self.api, self.cluster
)
cluster_resource.wait_for_observed_generation_changed(
existing_observed_generation=1
)

# Get the current KCP
kcp = resources.get_kubeadm_control_plane(self.api, self.cluster)

# Run the hack
hacks.set_certificate_expiry_days(self.api)

# Check if the KCPTemplate has been updated
kcp_template = objects.KubeadmControlPlaneTemplate.objects(
self.api, namespace="magnum-system"
).get(
name=cluster_class_without_certificates_expiry_days.obj["spec"][
"controlPlane"
]["ref"]["name"]
)
assert (
kcp_template.obj["spec"]["template"]["spec"]["rolloutBefore"][
"certificatesExpiryDays"
]
== 21
)

# Wait for the KubeadmControlPlane to reconcile
kcp.wait_for_observed_generation_changed()

# Assert that the hack has been applied
kcp = resources.get_kubeadm_control_plane(self.api, self.cluster)
assert kcp.obj["spec"]["rolloutBefore"]["certificatesExpiryDays"] == 21
finally:
# Delete the created Cluster resource
resources.Cluster(context, self.api, self.cluster).delete()

# Use tenacity to wait for cluster to be deleted
for attempt in Retrying(
retry=retry_if_result(lambda result: result is not None),
stop=stop_after_delay(10),
wait=wait_fixed(1),
):
with attempt:
capi_cluster = resources.Cluster(
context, self.api, self.cluster
).get_or_none()
if not attempt.retry_state.outcome.failed:
attempt.retry_state.set_result(capi_cluster)
Loading