Skip to content

Commit

Permalink
Merge pull request #35 from cynepco3hahue/fix_machine_remediation_con…
Browse files Browse the repository at this point in the history
…troller

Fix machine remediation controller
  • Loading branch information
Artyom Lukianov authored Aug 8, 2019
2 parents 0c637e0 + 81512c5 commit 5a50920
Show file tree
Hide file tree
Showing 12 changed files with 57 additions and 57 deletions.
5 changes: 3 additions & 2 deletions cmd/machine-remediation/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (

"github.com/golang/glog"
bmov1 "github.com/metal3-io/baremetal-operator/pkg/apis/metal3/v1alpha1"

mrv1 "kubevirt.io/machine-remediation-operator/pkg/apis/machineremediation/v1alpha1"
"kubevirt.io/machine-remediation-operator/pkg/baremetal/remediator"
"kubevirt.io/machine-remediation-operator/pkg/controllers"
Expand Down Expand Up @@ -65,9 +66,9 @@ func main() {
glog.Fatal(err)
}

bareMetalRemediator := remediator.NewBareMetalRemediator(mgr)
remediator := remediator.NewBareMetalRemediator(mgr)
addController := func(m manager.Manager, opts manager.Options) error {
return machineremediation.AddWithRemediator(m, bareMetalRemediator, opts)
return machineremediation.AddWithRemediator(m, remediator, opts)
}

// Setup all Controllers
Expand Down
54 changes: 27 additions & 27 deletions pkg/baremetal/remediator/remediator.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,10 @@ func (bmr *BareMetalRemediator) Reboot(ctx context.Context, machineRemediation *
}

// Copy the BareMetalHost object to prevent modification of the original one
newBmh := bmh.DeepCopy()
bmhCopy := bmh.DeepCopy()

// Copy the MachineRemediation object to prevent modification of the original one
newMachineRemediation := machineRemediation.DeepCopy()
mrCopy := machineRemediation.DeepCopy()

now := time.Now()
switch machineRemediation.Status.State {
Expand All @@ -77,30 +77,30 @@ func (bmr *BareMetalRemediator) Reboot(ctx context.Context, machineRemediation *
// it can mean that an user power off the machine by purpose
if !bmh.Spec.Online {
glog.V(4).Infof("Skip the remediation, machine %q has power off state before the remediation action", machine.Name)
newMachineRemediation.Status.State = mrv1.RemediationStateSucceeded
newMachineRemediation.Status.Reason = "Skip the reboot, the machine power off by an user"
newMachineRemediation.Status.EndTime = &metav1.Time{Time: now}
mrCopy.Status.State = mrv1.RemediationStateSucceeded
mrCopy.Status.Reason = "Skip the reboot, the machine power off by an user"
mrCopy.Status.EndTime = &metav1.Time{Time: now}
} else {
// power off the machine
glog.V(4).Infof("Power off machine %q", machine.Name)
newBmh.Spec.Online = false
if err := bmr.client.Update(context.TODO(), newBmh); err != nil {
bmhCopy.Spec.Online = false
if err := bmr.client.Update(context.TODO(), bmhCopy); err != nil {
return err
}

newMachineRemediation.Status.State = mrv1.RemediationStatePowerOff
newMachineRemediation.Status.Reason = "Starts the reboot process"
mrCopy.Status.State = mrv1.RemediationStatePowerOff
mrCopy.Status.Reason = "Starts the reboot process"
}
return bmr.client.Update(context.TODO(), newMachineRemediation)
return bmr.client.Status().Update(context.TODO(), mrCopy)

case mrv1.RemediationStatePowerOff:
// failed the remediation on timeout
if machineRemediation.Status.StartTime.Time.Add(rebootDefaultTimeout * time.Minute).Before(now) {
glog.Errorf("Remediation of machine %q failed on timeout", machine.Name)
newMachineRemediation.Status.State = mrv1.RemediationStateFailed
newMachineRemediation.Status.Reason = "Reboot failed on timeout"
newMachineRemediation.Status.EndTime = &metav1.Time{Time: now}
return bmr.client.Update(context.TODO(), newMachineRemediation)
mrCopy.Status.State = mrv1.RemediationStateFailed
mrCopy.Status.Reason = "Reboot failed on timeout"
mrCopy.Status.EndTime = &metav1.Time{Time: now}
return bmr.client.Status().Update(context.TODO(), mrCopy)
}

// host still has state on, we need to reconcile
Expand All @@ -116,23 +116,23 @@ func (bmr *BareMetalRemediator) Reboot(ctx context.Context, machineRemediation *

// power on the machine
glog.V(4).Infof("Power on machine %q", machine.Name)
newBmh.Spec.Online = true
if err := bmr.client.Update(context.TODO(), newBmh); err != nil {
bmhCopy.Spec.Online = true
if err := bmr.client.Update(context.TODO(), bmhCopy); err != nil {
return err
}

newMachineRemediation.Status.State = mrv1.RemediationStatePowerOn
newMachineRemediation.Status.Reason = "Reboot in progress"
return bmr.client.Update(context.TODO(), newMachineRemediation)
mrCopy.Status.State = mrv1.RemediationStatePowerOn
mrCopy.Status.Reason = "Reboot in progress"
return bmr.client.Status().Update(context.TODO(), mrCopy)

case mrv1.RemediationStatePowerOn:
// failed the remediation on timeout
if machineRemediation.Status.StartTime.Time.Add(rebootDefaultTimeout * time.Minute).Before(now) {
glog.Errorf("Remediation of machine %q failed on timeout", machine.Name)
newMachineRemediation.Status.State = mrv1.RemediationStateFailed
newMachineRemediation.Status.Reason = "Reboot failed on timeout"
newMachineRemediation.Status.EndTime = &metav1.Time{Time: now}
return bmr.client.Update(context.TODO(), newMachineRemediation)
mrCopy.Status.State = mrv1.RemediationStateFailed
mrCopy.Status.Reason = "Reboot failed on timeout"
mrCopy.Status.EndTime = &metav1.Time{Time: now}
return bmr.client.Status().Update(context.TODO(), mrCopy)
}

node, err := getNodeByMachine(bmr.client, machine)
Expand All @@ -149,10 +149,10 @@ func (bmr *BareMetalRemediator) Reboot(ctx context.Context, machineRemediation *
// Node back to Ready under the cluster
if conditions.NodeHasCondition(node, corev1.NodeReady, corev1.ConditionTrue) {
glog.V(4).Infof("Remediation of machine %q succeeded", machine.Name)
newMachineRemediation.Status.State = mrv1.RemediationStateSucceeded
newMachineRemediation.Status.Reason = "Reboot succeeded"
newMachineRemediation.Status.EndTime = &metav1.Time{Time: now}
return bmr.client.Update(context.TODO(), newMachineRemediation)
mrCopy.Status.State = mrv1.RemediationStateSucceeded
mrCopy.Status.Reason = "Reboot succeeded"
mrCopy.Status.EndTime = &metav1.Time{Time: now}
return bmr.client.Status().Update(context.TODO(), mrCopy)
}
return nil

Expand Down
1 change: 0 additions & 1 deletion pkg/controllers/machinedisruptionbudget/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ go_library(
"//vendor/k8s.io/apimachinery/pkg/api/equality:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/api/errors:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/runtime:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/types:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/wait:go_default_library",
"//vendor/k8s.io/client-go/tools/record:go_default_library",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ import (
apiequality "k8s.io/apimachinery/pkg/api/equality"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/tools/record"
Expand Down Expand Up @@ -66,7 +65,6 @@ func Add(mgr manager.Manager, opts manager.Options) error {
func newReconciler(mgr manager.Manager, opts manager.Options) *ReconcileMachineDisruption {
return &ReconcileMachineDisruption{
client: mgr.GetClient(),
scheme: mgr.GetScheme(),
recorder: mgr.GetEventRecorderFor("machine-disruption-controller"),
}
}
Expand Down Expand Up @@ -94,7 +92,6 @@ type ReconcileMachineDisruption struct {
// that reads objects from the cache and writes to the apiserver
client client.Client
recorder record.EventRecorder
scheme *runtime.Scheme
}

// Reconcile reads that state of the cluster for MachineDisruptionBudget and machine objects and makes changes based on labels under
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ func newFakeReconciler(recorder record.EventRecorder, initObjects ...runtime.Obj
return &ReconcileMachineDisruption{
client: fakeClient,
recorder: recorder,
scheme: scheme.Scheme,
}
}

Expand Down
1 change: 0 additions & 1 deletion pkg/controllers/machinehealthcheck/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ go_library(
"//vendor/k8s.io/apimachinery/pkg/api/errors:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/labels:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/runtime:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/types:go_default_library",
"//vendor/k8s.io/client-go/tools/cache:go_default_library",
"//vendor/sigs.k8s.io/cluster-api/pkg/apis/machine/v1beta1:go_default_library",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ import (
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/tools/cache"

Expand All @@ -29,8 +28,8 @@ import (
)

const (
machineAnnotationKey = "machine.openshift.io/machine"
ownerControllerKind = "MachineSet"
machineAnnotationKey = "machine.openshift.io/machine"
ownerControllerKind = "MachineSet"
disableRemediationAnotationKey = "healthchecking.openshift.io/disabled"
)

Expand All @@ -44,7 +43,6 @@ func Add(mgr manager.Manager, opts manager.Options) error {
func newReconciler(mgr manager.Manager, opts manager.Options) reconcile.Reconciler {
return &ReconcileMachineHealthCheck{
client: mgr.GetClient(),
scheme: mgr.GetScheme(),
namespace: opts.Namespace,
}
}
Expand All @@ -66,7 +64,6 @@ type ReconcileMachineHealthCheck struct {
// This client, initialized using mgr.Client() above, is a split client
// that reads objects from the cache and writes to the apiserver
client client.Client
scheme *runtime.Scheme
namespace string
}

Expand Down Expand Up @@ -122,6 +119,11 @@ func (r *ReconcileMachineHealthCheck) Reconcile(request reconcile.Request) (reco
return reconcile.Result{}, err
}

if hasRemediationDisabledAnnotation(*machine) {
glog.Infof("Machine %q has a matching %s annotation set to <true>, remediation is skipped.", machine.Name, disableRemediationAnotationKey)
return reconcile.Result{}, nil
}

// If the current machine matches any existing MachineHealthCheck CRD
allMachineHealthChecks := &mrv1.MachineHealthCheckList{}
err = r.client.List(context.Background(), allMachineHealthChecks)
Expand All @@ -130,11 +132,6 @@ func (r *ReconcileMachineHealthCheck) Reconcile(request reconcile.Request) (reco
return reconcile.Result{}, err
}

if hasRemediationDisabledAnnotation(*machine) {
glog.Infof("Machine %s has a matching %s annotation set to <true>, remediation is skipped.",
machineKey, disableRemediationAnotationKey)
return reconcile.Result{}, nil
}
for _, hc := range allMachineHealthChecks.Items {

if hasMatchingLabels(&hc, machine) {
Expand Down Expand Up @@ -280,11 +277,6 @@ func (r *ReconcileMachineHealthCheck) remediationStrategyReboot(machine *mapiv1.
MachineName: machine.Name,
Type: mrv1.RemediationTypeReboot,
},
Status: mrv1.MachineRemediationStatus{
State: mrv1.RemediationStateStarted,
Reason: "Machine remediation started",
StartTime: &metav1.Time{Time: time.Now()},
},
}

glog.Infof("Machine %s has been unhealthy for too long, creating machine remediation", machine.Name)
Expand Down Expand Up @@ -338,7 +330,6 @@ func hasRemediationDisabledAnnotation(machine mapiv1.Machine) bool {
return skipRemediation == "true"
}


func hasMatchingLabels(machineHealthCheck *mrv1.MachineHealthCheck, machine *mapiv1.Machine) bool {
selector, err := metav1.LabelSelectorAsSelector(&machineHealthCheck.Spec.Selector)
if err != nil {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,6 @@ func newFakeReconciler(initObjects ...runtime.Object) *ReconcileMachineHealthChe
fakeClient := fake.NewFakeClient(initObjects...)
return &ReconcileMachineHealthCheck{
client: fakeClient,
scheme: scheme.Scheme,
namespace: consts.NamespaceOpenshiftMachineAPI,
}
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/controllers/machineremediation/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ go_library(
"//pkg/apis/machineremediation/v1alpha1:go_default_library",
"//vendor/github.com/golang/glog:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/api/errors:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/runtime:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
"//vendor/sigs.k8s.io/controller-runtime/pkg/client:go_default_library",
"//vendor/sigs.k8s.io/controller-runtime/pkg/controller:go_default_library",
"//vendor/sigs.k8s.io/controller-runtime/pkg/handler:go_default_library",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import (
"github.com/golang/glog"

"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

mrv1 "kubevirt.io/machine-remediation-operator/pkg/apis/machineremediation/v1alpha1"

Expand All @@ -27,7 +27,6 @@ type ReconcileMachineRemediation struct {
// that reads objects from the cache and writes to the apiserver
client client.Client
remediator Remediator
scheme *runtime.Scheme
namespace string
}

Expand All @@ -44,7 +43,6 @@ func AddWithRemediator(mgr manager.Manager, remediator Remediator, opts manager.
func newReconciler(mgr manager.Manager, remediator Remediator, opts manager.Options) (reconcile.Reconciler, error) {
return &ReconcileMachineRemediation{
client: mgr.GetClient(),
scheme: mgr.GetScheme(),
remediator: remediator,
namespace: opts.Namespace,
}, nil
Expand Down Expand Up @@ -88,6 +86,19 @@ func (r *ReconcileMachineRemediation) Reconcile(request reconcile.Request) (reco
return reconcile.Result{}, nil
}

if mr.Status.State == "" {
mrCopy := mr.DeepCopy()
mrCopy.Status = mrv1.MachineRemediationStatus{
State: mrv1.RemediationStateStarted,
Reason: "Machine remediation started",
StartTime: &metav1.Time{Time: time.Now()},
}
if err := r.client.Status().Update(context.TODO(), mrCopy); err != nil {
glog.Errorf("failed to update MR %q status: %v", mr.Name, err)
return reconcile.Result{}, err
}
}

switch mr.Spec.Type {
case mrv1.RemediationTypeReboot:
glog.V(4).Infof("Run remediation reboot acion for MachineRemediation %s", mr.Name)
Expand Down
4 changes: 4 additions & 0 deletions pkg/operator/components/rbac.go
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ var (
Verbs: []string{
"get",
"list",
"watch",
},
},
{
Expand All @@ -180,6 +181,7 @@ var (
},
Resources: []string{
"machineremediations",
"machineremediations/status",
},
Verbs: []string{
"delete",
Expand All @@ -200,6 +202,7 @@ var (
"delete",
"get",
"list",
"watch",
},
},
{
Expand All @@ -213,6 +216,7 @@ var (
"get",
"list",
"update",
"watch",
},
},
{
Expand Down
2 changes: 1 addition & 1 deletion pkg/operator/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ func (r *ReconcileMachineRemediationOperator) createOrUpdateComponents(mro *mrv1
ImageRepository: mro.Spec.ImageRegistry,
PullPolicy: mro.Spec.ImagePullPolicy,
OperatorVersion: r.operatorVersion,
Verbosity: "2",
Verbosity: "4",
}
if err := r.createOrUpdateDeployment(deployData); err != nil {
return err
Expand Down

0 comments on commit 5a50920

Please sign in to comment.