diff --git a/cluster-autoscaler/clusterstate/clusterstate_test.go b/cluster-autoscaler/clusterstate/clusterstate_test.go index 7b4fb3d6c736..ac11a1cfa5c1 100644 --- a/cluster-autoscaler/clusterstate/clusterstate_test.go +++ b/cluster-autoscaler/clusterstate/clusterstate_test.go @@ -176,6 +176,77 @@ func TestHasNodeGroupStartedScaleUp(t *testing.T) { } } +// TestRecalculateStateAfterNodeGroupSizeChanged checks that Recalculate updates state correctly after +// some node group size changed. We verify that acceptable ranges are updated accordingly +// and that the UpcomingNodes reflect the node group size change (important for recalculating state after +// deleting scale-up nodes that failed to create). +func TestRecalculateStateAfterNodeGroupSizeChanged(t *testing.T) { + ngName := "ng1" + testCases := []struct { + name string + acceptableRange AcceptableRange + readiness Readiness + newTarget int + scaleUpRequest *ScaleUpRequest + wantAcceptableRange AcceptableRange + wantUpcoming int + }{ + { + name: "failed scale up by 3 nodes", + acceptableRange: AcceptableRange{MinNodes: 1, CurrentTarget: 4, MaxNodes: 4}, + readiness: Readiness{Ready: make([]string, 1)}, + newTarget: 1, + wantAcceptableRange: AcceptableRange{MinNodes: 1, CurrentTarget: 1, MaxNodes: 1}, + wantUpcoming: 0, + }, { + name: "partially failed scale up", + acceptableRange: AcceptableRange{MinNodes: 5, CurrentTarget: 7, MaxNodes: 8}, + readiness: Readiness{Ready: make([]string, 5)}, + newTarget: 6, + wantAcceptableRange: AcceptableRange{MinNodes: 5, CurrentTarget: 6, MaxNodes: 6}, + scaleUpRequest: &ScaleUpRequest{Increase: 1}, + wantUpcoming: 1, + }, { + name: "scale up ongoing, no change", + acceptableRange: AcceptableRange{MinNodes: 1, CurrentTarget: 4, MaxNodes: 4}, + readiness: Readiness{Ready: make([]string, 1)}, + newTarget: 4, + wantAcceptableRange: AcceptableRange{MinNodes: 1, CurrentTarget: 4, MaxNodes: 4}, + scaleUpRequest: &ScaleUpRequest{Increase: 3}, + wantUpcoming: 3, + }, { + name: "no scale up, no change", + acceptableRange: AcceptableRange{MinNodes: 4, CurrentTarget: 4, MaxNodes: 4}, + readiness: Readiness{Ready: make([]string, 4)}, + newTarget: 4, + wantAcceptableRange: AcceptableRange{MinNodes: 4, CurrentTarget: 4, MaxNodes: 4}, + wantUpcoming: 0, + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + provider := testprovider.NewTestCloudProvider(nil, nil) + provider.AddNodeGroup(ngName, 0, 1000, tc.newTarget) + + fakeLogRecorder, _ := utils.NewStatusMapRecorder(&fake.Clientset{}, "kube-system", kube_record.NewFakeRecorder(5), false, "my-cool-configmap") + clusterState := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{}, fakeLogRecorder, + newBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{})) + clusterState.acceptableRanges = map[string]AcceptableRange{ngName: tc.acceptableRange} + clusterState.perNodeGroupReadiness = map[string]Readiness{ngName: tc.readiness} + if tc.scaleUpRequest != nil { + clusterState.scaleUpRequests = map[string]*ScaleUpRequest{ngName: tc.scaleUpRequest} + } + + clusterState.Recalculate() + assert.Equal(t, tc.wantAcceptableRange, clusterState.acceptableRanges[ngName]) + upcomingCounts, _ := clusterState.GetUpcomingNodes() + if upcoming, found := upcomingCounts[ngName]; found { + assert.Equal(t, tc.wantUpcoming, upcoming, "Unexpected upcoming nodes count, want: %d got: %d", tc.wantUpcoming, upcomingCounts[ngName]) + } + }) + } +} + func TestOKOneUnreadyNode(t *testing.T) { now := time.Now() diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index 8003f12490fe..027ec59630b3 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -448,10 +448,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr return nil } - if deletedNodes := a.deleteCreatedNodesWithErrors(); deletedNodes { - klog.V(0).Infof("Some nodes that failed to create were removed, skipping iteration") - return nil - } + a.deleteCreatedNodesWithErrors() // Check if there has been a constant difference between the number of nodes in k8s and // the number of nodes on the cloud provider side. @@ -821,7 +818,7 @@ func toNodes(unregisteredNodes []clusterstate.UnregisteredNode) []*apiv1.Node { return nodes } -func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool { +func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() { // We always schedule deleting of incoming errornous nodes // TODO[lukaszos] Consider adding logic to not retry delete every loop iteration nodeGroups := a.nodeGroupsById() @@ -848,7 +845,10 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool { } } - return deletedAny + if deletedAny { + klog.V(0).Infof("Some nodes that failed to create were removed, recalculating cluster state.") + a.clusterStateRegistry.Recalculate() + } } // overrideNodesToDeleteForZeroOrMax returns a list of nodes to delete, taking into account that diff --git a/cluster-autoscaler/core/static_autoscaler_test.go b/cluster-autoscaler/core/static_autoscaler_test.go index cecf660982b9..b49406577a5d 100644 --- a/cluster-autoscaler/core/static_autoscaler_test.go +++ b/cluster-autoscaler/core/static_autoscaler_test.go @@ -1709,8 +1709,11 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - removedNodes := autoscaler.deleteCreatedNodesWithErrors() - assert.True(t, removedNodes) + autoscaler.deleteCreatedNodesWithErrors() + + // nodes should be deleted + expectedDeleteCalls := 1 + nodeGroupA.AssertNumberOfCalls(t, "DeleteNodes", expectedDeleteCalls) // check delete was called on correct nodes nodeGroupA.AssertCalled(t, "DeleteNodes", mock.MatchedBy( @@ -1734,10 +1737,12 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - removedNodes = autoscaler.deleteCreatedNodesWithErrors() - assert.True(t, removedNodes) + autoscaler.deleteCreatedNodesWithErrors() // nodes should be deleted again + expectedDeleteCalls += 1 + nodeGroupA.AssertNumberOfCalls(t, "DeleteNodes", expectedDeleteCalls) + nodeGroupA.AssertCalled(t, "DeleteNodes", mock.MatchedBy( func(nodes []*apiv1.Node) bool { if len(nodes) != 4 { @@ -1798,11 +1803,10 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - removedNodes = autoscaler.deleteCreatedNodesWithErrors() - assert.False(t, removedNodes) + autoscaler.deleteCreatedNodesWithErrors() - // we expect no more Delete Nodes - nodeGroupA.AssertNumberOfCalls(t, "DeleteNodes", 2) + // we expect no more Delete Nodes, don't increase expectedDeleteCalls + nodeGroupA.AssertNumberOfCalls(t, "DeleteNodes", expectedDeleteCalls) // failed node not included by NodeGroupForNode nodeGroupC := &mockprovider.NodeGroup{} @@ -1840,8 +1844,7 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, time.Now()) // No nodes are deleted when failed nodes don't have matching node groups - removedNodes = autoscaler.deleteCreatedNodesWithErrors() - assert.False(t, removedNodes) + autoscaler.deleteCreatedNodesWithErrors() nodeGroupC.AssertNumberOfCalls(t, "DeleteNodes", 0) nodeGroupAtomic := &mockprovider.NodeGroup{} @@ -1896,8 +1899,7 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - removedNodes = autoscaler.deleteCreatedNodesWithErrors() - assert.True(t, removedNodes) + autoscaler.deleteCreatedNodesWithErrors() nodeGroupAtomic.AssertCalled(t, "DeleteNodes", mock.MatchedBy( func(nodes []*apiv1.Node) bool { @@ -1957,8 +1959,7 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - removedNodes = autoscaler.deleteCreatedNodesWithErrors() - assert.False(t, removedNodes) + autoscaler.deleteCreatedNodesWithErrors() nodeGroupError.AssertNumberOfCalls(t, "DeleteNodes", 0) }