Skip to content

Commit

Permalink
Merge pull request kubernetes#7087 from bskiba/stockout-handling
Browse files Browse the repository at this point in the history
Faster handling of failed scale ups
  • Loading branch information
k8s-ci-robot authored Aug 13, 2024
2 parents a7ee2b2 + 939123c commit aec7b75
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 20 deletions.
71 changes: 71 additions & 0 deletions cluster-autoscaler/clusterstate/clusterstate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,77 @@ func TestHasNodeGroupStartedScaleUp(t *testing.T) {
}
}

// TestRecalculateStateAfterNodeGroupSizeChanged checks that Recalculate updates state correctly after
// some node group size changed. We verify that acceptable ranges are updated accordingly
// and that the UpcomingNodes reflect the node group size change (important for recalculating state after
// deleting scale-up nodes that failed to create).
func TestRecalculateStateAfterNodeGroupSizeChanged(t *testing.T) {
ngName := "ng1"
testCases := []struct {
name string
acceptableRange AcceptableRange
readiness Readiness
newTarget int
scaleUpRequest *ScaleUpRequest
wantAcceptableRange AcceptableRange
wantUpcoming int
}{
{
name: "failed scale up by 3 nodes",
acceptableRange: AcceptableRange{MinNodes: 1, CurrentTarget: 4, MaxNodes: 4},
readiness: Readiness{Ready: make([]string, 1)},
newTarget: 1,
wantAcceptableRange: AcceptableRange{MinNodes: 1, CurrentTarget: 1, MaxNodes: 1},
wantUpcoming: 0,
}, {
name: "partially failed scale up",
acceptableRange: AcceptableRange{MinNodes: 5, CurrentTarget: 7, MaxNodes: 8},
readiness: Readiness{Ready: make([]string, 5)},
newTarget: 6,
wantAcceptableRange: AcceptableRange{MinNodes: 5, CurrentTarget: 6, MaxNodes: 6},
scaleUpRequest: &ScaleUpRequest{Increase: 1},
wantUpcoming: 1,
}, {
name: "scale up ongoing, no change",
acceptableRange: AcceptableRange{MinNodes: 1, CurrentTarget: 4, MaxNodes: 4},
readiness: Readiness{Ready: make([]string, 1)},
newTarget: 4,
wantAcceptableRange: AcceptableRange{MinNodes: 1, CurrentTarget: 4, MaxNodes: 4},
scaleUpRequest: &ScaleUpRequest{Increase: 3},
wantUpcoming: 3,
}, {
name: "no scale up, no change",
acceptableRange: AcceptableRange{MinNodes: 4, CurrentTarget: 4, MaxNodes: 4},
readiness: Readiness{Ready: make([]string, 4)},
newTarget: 4,
wantAcceptableRange: AcceptableRange{MinNodes: 4, CurrentTarget: 4, MaxNodes: 4},
wantUpcoming: 0,
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
provider := testprovider.NewTestCloudProvider(nil, nil)
provider.AddNodeGroup(ngName, 0, 1000, tc.newTarget)

fakeLogRecorder, _ := utils.NewStatusMapRecorder(&fake.Clientset{}, "kube-system", kube_record.NewFakeRecorder(5), false, "my-cool-configmap")
clusterState := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{}, fakeLogRecorder,
newBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{}))
clusterState.acceptableRanges = map[string]AcceptableRange{ngName: tc.acceptableRange}
clusterState.perNodeGroupReadiness = map[string]Readiness{ngName: tc.readiness}
if tc.scaleUpRequest != nil {
clusterState.scaleUpRequests = map[string]*ScaleUpRequest{ngName: tc.scaleUpRequest}
}

clusterState.Recalculate()
assert.Equal(t, tc.wantAcceptableRange, clusterState.acceptableRanges[ngName])
upcomingCounts, _ := clusterState.GetUpcomingNodes()
if upcoming, found := upcomingCounts[ngName]; found {
assert.Equal(t, tc.wantUpcoming, upcoming, "Unexpected upcoming nodes count, want: %d got: %d", tc.wantUpcoming, upcomingCounts[ngName])
}
})
}
}

func TestOKOneUnreadyNode(t *testing.T) {
now := time.Now()

Expand Down
12 changes: 6 additions & 6 deletions cluster-autoscaler/core/static_autoscaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -448,10 +448,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr
return nil
}

if deletedNodes := a.deleteCreatedNodesWithErrors(); deletedNodes {
klog.V(0).Infof("Some nodes that failed to create were removed, skipping iteration")
return nil
}
a.deleteCreatedNodesWithErrors()

// Check if there has been a constant difference between the number of nodes in k8s and
// the number of nodes on the cloud provider side.
Expand Down Expand Up @@ -821,7 +818,7 @@ func toNodes(unregisteredNodes []clusterstate.UnregisteredNode) []*apiv1.Node {
return nodes
}

func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool {
func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() {
// We always schedule deleting of incoming errornous nodes
// TODO[lukaszos] Consider adding logic to not retry delete every loop iteration
nodeGroups := a.nodeGroupsById()
Expand All @@ -848,7 +845,10 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool {
}
}

return deletedAny
if deletedAny {
klog.V(0).Infof("Some nodes that failed to create were removed, recalculating cluster state.")
a.clusterStateRegistry.Recalculate()
}
}

// overrideNodesToDeleteForZeroOrMax returns a list of nodes to delete, taking into account that
Expand Down
29 changes: 15 additions & 14 deletions cluster-autoscaler/core/static_autoscaler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1709,8 +1709,11 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) {
clusterState.UpdateNodes([]*apiv1.Node{}, nil, now)

// delete nodes with create errors
removedNodes := autoscaler.deleteCreatedNodesWithErrors()
assert.True(t, removedNodes)
autoscaler.deleteCreatedNodesWithErrors()

// nodes should be deleted
expectedDeleteCalls := 1
nodeGroupA.AssertNumberOfCalls(t, "DeleteNodes", expectedDeleteCalls)

// check delete was called on correct nodes
nodeGroupA.AssertCalled(t, "DeleteNodes", mock.MatchedBy(
Expand All @@ -1734,10 +1737,12 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) {
clusterState.UpdateNodes([]*apiv1.Node{}, nil, now)

// delete nodes with create errors
removedNodes = autoscaler.deleteCreatedNodesWithErrors()
assert.True(t, removedNodes)
autoscaler.deleteCreatedNodesWithErrors()

// nodes should be deleted again
expectedDeleteCalls += 1
nodeGroupA.AssertNumberOfCalls(t, "DeleteNodes", expectedDeleteCalls)

nodeGroupA.AssertCalled(t, "DeleteNodes", mock.MatchedBy(
func(nodes []*apiv1.Node) bool {
if len(nodes) != 4 {
Expand Down Expand Up @@ -1798,11 +1803,10 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) {
clusterState.UpdateNodes([]*apiv1.Node{}, nil, now)

// delete nodes with create errors
removedNodes = autoscaler.deleteCreatedNodesWithErrors()
assert.False(t, removedNodes)
autoscaler.deleteCreatedNodesWithErrors()

// we expect no more Delete Nodes
nodeGroupA.AssertNumberOfCalls(t, "DeleteNodes", 2)
// we expect no more Delete Nodes, don't increase expectedDeleteCalls
nodeGroupA.AssertNumberOfCalls(t, "DeleteNodes", expectedDeleteCalls)

// failed node not included by NodeGroupForNode
nodeGroupC := &mockprovider.NodeGroup{}
Expand Down Expand Up @@ -1840,8 +1844,7 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) {
clusterState.UpdateNodes([]*apiv1.Node{}, nil, time.Now())

// No nodes are deleted when failed nodes don't have matching node groups
removedNodes = autoscaler.deleteCreatedNodesWithErrors()
assert.False(t, removedNodes)
autoscaler.deleteCreatedNodesWithErrors()
nodeGroupC.AssertNumberOfCalls(t, "DeleteNodes", 0)

nodeGroupAtomic := &mockprovider.NodeGroup{}
Expand Down Expand Up @@ -1896,8 +1899,7 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) {
clusterState.UpdateNodes([]*apiv1.Node{}, nil, now)

// delete nodes with create errors
removedNodes = autoscaler.deleteCreatedNodesWithErrors()
assert.True(t, removedNodes)
autoscaler.deleteCreatedNodesWithErrors()

nodeGroupAtomic.AssertCalled(t, "DeleteNodes", mock.MatchedBy(
func(nodes []*apiv1.Node) bool {
Expand Down Expand Up @@ -1957,8 +1959,7 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) {
clusterState.UpdateNodes([]*apiv1.Node{}, nil, now)

// delete nodes with create errors
removedNodes = autoscaler.deleteCreatedNodesWithErrors()
assert.False(t, removedNodes)
autoscaler.deleteCreatedNodesWithErrors()

nodeGroupError.AssertNumberOfCalls(t, "DeleteNodes", 0)
}
Expand Down

0 comments on commit aec7b75

Please sign in to comment.