Skip to content

Commit 7c6c1ed

Browse files
authored
scheduler: reconciler should constrain placements to count (#26239)
While working on property testing in #26172 we discovered there are scenarios where the reconciler will produce more than the expected number of placements. Testing of those scenarios at the whole-scheduler level shows that this gets handled correctly downstream of the reconciler, but this makes it harder to reason about reconciler behavior. Cap the number of placements in the reconciler. Ref: #26172
1 parent eb47d1c commit 7c6c1ed

File tree

2 files changed

+15
-7
lines changed

2 files changed

+15
-7
lines changed

scheduler/reconciler/reconcile_cluster.go

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -110,15 +110,17 @@ type ReconcileResults struct {
110110
Stop []AllocStopResult
111111

112112
// AttributeUpdates are updates to the allocation that are not from a
113-
// jobspec change.
113+
// jobspec change. map of alloc ID -> *Allocation
114114
AttributeUpdates map[string]*structs.Allocation
115115

116116
// DisconnectUpdates is the set of allocations are on disconnected nodes, but
117117
// have not yet had their ClientStatus set to AllocClientStatusUnknown.
118+
// map of alloc ID -> *Allocation
118119
DisconnectUpdates map[string]*structs.Allocation
119120

120121
// ReconnectUpdates is the set of allocations that have ClientStatus set to
121122
// AllocClientStatusUnknown, but the associated Node has reconnected.
123+
// map of alloc ID -> *Allocation
122124
ReconnectUpdates map[string]*structs.Allocation
123125

124126
// DesiredTGUpdates captures the desired set of changes to make for each
@@ -634,6 +636,12 @@ func (a *AllocReconciler) computeGroup(group string, all allocSet) (*ReconcileRe
634636
result.Deployment = a.jobState.DeploymentCurrent
635637
}
636638

639+
// We can never have more placements than the count
640+
if len(result.Place) > tg.Count {
641+
result.Place = result.Place[:tg.Count]
642+
result.DesiredTGUpdates[tg.Name].Place = uint64(tg.Count)
643+
}
644+
637645
deploymentComplete := a.isDeploymentComplete(group, destructive, inplace,
638646
migrate, rescheduleNow, result.Place, rescheduleLater, requiresCanaries)
639647

@@ -1123,8 +1131,8 @@ func (a *AllocReconciler) computeStop(group *structs.TaskGroup, nameIndex *Alloc
11231131

11241132
// Hot path the nothing to do case
11251133
//
1126-
// Note that this path can result in duplication allocation indexes in a
1127-
// scenario with a destructive job change (ex. image update) happens with
1134+
// Note that this path can result in duplicated allocation indexes in a
1135+
// scenario where a destructive job change (ex. image update) happens with
11281136
// an increased group count. Once the canary is replaced, and we compute
11291137
// the next set of stops, the untainted set equals the new group count,
11301138
// which results is missing one removal. The duplicate alloc index is

scheduler/reconciler/reconcile_cluster_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4048,7 +4048,7 @@ func TestReconciler_NewCanaries_CountGreater(t *testing.T) {
40484048

40494049
// Create 3 allocations from the old job
40504050
var allocs []*structs.Allocation
4051-
for i := 0; i < 3; i++ {
4051+
for i := range 3 {
40524052
alloc := mock.Alloc()
40534053
alloc.Job = job
40544054
alloc.JobID = job.ID
@@ -4087,18 +4087,18 @@ func TestReconciler_NewCanaries_CountGreater(t *testing.T) {
40874087
assertResults(t, r, &resultExpectation{
40884088
createDeployment: newD,
40894089
deploymentUpdates: nil,
4090-
place: 7,
4090+
place: 3,
40914091
inplace: 0,
40924092
stop: 0,
40934093
desiredTGUpdates: map[string]*structs.DesiredUpdates{
40944094
job.TaskGroups[0].Name: {
40954095
Canary: 7,
40964096
Ignore: 3,
4097+
Place: 3,
40974098
},
40984099
},
40994100
})
4100-
4101-
assertNamesHaveIndexes(t, intRange(0, 2, 3, 6), placeResultsToNames(r.Place))
4101+
assertNamesHaveIndexes(t, []int{0, 1, 2}, placeResultsToNames(r.Place))
41024102
}
41034103

41044104
// Tests the reconciler creates new canaries when the job changes for multiple

0 commit comments

Comments
 (0)