Files
hatchet/pkg/scheduling/scheduling.go
abelanger5 3d218302ff fix: internal queue items performance and race conditions (#943)
* fix: don't use xmin hack

* fix: assign not append

* refactor: parallel step run updates via hashes

* fix: intermittent double execution of child step runs

* fix: rollback rate limits

* fix: bulk event writes from single buffer

* expose cleanup

* fix: race conditions on failures and cancellations

* change logger defaults to warn and console
2024-10-07 11:16:53 -04:00

189 lines
5.1 KiB
Go

package scheduling
import (
"context"
"github.com/jackc/pgx/v5/pgtype"
"github.com/hatchet-dev/hatchet/internal/telemetry"
"github.com/hatchet-dev/hatchet/pkg/repository"
"github.com/hatchet-dev/hatchet/pkg/repository/prisma/dbsqlc"
"github.com/hatchet-dev/hatchet/pkg/repository/prisma/sqlchelpers"
)
type QueueItemWithOrder struct {
*dbsqlc.QueueItem
Order int
}
type Slot struct {
ID string
WorkerId string
DispatcherId string
ActionId string
}
func GeneratePlan(
ctx context.Context,
slots []*Slot,
uniqueActionsArr []string,
queueItems []*QueueItemWithOrder,
stepRunRateUnits map[string]map[string]int32,
currRateLimits map[string]*dbsqlc.ListRateLimitsForTenantWithMutateRow,
workerLabels map[string][]*dbsqlc.GetWorkerLabelsRow,
stepDesiredLabels map[string][]*dbsqlc.GetDesiredLabelsRow,
) (SchedulePlan, error) {
_, span := telemetry.NewSpan(ctx, "generate-scheduling-plan")
defer span.End()
plan := SchedulePlan{
StepRunIds: make([]pgtype.UUID, 0),
StepRunTimeouts: make([]string, 0),
SlotIds: make([]string, 0),
WorkerIds: make([]pgtype.UUID, 0),
UnassignedStepRunIds: make([]pgtype.UUID, 0),
RateLimitedStepRuns: RateLimitedResult{
StepRuns: make([]pgtype.UUID, 0),
Keys: make([]string, 0),
},
QueuedStepRuns: make([]repository.QueuedStepRun, 0),
TimedOutStepRuns: make([]pgtype.UUID, 0),
QueuedItems: make([]int64, 0),
MinQueuedIds: make(map[string]int64),
ShouldContinue: false,
RateLimitUnitsConsumed: make(map[string]int32),
unassignedActions: make(map[string]struct{}),
}
// initialize worker states
workerManager := NewWorkerStateManager(slots, workerLabels, stepDesiredLabels)
workers := workerManager.workers
rateLimits := make(map[string]*RateLimit)
for key, rl := range currRateLimits {
rateLimits[key] = NewRateLimit(key, rl)
}
// collect the queue counts
queueCounts := make(map[string]int64)
for _, qi := range queueItems {
if _, ok := plan.MinQueuedIds[qi.Queue]; !ok {
plan.MinQueuedIds[qi.Queue] = 0
}
queueCounts[qi.Queue]++
}
drainedQueues := make(map[string]struct{})
// NOTE(abelanger5): this is a version of the assignment problem. There is a more optimal solution i.e. optimal
// matching which can run in polynomial time. This is a naive approach which assigns the first steps which were
// queued to the first slots which are seen.
for _, qi := range queueItems {
plan.UpdateMinQueuedIds(qi)
// if we're timed out then mark as timed out
if IsTimedout(qi) {
plan.HandleTimedOut(qi)
continue
}
stepId := sqlchelpers.UUIDToStr(qi.StepId)
// if we're out of slots then mark as unassigned
if !workerManager.HasEligibleWorkers(stepId) {
plan.HandleNoSlots(qi)
continue
}
stepRunId := sqlchelpers.UUIDToStr(qi.StepRunId)
rateLimitedOnKey := ""
rateLimitedUnits := int32(0)
// check if we're rate limited
if srRateLimitUnits, ok := stepRunRateUnits[stepRunId]; ok {
for key, units := range srRateLimitUnits {
if rateLimit, ok := rateLimits[key]; ok {
// add the step run id to the rate limit. if this returns false, then we're rate limited
if !rateLimit.AddStepRunId(stepRunId, units) {
rateLimitedOnKey = key
rateLimitedUnits = units
}
}
}
}
// pick a worker to assign the slot to
assigned := false
if rateLimitedOnKey == "" {
slot := workerManager.AttemptAssignSlot(qi)
// we assign the slot to the plan
if slot != nil {
// add the slot to the plan
plan.AssignQiToSlot(qi, slot)
queueCounts[qi.Queue]--
if _, ok := drainedQueues[qi.Queue]; !ok && queueCounts[qi.Queue] == 0 {
drainedQueues[qi.Queue] = struct{}{}
}
assigned = true
}
}
// if we couldn't assign the slot to any worker then mark as rate limited
if rateLimitedOnKey != "" {
plan.HandleRateLimited(qi, rateLimitedOnKey, rateLimitedUnits)
// if we're rate limited then call rollback on the rate limits (this can happen if we've succeeded on one rate limit
// but failed on another)
for key := range stepRunRateUnits[stepRunId] {
if rateLimit, ok := rateLimits[key]; ok {
rateLimit.Rollback(stepRunId)
}
}
} else if !assigned {
plan.HandleUnassigned(qi)
// if we can't assign the slot to any worker then we rollback the rate limit
for key := range stepRunRateUnits[stepRunId] {
if rateLimit, ok := rateLimits[key]; ok {
rateLimit.Rollback(stepRunId)
}
}
}
}
for key, rateLimit := range rateLimits {
plan.RateLimitUnitsConsumed[key] = rateLimit.UnitsConsumed()
}
// we're looking to see if there's a single queue where all items have been properly scheduled
if len(drainedQueues) > 0 && len(workers) > 0 {
for _, qi := range queueItems {
if _, ok := drainedQueues[qi.Queue]; ok {
// if the queue is drained then we check if we can assign the action to any worker
for _, workers := range workers {
// if we can assign the action to any worker then we should continue and return early
if workers.CanAssign(qi.ActionId.String, nil) {
plan.ShouldContinue = true
return plan, nil
}
}
}
}
}
return plan, nil
}