Files
hatchet/pkg/scheduling/v1/pool.go
abelanger5 851fbaf214 feat: reduced cold starts for new workers and queues (#2969)
* feat: reduced cold starts for new workers and queues

* address changes from pr review

* fix: data race

* set logs to debug on the harness

* debug for queue level as well

* debug lines for queuer

* fix: add queue notifier to v0 workflow registration

* revert: lease manager interval

* revert log level changes

* add more debug, revert reverts

* more debug

* add debug to lease manager

* do it, try it

* fix: call upsertQueue as part of workflow version put

* change log level to error again

* pr review changes
2026-02-11 13:12:10 -08:00

276 lines
7.0 KiB
Go

package v1
import (
"context"
"fmt"
"sync"
"time"
"github.com/google/uuid"
"github.com/rs/zerolog"
"github.com/hatchet-dev/hatchet/internal/syncx"
v1 "github.com/hatchet-dev/hatchet/pkg/repository"
"github.com/hatchet-dev/hatchet/pkg/repository/sqlcv1"
)
type sharedConfig struct {
repo v1.SchedulerRepository
l *zerolog.Logger
singleQueueLimit int
schedulerConcurrencyRateLimit int
schedulerConcurrencyPollingMinInterval time.Duration
schedulerConcurrencyPollingMaxInterval time.Duration
}
// SchedulingPool is responsible for managing a pool of tenantManagers.
type SchedulingPool struct {
Extensions *Extensions
tenants syncx.Map[uuid.UUID, *tenantManager]
setMu mutex
cf *sharedConfig
resultsCh chan *QueueResults
concurrencyResultsCh chan *ConcurrencyResults
optimisticSchedulingEnabled bool
optimisticSemaphore chan struct{}
}
func NewSchedulingPool(
repo v1.SchedulerRepository,
l *zerolog.Logger,
singleQueueLimit int,
schedulerConcurrencyRateLimit int,
schedulerConcurrencyPollingMinInterval time.Duration,
schedulerConcurrencyPollingMaxInterval time.Duration,
optimisticSchedulingEnabled bool,
optimisticSlots int,
) (*SchedulingPool, func() error, error) {
resultsCh := make(chan *QueueResults, 1000)
concurrencyResultsCh := make(chan *ConcurrencyResults, 1000)
semaphore := make(chan struct{}, optimisticSlots)
s := &SchedulingPool{
Extensions: &Extensions{},
cf: &sharedConfig{
repo: repo,
l: l,
singleQueueLimit: singleQueueLimit,
schedulerConcurrencyRateLimit: schedulerConcurrencyRateLimit,
schedulerConcurrencyPollingMinInterval: schedulerConcurrencyPollingMinInterval,
schedulerConcurrencyPollingMaxInterval: schedulerConcurrencyPollingMaxInterval,
},
resultsCh: resultsCh,
concurrencyResultsCh: concurrencyResultsCh,
setMu: newMu(l),
optimisticSchedulingEnabled: optimisticSchedulingEnabled,
optimisticSemaphore: semaphore,
}
return s, func() error {
s.cleanup()
return nil
}, nil
}
func (p *SchedulingPool) GetResultsCh() chan *QueueResults {
return p.resultsCh
}
func (p *SchedulingPool) GetConcurrencyResultsCh() chan *ConcurrencyResults {
return p.concurrencyResultsCh
}
func (p *SchedulingPool) cleanup() {
toCleanup := make([]*tenantManager, 0)
p.tenants.Range(func(key uuid.UUID, value *tenantManager) bool {
toCleanup = append(toCleanup, value)
return true
})
p.cleanupTenants(toCleanup)
err := p.Extensions.Cleanup()
if err != nil {
p.cf.l.Error().Err(err).Msg("failed to cleanup extensions")
}
}
func (p *SchedulingPool) SetTenants(tenants []*sqlcv1.Tenant) {
if ok := p.setMu.TryLock(); !ok {
return
}
defer p.setMu.Unlock()
tenantMap := make(map[uuid.UUID]bool)
for _, t := range tenants {
tenantId := t.ID
tenantMap[tenantId] = true
p.getTenantManager(tenantId, true) // nolint: ineffassign
}
toCleanup := make([]*tenantManager, 0)
// delete tenants that are not in the list
p.tenants.Range(func(tenantId uuid.UUID, value *tenantManager) bool {
if _, ok := tenantMap[tenantId]; !ok {
toCleanup = append(toCleanup, value)
}
return true
})
// delete each tenant from the map
for _, tm := range toCleanup {
p.tenants.Delete(tm.tenantId)
}
go func() {
// it is safe to cleanup tenants in a separate goroutine because we no longer have pointers to
// any cleaned up tenants in the map
p.cleanupTenants(toCleanup)
}()
go p.Extensions.SetTenants(tenants)
}
func (p *SchedulingPool) cleanupTenants(toCleanup []*tenantManager) {
wg := sync.WaitGroup{}
for _, tm := range toCleanup {
wg.Add(1)
go func(tm *tenantManager) {
defer wg.Done()
err := tm.Cleanup()
if err != nil {
p.cf.l.Error().Err(err).Msgf("failed to cleanup tenant manager for tenant %s", tm.tenantId.String())
}
}(tm)
}
wg.Wait()
}
func (p *SchedulingPool) Replenish(ctx context.Context, tenantId uuid.UUID) {
if tm := p.getTenantManager(tenantId, false); tm != nil {
tm.replenish(ctx)
}
}
func (p *SchedulingPool) NotifyQueues(ctx context.Context, tenantId uuid.UUID, queueNames []string) {
if tm := p.getTenantManager(tenantId, false); tm != nil {
tm.queue(ctx, queueNames)
}
}
func (p *SchedulingPool) NotifyConcurrency(ctx context.Context, tenantId uuid.UUID, strategyIds []int64) {
if tm := p.getTenantManager(tenantId, false); tm != nil {
tm.notifyConcurrency(ctx, strategyIds)
}
}
func (p *SchedulingPool) NotifyNewWorker(ctx context.Context, tenantId uuid.UUID, workerId uuid.UUID) {
if tm := p.getTenantManager(tenantId, false); tm != nil {
tm.notifyNewWorker(ctx, workerId)
}
}
func (p *SchedulingPool) NotifyNewQueue(ctx context.Context, tenantId uuid.UUID, queueName string) {
if tm := p.getTenantManager(tenantId, false); tm != nil {
tm.notifyNewQueue(ctx, queueName)
}
}
func (p *SchedulingPool) NotifyNewConcurrencyStrategy(ctx context.Context, tenantId uuid.UUID, strategyId int64) {
if tm := p.getTenantManager(tenantId, false); tm != nil {
tm.notifyNewConcurrencyStrategy(ctx, strategyId)
}
}
func (p *SchedulingPool) getTenantManager(tenantId uuid.UUID, storeIfNotFound bool) *tenantManager {
tm, ok := p.tenants.Load(tenantId)
if !ok {
if storeIfNotFound {
tm = newTenantManager(p.cf, tenantId, p.resultsCh, p.concurrencyResultsCh, p.Extensions)
p.tenants.Store(tenantId, tm)
} else {
return nil
}
}
return tm
}
var ErrTenantNotFound = fmt.Errorf("tenant not found in pool")
var ErrNoOptimisticSlots = fmt.Errorf("no optimistic slots for scheduling")
func (p *SchedulingPool) RunOptimisticScheduling(ctx context.Context, tenantId uuid.UUID, opts []*v1.WorkflowNameTriggerOpts, localWorkerIds map[uuid.UUID]struct{}) (map[uuid.UUID][]*AssignedItemWithTask, []*v1.V1TaskWithPayload, []*v1.DAGWithData, error) {
if !p.optimisticSchedulingEnabled {
return nil, nil, nil, ErrNoOptimisticSlots
}
// attempt to acquire a slot in the semaphore
select {
case p.optimisticSemaphore <- struct{}{}:
// acquired a slot
defer func() {
<-p.optimisticSemaphore
}()
default:
// no slots available
return nil, nil, nil, ErrNoOptimisticSlots
}
tm := p.getTenantManager(tenantId, false)
if tm == nil {
return nil, nil, nil, ErrTenantNotFound
}
return tm.runOptimisticScheduling(ctx, opts, localWorkerIds)
}
func (p *SchedulingPool) RunOptimisticSchedulingFromEvents(ctx context.Context, tenantId uuid.UUID, opts []v1.EventTriggerOpts, localWorkerIds map[uuid.UUID]struct{}) (map[uuid.UUID][]*AssignedItemWithTask, *v1.TriggerFromEventsResult, error) {
if !p.optimisticSchedulingEnabled {
return nil, nil, ErrNoOptimisticSlots
}
// attempt to acquire a slot in the semaphore
select {
case p.optimisticSemaphore <- struct{}{}:
// acquired a slot
defer func() {
<-p.optimisticSemaphore
}()
default:
// no slots available
return nil, nil, ErrNoOptimisticSlots
}
tm := p.getTenantManager(tenantId, false)
if tm == nil {
return nil, nil, ErrTenantNotFound
}
return tm.runOptimisticSchedulingFromEvents(ctx, opts, localWorkerIds)
}