chore: improve throughput, remove deadlocks (#949)

* add otel to pub

* temporarily remove tenant id exchange

* fix: increase internal queue throughput

* fix: remove potential deadlocking

* rollback hash factor multiplier

* fix: batch update issues

* fix: rm unneeded locks

* move disable tenant pubsub to an env var

---------

Co-authored-by: gabriel ruttner <gabriel.ruttner@gmail.com>
This commit is contained in:
abelanger5
2024-10-10 08:54:34 -04:00
committed by GitHub
parent 089b2b5e33
commit 95558138a4
8 changed files with 96 additions and 69 deletions
+27 -10
View File
@@ -49,6 +49,8 @@ type MessageQueueImpl struct {
ready bool
disableTenantExchangePubs bool
// lru cache for tenant ids
tenantIdCache *lru.Cache[string, bool]
}
@@ -60,16 +62,18 @@ func (t *MessageQueueImpl) IsReady() bool {
type MessageQueueImplOpt func(*MessageQueueImplOpts)
type MessageQueueImplOpts struct {
l *zerolog.Logger
url string
qos int
l *zerolog.Logger
url string
qos int
disableTenantExchangePubs bool
}
func defaultMessageQueueImplOpts() *MessageQueueImplOpts {
l := logger.NewDefaultLogger("rabbitmq")
return &MessageQueueImplOpts{
l: &l,
l: &l,
disableTenantExchangePubs: false,
}
}
@@ -91,6 +95,12 @@ func WithQos(qos int) MessageQueueImplOpt {
}
}
func WithDisableTenantExchangePubs(disable bool) MessageQueueImplOpt {
return func(opts *MessageQueueImplOpts) {
opts.disableTenantExchangePubs = disable
}
}
// New creates a new MessageQueueImpl.
func New(fs ...MessageQueueImplOpt) (func() error, *MessageQueueImpl) {
ctx, cancel := context.WithCancel(context.Background())
@@ -105,11 +115,12 @@ func New(fs ...MessageQueueImplOpt) (func() error, *MessageQueueImpl) {
opts.l = &newLogger
t := &MessageQueueImpl{
ctx: ctx,
identity: identity(),
l: opts.l,
qos: opts.qos,
configFs: fs,
ctx: ctx,
identity: identity(),
l: opts.l,
qos: opts.qos,
configFs: fs,
disableTenantExchangePubs: opts.disableTenantExchangePubs,
}
constructor := func(context.Context) (*amqp.Connection, error) {
@@ -196,6 +207,9 @@ func (t *MessageQueueImpl) SetQOS(prefetchCount int) {
// AddMessage adds a msg to the queue.
func (t *MessageQueueImpl) AddMessage(ctx context.Context, q msgqueue.Queue, msg *msgqueue.Message) error {
ctx, span := telemetry.NewSpan(ctx, "add-message")
defer span.End()
// inject otel carrier into the message
if msg.OtelCarrier == nil {
msg.OtelCarrier = telemetry.GetCarrier(ctx)
@@ -339,6 +353,9 @@ func (t *MessageQueueImpl) startPublishing() func() error {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
ctx, span := telemetry.NewSpanWithCarrier(ctx, "publish-message", msg.OtelCarrier)
defer span.End()
t.l.Debug().Msgf("publishing msg %s to queue %s", msg.ID, msg.q.Name())
pubMsg := amqp.Publishing{
@@ -358,7 +375,7 @@ func (t *MessageQueueImpl) startPublishing() func() error {
}
// if this is a tenant msg, publish to the tenant exchange
if msg.TenantID() != "" {
if !t.disableTenantExchangePubs && msg.TenantID() != "" {
// determine if the tenant exchange exists
if _, ok := t.tenantIdCache.Get(msg.TenantID()); !ok {
// register the tenant exchange
+5 -15
View File
@@ -10,12 +10,11 @@ import (
)
type OperationPool struct {
ops sync.Map
timeout time.Duration
description string
method OpMethod
ql *zerolog.Logger
setTenantsMu sync.RWMutex
ops sync.Map
timeout time.Duration
description string
method OpMethod
ql *zerolog.Logger
}
func NewOperationPool(ql *zerolog.Logger, timeout time.Duration, description string, method OpMethod) *OperationPool {
@@ -28,9 +27,6 @@ func NewOperationPool(ql *zerolog.Logger, timeout time.Duration, description str
}
func (p *OperationPool) SetTenants(tenants []*dbsqlc.Tenant) {
p.setTenantsMu.Lock()
defer p.setTenantsMu.Unlock()
tenantMap := make(map[string]bool)
for _, t := range tenants {
@@ -48,16 +44,10 @@ func (p *OperationPool) SetTenants(tenants []*dbsqlc.Tenant) {
}
func (p *OperationPool) RunOrContinue(id string) {
p.setTenantsMu.RLock()
defer p.setTenantsMu.RUnlock()
p.GetOperation(id).RunOrContinue(p.ql)
}
func (p *OperationPool) GetOperation(id string) *SerialOperation {
p.setTenantsMu.RLock()
defer p.setTenantsMu.RUnlock()
op, ok := p.ops.Load(id)
if !ok {
@@ -872,7 +872,7 @@ func (ec *JobsControllerImpl) handleStepRunStarted(ctx context.Context, task *ms
return fmt.Errorf("could not parse started at: %w", err)
}
err = ec.repo.StepRun().StepRunStarted(ctx, metadata.TenantId, payload.StepRunId, startedAt)
err = ec.repo.StepRun().StepRunStarted(ctx, metadata.TenantId, payload.WorkflowRunId, payload.StepRunId, startedAt)
if err != nil {
return fmt.Errorf("could not update step run: %w", err)
@@ -913,7 +913,7 @@ func (ec *JobsControllerImpl) handleStepRunFinished(ctx context.Context, task *m
stepOutput = []byte(payload.StepOutputData)
}
err = ec.repo.StepRun().StepRunSucceeded(ctx, metadata.TenantId, payload.StepRunId, finishedAt, stepOutput)
err = ec.repo.StepRun().StepRunSucceeded(ctx, metadata.TenantId, payload.WorkflowRunId, payload.StepRunId, finishedAt, stepOutput)
if err != nil {
return fmt.Errorf("could not update step run: %w", err)
@@ -1014,7 +1014,7 @@ func (ec *JobsControllerImpl) failStepRun(ctx context.Context, tenantId, stepRun
}
// fail step run
err = ec.repo.StepRun().StepRunFailed(ctx, tenantId, stepRunId, failedAt, errorReason, int(oldStepRun.SRRetryCount))
err = ec.repo.StepRun().StepRunFailed(ctx, tenantId, sqlchelpers.UUIDToStr(oldStepRun.WorkflowRunId), stepRunId, failedAt, errorReason, int(oldStepRun.SRRetryCount))
if err != nil {
return fmt.Errorf("could not fail step run: %w", err)
@@ -1129,7 +1129,7 @@ func (ec *JobsControllerImpl) cancelStepRun(ctx context.Context, tenantId, stepR
return fmt.Errorf("could not get step run: %w", err)
}
err = ec.repo.StepRun().StepRunCancelled(ctx, tenantId, stepRunId, now, reason)
err = ec.repo.StepRun().StepRunCancelled(ctx, tenantId, sqlchelpers.UUIDToStr(oldStepRun.WorkflowRunId), stepRunId, now, reason)
if err != nil {
return fmt.Errorf("could not cancel step run: %w", err)
+1 -1
View File
@@ -1056,7 +1056,7 @@ func (s *DispatcherImpl) handleStepRunStarted(inputCtx context.Context, request
return nil, err
}
err = s.repo.StepRun().StepRunStarted(ctx, tenantId, request.StepRunId, startedAt)
err = s.repo.StepRun().StepRunStarted(ctx, tenantId, sqlchelpers.UUIDToStr(sr.WorkflowRunId), request.StepRunId, startedAt)
if err != nil {
return nil, fmt.Errorf("could not mark step run started: %w", err)
+1
View File
@@ -241,6 +241,7 @@ func GetServerConfigFromConfigfile(dc *database.Config, cf *server.ServerConfigF
rabbitmq.WithURL(cf.MessageQueue.RabbitMQ.URL),
rabbitmq.WithLogger(&l),
rabbitmq.WithQos(cf.MessageQueue.RabbitMQ.Qos),
rabbitmq.WithDisableTenantExchangePubs(cf.Runtime.DisableTenantPubs),
)
ing, err = ingestor.NewIngestor(
+12
View File
@@ -110,6 +110,12 @@ type ConfigFileRuntime struct {
// FlushItemsThreshold is the number of items to hold in memory until flushing to the database
FlushItemsThreshold int `mapstructure:"flushItemsThreshold" json:"flushItemsThreshold,omitempty" default:"100"`
// How many buckets to hash into for parallelizing updates
UpdateHashFactor int `mapstructure:"updateHashFactor" json:"updateHashFactor,omitempty" default:"100"`
// How many concurrent updates to allow
UpdateConcurrentFactor int `mapstructure:"updateConcurrentFactor" json:"updateConcurrentFactor,omitempty" default:"10"`
// Allow new tenants to be created
AllowSignup bool `mapstructure:"allowSignup" json:"allowSignup,omitempty" default:"true"`
@@ -121,6 +127,9 @@ type ConfigFileRuntime struct {
// Allow passwords to be changed
AllowChangePassword bool `mapstructure:"allowChangePassword" json:"allowChangePassword,omitempty" default:"true"`
// DisableTenantPubs controls whether tenant pubsub is disabled
DisableTenantPubs bool `mapstructure:"disableTenantPubs" json:"disableTenantPubs,omitempty"`
}
type SecurityCheckConfigFile struct {
@@ -409,6 +418,7 @@ func BindAllEnv(v *viper.Viper) {
_ = v.BindEnv("runtime.allowInvites", "SERVER_ALLOW_INVITES")
_ = v.BindEnv("runtime.allowCreateTenant", "SERVER_ALLOW_CREATE_TENANT")
_ = v.BindEnv("runtime.allowChangePassword", "SERVER_ALLOW_CHANGE_PASSWORD")
_ = v.BindEnv("runtime.disableTenantPubs", "SERVER_DISABLE_TENANT_PUBS")
// security check options
_ = v.BindEnv("securityCheck.enabled", "SERVER_SECURITY_CHECK_ENABLED")
@@ -493,6 +503,8 @@ func BindAllEnv(v *viper.Viper) {
_ = v.BindEnv("runtime.singleQueueLimit", "SERVER_SINGLE_QUEUE_LIMIT")
_ = v.BindEnv("runtime.flushPeriodMilliseconds", "SERVER_FLUSH_PERIOD_MILLISECONDS")
_ = v.BindEnv("runtime.flushItemsThreshold", "SERVER_FLUSH_ITEMS_THRESHOLD")
_ = v.BindEnv("runtime.updateHashFactor", "SERVER_UPDATE_HASH_FACTOR")
_ = v.BindEnv("runtime.updateConcurrentFactor", "SERVER_UPDATE_CONCURRENT_FACTOR")
// tls options
_ = v.BindEnv("tls.tlsStrategy", "SERVER_TLS_STRATEGY")
+42 -35
View File
@@ -293,8 +293,8 @@ func NewStepRunEngineRepository(pool *pgxpool.Pool, v validator.Validator, l *ze
queries: queries,
cf: cf,
cachedStepIdHasRateLimit: rlCache,
updateConcurrentFactor: 10,
maxHashFactor: 100,
updateConcurrentFactor: cf.UpdateConcurrentFactor,
maxHashFactor: cf.UpdateHashFactor,
}
err := s.startBuffers()
@@ -1928,10 +1928,10 @@ func (s *stepRunEngineRepository) ProcessStepRunUpdatesV2(ctx context.Context, q
pgTenantId := sqlchelpers.UUIDFromStr(tenantId)
limit := 100
limit := 100 * s.updateConcurrentFactor
if s.cf.SingleQueueLimit != 0 {
limit = s.cf.SingleQueueLimit * 4 // we call update step run 4x
limit = s.cf.SingleQueueLimit * s.updateConcurrentFactor
}
tx, commit, rollback, err := prepareTx(ctx, s.pool, s.l, 25000)
@@ -2225,7 +2225,7 @@ func (s *stepRunEngineRepository) processStepRunUpdatesV2(
ctx context.Context,
qlp *zerolog.Logger,
tenantId string,
tx dbsqlc.DBTX,
outerTx dbsqlc.DBTX,
data []updateStepRunQueueData,
) (succeededStepRuns []*dbsqlc.GetStepRunForEngineRow, completedWorkflowRuns []*dbsqlc.ResolveWorkflowRunStatusRow, err error) {
// startedAt := time.Now().UTC()
@@ -2246,6 +2246,7 @@ func (s *stepRunEngineRepository) processStepRunUpdatesV2(
}
}
var wrMu sync.Mutex
var eg errgroup.Group
for _, batch := range batches {
@@ -2266,9 +2267,11 @@ func (s *stepRunEngineRepository) processStepRunUpdatesV2(
failParams := dbsqlc.BulkFailStepRunParams{}
cancelParams := dbsqlc.BulkCancelStepRunParams{}
finishParams := dbsqlc.BulkFinishStepRunParams{}
batchStepRunIds := []pgtype.UUID{}
for _, item := range batch {
stepRunId := sqlchelpers.UUIDFromStr(item.StepRunId)
batchStepRunIds = append(batchStepRunIds, stepRunId)
switch dbsqlc.StepRunStatus(*item.Status) {
case dbsqlc.StepRunStatusRUNNING:
@@ -2322,12 +2325,35 @@ func (s *stepRunEngineRepository) processStepRunUpdatesV2(
}
}
// update the job runs and workflow runs as well
jobRunIds, err := s.queries.ResolveJobRunStatus(ctx, tx, dbsqlc.ResolveJobRunStatusParams{
Steprunids: batchStepRunIds,
Tenantid: pgTenantId,
})
if err != nil {
return fmt.Errorf("could not resolve job run status: %w", err)
}
innerCompletedWorkflowRuns, err := s.queries.ResolveWorkflowRunStatus(ctx, tx, dbsqlc.ResolveWorkflowRunStatusParams{
Jobrunids: jobRunIds,
Tenantid: pgTenantId,
})
if err != nil {
return fmt.Errorf("could not resolve workflow run status: %w", err)
}
err = commit(ctx)
if err != nil {
return fmt.Errorf("could not commit transaction: %w", err)
}
wrMu.Lock()
completedWorkflowRuns = append(completedWorkflowRuns, innerCompletedWorkflowRuns...)
wrMu.Unlock()
return nil
})
}
@@ -2338,26 +2364,7 @@ func (s *stepRunEngineRepository) processStepRunUpdatesV2(
return nil, nil, fmt.Errorf("could not process step run updates v2: %w", err)
}
// update the job runs and workflow runs as well
jobRunIds, err := s.queries.ResolveJobRunStatus(ctx, tx, dbsqlc.ResolveJobRunStatusParams{
Steprunids: stepRunIds,
Tenantid: pgTenantId,
})
if err != nil {
return nil, nil, fmt.Errorf("could not resolve job run status: %w", err)
}
completedWorkflowRuns, err = s.queries.ResolveWorkflowRunStatus(ctx, tx, dbsqlc.ResolveWorkflowRunStatusParams{
Jobrunids: jobRunIds,
Tenantid: pgTenantId,
})
if err != nil {
return nil, nil, fmt.Errorf("could not resolve workflow run status: %w", err)
}
succeededStepRuns, err = s.queries.GetStepRunForEngine(ctx, tx, dbsqlc.GetStepRunForEngineParams{
succeededStepRuns, err = s.queries.GetStepRunForEngine(ctx, outerTx, dbsqlc.GetStepRunForEngineParams{
Ids: completedStepRunIds,
TenantId: pgTenantId,
})
@@ -2507,14 +2514,14 @@ func (s *stepRunEngineRepository) CleanupInternalQueueItems(ctx context.Context,
return nil
}
func (s *stepRunEngineRepository) StepRunStarted(ctx context.Context, tenantId, stepRunId string, startedAt time.Time) error {
func (s *stepRunEngineRepository) StepRunStarted(ctx context.Context, tenantId, workflowRunId, stepRunId string, startedAt time.Time) error {
ctx, span := telemetry.NewSpan(ctx, "step-run-started-db")
defer span.End()
running := string(dbsqlc.StepRunStatusRUNNING)
data := &updateStepRunQueueData{
Hash: hashToBucket(sqlchelpers.UUIDFromStr(stepRunId), s.maxHashFactor),
Hash: hashToBucket(sqlchelpers.UUIDFromStr(workflowRunId), s.maxHashFactor),
StepRunId: stepRunId,
TenantId: tenantId,
StartedAt: &startedAt,
@@ -2530,7 +2537,7 @@ func (s *stepRunEngineRepository) StepRunStarted(ctx context.Context, tenantId,
return nil
}
func (s *stepRunEngineRepository) StepRunSucceeded(ctx context.Context, tenantId, stepRunId string, finishedAt time.Time, output []byte) error {
func (s *stepRunEngineRepository) StepRunSucceeded(ctx context.Context, tenantId, workflowRunId, stepRunId string, finishedAt time.Time, output []byte) error {
ctx, span := telemetry.NewSpan(ctx, "step-run-started-db")
defer span.End()
@@ -2544,7 +2551,7 @@ func (s *stepRunEngineRepository) StepRunSucceeded(ctx context.Context, tenantId
finished := string(dbsqlc.StepRunStatusSUCCEEDED)
data := &updateStepRunQueueData{
Hash: hashToBucket(sqlchelpers.UUIDFromStr(stepRunId), s.maxHashFactor),
Hash: hashToBucket(sqlchelpers.UUIDFromStr(workflowRunId), s.maxHashFactor),
StepRunId: stepRunId,
TenantId: tenantId,
FinishedAt: &finishedAt,
@@ -2599,7 +2606,7 @@ func (s *stepRunEngineRepository) StepRunSucceeded(ctx context.Context, tenantId
return nil
}
func (s *stepRunEngineRepository) StepRunCancelled(ctx context.Context, tenantId, stepRunId string, cancelledAt time.Time, cancelledReason string) error {
func (s *stepRunEngineRepository) StepRunCancelled(ctx context.Context, tenantId, workflowRunId, stepRunId string, cancelledAt time.Time, cancelledReason string) error {
ctx, span := telemetry.NewSpan(ctx, "step-run-cancelled-db")
defer span.End()
@@ -2613,7 +2620,7 @@ func (s *stepRunEngineRepository) StepRunCancelled(ctx context.Context, tenantId
cancelled := string(dbsqlc.StepRunStatusCANCELLED)
data := &updateStepRunQueueData{
Hash: hashToBucket(sqlchelpers.UUIDFromStr(stepRunId), s.maxHashFactor),
Hash: hashToBucket(sqlchelpers.UUIDFromStr(workflowRunId), s.maxHashFactor),
StepRunId: stepRunId,
TenantId: tenantId,
CancelledAt: &cancelledAt,
@@ -2644,7 +2651,7 @@ func (s *stepRunEngineRepository) StepRunCancelled(ctx context.Context, tenantId
reason := "PREVIOUS_STEP_CANCELLED"
_, err := s.bulkStatusBuffer.BuffItem(tenantId, &updateStepRunQueueData{
Hash: hashToBucket(laterStepRun.ID, s.maxHashFactor),
Hash: hashToBucket(sqlchelpers.UUIDFromStr(workflowRunId), s.maxHashFactor),
StepRunId: laterStepRunId,
TenantId: tenantId,
CancelledAt: &cancelledAt,
@@ -2660,7 +2667,7 @@ func (s *stepRunEngineRepository) StepRunCancelled(ctx context.Context, tenantId
return innerErr
}
func (s *stepRunEngineRepository) StepRunFailed(ctx context.Context, tenantId, stepRunId string, failedAt time.Time, errStr string, retryCount int) error {
func (s *stepRunEngineRepository) StepRunFailed(ctx context.Context, tenantId, workflowRunId, stepRunId string, failedAt time.Time, errStr string, retryCount int) error {
ctx, span := telemetry.NewSpan(ctx, "step-run-failed-db")
defer span.End()
@@ -2674,7 +2681,7 @@ func (s *stepRunEngineRepository) StepRunFailed(ctx context.Context, tenantId, s
failed := string(dbsqlc.StepRunStatusFAILED)
data := &updateStepRunQueueData{
Hash: hashToBucket(sqlchelpers.UUIDFromStr(stepRunId), s.maxHashFactor),
Hash: hashToBucket(sqlchelpers.UUIDFromStr(workflowRunId), s.maxHashFactor),
StepRunId: stepRunId,
TenantId: tenantId,
RetryCount: retryCount,
@@ -2711,7 +2718,7 @@ func (s *stepRunEngineRepository) StepRunFailed(ctx context.Context, tenantId, s
}
_, err := s.bulkStatusBuffer.BuffItem(tenantId, &updateStepRunQueueData{
Hash: hashToBucket(laterStepRun.ID, s.maxHashFactor),
Hash: hashToBucket(sqlchelpers.UUIDFromStr(workflowRunId), s.maxHashFactor),
StepRunId: laterStepRunId,
TenantId: tenantId,
CancelledAt: &failedAt,
+4 -4
View File
@@ -185,13 +185,13 @@ type StepRunEngineRepository interface {
ListStepRunsToTimeout(ctx context.Context, tenantId string) (bool, []*dbsqlc.GetStepRunForEngineRow, error)
StepRunStarted(ctx context.Context, tenantId, stepRunId string, startedAt time.Time) error
StepRunStarted(ctx context.Context, tenantId, workflowRunId, stepRunId string, startedAt time.Time) error
StepRunSucceeded(ctx context.Context, tenantId, stepRunId string, finishedAt time.Time, output []byte) error
StepRunSucceeded(ctx context.Context, tenantId, workflowRunId, stepRunId string, finishedAt time.Time, output []byte) error
StepRunCancelled(ctx context.Context, tenantId, stepRunId string, cancelledAt time.Time, cancelledReason string) error
StepRunCancelled(ctx context.Context, tenantId, workflowRunId, stepRunId string, cancelledAt time.Time, cancelledReason string) error
StepRunFailed(ctx context.Context, tenantId, stepRunId string, failedAt time.Time, errStr string, retryCount int) error
StepRunFailed(ctx context.Context, tenantId, workflowRunId, stepRunId string, failedAt time.Time, errStr string, retryCount int) error
ReplayStepRun(ctx context.Context, tenantId, stepRunId string, input []byte) (*dbsqlc.GetStepRunForEngineRow, error)