mirror of
https://github.com/hatchet-dev/hatchet.git
synced 2026-05-08 02:49:17 -05:00
chore: improve throughput, remove deadlocks (#949)
* add otel to pub * temporarily remove tenant id exchange * fix: increase internal queue throughput * fix: remove potential deadlocking * rollback hash factor multiplier * fix: batch update issues * fix: rm unneeded locks * move disable tenant pubsub to an env var --------- Co-authored-by: gabriel ruttner <gabriel.ruttner@gmail.com>
This commit is contained in:
@@ -49,6 +49,8 @@ type MessageQueueImpl struct {
|
||||
|
||||
ready bool
|
||||
|
||||
disableTenantExchangePubs bool
|
||||
|
||||
// lru cache for tenant ids
|
||||
tenantIdCache *lru.Cache[string, bool]
|
||||
}
|
||||
@@ -60,16 +62,18 @@ func (t *MessageQueueImpl) IsReady() bool {
|
||||
type MessageQueueImplOpt func(*MessageQueueImplOpts)
|
||||
|
||||
type MessageQueueImplOpts struct {
|
||||
l *zerolog.Logger
|
||||
url string
|
||||
qos int
|
||||
l *zerolog.Logger
|
||||
url string
|
||||
qos int
|
||||
disableTenantExchangePubs bool
|
||||
}
|
||||
|
||||
func defaultMessageQueueImplOpts() *MessageQueueImplOpts {
|
||||
l := logger.NewDefaultLogger("rabbitmq")
|
||||
|
||||
return &MessageQueueImplOpts{
|
||||
l: &l,
|
||||
l: &l,
|
||||
disableTenantExchangePubs: false,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -91,6 +95,12 @@ func WithQos(qos int) MessageQueueImplOpt {
|
||||
}
|
||||
}
|
||||
|
||||
func WithDisableTenantExchangePubs(disable bool) MessageQueueImplOpt {
|
||||
return func(opts *MessageQueueImplOpts) {
|
||||
opts.disableTenantExchangePubs = disable
|
||||
}
|
||||
}
|
||||
|
||||
// New creates a new MessageQueueImpl.
|
||||
func New(fs ...MessageQueueImplOpt) (func() error, *MessageQueueImpl) {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
@@ -105,11 +115,12 @@ func New(fs ...MessageQueueImplOpt) (func() error, *MessageQueueImpl) {
|
||||
opts.l = &newLogger
|
||||
|
||||
t := &MessageQueueImpl{
|
||||
ctx: ctx,
|
||||
identity: identity(),
|
||||
l: opts.l,
|
||||
qos: opts.qos,
|
||||
configFs: fs,
|
||||
ctx: ctx,
|
||||
identity: identity(),
|
||||
l: opts.l,
|
||||
qos: opts.qos,
|
||||
configFs: fs,
|
||||
disableTenantExchangePubs: opts.disableTenantExchangePubs,
|
||||
}
|
||||
|
||||
constructor := func(context.Context) (*amqp.Connection, error) {
|
||||
@@ -196,6 +207,9 @@ func (t *MessageQueueImpl) SetQOS(prefetchCount int) {
|
||||
|
||||
// AddMessage adds a msg to the queue.
|
||||
func (t *MessageQueueImpl) AddMessage(ctx context.Context, q msgqueue.Queue, msg *msgqueue.Message) error {
|
||||
ctx, span := telemetry.NewSpan(ctx, "add-message")
|
||||
defer span.End()
|
||||
|
||||
// inject otel carrier into the message
|
||||
if msg.OtelCarrier == nil {
|
||||
msg.OtelCarrier = telemetry.GetCarrier(ctx)
|
||||
@@ -339,6 +353,9 @@ func (t *MessageQueueImpl) startPublishing() func() error {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
ctx, span := telemetry.NewSpanWithCarrier(ctx, "publish-message", msg.OtelCarrier)
|
||||
defer span.End()
|
||||
|
||||
t.l.Debug().Msgf("publishing msg %s to queue %s", msg.ID, msg.q.Name())
|
||||
|
||||
pubMsg := amqp.Publishing{
|
||||
@@ -358,7 +375,7 @@ func (t *MessageQueueImpl) startPublishing() func() error {
|
||||
}
|
||||
|
||||
// if this is a tenant msg, publish to the tenant exchange
|
||||
if msg.TenantID() != "" {
|
||||
if !t.disableTenantExchangePubs && msg.TenantID() != "" {
|
||||
// determine if the tenant exchange exists
|
||||
if _, ok := t.tenantIdCache.Get(msg.TenantID()); !ok {
|
||||
// register the tenant exchange
|
||||
|
||||
@@ -10,12 +10,11 @@ import (
|
||||
)
|
||||
|
||||
type OperationPool struct {
|
||||
ops sync.Map
|
||||
timeout time.Duration
|
||||
description string
|
||||
method OpMethod
|
||||
ql *zerolog.Logger
|
||||
setTenantsMu sync.RWMutex
|
||||
ops sync.Map
|
||||
timeout time.Duration
|
||||
description string
|
||||
method OpMethod
|
||||
ql *zerolog.Logger
|
||||
}
|
||||
|
||||
func NewOperationPool(ql *zerolog.Logger, timeout time.Duration, description string, method OpMethod) *OperationPool {
|
||||
@@ -28,9 +27,6 @@ func NewOperationPool(ql *zerolog.Logger, timeout time.Duration, description str
|
||||
}
|
||||
|
||||
func (p *OperationPool) SetTenants(tenants []*dbsqlc.Tenant) {
|
||||
p.setTenantsMu.Lock()
|
||||
defer p.setTenantsMu.Unlock()
|
||||
|
||||
tenantMap := make(map[string]bool)
|
||||
|
||||
for _, t := range tenants {
|
||||
@@ -48,16 +44,10 @@ func (p *OperationPool) SetTenants(tenants []*dbsqlc.Tenant) {
|
||||
}
|
||||
|
||||
func (p *OperationPool) RunOrContinue(id string) {
|
||||
p.setTenantsMu.RLock()
|
||||
defer p.setTenantsMu.RUnlock()
|
||||
|
||||
p.GetOperation(id).RunOrContinue(p.ql)
|
||||
}
|
||||
|
||||
func (p *OperationPool) GetOperation(id string) *SerialOperation {
|
||||
p.setTenantsMu.RLock()
|
||||
defer p.setTenantsMu.RUnlock()
|
||||
|
||||
op, ok := p.ops.Load(id)
|
||||
|
||||
if !ok {
|
||||
|
||||
@@ -872,7 +872,7 @@ func (ec *JobsControllerImpl) handleStepRunStarted(ctx context.Context, task *ms
|
||||
return fmt.Errorf("could not parse started at: %w", err)
|
||||
}
|
||||
|
||||
err = ec.repo.StepRun().StepRunStarted(ctx, metadata.TenantId, payload.StepRunId, startedAt)
|
||||
err = ec.repo.StepRun().StepRunStarted(ctx, metadata.TenantId, payload.WorkflowRunId, payload.StepRunId, startedAt)
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not update step run: %w", err)
|
||||
@@ -913,7 +913,7 @@ func (ec *JobsControllerImpl) handleStepRunFinished(ctx context.Context, task *m
|
||||
stepOutput = []byte(payload.StepOutputData)
|
||||
}
|
||||
|
||||
err = ec.repo.StepRun().StepRunSucceeded(ctx, metadata.TenantId, payload.StepRunId, finishedAt, stepOutput)
|
||||
err = ec.repo.StepRun().StepRunSucceeded(ctx, metadata.TenantId, payload.WorkflowRunId, payload.StepRunId, finishedAt, stepOutput)
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not update step run: %w", err)
|
||||
@@ -1014,7 +1014,7 @@ func (ec *JobsControllerImpl) failStepRun(ctx context.Context, tenantId, stepRun
|
||||
}
|
||||
|
||||
// fail step run
|
||||
err = ec.repo.StepRun().StepRunFailed(ctx, tenantId, stepRunId, failedAt, errorReason, int(oldStepRun.SRRetryCount))
|
||||
err = ec.repo.StepRun().StepRunFailed(ctx, tenantId, sqlchelpers.UUIDToStr(oldStepRun.WorkflowRunId), stepRunId, failedAt, errorReason, int(oldStepRun.SRRetryCount))
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not fail step run: %w", err)
|
||||
@@ -1129,7 +1129,7 @@ func (ec *JobsControllerImpl) cancelStepRun(ctx context.Context, tenantId, stepR
|
||||
return fmt.Errorf("could not get step run: %w", err)
|
||||
}
|
||||
|
||||
err = ec.repo.StepRun().StepRunCancelled(ctx, tenantId, stepRunId, now, reason)
|
||||
err = ec.repo.StepRun().StepRunCancelled(ctx, tenantId, sqlchelpers.UUIDToStr(oldStepRun.WorkflowRunId), stepRunId, now, reason)
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not cancel step run: %w", err)
|
||||
|
||||
@@ -1056,7 +1056,7 @@ func (s *DispatcherImpl) handleStepRunStarted(inputCtx context.Context, request
|
||||
return nil, err
|
||||
}
|
||||
|
||||
err = s.repo.StepRun().StepRunStarted(ctx, tenantId, request.StepRunId, startedAt)
|
||||
err = s.repo.StepRun().StepRunStarted(ctx, tenantId, sqlchelpers.UUIDToStr(sr.WorkflowRunId), request.StepRunId, startedAt)
|
||||
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("could not mark step run started: %w", err)
|
||||
|
||||
@@ -241,6 +241,7 @@ func GetServerConfigFromConfigfile(dc *database.Config, cf *server.ServerConfigF
|
||||
rabbitmq.WithURL(cf.MessageQueue.RabbitMQ.URL),
|
||||
rabbitmq.WithLogger(&l),
|
||||
rabbitmq.WithQos(cf.MessageQueue.RabbitMQ.Qos),
|
||||
rabbitmq.WithDisableTenantExchangePubs(cf.Runtime.DisableTenantPubs),
|
||||
)
|
||||
|
||||
ing, err = ingestor.NewIngestor(
|
||||
|
||||
@@ -110,6 +110,12 @@ type ConfigFileRuntime struct {
|
||||
// FlushItemsThreshold is the number of items to hold in memory until flushing to the database
|
||||
FlushItemsThreshold int `mapstructure:"flushItemsThreshold" json:"flushItemsThreshold,omitempty" default:"100"`
|
||||
|
||||
// How many buckets to hash into for parallelizing updates
|
||||
UpdateHashFactor int `mapstructure:"updateHashFactor" json:"updateHashFactor,omitempty" default:"100"`
|
||||
|
||||
// How many concurrent updates to allow
|
||||
UpdateConcurrentFactor int `mapstructure:"updateConcurrentFactor" json:"updateConcurrentFactor,omitempty" default:"10"`
|
||||
|
||||
// Allow new tenants to be created
|
||||
AllowSignup bool `mapstructure:"allowSignup" json:"allowSignup,omitempty" default:"true"`
|
||||
|
||||
@@ -121,6 +127,9 @@ type ConfigFileRuntime struct {
|
||||
|
||||
// Allow passwords to be changed
|
||||
AllowChangePassword bool `mapstructure:"allowChangePassword" json:"allowChangePassword,omitempty" default:"true"`
|
||||
|
||||
// DisableTenantPubs controls whether tenant pubsub is disabled
|
||||
DisableTenantPubs bool `mapstructure:"disableTenantPubs" json:"disableTenantPubs,omitempty"`
|
||||
}
|
||||
|
||||
type SecurityCheckConfigFile struct {
|
||||
@@ -409,6 +418,7 @@ func BindAllEnv(v *viper.Viper) {
|
||||
_ = v.BindEnv("runtime.allowInvites", "SERVER_ALLOW_INVITES")
|
||||
_ = v.BindEnv("runtime.allowCreateTenant", "SERVER_ALLOW_CREATE_TENANT")
|
||||
_ = v.BindEnv("runtime.allowChangePassword", "SERVER_ALLOW_CHANGE_PASSWORD")
|
||||
_ = v.BindEnv("runtime.disableTenantPubs", "SERVER_DISABLE_TENANT_PUBS")
|
||||
|
||||
// security check options
|
||||
_ = v.BindEnv("securityCheck.enabled", "SERVER_SECURITY_CHECK_ENABLED")
|
||||
@@ -493,6 +503,8 @@ func BindAllEnv(v *viper.Viper) {
|
||||
_ = v.BindEnv("runtime.singleQueueLimit", "SERVER_SINGLE_QUEUE_LIMIT")
|
||||
_ = v.BindEnv("runtime.flushPeriodMilliseconds", "SERVER_FLUSH_PERIOD_MILLISECONDS")
|
||||
_ = v.BindEnv("runtime.flushItemsThreshold", "SERVER_FLUSH_ITEMS_THRESHOLD")
|
||||
_ = v.BindEnv("runtime.updateHashFactor", "SERVER_UPDATE_HASH_FACTOR")
|
||||
_ = v.BindEnv("runtime.updateConcurrentFactor", "SERVER_UPDATE_CONCURRENT_FACTOR")
|
||||
|
||||
// tls options
|
||||
_ = v.BindEnv("tls.tlsStrategy", "SERVER_TLS_STRATEGY")
|
||||
|
||||
@@ -293,8 +293,8 @@ func NewStepRunEngineRepository(pool *pgxpool.Pool, v validator.Validator, l *ze
|
||||
queries: queries,
|
||||
cf: cf,
|
||||
cachedStepIdHasRateLimit: rlCache,
|
||||
updateConcurrentFactor: 10,
|
||||
maxHashFactor: 100,
|
||||
updateConcurrentFactor: cf.UpdateConcurrentFactor,
|
||||
maxHashFactor: cf.UpdateHashFactor,
|
||||
}
|
||||
|
||||
err := s.startBuffers()
|
||||
@@ -1928,10 +1928,10 @@ func (s *stepRunEngineRepository) ProcessStepRunUpdatesV2(ctx context.Context, q
|
||||
|
||||
pgTenantId := sqlchelpers.UUIDFromStr(tenantId)
|
||||
|
||||
limit := 100
|
||||
limit := 100 * s.updateConcurrentFactor
|
||||
|
||||
if s.cf.SingleQueueLimit != 0 {
|
||||
limit = s.cf.SingleQueueLimit * 4 // we call update step run 4x
|
||||
limit = s.cf.SingleQueueLimit * s.updateConcurrentFactor
|
||||
}
|
||||
|
||||
tx, commit, rollback, err := prepareTx(ctx, s.pool, s.l, 25000)
|
||||
@@ -2225,7 +2225,7 @@ func (s *stepRunEngineRepository) processStepRunUpdatesV2(
|
||||
ctx context.Context,
|
||||
qlp *zerolog.Logger,
|
||||
tenantId string,
|
||||
tx dbsqlc.DBTX,
|
||||
outerTx dbsqlc.DBTX,
|
||||
data []updateStepRunQueueData,
|
||||
) (succeededStepRuns []*dbsqlc.GetStepRunForEngineRow, completedWorkflowRuns []*dbsqlc.ResolveWorkflowRunStatusRow, err error) {
|
||||
// startedAt := time.Now().UTC()
|
||||
@@ -2246,6 +2246,7 @@ func (s *stepRunEngineRepository) processStepRunUpdatesV2(
|
||||
}
|
||||
}
|
||||
|
||||
var wrMu sync.Mutex
|
||||
var eg errgroup.Group
|
||||
|
||||
for _, batch := range batches {
|
||||
@@ -2266,9 +2267,11 @@ func (s *stepRunEngineRepository) processStepRunUpdatesV2(
|
||||
failParams := dbsqlc.BulkFailStepRunParams{}
|
||||
cancelParams := dbsqlc.BulkCancelStepRunParams{}
|
||||
finishParams := dbsqlc.BulkFinishStepRunParams{}
|
||||
batchStepRunIds := []pgtype.UUID{}
|
||||
|
||||
for _, item := range batch {
|
||||
stepRunId := sqlchelpers.UUIDFromStr(item.StepRunId)
|
||||
batchStepRunIds = append(batchStepRunIds, stepRunId)
|
||||
|
||||
switch dbsqlc.StepRunStatus(*item.Status) {
|
||||
case dbsqlc.StepRunStatusRUNNING:
|
||||
@@ -2322,12 +2325,35 @@ func (s *stepRunEngineRepository) processStepRunUpdatesV2(
|
||||
}
|
||||
}
|
||||
|
||||
// update the job runs and workflow runs as well
|
||||
jobRunIds, err := s.queries.ResolveJobRunStatus(ctx, tx, dbsqlc.ResolveJobRunStatusParams{
|
||||
Steprunids: batchStepRunIds,
|
||||
Tenantid: pgTenantId,
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not resolve job run status: %w", err)
|
||||
}
|
||||
|
||||
innerCompletedWorkflowRuns, err := s.queries.ResolveWorkflowRunStatus(ctx, tx, dbsqlc.ResolveWorkflowRunStatusParams{
|
||||
Jobrunids: jobRunIds,
|
||||
Tenantid: pgTenantId,
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not resolve workflow run status: %w", err)
|
||||
}
|
||||
|
||||
err = commit(ctx)
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not commit transaction: %w", err)
|
||||
}
|
||||
|
||||
wrMu.Lock()
|
||||
completedWorkflowRuns = append(completedWorkflowRuns, innerCompletedWorkflowRuns...)
|
||||
wrMu.Unlock()
|
||||
|
||||
return nil
|
||||
})
|
||||
}
|
||||
@@ -2338,26 +2364,7 @@ func (s *stepRunEngineRepository) processStepRunUpdatesV2(
|
||||
return nil, nil, fmt.Errorf("could not process step run updates v2: %w", err)
|
||||
}
|
||||
|
||||
// update the job runs and workflow runs as well
|
||||
jobRunIds, err := s.queries.ResolveJobRunStatus(ctx, tx, dbsqlc.ResolveJobRunStatusParams{
|
||||
Steprunids: stepRunIds,
|
||||
Tenantid: pgTenantId,
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("could not resolve job run status: %w", err)
|
||||
}
|
||||
|
||||
completedWorkflowRuns, err = s.queries.ResolveWorkflowRunStatus(ctx, tx, dbsqlc.ResolveWorkflowRunStatusParams{
|
||||
Jobrunids: jobRunIds,
|
||||
Tenantid: pgTenantId,
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("could not resolve workflow run status: %w", err)
|
||||
}
|
||||
|
||||
succeededStepRuns, err = s.queries.GetStepRunForEngine(ctx, tx, dbsqlc.GetStepRunForEngineParams{
|
||||
succeededStepRuns, err = s.queries.GetStepRunForEngine(ctx, outerTx, dbsqlc.GetStepRunForEngineParams{
|
||||
Ids: completedStepRunIds,
|
||||
TenantId: pgTenantId,
|
||||
})
|
||||
@@ -2507,14 +2514,14 @@ func (s *stepRunEngineRepository) CleanupInternalQueueItems(ctx context.Context,
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *stepRunEngineRepository) StepRunStarted(ctx context.Context, tenantId, stepRunId string, startedAt time.Time) error {
|
||||
func (s *stepRunEngineRepository) StepRunStarted(ctx context.Context, tenantId, workflowRunId, stepRunId string, startedAt time.Time) error {
|
||||
ctx, span := telemetry.NewSpan(ctx, "step-run-started-db")
|
||||
defer span.End()
|
||||
|
||||
running := string(dbsqlc.StepRunStatusRUNNING)
|
||||
|
||||
data := &updateStepRunQueueData{
|
||||
Hash: hashToBucket(sqlchelpers.UUIDFromStr(stepRunId), s.maxHashFactor),
|
||||
Hash: hashToBucket(sqlchelpers.UUIDFromStr(workflowRunId), s.maxHashFactor),
|
||||
StepRunId: stepRunId,
|
||||
TenantId: tenantId,
|
||||
StartedAt: &startedAt,
|
||||
@@ -2530,7 +2537,7 @@ func (s *stepRunEngineRepository) StepRunStarted(ctx context.Context, tenantId,
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *stepRunEngineRepository) StepRunSucceeded(ctx context.Context, tenantId, stepRunId string, finishedAt time.Time, output []byte) error {
|
||||
func (s *stepRunEngineRepository) StepRunSucceeded(ctx context.Context, tenantId, workflowRunId, stepRunId string, finishedAt time.Time, output []byte) error {
|
||||
ctx, span := telemetry.NewSpan(ctx, "step-run-started-db")
|
||||
defer span.End()
|
||||
|
||||
@@ -2544,7 +2551,7 @@ func (s *stepRunEngineRepository) StepRunSucceeded(ctx context.Context, tenantId
|
||||
finished := string(dbsqlc.StepRunStatusSUCCEEDED)
|
||||
|
||||
data := &updateStepRunQueueData{
|
||||
Hash: hashToBucket(sqlchelpers.UUIDFromStr(stepRunId), s.maxHashFactor),
|
||||
Hash: hashToBucket(sqlchelpers.UUIDFromStr(workflowRunId), s.maxHashFactor),
|
||||
StepRunId: stepRunId,
|
||||
TenantId: tenantId,
|
||||
FinishedAt: &finishedAt,
|
||||
@@ -2599,7 +2606,7 @@ func (s *stepRunEngineRepository) StepRunSucceeded(ctx context.Context, tenantId
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *stepRunEngineRepository) StepRunCancelled(ctx context.Context, tenantId, stepRunId string, cancelledAt time.Time, cancelledReason string) error {
|
||||
func (s *stepRunEngineRepository) StepRunCancelled(ctx context.Context, tenantId, workflowRunId, stepRunId string, cancelledAt time.Time, cancelledReason string) error {
|
||||
ctx, span := telemetry.NewSpan(ctx, "step-run-cancelled-db")
|
||||
defer span.End()
|
||||
|
||||
@@ -2613,7 +2620,7 @@ func (s *stepRunEngineRepository) StepRunCancelled(ctx context.Context, tenantId
|
||||
cancelled := string(dbsqlc.StepRunStatusCANCELLED)
|
||||
|
||||
data := &updateStepRunQueueData{
|
||||
Hash: hashToBucket(sqlchelpers.UUIDFromStr(stepRunId), s.maxHashFactor),
|
||||
Hash: hashToBucket(sqlchelpers.UUIDFromStr(workflowRunId), s.maxHashFactor),
|
||||
StepRunId: stepRunId,
|
||||
TenantId: tenantId,
|
||||
CancelledAt: &cancelledAt,
|
||||
@@ -2644,7 +2651,7 @@ func (s *stepRunEngineRepository) StepRunCancelled(ctx context.Context, tenantId
|
||||
reason := "PREVIOUS_STEP_CANCELLED"
|
||||
|
||||
_, err := s.bulkStatusBuffer.BuffItem(tenantId, &updateStepRunQueueData{
|
||||
Hash: hashToBucket(laterStepRun.ID, s.maxHashFactor),
|
||||
Hash: hashToBucket(sqlchelpers.UUIDFromStr(workflowRunId), s.maxHashFactor),
|
||||
StepRunId: laterStepRunId,
|
||||
TenantId: tenantId,
|
||||
CancelledAt: &cancelledAt,
|
||||
@@ -2660,7 +2667,7 @@ func (s *stepRunEngineRepository) StepRunCancelled(ctx context.Context, tenantId
|
||||
return innerErr
|
||||
}
|
||||
|
||||
func (s *stepRunEngineRepository) StepRunFailed(ctx context.Context, tenantId, stepRunId string, failedAt time.Time, errStr string, retryCount int) error {
|
||||
func (s *stepRunEngineRepository) StepRunFailed(ctx context.Context, tenantId, workflowRunId, stepRunId string, failedAt time.Time, errStr string, retryCount int) error {
|
||||
ctx, span := telemetry.NewSpan(ctx, "step-run-failed-db")
|
||||
defer span.End()
|
||||
|
||||
@@ -2674,7 +2681,7 @@ func (s *stepRunEngineRepository) StepRunFailed(ctx context.Context, tenantId, s
|
||||
failed := string(dbsqlc.StepRunStatusFAILED)
|
||||
|
||||
data := &updateStepRunQueueData{
|
||||
Hash: hashToBucket(sqlchelpers.UUIDFromStr(stepRunId), s.maxHashFactor),
|
||||
Hash: hashToBucket(sqlchelpers.UUIDFromStr(workflowRunId), s.maxHashFactor),
|
||||
StepRunId: stepRunId,
|
||||
TenantId: tenantId,
|
||||
RetryCount: retryCount,
|
||||
@@ -2711,7 +2718,7 @@ func (s *stepRunEngineRepository) StepRunFailed(ctx context.Context, tenantId, s
|
||||
}
|
||||
|
||||
_, err := s.bulkStatusBuffer.BuffItem(tenantId, &updateStepRunQueueData{
|
||||
Hash: hashToBucket(laterStepRun.ID, s.maxHashFactor),
|
||||
Hash: hashToBucket(sqlchelpers.UUIDFromStr(workflowRunId), s.maxHashFactor),
|
||||
StepRunId: laterStepRunId,
|
||||
TenantId: tenantId,
|
||||
CancelledAt: &failedAt,
|
||||
|
||||
@@ -185,13 +185,13 @@ type StepRunEngineRepository interface {
|
||||
|
||||
ListStepRunsToTimeout(ctx context.Context, tenantId string) (bool, []*dbsqlc.GetStepRunForEngineRow, error)
|
||||
|
||||
StepRunStarted(ctx context.Context, tenantId, stepRunId string, startedAt time.Time) error
|
||||
StepRunStarted(ctx context.Context, tenantId, workflowRunId, stepRunId string, startedAt time.Time) error
|
||||
|
||||
StepRunSucceeded(ctx context.Context, tenantId, stepRunId string, finishedAt time.Time, output []byte) error
|
||||
StepRunSucceeded(ctx context.Context, tenantId, workflowRunId, stepRunId string, finishedAt time.Time, output []byte) error
|
||||
|
||||
StepRunCancelled(ctx context.Context, tenantId, stepRunId string, cancelledAt time.Time, cancelledReason string) error
|
||||
StepRunCancelled(ctx context.Context, tenantId, workflowRunId, stepRunId string, cancelledAt time.Time, cancelledReason string) error
|
||||
|
||||
StepRunFailed(ctx context.Context, tenantId, stepRunId string, failedAt time.Time, errStr string, retryCount int) error
|
||||
StepRunFailed(ctx context.Context, tenantId, workflowRunId, stepRunId string, failedAt time.Time, errStr string, retryCount int) error
|
||||
|
||||
ReplayStepRun(ctx context.Context, tenantId, stepRunId string, input []byte) (*dbsqlc.GetStepRunForEngineRow, error)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user