feat: configurable internal retry (#1049)

* feat: configurable internal retry

* fix: bump default to 3
This commit is contained in:
Gabe Ruttner
2024-11-15 06:19:24 -08:00
committed by GitHub
parent f22c49652c
commit 4eaa9e7fd9
4 changed files with 19 additions and 7 deletions
+4
View File
@@ -139,6 +139,9 @@ type ConfigFileRuntime struct {
// DisableTenantPubs controls whether tenant pubsub is disabled
DisableTenantPubs bool `mapstructure:"disableTenantPubs" json:"disableTenantPubs,omitempty"`
// MaxInternalRetryCount is the maximum number of internal retries before a step run is considered failed (default: 3)
MaxInternalRetryCount int32 `mapstructure:"maxInternalRetryCount" json:"maxInternalRetryCount,omitempty" default:"3"`
// WaitForFlush is the time to wait for the buffer to flush used for exerting some back pressure on writers
WaitForFlush time.Duration `mapstructure:"waitForFlush" json:"waitForFlush,omitempty" default:"1ms"`
@@ -456,6 +459,7 @@ func BindAllEnv(v *viper.Viper) {
_ = v.BindEnv("runtime.allowChangePassword", "SERVER_ALLOW_CHANGE_PASSWORD")
_ = v.BindEnv("runtime.bufferCreateWorkflowRuns", "SERVER_BUFFER_CREATE_WORKFLOW_RUNS")
_ = v.BindEnv("runtime.disableTenantPubs", "SERVER_DISABLE_TENANT_PUBS")
_ = v.BindEnv("runtime.maxInternalRetryCount", "SERVER_MAX_INTERNAL_RETRY_COUNT")
// security check options
_ = v.BindEnv("securityCheck.enabled", "SERVER_SECURITY_CHECK_ENABLED")
+2 -2
View File
@@ -671,7 +671,7 @@ step_runs_to_reassign AS (
FROM
step_runs_on_inactive_workers
WHERE
"internalRetryCount" < 1
"internalRetryCount" < @maxInternalRetryCount::int
),
step_runs_to_fail AS (
SELECT
@@ -679,7 +679,7 @@ step_runs_to_fail AS (
FROM
step_runs_on_inactive_workers
WHERE
"internalRetryCount" >= 1
"internalRetryCount" >= @maxInternalRetryCount::int
),
deleted_sqis AS (
DELETE FROM
@@ -2098,7 +2098,7 @@ step_runs_to_reassign AS (
FROM
step_runs_on_inactive_workers
WHERE
"internalRetryCount" < 1
"internalRetryCount" < $2::int
),
step_runs_to_fail AS (
SELECT
@@ -2106,7 +2106,7 @@ step_runs_to_fail AS (
FROM
step_runs_on_inactive_workers
WHERE
"internalRetryCount" >= 1
"internalRetryCount" >= $2::int
),
deleted_sqis AS (
DELETE FROM
@@ -2182,6 +2182,11 @@ FROM
step_runs_to_fail srs2
`
type ListStepRunsToReassignParams struct {
Tenantid pgtype.UUID `json:"tenantid"`
Maxinternalretrycount int32 `json:"maxinternalretrycount"`
}
type ListStepRunsToReassignRow struct {
ID pgtype.UUID `json:"id"`
WorkerId pgtype.UUID `json:"workerId"`
@@ -2189,8 +2194,8 @@ type ListStepRunsToReassignRow struct {
Operation string `json:"operation"`
}
func (q *Queries) ListStepRunsToReassign(ctx context.Context, db DBTX, tenantid pgtype.UUID) ([]*ListStepRunsToReassignRow, error) {
rows, err := db.Query(ctx, listStepRunsToReassign, tenantid)
func (q *Queries) ListStepRunsToReassign(ctx context.Context, db DBTX, arg ListStepRunsToReassignParams) ([]*ListStepRunsToReassignRow, error) {
rows, err := db.Query(ctx, listStepRunsToReassign, arg.Tenantid, arg.Maxinternalretrycount)
if err != nil {
return nil, err
}
+4 -1
View File
@@ -650,7 +650,10 @@ func (s *stepRunEngineRepository) ListStepRunsToReassign(ctx context.Context, te
defer rollback()
// get the step run and make sure it's still in pending
results, err := s.queries.ListStepRunsToReassign(ctx, tx, pgTenantId)
results, err := s.queries.ListStepRunsToReassign(ctx, tx, dbsqlc.ListStepRunsToReassignParams{
Maxinternalretrycount: s.cf.MaxInternalRetryCount,
Tenantid: pgTenantId,
})
if err != nil {
return nil, nil, err