Fix: Payload fallbacks, WAL conflict handling, WAL eviction (#2372)

* fix: improve error handling

* fix: add default operation

* fix: dont write operation

* fix: refactor offload to always evict

* fix: err check

* fix: err
This commit is contained in:
matt
2025-10-03 14:50:46 -04:00
committed by GitHub
parent 2a08cbf77b
commit dfc5074057
7 changed files with 32 additions and 30 deletions
@@ -0,0 +1,9 @@
-- +goose Up
-- +goose StatementBegin
ALTER TABLE v1_payload_wal ALTER COLUMN operation SET DEFAULT 'CREATE';
-- +goose StatementEnd
-- +goose Down
-- +goose StatementBegin
ALTER TABLE v1_payload_wal ALTER COLUMN operation DROP DEFAULT;
-- +goose StatementEnd
@@ -254,7 +254,7 @@ func (d *DispatcherImpl) handleTaskBulkAssignedTask(ctx context.Context, msg *ms
TenantId: task.TenantID,
}]
if input == nil || !ok {
if !ok {
// If the input wasn't found in the payload store,
// fall back to the input stored on the task itself.
d.l.Error().Msgf("handleTaskBulkAssignedTask-1: task %s has empty payload, falling back to input", task.ExternalID.String())
@@ -311,7 +311,7 @@ func (d *DispatcherImpl) handleTaskBulkAssignedTask(ctx context.Context, msg *ms
TenantId: task.TenantID,
}]
if input == nil || !ok {
if !ok {
// If the input wasn't found in the payload store,
// fall back to the input stored on the task itself.
d.l.Error().Msgf("handleTaskBulkAssignedTask-2: task %s has empty payload, falling back to input", task.ExternalID.String())
+11 -8
View File
@@ -9,6 +9,7 @@ import (
"github.com/hatchet-dev/hatchet/internal/telemetry"
"github.com/hatchet-dev/hatchet/pkg/repository/postgres/sqlchelpers"
"github.com/hatchet-dev/hatchet/pkg/repository/v1/sqlcv1"
"github.com/jackc/pgx/v5"
"github.com/jackc/pgx/v5/pgtype"
"github.com/jackc/pgx/v5/pgxpool"
"github.com/rs/zerolog"
@@ -102,7 +103,6 @@ func (p *payloadStoreRepositoryImpl) Store(ctx context.Context, tx sqlcv1.DBTX,
payloadTypes := make([]string, 0, len(payloads))
inlineContents := make([][]byte, 0, len(payloads))
offloadAts := make([]pgtype.Timestamptz, 0, len(payloads))
operations := make([]string, 0, len(payloads))
tenantIds := make([]pgtype.UUID, 0, len(payloads))
locations := make([]string, 0, len(payloads))
@@ -134,7 +134,6 @@ func (p *payloadStoreRepositoryImpl) Store(ctx context.Context, tx sqlcv1.DBTX,
tenantIds = append(tenantIds, tenantId)
locations = append(locations, string(sqlcv1.V1PayloadLocationINLINE))
inlineContents = append(inlineContents, payload.Payload)
operations = append(operations, string(sqlcv1.V1PayloadWalOperationCREATE))
if p.externalStoreEnabled {
offloadAts = append(offloadAts, pgtype.Timestamptz{Time: payload.InsertedAt.Time.Add(*p.inlineStoreTTL), Valid: true})
@@ -164,7 +163,6 @@ func (p *payloadStoreRepositoryImpl) Store(ctx context.Context, tx sqlcv1.DBTX,
Payloadinsertedats: taskInsertedAts,
Payloadtypes: payloadTypes,
Offloadats: offloadAts,
Operations: operations,
})
if err != nil {
@@ -185,7 +183,7 @@ func (p *payloadStoreRepositoryImpl) Retrieve(ctx context.Context, opts Retrieve
payload, ok := payloadMap[opts]
if !ok {
return nil, nil
return nil, pgx.ErrNoRows
}
return payload, nil
@@ -366,7 +364,7 @@ func (p *payloadStoreRepositoryImpl) ProcessPayloadWAL(ctx context.Context, part
span.SetAttributes(attrs...)
for opts, payload := range payloads {
for _, opts := range retrieveOpts {
offloadAt, ok := retrieveOptsToOffloadAt[opts]
if !ok {
@@ -378,7 +376,7 @@ func (p *payloadStoreRepositoryImpl) ProcessPayloadWAL(ctx context.Context, part
Id: opts.Id,
InsertedAt: opts.InsertedAt,
Type: opts.Type,
Payload: payload,
Payload: payloads[opts],
TenantId: opts.TenantId.String(),
},
OffloadAt: offloadAt.Time,
@@ -406,7 +404,7 @@ func (p *payloadStoreRepositoryImpl) ProcessPayloadWAL(ctx context.Context, part
tenantIds := make([]pgtype.UUID, 0, len(retrieveOptsToStoredKey))
externalLocationKeys := make([]string, 0, len(retrieveOptsToStoredKey))
for opt := range retrieveOptsToStoredKey {
for _, opt := range retrieveOpts {
offloadAt, exists := retrieveOptsToOffloadAt[opt]
if !exists {
@@ -420,7 +418,12 @@ func (p *payloadStoreRepositoryImpl) ProcessPayloadWAL(ctx context.Context, part
key, ok := retrieveOptsToStoredKey[opt]
if !ok {
return false, fmt.Errorf("external location key not found for opts: %+v", opt)
// important: if there's no key here, it's likely because the payloads table did not contain the payload
// this is okay - it can happen if e.g. a payload partition is dropped before the WAL is processed (not a great situation, but not catastrophic)
// if this happens, we log an error and set the key to `""` which will allow it to be evicted from the WAL. it'll never cause
// an update in the payloads table because there won't be a matching row
p.l.Error().Int64("id", opt.Id).Time("insertedAt", opt.InsertedAt.Time).Msg("external location key not found for opts")
key = ""
}
externalLocationKeys = append(externalLocationKeys, string(key))
+3 -7
View File
@@ -72,7 +72,6 @@ WITH inputs AS (
UNNEST(@payloadInsertedAts::TIMESTAMPTZ[]) AS payload_inserted_at,
UNNEST(CAST(@payloadTypes::TEXT[] AS v1_payload_type[])) AS payload_type,
UNNEST(@offloadAts::TIMESTAMPTZ[]) AS offload_at,
UNNEST(CAST(@operations::TEXT[] AS v1_payload_wal_operation[])) AS operation,
UNNEST(@tenantIds::UUID[]) AS tenant_id
)
@@ -81,20 +80,17 @@ INSERT INTO v1_payload_wal (
offload_at,
payload_id,
payload_inserted_at,
payload_type,
operation
payload_type
)
SELECT
i.tenant_id,
i.offload_at,
i.payload_id,
i.payload_inserted_at,
i.payload_type,
i.operation
i.payload_type
FROM
inputs i
ON CONFLICT (offload_at, tenant_id, payload_id, payload_inserted_at, payload_type) DO UPDATE
SET operation = EXCLUDED.operation
ON CONFLICT DO NOTHING
;
-- name: PollPayloadWALForRecordsToOffload :many
+4 -10
View File
@@ -217,8 +217,7 @@ WITH inputs AS (
UNNEST($2::TIMESTAMPTZ[]) AS payload_inserted_at,
UNNEST(CAST($3::TEXT[] AS v1_payload_type[])) AS payload_type,
UNNEST($4::TIMESTAMPTZ[]) AS offload_at,
UNNEST(CAST($5::TEXT[] AS v1_payload_wal_operation[])) AS operation,
UNNEST($6::UUID[]) AS tenant_id
UNNEST($5::UUID[]) AS tenant_id
)
INSERT INTO v1_payload_wal (
@@ -226,20 +225,17 @@ INSERT INTO v1_payload_wal (
offload_at,
payload_id,
payload_inserted_at,
payload_type,
operation
payload_type
)
SELECT
i.tenant_id,
i.offload_at,
i.payload_id,
i.payload_inserted_at,
i.payload_type,
i.operation
i.payload_type
FROM
inputs i
ON CONFLICT (offload_at, tenant_id, payload_id, payload_inserted_at, payload_type) DO UPDATE
SET operation = EXCLUDED.operation
ON CONFLICT DO NOTHING
`
type WritePayloadWALParams struct {
@@ -247,7 +243,6 @@ type WritePayloadWALParams struct {
Payloadinsertedats []pgtype.Timestamptz `json:"payloadinsertedats"`
Payloadtypes []string `json:"payloadtypes"`
Offloadats []pgtype.Timestamptz `json:"offloadats"`
Operations []string `json:"operations"`
Tenantids []pgtype.UUID `json:"tenantids"`
}
@@ -257,7 +252,6 @@ func (q *Queries) WritePayloadWAL(ctx context.Context, db DBTX, arg WritePayload
arg.Payloadinsertedats,
arg.Payloadtypes,
arg.Offloadats,
arg.Operations,
arg.Tenantids,
)
return err
+2 -2
View File
@@ -2850,11 +2850,11 @@ func (r *TaskRepositoryImpl) ReplayTasks(ctx context.Context, tenantId string, t
TenantId: sqlchelpers.UUIDFromStr(tenantId),
})
if err != nil {
if err != nil && err != pgx.ErrNoRows {
return nil, fmt.Errorf("failed to retrieve task input: %w", err)
}
if input == nil {
if errors.Is(err, pgx.ErrNoRows) {
// If the input wasn't found in the payload store,
// fall back to the input stored on the task itself.
r.l.Error().Msgf("ReplayTasks: task %s has empty payload, falling back to input", task.ExternalID.String())
+1 -1
View File
@@ -1658,7 +1658,7 @@ CREATE TABLE v1_payload_wal (
payload_id BIGINT NOT NULL,
payload_inserted_at TIMESTAMPTZ NOT NULL,
payload_type v1_payload_type NOT NULL,
operation v1_payload_wal_operation NOT NULL,
operation v1_payload_wal_operation NOT NULL DEFAULT 'CREATE',
PRIMARY KEY (offload_at, tenant_id, payload_id, payload_inserted_at, payload_type),
CONSTRAINT "v1_payload_wal_payload" FOREIGN KEY (payload_id, payload_inserted_at, payload_type, tenant_id) REFERENCES v1_payload (id, inserted_at, type, tenant_id) ON DELETE CASCADE