From c6e154fd0399fa1ad57e3c11dd3487dbe6679375 Mon Sep 17 00:00:00 2001 From: matt Date: Mon, 20 Oct 2025 09:09:49 -0400 Subject: [PATCH] Feat: OLAP Payloads (#2410) * feat: olap payloads table * feat: olap queue messages for payload puts * feat: wire up writes on task write * driveby: add + ignore psql-connect * fix: down migration * fix: use external id for pk * fix: insert query * fix: more external ids * fix: bit more cleanup * feat: dags * fix: the rest of the refs * fix: placeholder uuid * fix: write external ids * feat: wire up messages over the queue * fix: panic * Revert "fix: panic" This reverts commit c0adccf2eaba8dd34d17acca301eab896e5ae389. * Revert "feat: wire up messages over the queue" This reverts commit 36f425f3c141751b5a988312b391c71a4cfe77d0. * fix: rm unused method * fix: rm more * fix: rm cruft * feat: wire up failures * feat: start wiring up completed events * fix: more wiring * fix: finish wiring up completed event payloads * fix: lint * feat: start wiring up external ids in the core * feat: olap pub * fix: add returning * fix: wiring * debug: log lines for pubs * fix: external id writes * Revert "debug: log lines for pubs" This reverts commit fe430840bd62c164d24f0d40144047db4cfdecae. * fix: rm sample * debug: rm pub buffer param * Revert "debug: rm pub buffer param" This reverts commit b42a5cacbbedcdf04ca01b58757852e26bd6eb42. * debug: stuck queries * debug: more logs * debug: yet more logs * fix: rename BulkRetrieve -> Retrieve * chore: lint * fix: naming * fix: conn leak in putpayloads * fix: revert debug * Revert "debug: more logs" This reverts commit 95da7de64f5d764a8f524f5ec177e684f2657f74. * Revert "debug: stuck queries" This reverts commit 8fda64adc46a9d65c8d947a4c86b1ac2630e6e46. * feat: improve getters, olap getter * fix: key type * feat: first pass at pulling olap payloads from the payload store * fix: start fixing bugs * fix: start reworking `includePayloads` param * fix: include payloads wiring * feat: analyze for payloads * fix: simplify writes more + write event payloads * feat: read out event payloads * feat: env vars for dual writes * refactor: clean up task prop drilling a bit * feat: add include payloads params to python for tests * fix: tx commit * fix: dual writes * fix: not null constraint * fix: one more * debug: logging * fix: more debugging, tweak function sig * fix: function sig * fix: refs * debug: more logging * debug: more logging * debug: fix condition * debug: overwrite properly * fix: revert debug * fix: rm more drilling * fix: comments * fix: partitioning jobs * chore: ver * fix: bug, docs * hack: dummy id and inserted at for payload offloads * fix: bug * fix: no need to handle offloads for task event data * hack: jitter + current ts * fix: short circuit * fix: offload payloads in a tx * fix: uncomment sampling * fix: don't offload if external store is disabled * chore: gen sqlc * fix: migration * fix: start reworking types * fix: couple more * fix: rm unused code * fix: drill includePayloads down again * fix: silence annoying error in some cases * fix: always store payloads * debug: use workflow run id for input * fix: improve logging * debug: logging on retrieve * debug: task input * fix: use correct field * debug: write even null payloads to limit errors * debug: hide error lines * fix: quieting more errors * fix: duplicate example names, remove print lines * debug: add logging for olap event writes * hack: immediate event offloads and cutovers * fix: rm log line * fix: import * fix: short circuit events * fix: duped names --- .gitignore | 3 + .../handlers/v1/tasks/list_by_dag_id.go | 2 +- .../server/handlers/v1/workflow-runs/get.go | 2 + .../server/handlers/v1/workflow-runs/list.go | 45 +- api/v1/server/oas/transformers/event.go | 2 +- api/v1/server/oas/transformers/v1/events.go | 6 +- api/v1/server/oas/transformers/v1/tasks.go | 39 +- .../oas/transformers/v1/workflow_runs.go | 9 +- .../migrations/20251015164344_v1_0_51.sql | 41 + .../concurrency_workflow_level/worker.py | 2 +- examples/python/cron/programatic-async.py | 2 +- examples/python/cron/programatic-sync.py | 2 +- .../v1/workflow-runs-v1/hooks/use-runs.tsx | 1 + frontend/docs/pages/sdks/python/client.mdx | 12 +- frontend/docs/pages/sdks/python/context.mdx | 22 +- .../sdks/python/feature-clients/filters.mdx | 22 +- .../sdks/python/feature-clients/runs.mdx | 2 + frontend/docs/pages/sdks/python/runnables.mdx | 46 +- .../controllers/v1/olap/controller.go | 97 ++- .../v1/task/process_payload_wal.go | 2 +- internal/services/dispatcher/dispatcher_v1.go | 2 +- pkg/config/loader/loader.go | 1 + pkg/config/server/server.go | 2 + pkg/repository/v1/ids.go | 2 +- pkg/repository/v1/match.go | 4 +- pkg/repository/v1/olap.go | 716 ++++++++++++++++-- pkg/repository/v1/olappayload.go | 27 + pkg/repository/v1/payloadstore.go | 99 ++- pkg/repository/v1/sqlcv1/copyfrom.go | 3 +- pkg/repository/v1/sqlcv1/models.go | 54 ++ pkg/repository/v1/sqlcv1/olap.sql | 118 ++- pkg/repository/v1/sqlcv1/olap.sql.go | 354 +++++++-- pkg/repository/v1/sqlcv1/payload-store.sql | 31 +- pkg/repository/v1/sqlcv1/payload-store.sql.go | 117 +-- pkg/repository/v1/task.go | 11 +- pkg/repository/v1/trigger.go | 4 +- sdks/python/CHANGELOG.md | 6 + .../concurrency_workflow_level/worker.py | 2 +- .../python/examples/cron/programatic-async.py | 2 +- sdks/python/examples/cron/programatic-sync.py | 2 +- sdks/python/hatchet_sdk/features/runs.py | 12 + sql/schema/v1-core.sql | 3 + sql/schema/v1-olap.sql | 22 + 43 files changed, 1597 insertions(+), 356 deletions(-) create mode 100644 cmd/hatchet-migrate/migrate/migrations/20251015164344_v1_0_51.sql create mode 100644 pkg/repository/v1/olappayload.go diff --git a/.gitignore b/.gitignore index 897b22e6d..f67f587f1 100644 --- a/.gitignore +++ b/.gitignore @@ -99,3 +99,6 @@ openapitools.json # Generated docs content frontend/app/src/next/lib/docs/generated/ frontend/docs/lib/generated/ + +# Scripts +hack/dev/psql-connect.sh diff --git a/api/v1/server/handlers/v1/tasks/list_by_dag_id.go b/api/v1/server/handlers/v1/tasks/list_by_dag_id.go index 25c920e17..1942cfa2e 100644 --- a/api/v1/server/handlers/v1/tasks/list_by_dag_id.go +++ b/api/v1/server/handlers/v1/tasks/list_by_dag_id.go @@ -23,7 +23,7 @@ func (t *TasksService) V1DagListTasks(ctx echo.Context, request gen.V1DagListTas ctx.Request().Context(), tenantId, pguuids, - true, + false, ) if err != nil { diff --git a/api/v1/server/handlers/v1/workflow-runs/get.go b/api/v1/server/handlers/v1/workflow-runs/get.go index 57e1c2f26..df9dfb260 100644 --- a/api/v1/server/handlers/v1/workflow-runs/get.go +++ b/api/v1/server/handlers/v1/workflow-runs/get.go @@ -61,6 +61,7 @@ func (t *V1WorkflowRunsService) getWorkflowRunDetails( ctx, tenantId, taskMetadata, + true, ) if err != nil { @@ -68,6 +69,7 @@ func (t *V1WorkflowRunsService) getWorkflowRunDetails( } stepIdToTaskExternalId := make(map[pgtype.UUID]pgtype.UUID) + for _, task := range tasks { stepIdToTaskExternalId[task.StepID] = task.ExternalID } diff --git a/api/v1/server/handlers/v1/workflow-runs/list.go b/api/v1/server/handlers/v1/workflow-runs/list.go index 3ac192e8f..52bae3760 100644 --- a/api/v1/server/handlers/v1/workflow-runs/list.go +++ b/api/v1/server/handlers/v1/workflow-runs/list.go @@ -57,12 +57,18 @@ func (t *V1WorkflowRunsService) WithDags(ctx context.Context, request gen.V1Work workflowIds = *request.Params.WorkflowIds } + includePayloads := false + if request.Params.IncludePayloads != nil { + includePayloads = *request.Params.IncludePayloads + } + opts := v1.ListWorkflowRunOpts{ - CreatedAfter: since, - Statuses: statuses, - WorkflowIds: workflowIds, - Limit: limit, - Offset: offset, + CreatedAfter: since, + Statuses: statuses, + WorkflowIds: workflowIds, + Limit: limit, + Offset: offset, + IncludePayloads: includePayloads, } additionalMetadataFilters := make(map[string]interface{}) @@ -93,13 +99,6 @@ func (t *V1WorkflowRunsService) WithDags(ctx context.Context, request gen.V1Work opts.TriggeringEventExternalId = &id } - includePayloads := true - if request.Params.IncludePayloads != nil { - includePayloads = *request.Params.IncludePayloads - } - - opts.IncludePayloads = includePayloads - dags, total, err := t.config.V1.OLAP().ListWorkflowRuns( ctx, tenantId, @@ -146,17 +145,13 @@ func (t *V1WorkflowRunsService) WithDags(ctx context.Context, request gen.V1Work } taskIdToWorkflowName := make(map[int64]string) - - for _, task := range tasks { - if name, ok := workflowNames[task.WorkflowID]; ok { - taskIdToWorkflowName[task.ID] = name - } - } - taskIdToActionId := make(map[int64]string) for _, task := range tasks { taskIdToActionId[task.ID] = task.ActionID + if name, ok := workflowNames[task.WorkflowID]; ok { + taskIdToWorkflowName[task.ID] = name + } } parsedTasks := transformers.TaskRunDataRowToWorkflowRunsMany(tasks, taskIdToWorkflowName, total, limit, offset) @@ -221,6 +216,11 @@ func (t *V1WorkflowRunsService) OnlyTasks(ctx context.Context, request gen.V1Wor workflowIds = *request.Params.WorkflowIds } + includePayloads := false + if request.Params.IncludePayloads != nil { + includePayloads = *request.Params.IncludePayloads + } + opts := v1.ListTaskRunOpts{ CreatedAfter: since, Statuses: statuses, @@ -228,7 +228,7 @@ func (t *V1WorkflowRunsService) OnlyTasks(ctx context.Context, request gen.V1Wor Limit: limit, Offset: offset, WorkerId: request.Params.WorkerId, - IncludePayloads: true, + IncludePayloads: includePayloads, } additionalMetadataFilters := make(map[string]interface{}) @@ -252,10 +252,6 @@ func (t *V1WorkflowRunsService) OnlyTasks(ctx context.Context, request gen.V1Wor opts.TriggeringEventExternalId = request.Params.TriggeringEventExternalId } - if request.Params.IncludePayloads != nil { - opts.IncludePayloads = *request.Params.IncludePayloads - } - tasks, total, err := t.config.V1.OLAP().ListTasks( ctx, tenantId, @@ -282,6 +278,7 @@ func (t *V1WorkflowRunsService) OnlyTasks(ctx context.Context, request gen.V1Wor } taskIdToWorkflowName := make(map[int64]string) + for _, task := range tasks { if name, ok := workflowIdToName[task.WorkflowID]; ok { taskIdToWorkflowName[task.ID] = name diff --git a/api/v1/server/oas/transformers/event.go b/api/v1/server/oas/transformers/event.go index 44980b4cc..fe27c876b 100644 --- a/api/v1/server/oas/transformers/event.go +++ b/api/v1/server/oas/transformers/event.go @@ -61,7 +61,7 @@ func ToEventFromSQLC(eventRow *dbsqlc.ListEventsRow) (*gen.Event, error) { return res, nil } -func ToEventFromSQLCV1(event *v1.ListEventsRow) (*gen.Event, error) { +func ToEventFromSQLCV1(event *v1.EventWithPayload) (*gen.Event, error) { var metadata map[string]interface{} if event.EventAdditionalMetadata != nil { diff --git a/api/v1/server/oas/transformers/v1/events.go b/api/v1/server/oas/transformers/v1/events.go index 5e2d7c63c..db7351501 100644 --- a/api/v1/server/oas/transformers/v1/events.go +++ b/api/v1/server/oas/transformers/v1/events.go @@ -49,7 +49,7 @@ func parseTriggeredRuns(triggeredRuns []byte) ([]gen.V1EventTriggeredRun, error) return result, nil } -func ToV1EventList(events []*v1.ListEventsRow, limit, offset, total int64) gen.V1EventList { +func ToV1EventList(events []*v1.EventWithPayload, limit, offset, total int64) gen.V1EventList { rows := make([]gen.V1Event, len(events)) numPages := int64(math.Ceil(float64(total) / float64(limit))) @@ -70,7 +70,9 @@ func ToV1EventList(events []*v1.ListEventsRow, limit, offset, total int64) gen.V for i, row := range events { additionalMetadata := jsonToMap(row.EventAdditionalMetadata) - payload := jsonToMap(row.EventPayload) + + payload := jsonToMap(row.Payload) + triggeredRuns, err := parseTriggeredRuns(row.TriggeredRuns) if err != nil { diff --git a/api/v1/server/oas/transformers/v1/tasks.go b/api/v1/server/oas/transformers/v1/tasks.go index 393e97fb1..826dc11bb 100644 --- a/api/v1/server/oas/transformers/v1/tasks.go +++ b/api/v1/server/oas/transformers/v1/tasks.go @@ -22,7 +22,7 @@ func jsonToMap(jsonBytes []byte) map[string]interface{} { return result } -func ToTaskSummary(task *sqlcv1.PopulateTaskRunDataRow) gen.V1TaskSummary { +func ToTaskSummary(task *v1.TaskWithPayloads) gen.V1TaskSummary { workflowVersionID := uuid.MustParse(sqlchelpers.UUIDToStr(task.WorkflowVersionID)) additionalMetadata := jsonToMap(task.AdditionalMetadata) @@ -56,8 +56,8 @@ func ToTaskSummary(task *sqlcv1.PopulateTaskRunDataRow) gen.V1TaskSummary { CreatedAt: task.InsertedAt.Time, UpdatedAt: task.InsertedAt.Time, }, - Input: jsonToMap(task.Input), - Output: jsonToMap(task.Output), + Input: jsonToMap(task.InputPayload), + Output: jsonToMap(task.OutputPayload), Type: gen.V1WorkflowTypeTASK, DisplayName: task.DisplayName, Duration: durationPtr, @@ -81,7 +81,7 @@ func ToTaskSummary(task *sqlcv1.PopulateTaskRunDataRow) gen.V1TaskSummary { } func ToTaskSummaryRows( - tasks []*sqlcv1.PopulateTaskRunDataRow, + tasks []*v1.TaskWithPayloads, ) []gen.V1TaskSummary { toReturn := make([]gen.V1TaskSummary, len(tasks)) @@ -93,13 +93,14 @@ func ToTaskSummaryRows( } func ToDagChildren( - tasks []*sqlcv1.PopulateTaskRunDataRow, + tasks []*v1.TaskWithPayloads, taskIdToDagExternalId map[int64]uuid.UUID, ) []gen.V1DagChildren { dagIdToTasks := make(map[uuid.UUID][]gen.V1TaskSummary) for _, task := range tasks { dagId := taskIdToDagExternalId[task.ID] + dagIdToTasks[dagId] = append(dagIdToTasks[dagId], ToTaskSummary(task)) } @@ -119,7 +120,7 @@ func ToDagChildren( } func ToTaskSummaryMany( - tasks []*sqlcv1.PopulateTaskRunDataRow, + tasks []*v1.TaskWithPayloads, total int, limit, offset int64, ) gen.V1TaskSummaryList { toReturn := ToTaskSummaryRows(tasks) @@ -175,13 +176,13 @@ func ToTaskRunEventMany( } func ToWorkflowRunTaskRunEventsMany( - events []*sqlcv1.ListTaskEventsForWorkflowRunRow, + events []*v1.TaskEventWithPayloads, ) gen.V1TaskEventList { toReturn := make([]gen.V1TaskEvent, len(events)) for i, event := range events { workerId := uuid.MustParse(sqlchelpers.UUIDToStr(event.WorkerID)) - output := string(event.Output) + output := string(event.OutputPayload) taskExternalId := uuid.MustParse(sqlchelpers.UUIDToStr(event.TaskExternalID)) retryCount := int(event.RetryCount) @@ -238,7 +239,7 @@ func ToTaskRunMetrics(metrics *[]v1.TaskRunMetric) gen.V1TaskRunMetrics { return toReturn } -func ToTask(taskWithData *sqlcv1.PopulateSingleTaskRunDataRow, workflowRunExternalId pgtype.UUID, workflowVersion *dbsqlc.GetWorkflowVersionByIdRow) gen.V1TaskSummary { +func ToTask(taskWithData *v1.TaskWithPayloads, workflowRunExternalId pgtype.UUID, workflowVersion *dbsqlc.GetWorkflowVersionByIdRow) gen.V1TaskSummary { workflowVersionID := uuid.MustParse(sqlchelpers.UUIDToStr(taskWithData.WorkflowVersionID)) additionalMetadata := jsonToMap(taskWithData.AdditionalMetadata) @@ -263,11 +264,11 @@ func ToTask(taskWithData *sqlcv1.PopulateSingleTaskRunDataRow, workflowRunExtern output := make(map[string]interface{}) - if taskWithData.Output != nil { - output = jsonToMap(taskWithData.Output) + if len(taskWithData.OutputPayload) > 0 { + output = jsonToMap(taskWithData.OutputPayload) } - input := jsonToMap(taskWithData.Input) + input := jsonToMap(taskWithData.InputPayload) stepId := uuid.MustParse(sqlchelpers.UUIDToStr(taskWithData.StepID)) @@ -308,11 +309,11 @@ func ToTask(taskWithData *sqlcv1.PopulateSingleTaskRunDataRow, workflowRunExtern Input: input, TenantId: uuid.MustParse(sqlchelpers.UUIDToStr(taskWithData.TenantID)), WorkflowId: uuid.MustParse(sqlchelpers.UUIDToStr(taskWithData.WorkflowID)), - ErrorMessage: &taskWithData.ErrorMessage.String, + ErrorMessage: &taskWithData.ErrorMessage, WorkflowRunExternalId: uuid.MustParse(sqlchelpers.UUIDToStr(workflowRunExternalId)), TaskExternalId: uuid.MustParse(sqlchelpers.UUIDToStr(taskWithData.ExternalID)), Type: gen.V1WorkflowTypeTASK, - NumSpawnedChildren: int(taskWithData.SpawnedChildren.Int64), + NumSpawnedChildren: int(taskWithData.NumSpawnedChildren), StepId: &stepId, ActionId: &taskWithData.ActionID, WorkflowVersionId: &workflowVersionID, @@ -324,10 +325,10 @@ func ToTask(taskWithData *sqlcv1.PopulateSingleTaskRunDataRow, workflowRunExtern } func ToWorkflowRunDetails( - taskRunEvents []*sqlcv1.ListTaskEventsForWorkflowRunRow, + taskRunEvents []*v1.TaskEventWithPayloads, workflowRun *v1.WorkflowRunData, shape []*dbsqlc.GetWorkflowRunShapeRow, - tasks []*sqlcv1.PopulateTaskRunDataRow, + tasks []*v1.TaskWithPayloads, stepIdToTaskExternalId map[pgtype.UUID]pgtype.UUID, workflowVersion *dbsqlc.GetWorkflowVersionByIdRow, ) (gen.V1WorkflowRunDetails, error) { @@ -337,8 +338,8 @@ func ToWorkflowRunDetails( output := make(map[string]interface{}) - if workflowRun.Output != nil { - output = jsonToMap(*workflowRun.Output) + if len(workflowRun.Output) > 0 { + output = jsonToMap(workflowRun.Output) } additionalMetadata := jsonToMap(workflowRun.AdditionalMetadata) @@ -394,7 +395,7 @@ func ToWorkflowRunDetails( for i, event := range taskRunEvents { workerId := uuid.MustParse(sqlchelpers.UUIDToStr(event.WorkerID)) - output := string(event.Output) + output := string(event.OutputPayload) retryCount := int(event.RetryCount) attempt := retryCount + 1 diff --git a/api/v1/server/oas/transformers/v1/workflow_runs.go b/api/v1/server/oas/transformers/v1/workflow_runs.go index d5dc2f224..e5c02949d 100644 --- a/api/v1/server/oas/transformers/v1/workflow_runs.go +++ b/api/v1/server/oas/transformers/v1/workflow_runs.go @@ -39,8 +39,8 @@ func WorkflowRunDataToV1TaskSummary(task *v1.WorkflowRunData, workflowIdsToNames var output map[string]interface{} - if task.Output != nil { - output = jsonToMap(*task.Output) + if len(task.Output) > 0 { + output = jsonToMap(task.Output) } workflowVersionId := uuid.MustParse(sqlchelpers.UUIDToStr(task.WorkflowVersionId)) @@ -146,7 +146,7 @@ func ToWorkflowRunMany( } } -func PopulateTaskRunDataRowToV1TaskSummary(task *sqlcv1.PopulateTaskRunDataRow, workflowName *string) gen.V1TaskSummary { +func PopulateTaskRunDataRowToV1TaskSummary(task *v1.TaskWithPayloads, workflowName *string) gen.V1TaskSummary { workflowVersionID := uuid.MustParse(sqlchelpers.UUIDToStr(task.WorkflowVersionID)) additionalMetadata := jsonToMap(task.AdditionalMetadata) @@ -171,6 +171,7 @@ func PopulateTaskRunDataRowToV1TaskSummary(task *sqlcv1.PopulateTaskRunDataRow, input := jsonToMap(task.Input) output := jsonToMap(task.Output) + stepId := uuid.MustParse(sqlchelpers.UUIDToStr(task.StepID)) retryCount := int(task.RetryCount) @@ -209,7 +210,7 @@ func PopulateTaskRunDataRowToV1TaskSummary(task *sqlcv1.PopulateTaskRunDataRow, } func TaskRunDataRowToWorkflowRunsMany( - tasks []*sqlcv1.PopulateTaskRunDataRow, + tasks []*v1.TaskWithPayloads, taskIdToWorkflowName map[int64]string, total int, limit, offset int64, ) gen.V1TaskSummaryList { diff --git a/cmd/hatchet-migrate/migrate/migrations/20251015164344_v1_0_51.sql b/cmd/hatchet-migrate/migrate/migrations/20251015164344_v1_0_51.sql new file mode 100644 index 000000000..6b654bf67 --- /dev/null +++ b/cmd/hatchet-migrate/migrate/migrations/20251015164344_v1_0_51.sql @@ -0,0 +1,41 @@ +-- +goose Up +-- +goose StatementBegin +CREATE TYPE v1_payload_location_olap AS ENUM ('INLINE', 'EXTERNAL'); + +CREATE TABLE v1_payloads_olap ( + tenant_id UUID NOT NULL, + external_id UUID NOT NULL, + + location v1_payload_location_olap NOT NULL, + external_location_key TEXT, + inline_content JSONB, + + inserted_at TIMESTAMPTZ NOT NULL DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMPTZ NOT NULL DEFAULT CURRENT_TIMESTAMP, + + PRIMARY KEY (tenant_id, external_id, inserted_at), + + CHECK ( + location = 'INLINE' + OR + (location = 'EXTERNAL' AND inline_content IS NULL AND external_location_key IS NOT NULL) + ) +) PARTITION BY RANGE(inserted_at); + +SELECT create_v1_range_partition('v1_payloads_olap'::TEXT, NOW()::DATE); +SELECT create_v1_range_partition('v1_payloads_olap'::TEXT, (NOW() + INTERVAL '1 day')::DATE); +SELECT create_v1_range_partition('v1_payloads_olap'::TEXT, (NOW() + INTERVAL '2 day')::DATE); + +ALTER TABLE v1_task_events_olap ADD COLUMN external_id UUID; +ALTER TABLE v1_payload ADD COLUMN external_id UUID; +-- +goose StatementEnd + +-- +goose Down +-- +goose StatementBegin +ALTER TABLE v1_task_events_olap DROP COLUMN external_id; +ALTER TABLE v1_payload DROP COLUMN external_id; + +DROP TABLE v1_payloads_olap; + +DROP TYPE v1_payload_location_olap; +-- +goose StatementEnd diff --git a/examples/python/concurrency_workflow_level/worker.py b/examples/python/concurrency_workflow_level/worker.py index 22b389a64..cbee7c930 100644 --- a/examples/python/concurrency_workflow_level/worker.py +++ b/examples/python/concurrency_workflow_level/worker.py @@ -23,7 +23,7 @@ class WorkflowInput(BaseModel): concurrency_workflow_level_workflow = hatchet.workflow( - name="ConcurrencyWorkflowManyKeys", + name="ConcurrencyWorkflowLevel", input_validator=WorkflowInput, concurrency=[ ConcurrencyExpression( diff --git a/examples/python/cron/programatic-async.py b/examples/python/cron/programatic-async.py index 4c6d1203f..e35fcb1f7 100644 --- a/examples/python/cron/programatic-async.py +++ b/examples/python/cron/programatic-async.py @@ -11,7 +11,7 @@ class DynamicCronInput(BaseModel): async def create_cron() -> None: dynamic_cron_workflow = hatchet.workflow( - name="CronWorkflow", input_validator=DynamicCronInput + name="DynamicCronWorkflow", input_validator=DynamicCronInput ) # > Create diff --git a/examples/python/cron/programatic-sync.py b/examples/python/cron/programatic-sync.py index b912bcbb4..ab699a705 100644 --- a/examples/python/cron/programatic-sync.py +++ b/examples/python/cron/programatic-sync.py @@ -10,7 +10,7 @@ class DynamicCronInput(BaseModel): dynamic_cron_workflow = hatchet.workflow( - name="CronWorkflow", input_validator=DynamicCronInput + name="DynamicCronWorkflow", input_validator=DynamicCronInput ) # > Create diff --git a/frontend/app/src/pages/main/v1/workflow-runs-v1/hooks/use-runs.tsx b/frontend/app/src/pages/main/v1/workflow-runs-v1/hooks/use-runs.tsx index 4743279c7..6478cc357 100644 --- a/frontend/app/src/pages/main/v1/workflow-runs-v1/hooks/use-runs.tsx +++ b/frontend/app/src/pages/main/v1/workflow-runs-v1/hooks/use-runs.tsx @@ -67,6 +67,7 @@ export const useRuns = ({ worker_id: workerId, only_tasks: onlyTasks, triggering_event_external_id: triggeringEventExternalId, + include_payloads: false, }), placeholderData: (prev) => prev, refetchInterval: diff --git a/frontend/docs/pages/sdks/python/client.mdx b/frontend/docs/pages/sdks/python/client.mdx index febb40568..c0cf794e0 100644 --- a/frontend/docs/pages/sdks/python/client.mdx +++ b/frontend/docs/pages/sdks/python/client.mdx @@ -142,9 +142,9 @@ Parameters: Returns: -| Type | Description | -| -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------- | -| `Callable[[Callable[[EmptyModel, Context], R \| CoroutineLike[R]]], Standalone[EmptyModel, R]] \| Callable[[Callable[[TWorkflowInput, Context], R \| CoroutineLike[R]]], Standalone[TWorkflowInput, R]]` | A decorator which creates a `Standalone` task object. | +| Type | Description | +| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------- | +| `Callable[[Callable[Concatenate[EmptyModel, Context, P], R \| CoroutineLike[R]]], Standalone[EmptyModel, R]] \| Callable[[Callable[Concatenate[TWorkflowInput, Context, P], R \| CoroutineLike[R]]], Standalone[TWorkflowInput, R]]` | A decorator which creates a `Standalone` task object. | #### `durable_task` @@ -174,6 +174,6 @@ Parameters: Returns: -| Type | Description | -| ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------- | -| `Callable[[Callable[[EmptyModel, DurableContext], R \| CoroutineLike[R]]], Standalone[EmptyModel, R]] \| Callable[[Callable[[TWorkflowInput, DurableContext], R \| CoroutineLike[R]]], Standalone[TWorkflowInput, R]]` | A decorator which creates a `Standalone` task object. | +| Type | Description | +| -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------- | +| `Callable[[Callable[Concatenate[EmptyModel, DurableContext, P], R \| CoroutineLike[R]]], Standalone[EmptyModel, R]] \| Callable[[Callable[Concatenate[TWorkflowInput, DurableContext, P], R \| CoroutineLike[R]]], Standalone[TWorkflowInput, R]]` | A decorator which creates a `Standalone` task object. | diff --git a/frontend/docs/pages/sdks/python/context.mdx b/frontend/docs/pages/sdks/python/context.mdx index 9b161b919..dd45e6865 100644 --- a/frontend/docs/pages/sdks/python/context.mdx +++ b/frontend/docs/pages/sdks/python/context.mdx @@ -4,8 +4,8 @@ The Hatchet Context class provides helper methods and useful data to tasks at ru There are two types of context classes you'll encounter: -- `Context`: The standard context for regular tasks with methods for logging, task output retrieval, cancellation, and more. -- `DurableContext`: An extended context for durable tasks that includes additional methods for durable execution like `aio_wait_for` and `aio_sleep_for`. +- `Context` - The standard context for regular tasks with methods for logging, task output retrieval, cancellation, and more. +- `DurableContext` - An extended context for durable tasks that includes additional methods for durable execution like `aio_wait_for` and `aio_sleep_for`. ## Context @@ -22,7 +22,7 @@ There are two types of context classes you'll encounter: | `release_slot` | Manually release the slot for the current step run to free up a slot on the worker. Note that this is an advanced feature and should be used with caution. | | `put_stream` | Put a stream event to the Hatchet API. This will send the data to the Hatchet API and return immediately. You can then subscribe to the stream from a separate consumer. | | `refresh_timeout` | Refresh the timeout for the current task run. You can read about refreshing timeouts in [the docs](../../home/timeouts#refreshing-timeouts). | -| `fetch_task_run_error` | A helper intended to be used in an on-failure step to retrieve the error that occurred in a specific upstream task run. | +| `fetch_task_run_error` | **DEPRECATED**: Use `get_task_run_error` instead. | ### Attributes @@ -88,9 +88,9 @@ The additional metadata sent with the current task run. Returns: -| Type | Description | -| --------------------------------- | --------------------------------------------------------------------------------------------------- | -| `JSONSerializableMapping \| None` | The additional metadata sent with the current task run, or None if no additional metadata was sent. | +| Type | Description | +| ------------------------- | --------------------------------------------------------------------------------------------------- | +| `JSONSerializableMapping` | The additional metadata sent with the current task run, or None if no additional metadata was sent. | #### `parent_workflow_run_id` @@ -273,6 +273,8 @@ Returns: #### `fetch_task_run_error` +**DEPRECATED**: Use `get_task_run_error` instead. + A helper intended to be used in an on-failure step to retrieve the error that occurred in a specific upstream task run. Parameters: @@ -306,10 +308,10 @@ Durably wait for either a sleep or an event. Parameters: -| Name | Type | Description | Default | -| ------------- | -------------------------------------- | -------------------------------------------------------------------------------------------- | ---------- | -| `signal_key` | `str` | The key to use for the durable event. This is used to identify the event in the Hatchet API. | _required_ | -| `*conditions` | `SleepCondition \| UserEventCondition` | The conditions to wait for. Can be a SleepCondition or UserEventCondition. | `()` | +| Name | Type | Description | Default | +| ------------- | ------------------------------------------------- | -------------------------------------------------------------------------------------------- | ---------- | +| `signal_key` | `str` | The key to use for the durable event. This is used to identify the event in the Hatchet API. | _required_ | +| `*conditions` | `SleepCondition \| UserEventCondition \| OrGroup` | The conditions to wait for. Can be a SleepCondition or UserEventCondition. | `()` | Returns: diff --git a/frontend/docs/pages/sdks/python/feature-clients/filters.mdx b/frontend/docs/pages/sdks/python/feature-clients/filters.mdx index f5eac72cc..bf941d7bd 100644 --- a/frontend/docs/pages/sdks/python/feature-clients/filters.mdx +++ b/frontend/docs/pages/sdks/python/feature-clients/filters.mdx @@ -78,11 +78,12 @@ List filters for a given tenant. Parameters: -| Name | Type | Description | Default | -| ------------------------- | ------------------------------- | --------------------------------------------------------------------------------------------------------------------- | ------- | -| `limit` | `int \| None` | The maximum number of filters to return. | `None` | -| `offset` | `int \| None` | The number of filters to skip before starting to collect the result set. | `None` | -| `workflow_id_scope_pairs` | `list[tuple[str, str]] \| None` | A list of tuples containing workflow IDs and scopes to filter by. The workflow id is first, then the scope is second. | `None` | +| Name | Type | Description | Default | +| -------------- | ------------------- | ------------------------------------------------------------------------ | ------- | +| `limit` | `int \| None` | The maximum number of filters to return. | `None` | +| `offset` | `int \| None` | The number of filters to skip before starting to collect the result set. | `None` | +| `workflow_ids` | `list[str] \| None` | A list of workflow IDs to filter by. | `None` | +| `scopes` | `list[str] \| None` | A list of scopes to filter by. | `None` | Returns: @@ -164,11 +165,12 @@ List filters for a given tenant. Parameters: -| Name | Type | Description | Default | -| ------------------------- | ------------------------------- | --------------------------------------------------------------------------------------------------------------------- | ------- | -| `limit` | `int \| None` | The maximum number of filters to return. | `None` | -| `offset` | `int \| None` | The number of filters to skip before starting to collect the result set. | `None` | -| `workflow_id_scope_pairs` | `list[tuple[str, str]] \| None` | A list of tuples containing workflow IDs and scopes to filter by. The workflow id is first, then the scope is second. | `None` | +| Name | Type | Description | Default | +| -------------- | ------------------- | ------------------------------------------------------------------------ | ------- | +| `limit` | `int \| None` | The maximum number of filters to return. | `None` | +| `offset` | `int \| None` | The number of filters to skip before starting to collect the result set. | `None` | +| `workflow_ids` | `list[str] \| None` | A list of workflow IDs to filter by. | `None` | +| `scopes` | `list[str] \| None` | A list of scopes to filter by. | `None` | Returns: diff --git a/frontend/docs/pages/sdks/python/feature-clients/runs.mdx b/frontend/docs/pages/sdks/python/feature-clients/runs.mdx index 032d05206..3bb4a67cd 100644 --- a/frontend/docs/pages/sdks/python/feature-clients/runs.mdx +++ b/frontend/docs/pages/sdks/python/feature-clients/runs.mdx @@ -120,6 +120,7 @@ Parameters: | `worker_id` | `str \| None` | The worker ID to filter task runs by. | `None` | | `parent_task_external_id` | `str \| None` | The parent task external ID to filter task runs by. | `None` | | `triggering_event_external_id` | `str \| None` | The event id that triggered the task run. | `None` | +| `include_payloads` | `bool` | Whether to include payloads in the response. | `True` | Returns: @@ -146,6 +147,7 @@ Parameters: | `worker_id` | `str \| None` | The worker ID to filter task runs by. | `None` | | `parent_task_external_id` | `str \| None` | The parent task external ID to filter task runs by. | `None` | | `triggering_event_external_id` | `str \| None` | The event id that triggered the task run. | `None` | +| `include_payloads` | `bool` | Whether to include payloads in the response. | `True` | Returns: diff --git a/frontend/docs/pages/sdks/python/runnables.mdx b/frontend/docs/pages/sdks/python/runnables.mdx index 7d1ce8d32..ece9c6158 100644 --- a/frontend/docs/pages/sdks/python/runnables.mdx +++ b/frontend/docs/pages/sdks/python/runnables.mdx @@ -2,7 +2,7 @@ `Runnables` in the Hatchet SDK are things that can be run, namely tasks and workflows. The two main types of runnables you'll encounter are: -- `Workflow`, which lets you define tasks and call all of the run, schedule, etc. methods. +- `Workflow`, which lets you define tasks and call all of the run, schedule, etc. methods - `Standalone`, which is a single task that's returned by `hatchet.task` and can be run, scheduled, etc. ## Workflow @@ -188,7 +188,7 @@ Parameters: | `name` | `str \| None` | The name of the on-success task. If not specified, defaults to the name of the function being wrapped by the `on_success_task` decorator. | `None` | | `schedule_timeout` | `Duration` | The maximum time to wait for the task to be scheduled. The run will be canceled if the task does not begin within this time. | `timedelta(minutes=5)` | | `execution_timeout` | `Duration` | The maximum time to wait for the task to complete. The run will be canceled if the task does not complete within this time. | `timedelta(seconds=60)` | -| `retries` | `int` | The number of times to retry the on-success task before failing. | `0` | +| `retries` | `int` | The number of times to retry the on-success task before failing | `0` | | `rate_limits` | `list[RateLimit] \| None` | A list of rate limit configurations for the on-success task. | `None` | | `backoff_factor` | `float \| None` | The backoff factor for controlling exponential backoff in retries. | `None` | | `backoff_max_seconds` | `int \| None` | The maximum number of seconds to allow retries with exponential backoff to continue. | `None` | @@ -196,9 +196,9 @@ Parameters: Returns: -| Type | Description | -| --------------------------------------------------------------------------------------------------------------- | ------------------------------------------ | -| `Callable[[Callable[Concatenate[TWorkflowInput, Context, P], R \| CoroutineLike[R]]], Task[TWorkflowInput, R]]` | A decorator which creates a `Task` object. | +| Type | Description | +| --------------------------------------------------------------------------------------------------------------- | ---------------------------------------- | +| `Callable[[Callable[Concatenate[TWorkflowInput, Context, P], R \| CoroutineLike[R]]], Task[TWorkflowInput, R]]` | A decorator which creates a Task object. | #### `run` @@ -728,9 +728,9 @@ Parameters: Returns: -| Type | Description | -| ------------------------------------- | ---------------------------------------- | -| `list[R] \| list[R \| BaseException]` | A list of results for each workflow run. | +| Type | Description | +| -------- | ------------------------- | ---------------------------------------- | +| `list[R] | list[R \| BaseException]` | A list of results for each workflow run. | #### `run_many_no_wait` @@ -1014,13 +1014,14 @@ Mimic the execution of a task. This method is intended to be used to unit test t Parameters: -| Name | Type | Description | Default | -| --------------------- | -------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- | -| `input` | `TWorkflowInput \| None` | The input to the task. | `None` | -| `additional_metadata` | `JSONSerializableMapping \| None` | Additional metadata to attach to the task. | `None` | -| `parent_outputs` | `dict[str, JSONSerializableMapping] \| None` | Outputs from parent tasks, if any. This is useful for mimicking DAG functionality. For instance, if you have a task `step_2` that has a `parent` which is `step_1`, you can pass `parent_outputs={"step_1": {"result": "Hello, world!"}}` to `step_2.mock_run()` to be able to access `ctx.task_output(step_1)` in `step_2`. | `None` | -| `retry_count` | `int` | The number of times the task has been retried. | `0` | -| `lifespan` | `Any` | The lifespan to be used in the task, which is useful if one was set on the worker. This will allow you to access `ctx.lifespan` inside of your task. | `None` | +| Name | Type | Description | Default | +| --------------------- | -------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- | +| `input` | `TWorkflowInput \| None` | The input to the task. | `None` | +| `additional_metadata` | `JSONSerializableMapping \| None` | Additional metadata to attach to the task. | `None` | +| `parent_outputs` | `dict[str, JSONSerializableMapping] \| None` | Outputs from parent tasks, if any. This is useful for mimicking DAG functionality. For instance, if you have a task `step_2` that has a `parent` which is `step_1`, you can pass `parent_outputs={"step_1": {"result": "Hello, world!"}}` to `step_2.mock_run()` to be able to access `ctx.task_output(step_1)` in `step_2`. | `None` | +| `retry_count` | `int` | The number of times the task has been retried. | `0` | +| `lifespan` | `Any` | The lifespan to be used in the task, which is useful if one was set on the worker. This will allow you to access `ctx.lifespan` inside of your task. | `None` | +| `dependencies` | `dict[str, Any] \| None` | Dependencies to be injected into the task. This is useful for tasks that have dependencies defined using `Depends`. **IMPORTANT**: You must pass the dependencies _directly_, **not** the `Depends` objects themselves. For example, if you have a task that has a dependency `config: Annotated[str, Depends(get_config)]`, you should pass `dependencies={"config": "config_value"}` to `aio_mock_run`. | `None` | Returns: @@ -1034,13 +1035,14 @@ Mimic the execution of a task. This method is intended to be used to unit test t Parameters: -| Name | Type | Description | Default | -| --------------------- | -------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- | -| `input` | `TWorkflowInput \| None` | The input to the task. | `None` | -| `additional_metadata` | `JSONSerializableMapping \| None` | Additional metadata to attach to the task. | `None` | -| `parent_outputs` | `dict[str, JSONSerializableMapping] \| None` | Outputs from parent tasks, if any. This is useful for mimicking DAG functionality. For instance, if you have a task `step_2` that has a `parent` which is `step_1`, you can pass `parent_outputs={"step_1": {"result": "Hello, world!"}}` to `step_2.mock_run()` to be able to access `ctx.task_output(step_1)` in `step_2`. | `None` | -| `retry_count` | `int` | The number of times the task has been retried. | `0` | -| `lifespan` | `Any` | The lifespan to be used in the task, which is useful if one was set on the worker. This will allow you to access `ctx.lifespan` inside of your task. | `None` | +| Name | Type | Description | Default | +| --------------------- | -------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- | +| `input` | `TWorkflowInput \| None` | The input to the task. | `None` | +| `additional_metadata` | `JSONSerializableMapping \| None` | Additional metadata to attach to the task. | `None` | +| `parent_outputs` | `dict[str, JSONSerializableMapping] \| None` | Outputs from parent tasks, if any. This is useful for mimicking DAG functionality. For instance, if you have a task `step_2` that has a `parent` which is `step_1`, you can pass `parent_outputs={"step_1": {"result": "Hello, world!"}}` to `step_2.mock_run()` to be able to access `ctx.task_output(step_1)` in `step_2`. | `None` | +| `retry_count` | `int` | The number of times the task has been retried. | `0` | +| `lifespan` | `Any` | The lifespan to be used in the task, which is useful if one was set on the worker. This will allow you to access `ctx.lifespan` inside of your task. | `None` | +| `dependencies` | `dict[str, Any] \| None` | Dependencies to be injected into the task. This is useful for tasks that have dependencies defined using `Depends`. **IMPORTANT**: You must pass the dependencies _directly_, **not** the `Depends` objects themselves. For example, if you have a task that has a dependency `config: Annotated[str, Depends(get_config)]`, you should pass `dependencies={"config": "config_value"}` to `aio_mock_run`. | `None` | Returns: diff --git a/internal/services/controllers/v1/olap/controller.go b/internal/services/controllers/v1/olap/controller.go index 75c32c95b..44ee55898 100644 --- a/internal/services/controllers/v1/olap/controller.go +++ b/internal/services/controllers/v1/olap/controller.go @@ -5,10 +5,12 @@ import ( "errors" "fmt" "hash/fnv" + "math/rand" "sync" "time" "github.com/go-co-op/gocron/v2" + "github.com/google/uuid" "github.com/jackc/pgx/v5/pgtype" "github.com/rs/zerolog" @@ -364,11 +366,32 @@ func (tc *OLAPControllerImpl) handleBufferedMsgs(tenantId, msgId string, payload return tc.handleFailedWebhookValidation(context.Background(), tenantId, payloads) case "cel-evaluation-failure": return tc.handleCelEvaluationFailure(context.Background(), tenantId, payloads) + case "offload-payload": + return tc.handlePayloadOffload(context.Background(), tenantId, payloads) } return fmt.Errorf("unknown message id: %s", msgId) } +func (tc *OLAPControllerImpl) handlePayloadOffload(ctx context.Context, tenantId string, payloads [][]byte) error { + offloads := make([]v1.OffloadPayloadOpts, 0) + + msgs := msgqueue.JSONConvert[v1.OLAPPayloadsToOffload](payloads) + + for _, msg := range msgs { + for _, payload := range msg.Payloads { + if !tc.sample(payload.ExternalId.String()) { + tc.l.Debug().Msgf("skipping payload offload external id %s", payload.ExternalId) + continue + } + + offloads = append(offloads, v1.OffloadPayloadOpts(payload)) + } + } + + return tc.repo.OLAP().OffloadPayloads(ctx, tenantId, offloads) +} + func (tc *OLAPControllerImpl) handleCelEvaluationFailure(ctx context.Context, tenantId string, payloads [][]byte) error { failures := make([]v1.CELEvaluationFailure, 0) @@ -537,6 +560,7 @@ func (tc *OLAPControllerImpl) handleCreateMonitoringEvent(ctx context.Context, t eventPayloads := make([]string, 0) eventMessages := make([]string, 0) timestamps := make([]pgtype.Timestamptz, 0) + eventExternalIds := make([]pgtype.UUID, 0) for _, msg := range msgs { taskMeta := taskIdsToMetas[msg.TaskId] @@ -559,6 +583,7 @@ func (tc *OLAPControllerImpl) handleCreateMonitoringEvent(ctx context.Context, t eventPayloads = append(eventPayloads, msg.EventPayload) eventMessages = append(eventMessages, msg.EventMessage) timestamps = append(timestamps, sqlchelpers.TimestamptzFromTime(msg.EventTimestamp)) + eventExternalIds = append(eventExternalIds, sqlchelpers.UUIDFromStr(uuid.New().String())) if msg.WorkerId != nil { workerIds = append(workerIds, *msg.WorkerId) @@ -630,6 +655,7 @@ func (tc *OLAPControllerImpl) handleCreateMonitoringEvent(ctx context.Context, t RetryCount: retryCounts[i], WorkerID: workerId, AdditionalEventMessage: sqlchelpers.TextFromStr(eventMessages[i]), + ExternalID: eventExternalIds[i], } switch eventTypes[i] { @@ -646,7 +672,76 @@ func (tc *OLAPControllerImpl) handleCreateMonitoringEvent(ctx context.Context, t opts = append(opts, event) } - return tc.repo.OLAP().CreateTaskEvents(ctx, tenantId, opts) + err = tc.repo.OLAP().CreateTaskEvents(ctx, tenantId, opts) + + if err != nil { + return err + } + + if !tc.repo.OLAP().PayloadStore().ExternalStoreEnabled() { + return nil + } + + offloadToExternalOpts := make([]v1.OffloadToExternalStoreOpts, 0) + idInsertedAtToExternalId := make(map[v1.IdInsertedAt]pgtype.UUID) + + for _, opt := range opts { + // generating a dummy id + inserted at to use for creating the external keys for the task events + // we do this since we don't have the id + inserted at of the events themselves on the opts, and we don't + // actually need those for anything once the keys are created. + dummyId := rand.Int63() + // randomly jitter the inserted at time by +/- 300ms to make collisions virtually impossible + dummyInsertedAt := time.Now().Add(time.Duration(rand.Intn(2*300+1)-300) * time.Millisecond) + + idInsertedAtToExternalId[v1.IdInsertedAt{ + ID: dummyId, + InsertedAt: sqlchelpers.TimestamptzFromTime(dummyInsertedAt), + }] = opt.ExternalID + + offloadToExternalOpts = append(offloadToExternalOpts, v1.OffloadToExternalStoreOpts{ + StorePayloadOpts: &v1.StorePayloadOpts{ + Id: dummyId, + InsertedAt: sqlchelpers.TimestamptzFromTime(dummyInsertedAt), + ExternalId: opt.ExternalID, + Type: sqlcv1.V1PayloadTypeTASKEVENTDATA, + Payload: opt.Output, + TenantId: tenantId, + }, + OffloadAt: time.Now(), + }) + } + + if len(offloadToExternalOpts) == 0 { + return nil + } + + retrieveOptsToKey, err := tc.repo.OLAP().PayloadStore().ExternalStore().Store(ctx, offloadToExternalOpts...) + + if err != nil { + return err + } + + offloadOpts := make([]v1.OffloadPayloadOpts, 0) + + for opt, key := range retrieveOptsToKey { + externalId := idInsertedAtToExternalId[v1.IdInsertedAt{ + ID: opt.Id, + InsertedAt: opt.InsertedAt, + }] + + offloadOpts = append(offloadOpts, v1.OffloadPayloadOpts{ + ExternalId: externalId, + ExternalLocationKey: string(key), + }) + } + + err = tc.repo.OLAP().OffloadPayloads(ctx, tenantId, offloadOpts) + + if err != nil { + return err + } + + return nil } func (tc *OLAPControllerImpl) handleFailedWebhookValidation(ctx context.Context, tenantId string, payloads [][]byte) error { diff --git a/internal/services/controllers/v1/task/process_payload_wal.go b/internal/services/controllers/v1/task/process_payload_wal.go index 8570833ed..c742fffd5 100644 --- a/internal/services/controllers/v1/task/process_payload_wal.go +++ b/internal/services/controllers/v1/task/process_payload_wal.go @@ -19,7 +19,7 @@ func (tc *TasksControllerImpl) runProcessPayloadWAL(ctx context.Context) func() } func (tc *TasksControllerImpl) processPayloadWAL(ctx context.Context, partitionNumber int64) (bool, error) { - return tc.repov1.Payloads().ProcessPayloadWAL(ctx, partitionNumber) + return tc.repov1.Payloads().ProcessPayloadWAL(ctx, partitionNumber, tc.pubBuffer) } func (tc *TasksControllerImpl) runProcessPayloadExternalCutovers(ctx context.Context) func() { diff --git a/internal/services/dispatcher/dispatcher_v1.go b/internal/services/dispatcher/dispatcher_v1.go index eaa597afa..ceb885fc3 100644 --- a/internal/services/dispatcher/dispatcher_v1.go +++ b/internal/services/dispatcher/dispatcher_v1.go @@ -237,7 +237,7 @@ func (d *DispatcherImpl) handleTaskBulkAssignedTask(ctx context.Context, msg *ms } } - inputs, err := d.repov1.Payloads().BulkRetrieve(ctx, retrievePayloadOpts...) + inputs, err := d.repov1.Payloads().Retrieve(ctx, retrievePayloadOpts...) if err != nil { d.l.Error().Err(err).Msgf("could not bulk retrieve inputs for %d tasks", len(bulkDatas)) diff --git a/pkg/config/loader/loader.go b/pkg/config/loader/loader.go index 4dd751336..764299b2e 100644 --- a/pkg/config/loader/loader.go +++ b/pkg/config/loader/loader.go @@ -292,6 +292,7 @@ func (c *ConfigLoader) InitDataLayer() (res *database.Layer, err error) { payloadStoreOpts := repov1.PayloadStoreRepositoryOpts{ EnablePayloadDualWrites: scf.PayloadStore.EnablePayloadDualWrites, EnableTaskEventPayloadDualWrites: scf.PayloadStore.EnableTaskEventPayloadDualWrites, + EnableOLAPPayloadDualWrites: scf.PayloadStore.EnableOLAPPayloadDualWrites, EnableDagDataPayloadDualWrites: scf.PayloadStore.EnableDagDataPayloadDualWrites, WALPollLimit: scf.PayloadStore.WALPollLimit, WALProcessInterval: scf.PayloadStore.WALProcessInterval, diff --git a/pkg/config/server/server.go b/pkg/config/server/server.go index af1ec8f7a..a10c33131 100644 --- a/pkg/config/server/server.go +++ b/pkg/config/server/server.go @@ -616,6 +616,7 @@ type PayloadStoreConfig struct { EnablePayloadDualWrites bool `mapstructure:"enablePayloadDualWrites" json:"enablePayloadDualWrites,omitempty" default:"true"` EnableTaskEventPayloadDualWrites bool `mapstructure:"enableTaskEventPayloadDualWrites" json:"enableTaskEventPayloadDualWrites,omitempty" default:"true"` EnableDagDataPayloadDualWrites bool `mapstructure:"enableDagDataPayloadDualWrites" json:"enableDagDataPayloadDualWrites,omitempty" default:"true"` + EnableOLAPPayloadDualWrites bool `mapstructure:"enableOLAPPayloadDualWrites" json:"enableOLAPPayloadDualWrites,omitempty" default:"true"` WALPollLimit int `mapstructure:"walPollLimit" json:"walPollLimit,omitempty" default:"1000"` WALProcessInterval time.Duration `mapstructure:"walProcessInterval" json:"walProcessInterval,omitempty" default:"15s"` ExternalCutoverProcessInterval time.Duration `mapstructure:"externalCutoverProcessInterval" json:"externalCutoverProcessInterval,omitempty" default:"15s"` @@ -887,6 +888,7 @@ func BindAllEnv(v *viper.Viper) { _ = v.BindEnv("payloadStore.enablePayloadDualWrites", "SERVER_PAYLOAD_STORE_ENABLE_PAYLOAD_DUAL_WRITES") _ = v.BindEnv("payloadStore.enableTaskEventPayloadDualWrites", "SERVER_PAYLOAD_STORE_ENABLE_TASK_EVENT_PAYLOAD_DUAL_WRITES") _ = v.BindEnv("payloadStore.enableDagDataPayloadDualWrites", "SERVER_PAYLOAD_STORE_ENABLE_DAG_DATA_PAYLOAD_DUAL_WRITES") + _ = v.BindEnv("payloadStore.enableOLAPPayloadDualWrites", "SERVER_PAYLOAD_STORE_ENABLE_OLAP_PAYLOAD_DUAL_WRITES") _ = v.BindEnv("payloadStore.walPollLimit", "SERVER_PAYLOAD_STORE_WAL_POLL_LIMIT") _ = v.BindEnv("payloadStore.walProcessInterval", "SERVER_PAYLOAD_STORE_WAL_PROCESS_INTERVAL") _ = v.BindEnv("payloadStore.externalCutoverProcessInterval", "SERVER_PAYLOAD_STORE_EXTERNAL_CUTOVER_PROCESS_INTERVAL") diff --git a/pkg/repository/v1/ids.go b/pkg/repository/v1/ids.go index eec247245..9e351000b 100644 --- a/pkg/repository/v1/ids.go +++ b/pkg/repository/v1/ids.go @@ -180,7 +180,7 @@ func (s *sharedRepository) generateExternalIdsForChildWorkflows(ctx context.Cont } } - payloads, err := s.payloadStore.BulkRetrieve(ctx, retrievePayloadOpts...) + payloads, err := s.payloadStore.Retrieve(ctx, retrievePayloadOpts...) if err != nil { return err diff --git a/pkg/repository/v1/match.go b/pkg/repository/v1/match.go index 9a7f9d679..d3a1692a4 100644 --- a/pkg/repository/v1/match.go +++ b/pkg/repository/v1/match.go @@ -262,6 +262,7 @@ func (m *MatchRepositoryImpl) ProcessInternalEventMatches(ctx context.Context, t storePayloadOpts[i] = StorePayloadOpts{ Id: task.ID, InsertedAt: task.InsertedAt, + ExternalId: task.ExternalID, Type: sqlcv1.V1PayloadTypeTASKINPUT, Payload: task.Payload, TenantId: task.TenantID.String(), @@ -304,6 +305,7 @@ func (m *MatchRepositoryImpl) ProcessUserEventMatches(ctx context.Context, tenan storePayloadOpts[i] = StorePayloadOpts{ Id: task.ID, InsertedAt: task.InsertedAt, + ExternalId: task.ExternalID, Type: sqlcv1.V1PayloadTypeTASKINPUT, Payload: task.Payload, TenantId: task.TenantID.String(), @@ -493,7 +495,7 @@ func (m *sharedRepository) processEventMatches(ctx context.Context, tx sqlcv1.DB } } - payloads, err := m.payloadStore.BulkRetrieve(ctx, retrievePayloadOpts...) + payloads, err := m.payloadStore.Retrieve(ctx, retrievePayloadOpts...) if err != nil { return nil, fmt.Errorf("failed to retrieve dag input payloads: %w", err) diff --git a/pkg/repository/v1/olap.go b/pkg/repository/v1/olap.go index 500e6b4b2..7646554fe 100644 --- a/pkg/repository/v1/olap.go +++ b/pkg/repository/v1/olap.go @@ -101,7 +101,7 @@ type WorkflowRunData struct { Input []byte `json:"input"` InsertedAt pgtype.Timestamptz `json:"inserted_at"` Kind sqlcv1.V1RunKind `json:"kind"` - Output *[]byte `json:"output,omitempty"` + Output []byte `json:"output,omitempty"` ParentTaskExternalId *pgtype.UUID `json:"parent_task_external_id,omitempty"` ReadableStatus sqlcv1.V1ReadableStatusOlap `json:"readable_status"` StepId *pgtype.UUID `json:"step_id,omitempty"` @@ -203,18 +203,30 @@ type UpdateDAGStatusRow struct { WorkflowId pgtype.UUID } +type TaskWithPayloads struct { + *sqlcv1.PopulateTaskRunDataRow + InputPayload []byte + OutputPayload []byte + NumSpawnedChildren int64 +} + +type TaskEventWithPayloads struct { + *sqlcv1.ListTaskEventsForWorkflowRunRow + OutputPayload []byte +} + type OLAPRepository interface { UpdateTablePartitions(ctx context.Context) error SetReadReplicaPool(pool *pgxpool.Pool) ReadTaskRun(ctx context.Context, taskExternalId string) (*sqlcv1.V1TasksOlap, error) ReadWorkflowRun(ctx context.Context, workflowRunExternalId pgtype.UUID) (*V1WorkflowRunPopulator, error) - ReadTaskRunData(ctx context.Context, tenantId pgtype.UUID, taskId int64, taskInsertedAt pgtype.Timestamptz, retryCount *int) (*sqlcv1.PopulateSingleTaskRunDataRow, pgtype.UUID, error) + ReadTaskRunData(ctx context.Context, tenantId pgtype.UUID, taskId int64, taskInsertedAt pgtype.Timestamptz, retryCount *int) (*TaskWithPayloads, pgtype.UUID, error) - ListTasks(ctx context.Context, tenantId string, opts ListTaskRunOpts) ([]*sqlcv1.PopulateTaskRunDataRow, int, error) + ListTasks(ctx context.Context, tenantId string, opts ListTaskRunOpts) ([]*TaskWithPayloads, int, error) ListWorkflowRuns(ctx context.Context, tenantId string, opts ListWorkflowRunOpts) ([]*WorkflowRunData, int, error) ListTaskRunEvents(ctx context.Context, tenantId string, taskId int64, taskInsertedAt pgtype.Timestamptz, limit, offset int64) ([]*sqlcv1.ListTaskEventsRow, error) - ListTaskRunEventsByWorkflowRunId(ctx context.Context, tenantId string, workflowRunId pgtype.UUID) ([]*sqlcv1.ListTaskEventsForWorkflowRunRow, error) + ListTaskRunEventsByWorkflowRunId(ctx context.Context, tenantId string, workflowRunId pgtype.UUID) ([]*TaskEventWithPayloads, error) ListWorkflowRunDisplayNames(ctx context.Context, tenantId pgtype.UUID, externalIds []pgtype.UUID) ([]*sqlcv1.ListWorkflowRunDisplayNamesRow, error) ReadTaskRunMetrics(ctx context.Context, tenantId string, opts ReadTaskRunMetricsOpts) ([]TaskRunMetric, error) CreateTasks(ctx context.Context, tenantId string, tasks []*V1TaskWithPayload) error @@ -224,8 +236,8 @@ type OLAPRepository interface { UpdateTaskStatuses(ctx context.Context, tenantIds []string) (bool, []UpdateTaskStatusRow, error) UpdateDAGStatuses(ctx context.Context, tenantIds []string) (bool, []UpdateDAGStatusRow, error) ReadDAG(ctx context.Context, dagExternalId string) (*sqlcv1.V1DagsOlap, error) - ListTasksByDAGId(ctx context.Context, tenantId string, dagIds []pgtype.UUID, includePayloads bool) ([]*sqlcv1.PopulateTaskRunDataRow, map[int64]uuid.UUID, error) - ListTasksByIdAndInsertedAt(ctx context.Context, tenantId string, taskMetadata []TaskMetadata) ([]*sqlcv1.PopulateTaskRunDataRow, error) + ListTasksByDAGId(ctx context.Context, tenantId string, dagIds []pgtype.UUID, includePayloads bool) ([]*TaskWithPayloads, map[int64]uuid.UUID, error) + ListTasksByIdAndInsertedAt(ctx context.Context, tenantId string, taskMetadata []TaskMetadata, includePayloads bool) ([]*TaskWithPayloads, error) // ListTasksByExternalIds returns a list of tasks based on their external ids or the external id of their parent DAG. // In the case of a DAG, we flatten the result into the list of tasks which belong to that DAG. @@ -233,7 +245,7 @@ type OLAPRepository interface { GetTaskTimings(ctx context.Context, tenantId string, workflowRunId pgtype.UUID, depth int32) ([]*sqlcv1.PopulateTaskRunDataRow, map[string]int32, error) BulkCreateEventsAndTriggers(ctx context.Context, events sqlcv1.BulkCreateEventsParams, triggers []EventTriggersFromExternalId) error - ListEvents(ctx context.Context, opts sqlcv1.ListEventsParams) ([]*ListEventsRow, *int64, error) + ListEvents(ctx context.Context, opts sqlcv1.ListEventsParams) ([]*EventWithPayload, *int64, error) GetEvent(ctx context.Context, externalId string) (*sqlcv1.V1EventsOlap, error) ListEventKeys(ctx context.Context, tenantId string) ([]string, error) @@ -242,8 +254,14 @@ type OLAPRepository interface { CreateIncomingWebhookValidationFailureLogs(ctx context.Context, tenantId string, opts []CreateIncomingWebhookFailureLogOpts) error StoreCELEvaluationFailures(ctx context.Context, tenantId string, failures []CELEvaluationFailure) error + PutPayloads(ctx context.Context, tx sqlcv1.DBTX, tenantId string, payloads []StoreOLAPPayloadOpts) error + ReadPayload(ctx context.Context, tenantId string, externalId pgtype.UUID) ([]byte, error) + ReadPayloads(ctx context.Context, tenantId string, externalIds ...pgtype.UUID) (map[pgtype.UUID][]byte, error) AnalyzeOLAPTables(ctx context.Context) error + OffloadPayloads(ctx context.Context, tenantId string, payloads []OffloadPayloadOpts) error + + PayloadStore() PayloadStoreRepository ListWorkflowRunExternalIds(ctx context.Context, tenantId string, opts ListWorkflowRunOpts) ([]pgtype.UUID, error) } @@ -382,6 +400,10 @@ func (r *OLAPRepositoryImpl) SetReadReplicaPool(pool *pgxpool.Pool) { r.readPool = pool } +func (r *OLAPRepositoryImpl) PayloadStore() PayloadStoreRepository { + return r.payloadStore +} + func StringToReadableStatus(status string) ReadableTaskStatus { switch status { case "QUEUED": @@ -463,6 +485,12 @@ func (r *OLAPRepositoryImpl) ReadWorkflowRun(ctx context.Context, workflowRunExt return nil, err } + inputPayload, err := r.ReadPayload(ctx, row.TenantID.String(), row.ExternalID) + + if err != nil { + return nil, err + } + return &V1WorkflowRunPopulator{ WorkflowRun: &WorkflowRunData{ TenantID: row.TenantID, @@ -478,14 +506,14 @@ func (r *OLAPRepositoryImpl) ReadWorkflowRun(ctx context.Context, workflowRunExt FinishedAt: row.FinishedAt, ErrorMessage: row.ErrorMessage.String, WorkflowVersionId: row.WorkflowVersionID, - Input: row.Input, + Input: inputPayload, ParentTaskExternalId: &row.ParentTaskExternalID, }, TaskMetadata: taskMetadata, }, nil } -func (r *OLAPRepositoryImpl) ReadTaskRunData(ctx context.Context, tenantId pgtype.UUID, taskId int64, taskInsertedAt pgtype.Timestamptz, retryCount *int) (*sqlcv1.PopulateSingleTaskRunDataRow, pgtype.UUID, error) { +func (r *OLAPRepositoryImpl) ReadTaskRunData(ctx context.Context, tenantId pgtype.UUID, taskId int64, taskInsertedAt pgtype.Timestamptz, retryCount *int) (*TaskWithPayloads, pgtype.UUID, error) { emptyUUID := pgtype.UUID{} params := sqlcv1.PopulateSingleTaskRunDataParams{ @@ -504,7 +532,7 @@ func (r *OLAPRepositoryImpl) ReadTaskRunData(ctx context.Context, tenantId pgtyp return nil, emptyUUID, err } - workflowRunId := pgtype.UUID{} + var workflowRunId pgtype.UUID if taskRun.DagID.Valid { dagId := taskRun.DagID.Int64 @@ -522,10 +550,60 @@ func (r *OLAPRepositoryImpl) ReadTaskRunData(ctx context.Context, tenantId pgtyp workflowRunId = taskRun.ExternalID } - return taskRun, workflowRunId, nil + payloads, err := r.ReadPayloads(ctx, tenantId.String(), workflowRunId, taskRun.OutputEventExternalID) + + if err != nil { + return nil, emptyUUID, err + } + + input, exists := payloads[workflowRunId] + + if !exists { + r.l.Error().Msgf("ReadTaskRunData: task with external_id %s and inserted_at %s has empty payload, falling back to input", workflowRunId, taskRun.InsertedAt.Time) + input = taskRun.Input + } + + output, exists := payloads[taskRun.OutputEventExternalID] + + if !exists { + r.l.Error().Msgf("ReadTaskRunData: task with external_id %s and inserted_at %s has empty payload, falling back to output (lookup key: %s)", taskRun.ExternalID, taskRun.InsertedAt.Time, taskRun.OutputEventExternalID) + output = taskRun.Output + } + + return &TaskWithPayloads{ + &sqlcv1.PopulateTaskRunDataRow{ + TenantID: taskRun.TenantID, + ID: taskRun.ID, + InsertedAt: taskRun.InsertedAt, + ExternalID: taskRun.ExternalID, + Queue: taskRun.Queue, + ActionID: taskRun.ActionID, + StepID: taskRun.StepID, + WorkflowID: taskRun.WorkflowID, + WorkflowVersionID: taskRun.WorkflowVersionID, + ScheduleTimeout: taskRun.ScheduleTimeout, + StepTimeout: taskRun.StepTimeout, + Priority: taskRun.Priority, + Sticky: taskRun.Sticky, + DisplayName: taskRun.DisplayName, + AdditionalMetadata: taskRun.AdditionalMetadata, + ParentTaskExternalID: taskRun.ParentTaskExternalID, + Status: taskRun.Status, + WorkflowRunID: workflowRunId, + FinishedAt: taskRun.FinishedAt, + StartedAt: taskRun.StartedAt, + QueuedAt: taskRun.QueuedAt, + ErrorMessage: taskRun.ErrorMessage.String, + RetryCount: taskRun.RetryCount, + OutputEventExternalID: taskRun.OutputEventExternalID, + }, + input, + output, + taskRun.SpawnedChildren.Int64, + }, workflowRunId, nil } -func (r *OLAPRepositoryImpl) ListTasks(ctx context.Context, tenantId string, opts ListTaskRunOpts) ([]*sqlcv1.PopulateTaskRunDataRow, int, error) { +func (r *OLAPRepositoryImpl) ListTasks(ctx context.Context, tenantId string, opts ListTaskRunOpts) ([]*TaskWithPayloads, int, error) { ctx, span := telemetry.NewSpan(ctx, "list-tasks-olap") defer span.End() @@ -629,6 +707,51 @@ func (r *OLAPRepositoryImpl) ListTasks(ctx context.Context, tenantId string, opt return nil, 0, err } + payloads := make(map[pgtype.UUID][]byte) + + if opts.IncludePayloads { + externalIds := make([]pgtype.UUID, 0) + for _, task := range tasksWithData { + externalIds = append(externalIds, task.ExternalID) + externalIds = append(externalIds, task.OutputEventExternalID) + } + + payloads, err = r.ReadPayloads(ctx, tenantId, externalIds...) + + if err != nil { + return nil, 0, err + } + } + + result := make([]*TaskWithPayloads, 0, len(tasksWithData)) + + for _, task := range tasksWithData { + input, exists := payloads[task.ExternalID] + + if !exists { + if opts.IncludePayloads && task.ExternalID.Valid { + r.l.Error().Msgf("ListTasks: task with external_id %s and inserted_at %s has empty payload, falling back to input", task.ExternalID, task.InsertedAt.Time) + } + input = task.Input + } + + output, exists := payloads[task.OutputEventExternalID] + + if !exists { + if opts.IncludePayloads && task.OutputEventExternalID.Valid { + r.l.Error().Msgf("ListTasks: task with external_id %s and inserted_at %s has empty payload, falling back to output (lookup key: %s)", task.ExternalID, task.InsertedAt.Time, task.OutputEventExternalID) + } + output = task.Output + } + + result = append(result, &TaskWithPayloads{ + task, + input, + output, + int64(0), + }) + } + count, err := r.queries.CountTasks(ctx, tx, countParams) if err != nil { @@ -639,10 +762,10 @@ func (r *OLAPRepositoryImpl) ListTasks(ctx context.Context, tenantId string, opt return nil, 0, err } - return tasksWithData, int(count), nil + return result, int(count), nil } -func (r *OLAPRepositoryImpl) ListTasksByDAGId(ctx context.Context, tenantId string, dagids []pgtype.UUID, includePayloads bool) ([]*sqlcv1.PopulateTaskRunDataRow, map[int64]uuid.UUID, error) { +func (r *OLAPRepositoryImpl) ListTasksByDAGId(ctx context.Context, tenantId string, dagids []pgtype.UUID, includePayloads bool) ([]*TaskWithPayloads, map[int64]uuid.UUID, error) { ctx, span := telemetry.NewSpan(ctx, "list-tasks-by-dag-id-olap") defer span.End() @@ -680,14 +803,60 @@ func (r *OLAPRepositoryImpl) ListTasksByDAGId(ctx context.Context, tenantId stri return nil, taskIdToDagExternalId, err } + payloads := make(map[pgtype.UUID][]byte) + + if includePayloads { + externalIds := make([]pgtype.UUID, 0) + for _, task := range tasksWithData { + externalIds = append(externalIds, task.ExternalID) + externalIds = append(externalIds, task.OutputEventExternalID) + } + + payloads, err = r.ReadPayloads(ctx, tenantId, externalIds...) + + if err != nil { + return nil, taskIdToDagExternalId, err + } + } + + result := make([]*TaskWithPayloads, 0, len(tasksWithData)) + + for _, task := range tasksWithData { + input, exists := payloads[task.ExternalID] + + if !exists { + if includePayloads && task.ExternalID.Valid { + r.l.Error().Msgf("ListTasksByDAGId: task with external_id %s and inserted_at %s has empty payload, falling back to input", task.ExternalID, task.InsertedAt.Time) + } + input = task.Input + } + + output, exists := payloads[task.OutputEventExternalID] + + if !exists { + if includePayloads && task.OutputEventExternalID.Valid { + r.l.Error().Msgf("ListTasksByDAGId: task with external_id %s and inserted_at %s has empty payload, falling back to output (lookup key: %s)", task.ExternalID, task.InsertedAt.Time, task.OutputEventExternalID) + } + + output = task.Output + } + + result = append(result, &TaskWithPayloads{ + task, + input, + output, + int64(0), + }) + } + if err := commit(ctx); err != nil { return nil, taskIdToDagExternalId, err } - return tasksWithData, taskIdToDagExternalId, nil + return result, taskIdToDagExternalId, nil } -func (r *OLAPRepositoryImpl) ListTasksByIdAndInsertedAt(ctx context.Context, tenantId string, taskMetadata []TaskMetadata) ([]*sqlcv1.PopulateTaskRunDataRow, error) { +func (r *OLAPRepositoryImpl) ListTasksByIdAndInsertedAt(ctx context.Context, tenantId string, taskMetadata []TaskMetadata, includePayloads bool) ([]*TaskWithPayloads, error) { ctx, span := telemetry.NewSpan(ctx, "list-tasks-by-id-and-inserted-at-olap") defer span.End() @@ -708,17 +877,62 @@ func (r *OLAPRepositoryImpl) ListTasksByIdAndInsertedAt(ctx context.Context, ten }) } - tasksWithData, err := r.populateTaskRunData(ctx, tx, tenantId, idsInsertedAts, true) + tasksWithData, err := r.populateTaskRunData(ctx, tx, tenantId, idsInsertedAts, includePayloads) if err != nil { return nil, err } + payloads := make(map[pgtype.UUID][]byte) + + if includePayloads { + externalIds := make([]pgtype.UUID, 0) + for _, task := range tasksWithData { + externalIds = append(externalIds, task.ExternalID) + externalIds = append(externalIds, task.OutputEventExternalID) + } + + payloads, err = r.ReadPayloads(ctx, tenantId, externalIds...) + + if err != nil { + return nil, err + } + } + + result := make([]*TaskWithPayloads, 0, len(tasksWithData)) + + for _, task := range tasksWithData { + input, exists := payloads[task.ExternalID] + + if !exists { + if includePayloads && task.ExternalID.Valid { + r.l.Error().Msgf("ListTasksByIdAndInsertedAt-1: task with external_id %s and inserted_at %s has empty payload, falling back to input", task.ExternalID, task.InsertedAt.Time) + } + input = task.Input + } + + output, exists := payloads[task.OutputEventExternalID] + + if !exists { + if includePayloads && task.OutputEventExternalID.Valid { + r.l.Error().Msgf("ListTasksByIdAndInsertedAt-2: task with external_id %s and inserted_at %s has empty payload, falling back to output (lookup key: %s)", task.ExternalID, task.InsertedAt.Time, task.OutputEventExternalID) + } + output = task.Output + } + + result = append(result, &TaskWithPayloads{ + task, + input, + output, + int64(0), + }) + } + if err := commit(ctx); err != nil { return nil, err } - return tasksWithData, nil + return result, nil } func (r *OLAPRepositoryImpl) ListWorkflowRuns(ctx context.Context, tenantId string, opts ListWorkflowRunOpts) ([]*WorkflowRunData, int, error) { @@ -808,6 +1022,7 @@ func (r *OLAPRepositoryImpl) ListWorkflowRuns(ctx context.Context, tenantId stri dagIdsInsertedAts := make([]IdInsertedAt, 0, len(workflowRunIds)) idsInsertedAts := make([]IdInsertedAt, 0, len(workflowRunIds)) + externalIdsForPayloads := make([]pgtype.UUID, 0) for _, row := range workflowRunIds { if row.Kind == sqlcv1.V1RunKindDAG { @@ -843,6 +1058,8 @@ func (r *OLAPRepositoryImpl) ListWorkflowRuns(ctx context.Context, tenantId stri } dagsToPopulated[sqlchelpers.UUIDToStr(dag.ExternalID)] = dag + externalIdsForPayloads = append(externalIdsForPayloads, dag.ExternalID) + externalIdsForPayloads = append(externalIdsForPayloads, dag.OutputEventExternalID) } count, err := r.queries.CountWorkflowRuns(ctx, tx, countParams) @@ -863,12 +1080,25 @@ func (r *OLAPRepositoryImpl) ListWorkflowRuns(ctx context.Context, tenantId stri for _, task := range populatedTasks { externalId := sqlchelpers.UUIDToStr(task.ExternalID) tasksToPopulated[externalId] = task + + externalIdsForPayloads = append(externalIdsForPayloads, task.ExternalID) + externalIdsForPayloads = append(externalIdsForPayloads, task.OutputEventExternalID) } if err := commit(ctx); err != nil { return nil, 0, err } + externalIdToPayload := make(map[pgtype.UUID][]byte) + + if opts.IncludePayloads { + externalIdToPayload, err = r.ReadPayloads(ctx, tenantId, externalIdsForPayloads...) + + if err != nil { + return nil, 0, err + } + } + res := make([]*WorkflowRunData, 0) for _, row := range workflowRunIds { @@ -882,6 +1112,23 @@ func (r *OLAPRepositoryImpl) ListWorkflowRuns(ctx context.Context, tenantId stri continue } + outputPayload, exists := externalIdToPayload[dag.OutputEventExternalID] + + if !exists { + if opts.IncludePayloads && dag.OutputEventExternalID.Valid { + r.l.Error().Msgf("ListWorkflowRuns-1: dag with external_id %s and inserted_at %s has empty payload, falling back to output", dag.ExternalID, dag.InsertedAt.Time) + } + outputPayload = dag.Output + } + + inputPayload, exists := externalIdToPayload[dag.ExternalID] + if !exists { + if opts.IncludePayloads && dag.ExternalID.Valid { + r.l.Error().Msgf("ListWorkflowRuns-2: dag with external_id %s and inserted_at %s has empty payload, falling back to input", dag.ExternalID, dag.InsertedAt.Time) + } + inputPayload = dag.Input + } + // TODO !IMPORTANT: verify this is correct retryCount := int(dag.RetryCount) @@ -902,8 +1149,8 @@ func (r *OLAPRepositoryImpl) ListWorkflowRuns(ctx context.Context, tenantId stri TaskExternalId: nil, TaskId: nil, TaskInsertedAt: nil, - Output: &dag.Output, - Input: dag.Input, + Output: outputPayload, + Input: inputPayload, ParentTaskExternalId: &dag.ParentTaskExternalID, RetryCount: &retryCount, }) @@ -917,6 +1164,24 @@ func (r *OLAPRepositoryImpl) ListWorkflowRuns(ctx context.Context, tenantId stri retryCount := int(task.RetryCount) + outputPayload, exists := externalIdToPayload[task.OutputEventExternalID] + + if !exists { + if opts.IncludePayloads && task.OutputEventExternalID.Valid { + r.l.Error().Msgf("ListWorkflowRuns-3: task with external_id %s and inserted_at %s has empty payload, falling back to output", task.ExternalID, task.InsertedAt.Time) + } + outputPayload = task.Output + } + + inputPayload, exists := externalIdToPayload[task.ExternalID] + + if !exists { + if opts.IncludePayloads && task.ExternalID.Valid { + r.l.Error().Msgf("ListWorkflowRuns-4: task with external_id %s and inserted_at %s has empty payload, falling back to input", task.ExternalID, task.InsertedAt.Time) + } + inputPayload = task.Input + } + res = append(res, &WorkflowRunData{ TenantID: task.TenantID, InsertedAt: task.InsertedAt, @@ -934,8 +1199,8 @@ func (r *OLAPRepositoryImpl) ListWorkflowRuns(ctx context.Context, tenantId stri TaskExternalId: &task.ExternalID, TaskId: &task.ID, TaskInsertedAt: &task.InsertedAt, - Output: &task.Output, - Input: task.Input, + Output: outputPayload, + Input: inputPayload, StepId: &task.StepID, RetryCount: &retryCount, }) @@ -1028,7 +1293,7 @@ func (r *OLAPRepositoryImpl) ListTaskRunEvents(ctx context.Context, tenantId str return rows, nil } -func (r *OLAPRepositoryImpl) ListTaskRunEventsByWorkflowRunId(ctx context.Context, tenantId string, workflowRunId pgtype.UUID) ([]*sqlcv1.ListTaskEventsForWorkflowRunRow, error) { +func (r *OLAPRepositoryImpl) ListTaskRunEventsByWorkflowRunId(ctx context.Context, tenantId string, workflowRunId pgtype.UUID) ([]*TaskEventWithPayloads, error) { rows, err := r.queries.ListTaskEventsForWorkflowRun(ctx, r.readPool, sqlcv1.ListTaskEventsForWorkflowRunParams{ Tenantid: sqlchelpers.UUIDFromStr(tenantId), Workflowrunid: workflowRunId, @@ -1038,7 +1303,34 @@ func (r *OLAPRepositoryImpl) ListTaskRunEventsByWorkflowRunId(ctx context.Contex return nil, err } - return rows, nil + externalIds := make([]pgtype.UUID, len(rows)) + + for i, row := range rows { + externalIds[i] = row.EventExternalID + } + + payloads, err := r.ReadPayloads(ctx, tenantId, externalIds...) + + if err != nil { + return nil, err + } + + taskEventWithPayloads := make([]*TaskEventWithPayloads, 0, len(rows)) + + for _, row := range rows { + payload, exists := payloads[row.EventExternalID] + if !exists { + r.l.Error().Msgf("ListTaskRunEventsByWorkflowRunId: event with external_id %s and task_inserted_at %s has empty payload, falling back to payload", row.EventExternalID, row.TaskInsertedAt.Time) + payload = row.Output + } + + taskEventWithPayloads = append(taskEventWithPayloads, &TaskEventWithPayloads{ + row, + payload, + }) + } + + return taskEventWithPayloads, nil } func (r *OLAPRepositoryImpl) ReadTaskRunMetrics(ctx context.Context, tenantId string, opts ReadTaskRunMetricsOpts) ([]TaskRunMetric, error) { @@ -1139,6 +1431,7 @@ func (r *OLAPRepositoryImpl) writeTaskEventBatch(ctx context.Context, tenantId s // skip any events which have a corresponding event already eventsToWrite := make([]sqlcv1.CreateTaskEventsOLAPParams, 0) tmpEventsToWrite := make([]sqlcv1.CreateTaskEventsOLAPTmpParams, 0) + payloadsToWrite := make([]StoreOLAPPayloadOpts, 0) for _, event := range events { key := getCacheKey(event) @@ -1156,6 +1449,18 @@ func (r *OLAPRepositoryImpl) writeTaskEventBatch(ctx context.Context, tenantId s WorkerID: event.WorkerID, }) } + + if event.ExternalID.Valid { + payloadsToWrite = append(payloadsToWrite, StoreOLAPPayloadOpts{ + ExternalId: event.ExternalID, + InsertedAt: event.TaskInsertedAt, + Payload: event.Output, + }) + } + + if !r.payloadStore.OLAPDualWritesEnabled() { + event.Output = []byte("{}") + } } if len(eventsToWrite) == 0 { @@ -1182,6 +1487,12 @@ func (r *OLAPRepositoryImpl) writeTaskEventBatch(ctx context.Context, tenantId s return err } + err = r.PutPayloads(ctx, tx, tenantId, payloadsToWrite) + + if err != nil { + return err + } + if err := commit(ctx); err != nil { return err } @@ -1369,6 +1680,7 @@ func (r *OLAPRepositoryImpl) UpdateDAGStatuses(ctx context.Context, tenantIds [] func (r *OLAPRepositoryImpl) writeTaskBatch(ctx context.Context, tenantId string, tasks []*V1TaskWithPayload) error { params := make([]sqlcv1.CreateTasksOLAPParams, 0) + putPayloadOpts := make([]StoreOLAPPayloadOpts, 0) for _, task := range tasks { payload := task.Payload @@ -1380,6 +1692,12 @@ func (r *OLAPRepositoryImpl) writeTaskBatch(ctx context.Context, tenantId string payload = task.Input } + // todo: remove this when we remove dual writes + payloadToWriteToTask := payload + if !r.payloadStore.OLAPDualWritesEnabled() { + payloadToWriteToTask = []byte("{}") + } + params = append(params, sqlcv1.CreateTasksOLAPParams{ TenantID: task.TenantID, ID: task.ID, @@ -1401,7 +1719,13 @@ func (r *OLAPRepositoryImpl) writeTaskBatch(ctx context.Context, tenantId string DagInsertedAt: task.DagInsertedAt, ParentTaskExternalID: task.ParentTaskExternalID, WorkflowRunID: task.WorkflowRunID, - Input: payload, + Input: payloadToWriteToTask, + }) + + putPayloadOpts = append(putPayloadOpts, StoreOLAPPayloadOpts{ + ExternalId: task.ExternalID, + InsertedAt: task.InsertedAt, + Payload: payload, }) } @@ -1416,6 +1740,12 @@ func (r *OLAPRepositoryImpl) writeTaskBatch(ctx context.Context, tenantId string return err } + err = r.PutPayloads(ctx, tx, tenantId, putPayloadOpts) + + if err != nil { + return err + } + if err := commit(ctx); err != nil { return err } @@ -1425,6 +1755,7 @@ func (r *OLAPRepositoryImpl) writeTaskBatch(ctx context.Context, tenantId string func (r *OLAPRepositoryImpl) writeDAGBatch(ctx context.Context, tenantId string, dags []*DAGWithData) error { params := make([]sqlcv1.CreateDAGsOLAPParams, 0) + putPayloadOpts := make([]StoreOLAPPayloadOpts, 0) for _, dag := range dags { var parentTaskExternalID = pgtype.UUID{} @@ -1432,6 +1763,12 @@ func (r *OLAPRepositoryImpl) writeDAGBatch(ctx context.Context, tenantId string, parentTaskExternalID = *dag.ParentTaskExternalID } + // todo: remove this when we remove dual writes + input := dag.Input + if !r.payloadStore.OLAPDualWritesEnabled() { + input = []byte("{}") + } + params = append(params, sqlcv1.CreateDAGsOLAPParams{ TenantID: dag.TenantID, ID: dag.ID, @@ -1443,7 +1780,13 @@ func (r *OLAPRepositoryImpl) writeDAGBatch(ctx context.Context, tenantId string, AdditionalMetadata: dag.AdditionalMetadata, ParentTaskExternalID: parentTaskExternalID, TotalTasks: int32(dag.TotalTasks), // nolint: gosec - Input: dag.Input, + Input: input, + }) + + putPayloadOpts = append(putPayloadOpts, StoreOLAPPayloadOpts{ + ExternalId: dag.ExternalID, + InsertedAt: dag.InsertedAt, + Payload: dag.Input, }) } @@ -1458,6 +1801,12 @@ func (r *OLAPRepositoryImpl) writeDAGBatch(ctx context.Context, tenantId string, return err } + err = r.PutPayloads(ctx, tx, tenantId, putPayloadOpts) + + if err != nil { + return err + } + if err := commit(ctx); err != nil { return err } @@ -1627,12 +1976,48 @@ func (r *OLAPRepositoryImpl) BulkCreateEventsAndTriggers(ctx context.Context, ev defer rollback() - insertedEvents, err := r.queries.BulkCreateEvents(ctx, tx, events) + // todo: remove this when we remove dual writes + eventsToInsert := events + + if !r.payloadStore.OLAPDualWritesEnabled() { + payloads := make([][]byte, len(eventsToInsert.Payloads)) + + for i := range eventsToInsert.Payloads { + payloads[i] = []byte("{}") + } + + eventsToInsert.Payloads = payloads + } + + insertedEvents, err := r.queries.BulkCreateEvents(ctx, tx, eventsToInsert) if err != nil { return fmt.Errorf("error creating events: %v", err) } + tenantIdToPutPayloadOpts := make(map[string][]StoreOLAPPayloadOpts) + + for _, event := range insertedEvents { + if event == nil { + continue + } + + tenantIdToPutPayloadOpts[event.TenantID.String()] = append(tenantIdToPutPayloadOpts[event.TenantID.String()], StoreOLAPPayloadOpts{ + ExternalId: event.ExternalID, + InsertedAt: event.SeenAt, + Payload: event.Payload, + }) + + } + + for tenantId, putPayloadOpts := range tenantIdToPutPayloadOpts { + err = r.PutPayloads(ctx, tx, tenantId, putPayloadOpts) + + if err != nil { + return fmt.Errorf("error putting event payloads: %v", err) + } + } + eventExternalIdToId := make(map[pgtype.UUID]int64) for _, event := range insertedEvents { @@ -1667,6 +2052,70 @@ func (r *OLAPRepositoryImpl) BulkCreateEventsAndTriggers(ctx context.Context, ev return fmt.Errorf("error committing transaction: %v", err) } + if !r.payloadStore.ExternalStoreEnabled() { + return nil + } + + offloadToExternalOpts := make([]OffloadToExternalStoreOpts, 0) + idInsertedAtToExternalId := make(map[IdInsertedAt]pgtype.UUID) + + for _, event := range insertedEvents { + id := event.ID + insertedAt := event.SeenAt + idInsertedAtToExternalId[IdInsertedAt{ + ID: id, + InsertedAt: insertedAt, + }] = event.ExternalID + + offloadToExternalOpts = append(offloadToExternalOpts, OffloadToExternalStoreOpts{ + StorePayloadOpts: &StorePayloadOpts{ + Id: event.ID, + InsertedAt: event.SeenAt, + ExternalId: event.ExternalID, + Type: sqlcv1.V1PayloadTypeTASKINPUT, + Payload: event.Payload, + TenantId: event.TenantID.String(), + }, + OffloadAt: time.Now(), + }) + } + + if len(offloadToExternalOpts) == 0 { + return nil + } + + retrieveOptsToKey, err := r.PayloadStore().ExternalStore().Store(ctx, offloadToExternalOpts...) + + if err != nil { + return err + } + + tenantIdToffloadOpts := make(map[string][]OffloadPayloadOpts) + + for opt, key := range retrieveOptsToKey { + externalId := idInsertedAtToExternalId[IdInsertedAt{ + ID: opt.Id, + InsertedAt: opt.InsertedAt, + }] + + tenantIdToffloadOpts[opt.TenantId.String()] = append(tenantIdToffloadOpts[opt.TenantId.String()], OffloadPayloadOpts{ + ExternalId: externalId, + ExternalLocationKey: string(key), + }) + } + + for tenantId, opts := range tenantIdToffloadOpts { + err = r.OffloadPayloads(ctx, tenantId, opts) + + if err != nil { + return fmt.Errorf("error offloading payloads: %v", err) + } + } + + if len(offloadToExternalOpts) == 0 { + return nil + } + return nil } @@ -1692,7 +2141,12 @@ type ListEventsRow struct { TriggeringWebhookName *string `json:"triggering_webhook_name,omitempty"` } -func (r *OLAPRepositoryImpl) ListEvents(ctx context.Context, opts sqlcv1.ListEventsParams) ([]*ListEventsRow, *int64, error) { +type EventWithPayload struct { + *ListEventsRow + Payload []byte `json:"payload"` +} + +func (r *OLAPRepositoryImpl) ListEvents(ctx context.Context, opts sqlcv1.ListEventsParams) ([]*EventWithPayload, *int64, error) { events, err := r.queries.ListEvents(ctx, r.readPool, opts) if err != nil { @@ -1736,52 +2190,71 @@ func (r *OLAPRepositoryImpl) ListEvents(ctx context.Context, opts sqlcv1.ListEve externalIdToEventData[data.ExternalID] = append(externalIdToEventData[data.ExternalID], data) } - result := make([]*ListEventsRow, 0) + externalIdToPayload, err := r.ReadPayloads(ctx, opts.Tenantid.String(), eventExternalIds...) + + if err != nil { + return nil, nil, fmt.Errorf("error reading event payloads: %v", err) + } + + result := make([]*EventWithPayload, 0) for _, event := range events { - data, exists := externalIdToEventData[event.ExternalID] + payload, exists := externalIdToPayload[event.ExternalID] + + if !exists { + r.l.Error().Msgf("ListEvents: payload for event %s not found", sqlchelpers.UUIDToStr(event.ExternalID)) + payload = event.Payload + } + var triggeringWebhookName *string if event.TriggeringWebhookName.Valid { triggeringWebhookName = &event.TriggeringWebhookName.String } - if !exists || len(data) == 0 { + data, exists := externalIdToEventData[event.ExternalID] - result = append(result, &ListEventsRow{ - TenantID: event.TenantID, - EventID: event.ID, - EventExternalID: event.ExternalID, - EventSeenAt: event.SeenAt, - EventKey: event.Key, - EventPayload: event.Payload, - EventAdditionalMetadata: event.AdditionalMetadata, - EventScope: event.Scope.String, - QueuedCount: 0, - RunningCount: 0, - CompletedCount: 0, - CancelledCount: 0, - FailedCount: 0, - TriggeringWebhookName: triggeringWebhookName, - }) - } else { - for _, d := range data { - result = append(result, &ListEventsRow{ + if !exists || len(data) == 0 { + result = append(result, &EventWithPayload{ + ListEventsRow: &ListEventsRow{ TenantID: event.TenantID, EventID: event.ID, EventExternalID: event.ExternalID, EventSeenAt: event.SeenAt, EventKey: event.Key, - EventPayload: event.Payload, + EventPayload: payload, EventAdditionalMetadata: event.AdditionalMetadata, EventScope: event.Scope.String, - QueuedCount: d.QueuedCount, - RunningCount: d.RunningCount, - CompletedCount: d.CompletedCount, - CancelledCount: d.CancelledCount, - FailedCount: d.FailedCount, - TriggeredRuns: d.TriggeredRuns, + QueuedCount: 0, + RunningCount: 0, + CompletedCount: 0, + CancelledCount: 0, + FailedCount: 0, TriggeringWebhookName: triggeringWebhookName, + }, + Payload: payload, + }) + } else { + for _, d := range data { + result = append(result, &EventWithPayload{ + ListEventsRow: &ListEventsRow{ + TenantID: event.TenantID, + EventID: event.ID, + EventExternalID: event.ExternalID, + EventSeenAt: event.SeenAt, + EventKey: event.Key, + EventPayload: payload, + EventAdditionalMetadata: event.AdditionalMetadata, + EventScope: event.Scope.String, + QueuedCount: d.QueuedCount, + RunningCount: d.RunningCount, + CompletedCount: d.CompletedCount, + CancelledCount: d.CancelledCount, + FailedCount: d.FailedCount, + TriggeredRuns: d.TriggeredRuns, + TriggeringWebhookName: triggeringWebhookName, + }, + Payload: payload, }) } } @@ -1892,6 +2365,127 @@ func (r *OLAPRepositoryImpl) StoreCELEvaluationFailures(ctx context.Context, ten }) } +type OffloadPayloadOpts struct { + ExternalId pgtype.UUID + ExternalLocationKey string +} + +func (r *OLAPRepositoryImpl) PutPayloads(ctx context.Context, tx sqlcv1.DBTX, tenantId string, putPayloadOpts []StoreOLAPPayloadOpts) error { + insertedAts := make([]pgtype.Timestamptz, len(putPayloadOpts)) + tenantIds := make([]pgtype.UUID, len(putPayloadOpts)) + externalIds := make([]pgtype.UUID, len(putPayloadOpts)) + payloads := make([][]byte, len(putPayloadOpts)) + locations := make([]string, len(putPayloadOpts)) + + for i, opt := range putPayloadOpts { + externalIds[i] = opt.ExternalId + insertedAts[i] = opt.InsertedAt + tenantIds[i] = sqlchelpers.UUIDFromStr(tenantId) + payloads[i] = opt.Payload + locations[i] = string(sqlcv1.V1PayloadLocationOlapINLINE) + } + + return r.queries.PutPayloads(ctx, tx, sqlcv1.PutPayloadsParams{ + Externalids: externalIds, + Insertedats: insertedAts, + Tenantids: tenantIds, + Payloads: payloads, + Locations: locations, + }) +} + +func (r *OLAPRepositoryImpl) ReadPayload(ctx context.Context, tenantId string, externalId pgtype.UUID) ([]byte, error) { + payloads, err := r.ReadPayloads(ctx, tenantId, externalId) + + if err != nil { + return nil, err + } + + payload, exists := payloads[externalId] + + if !exists { + r.l.Debug().Msgf("payload for external ID %s not found", sqlchelpers.UUIDToStr(externalId)) + } + + return payload, nil +} + +func (r *OLAPRepositoryImpl) ReadPayloads(ctx context.Context, tenantId string, externalIds ...pgtype.UUID) (map[pgtype.UUID][]byte, error) { + payloads, err := r.queries.ReadPayloadsOLAP(ctx, r.readPool, sqlcv1.ReadPayloadsOLAPParams{ + Tenantid: sqlchelpers.UUIDFromStr(tenantId), + Externalids: externalIds, + }) + + if err != nil { + return nil, err + } + + externalIdToPayload := make(map[pgtype.UUID][]byte) + externalIdToExternalKey := make(map[pgtype.UUID]ExternalPayloadLocationKey) + externalKeys := make([]ExternalPayloadLocationKey, 0) + + for _, payload := range payloads { + if payload.Location == sqlcv1.V1PayloadLocationOlapINLINE { + externalIdToPayload[payload.ExternalID] = payload.InlineContent + } else { + key := ExternalPayloadLocationKey(payload.ExternalLocationKey.String) + + externalIdToExternalKey[payload.ExternalID] = key + externalKeys = append(externalKeys, key) + } + } + + if len(externalKeys) > 0 && r.payloadStore.ExternalStoreEnabled() { + keyToPayload, err := r.payloadStore.RetrieveFromExternal(ctx, externalKeys...) + + if err != nil { + return nil, err + } + + for externalId, externalKey := range externalIdToExternalKey { + externalIdToPayload[externalId] = keyToPayload[externalKey] + } + } + + return externalIdToPayload, nil +} + +func (r *OLAPRepositoryImpl) OffloadPayloads(ctx context.Context, tenantId string, payloads []OffloadPayloadOpts) error { + tx, commit, rollback, err := sqlchelpers.PrepareTx(ctx, r.pool, r.l, 5000) + + if err != nil { + return fmt.Errorf("error beginning transaction: %v", err) + } + + defer rollback() + + tenantIds := make([]pgtype.UUID, len(payloads)) + externalIds := make([]pgtype.UUID, len(payloads)) + externalLocationKeys := make([]string, len(payloads)) + + for i, opt := range payloads { + externalIds[i] = opt.ExternalId + tenantIds[i] = sqlchelpers.UUIDFromStr(tenantId) + externalLocationKeys[i] = opt.ExternalLocationKey + } + + err = r.queries.OffloadPayloads(ctx, tx, sqlcv1.OffloadPayloadsParams{ + Externalids: externalIds, + Tenantids: tenantIds, + Externallocationkeys: externalLocationKeys, + }) + + if err != nil { + return fmt.Errorf("error offloading payloads: %v", err) + } + + if err := commit(ctx); err != nil { + return fmt.Errorf("error committing transaction: %v", err) + } + + return nil +} + func (r *OLAPRepositoryImpl) AnalyzeOLAPTables(ctx context.Context) error { const timeout = 1000 * 60 * 60 // 60 minute timeout tx, commit, rollback, err := sqlchelpers.PrepareTx(ctx, r.pool, r.l, timeout) @@ -1937,6 +2531,12 @@ func (r *OLAPRepositoryImpl) AnalyzeOLAPTables(ctx context.Context) error { return fmt.Errorf("error analyzing v1_dag_to_task_olap: %v", err) } + err = r.queries.AnalyzeV1PayloadsOLAP(ctx, tx) + + if err != nil { + return fmt.Errorf("error analyzing v1_payloads_olap: %v", err) + } + if err := commit(ctx); err != nil { return fmt.Errorf("error committing transaction: %v", err) } @@ -1976,10 +2576,10 @@ func (r *OLAPRepositoryImpl) populateTaskRunData(ctx context.Context, tx pgx.Tx, for idInsertedAt := range uniqueTaskIdInsertedAts { taskData, err := r.queries.PopulateTaskRunData(ctx, tx, sqlcv1.PopulateTaskRunDataParams{ - Includepayloads: includePayloads, Taskid: idInsertedAt.ID, Taskinsertedat: idInsertedAt.InsertedAt, Tenantid: sqlchelpers.UUIDFromStr(tenantId), + Includepayloads: includePayloads, }) if err != nil && !errors.Is(err, pgx.ErrNoRows) { diff --git a/pkg/repository/v1/olappayload.go b/pkg/repository/v1/olappayload.go new file mode 100644 index 000000000..c1ad93a7a --- /dev/null +++ b/pkg/repository/v1/olappayload.go @@ -0,0 +1,27 @@ +package v1 + +import ( + msgqueue "github.com/hatchet-dev/hatchet/internal/msgqueue/v1" + "github.com/jackc/pgx/v5/pgtype" +) + +type OLAPPayloadToOffload struct { + ExternalId pgtype.UUID + ExternalLocationKey string +} + +type OLAPPayloadsToOffload struct { + Payloads []OLAPPayloadToOffload +} + +func OLAPPayloadOffloadMessage(tenantId string, payloads []OLAPPayloadToOffload) (*msgqueue.Message, error) { + return msgqueue.NewTenantMessage( + tenantId, + "offload-payload", + false, + true, + OLAPPayloadsToOffload{ + Payloads: payloads, + }, + ) +} diff --git a/pkg/repository/v1/payloadstore.go b/pkg/repository/v1/payloadstore.go index 298c186ab..f85953479 100644 --- a/pkg/repository/v1/payloadstore.go +++ b/pkg/repository/v1/payloadstore.go @@ -6,6 +6,7 @@ import ( "sort" "time" + msgqueue "github.com/hatchet-dev/hatchet/internal/msgqueue/v1" "github.com/hatchet-dev/hatchet/pkg/repository/postgres/sqlchelpers" "github.com/hatchet-dev/hatchet/pkg/repository/v1/sqlcv1" "github.com/hatchet-dev/hatchet/pkg/telemetry" @@ -18,11 +19,18 @@ import ( type StorePayloadOpts struct { Id int64 InsertedAt pgtype.Timestamptz + ExternalId pgtype.UUID Type sqlcv1.V1PayloadType Payload []byte TenantId string } +type StoreOLAPPayloadOpts struct { + ExternalId pgtype.UUID + InsertedAt pgtype.Timestamptz + Payload []byte +} + type OffloadToExternalStoreOpts struct { *StorePayloadOpts OffloadAt time.Time @@ -38,28 +46,27 @@ type RetrievePayloadOpts struct { type PayloadLocation string type ExternalPayloadLocationKey string -type BulkRetrievePayloadOpts struct { - Keys []ExternalPayloadLocationKey - TenantId string -} - type ExternalStore interface { Store(ctx context.Context, payloads ...OffloadToExternalStoreOpts) (map[RetrievePayloadOpts]ExternalPayloadLocationKey, error) - BulkRetrieve(ctx context.Context, opts ...BulkRetrievePayloadOpts) (map[ExternalPayloadLocationKey][]byte, error) + Retrieve(ctx context.Context, keys ...ExternalPayloadLocationKey) (map[ExternalPayloadLocationKey][]byte, error) } type PayloadStoreRepository interface { Store(ctx context.Context, tx sqlcv1.DBTX, payloads ...StorePayloadOpts) error - BulkRetrieve(ctx context.Context, opts ...RetrievePayloadOpts) (map[RetrievePayloadOpts][]byte, error) - ProcessPayloadWAL(ctx context.Context, partitionNumber int64) (bool, error) + Retrieve(ctx context.Context, opts ...RetrievePayloadOpts) (map[RetrievePayloadOpts][]byte, error) + RetrieveFromExternal(ctx context.Context, keys ...ExternalPayloadLocationKey) (map[ExternalPayloadLocationKey][]byte, error) + ProcessPayloadWAL(ctx context.Context, partitionNumber int64, pubBuffer *msgqueue.MQPubBuffer) (bool, error) ProcessPayloadExternalCutovers(ctx context.Context, partitionNumber int64) (bool, error) OverwriteExternalStore(store ExternalStore, inlineStoreTTL time.Duration) DualWritesEnabled() bool TaskEventDualWritesEnabled() bool DagDataDualWritesEnabled() bool + OLAPDualWritesEnabled() bool WALPollLimit() int WALProcessInterval() time.Duration ExternalCutoverProcessInterval() time.Duration + ExternalStoreEnabled() bool + ExternalStore() ExternalStore } type payloadStoreRepositoryImpl struct { @@ -72,6 +79,7 @@ type payloadStoreRepositoryImpl struct { enablePayloadDualWrites bool enableTaskEventPayloadDualWrites bool enableDagDataPayloadDualWrites bool + enableOLAPPayloadDualWrites bool walPollLimit int walProcessInterval time.Duration externalCutoverProcessInterval time.Duration @@ -81,6 +89,7 @@ type PayloadStoreRepositoryOpts struct { EnablePayloadDualWrites bool EnableTaskEventPayloadDualWrites bool EnableDagDataPayloadDualWrites bool + EnableOLAPPayloadDualWrites bool WALPollLimit int WALProcessInterval time.Duration ExternalCutoverProcessInterval time.Duration @@ -103,6 +112,7 @@ func NewPayloadStoreRepository( enablePayloadDualWrites: opts.EnablePayloadDualWrites, enableTaskEventPayloadDualWrites: opts.EnableTaskEventPayloadDualWrites, enableDagDataPayloadDualWrites: opts.EnableDagDataPayloadDualWrites, + enableOLAPPayloadDualWrites: opts.EnableOLAPPayloadDualWrites, walPollLimit: opts.WALPollLimit, walProcessInterval: opts.WALProcessInterval, externalCutoverProcessInterval: opts.ExternalCutoverProcessInterval, @@ -124,6 +134,7 @@ func (p *payloadStoreRepositoryImpl) Store(ctx context.Context, tx sqlcv1.DBTX, offloadAts := make([]pgtype.Timestamptz, 0, len(payloads)) tenantIds := make([]pgtype.UUID, 0, len(payloads)) locations := make([]string, 0, len(payloads)) + externalIds := make([]pgtype.UUID, 0, len(payloads)) seenPayloadUniqueKeys := make(map[PayloadUniqueKey]struct{}) @@ -153,6 +164,7 @@ func (p *payloadStoreRepositoryImpl) Store(ctx context.Context, tx sqlcv1.DBTX, tenantIds = append(tenantIds, tenantId) locations = append(locations, string(sqlcv1.V1PayloadLocationINLINE)) inlineContents = append(inlineContents, payload.Payload) + externalIds = append(externalIds, payload.ExternalId) if p.externalStoreEnabled { offloadAts = append(offloadAts, pgtype.Timestamptz{Time: payload.InsertedAt.Time.Add(*p.inlineStoreTTL), Valid: true}) @@ -168,6 +180,7 @@ func (p *payloadStoreRepositoryImpl) Store(ctx context.Context, tx sqlcv1.DBTX, Locations: locations, Tenantids: tenantIds, Inlinecontents: inlineContents, + Externalids: externalIds, }) if err != nil { @@ -192,11 +205,19 @@ func (p *payloadStoreRepositoryImpl) Store(ctx context.Context, tx sqlcv1.DBTX, return err } -func (p *payloadStoreRepositoryImpl) BulkRetrieve(ctx context.Context, opts ...RetrievePayloadOpts) (map[RetrievePayloadOpts][]byte, error) { - return p.bulkRetrieve(ctx, p.pool, opts...) +func (p *payloadStoreRepositoryImpl) Retrieve(ctx context.Context, opts ...RetrievePayloadOpts) (map[RetrievePayloadOpts][]byte, error) { + return p.retrieve(ctx, p.pool, opts...) } -func (p *payloadStoreRepositoryImpl) bulkRetrieve(ctx context.Context, tx sqlcv1.DBTX, opts ...RetrievePayloadOpts) (map[RetrievePayloadOpts][]byte, error) { +func (p *payloadStoreRepositoryImpl) RetrieveFromExternal(ctx context.Context, keys ...ExternalPayloadLocationKey) (map[ExternalPayloadLocationKey][]byte, error) { + if !p.externalStoreEnabled { + return nil, fmt.Errorf("external store not enabled") + } + + return p.externalStore.Retrieve(ctx, keys...) +} + +func (p *payloadStoreRepositoryImpl) retrieve(ctx context.Context, tx sqlcv1.DBTX, opts ...RetrievePayloadOpts) (map[RetrievePayloadOpts][]byte, error) { if len(opts) == 0 { return make(map[RetrievePayloadOpts][]byte), nil } @@ -227,7 +248,7 @@ func (p *payloadStoreRepositoryImpl) bulkRetrieve(ctx context.Context, tx sqlcv1 optsToPayload := make(map[RetrievePayloadOpts][]byte) externalKeysToOpts := make(map[ExternalPayloadLocationKey]RetrievePayloadOpts) - retrievePayloadOpts := make([]BulkRetrievePayloadOpts, 0) + externalKeys := make([]ExternalPayloadLocationKey, 0) for _, payload := range payloads { if payload == nil { @@ -244,17 +265,14 @@ func (p *payloadStoreRepositoryImpl) bulkRetrieve(ctx context.Context, tx sqlcv1 if payload.Location == sqlcv1.V1PayloadLocationEXTERNAL { key := ExternalPayloadLocationKey(payload.ExternalLocationKey.String) externalKeysToOpts[key] = opts - retrievePayloadOpts = append(retrievePayloadOpts, BulkRetrievePayloadOpts{ - Keys: []ExternalPayloadLocationKey{key}, - TenantId: opts.TenantId.String(), - }) + externalKeys = append(externalKeys, key) } else { optsToPayload[opts] = payload.InlineContent } } - if len(retrievePayloadOpts) > 0 { - externalData, err := p.externalStore.BulkRetrieve(ctx, retrievePayloadOpts...) + if len(externalKeys) > 0 { + externalData, err := p.RetrieveFromExternal(ctx, externalKeys...) if err != nil { return nil, fmt.Errorf("failed to retrieve external payloads: %w", err) } @@ -283,7 +301,7 @@ func (p *payloadStoreRepositoryImpl) offloadToExternal(ctx context.Context, payl return p.externalStore.Store(ctx, payloads...) } -func (p *payloadStoreRepositoryImpl) ProcessPayloadWAL(ctx context.Context, partitionNumber int64) (bool, error) { +func (p *payloadStoreRepositoryImpl) ProcessPayloadWAL(ctx context.Context, partitionNumber int64, pubBuffer *msgqueue.MQPubBuffer) (bool, error) { // no need to process the WAL if external store is not enabled if !p.externalStoreEnabled { return false, nil @@ -340,7 +358,7 @@ func (p *payloadStoreRepositoryImpl) ProcessPayloadWAL(ctx context.Context, part retrieveOptsToOffloadAt[opts] = record.OffloadAt } - payloads, err := p.bulkRetrieve(ctx, tx, retrieveOpts...) + payloads, err := p.retrieve(ctx, tx, retrieveOpts...) if err != nil { return false, err @@ -441,7 +459,7 @@ func (p *payloadStoreRepositoryImpl) ProcessPayloadWAL(ctx context.Context, part return false, fmt.Errorf("failed to prepare transaction for offloading: %w", err) } - err = p.queries.SetPayloadExternalKeys(ctx, tx, sqlcv1.SetPayloadExternalKeysParams{ + updatedPayloads, err := p.queries.SetPayloadExternalKeys(ctx, tx, sqlcv1.SetPayloadExternalKeysParams{ Ids: ids, Insertedats: insertedAts, Payloadtypes: types, @@ -454,10 +472,35 @@ func (p *payloadStoreRepositoryImpl) ProcessPayloadWAL(ctx context.Context, part return false, err } + tenantIdToPayloads := make(map[string][]OLAPPayloadToOffload) + + for _, updatedPayload := range updatedPayloads { + if updatedPayload == nil || updatedPayload.Type == sqlcv1.V1PayloadTypeTASKEVENTDATA { + continue + } + + tenantIdToPayloads[updatedPayload.TenantID.String()] = append(tenantIdToPayloads[updatedPayload.TenantID.String()], OLAPPayloadToOffload{ + ExternalId: updatedPayload.ExternalID, + ExternalLocationKey: updatedPayload.ExternalLocationKey.String, + }) + } + if err := commit(ctx); err != nil { return false, err } + // todo: make this transactionally safe + // there's no application-level risk here because the worst case if + // we miss an event is we don't mark the payload as external and there's a bit + // of disk bloat, but it'd be good to not need to worry about that + for tenantId, payloads := range tenantIdToPayloads { + msg, err := OLAPPayloadOffloadMessage(tenantId, payloads) + if err != nil { + return false, fmt.Errorf("failed to create OLAP payload offload message: %w", err) + } + pubBuffer.Pub(ctx, msgqueue.OLAP_QUEUE, msg, false) + } + return hasMoreWALRecords, nil } @@ -528,6 +571,10 @@ func (p *payloadStoreRepositoryImpl) DagDataDualWritesEnabled() bool { return p.enableDagDataPayloadDualWrites } +func (p *payloadStoreRepositoryImpl) OLAPDualWritesEnabled() bool { + return p.enableOLAPPayloadDualWrites +} + func (p *payloadStoreRepositoryImpl) WALPollLimit() int { return p.walPollLimit } @@ -540,12 +587,20 @@ func (p *payloadStoreRepositoryImpl) ExternalCutoverProcessInterval() time.Durat return p.externalCutoverProcessInterval } +func (p *payloadStoreRepositoryImpl) ExternalStoreEnabled() bool { + return p.externalStoreEnabled +} + +func (p *payloadStoreRepositoryImpl) ExternalStore() ExternalStore { + return p.externalStore +} + type NoOpExternalStore struct{} func (n *NoOpExternalStore) Store(ctx context.Context, payloads ...OffloadToExternalStoreOpts) (map[RetrievePayloadOpts]ExternalPayloadLocationKey, error) { return nil, fmt.Errorf("external store disabled") } -func (n *NoOpExternalStore) BulkRetrieve(ctx context.Context, opts ...BulkRetrievePayloadOpts) (map[ExternalPayloadLocationKey][]byte, error) { +func (n *NoOpExternalStore) Retrieve(ctx context.Context, keys ...ExternalPayloadLocationKey) (map[ExternalPayloadLocationKey][]byte, error) { return nil, fmt.Errorf("external store disabled") } diff --git a/pkg/repository/v1/sqlcv1/copyfrom.go b/pkg/repository/v1/sqlcv1/copyfrom.go index 0981ada20..4c6cef7c3 100644 --- a/pkg/repository/v1/sqlcv1/copyfrom.go +++ b/pkg/repository/v1/sqlcv1/copyfrom.go @@ -197,6 +197,7 @@ func (r iteratorForCreateTaskEventsOLAP) Values() ([]interface{}, error) { r.rows[0].WorkerID, r.rows[0].AdditionalEventData, r.rows[0].AdditionalEventMessage, + r.rows[0].ExternalID, }, nil } @@ -205,7 +206,7 @@ func (r iteratorForCreateTaskEventsOLAP) Err() error { } func (q *Queries) CreateTaskEventsOLAP(ctx context.Context, db DBTX, arg []CreateTaskEventsOLAPParams) (int64, error) { - return db.CopyFrom(ctx, []string{"v1_task_events_olap"}, []string{"tenant_id", "task_id", "task_inserted_at", "event_type", "workflow_id", "event_timestamp", "readable_status", "retry_count", "error_message", "output", "worker_id", "additional__event_data", "additional__event_message"}, &iteratorForCreateTaskEventsOLAP{rows: arg}) + return db.CopyFrom(ctx, []string{"v1_task_events_olap"}, []string{"tenant_id", "task_id", "task_inserted_at", "event_type", "workflow_id", "event_timestamp", "readable_status", "retry_count", "error_message", "output", "worker_id", "additional__event_data", "additional__event_message", "external_id"}, &iteratorForCreateTaskEventsOLAP{rows: arg}) } // iteratorForCreateTaskEventsOLAPTmp implements pgx.CopyFromSource. diff --git a/pkg/repository/v1/sqlcv1/models.go b/pkg/repository/v1/sqlcv1/models.go index 52da6c818..e194467ba 100644 --- a/pkg/repository/v1/sqlcv1/models.go +++ b/pkg/repository/v1/sqlcv1/models.go @@ -1436,6 +1436,48 @@ func (ns NullV1PayloadLocation) Value() (driver.Value, error) { return string(ns.V1PayloadLocation), nil } +type V1PayloadLocationOlap string + +const ( + V1PayloadLocationOlapINLINE V1PayloadLocationOlap = "INLINE" + V1PayloadLocationOlapEXTERNAL V1PayloadLocationOlap = "EXTERNAL" +) + +func (e *V1PayloadLocationOlap) Scan(src interface{}) error { + switch s := src.(type) { + case []byte: + *e = V1PayloadLocationOlap(s) + case string: + *e = V1PayloadLocationOlap(s) + default: + return fmt.Errorf("unsupported scan type for V1PayloadLocationOlap: %T", src) + } + return nil +} + +type NullV1PayloadLocationOlap struct { + V1PayloadLocationOlap V1PayloadLocationOlap `json:"v1_payload_location_olap"` + Valid bool `json:"valid"` // Valid is true if V1PayloadLocationOlap is not NULL +} + +// Scan implements the Scanner interface. +func (ns *NullV1PayloadLocationOlap) Scan(value interface{}) error { + if value == nil { + ns.V1PayloadLocationOlap, ns.Valid = "", false + return nil + } + ns.Valid = true + return ns.V1PayloadLocationOlap.Scan(value) +} + +// Value implements the driver Valuer interface. +func (ns NullV1PayloadLocationOlap) Value() (driver.Value, error) { + if !ns.Valid { + return nil, nil + } + return string(ns.V1PayloadLocationOlap), nil +} + type V1PayloadType string const ( @@ -3079,6 +3121,7 @@ type V1Payload struct { TenantID pgtype.UUID `json:"tenant_id"` ID int64 `json:"id"` InsertedAt pgtype.Timestamptz `json:"inserted_at"` + ExternalID pgtype.UUID `json:"external_id"` Type V1PayloadType `json:"type"` Location V1PayloadLocation `json:"location"` ExternalLocationKey pgtype.Text `json:"external_location_key"` @@ -3103,6 +3146,16 @@ type V1PayloadWal struct { Operation V1PayloadWalOperation `json:"operation"` } +type V1PayloadsOlap struct { + TenantID pgtype.UUID `json:"tenant_id"` + ExternalID pgtype.UUID `json:"external_id"` + Location V1PayloadLocationOlap `json:"location"` + ExternalLocationKey pgtype.Text `json:"external_location_key"` + InlineContent []byte `json:"inline_content"` + InsertedAt pgtype.Timestamptz `json:"inserted_at"` + UpdatedAt pgtype.Timestamptz `json:"updated_at"` +} + type V1Queue struct { TenantID pgtype.UUID `json:"tenant_id"` Name string `json:"name"` @@ -3262,6 +3315,7 @@ type V1TaskEventsOlap struct { TenantID pgtype.UUID `json:"tenant_id"` ID int64 `json:"id"` InsertedAt pgtype.Timestamptz `json:"inserted_at"` + ExternalID pgtype.UUID `json:"external_id"` TaskID int64 `json:"task_id"` TaskInsertedAt pgtype.Timestamptz `json:"task_inserted_at"` EventType V1EventTypeOlap `json:"event_type"` diff --git a/pkg/repository/v1/sqlcv1/olap.sql b/pkg/repository/v1/sqlcv1/olap.sql index 75ee2bce3..d56a89235 100644 --- a/pkg/repository/v1/sqlcv1/olap.sql +++ b/pkg/repository/v1/sqlcv1/olap.sql @@ -13,7 +13,8 @@ SELECT create_v1_range_partition('v1_event_to_run_olap'::text, @date::date), create_v1_weekly_range_partition('v1_event_lookup_table_olap'::text, @date::date), create_v1_range_partition('v1_incoming_webhook_validation_failures_olap'::text, @date::date), - create_v1_range_partition('v1_cel_evaluation_failures_olap'::text, @date::date) + create_v1_range_partition('v1_cel_evaluation_failures_olap'::text, @date::date), + create_v1_range_partition('v1_payloads_olap'::text, @date::date) ; -- name: AnalyzeV1RunsOLAP :exec @@ -28,6 +29,9 @@ ANALYZE v1_dags_olap; -- name: AnalyzeV1DAGToTaskOLAP :exec ANALYZE v1_dag_to_task_olap; +-- name: AnalyzeV1PayloadsOLAP :exec +ANALYZE v1_payloads_olap; + -- name: ListOLAPPartitionsBeforeDate :many WITH task_partitions AS ( SELECT 'v1_tasks_olap' AS parent_table, p::text as partition_name FROM get_v1_partitions_before_date('v1_tasks_olap'::text, @date::date) AS p @@ -45,6 +49,8 @@ WITH task_partitions AS ( SELECT 'v1_incoming_webhook_validation_failures_olap' AS parent_table, p::TEXT AS partition_name FROM get_v1_partitions_before_date('v1_incoming_webhook_validation_failures_olap', @date::date) AS p ), cel_evaluation_failures_partitions AS ( SELECT 'v1_cel_evaluation_failures_olap' AS parent_table, p::TEXT AS partition_name FROM get_v1_partitions_before_date('v1_cel_evaluation_failures_olap', @date::date) AS p +), payloads_partitions AS ( + SELECT 'v1_payloads_olap' AS parent_table, p::TEXT AS partition_name FROM get_v1_partitions_before_date('v1_payloads_olap', @date::date) AS p ), candidates AS ( SELECT * @@ -99,6 +105,13 @@ WITH task_partitions AS ( * FROM cel_evaluation_failures_partitions + + UNION ALL + + SELECT + * + FROM + payloads_partitions ) SELECT * @@ -106,7 +119,9 @@ FROM candidates WHERE CASE WHEN @shouldPartitionEventsTables::BOOLEAN THEN TRUE - ELSE parent_table NOT IN ('v1_events_olap', 'v1_event_to_run_olap', 'v1_cel_evaluation_failures_olap', 'v1_incoming_webhook_validation_failures_olap') + -- this is a list of all of the tables which are hypertables in timescale, so we should not manually drop their + -- partitions if @shouldPartitionEventsTables is false + ELSE parent_table NOT IN ('v1_events_olap', 'v1_event_to_run_olap', 'v1_cel_evaluation_failures_olap', 'v1_incoming_webhook_validation_failures_olap', 'v1_payloads_olap') END ; @@ -217,7 +232,8 @@ INSERT INTO v1_task_events_olap ( output, worker_id, additional__event_data, - additional__event_message + additional__event_message, + external_id ) VALUES ( $1, $2, @@ -231,7 +247,8 @@ INSERT INTO v1_task_events_olap ( $10, $11, $12, - $13 + $13, + $14 ); -- name: ReadTaskByExternalID :one @@ -248,6 +265,7 @@ WITH lookup_task AS ( SELECT t.*, e.output, + e.external_id AS event_external_id, e.error_message FROM v1_tasks_olap t @@ -336,6 +354,7 @@ SELECT t.readable_status, t.error_message, t.output, + t.external_id AS event_external_id, t.worker_id, t.additional__event_data, t.additional__event_message @@ -386,6 +405,7 @@ SELECT t.readable_status, t.error_message, t.output, + t.external_id AS event_external_id, t.worker_id, t.additional__event_data, t.additional__event_message, @@ -439,8 +459,16 @@ WITH selected_retry_count AS ( relevant_events WHERE event_type = 'STARTED' +), queued_at AS ( + SELECT + MAX(event_timestamp) AS queued_at + FROM + relevant_events + WHERE + event_type = 'QUEUED' ), task_output AS ( SELECT + external_id, output FROM relevant_events @@ -483,7 +511,9 @@ SELECT st.readable_status::v1_readable_status_olap as status, f.finished_at::timestamptz as finished_at, s.started_at::timestamptz as started_at, - o.output::jsonb as output, + q.queued_at::timestamptz as queued_at, + o.external_id::UUID AS output_event_external_id, + o.output as output, e.error_message as error_message, sc.spawned_children, (SELECT retry_count FROM selected_retry_count) as retry_count @@ -493,6 +523,8 @@ LEFT JOIN finished_at f ON true LEFT JOIN started_at s ON true +LEFT JOIN + queued_at q ON true LEFT JOIN task_output o ON true LEFT JOIN @@ -511,6 +543,7 @@ WITH metadata AS ( MAX(event_timestamp) FILTER (WHERE event_type = 'STARTED')::TIMESTAMPTZ AS started_at, MAX(event_timestamp) FILTER (WHERE readable_status = ANY(ARRAY['COMPLETED', 'FAILED', 'CANCELLED']::v1_readable_status_olap[]))::TIMESTAMPTZ AS finished_at, MAX(output::TEXT) FILTER (WHERE readable_status = 'COMPLETED')::JSONB AS output, + MAX(external_id::TEXT) FILTER (WHERE readable_status = 'COMPLETED')::UUID AS output_event_external_id, MAX(error_message) FILTER (WHERE readable_status = 'FAILED')::TEXT AS error_message FROM v1_task_events_olap @@ -562,7 +595,8 @@ SELECT CASE WHEN @includePayloads::BOOLEAN THEN m.output::JSONB ELSE '{}'::JSONB - END::JSONB AS output + END::JSONB AS output, + m.output_event_external_id::UUID AS output_event_external_id FROM v1_tasks_olap t, metadata m WHERE @@ -983,6 +1017,7 @@ WITH run AS ( MAX(e.event_timestamp) FILTER (WHERE e.readable_status IN ('COMPLETED', 'CANCELLED', 'FAILED'))::timestamptz AS finished_at, MAX(e.error_message) FILTER (WHERE e.readable_status = 'FAILED') AS error_message, MAX(e.output::TEXT) FILTER (WHERE e.event_type = 'FINISHED')::JSONB AS output, + MAX(e.external_id::TEXT) FILTER (WHERE e.event_type = 'FINISHED')::UUID AS output_event_external_id, MAX(e.retry_count) AS max_retry_count FROM relevant_events e WHERE e.retry_count = ( @@ -999,6 +1034,7 @@ SELECT -- hack to force this to string since sqlc can't figure out that this should be pgtype.Text COALESCE(m.error_message, '')::TEXT AS error_message, m.output::JSONB AS output, + m.output_event_external_id::UUID AS output_event_external_id, COALESCE(m.max_retry_count, 0)::int as retry_count FROM run r, metadata m ; @@ -1583,6 +1619,76 @@ SELECT @tenantId::UUID, source, error FROM inputs ; +-- name: PutPayloads :exec +WITH inputs AS ( + SELECT + UNNEST(@externalIds::UUID[]) AS external_id, + UNNEST(@insertedAts::TIMESTAMPTZ[]) AS inserted_at, + UNNEST(@payloads::JSONB[]) AS payload, + UNNEST(@tenantIds::UUID[]) AS tenant_id, + UNNEST(CAST(@locations::TEXT[] AS v1_payload_location_olap[])) AS location +) + +INSERT INTO v1_payloads_olap ( + tenant_id, + external_id, + inserted_at, + location, + external_location_key, + inline_content +) + +SELECT + i.tenant_id, + i.external_id, + i.inserted_at, + i.location, + CASE + WHEN i.location = 'EXTERNAL' THEN i.payload + ELSE NULL + END, + CASE + WHEN i.location = 'INLINE' THEN i.payload + ELSE NULL + END AS inline_content +FROM inputs i +ON CONFLICT (tenant_id, external_id, inserted_at) DO UPDATE +SET + location = EXCLUDED.location, + external_location_key = EXCLUDED.external_location_key, + inline_content = EXCLUDED.inline_content, + updated_at = NOW() +; + +-- name: OffloadPayloads :exec +WITH inputs AS ( + SELECT + UNNEST(@externalIds::UUID[]) AS external_id, + UNNEST(@tenantIds::UUID[]) AS tenant_id, + UNNEST(@externalLocationKeys::TEXT[]) AS external_location_key +) + +UPDATE v1_payloads_olap +SET + location = 'EXTERNAL', + external_location_key = i.external_location_key, + inline_content = NULL, + updated_at = NOW() +FROM inputs i +WHERE + (v1_payloads_olap.tenant_id, v1_payloads_olap.external_id) = (i.tenant_id, i.external_id) + AND v1_payloads_olap.location = 'INLINE' + AND v1_payloads_olap.external_location_key IS NULL +; + +-- name: ReadPayloadsOLAP :many +SELECT * +FROM v1_payloads_olap +WHERE + tenant_id = @tenantId::UUID + AND external_id = ANY(@externalIds::UUID[]) +; + -- name: ListWorkflowRunExternalIds :many SELECT external_id FROM v1_runs_olap diff --git a/pkg/repository/v1/sqlcv1/olap.sql.go b/pkg/repository/v1/sqlcv1/olap.sql.go index de7bff2b6..063ff3618 100644 --- a/pkg/repository/v1/sqlcv1/olap.sql.go +++ b/pkg/repository/v1/sqlcv1/olap.sql.go @@ -29,6 +29,15 @@ func (q *Queries) AnalyzeV1DAGsOLAP(ctx context.Context, db DBTX) error { return err } +const analyzeV1PayloadsOLAP = `-- name: AnalyzeV1PayloadsOLAP :exec +ANALYZE v1_payloads_olap +` + +func (q *Queries) AnalyzeV1PayloadsOLAP(ctx context.Context, db DBTX) error { + _, err := db.Exec(ctx, analyzeV1PayloadsOLAP) + return err +} + const analyzeV1RunsOLAP = `-- name: AnalyzeV1RunsOLAP :exec ANALYZE v1_runs_olap ` @@ -191,7 +200,8 @@ SELECT create_v1_range_partition('v1_event_to_run_olap'::text, $1::date), create_v1_weekly_range_partition('v1_event_lookup_table_olap'::text, $1::date), create_v1_range_partition('v1_incoming_webhook_validation_failures_olap'::text, $1::date), - create_v1_range_partition('v1_cel_evaluation_failures_olap'::text, $1::date) + create_v1_range_partition('v1_cel_evaluation_failures_olap'::text, $1::date), + create_v1_range_partition('v1_payloads_olap'::text, $1::date) ` func (q *Queries) CreateOLAPEventPartitions(ctx context.Context, db DBTX, date pgtype.Date) error { @@ -232,6 +242,7 @@ type CreateTaskEventsOLAPParams struct { WorkerID pgtype.UUID `json:"worker_id"` AdditionalEventData pgtype.Text `json:"additional__event_data"` AdditionalEventMessage pgtype.Text `json:"additional__event_message"` + ExternalID pgtype.UUID `json:"external_id"` } type CreateTaskEventsOLAPTmpParams struct { @@ -1028,6 +1039,8 @@ WITH task_partitions AS ( SELECT 'v1_incoming_webhook_validation_failures_olap' AS parent_table, p::TEXT AS partition_name FROM get_v1_partitions_before_date('v1_incoming_webhook_validation_failures_olap', $2::date) AS p ), cel_evaluation_failures_partitions AS ( SELECT 'v1_cel_evaluation_failures_olap' AS parent_table, p::TEXT AS partition_name FROM get_v1_partitions_before_date('v1_cel_evaluation_failures_olap', $2::date) AS p +), payloads_partitions AS ( + SELECT 'v1_payloads_olap' AS parent_table, p::TEXT AS partition_name FROM get_v1_partitions_before_date('v1_payloads_olap', $2::date) AS p ), candidates AS ( SELECT parent_table, partition_name @@ -1082,6 +1095,13 @@ WITH task_partitions AS ( parent_table, partition_name FROM cel_evaluation_failures_partitions + + UNION ALL + + SELECT + parent_table, partition_name + FROM + payloads_partitions ) SELECT parent_table, partition_name @@ -1089,7 +1109,9 @@ FROM candidates WHERE CASE WHEN $1::BOOLEAN THEN TRUE - ELSE parent_table NOT IN ('v1_events_olap', 'v1_event_to_run_olap', 'v1_cel_evaluation_failures_olap', 'v1_incoming_webhook_validation_failures_olap') + -- this is a list of all of the tables which are hypertables in timescale, so we should not manually drop their + -- partitions if @shouldPartitionEventsTables is false + ELSE parent_table NOT IN ('v1_events_olap', 'v1_event_to_run_olap', 'v1_cel_evaluation_failures_olap', 'v1_incoming_webhook_validation_failures_olap', 'v1_payloads_olap') END ` @@ -1156,6 +1178,7 @@ SELECT t.readable_status, t.error_message, t.output, + t.external_id AS event_external_id, t.worker_id, t.additional__event_data, t.additional__event_message @@ -1188,6 +1211,7 @@ type ListTaskEventsRow struct { ReadableStatus V1ReadableStatusOlap `json:"readable_status"` ErrorMessage pgtype.Text `json:"error_message"` Output []byte `json:"output"` + EventExternalID pgtype.UUID `json:"event_external_id"` WorkerID pgtype.UUID `json:"worker_id"` AdditionalEventData pgtype.Text `json:"additional__event_data"` AdditionalEventMessage pgtype.Text `json:"additional__event_message"` @@ -1216,6 +1240,7 @@ func (q *Queries) ListTaskEvents(ctx context.Context, db DBTX, arg ListTaskEvent &i.ReadableStatus, &i.ErrorMessage, &i.Output, + &i.EventExternalID, &i.WorkerID, &i.AdditionalEventData, &i.AdditionalEventMessage, @@ -1269,6 +1294,7 @@ SELECT t.readable_status, t.error_message, t.output, + t.external_id AS event_external_id, t.worker_id, t.additional__event_data, t.additional__event_message, @@ -1304,6 +1330,7 @@ type ListTaskEventsForWorkflowRunRow struct { ReadableStatus V1ReadableStatusOlap `json:"readable_status"` ErrorMessage pgtype.Text `json:"error_message"` Output []byte `json:"output"` + EventExternalID pgtype.UUID `json:"event_external_id"` WorkerID pgtype.UUID `json:"worker_id"` AdditionalEventData pgtype.Text `json:"additional__event_data"` AdditionalEventMessage pgtype.Text `json:"additional__event_message"` @@ -1334,6 +1361,7 @@ func (q *Queries) ListTaskEventsForWorkflowRun(ctx context.Context, db DBTX, arg &i.ReadableStatus, &i.ErrorMessage, &i.Output, + &i.EventExternalID, &i.WorkerID, &i.AdditionalEventData, &i.AdditionalEventMessage, @@ -1560,6 +1588,38 @@ func (q *Queries) ListWorkflowRunExternalIds(ctx context.Context, db DBTX, arg L return items, nil } +const offloadPayloads = `-- name: OffloadPayloads :exec +WITH inputs AS ( + SELECT + UNNEST($1::UUID[]) AS external_id, + UNNEST($2::UUID[]) AS tenant_id, + UNNEST($3::TEXT[]) AS external_location_key +) + +UPDATE v1_payloads_olap +SET + location = 'EXTERNAL', + external_location_key = i.external_location_key, + inline_content = NULL, + updated_at = NOW() +FROM inputs i +WHERE + (v1_payloads_olap.tenant_id, v1_payloads_olap.external_id) = (i.tenant_id, i.external_id) + AND v1_payloads_olap.location = 'INLINE' + AND v1_payloads_olap.external_location_key IS NULL +` + +type OffloadPayloadsParams struct { + Externalids []pgtype.UUID `json:"externalids"` + Tenantids []pgtype.UUID `json:"tenantids"` + Externallocationkeys []string `json:"externallocationkeys"` +} + +func (q *Queries) OffloadPayloads(ctx context.Context, db DBTX, arg OffloadPayloadsParams) error { + _, err := db.Exec(ctx, offloadPayloads, arg.Externalids, arg.Tenantids, arg.Externallocationkeys) + return err +} + const populateDAGMetadata = `-- name: PopulateDAGMetadata :one WITH run AS ( SELECT @@ -1590,7 +1650,7 @@ WITH run AS ( AND r.tenant_id = $4::UUID AND r.kind = 'DAG' ), relevant_events AS ( - SELECT e.tenant_id, e.id, e.inserted_at, e.task_id, e.task_inserted_at, e.event_type, e.workflow_id, e.event_timestamp, e.readable_status, e.retry_count, e.error_message, e.output, e.worker_id, e.additional__event_data, e.additional__event_message + SELECT e.tenant_id, e.id, e.inserted_at, e.external_id, e.task_id, e.task_inserted_at, e.event_type, e.workflow_id, e.event_timestamp, e.readable_status, e.retry_count, e.error_message, e.output, e.worker_id, e.additional__event_data, e.additional__event_message FROM run r JOIN v1_dag_to_task_olap dt ON (r.dag_id, r.inserted_at) = (dt.dag_id, dt.dag_inserted_at) JOIN v1_task_events_olap e ON (e.task_id, e.task_inserted_at) = (dt.task_id, dt.task_inserted_at) @@ -1601,6 +1661,7 @@ WITH run AS ( MAX(e.event_timestamp) FILTER (WHERE e.readable_status IN ('COMPLETED', 'CANCELLED', 'FAILED'))::timestamptz AS finished_at, MAX(e.error_message) FILTER (WHERE e.readable_status = 'FAILED') AS error_message, MAX(e.output::TEXT) FILTER (WHERE e.event_type = 'FINISHED')::JSONB AS output, + MAX(e.external_id::TEXT) FILTER (WHERE e.event_type = 'FINISHED')::UUID AS output_event_external_id, MAX(e.retry_count) AS max_retry_count FROM relevant_events e WHERE e.retry_count = ( @@ -1617,6 +1678,7 @@ SELECT -- hack to force this to string since sqlc can't figure out that this should be pgtype.Text COALESCE(m.error_message, '')::TEXT AS error_message, m.output::JSONB AS output, + m.output_event_external_id::UUID AS output_event_external_id, COALESCE(m.max_retry_count, 0)::int as retry_count FROM run r, metadata m ` @@ -1629,25 +1691,26 @@ type PopulateDAGMetadataParams struct { } type PopulateDAGMetadataRow struct { - DagID int64 `json:"dag_id"` - RunID int64 `json:"run_id"` - TenantID pgtype.UUID `json:"tenant_id"` - InsertedAt pgtype.Timestamptz `json:"inserted_at"` - ExternalID pgtype.UUID `json:"external_id"` - ReadableStatus V1ReadableStatusOlap `json:"readable_status"` - Kind V1RunKind `json:"kind"` - WorkflowID pgtype.UUID `json:"workflow_id"` - DisplayName string `json:"display_name"` - Input []byte `json:"input"` - AdditionalMetadata []byte `json:"additional_metadata"` - WorkflowVersionID pgtype.UUID `json:"workflow_version_id"` - ParentTaskExternalID pgtype.UUID `json:"parent_task_external_id"` - CreatedAt pgtype.Timestamptz `json:"created_at"` - StartedAt pgtype.Timestamptz `json:"started_at"` - FinishedAt pgtype.Timestamptz `json:"finished_at"` - ErrorMessage string `json:"error_message"` - Output []byte `json:"output"` - RetryCount int32 `json:"retry_count"` + DagID int64 `json:"dag_id"` + RunID int64 `json:"run_id"` + TenantID pgtype.UUID `json:"tenant_id"` + InsertedAt pgtype.Timestamptz `json:"inserted_at"` + ExternalID pgtype.UUID `json:"external_id"` + ReadableStatus V1ReadableStatusOlap `json:"readable_status"` + Kind V1RunKind `json:"kind"` + WorkflowID pgtype.UUID `json:"workflow_id"` + DisplayName string `json:"display_name"` + Input []byte `json:"input"` + AdditionalMetadata []byte `json:"additional_metadata"` + WorkflowVersionID pgtype.UUID `json:"workflow_version_id"` + ParentTaskExternalID pgtype.UUID `json:"parent_task_external_id"` + CreatedAt pgtype.Timestamptz `json:"created_at"` + StartedAt pgtype.Timestamptz `json:"started_at"` + FinishedAt pgtype.Timestamptz `json:"finished_at"` + ErrorMessage string `json:"error_message"` + Output []byte `json:"output"` + OutputEventExternalID pgtype.UUID `json:"output_event_external_id"` + RetryCount int32 `json:"retry_count"` } func (q *Queries) PopulateDAGMetadata(ctx context.Context, db DBTX, arg PopulateDAGMetadataParams) (*PopulateDAGMetadataRow, error) { @@ -1677,6 +1740,7 @@ func (q *Queries) PopulateDAGMetadata(ctx context.Context, db DBTX, arg Populate &i.FinishedAt, &i.ErrorMessage, &i.Output, + &i.OutputEventExternalID, &i.RetryCount, ) return &i, err @@ -1760,7 +1824,7 @@ WITH selected_retry_count AS ( LIMIT 1 ), relevant_events AS ( SELECT - tenant_id, id, inserted_at, task_id, task_inserted_at, event_type, workflow_id, event_timestamp, readable_status, retry_count, error_message, output, worker_id, additional__event_data, additional__event_message + tenant_id, id, inserted_at, external_id, task_id, task_inserted_at, event_type, workflow_id, event_timestamp, readable_status, retry_count, error_message, output, worker_id, additional__event_data, additional__event_message FROM v1_task_events_olap WHERE @@ -1782,8 +1846,16 @@ WITH selected_retry_count AS ( relevant_events WHERE event_type = 'STARTED' +), queued_at AS ( + SELECT + MAX(event_timestamp) AS queued_at + FROM + relevant_events + WHERE + event_type = 'QUEUED' ), task_output AS ( SELECT + external_id, output FROM relevant_events @@ -1826,7 +1898,9 @@ SELECT st.readable_status::v1_readable_status_olap as status, f.finished_at::timestamptz as finished_at, s.started_at::timestamptz as started_at, - o.output::jsonb as output, + q.queued_at::timestamptz as queued_at, + o.external_id::UUID AS output_event_external_id, + o.output as output, e.error_message as error_message, sc.spawned_children, (SELECT retry_count FROM selected_retry_count) as retry_count @@ -1836,6 +1910,8 @@ LEFT JOIN finished_at f ON true LEFT JOIN started_at s ON true +LEFT JOIN + queued_at q ON true LEFT JOIN task_output o ON true LEFT JOIN @@ -1856,37 +1932,39 @@ type PopulateSingleTaskRunDataParams struct { } type PopulateSingleTaskRunDataRow struct { - TenantID pgtype.UUID `json:"tenant_id"` - ID int64 `json:"id"` - InsertedAt pgtype.Timestamptz `json:"inserted_at"` - ExternalID pgtype.UUID `json:"external_id"` - Queue string `json:"queue"` - ActionID string `json:"action_id"` - StepID pgtype.UUID `json:"step_id"` - WorkflowID pgtype.UUID `json:"workflow_id"` - WorkflowVersionID pgtype.UUID `json:"workflow_version_id"` - WorkflowRunID pgtype.UUID `json:"workflow_run_id"` - ScheduleTimeout string `json:"schedule_timeout"` - StepTimeout pgtype.Text `json:"step_timeout"` - Priority pgtype.Int4 `json:"priority"` - Sticky V1StickyStrategyOlap `json:"sticky"` - DesiredWorkerID pgtype.UUID `json:"desired_worker_id"` - DisplayName string `json:"display_name"` - Input []byte `json:"input"` - AdditionalMetadata []byte `json:"additional_metadata"` - ReadableStatus V1ReadableStatusOlap `json:"readable_status"` - LatestRetryCount int32 `json:"latest_retry_count"` - LatestWorkerID pgtype.UUID `json:"latest_worker_id"` - DagID pgtype.Int8 `json:"dag_id"` - DagInsertedAt pgtype.Timestamptz `json:"dag_inserted_at"` - ParentTaskExternalID pgtype.UUID `json:"parent_task_external_id"` - Status V1ReadableStatusOlap `json:"status"` - FinishedAt pgtype.Timestamptz `json:"finished_at"` - StartedAt pgtype.Timestamptz `json:"started_at"` - Output []byte `json:"output"` - ErrorMessage pgtype.Text `json:"error_message"` - SpawnedChildren pgtype.Int8 `json:"spawned_children"` - RetryCount int32 `json:"retry_count"` + TenantID pgtype.UUID `json:"tenant_id"` + ID int64 `json:"id"` + InsertedAt pgtype.Timestamptz `json:"inserted_at"` + ExternalID pgtype.UUID `json:"external_id"` + Queue string `json:"queue"` + ActionID string `json:"action_id"` + StepID pgtype.UUID `json:"step_id"` + WorkflowID pgtype.UUID `json:"workflow_id"` + WorkflowVersionID pgtype.UUID `json:"workflow_version_id"` + WorkflowRunID pgtype.UUID `json:"workflow_run_id"` + ScheduleTimeout string `json:"schedule_timeout"` + StepTimeout pgtype.Text `json:"step_timeout"` + Priority pgtype.Int4 `json:"priority"` + Sticky V1StickyStrategyOlap `json:"sticky"` + DesiredWorkerID pgtype.UUID `json:"desired_worker_id"` + DisplayName string `json:"display_name"` + Input []byte `json:"input"` + AdditionalMetadata []byte `json:"additional_metadata"` + ReadableStatus V1ReadableStatusOlap `json:"readable_status"` + LatestRetryCount int32 `json:"latest_retry_count"` + LatestWorkerID pgtype.UUID `json:"latest_worker_id"` + DagID pgtype.Int8 `json:"dag_id"` + DagInsertedAt pgtype.Timestamptz `json:"dag_inserted_at"` + ParentTaskExternalID pgtype.UUID `json:"parent_task_external_id"` + Status V1ReadableStatusOlap `json:"status"` + FinishedAt pgtype.Timestamptz `json:"finished_at"` + StartedAt pgtype.Timestamptz `json:"started_at"` + QueuedAt pgtype.Timestamptz `json:"queued_at"` + OutputEventExternalID pgtype.UUID `json:"output_event_external_id"` + Output []byte `json:"output"` + ErrorMessage pgtype.Text `json:"error_message"` + SpawnedChildren pgtype.Int8 `json:"spawned_children"` + RetryCount int32 `json:"retry_count"` } func (q *Queries) PopulateSingleTaskRunData(ctx context.Context, db DBTX, arg PopulateSingleTaskRunDataParams) (*PopulateSingleTaskRunDataRow, error) { @@ -1925,6 +2003,8 @@ func (q *Queries) PopulateSingleTaskRunData(ctx context.Context, db DBTX, arg Po &i.Status, &i.FinishedAt, &i.StartedAt, + &i.QueuedAt, + &i.OutputEventExternalID, &i.Output, &i.ErrorMessage, &i.SpawnedChildren, @@ -1940,6 +2020,7 @@ WITH metadata AS ( MAX(event_timestamp) FILTER (WHERE event_type = 'STARTED')::TIMESTAMPTZ AS started_at, MAX(event_timestamp) FILTER (WHERE readable_status = ANY(ARRAY['COMPLETED', 'FAILED', 'CANCELLED']::v1_readable_status_olap[]))::TIMESTAMPTZ AS finished_at, MAX(output::TEXT) FILTER (WHERE readable_status = 'COMPLETED')::JSONB AS output, + MAX(external_id::TEXT) FILTER (WHERE readable_status = 'COMPLETED')::UUID AS output_event_external_id, MAX(error_message) FILTER (WHERE readable_status = 'FAILED')::TEXT AS error_message FROM v1_task_events_olap @@ -1991,7 +2072,8 @@ SELECT CASE WHEN $1::BOOLEAN THEN m.output::JSONB ELSE '{}'::JSONB - END::JSONB AS output + END::JSONB AS output, + m.output_event_external_id::UUID AS output_event_external_id FROM v1_tasks_olap t, metadata m WHERE @@ -2009,31 +2091,32 @@ type PopulateTaskRunDataParams struct { } type PopulateTaskRunDataRow struct { - TenantID pgtype.UUID `json:"tenant_id"` - ID int64 `json:"id"` - InsertedAt pgtype.Timestamptz `json:"inserted_at"` - ExternalID pgtype.UUID `json:"external_id"` - Queue string `json:"queue"` - ActionID string `json:"action_id"` - StepID pgtype.UUID `json:"step_id"` - WorkflowID pgtype.UUID `json:"workflow_id"` - WorkflowVersionID pgtype.UUID `json:"workflow_version_id"` - ScheduleTimeout string `json:"schedule_timeout"` - StepTimeout pgtype.Text `json:"step_timeout"` - Priority pgtype.Int4 `json:"priority"` - Sticky V1StickyStrategyOlap `json:"sticky"` - DisplayName string `json:"display_name"` - AdditionalMetadata []byte `json:"additional_metadata"` - ParentTaskExternalID pgtype.UUID `json:"parent_task_external_id"` - Input []byte `json:"input"` - Status V1ReadableStatusOlap `json:"status"` - WorkflowRunID pgtype.UUID `json:"workflow_run_id"` - FinishedAt pgtype.Timestamptz `json:"finished_at"` - StartedAt pgtype.Timestamptz `json:"started_at"` - QueuedAt pgtype.Timestamptz `json:"queued_at"` - ErrorMessage string `json:"error_message"` - RetryCount int32 `json:"retry_count"` - Output []byte `json:"output"` + TenantID pgtype.UUID `json:"tenant_id"` + ID int64 `json:"id"` + InsertedAt pgtype.Timestamptz `json:"inserted_at"` + ExternalID pgtype.UUID `json:"external_id"` + Queue string `json:"queue"` + ActionID string `json:"action_id"` + StepID pgtype.UUID `json:"step_id"` + WorkflowID pgtype.UUID `json:"workflow_id"` + WorkflowVersionID pgtype.UUID `json:"workflow_version_id"` + ScheduleTimeout string `json:"schedule_timeout"` + StepTimeout pgtype.Text `json:"step_timeout"` + Priority pgtype.Int4 `json:"priority"` + Sticky V1StickyStrategyOlap `json:"sticky"` + DisplayName string `json:"display_name"` + AdditionalMetadata []byte `json:"additional_metadata"` + ParentTaskExternalID pgtype.UUID `json:"parent_task_external_id"` + Input []byte `json:"input"` + Status V1ReadableStatusOlap `json:"status"` + WorkflowRunID pgtype.UUID `json:"workflow_run_id"` + FinishedAt pgtype.Timestamptz `json:"finished_at"` + StartedAt pgtype.Timestamptz `json:"started_at"` + QueuedAt pgtype.Timestamptz `json:"queued_at"` + ErrorMessage string `json:"error_message"` + RetryCount int32 `json:"retry_count"` + Output []byte `json:"output"` + OutputEventExternalID pgtype.UUID `json:"output_event_external_id"` } func (q *Queries) PopulateTaskRunData(ctx context.Context, db DBTX, arg PopulateTaskRunDataParams) (*PopulateTaskRunDataRow, error) { @@ -2070,10 +2153,71 @@ func (q *Queries) PopulateTaskRunData(ctx context.Context, db DBTX, arg Populate &i.ErrorMessage, &i.RetryCount, &i.Output, + &i.OutputEventExternalID, ) return &i, err } +const putPayloads = `-- name: PutPayloads :exec +WITH inputs AS ( + SELECT + UNNEST($1::UUID[]) AS external_id, + UNNEST($2::TIMESTAMPTZ[]) AS inserted_at, + UNNEST($3::JSONB[]) AS payload, + UNNEST($4::UUID[]) AS tenant_id, + UNNEST(CAST($5::TEXT[] AS v1_payload_location_olap[])) AS location +) + +INSERT INTO v1_payloads_olap ( + tenant_id, + external_id, + inserted_at, + location, + external_location_key, + inline_content +) + +SELECT + i.tenant_id, + i.external_id, + i.inserted_at, + i.location, + CASE + WHEN i.location = 'EXTERNAL' THEN i.payload + ELSE NULL + END, + CASE + WHEN i.location = 'INLINE' THEN i.payload + ELSE NULL + END AS inline_content +FROM inputs i +ON CONFLICT (tenant_id, external_id, inserted_at) DO UPDATE +SET + location = EXCLUDED.location, + external_location_key = EXCLUDED.external_location_key, + inline_content = EXCLUDED.inline_content, + updated_at = NOW() +` + +type PutPayloadsParams struct { + Externalids []pgtype.UUID `json:"externalids"` + Insertedats []pgtype.Timestamptz `json:"insertedats"` + Payloads [][]byte `json:"payloads"` + Tenantids []pgtype.UUID `json:"tenantids"` + Locations []string `json:"locations"` +} + +func (q *Queries) PutPayloads(ctx context.Context, db DBTX, arg PutPayloadsParams) error { + _, err := db.Exec(ctx, putPayloads, + arg.Externalids, + arg.Insertedats, + arg.Payloads, + arg.Tenantids, + arg.Locations, + ) + return err +} + const readDAGByExternalID = `-- name: ReadDAGByExternalID :one WITH lookup_task AS ( SELECT @@ -2113,6 +2257,47 @@ func (q *Queries) ReadDAGByExternalID(ctx context.Context, db DBTX, externalid p return &i, err } +const readPayloadsOLAP = `-- name: ReadPayloadsOLAP :many +SELECT tenant_id, external_id, location, external_location_key, inline_content, inserted_at, updated_at +FROM v1_payloads_olap +WHERE + tenant_id = $1::UUID + AND external_id = ANY($2::UUID[]) +` + +type ReadPayloadsOLAPParams struct { + Tenantid pgtype.UUID `json:"tenantid"` + Externalids []pgtype.UUID `json:"externalids"` +} + +func (q *Queries) ReadPayloadsOLAP(ctx context.Context, db DBTX, arg ReadPayloadsOLAPParams) ([]*V1PayloadsOlap, error) { + rows, err := db.Query(ctx, readPayloadsOLAP, arg.Tenantid, arg.Externalids) + if err != nil { + return nil, err + } + defer rows.Close() + var items []*V1PayloadsOlap + for rows.Next() { + var i V1PayloadsOlap + if err := rows.Scan( + &i.TenantID, + &i.ExternalID, + &i.Location, + &i.ExternalLocationKey, + &i.InlineContent, + &i.InsertedAt, + &i.UpdatedAt, + ); err != nil { + return nil, err + } + items = append(items, &i) + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + const readTaskByExternalID = `-- name: ReadTaskByExternalID :one WITH lookup_task AS ( SELECT @@ -2127,6 +2312,7 @@ WITH lookup_task AS ( SELECT t.tenant_id, t.id, t.inserted_at, t.external_id, t.queue, t.action_id, t.step_id, t.workflow_id, t.workflow_version_id, t.workflow_run_id, t.schedule_timeout, t.step_timeout, t.priority, t.sticky, t.desired_worker_id, t.display_name, t.input, t.additional_metadata, t.readable_status, t.latest_retry_count, t.latest_worker_id, t.dag_id, t.dag_inserted_at, t.parent_task_external_id, e.output, + e.external_id AS event_external_id, e.error_message FROM v1_tasks_olap t @@ -2162,6 +2348,7 @@ type ReadTaskByExternalIDRow struct { DagInsertedAt pgtype.Timestamptz `json:"dag_inserted_at"` ParentTaskExternalID pgtype.UUID `json:"parent_task_external_id"` Output []byte `json:"output"` + EventExternalID pgtype.UUID `json:"event_external_id"` ErrorMessage pgtype.Text `json:"error_message"` } @@ -2194,6 +2381,7 @@ func (q *Queries) ReadTaskByExternalID(ctx context.Context, db DBTX, externalid &i.DagInsertedAt, &i.ParentTaskExternalID, &i.Output, + &i.EventExternalID, &i.ErrorMessage, ) return &i, err @@ -2254,7 +2442,7 @@ WITH runs AS ( AND lt.task_id IS NOT NULL ), relevant_events AS ( SELECT - e.tenant_id, e.id, e.inserted_at, e.task_id, e.task_inserted_at, e.event_type, e.workflow_id, e.event_timestamp, e.readable_status, e.retry_count, e.error_message, e.output, e.worker_id, e.additional__event_data, e.additional__event_message + e.tenant_id, e.id, e.inserted_at, e.external_id, e.task_id, e.task_inserted_at, e.event_type, e.workflow_id, e.event_timestamp, e.readable_status, e.retry_count, e.error_message, e.output, e.worker_id, e.additional__event_data, e.additional__event_message FROM runs r JOIN v1_dag_to_task_olap dt ON r.dag_id = dt.dag_id AND r.inserted_at = dt.dag_inserted_at JOIN v1_task_events_olap e ON (e.task_id, e.task_inserted_at) = (dt.task_id, dt.task_inserted_at) @@ -2263,7 +2451,7 @@ WITH runs AS ( UNION ALL SELECT - e.tenant_id, e.id, e.inserted_at, e.task_id, e.task_inserted_at, e.event_type, e.workflow_id, e.event_timestamp, e.readable_status, e.retry_count, e.error_message, e.output, e.worker_id, e.additional__event_data, e.additional__event_message + e.tenant_id, e.id, e.inserted_at, e.external_id, e.task_id, e.task_inserted_at, e.event_type, e.workflow_id, e.event_timestamp, e.readable_status, e.retry_count, e.error_message, e.output, e.worker_id, e.additional__event_data, e.additional__event_message FROM runs r JOIN v1_task_events_olap e ON e.task_id = r.task_id AND e.task_inserted_at = r.inserted_at WHERE r.task_id IS NOT NULL diff --git a/pkg/repository/v1/sqlcv1/payload-store.sql b/pkg/repository/v1/sqlcv1/payload-store.sql index ca99b845a..d11781ecd 100644 --- a/pkg/repository/v1/sqlcv1/payload-store.sql +++ b/pkg/repository/v1/sqlcv1/payload-store.sql @@ -1,13 +1,3 @@ --- name: ReadPayload :one -SELECT * -FROM v1_payload -WHERE - tenant_id = @tenantId::UUID - AND type = @type::v1_payload_type - AND id = @id::BIGINT - AND inserted_at = @insertedAt::TIMESTAMPTZ -; - -- name: ReadPayloads :many WITH inputs AS ( SELECT @@ -30,6 +20,7 @@ WITH inputs AS ( SELECT DISTINCT UNNEST(@ids::BIGINT[]) AS id, UNNEST(@insertedAts::TIMESTAMPTZ[]) AS inserted_at, + UNNEST(@externalIds::UUID[]) AS external_id, UNNEST(CAST(@types::TEXT[] AS v1_payload_type[])) AS type, UNNEST(CAST(@locations::TEXT[] AS v1_payload_location[])) AS location, UNNEST(@externalLocationKeys::TEXT[]) AS external_location_key, @@ -41,6 +32,7 @@ INSERT INTO v1_payload ( tenant_id, id, inserted_at, + external_id, type, location, external_location_key, @@ -50,6 +42,7 @@ SELECT i.tenant_id, i.id, i.inserted_at, + i.external_id, i.type, i.location, CASE WHEN i.external_location_key = '' OR i.location != 'EXTERNAL' THEN NULL ELSE i.external_location_key END, @@ -110,7 +103,7 @@ LIMIT @pollLimit::INT FOR UPDATE SKIP LOCKED ; --- name: SetPayloadExternalKeys :exec +-- name: SetPayloadExternalKeys :many WITH inputs AS ( SELECT UNNEST(@ids::BIGINT[]) AS id, @@ -129,6 +122,7 @@ WITH inputs AS ( v1_payload.id = i.id AND v1_payload.inserted_at = i.inserted_at AND v1_payload.tenant_id = i.tenant_id + RETURNING v1_payload.* ), cutover_queue_items AS ( INSERT INTO v1_payload_cutover_queue_item ( tenant_id, @@ -146,14 +140,17 @@ WITH inputs AS ( FROM inputs i ON CONFLICT DO NOTHING +), deletions AS ( + DELETE FROM v1_payload_wal + WHERE + (offload_at, payload_id, payload_inserted_at, payload_type, tenant_id) IN ( + SELECT offload_at, id, inserted_at, type, tenant_id + FROM inputs + ) ) -DELETE FROM v1_payload_wal -WHERE - (offload_at, payload_id, payload_inserted_at, payload_type, tenant_id) IN ( - SELECT offload_at, id, inserted_at, type, tenant_id - FROM inputs - ) +SELECT * +FROM payload_updates ; diff --git a/pkg/repository/v1/sqlcv1/payload-store.sql.go b/pkg/repository/v1/sqlcv1/payload-store.sql.go index 8b7eeb97a..30497c320 100644 --- a/pkg/repository/v1/sqlcv1/payload-store.sql.go +++ b/pkg/repository/v1/sqlcv1/payload-store.sql.go @@ -123,44 +123,6 @@ func (q *Queries) PollPayloadWALForRecordsToReplicate(ctx context.Context, db DB return items, nil } -const readPayload = `-- name: ReadPayload :one -SELECT tenant_id, id, inserted_at, type, location, external_location_key, inline_content, updated_at -FROM v1_payload -WHERE - tenant_id = $1::UUID - AND type = $2::v1_payload_type - AND id = $3::BIGINT - AND inserted_at = $4::TIMESTAMPTZ -` - -type ReadPayloadParams struct { - Tenantid pgtype.UUID `json:"tenantid"` - Type V1PayloadType `json:"type"` - ID int64 `json:"id"` - Insertedat pgtype.Timestamptz `json:"insertedat"` -} - -func (q *Queries) ReadPayload(ctx context.Context, db DBTX, arg ReadPayloadParams) (*V1Payload, error) { - row := db.QueryRow(ctx, readPayload, - arg.Tenantid, - arg.Type, - arg.ID, - arg.Insertedat, - ) - var i V1Payload - err := row.Scan( - &i.TenantID, - &i.ID, - &i.InsertedAt, - &i.Type, - &i.Location, - &i.ExternalLocationKey, - &i.InlineContent, - &i.UpdatedAt, - ) - return &i, err -} - const readPayloads = `-- name: ReadPayloads :many WITH inputs AS ( SELECT @@ -170,7 +132,7 @@ WITH inputs AS ( UNNEST(CAST($4::TEXT[] AS v1_payload_type[])) AS type ) -SELECT tenant_id, id, inserted_at, type, location, external_location_key, inline_content, updated_at +SELECT tenant_id, id, inserted_at, external_id, type, location, external_location_key, inline_content, updated_at FROM v1_payload WHERE (tenant_id, id, inserted_at, type) IN ( SELECT tenant_id, id, inserted_at, type @@ -203,6 +165,7 @@ func (q *Queries) ReadPayloads(ctx context.Context, db DBTX, arg ReadPayloadsPar &i.TenantID, &i.ID, &i.InsertedAt, + &i.ExternalID, &i.Type, &i.Location, &i.ExternalLocationKey, @@ -219,7 +182,7 @@ func (q *Queries) ReadPayloads(ctx context.Context, db DBTX, arg ReadPayloadsPar return items, nil } -const setPayloadExternalKeys = `-- name: SetPayloadExternalKeys :exec +const setPayloadExternalKeys = `-- name: SetPayloadExternalKeys :many WITH inputs AS ( SELECT UNNEST($1::BIGINT[]) AS id, @@ -238,6 +201,7 @@ WITH inputs AS ( v1_payload.id = i.id AND v1_payload.inserted_at = i.inserted_at AND v1_payload.tenant_id = i.tenant_id + RETURNING v1_payload.tenant_id, v1_payload.id, v1_payload.inserted_at, v1_payload.external_id, v1_payload.type, v1_payload.location, v1_payload.external_location_key, v1_payload.inline_content, v1_payload.updated_at ), cutover_queue_items AS ( INSERT INTO v1_payload_cutover_queue_item ( tenant_id, @@ -255,14 +219,17 @@ WITH inputs AS ( FROM inputs i ON CONFLICT DO NOTHING +), deletions AS ( + DELETE FROM v1_payload_wal + WHERE + (offload_at, payload_id, payload_inserted_at, payload_type, tenant_id) IN ( + SELECT offload_at, id, inserted_at, type, tenant_id + FROM inputs + ) ) -DELETE FROM v1_payload_wal -WHERE - (offload_at, payload_id, payload_inserted_at, payload_type, tenant_id) IN ( - SELECT offload_at, id, inserted_at, type, tenant_id - FROM inputs - ) +SELECT tenant_id, id, inserted_at, external_id, type, location, external_location_key, inline_content, updated_at +FROM payload_updates ` type SetPayloadExternalKeysParams struct { @@ -274,8 +241,20 @@ type SetPayloadExternalKeysParams struct { Tenantids []pgtype.UUID `json:"tenantids"` } -func (q *Queries) SetPayloadExternalKeys(ctx context.Context, db DBTX, arg SetPayloadExternalKeysParams) error { - _, err := db.Exec(ctx, setPayloadExternalKeys, +type SetPayloadExternalKeysRow struct { + TenantID pgtype.UUID `json:"tenant_id"` + ID int64 `json:"id"` + InsertedAt pgtype.Timestamptz `json:"inserted_at"` + ExternalID pgtype.UUID `json:"external_id"` + Type V1PayloadType `json:"type"` + Location V1PayloadLocation `json:"location"` + ExternalLocationKey pgtype.Text `json:"external_location_key"` + InlineContent []byte `json:"inline_content"` + UpdatedAt pgtype.Timestamptz `json:"updated_at"` +} + +func (q *Queries) SetPayloadExternalKeys(ctx context.Context, db DBTX, arg SetPayloadExternalKeysParams) ([]*SetPayloadExternalKeysRow, error) { + rows, err := db.Query(ctx, setPayloadExternalKeys, arg.Ids, arg.Insertedats, arg.Payloadtypes, @@ -283,7 +262,32 @@ func (q *Queries) SetPayloadExternalKeys(ctx context.Context, db DBTX, arg SetPa arg.Externallocationkeys, arg.Tenantids, ) - return err + if err != nil { + return nil, err + } + defer rows.Close() + var items []*SetPayloadExternalKeysRow + for rows.Next() { + var i SetPayloadExternalKeysRow + if err := rows.Scan( + &i.TenantID, + &i.ID, + &i.InsertedAt, + &i.ExternalID, + &i.Type, + &i.Location, + &i.ExternalLocationKey, + &i.InlineContent, + &i.UpdatedAt, + ); err != nil { + return nil, err + } + items = append(items, &i) + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil } const writePayloadWAL = `-- name: WritePayloadWAL :exec @@ -338,17 +342,19 @@ WITH inputs AS ( SELECT DISTINCT UNNEST($1::BIGINT[]) AS id, UNNEST($2::TIMESTAMPTZ[]) AS inserted_at, - UNNEST(CAST($3::TEXT[] AS v1_payload_type[])) AS type, - UNNEST(CAST($4::TEXT[] AS v1_payload_location[])) AS location, - UNNEST($5::TEXT[]) AS external_location_key, - UNNEST($6::JSONB[]) AS inline_content, - UNNEST($7::UUID[]) AS tenant_id + UNNEST($3::UUID[]) AS external_id, + UNNEST(CAST($4::TEXT[] AS v1_payload_type[])) AS type, + UNNEST(CAST($5::TEXT[] AS v1_payload_location[])) AS location, + UNNEST($6::TEXT[]) AS external_location_key, + UNNEST($7::JSONB[]) AS inline_content, + UNNEST($8::UUID[]) AS tenant_id ) INSERT INTO v1_payload ( tenant_id, id, inserted_at, + external_id, type, location, external_location_key, @@ -358,6 +364,7 @@ SELECT i.tenant_id, i.id, i.inserted_at, + i.external_id, i.type, i.location, CASE WHEN i.external_location_key = '' OR i.location != 'EXTERNAL' THEN NULL ELSE i.external_location_key END, @@ -375,6 +382,7 @@ DO UPDATE SET type WritePayloadsParams struct { Ids []int64 `json:"ids"` Insertedats []pgtype.Timestamptz `json:"insertedats"` + Externalids []pgtype.UUID `json:"externalids"` Types []string `json:"types"` Locations []string `json:"locations"` Externallocationkeys []string `json:"externallocationkeys"` @@ -386,6 +394,7 @@ func (q *Queries) WritePayloads(ctx context.Context, db DBTX, arg WritePayloadsP _, err := db.Exec(ctx, writePayloads, arg.Ids, arg.Insertedats, + arg.Externalids, arg.Types, arg.Locations, arg.Externallocationkeys, diff --git a/pkg/repository/v1/task.go b/pkg/repository/v1/task.go index 2e630ee11..9e9f99053 100644 --- a/pkg/repository/v1/task.go +++ b/pkg/repository/v1/task.go @@ -1106,7 +1106,7 @@ func (r *TaskRepositoryImpl) listTaskOutputEvents(ctx context.Context, tx sqlcv1 matchedEventToRetrieveOpts[event] = opt } - payloads, err := r.payloadStore.BulkRetrieve(ctx, retrieveOpts...) + payloads, err := r.payloadStore.Retrieve(ctx, retrieveOpts...) if err != nil { return nil, err @@ -1381,6 +1381,7 @@ func (r *TaskRepositoryImpl) ProcessDurableSleeps(ctx context.Context, tenantId Id: task.ID, InsertedAt: task.InsertedAt, Type: sqlcv1.V1PayloadTypeTASKINPUT, + ExternalId: task.ExternalID, Payload: task.Payload, TenantId: task.TenantID.String(), } @@ -2358,6 +2359,7 @@ func (r *sharedRepository) replayTasks( Id: taskIds[i], InsertedAt: taskInsertedAts[i], Type: sqlcv1.V1PayloadTypeTASKINPUT, + ExternalId: sqlchelpers.UUIDFromStr(task.ExternalId), Payload: input, TenantId: tenantId, } @@ -2699,6 +2701,7 @@ func (r *sharedRepository) createTaskEvents( storePayloadOpts[i] = StorePayloadOpts{ Id: taskEvent.ID, InsertedAt: taskEvent.InsertedAt, + ExternalId: taskEvent.ExternalID, Type: sqlcv1.V1PayloadTypeTASKEVENTDATA, Payload: data, TenantId: tenantId, @@ -2869,7 +2872,7 @@ func (r *TaskRepositoryImpl) ReplayTasks(ctx context.Context, tenantId string, t } } - payloads, err := r.payloadStore.BulkRetrieve(ctx, retrieveOpts...) + payloads, err := r.payloadStore.Retrieve(ctx, retrieveOpts...) if err != nil { return nil, fmt.Errorf("failed to bulk retrieve task inputs: %w", err) @@ -3477,7 +3480,7 @@ func (r *TaskRepositoryImpl) ListTaskParentOutputs(ctx context.Context, tenantId retrieveOptToPayload[opt] = outputTask.Output } - payloads, err := r.payloadStore.BulkRetrieve(ctx, retrieveOpts...) + payloads, err := r.payloadStore.Retrieve(ctx, retrieveOpts...) if err != nil { return nil, fmt.Errorf("failed to retrieve task output payloads: %w", err) @@ -3553,7 +3556,7 @@ func (r *TaskRepositoryImpl) ListSignalCompletedEvents(ctx context.Context, tena retrieveOpts[i] = retrieveOpt } - payloads, err := r.payloadStore.BulkRetrieve(ctx, retrieveOpts...) + payloads, err := r.payloadStore.Retrieve(ctx, retrieveOpts...) if err != nil { return nil, fmt.Errorf("failed to retrieve task event payloads: %w", err) diff --git a/pkg/repository/v1/trigger.go b/pkg/repository/v1/trigger.go index 4e4ac0136..713746d32 100644 --- a/pkg/repository/v1/trigger.go +++ b/pkg/repository/v1/trigger.go @@ -1257,6 +1257,7 @@ func (r *TriggerRepositoryImpl) triggerWorkflows(ctx context.Context, tenantId s storePayloadOpts = append(storePayloadOpts, StorePayloadOpts{ Id: task.ID, InsertedAt: task.InsertedAt, + ExternalId: task.ExternalID, Type: sqlcv1.V1PayloadTypeTASKINPUT, Payload: task.Payload, TenantId: tenantId, @@ -1267,6 +1268,7 @@ func (r *TriggerRepositoryImpl) triggerWorkflows(ctx context.Context, tenantId s storePayloadOpts = append(storePayloadOpts, StorePayloadOpts{ Id: dag.ID, InsertedAt: dag.InsertedAt, + ExternalId: dag.ExternalID, Type: sqlcv1.V1PayloadTypeDAGINPUT, Payload: dag.Input, TenantId: tenantId, @@ -1510,7 +1512,7 @@ func (r *TriggerRepositoryImpl) registerChildWorkflows( } } - payloads, err := r.payloadStore.BulkRetrieve(ctx, retrievePayloadOpts...) + payloads, err := r.payloadStore.Retrieve(ctx, retrievePayloadOpts...) if err != nil { return nil, fmt.Errorf("failed to retrieve payloads for signal created events: %w", err) diff --git a/sdks/python/CHANGELOG.md b/sdks/python/CHANGELOG.md index 551a7f4e0..4f282d44d 100644 --- a/sdks/python/CHANGELOG.md +++ b/sdks/python/CHANGELOG.md @@ -5,6 +5,12 @@ All notable changes to Hatchet's Python SDK will be documented in this changelog The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.20.2] - 2025-10-15 + +### Added + +- Adds a `include_payloads` parameter to the `list` methods on the runs client (defaults to true, so no change in behavior). + ## [1.20.1] - 2025-10-14 ### Added diff --git a/sdks/python/examples/concurrency_workflow_level/worker.py b/sdks/python/examples/concurrency_workflow_level/worker.py index fb63e6e1a..a1d368608 100644 --- a/sdks/python/examples/concurrency_workflow_level/worker.py +++ b/sdks/python/examples/concurrency_workflow_level/worker.py @@ -23,7 +23,7 @@ class WorkflowInput(BaseModel): concurrency_workflow_level_workflow = hatchet.workflow( - name="ConcurrencyWorkflowManyKeys", + name="ConcurrencyWorkflowLevel", input_validator=WorkflowInput, concurrency=[ ConcurrencyExpression( diff --git a/sdks/python/examples/cron/programatic-async.py b/sdks/python/examples/cron/programatic-async.py index 3985be8aa..6f50b3762 100644 --- a/sdks/python/examples/cron/programatic-async.py +++ b/sdks/python/examples/cron/programatic-async.py @@ -11,7 +11,7 @@ class DynamicCronInput(BaseModel): async def create_cron() -> None: dynamic_cron_workflow = hatchet.workflow( - name="CronWorkflow", input_validator=DynamicCronInput + name="DynamicCronWorkflow", input_validator=DynamicCronInput ) # > Create diff --git a/sdks/python/examples/cron/programatic-sync.py b/sdks/python/examples/cron/programatic-sync.py index 0c5e12b16..4e51b6cbe 100644 --- a/sdks/python/examples/cron/programatic-sync.py +++ b/sdks/python/examples/cron/programatic-sync.py @@ -10,7 +10,7 @@ class DynamicCronInput(BaseModel): dynamic_cron_workflow = hatchet.workflow( - name="CronWorkflow", input_validator=DynamicCronInput + name="DynamicCronWorkflow", input_validator=DynamicCronInput ) # > Create diff --git a/sdks/python/hatchet_sdk/features/runs.py b/sdks/python/hatchet_sdk/features/runs.py index aae80501d..d98f8bd8f 100644 --- a/sdks/python/hatchet_sdk/features/runs.py +++ b/sdks/python/hatchet_sdk/features/runs.py @@ -405,6 +405,7 @@ class RunsClient(BaseRestClient): worker_id: str | None = None, parent_task_external_id: str | None = None, triggering_event_external_id: str | None = None, + include_payloads: bool = True, ) -> list[V1TaskSummary]: """ List task runs according to a set of filters, paginating through days @@ -420,6 +421,7 @@ class RunsClient(BaseRestClient): :param worker_id: The worker ID to filter task runs by. :param parent_task_external_id: The parent task external ID to filter task runs by. :param triggering_event_external_id: The event id that triggered the task run. + :param include_payloads: Whether to include payloads in the response. :return: A list of task runs matching the specified filters. """ @@ -446,6 +448,7 @@ class RunsClient(BaseRestClient): worker_id=worker_id, parent_task_external_id=parent_task_external_id, triggering_event_external_id=triggering_event_external_id, + include_payloads=include_payloads, ) for s, u in date_ranges ] @@ -475,6 +478,7 @@ class RunsClient(BaseRestClient): worker_id: str | None = None, parent_task_external_id: str | None = None, triggering_event_external_id: str | None = None, + include_payloads: bool = True, ) -> list[V1TaskSummary]: """ List task runs according to a set of filters, paginating through days @@ -490,6 +494,7 @@ class RunsClient(BaseRestClient): :param worker_id: The worker ID to filter task runs by. :param parent_task_external_id: The parent task external ID to filter task runs by. :param triggering_event_external_id: The event id that triggered the task run. + :param include_payloads: Whether to include payloads in the response. :return: A list of task runs matching the specified filters. """ @@ -517,6 +522,7 @@ class RunsClient(BaseRestClient): worker_id=worker_id, parent_task_external_id=parent_task_external_id, triggering_event_external_id=triggering_event_external_id, + include_payloads=include_payloads, ) for s, u in date_ranges ] @@ -550,6 +556,7 @@ class RunsClient(BaseRestClient): worker_id: str | None = None, parent_task_external_id: str | None = None, triggering_event_external_id: str | None = None, + include_payloads: bool = True, ) -> V1TaskSummaryList: """ List task runs according to a set of filters. @@ -565,6 +572,7 @@ class RunsClient(BaseRestClient): :param worker_id: The worker ID to filter task runs by. :param parent_task_external_id: The parent task external ID to filter task runs by. :param triggering_event_external_id: The event id that triggered the task run. + :param include_payloads: Whether to include payloads in the response. :return: A list of task runs matching the specified filters. """ @@ -581,6 +589,7 @@ class RunsClient(BaseRestClient): worker_id=worker_id, parent_task_external_id=parent_task_external_id, triggering_event_external_id=triggering_event_external_id, + include_payloads=include_payloads, ) @retry @@ -597,6 +606,7 @@ class RunsClient(BaseRestClient): worker_id: str | None = None, parent_task_external_id: str | None = None, triggering_event_external_id: str | None = None, + include_payloads: bool = True, ) -> V1TaskSummaryList: """ List task runs according to a set of filters. @@ -612,6 +622,7 @@ class RunsClient(BaseRestClient): :param worker_id: The worker ID to filter task runs by. :param parent_task_external_id: The parent task external ID to filter task runs by. :param triggering_event_external_id: The event id that triggered the task run. + :param include_payloads: Whether to include payloads in the response. :return: A list of task runs matching the specified filters. """ @@ -643,6 +654,7 @@ class RunsClient(BaseRestClient): worker_id=worker_id, parent_task_external_id=parent_task_external_id, triggering_event_external_id=triggering_event_external_id, + include_payloads=include_payloads, ) def create( diff --git a/sql/schema/v1-core.sql b/sql/schema/v1-core.sql index a20bb1921..31d53afb9 100644 --- a/sql/schema/v1-core.sql +++ b/sql/schema/v1-core.sql @@ -1641,12 +1641,15 @@ CREATE TABLE v1_durable_sleep ( ); CREATE TYPE v1_payload_type AS ENUM ('TASK_INPUT', 'DAG_INPUT', 'TASK_OUTPUT', 'TASK_EVENT_DATA'); + +-- IMPORTANT: Keep these values in sync with `v1_payload_type_olap` in the OLAP db CREATE TYPE v1_payload_location AS ENUM ('INLINE', 'EXTERNAL'); CREATE TABLE v1_payload ( tenant_id UUID NOT NULL, id BIGINT NOT NULL, inserted_at TIMESTAMPTZ NOT NULL, + external_id UUID, type v1_payload_type NOT NULL, location v1_payload_location NOT NULL, external_location_key TEXT, diff --git a/sql/schema/v1-olap.sql b/sql/schema/v1-olap.sql index 2b6fb7fae..600d3799a 100644 --- a/sql/schema/v1-olap.sql +++ b/sql/schema/v1-olap.sql @@ -318,6 +318,7 @@ CREATE TABLE v1_task_events_olap ( tenant_id UUID NOT NULL, id bigint GENERATED ALWAYS AS IDENTITY, inserted_at TIMESTAMPTZ NOT NULL DEFAULT CURRENT_TIMESTAMP, + external_id UUID, task_id BIGINT NOT NULL, task_inserted_at TIMESTAMPTZ NOT NULL, event_type v1_event_type_olap NOT NULL, @@ -354,6 +355,27 @@ CREATE TABLE v1_incoming_webhook_validation_failures_olap ( CREATE INDEX v1_incoming_webhook_validation_failures_olap_tenant_id_incoming_webhook_name_idx ON v1_incoming_webhook_validation_failures_olap (tenant_id, incoming_webhook_name); +-- IMPORTANT: Keep these values in sync with `v1_payload_type` in the core db +CREATE TYPE v1_payload_location_olap AS ENUM ('INLINE', 'EXTERNAL'); + +CREATE TABLE v1_payloads_olap ( + tenant_id UUID NOT NULL, + external_id UUID NOT NULL, + + location v1_payload_location_olap NOT NULL, + external_location_key TEXT, + inline_content JSONB, + + inserted_at TIMESTAMPTZ NOT NULL DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMPTZ NOT NULL DEFAULT CURRENT_TIMESTAMP, + + PRIMARY KEY (tenant_id, external_id, inserted_at), + CHECK ( + location = 'INLINE' + OR + (location = 'EXTERNAL' AND inline_content IS NULL AND external_location_key IS NOT NULL) + ) +) PARTITION BY RANGE(inserted_at); -- this is a hash-partitioned table on the dag_id, so that we can process batches of events in parallel -- without needing to place conflicting locks on dags.