mirror of
https://github.com/hatchet-dev/hatchet.git
synced 2026-02-14 04:09:11 -06:00
Add telemetry around task statuses in controller (#2090)
* add telemetry around task statuses in controller * fixes * more fixes
This commit is contained in:
@@ -8,6 +8,7 @@ import (
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/rs/zerolog"
|
||||
"go.opentelemetry.io/otel/codes"
|
||||
"golang.org/x/sync/errgroup"
|
||||
|
||||
"github.com/hatchet-dev/hatchet/internal/cache"
|
||||
@@ -107,7 +108,18 @@ func (p *PostgresMessageQueue) SetQOS(prefetchCount int) {
|
||||
}
|
||||
|
||||
func (p *PostgresMessageQueue) SendMessage(ctx context.Context, queue msgqueue.Queue, task *msgqueue.Message) error {
|
||||
return p.addMessage(ctx, queue, task)
|
||||
ctx, span := telemetry.NewSpan(ctx, "PostgresMessageQueue.SendMessage")
|
||||
defer span.End()
|
||||
|
||||
err := p.addMessage(ctx, queue, task)
|
||||
|
||||
if err != nil {
|
||||
span.RecordError(err)
|
||||
span.SetStatus(codes.Error, "error adding message")
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (p *PostgresMessageQueue) addMessage(ctx context.Context, queue msgqueue.Queue, task *msgqueue.Message) error {
|
||||
|
||||
@@ -12,6 +12,7 @@ import (
|
||||
lru "github.com/hashicorp/golang-lru/v2"
|
||||
amqp "github.com/rabbitmq/amqp091-go"
|
||||
"github.com/rs/zerolog"
|
||||
"go.opentelemetry.io/otel/codes"
|
||||
|
||||
msgqueue "github.com/hatchet-dev/hatchet/internal/msgqueue/v1"
|
||||
"github.com/hatchet-dev/hatchet/internal/queueutils"
|
||||
@@ -172,7 +173,18 @@ func (t *MessageQueueImpl) SetQOS(prefetchCount int) {
|
||||
}
|
||||
|
||||
func (t *MessageQueueImpl) SendMessage(ctx context.Context, q msgqueue.Queue, msg *msgqueue.Message) error {
|
||||
return t.pubMessage(ctx, q, msg)
|
||||
ctx, span := telemetry.NewSpan(ctx, "MessageQueueImpl.SendMessage")
|
||||
defer span.End()
|
||||
|
||||
err := t.pubMessage(ctx, q, msg)
|
||||
|
||||
if err != nil {
|
||||
span.RecordError(err)
|
||||
span.SetStatus(codes.Error, "error publishing message")
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (t *MessageQueueImpl) pubMessage(ctx context.Context, q msgqueue.Queue, msg *msgqueue.Message) error {
|
||||
|
||||
@@ -408,6 +408,9 @@ func (tc *TasksControllerImpl) handleBufferedMsgs(tenantId, msgId string, payloa
|
||||
}
|
||||
|
||||
func (tc *TasksControllerImpl) handleTaskCompleted(ctx context.Context, tenantId string, payloads [][]byte) error {
|
||||
ctx, span := telemetry.NewSpan(ctx, "TasksControllerImpl.handleTaskCompleted")
|
||||
defer span.End()
|
||||
|
||||
opts := make([]v1.CompleteTaskOpts, 0)
|
||||
idsToData := make(map[int64][]byte)
|
||||
|
||||
@@ -429,6 +432,9 @@ func (tc *TasksControllerImpl) handleTaskCompleted(ctx context.Context, tenantId
|
||||
res, err := tc.repov1.Tasks().CompleteTasks(ctx, tenantId, opts)
|
||||
|
||||
if err != nil {
|
||||
err = fmt.Errorf("could not complete tasks: %w", err)
|
||||
span.RecordError(err)
|
||||
span.SetStatus(codes.Error, "could not complete tasks")
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -444,6 +450,9 @@ func (tc *TasksControllerImpl) handleTaskCompleted(ctx context.Context, tenantId
|
||||
}
|
||||
|
||||
func (tc *TasksControllerImpl) handleTaskFailed(ctx context.Context, tenantId string, payloads [][]byte) error {
|
||||
ctx, span := telemetry.NewSpan(ctx, "TasksControllerImpl.handleTaskFailed")
|
||||
defer span.End()
|
||||
|
||||
opts := make([]v1.FailTaskOpts, 0)
|
||||
|
||||
msgs := msgqueue.JSONConvert[tasktypes.FailedTaskPayload](payloads)
|
||||
@@ -479,13 +488,19 @@ func (tc *TasksControllerImpl) handleTaskFailed(ctx context.Context, tenantId st
|
||||
|
||||
if err != nil {
|
||||
tc.l.Error().Err(err).Msg("could not create monitoring event message")
|
||||
err = fmt.Errorf("could not create monitoring event message: %w", err)
|
||||
span.RecordError(err)
|
||||
span.SetStatus(codes.Error, "could not create monitoring event message")
|
||||
continue
|
||||
}
|
||||
|
||||
err = tc.pubBuffer.Pub(ctx, msgqueue.OLAP_QUEUE, olapMsg, false)
|
||||
|
||||
if err != nil {
|
||||
tc.l.Error().Err(err).Msg("could not create monitoring event message")
|
||||
tc.l.Error().Err(err).Msg("could not publish monitoring event message")
|
||||
err = fmt.Errorf("could not publish monitoring event message: %w", err)
|
||||
span.RecordError(err)
|
||||
span.SetStatus(codes.Error, "could not publish monitoring event message")
|
||||
continue
|
||||
}
|
||||
}
|
||||
@@ -493,13 +508,28 @@ func (tc *TasksControllerImpl) handleTaskFailed(ctx context.Context, tenantId st
|
||||
res, err := tc.repov1.Tasks().FailTasks(ctx, tenantId, opts)
|
||||
|
||||
if err != nil {
|
||||
err = fmt.Errorf("could not fail tasks: %w", err)
|
||||
span.RecordError(err)
|
||||
span.SetStatus(codes.Error, "could not fail tasks")
|
||||
return err
|
||||
}
|
||||
|
||||
return tc.processFailTasksResponse(ctx, tenantId, res)
|
||||
err = tc.processFailTasksResponse(ctx, tenantId, res)
|
||||
|
||||
if err != nil {
|
||||
err = fmt.Errorf("could not process fail tasks response: %w", err)
|
||||
span.RecordError(err)
|
||||
span.SetStatus(codes.Error, "could not process fail tasks response")
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (tc *TasksControllerImpl) processFailTasksResponse(ctx context.Context, tenantId string, res *v1.FailTasksResponse) error {
|
||||
ctx, span := telemetry.NewSpan(ctx, "TasksControllerImpl.processFailTasksResponse")
|
||||
defer span.End()
|
||||
|
||||
retriedTaskIds := make(map[int64]struct{})
|
||||
|
||||
for _, task := range res.RetriedTasks {
|
||||
@@ -527,6 +557,9 @@ func (tc *TasksControllerImpl) processFailTasksResponse(ctx context.Context, ten
|
||||
err := tc.sendInternalEvents(ctx, tenantId, internalEventsWithoutRetries)
|
||||
|
||||
if err != nil {
|
||||
err = fmt.Errorf("could not send internal events: %w", err)
|
||||
span.RecordError(err)
|
||||
span.SetStatus(codes.Error, "could not send internal events")
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -538,7 +571,10 @@ func (tc *TasksControllerImpl) processFailTasksResponse(ctx context.Context, ten
|
||||
err = tc.pubRetryEvent(ctx, tenantId, task)
|
||||
|
||||
if err != nil {
|
||||
outerErr = multierror.Append(outerErr, fmt.Errorf("could not publish retry event: %w", err))
|
||||
err = fmt.Errorf("could not publish retry event: %w", err)
|
||||
span.RecordError(err)
|
||||
span.SetStatus(codes.Error, "could not publish retry event")
|
||||
outerErr = multierror.Append(outerErr, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -547,6 +583,9 @@ func (tc *TasksControllerImpl) processFailTasksResponse(ctx context.Context, ten
|
||||
}
|
||||
|
||||
func (tc *TasksControllerImpl) handleTaskCancelled(ctx context.Context, tenantId string, payloads [][]byte) error {
|
||||
ctx, span := telemetry.NewSpan(ctx, "TasksControllerImpl.handleTaskCancelled")
|
||||
defer span.End()
|
||||
|
||||
opts := make([]v1.TaskIdInsertedAtRetryCount, 0)
|
||||
|
||||
msgs := msgqueue.JSONConvert[tasktypes.CancelledTaskPayload](payloads)
|
||||
@@ -565,6 +604,9 @@ func (tc *TasksControllerImpl) handleTaskCancelled(ctx context.Context, tenantId
|
||||
res, err := tc.repov1.Tasks().CancelTasks(ctx, tenantId, opts)
|
||||
|
||||
if err != nil {
|
||||
err = fmt.Errorf("could not cancel tasks: %w", err)
|
||||
span.RecordError(err)
|
||||
span.SetStatus(codes.Error, "could not cancel tasks")
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -584,7 +626,10 @@ func (tc *TasksControllerImpl) handleTaskCancelled(ctx context.Context, tenantId
|
||||
err = tc.sendTaskCancellationsToDispatcher(ctx, tenantId, tasksToSendToDispatcher)
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not send task cancellations to dispatcher: %w", err)
|
||||
err = fmt.Errorf("could not send task cancellations to dispatcher: %w", err)
|
||||
span.RecordError(err)
|
||||
span.SetStatus(codes.Error, "could not send task cancellations to dispatcher")
|
||||
return err
|
||||
}
|
||||
|
||||
tc.notifyQueuesOnCompletion(ctx, tenantId, res.ReleasedTasks)
|
||||
@@ -593,6 +638,9 @@ func (tc *TasksControllerImpl) handleTaskCancelled(ctx context.Context, tenantId
|
||||
err = tc.sendInternalEvents(ctx, tenantId, res.InternalEvents)
|
||||
|
||||
if err != nil {
|
||||
err = fmt.Errorf("could not send internal events: %w", err)
|
||||
span.RecordError(err)
|
||||
span.SetStatus(codes.Error, "could not send internal events")
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -613,7 +661,11 @@ func (tc *TasksControllerImpl) handleTaskCancelled(ctx context.Context, tenantId
|
||||
)
|
||||
|
||||
if err != nil {
|
||||
outerErr = multierror.Append(outerErr, fmt.Errorf("could not create monitoring event message: %w", err))
|
||||
tc.l.Error().Err(err).Msg("could not create monitoring event message")
|
||||
err = fmt.Errorf("could not create monitoring event message: %w", err)
|
||||
span.RecordError(err)
|
||||
span.SetStatus(codes.Error, "could not create monitoring event message")
|
||||
outerErr = multierror.Append(outerErr, err)
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -625,7 +677,11 @@ func (tc *TasksControllerImpl) handleTaskCancelled(ctx context.Context, tenantId
|
||||
)
|
||||
|
||||
if err != nil {
|
||||
outerErr = multierror.Append(outerErr, fmt.Errorf("could not publish monitoring event message: %w", err))
|
||||
tc.l.Error().Err(err).Msg("could not publish monitoring event message")
|
||||
err = fmt.Errorf("could not publish monitoring event message: %w", err)
|
||||
span.RecordError(err)
|
||||
span.SetStatus(codes.Error, "could not publish monitoring event message")
|
||||
outerErr = multierror.Append(outerErr, err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1053,6 +1109,9 @@ func (tc *TasksControllerImpl) handleProcessTaskTrigger(ctx context.Context, ten
|
||||
}
|
||||
|
||||
func (tc *TasksControllerImpl) sendInternalEvents(ctx context.Context, tenantId string, events []v1.InternalTaskEvent) error {
|
||||
ctx, span := telemetry.NewSpan(ctx, "TasksControllerImpl.sendInternalEvents")
|
||||
defer span.End()
|
||||
|
||||
if len(events) == 0 {
|
||||
return nil
|
||||
}
|
||||
@@ -1060,7 +1119,10 @@ func (tc *TasksControllerImpl) sendInternalEvents(ctx context.Context, tenantId
|
||||
msg, err := tasktypes.NewInternalEventMessage(tenantId, time.Now(), events...)
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not create internal event message: %w", err)
|
||||
err = fmt.Errorf("could not create internal event message: %w", err)
|
||||
span.RecordError(err)
|
||||
span.SetStatus(codes.Error, "could not create internal event message")
|
||||
return err
|
||||
}
|
||||
|
||||
return tc.mq.SendMessage(
|
||||
|
||||
Reference in New Issue
Block a user