Add telemetry around task statuses in controller (#2090)

* add telemetry around task statuses in controller

* fixes

* more fixes
This commit is contained in:
Mohammed Nafees
2025-08-06 14:41:54 +02:00
committed by GitHub
parent 67e8e89f95
commit 89e6d00a8f
3 changed files with 95 additions and 9 deletions

View File

@@ -8,6 +8,7 @@ import (
"github.com/google/uuid"
"github.com/rs/zerolog"
"go.opentelemetry.io/otel/codes"
"golang.org/x/sync/errgroup"
"github.com/hatchet-dev/hatchet/internal/cache"
@@ -107,7 +108,18 @@ func (p *PostgresMessageQueue) SetQOS(prefetchCount int) {
}
func (p *PostgresMessageQueue) SendMessage(ctx context.Context, queue msgqueue.Queue, task *msgqueue.Message) error {
return p.addMessage(ctx, queue, task)
ctx, span := telemetry.NewSpan(ctx, "PostgresMessageQueue.SendMessage")
defer span.End()
err := p.addMessage(ctx, queue, task)
if err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, "error adding message")
return err
}
return nil
}
func (p *PostgresMessageQueue) addMessage(ctx context.Context, queue msgqueue.Queue, task *msgqueue.Message) error {

View File

@@ -12,6 +12,7 @@ import (
lru "github.com/hashicorp/golang-lru/v2"
amqp "github.com/rabbitmq/amqp091-go"
"github.com/rs/zerolog"
"go.opentelemetry.io/otel/codes"
msgqueue "github.com/hatchet-dev/hatchet/internal/msgqueue/v1"
"github.com/hatchet-dev/hatchet/internal/queueutils"
@@ -172,7 +173,18 @@ func (t *MessageQueueImpl) SetQOS(prefetchCount int) {
}
func (t *MessageQueueImpl) SendMessage(ctx context.Context, q msgqueue.Queue, msg *msgqueue.Message) error {
return t.pubMessage(ctx, q, msg)
ctx, span := telemetry.NewSpan(ctx, "MessageQueueImpl.SendMessage")
defer span.End()
err := t.pubMessage(ctx, q, msg)
if err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, "error publishing message")
return err
}
return nil
}
func (t *MessageQueueImpl) pubMessage(ctx context.Context, q msgqueue.Queue, msg *msgqueue.Message) error {

View File

@@ -408,6 +408,9 @@ func (tc *TasksControllerImpl) handleBufferedMsgs(tenantId, msgId string, payloa
}
func (tc *TasksControllerImpl) handleTaskCompleted(ctx context.Context, tenantId string, payloads [][]byte) error {
ctx, span := telemetry.NewSpan(ctx, "TasksControllerImpl.handleTaskCompleted")
defer span.End()
opts := make([]v1.CompleteTaskOpts, 0)
idsToData := make(map[int64][]byte)
@@ -429,6 +432,9 @@ func (tc *TasksControllerImpl) handleTaskCompleted(ctx context.Context, tenantId
res, err := tc.repov1.Tasks().CompleteTasks(ctx, tenantId, opts)
if err != nil {
err = fmt.Errorf("could not complete tasks: %w", err)
span.RecordError(err)
span.SetStatus(codes.Error, "could not complete tasks")
return err
}
@@ -444,6 +450,9 @@ func (tc *TasksControllerImpl) handleTaskCompleted(ctx context.Context, tenantId
}
func (tc *TasksControllerImpl) handleTaskFailed(ctx context.Context, tenantId string, payloads [][]byte) error {
ctx, span := telemetry.NewSpan(ctx, "TasksControllerImpl.handleTaskFailed")
defer span.End()
opts := make([]v1.FailTaskOpts, 0)
msgs := msgqueue.JSONConvert[tasktypes.FailedTaskPayload](payloads)
@@ -479,13 +488,19 @@ func (tc *TasksControllerImpl) handleTaskFailed(ctx context.Context, tenantId st
if err != nil {
tc.l.Error().Err(err).Msg("could not create monitoring event message")
err = fmt.Errorf("could not create monitoring event message: %w", err)
span.RecordError(err)
span.SetStatus(codes.Error, "could not create monitoring event message")
continue
}
err = tc.pubBuffer.Pub(ctx, msgqueue.OLAP_QUEUE, olapMsg, false)
if err != nil {
tc.l.Error().Err(err).Msg("could not create monitoring event message")
tc.l.Error().Err(err).Msg("could not publish monitoring event message")
err = fmt.Errorf("could not publish monitoring event message: %w", err)
span.RecordError(err)
span.SetStatus(codes.Error, "could not publish monitoring event message")
continue
}
}
@@ -493,13 +508,28 @@ func (tc *TasksControllerImpl) handleTaskFailed(ctx context.Context, tenantId st
res, err := tc.repov1.Tasks().FailTasks(ctx, tenantId, opts)
if err != nil {
err = fmt.Errorf("could not fail tasks: %w", err)
span.RecordError(err)
span.SetStatus(codes.Error, "could not fail tasks")
return err
}
return tc.processFailTasksResponse(ctx, tenantId, res)
err = tc.processFailTasksResponse(ctx, tenantId, res)
if err != nil {
err = fmt.Errorf("could not process fail tasks response: %w", err)
span.RecordError(err)
span.SetStatus(codes.Error, "could not process fail tasks response")
return err
}
return nil
}
func (tc *TasksControllerImpl) processFailTasksResponse(ctx context.Context, tenantId string, res *v1.FailTasksResponse) error {
ctx, span := telemetry.NewSpan(ctx, "TasksControllerImpl.processFailTasksResponse")
defer span.End()
retriedTaskIds := make(map[int64]struct{})
for _, task := range res.RetriedTasks {
@@ -527,6 +557,9 @@ func (tc *TasksControllerImpl) processFailTasksResponse(ctx context.Context, ten
err := tc.sendInternalEvents(ctx, tenantId, internalEventsWithoutRetries)
if err != nil {
err = fmt.Errorf("could not send internal events: %w", err)
span.RecordError(err)
span.SetStatus(codes.Error, "could not send internal events")
return err
}
@@ -538,7 +571,10 @@ func (tc *TasksControllerImpl) processFailTasksResponse(ctx context.Context, ten
err = tc.pubRetryEvent(ctx, tenantId, task)
if err != nil {
outerErr = multierror.Append(outerErr, fmt.Errorf("could not publish retry event: %w", err))
err = fmt.Errorf("could not publish retry event: %w", err)
span.RecordError(err)
span.SetStatus(codes.Error, "could not publish retry event")
outerErr = multierror.Append(outerErr, err)
}
}
}
@@ -547,6 +583,9 @@ func (tc *TasksControllerImpl) processFailTasksResponse(ctx context.Context, ten
}
func (tc *TasksControllerImpl) handleTaskCancelled(ctx context.Context, tenantId string, payloads [][]byte) error {
ctx, span := telemetry.NewSpan(ctx, "TasksControllerImpl.handleTaskCancelled")
defer span.End()
opts := make([]v1.TaskIdInsertedAtRetryCount, 0)
msgs := msgqueue.JSONConvert[tasktypes.CancelledTaskPayload](payloads)
@@ -565,6 +604,9 @@ func (tc *TasksControllerImpl) handleTaskCancelled(ctx context.Context, tenantId
res, err := tc.repov1.Tasks().CancelTasks(ctx, tenantId, opts)
if err != nil {
err = fmt.Errorf("could not cancel tasks: %w", err)
span.RecordError(err)
span.SetStatus(codes.Error, "could not cancel tasks")
return err
}
@@ -584,7 +626,10 @@ func (tc *TasksControllerImpl) handleTaskCancelled(ctx context.Context, tenantId
err = tc.sendTaskCancellationsToDispatcher(ctx, tenantId, tasksToSendToDispatcher)
if err != nil {
return fmt.Errorf("could not send task cancellations to dispatcher: %w", err)
err = fmt.Errorf("could not send task cancellations to dispatcher: %w", err)
span.RecordError(err)
span.SetStatus(codes.Error, "could not send task cancellations to dispatcher")
return err
}
tc.notifyQueuesOnCompletion(ctx, tenantId, res.ReleasedTasks)
@@ -593,6 +638,9 @@ func (tc *TasksControllerImpl) handleTaskCancelled(ctx context.Context, tenantId
err = tc.sendInternalEvents(ctx, tenantId, res.InternalEvents)
if err != nil {
err = fmt.Errorf("could not send internal events: %w", err)
span.RecordError(err)
span.SetStatus(codes.Error, "could not send internal events")
return err
}
@@ -613,7 +661,11 @@ func (tc *TasksControllerImpl) handleTaskCancelled(ctx context.Context, tenantId
)
if err != nil {
outerErr = multierror.Append(outerErr, fmt.Errorf("could not create monitoring event message: %w", err))
tc.l.Error().Err(err).Msg("could not create monitoring event message")
err = fmt.Errorf("could not create monitoring event message: %w", err)
span.RecordError(err)
span.SetStatus(codes.Error, "could not create monitoring event message")
outerErr = multierror.Append(outerErr, err)
continue
}
@@ -625,7 +677,11 @@ func (tc *TasksControllerImpl) handleTaskCancelled(ctx context.Context, tenantId
)
if err != nil {
outerErr = multierror.Append(outerErr, fmt.Errorf("could not publish monitoring event message: %w", err))
tc.l.Error().Err(err).Msg("could not publish monitoring event message")
err = fmt.Errorf("could not publish monitoring event message: %w", err)
span.RecordError(err)
span.SetStatus(codes.Error, "could not publish monitoring event message")
outerErr = multierror.Append(outerErr, err)
}
}
@@ -1053,6 +1109,9 @@ func (tc *TasksControllerImpl) handleProcessTaskTrigger(ctx context.Context, ten
}
func (tc *TasksControllerImpl) sendInternalEvents(ctx context.Context, tenantId string, events []v1.InternalTaskEvent) error {
ctx, span := telemetry.NewSpan(ctx, "TasksControllerImpl.sendInternalEvents")
defer span.End()
if len(events) == 0 {
return nil
}
@@ -1060,7 +1119,10 @@ func (tc *TasksControllerImpl) sendInternalEvents(ctx context.Context, tenantId
msg, err := tasktypes.NewInternalEventMessage(tenantId, time.Now(), events...)
if err != nil {
return fmt.Errorf("could not create internal event message: %w", err)
err = fmt.Errorf("could not create internal event message: %w", err)
span.RecordError(err)
span.SetStatus(codes.Error, "could not create internal event message")
return err
}
return tc.mq.SendMessage(