Files
hatchet/api/v1/server/handlers/monitoring/probe.go
T
2024-12-18 09:00:54 -05:00

318 lines
7.8 KiB
Go

package monitoring
import (
"context"
"fmt"
"net/http"
"slices"
"strings"
"time"
"github.com/google/uuid"
"github.com/labstack/echo/v4"
"golang.org/x/exp/rand"
"github.com/hatchet-dev/hatchet/api/v1/server/oas/gen"
"github.com/hatchet-dev/hatchet/pkg/client"
clientconfig "github.com/hatchet-dev/hatchet/pkg/config/client"
"github.com/hatchet-dev/hatchet/pkg/config/shared"
"github.com/hatchet-dev/hatchet/pkg/repository/prisma/db"
"github.com/hatchet-dev/hatchet/pkg/repository/prisma/dbsqlc"
"github.com/hatchet-dev/hatchet/pkg/repository/prisma/sqlchelpers"
"github.com/hatchet-dev/hatchet/pkg/worker"
)
func (m *MonitoringService) MonitoringPostRunProbe(ctx echo.Context, request gen.MonitoringPostRunProbeRequestObject) (gen.MonitoringPostRunProbeResponseObject, error) {
if !m.enabled {
m.l.Error().Msg("monitoring is not enabled")
return gen.MonitoringPostRunProbe403JSONResponse{}, nil
}
tenant := ctx.Get("tenant").(*db.TenantModel)
if !slices.Contains[[]string](m.permittedTenants, tenant.ID) {
err := fmt.Errorf("tenant is not a monitoring tenant for this instance")
if len(m.permittedTenants) > 0 {
m.l.Error().Err(err).Msgf("monitoring tenants are %v", m.permittedTenants)
} else {
m.l.Error().Err(err).Msg("no monitoring tenants are configured")
}
return gen.MonitoringPostRunProbe403JSONResponse{}, nil
}
token, err := getBearerTokenFromRequest(ctx.Request())
if err != nil {
return nil, err
}
events := make(chan string, 50)
stepChan := make(chan string, 50)
errorChan := make(chan error, 50)
cancellableContext, cancel := context.WithTimeout(ctx.Request().Context(), m.probeTimeout)
defer cancel()
cf := clientconfig.ClientConfigFile{
Token: token,
TenantId: tenant.ID,
Namespace: randomNamespace(),
TLS: clientconfig.ClientTLSConfigFile{
Base: shared.TLSConfigFile{
TLSRootCAFile: m.tlsRootCAFile,
},
},
}
cleanup, err := m.run(cancellableContext, cf, m.workflowName, events, stepChan, errorChan)
if err != nil {
m.l.Error().Msgf("error running probe: %s", err)
return nil, err
}
defer cleanup()
// Stream events are not necessarily received in order so we don't distinguish between them
messages := []string{"This is a stream event", "This is a stream event"}
messageIndex := 0
stepMessages := []string{"step-one", "step-two"}
for {
select {
case <-cancellableContext.Done():
if cancellableContext.Err() == context.DeadlineExceeded {
m.l.Error().Msg("timed out waiting for probe to complete")
return nil, fmt.Errorf("timed out waiting for probe to complete")
}
case err := <-errorChan:
m.l.Error().Msgf("error during probe: %s", err)
return nil, err
case e := <-events:
if !strings.HasPrefix(e, messages[messageIndex]) {
return nil, fmt.Errorf("expected message %s, to start with %s", messages[messageIndex], e)
}
messageIndex++
if messageIndex == len(messages) {
for i := range stepMessages {
stepMessage := <-stepChan
if stepMessage != stepMessages[i] {
return nil, fmt.Errorf("probe did not complete successfully - step messages failed")
}
}
return nil, nil
}
}
}
}
type probeEvent struct {
UniqueStreamId string
}
type stepOneOutput struct {
Message string `json:"message"`
UniqueStreamId string
}
func (m *MonitoringService) run(ctx context.Context, cf clientconfig.ClientConfigFile, workflowName string, events chan<- string, stepChan chan<- string, errors chan<- error) (func(), error) {
c, err := client.NewFromConfigFile(
&cf, client.WithLogLevel(m.l.GetLevel().String()),
)
if err != nil {
return nil, fmt.Errorf("error creating client: %w", err)
}
w, err := worker.NewWorker(
worker.WithClient(
c,
), worker.WithLogLevel(m.l.GetLevel().String()),
)
if err != nil {
return nil, fmt.Errorf("error creating worker: %w", err)
}
streamKey := "streamKey"
streamValue := fmt.Sprintf("stream-event-%d", rand.Intn(100)+1)
var wfrId string
err = w.RegisterWorkflow(
&worker.WorkflowJob{
On: worker.Events(m.eventName),
Name: workflowName,
Description: "This is part of the monitoring system for testing the readiness of this Hatchet installation.",
Steps: []*worker.WorkflowStep{
worker.Fn(func(ctx worker.HatchetContext) (result *stepOneOutput, err error) {
input := &probeEvent{}
err = ctx.WorkflowInput(input)
if err != nil {
return nil, err
}
wfrId = ctx.WorkflowRunId()
if input.UniqueStreamId == "" {
return nil, fmt.Errorf("uniqueStreamId is required")
}
if input.UniqueStreamId != streamValue {
return nil, fmt.Errorf("uniqueStreamId does not match stream value")
}
ctx.StreamEvent([]byte("This is a stream event for step-one"))
stepChan <- "step-one"
return &stepOneOutput{
Message: "This is a message for step-one",
UniqueStreamId: streamValue,
}, nil
},
).SetName("step-one"),
worker.Fn(func(ctx worker.HatchetContext) (result *stepOneOutput, err error) {
input := &probeEvent{}
err = ctx.StepOutput("step-one", input)
if err != nil {
return nil, err
}
if input.UniqueStreamId == "" {
return nil, fmt.Errorf("uniqueStreamId is required")
}
if input.UniqueStreamId != streamValue {
return nil, fmt.Errorf("uniqueStreamId does not match stream value")
}
ctx.StreamEvent([]byte("This is a stream event for step-two"))
stepChan <- "step-two"
return &stepOneOutput{
Message: "This is a message for step-two",
UniqueStreamId: streamValue,
}, nil
}).SetName("step-two").AddParents("step-one"),
},
},
)
if err != nil {
return nil, fmt.Errorf("error registering workflow: %w", err)
}
go func() {
err = c.Subscribe().StreamByAdditionalMetadata(ctx, streamKey, streamValue, func(event client.StreamEvent) error {
events <- string(event.Message)
return nil
})
if err != nil {
errors <- fmt.Errorf("error subscribing to stream: %w", err)
}
}()
go func() {
testEvent := probeEvent{
UniqueStreamId: streamValue,
}
err := c.Event().Push(
ctx,
m.eventName,
testEvent,
client.WithEventMetadata(map[string]string{
streamKey: streamValue,
},
),
)
if err != nil {
errors <- fmt.Errorf("error pushing event: %w", err)
}
}()
cleanupWorker, err := w.Start()
if err != nil {
return nil, fmt.Errorf("error starting worker: %w", err)
}
cleanup := func() {
err := cleanupWorker()
if err != nil {
m.l.Error().Msgf("error cleaning up worker: %s", err)
}
defer func() {
if wfrId == "" {
m.l.Warn().Msg("workflow run id was never set for probe")
return
}
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
for i := 0; i < 10; i++ {
wrfRow, err := m.config.APIRepository.WorkflowRun().GetWorkflowRunById(ctx, cf.TenantId, wfrId)
if err != nil {
m.l.Error().Msgf("error getting workflow run: %s", err)
}
if wrfRow.Status != dbsqlc.WorkflowRunStatusRUNNING {
workflowId := sqlchelpers.UUIDToStr(wrfRow.Workflow.ID)
_, err = m.config.APIRepository.Workflow().DeleteWorkflow(ctx, cf.TenantId, workflowId)
if err != nil {
m.l.Error().Msgf("error deleting workflow: %s", err)
} else {
m.l.Info().Msgf("deleted workflow %s", workflowId)
return
}
}
time.Sleep(200 * time.Millisecond)
}
m.l.Error().Msg("could not clean up workflow after 10 attempts")
}()
}
return cleanup, nil
}
func getBearerTokenFromRequest(r *http.Request) (string, error) {
reqToken := r.Header.Get("Authorization")
splitToken := strings.Split(reqToken, "Bearer")
if len(splitToken) != 2 {
return "", fmt.Errorf("invalid token")
}
reqToken = strings.TrimSpace(splitToken[1])
return reqToken, nil
}
func randomNamespace() string {
return "ns_" + uuid.New().String()[0:8]
}