Feat: Hatchet Metrics Monitoring, I (#2480)

* feat: queries + task methods for oldest running task and oldest task

* feat: worker slot and sdk metrics

* feat: wal metrics

* repository stub

* feat: add meter provider thingy

* pg queries

* fix: add task

* feat: repo methods for worker metrics

* feat: active workers query, fix where clauses

* fix: aliasing

* fix: sql, cleanup

* chore: cast

* feat: olap queries

* feat: olap queries

* feat: finish wiring up olap status update metrics

* chore: lint

* chore: lint

* fix: dupes, other code review comments

* send metrics to OTel collector

* last autovac

* flag

* logging updates

* address PR comments

---------

Co-authored-by: gabriel ruttner <gabriel.ruttner@gmail.com>
Co-authored-by: Mohammed Nafees <hello@mnafees.me>
This commit is contained in:
matt
2025-12-22 14:34:02 -05:00
committed by GitHub
parent a4e7584c18
commit fdc075ec6f
33 changed files with 2402 additions and 60 deletions
+236
View File
@@ -0,0 +1,236 @@
package telemetry
import (
"context"
"fmt"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/metric"
)
// MetricsRecorder provides a centralized way to record OTel metrics
type MetricsRecorder struct {
meter metric.Meter
// Database health metrics
dbBloatGauge metric.Int64Gauge
dbBloatPercentGauge metric.Float64Gauge
dbLongRunningQueriesGauge metric.Int64Gauge
dbQueryCacheHitRatioGauge metric.Float64Gauge
dbLongRunningVacuumGauge metric.Int64Gauge
dbLastAutovacuumSecondsSinceGauge metric.Float64Gauge
// OLAP metrics
olapTempTableSizeDAGGauge metric.Int64Gauge
olapTempTableSizeTaskGauge metric.Int64Gauge
yesterdayRunCountGauge metric.Int64Gauge
// Worker metrics
activeSlotsGauge metric.Int64Gauge
activeWorkersGauge metric.Int64Gauge
activeSDKsGauge metric.Int64Gauge
}
// NewMetricsRecorder creates a new metrics recorder with all instruments registered
func NewMetricsRecorder(ctx context.Context) (*MetricsRecorder, error) {
meter := otel.Meter("hatchet.run/metrics")
// Database health metrics
dbBloatGauge, err := meter.Int64Gauge(
"hatchet.db.bloat.count",
metric.WithDescription("Number of bloated tables detected in the database"),
)
if err != nil {
return nil, fmt.Errorf("failed to create db bloat gauge: %w", err)
}
dbBloatPercentGauge, err := meter.Float64Gauge(
"hatchet.db.bloat.dead_tuple_percent",
metric.WithDescription("Percentage of dead tuples per table"),
)
if err != nil {
return nil, fmt.Errorf("failed to create db bloat percent gauge: %w", err)
}
dbLongRunningQueriesGauge, err := meter.Int64Gauge(
"hatchet.db.long_running_queries.count",
metric.WithDescription("Number of long-running queries detected in the database"),
)
if err != nil {
return nil, fmt.Errorf("failed to create long running queries gauge: %w", err)
}
dbQueryCacheHitRatioGauge, err := meter.Float64Gauge(
"hatchet.db.query_cache.hit_ratio",
metric.WithDescription("Query cache hit ratio percentage for tables"),
)
if err != nil {
return nil, fmt.Errorf("failed to create query cache hit ratio gauge: %w", err)
}
dbLongRunningVacuumGauge, err := meter.Int64Gauge(
"hatchet.db.long_running_vacuum.count",
metric.WithDescription("Number of long-running vacuum operations detected in the database"),
)
if err != nil {
return nil, fmt.Errorf("failed to create long running vacuum gauge: %w", err)
}
dbLastAutovacuumSecondsSinceGauge, err := meter.Float64Gauge(
"hatchet.db.last_autovacuum.seconds_since",
metric.WithDescription("Seconds since last autovacuum for partitioned tables"),
)
if err != nil {
return nil, fmt.Errorf("failed to create last autovacuum gauge: %w", err)
}
// OLAP metrics (instance-wide)
olapTempTableSizeDAGGauge, err := meter.Int64Gauge(
"hatchet.olap.temp_table_size.dag_status_updates",
metric.WithDescription("Size of temporary table for DAG status updates (instance-wide)"),
)
if err != nil {
return nil, fmt.Errorf("failed to create OLAP DAG temp table size gauge: %w", err)
}
olapTempTableSizeTaskGauge, err := meter.Int64Gauge(
"hatchet.olap.temp_table_size.task_status_updates",
metric.WithDescription("Size of temporary table for task status updates (instance-wide)"),
)
if err != nil {
return nil, fmt.Errorf("failed to create OLAP task temp table size gauge: %w", err)
}
yesterdayRunCountGauge, err := meter.Int64Gauge(
"hatchet.olap.yesterday_run_count",
metric.WithDescription("Number of workflow runs from yesterday by status (instance-wide)"),
)
if err != nil {
return nil, fmt.Errorf("failed to create yesterday run count gauge: %w", err)
}
// Worker metrics
activeSlotsGauge, err := meter.Int64Gauge(
"hatchet.workers.active_slots",
metric.WithDescription("Number of active worker slots per tenant"),
)
if err != nil {
return nil, fmt.Errorf("failed to create active slots gauge: %w", err)
}
activeWorkersGauge, err := meter.Int64Gauge(
"hatchet.workers.active_count",
metric.WithDescription("Number of active workers per tenant"),
)
if err != nil {
return nil, fmt.Errorf("failed to create active workers gauge: %w", err)
}
activeSDKsGauge, err := meter.Int64Gauge(
"hatchet.workers.active_sdks",
metric.WithDescription("Number of active SDKs per tenant and SDK version"),
)
if err != nil {
return nil, fmt.Errorf("failed to create active SDKs gauge: %w", err)
}
return &MetricsRecorder{
meter: meter,
dbBloatGauge: dbBloatGauge,
dbBloatPercentGauge: dbBloatPercentGauge,
dbLongRunningQueriesGauge: dbLongRunningQueriesGauge,
dbQueryCacheHitRatioGauge: dbQueryCacheHitRatioGauge,
dbLongRunningVacuumGauge: dbLongRunningVacuumGauge,
dbLastAutovacuumSecondsSinceGauge: dbLastAutovacuumSecondsSinceGauge,
olapTempTableSizeDAGGauge: olapTempTableSizeDAGGauge,
olapTempTableSizeTaskGauge: olapTempTableSizeTaskGauge,
yesterdayRunCountGauge: yesterdayRunCountGauge,
activeSlotsGauge: activeSlotsGauge,
activeWorkersGauge: activeWorkersGauge,
activeSDKsGauge: activeSDKsGauge,
}, nil
}
// RecordDBBloat records the number of bloated tables detected
func (m *MetricsRecorder) RecordDBBloat(ctx context.Context, count int64, healthStatus string) {
m.dbBloatGauge.Record(ctx, count,
metric.WithAttributes(attribute.String("health_status", healthStatus)))
}
// RecordDBBloatPercent records the dead tuple percentage for a specific table
func (m *MetricsRecorder) RecordDBBloatPercent(ctx context.Context, tableName string, deadPercent float64) {
m.dbBloatPercentGauge.Record(ctx, deadPercent,
metric.WithAttributes(attribute.String("table_name", tableName)))
}
// RecordDBLongRunningQueries records the number of long-running queries
func (m *MetricsRecorder) RecordDBLongRunningQueries(ctx context.Context, count int64) {
m.dbLongRunningQueriesGauge.Record(ctx, count)
}
// RecordDBQueryCacheHitRatio records the query cache hit ratio for a table
func (m *MetricsRecorder) RecordDBQueryCacheHitRatio(ctx context.Context, tableName string, hitRatio float64) {
m.dbQueryCacheHitRatioGauge.Record(ctx, hitRatio,
metric.WithAttributes(attribute.String("table_name", tableName)))
}
// RecordDBLongRunningVacuum records the number of long-running vacuum operations
func (m *MetricsRecorder) RecordDBLongRunningVacuum(ctx context.Context, count int64, healthStatus string) {
m.dbLongRunningVacuumGauge.Record(ctx, count,
metric.WithAttributes(attribute.String("health_status", healthStatus)))
}
// RecordDBLastAutovacuumSecondsSince records seconds since last autovacuum for a partitioned table
func (m *MetricsRecorder) RecordDBLastAutovacuumSecondsSince(ctx context.Context, tableName string, seconds float64) {
m.dbLastAutovacuumSecondsSinceGauge.Record(ctx, seconds,
metric.WithAttributes(attribute.String("table_name", tableName)))
}
// RecordOLAPTempTableSizeDAG records the size of the OLAP DAG status updates temp table (instance-wide)
func (m *MetricsRecorder) RecordOLAPTempTableSizeDAG(ctx context.Context, size int64) {
m.olapTempTableSizeDAGGauge.Record(ctx, size)
}
// RecordOLAPTempTableSizeTask records the size of the OLAP task status updates temp table (instance-wide)
func (m *MetricsRecorder) RecordOLAPTempTableSizeTask(ctx context.Context, size int64) {
m.olapTempTableSizeTaskGauge.Record(ctx, size)
}
// RecordYesterdayRunCount records the number of workflow runs from yesterday (instance-wide)
func (m *MetricsRecorder) RecordYesterdayRunCount(ctx context.Context, status string, count int64) {
m.yesterdayRunCountGauge.Record(ctx, count,
metric.WithAttributes(attribute.String("status", status)))
}
// RecordActiveSlots records the number of active worker slots
func (m *MetricsRecorder) RecordActiveSlots(ctx context.Context, tenantId string, count int64) {
m.activeSlotsGauge.Record(ctx, count,
metric.WithAttributes(attribute.String("tenant_id", tenantId)))
}
// RecordActiveWorkers records the number of active workers
func (m *MetricsRecorder) RecordActiveWorkers(ctx context.Context, tenantId string, count int64) {
m.activeWorkersGauge.Record(ctx, count,
metric.WithAttributes(attribute.String("tenant_id", tenantId)))
}
// RecordActiveSDKs records the number of active SDKs
func (m *MetricsRecorder) RecordActiveSDKs(ctx context.Context, tenantId string, sdk SDKInfo, count int64) {
m.activeSDKsGauge.Record(ctx, count,
metric.WithAttributes(
attribute.String("tenant_id", tenantId),
attribute.String("sdk_language", sdk.Language),
attribute.String("sdk_version", sdk.SdkVersion),
attribute.String("sdk_os", sdk.OperatingSystem),
attribute.String("sdk_language_version", sdk.LanguageVersion),
))
}
// SDKInfo contains information about an SDK
type SDKInfo struct {
OperatingSystem string
Language string
LanguageVersion string
SdkVersion string
}
+85
View File
@@ -11,9 +11,11 @@ import (
"github.com/google/uuid"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
"go.opentelemetry.io/otel/propagation"
"go.opentelemetry.io/otel/sdk/metric"
"go.opentelemetry.io/otel/sdk/resource"
sdktrace "go.opentelemetry.io/otel/sdk/trace"
"go.opentelemetry.io/otel/trace"
@@ -106,6 +108,89 @@ func InitTracer(opts *TracerOpts) (func(context.Context) error, error) {
return exporter.Shutdown, nil
}
func InitMeter(opts *TracerOpts) (func(context.Context) error, error) {
if opts.CollectorURL == "" {
// no-op
return func(context.Context) error {
return nil
}, nil
}
var secureOption otlpmetricgrpc.Option
if !opts.Insecure {
secureOption = otlpmetricgrpc.WithTLSCredentials(credentials.NewClientTLSFromCert(nil, ""))
} else {
secureOption = otlpmetricgrpc.WithInsecure()
}
exporter, err := otlpmetricgrpc.New(
context.Background(),
secureOption,
otlpmetricgrpc.WithEndpoint(opts.CollectorURL),
otlpmetricgrpc.WithHeaders(map[string]string{
"Authorization": opts.CollectorAuth,
}),
)
if err != nil {
return nil, fmt.Errorf("failed to create exporter: %w", err)
}
resourceAttrs := []attribute.KeyValue{
attribute.String("service.name", opts.ServiceName),
attribute.String("library.language", "go"),
}
// Add Kubernetes pod information if available
if podName := os.Getenv("K8S_POD_NAME"); podName != "" {
resourceAttrs = append(resourceAttrs, attribute.String("k8s.pod.name", podName))
}
if podNamespace := os.Getenv("K8S_POD_NAMESPACE"); podNamespace != "" {
resourceAttrs = append(resourceAttrs, attribute.String("k8s.namespace.name", podNamespace))
}
resources, err := resource.New(
context.Background(),
resource.WithAttributes(resourceAttrs...),
)
if err != nil {
return nil, fmt.Errorf("failed to set resources: %w", err)
}
meterProvider := metric.NewMeterProvider(
metric.WithReader(
metric.NewPeriodicReader(
exporter,
metric.WithInterval(3*time.Second),
),
),
metric.WithResource(resources),
)
otel.SetMeterProvider(
meterProvider,
)
return func(ctx context.Context) error {
var shutdownErr error
if err := meterProvider.Shutdown(ctx); err != nil {
shutdownErr = fmt.Errorf("failed to shutdown meter provider: %w", err)
}
if err := exporter.Shutdown(ctx); err != nil {
if shutdownErr != nil {
shutdownErr = fmt.Errorf("%v; failed to shutdown exporter: %w", shutdownErr, err)
} else {
shutdownErr = fmt.Errorf("failed to shutdown exporter: %w", err)
}
}
return shutdownErr
}, nil
}
func NewSpan(ctx context.Context, name string) (context.Context, trace.Span) {
ctx, span := otel.Tracer("").Start(ctx, prefixSpanKey(name))
return ctx, span