fix: RED metrics

2026-04-26 05:58:27 -05:00 · 2023-12-15 23:28:16 +01:00
parent 8fb5864ca2
commit 06c18f3f65
5 changed files with 89 additions and 17 deletions
@@ -0,0 +1,11 @@
+Bugfix: Fix RED metrics on the metrics endpoint
+
+We connected some metrics to the metrics endpoint to support the RED method for monitoring microservices.
+
+- Request Rate: The number of requests per second. The total count of requests is available under `ocis_proxy_requests_total`.
+- Error Rate: The number of failed requests per second. The total count of failed requests is available under `ocis_proxy_errors_total`.
+- Duration: The amount of time each request takes. The duration of all requests is available under `ocis_proxy_request_duration_seconds`. This is a histogram metric, so it also provides information about the distribution of request durations.
+
+The metrics are available under the following paths: `PROXY_DEBUG_ADDR/metrics` in a prometheus compatible format and maybe secured by `PROXY_DEBUG_TOKEN`.
+
+https://github.com/owncloud/ocis/pull/7994
@@ -116,3 +116,35 @@ When using the ocis IDP service instead of an external IDP:
 -   Use the environment variable `OCIS_URL` to define how ocis can be accessed, mandatory use `https` as protocol for the URL.
 -   If no reverse proxy is set up, the `PROXY_TLS` environment variable **must** be set to `true` because the embedded `libreConnect` shipped with the IDP service has a hard check if the connection is on TLS and uses the HTTPS protocol. If this mismatches, an error will be logged and no connection from the client can be established.
 -   `PROXY_TLS` **can** be set to `false` if a reverse proxy is used and the https connection is terminated at the reverse proxy. When setting to `false`, the communication between the reverse proxy and ocis is not secured. If set to `true`, you must provide certificates.
+
+## Metrics
+
+The proxy service in ocis has the ability to expose metrics in the prometheus format. The metrics are exposed on the `/metrics` endpoint. There are two ways to run the ocis proxy service which has an impact on the number of metrics exposed.
+
+### 1) Single Process Mode
+In the single process mode, all ocis services are running inside a single process. This is the default mode when using the `ocis server` command to start the services. In this mode, the proxy service exposes metrics about the proxy service itself and about the ocis services it is proxying. This is due to the nature of the prometheus registry which is a singleton. The metrics exposed by the proxy service itself are prefixed with `ocis_proxy_` and the metrics exposed by other ocis services are prefixed with `ocis_<service-name>_`.
+
+### 2) Standalone Mode
+In this mode, the proxy service only exposes its own metrics. The metrics of the other ocis services are exposed on their own metrics endpoints.
+
+### Available Metrics
+The following metrics are exposed by the proxy service:
+
+| Metric Name                      | Description                                                                                                                                                                                                                   | Labels                                |
+|----------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------|
+| `ocis_proxy_requests_total`      | [Counter](https://prometheus.io/docs/tutorials/understanding_metric_types/#counter) metric which reports the total number of HTTP requests.                                                                                   | `method`: HTTP method of the request  |
+| `ocis_proxy_errors_total`        | [Counter](https://prometheus.io/docs/tutorials/understanding_metric_types/#counter) metric which reports the total number of HTTP requests which have failed. That counts all response codes >= 500                           | `method`: HTTP method of the request  |
+| `ocis_proxy_duration_seconds`    | [Histogram](https://prometheus.io/docs/tutorials/understanding_metric_types/#histogram) of the time (in seconds) each request took. A histogram metric uses buckets to count the number of events that fall into each bucket. | `method`: HTTP method of the request  |
+| `ocis_proxy_build_info{version}` | A metric with a constant `1` value labeled by version, exposing the version of the ocis proxy service.                                                                                                                        | `version`: Build version of the proxy |
+
+### Prometheus Configuration
+The following is an example prometheus configuration for the single process mode. It assumes that the proxy service is configured to bind on all interfaces `PROXY_HTTP_ADDR=0.0.0.0:9205` and that the proxy is available via the `ocis` service name (typically in docker-compose). The prometheus service detects the `/metrics` endpoint automatically and scrapes it every 15 seconds.
+
+```yaml
+global:
+  scrape_interval: 15s
+scrape_configs:
+  - job_name: ocis_proxy
+    static_configs:
+    - targets: ["ocis:9205"]
+```
@@ -129,7 +129,7 @@ func Server(cfg *config.Config) *cli.Command {
 			}

 			{
-				middlewares := loadMiddlewares(ctx, logger, cfg, userInfoCache, traceProvider)
+				middlewares := loadMiddlewares(ctx, logger, cfg, userInfoCache, traceProvider, *m)
 				server, err := proxyHTTP.Server(
 					proxyHTTP.Handler(lh.handler()),
 					proxyHTTP.Logger(logger),
@@ -269,7 +269,7 @@ func (h *StaticRouteHandler) backchannelLogout(w http.ResponseWriter, r *http.Re
 	render.JSON(w, r, nil)
 }

-func loadMiddlewares(ctx context.Context, logger log.Logger, cfg *config.Config, userInfoCache microstore.Store, traceProvider trace.TracerProvider) alice.Chain {
+func loadMiddlewares(ctx context.Context, logger log.Logger, cfg *config.Config, userInfoCache microstore.Store, traceProvider trace.TracerProvider, metrics metrics.Metrics) alice.Chain {
 	rolesClient := settingssvc.NewRoleService("com.owncloud.api.settings", cfg.GrpcClient)
 	policiesProviderClient := policiessvc.NewPoliciesProviderService("com.owncloud.api.policies", cfg.GrpcClient)
 	gatewaySelector, err := pool.GatewaySelector(cfg.Reva.Address, append(cfg.Reva.GetRevaOptions(), pool.WithRegistry(registry.GetRegistry()))...)
@@ -381,6 +381,7 @@ func loadMiddlewares(ctx context.Context, logger log.Logger, cfg *config.Config,
 		),
 		middleware.Tracer(traceProvider),
 		pkgmiddleware.TraceContext,
+		middleware.Instrumenter(metrics),
 		chimiddleware.RealIP,
 		chimiddleware.RequestID,
 		middleware.AccessLog(logger),
@@ -14,8 +14,8 @@ var (

 // Metrics defines the available metrics of this service.
 type Metrics struct {
-	Counter   *prometheus.CounterVec
-	Latency   *prometheus.SummaryVec
+	Requests  *prometheus.CounterVec
+	Errors    *prometheus.CounterVec
 	Duration  *prometheus.HistogramVec
 	BuildInfo *prometheus.GaugeVec
 }
@@ -23,24 +23,24 @@ type Metrics struct {
 // New initializes the available metrics.
 func New() *Metrics {
 	m := &Metrics{
-		Counter: prometheus.NewCounterVec(prometheus.CounterOpts{
+		Requests: prometheus.NewCounterVec(prometheus.CounterOpts{
 			Namespace: Namespace,
 			Subsystem: Subsystem,
-			Name:      "proxy_total",
-			Help:      "How many proxy requests processed",
-		}, []string{}),
-		Latency: prometheus.NewSummaryVec(prometheus.SummaryOpts{
+			Name:      "requests_total",
+			Help:      "How many requests processed in total",
+		}, []string{"method"}),
+		Errors: prometheus.NewCounterVec(prometheus.CounterOpts{
 			Namespace: Namespace,
 			Subsystem: Subsystem,
-			Name:      "proxy_latency_microseconds",
-			Help:      "proxy request latencies in microseconds",
-		}, []string{}),
+			Name:      "errors_total",
+			Help:      "How many requests run into errors",
+		}, []string{"method"}),
 		Duration: prometheus.NewHistogramVec(prometheus.HistogramOpts{
 			Namespace: Namespace,
 			Subsystem: Subsystem,
-			Name:      "proxy_duration_seconds",
-			Help:      "proxy method request time in seconds",
-		}, []string{}),
+			Name:      "duration_seconds",
+			Help:      "request duration in seconds",
+		}, []string{"method"}),
 		BuildInfo: prometheus.NewGaugeVec(prometheus.GaugeOpts{
 			Namespace: Namespace,
 			Subsystem: Subsystem,
@@ -49,8 +49,8 @@ func New() *Metrics {
 		}, []string{"versions"}),
 	}

-	_ = prometheus.Register(m.Counter)
-	_ = prometheus.Register(m.Latency)
+	_ = prometheus.Register(m.Requests)
+	_ = prometheus.Register(m.Errors)
 	_ = prometheus.Register(m.Duration)
 	_ = prometheus.Register(m.BuildInfo)
 	return m
@@ -0,0 +1,28 @@
+package middleware
+
+import (
+	"net/http"
+	"time"
+
+	"github.com/go-chi/chi/v5/middleware"
+	"github.com/owncloud/ocis/v2/services/proxy/pkg/metrics"
+	"github.com/prometheus/client_golang/prometheus"
+)
+
+// Instrumenter provides a middleware to create metrics
+func Instrumenter(m metrics.Metrics) func(next http.Handler) http.Handler {
+	return func(next http.Handler) http.Handler {
+		return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			start := time.Now()
+			ww := middleware.NewWrapResponseWriter(w, r.ProtoMajor)
+			m.Requests.With(prometheus.Labels{"method": r.Method}).Inc()
+
+			next.ServeHTTP(ww, r)
+
+			m.Duration.With(prometheus.Labels{"method": r.Method}).Observe(float64(time.Since(start).Seconds()))
+			if ww.Status() >= 500 {
+				m.Errors.With(prometheus.Labels{"method": r.Method}).Inc()
+			}
+		})
+	}
+}