From af5a3d3b7a667e0c5418e6ef48721cc87564d41d Mon Sep 17 00:00:00 2001 From: Sebastian Jeltsch Date: Fri, 3 Oct 2025 16:15:19 +0200 Subject: [PATCH] PoC: Minimal open-telemetry tracing and metrics integration behind "otel" feature flag. --- Cargo.lock | 274 ++++++++++++++++++++++++-- crates/core/Cargo.toml | 4 + crates/core/src/server/mod.rs | 98 +++++---- crates/core/tests/integration_test.rs | 71 +++++-- crates/core/tests/tls_test.rs | 3 +- 5 files changed, 380 insertions(+), 70 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3afd1558..fa06e57e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -694,6 +694,25 @@ dependencies = [ "url", ] +[[package]] +name = "axum-tracing-opentelemetry" +version = "0.32.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8ca2639ea8e778162fa734ddd5d77cb2fdf29ec39adb80192693834b6639ea6" +dependencies = [ + "axum 0.8.7", + "futures-core", + "futures-util", + "http", + "opentelemetry 0.31.0", + "opentelemetry-semantic-conventions 0.31.0", + "pin-project-lite", + "tower 0.5.2", + "tracing", + "tracing-opentelemetry", + "tracing-opentelemetry-instrumentation-sdk", +] + [[package]] name = "az" version = "1.2.1" @@ -1224,9 +1243,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.51" +version = "4.5.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c26d721170e0295f191a69bd9a1f93efcdb0aff38684b61ab5750468972e5f5" +checksum = "aa8120877db0e5c011242f96806ce3c94e0737ab8108532a76a3300a01db2ab8" dependencies = [ "clap_builder", "clap_derive", @@ -1234,9 +1253,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.51" +version = "4.5.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75835f0c7bf681bfd05abe44e965760fea999a5286c6eb2d59883634fd02011a" +checksum = "02576b399397b659c26064fbc92a75fede9d18ffd5f80ca1cd74ddab167016e1" dependencies = [ "anstream", "anstyle", @@ -2239,11 +2258,11 @@ dependencies = [ "hyper-util", "log", "once_cell", - "opentelemetry", - "opentelemetry-http", - "opentelemetry-otlp", - "opentelemetry-semantic-conventions", - "opentelemetry_sdk", + "opentelemetry 0.27.1", + "opentelemetry-http 0.27.0", + "opentelemetry-otlp 0.27.0", + "opentelemetry-semantic-conventions 0.27.0", + "opentelemetry_sdk 0.27.1", "pin-project", "serde", "thiserror 2.0.17", @@ -3747,6 +3766,23 @@ dependencies = [ "cfb", ] +[[package]] +name = "init-tracing-opentelemetry" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3405b8beafca487bde01d88e05e6646c4e5c6a3c9830e7b937290ea8f7f5e5d9" +dependencies = [ + "opentelemetry 0.31.0", + "opentelemetry-otlp 0.31.0", + "opentelemetry-semantic-conventions 0.31.0", + "opentelemetry-stdout", + "opentelemetry_sdk 0.31.0", + "thiserror 2.0.17", + "tracing", + "tracing-opentelemetry", + "tracing-subscriber", +] + [[package]] name = "inout" version = "0.1.4" @@ -4193,6 +4229,15 @@ dependencies = [ "libc", ] +[[package]] +name = "matchers" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9" +dependencies = [ + "regex-automata", +] + [[package]] name = "matchit" version = "0.7.3" @@ -4478,6 +4523,15 @@ dependencies = [ "winapi", ] +[[package]] +name = "nu-ansi-term" +version = "0.50.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" +dependencies = [ + "windows-sys 0.61.2", +] + [[package]] name = "num" version = "0.4.3" @@ -4761,6 +4815,19 @@ dependencies = [ "tracing", ] +[[package]] +name = "opentelemetry" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b84bcd6ae87133e903af7ef497404dda70c60d0ea14895fc8a5e6722754fc2a0" +dependencies = [ + "futures-core", + "futures-sink", + "js-sys", + "pin-project-lite", + "thiserror 2.0.17", +] + [[package]] name = "opentelemetry-http" version = "0.27.0" @@ -4770,7 +4837,20 @@ dependencies = [ "async-trait", "bytes", "http", - "opentelemetry", + "opentelemetry 0.27.1", +] + +[[package]] +name = "opentelemetry-http" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7a6d09a73194e6b66df7c8f1b680f156d916a1a942abf2de06823dd02b7855d" +dependencies = [ + "async-trait", + "bytes", + "http", + "opentelemetry 0.31.0", + "reqwest", ] [[package]] @@ -4782,18 +4862,36 @@ dependencies = [ "async-trait", "futures-core", "http", - "opentelemetry", - "opentelemetry-http", - "opentelemetry-proto", - "opentelemetry_sdk", + "opentelemetry 0.27.1", + "opentelemetry-http 0.27.0", + "opentelemetry-proto 0.27.0", + "opentelemetry_sdk 0.27.1", "prost 0.13.5", "serde_json", "thiserror 1.0.69", "tokio", - "tonic", + "tonic 0.12.3", "tracing", ] +[[package]] +name = "opentelemetry-otlp" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a2366db2dca4d2ad033cad11e6ee42844fd727007af5ad04a1730f4cb8163bf" +dependencies = [ + "http", + "opentelemetry 0.31.0", + "opentelemetry-http 0.31.0", + "opentelemetry-proto 0.31.0", + "opentelemetry_sdk 0.31.0", + "prost 0.14.1", + "reqwest", + "thiserror 2.0.17", + "tokio", + "tonic 0.14.2", +] + [[package]] name = "opentelemetry-proto" version = "0.27.0" @@ -4801,11 +4899,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6e05acbfada5ec79023c85368af14abd0b307c015e9064d249b2a950ef459a6" dependencies = [ "hex", - "opentelemetry", - "opentelemetry_sdk", + "opentelemetry 0.27.1", + "opentelemetry_sdk 0.27.1", "prost 0.13.5", "serde", - "tonic", + "tonic 0.12.3", +] + +[[package]] +name = "opentelemetry-proto" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7175df06de5eaee9909d4805a3d07e28bb752c34cab57fa9cff549da596b30f" +dependencies = [ + "opentelemetry 0.31.0", + "opentelemetry_sdk 0.31.0", + "prost 0.14.1", + "tonic 0.14.2", + "tonic-prost", ] [[package]] @@ -4814,6 +4925,23 @@ version = "0.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bc1b6902ff63b32ef6c489e8048c5e253e2e4a803ea3ea7e783914536eb15c52" +[[package]] +name = "opentelemetry-semantic-conventions" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e62e29dfe041afb8ed2a6c9737ab57db4907285d999ef8ad3a59092a36bdc846" + +[[package]] +name = "opentelemetry-stdout" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc8887887e169414f637b18751487cce4e095be787d23fad13c454e2fb1b3811" +dependencies = [ + "chrono", + "opentelemetry 0.31.0", + "opentelemetry_sdk 0.31.0", +] + [[package]] name = "opentelemetry_sdk" version = "0.27.1" @@ -4825,7 +4953,7 @@ dependencies = [ "futures-executor", "futures-util", "glob", - "opentelemetry", + "opentelemetry 0.27.1", "percent-encoding", "rand 0.8.5", "serde_json", @@ -4835,6 +4963,23 @@ dependencies = [ "tracing", ] +[[package]] +name = "opentelemetry_sdk" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e14ae4f5991976fd48df6d843de219ca6d31b01daaab2dad5af2badeded372bd" +dependencies = [ + "futures-channel", + "futures-executor", + "futures-util", + "opentelemetry 0.31.0", + "percent-encoding", + "rand 0.9.2", + "thiserror 2.0.17", + "tokio", + "tokio-stream", +] + [[package]] name = "ordered-float" version = "2.10.1" @@ -7699,6 +7844,43 @@ dependencies = [ "tracing", ] +[[package]] +name = "tonic" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb7613188ce9f7df5bfe185db26c5814347d110db17920415cf2fbcad85e7203" +dependencies = [ + "async-trait", + "base64", + "bytes", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-timeout", + "hyper-util", + "percent-encoding", + "pin-project", + "sync_wrapper", + "tokio", + "tokio-stream", + "tower 0.5.2", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tonic-prost" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66bd50ad6ce1252d87ef024b3d64fe4c3cf54a86fb9ef4c631fdd0ded7aeaa67" +dependencies = [ + "bytes", + "prost 0.14.1", + "tonic 0.14.2", +] + [[package]] name = "tower" version = "0.4.13" @@ -7727,9 +7909,12 @@ checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" dependencies = [ "futures-core", "futures-util", + "indexmap 2.12.0", "pin-project-lite", + "slab", "sync_wrapper", "tokio", + "tokio-util", "tower-layer", "tower-service", "tracing", @@ -7823,6 +8008,49 @@ dependencies = [ "valuable", ] +[[package]] +name = "tracing-log" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" +dependencies = [ + "log", + "once_cell", + "tracing-core", +] + +[[package]] +name = "tracing-opentelemetry" +version = "0.32.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e6e5658463dd88089aba75c7791e1d3120633b1bfde22478b28f625a9bb1b8e" +dependencies = [ + "js-sys", + "opentelemetry 0.31.0", + "opentelemetry_sdk 0.31.0", + "rustversion", + "smallvec", + "thiserror 2.0.17", + "tracing", + "tracing-core", + "tracing-log", + "tracing-subscriber", + "web-time", +] + +[[package]] +name = "tracing-opentelemetry-instrumentation-sdk" +version = "0.32.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4aea2e5bee9df9d098d951baeccb84e614f12cbc74d01d99fb3ac99a55bf7bf7" +dependencies = [ + "http", + "opentelemetry 0.31.0", + "opentelemetry-semantic-conventions 0.31.0", + "tracing", + "tracing-opentelemetry", +] + [[package]] name = "tracing-serde" version = "0.2.0" @@ -7839,11 +8067,16 @@ version = "0.3.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2054a14f5307d601f88daf0553e1cbf472acc4f2c51afab632431cdcd72124d5" dependencies = [ + "matchers", + "nu-ansi-term", + "once_cell", + "regex-automata", "serde", "serde_json", "sharded-slab", "smallvec", "thread_local", + "tracing", "tracing-core", "tracing-serde", ] @@ -7860,6 +8093,7 @@ dependencies = [ "async-trait", "axum 0.8.7", "axum-test", + "axum-tracing-opentelemetry", "base64", "bytes", "chrono", @@ -7876,6 +8110,7 @@ dependencies = [ "hyper-util", "indexmap 2.12.0", "indoc", + "init-tracing-opentelemetry", "itertools 0.14.0", "jsonschema", "jsonwebtoken", @@ -7916,6 +8151,7 @@ dependencies = [ "tower-http", "tower-service", "tracing", + "tracing-opentelemetry-instrumentation-sdk", "tracing-subscriber", "trailbase-assets", "trailbase-build", diff --git a/crates/core/Cargo.toml b/crates/core/Cargo.toml index 7d6cae3c..95e33812 100644 --- a/crates/core/Cargo.toml +++ b/crates/core/Cargo.toml @@ -27,6 +27,7 @@ harness = false default = ["wasm"] v8 = ["dep:trailbase-js"] wasm = ["dep:trailbase-wasm-runtime-host"] +otel = ["dep:axum-tracing-opentelemetry", "dep:init-tracing-opentelemetry"] [dependencies] aes-gcm-siv = "0.11.1" @@ -36,6 +37,7 @@ async-channel = "2.3.1" async-trait = "0.1.80" axum = { workspace = true } base64 = { workspace = true } +axum-tracing-opentelemetry = { version = "0.32.0", optional = true } bytes = { version = "1.8.0", features = ["serde"] } chrono = "^0.4.38" client-ip = "0.1.1" @@ -49,6 +51,7 @@ hyper = "1.6.0" hyper-util = "0.1.7" indexmap = "2.11.4" indoc = "2.0.5" +init-tracing-opentelemetry = { version = "0.32.0", features = ["tracing_subscriber_ext", "metrics"], optional = true } itertools = "0.14.0" jsonschema = { version = "0.35.0", default-features = false } jsonwebtoken = { version = "^10.2.0", default-features = false, features = ["use_pem", "rust_crypto"] } @@ -85,6 +88,7 @@ tower-cookies = "0.11.0" tower-http = { version = "^0.6.0", default-features = false, features = ["cors", "trace", "fs", "limit"] } tower-service = { version = "0.3.3", default-features = false } tracing = { workspace = true } +tracing-opentelemetry-instrumentation-sdk = "0.32.0" tracing-subscriber = { workspace = true } trailbase-assets = { workspace = true } trailbase-build = { workspace = true } diff --git a/crates/core/src/server/mod.rs b/crates/core/src/server/mod.rs index dd734f51..abaf962a 100644 --- a/crates/core/src/server/mod.rs +++ b/crates/core/src/server/mod.rs @@ -40,7 +40,7 @@ pub use init::{InitArgs, InitError, init_app_state}; /// A set of options to configure serving behaviors. Changing any of these options /// requires a server restart, which makes them a natural fit for being exposed as command line /// arguments. -#[derive(Debug, Default)] +#[derive(Clone, Debug, Default)] pub struct ServerOptions { /// Optional path to static assets that will be served at the HTTP root. pub data_dir: DataDir, @@ -84,7 +84,7 @@ pub struct ServerOptions { /// TLS certificate path. pub tls_cert: Option>, /// TLS key path. - pub tls_key: Option>, + pub tls_key: Option>>, } pub struct Server { @@ -141,37 +141,7 @@ impl Server { }) .await?; - // Initialize tracing subscribers/layers. - // - // A few notes in case initialization below panics. The `log` and `tracing` crates/systems are - // mostly independent. Both like to be initialized only once given their global nature. There - // is a `.try_init()`, which has not effect when already initialized. - // - // Here we specifically only initialize `tracing`, since we critically rely on the - // `SqliteLogLayer`. We leave `log` initialization to the program level. - // - // The current setup prevents users from initializing tracing themselves. This is only relevant - // for the frameworks-use-case. If we wanted to allow it, we could check that if already - // initialized, the "logging::SqliteLogLayer" is present. - // - // If the `tracing_subscriber` crate is built with the default feature `tracing-log`, - // initializing `tracing` will also initialize the `log` crate. So this approach will only - // work if built w/o `tracing-log`. Otherwise, initializing `log` before will lead to a panic - // here. We do *not* want to use a `.try_init()` here, otherwise may silently miss - // `SqliteLogLayer`. - // - // Response log events are emitted at the INFO level, see `logging.rs` - let filter_layer = filter::Targets::new() - .with_default(filter::LevelFilter::OFF) - .with_target(crate::logging::EVENT_TARGET, crate::logging::LEVEL); - - tracing_subscriber::Registry::default() - .with(filter_layer) - .with(logging::SqliteLogLayer::new( - &state, - /* log-to-stdout= */ opts.log_responses, - )) - .init(); + Self::build_tracing(&state, opts.log_responses).init(); if new_data_dir { on_first_init(state.clone()) @@ -209,6 +179,61 @@ impl Server { }) } + fn build_tracing( + state: &AppState, + log_responses: bool, + ) -> impl tracing_subscriber::layer::SubscriberExt { + // Initialize tracing subscribers/layers. + // + // A few notes in case initialization below panics. The `log` and `tracing` crates/systems are + // mostly independent. Both like to be initialized only once given their global nature. There + // is a `.try_init()`, which has not effect when already initialized. + // + // Here we specifically only initialize `tracing`, since we critically rely on the + // `SqliteLogLayer`. We leave `log` initialization to the program level. + // + // The current setup prevents users from initializing tracing themselves. This is only relevant + // for the frameworks-use-case. If we wanted to allow it, we could check that if already + // initialized, the "logging::SqliteLogLayer" is present. + // + // If the `tracing_subscriber` crate is built with the default feature `tracing-log`, + // initializing `tracing` will also initialize the `log` crate. So this approach will only + // work if built w/o `tracing-log`. Otherwise, initializing `log` before will lead to a panic + // here. We do *not* want to use a `.try_init()` here, otherwise may silently miss + // `SqliteLogLayer`. + // + // Response log events are emitted at the INFO level, see `logging.rs` + #[cfg(not(feature = "otel"))] + let subscriber = tracing_subscriber::Registry::default(); + + #[cfg(feature = "otel")] + let subscriber = { + let (subscriber, otel_guard) = + init_tracing_opentelemetry::tracing_subscriber_ext::regiter_otel_layers( + tracing_subscriber::Registry::default(), + ) + .expect("startup"); + + // TODO: We have to keep this alive. Let's find something better than a singleton. + use std::sync::OnceLock; + static SINGLETON: OnceLock = OnceLock::new(); + SINGLETON.get_or_init(move || init_tracing_opentelemetry::Guard::global(Some(otel_guard))); + + subscriber + }; + + let filter_layer = filter::Targets::new() + .with_default(filter::LevelFilter::OFF) + .with_target(crate::logging::EVENT_TARGET, crate::logging::LEVEL); + + return subscriber + .with(filter_layer) + .with(logging::SqliteLogLayer::new( + state, + /* log-to-stdout= */ log_responses, + )); + } + pub async fn serve(self) -> Result<(), Box> { // Install a HUP/hangup signal handler to reload config. #[cfg(unix)] @@ -409,11 +434,16 @@ impl Server { ); } - fn wrap_with_default_layers( + pub fn wrap_with_default_layers( state: &AppState, opts: &ServerOptions, router: Router, ) -> Router<()> { + #[cfg(feature = "otel")] + let router = router + .layer(axum_tracing_opentelemetry::middleware::OtelInResponseLayer) + .layer(axum_tracing_opentelemetry::middleware::OtelAxumLayer::default()); + return router .layer(CookieManagerLayer::new()) .layer(build_cors(opts)) diff --git a/crates/core/tests/integration_test.rs b/crates/core/tests/integration_test.rs index d3b2fdbe..bb747c4d 100644 --- a/crates/core/tests/integration_test.rs +++ b/crates/core/tests/integration_test.rs @@ -3,7 +3,6 @@ use axum::http::StatusCode; use axum_test::TestServer; use axum_test::multipart::MultipartForm; use tower_cookies::Cookie; -use tracing_subscriber::prelude::*; use trailbase_sqlite::params; use trailbase::AppState; @@ -34,13 +33,7 @@ fn integration_tests() { async fn test_record_apis() { let data_dir = temp_dir::TempDir::new().unwrap(); - - let Server { - state, - main_router, - admin_router, - tls, - } = Server::init(ServerOptions { + let options = ServerOptions { data_dir: DataDir(data_dir.path().to_path_buf()), address: "localhost:4041".to_string(), admin_address: None, @@ -49,9 +42,14 @@ async fn test_record_apis() { cors_allowed_origins: vec![], runtime_threads: None, ..Default::default() - }) - .await - .unwrap(); + }; + + let Server { + state, + main_router, + admin_router, + tls, + } = Server::init(options.clone()).await.unwrap(); assert!(admin_router.is_none()); assert!(tls.is_none()); @@ -93,13 +91,54 @@ async fn test_record_apis() { add_user_to_room(conn, user_x, room).await.unwrap(); - // Set up logging: declares **where** tracing is being logged to, e.g. stderr, file, sqlite. - tracing_subscriber::Registry::default() - .with(trailbase::logging::SqliteLogLayer::new(&state, false)) - .set_default(); + #[allow(unused_mut)] + let (_address, mut router) = main_router; + + #[cfg(feature = "otel")] + { + #[tracing::instrument] + async fn trace_id() -> impl axum::response::IntoResponse { + // Publish OTEL metrics: + // https://docs.rs/tracing-opentelemetry/latest/tracing_opentelemetry/struct.MetricsLayer.html + // + // To publish a new metric, add a key-value pair to your tracing::Event that contains + // following prefixes: + // + // monotonic_counter. (non-negative numbers): Used when the counter should only ever + // increase counter.: Used when the counter can go up or down + // histogram.: Used to report arbitrary values that are likely to be statistically + // meaningful gauge.: Used to report instantaneous values that can go up or down + tracing::info!(monotonic_counter.index = 1); + + return axum::Json(serde_json::json!({ + "id": tracing_opentelemetry_instrumentation_sdk::find_current_trace_id().unwrap(), + })); + } + + // NOTE: the wrapping is needed to have the OTEL layers touch the route. + router = router.merge(Server::wrap_with_default_layers( + &state, + &options, + axum::Router::new().route("/trace", axum::routing::get(trace_id)), + )); + } { - let server = TestServer::new(main_router.1).unwrap(); + let server = TestServer::new(router).unwrap(); + + #[cfg(feature = "otel")] + { + let trace_response = server.get("/trace").await; + assert_eq!(trace_response.status_code(), StatusCode::OK); + + #[derive(serde::Deserialize)] + struct TraceResponse { + id: String, + } + + let TraceResponse { id } = trace_response.json(); + assert_ne!(id, ""); + } { // User X can post to a JSON message. diff --git a/crates/core/tests/tls_test.rs b/crates/core/tests/tls_test.rs index 21761c6c..63650c1c 100644 --- a/crates/core/tests/tls_test.rs +++ b/crates/core/tests/tls_test.rs @@ -1,4 +1,5 @@ use rcgen::{CertifiedKey, generate_simple_self_signed}; +use std::sync::Arc; use tokio_rustls::rustls::pki_types::{PrivateKeyDer, pem::PemObject}; use tracing::*; use trailbase::{DataDir, Server, ServerOptions}; @@ -32,7 +33,7 @@ fn test_https_serving() { dev: false, cors_allowed_origins: vec![], runtime_threads: None, - tls_key: Some(tls_key), + tls_key: Some(Arc::new(tls_key)), tls_cert: Some(cert.der().clone()), ..Default::default() })