Files
TimeTracker/app/telemetry/service.py
T
Dries Peeters 4007ee2ca8 feat(observability): add OpenTelemetry traces, OTLP metrics, and log correlation
Introduce a centralized OTel layer (app/telemetry/otel_setup.py) that reuses
the existing OTLP endpoint and token, exports traces and metrics over OTLP/HTTP,
and instruments Flask plus SQLAlchemy. Manual OTLP log export remains for base
and product analytics; log records now include trace_id, span_id, and
event_category where tracing is active.

Business spans and product metrics cover invoices, timers, reports, auth,
webhook delivery, and scheduled jobs. RED-style HTTP metrics are recorded in
after_request alongside existing Prometheus counters. ENABLE_TRACING and
ENABLE_METRICS default on when credentials exist; graceful no-op when they do not.

Privacy is preserved: user_id appears on traces only when detailed analytics is
opted in; metrics never carry user_id; _remove_pii behavior for analytics is
unchanged. Responses inject traceparent when tracing is enabled for future
browser correlation.

Tests: test_otel_integration.py and per-test reset_for_testing() in conftest
so each app factory can reinitialize OTel.
2026-03-28 17:32:18 +01:00

334 lines
12 KiB
Python

"""
Consent-aware telemetry service backed by Grafana Cloud OTLP.
- Base telemetry is always-on and anonymous per installation.
- Detailed analytics is sent only when the user opted in.
"""
import json
import logging
import os
import platform
import base64
import time
from datetime import datetime, timezone
from typing import Any, Dict, List, Optional
from urllib import request
from urllib.parse import urlparse
logger = logging.getLogger(__name__)
BASE_SCHEMA_KEYS = frozenset(
{
"install_id",
"telemetry_fingerprint",
"app_version",
"platform",
"os_version",
"architecture",
"locale",
"timezone",
"first_seen_at",
"last_seen_at",
"heartbeat_at",
"release_channel",
"deployment_type",
}
)
def is_detailed_analytics_enabled() -> bool:
from app.utils.telemetry import is_telemetry_enabled
return is_telemetry_enabled()
def _build_base_telemetry_payload(event_kind: str) -> Dict[str, Any]:
from app.config.analytics_defaults import get_analytics_config
from app.utils.installation import get_installation_config
from app.utils.telemetry import get_telemetry_fingerprint
config = get_analytics_config()
inst = get_installation_config()
now = datetime.now(timezone.utc).isoformat()
first_seen = inst.get_base_first_seen_sent_at() or now
payload = {
"install_id": inst.get_install_id(),
"telemetry_fingerprint": get_telemetry_fingerprint(),
"app_version": config.get("app_version", "unknown"),
"platform": platform.system(),
"os_version": platform.release(),
"architecture": platform.machine(),
"locale": (os.getenv("LANG") or os.getenv("LC_ALL") or "unknown")[:5] or "unknown",
"timezone": os.getenv("TZ", "UTC"),
"first_seen_at": first_seen,
"last_seen_at": now,
"heartbeat_at": now,
"release_channel": os.getenv("RELEASE_CHANNEL", "default"),
"deployment_type": "docker" if os.path.exists("/.dockerenv") else "native",
}
if event_kind == "first_seen":
payload["first_seen_at"] = now
return payload
def _otlp_enabled() -> bool:
from app.config.analytics_defaults import get_analytics_config
config = get_analytics_config()
endpoint = (
config.get("otel_exporter_otlp_endpoint")
or os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "")
)
token = config.get("otel_exporter_otlp_token") or os.getenv("OTEL_EXPORTER_OTLP_TOKEN", "")
return bool(endpoint and token)
def _build_otlp_auth_header(token: str) -> str:
"""
Build OTLP Authorization header from a single token input.
Accepted token formats:
- "Basic <base64>"
- "<instance_id>:<token>" -> converted to Basic
- "<base64blob>" -> treated as Basic payload
"""
value = (token or "").strip()
if value.lower().startswith("basic "):
return value
if ":" in value:
encoded = base64.b64encode(value.encode("utf-8")).decode("ascii")
return f"Basic {encoded}"
return f"Basic {value}"
def _telemetry_debug_logging_enabled() -> bool:
return (os.getenv("OTEL_DEBUG_LOGGING", "false") or "").strip().lower() in {"1", "true", "yes", "on"}
def _remove_pii(properties: Dict[str, Any]) -> Dict[str, Any]:
pii_keys = {"email", "username", "ip", "ip_address", "full_name", "name", "password", "token"}
return {k: v for k, v in properties.items() if k.lower() not in pii_keys}
def event_category_for_event_name(event_name: str) -> str:
"""First segment of dotted event names; product analytics screen events use analytics."""
if event_name.startswith("$"):
return "analytics"
if "." in event_name:
return event_name.split(".", 1)[0]
return "general"
def _otlp_correlation_attributes(event_name: str) -> List[Dict[str, Any]]:
"""trace_id, span_id, event_category for OTLP log records (no PII)."""
rows = [
{"key": "event_category", "value": {"stringValue": event_category_for_event_name(event_name)}},
]
try:
from app.telemetry.otel_setup import get_trace_context_for_logs, is_otel_tracing_active
if is_otel_tracing_active():
ctx = get_trace_context_for_logs()
tid = ctx.get("trace_id")
sid = ctx.get("span_id")
if tid:
rows.append({"key": "trace_id", "value": {"stringValue": tid}})
if sid:
rows.append({"key": "span_id", "value": {"stringValue": sid}})
except Exception:
pass
return rows
def _to_otlp_any_value(value: Any) -> Dict[str, Any]:
if isinstance(value, bool):
return {"boolValue": value}
if isinstance(value, int):
return {"intValue": str(value)}
if isinstance(value, float):
return {"doubleValue": value}
return {"stringValue": str(value)}
def _build_otlp_logs_payload(
event_name: str,
identity: str,
detailed: bool,
safe_props: Dict[str, Any],
service_version: str,
) -> Dict[str, Any]:
now_nanos = str(int(time.time() * 1_000_000_000))
resource_attributes = [
{"key": "service.name", "value": {"stringValue": "timetracker"}},
{"key": "service.version", "value": {"stringValue": str(service_version or "unknown")}},
{"key": "deployment.environment", "value": {"stringValue": os.getenv("FLASK_ENV", "production")}},
]
record_attributes = [
{"key": "event_name", "value": {"stringValue": event_name}},
{"key": "identity", "value": {"stringValue": str(identity)}},
{"key": "detailed", "value": {"boolValue": bool(detailed)}},
]
record_attributes.extend(_otlp_correlation_attributes(event_name))
for key, value in safe_props.items():
record_attributes.append({"key": str(key), "value": _to_otlp_any_value(value)})
return {
"resourceLogs": [
{
"resource": {"attributes": resource_attributes},
"scopeLogs": [
{
"scope": {"name": "timetracker.telemetry"},
"logRecords": [
{
"timeUnixNano": now_nanos,
"severityText": "INFO",
"body": {"stringValue": event_name},
"attributes": record_attributes,
}
],
}
],
}
]
}
def _send_otlp_event(event_name: str, identity: str, properties: Dict[str, Any], detailed: bool) -> bool:
from app.config.analytics_defaults import get_analytics_config
config = get_analytics_config()
endpoint = config.get("otel_exporter_otlp_endpoint") or os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "")
token = config.get("otel_exporter_otlp_token") or os.getenv("OTEL_EXPORTER_OTLP_TOKEN", "")
if not endpoint or not token:
if _telemetry_debug_logging_enabled():
logger.info(
"telemetry.skip event=%s reason=missing_otlp_config endpoint_set=%s token_set=%s",
event_name,
bool(endpoint),
bool(token),
)
return False
# Support OTEL-style base endpoint by auto-targeting logs path.
endpoint = endpoint.rstrip("/")
if endpoint.endswith("/otlp"):
endpoint = f"{endpoint}/v1/logs"
elif not endpoint.endswith("/v1/logs"):
endpoint = f"{endpoint}/v1/logs"
safe_props = _remove_pii(properties) if detailed else properties
payload = _build_otlp_logs_payload(
event_name=event_name,
identity=str(identity),
detailed=detailed,
safe_props=safe_props,
service_version=str(config.get("app_version", "unknown")),
)
body = json.dumps(payload).encode("utf-8")
auth_header = _build_otlp_auth_header(token)
headers = {
"Content-Type": "application/json",
"Authorization": auth_header,
}
if _telemetry_debug_logging_enabled():
parsed = urlparse(endpoint)
auth_mode = "basic_from_colon" if ":" in token and not token.lower().startswith("basic ") else "basic_direct"
logger.info(
"telemetry.send event=%s detailed=%s endpoint=%s://%s%s auth_mode=%s identity_len=%s props_count=%s",
event_name,
detailed,
parsed.scheme or "https",
parsed.netloc,
parsed.path,
auth_mode,
len(str(identity)),
len(safe_props),
)
req = request.Request(
endpoint,
data=body,
method="POST",
headers=headers,
)
try:
with request.urlopen(req, timeout=5) as response:
if _telemetry_debug_logging_enabled():
logger.info("telemetry.ok event=%s status=%s", event_name, getattr(response, "status", "unknown"))
return True
except Exception as exc:
logger.warning("telemetry.fail event=%s error=%s", event_name, exc)
return False
def send_base_telemetry(payload: Dict[str, Any]) -> bool:
install_id = payload.get("install_id")
if not install_id:
return False
event_name = payload.get("_event", "base_telemetry.heartbeat")
props = {k: v for k, v in payload.items() if k != "_event"}
return _send_otlp_event(event_name=event_name, identity=str(install_id), properties=props, detailed=False)
def send_base_first_seen() -> bool:
from app.utils.installation import get_installation_config
inst = get_installation_config()
if inst.get_base_first_seen_sent_at():
return False
payload = _build_base_telemetry_payload("first_seen")
payload["_event"] = "base_telemetry.first_seen"
payload["first_seen_at"] = datetime.now(timezone.utc).isoformat()
if send_base_telemetry(payload):
inst.set_base_first_seen_sent_at(payload["first_seen_at"])
return True
return False
def send_base_heartbeat() -> bool:
payload = _build_base_telemetry_payload("heartbeat")
payload["_event"] = "base_telemetry.heartbeat"
return send_base_telemetry(payload)
def identify_user(user_id: Any, properties: Optional[Dict[str, Any]] = None) -> None:
if not is_detailed_analytics_enabled():
return
_send_otlp_event("analytics.identify", str(user_id), properties or {}, detailed=True)
def send_analytics_event(user_id: Any, event_name: str, properties: Optional[Dict[str, Any]] = None) -> None:
if not is_detailed_analytics_enabled():
return
from app.config.analytics_defaults import get_analytics_config
from app.utils.installation import get_installation_config
from app.utils.telemetry import get_telemetry_fingerprint
config = get_analytics_config()
enhanced = dict(properties or {})
enhanced["install_id"] = get_installation_config().get_install_id()
enhanced["telemetry_fingerprint"] = get_telemetry_fingerprint()
enhanced["environment"] = os.getenv("FLASK_ENV", "production")
enhanced["app_version"] = config.get("app_version")
enhanced["deployment_method"] = "docker" if os.path.exists("/.dockerenv") else "native"
try:
from flask import request as flask_request
if flask_request:
enhanced["current_url"] = flask_request.url
enhanced["host"] = flask_request.host
enhanced["pathname"] = flask_request.path
enhanced["browser"] = getattr(flask_request.user_agent, "browser", None)
enhanced["device_type"] = (
"mobile" if getattr(flask_request.user_agent, "platform", None) in ["android", "iphone"] else "desktop"
)
enhanced["os"] = getattr(flask_request.user_agent, "platform", None)
except Exception:
pass
_send_otlp_event(event_name=event_name, identity=str(user_id), properties=enhanced, detailed=True)