test_grpc_upstream_404_maps_to_404

2026-05-05 15:49:13 -05:00 · 2025-10-12 06:59:00 -04:00
parent 665885562e
commit f5d0d34993
121 changed files with 13625 additions and 7337 deletions
@@ -0,0 +1,167 @@
+#!/usr/bin/env python3
+"""
+Capture CPU and event-loop lag statistics for a running Doorman process.
+
+Writes a JSON file (perf-stats.json) alongside k6 results so compare_perf.py
+can print these figures in the diff report.
+
+Note: Loop lag is measured by this monitor's own asyncio loop as an
+approximation of scheduler pressure on the host. It does not instrument the
+server's internal loop directly, but correlates under shared host load.
+"""
+
+from __future__ import annotations
+import argparse
+import asyncio
+import json
+import os
+import signal
+import statistics
+import sys
+import time
+from pathlib import Path
+
+try:
+    import psutil  # type: ignore
+except Exception:
+    psutil = None  # type: ignore
+
+
+def parse_args() -> argparse.Namespace:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--pid", type=int, help="PID of the target process")
+    ap.add_argument("--pidfile", type=str, default="backend-services/doorman.pid",
+                    help="Path to PID file (used if --pid not provided)")
+    ap.add_argument("--output", type=str, default="load-tests/perf-stats.json",
+                    help="Output JSON path")
+    ap.add_argument("--cpu-interval", type=float, default=0.5,
+                    help="CPU sampling interval seconds")
+    ap.add_argument("--lag-interval", type=float, default=0.05,
+                    help="Loop lag sampling interval seconds")
+    ap.add_argument("--timeout", type=float, default=0.0,
+                    help="Optional timeout seconds; 0 = until process exits or SIGTERM")
+    return ap.parse_args()
+
+
+def read_pid(pid: int | None, pidfile: str) -> int | None:
+    if pid:
+        return pid
+    try:
+        with open(pidfile, "r") as f:
+            return int(f.read().strip())
+    except Exception:
+        return None
+
+
+async def sample_cpu(proc: "psutil.Process", interval: float, stop: asyncio.Event, samples: list[float]):
+    # Prime cpu_percent() baseline
+    try:
+        proc.cpu_percent(None)
+    except Exception:
+        pass
+    while not stop.is_set():
+        try:
+            val = await asyncio.to_thread(proc.cpu_percent, interval)
+            samples.append(float(val))
+        except Exception:
+            await asyncio.sleep(interval)
+            continue
+
+
+async def sample_loop_lag(interval: float, stop: asyncio.Event, lags_ms: list[float]):
+    # Measure scheduling delay over requested interval
+    next_ts = time.perf_counter() + interval
+    while not stop.is_set():
+        await asyncio.sleep(max(0.0, next_ts - time.perf_counter()))
+        now = time.perf_counter()
+        expected = next_ts
+        lag = max(0.0, (now - expected) * 1000.0)  # ms
+        lags_ms.append(lag)
+        next_ts = expected + interval
+
+
+def percentile(values: list[float], p: float) -> float:
+    if not values:
+        return 0.0
+    values = sorted(values)
+    k = int(max(0, min(len(values) - 1, round((p / 100.0) * (len(values) - 1)))))
+    return float(values[k])
+
+
+async def main() -> int:
+    if psutil is None:
+        print("psutil is not installed; CPU stats unavailable", file=sys.stderr)
+        return 1
+
+    args = parse_args()
+    pid = read_pid(args.pid, args.pidfile)
+    if not pid:
+        print(f"No PID found (pidfile: {args.pidfile}). Is the server running?", file=sys.stderr)
+        return 2
+
+    try:
+        proc = psutil.Process(pid)
+    except Exception as e:
+        print(f"Failed to attach to PID {pid}: {e}", file=sys.stderr)
+        return 3
+
+    stop = asyncio.Event()
+
+    def _handle_sig(*_):
+        stop.set()
+
+    for s in (signal.SIGINT, signal.SIGTERM):
+        try:
+            signal.signal(s, _handle_sig)
+        except Exception:
+            pass
+
+    cpu_samples: list[float] = []
+    lag_samples_ms: list[float] = []
+
+    tasks = [
+        asyncio.create_task(sample_cpu(proc, args.cpu_interval, stop, cpu_samples)),
+        asyncio.create_task(sample_loop_lag(args.lag_interval, stop, lag_samples_ms)),
+    ]
+
+    start = time.time()
+    try:
+        while not stop.is_set():
+            # Exit if target process is gone
+            if not proc.is_running():
+                break
+            if args.timeout > 0 and (time.time() - start) >= args.timeout:
+                break
+            await asyncio.sleep(0.2)
+    finally:
+        stop.set()
+        for t in tasks:
+            try:
+                await asyncio.wait_for(t, timeout=2.0)
+            except Exception:
+                pass
+
+    out = {
+        "cpu_percent_avg": round(statistics.fmean(cpu_samples), 2) if cpu_samples else 0.0,
+        "cpu_percent_p95": round(percentile(cpu_samples, 95), 2) if cpu_samples else 0.0,
+        "cpu_samples": len(cpu_samples),
+        "loop_lag_ms_p95": round(percentile(lag_samples_ms, 95), 2) if lag_samples_ms else 0.0,
+        "loop_lag_samples": len(lag_samples_ms),
+    }
+
+    try:
+        out_path = Path(args.output)
+        out_path.parent.mkdir(parents=True, exist_ok=True)
+        with out_path.open("w", encoding="utf-8") as f:
+            json.dump(out, f, indent=2)
+        print(f"Wrote perf stats: {out_path}")
+    except Exception as e:
+        print(f"Failed to write output: {e}", file=sys.stderr)
+        return 4
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(asyncio.run(main()))
+
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+import json
+import sys
+from pathlib import Path
+
+REGRESSION_THRESHOLD = 0.10  # 10%
+
+def load_summary(path: Path):
+    with path.open('r', encoding='utf-8') as f:
+        return json.load(f)
+
+def extract_metrics(summary: dict):
+    m = summary.get('metrics', {})
+    http = m.get('http_req_duration', {}).get('values', {})
+    http_reqs = m.get('http_reqs', {}).get('values', {})
+    p50 = float(http.get('p(50)', 0.0))
+    p95 = float(http.get('p(95)', 0.0))
+    p99 = float(http.get('p(99)', 0.0))
+    # Prefer provided rate; fallback to 0 if missing
+    rps = float(http_reqs.get('rate', 0.0))
+    return {
+        'p50': p50,
+        'p95': p95,
+        'p99': p99,
+        'rps': rps,
+    }
+
+def main():
+    if len(sys.argv) < 3:
+        print('Usage: compare_perf.py <current_summary.json> <baseline_summary.json>')
+        sys.exit(2)
+    current = Path(sys.argv[1])
+    baseline = Path(sys.argv[2])
+    if not current.exists():
+        print(f'Current summary not found: {current}')
+        sys.exit(2)
+    if not baseline.exists():
+        print(f'Baseline summary not found: {baseline}')
+        sys.exit(2)
+
+    cur = load_summary(current)
+    base = load_summary(baseline)
+    curm = extract_metrics(cur)
+    basem = extract_metrics(base)
+
+    print('Baseline metrics:')
+    print(f"  p50={basem['p50']:.2f}ms  p95={basem['p95']:.2f}ms  p99={basem['p99']:.2f}ms  rps={basem['rps']:.2f}")
+    print('Current metrics:')
+    print(f"  p50={curm['p50']:.2f}ms  p95={curm['p95']:.2f}ms  p99={curm['p99']:.2f}ms  rps={curm['rps']:.2f}")
+
+    failures = []
+    # p95 must not regress more than +10%
+    for q in ('p50', 'p95', 'p99'):
+        base_v = basem[q]
+        cur_v = curm[q]
+        if base_v > 0:
+            allowed = base_v * (1.0 + REGRESSION_THRESHOLD)
+            if cur_v > allowed:
+                failures.append(f"{q} regression: {cur_v:.2f}ms > {allowed:.2f}ms (baseline {base_v:.2f}ms)")
+
+    # RPS must not drop more than -10%
+    base_rps = basem['rps']
+    cur_rps = curm['rps']
+    if base_rps > 0:
+        allowed_rps = base_rps * (1.0 - REGRESSION_THRESHOLD)
+        if cur_rps < allowed_rps:
+            failures.append(f'RPS regression: {cur_rps:.2f} < {allowed_rps:.2f} (baseline {base_rps:.2f})')
+
+    # Optional: compare CPU/event-loop utilization if perf-stats.json files are present alongside summaries
+    try:
+        cur_stats = (current.parent / 'perf-stats.json')
+        base_stats = (baseline.parent / 'perf-stats.json')
+        if cur_stats.exists() and base_stats.exists():
+            cstats = load_summary(cur_stats)
+            bstats = load_summary(base_stats)
+            for key in ('cpu_percent', 'loop_lag_ms_p95'):
+                if key in cstats and key in bstats:
+                    print(f"{key}: baseline={bstats[key]} current={cstats[key]}")
+                
+    except Exception:
+        pass
+
+    if failures:
+        print('Perf regression detected:')
+        for f in failures:
+            print(f'- {f}')
+        sys.exit(1)
+    print('Performance within regression thresholds.')
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Runs k6 load-tests/k6-load-test.js, captures CPU/loop-lag stats while running,
+# and compares results against a baseline summary via scripts/compare_perf.py.
+
+BASE_URL=${BASE_URL:-http://localhost:8000}
+BASELINE_JSON=${BASELINE_JSON:-load-tests/baseline/k6-summary.json}
+CURRENT_JSON=${CURRENT_JSON:-load-tests/k6-summary.json}
+PERF_JSON=${PERF_JSON:-load-tests/perf-stats.json}
+
+echo "Using BASE_URL=${BASE_URL}"
+echo "Baseline file: ${BASELINE_JSON}"
+
+if ! command -v k6 >/dev/null 2>&1; then
+  echo "Error: k6 is not installed. Install from https://k6.io/docs/get-started/installation/" >&2
+  exit 2
+fi
+
+if ! command -v python3 >/dev/null 2>&1; then
+  echo "Error: python3 not found" >&2
+  exit 2
+fi
+
+# Launch perf capture in background (optional; will fall back gracefully if pidfile is missing)
+python3 scripts/capture_perf_stats.py --output "${PERF_JSON}" --pidfile backend-services/doorman.pid --timeout 0 \
+  >/dev/null 2>&1 &
+MONITOR_PID=$!
+
+cleanup() {
+  if kill -0 "${MONITOR_PID}" >/dev/null 2>&1; then
+    kill "${MONITOR_PID}" >/dev/null 2>&1 || true
+    wait "${MONITOR_PID}" || true
+  fi
+}
+trap cleanup EXIT INT TERM
+
+echo "Running k6 load test..."
+K6_CMD=(k6 run load-tests/k6-load-test.js --env BASE_URL="${BASE_URL}")
+"${K6_CMD[@]}"
+
+echo
+echo "k6 summary written to: ${CURRENT_JSON}"
+
+if [ ! -f "${BASELINE_JSON}" ]; then
+  echo "Baseline summary not found at ${BASELINE_JSON}" >&2
+  echo "Create one by copying a known-good run, e.g.:" >&2
+  echo "  mkdir -p \"$(dirname \"${BASELINE_JSON}\")\" && cp '${CURRENT_JSON}' '${BASELINE_JSON}'" >&2
+  exit 3
+fi
+
+echo
+echo "Comparing current vs baseline..."
+python3 scripts/compare_perf.py "${CURRENT_JSON}" "${BASELINE_JSON}"
+