mirror of
https://github.com/apidoorman/doorman.git
synced 2026-05-05 15:49:13 -05:00
test_grpc_upstream_404_maps_to_404
This commit is contained in:
@@ -0,0 +1,167 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Capture CPU and event-loop lag statistics for a running Doorman process.
|
||||
|
||||
Writes a JSON file (perf-stats.json) alongside k6 results so compare_perf.py
|
||||
can print these figures in the diff report.
|
||||
|
||||
Note: Loop lag is measured by this monitor's own asyncio loop as an
|
||||
approximation of scheduler pressure on the host. It does not instrument the
|
||||
server's internal loop directly, but correlates under shared host load.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import signal
|
||||
import statistics
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
import psutil # type: ignore
|
||||
except Exception:
|
||||
psutil = None # type: ignore
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--pid", type=int, help="PID of the target process")
|
||||
ap.add_argument("--pidfile", type=str, default="backend-services/doorman.pid",
|
||||
help="Path to PID file (used if --pid not provided)")
|
||||
ap.add_argument("--output", type=str, default="load-tests/perf-stats.json",
|
||||
help="Output JSON path")
|
||||
ap.add_argument("--cpu-interval", type=float, default=0.5,
|
||||
help="CPU sampling interval seconds")
|
||||
ap.add_argument("--lag-interval", type=float, default=0.05,
|
||||
help="Loop lag sampling interval seconds")
|
||||
ap.add_argument("--timeout", type=float, default=0.0,
|
||||
help="Optional timeout seconds; 0 = until process exits or SIGTERM")
|
||||
return ap.parse_args()
|
||||
|
||||
|
||||
def read_pid(pid: int | None, pidfile: str) -> int | None:
|
||||
if pid:
|
||||
return pid
|
||||
try:
|
||||
with open(pidfile, "r") as f:
|
||||
return int(f.read().strip())
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
async def sample_cpu(proc: "psutil.Process", interval: float, stop: asyncio.Event, samples: list[float]):
|
||||
# Prime cpu_percent() baseline
|
||||
try:
|
||||
proc.cpu_percent(None)
|
||||
except Exception:
|
||||
pass
|
||||
while not stop.is_set():
|
||||
try:
|
||||
val = await asyncio.to_thread(proc.cpu_percent, interval)
|
||||
samples.append(float(val))
|
||||
except Exception:
|
||||
await asyncio.sleep(interval)
|
||||
continue
|
||||
|
||||
|
||||
async def sample_loop_lag(interval: float, stop: asyncio.Event, lags_ms: list[float]):
|
||||
# Measure scheduling delay over requested interval
|
||||
next_ts = time.perf_counter() + interval
|
||||
while not stop.is_set():
|
||||
await asyncio.sleep(max(0.0, next_ts - time.perf_counter()))
|
||||
now = time.perf_counter()
|
||||
expected = next_ts
|
||||
lag = max(0.0, (now - expected) * 1000.0) # ms
|
||||
lags_ms.append(lag)
|
||||
next_ts = expected + interval
|
||||
|
||||
|
||||
def percentile(values: list[float], p: float) -> float:
|
||||
if not values:
|
||||
return 0.0
|
||||
values = sorted(values)
|
||||
k = int(max(0, min(len(values) - 1, round((p / 100.0) * (len(values) - 1)))))
|
||||
return float(values[k])
|
||||
|
||||
|
||||
async def main() -> int:
|
||||
if psutil is None:
|
||||
print("psutil is not installed; CPU stats unavailable", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
args = parse_args()
|
||||
pid = read_pid(args.pid, args.pidfile)
|
||||
if not pid:
|
||||
print(f"No PID found (pidfile: {args.pidfile}). Is the server running?", file=sys.stderr)
|
||||
return 2
|
||||
|
||||
try:
|
||||
proc = psutil.Process(pid)
|
||||
except Exception as e:
|
||||
print(f"Failed to attach to PID {pid}: {e}", file=sys.stderr)
|
||||
return 3
|
||||
|
||||
stop = asyncio.Event()
|
||||
|
||||
def _handle_sig(*_):
|
||||
stop.set()
|
||||
|
||||
for s in (signal.SIGINT, signal.SIGTERM):
|
||||
try:
|
||||
signal.signal(s, _handle_sig)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
cpu_samples: list[float] = []
|
||||
lag_samples_ms: list[float] = []
|
||||
|
||||
tasks = [
|
||||
asyncio.create_task(sample_cpu(proc, args.cpu_interval, stop, cpu_samples)),
|
||||
asyncio.create_task(sample_loop_lag(args.lag_interval, stop, lag_samples_ms)),
|
||||
]
|
||||
|
||||
start = time.time()
|
||||
try:
|
||||
while not stop.is_set():
|
||||
# Exit if target process is gone
|
||||
if not proc.is_running():
|
||||
break
|
||||
if args.timeout > 0 and (time.time() - start) >= args.timeout:
|
||||
break
|
||||
await asyncio.sleep(0.2)
|
||||
finally:
|
||||
stop.set()
|
||||
for t in tasks:
|
||||
try:
|
||||
await asyncio.wait_for(t, timeout=2.0)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
out = {
|
||||
"cpu_percent_avg": round(statistics.fmean(cpu_samples), 2) if cpu_samples else 0.0,
|
||||
"cpu_percent_p95": round(percentile(cpu_samples, 95), 2) if cpu_samples else 0.0,
|
||||
"cpu_samples": len(cpu_samples),
|
||||
"loop_lag_ms_p95": round(percentile(lag_samples_ms, 95), 2) if lag_samples_ms else 0.0,
|
||||
"loop_lag_samples": len(lag_samples_ms),
|
||||
}
|
||||
|
||||
try:
|
||||
out_path = Path(args.output)
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with out_path.open("w", encoding="utf-8") as f:
|
||||
json.dump(out, f, indent=2)
|
||||
print(f"Wrote perf stats: {out_path}")
|
||||
except Exception as e:
|
||||
print(f"Failed to write output: {e}", file=sys.stderr)
|
||||
return 4
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(asyncio.run(main()))
|
||||
|
||||
@@ -0,0 +1,91 @@
|
||||
#!/usr/bin/env python3
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
REGRESSION_THRESHOLD = 0.10 # 10%
|
||||
|
||||
def load_summary(path: Path):
|
||||
with path.open('r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
|
||||
def extract_metrics(summary: dict):
|
||||
m = summary.get('metrics', {})
|
||||
http = m.get('http_req_duration', {}).get('values', {})
|
||||
http_reqs = m.get('http_reqs', {}).get('values', {})
|
||||
p50 = float(http.get('p(50)', 0.0))
|
||||
p95 = float(http.get('p(95)', 0.0))
|
||||
p99 = float(http.get('p(99)', 0.0))
|
||||
# Prefer provided rate; fallback to 0 if missing
|
||||
rps = float(http_reqs.get('rate', 0.0))
|
||||
return {
|
||||
'p50': p50,
|
||||
'p95': p95,
|
||||
'p99': p99,
|
||||
'rps': rps,
|
||||
}
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 3:
|
||||
print('Usage: compare_perf.py <current_summary.json> <baseline_summary.json>')
|
||||
sys.exit(2)
|
||||
current = Path(sys.argv[1])
|
||||
baseline = Path(sys.argv[2])
|
||||
if not current.exists():
|
||||
print(f'Current summary not found: {current}')
|
||||
sys.exit(2)
|
||||
if not baseline.exists():
|
||||
print(f'Baseline summary not found: {baseline}')
|
||||
sys.exit(2)
|
||||
|
||||
cur = load_summary(current)
|
||||
base = load_summary(baseline)
|
||||
curm = extract_metrics(cur)
|
||||
basem = extract_metrics(base)
|
||||
|
||||
print('Baseline metrics:')
|
||||
print(f" p50={basem['p50']:.2f}ms p95={basem['p95']:.2f}ms p99={basem['p99']:.2f}ms rps={basem['rps']:.2f}")
|
||||
print('Current metrics:')
|
||||
print(f" p50={curm['p50']:.2f}ms p95={curm['p95']:.2f}ms p99={curm['p99']:.2f}ms rps={curm['rps']:.2f}")
|
||||
|
||||
failures = []
|
||||
# p95 must not regress more than +10%
|
||||
for q in ('p50', 'p95', 'p99'):
|
||||
base_v = basem[q]
|
||||
cur_v = curm[q]
|
||||
if base_v > 0:
|
||||
allowed = base_v * (1.0 + REGRESSION_THRESHOLD)
|
||||
if cur_v > allowed:
|
||||
failures.append(f"{q} regression: {cur_v:.2f}ms > {allowed:.2f}ms (baseline {base_v:.2f}ms)")
|
||||
|
||||
# RPS must not drop more than -10%
|
||||
base_rps = basem['rps']
|
||||
cur_rps = curm['rps']
|
||||
if base_rps > 0:
|
||||
allowed_rps = base_rps * (1.0 - REGRESSION_THRESHOLD)
|
||||
if cur_rps < allowed_rps:
|
||||
failures.append(f'RPS regression: {cur_rps:.2f} < {allowed_rps:.2f} (baseline {base_rps:.2f})')
|
||||
|
||||
# Optional: compare CPU/event-loop utilization if perf-stats.json files are present alongside summaries
|
||||
try:
|
||||
cur_stats = (current.parent / 'perf-stats.json')
|
||||
base_stats = (baseline.parent / 'perf-stats.json')
|
||||
if cur_stats.exists() and base_stats.exists():
|
||||
cstats = load_summary(cur_stats)
|
||||
bstats = load_summary(base_stats)
|
||||
for key in ('cpu_percent', 'loop_lag_ms_p95'):
|
||||
if key in cstats and key in bstats:
|
||||
print(f"{key}: baseline={bstats[key]} current={cstats[key]}")
|
||||
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if failures:
|
||||
print('Perf regression detected:')
|
||||
for f in failures:
|
||||
print(f'- {f}')
|
||||
sys.exit(1)
|
||||
print('Performance within regression thresholds.')
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -0,0 +1,55 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# Runs k6 load-tests/k6-load-test.js, captures CPU/loop-lag stats while running,
|
||||
# and compares results against a baseline summary via scripts/compare_perf.py.
|
||||
|
||||
BASE_URL=${BASE_URL:-http://localhost:8000}
|
||||
BASELINE_JSON=${BASELINE_JSON:-load-tests/baseline/k6-summary.json}
|
||||
CURRENT_JSON=${CURRENT_JSON:-load-tests/k6-summary.json}
|
||||
PERF_JSON=${PERF_JSON:-load-tests/perf-stats.json}
|
||||
|
||||
echo "Using BASE_URL=${BASE_URL}"
|
||||
echo "Baseline file: ${BASELINE_JSON}"
|
||||
|
||||
if ! command -v k6 >/dev/null 2>&1; then
|
||||
echo "Error: k6 is not installed. Install from https://k6.io/docs/get-started/installation/" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
if ! command -v python3 >/dev/null 2>&1; then
|
||||
echo "Error: python3 not found" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
# Launch perf capture in background (optional; will fall back gracefully if pidfile is missing)
|
||||
python3 scripts/capture_perf_stats.py --output "${PERF_JSON}" --pidfile backend-services/doorman.pid --timeout 0 \
|
||||
>/dev/null 2>&1 &
|
||||
MONITOR_PID=$!
|
||||
|
||||
cleanup() {
|
||||
if kill -0 "${MONITOR_PID}" >/dev/null 2>&1; then
|
||||
kill "${MONITOR_PID}" >/dev/null 2>&1 || true
|
||||
wait "${MONITOR_PID}" || true
|
||||
fi
|
||||
}
|
||||
trap cleanup EXIT INT TERM
|
||||
|
||||
echo "Running k6 load test..."
|
||||
K6_CMD=(k6 run load-tests/k6-load-test.js --env BASE_URL="${BASE_URL}")
|
||||
"${K6_CMD[@]}"
|
||||
|
||||
echo
|
||||
echo "k6 summary written to: ${CURRENT_JSON}"
|
||||
|
||||
if [ ! -f "${BASELINE_JSON}" ]; then
|
||||
echo "Baseline summary not found at ${BASELINE_JSON}" >&2
|
||||
echo "Create one by copying a known-good run, e.g.:" >&2
|
||||
echo " mkdir -p \"$(dirname \"${BASELINE_JSON}\")\" && cp '${CURRENT_JSON}' '${BASELINE_JSON}'" >&2
|
||||
exit 3
|
||||
fi
|
||||
|
||||
echo
|
||||
echo "Comparing current vs baseline..."
|
||||
python3 scripts/compare_perf.py "${CURRENT_JSON}" "${BASELINE_JSON}"
|
||||
|
||||
Reference in New Issue
Block a user