test_grpc_upstream_404_maps_to_404

This commit is contained in:
seniorswe
2025-10-12 06:59:00 -04:00
parent 665885562e
commit f5d0d34993
121 changed files with 13625 additions and 7337 deletions
+167
View File
@@ -0,0 +1,167 @@
#!/usr/bin/env python3
"""
Capture CPU and event-loop lag statistics for a running Doorman process.
Writes a JSON file (perf-stats.json) alongside k6 results so compare_perf.py
can print these figures in the diff report.
Note: Loop lag is measured by this monitor's own asyncio loop as an
approximation of scheduler pressure on the host. It does not instrument the
server's internal loop directly, but correlates under shared host load.
"""
from __future__ import annotations
import argparse
import asyncio
import json
import os
import signal
import statistics
import sys
import time
from pathlib import Path
try:
import psutil # type: ignore
except Exception:
psutil = None # type: ignore
def parse_args() -> argparse.Namespace:
ap = argparse.ArgumentParser()
ap.add_argument("--pid", type=int, help="PID of the target process")
ap.add_argument("--pidfile", type=str, default="backend-services/doorman.pid",
help="Path to PID file (used if --pid not provided)")
ap.add_argument("--output", type=str, default="load-tests/perf-stats.json",
help="Output JSON path")
ap.add_argument("--cpu-interval", type=float, default=0.5,
help="CPU sampling interval seconds")
ap.add_argument("--lag-interval", type=float, default=0.05,
help="Loop lag sampling interval seconds")
ap.add_argument("--timeout", type=float, default=0.0,
help="Optional timeout seconds; 0 = until process exits or SIGTERM")
return ap.parse_args()
def read_pid(pid: int | None, pidfile: str) -> int | None:
if pid:
return pid
try:
with open(pidfile, "r") as f:
return int(f.read().strip())
except Exception:
return None
async def sample_cpu(proc: "psutil.Process", interval: float, stop: asyncio.Event, samples: list[float]):
# Prime cpu_percent() baseline
try:
proc.cpu_percent(None)
except Exception:
pass
while not stop.is_set():
try:
val = await asyncio.to_thread(proc.cpu_percent, interval)
samples.append(float(val))
except Exception:
await asyncio.sleep(interval)
continue
async def sample_loop_lag(interval: float, stop: asyncio.Event, lags_ms: list[float]):
# Measure scheduling delay over requested interval
next_ts = time.perf_counter() + interval
while not stop.is_set():
await asyncio.sleep(max(0.0, next_ts - time.perf_counter()))
now = time.perf_counter()
expected = next_ts
lag = max(0.0, (now - expected) * 1000.0) # ms
lags_ms.append(lag)
next_ts = expected + interval
def percentile(values: list[float], p: float) -> float:
if not values:
return 0.0
values = sorted(values)
k = int(max(0, min(len(values) - 1, round((p / 100.0) * (len(values) - 1)))))
return float(values[k])
async def main() -> int:
if psutil is None:
print("psutil is not installed; CPU stats unavailable", file=sys.stderr)
return 1
args = parse_args()
pid = read_pid(args.pid, args.pidfile)
if not pid:
print(f"No PID found (pidfile: {args.pidfile}). Is the server running?", file=sys.stderr)
return 2
try:
proc = psutil.Process(pid)
except Exception as e:
print(f"Failed to attach to PID {pid}: {e}", file=sys.stderr)
return 3
stop = asyncio.Event()
def _handle_sig(*_):
stop.set()
for s in (signal.SIGINT, signal.SIGTERM):
try:
signal.signal(s, _handle_sig)
except Exception:
pass
cpu_samples: list[float] = []
lag_samples_ms: list[float] = []
tasks = [
asyncio.create_task(sample_cpu(proc, args.cpu_interval, stop, cpu_samples)),
asyncio.create_task(sample_loop_lag(args.lag_interval, stop, lag_samples_ms)),
]
start = time.time()
try:
while not stop.is_set():
# Exit if target process is gone
if not proc.is_running():
break
if args.timeout > 0 and (time.time() - start) >= args.timeout:
break
await asyncio.sleep(0.2)
finally:
stop.set()
for t in tasks:
try:
await asyncio.wait_for(t, timeout=2.0)
except Exception:
pass
out = {
"cpu_percent_avg": round(statistics.fmean(cpu_samples), 2) if cpu_samples else 0.0,
"cpu_percent_p95": round(percentile(cpu_samples, 95), 2) if cpu_samples else 0.0,
"cpu_samples": len(cpu_samples),
"loop_lag_ms_p95": round(percentile(lag_samples_ms, 95), 2) if lag_samples_ms else 0.0,
"loop_lag_samples": len(lag_samples_ms),
}
try:
out_path = Path(args.output)
out_path.parent.mkdir(parents=True, exist_ok=True)
with out_path.open("w", encoding="utf-8") as f:
json.dump(out, f, indent=2)
print(f"Wrote perf stats: {out_path}")
except Exception as e:
print(f"Failed to write output: {e}", file=sys.stderr)
return 4
return 0
if __name__ == "__main__":
raise SystemExit(asyncio.run(main()))
+91
View File
@@ -0,0 +1,91 @@
#!/usr/bin/env python3
import json
import sys
from pathlib import Path
REGRESSION_THRESHOLD = 0.10 # 10%
def load_summary(path: Path):
with path.open('r', encoding='utf-8') as f:
return json.load(f)
def extract_metrics(summary: dict):
m = summary.get('metrics', {})
http = m.get('http_req_duration', {}).get('values', {})
http_reqs = m.get('http_reqs', {}).get('values', {})
p50 = float(http.get('p(50)', 0.0))
p95 = float(http.get('p(95)', 0.0))
p99 = float(http.get('p(99)', 0.0))
# Prefer provided rate; fallback to 0 if missing
rps = float(http_reqs.get('rate', 0.0))
return {
'p50': p50,
'p95': p95,
'p99': p99,
'rps': rps,
}
def main():
if len(sys.argv) < 3:
print('Usage: compare_perf.py <current_summary.json> <baseline_summary.json>')
sys.exit(2)
current = Path(sys.argv[1])
baseline = Path(sys.argv[2])
if not current.exists():
print(f'Current summary not found: {current}')
sys.exit(2)
if not baseline.exists():
print(f'Baseline summary not found: {baseline}')
sys.exit(2)
cur = load_summary(current)
base = load_summary(baseline)
curm = extract_metrics(cur)
basem = extract_metrics(base)
print('Baseline metrics:')
print(f" p50={basem['p50']:.2f}ms p95={basem['p95']:.2f}ms p99={basem['p99']:.2f}ms rps={basem['rps']:.2f}")
print('Current metrics:')
print(f" p50={curm['p50']:.2f}ms p95={curm['p95']:.2f}ms p99={curm['p99']:.2f}ms rps={curm['rps']:.2f}")
failures = []
# p95 must not regress more than +10%
for q in ('p50', 'p95', 'p99'):
base_v = basem[q]
cur_v = curm[q]
if base_v > 0:
allowed = base_v * (1.0 + REGRESSION_THRESHOLD)
if cur_v > allowed:
failures.append(f"{q} regression: {cur_v:.2f}ms > {allowed:.2f}ms (baseline {base_v:.2f}ms)")
# RPS must not drop more than -10%
base_rps = basem['rps']
cur_rps = curm['rps']
if base_rps > 0:
allowed_rps = base_rps * (1.0 - REGRESSION_THRESHOLD)
if cur_rps < allowed_rps:
failures.append(f'RPS regression: {cur_rps:.2f} < {allowed_rps:.2f} (baseline {base_rps:.2f})')
# Optional: compare CPU/event-loop utilization if perf-stats.json files are present alongside summaries
try:
cur_stats = (current.parent / 'perf-stats.json')
base_stats = (baseline.parent / 'perf-stats.json')
if cur_stats.exists() and base_stats.exists():
cstats = load_summary(cur_stats)
bstats = load_summary(base_stats)
for key in ('cpu_percent', 'loop_lag_ms_p95'):
if key in cstats and key in bstats:
print(f"{key}: baseline={bstats[key]} current={cstats[key]}")
except Exception:
pass
if failures:
print('Perf regression detected:')
for f in failures:
print(f'- {f}')
sys.exit(1)
print('Performance within regression thresholds.')
if __name__ == '__main__':
main()
+55
View File
@@ -0,0 +1,55 @@
#!/usr/bin/env bash
set -euo pipefail
# Runs k6 load-tests/k6-load-test.js, captures CPU/loop-lag stats while running,
# and compares results against a baseline summary via scripts/compare_perf.py.
BASE_URL=${BASE_URL:-http://localhost:8000}
BASELINE_JSON=${BASELINE_JSON:-load-tests/baseline/k6-summary.json}
CURRENT_JSON=${CURRENT_JSON:-load-tests/k6-summary.json}
PERF_JSON=${PERF_JSON:-load-tests/perf-stats.json}
echo "Using BASE_URL=${BASE_URL}"
echo "Baseline file: ${BASELINE_JSON}"
if ! command -v k6 >/dev/null 2>&1; then
echo "Error: k6 is not installed. Install from https://k6.io/docs/get-started/installation/" >&2
exit 2
fi
if ! command -v python3 >/dev/null 2>&1; then
echo "Error: python3 not found" >&2
exit 2
fi
# Launch perf capture in background (optional; will fall back gracefully if pidfile is missing)
python3 scripts/capture_perf_stats.py --output "${PERF_JSON}" --pidfile backend-services/doorman.pid --timeout 0 \
>/dev/null 2>&1 &
MONITOR_PID=$!
cleanup() {
if kill -0 "${MONITOR_PID}" >/dev/null 2>&1; then
kill "${MONITOR_PID}" >/dev/null 2>&1 || true
wait "${MONITOR_PID}" || true
fi
}
trap cleanup EXIT INT TERM
echo "Running k6 load test..."
K6_CMD=(k6 run load-tests/k6-load-test.js --env BASE_URL="${BASE_URL}")
"${K6_CMD[@]}"
echo
echo "k6 summary written to: ${CURRENT_JSON}"
if [ ! -f "${BASELINE_JSON}" ]; then
echo "Baseline summary not found at ${BASELINE_JSON}" >&2
echo "Create one by copying a known-good run, e.g.:" >&2
echo " mkdir -p \"$(dirname \"${BASELINE_JSON}\")\" && cp '${CURRENT_JSON}' '${BASELINE_JSON}'" >&2
exit 3
fi
echo
echo "Comparing current vs baseline..."
python3 scripts/compare_perf.py "${CURRENT_JSON}" "${BASELINE_JSON}"