mirror of
https://github.com/apidoorman/doorman.git
synced 2026-02-09 11:07:05 -06:00
157 lines
4.9 KiB
Python
157 lines
4.9 KiB
Python
"""
|
|
Capture CPU and event-loop lag statistics for a running Doorman process.
|
|
|
|
Writes a JSON file (perf-stats.json) alongside k6 results so compare_perf.py
|
|
can print these figures in the diff report.
|
|
|
|
Note: Loop lag is measured by this monitor's own asyncio loop as an
|
|
approximation of scheduler pressure on the host. It does not instrument the
|
|
server's internal loop directly, but correlates under shared host load.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
import argparse
|
|
import asyncio
|
|
import json
|
|
import os
|
|
import signal
|
|
import statistics
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
try:
|
|
import psutil
|
|
except Exception:
|
|
psutil = None
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--pid", type=int, help="PID of the target process")
|
|
ap.add_argument("--pidfile", type=str, default="backend-services/doorman.pid",
|
|
help="Path to PID file (used if --pid not provided)")
|
|
ap.add_argument("--output", type=str, default="load-tests/perf-stats.json",
|
|
help="Output JSON path")
|
|
ap.add_argument("--cpu-interval", type=float, default=0.5,
|
|
help="CPU sampling interval seconds")
|
|
ap.add_argument("--lag-interval", type=float, default=0.05,
|
|
help="Loop lag sampling interval seconds")
|
|
ap.add_argument("--timeout", type=float, default=0.0,
|
|
help="Optional timeout seconds; 0 = until process exits or SIGTERM")
|
|
return ap.parse_args()
|
|
|
|
def read_pid(pid: int | None, pidfile: str) -> int | None:
|
|
if pid:
|
|
return pid
|
|
try:
|
|
with open(pidfile, "r") as f:
|
|
return int(f.read().strip())
|
|
except Exception:
|
|
return None
|
|
|
|
async def sample_cpu(proc: "psutil.Process", interval: float, stop: asyncio.Event, samples: list[float]):
|
|
try:
|
|
proc.cpu_percent(None)
|
|
except Exception:
|
|
pass
|
|
while not stop.is_set():
|
|
try:
|
|
val = await asyncio.to_thread(proc.cpu_percent, interval)
|
|
samples.append(float(val))
|
|
except Exception:
|
|
await asyncio.sleep(interval)
|
|
continue
|
|
|
|
async def sample_loop_lag(interval: float, stop: asyncio.Event, lags_ms: list[float]):
|
|
next_ts = time.perf_counter() + interval
|
|
while not stop.is_set():
|
|
await asyncio.sleep(max(0.0, next_ts - time.perf_counter()))
|
|
now = time.perf_counter()
|
|
expected = next_ts
|
|
lag = max(0.0, (now - expected) * 1000.0)
|
|
lags_ms.append(lag)
|
|
next_ts = expected + interval
|
|
|
|
def percentile(values: list[float], p: float) -> float:
|
|
if not values:
|
|
return 0.0
|
|
values = sorted(values)
|
|
k = int(max(0, min(len(values) - 1, round((p / 100.0) * (len(values) - 1)))))
|
|
return float(values[k])
|
|
|
|
async def main() -> int:
|
|
if psutil is None:
|
|
print("psutil is not installed; CPU stats unavailable", file=sys.stderr)
|
|
return 1
|
|
|
|
args = parse_args()
|
|
pid = read_pid(args.pid, args.pidfile)
|
|
if not pid:
|
|
print(f"No PID found (pidfile: {args.pidfile}). Is the server running?", file=sys.stderr)
|
|
return 2
|
|
|
|
try:
|
|
proc = psutil.Process(pid)
|
|
except Exception as e:
|
|
print(f"Failed to attach to PID {pid}: {e}", file=sys.stderr)
|
|
return 3
|
|
|
|
stop = asyncio.Event()
|
|
|
|
def _handle_sig(*_):
|
|
stop.set()
|
|
|
|
for s in (signal.SIGINT, signal.SIGTERM):
|
|
try:
|
|
signal.signal(s, _handle_sig)
|
|
except Exception:
|
|
pass
|
|
|
|
cpu_samples: list[float] = []
|
|
lag_samples_ms: list[float] = []
|
|
|
|
tasks = [
|
|
asyncio.create_task(sample_cpu(proc, args.cpu_interval, stop, cpu_samples)),
|
|
asyncio.create_task(sample_loop_lag(args.lag_interval, stop, lag_samples_ms)),
|
|
]
|
|
|
|
start = time.time()
|
|
try:
|
|
while not stop.is_set():
|
|
if not proc.is_running():
|
|
break
|
|
if args.timeout > 0 and (time.time() - start) >= args.timeout:
|
|
break
|
|
await asyncio.sleep(0.2)
|
|
finally:
|
|
stop.set()
|
|
for t in tasks:
|
|
try:
|
|
await asyncio.wait_for(t, timeout=2.0)
|
|
except Exception:
|
|
pass
|
|
|
|
out = {
|
|
"cpu_percent_avg": round(statistics.fmean(cpu_samples), 2) if cpu_samples else 0.0,
|
|
"cpu_percent_p95": round(percentile(cpu_samples, 95), 2) if cpu_samples else 0.0,
|
|
"cpu_samples": len(cpu_samples),
|
|
"loop_lag_ms_p95": round(percentile(lag_samples_ms, 95), 2) if lag_samples_ms else 0.0,
|
|
"loop_lag_samples": len(lag_samples_ms),
|
|
}
|
|
|
|
try:
|
|
out_path = Path(args.output)
|
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
with out_path.open("w", encoding="utf-8") as f:
|
|
json.dump(out, f, indent=2)
|
|
print(f"Wrote perf stats: {out_path}")
|
|
except Exception as e:
|
|
print(f"Failed to write output: {e}", file=sys.stderr)
|
|
return 4
|
|
|
|
return 0
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(asyncio.run(main()))
|
|
|