doorman/scripts/capture_perf_stats.py

"""
Capture CPU and event-loop lag statistics for a running Doorman process.

Writes a JSON file (perf-stats.json) alongside k6 results so compare_perf.py
can print these figures in the diff report.

Note: Loop lag is measured by this monitor's own asyncio loop as an
approximation of scheduler pressure on the host. It does not instrument the
server's internal loop directly, but correlates under shared host load.
"""

from __future__ import annotations
import argparse
import asyncio
import json
import os
import signal
import statistics
import sys
import time
from pathlib import Path

try:
    import psutil
except Exception:
    psutil = None

def parse_args() -> argparse.Namespace:
    ap = argparse.ArgumentParser()
    ap.add_argument("--pid", type=int, help="PID of the target process")
    ap.add_argument("--pidfile", type=str, default="backend-services/doorman.pid",
                    help="Path to PID file (used if --pid not provided)")
    ap.add_argument("--output", type=str, default="load-tests/perf-stats.json",
                    help="Output JSON path")
    ap.add_argument("--cpu-interval", type=float, default=0.5,
                    help="CPU sampling interval seconds")
    ap.add_argument("--lag-interval", type=float, default=0.05,
                    help="Loop lag sampling interval seconds")
    ap.add_argument("--timeout", type=float, default=0.0,
                    help="Optional timeout seconds; 0 = until process exits or SIGTERM")
    return ap.parse_args()

def read_pid(pid: int | None, pidfile: str) -> int | None:
    if pid:
        return pid
    try:
        with open(pidfile, "r") as f:
            return int(f.read().strip())
    except Exception:
        return None

async def sample_cpu(proc: "psutil.Process", interval: float, stop: asyncio.Event, samples: list[float]):
    try:
        proc.cpu_percent(None)
    except Exception:
        pass
    while not stop.is_set():
        try:
            val = await asyncio.to_thread(proc.cpu_percent, interval)
            samples.append(float(val))
        except Exception:
            await asyncio.sleep(interval)
            continue

async def sample_loop_lag(interval: float, stop: asyncio.Event, lags_ms: list[float]):
    next_ts = time.perf_counter() + interval
    while not stop.is_set():
        await asyncio.sleep(max(0.0, next_ts - time.perf_counter()))
        now = time.perf_counter()
        expected = next_ts
        lag = max(0.0, (now - expected) * 1000.0)
        lags_ms.append(lag)
        next_ts = expected + interval

def percentile(values: list[float], p: float) -> float:
    if not values:
        return 0.0
    values = sorted(values)
    k = int(max(0, min(len(values) - 1, round((p / 100.0) * (len(values) - 1)))))
    return float(values[k])

async def main() -> int:
    if psutil is None:
        print("psutil is not installed; CPU stats unavailable", file=sys.stderr)
        return 1

    args = parse_args()
    pid = read_pid(args.pid, args.pidfile)
    if not pid:
        print(f"No PID found (pidfile: {args.pidfile}). Is the server running?", file=sys.stderr)
        return 2

    try:
        proc = psutil.Process(pid)
    except Exception as e:
        print(f"Failed to attach to PID {pid}: {e}", file=sys.stderr)
        return 3

    stop = asyncio.Event()

    def _handle_sig(*_):
        stop.set()

    for s in (signal.SIGINT, signal.SIGTERM):
        try:
            signal.signal(s, _handle_sig)
        except Exception:
            pass

    cpu_samples: list[float] = []
    lag_samples_ms: list[float] = []

    tasks = [
        asyncio.create_task(sample_cpu(proc, args.cpu_interval, stop, cpu_samples)),
        asyncio.create_task(sample_loop_lag(args.lag_interval, stop, lag_samples_ms)),
    ]

    start = time.time()
    try:
        while not stop.is_set():
            if not proc.is_running():
                break
            if args.timeout > 0 and (time.time() - start) >= args.timeout:
                break
            await asyncio.sleep(0.2)
    finally:
        stop.set()
        for t in tasks:
            try:
                await asyncio.wait_for(t, timeout=2.0)
            except Exception:
                pass

    out = {
        "cpu_percent_avg": round(statistics.fmean(cpu_samples), 2) if cpu_samples else 0.0,
        "cpu_percent_p95": round(percentile(cpu_samples, 95), 2) if cpu_samples else 0.0,
        "cpu_samples": len(cpu_samples),
        "loop_lag_ms_p95": round(percentile(lag_samples_ms, 95), 2) if lag_samples_ms else 0.0,
        "loop_lag_samples": len(lag_samples_ms),
    }

    try:
        out_path = Path(args.output)
        out_path.parent.mkdir(parents=True, exist_ok=True)
        with out_path.open("w", encoding="utf-8") as f:
            json.dump(out, f, indent=2)
        print(f"Wrote perf stats: {out_path}")
    except Exception as e:
        print(f"Failed to write output: {e}", file=sys.stderr)
        return 4

    return 0

if __name__ == "__main__":
    raise SystemExit(asyncio.run(main()))