#!/usr/bin/env python3 """ Enrich RustNet PCAP captures with process information from sidecar JSONL. This script correlates packets in a PCAP file with process information from the accompanying .connections.jsonl file created by RustNet. NOTE: If you captured using PKTAP on macOS (e.g., `--interface pktap,en0`), the process information is already embedded in the PCAP file itself. You can view it directly in Wireshark without using this script. This script is only needed for regular (non-PKTAP) captures. Usage: # Show packets with process info python pcap_enrich.py capture.pcap # Export to annotated PCAPNG (requires editcap from Wireshark) python pcap_enrich.py capture.pcap --output annotated.pcapng # Generate TSV report python pcap_enrich.py capture.pcap --format tsv > report.tsv Requirements: pip install scapy """ import argparse import json import subprocess import sys import tempfile from pathlib import Path try: from scapy.all import rdpcap, IP, TCP, UDP, ICMP except ImportError: print("Error: scapy is required. Install with: pip install scapy", file=sys.stderr) sys.exit(1) def parse_systemtime(st) -> float | None: """Parse a SystemTime serialized as {secs_since_epoch, nanos_since_epoch}.""" if st is None: return None if isinstance(st, dict): secs = st.get("secs_since_epoch", 0) nanos = st.get("nanos_since_epoch", 0) return secs + nanos / 1e9 # Fallback for other formats return None def load_connections(jsonl_path: Path) -> dict: """Load connection-to-process mappings from JSONL file. Returns a dict mapping (proto, local, remote) -> list of connection info dicts. Multiple connections can exist for the same tuple (port reuse over time). """ lookup = {} if not jsonl_path.exists(): print(f"Warning: Sidecar file not found: {jsonl_path}", file=sys.stderr) return lookup with open(jsonl_path) as f: for line_num, line in enumerate(f, 1): line = line.strip() if not line: continue try: c = json.loads(line) proto = c.get("protocol", "").upper() local = c.get("local_addr", "") remote = c.get("remote_addr", "") if proto and local and remote: info = { "pid": c.get("pid"), "process_name": c.get("process_name"), "first_seen": parse_systemtime(c.get("first_seen")), "last_seen": parse_systemtime(c.get("last_seen")), "bytes_sent": c.get("bytes_sent", 0), "bytes_received": c.get("bytes_received", 0), } # Store both directions, as a list to handle port reuse for key in [(proto, local, remote), (proto, remote, local)]: if key not in lookup: lookup[key] = [] lookup[key].append(info) except json.JSONDecodeError as e: print(f"Warning: Invalid JSON at line {line_num}: {e}", file=sys.stderr) return lookup def find_matching_connection(lookup: dict, pkt_tuple: tuple, pkt_time: float, slack: float) -> dict | None: """Find the best matching connection for a packet based on tuple and timestamp. Args: lookup: Connection lookup dict pkt_tuple: (proto, src, dst) tuple from packet pkt_time: Packet timestamp (seconds since epoch) slack: Allowed time slack in seconds Returns: Best matching connection info dict, or None if no match """ connections = lookup.get(pkt_tuple, []) if not connections: return None best_match = None best_score = float('inf') for conn in connections: first_seen = conn.get("first_seen") last_seen = conn.get("last_seen") # If no timestamps, fall back to simple match (first connection wins) if first_seen is None or last_seen is None: if best_match is None: best_match = conn continue # Check if packet falls within connection time range (with slack) if first_seen - slack <= pkt_time <= last_seen + slack: # Score by how close the packet is to the connection's time range # Prefer connections where the packet is well within the range if pkt_time < first_seen: score = first_seen - pkt_time elif pkt_time > last_seen: score = pkt_time - last_seen else: score = 0 # Perfect match (within range) if score < best_score: best_score = score best_match = conn return best_match def get_packet_tuple(pkt) -> tuple: """Extract connection tuple from packet.""" if not pkt.haslayer(IP): return None ip = pkt[IP] src_ip = ip.src dst_ip = ip.dst if pkt.haslayer(TCP): tcp = pkt[TCP] return ("TCP", f"{src_ip}:{tcp.sport}", f"{dst_ip}:{tcp.dport}") elif pkt.haslayer(UDP): udp = pkt[UDP] return ("UDP", f"{src_ip}:{udp.sport}", f"{dst_ip}:{udp.dport}") elif pkt.haslayer(ICMP): return ("ICMP", src_ip, dst_ip) return None def enrich_packets(pcap_path: Path, lookup: dict, slack: float): """Yield enriched packet information.""" packets = rdpcap(str(pcap_path)) for frame_num, pkt in enumerate(packets, 1): pkt_tuple = get_packet_tuple(pkt) pkt_time = float(pkt.time) if not pkt_tuple: yield { "frame": frame_num, "time": pkt_time, "proto": "OTHER", "src": "", "dst": "", "pid": None, "process": None, } continue proto, src, dst = pkt_tuple info = find_matching_connection(lookup, pkt_tuple, pkt_time, slack) or {} yield { "frame": frame_num, "time": pkt_time, "proto": proto, "src": src, "dst": dst, "pid": info.get("pid"), "process": info.get("process_name"), "bytes_sent": info.get("bytes_sent"), "bytes_received": info.get("bytes_received"), } def print_table(packets: list): """Print enriched packets as a formatted table.""" print(f"{'Frame':>6} {'Proto':<5} {'Source':<24} {'Destination':<24} {'PID':>7} {'Process':<20}") print("-" * 95) for p in packets: pid_str = str(p["pid"]) if p["pid"] else "-" proc_str = p["process"] or "-" if len(proc_str) > 20: proc_str = proc_str[:17] + "..." print(f"{p['frame']:>6} {p['proto']:<5} {p['src']:<24} {p['dst']:<24} {pid_str:>7} {proc_str:<20}") def print_tsv(packets: list): """Print enriched packets as TSV.""" print("frame\ttime\tproto\tsrc\tdst\tpid\tprocess") for p in packets: print(f"{p['frame']}\t{p['time']:.6f}\t{p['proto']}\t{p['src']}\t{p['dst']}\t{p['pid'] or ''}\t{p['process'] or ''}") def print_json(packets: list): """Print enriched packets as JSON.""" print(json.dumps(packets, indent=2)) def create_pcapng(pcap_path: Path, packets: list, output_path: Path): """Create annotated PCAPNG using editcap.""" # Check if editcap is available try: subprocess.run(["editcap", "--version"], capture_output=True, check=True) except (subprocess.CalledProcessError, FileNotFoundError): print("Error: editcap not found. Install Wireshark to get editcap.", file=sys.stderr) sys.exit(1) # First convert to pcapng with tempfile.NamedTemporaryFile(suffix=".pcapng", delete=False) as tmp: tmp_path = Path(tmp.name) subprocess.run(["editcap", "-F", "pcapng", str(pcap_path), str(tmp_path)], check=True) # Build annotation commands # editcap -a "frame:comment" format annotations = [] for p in packets: if p["pid"] or p["process"]: comment_parts = [] if p["pid"]: comment_parts.append(f"PID:{p['pid']}") if p["process"]: comment_parts.append(f"Process:{p['process']}") comment = " ".join(comment_parts) annotations.append(f"{p['frame']}:{comment}") if not annotations: print("No process information found to annotate.", file=sys.stderr) # Just copy the pcapng as-is tmp_path.rename(output_path) return # Apply annotations in batches (editcap has command line limits) current_input = tmp_path batch_size = 100 for i in range(0, len(annotations), batch_size): batch = annotations[i:i + batch_size] with tempfile.NamedTemporaryFile(suffix=".pcapng", delete=False) as tmp2: tmp2_path = Path(tmp2.name) cmd = ["editcap"] for ann in batch: cmd.extend(["-a", ann]) cmd.extend([str(current_input), str(tmp2_path)]) subprocess.run(cmd, check=True) if current_input != tmp_path: current_input.unlink() current_input = tmp2_path # Move final result to output current_input.rename(output_path) if tmp_path.exists(): tmp_path.unlink() print(f"Created annotated PCAPNG: {output_path}") print(f"Annotated {len(annotations)} packets with process information.") def count_unique_connections(lookup: dict) -> int: """Count unique connections (accounting for bidirectional storage).""" seen = set() count = 0 for key, conns in lookup.items(): for conn in conns: # Create a unique identifier for each connection conn_id = (key, conn.get("first_seen"), conn.get("pid")) if conn_id not in seen: seen.add(conn_id) count += 1 return count // 2 # Divide by 2 because we store both directions def print_summary(packets: list, lookup: dict): """Print a summary of process information found.""" total = len(packets) with_pid = sum(1 for p in packets if p["pid"]) # Group by process by_process = {} for p in packets: proc = p["process"] or "" if proc not in by_process: by_process[proc] = {"count": 0, "pid": p["pid"]} by_process[proc]["count"] += 1 print(f"\nSummary:") print(f" Total packets: {total}") print(f" Packets with process info: {with_pid} ({100*with_pid/total:.1f}%)") print(f" Unique connections in sidecar: {count_unique_connections(lookup)}") print(f"\nPackets by process:") for proc, info in sorted(by_process.items(), key=lambda x: -x[1]["count"]): pid_str = f" (PID {info['pid']})" if info["pid"] else "" print(f" {proc}{pid_str}: {info['count']} packets") def main(): parser = argparse.ArgumentParser( description="Enrich RustNet PCAP captures with process information.", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: %(prog)s capture.pcap # Show packets with process info %(prog)s capture.pcap --format tsv # Output as TSV %(prog)s capture.pcap --format json # Output as JSON %(prog)s capture.pcap -o annotated.pcapng # Create annotated PCAPNG %(prog)s capture.pcap --summary # Show summary only %(prog)s capture.pcap --slack 5 # Use 5 second slack for timestamp matching """ ) parser.add_argument("pcap", type=Path, help="Path to PCAP file") parser.add_argument("-j", "--jsonl", type=Path, help="Path to sidecar JSONL file (default: .connections.jsonl)") parser.add_argument("-o", "--output", type=Path, help="Output annotated PCAPNG file") parser.add_argument("-f", "--format", choices=["table", "tsv", "json"], default="table", help="Output format (default: table)") parser.add_argument("-s", "--summary", action="store_true", help="Show summary only") parser.add_argument("-l", "--limit", type=int, default=0, help="Limit number of packets to process (0 = no limit)") parser.add_argument("--slack", type=float, default=2.0, help="Timestamp matching slack in seconds (default: 2.0)") args = parser.parse_args() if not args.pcap.exists(): print(f"Error: PCAP file not found: {args.pcap}", file=sys.stderr) sys.exit(1) # Default sidecar path jsonl_path = args.jsonl or Path(f"{args.pcap}.connections.jsonl") # Load connection mappings lookup = load_connections(jsonl_path) if lookup: print(f"Loaded {count_unique_connections(lookup)} connections from {jsonl_path}", file=sys.stderr) # Process packets packets = list(enrich_packets(args.pcap, lookup, args.slack)) if args.limit > 0: packets = packets[:args.limit] if args.summary: print_summary(packets, lookup) return if args.output: create_pcapng(args.pcap, packets, args.output) print_summary(packets, lookup) else: if args.format == "table": print_table(packets) print_summary(packets, lookup) elif args.format == "tsv": print_tsv(packets) elif args.format == "json": print_json(packets) if __name__ == "__main__": main()