From 799d9d3ba74c04e9637231440747e8ad7b963c0b Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Wed, 27 Aug 2025 19:53:26 -0400
Subject: [PATCH] Normalize common LLM output errors

---
 .../agent/callbacks/operator_validator.py     | 90 +++++++++++++++++++
 1 file changed, 90 insertions(+)
 create mode 100644 libs/python/agent/agent/callbacks/operator_validator.py

diff --git a/libs/python/agent/agent/callbacks/operator_validator.py b/libs/python/agent/agent/callbacks/operator_validator.py
new file mode 100644
index 00000000..db19555c
--- /dev/null
+++ b/libs/python/agent/agent/callbacks/operator_validator.py
@@ -0,0 +1,90 @@
+"""
+OperatorValidatorCallback
+
+Ensures agent output actions conform to expected schemas by fixing common issues:
+- click: add default button='left' if missing
+- keypress: wrap keys string into a list
+
+This runs in on_llm_end, which receives the output array (AgentMessage[] as dicts).
+"""
+from __future__ import annotations
+
+from typing import Any, Dict, List
+
+from .base import AsyncCallbackHandler
+
+
+class OperatorValidatorCallback(AsyncCallbackHandler):
+    """Validates and normalizes operator/computer actions in LLM outputs."""
+
+    async def on_llm_end(self, output: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        # Mutate in-place as requested, but still return the list for chaining
+        for item in output or []:
+            if not isinstance(item, dict):
+                continue
+            if item.get("type") != "computer_call":
+                continue
+            action = item.get("action")
+            if not isinstance(action, dict):
+                continue
+            action_type = action.get("type")
+            def _remove_keys(action: Dict[str, Any], keys: List[str]):
+                for key in keys:
+                    if key in action:
+                        del action[key]
+            for mouse_btn in ["left", "right", "wheel", "back", "forward"]:
+                if f"{mouse_btn}_click" in action:
+                    action["type"] = "click"
+                    action["button"] = mouse_btn
+            if action_type == "click":
+                # Add default button if missing
+                if "button" not in action or action.get("button") is None:
+                    action["button"] = "left"
+                if "coordinate" in action:
+                    action["x"] = action["coordinate"][0]
+                    action["y"] = action["coordinate"][1]
+                    del action["coordinate"]
+            if action_type in ["type", "keypress", "screenshot", "wait"]:
+                _remove_keys(action, ["coordinate", "x", "y"])
+            elif action_type == "keypress":
+                keys = action.get("keys")
+                if isinstance(keys, str):
+                    action["keys"] = keys.replace("-", "+").split("+") if len(keys) > 1 else [keys]
+            
+
+        # Second pass: if an assistant message is immediately followed by a computer_call,
+        # replace the assistant message itself with a reasoning message with summary text.
+        if isinstance(output, list):
+            for i, item in enumerate(output):
+                if not isinstance(item, dict):
+                    continue
+                # AssistantMessage shape: { type: 'message', role: 'assistant', content: OutputContent[] }
+                if item.get("type") == "message" and item.get("role") == "assistant":
+                    next_idx = i + 1
+                    if next_idx >= len(output):
+                        continue
+                    next_item = output[next_idx]
+                    if not isinstance(next_item, dict):
+                        continue
+                    if next_item.get("type") != "computer_call":
+                        continue
+                    contents = item.get("content") or []
+                    # Extract text from OutputContent[]
+                    text_parts: List[str] = []
+                    if isinstance(contents, list):
+                        for c in contents:
+                            if isinstance(c, dict) and c.get("type") == "output_text" and isinstance(c.get("text"), str):
+                                text_parts.append(c["text"])
+                    text_content = "\n".join(text_parts).strip()
+                    # Replace assistant message with reasoning message
+                    output[i] = {
+                        "type": "reasoning",
+                        "summary": [
+                            {
+                                "type": "summary_text",
+                                "text": text_content,
+                            }
+                        ],
+                    }
+
+        return output