Files
computer/libs/python/agent/agent/integrations/hud/adapter.py
2025-08-08 12:47:07 -04:00

122 lines
4.0 KiB
Python

"""HUD Adapter for ComputerAgent integration."""
from __future__ import annotations
from typing import Any, ClassVar
from hud.adapters.common import CLA, Adapter
from hud.adapters.common.types import (
CLAButton,
CLAKey,
ClickAction,
CustomAction,
DragAction,
MoveAction,
Point,
PressAction,
ResponseAction,
ScreenshotFetch,
ScrollAction,
TypeAction,
WaitAction,
)
class ComputerAgentAdapter(Adapter):
"""Adapter for ComputerAgent to work with HUD."""
KEY_MAP: ClassVar[dict[str, CLAKey]] = {
"return": "enter",
"arrowup": "up",
"arrowdown": "down",
"arrowleft": "left",
"arrowright": "right",
"cmd": "ctrl",
"super": "win",
"meta": "win",
}
BUTTON_MAP: ClassVar[dict[str, CLAButton]] = {
"wheel": "middle",
"middle": "middle",
}
def __init__(self) -> None:
super().__init__()
# ComputerAgent default dimensions (can be overridden)
self.agent_width = 1024
self.agent_height = 768
def _map_key(self, key: str) -> CLAKey:
"""Map a key to its standardized form."""
return self.KEY_MAP.get(key.lower(), key.lower()) # type: ignore
def convert(self, data: Any) -> CLA:
"""Convert a ComputerAgent action to a HUD action."""
try:
action_type = data.get("type")
if action_type == "click":
x, y = data.get("x", 0), data.get("y", 0)
button = data.get("button", "left")
button = self.BUTTON_MAP.get(button, button)
if button is None:
button = "left"
converted_action = ClickAction(point=Point(x=x, y=y), button=button)
elif action_type == "double_click":
x, y = data.get("x", 0), data.get("y", 0)
converted_action = ClickAction(point=Point(x=x, y=y), button="left", pattern=[100])
elif action_type == "scroll":
x, y = int(data.get("x", 0)), int(data.get("y", 0))
scroll_x = int(data.get("scroll_x", 0))
scroll_y = int(data.get("scroll_y", 0))
converted_action = ScrollAction(
point=Point(x=x, y=y), scroll=Point(x=scroll_x, y=scroll_y)
)
elif action_type == "type":
text = data.get("text", "")
converted_action = TypeAction(text=text, enter_after=False)
elif action_type == "wait":
ms = data.get("ms", 1000)
converted_action = WaitAction(time=ms)
elif action_type == "move":
x, y = data.get("x", 0), data.get("y", 0)
converted_action = MoveAction(point=Point(x=x, y=y))
elif action_type == "keypress":
keys = data.get("keys", [])
if isinstance(keys, str):
keys = [keys]
converted_action = PressAction(keys=[self._map_key(k) for k in keys])
elif action_type == "drag":
path = data.get("path", [])
points = [Point(x=p.get("x", 0), y=p.get("y", 0)) for p in path]
converted_action = DragAction(path=points)
elif action_type == "screenshot":
converted_action = ScreenshotFetch()
elif action_type == "response":
converted_action = ResponseAction(text=data.get("text", ""))
elif action_type == "custom":
converted_action = CustomAction(action=data.get("action", ""))
else:
raise ValueError(f"Unsupported action type: {action_type}")
# Add reasoning and logs if available
converted_action.reasoning = data.get("reasoning", "")
converted_action.logs = data.get("logs", "")
return converted_action
except Exception as e:
raise ValueError(f"Invalid action: {data}. Error: {e!s}") from e