From 1882fb68e557a67f4d242955cf688c6ab1016c50 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Fri, 8 Aug 2025 12:47:07 -0400 Subject: [PATCH] Added HUD integration --- libs/python/agent/agent/computers/base.py | 2 + .../agent/agent/integrations/hud/__init__.py | 7 + .../agent/agent/integrations/hud/adapter.py | 121 +++++++++ .../agent/agent/integrations/hud/agent.py | 231 ++++++++++++++++++ .../integrations/hud/computer_handler.py | 187 ++++++++++++++ 5 files changed, 548 insertions(+) create mode 100644 libs/python/agent/agent/integrations/hud/__init__.py create mode 100644 libs/python/agent/agent/integrations/hud/adapter.py create mode 100644 libs/python/agent/agent/integrations/hud/agent.py create mode 100644 libs/python/agent/agent/integrations/hud/computer_handler.py diff --git a/libs/python/agent/agent/computers/base.py b/libs/python/agent/agent/computers/base.py index 82d54057..7fbcb0f7 100644 --- a/libs/python/agent/agent/computers/base.py +++ b/libs/python/agent/agent/computers/base.py @@ -59,6 +59,8 @@ class AsyncComputerHandler(Protocol): """Get current URL (for browser environments).""" ... + # ==== Anthropic Action Space ==== + async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None: """Left mouse down at coordinates.""" ... diff --git a/libs/python/agent/agent/integrations/hud/__init__.py b/libs/python/agent/agent/integrations/hud/__init__.py new file mode 100644 index 00000000..6459048d --- /dev/null +++ b/libs/python/agent/agent/integrations/hud/__init__.py @@ -0,0 +1,7 @@ +"""HUD integration for ComputerAgent.""" + +from .agent import ComputerAgent +from .adapter import ComputerAgentAdapter +from .computer_handler import HUDComputerHandler + +__all__ = ["ComputerAgent", "ComputerAgentAdapter", "HUDComputerHandler"] diff --git a/libs/python/agent/agent/integrations/hud/adapter.py b/libs/python/agent/agent/integrations/hud/adapter.py new file mode 100644 index 00000000..77c8dc7d --- /dev/null +++ b/libs/python/agent/agent/integrations/hud/adapter.py @@ -0,0 +1,121 @@ +"""HUD Adapter for ComputerAgent integration.""" + +from __future__ import annotations + +from typing import Any, ClassVar + +from hud.adapters.common import CLA, Adapter +from hud.adapters.common.types import ( + CLAButton, + CLAKey, + ClickAction, + CustomAction, + DragAction, + MoveAction, + Point, + PressAction, + ResponseAction, + ScreenshotFetch, + ScrollAction, + TypeAction, + WaitAction, +) + + +class ComputerAgentAdapter(Adapter): + """Adapter for ComputerAgent to work with HUD.""" + + KEY_MAP: ClassVar[dict[str, CLAKey]] = { + "return": "enter", + "arrowup": "up", + "arrowdown": "down", + "arrowleft": "left", + "arrowright": "right", + "cmd": "ctrl", + "super": "win", + "meta": "win", + } + + BUTTON_MAP: ClassVar[dict[str, CLAButton]] = { + "wheel": "middle", + "middle": "middle", + } + + def __init__(self) -> None: + super().__init__() + # ComputerAgent default dimensions (can be overridden) + self.agent_width = 1024 + self.agent_height = 768 + + def _map_key(self, key: str) -> CLAKey: + """Map a key to its standardized form.""" + return self.KEY_MAP.get(key.lower(), key.lower()) # type: ignore + + def convert(self, data: Any) -> CLA: + """Convert a ComputerAgent action to a HUD action.""" + try: + action_type = data.get("type") + + if action_type == "click": + x, y = data.get("x", 0), data.get("y", 0) + button = data.get("button", "left") + button = self.BUTTON_MAP.get(button, button) + if button is None: + button = "left" + converted_action = ClickAction(point=Point(x=x, y=y), button=button) + + elif action_type == "double_click": + x, y = data.get("x", 0), data.get("y", 0) + converted_action = ClickAction(point=Point(x=x, y=y), button="left", pattern=[100]) + + elif action_type == "scroll": + x, y = int(data.get("x", 0)), int(data.get("y", 0)) + scroll_x = int(data.get("scroll_x", 0)) + scroll_y = int(data.get("scroll_y", 0)) + converted_action = ScrollAction( + point=Point(x=x, y=y), scroll=Point(x=scroll_x, y=scroll_y) + ) + + elif action_type == "type": + text = data.get("text", "") + converted_action = TypeAction(text=text, enter_after=False) + + elif action_type == "wait": + ms = data.get("ms", 1000) + converted_action = WaitAction(time=ms) + + elif action_type == "move": + x, y = data.get("x", 0), data.get("y", 0) + converted_action = MoveAction(point=Point(x=x, y=y)) + + elif action_type == "keypress": + keys = data.get("keys", []) + if isinstance(keys, str): + keys = [keys] + converted_action = PressAction(keys=[self._map_key(k) for k in keys]) + + elif action_type == "drag": + path = data.get("path", []) + points = [Point(x=p.get("x", 0), y=p.get("y", 0)) for p in path] + converted_action = DragAction(path=points) + + elif action_type == "screenshot": + converted_action = ScreenshotFetch() + + elif action_type == "response": + converted_action = ResponseAction(text=data.get("text", "")) + + elif action_type == "custom": + converted_action = CustomAction(action=data.get("action", "")) + + else: + raise ValueError(f"Unsupported action type: {action_type}") + + # Add reasoning and logs if available + converted_action.reasoning = data.get("reasoning", "") + converted_action.logs = data.get("logs", "") + + return converted_action + + except Exception as e: + raise ValueError(f"Invalid action: {data}. Error: {e!s}") from e diff --git a/libs/python/agent/agent/integrations/hud/agent.py b/libs/python/agent/agent/integrations/hud/agent.py new file mode 100644 index 00000000..6f246c20 --- /dev/null +++ b/libs/python/agent/agent/integrations/hud/agent.py @@ -0,0 +1,231 @@ +"""HUD ComputerAgent wrapper for OSWorld benchmarking.""" + +import logging +from typing import Any, Literal, Optional, Union, List, Dict +import asyncio + +from agent import ComputerAgent as BaseComputerAgent +from hud.adapters import Adapter +from hud.agent.base import Agent +from hud.utils.common import Observation +from hud.adapters.common.types import LogType +from hud.types import Gym + +from .adapter import ComputerAgentAdapter +from .computer_handler import HUDComputerHandler + +logger = logging.getLogger(__name__) + + +class ComputerAgent(Agent[BaseComputerAgent, dict[str, Any]]): + """ + A ComputerAgent wrapper for HUD integration. + + This agent wraps the base ComputerAgent to work with HUD environments, + providing the same interface as OperatorAgent but using ComputerAgent internally. + """ + + transfer_gyms: dict[Gym, Gym] = {"qa": "hud-browser"} + + def __init__( + self, + model: str = "anthropic/claude-3-5-sonnet-20241022", + environment: Literal["windows", "mac", "linux", "browser"] = "browser", + adapter: Optional[Adapter] = None, + name: Optional[str] = None, + **kwargs: Any, + ): + """ + Initialize the ComputerAgent for HUD. + + Args: + model: The model string for ComputerAgent (e.g., "anthropic/claude-3-5-sonnet-20241022") + environment: The environment type (windows, mac, linux, browser) + adapter: The adapter to use for preprocessing and postprocessing + name: The name of the agent + **kwargs: Additional arguments passed to ComputerAgent + """ + # Create adapter if not provided + adapter = adapter or ComputerAgentAdapter() + + if name is None: + name = f"computeragent-{model.split('/')[-1]}" + + # Initialize the base Agent class without client (we'll create it later) + super().__init__(client=None, adapter=adapter, name=name) + + self.model = model + self.environment = environment + self.kwargs = kwargs + + # Default dimensions + self.width = 1024 + self.height = 768 + + # Update dimensions if adapter is provided + if self.adapter: + self.width = self.adapter.agent_width + self.height = self.adapter.agent_height + + # Create HUD computer handler + self.hud_computer = HUDComputerHandler( + environment=environment, + dimensions=(self.width, self.height) + ) + + # Initialize ComputerAgent with HUD computer handler + self.computer_agent = BaseComputerAgent( + model=model, + tools=[self.hud_computer], + **kwargs + ) + + # Set the client to the computer_agent for compatibility + self.client = self.computer_agent + + # State tracking + self.conversation_history: List[Dict[str, Any]] = [] + self.initial_prompt: Optional[str] = None + + # System prompt for computer use tasks + self.base_system_prompt = """ + You are an autonomous computer-using agent. Follow these guidelines: + + 1. Be decisive and complete tasks without asking for confirmation unless absolutely necessary. + 2. If you need user confirmation for safety-critical actions, use the formal safety check mechanism. + 3. Do NOT ask questions like "Should I proceed?" or "Would you like me to continue?" - just proceed with the task. + 4. When you find what you're looking for (e.g., a file to upload), proceed with the action directly. + 5. Only stop when the task is fully complete or if you encounter an error that prevents completion. + 6. Trust that the user wants you to complete the entire task they've requested. + + Remember: You have been given permission to complete the requested task autonomously. + """ + + async def fetch_response(self, observation: Observation) -> tuple[list[dict[str, Any]], bool]: + """ + Fetch a response from ComputerAgent based on the observation. + + Args: + observation: The preprocessed observation + + Returns: + tuple[list[dict[str, Any]], bool, list[LogType] | None]: A tuple containing the list of raw actions, + boolean indicating if the agent believes the task is complete. + """ + try: + # Update the computer handler with the current screenshot + if observation.screenshot: + self.hud_computer.update_screenshot(observation.screenshot) + + # Set up action callback to capture actions + captured_actions = [] + action_done = False + + async def action_callback(action: Dict[str, Any]) -> None: + """Callback to capture actions from ComputerAgent.""" + nonlocal captured_actions, action_done + captured_actions.append(action) + + # Set the action callback + self.hud_computer.set_action_callback(action_callback) + + # Prepare the message for ComputerAgent + if not self.conversation_history: + # First interaction - use the observation text as initial prompt + if observation.text: + self.initial_prompt = observation.text + message = f"{self.base_system_prompt}\n\nTask: {observation.text}" + else: + message = f"{self.base_system_prompt}\n\nPlease analyze the current screen and determine what action to take." + + self.conversation_history.append({"role": "user", "content": message}) + else: + # Subsequent interactions - add context about the current state + message = "Continue with the task based on the current screen state." + self.conversation_history.append({"role": "user", "content": message}) + + # Run ComputerAgent + try: + # ComputerAgent.run returns an async generator + async for result in self.computer_agent.run(self.conversation_history, stream=False): + # Update conversation history with the output + self.conversation_history += result["output"] + + # Check if we captured any actions + if captured_actions: + # Extract reasoning from the conversation history + reasoning = "" + # Look for the latest reasoning message + for msg in reversed(self.conversation_history): + if msg.get("type") == "reasoning" and msg.get("summary"): + reasoning = " ".join([s.get("text", "") for s in msg["summary"] if s.get("type") == "summary_text"]) + break + elif msg.get("type") == "message" and msg.get("role") == "assistant": + content = msg.get("content", []) + if isinstance(content, list): + reasoning = " ".join([c.get("text", "") for c in content if c.get("type") == "output_text"]) + break + + # Add reasoning and logs to each action + for action in captured_actions: + action["reasoning"] = reasoning + action["logs"] = {"conversation_length": len(self.conversation_history)} + + # Check if task is done by looking for assistant message indicating completion + done = False + for msg in reversed(self.conversation_history): + if msg.get("type") == "message" and msg.get("role") == "assistant": + content = msg.get("content", []) + for c in content: + if c.get("type") == "output_text" and "task completed" in c.get("text", "").lower(): + done = True + break + break + + return captured_actions, done + else: + # No actions captured, task is likely complete + response_text = "Task completed." + for msg in reversed(self.conversation_history): + if msg.get("type") == "message" and msg.get("role") == "assistant": + content = msg.get("content", []) + for c in content: + if c.get("type") == "output_text": + response_text = c.get("text", response_text) + break + break + + response_action = { + "type": "response", + "text": response_text, + "reasoning": response_text, + "logs": {"conversation_length": len(self.conversation_history)} + } + + # Check if this indicates task completion or failure + done = True + if "task is infeasible" in response_text.lower(): + response_action = {"type": "custom", "action": "FAIL"} + + return [response_action], done + + except Exception as e: + logger.error(f"Error running ComputerAgent: {e}") + # Return an error response + error_action = { + "type": "response", + "text": f"Error occurred: {str(e)}", + "reasoning": f"ComputerAgent encountered an error: {str(e)}", + "logs": {"error": str(e)} + } + return [error_action], True + + except Exception as e: + logger.error(f"Error in fetch_response: {e}") + error_action = { + "type": "response", + "text": f"Error in agent processing: {str(e)}", + "reasoning": f"Agent processing error: {str(e)}", + "logs": {"error": str(e)} + } + return [error_action], True diff --git a/libs/python/agent/agent/integrations/hud/computer_handler.py b/libs/python/agent/agent/integrations/hud/computer_handler.py new file mode 100644 index 00000000..5bf86666 --- /dev/null +++ b/libs/python/agent/agent/integrations/hud/computer_handler.py @@ -0,0 +1,187 @@ +"""HUD Computer Handler for ComputerAgent integration.""" + +import base64 +from io import BytesIO +from typing import Literal, Optional, Any, Dict, Callable +from PIL import Image + +from agent.computers import AsyncComputerHandler + + +class HUDComputerHandler(AsyncComputerHandler): + """Computer handler that interfaces with HUD environment.""" + + def __init__( + self, + environment: Literal["windows", "mac", "linux", "browser"] = "browser", + dimensions: tuple[int, int] = (1024, 768), + screenshot_callback: Optional[Callable] = None, + action_callback: Optional[Callable] = None, + ): + """ + Initialize HUD computer handler. + + Args: + environment: The environment type for HUD + dimensions: Screen dimensions as (width, height) + screenshot_callback: Optional callback to get screenshots from HUD environment + action_callback: Optional callback to execute actions in HUD environment + """ + super().__init__() + self._environment = environment + self._dimensions = dimensions + self._screenshot_callback = screenshot_callback + self._action_callback = action_callback + + # Store the last screenshot for reuse + self._last_screenshot: Optional[str] = None + + def set_screenshot_callback(self, callback: Callable) -> None: + """Set the screenshot callback.""" + self._screenshot_callback = callback + + def set_action_callback(self, callback: Callable) -> None: + """Set the action callback.""" + self._action_callback = callback + + def update_screenshot(self, screenshot: str) -> None: + """Update the stored screenshot (base64 string).""" + self._last_screenshot = screenshot + + async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]: + """Get the current environment type.""" + return self._environment # type: ignore + + async def get_dimensions(self) -> tuple[int, int]: + """Get screen dimensions as (width, height).""" + return self._dimensions + + async def screenshot(self) -> str: + """Take a screenshot and return as base64 string.""" + if self._screenshot_callback: + screenshot = await self._screenshot_callback() + if isinstance(screenshot, str): + self._last_screenshot = screenshot + return screenshot + elif isinstance(screenshot, Image.Image): + # Convert PIL Image to base64 + buffer = BytesIO() + screenshot.save(buffer, format="PNG") + screenshot_b64 = base64.b64encode(buffer.getvalue()).decode() + self._last_screenshot = screenshot_b64 + return screenshot_b64 + elif isinstance(screenshot, bytes): + screenshot_b64 = base64.b64encode(screenshot).decode() + self._last_screenshot = screenshot_b64 + return screenshot_b64 + + # Return last screenshot if available, otherwise create a blank one + if self._last_screenshot: + return self._last_screenshot + + # Create a blank screenshot as fallback + blank_image = Image.new('RGB', self._dimensions, color='white') + buffer = BytesIO() + blank_image.save(buffer, format="PNG") + screenshot_b64 = base64.b64encode(buffer.getvalue()).decode() + self._last_screenshot = screenshot_b64 + return screenshot_b64 + + async def click(self, x: int, y: int, button: str = "left") -> None: + """Click at coordinates with specified button.""" + if self._action_callback: + await self._action_callback({ + "type": "click", + "x": x, + "y": y, + "button": button + }) + + async def double_click(self, x: int, y: int) -> None: + """Double click at coordinates.""" + if self._action_callback: + await self._action_callback({ + "type": "double_click", + "x": x, + "y": y + }) + + async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: + """Scroll at coordinates with specified scroll amounts.""" + if self._action_callback: + await self._action_callback({ + "type": "scroll", + "x": x, + "y": y, + "scroll_x": scroll_x, + "scroll_y": scroll_y + }) + + async def type(self, text: str) -> None: + """Type text.""" + if self._action_callback: + await self._action_callback({ + "type": "type", + "text": text + }) + + async def wait(self, ms: int = 1000) -> None: + """Wait for specified milliseconds.""" + if self._action_callback: + await self._action_callback({ + "type": "wait", + "ms": ms + }) + + async def move(self, x: int, y: int) -> None: + """Move cursor to coordinates.""" + if self._action_callback: + await self._action_callback({ + "type": "move", + "x": x, + "y": y + }) + + async def keypress(self, keys: list[str] | str) -> None: + """Press key combination.""" + if isinstance(keys, str): + keys = [keys] + if self._action_callback: + await self._action_callback({ + "type": "keypress", + "keys": keys + }) + + async def drag(self, path: list[dict[str, int]]) -> None: + """Drag along a path of points.""" + if self._action_callback: + await self._action_callback({ + "type": "drag", + "path": path + }) + + async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None: + """Left mouse down at coordinates.""" + if self._action_callback: + await self._action_callback({ + "type": "left_mouse_down", + "x": x, + "y": y + }) + + async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None: + """Left mouse up at coordinates.""" + if self._action_callback: + await self._action_callback({ + "type": "left_mouse_up", + "x": x, + "y": y + }) + + async def get_current_url(self) -> str: + """Get the current URL.""" + if self._action_callback: + return await self._action_callback({ + "type": "get_current_url" + }) + return "" \ No newline at end of file