Added HUD integration

2026-02-28 02:50:17 -06:00 · 2025-08-08 12:47:07 -04:00
parent 5c46ca0c9a
commit 1882fb68e5
5 changed files with 548 additions and 0 deletions
--- a/libs/python/agent/agent/computers/base.py
+++ b/libs/python/agent/agent/computers/base.py
@@ -59,6 +59,8 @@ class AsyncComputerHandler(Protocol):
        """Get current URL (for browser environments)."""
        ...
    
+    # ==== Anthropic Action Space ==== 
+
    async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
        """Left mouse down at coordinates."""
        ...
--- a/libs/python/agent/agent/integrations/hud/init.py
+++ b/libs/python/agent/agent/integrations/hud/init.py
@@ -0,0 +1,7 @@
+"""HUD integration for ComputerAgent."""
+
+from .agent import ComputerAgent
+from .adapter import ComputerAgentAdapter
+from .computer_handler import HUDComputerHandler
+
+__all__ = ["ComputerAgent", "ComputerAgentAdapter", "HUDComputerHandler"]
--- a/libs/python/agent/agent/integrations/hud/adapter.py
+++ b/libs/python/agent/agent/integrations/hud/adapter.py
@@ -0,0 +1,121 @@
+"""HUD Adapter for ComputerAgent integration."""
+
+from __future__ import annotations
+
+from typing import Any, ClassVar
+
+from hud.adapters.common import CLA, Adapter
+from hud.adapters.common.types import (
+    CLAButton,
+    CLAKey,
+    ClickAction,
+    CustomAction,
+    DragAction,
+    MoveAction,
+    Point,
+    PressAction,
+    ResponseAction,
+    ScreenshotFetch,
+    ScrollAction,
+    TypeAction,
+    WaitAction,
+)
+
+
+class ComputerAgentAdapter(Adapter):
+    """Adapter for ComputerAgent to work with HUD."""
+    
+    KEY_MAP: ClassVar[dict[str, CLAKey]] = {
+        "return": "enter",
+        "arrowup": "up",
+        "arrowdown": "down",
+        "arrowleft": "left",
+        "arrowright": "right",
+        "cmd": "ctrl",
+        "super": "win",
+        "meta": "win",
+    }
+
+    BUTTON_MAP: ClassVar[dict[str, CLAButton]] = {
+        "wheel": "middle",
+        "middle": "middle",
+    }
+
+    def __init__(self) -> None:
+        super().__init__()
+        # ComputerAgent default dimensions (can be overridden)
+        self.agent_width = 1024
+        self.agent_height = 768
+
+    def _map_key(self, key: str) -> CLAKey:
+        """Map a key to its standardized form."""
+        return self.KEY_MAP.get(key.lower(), key.lower())  # type: ignore
+
+    def convert(self, data: Any) -> CLA:
+        """Convert a ComputerAgent action to a HUD action."""
+        try:
+            action_type = data.get("type")
+
+            if action_type == "click":
+                x, y = data.get("x", 0), data.get("y", 0)
+                button = data.get("button", "left")
+                button = self.BUTTON_MAP.get(button, button)
+                if button is None:
+                    button = "left"
+                converted_action = ClickAction(point=Point(x=x, y=y), button=button)
+
+            elif action_type == "double_click":
+                x, y = data.get("x", 0), data.get("y", 0)
+                converted_action = ClickAction(point=Point(x=x, y=y), button="left", pattern=[100])
+
+            elif action_type == "scroll":
+                x, y = int(data.get("x", 0)), int(data.get("y", 0))
+                scroll_x = int(data.get("scroll_x", 0))
+                scroll_y = int(data.get("scroll_y", 0))
+                converted_action = ScrollAction(
+                    point=Point(x=x, y=y), scroll=Point(x=scroll_x, y=scroll_y)
+                )
+
+            elif action_type == "type":
+                text = data.get("text", "")
+                converted_action = TypeAction(text=text, enter_after=False)
+
+            elif action_type == "wait":
+                ms = data.get("ms", 1000)
+                converted_action = WaitAction(time=ms)
+
+            elif action_type == "move":
+                x, y = data.get("x", 0), data.get("y", 0)
+                converted_action = MoveAction(point=Point(x=x, y=y))
+
+            elif action_type == "keypress":
+                keys = data.get("keys", [])
+                if isinstance(keys, str):
+                    keys = [keys]
+                converted_action = PressAction(keys=[self._map_key(k) for k in keys])
+
+            elif action_type == "drag":
+                path = data.get("path", [])
+                points = [Point(x=p.get("x", 0), y=p.get("y", 0)) for p in path]
+                converted_action = DragAction(path=points)
+
+            elif action_type == "screenshot":
+                converted_action = ScreenshotFetch()
+
+            elif action_type == "response":
+                converted_action = ResponseAction(text=data.get("text", ""))
+                
+            elif action_type == "custom":
+                converted_action = CustomAction(action=data.get("action", ""))
+                
+            else:
+                raise ValueError(f"Unsupported action type: {action_type}")
+
+            # Add reasoning and logs if available
+            converted_action.reasoning = data.get("reasoning", "")
+            converted_action.logs = data.get("logs", "")
+
+            return converted_action
+
+        except Exception as e:
+            raise ValueError(f"Invalid action: {data}. Error: {e!s}") from e
--- a/libs/python/agent/agent/integrations/hud/agent.py
+++ b/libs/python/agent/agent/integrations/hud/agent.py
@@ -0,0 +1,231 @@
+"""HUD ComputerAgent wrapper for OSWorld benchmarking."""
+
+import logging
+from typing import Any, Literal, Optional, Union, List, Dict
+import asyncio
+
+from agent import ComputerAgent as BaseComputerAgent
+from hud.adapters import Adapter
+from hud.agent.base import Agent
+from hud.utils.common import Observation
+from hud.adapters.common.types import LogType
+from hud.types import Gym
+
+from .adapter import ComputerAgentAdapter
+from .computer_handler import HUDComputerHandler
+
+logger = logging.getLogger(__name__)
+
+
+class ComputerAgent(Agent[BaseComputerAgent, dict[str, Any]]):
+    """
+    A ComputerAgent wrapper for HUD integration.
+    
+    This agent wraps the base ComputerAgent to work with HUD environments,
+    providing the same interface as OperatorAgent but using ComputerAgent internally.
+    """
+    
+    transfer_gyms: dict[Gym, Gym] = {"qa": "hud-browser"}
+
+    def __init__(
+        self,
+        model: str = "anthropic/claude-3-5-sonnet-20241022",
+        environment: Literal["windows", "mac", "linux", "browser"] = "browser",
+        adapter: Optional[Adapter] = None,
+        name: Optional[str] = None,
+        **kwargs: Any,
+    ):
+        """
+        Initialize the ComputerAgent for HUD.
+
+        Args:
+            model: The model string for ComputerAgent (e.g., "anthropic/claude-3-5-sonnet-20241022")
+            environment: The environment type (windows, mac, linux, browser)
+            adapter: The adapter to use for preprocessing and postprocessing
+            name: The name of the agent
+            **kwargs: Additional arguments passed to ComputerAgent
+        """
+        # Create adapter if not provided
+        adapter = adapter or ComputerAgentAdapter()
+        
+        if name is None:
+            name = f"computeragent-{model.split('/')[-1]}"
+
+        # Initialize the base Agent class without client (we'll create it later)
+        super().__init__(client=None, adapter=adapter, name=name)
+
+        self.model = model
+        self.environment = environment
+        self.kwargs = kwargs
+
+        # Default dimensions
+        self.width = 1024
+        self.height = 768
+
+        # Update dimensions if adapter is provided
+        if self.adapter:
+            self.width = self.adapter.agent_width
+            self.height = self.adapter.agent_height
+
+        # Create HUD computer handler
+        self.hud_computer = HUDComputerHandler(
+            environment=environment,
+            dimensions=(self.width, self.height)
+        )
+
+        # Initialize ComputerAgent with HUD computer handler
+        self.computer_agent = BaseComputerAgent(
+            model=model,
+            tools=[self.hud_computer],
+            **kwargs
+        )
+        
+        # Set the client to the computer_agent for compatibility
+        self.client = self.computer_agent
+
+        # State tracking
+        self.conversation_history: List[Dict[str, Any]] = []
+        self.initial_prompt: Optional[str] = None
+
+        # System prompt for computer use tasks
+        self.base_system_prompt = """
+        You are an autonomous computer-using agent. Follow these guidelines:
+
+        1. Be decisive and complete tasks without asking for confirmation unless absolutely necessary.
+        2. If you need user confirmation for safety-critical actions, use the formal safety check mechanism.
+        3. Do NOT ask questions like "Should I proceed?" or "Would you like me to continue?" - just proceed with the task.
+        4. When you find what you're looking for (e.g., a file to upload), proceed with the action directly.
+        5. Only stop when the task is fully complete or if you encounter an error that prevents completion.
+        6. Trust that the user wants you to complete the entire task they've requested.
+
+        Remember: You have been given permission to complete the requested task autonomously.
+        """
+
+    async def fetch_response(self, observation: Observation) -> tuple[list[dict[str, Any]], bool]:
+        """
+        Fetch a response from ComputerAgent based on the observation.
+
+        Args:
+            observation: The preprocessed observation
+
+        Returns:
+            tuple[list[dict[str, Any]], bool, list[LogType] | None]: A tuple containing the list of raw actions,
+                                             boolean indicating if the agent believes the task is complete.
+        """
+        try:
+            # Update the computer handler with the current screenshot
+            if observation.screenshot:
+                self.hud_computer.update_screenshot(observation.screenshot)
+
+            # Set up action callback to capture actions
+            captured_actions = []
+            action_done = False
+
+            async def action_callback(action: Dict[str, Any]) -> None:
+                """Callback to capture actions from ComputerAgent."""
+                nonlocal captured_actions, action_done
+                captured_actions.append(action)
+
+            # Set the action callback
+            self.hud_computer.set_action_callback(action_callback)
+
+            # Prepare the message for ComputerAgent
+            if not self.conversation_history:
+                # First interaction - use the observation text as initial prompt
+                if observation.text:
+                    self.initial_prompt = observation.text
+                    message = f"{self.base_system_prompt}\n\nTask: {observation.text}"
+                else:
+                    message = f"{self.base_system_prompt}\n\nPlease analyze the current screen and determine what action to take."
+                
+                self.conversation_history.append({"role": "user", "content": message})
+            else:
+                # Subsequent interactions - add context about the current state
+                message = "Continue with the task based on the current screen state."
+                self.conversation_history.append({"role": "user", "content": message})
+
+            # Run ComputerAgent
+            try:
+                # ComputerAgent.run returns an async generator
+                async for result in self.computer_agent.run(self.conversation_history, stream=False):
+                    # Update conversation history with the output
+                    self.conversation_history += result["output"]
+                
+                # Check if we captured any actions
+                if captured_actions:
+                    # Extract reasoning from the conversation history
+                    reasoning = ""
+                    # Look for the latest reasoning message
+                    for msg in reversed(self.conversation_history):
+                        if msg.get("type") == "reasoning" and msg.get("summary"):
+                            reasoning = " ".join([s.get("text", "") for s in msg["summary"] if s.get("type") == "summary_text"])
+                            break
+                        elif msg.get("type") == "message" and msg.get("role") == "assistant":
+                            content = msg.get("content", [])
+                            if isinstance(content, list):
+                                reasoning = " ".join([c.get("text", "") for c in content if c.get("type") == "output_text"])
+                            break
+                    
+                    # Add reasoning and logs to each action
+                    for action in captured_actions:
+                        action["reasoning"] = reasoning
+                        action["logs"] = {"conversation_length": len(self.conversation_history)}
+                    
+                    # Check if task is done by looking for assistant message indicating completion
+                    done = False
+                    for msg in reversed(self.conversation_history):
+                        if msg.get("type") == "message" and msg.get("role") == "assistant":
+                            content = msg.get("content", [])
+                            for c in content:
+                                if c.get("type") == "output_text" and "task completed" in c.get("text", "").lower():
+                                    done = True
+                                    break
+                            break
+                    
+                    return captured_actions, done
+                else:
+                    # No actions captured, task is likely complete
+                    response_text = "Task completed."
+                    for msg in reversed(self.conversation_history):
+                        if msg.get("type") == "message" and msg.get("role") == "assistant":
+                            content = msg.get("content", [])
+                            for c in content:
+                                if c.get("type") == "output_text":
+                                    response_text = c.get("text", response_text)
+                                    break
+                            break
+                    
+                    response_action = {
+                        "type": "response",
+                        "text": response_text,
+                        "reasoning": response_text,
+                        "logs": {"conversation_length": len(self.conversation_history)}
+                    }
+                    
+                    # Check if this indicates task completion or failure
+                    done = True
+                    if "task is infeasible" in response_text.lower():
+                        response_action = {"type": "custom", "action": "FAIL"}
+                    
+                    return [response_action], done
+
+            except Exception as e:
+                logger.error(f"Error running ComputerAgent: {e}")
+                # Return an error response
+                error_action = {
+                    "type": "response", 
+                    "text": f"Error occurred: {str(e)}",
+                    "reasoning": f"ComputerAgent encountered an error: {str(e)}",
+                    "logs": {"error": str(e)}
+                }
+                return [error_action], True
+
+        except Exception as e:
+            logger.error(f"Error in fetch_response: {e}")
+            error_action = {
+                "type": "response",
+                "text": f"Error in agent processing: {str(e)}",
+                "reasoning": f"Agent processing error: {str(e)}",
+                "logs": {"error": str(e)}
+            }
+            return [error_action], True
--- a/libs/python/agent/agent/integrations/hud/computer_handler.py
+++ b/libs/python/agent/agent/integrations/hud/computer_handler.py
@@ -0,0 +1,187 @@
+"""HUD Computer Handler for ComputerAgent integration."""
+
+import base64
+from io import BytesIO
+from typing import Literal, Optional, Any, Dict, Callable
+from PIL import Image
+
+from agent.computers import AsyncComputerHandler
+
+
+class HUDComputerHandler(AsyncComputerHandler):
+    """Computer handler that interfaces with HUD environment."""
+    
+    def __init__(
+        self,
+        environment: Literal["windows", "mac", "linux", "browser"] = "browser",
+        dimensions: tuple[int, int] = (1024, 768),
+        screenshot_callback: Optional[Callable] = None,
+        action_callback: Optional[Callable] = None,
+    ):
+        """
+        Initialize HUD computer handler.
+        
+        Args:
+            environment: The environment type for HUD
+            dimensions: Screen dimensions as (width, height)
+            screenshot_callback: Optional callback to get screenshots from HUD environment
+            action_callback: Optional callback to execute actions in HUD environment
+        """
+        super().__init__()
+        self._environment = environment
+        self._dimensions = dimensions
+        self._screenshot_callback = screenshot_callback
+        self._action_callback = action_callback
+        
+        # Store the last screenshot for reuse
+        self._last_screenshot: Optional[str] = None
+        
+    def set_screenshot_callback(self, callback: Callable) -> None:
+        """Set the screenshot callback."""
+        self._screenshot_callback = callback
+        
+    def set_action_callback(self, callback: Callable) -> None:
+        """Set the action callback."""
+        self._action_callback = callback
+        
+    def update_screenshot(self, screenshot: str) -> None:
+        """Update the stored screenshot (base64 string)."""
+        self._last_screenshot = screenshot
+
+    async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
+        """Get the current environment type."""
+        return self._environment # type: ignore
+    
+    async def get_dimensions(self) -> tuple[int, int]:
+        """Get screen dimensions as (width, height)."""
+        return self._dimensions
+    
+    async def screenshot(self) -> str:
+        """Take a screenshot and return as base64 string."""
+        if self._screenshot_callback:
+            screenshot = await self._screenshot_callback()
+            if isinstance(screenshot, str):
+                self._last_screenshot = screenshot
+                return screenshot
+            elif isinstance(screenshot, Image.Image):
+                # Convert PIL Image to base64
+                buffer = BytesIO()
+                screenshot.save(buffer, format="PNG")
+                screenshot_b64 = base64.b64encode(buffer.getvalue()).decode()
+                self._last_screenshot = screenshot_b64
+                return screenshot_b64
+            elif isinstance(screenshot, bytes):
+                screenshot_b64 = base64.b64encode(screenshot).decode()
+                self._last_screenshot = screenshot_b64
+                return screenshot_b64
+        
+        # Return last screenshot if available, otherwise create a blank one
+        if self._last_screenshot:
+            return self._last_screenshot
+            
+        # Create a blank screenshot as fallback
+        blank_image = Image.new('RGB', self._dimensions, color='white')
+        buffer = BytesIO()
+        blank_image.save(buffer, format="PNG")
+        screenshot_b64 = base64.b64encode(buffer.getvalue()).decode()
+        self._last_screenshot = screenshot_b64
+        return screenshot_b64
+    
+    async def click(self, x: int, y: int, button: str = "left") -> None:
+        """Click at coordinates with specified button."""
+        if self._action_callback:
+            await self._action_callback({
+                "type": "click",
+                "x": x,
+                "y": y,
+                "button": button
+            })
+    
+    async def double_click(self, x: int, y: int) -> None:
+        """Double click at coordinates."""
+        if self._action_callback:
+            await self._action_callback({
+                "type": "double_click",
+                "x": x,
+                "y": y
+            })
+    
+    async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
+        """Scroll at coordinates with specified scroll amounts."""
+        if self._action_callback:
+            await self._action_callback({
+                "type": "scroll",
+                "x": x,
+                "y": y,
+                "scroll_x": scroll_x,
+                "scroll_y": scroll_y
+            })
+    
+    async def type(self, text: str) -> None:
+        """Type text."""
+        if self._action_callback:
+            await self._action_callback({
+                "type": "type",
+                "text": text
+            })
+    
+    async def wait(self, ms: int = 1000) -> None:
+        """Wait for specified milliseconds."""
+        if self._action_callback:
+            await self._action_callback({
+                "type": "wait",
+                "ms": ms
+            })
+    
+    async def move(self, x: int, y: int) -> None:
+        """Move cursor to coordinates."""
+        if self._action_callback:
+            await self._action_callback({
+                "type": "move",
+                "x": x,
+                "y": y
+            })
+    
+    async def keypress(self, keys: list[str] | str) -> None:
+        """Press key combination."""
+        if isinstance(keys, str):
+            keys = [keys]
+        if self._action_callback:
+            await self._action_callback({
+                "type": "keypress",
+                "keys": keys
+            })
+    
+    async def drag(self, path: list[dict[str, int]]) -> None:
+        """Drag along a path of points."""
+        if self._action_callback:
+            await self._action_callback({
+                "type": "drag",
+                "path": path
+            })
+
+    async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
+        """Left mouse down at coordinates."""
+        if self._action_callback:
+            await self._action_callback({
+                "type": "left_mouse_down",
+                "x": x,
+                "y": y
+            })
+    
+    async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
+        """Left mouse up at coordinates."""
+        if self._action_callback:
+            await self._action_callback({
+                "type": "left_mouse_up",
+                "x": x,
+                "y": y
+            })
+    
+    async def get_current_url(self) -> str:
+        """Get the current URL."""
+        if self._action_callback:
+            return await self._action_callback({
+                "type": "get_current_url"
+            })
+        return ""