Added HUD integration

This commit is contained in:
Dillon DuPont
2025-08-08 12:47:07 -04:00
parent 5c46ca0c9a
commit 1882fb68e5
5 changed files with 548 additions and 0 deletions

View File

@@ -59,6 +59,8 @@ class AsyncComputerHandler(Protocol):
"""Get current URL (for browser environments)."""
...
# ==== Anthropic Action Space ====
async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
"""Left mouse down at coordinates."""
...

View File

@@ -0,0 +1,7 @@
"""HUD integration for ComputerAgent."""
from .agent import ComputerAgent
from .adapter import ComputerAgentAdapter
from .computer_handler import HUDComputerHandler
__all__ = ["ComputerAgent", "ComputerAgentAdapter", "HUDComputerHandler"]

View File

@@ -0,0 +1,121 @@
"""HUD Adapter for ComputerAgent integration."""
from __future__ import annotations
from typing import Any, ClassVar
from hud.adapters.common import CLA, Adapter
from hud.adapters.common.types import (
CLAButton,
CLAKey,
ClickAction,
CustomAction,
DragAction,
MoveAction,
Point,
PressAction,
ResponseAction,
ScreenshotFetch,
ScrollAction,
TypeAction,
WaitAction,
)
class ComputerAgentAdapter(Adapter):
"""Adapter for ComputerAgent to work with HUD."""
KEY_MAP: ClassVar[dict[str, CLAKey]] = {
"return": "enter",
"arrowup": "up",
"arrowdown": "down",
"arrowleft": "left",
"arrowright": "right",
"cmd": "ctrl",
"super": "win",
"meta": "win",
}
BUTTON_MAP: ClassVar[dict[str, CLAButton]] = {
"wheel": "middle",
"middle": "middle",
}
def __init__(self) -> None:
super().__init__()
# ComputerAgent default dimensions (can be overridden)
self.agent_width = 1024
self.agent_height = 768
def _map_key(self, key: str) -> CLAKey:
"""Map a key to its standardized form."""
return self.KEY_MAP.get(key.lower(), key.lower()) # type: ignore
def convert(self, data: Any) -> CLA:
"""Convert a ComputerAgent action to a HUD action."""
try:
action_type = data.get("type")
if action_type == "click":
x, y = data.get("x", 0), data.get("y", 0)
button = data.get("button", "left")
button = self.BUTTON_MAP.get(button, button)
if button is None:
button = "left"
converted_action = ClickAction(point=Point(x=x, y=y), button=button)
elif action_type == "double_click":
x, y = data.get("x", 0), data.get("y", 0)
converted_action = ClickAction(point=Point(x=x, y=y), button="left", pattern=[100])
elif action_type == "scroll":
x, y = int(data.get("x", 0)), int(data.get("y", 0))
scroll_x = int(data.get("scroll_x", 0))
scroll_y = int(data.get("scroll_y", 0))
converted_action = ScrollAction(
point=Point(x=x, y=y), scroll=Point(x=scroll_x, y=scroll_y)
)
elif action_type == "type":
text = data.get("text", "")
converted_action = TypeAction(text=text, enter_after=False)
elif action_type == "wait":
ms = data.get("ms", 1000)
converted_action = WaitAction(time=ms)
elif action_type == "move":
x, y = data.get("x", 0), data.get("y", 0)
converted_action = MoveAction(point=Point(x=x, y=y))
elif action_type == "keypress":
keys = data.get("keys", [])
if isinstance(keys, str):
keys = [keys]
converted_action = PressAction(keys=[self._map_key(k) for k in keys])
elif action_type == "drag":
path = data.get("path", [])
points = [Point(x=p.get("x", 0), y=p.get("y", 0)) for p in path]
converted_action = DragAction(path=points)
elif action_type == "screenshot":
converted_action = ScreenshotFetch()
elif action_type == "response":
converted_action = ResponseAction(text=data.get("text", ""))
elif action_type == "custom":
converted_action = CustomAction(action=data.get("action", ""))
else:
raise ValueError(f"Unsupported action type: {action_type}")
# Add reasoning and logs if available
converted_action.reasoning = data.get("reasoning", "")
converted_action.logs = data.get("logs", "")
return converted_action
except Exception as e:
raise ValueError(f"Invalid action: {data}. Error: {e!s}") from e

View File

@@ -0,0 +1,231 @@
"""HUD ComputerAgent wrapper for OSWorld benchmarking."""
import logging
from typing import Any, Literal, Optional, Union, List, Dict
import asyncio
from agent import ComputerAgent as BaseComputerAgent
from hud.adapters import Adapter
from hud.agent.base import Agent
from hud.utils.common import Observation
from hud.adapters.common.types import LogType
from hud.types import Gym
from .adapter import ComputerAgentAdapter
from .computer_handler import HUDComputerHandler
logger = logging.getLogger(__name__)
class ComputerAgent(Agent[BaseComputerAgent, dict[str, Any]]):
"""
A ComputerAgent wrapper for HUD integration.
This agent wraps the base ComputerAgent to work with HUD environments,
providing the same interface as OperatorAgent but using ComputerAgent internally.
"""
transfer_gyms: dict[Gym, Gym] = {"qa": "hud-browser"}
def __init__(
self,
model: str = "anthropic/claude-3-5-sonnet-20241022",
environment: Literal["windows", "mac", "linux", "browser"] = "browser",
adapter: Optional[Adapter] = None,
name: Optional[str] = None,
**kwargs: Any,
):
"""
Initialize the ComputerAgent for HUD.
Args:
model: The model string for ComputerAgent (e.g., "anthropic/claude-3-5-sonnet-20241022")
environment: The environment type (windows, mac, linux, browser)
adapter: The adapter to use for preprocessing and postprocessing
name: The name of the agent
**kwargs: Additional arguments passed to ComputerAgent
"""
# Create adapter if not provided
adapter = adapter or ComputerAgentAdapter()
if name is None:
name = f"computeragent-{model.split('/')[-1]}"
# Initialize the base Agent class without client (we'll create it later)
super().__init__(client=None, adapter=adapter, name=name)
self.model = model
self.environment = environment
self.kwargs = kwargs
# Default dimensions
self.width = 1024
self.height = 768
# Update dimensions if adapter is provided
if self.adapter:
self.width = self.adapter.agent_width
self.height = self.adapter.agent_height
# Create HUD computer handler
self.hud_computer = HUDComputerHandler(
environment=environment,
dimensions=(self.width, self.height)
)
# Initialize ComputerAgent with HUD computer handler
self.computer_agent = BaseComputerAgent(
model=model,
tools=[self.hud_computer],
**kwargs
)
# Set the client to the computer_agent for compatibility
self.client = self.computer_agent
# State tracking
self.conversation_history: List[Dict[str, Any]] = []
self.initial_prompt: Optional[str] = None
# System prompt for computer use tasks
self.base_system_prompt = """
You are an autonomous computer-using agent. Follow these guidelines:
1. Be decisive and complete tasks without asking for confirmation unless absolutely necessary.
2. If you need user confirmation for safety-critical actions, use the formal safety check mechanism.
3. Do NOT ask questions like "Should I proceed?" or "Would you like me to continue?" - just proceed with the task.
4. When you find what you're looking for (e.g., a file to upload), proceed with the action directly.
5. Only stop when the task is fully complete or if you encounter an error that prevents completion.
6. Trust that the user wants you to complete the entire task they've requested.
Remember: You have been given permission to complete the requested task autonomously.
"""
async def fetch_response(self, observation: Observation) -> tuple[list[dict[str, Any]], bool]:
"""
Fetch a response from ComputerAgent based on the observation.
Args:
observation: The preprocessed observation
Returns:
tuple[list[dict[str, Any]], bool, list[LogType] | None]: A tuple containing the list of raw actions,
boolean indicating if the agent believes the task is complete.
"""
try:
# Update the computer handler with the current screenshot
if observation.screenshot:
self.hud_computer.update_screenshot(observation.screenshot)
# Set up action callback to capture actions
captured_actions = []
action_done = False
async def action_callback(action: Dict[str, Any]) -> None:
"""Callback to capture actions from ComputerAgent."""
nonlocal captured_actions, action_done
captured_actions.append(action)
# Set the action callback
self.hud_computer.set_action_callback(action_callback)
# Prepare the message for ComputerAgent
if not self.conversation_history:
# First interaction - use the observation text as initial prompt
if observation.text:
self.initial_prompt = observation.text
message = f"{self.base_system_prompt}\n\nTask: {observation.text}"
else:
message = f"{self.base_system_prompt}\n\nPlease analyze the current screen and determine what action to take."
self.conversation_history.append({"role": "user", "content": message})
else:
# Subsequent interactions - add context about the current state
message = "Continue with the task based on the current screen state."
self.conversation_history.append({"role": "user", "content": message})
# Run ComputerAgent
try:
# ComputerAgent.run returns an async generator
async for result in self.computer_agent.run(self.conversation_history, stream=False):
# Update conversation history with the output
self.conversation_history += result["output"]
# Check if we captured any actions
if captured_actions:
# Extract reasoning from the conversation history
reasoning = ""
# Look for the latest reasoning message
for msg in reversed(self.conversation_history):
if msg.get("type") == "reasoning" and msg.get("summary"):
reasoning = " ".join([s.get("text", "") for s in msg["summary"] if s.get("type") == "summary_text"])
break
elif msg.get("type") == "message" and msg.get("role") == "assistant":
content = msg.get("content", [])
if isinstance(content, list):
reasoning = " ".join([c.get("text", "") for c in content if c.get("type") == "output_text"])
break
# Add reasoning and logs to each action
for action in captured_actions:
action["reasoning"] = reasoning
action["logs"] = {"conversation_length": len(self.conversation_history)}
# Check if task is done by looking for assistant message indicating completion
done = False
for msg in reversed(self.conversation_history):
if msg.get("type") == "message" and msg.get("role") == "assistant":
content = msg.get("content", [])
for c in content:
if c.get("type") == "output_text" and "task completed" in c.get("text", "").lower():
done = True
break
break
return captured_actions, done
else:
# No actions captured, task is likely complete
response_text = "Task completed."
for msg in reversed(self.conversation_history):
if msg.get("type") == "message" and msg.get("role") == "assistant":
content = msg.get("content", [])
for c in content:
if c.get("type") == "output_text":
response_text = c.get("text", response_text)
break
break
response_action = {
"type": "response",
"text": response_text,
"reasoning": response_text,
"logs": {"conversation_length": len(self.conversation_history)}
}
# Check if this indicates task completion or failure
done = True
if "task is infeasible" in response_text.lower():
response_action = {"type": "custom", "action": "FAIL"}
return [response_action], done
except Exception as e:
logger.error(f"Error running ComputerAgent: {e}")
# Return an error response
error_action = {
"type": "response",
"text": f"Error occurred: {str(e)}",
"reasoning": f"ComputerAgent encountered an error: {str(e)}",
"logs": {"error": str(e)}
}
return [error_action], True
except Exception as e:
logger.error(f"Error in fetch_response: {e}")
error_action = {
"type": "response",
"text": f"Error in agent processing: {str(e)}",
"reasoning": f"Agent processing error: {str(e)}",
"logs": {"error": str(e)}
}
return [error_action], True

View File

@@ -0,0 +1,187 @@
"""HUD Computer Handler for ComputerAgent integration."""
import base64
from io import BytesIO
from typing import Literal, Optional, Any, Dict, Callable
from PIL import Image
from agent.computers import AsyncComputerHandler
class HUDComputerHandler(AsyncComputerHandler):
"""Computer handler that interfaces with HUD environment."""
def __init__(
self,
environment: Literal["windows", "mac", "linux", "browser"] = "browser",
dimensions: tuple[int, int] = (1024, 768),
screenshot_callback: Optional[Callable] = None,
action_callback: Optional[Callable] = None,
):
"""
Initialize HUD computer handler.
Args:
environment: The environment type for HUD
dimensions: Screen dimensions as (width, height)
screenshot_callback: Optional callback to get screenshots from HUD environment
action_callback: Optional callback to execute actions in HUD environment
"""
super().__init__()
self._environment = environment
self._dimensions = dimensions
self._screenshot_callback = screenshot_callback
self._action_callback = action_callback
# Store the last screenshot for reuse
self._last_screenshot: Optional[str] = None
def set_screenshot_callback(self, callback: Callable) -> None:
"""Set the screenshot callback."""
self._screenshot_callback = callback
def set_action_callback(self, callback: Callable) -> None:
"""Set the action callback."""
self._action_callback = callback
def update_screenshot(self, screenshot: str) -> None:
"""Update the stored screenshot (base64 string)."""
self._last_screenshot = screenshot
async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
"""Get the current environment type."""
return self._environment # type: ignore
async def get_dimensions(self) -> tuple[int, int]:
"""Get screen dimensions as (width, height)."""
return self._dimensions
async def screenshot(self) -> str:
"""Take a screenshot and return as base64 string."""
if self._screenshot_callback:
screenshot = await self._screenshot_callback()
if isinstance(screenshot, str):
self._last_screenshot = screenshot
return screenshot
elif isinstance(screenshot, Image.Image):
# Convert PIL Image to base64
buffer = BytesIO()
screenshot.save(buffer, format="PNG")
screenshot_b64 = base64.b64encode(buffer.getvalue()).decode()
self._last_screenshot = screenshot_b64
return screenshot_b64
elif isinstance(screenshot, bytes):
screenshot_b64 = base64.b64encode(screenshot).decode()
self._last_screenshot = screenshot_b64
return screenshot_b64
# Return last screenshot if available, otherwise create a blank one
if self._last_screenshot:
return self._last_screenshot
# Create a blank screenshot as fallback
blank_image = Image.new('RGB', self._dimensions, color='white')
buffer = BytesIO()
blank_image.save(buffer, format="PNG")
screenshot_b64 = base64.b64encode(buffer.getvalue()).decode()
self._last_screenshot = screenshot_b64
return screenshot_b64
async def click(self, x: int, y: int, button: str = "left") -> None:
"""Click at coordinates with specified button."""
if self._action_callback:
await self._action_callback({
"type": "click",
"x": x,
"y": y,
"button": button
})
async def double_click(self, x: int, y: int) -> None:
"""Double click at coordinates."""
if self._action_callback:
await self._action_callback({
"type": "double_click",
"x": x,
"y": y
})
async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
"""Scroll at coordinates with specified scroll amounts."""
if self._action_callback:
await self._action_callback({
"type": "scroll",
"x": x,
"y": y,
"scroll_x": scroll_x,
"scroll_y": scroll_y
})
async def type(self, text: str) -> None:
"""Type text."""
if self._action_callback:
await self._action_callback({
"type": "type",
"text": text
})
async def wait(self, ms: int = 1000) -> None:
"""Wait for specified milliseconds."""
if self._action_callback:
await self._action_callback({
"type": "wait",
"ms": ms
})
async def move(self, x: int, y: int) -> None:
"""Move cursor to coordinates."""
if self._action_callback:
await self._action_callback({
"type": "move",
"x": x,
"y": y
})
async def keypress(self, keys: list[str] | str) -> None:
"""Press key combination."""
if isinstance(keys, str):
keys = [keys]
if self._action_callback:
await self._action_callback({
"type": "keypress",
"keys": keys
})
async def drag(self, path: list[dict[str, int]]) -> None:
"""Drag along a path of points."""
if self._action_callback:
await self._action_callback({
"type": "drag",
"path": path
})
async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
"""Left mouse down at coordinates."""
if self._action_callback:
await self._action_callback({
"type": "left_mouse_down",
"x": x,
"y": y
})
async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
"""Left mouse up at coordinates."""
if self._action_callback:
await self._action_callback({
"type": "left_mouse_up",
"x": x,
"y": y
})
async def get_current_url(self) -> str:
"""Get the current URL."""
if self._action_callback:
return await self._action_callback({
"type": "get_current_url"
})
return ""