mirror of
https://github.com/trycua/computer.git
synced 2026-01-03 12:00:00 -06:00
Added HUD integration
This commit is contained in:
@@ -59,6 +59,8 @@ class AsyncComputerHandler(Protocol):
|
||||
"""Get current URL (for browser environments)."""
|
||||
...
|
||||
|
||||
# ==== Anthropic Action Space ====
|
||||
|
||||
async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
|
||||
"""Left mouse down at coordinates."""
|
||||
...
|
||||
|
||||
7
libs/python/agent/agent/integrations/hud/__init__.py
Normal file
7
libs/python/agent/agent/integrations/hud/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
"""HUD integration for ComputerAgent."""
|
||||
|
||||
from .agent import ComputerAgent
|
||||
from .adapter import ComputerAgentAdapter
|
||||
from .computer_handler import HUDComputerHandler
|
||||
|
||||
__all__ = ["ComputerAgent", "ComputerAgentAdapter", "HUDComputerHandler"]
|
||||
121
libs/python/agent/agent/integrations/hud/adapter.py
Normal file
121
libs/python/agent/agent/integrations/hud/adapter.py
Normal file
@@ -0,0 +1,121 @@
|
||||
"""HUD Adapter for ComputerAgent integration."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, ClassVar
|
||||
|
||||
from hud.adapters.common import CLA, Adapter
|
||||
from hud.adapters.common.types import (
|
||||
CLAButton,
|
||||
CLAKey,
|
||||
ClickAction,
|
||||
CustomAction,
|
||||
DragAction,
|
||||
MoveAction,
|
||||
Point,
|
||||
PressAction,
|
||||
ResponseAction,
|
||||
ScreenshotFetch,
|
||||
ScrollAction,
|
||||
TypeAction,
|
||||
WaitAction,
|
||||
)
|
||||
|
||||
|
||||
class ComputerAgentAdapter(Adapter):
|
||||
"""Adapter for ComputerAgent to work with HUD."""
|
||||
|
||||
KEY_MAP: ClassVar[dict[str, CLAKey]] = {
|
||||
"return": "enter",
|
||||
"arrowup": "up",
|
||||
"arrowdown": "down",
|
||||
"arrowleft": "left",
|
||||
"arrowright": "right",
|
||||
"cmd": "ctrl",
|
||||
"super": "win",
|
||||
"meta": "win",
|
||||
}
|
||||
|
||||
BUTTON_MAP: ClassVar[dict[str, CLAButton]] = {
|
||||
"wheel": "middle",
|
||||
"middle": "middle",
|
||||
}
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
# ComputerAgent default dimensions (can be overridden)
|
||||
self.agent_width = 1024
|
||||
self.agent_height = 768
|
||||
|
||||
def _map_key(self, key: str) -> CLAKey:
|
||||
"""Map a key to its standardized form."""
|
||||
return self.KEY_MAP.get(key.lower(), key.lower()) # type: ignore
|
||||
|
||||
def convert(self, data: Any) -> CLA:
|
||||
"""Convert a ComputerAgent action to a HUD action."""
|
||||
try:
|
||||
action_type = data.get("type")
|
||||
|
||||
if action_type == "click":
|
||||
x, y = data.get("x", 0), data.get("y", 0)
|
||||
button = data.get("button", "left")
|
||||
button = self.BUTTON_MAP.get(button, button)
|
||||
if button is None:
|
||||
button = "left"
|
||||
converted_action = ClickAction(point=Point(x=x, y=y), button=button)
|
||||
|
||||
elif action_type == "double_click":
|
||||
x, y = data.get("x", 0), data.get("y", 0)
|
||||
converted_action = ClickAction(point=Point(x=x, y=y), button="left", pattern=[100])
|
||||
|
||||
elif action_type == "scroll":
|
||||
x, y = int(data.get("x", 0)), int(data.get("y", 0))
|
||||
scroll_x = int(data.get("scroll_x", 0))
|
||||
scroll_y = int(data.get("scroll_y", 0))
|
||||
converted_action = ScrollAction(
|
||||
point=Point(x=x, y=y), scroll=Point(x=scroll_x, y=scroll_y)
|
||||
)
|
||||
|
||||
elif action_type == "type":
|
||||
text = data.get("text", "")
|
||||
converted_action = TypeAction(text=text, enter_after=False)
|
||||
|
||||
elif action_type == "wait":
|
||||
ms = data.get("ms", 1000)
|
||||
converted_action = WaitAction(time=ms)
|
||||
|
||||
elif action_type == "move":
|
||||
x, y = data.get("x", 0), data.get("y", 0)
|
||||
converted_action = MoveAction(point=Point(x=x, y=y))
|
||||
|
||||
elif action_type == "keypress":
|
||||
keys = data.get("keys", [])
|
||||
if isinstance(keys, str):
|
||||
keys = [keys]
|
||||
converted_action = PressAction(keys=[self._map_key(k) for k in keys])
|
||||
|
||||
elif action_type == "drag":
|
||||
path = data.get("path", [])
|
||||
points = [Point(x=p.get("x", 0), y=p.get("y", 0)) for p in path]
|
||||
converted_action = DragAction(path=points)
|
||||
|
||||
elif action_type == "screenshot":
|
||||
converted_action = ScreenshotFetch()
|
||||
|
||||
elif action_type == "response":
|
||||
converted_action = ResponseAction(text=data.get("text", ""))
|
||||
|
||||
elif action_type == "custom":
|
||||
converted_action = CustomAction(action=data.get("action", ""))
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unsupported action type: {action_type}")
|
||||
|
||||
# Add reasoning and logs if available
|
||||
converted_action.reasoning = data.get("reasoning", "")
|
||||
converted_action.logs = data.get("logs", "")
|
||||
|
||||
return converted_action
|
||||
|
||||
except Exception as e:
|
||||
raise ValueError(f"Invalid action: {data}. Error: {e!s}") from e
|
||||
231
libs/python/agent/agent/integrations/hud/agent.py
Normal file
231
libs/python/agent/agent/integrations/hud/agent.py
Normal file
@@ -0,0 +1,231 @@
|
||||
"""HUD ComputerAgent wrapper for OSWorld benchmarking."""
|
||||
|
||||
import logging
|
||||
from typing import Any, Literal, Optional, Union, List, Dict
|
||||
import asyncio
|
||||
|
||||
from agent import ComputerAgent as BaseComputerAgent
|
||||
from hud.adapters import Adapter
|
||||
from hud.agent.base import Agent
|
||||
from hud.utils.common import Observation
|
||||
from hud.adapters.common.types import LogType
|
||||
from hud.types import Gym
|
||||
|
||||
from .adapter import ComputerAgentAdapter
|
||||
from .computer_handler import HUDComputerHandler
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ComputerAgent(Agent[BaseComputerAgent, dict[str, Any]]):
|
||||
"""
|
||||
A ComputerAgent wrapper for HUD integration.
|
||||
|
||||
This agent wraps the base ComputerAgent to work with HUD environments,
|
||||
providing the same interface as OperatorAgent but using ComputerAgent internally.
|
||||
"""
|
||||
|
||||
transfer_gyms: dict[Gym, Gym] = {"qa": "hud-browser"}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: str = "anthropic/claude-3-5-sonnet-20241022",
|
||||
environment: Literal["windows", "mac", "linux", "browser"] = "browser",
|
||||
adapter: Optional[Adapter] = None,
|
||||
name: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
):
|
||||
"""
|
||||
Initialize the ComputerAgent for HUD.
|
||||
|
||||
Args:
|
||||
model: The model string for ComputerAgent (e.g., "anthropic/claude-3-5-sonnet-20241022")
|
||||
environment: The environment type (windows, mac, linux, browser)
|
||||
adapter: The adapter to use for preprocessing and postprocessing
|
||||
name: The name of the agent
|
||||
**kwargs: Additional arguments passed to ComputerAgent
|
||||
"""
|
||||
# Create adapter if not provided
|
||||
adapter = adapter or ComputerAgentAdapter()
|
||||
|
||||
if name is None:
|
||||
name = f"computeragent-{model.split('/')[-1]}"
|
||||
|
||||
# Initialize the base Agent class without client (we'll create it later)
|
||||
super().__init__(client=None, adapter=adapter, name=name)
|
||||
|
||||
self.model = model
|
||||
self.environment = environment
|
||||
self.kwargs = kwargs
|
||||
|
||||
# Default dimensions
|
||||
self.width = 1024
|
||||
self.height = 768
|
||||
|
||||
# Update dimensions if adapter is provided
|
||||
if self.adapter:
|
||||
self.width = self.adapter.agent_width
|
||||
self.height = self.adapter.agent_height
|
||||
|
||||
# Create HUD computer handler
|
||||
self.hud_computer = HUDComputerHandler(
|
||||
environment=environment,
|
||||
dimensions=(self.width, self.height)
|
||||
)
|
||||
|
||||
# Initialize ComputerAgent with HUD computer handler
|
||||
self.computer_agent = BaseComputerAgent(
|
||||
model=model,
|
||||
tools=[self.hud_computer],
|
||||
**kwargs
|
||||
)
|
||||
|
||||
# Set the client to the computer_agent for compatibility
|
||||
self.client = self.computer_agent
|
||||
|
||||
# State tracking
|
||||
self.conversation_history: List[Dict[str, Any]] = []
|
||||
self.initial_prompt: Optional[str] = None
|
||||
|
||||
# System prompt for computer use tasks
|
||||
self.base_system_prompt = """
|
||||
You are an autonomous computer-using agent. Follow these guidelines:
|
||||
|
||||
1. Be decisive and complete tasks without asking for confirmation unless absolutely necessary.
|
||||
2. If you need user confirmation for safety-critical actions, use the formal safety check mechanism.
|
||||
3. Do NOT ask questions like "Should I proceed?" or "Would you like me to continue?" - just proceed with the task.
|
||||
4. When you find what you're looking for (e.g., a file to upload), proceed with the action directly.
|
||||
5. Only stop when the task is fully complete or if you encounter an error that prevents completion.
|
||||
6. Trust that the user wants you to complete the entire task they've requested.
|
||||
|
||||
Remember: You have been given permission to complete the requested task autonomously.
|
||||
"""
|
||||
|
||||
async def fetch_response(self, observation: Observation) -> tuple[list[dict[str, Any]], bool]:
|
||||
"""
|
||||
Fetch a response from ComputerAgent based on the observation.
|
||||
|
||||
Args:
|
||||
observation: The preprocessed observation
|
||||
|
||||
Returns:
|
||||
tuple[list[dict[str, Any]], bool, list[LogType] | None]: A tuple containing the list of raw actions,
|
||||
boolean indicating if the agent believes the task is complete.
|
||||
"""
|
||||
try:
|
||||
# Update the computer handler with the current screenshot
|
||||
if observation.screenshot:
|
||||
self.hud_computer.update_screenshot(observation.screenshot)
|
||||
|
||||
# Set up action callback to capture actions
|
||||
captured_actions = []
|
||||
action_done = False
|
||||
|
||||
async def action_callback(action: Dict[str, Any]) -> None:
|
||||
"""Callback to capture actions from ComputerAgent."""
|
||||
nonlocal captured_actions, action_done
|
||||
captured_actions.append(action)
|
||||
|
||||
# Set the action callback
|
||||
self.hud_computer.set_action_callback(action_callback)
|
||||
|
||||
# Prepare the message for ComputerAgent
|
||||
if not self.conversation_history:
|
||||
# First interaction - use the observation text as initial prompt
|
||||
if observation.text:
|
||||
self.initial_prompt = observation.text
|
||||
message = f"{self.base_system_prompt}\n\nTask: {observation.text}"
|
||||
else:
|
||||
message = f"{self.base_system_prompt}\n\nPlease analyze the current screen and determine what action to take."
|
||||
|
||||
self.conversation_history.append({"role": "user", "content": message})
|
||||
else:
|
||||
# Subsequent interactions - add context about the current state
|
||||
message = "Continue with the task based on the current screen state."
|
||||
self.conversation_history.append({"role": "user", "content": message})
|
||||
|
||||
# Run ComputerAgent
|
||||
try:
|
||||
# ComputerAgent.run returns an async generator
|
||||
async for result in self.computer_agent.run(self.conversation_history, stream=False):
|
||||
# Update conversation history with the output
|
||||
self.conversation_history += result["output"]
|
||||
|
||||
# Check if we captured any actions
|
||||
if captured_actions:
|
||||
# Extract reasoning from the conversation history
|
||||
reasoning = ""
|
||||
# Look for the latest reasoning message
|
||||
for msg in reversed(self.conversation_history):
|
||||
if msg.get("type") == "reasoning" and msg.get("summary"):
|
||||
reasoning = " ".join([s.get("text", "") for s in msg["summary"] if s.get("type") == "summary_text"])
|
||||
break
|
||||
elif msg.get("type") == "message" and msg.get("role") == "assistant":
|
||||
content = msg.get("content", [])
|
||||
if isinstance(content, list):
|
||||
reasoning = " ".join([c.get("text", "") for c in content if c.get("type") == "output_text"])
|
||||
break
|
||||
|
||||
# Add reasoning and logs to each action
|
||||
for action in captured_actions:
|
||||
action["reasoning"] = reasoning
|
||||
action["logs"] = {"conversation_length": len(self.conversation_history)}
|
||||
|
||||
# Check if task is done by looking for assistant message indicating completion
|
||||
done = False
|
||||
for msg in reversed(self.conversation_history):
|
||||
if msg.get("type") == "message" and msg.get("role") == "assistant":
|
||||
content = msg.get("content", [])
|
||||
for c in content:
|
||||
if c.get("type") == "output_text" and "task completed" in c.get("text", "").lower():
|
||||
done = True
|
||||
break
|
||||
break
|
||||
|
||||
return captured_actions, done
|
||||
else:
|
||||
# No actions captured, task is likely complete
|
||||
response_text = "Task completed."
|
||||
for msg in reversed(self.conversation_history):
|
||||
if msg.get("type") == "message" and msg.get("role") == "assistant":
|
||||
content = msg.get("content", [])
|
||||
for c in content:
|
||||
if c.get("type") == "output_text":
|
||||
response_text = c.get("text", response_text)
|
||||
break
|
||||
break
|
||||
|
||||
response_action = {
|
||||
"type": "response",
|
||||
"text": response_text,
|
||||
"reasoning": response_text,
|
||||
"logs": {"conversation_length": len(self.conversation_history)}
|
||||
}
|
||||
|
||||
# Check if this indicates task completion or failure
|
||||
done = True
|
||||
if "task is infeasible" in response_text.lower():
|
||||
response_action = {"type": "custom", "action": "FAIL"}
|
||||
|
||||
return [response_action], done
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error running ComputerAgent: {e}")
|
||||
# Return an error response
|
||||
error_action = {
|
||||
"type": "response",
|
||||
"text": f"Error occurred: {str(e)}",
|
||||
"reasoning": f"ComputerAgent encountered an error: {str(e)}",
|
||||
"logs": {"error": str(e)}
|
||||
}
|
||||
return [error_action], True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in fetch_response: {e}")
|
||||
error_action = {
|
||||
"type": "response",
|
||||
"text": f"Error in agent processing: {str(e)}",
|
||||
"reasoning": f"Agent processing error: {str(e)}",
|
||||
"logs": {"error": str(e)}
|
||||
}
|
||||
return [error_action], True
|
||||
187
libs/python/agent/agent/integrations/hud/computer_handler.py
Normal file
187
libs/python/agent/agent/integrations/hud/computer_handler.py
Normal file
@@ -0,0 +1,187 @@
|
||||
"""HUD Computer Handler for ComputerAgent integration."""
|
||||
|
||||
import base64
|
||||
from io import BytesIO
|
||||
from typing import Literal, Optional, Any, Dict, Callable
|
||||
from PIL import Image
|
||||
|
||||
from agent.computers import AsyncComputerHandler
|
||||
|
||||
|
||||
class HUDComputerHandler(AsyncComputerHandler):
|
||||
"""Computer handler that interfaces with HUD environment."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
environment: Literal["windows", "mac", "linux", "browser"] = "browser",
|
||||
dimensions: tuple[int, int] = (1024, 768),
|
||||
screenshot_callback: Optional[Callable] = None,
|
||||
action_callback: Optional[Callable] = None,
|
||||
):
|
||||
"""
|
||||
Initialize HUD computer handler.
|
||||
|
||||
Args:
|
||||
environment: The environment type for HUD
|
||||
dimensions: Screen dimensions as (width, height)
|
||||
screenshot_callback: Optional callback to get screenshots from HUD environment
|
||||
action_callback: Optional callback to execute actions in HUD environment
|
||||
"""
|
||||
super().__init__()
|
||||
self._environment = environment
|
||||
self._dimensions = dimensions
|
||||
self._screenshot_callback = screenshot_callback
|
||||
self._action_callback = action_callback
|
||||
|
||||
# Store the last screenshot for reuse
|
||||
self._last_screenshot: Optional[str] = None
|
||||
|
||||
def set_screenshot_callback(self, callback: Callable) -> None:
|
||||
"""Set the screenshot callback."""
|
||||
self._screenshot_callback = callback
|
||||
|
||||
def set_action_callback(self, callback: Callable) -> None:
|
||||
"""Set the action callback."""
|
||||
self._action_callback = callback
|
||||
|
||||
def update_screenshot(self, screenshot: str) -> None:
|
||||
"""Update the stored screenshot (base64 string)."""
|
||||
self._last_screenshot = screenshot
|
||||
|
||||
async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
|
||||
"""Get the current environment type."""
|
||||
return self._environment # type: ignore
|
||||
|
||||
async def get_dimensions(self) -> tuple[int, int]:
|
||||
"""Get screen dimensions as (width, height)."""
|
||||
return self._dimensions
|
||||
|
||||
async def screenshot(self) -> str:
|
||||
"""Take a screenshot and return as base64 string."""
|
||||
if self._screenshot_callback:
|
||||
screenshot = await self._screenshot_callback()
|
||||
if isinstance(screenshot, str):
|
||||
self._last_screenshot = screenshot
|
||||
return screenshot
|
||||
elif isinstance(screenshot, Image.Image):
|
||||
# Convert PIL Image to base64
|
||||
buffer = BytesIO()
|
||||
screenshot.save(buffer, format="PNG")
|
||||
screenshot_b64 = base64.b64encode(buffer.getvalue()).decode()
|
||||
self._last_screenshot = screenshot_b64
|
||||
return screenshot_b64
|
||||
elif isinstance(screenshot, bytes):
|
||||
screenshot_b64 = base64.b64encode(screenshot).decode()
|
||||
self._last_screenshot = screenshot_b64
|
||||
return screenshot_b64
|
||||
|
||||
# Return last screenshot if available, otherwise create a blank one
|
||||
if self._last_screenshot:
|
||||
return self._last_screenshot
|
||||
|
||||
# Create a blank screenshot as fallback
|
||||
blank_image = Image.new('RGB', self._dimensions, color='white')
|
||||
buffer = BytesIO()
|
||||
blank_image.save(buffer, format="PNG")
|
||||
screenshot_b64 = base64.b64encode(buffer.getvalue()).decode()
|
||||
self._last_screenshot = screenshot_b64
|
||||
return screenshot_b64
|
||||
|
||||
async def click(self, x: int, y: int, button: str = "left") -> None:
|
||||
"""Click at coordinates with specified button."""
|
||||
if self._action_callback:
|
||||
await self._action_callback({
|
||||
"type": "click",
|
||||
"x": x,
|
||||
"y": y,
|
||||
"button": button
|
||||
})
|
||||
|
||||
async def double_click(self, x: int, y: int) -> None:
|
||||
"""Double click at coordinates."""
|
||||
if self._action_callback:
|
||||
await self._action_callback({
|
||||
"type": "double_click",
|
||||
"x": x,
|
||||
"y": y
|
||||
})
|
||||
|
||||
async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
|
||||
"""Scroll at coordinates with specified scroll amounts."""
|
||||
if self._action_callback:
|
||||
await self._action_callback({
|
||||
"type": "scroll",
|
||||
"x": x,
|
||||
"y": y,
|
||||
"scroll_x": scroll_x,
|
||||
"scroll_y": scroll_y
|
||||
})
|
||||
|
||||
async def type(self, text: str) -> None:
|
||||
"""Type text."""
|
||||
if self._action_callback:
|
||||
await self._action_callback({
|
||||
"type": "type",
|
||||
"text": text
|
||||
})
|
||||
|
||||
async def wait(self, ms: int = 1000) -> None:
|
||||
"""Wait for specified milliseconds."""
|
||||
if self._action_callback:
|
||||
await self._action_callback({
|
||||
"type": "wait",
|
||||
"ms": ms
|
||||
})
|
||||
|
||||
async def move(self, x: int, y: int) -> None:
|
||||
"""Move cursor to coordinates."""
|
||||
if self._action_callback:
|
||||
await self._action_callback({
|
||||
"type": "move",
|
||||
"x": x,
|
||||
"y": y
|
||||
})
|
||||
|
||||
async def keypress(self, keys: list[str] | str) -> None:
|
||||
"""Press key combination."""
|
||||
if isinstance(keys, str):
|
||||
keys = [keys]
|
||||
if self._action_callback:
|
||||
await self._action_callback({
|
||||
"type": "keypress",
|
||||
"keys": keys
|
||||
})
|
||||
|
||||
async def drag(self, path: list[dict[str, int]]) -> None:
|
||||
"""Drag along a path of points."""
|
||||
if self._action_callback:
|
||||
await self._action_callback({
|
||||
"type": "drag",
|
||||
"path": path
|
||||
})
|
||||
|
||||
async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
|
||||
"""Left mouse down at coordinates."""
|
||||
if self._action_callback:
|
||||
await self._action_callback({
|
||||
"type": "left_mouse_down",
|
||||
"x": x,
|
||||
"y": y
|
||||
})
|
||||
|
||||
async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
|
||||
"""Left mouse up at coordinates."""
|
||||
if self._action_callback:
|
||||
await self._action_callback({
|
||||
"type": "left_mouse_up",
|
||||
"x": x,
|
||||
"y": y
|
||||
})
|
||||
|
||||
async def get_current_url(self) -> str:
|
||||
"""Get the current URL."""
|
||||
if self._action_callback:
|
||||
return await self._action_callback({
|
||||
"type": "get_current_url"
|
||||
})
|
||||
return ""
|
||||
Reference in New Issue
Block a user