From d47ef3f1b66233c6e2781f486af5c048cf2b714e Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Fri, 8 Aug 2025 10:22:10 -0400 Subject: [PATCH] Added protocol for custom computer handlers --- libs/python/agent/agent/agent.py | 26 ++++--- libs/python/agent/agent/computers/__init__.py | 30 ++++++++ libs/python/agent/agent/computers/base.py | 68 +++++++++++++++++++ .../{computer_handler.py => computers/cua.py} | 10 +-- libs/python/agent/agent/types.py | 52 -------------- 5 files changed, 119 insertions(+), 67 deletions(-) create mode 100644 libs/python/agent/agent/computers/__init__.py create mode 100644 libs/python/agent/agent/computers/base.py rename libs/python/agent/agent/{computer_handler.py => computers/cua.py} (95%) diff --git a/libs/python/agent/agent/agent.py b/libs/python/agent/agent/agent.py index 79a4b9a6..48bd7b54 100644 --- a/libs/python/agent/agent/agent.py +++ b/libs/python/agent/agent/agent.py @@ -7,9 +7,8 @@ from typing import Dict, List, Any, Optional, AsyncGenerator, Union, cast, Calla from litellm.responses.utils import Usage -from .types import Messages, Computer, AgentCapability +from .types import Messages, AgentCapability from .decorators import find_agent_config -from .computer_handler import OpenAIComputerHandler, acknowledge_safety_check_callback, check_blocklisted_url import json import litellm import litellm.utils @@ -22,9 +21,13 @@ from .callbacks import ( BudgetManagerCallback, TelemetryCallback, ) +from .computers import ( + ComputerHandler, + make_computer_handler +) def get_json(obj: Any, max_depth: int = 10) -> Any: - def custom_serializer(o: Any, depth: int = 0, seen: Set[int] = None) -> Any: + def custom_serializer(o: Any, depth: int = 0, seen: Optional[Set[int]] = None) -> Any: if seen is None: seen = set() @@ -247,7 +250,7 @@ class ComputerAgent: computer_handler = None for schema in self.tool_schemas: if schema["type"] == "computer": - computer_handler = OpenAIComputerHandler(schema["computer"].interface) + computer_handler = make_computer_handler(schema["computer"]) break self.computer_handler = computer_handler @@ -398,7 +401,7 @@ class ComputerAgent: # AGENT OUTPUT PROCESSING # ============================================================================ - async def _handle_item(self, item: Any, computer: Optional[Computer] = None, ignore_call_ids: Optional[List[str]] = None) -> List[Dict[str, Any]]: + async def _handle_item(self, item: Any, computer: Optional[ComputerHandler] = None, ignore_call_ids: Optional[List[str]] = None) -> List[Dict[str, Any]]: """Handle each item; may cause a computer action + screenshot.""" if ignore_call_ids and item.get("call_id") and item.get("call_id") in ignore_call_ids: return [] @@ -450,10 +453,12 @@ class ComputerAgent: acknowledged_checks = [] for check in pending_checks: check_message = check.get("message", str(check)) - if acknowledge_safety_check_callback(check_message, allow_always=True): # TODO: implement a callback for safety checks - acknowledged_checks.append(check) - else: - raise ValueError(f"Safety check failed: {check_message}") + acknowledged_checks.append(check) + # TODO: implement a callback for safety checks + # if acknowledge_safety_check_callback(check_message, allow_always=True): + # acknowledged_checks.append(check) + # else: + # raise ValueError(f"Safety check failed: {check_message}") # Create call output call_output = { @@ -470,7 +475,8 @@ class ComputerAgent: if await computer.get_environment() == "browser": current_url = await computer.get_current_url() call_output["output"]["current_url"] = current_url - check_blocklisted_url(current_url) + # TODO: implement a callback for URL safety checks + # check_blocklisted_url(current_url) result = [call_output] await self._on_computer_call_end(item, result) diff --git a/libs/python/agent/agent/computers/__init__.py b/libs/python/agent/agent/computers/__init__.py new file mode 100644 index 00000000..e2c4a07a --- /dev/null +++ b/libs/python/agent/agent/computers/__init__.py @@ -0,0 +1,30 @@ +""" +Computer handler factory and interface definitions. + +This module provides a factory function to create computer handlers from different +computer interface types, supporting both the ComputerHandler protocol and the +Computer library interface. +""" + +from .base import ComputerHandler +from .cua import cuaComputerHandler +from computer import Computer + +def make_computer_handler(computer): + """ + Create a computer handler from a computer interface. + + Args: + computer: Either a ComputerHandler instance or a Computer instance + + Returns: + ComputerHandler: A computer handler instance + + Raises: + ValueError: If the computer type is not supported + """ + if isinstance(computer, ComputerHandler): + return computer + if isinstance(computer, Computer): + return cuaComputerHandler(computer) + raise ValueError(f"Unsupported computer type: {type(computer)}") \ No newline at end of file diff --git a/libs/python/agent/agent/computers/base.py b/libs/python/agent/agent/computers/base.py new file mode 100644 index 00000000..161d9fb8 --- /dev/null +++ b/libs/python/agent/agent/computers/base.py @@ -0,0 +1,68 @@ +""" +Base computer interface protocol for agent interactions. +""" + +from typing import Protocol, Literal, List, Dict, Any, Union, Optional, runtime_checkable + + +@runtime_checkable +class ComputerHandler(Protocol): + """Protocol defining the interface for computer interactions.""" + + # ==== Computer-Use-Preview Action Space ==== + + async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]: + """Get the current environment type.""" + ... + + async def get_dimensions(self) -> tuple[int, int]: + """Get screen dimensions as (width, height).""" + ... + + async def screenshot(self) -> str: + """Take a screenshot and return as base64 string.""" + ... + + async def click(self, x: int, y: int, button: str = "left") -> None: + """Click at coordinates with specified button.""" + ... + + async def double_click(self, x: int, y: int) -> None: + """Double click at coordinates.""" + ... + + async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: + """Scroll at coordinates with specified scroll amounts.""" + ... + + async def type(self, text: str) -> None: + """Type text.""" + ... + + async def wait(self, ms: int = 1000) -> None: + """Wait for specified milliseconds.""" + ... + + async def move(self, x: int, y: int) -> None: + """Move cursor to coordinates.""" + ... + + async def keypress(self, keys: Union[List[str], str]) -> None: + """Press key combination.""" + ... + + async def drag(self, path: List[Dict[str, int]]) -> None: + """Drag along specified path.""" + ... + + async def get_current_url(self) -> str: + """Get current URL (for browser environments).""" + ... + + async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None: + """Left mouse down at coordinates.""" + ... + + async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None: + """Left mouse up at coordinates.""" + ... diff --git a/libs/python/agent/agent/computer_handler.py b/libs/python/agent/agent/computers/cua.py similarity index 95% rename from libs/python/agent/agent/computer_handler.py rename to libs/python/agent/agent/computers/cua.py index 53de49ed..30663116 100644 --- a/libs/python/agent/agent/computer_handler.py +++ b/libs/python/agent/agent/computers/cua.py @@ -4,15 +4,15 @@ Computer handler implementation for OpenAI computer-use-preview protocol. import base64 from typing import Dict, List, Any, Literal, Union, Optional -from .types import Computer +from .base import ComputerHandler +from computer import Computer - -class OpenAIComputerHandler: +class cuaComputerHandler(ComputerHandler): """Computer handler that implements the Computer protocol using the computer interface.""" - def __init__(self, computer_interface): + def __init__(self, cua_computer: Computer): """Initialize with a computer interface (from tool schema).""" - self.interface = computer_interface + self.interface = cua_computer.interface # ==== Computer-Use-Preview Action Space ==== diff --git a/libs/python/agent/agent/types.py b/libs/python/agent/agent/types.py index 881e1c20..c56a9e5c 100644 --- a/libs/python/agent/agent/types.py +++ b/libs/python/agent/agent/types.py @@ -27,55 +27,3 @@ class AgentConfigInfo(BaseModel): def matches_model(self, model: str) -> bool: """Check if this agent config matches the given model""" return bool(re.match(self.models_regex, model)) - -# Computer tool interface -class Computer(Protocol): - """Protocol defining the interface for computer interactions.""" - - async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]: - """Get the current environment type.""" - ... - - async def get_dimensions(self) -> tuple[int, int]: - """Get screen dimensions as (width, height).""" - ... - - async def screenshot(self) -> str: - """Take a screenshot and return as base64 string.""" - ... - - async def click(self, x: int, y: int, button: str = "left") -> None: - """Click at coordinates with specified button.""" - ... - - async def double_click(self, x: int, y: int) -> None: - """Double click at coordinates.""" - ... - - async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: - """Scroll at coordinates with specified scroll amounts.""" - ... - - async def type(self, text: str) -> None: - """Type text.""" - ... - - async def wait(self, ms: int = 1000) -> None: - """Wait for specified milliseconds.""" - ... - - async def move(self, x: int, y: int) -> None: - """Move cursor to coordinates.""" - ... - - async def keypress(self, keys: List[str]) -> None: - """Press key combination.""" - ... - - async def drag(self, path: List[Dict[str, int]]) -> None: - """Drag along specified path.""" - ... - - async def get_current_url(self) -> str: - """Get current URL (for browser environments).""" - ...