Added protocol for custom computer handlers

This commit is contained in:
Dillon DuPont
2025-08-08 10:22:10 -04:00
parent a78a0e1e58
commit d47ef3f1b6
5 changed files with 119 additions and 67 deletions

View File

@@ -7,9 +7,8 @@ from typing import Dict, List, Any, Optional, AsyncGenerator, Union, cast, Calla
from litellm.responses.utils import Usage
from .types import Messages, Computer, AgentCapability
from .types import Messages, AgentCapability
from .decorators import find_agent_config
from .computer_handler import OpenAIComputerHandler, acknowledge_safety_check_callback, check_blocklisted_url
import json
import litellm
import litellm.utils
@@ -22,9 +21,13 @@ from .callbacks import (
BudgetManagerCallback,
TelemetryCallback,
)
from .computers import (
ComputerHandler,
make_computer_handler
)
def get_json(obj: Any, max_depth: int = 10) -> Any:
def custom_serializer(o: Any, depth: int = 0, seen: Set[int] = None) -> Any:
def custom_serializer(o: Any, depth: int = 0, seen: Optional[Set[int]] = None) -> Any:
if seen is None:
seen = set()
@@ -247,7 +250,7 @@ class ComputerAgent:
computer_handler = None
for schema in self.tool_schemas:
if schema["type"] == "computer":
computer_handler = OpenAIComputerHandler(schema["computer"].interface)
computer_handler = make_computer_handler(schema["computer"])
break
self.computer_handler = computer_handler
@@ -398,7 +401,7 @@ class ComputerAgent:
# AGENT OUTPUT PROCESSING
# ============================================================================
async def _handle_item(self, item: Any, computer: Optional[Computer] = None, ignore_call_ids: Optional[List[str]] = None) -> List[Dict[str, Any]]:
async def _handle_item(self, item: Any, computer: Optional[ComputerHandler] = None, ignore_call_ids: Optional[List[str]] = None) -> List[Dict[str, Any]]:
"""Handle each item; may cause a computer action + screenshot."""
if ignore_call_ids and item.get("call_id") and item.get("call_id") in ignore_call_ids:
return []
@@ -450,10 +453,12 @@ class ComputerAgent:
acknowledged_checks = []
for check in pending_checks:
check_message = check.get("message", str(check))
if acknowledge_safety_check_callback(check_message, allow_always=True): # TODO: implement a callback for safety checks
acknowledged_checks.append(check)
else:
raise ValueError(f"Safety check failed: {check_message}")
acknowledged_checks.append(check)
# TODO: implement a callback for safety checks
# if acknowledge_safety_check_callback(check_message, allow_always=True):
# acknowledged_checks.append(check)
# else:
# raise ValueError(f"Safety check failed: {check_message}")
# Create call output
call_output = {
@@ -470,7 +475,8 @@ class ComputerAgent:
if await computer.get_environment() == "browser":
current_url = await computer.get_current_url()
call_output["output"]["current_url"] = current_url
check_blocklisted_url(current_url)
# TODO: implement a callback for URL safety checks
# check_blocklisted_url(current_url)
result = [call_output]
await self._on_computer_call_end(item, result)

View File

@@ -0,0 +1,30 @@
"""
Computer handler factory and interface definitions.
This module provides a factory function to create computer handlers from different
computer interface types, supporting both the ComputerHandler protocol and the
Computer library interface.
"""
from .base import ComputerHandler
from .cua import cuaComputerHandler
from computer import Computer
def make_computer_handler(computer):
"""
Create a computer handler from a computer interface.
Args:
computer: Either a ComputerHandler instance or a Computer instance
Returns:
ComputerHandler: A computer handler instance
Raises:
ValueError: If the computer type is not supported
"""
if isinstance(computer, ComputerHandler):
return computer
if isinstance(computer, Computer):
return cuaComputerHandler(computer)
raise ValueError(f"Unsupported computer type: {type(computer)}")

View File

@@ -0,0 +1,68 @@
"""
Base computer interface protocol for agent interactions.
"""
from typing import Protocol, Literal, List, Dict, Any, Union, Optional, runtime_checkable
@runtime_checkable
class ComputerHandler(Protocol):
"""Protocol defining the interface for computer interactions."""
# ==== Computer-Use-Preview Action Space ====
async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
"""Get the current environment type."""
...
async def get_dimensions(self) -> tuple[int, int]:
"""Get screen dimensions as (width, height)."""
...
async def screenshot(self) -> str:
"""Take a screenshot and return as base64 string."""
...
async def click(self, x: int, y: int, button: str = "left") -> None:
"""Click at coordinates with specified button."""
...
async def double_click(self, x: int, y: int) -> None:
"""Double click at coordinates."""
...
async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
"""Scroll at coordinates with specified scroll amounts."""
...
async def type(self, text: str) -> None:
"""Type text."""
...
async def wait(self, ms: int = 1000) -> None:
"""Wait for specified milliseconds."""
...
async def move(self, x: int, y: int) -> None:
"""Move cursor to coordinates."""
...
async def keypress(self, keys: Union[List[str], str]) -> None:
"""Press key combination."""
...
async def drag(self, path: List[Dict[str, int]]) -> None:
"""Drag along specified path."""
...
async def get_current_url(self) -> str:
"""Get current URL (for browser environments)."""
...
async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
"""Left mouse down at coordinates."""
...
async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
"""Left mouse up at coordinates."""
...

View File

@@ -4,15 +4,15 @@ Computer handler implementation for OpenAI computer-use-preview protocol.
import base64
from typing import Dict, List, Any, Literal, Union, Optional
from .types import Computer
from .base import ComputerHandler
from computer import Computer
class OpenAIComputerHandler:
class cuaComputerHandler(ComputerHandler):
"""Computer handler that implements the Computer protocol using the computer interface."""
def __init__(self, computer_interface):
def __init__(self, cua_computer: Computer):
"""Initialize with a computer interface (from tool schema)."""
self.interface = computer_interface
self.interface = cua_computer.interface
# ==== Computer-Use-Preview Action Space ====

View File

@@ -27,55 +27,3 @@ class AgentConfigInfo(BaseModel):
def matches_model(self, model: str) -> bool:
"""Check if this agent config matches the given model"""
return bool(re.match(self.models_regex, model))
# Computer tool interface
class Computer(Protocol):
"""Protocol defining the interface for computer interactions."""
async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
"""Get the current environment type."""
...
async def get_dimensions(self) -> tuple[int, int]:
"""Get screen dimensions as (width, height)."""
...
async def screenshot(self) -> str:
"""Take a screenshot and return as base64 string."""
...
async def click(self, x: int, y: int, button: str = "left") -> None:
"""Click at coordinates with specified button."""
...
async def double_click(self, x: int, y: int) -> None:
"""Double click at coordinates."""
...
async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
"""Scroll at coordinates with specified scroll amounts."""
...
async def type(self, text: str) -> None:
"""Type text."""
...
async def wait(self, ms: int = 1000) -> None:
"""Wait for specified milliseconds."""
...
async def move(self, x: int, y: int) -> None:
"""Move cursor to coordinates."""
...
async def keypress(self, keys: List[str]) -> None:
"""Press key combination."""
...
async def drag(self, path: List[Dict[str, int]]) -> None:
"""Drag along specified path."""
...
async def get_current_url(self) -> str:
"""Get current URL (for browser environments)."""
...