mirror of
https://github.com/trycua/computer.git
synced 2026-01-03 20:10:04 -06:00
Added protocol for custom computer handlers
This commit is contained in:
@@ -7,9 +7,8 @@ from typing import Dict, List, Any, Optional, AsyncGenerator, Union, cast, Calla
|
||||
|
||||
from litellm.responses.utils import Usage
|
||||
|
||||
from .types import Messages, Computer, AgentCapability
|
||||
from .types import Messages, AgentCapability
|
||||
from .decorators import find_agent_config
|
||||
from .computer_handler import OpenAIComputerHandler, acknowledge_safety_check_callback, check_blocklisted_url
|
||||
import json
|
||||
import litellm
|
||||
import litellm.utils
|
||||
@@ -22,9 +21,13 @@ from .callbacks import (
|
||||
BudgetManagerCallback,
|
||||
TelemetryCallback,
|
||||
)
|
||||
from .computers import (
|
||||
ComputerHandler,
|
||||
make_computer_handler
|
||||
)
|
||||
|
||||
def get_json(obj: Any, max_depth: int = 10) -> Any:
|
||||
def custom_serializer(o: Any, depth: int = 0, seen: Set[int] = None) -> Any:
|
||||
def custom_serializer(o: Any, depth: int = 0, seen: Optional[Set[int]] = None) -> Any:
|
||||
if seen is None:
|
||||
seen = set()
|
||||
|
||||
@@ -247,7 +250,7 @@ class ComputerAgent:
|
||||
computer_handler = None
|
||||
for schema in self.tool_schemas:
|
||||
if schema["type"] == "computer":
|
||||
computer_handler = OpenAIComputerHandler(schema["computer"].interface)
|
||||
computer_handler = make_computer_handler(schema["computer"])
|
||||
break
|
||||
self.computer_handler = computer_handler
|
||||
|
||||
@@ -398,7 +401,7 @@ class ComputerAgent:
|
||||
# AGENT OUTPUT PROCESSING
|
||||
# ============================================================================
|
||||
|
||||
async def _handle_item(self, item: Any, computer: Optional[Computer] = None, ignore_call_ids: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
||||
async def _handle_item(self, item: Any, computer: Optional[ComputerHandler] = None, ignore_call_ids: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
||||
"""Handle each item; may cause a computer action + screenshot."""
|
||||
if ignore_call_ids and item.get("call_id") and item.get("call_id") in ignore_call_ids:
|
||||
return []
|
||||
@@ -450,10 +453,12 @@ class ComputerAgent:
|
||||
acknowledged_checks = []
|
||||
for check in pending_checks:
|
||||
check_message = check.get("message", str(check))
|
||||
if acknowledge_safety_check_callback(check_message, allow_always=True): # TODO: implement a callback for safety checks
|
||||
acknowledged_checks.append(check)
|
||||
else:
|
||||
raise ValueError(f"Safety check failed: {check_message}")
|
||||
acknowledged_checks.append(check)
|
||||
# TODO: implement a callback for safety checks
|
||||
# if acknowledge_safety_check_callback(check_message, allow_always=True):
|
||||
# acknowledged_checks.append(check)
|
||||
# else:
|
||||
# raise ValueError(f"Safety check failed: {check_message}")
|
||||
|
||||
# Create call output
|
||||
call_output = {
|
||||
@@ -470,7 +475,8 @@ class ComputerAgent:
|
||||
if await computer.get_environment() == "browser":
|
||||
current_url = await computer.get_current_url()
|
||||
call_output["output"]["current_url"] = current_url
|
||||
check_blocklisted_url(current_url)
|
||||
# TODO: implement a callback for URL safety checks
|
||||
# check_blocklisted_url(current_url)
|
||||
|
||||
result = [call_output]
|
||||
await self._on_computer_call_end(item, result)
|
||||
|
||||
30
libs/python/agent/agent/computers/__init__.py
Normal file
30
libs/python/agent/agent/computers/__init__.py
Normal file
@@ -0,0 +1,30 @@
|
||||
"""
|
||||
Computer handler factory and interface definitions.
|
||||
|
||||
This module provides a factory function to create computer handlers from different
|
||||
computer interface types, supporting both the ComputerHandler protocol and the
|
||||
Computer library interface.
|
||||
"""
|
||||
|
||||
from .base import ComputerHandler
|
||||
from .cua import cuaComputerHandler
|
||||
from computer import Computer
|
||||
|
||||
def make_computer_handler(computer):
|
||||
"""
|
||||
Create a computer handler from a computer interface.
|
||||
|
||||
Args:
|
||||
computer: Either a ComputerHandler instance or a Computer instance
|
||||
|
||||
Returns:
|
||||
ComputerHandler: A computer handler instance
|
||||
|
||||
Raises:
|
||||
ValueError: If the computer type is not supported
|
||||
"""
|
||||
if isinstance(computer, ComputerHandler):
|
||||
return computer
|
||||
if isinstance(computer, Computer):
|
||||
return cuaComputerHandler(computer)
|
||||
raise ValueError(f"Unsupported computer type: {type(computer)}")
|
||||
68
libs/python/agent/agent/computers/base.py
Normal file
68
libs/python/agent/agent/computers/base.py
Normal file
@@ -0,0 +1,68 @@
|
||||
"""
|
||||
Base computer interface protocol for agent interactions.
|
||||
"""
|
||||
|
||||
from typing import Protocol, Literal, List, Dict, Any, Union, Optional, runtime_checkable
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class ComputerHandler(Protocol):
|
||||
"""Protocol defining the interface for computer interactions."""
|
||||
|
||||
# ==== Computer-Use-Preview Action Space ====
|
||||
|
||||
async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
|
||||
"""Get the current environment type."""
|
||||
...
|
||||
|
||||
async def get_dimensions(self) -> tuple[int, int]:
|
||||
"""Get screen dimensions as (width, height)."""
|
||||
...
|
||||
|
||||
async def screenshot(self) -> str:
|
||||
"""Take a screenshot and return as base64 string."""
|
||||
...
|
||||
|
||||
async def click(self, x: int, y: int, button: str = "left") -> None:
|
||||
"""Click at coordinates with specified button."""
|
||||
...
|
||||
|
||||
async def double_click(self, x: int, y: int) -> None:
|
||||
"""Double click at coordinates."""
|
||||
...
|
||||
|
||||
async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
|
||||
"""Scroll at coordinates with specified scroll amounts."""
|
||||
...
|
||||
|
||||
async def type(self, text: str) -> None:
|
||||
"""Type text."""
|
||||
...
|
||||
|
||||
async def wait(self, ms: int = 1000) -> None:
|
||||
"""Wait for specified milliseconds."""
|
||||
...
|
||||
|
||||
async def move(self, x: int, y: int) -> None:
|
||||
"""Move cursor to coordinates."""
|
||||
...
|
||||
|
||||
async def keypress(self, keys: Union[List[str], str]) -> None:
|
||||
"""Press key combination."""
|
||||
...
|
||||
|
||||
async def drag(self, path: List[Dict[str, int]]) -> None:
|
||||
"""Drag along specified path."""
|
||||
...
|
||||
|
||||
async def get_current_url(self) -> str:
|
||||
"""Get current URL (for browser environments)."""
|
||||
...
|
||||
|
||||
async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
|
||||
"""Left mouse down at coordinates."""
|
||||
...
|
||||
|
||||
async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
|
||||
"""Left mouse up at coordinates."""
|
||||
...
|
||||
@@ -4,15 +4,15 @@ Computer handler implementation for OpenAI computer-use-preview protocol.
|
||||
|
||||
import base64
|
||||
from typing import Dict, List, Any, Literal, Union, Optional
|
||||
from .types import Computer
|
||||
from .base import ComputerHandler
|
||||
from computer import Computer
|
||||
|
||||
|
||||
class OpenAIComputerHandler:
|
||||
class cuaComputerHandler(ComputerHandler):
|
||||
"""Computer handler that implements the Computer protocol using the computer interface."""
|
||||
|
||||
def __init__(self, computer_interface):
|
||||
def __init__(self, cua_computer: Computer):
|
||||
"""Initialize with a computer interface (from tool schema)."""
|
||||
self.interface = computer_interface
|
||||
self.interface = cua_computer.interface
|
||||
|
||||
# ==== Computer-Use-Preview Action Space ====
|
||||
|
||||
@@ -27,55 +27,3 @@ class AgentConfigInfo(BaseModel):
|
||||
def matches_model(self, model: str) -> bool:
|
||||
"""Check if this agent config matches the given model"""
|
||||
return bool(re.match(self.models_regex, model))
|
||||
|
||||
# Computer tool interface
|
||||
class Computer(Protocol):
|
||||
"""Protocol defining the interface for computer interactions."""
|
||||
|
||||
async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
|
||||
"""Get the current environment type."""
|
||||
...
|
||||
|
||||
async def get_dimensions(self) -> tuple[int, int]:
|
||||
"""Get screen dimensions as (width, height)."""
|
||||
...
|
||||
|
||||
async def screenshot(self) -> str:
|
||||
"""Take a screenshot and return as base64 string."""
|
||||
...
|
||||
|
||||
async def click(self, x: int, y: int, button: str = "left") -> None:
|
||||
"""Click at coordinates with specified button."""
|
||||
...
|
||||
|
||||
async def double_click(self, x: int, y: int) -> None:
|
||||
"""Double click at coordinates."""
|
||||
...
|
||||
|
||||
async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
|
||||
"""Scroll at coordinates with specified scroll amounts."""
|
||||
...
|
||||
|
||||
async def type(self, text: str) -> None:
|
||||
"""Type text."""
|
||||
...
|
||||
|
||||
async def wait(self, ms: int = 1000) -> None:
|
||||
"""Wait for specified milliseconds."""
|
||||
...
|
||||
|
||||
async def move(self, x: int, y: int) -> None:
|
||||
"""Move cursor to coordinates."""
|
||||
...
|
||||
|
||||
async def keypress(self, keys: List[str]) -> None:
|
||||
"""Press key combination."""
|
||||
...
|
||||
|
||||
async def drag(self, path: List[Dict[str, int]]) -> None:
|
||||
"""Drag along specified path."""
|
||||
...
|
||||
|
||||
async def get_current_url(self) -> str:
|
||||
"""Get current URL (for browser environments)."""
|
||||
...
|
||||
|
||||
Reference in New Issue
Block a user