Merge pull request #333 from trycua/fix/passthrough-tool-errors

[Agent] Implement left_mouse_down, left_mouse_up, and tool errors
This commit is contained in:
ddupont
2025-08-07 16:49:46 -04:00
committed by GitHub
4 changed files with 662 additions and 510 deletions

View File

@@ -94,14 +94,14 @@ def print_action(action_type: str, details: Dict[str, Any], total_cost: float):
# Format action details
args_str = ""
if action_type == "click" and "x" in details and "y" in details:
args_str = f"({details['x']}, {details['y']})"
args_str = f"_{details['button']}({details['x']}, {details['y']})"
elif action_type == "type" and "text" in details:
text = details["text"]
if len(text) > 50:
text = text[:47] + "..."
args_str = f'"{text}"'
elif action_type == "key" and "key" in details:
args_str = f"'{details['key']}'"
args_str = f'("{text}")'
elif action_type == "key" and "text" in details:
args_str = f"('{details['text']}')"
elif action_type == "scroll" and "x" in details and "y" in details:
args_str = f"({details['x']}, {details['y']})"

View File

@@ -3,7 +3,7 @@ Computer handler implementation for OpenAI computer-use-preview protocol.
"""
import base64
from typing import Dict, List, Any, Literal, Union
from typing import Dict, List, Any, Literal, Union, Optional
from .types import Computer
@@ -14,11 +14,13 @@ class OpenAIComputerHandler:
"""Initialize with a computer interface (from tool schema)."""
self.interface = computer_interface
# ==== Computer-Use-Preview Action Space ====
async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
"""Get the current environment type."""
# For now, return a default - this could be enhanced to detect actual environment
return "windows"
async def get_dimensions(self) -> tuple[int, int]:
"""Get screen dimensions as (width, height)."""
screen_size = await self.interface.get_screen_size()
@@ -94,6 +96,14 @@ class OpenAIComputerHandler:
# For now, return empty string
return ""
# ==== Anthropic Computer Action Space ====
async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
"""Left mouse down at coordinates."""
await self.interface.mouse_down(x, y, button="left")
async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
"""Left mouse up at coordinates."""
await self.interface.mouse_up(x, y, button="left")
def acknowledge_safety_check_callback(message: str, allow_always: bool = False) -> bool:
"""Safety check callback for user acknowledgment."""

File diff suppressed because it is too large Load Diff

View File

@@ -206,6 +206,51 @@ def make_wait_item(call_id: Optional[str] = None) -> ResponseComputerToolCallPar
type="computer_call"
)
# Extra anthropic computer calls
def make_left_mouse_down_item(x: Optional[int] = None, y: Optional[int] = None, call_id: Optional[str] = None) -> Dict[str, Any]:
return {
"id": random_id(),
"call_id": call_id if call_id else random_id(),
"action": {
"type": "left_mouse_down",
"x": x,
"y": y
},
"pending_safety_checks": [],
"status": "completed",
"type": "computer_call"
}
def make_left_mouse_up_item(x: Optional[int] = None, y: Optional[int] = None, call_id: Optional[str] = None) -> Dict[str, Any]:
return {
"id": random_id(),
"call_id": call_id if call_id else random_id(),
"action": {
"type": "left_mouse_up",
"x": x,
"y": y
},
"pending_safety_checks": [],
"status": "completed",
"type": "computer_call"
}
def make_failed_tool_call_items(tool_name: str, tool_kwargs: Dict[str, Any], error_message: str, call_id: Optional[str] = None) -> List[Dict[str, Any]]:
call_id = call_id if call_id else random_id()
return [
{
"type": "function_call",
"id": random_id(),
"call_id": call_id,
"name": tool_name,
"arguments": json.dumps(tool_kwargs),
},
{
"type": "function_call_output",
"call_id": call_id,
"output": json.dumps({"error": error_message}),
}
]
# Conversion functions between element descriptions and coordinates
def convert_computer_calls_desc2xy(responses_items: List[Dict[str, Any]], desc2xy: Dict[str, tuple]) -> List[Dict[str, Any]]: