From 4b0b07240ccb4657342f0a171c39265f63adbeae Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Thu, 7 Aug 2025 16:47:34 -0400 Subject: [PATCH] Made coordinates optional for left_mouse_up/down --- libs/python/agent/agent/computer_handler.py | 6 +-- libs/python/agent/agent/loops/anthropic.py | 44 +++++++++++++++------ libs/python/agent/agent/responses.py | 4 +- 3 files changed, 37 insertions(+), 17 deletions(-) diff --git a/libs/python/agent/agent/computer_handler.py b/libs/python/agent/agent/computer_handler.py index ae8a02e2..53de49ed 100644 --- a/libs/python/agent/agent/computer_handler.py +++ b/libs/python/agent/agent/computer_handler.py @@ -3,7 +3,7 @@ Computer handler implementation for OpenAI computer-use-preview protocol. """ import base64 -from typing import Dict, List, Any, Literal, Union +from typing import Dict, List, Any, Literal, Union, Optional from .types import Computer @@ -97,11 +97,11 @@ class OpenAIComputerHandler: return "" # ==== Anthropic Computer Action Space ==== - async def left_mouse_down(self, x: int, y: int) -> None: + async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None: """Left mouse down at coordinates.""" await self.interface.mouse_down(x, y, button="left") - async def left_mouse_up(self, x: int, y: int) -> None: + async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None: """Left mouse up at coordinates.""" await self.interface.mouse_up(x, y, button="left") diff --git a/libs/python/agent/agent/loops/anthropic.py b/libs/python/agent/agent/loops/anthropic.py index 8dcc5733..50fbd24e 100644 --- a/libs/python/agent/agent/loops/anthropic.py +++ b/libs/python/agent/agent/loops/anthropic.py @@ -568,6 +568,26 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[ "action": "screenshot" } }) + elif action_type == "left_mouse_down": + tool_use_content.append({ + "type": "tool_use", + "id": call_id, + "name": "computer", + "input": { + "action": "left_mouse_down", + "coordinate": [action.get("x", None), action.get("y", None)] + } + }) + elif action_type == "left_mouse_up": + tool_use_content.append({ + "type": "tool_use", + "id": call_id, + "name": "computer", + "input": { + "action": "left_mouse_up", + "coordinate": [action.get("x", None), action.get("y", None)] + } + }) # Convert tool_use_content to OpenAI tool_calls format openai_tool_calls = [] @@ -762,10 +782,10 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any] # "y": coordinate[1] if len(coordinate) > 1 else 0 # } # }) - coordinate = tool_input.get("coordinate", [0, 0]) + coordinate = tool_input.get("coordinate", [None, None]) responses_items.append(make_left_mouse_down_item( - x=coordinate[0] if len(coordinate) > 0 else 0, - y=coordinate[1] if len(coordinate) > 1 else 0, + x=coordinate[0] if len(coordinate) > 0 else None, + y=coordinate[1] if len(coordinate) > 1 else None, call_id=call_id )) elif action_type == "left_mouse_up": @@ -780,10 +800,10 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any] # "y": coordinate[1] if len(coordinate) > 1 else 0 # } # }) - coordinate = tool_input.get("coordinate", [0, 0]) + coordinate = tool_input.get("coordinate", [None, None]) responses_items.append(make_left_mouse_up_item( - x=coordinate[0] if len(coordinate) > 0 else 0, - y=coordinate[1] if len(coordinate) > 1 else 0, + x=coordinate[0] if len(coordinate) > 0 else None, + y=coordinate[1] if len(coordinate) > 1 else None, call_id=call_id )) elif action_type == "hold_key": @@ -1189,10 +1209,10 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any] # "y": 280 # } # } - coordinate = args.get("coordinate", [0, 0]) + coordinate = args.get("coordinate", [None, None]) responses_items.append(make_left_mouse_down_item( - x=coordinate[0] if len(coordinate) > 0 else 0, - y=coordinate[1] if len(coordinate) > 1 else 0, + x=coordinate[0] if len(coordinate) > 0 else None, + y=coordinate[1] if len(coordinate) > 1 else None, call_id=call_id )) elif action_type == "left_mouse_up": @@ -1220,10 +1240,10 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any] # "y": 300 # } # } - coordinate = args.get("coordinate", [0, 0]) + coordinate = args.get("coordinate", [None, None]) responses_items.append(make_left_mouse_up_item( - x=coordinate[0] if len(coordinate) > 0 else 0, - y=coordinate[1] if len(coordinate) > 1 else 0, + x=coordinate[0] if len(coordinate) > 0 else None, + y=coordinate[1] if len(coordinate) > 1 else None, call_id=call_id )) elif action_type == "hold_key": diff --git a/libs/python/agent/agent/responses.py b/libs/python/agent/agent/responses.py index 925eca3b..fb034a70 100644 --- a/libs/python/agent/agent/responses.py +++ b/libs/python/agent/agent/responses.py @@ -207,7 +207,7 @@ def make_wait_item(call_id: Optional[str] = None) -> ResponseComputerToolCallPar ) # Extra anthropic computer calls -def make_left_mouse_down_item(x: int, y: int, call_id: Optional[str] = None) -> Dict[str, Any]: +def make_left_mouse_down_item(x: Optional[int] = None, y: Optional[int] = None, call_id: Optional[str] = None) -> Dict[str, Any]: return { "id": random_id(), "call_id": call_id if call_id else random_id(), @@ -221,7 +221,7 @@ def make_left_mouse_down_item(x: int, y: int, call_id: Optional[str] = None) -> "type": "computer_call" } -def make_left_mouse_up_item(x: int, y: int, call_id: Optional[str] = None) -> Dict[str, Any]: +def make_left_mouse_up_item(x: Optional[int] = None, y: Optional[int] = None, call_id: Optional[str] = None) -> Dict[str, Any]: return { "id": random_id(), "call_id": call_id if call_id else random_id(),