diff --git a/libs/python/agent/agent/cli.py b/libs/python/agent/agent/cli.py index 215c791c..4d17ca15 100644 --- a/libs/python/agent/agent/cli.py +++ b/libs/python/agent/agent/cli.py @@ -94,14 +94,14 @@ def print_action(action_type: str, details: Dict[str, Any], total_cost: float): # Format action details args_str = "" if action_type == "click" and "x" in details and "y" in details: - args_str = f"({details['x']}, {details['y']})" + args_str = f"_{details['button']}({details['x']}, {details['y']})" elif action_type == "type" and "text" in details: text = details["text"] if len(text) > 50: text = text[:47] + "..." - args_str = f'"{text}"' - elif action_type == "key" and "key" in details: - args_str = f"'{details['key']}'" + args_str = f'("{text}")' + elif action_type == "key" and "text" in details: + args_str = f"('{details['text']}')" elif action_type == "scroll" and "x" in details and "y" in details: args_str = f"({details['x']}, {details['y']})" diff --git a/libs/python/agent/agent/computer_handler.py b/libs/python/agent/agent/computer_handler.py index fb42295d..53de49ed 100644 --- a/libs/python/agent/agent/computer_handler.py +++ b/libs/python/agent/agent/computer_handler.py @@ -3,7 +3,7 @@ Computer handler implementation for OpenAI computer-use-preview protocol. """ import base64 -from typing import Dict, List, Any, Literal, Union +from typing import Dict, List, Any, Literal, Union, Optional from .types import Computer @@ -14,11 +14,13 @@ class OpenAIComputerHandler: """Initialize with a computer interface (from tool schema).""" self.interface = computer_interface + # ==== Computer-Use-Preview Action Space ==== + async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]: """Get the current environment type.""" # For now, return a default - this could be enhanced to detect actual environment return "windows" - + async def get_dimensions(self) -> tuple[int, int]: """Get screen dimensions as (width, height).""" screen_size = await self.interface.get_screen_size() @@ -94,6 +96,14 @@ class OpenAIComputerHandler: # For now, return empty string return "" + # ==== Anthropic Computer Action Space ==== + async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None: + """Left mouse down at coordinates.""" + await self.interface.mouse_down(x, y, button="left") + + async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None: + """Left mouse up at coordinates.""" + await self.interface.mouse_up(x, y, button="left") def acknowledge_safety_check_callback(message: str, allow_always: bool = False) -> bool: """Safety check callback for user acknowledgment.""" diff --git a/libs/python/agent/agent/loops/anthropic.py b/libs/python/agent/agent/loops/anthropic.py index 599d1dc4..50fbd24e 100644 --- a/libs/python/agent/agent/loops/anthropic.py +++ b/libs/python/agent/agent/loops/anthropic.py @@ -23,7 +23,10 @@ from ..responses import ( make_type_item, make_wait_item, make_input_image_item, - make_screenshot_item + make_screenshot_item, + make_failed_tool_call_items, + make_left_mouse_down_item, + make_left_mouse_up_item ) # Model version mapping to tool version and beta flag @@ -115,7 +118,8 @@ async def _prepare_tools_for_anthropic(tool_schemas: List[Dict[str, Any]], model def _convert_responses_items_to_completion_messages(messages: Messages) -> List[Dict[str, Any]]: """Convert responses_items message format to liteLLM completion format.""" completion_messages = [] - + call_id_to_fn_name = {} + for message in messages: msg_type = message.get("type") role = message.get("role") @@ -193,6 +197,43 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[ "content": reasoning_text }) + elif msg_type == "function_call": + fn_name = message.get("name") + fn_args = message.get("arguments", "{}") + call_id = message.get("call_id", "call_1") + call_id_to_fn_name[call_id] = fn_name + openai_tool_calls = [{ + "id": call_id, + "type": "function", + "function": { + "name": fn_name, + "arguments": fn_args + } + }] # If the last completion message is an assistant message, extend the tool_calls + if completion_messages and completion_messages[-1].get("role") == "assistant": + if "tool_calls" not in completion_messages[-1]: + completion_messages[-1]["tool_calls"] = [] + completion_messages[-1]["tool_calls"].extend(openai_tool_calls) + else: + # Create new assistant message with tool calls + completion_messages.append({ + "role": "assistant", + "content": None, + "tool_calls": openai_tool_calls + }) + + elif msg_type == "function_call_output": + call_id = message.get("call_id", "call_1") + fn_output = message.get("output", "") + fn_name = call_id_to_fn_name.get(call_id, "computer") + + completion_messages.append({ + "role": "function", + "name": fn_name, + "tool_call_id": call_id, + "content": str(fn_output) + }) + elif msg_type == "computer_call": # Computer call becomes tool use in assistant message action = message.get("action", {}) @@ -527,6 +568,26 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[ "action": "screenshot" } }) + elif action_type == "left_mouse_down": + tool_use_content.append({ + "type": "tool_use", + "id": call_id, + "name": "computer", + "input": { + "action": "left_mouse_down", + "coordinate": [action.get("x", None), action.get("y", None)] + } + }) + elif action_type == "left_mouse_up": + tool_use_content.append({ + "type": "tool_use", + "id": call_id, + "name": "computer", + "input": { + "action": "left_mouse_up", + "coordinate": [action.get("x", None), action.get("y", None)] + } + }) # Convert tool_use_content to OpenAI tool_calls format openai_tool_calls = [] @@ -611,45 +672,350 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any] # Action reference: # https://docs.anthropic.com/en/docs/agents-and-tools/tool-use/computer-use-tool#available-actions + try: + # Basic actions (all versions) + if action_type == "screenshot": + responses_items.append(make_screenshot_item(call_id=call_id)) + elif action_type in ["click", "left_click"]: + coordinate = tool_input.get("coordinate", [0, 0]) + responses_items.append(make_click_item( + x=coordinate[0] if len(coordinate) > 0 else 0, + y=coordinate[1] if len(coordinate) > 1 else 0, + call_id=call_id + )) + elif action_type in ["type", "type_text"]: + responses_items.append(make_type_item( + text=tool_input.get("text", ""), + call_id=call_id + )) + elif action_type in ["key", "keypress", "hotkey"]: + responses_items.append(make_keypress_item( + keys=tool_input.get("text", "").replace("+", "-").split("-"), + call_id=call_id + )) + elif action_type in ["mouse_move", "move_cursor", "move"]: + # Mouse move - create a custom action item + coordinate = tool_input.get("coordinate", [0, 0]) + responses_items.append( + make_move_item( + x=coordinate[0] if len(coordinate) > 0 else 0, + y=coordinate[1] if len(coordinate) > 1 else 0, + call_id=call_id + ) + ) + + # Enhanced actions (computer_20250124) Available in Claude 4 and Claude Sonnet 3.7 + elif action_type == "scroll": + coordinate = tool_input.get("coordinate", [0, 0]) + scroll_amount = tool_input.get("scroll_amount", 3) + scroll_x = scroll_amount if tool_input.get("scroll_direction", "down") == "right" else \ + -scroll_amount if tool_input.get("scroll_direction", "down") == "left" else 0 + scroll_y = scroll_amount if tool_input.get("scroll_direction", "down") == "down" else \ + -scroll_amount if tool_input.get("scroll_direction", "down") == "up" else 0 + responses_items.append(make_scroll_item( + x=coordinate[0] if len(coordinate) > 0 else 0, + y=coordinate[1] if len(coordinate) > 1 else 0, + scroll_x=scroll_x, + scroll_y=scroll_y, + call_id=call_id + )) + elif action_type in ["left_click_drag", "drag"]: + start_coord = tool_input.get("start_coordinate", [0, 0]) + end_coord = tool_input.get("end_coordinate", [0, 0]) + responses_items.append(make_drag_item( + path=[ + { + "x": start_coord[0] if len(start_coord) > 0 else 0, + "y": start_coord[1] if len(start_coord) > 1 else 0 + }, + { + "x": end_coord[0] if len(end_coord) > 0 else 0, + "y": end_coord[1] if len(end_coord) > 1 else 0 + } + ], + call_id=call_id + )) + elif action_type == "right_click": + coordinate = tool_input.get("coordinate", [0, 0]) + responses_items.append(make_click_item( + x=coordinate[0] if len(coordinate) > 0 else 0, + y=coordinate[1] if len(coordinate) > 1 else 0, + button="right", + call_id=call_id + )) + elif action_type == "middle_click": + coordinate = tool_input.get("coordinate", [0, 0]) + responses_items.append(make_click_item( + x=coordinate[0] if len(coordinate) > 0 else 0, + y=coordinate[1] if len(coordinate) > 1 else 0, + button="wheel", + call_id=call_id + )) + elif action_type == "double_click": + coordinate = tool_input.get("coordinate", [0, 0]) + responses_items.append(make_double_click_item( + x=coordinate[0] if len(coordinate) > 0 else 0, + y=coordinate[1] if len(coordinate) > 1 else 0, + call_id=call_id + )) + elif action_type == "triple_click": + # coordinate = tool_input.get("coordinate", [0, 0]) + # responses_items.append({ + # "type": "computer_call", + # "call_id": call_id, + # "action": { + # "type": "triple_click", + # "x": coordinate[0] if len(coordinate) > 0 else 0, + # "y": coordinate[1] if len(coordinate) > 1 else 0 + # } + # }) + raise NotImplementedError("triple_click") + elif action_type == "left_mouse_down": + # coordinate = tool_input.get("coordinate", [0, 0]) + # responses_items.append({ + # "type": "computer_call", + # "call_id": call_id, + # "action": { + # "type": "mouse_down", + # "button": "left", + # "x": coordinate[0] if len(coordinate) > 0 else 0, + # "y": coordinate[1] if len(coordinate) > 1 else 0 + # } + # }) + coordinate = tool_input.get("coordinate", [None, None]) + responses_items.append(make_left_mouse_down_item( + x=coordinate[0] if len(coordinate) > 0 else None, + y=coordinate[1] if len(coordinate) > 1 else None, + call_id=call_id + )) + elif action_type == "left_mouse_up": + # coordinate = tool_input.get("coordinate", [0, 0]) + # responses_items.append({ + # "type": "computer_call", + # "call_id": call_id, + # "action": { + # "type": "mouse_up", + # "button": "left", + # "x": coordinate[0] if len(coordinate) > 0 else 0, + # "y": coordinate[1] if len(coordinate) > 1 else 0 + # } + # }) + coordinate = tool_input.get("coordinate", [None, None]) + responses_items.append(make_left_mouse_up_item( + x=coordinate[0] if len(coordinate) > 0 else None, + y=coordinate[1] if len(coordinate) > 1 else None, + call_id=call_id + )) + elif action_type == "hold_key": + # responses_items.append({ + # "type": "computer_call", + # "call_id": call_id, + # "action": { + # "type": "key_hold", + # "key": tool_input.get("key", "") + # } + # }) + raise NotImplementedError("hold_key") + elif action_type == "wait": + responses_items.append(make_wait_item( + call_id=call_id + )) + else: + raise ValueError(f"Unknown action type: {action_type}") + except Exception as e: + responses_items.extend(make_failed_tool_call_items( + tool_name="computer", + tool_kwargs=tool_input, + error_message=repr(e), + call_id=call_id + )) + + # Handle tool calls (alternative format) + if hasattr(message, 'tool_calls') and message.tool_calls: + for tool_call in message.tool_calls: + if tool_call.function.name == "computer": + try: + try: + args = json.loads(tool_call.function.arguments) + action_type = args.get("action") + call_id = tool_call.id + # Basic actions (all versions) if action_type == "screenshot": - responses_items.append(make_screenshot_item(call_id=call_id)) + # Input: + # { + # "function": { + # "name": "computer", + # "arguments": json.dumps({ + # "action": "screenshot" + # }) + # }, + # "id": "call_1", + # "type": "function" + # } + + # Output: + # { + # "type": "computer_call", + # "call_id": "call_1", + # "action": { + # "type": "screenshot" + # } + # } + responses_items.append(make_screenshot_item( + call_id=call_id + )) elif action_type in ["click", "left_click"]: - coordinate = tool_input.get("coordinate", [0, 0]) + # Input: + # { + # "function": { + # "name": "computer", + # "arguments": json.dumps({ + # "action": "click", + # "coordinate": [100, 200] + # }) + # }, + # "id": "call_1", + # "type": "function" + # } + + # Output: + # { + # "type": "computer_call", + # "call_id": "call_1", + # "action": { + # "type": "click", + # "x": 100, + # "y": 200 + # } + # } + coordinate = args.get("coordinate", [0, 0]) responses_items.append(make_click_item( x=coordinate[0] if len(coordinate) > 0 else 0, y=coordinate[1] if len(coordinate) > 1 else 0, call_id=call_id )) elif action_type in ["type", "type_text"]: + # Input: + # { + # "function": { + # "name": "computer", + # "arguments": json.dumps({ + # "action": "type", + # "text": "Hello World" + # }) + # }, + # "id": "call_1", + # "type": "function" + # } + + # Output: + # { + # "type": "computer_call", + # "call_id": "call_1", + # "action": { + # "type": "type", + # "text": "Hello World" + # } + # } responses_items.append(make_type_item( - text=tool_input.get("text", ""), + text=args.get("text", ""), call_id=call_id )) elif action_type in ["key", "keypress", "hotkey"]: + # Input: + # { + # "function": { + # "name": "computer", + # "arguments": json.dumps({ + # "action": "key", + # "text": "ctrl+c" + # }) + # }, + # "id": "call_1", + # "type": "function" + # } + + # Output: + # { + # "type": "computer_call", + # "call_id": "call_1", + # "action": { + # "type": "keypress", + # "keys": ["ctrl", "c"] + # } + # } responses_items.append(make_keypress_item( - keys=tool_input.get("text", "").replace("+", "-").split("-"), + keys=args.get("text", "").replace("+", "-").split("-"), call_id=call_id )) elif action_type in ["mouse_move", "move_cursor", "move"]: - # Mouse move - create a custom action item - coordinate = tool_input.get("coordinate", [0, 0]) - responses_items.append( - make_move_item( - x=coordinate[0] if len(coordinate) > 0 else 0, - y=coordinate[1] if len(coordinate) > 1 else 0, - call_id=call_id - ) - ) + # Input: + # { + # "function": { + # "name": "computer", + # "arguments": json.dumps({ + # "action": "mouse_move", + # "coordinate": [150, 250] + # }) + # }, + # "id": "call_1", + # "type": "function" + # } + + # Output: + # { + # "type": "computer_call", + # "call_id": "call_1", + # "action": { + # "type": "mouse_move", + # "x": 150, + # "y": 250 + # } + # } + coordinate = args.get("coordinate", [0, 0]) + responses_items.append(make_move_item( + x=coordinate[0] if len(coordinate) > 0 else 0, + y=coordinate[1] if len(coordinate) > 1 else 0, + call_id=call_id + )) # Enhanced actions (computer_20250124) Available in Claude 4 and Claude Sonnet 3.7 elif action_type == "scroll": - coordinate = tool_input.get("coordinate", [0, 0]) - scroll_amount = tool_input.get("scroll_amount", 3) - scroll_x = scroll_amount if tool_input.get("scroll_direction", "down") == "right" else \ - -scroll_amount if tool_input.get("scroll_direction", "down") == "left" else 0 - scroll_y = scroll_amount if tool_input.get("scroll_direction", "down") == "down" else \ - -scroll_amount if tool_input.get("scroll_direction", "down") == "up" else 0 + # Input: + # { + # "function": { + # "name": "computer", + # "arguments": json.dumps({ + # "action": "scroll", + # "coordinate": [300, 400], + # "scroll_direction": "down", + # "scroll_amount": 5 + # }) + # }, + # "id": "call_1", + # "type": "function" + # } + + # Output: + # { + # "type": "computer_call", + # "call_id": "call_1", + # "action": { + # "type": "scroll", + # "x": 300, + # "y": 400, + # "scroll_x": 0, + # "scroll_y": -5 + # } + # } + coordinate = args.get("coordinate", [0, 0]) + direction = args.get("scroll_direction", "down") + amount = args.get("scroll_amount", 3) + scroll_x = amount if direction == "left" else \ + -amount if direction == "right" else 0 + scroll_y = amount if direction == "up" else \ + -amount if direction == "down" else 0 responses_items.append(make_scroll_item( x=coordinate[0] if len(coordinate) > 0 else 0, y=coordinate[1] if len(coordinate) > 1 else 0, @@ -658,8 +1024,34 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any] call_id=call_id )) elif action_type in ["left_click_drag", "drag"]: - start_coord = tool_input.get("start_coordinate", [0, 0]) - end_coord = tool_input.get("end_coordinate", [0, 0]) + # Input: + # { + # "function": { + # "name": "computer", + # "arguments": json.dumps({ + # "action": "left_click_drag", + # "start_coordinate": [100, 150], + # "end_coordinate": [200, 250] + # }) + # }, + # "id": "call_1", + # "type": "function" + # } + + # Output: + # { + # "type": "computer_call", + # "call_id": "call_1", + # "action": { + # "type": "drag", + # "path": [ + # {"x": 100, "y": 150}, + # {"x": 200, "y": 250} + # ] + # } + # } + start_coord = args.get("start_coordinate", [0, 0]) + end_coord = args.get("end_coordinate", [0, 0]) responses_items.append(make_drag_item( path=[ { @@ -674,7 +1066,31 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any] call_id=call_id )) elif action_type == "right_click": - coordinate = tool_input.get("coordinate", [0, 0]) + # Input: + # { + # "function": { + # "name": "computer", + # "arguments": json.dumps({ + # "action": "right_click", + # "coordinate": [120, 180] + # }) + # }, + # "id": "call_1", + # "type": "function" + # } + + # Output: + # { + # "type": "computer_call", + # "call_id": "call_1", + # "action": { + # "type": "click", + # "x": 120, + # "y": 180, + # "button": "right" + # } + # } + coordinate = args.get("coordinate", [0, 0]) responses_items.append(make_click_item( x=coordinate[0] if len(coordinate) > 0 else 0, y=coordinate[1] if len(coordinate) > 1 else 0, @@ -682,7 +1098,31 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any] call_id=call_id )) elif action_type == "middle_click": - coordinate = tool_input.get("coordinate", [0, 0]) + # Input: + # { + # "function": { + # "name": "computer", + # "arguments": json.dumps({ + # "action": "middle_click", + # "coordinate": [140, 220] + # }) + # }, + # "id": "call_1", + # "type": "function" + # } + + # Output: + # { + # "type": "computer_call", + # "call_id": "call_1", + # "action": { + # "type": "click", + # "x": 140, + # "y": 220, + # "button": "wheel" + # } + # } + coordinate = args.get("coordinate", [0, 0]) responses_items.append(make_click_item( x=coordinate[0] if len(coordinate) > 0 else 0, y=coordinate[1] if len(coordinate) > 1 else 0, @@ -690,518 +1130,175 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any] call_id=call_id )) elif action_type == "double_click": - coordinate = tool_input.get("coordinate", [0, 0]) + # Input: + # { + # "function": { + # "name": "computer", + # "arguments": json.dumps({ + # "action": "double_click", + # "coordinate": [160, 240] + # }) + # }, + # "id": "call_1", + # "type": "function" + # } + + # Output: + # { + # "type": "computer_call", + # "call_id": "call_1", + # "action": { + # "type": "double_click", + # "x": 160, + # "y": 240 + # } + # } + coordinate = args.get("coordinate", [0, 0]) responses_items.append(make_double_click_item( x=coordinate[0] if len(coordinate) > 0 else 0, y=coordinate[1] if len(coordinate) > 1 else 0, call_id=call_id )) elif action_type == "triple_click": - # coordinate = tool_input.get("coordinate", [0, 0]) - # responses_items.append({ + # Input: + # { + # "function": { + # "name": "computer", + # "arguments": json.dumps({ + # "action": "triple_click", + # "coordinate": [180, 260] + # }) + # }, + # "id": "call_1", + # "type": "function" + # } + + # Output: + # { # "type": "computer_call", - # "call_id": call_id, + # "call_id": "call_1", # "action": { # "type": "triple_click", - # "x": coordinate[0] if len(coordinate) > 0 else 0, - # "y": coordinate[1] if len(coordinate) > 1 else 0 + # "x": 180, + # "y": 260 # } - # }) + # } raise NotImplementedError("triple_click") elif action_type == "left_mouse_down": - # coordinate = tool_input.get("coordinate", [0, 0]) - # responses_items.append({ + # Input: + # { + # "function": { + # "name": "computer", + # "arguments": json.dumps({ + # "action": "left_mouse_down", + # "coordinate": [200, 280] + # }) + # }, + # "id": "call_1", + # "type": "function" + # } + + # Output: + # { # "type": "computer_call", - # "call_id": call_id, + # "call_id": "call_1", # "action": { # "type": "mouse_down", # "button": "left", - # "x": coordinate[0] if len(coordinate) > 0 else 0, - # "y": coordinate[1] if len(coordinate) > 1 else 0 + # "x": 200, + # "y": 280 # } - # }) - raise NotImplementedError("left_mouse_down") + # } + coordinate = args.get("coordinate", [None, None]) + responses_items.append(make_left_mouse_down_item( + x=coordinate[0] if len(coordinate) > 0 else None, + y=coordinate[1] if len(coordinate) > 1 else None, + call_id=call_id + )) elif action_type == "left_mouse_up": - # coordinate = tool_input.get("coordinate", [0, 0]) - # responses_items.append({ + # Input: + # { + # "function": { + # "name": "computer", + # "arguments": json.dumps({ + # "action": "left_mouse_up", + # "coordinate": [220, 300] + # }) + # }, + # "id": "call_1", + # "type": "function" + # } + + # Output: + # { # "type": "computer_call", - # "call_id": call_id, + # "call_id": "call_1", # "action": { # "type": "mouse_up", # "button": "left", - # "x": coordinate[0] if len(coordinate) > 0 else 0, - # "y": coordinate[1] if len(coordinate) > 1 else 0 + # "x": 220, + # "y": 300 # } - # }) - raise NotImplementedError("left_mouse_up") + # } + coordinate = args.get("coordinate", [None, None]) + responses_items.append(make_left_mouse_up_item( + x=coordinate[0] if len(coordinate) > 0 else None, + y=coordinate[1] if len(coordinate) > 1 else None, + call_id=call_id + )) elif action_type == "hold_key": - # responses_items.append({ + # Input: + # { + # "function": { + # "name": "computer", + # "arguments": json.dumps({ + # "action": "hold_key", + # "key": "shift" + # }) + # }, + # "id": "call_1", + # "type": "function" + # } + + # Output: + # { # "type": "computer_call", - # "call_id": call_id, + # "call_id": "call_1", # "action": { # "type": "key_hold", - # "key": tool_input.get("key", "") + # "key": "shift" # } - # }) + # } raise NotImplementedError("hold_key") elif action_type == "wait": + # Input: + # { + # "function": { + # "name": "computer", + # "arguments": json.dumps({ + # "action": "wait" + # }) + # }, + # "id": "call_1", + # "type": "function" + # } + + # Output: + # { + # "type": "computer_call", + # "call_id": "call_1", + # "action": { + # "type": "wait" + # } + # } responses_items.append(make_wait_item( call_id=call_id )) - else: - raise ValueError(f"Unknown action type: {action_type}") - - # Handle tool calls (alternative format) - if hasattr(message, 'tool_calls') and message.tool_calls: - for tool_call in message.tool_calls: - if tool_call.function.name == "computer": - try: - args = json.loads(tool_call.function.arguments) - action_type = args.get("action") - call_id = tool_call.id - - # Basic actions (all versions) - if action_type == "screenshot": - # Input: - # { - # "function": { - # "name": "computer", - # "arguments": json.dumps({ - # "action": "screenshot" - # }) - # }, - # "id": "call_1", - # "type": "function" - # } - - # Output: - # { - # "type": "computer_call", - # "call_id": "call_1", - # "action": { - # "type": "screenshot" - # } - # } - responses_items.append(make_screenshot_item( - call_id=call_id - )) - elif action_type in ["click", "left_click"]: - # Input: - # { - # "function": { - # "name": "computer", - # "arguments": json.dumps({ - # "action": "click", - # "coordinate": [100, 200] - # }) - # }, - # "id": "call_1", - # "type": "function" - # } - - # Output: - # { - # "type": "computer_call", - # "call_id": "call_1", - # "action": { - # "type": "click", - # "x": 100, - # "y": 200 - # } - # } - coordinate = args.get("coordinate", [0, 0]) - responses_items.append(make_click_item( - x=coordinate[0] if len(coordinate) > 0 else 0, - y=coordinate[1] if len(coordinate) > 1 else 0, - call_id=call_id - )) - elif action_type in ["type", "type_text"]: - # Input: - # { - # "function": { - # "name": "computer", - # "arguments": json.dumps({ - # "action": "type", - # "text": "Hello World" - # }) - # }, - # "id": "call_1", - # "type": "function" - # } - - # Output: - # { - # "type": "computer_call", - # "call_id": "call_1", - # "action": { - # "type": "type", - # "text": "Hello World" - # } - # } - responses_items.append(make_type_item( - text=args.get("text", ""), - call_id=call_id - )) - elif action_type in ["key", "keypress", "hotkey"]: - # Input: - # { - # "function": { - # "name": "computer", - # "arguments": json.dumps({ - # "action": "key", - # "text": "ctrl+c" - # }) - # }, - # "id": "call_1", - # "type": "function" - # } - - # Output: - # { - # "type": "computer_call", - # "call_id": "call_1", - # "action": { - # "type": "keypress", - # "keys": ["ctrl", "c"] - # } - # } - responses_items.append(make_keypress_item( - keys=args.get("text", "").replace("+", "-").split("-"), - call_id=call_id - )) - elif action_type in ["mouse_move", "move_cursor", "move"]: - # Input: - # { - # "function": { - # "name": "computer", - # "arguments": json.dumps({ - # "action": "mouse_move", - # "coordinate": [150, 250] - # }) - # }, - # "id": "call_1", - # "type": "function" - # } - - # Output: - # { - # "type": "computer_call", - # "call_id": "call_1", - # "action": { - # "type": "mouse_move", - # "x": 150, - # "y": 250 - # } - # } - coordinate = args.get("coordinate", [0, 0]) - responses_items.append(make_move_item( - x=coordinate[0] if len(coordinate) > 0 else 0, - y=coordinate[1] if len(coordinate) > 1 else 0, - call_id=call_id - )) - - # Enhanced actions (computer_20250124) Available in Claude 4 and Claude Sonnet 3.7 - elif action_type == "scroll": - # Input: - # { - # "function": { - # "name": "computer", - # "arguments": json.dumps({ - # "action": "scroll", - # "coordinate": [300, 400], - # "scroll_direction": "down", - # "scroll_amount": 5 - # }) - # }, - # "id": "call_1", - # "type": "function" - # } - - # Output: - # { - # "type": "computer_call", - # "call_id": "call_1", - # "action": { - # "type": "scroll", - # "x": 300, - # "y": 400, - # "scroll_x": 0, - # "scroll_y": -5 - # } - # } - coordinate = args.get("coordinate", [0, 0]) - direction = args.get("scroll_direction", "down") - amount = args.get("scroll_amount", 3) - scroll_x = amount if direction == "left" else \ - -amount if direction == "right" else 0 - scroll_y = amount if direction == "up" else \ - -amount if direction == "down" else 0 - responses_items.append(make_scroll_item( - x=coordinate[0] if len(coordinate) > 0 else 0, - y=coordinate[1] if len(coordinate) > 1 else 0, - scroll_x=scroll_x, - scroll_y=scroll_y, - call_id=call_id - )) - elif action_type in ["left_click_drag", "drag"]: - # Input: - # { - # "function": { - # "name": "computer", - # "arguments": json.dumps({ - # "action": "left_click_drag", - # "start_coordinate": [100, 150], - # "end_coordinate": [200, 250] - # }) - # }, - # "id": "call_1", - # "type": "function" - # } - - # Output: - # { - # "type": "computer_call", - # "call_id": "call_1", - # "action": { - # "type": "drag", - # "path": [ - # {"x": 100, "y": 150}, - # {"x": 200, "y": 250} - # ] - # } - # } - start_coord = args.get("start_coordinate", [0, 0]) - end_coord = args.get("end_coordinate", [0, 0]) - responses_items.append(make_drag_item( - path=[ - { - "x": start_coord[0] if len(start_coord) > 0 else 0, - "y": start_coord[1] if len(start_coord) > 1 else 0 - }, - { - "x": end_coord[0] if len(end_coord) > 0 else 0, - "y": end_coord[1] if len(end_coord) > 1 else 0 - } - ], - call_id=call_id - )) - elif action_type == "right_click": - # Input: - # { - # "function": { - # "name": "computer", - # "arguments": json.dumps({ - # "action": "right_click", - # "coordinate": [120, 180] - # }) - # }, - # "id": "call_1", - # "type": "function" - # } - - # Output: - # { - # "type": "computer_call", - # "call_id": "call_1", - # "action": { - # "type": "click", - # "x": 120, - # "y": 180, - # "button": "right" - # } - # } - coordinate = args.get("coordinate", [0, 0]) - responses_items.append(make_click_item( - x=coordinate[0] if len(coordinate) > 0 else 0, - y=coordinate[1] if len(coordinate) > 1 else 0, - button="right", - call_id=call_id - )) - elif action_type == "middle_click": - # Input: - # { - # "function": { - # "name": "computer", - # "arguments": json.dumps({ - # "action": "middle_click", - # "coordinate": [140, 220] - # }) - # }, - # "id": "call_1", - # "type": "function" - # } - - # Output: - # { - # "type": "computer_call", - # "call_id": "call_1", - # "action": { - # "type": "click", - # "x": 140, - # "y": 220, - # "button": "wheel" - # } - # } - coordinate = args.get("coordinate", [0, 0]) - responses_items.append(make_click_item( - x=coordinate[0] if len(coordinate) > 0 else 0, - y=coordinate[1] if len(coordinate) > 1 else 0, - button="wheel", - call_id=call_id - )) - elif action_type == "double_click": - # Input: - # { - # "function": { - # "name": "computer", - # "arguments": json.dumps({ - # "action": "double_click", - # "coordinate": [160, 240] - # }) - # }, - # "id": "call_1", - # "type": "function" - # } - - # Output: - # { - # "type": "computer_call", - # "call_id": "call_1", - # "action": { - # "type": "double_click", - # "x": 160, - # "y": 240 - # } - # } - coordinate = args.get("coordinate", [0, 0]) - responses_items.append(make_double_click_item( - x=coordinate[0] if len(coordinate) > 0 else 0, - y=coordinate[1] if len(coordinate) > 1 else 0, - call_id=call_id - )) - elif action_type == "triple_click": - # Input: - # { - # "function": { - # "name": "computer", - # "arguments": json.dumps({ - # "action": "triple_click", - # "coordinate": [180, 260] - # }) - # }, - # "id": "call_1", - # "type": "function" - # } - - # Output: - # { - # "type": "computer_call", - # "call_id": "call_1", - # "action": { - # "type": "triple_click", - # "x": 180, - # "y": 260 - # } - # } - raise NotImplementedError("triple_click") - elif action_type == "left_mouse_down": - # Input: - # { - # "function": { - # "name": "computer", - # "arguments": json.dumps({ - # "action": "left_mouse_down", - # "coordinate": [200, 280] - # }) - # }, - # "id": "call_1", - # "type": "function" - # } - - # Output: - # { - # "type": "computer_call", - # "call_id": "call_1", - # "action": { - # "type": "mouse_down", - # "button": "left", - # "x": 200, - # "y": 280 - # } - # } - raise NotImplementedError("left_mouse_down") - elif action_type == "left_mouse_up": - # Input: - # { - # "function": { - # "name": "computer", - # "arguments": json.dumps({ - # "action": "left_mouse_up", - # "coordinate": [220, 300] - # }) - # }, - # "id": "call_1", - # "type": "function" - # } - - # Output: - # { - # "type": "computer_call", - # "call_id": "call_1", - # "action": { - # "type": "mouse_up", - # "button": "left", - # "x": 220, - # "y": 300 - # } - # } - raise NotImplementedError("left_mouse_up") - elif action_type == "hold_key": - # Input: - # { - # "function": { - # "name": "computer", - # "arguments": json.dumps({ - # "action": "hold_key", - # "key": "shift" - # }) - # }, - # "id": "call_1", - # "type": "function" - # } - - # Output: - # { - # "type": "computer_call", - # "call_id": "call_1", - # "action": { - # "type": "key_hold", - # "key": "shift" - # } - # } - raise NotImplementedError("hold_key") - elif action_type == "wait": - # Input: - # { - # "function": { - # "name": "computer", - # "arguments": json.dumps({ - # "action": "wait" - # }) - # }, - # "id": "call_1", - # "type": "function" - # } - - # Output: - # { - # "type": "computer_call", - # "call_id": "call_1", - # "action": { - # "type": "wait" - # } - # } - responses_items.append(make_wait_item( + except Exception as e: + responses_items.extend(make_failed_tool_call_items( + tool_name="computer", + tool_kwargs=args, + error_message=repr(e), call_id=call_id )) except json.JSONDecodeError: diff --git a/libs/python/agent/agent/responses.py b/libs/python/agent/agent/responses.py index 52993750..fb034a70 100644 --- a/libs/python/agent/agent/responses.py +++ b/libs/python/agent/agent/responses.py @@ -206,6 +206,51 @@ def make_wait_item(call_id: Optional[str] = None) -> ResponseComputerToolCallPar type="computer_call" ) +# Extra anthropic computer calls +def make_left_mouse_down_item(x: Optional[int] = None, y: Optional[int] = None, call_id: Optional[str] = None) -> Dict[str, Any]: + return { + "id": random_id(), + "call_id": call_id if call_id else random_id(), + "action": { + "type": "left_mouse_down", + "x": x, + "y": y + }, + "pending_safety_checks": [], + "status": "completed", + "type": "computer_call" + } + +def make_left_mouse_up_item(x: Optional[int] = None, y: Optional[int] = None, call_id: Optional[str] = None) -> Dict[str, Any]: + return { + "id": random_id(), + "call_id": call_id if call_id else random_id(), + "action": { + "type": "left_mouse_up", + "x": x, + "y": y + }, + "pending_safety_checks": [], + "status": "completed", + "type": "computer_call" + } + +def make_failed_tool_call_items(tool_name: str, tool_kwargs: Dict[str, Any], error_message: str, call_id: Optional[str] = None) -> List[Dict[str, Any]]: + call_id = call_id if call_id else random_id() + return [ + { + "type": "function_call", + "id": random_id(), + "call_id": call_id, + "name": tool_name, + "arguments": json.dumps(tool_kwargs), + }, + { + "type": "function_call_output", + "call_id": call_id, + "output": json.dumps({"error": error_message}), + } + ] # Conversion functions between element descriptions and coordinates def convert_computer_calls_desc2xy(responses_items: List[Dict[str, Any]], desc2xy: Dict[str, tuple]) -> List[Dict[str, Any]]: