Files
computer/libs/python/agent/agent/responses.py
r33drichards 7d3d826672 fix linting and typechecking github action (#631)
* Sort imports alphabetically in agent loops __init__.py

Fix isort check failure by alphabetically sorting both the import list
and __all__ list to match the expected order.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

* run on all prs

* Fix black formatting issues in responses.py and provider.py

Apply black formatting to fix lint check failures:
- Reformat conditional expression in responses.py for better readability
- Break long assert line in provider.py to comply with line length limits

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

* Fix prettier formatting issues in markdown and TypeScript files

Apply prettier formatting to blog posts, documentation, and CLI code:
- Format blog markdown files for consistent styling
- Format TypeScript CLI source files
- Format documentation MDX files

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

---------

Co-authored-by: Claude <noreply@anthropic.com>
2025-12-01 16:31:20 -05:00

801 lines
31 KiB
Python

"""
Functions for making various Responses API items from different types of responses.
Based on the OpenAI spec for Responses API items.
"""
import base64
import json
import uuid
from typing import Any, Dict, List, Literal, Optional, Union
from openai.types.responses.easy_input_message_param import EasyInputMessageParam
from openai.types.responses.response_computer_tool_call_param import (
ActionClick,
ActionDoubleClick,
ActionDrag,
ActionDragPath,
ActionKeypress,
ActionMove,
ActionScreenshot,
ActionScroll,
)
from openai.types.responses.response_computer_tool_call_param import (
ActionType as ActionTypeAction,
)
from openai.types.responses.response_computer_tool_call_param import (
ActionWait,
PendingSafetyCheck,
ResponseComputerToolCallParam,
)
from openai.types.responses.response_function_tool_call_param import (
ResponseFunctionToolCallParam,
)
from openai.types.responses.response_input_image_param import ResponseInputImageParam
from openai.types.responses.response_output_message_param import (
ResponseOutputMessageParam,
)
from openai.types.responses.response_output_text_param import ResponseOutputTextParam
from openai.types.responses.response_reasoning_item_param import (
ResponseReasoningItemParam,
Summary,
)
def random_id():
return str(uuid.uuid4())
# User message items
def make_input_image_item(image_data: Union[str, bytes]) -> EasyInputMessageParam:
return EasyInputMessageParam(
content=[
ResponseInputImageParam(
type="input_image",
image_url=f"data:image/png;base64,{base64.b64encode(image_data).decode('utf-8') if isinstance(image_data, bytes) else image_data}",
) # type: ignore
],
role="user",
type="message",
)
# Text items
def make_reasoning_item(reasoning: str) -> ResponseReasoningItemParam:
return ResponseReasoningItemParam(
id=random_id(), summary=[Summary(text=reasoning, type="summary_text")], type="reasoning"
)
def make_output_text_item(content: str) -> ResponseOutputMessageParam:
return ResponseOutputMessageParam(
id=random_id(),
content=[ResponseOutputTextParam(text=content, type="output_text", annotations=[])],
role="assistant",
status="completed",
type="message",
)
# Function call items
def make_function_call_item(
function_name: str, arguments: Dict[str, Any], call_id: Optional[str] = None
) -> ResponseFunctionToolCallParam:
return ResponseFunctionToolCallParam(
id=random_id(),
call_id=call_id if call_id else random_id(),
name=function_name,
arguments=json.dumps(arguments),
status="completed",
type="function_call",
)
# Computer tool call items
def make_click_item(
x: int,
y: int,
button: Literal["left", "right", "wheel", "back", "forward"] = "left",
call_id: Optional[str] = None,
) -> ResponseComputerToolCallParam:
return ResponseComputerToolCallParam(
id=random_id(),
call_id=call_id if call_id else random_id(),
action=ActionClick(button=button, type="click", x=x, y=y),
pending_safety_checks=[],
status="completed",
type="computer_call",
)
def make_double_click_item(
x: int, y: int, call_id: Optional[str] = None
) -> ResponseComputerToolCallParam:
return ResponseComputerToolCallParam(
id=random_id(),
call_id=call_id if call_id else random_id(),
action=ActionDoubleClick(type="double_click", x=x, y=y),
pending_safety_checks=[],
status="completed",
type="computer_call",
)
def make_drag_item(
path: List[Dict[str, int]], call_id: Optional[str] = None
) -> ResponseComputerToolCallParam:
drag_path = [ActionDragPath(x=point["x"], y=point["y"]) for point in path]
return ResponseComputerToolCallParam(
id=random_id(),
call_id=call_id if call_id else random_id(),
action=ActionDrag(path=drag_path, type="drag"),
pending_safety_checks=[],
status="completed",
type="computer_call",
)
def make_keypress_item(
keys: List[str], call_id: Optional[str] = None
) -> ResponseComputerToolCallParam:
return ResponseComputerToolCallParam(
id=random_id(),
call_id=call_id if call_id else random_id(),
action=ActionKeypress(keys=keys, type="keypress"),
pending_safety_checks=[],
status="completed",
type="computer_call",
)
def make_move_item(x: int, y: int, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
return ResponseComputerToolCallParam(
id=random_id(),
call_id=call_id if call_id else random_id(),
action=ActionMove(type="move", x=x, y=y),
pending_safety_checks=[],
status="completed",
type="computer_call",
)
def make_screenshot_item(call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
return ResponseComputerToolCallParam(
id=random_id(),
call_id=call_id if call_id else random_id(),
action=ActionScreenshot(type="screenshot"),
pending_safety_checks=[],
status="completed",
type="computer_call",
)
def make_scroll_item(
x: int, y: int, scroll_x: int, scroll_y: int, call_id: Optional[str] = None
) -> ResponseComputerToolCallParam:
return ResponseComputerToolCallParam(
id=random_id(),
call_id=call_id if call_id else random_id(),
action=ActionScroll(scroll_x=scroll_x, scroll_y=scroll_y, type="scroll", x=x, y=y),
pending_safety_checks=[],
status="completed",
type="computer_call",
)
def make_type_item(text: str, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
return ResponseComputerToolCallParam(
id=random_id(),
call_id=call_id if call_id else random_id(),
action=ActionTypeAction(text=text, type="type"),
pending_safety_checks=[],
status="completed",
type="computer_call",
)
def make_wait_item(call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
return ResponseComputerToolCallParam(
id=random_id(),
call_id=call_id if call_id else random_id(),
action=ActionWait(type="wait"),
pending_safety_checks=[],
status="completed",
type="computer_call",
)
# Extra anthropic computer calls
def make_left_mouse_down_item(
x: Optional[int] = None, y: Optional[int] = None, call_id: Optional[str] = None
) -> Dict[str, Any]:
return {
"id": random_id(),
"call_id": call_id if call_id else random_id(),
"action": {"type": "left_mouse_down", "x": x, "y": y},
"pending_safety_checks": [],
"status": "completed",
"type": "computer_call",
}
def make_left_mouse_up_item(
x: Optional[int] = None, y: Optional[int] = None, call_id: Optional[str] = None
) -> Dict[str, Any]:
return {
"id": random_id(),
"call_id": call_id if call_id else random_id(),
"action": {"type": "left_mouse_up", "x": x, "y": y},
"pending_safety_checks": [],
"status": "completed",
"type": "computer_call",
}
def make_failed_tool_call_items(
tool_name: str, tool_kwargs: Dict[str, Any], error_message: str, call_id: Optional[str] = None
) -> List[Dict[str, Any]]:
call_id = call_id if call_id else random_id()
return [
{
"type": "function_call",
"id": random_id(),
"call_id": call_id,
"name": tool_name,
"arguments": json.dumps(tool_kwargs),
},
{
"type": "function_call_output",
"call_id": call_id,
"output": json.dumps({"error": error_message}),
},
]
def make_tool_error_item(error_message: str, call_id: Optional[str] = None) -> Dict[str, Any]:
call_id = call_id if call_id else random_id()
return {
"type": "function_call_output",
"call_id": call_id,
"output": json.dumps({"error": error_message}),
}
def replace_failed_computer_calls_with_function_calls(
messages: List[Dict[str, Any]],
) -> List[Dict[str, Any]]:
"""
Replace computer_call items with function_call items if they share a call_id with a function_call_output.
This indicates the computer call failed and should be treated as a function call instead.
We do this because the computer_call_output items do not support text output.
Args:
messages: List of message items to process
"""
messages = messages.copy()
# Find all call_ids that have function_call_output items
failed_call_ids = set()
for msg in messages:
if msg.get("type") == "function_call_output":
call_id = msg.get("call_id")
if call_id:
failed_call_ids.add(call_id)
# Replace computer_call items that have matching call_ids
for i, msg in enumerate(messages):
if msg.get("type") == "computer_call" and msg.get("call_id") in failed_call_ids:
# Extract action from computer_call
action = msg.get("action", {})
call_id = msg.get("call_id")
# Create function_call replacement
messages[i] = {
"type": "function_call",
"id": msg.get("id", random_id()),
"call_id": call_id,
"name": "computer",
"arguments": json.dumps(action),
}
return messages
# Conversion functions between element descriptions and coordinates
def convert_computer_calls_desc2xy(
responses_items: List[Dict[str, Any]], desc2xy: Dict[str, tuple]
) -> List[Dict[str, Any]]:
"""
Convert computer calls from element descriptions to x,y coordinates.
Args:
responses_items: List of response items containing computer calls with element_description
desc2xy: Dictionary mapping element descriptions to (x, y) coordinate tuples
Returns:
List of response items with element_description replaced by x,y coordinates
"""
converted_items = []
for item in responses_items:
if item.get("type") == "computer_call" and "action" in item:
action = item["action"].copy()
# Handle single element_description
if "element_description" in action:
desc = action["element_description"]
if desc in desc2xy:
x, y = desc2xy[desc]
action["x"] = x
action["y"] = y
del action["element_description"]
# Handle start_element_description and end_element_description for drag operations
elif "start_element_description" in action and "end_element_description" in action:
start_desc = action["start_element_description"]
end_desc = action["end_element_description"]
if start_desc in desc2xy and end_desc in desc2xy:
start_x, start_y = desc2xy[start_desc]
end_x, end_y = desc2xy[end_desc]
action["path"] = [{"x": start_x, "y": start_y}, {"x": end_x, "y": end_y}]
del action["start_element_description"]
del action["end_element_description"]
converted_item = item.copy()
converted_item["action"] = action
converted_items.append(converted_item)
else:
converted_items.append(item)
return converted_items
def convert_computer_calls_xy2desc(
responses_items: List[Dict[str, Any]], desc2xy: Dict[str, tuple]
) -> List[Dict[str, Any]]:
"""
Convert computer calls from x,y coordinates to element descriptions.
Args:
responses_items: List of response items containing computer calls with x,y coordinates
desc2xy: Dictionary mapping element descriptions to (x, y) coordinate tuples
Returns:
List of response items with x,y coordinates replaced by element_description
"""
# Create reverse mapping from coordinates to descriptions
xy2desc = {coords: desc for desc, coords in desc2xy.items()}
converted_items = []
for item in responses_items:
if item.get("type") == "computer_call" and "action" in item:
action = item["action"].copy()
# Handle single x,y coordinates
if "x" in action and "y" in action:
coords = (action["x"], action["y"])
if coords in xy2desc:
action["element_description"] = xy2desc[coords]
del action["x"]
del action["y"]
# Handle path for drag operations
elif "path" in action and isinstance(action["path"], list) and len(action["path"]) == 2:
start_point = action["path"][0]
end_point = action["path"][1]
if (
"x" in start_point
and "y" in start_point
and "x" in end_point
and "y" in end_point
):
start_coords = (start_point["x"], start_point["y"])
end_coords = (end_point["x"], end_point["y"])
if start_coords in xy2desc and end_coords in xy2desc:
action["start_element_description"] = xy2desc[start_coords]
action["end_element_description"] = xy2desc[end_coords]
del action["path"]
converted_item = item.copy()
converted_item["action"] = action
converted_items.append(converted_item)
else:
converted_items.append(item)
return converted_items
def get_all_element_descriptions(responses_items: List[Dict[str, Any]]) -> List[str]:
"""
Extract all element descriptions from computer calls in responses items.
Args:
responses_items: List of response items containing computer calls
Returns:
List of unique element descriptions found in computer calls
"""
descriptions = set()
for item in responses_items:
if item.get("type") == "computer_call" and "action" in item:
action = item["action"]
# Handle single element_description
if "element_description" in action:
descriptions.add(action["element_description"])
# Handle start_element_description and end_element_description for drag operations
if "start_element_description" in action:
descriptions.add(action["start_element_description"])
if "end_element_description" in action:
descriptions.add(action["end_element_description"])
return list(descriptions)
# Conversion functions between responses_items and completion messages formats
def convert_responses_items_to_completion_messages(
messages: List[Dict[str, Any]],
allow_images_in_tool_results: bool = True,
send_multiple_user_images_per_parallel_tool_results: bool = False,
) -> List[Dict[str, Any]]:
"""Convert responses_items message format to liteLLM completion format.
Args:
messages: List of responses_items format messages
allow_images_in_tool_results: If True, include images in tool role messages.
If False, send tool message + separate user message with image.
send_multiple_user_images_per_parallel_tool_results: If True, send multiple user images in parallel tool results.
"""
completion_messages = []
for i, message in enumerate(messages):
msg_type = message.get("type")
role = message.get("role")
# Handle user messages (both with and without explicit type)
if role == "user" or msg_type == "user":
content = message.get("content", "")
if isinstance(content, list):
# Handle list content (images, text blocks)
completion_content = []
for item in content:
if item.get("type") == "input_image":
completion_content.append(
{"type": "image_url", "image_url": {"url": item.get("image_url")}}
)
elif item.get("type") == "input_text":
completion_content.append({"type": "text", "text": item.get("text")})
elif item.get("type") == "text":
completion_content.append({"type": "text", "text": item.get("text")})
completion_messages.append({"role": "user", "content": completion_content})
elif isinstance(content, str):
# Handle string content
completion_messages.append({"role": "user", "content": content})
# Handle assistant messages
elif role == "assistant" or msg_type == "message":
content = message.get("content", [])
if isinstance(content, list):
text_parts = []
for item in content:
if item.get("type") == "output_text":
text_parts.append(item.get("text", ""))
elif item.get("type") == "text":
text_parts.append(item.get("text", ""))
if text_parts:
completion_messages.append(
{"role": "assistant", "content": "\n".join(text_parts)}
)
# Handle reasoning items (convert to assistant message)
elif msg_type == "reasoning":
summary = message.get("summary", [])
text_parts = []
for item in summary:
if item.get("type") == "summary_text":
text_parts.append(item.get("text", ""))
if text_parts:
completion_messages.append({"role": "assistant", "content": "\n".join(text_parts)})
# Handle function calls
elif msg_type == "function_call":
# Add tool call to last assistant message or create new one
if not completion_messages or completion_messages[-1]["role"] != "assistant":
completion_messages.append({"role": "assistant", "content": "", "tool_calls": []})
if "tool_calls" not in completion_messages[-1]:
completion_messages[-1]["tool_calls"] = []
completion_messages[-1]["tool_calls"].append(
{
"id": message.get("call_id"),
"type": "function",
"function": {
"name": message.get("name"),
"arguments": message.get("arguments"),
},
}
)
# Handle computer calls
elif msg_type == "computer_call":
# Add tool call to last assistant message or create new one
if not completion_messages or completion_messages[-1]["role"] != "assistant":
completion_messages.append({"role": "assistant", "content": "", "tool_calls": []})
if "tool_calls" not in completion_messages[-1]:
completion_messages[-1]["tool_calls"] = []
action = message.get("action", {})
completion_messages[-1]["tool_calls"].append(
{
"id": message.get("call_id"),
"type": "function",
"function": {"name": "computer", "arguments": json.dumps(action)},
}
)
# Handle function/computer call outputs
elif msg_type in ["function_call_output", "computer_call_output"]:
output = message.get("output")
call_id = message.get("call_id")
if isinstance(output, dict) and output.get("type") == "input_image":
if allow_images_in_tool_results:
# Handle image output as tool response (may not work with all APIs)
completion_messages.append(
{
"role": "tool",
"tool_call_id": call_id,
"content": [
{"type": "image_url", "image_url": {"url": output.get("image_url")}}
],
}
)
else:
# Determine if the next message is also a tool call output
next_type = None
if i + 1 < len(messages):
next_msg = messages[i + 1]
next_type = next_msg.get("type")
is_next_message_image_result = next_type in [
"computer_call_output",
]
# Send tool message + separate user message with image (OpenAI compatible)
completion_messages += (
[
{
"role": "tool",
"tool_call_id": call_id,
"content": "[Execution completed. See screenshot below]",
},
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": output.get("image_url")},
}
],
},
]
if send_multiple_user_images_per_parallel_tool_results
or (not is_next_message_image_result)
else [
{
"role": "tool",
"tool_call_id": call_id,
"content": "[Execution completed. See screenshot below]",
},
]
)
else:
# Handle text output as tool response
completion_messages.append(
{"role": "tool", "tool_call_id": call_id, "content": str(output)}
)
return completion_messages
def convert_completion_messages_to_responses_items(
completion_messages: List[Dict[str, Any]],
) -> List[Dict[str, Any]]:
"""Convert completion messages format to responses_items message format."""
responses_items = []
skip_next = False
for i, message in enumerate(completion_messages):
if skip_next:
skip_next = False
continue
role = message.get("role")
content = message.get("content")
tool_calls = message.get("tool_calls", [])
# Handle assistant messages with text content
if role == "assistant" and content and isinstance(content, str):
responses_items.append(
{
"type": "message",
"role": "assistant",
"content": [{"type": "output_text", "text": content}],
}
)
# Handle tool calls
if tool_calls:
for tool_call in tool_calls:
if tool_call.get("type") == "function":
function = tool_call.get("function", {})
function_name = function.get("name")
if function_name == "computer":
# Parse computer action
try:
action = json.loads(function.get("arguments", "{}"))
# Change key from "action" -> "type"
if action.get("action"):
action["type"] = action["action"]
del action["action"]
responses_items.append(
{
"type": "computer_call",
"call_id": tool_call.get("id"),
"action": action,
"status": "completed",
}
)
except json.JSONDecodeError:
# Fallback to function call format
responses_items.append(
{
"type": "function_call",
"call_id": tool_call.get("id"),
"name": function_name,
"arguments": function.get("arguments", "{}"),
"status": "completed",
}
)
else:
# Regular function call
responses_items.append(
{
"type": "function_call",
"call_id": tool_call.get("id"),
"name": function_name,
"arguments": function.get("arguments", "{}"),
"status": "completed",
}
)
# Handle tool messages (function/computer call outputs)
elif role == "tool" and content:
tool_call_id = message.get("tool_call_id")
if isinstance(content, str):
# Check if this is the "[Execution completed. See screenshot below]" pattern
if content == "[Execution completed. See screenshot below]":
# Look ahead for the next user message with image
next_idx = i + 1
if (
next_idx < len(completion_messages)
and completion_messages[next_idx].get("role") == "user"
and isinstance(completion_messages[next_idx].get("content"), list)
):
# Found the pattern - extract image from next message
next_content = completion_messages[next_idx]["content"]
for item in next_content:
if item.get("type") == "image_url":
responses_items.append(
{
"type": "computer_call_output",
"call_id": tool_call_id,
"output": {
"type": "input_image",
"image_url": item.get("image_url", {}).get("url"),
},
}
)
# Skip the next user message since we processed it
skip_next = True
break
else:
# No matching user message, treat as regular text
responses_items.append(
{
"type": "computer_call_output",
"call_id": tool_call_id,
"output": content,
}
)
else:
# Determine if this is a computer call or function call output
try:
# Try to parse as structured output
parsed_content = json.loads(content)
if parsed_content.get("type") == "input_image":
responses_items.append(
{
"type": "computer_call_output",
"call_id": tool_call_id,
"output": parsed_content,
}
)
else:
responses_items.append(
{
"type": "computer_call_output",
"call_id": tool_call_id,
"output": content,
}
)
except json.JSONDecodeError:
# Plain text output - could be function or computer call
responses_items.append(
{
"type": "function_call_output",
"call_id": tool_call_id,
"output": content,
}
)
elif isinstance(content, list):
# Handle structured content (e.g., images)
for item in content:
if item.get("type") == "image_url":
responses_items.append(
{
"type": "computer_call_output",
"call_id": tool_call_id,
"output": {
"type": "input_image",
"image_url": item.get("image_url", {}).get("url"),
},
}
)
elif item.get("type") == "text":
responses_items.append(
{
"type": "function_call_output",
"call_id": tool_call_id,
"output": item.get("text"),
}
)
# Handle actual user messages
elif role == "user" and content:
if isinstance(content, list):
# Handle structured user content (e.g., text + images)
user_content = []
for item in content:
if item.get("type") == "image_url":
user_content.append(
{
"type": "input_image",
"image_url": item.get("image_url", {}).get("url"),
}
)
elif item.get("type") == "text":
user_content.append({"type": "input_text", "text": item.get("text")})
if user_content:
responses_items.append(
{"role": "user", "type": "message", "content": user_content}
)
elif isinstance(content, str):
# Handle simple text user message
responses_items.append({"role": "user", "content": content})
return responses_items