mirror of
https://github.com/trycua/computer.git
synced 2026-01-04 20:40:15 -06:00
* Sort imports alphabetically in agent loops __init__.py Fix isort check failure by alphabetically sorting both the import list and __all__ list to match the expected order. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> * run on all prs * Fix black formatting issues in responses.py and provider.py Apply black formatting to fix lint check failures: - Reformat conditional expression in responses.py for better readability - Break long assert line in provider.py to comply with line length limits 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> * Fix prettier formatting issues in markdown and TypeScript files Apply prettier formatting to blog posts, documentation, and CLI code: - Format blog markdown files for consistent styling - Format TypeScript CLI source files - Format documentation MDX files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> --------- Co-authored-by: Claude <noreply@anthropic.com>
801 lines
31 KiB
Python
801 lines
31 KiB
Python
"""
|
|
Functions for making various Responses API items from different types of responses.
|
|
Based on the OpenAI spec for Responses API items.
|
|
"""
|
|
|
|
import base64
|
|
import json
|
|
import uuid
|
|
from typing import Any, Dict, List, Literal, Optional, Union
|
|
|
|
from openai.types.responses.easy_input_message_param import EasyInputMessageParam
|
|
from openai.types.responses.response_computer_tool_call_param import (
|
|
ActionClick,
|
|
ActionDoubleClick,
|
|
ActionDrag,
|
|
ActionDragPath,
|
|
ActionKeypress,
|
|
ActionMove,
|
|
ActionScreenshot,
|
|
ActionScroll,
|
|
)
|
|
from openai.types.responses.response_computer_tool_call_param import (
|
|
ActionType as ActionTypeAction,
|
|
)
|
|
from openai.types.responses.response_computer_tool_call_param import (
|
|
ActionWait,
|
|
PendingSafetyCheck,
|
|
ResponseComputerToolCallParam,
|
|
)
|
|
from openai.types.responses.response_function_tool_call_param import (
|
|
ResponseFunctionToolCallParam,
|
|
)
|
|
from openai.types.responses.response_input_image_param import ResponseInputImageParam
|
|
from openai.types.responses.response_output_message_param import (
|
|
ResponseOutputMessageParam,
|
|
)
|
|
from openai.types.responses.response_output_text_param import ResponseOutputTextParam
|
|
from openai.types.responses.response_reasoning_item_param import (
|
|
ResponseReasoningItemParam,
|
|
Summary,
|
|
)
|
|
|
|
|
|
def random_id():
|
|
return str(uuid.uuid4())
|
|
|
|
|
|
# User message items
|
|
def make_input_image_item(image_data: Union[str, bytes]) -> EasyInputMessageParam:
|
|
return EasyInputMessageParam(
|
|
content=[
|
|
ResponseInputImageParam(
|
|
type="input_image",
|
|
image_url=f"data:image/png;base64,{base64.b64encode(image_data).decode('utf-8') if isinstance(image_data, bytes) else image_data}",
|
|
) # type: ignore
|
|
],
|
|
role="user",
|
|
type="message",
|
|
)
|
|
|
|
|
|
# Text items
|
|
def make_reasoning_item(reasoning: str) -> ResponseReasoningItemParam:
|
|
return ResponseReasoningItemParam(
|
|
id=random_id(), summary=[Summary(text=reasoning, type="summary_text")], type="reasoning"
|
|
)
|
|
|
|
|
|
def make_output_text_item(content: str) -> ResponseOutputMessageParam:
|
|
return ResponseOutputMessageParam(
|
|
id=random_id(),
|
|
content=[ResponseOutputTextParam(text=content, type="output_text", annotations=[])],
|
|
role="assistant",
|
|
status="completed",
|
|
type="message",
|
|
)
|
|
|
|
|
|
# Function call items
|
|
def make_function_call_item(
|
|
function_name: str, arguments: Dict[str, Any], call_id: Optional[str] = None
|
|
) -> ResponseFunctionToolCallParam:
|
|
return ResponseFunctionToolCallParam(
|
|
id=random_id(),
|
|
call_id=call_id if call_id else random_id(),
|
|
name=function_name,
|
|
arguments=json.dumps(arguments),
|
|
status="completed",
|
|
type="function_call",
|
|
)
|
|
|
|
|
|
# Computer tool call items
|
|
def make_click_item(
|
|
x: int,
|
|
y: int,
|
|
button: Literal["left", "right", "wheel", "back", "forward"] = "left",
|
|
call_id: Optional[str] = None,
|
|
) -> ResponseComputerToolCallParam:
|
|
return ResponseComputerToolCallParam(
|
|
id=random_id(),
|
|
call_id=call_id if call_id else random_id(),
|
|
action=ActionClick(button=button, type="click", x=x, y=y),
|
|
pending_safety_checks=[],
|
|
status="completed",
|
|
type="computer_call",
|
|
)
|
|
|
|
|
|
def make_double_click_item(
|
|
x: int, y: int, call_id: Optional[str] = None
|
|
) -> ResponseComputerToolCallParam:
|
|
return ResponseComputerToolCallParam(
|
|
id=random_id(),
|
|
call_id=call_id if call_id else random_id(),
|
|
action=ActionDoubleClick(type="double_click", x=x, y=y),
|
|
pending_safety_checks=[],
|
|
status="completed",
|
|
type="computer_call",
|
|
)
|
|
|
|
|
|
def make_drag_item(
|
|
path: List[Dict[str, int]], call_id: Optional[str] = None
|
|
) -> ResponseComputerToolCallParam:
|
|
drag_path = [ActionDragPath(x=point["x"], y=point["y"]) for point in path]
|
|
return ResponseComputerToolCallParam(
|
|
id=random_id(),
|
|
call_id=call_id if call_id else random_id(),
|
|
action=ActionDrag(path=drag_path, type="drag"),
|
|
pending_safety_checks=[],
|
|
status="completed",
|
|
type="computer_call",
|
|
)
|
|
|
|
|
|
def make_keypress_item(
|
|
keys: List[str], call_id: Optional[str] = None
|
|
) -> ResponseComputerToolCallParam:
|
|
return ResponseComputerToolCallParam(
|
|
id=random_id(),
|
|
call_id=call_id if call_id else random_id(),
|
|
action=ActionKeypress(keys=keys, type="keypress"),
|
|
pending_safety_checks=[],
|
|
status="completed",
|
|
type="computer_call",
|
|
)
|
|
|
|
|
|
def make_move_item(x: int, y: int, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
|
|
return ResponseComputerToolCallParam(
|
|
id=random_id(),
|
|
call_id=call_id if call_id else random_id(),
|
|
action=ActionMove(type="move", x=x, y=y),
|
|
pending_safety_checks=[],
|
|
status="completed",
|
|
type="computer_call",
|
|
)
|
|
|
|
|
|
def make_screenshot_item(call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
|
|
return ResponseComputerToolCallParam(
|
|
id=random_id(),
|
|
call_id=call_id if call_id else random_id(),
|
|
action=ActionScreenshot(type="screenshot"),
|
|
pending_safety_checks=[],
|
|
status="completed",
|
|
type="computer_call",
|
|
)
|
|
|
|
|
|
def make_scroll_item(
|
|
x: int, y: int, scroll_x: int, scroll_y: int, call_id: Optional[str] = None
|
|
) -> ResponseComputerToolCallParam:
|
|
return ResponseComputerToolCallParam(
|
|
id=random_id(),
|
|
call_id=call_id if call_id else random_id(),
|
|
action=ActionScroll(scroll_x=scroll_x, scroll_y=scroll_y, type="scroll", x=x, y=y),
|
|
pending_safety_checks=[],
|
|
status="completed",
|
|
type="computer_call",
|
|
)
|
|
|
|
|
|
def make_type_item(text: str, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
|
|
return ResponseComputerToolCallParam(
|
|
id=random_id(),
|
|
call_id=call_id if call_id else random_id(),
|
|
action=ActionTypeAction(text=text, type="type"),
|
|
pending_safety_checks=[],
|
|
status="completed",
|
|
type="computer_call",
|
|
)
|
|
|
|
|
|
def make_wait_item(call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
|
|
return ResponseComputerToolCallParam(
|
|
id=random_id(),
|
|
call_id=call_id if call_id else random_id(),
|
|
action=ActionWait(type="wait"),
|
|
pending_safety_checks=[],
|
|
status="completed",
|
|
type="computer_call",
|
|
)
|
|
|
|
|
|
# Extra anthropic computer calls
|
|
def make_left_mouse_down_item(
|
|
x: Optional[int] = None, y: Optional[int] = None, call_id: Optional[str] = None
|
|
) -> Dict[str, Any]:
|
|
return {
|
|
"id": random_id(),
|
|
"call_id": call_id if call_id else random_id(),
|
|
"action": {"type": "left_mouse_down", "x": x, "y": y},
|
|
"pending_safety_checks": [],
|
|
"status": "completed",
|
|
"type": "computer_call",
|
|
}
|
|
|
|
|
|
def make_left_mouse_up_item(
|
|
x: Optional[int] = None, y: Optional[int] = None, call_id: Optional[str] = None
|
|
) -> Dict[str, Any]:
|
|
return {
|
|
"id": random_id(),
|
|
"call_id": call_id if call_id else random_id(),
|
|
"action": {"type": "left_mouse_up", "x": x, "y": y},
|
|
"pending_safety_checks": [],
|
|
"status": "completed",
|
|
"type": "computer_call",
|
|
}
|
|
|
|
|
|
def make_failed_tool_call_items(
|
|
tool_name: str, tool_kwargs: Dict[str, Any], error_message: str, call_id: Optional[str] = None
|
|
) -> List[Dict[str, Any]]:
|
|
call_id = call_id if call_id else random_id()
|
|
return [
|
|
{
|
|
"type": "function_call",
|
|
"id": random_id(),
|
|
"call_id": call_id,
|
|
"name": tool_name,
|
|
"arguments": json.dumps(tool_kwargs),
|
|
},
|
|
{
|
|
"type": "function_call_output",
|
|
"call_id": call_id,
|
|
"output": json.dumps({"error": error_message}),
|
|
},
|
|
]
|
|
|
|
|
|
def make_tool_error_item(error_message: str, call_id: Optional[str] = None) -> Dict[str, Any]:
|
|
call_id = call_id if call_id else random_id()
|
|
return {
|
|
"type": "function_call_output",
|
|
"call_id": call_id,
|
|
"output": json.dumps({"error": error_message}),
|
|
}
|
|
|
|
|
|
def replace_failed_computer_calls_with_function_calls(
|
|
messages: List[Dict[str, Any]],
|
|
) -> List[Dict[str, Any]]:
|
|
"""
|
|
Replace computer_call items with function_call items if they share a call_id with a function_call_output.
|
|
This indicates the computer call failed and should be treated as a function call instead.
|
|
We do this because the computer_call_output items do not support text output.
|
|
|
|
Args:
|
|
messages: List of message items to process
|
|
"""
|
|
messages = messages.copy()
|
|
|
|
# Find all call_ids that have function_call_output items
|
|
failed_call_ids = set()
|
|
for msg in messages:
|
|
if msg.get("type") == "function_call_output":
|
|
call_id = msg.get("call_id")
|
|
if call_id:
|
|
failed_call_ids.add(call_id)
|
|
|
|
# Replace computer_call items that have matching call_ids
|
|
for i, msg in enumerate(messages):
|
|
if msg.get("type") == "computer_call" and msg.get("call_id") in failed_call_ids:
|
|
|
|
# Extract action from computer_call
|
|
action = msg.get("action", {})
|
|
call_id = msg.get("call_id")
|
|
|
|
# Create function_call replacement
|
|
messages[i] = {
|
|
"type": "function_call",
|
|
"id": msg.get("id", random_id()),
|
|
"call_id": call_id,
|
|
"name": "computer",
|
|
"arguments": json.dumps(action),
|
|
}
|
|
|
|
return messages
|
|
|
|
|
|
# Conversion functions between element descriptions and coordinates
|
|
def convert_computer_calls_desc2xy(
|
|
responses_items: List[Dict[str, Any]], desc2xy: Dict[str, tuple]
|
|
) -> List[Dict[str, Any]]:
|
|
"""
|
|
Convert computer calls from element descriptions to x,y coordinates.
|
|
|
|
Args:
|
|
responses_items: List of response items containing computer calls with element_description
|
|
desc2xy: Dictionary mapping element descriptions to (x, y) coordinate tuples
|
|
|
|
Returns:
|
|
List of response items with element_description replaced by x,y coordinates
|
|
"""
|
|
converted_items = []
|
|
|
|
for item in responses_items:
|
|
if item.get("type") == "computer_call" and "action" in item:
|
|
action = item["action"].copy()
|
|
|
|
# Handle single element_description
|
|
if "element_description" in action:
|
|
desc = action["element_description"]
|
|
if desc in desc2xy:
|
|
x, y = desc2xy[desc]
|
|
action["x"] = x
|
|
action["y"] = y
|
|
del action["element_description"]
|
|
|
|
# Handle start_element_description and end_element_description for drag operations
|
|
elif "start_element_description" in action and "end_element_description" in action:
|
|
start_desc = action["start_element_description"]
|
|
end_desc = action["end_element_description"]
|
|
|
|
if start_desc in desc2xy and end_desc in desc2xy:
|
|
start_x, start_y = desc2xy[start_desc]
|
|
end_x, end_y = desc2xy[end_desc]
|
|
action["path"] = [{"x": start_x, "y": start_y}, {"x": end_x, "y": end_y}]
|
|
del action["start_element_description"]
|
|
del action["end_element_description"]
|
|
|
|
converted_item = item.copy()
|
|
converted_item["action"] = action
|
|
converted_items.append(converted_item)
|
|
else:
|
|
converted_items.append(item)
|
|
|
|
return converted_items
|
|
|
|
|
|
def convert_computer_calls_xy2desc(
|
|
responses_items: List[Dict[str, Any]], desc2xy: Dict[str, tuple]
|
|
) -> List[Dict[str, Any]]:
|
|
"""
|
|
Convert computer calls from x,y coordinates to element descriptions.
|
|
|
|
Args:
|
|
responses_items: List of response items containing computer calls with x,y coordinates
|
|
desc2xy: Dictionary mapping element descriptions to (x, y) coordinate tuples
|
|
|
|
Returns:
|
|
List of response items with x,y coordinates replaced by element_description
|
|
"""
|
|
# Create reverse mapping from coordinates to descriptions
|
|
xy2desc = {coords: desc for desc, coords in desc2xy.items()}
|
|
|
|
converted_items = []
|
|
|
|
for item in responses_items:
|
|
if item.get("type") == "computer_call" and "action" in item:
|
|
action = item["action"].copy()
|
|
|
|
# Handle single x,y coordinates
|
|
if "x" in action and "y" in action:
|
|
coords = (action["x"], action["y"])
|
|
if coords in xy2desc:
|
|
action["element_description"] = xy2desc[coords]
|
|
del action["x"]
|
|
del action["y"]
|
|
|
|
# Handle path for drag operations
|
|
elif "path" in action and isinstance(action["path"], list) and len(action["path"]) == 2:
|
|
start_point = action["path"][0]
|
|
end_point = action["path"][1]
|
|
|
|
if (
|
|
"x" in start_point
|
|
and "y" in start_point
|
|
and "x" in end_point
|
|
and "y" in end_point
|
|
):
|
|
|
|
start_coords = (start_point["x"], start_point["y"])
|
|
end_coords = (end_point["x"], end_point["y"])
|
|
|
|
if start_coords in xy2desc and end_coords in xy2desc:
|
|
action["start_element_description"] = xy2desc[start_coords]
|
|
action["end_element_description"] = xy2desc[end_coords]
|
|
del action["path"]
|
|
|
|
converted_item = item.copy()
|
|
converted_item["action"] = action
|
|
converted_items.append(converted_item)
|
|
else:
|
|
converted_items.append(item)
|
|
|
|
return converted_items
|
|
|
|
|
|
def get_all_element_descriptions(responses_items: List[Dict[str, Any]]) -> List[str]:
|
|
"""
|
|
Extract all element descriptions from computer calls in responses items.
|
|
|
|
Args:
|
|
responses_items: List of response items containing computer calls
|
|
|
|
Returns:
|
|
List of unique element descriptions found in computer calls
|
|
"""
|
|
descriptions = set()
|
|
|
|
for item in responses_items:
|
|
if item.get("type") == "computer_call" and "action" in item:
|
|
action = item["action"]
|
|
|
|
# Handle single element_description
|
|
if "element_description" in action:
|
|
descriptions.add(action["element_description"])
|
|
|
|
# Handle start_element_description and end_element_description for drag operations
|
|
if "start_element_description" in action:
|
|
descriptions.add(action["start_element_description"])
|
|
|
|
if "end_element_description" in action:
|
|
descriptions.add(action["end_element_description"])
|
|
|
|
return list(descriptions)
|
|
|
|
|
|
# Conversion functions between responses_items and completion messages formats
|
|
def convert_responses_items_to_completion_messages(
|
|
messages: List[Dict[str, Any]],
|
|
allow_images_in_tool_results: bool = True,
|
|
send_multiple_user_images_per_parallel_tool_results: bool = False,
|
|
) -> List[Dict[str, Any]]:
|
|
"""Convert responses_items message format to liteLLM completion format.
|
|
|
|
Args:
|
|
messages: List of responses_items format messages
|
|
allow_images_in_tool_results: If True, include images in tool role messages.
|
|
If False, send tool message + separate user message with image.
|
|
send_multiple_user_images_per_parallel_tool_results: If True, send multiple user images in parallel tool results.
|
|
"""
|
|
completion_messages = []
|
|
|
|
for i, message in enumerate(messages):
|
|
msg_type = message.get("type")
|
|
role = message.get("role")
|
|
|
|
# Handle user messages (both with and without explicit type)
|
|
if role == "user" or msg_type == "user":
|
|
content = message.get("content", "")
|
|
if isinstance(content, list):
|
|
# Handle list content (images, text blocks)
|
|
completion_content = []
|
|
for item in content:
|
|
if item.get("type") == "input_image":
|
|
completion_content.append(
|
|
{"type": "image_url", "image_url": {"url": item.get("image_url")}}
|
|
)
|
|
elif item.get("type") == "input_text":
|
|
completion_content.append({"type": "text", "text": item.get("text")})
|
|
elif item.get("type") == "text":
|
|
completion_content.append({"type": "text", "text": item.get("text")})
|
|
|
|
completion_messages.append({"role": "user", "content": completion_content})
|
|
elif isinstance(content, str):
|
|
# Handle string content
|
|
completion_messages.append({"role": "user", "content": content})
|
|
|
|
# Handle assistant messages
|
|
elif role == "assistant" or msg_type == "message":
|
|
content = message.get("content", [])
|
|
if isinstance(content, list):
|
|
text_parts = []
|
|
for item in content:
|
|
if item.get("type") == "output_text":
|
|
text_parts.append(item.get("text", ""))
|
|
elif item.get("type") == "text":
|
|
text_parts.append(item.get("text", ""))
|
|
|
|
if text_parts:
|
|
completion_messages.append(
|
|
{"role": "assistant", "content": "\n".join(text_parts)}
|
|
)
|
|
|
|
# Handle reasoning items (convert to assistant message)
|
|
elif msg_type == "reasoning":
|
|
summary = message.get("summary", [])
|
|
text_parts = []
|
|
for item in summary:
|
|
if item.get("type") == "summary_text":
|
|
text_parts.append(item.get("text", ""))
|
|
|
|
if text_parts:
|
|
completion_messages.append({"role": "assistant", "content": "\n".join(text_parts)})
|
|
|
|
# Handle function calls
|
|
elif msg_type == "function_call":
|
|
# Add tool call to last assistant message or create new one
|
|
if not completion_messages or completion_messages[-1]["role"] != "assistant":
|
|
completion_messages.append({"role": "assistant", "content": "", "tool_calls": []})
|
|
|
|
if "tool_calls" not in completion_messages[-1]:
|
|
completion_messages[-1]["tool_calls"] = []
|
|
|
|
completion_messages[-1]["tool_calls"].append(
|
|
{
|
|
"id": message.get("call_id"),
|
|
"type": "function",
|
|
"function": {
|
|
"name": message.get("name"),
|
|
"arguments": message.get("arguments"),
|
|
},
|
|
}
|
|
)
|
|
|
|
# Handle computer calls
|
|
elif msg_type == "computer_call":
|
|
# Add tool call to last assistant message or create new one
|
|
if not completion_messages or completion_messages[-1]["role"] != "assistant":
|
|
completion_messages.append({"role": "assistant", "content": "", "tool_calls": []})
|
|
|
|
if "tool_calls" not in completion_messages[-1]:
|
|
completion_messages[-1]["tool_calls"] = []
|
|
|
|
action = message.get("action", {})
|
|
completion_messages[-1]["tool_calls"].append(
|
|
{
|
|
"id": message.get("call_id"),
|
|
"type": "function",
|
|
"function": {"name": "computer", "arguments": json.dumps(action)},
|
|
}
|
|
)
|
|
|
|
# Handle function/computer call outputs
|
|
elif msg_type in ["function_call_output", "computer_call_output"]:
|
|
output = message.get("output")
|
|
call_id = message.get("call_id")
|
|
|
|
if isinstance(output, dict) and output.get("type") == "input_image":
|
|
if allow_images_in_tool_results:
|
|
# Handle image output as tool response (may not work with all APIs)
|
|
completion_messages.append(
|
|
{
|
|
"role": "tool",
|
|
"tool_call_id": call_id,
|
|
"content": [
|
|
{"type": "image_url", "image_url": {"url": output.get("image_url")}}
|
|
],
|
|
}
|
|
)
|
|
else:
|
|
# Determine if the next message is also a tool call output
|
|
next_type = None
|
|
if i + 1 < len(messages):
|
|
next_msg = messages[i + 1]
|
|
next_type = next_msg.get("type")
|
|
is_next_message_image_result = next_type in [
|
|
"computer_call_output",
|
|
]
|
|
# Send tool message + separate user message with image (OpenAI compatible)
|
|
completion_messages += (
|
|
[
|
|
{
|
|
"role": "tool",
|
|
"tool_call_id": call_id,
|
|
"content": "[Execution completed. See screenshot below]",
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "image_url",
|
|
"image_url": {"url": output.get("image_url")},
|
|
}
|
|
],
|
|
},
|
|
]
|
|
if send_multiple_user_images_per_parallel_tool_results
|
|
or (not is_next_message_image_result)
|
|
else [
|
|
{
|
|
"role": "tool",
|
|
"tool_call_id": call_id,
|
|
"content": "[Execution completed. See screenshot below]",
|
|
},
|
|
]
|
|
)
|
|
else:
|
|
# Handle text output as tool response
|
|
completion_messages.append(
|
|
{"role": "tool", "tool_call_id": call_id, "content": str(output)}
|
|
)
|
|
|
|
return completion_messages
|
|
|
|
|
|
def convert_completion_messages_to_responses_items(
|
|
completion_messages: List[Dict[str, Any]],
|
|
) -> List[Dict[str, Any]]:
|
|
"""Convert completion messages format to responses_items message format."""
|
|
responses_items = []
|
|
skip_next = False
|
|
|
|
for i, message in enumerate(completion_messages):
|
|
if skip_next:
|
|
skip_next = False
|
|
continue
|
|
|
|
role = message.get("role")
|
|
content = message.get("content")
|
|
tool_calls = message.get("tool_calls", [])
|
|
|
|
# Handle assistant messages with text content
|
|
if role == "assistant" and content and isinstance(content, str):
|
|
responses_items.append(
|
|
{
|
|
"type": "message",
|
|
"role": "assistant",
|
|
"content": [{"type": "output_text", "text": content}],
|
|
}
|
|
)
|
|
|
|
# Handle tool calls
|
|
if tool_calls:
|
|
for tool_call in tool_calls:
|
|
if tool_call.get("type") == "function":
|
|
function = tool_call.get("function", {})
|
|
function_name = function.get("name")
|
|
|
|
if function_name == "computer":
|
|
# Parse computer action
|
|
try:
|
|
action = json.loads(function.get("arguments", "{}"))
|
|
# Change key from "action" -> "type"
|
|
if action.get("action"):
|
|
action["type"] = action["action"]
|
|
del action["action"]
|
|
responses_items.append(
|
|
{
|
|
"type": "computer_call",
|
|
"call_id": tool_call.get("id"),
|
|
"action": action,
|
|
"status": "completed",
|
|
}
|
|
)
|
|
except json.JSONDecodeError:
|
|
# Fallback to function call format
|
|
responses_items.append(
|
|
{
|
|
"type": "function_call",
|
|
"call_id": tool_call.get("id"),
|
|
"name": function_name,
|
|
"arguments": function.get("arguments", "{}"),
|
|
"status": "completed",
|
|
}
|
|
)
|
|
else:
|
|
# Regular function call
|
|
responses_items.append(
|
|
{
|
|
"type": "function_call",
|
|
"call_id": tool_call.get("id"),
|
|
"name": function_name,
|
|
"arguments": function.get("arguments", "{}"),
|
|
"status": "completed",
|
|
}
|
|
)
|
|
|
|
# Handle tool messages (function/computer call outputs)
|
|
elif role == "tool" and content:
|
|
tool_call_id = message.get("tool_call_id")
|
|
if isinstance(content, str):
|
|
# Check if this is the "[Execution completed. See screenshot below]" pattern
|
|
if content == "[Execution completed. See screenshot below]":
|
|
# Look ahead for the next user message with image
|
|
next_idx = i + 1
|
|
if (
|
|
next_idx < len(completion_messages)
|
|
and completion_messages[next_idx].get("role") == "user"
|
|
and isinstance(completion_messages[next_idx].get("content"), list)
|
|
):
|
|
# Found the pattern - extract image from next message
|
|
next_content = completion_messages[next_idx]["content"]
|
|
for item in next_content:
|
|
if item.get("type") == "image_url":
|
|
responses_items.append(
|
|
{
|
|
"type": "computer_call_output",
|
|
"call_id": tool_call_id,
|
|
"output": {
|
|
"type": "input_image",
|
|
"image_url": item.get("image_url", {}).get("url"),
|
|
},
|
|
}
|
|
)
|
|
# Skip the next user message since we processed it
|
|
skip_next = True
|
|
break
|
|
else:
|
|
# No matching user message, treat as regular text
|
|
responses_items.append(
|
|
{
|
|
"type": "computer_call_output",
|
|
"call_id": tool_call_id,
|
|
"output": content,
|
|
}
|
|
)
|
|
else:
|
|
# Determine if this is a computer call or function call output
|
|
try:
|
|
# Try to parse as structured output
|
|
parsed_content = json.loads(content)
|
|
if parsed_content.get("type") == "input_image":
|
|
responses_items.append(
|
|
{
|
|
"type": "computer_call_output",
|
|
"call_id": tool_call_id,
|
|
"output": parsed_content,
|
|
}
|
|
)
|
|
else:
|
|
responses_items.append(
|
|
{
|
|
"type": "computer_call_output",
|
|
"call_id": tool_call_id,
|
|
"output": content,
|
|
}
|
|
)
|
|
except json.JSONDecodeError:
|
|
# Plain text output - could be function or computer call
|
|
responses_items.append(
|
|
{
|
|
"type": "function_call_output",
|
|
"call_id": tool_call_id,
|
|
"output": content,
|
|
}
|
|
)
|
|
elif isinstance(content, list):
|
|
# Handle structured content (e.g., images)
|
|
for item in content:
|
|
if item.get("type") == "image_url":
|
|
responses_items.append(
|
|
{
|
|
"type": "computer_call_output",
|
|
"call_id": tool_call_id,
|
|
"output": {
|
|
"type": "input_image",
|
|
"image_url": item.get("image_url", {}).get("url"),
|
|
},
|
|
}
|
|
)
|
|
elif item.get("type") == "text":
|
|
responses_items.append(
|
|
{
|
|
"type": "function_call_output",
|
|
"call_id": tool_call_id,
|
|
"output": item.get("text"),
|
|
}
|
|
)
|
|
|
|
# Handle actual user messages
|
|
elif role == "user" and content:
|
|
if isinstance(content, list):
|
|
# Handle structured user content (e.g., text + images)
|
|
user_content = []
|
|
for item in content:
|
|
if item.get("type") == "image_url":
|
|
user_content.append(
|
|
{
|
|
"type": "input_image",
|
|
"image_url": item.get("image_url", {}).get("url"),
|
|
}
|
|
)
|
|
elif item.get("type") == "text":
|
|
user_content.append({"type": "input_text", "text": item.get("text")})
|
|
|
|
if user_content:
|
|
responses_items.append(
|
|
{"role": "user", "type": "message", "content": user_content}
|
|
)
|
|
elif isinstance(content, str):
|
|
# Handle simple text user message
|
|
responses_items.append({"role": "user", "content": content})
|
|
|
|
return responses_items
|