diff --git a/docs/content/docs/agent-sdk/chat-history.mdx b/docs/content/docs/agent-sdk/chat-history.mdx index 83435a70..e7041c3b 100644 --- a/docs/content/docs/agent-sdk/chat-history.mdx +++ b/docs/content/docs/agent-sdk/chat-history.mdx @@ -75,13 +75,7 @@ messages = [ ## Message Types -- **user**: User input messages -- **computer_call**: Computer actions (click, type, keypress, etc.) -- **computer_call_output**: Results from computer actions (usually screenshots) -- **function_call**: Function calls (e.g., `computer.call`) -- **function_call_output**: Results from function calls -- **reasoning**: Agent's internal reasoning and planning -- **message**: Agent text responses +See the complete schema in [Message Format](./message-format). ### Memory Management diff --git a/docs/content/docs/agent-sdk/message-format.mdx b/docs/content/docs/agent-sdk/message-format.mdx new file mode 100644 index 00000000..ac329d4d --- /dev/null +++ b/docs/content/docs/agent-sdk/message-format.mdx @@ -0,0 +1,201 @@ +--- +title: Message Format +--- + +This page documents the Python message and response schema used by the Agent SDK. +It mirrors the structure shown in Chat History and provides precise type definitions you can target in your own code. + +All examples below use Python type hints with `TypedDict` and `Literal` from the standard `typing` module. + +## Response + +The agent yields response chunks as an async generator of objects with `output` and `usage`. + +```python +from typing import List, TypedDict + +class Usage(TypedDict, total=False): + prompt_tokens: int + completion_tokens: int + total_tokens: int + response_cost: float # USD cost if available + +class AgentResponse(TypedDict): + output: List["AgentMessage"] + usage: Usage +``` + +## Messages + +Agent messages represent the state of the conversation and the agent's actions. + +```python +from typing import List, Literal, Optional, TypedDict, Union + +# Union of all message variants +AgentMessage = Union[ + "UserMessage", + "AssistantMessage", + "ReasoningMessage", + "ComputerCallMessage", + "ComputerCallOutputMessage", + "FunctionCallMessage", + "FunctionCallOutputMessage", +] + +# Input message (role: user/system/developer) +class UserMessage(TypedDict, total=False): + type: Literal["message"] # optional for user input + role: Literal["user", "system", "developer"] + content: Union[str, List["InputContent"]] + +# Output message (assistant text) +class AssistantMessage(TypedDict): + type: Literal["message"] + role: Literal["assistant"] + content: List["OutputContent"] + +# Output reasoning/thinking message +class ReasoningMessage(TypedDict): + type: Literal["reasoning"] + summary: List["SummaryContent"] + +# Output computer action call (agent intends to act) +class ComputerCallMessage(TypedDict): + type: Literal["computer_call"] + call_id: str + status: Literal["completed", "failed", "pending"] + action: "ComputerAction" + +# Output computer action result (always a screenshot) +class ComputerCallOutputMessage(TypedDict): + type: Literal["computer_call_output"] + call_id: str + output: "ComputerResultContent" + +# Output function call (agent calls a Python tool) +class FunctionCallMessage(TypedDict): + type: Literal["function_call"] + call_id: str + status: Literal["completed", "failed", "pending"] + name: str + arguments: str # JSON-serialized kwargs + +# Output function call result (text) +class FunctionCallOutputMessage(TypedDict): + type: Literal["function_call_output"] + call_id: str + output: str +``` + +## Message Content + +These content items appear inside `content` arrays for the message types above. + +```python +# Input content kinds +class InputContent(TypedDict): + type: Literal["input_image", "input_text"] + text: Optional[str] + image_url: Optional[str] # e.g., data URL + +# Assistant output content +class OutputContent(TypedDict): + type: Literal["output_text"] + text: str + +# Reasoning/summary output content +class SummaryContent(TypedDict): + type: Literal["summary_text"] + text: str + +# Computer call outputs (screenshots) +class ComputerResultContent(TypedDict): + type: Literal["computer_screenshot", "input_image"] + image_url: str # data URL (e.g., "data:image/png;base64,....") +``` + +## Actions + +Computer actions represent concrete operations the agent will perform on the computer. + +Two broad families exist depending on the provider: OpenAI-style and Anthropic-style. + +```python +# Union of all supported computer actions +ComputerAction = Union[ + "ClickAction", + "DoubleClickAction", + "DragAction", + "KeyPressAction", + "MoveAction", + "ScreenshotAction", + "ScrollAction", + "TypeAction", + "WaitAction", + # Anthropic variants + "LeftMouseDownAction", + "LeftMouseUpAction", +] + +# OpenAI Computer Actions +class ClickAction(TypedDict): + type: Literal["click"] + button: Literal["left", "right", "wheel", "back", "forward"] + x: int + y: int + +class DoubleClickAction(TypedDict, total=False): + type: Literal["double_click"] + button: Literal["left", "right", "wheel", "back", "forward"] + x: int + y: int + +class DragAction(TypedDict, total=False): + type: Literal["drag"] + button: Literal["left", "right", "wheel", "back", "forward"] + path: List[tuple[int, int]] # [(x1, y1), (x2, y2), ...] + +class KeyPressAction(TypedDict): + type: Literal["keypress"] + keys: List[str] # e.g., ["ctrl", "a"] + +class MoveAction(TypedDict): + type: Literal["move"] + x: int + y: int + +class ScreenshotAction(TypedDict): + type: Literal["screenshot"] + +class ScrollAction(TypedDict): + type: Literal["scroll"] + scroll_x: int + scroll_y: int + x: int + y: int + +class TypeAction(TypedDict): + type: Literal["type"] + text: str + +class WaitAction(TypedDict): + type: Literal["wait"] + +# Anthropic Computer Actions +class LeftMouseDownAction(TypedDict): + type: Literal["left_mouse_down"] + x: int + y: int + +class LeftMouseUpAction(TypedDict): + type: Literal["left_mouse_up"] + x: int + y: int +``` + +## Notes + +- The agent runtime may add provider-specific fields when available (e.g., usage cost). Unknown fields should be ignored for forward compatibility. +- Computer action outputs are screenshots as data URLs. For security and storage, some serializers may redact or omit large fields in persisted metadata. +- The message flow typically alternates between reasoning, actions, screenshots, and concluding assistant text. See [Chat History](./chat-history) for a step-by-step example. diff --git a/docs/content/docs/agent-sdk/meta.json b/docs/content/docs/agent-sdk/meta.json index b745ce58..1083fc25 100644 --- a/docs/content/docs/agent-sdk/meta.json +++ b/docs/content/docs/agent-sdk/meta.json @@ -6,6 +6,7 @@ "supported-agents", "supported-model-providers", "chat-history", + "message-format", "customizing-computeragent", "callbacks", "custom-tools",