Added message format documentation

2026-02-17 11:58:59 -06:00 · 2025-09-09 11:08:19 -04:00
parent 665e65cb85
commit bae97a6cb7
3 changed files with 203 additions and 7 deletions
--- a/docs/content/docs/agent-sdk/chat-history.mdx
+++ b/docs/content/docs/agent-sdk/chat-history.mdx
@@ -75,13 +75,7 @@ messages = [

 ## Message Types

- **user**: User input messages
- **computer_call**: Computer actions (click, type, keypress, etc.)
- **computer_call_output**: Results from computer actions (usually screenshots)
- **function_call**: Function calls (e.g., `computer.call`)
- **function_call_output**: Results from function calls
- **reasoning**: Agent's internal reasoning and planning
- **message**: Agent text responses
+See the complete schema in [Message Format](./message-format).

 ### Memory Management

--- a/docs/content/docs/agent-sdk/message-format.mdx
+++ b/docs/content/docs/agent-sdk/message-format.mdx
@@ -0,0 +1,201 @@
+---
+title: Message Format
+---
+
+This page documents the Python message and response schema used by the Agent SDK.
+It mirrors the structure shown in Chat History and provides precise type definitions you can target in your own code.
+
+All examples below use Python type hints with `TypedDict` and `Literal` from the standard `typing` module.
+
+## Response
+
+The agent yields response chunks as an async generator of objects with `output` and `usage`.
+
+```python
+from typing import List, TypedDict
+
+class Usage(TypedDict, total=False):
+    prompt_tokens: int
+    completion_tokens: int
+    total_tokens: int
+    response_cost: float  # USD cost if available
+
+class AgentResponse(TypedDict):
+    output: List["AgentMessage"]
+    usage: Usage
+```
+
+## Messages
+
+Agent messages represent the state of the conversation and the agent's actions.
+
+```python
+from typing import List, Literal, Optional, TypedDict, Union
+
+# Union of all message variants
+AgentMessage = Union[
+    "UserMessage",
+    "AssistantMessage",
+    "ReasoningMessage",
+    "ComputerCallMessage",
+    "ComputerCallOutputMessage",
+    "FunctionCallMessage",
+    "FunctionCallOutputMessage",
+]
+
+# Input message (role: user/system/developer)
+class UserMessage(TypedDict, total=False):
+    type: Literal["message"]  # optional for user input
+    role: Literal["user", "system", "developer"]
+    content: Union[str, List["InputContent"]]
+
+# Output message (assistant text)
+class AssistantMessage(TypedDict):
+    type: Literal["message"]
+    role: Literal["assistant"]
+    content: List["OutputContent"]
+
+# Output reasoning/thinking message
+class ReasoningMessage(TypedDict):
+    type: Literal["reasoning"]
+    summary: List["SummaryContent"]
+
+# Output computer action call (agent intends to act)
+class ComputerCallMessage(TypedDict):
+    type: Literal["computer_call"]
+    call_id: str
+    status: Literal["completed", "failed", "pending"]
+    action: "ComputerAction"
+
+# Output computer action result (always a screenshot)
+class ComputerCallOutputMessage(TypedDict):
+    type: Literal["computer_call_output"]
+    call_id: str
+    output: "ComputerResultContent"
+
+# Output function call (agent calls a Python tool)
+class FunctionCallMessage(TypedDict):
+    type: Literal["function_call"]
+    call_id: str
+    status: Literal["completed", "failed", "pending"]
+    name: str
+    arguments: str  # JSON-serialized kwargs
+
+# Output function call result (text)
+class FunctionCallOutputMessage(TypedDict):
+    type: Literal["function_call_output"]
+    call_id: str
+    output: str
+```
+
+## Message Content
+
+These content items appear inside `content` arrays for the message types above.
+
+```python
+# Input content kinds
+class InputContent(TypedDict):
+    type: Literal["input_image", "input_text"]
+    text: Optional[str]
+    image_url: Optional[str]  # e.g., data URL
+
+# Assistant output content
+class OutputContent(TypedDict):
+    type: Literal["output_text"]
+    text: str
+
+# Reasoning/summary output content
+class SummaryContent(TypedDict):
+    type: Literal["summary_text"]
+    text: str
+
+# Computer call outputs (screenshots)
+class ComputerResultContent(TypedDict):
+    type: Literal["computer_screenshot", "input_image"]
+    image_url: str  # data URL (e.g., "data:image/png;base64,....")
+```
+
+## Actions
+
+Computer actions represent concrete operations the agent will perform on the computer.
+
+Two broad families exist depending on the provider: OpenAI-style and Anthropic-style.
+
+```python
+# Union of all supported computer actions
+ComputerAction = Union[
+    "ClickAction",
+    "DoubleClickAction",
+    "DragAction",
+    "KeyPressAction",
+    "MoveAction",
+    "ScreenshotAction",
+    "ScrollAction",
+    "TypeAction",
+    "WaitAction",
+    # Anthropic variants
+    "LeftMouseDownAction",
+    "LeftMouseUpAction",
+]
+
+# OpenAI Computer Actions
+class ClickAction(TypedDict):
+    type: Literal["click"]
+    button: Literal["left", "right", "wheel", "back", "forward"]
+    x: int
+    y: int
+
+class DoubleClickAction(TypedDict, total=False):
+    type: Literal["double_click"]
+    button: Literal["left", "right", "wheel", "back", "forward"]
+    x: int
+    y: int
+
+class DragAction(TypedDict, total=False):
+    type: Literal["drag"]
+    button: Literal["left", "right", "wheel", "back", "forward"]
+    path: List[tuple[int, int]]  # [(x1, y1), (x2, y2), ...]
+
+class KeyPressAction(TypedDict):
+    type: Literal["keypress"]
+    keys: List[str]  # e.g., ["ctrl", "a"]
+
+class MoveAction(TypedDict):
+    type: Literal["move"]
+    x: int
+    y: int
+
+class ScreenshotAction(TypedDict):
+    type: Literal["screenshot"]
+
+class ScrollAction(TypedDict):
+    type: Literal["scroll"]
+    scroll_x: int
+    scroll_y: int
+    x: int
+    y: int
+
+class TypeAction(TypedDict):
+    type: Literal["type"]
+    text: str
+
+class WaitAction(TypedDict):
+    type: Literal["wait"]
+
+# Anthropic Computer Actions
+class LeftMouseDownAction(TypedDict):
+    type: Literal["left_mouse_down"]
+    x: int
+    y: int
+
+class LeftMouseUpAction(TypedDict):
+    type: Literal["left_mouse_up"]
+    x: int
+    y: int
+```
+
+## Notes
+
+- The agent runtime may add provider-specific fields when available (e.g., usage cost). Unknown fields should be ignored for forward compatibility.
+- Computer action outputs are screenshots as data URLs. For security and storage, some serializers may redact or omit large fields in persisted metadata.
+- The message flow typically alternates between reasoning, actions, screenshots, and concluding assistant text. See [Chat History](./chat-history) for a step-by-step example.
--- a/docs/content/docs/agent-sdk/meta.json
+++ b/docs/content/docs/agent-sdk/meta.json
@@ -6,6 +6,7 @@
        "supported-agents",
 		"supported-model-providers",
 		"chat-history",
+		"message-format",
 		"customizing-computeragent",
 		"callbacks",
        "custom-tools",