mirror of
https://github.com/trycua/computer.git
synced 2026-02-17 11:58:59 -06:00
Added message format documentation
This commit is contained in:
@@ -75,13 +75,7 @@ messages = [
|
||||
|
||||
## Message Types
|
||||
|
||||
- **user**: User input messages
|
||||
- **computer_call**: Computer actions (click, type, keypress, etc.)
|
||||
- **computer_call_output**: Results from computer actions (usually screenshots)
|
||||
- **function_call**: Function calls (e.g., `computer.call`)
|
||||
- **function_call_output**: Results from function calls
|
||||
- **reasoning**: Agent's internal reasoning and planning
|
||||
- **message**: Agent text responses
|
||||
See the complete schema in [Message Format](./message-format).
|
||||
|
||||
### Memory Management
|
||||
|
||||
|
||||
201
docs/content/docs/agent-sdk/message-format.mdx
Normal file
201
docs/content/docs/agent-sdk/message-format.mdx
Normal file
@@ -0,0 +1,201 @@
|
||||
---
|
||||
title: Message Format
|
||||
---
|
||||
|
||||
This page documents the Python message and response schema used by the Agent SDK.
|
||||
It mirrors the structure shown in Chat History and provides precise type definitions you can target in your own code.
|
||||
|
||||
All examples below use Python type hints with `TypedDict` and `Literal` from the standard `typing` module.
|
||||
|
||||
## Response
|
||||
|
||||
The agent yields response chunks as an async generator of objects with `output` and `usage`.
|
||||
|
||||
```python
|
||||
from typing import List, TypedDict
|
||||
|
||||
class Usage(TypedDict, total=False):
|
||||
prompt_tokens: int
|
||||
completion_tokens: int
|
||||
total_tokens: int
|
||||
response_cost: float # USD cost if available
|
||||
|
||||
class AgentResponse(TypedDict):
|
||||
output: List["AgentMessage"]
|
||||
usage: Usage
|
||||
```
|
||||
|
||||
## Messages
|
||||
|
||||
Agent messages represent the state of the conversation and the agent's actions.
|
||||
|
||||
```python
|
||||
from typing import List, Literal, Optional, TypedDict, Union
|
||||
|
||||
# Union of all message variants
|
||||
AgentMessage = Union[
|
||||
"UserMessage",
|
||||
"AssistantMessage",
|
||||
"ReasoningMessage",
|
||||
"ComputerCallMessage",
|
||||
"ComputerCallOutputMessage",
|
||||
"FunctionCallMessage",
|
||||
"FunctionCallOutputMessage",
|
||||
]
|
||||
|
||||
# Input message (role: user/system/developer)
|
||||
class UserMessage(TypedDict, total=False):
|
||||
type: Literal["message"] # optional for user input
|
||||
role: Literal["user", "system", "developer"]
|
||||
content: Union[str, List["InputContent"]]
|
||||
|
||||
# Output message (assistant text)
|
||||
class AssistantMessage(TypedDict):
|
||||
type: Literal["message"]
|
||||
role: Literal["assistant"]
|
||||
content: List["OutputContent"]
|
||||
|
||||
# Output reasoning/thinking message
|
||||
class ReasoningMessage(TypedDict):
|
||||
type: Literal["reasoning"]
|
||||
summary: List["SummaryContent"]
|
||||
|
||||
# Output computer action call (agent intends to act)
|
||||
class ComputerCallMessage(TypedDict):
|
||||
type: Literal["computer_call"]
|
||||
call_id: str
|
||||
status: Literal["completed", "failed", "pending"]
|
||||
action: "ComputerAction"
|
||||
|
||||
# Output computer action result (always a screenshot)
|
||||
class ComputerCallOutputMessage(TypedDict):
|
||||
type: Literal["computer_call_output"]
|
||||
call_id: str
|
||||
output: "ComputerResultContent"
|
||||
|
||||
# Output function call (agent calls a Python tool)
|
||||
class FunctionCallMessage(TypedDict):
|
||||
type: Literal["function_call"]
|
||||
call_id: str
|
||||
status: Literal["completed", "failed", "pending"]
|
||||
name: str
|
||||
arguments: str # JSON-serialized kwargs
|
||||
|
||||
# Output function call result (text)
|
||||
class FunctionCallOutputMessage(TypedDict):
|
||||
type: Literal["function_call_output"]
|
||||
call_id: str
|
||||
output: str
|
||||
```
|
||||
|
||||
## Message Content
|
||||
|
||||
These content items appear inside `content` arrays for the message types above.
|
||||
|
||||
```python
|
||||
# Input content kinds
|
||||
class InputContent(TypedDict):
|
||||
type: Literal["input_image", "input_text"]
|
||||
text: Optional[str]
|
||||
image_url: Optional[str] # e.g., data URL
|
||||
|
||||
# Assistant output content
|
||||
class OutputContent(TypedDict):
|
||||
type: Literal["output_text"]
|
||||
text: str
|
||||
|
||||
# Reasoning/summary output content
|
||||
class SummaryContent(TypedDict):
|
||||
type: Literal["summary_text"]
|
||||
text: str
|
||||
|
||||
# Computer call outputs (screenshots)
|
||||
class ComputerResultContent(TypedDict):
|
||||
type: Literal["computer_screenshot", "input_image"]
|
||||
image_url: str # data URL (e.g., "data:image/png;base64,....")
|
||||
```
|
||||
|
||||
## Actions
|
||||
|
||||
Computer actions represent concrete operations the agent will perform on the computer.
|
||||
|
||||
Two broad families exist depending on the provider: OpenAI-style and Anthropic-style.
|
||||
|
||||
```python
|
||||
# Union of all supported computer actions
|
||||
ComputerAction = Union[
|
||||
"ClickAction",
|
||||
"DoubleClickAction",
|
||||
"DragAction",
|
||||
"KeyPressAction",
|
||||
"MoveAction",
|
||||
"ScreenshotAction",
|
||||
"ScrollAction",
|
||||
"TypeAction",
|
||||
"WaitAction",
|
||||
# Anthropic variants
|
||||
"LeftMouseDownAction",
|
||||
"LeftMouseUpAction",
|
||||
]
|
||||
|
||||
# OpenAI Computer Actions
|
||||
class ClickAction(TypedDict):
|
||||
type: Literal["click"]
|
||||
button: Literal["left", "right", "wheel", "back", "forward"]
|
||||
x: int
|
||||
y: int
|
||||
|
||||
class DoubleClickAction(TypedDict, total=False):
|
||||
type: Literal["double_click"]
|
||||
button: Literal["left", "right", "wheel", "back", "forward"]
|
||||
x: int
|
||||
y: int
|
||||
|
||||
class DragAction(TypedDict, total=False):
|
||||
type: Literal["drag"]
|
||||
button: Literal["left", "right", "wheel", "back", "forward"]
|
||||
path: List[tuple[int, int]] # [(x1, y1), (x2, y2), ...]
|
||||
|
||||
class KeyPressAction(TypedDict):
|
||||
type: Literal["keypress"]
|
||||
keys: List[str] # e.g., ["ctrl", "a"]
|
||||
|
||||
class MoveAction(TypedDict):
|
||||
type: Literal["move"]
|
||||
x: int
|
||||
y: int
|
||||
|
||||
class ScreenshotAction(TypedDict):
|
||||
type: Literal["screenshot"]
|
||||
|
||||
class ScrollAction(TypedDict):
|
||||
type: Literal["scroll"]
|
||||
scroll_x: int
|
||||
scroll_y: int
|
||||
x: int
|
||||
y: int
|
||||
|
||||
class TypeAction(TypedDict):
|
||||
type: Literal["type"]
|
||||
text: str
|
||||
|
||||
class WaitAction(TypedDict):
|
||||
type: Literal["wait"]
|
||||
|
||||
# Anthropic Computer Actions
|
||||
class LeftMouseDownAction(TypedDict):
|
||||
type: Literal["left_mouse_down"]
|
||||
x: int
|
||||
y: int
|
||||
|
||||
class LeftMouseUpAction(TypedDict):
|
||||
type: Literal["left_mouse_up"]
|
||||
x: int
|
||||
y: int
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
||||
- The agent runtime may add provider-specific fields when available (e.g., usage cost). Unknown fields should be ignored for forward compatibility.
|
||||
- Computer action outputs are screenshots as data URLs. For security and storage, some serializers may redact or omit large fields in persisted metadata.
|
||||
- The message flow typically alternates between reasoning, actions, screenshots, and concluding assistant text. See [Chat History](./chat-history) for a step-by-step example.
|
||||
@@ -6,6 +6,7 @@
|
||||
"supported-agents",
|
||||
"supported-model-providers",
|
||||
"chat-history",
|
||||
"message-format",
|
||||
"customizing-computeragent",
|
||||
"callbacks",
|
||||
"custom-tools",
|
||||
|
||||
Reference in New Issue
Block a user