From 17d670962970a1d1774daaec029ebf92f1f9235e Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Tue, 9 Sep 2025 10:55:57 -0400 Subject: [PATCH 1/6] added simple guide for customizing computeragent --- .../agent-sdk/customizing-computeragent.mdx | 121 ++++++++++++++++++ docs/content/docs/agent-sdk/meta.json | 1 + libs/python/agent/agent/agent.py | 10 +- libs/python/agent/agent/callbacks/__init__.py | 2 + .../agent/agent/integrations/hud/__init__.py | 15 ++- 5 files changed, 145 insertions(+), 4 deletions(-) create mode 100644 docs/content/docs/agent-sdk/customizing-computeragent.mdx diff --git a/docs/content/docs/agent-sdk/customizing-computeragent.mdx b/docs/content/docs/agent-sdk/customizing-computeragent.mdx new file mode 100644 index 00000000..462bb5f9 --- /dev/null +++ b/docs/content/docs/agent-sdk/customizing-computeragent.mdx @@ -0,0 +1,121 @@ +--- +title: Customizing Your ComputerAgent +--- + +The `ComputerAgent` interface provides an easy proxy to any computer-using model configuration, and it is a powerful framework for extending and building your own agentic systems. + +This guide shows four proven ways to increase capabilities and success rate: + +- 1 — Simple: Prompt engineering +- 2 — Easy: Tools +- 3 — Intermediate: Callbacks +- 4 — Expert: Custom `@register_agent` + +For a hands-on walkthrough, see the companion notebook: [notebooks/customizing_computeragent.ipynb](../../../notebooks/customizing_computeragent.ipynb). + +## 1) Simple: Prompt engineering + +Provide guiding instructions to shape behavior. `ComputerAgent` accepts an optional `instructions: str | None` which acts like a system-style preface. Internally, this uses a callback that pre-pends a user message before each LLM call. + +```python +from agent.agent import ComputerAgent + +agent = ComputerAgent( + model="openai/computer-use-preview", + tools=[computer], + instructions=( + "You are a meticulous software operator. Prefer safe, deterministic actions. " + "Always confirm via on-screen text before proceeding." + ), +) +``` + +## 2) Easy: Tools + +Expose deterministic capabilities as tools (Python functions or custom computer handlers). The agent will call them when appropriate. + +```python +def calculate_percentage(numerator: float, denominator: float) -> str: + """Calculate percentage as a string. + + Args: + numerator: Numerator value + denominator: Denominator value + Returns: + A formatted percentage string (e.g., '75.00%'). + """ + if denominator == 0: + return "0.00%" + return f"{(numerator/denominator)*100:.2f}%" + +agent = ComputerAgent( + model="openai/computer-use-preview", + tools=[computer, calculate_percentage], +) +``` + +- See `docs/agent-sdk/custom-tools` for authoring function tools. +- See `docs/agent-sdk/custom-computer-handlers` for building full computer interfaces. + +## 3) Intermediate: Callbacks + +Callbacks provide lifecycle hooks to preprocess messages, postprocess outputs, record trajectories, manage costs, and more. + +```python +from agent.callbacks import ImageRetentionCallback, TrajectorySaverCallback, BudgetManagerCallback + +agent = ComputerAgent( + model="anthropic/claude-3-5-sonnet-20241022", + tools=[computer], + callbacks=[ + ImageRetentionCallback(only_n_most_recent_images=3), + TrajectorySaverCallback("./trajectories"), + BudgetManagerCallback(max_budget=10.0, raise_error=True), + ], +) +``` + +- Browse implementations in `libs/python/agent/agent/loops/`. + +## 4) Expert: Custom `@register_agent` + +Build your own agent configuration class to control prompting, message shaping, and tool handling. This is the most flexible option for specialized domains. + +- Register your own `model=...` loop using `@register_agent` +- Browse implementations in `libs/python/agent/agent/loops/`. +- Implement `predict_step()` (and optionally `predict_click()`) and return the standardized output schema. + +```python +from agent.decorators import register_agent + +@register_agent(models=r".*my-special-model.*", priority=10) +class MyCustomAgentConfig: + async def predict_step(self, messages, model, tools, **kwargs): + # 1) Format messages for your provider + # 2) Call provider + # 3) Convert responses to the agent output schema + return {"output": [], "usage": {}} + + async def predict_click(self, model, image_b64, instruction): + # Optional: click-only capability + return None + + def get_capabilities(self): + return ["step"] +``` + +## HUD integration (optional) + +When using the HUD evaluation integration (`agent/integrations/hud/`), you can pass `instructions`, `tools`, and `callbacks` directly + +```python +from agent.integrations.hud import run_single_task + +await run_single_task( + dataset="username/dataset-name", + model="openai/computer-use-preview", + instructions="Operate carefully. Always verify on-screen text before actions.", + # tools=[your_custom_function], + # callbacks=[YourCustomCallback()], +) +``` \ No newline at end of file diff --git a/docs/content/docs/agent-sdk/meta.json b/docs/content/docs/agent-sdk/meta.json index 07bf7199..b745ce58 100644 --- a/docs/content/docs/agent-sdk/meta.json +++ b/docs/content/docs/agent-sdk/meta.json @@ -6,6 +6,7 @@ "supported-agents", "supported-model-providers", "chat-history", + "customizing-computeragent", "callbacks", "custom-tools", "custom-computer-handlers", diff --git a/libs/python/agent/agent/agent.py b/libs/python/agent/agent/agent.py index b796866d..feb0363b 100644 --- a/libs/python/agent/agent/agent.py +++ b/libs/python/agent/agent/agent.py @@ -31,7 +31,8 @@ from .callbacks import ( TrajectorySaverCallback, BudgetManagerCallback, TelemetryCallback, - OperatorNormalizerCallback + OperatorNormalizerCallback, + PromptInstructionsCallback, ) from .computers import ( AsyncComputerHandler, @@ -162,6 +163,7 @@ class ComputerAgent: custom_loop: Optional[Callable] = None, only_n_most_recent_images: Optional[int] = None, callbacks: Optional[List[Any]] = None, + instructions: Optional[str] = None, verbosity: Optional[int] = None, trajectory_dir: Optional[str | Path | dict] = None, max_retries: Optional[int] = 3, @@ -180,6 +182,7 @@ class ComputerAgent: custom_loop: Custom agent loop function to use instead of auto-selection only_n_most_recent_images: If set, only keep the N most recent images in message history. Adds ImageRetentionCallback automatically. callbacks: List of AsyncCallbackHandler instances for preprocessing/postprocessing + instructions: Optional system instructions to be passed to the model verbosity: Logging level (logging.DEBUG, logging.INFO, etc.). If set, adds LoggingCallback automatically trajectory_dir: If set, saves trajectory data (screenshots, responses) to this directory. Adds TrajectorySaverCallback automatically. max_retries: Maximum number of retries for failed API calls @@ -198,6 +201,7 @@ class ComputerAgent: self.custom_loop = custom_loop self.only_n_most_recent_images = only_n_most_recent_images self.callbacks = callbacks or [] + self.instructions = instructions self.verbosity = verbosity self.trajectory_dir = trajectory_dir self.max_retries = max_retries @@ -211,6 +215,10 @@ class ComputerAgent: # Prepend operator normalizer callback self.callbacks.insert(0, OperatorNormalizerCallback()) + # Add prompt instructions callback if provided + if self.instructions: + self.callbacks.append(PromptInstructionsCallback(self.instructions)) + # Add telemetry callback if telemetry_enabled is set if self.telemetry_enabled: if isinstance(self.telemetry_enabled, bool): diff --git a/libs/python/agent/agent/callbacks/__init__.py b/libs/python/agent/agent/callbacks/__init__.py index e0befcc7..eca40173 100644 --- a/libs/python/agent/agent/callbacks/__init__.py +++ b/libs/python/agent/agent/callbacks/__init__.py @@ -9,6 +9,7 @@ from .trajectory_saver import TrajectorySaverCallback from .budget_manager import BudgetManagerCallback from .telemetry import TelemetryCallback from .operator_validator import OperatorNormalizerCallback +from .prompt_instructions import PromptInstructionsCallback __all__ = [ "AsyncCallbackHandler", @@ -18,4 +19,5 @@ __all__ = [ "BudgetManagerCallback", "TelemetryCallback", "OperatorNormalizerCallback", + "PromptInstructionsCallback", ] diff --git a/libs/python/agent/agent/integrations/hud/__init__.py b/libs/python/agent/agent/integrations/hud/__init__.py index 0da87bfa..b0d06041 100644 --- a/libs/python/agent/agent/integrations/hud/__init__.py +++ b/libs/python/agent/agent/integrations/hud/__init__.py @@ -20,6 +20,7 @@ from hud import trace from agent.agent import ComputerAgent as BaseComputerAgent from .proxy import FakeAsyncOpenAI +from agent.callbacks import PromptInstructionsCallback # --------------------------------------------------------------------------- @@ -47,6 +48,7 @@ class ProxyOperatorAgent(OperatorAgent): custom_loop: Any | None = None, only_n_most_recent_images: int | None = None, callbacks: list[Any] | None = None, + instructions: str | None = None, verbosity: int | None = None, max_retries: int | None = 3, screenshot_delay: float | int = 0.5, @@ -68,12 +70,17 @@ class ProxyOperatorAgent(OperatorAgent): if tools: agent_tools.extend(tools) + # Build callbacks, injecting prompt instructions if provided + agent_callbacks = list(callbacks or []) + if instructions: + agent_callbacks.append(PromptInstructionsCallback(instructions)) + computer_agent = BaseComputerAgent( model=model, tools=agent_tools, custom_loop=custom_loop, only_n_most_recent_images=only_n_most_recent_images, - callbacks=callbacks, + callbacks=agent_callbacks, verbosity=verbosity, trajectory_dir=trajectory_dir, max_retries=max_retries, @@ -96,7 +103,6 @@ class ProxyOperatorAgent(OperatorAgent): # Single-task runner # --------------------------------------------------------------------------- - async def run_single_task( dataset: str | Dataset | list[dict[str, Any]], *, @@ -108,6 +114,7 @@ async def run_single_task( custom_loop: Any | None = None, only_n_most_recent_images: int | None = None, callbacks: list[Any] | None = None, + instructions: str | None = None, verbosity: int | None = None, trajectory_dir: str | dict | None = None, max_retries: int | None = 3, @@ -140,6 +147,7 @@ async def run_single_task( custom_loop=custom_loop, only_n_most_recent_images=only_n_most_recent_images, callbacks=callbacks, + instructions=instructions, verbosity=verbosity, trajectory_dir=trajectory_dir, max_retries=max_retries, @@ -157,7 +165,6 @@ async def run_single_task( # Full-dataset runner # --------------------------------------------------------------------------- - async def run_full_dataset( dataset: str | Dataset | list[dict[str, Any]], *, @@ -173,6 +180,7 @@ async def run_full_dataset( custom_loop: Any | None = None, only_n_most_recent_images: int | None = 5, callbacks: list[Any] | None = None, + instructions: str | None = None, verbosity: int | None = None, max_retries: int | None = 3, screenshot_delay: float | int = 0.5, @@ -207,6 +215,7 @@ async def run_full_dataset( "custom_loop": custom_loop, "only_n_most_recent_images": only_n_most_recent_images, "callbacks": callbacks, + "instructions": instructions, "verbosity": verbosity, "max_retries": max_retries, "screenshot_delay": screenshot_delay, From f270af30e1cae760335dc197f84f1175f3f44911 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Tue, 9 Sep 2025 10:57:16 -0400 Subject: [PATCH 2/6] added notebook --- notebooks/customizing_computeragent.ipynb | 194 ++++++++++++++++++++++ 1 file changed, 194 insertions(+) create mode 100644 notebooks/customizing_computeragent.ipynb diff --git a/notebooks/customizing_computeragent.ipynb b/notebooks/customizing_computeragent.ipynb new file mode 100644 index 00000000..b0234d24 --- /dev/null +++ b/notebooks/customizing_computeragent.ipynb @@ -0,0 +1,194 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Customizing Your ComputerAgent\n\n", + "This notebook demonstrates four practical ways to increase the capabilities and success rate of your `ComputerAgent` in the Agent SDK:\n\n", + "1. Simple: Prompt engineering (via optional `instructions`)\n", + "2. Easy: Tools (function tools and custom computer tools)\n", + "3. Intermediate: Callbacks\n", + "4. Expert: Custom `@register_agent` loops\n\n", + "> Tip: The same patterns work in scripts and services — the notebook just makes it easy to iterate." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup\n\n", + "We'll import `ComputerAgent`, a simple computer shim, and some utilities." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "from agent.agent import ComputerAgent\n", + "from agent.callbacks import PromptInstructionsCallback, LoggingCallback\n", + "\n", + "# A very small computer shim for demo purposes (for full computer handlers, see docs)\n", + "class DummyComputer:\n", + " async def screenshot(self):\n", + " # Return a 1x1 transparent PNG as base64 string (placeholder)\n", + " import base64\n", + " png_bytes = base64.b64decode(\"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8Xw8AAr8B9k2m0oYAAAAASUVORK5CYII=\")\n", + " return base64.b64encode(png_bytes).decode()\n", + "\n", + " async def click(self, x: int, y: int):\n", + " pass\n", + "\n", + " async def type(self, text: str):\n", + " pass\n", + "\n", + "computer = DummyComputer()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1) Simple: Prompt engineering\n\n", + "You can guide your agent with system-like `instructions`.\n\n", + "Under the hood, `ComputerAgent(instructions=...)` adds a `PromptInstructionsCallback` that prepends a user message before each LLM call.\n\n", + "This mirrors the recommended snippet in code:\n\n", + "```python\n", + "effective_input = full_input\n", + "if instructions:\n", + " effective_input = [{\"role\": \"user\", \"content\": instructions}] + full_input\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "instructions = (\n", + " \"You are a meticulous software operator. Prefer safe, deterministic actions. \"\n", + " \"Always confirm via on-screen text before proceeding.\"\n", + ")\n", + "agent = ComputerAgent(\n", + " model=\"openai/computer-use-preview\",\n", + " tools=[computer],\n", + " instructions=instructions,\n", + " callbacks=[LoggingCallback(level=logging.INFO)],\n", + ")\n", + "messages = [\n", + " {\"role\": \"user\", \"content\": \"Open the settings and turn on dark mode.\"}\n", + "]\n", + "\n", + "# In notebooks, you may want to consume the async generator\n", + "import asyncio\n", + "async def run_once():\n", + " async for chunk in agent.run(messages):\n", + " # Print any assistant text outputs\n", + " for item in chunk.get(\"output\", []):\n", + " if item.get(\"type\") == \"message\":\n", + " for c in item.get(\"content\", []):\n", + " if c.get(\"text\"):\n", + " print(c.get(\"text\"))\n", + "\n", + "await run_once()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2) Easy: Tools\n\n", + "Add function tools to expose deterministic capabilities. Tools are auto-extracted to schemas and callable by the agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def calculate_percentage(numerator: float, denominator: float) -> str:\n", + " \"\"\"Calculate a percentage string.\n", + "\n", + " Args:\n", + " numerator: Numerator value\n", + " denominator: Denominator value\n", + " Returns:\n", + " A formatted percentage string (e.g., '75.00%').\n", + " \"\"\"\n", + " if denominator == 0:\n", + " return \"0.00%\"\n", + " return f\"{(numerator/denominator)*100:.2f}%\"\n", + "\n", + "agent_with_tool = ComputerAgent(\n", + " model=\"openai/computer-use-preview\",\n", + " tools=[computer, calculate_percentage],\n", + " instructions=\"When doing math, prefer the `calculate_percentage` tool when relevant.\",\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3) Intermediate: Callbacks\n\n", + "Callbacks offer lifecycle hooks. For example, limit recent images or record trajectories." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from agent.callbacks import ImageRetentionCallback, TrajectorySaverCallback\n", + "\n", + "agent_with_callbacks = ComputerAgent(\n", + " model=\"anthropic/claude-3-5-sonnet-20241022\",\n", + " tools=[computer],\n", + " callbacks=[\n", + " ImageRetentionCallback(only_n_most_recent_images=3),\n", + " TrajectorySaverCallback(\"./trajectories\"),\n", + " ],\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4) Expert: Custom `@register_agent`\n\n", + "Register custom agent configs that implement `predict_step` (and optionally `predict_click`). This gives you full control over prompting, message shaping, and tool wiring.\n\n", + "See: `libs/python/agent/agent/loops/` for concrete examples." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Next steps\n\n", + "- Start with `instructions` for fast wins.\n", + "- Add function tools for determinism and reliability.\n", + "- Use callbacks to manage cost, logs, and safety.\n", + "- Build custom loops for specialized domains." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From b21c66894641a783db546fb3018817afc829ec84 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Tue, 9 Sep 2025 10:58:37 -0400 Subject: [PATCH 3/6] added notebook --- docs/content/docs/agent-sdk/customizing-computeragent.mdx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/content/docs/agent-sdk/customizing-computeragent.mdx b/docs/content/docs/agent-sdk/customizing-computeragent.mdx index 462bb5f9..d94f4ec0 100644 --- a/docs/content/docs/agent-sdk/customizing-computeragent.mdx +++ b/docs/content/docs/agent-sdk/customizing-computeragent.mdx @@ -2,6 +2,8 @@ title: Customizing Your ComputerAgent --- +A corresponding Jupyter Notebook is available for this documentation. + The `ComputerAgent` interface provides an easy proxy to any computer-using model configuration, and it is a powerful framework for extending and building your own agentic systems. This guide shows four proven ways to increase capabilities and success rate: From 665e65cb856a5515c04471dde336ce27f6ba48a2 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Tue, 9 Sep 2025 11:00:52 -0400 Subject: [PATCH 4/6] Replaced computer shim with Docker computer --- notebooks/customizing_computeragent.ipynb | 65 +++++++++++++---------- 1 file changed, 36 insertions(+), 29 deletions(-) diff --git a/notebooks/customizing_computeragent.ipynb b/notebooks/customizing_computeragent.ipynb index b0234d24..56f0beb9 100644 --- a/notebooks/customizing_computeragent.ipynb +++ b/notebooks/customizing_computeragent.ipynb @@ -4,12 +4,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Customizing Your ComputerAgent\n\n", - "This notebook demonstrates four practical ways to increase the capabilities and success rate of your `ComputerAgent` in the Agent SDK:\n\n", + "# Customizing Your ComputerAgent\n", + "\n", + "This notebook demonstrates four practical ways to increase the capabilities and success rate of your `ComputerAgent` in the Agent SDK:\n", + "\n", "1. Simple: Prompt engineering (via optional `instructions`)\n", "2. Easy: Tools (function tools and custom computer tools)\n", "3. Intermediate: Callbacks\n", - "4. Expert: Custom `@register_agent` loops\n\n", + "4. Expert: Custom `@register_agent` loops\n", + "\n", "> Tip: The same patterns work in scripts and services — the notebook just makes it easy to iterate." ] }, @@ -17,8 +20,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Setup\n\n", - "We'll import `ComputerAgent`, a simple computer shim, and some utilities." + "## Setup\n", + "\n", + "We'll import `ComputerAgent`, a simple Docker-based computer, and some utilities." ] }, { @@ -29,33 +33,31 @@ "source": [ "import logging\n", "from agent.agent import ComputerAgent\n", - "from agent.callbacks import PromptInstructionsCallback, LoggingCallback\n", + "from agent.callbacks import LoggingCallback\n", + "from computer import Computer\n", "\n", - "# A very small computer shim for demo purposes (for full computer handlers, see docs)\n", - "class DummyComputer:\n", - " async def screenshot(self):\n", - " # Return a 1x1 transparent PNG as base64 string (placeholder)\n", - " import base64\n", - " png_bytes = base64.b64decode(\"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8Xw8AAr8B9k2m0oYAAAAASUVORK5CYII=\")\n", - " return base64.b64encode(png_bytes).decode()\n", + "computer = Computer(\n", + " os_type=\"linux\",\n", + " provider_type=\"docker\",\n", + " image=\"trycua/cua-ubuntu:latest\",\n", + " name=\"my-cua-container\"\n", + ")\n", "\n", - " async def click(self, x: int, y: int):\n", - " pass\n", - "\n", - " async def type(self, text: str):\n", - " pass\n", - "\n", - "computer = DummyComputer()\n" + "await computer.run() # Launch & connect to Docker container" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 1) Simple: Prompt engineering\n\n", - "You can guide your agent with system-like `instructions`.\n\n", - "Under the hood, `ComputerAgent(instructions=...)` adds a `PromptInstructionsCallback` that prepends a user message before each LLM call.\n\n", - "This mirrors the recommended snippet in code:\n\n", + "## 1) Simple: Prompt engineering\n", + "\n", + "You can guide your agent with system-like `instructions`.\n", + "\n", + "Under the hood, `ComputerAgent(instructions=...)` adds a `PromptInstructionsCallback` that prepends a user message before each LLM call.\n", + "\n", + "This mirrors the recommended snippet in code:\n", + "\n", "```python\n", "effective_input = full_input\n", "if instructions:\n", @@ -101,7 +103,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 2) Easy: Tools\n\n", + "## 2) Easy: Tools\n", + "\n", "Add function tools to expose deterministic capabilities. Tools are auto-extracted to schemas and callable by the agent." ] }, @@ -135,7 +138,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 3) Intermediate: Callbacks\n\n", + "## 3) Intermediate: Callbacks\n", + "\n", "Callbacks offer lifecycle hooks. For example, limit recent images or record trajectories." ] }, @@ -161,8 +165,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 4) Expert: Custom `@register_agent`\n\n", - "Register custom agent configs that implement `predict_step` (and optionally `predict_click`). This gives you full control over prompting, message shaping, and tool wiring.\n\n", + "## 4) Expert: Custom `@register_agent`\n", + "\n", + "Register custom agent configs that implement `predict_step` (and optionally `predict_click`). This gives you full control over prompting, message shaping, and tool wiring.\n", + "\n", "See: `libs/python/agent/agent/loops/` for concrete examples." ] }, @@ -170,7 +176,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Next steps\n\n", + "## Next steps\n", + "\n", "- Start with `instructions` for fast wins.\n", "- Add function tools for determinism and reliability.\n", "- Use callbacks to manage cost, logs, and safety.\n", From bae97a6cb7760e20562cfb2b02cbf548f70a13f3 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Tue, 9 Sep 2025 11:08:19 -0400 Subject: [PATCH 5/6] Added message format documentation --- docs/content/docs/agent-sdk/chat-history.mdx | 8 +- .../content/docs/agent-sdk/message-format.mdx | 201 ++++++++++++++++++ docs/content/docs/agent-sdk/meta.json | 1 + 3 files changed, 203 insertions(+), 7 deletions(-) create mode 100644 docs/content/docs/agent-sdk/message-format.mdx diff --git a/docs/content/docs/agent-sdk/chat-history.mdx b/docs/content/docs/agent-sdk/chat-history.mdx index 83435a70..e7041c3b 100644 --- a/docs/content/docs/agent-sdk/chat-history.mdx +++ b/docs/content/docs/agent-sdk/chat-history.mdx @@ -75,13 +75,7 @@ messages = [ ## Message Types -- **user**: User input messages -- **computer_call**: Computer actions (click, type, keypress, etc.) -- **computer_call_output**: Results from computer actions (usually screenshots) -- **function_call**: Function calls (e.g., `computer.call`) -- **function_call_output**: Results from function calls -- **reasoning**: Agent's internal reasoning and planning -- **message**: Agent text responses +See the complete schema in [Message Format](./message-format). ### Memory Management diff --git a/docs/content/docs/agent-sdk/message-format.mdx b/docs/content/docs/agent-sdk/message-format.mdx new file mode 100644 index 00000000..ac329d4d --- /dev/null +++ b/docs/content/docs/agent-sdk/message-format.mdx @@ -0,0 +1,201 @@ +--- +title: Message Format +--- + +This page documents the Python message and response schema used by the Agent SDK. +It mirrors the structure shown in Chat History and provides precise type definitions you can target in your own code. + +All examples below use Python type hints with `TypedDict` and `Literal` from the standard `typing` module. + +## Response + +The agent yields response chunks as an async generator of objects with `output` and `usage`. + +```python +from typing import List, TypedDict + +class Usage(TypedDict, total=False): + prompt_tokens: int + completion_tokens: int + total_tokens: int + response_cost: float # USD cost if available + +class AgentResponse(TypedDict): + output: List["AgentMessage"] + usage: Usage +``` + +## Messages + +Agent messages represent the state of the conversation and the agent's actions. + +```python +from typing import List, Literal, Optional, TypedDict, Union + +# Union of all message variants +AgentMessage = Union[ + "UserMessage", + "AssistantMessage", + "ReasoningMessage", + "ComputerCallMessage", + "ComputerCallOutputMessage", + "FunctionCallMessage", + "FunctionCallOutputMessage", +] + +# Input message (role: user/system/developer) +class UserMessage(TypedDict, total=False): + type: Literal["message"] # optional for user input + role: Literal["user", "system", "developer"] + content: Union[str, List["InputContent"]] + +# Output message (assistant text) +class AssistantMessage(TypedDict): + type: Literal["message"] + role: Literal["assistant"] + content: List["OutputContent"] + +# Output reasoning/thinking message +class ReasoningMessage(TypedDict): + type: Literal["reasoning"] + summary: List["SummaryContent"] + +# Output computer action call (agent intends to act) +class ComputerCallMessage(TypedDict): + type: Literal["computer_call"] + call_id: str + status: Literal["completed", "failed", "pending"] + action: "ComputerAction" + +# Output computer action result (always a screenshot) +class ComputerCallOutputMessage(TypedDict): + type: Literal["computer_call_output"] + call_id: str + output: "ComputerResultContent" + +# Output function call (agent calls a Python tool) +class FunctionCallMessage(TypedDict): + type: Literal["function_call"] + call_id: str + status: Literal["completed", "failed", "pending"] + name: str + arguments: str # JSON-serialized kwargs + +# Output function call result (text) +class FunctionCallOutputMessage(TypedDict): + type: Literal["function_call_output"] + call_id: str + output: str +``` + +## Message Content + +These content items appear inside `content` arrays for the message types above. + +```python +# Input content kinds +class InputContent(TypedDict): + type: Literal["input_image", "input_text"] + text: Optional[str] + image_url: Optional[str] # e.g., data URL + +# Assistant output content +class OutputContent(TypedDict): + type: Literal["output_text"] + text: str + +# Reasoning/summary output content +class SummaryContent(TypedDict): + type: Literal["summary_text"] + text: str + +# Computer call outputs (screenshots) +class ComputerResultContent(TypedDict): + type: Literal["computer_screenshot", "input_image"] + image_url: str # data URL (e.g., "data:image/png;base64,....") +``` + +## Actions + +Computer actions represent concrete operations the agent will perform on the computer. + +Two broad families exist depending on the provider: OpenAI-style and Anthropic-style. + +```python +# Union of all supported computer actions +ComputerAction = Union[ + "ClickAction", + "DoubleClickAction", + "DragAction", + "KeyPressAction", + "MoveAction", + "ScreenshotAction", + "ScrollAction", + "TypeAction", + "WaitAction", + # Anthropic variants + "LeftMouseDownAction", + "LeftMouseUpAction", +] + +# OpenAI Computer Actions +class ClickAction(TypedDict): + type: Literal["click"] + button: Literal["left", "right", "wheel", "back", "forward"] + x: int + y: int + +class DoubleClickAction(TypedDict, total=False): + type: Literal["double_click"] + button: Literal["left", "right", "wheel", "back", "forward"] + x: int + y: int + +class DragAction(TypedDict, total=False): + type: Literal["drag"] + button: Literal["left", "right", "wheel", "back", "forward"] + path: List[tuple[int, int]] # [(x1, y1), (x2, y2), ...] + +class KeyPressAction(TypedDict): + type: Literal["keypress"] + keys: List[str] # e.g., ["ctrl", "a"] + +class MoveAction(TypedDict): + type: Literal["move"] + x: int + y: int + +class ScreenshotAction(TypedDict): + type: Literal["screenshot"] + +class ScrollAction(TypedDict): + type: Literal["scroll"] + scroll_x: int + scroll_y: int + x: int + y: int + +class TypeAction(TypedDict): + type: Literal["type"] + text: str + +class WaitAction(TypedDict): + type: Literal["wait"] + +# Anthropic Computer Actions +class LeftMouseDownAction(TypedDict): + type: Literal["left_mouse_down"] + x: int + y: int + +class LeftMouseUpAction(TypedDict): + type: Literal["left_mouse_up"] + x: int + y: int +``` + +## Notes + +- The agent runtime may add provider-specific fields when available (e.g., usage cost). Unknown fields should be ignored for forward compatibility. +- Computer action outputs are screenshots as data URLs. For security and storage, some serializers may redact or omit large fields in persisted metadata. +- The message flow typically alternates between reasoning, actions, screenshots, and concluding assistant text. See [Chat History](./chat-history) for a step-by-step example. diff --git a/docs/content/docs/agent-sdk/meta.json b/docs/content/docs/agent-sdk/meta.json index b745ce58..1083fc25 100644 --- a/docs/content/docs/agent-sdk/meta.json +++ b/docs/content/docs/agent-sdk/meta.json @@ -6,6 +6,7 @@ "supported-agents", "supported-model-providers", "chat-history", + "message-format", "customizing-computeragent", "callbacks", "custom-tools", From ae6d35ffa557455d093165612c3d4c77dccf1330 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Tue, 9 Sep 2025 11:23:13 -0400 Subject: [PATCH 6/6] Fixed broken link --- docs/content/docs/agent-sdk/customizing-computeragent.mdx | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/content/docs/agent-sdk/customizing-computeragent.mdx b/docs/content/docs/agent-sdk/customizing-computeragent.mdx index d94f4ec0..dac0d35f 100644 --- a/docs/content/docs/agent-sdk/customizing-computeragent.mdx +++ b/docs/content/docs/agent-sdk/customizing-computeragent.mdx @@ -13,8 +13,6 @@ This guide shows four proven ways to increase capabilities and success rate: - 3 — Intermediate: Callbacks - 4 — Expert: Custom `@register_agent` -For a hands-on walkthrough, see the companion notebook: [notebooks/customizing_computeragent.ipynb](../../../notebooks/customizing_computeragent.ipynb). - ## 1) Simple: Prompt engineering Provide guiding instructions to shape behavior. `ComputerAgent` accepts an optional `instructions: str | None` which acts like a system-style preface. Internally, this uses a callback that pre-pends a user message before each LLM call.