From 17d670962970a1d1774daaec029ebf92f1f9235e Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Tue, 9 Sep 2025 10:55:57 -0400
Subject: [PATCH 1/6] added simple guide for customizing computeragent

---
 .../agent-sdk/customizing-computeragent.mdx   | 121 ++++++++++++++++++
 docs/content/docs/agent-sdk/meta.json         |   1 +
 libs/python/agent/agent/agent.py              |  10 +-
 libs/python/agent/agent/callbacks/__init__.py |   2 +
 .../agent/agent/integrations/hud/__init__.py  |  15 ++-
 5 files changed, 145 insertions(+), 4 deletions(-)
 create mode 100644 docs/content/docs/agent-sdk/customizing-computeragent.mdx

diff --git a/docs/content/docs/agent-sdk/customizing-computeragent.mdx b/docs/content/docs/agent-sdk/customizing-computeragent.mdx
new file mode 100644
index 00000000..462bb5f9
--- /dev/null
+++ b/docs/content/docs/agent-sdk/customizing-computeragent.mdx
@@ -0,0 +1,121 @@
+---
+title: Customizing Your ComputerAgent
+---
+
+The `ComputerAgent` interface provides an easy proxy to any computer-using model configuration, and it is a powerful framework for extending and building your own agentic systems.
+
+This guide shows four proven ways to increase capabilities and success rate:
+
+- 1 — Simple: Prompt engineering
+- 2 — Easy: Tools
+- 3 — Intermediate: Callbacks
+- 4 — Expert: Custom `@register_agent`
+
+For a hands-on walkthrough, see the companion notebook: [notebooks/customizing_computeragent.ipynb](../../../notebooks/customizing_computeragent.ipynb).
+
+## 1) Simple: Prompt engineering
+
+Provide guiding instructions to shape behavior. `ComputerAgent` accepts an optional `instructions: str | None` which acts like a system-style preface. Internally, this uses a callback that pre-pends a user message before each LLM call.
+
+```python
+from agent.agent import ComputerAgent
+
+agent = ComputerAgent(
+    model="openai/computer-use-preview",
+    tools=[computer],
+    instructions=(
+        "You are a meticulous software operator. Prefer safe, deterministic actions. "
+        "Always confirm via on-screen text before proceeding."
+    ),
+)
+```
+
+## 2) Easy: Tools
+
+Expose deterministic capabilities as tools (Python functions or custom computer handlers). The agent will call them when appropriate.
+
+```python
+def calculate_percentage(numerator: float, denominator: float) -> str:
+    """Calculate percentage as a string.
+
+    Args:
+        numerator: Numerator value
+        denominator: Denominator value
+    Returns:
+        A formatted percentage string (e.g., '75.00%').
+    """
+    if denominator == 0:
+        return "0.00%"
+    return f"{(numerator/denominator)*100:.2f}%"
+
+agent = ComputerAgent(
+    model="openai/computer-use-preview",
+    tools=[computer, calculate_percentage],
+)
+```
+
+- See `docs/agent-sdk/custom-tools` for authoring function tools.
+- See `docs/agent-sdk/custom-computer-handlers` for building full computer interfaces.
+
+## 3) Intermediate: Callbacks
+
+Callbacks provide lifecycle hooks to preprocess messages, postprocess outputs, record trajectories, manage costs, and more.
+
+```python
+from agent.callbacks import ImageRetentionCallback, TrajectorySaverCallback, BudgetManagerCallback
+
+agent = ComputerAgent(
+    model="anthropic/claude-3-5-sonnet-20241022",
+    tools=[computer],
+    callbacks=[
+        ImageRetentionCallback(only_n_most_recent_images=3),
+        TrajectorySaverCallback("./trajectories"),
+        BudgetManagerCallback(max_budget=10.0, raise_error=True),
+    ],
+)
+```
+
+- Browse implementations in `libs/python/agent/agent/loops/`.
+
+## 4) Expert: Custom `@register_agent`
+
+Build your own agent configuration class to control prompting, message shaping, and tool handling. This is the most flexible option for specialized domains.
+
+- Register your own `model=...` loop using `@register_agent`
+- Browse implementations in `libs/python/agent/agent/loops/`.
+- Implement `predict_step()` (and optionally `predict_click()`) and return the standardized output schema.
+
+```python
+from agent.decorators import register_agent
+
+@register_agent(models=r".*my-special-model.*", priority=10)
+class MyCustomAgentConfig:
+    async def predict_step(self, messages, model, tools, **kwargs):
+        # 1) Format messages for your provider
+        # 2) Call provider
+        # 3) Convert responses to the agent output schema
+        return {"output": [], "usage": {}}
+
+    async def predict_click(self, model, image_b64, instruction):
+        # Optional: click-only capability
+        return None
+
+    def get_capabilities(self):
+        return ["step"]
+```
+
+## HUD integration (optional)
+
+When using the HUD evaluation integration (`agent/integrations/hud/`), you can pass `instructions`, `tools`, and `callbacks` directly
+
+```python
+from agent.integrations.hud import run_single_task
+
+await run_single_task(
+    dataset="username/dataset-name",
+    model="openai/computer-use-preview",
+    instructions="Operate carefully. Always verify on-screen text before actions.",
+    # tools=[your_custom_function],
+    # callbacks=[YourCustomCallback()],
+)
+```
\ No newline at end of file
diff --git a/docs/content/docs/agent-sdk/meta.json b/docs/content/docs/agent-sdk/meta.json
index 07bf7199..b745ce58 100644
--- a/docs/content/docs/agent-sdk/meta.json
+++ b/docs/content/docs/agent-sdk/meta.json
@@ -6,6 +6,7 @@
         "supported-agents",
 		"supported-model-providers",
 		"chat-history",
+		"customizing-computeragent",
 		"callbacks",
         "custom-tools",
 		"custom-computer-handlers",
diff --git a/libs/python/agent/agent/agent.py b/libs/python/agent/agent/agent.py
index b796866d..feb0363b 100644
--- a/libs/python/agent/agent/agent.py
+++ b/libs/python/agent/agent/agent.py
@@ -31,7 +31,8 @@ from .callbacks import (
     TrajectorySaverCallback, 
     BudgetManagerCallback,
     TelemetryCallback,
-    OperatorNormalizerCallback
+    OperatorNormalizerCallback,
+    PromptInstructionsCallback,
 )
 from .computers import (
     AsyncComputerHandler,
@@ -162,6 +163,7 @@ class ComputerAgent:
         custom_loop: Optional[Callable] = None,
         only_n_most_recent_images: Optional[int] = None,
         callbacks: Optional[List[Any]] = None,
+        instructions: Optional[str] = None,
         verbosity: Optional[int] = None,
         trajectory_dir: Optional[str | Path | dict] = None,
         max_retries: Optional[int] = 3,
@@ -180,6 +182,7 @@ class ComputerAgent:
             custom_loop: Custom agent loop function to use instead of auto-selection
             only_n_most_recent_images: If set, only keep the N most recent images in message history. Adds ImageRetentionCallback automatically.
             callbacks: List of AsyncCallbackHandler instances for preprocessing/postprocessing
+            instructions: Optional system instructions to be passed to the model
             verbosity: Logging level (logging.DEBUG, logging.INFO, etc.). If set, adds LoggingCallback automatically
             trajectory_dir: If set, saves trajectory data (screenshots, responses) to this directory. Adds TrajectorySaverCallback automatically.
             max_retries: Maximum number of retries for failed API calls
@@ -198,6 +201,7 @@ class ComputerAgent:
         self.custom_loop = custom_loop
         self.only_n_most_recent_images = only_n_most_recent_images
         self.callbacks = callbacks or []
+        self.instructions = instructions
         self.verbosity = verbosity
         self.trajectory_dir = trajectory_dir
         self.max_retries = max_retries
@@ -211,6 +215,10 @@ class ComputerAgent:
         # Prepend operator normalizer callback
         self.callbacks.insert(0, OperatorNormalizerCallback())
 
+        # Add prompt instructions callback if provided
+        if self.instructions:
+            self.callbacks.append(PromptInstructionsCallback(self.instructions))
+
         # Add telemetry callback if telemetry_enabled is set
         if self.telemetry_enabled:
             if isinstance(self.telemetry_enabled, bool):
diff --git a/libs/python/agent/agent/callbacks/__init__.py b/libs/python/agent/agent/callbacks/__init__.py
index e0befcc7..eca40173 100644
--- a/libs/python/agent/agent/callbacks/__init__.py
+++ b/libs/python/agent/agent/callbacks/__init__.py
@@ -9,6 +9,7 @@ from .trajectory_saver import TrajectorySaverCallback
 from .budget_manager import BudgetManagerCallback
 from .telemetry import TelemetryCallback
 from .operator_validator import OperatorNormalizerCallback
+from .prompt_instructions import PromptInstructionsCallback
 
 __all__ = [
     "AsyncCallbackHandler",
@@ -18,4 +19,5 @@ __all__ = [
     "BudgetManagerCallback",
     "TelemetryCallback",
     "OperatorNormalizerCallback",
+    "PromptInstructionsCallback",
 ]
diff --git a/libs/python/agent/agent/integrations/hud/__init__.py b/libs/python/agent/agent/integrations/hud/__init__.py
index 0da87bfa..b0d06041 100644
--- a/libs/python/agent/agent/integrations/hud/__init__.py
+++ b/libs/python/agent/agent/integrations/hud/__init__.py
@@ -20,6 +20,7 @@ from hud import trace
 
 from agent.agent import ComputerAgent as BaseComputerAgent
 from .proxy import FakeAsyncOpenAI
+from agent.callbacks import PromptInstructionsCallback
 
 
 # ---------------------------------------------------------------------------
@@ -47,6 +48,7 @@ class ProxyOperatorAgent(OperatorAgent):
         custom_loop: Any | None = None,
         only_n_most_recent_images: int | None = None,
         callbacks: list[Any] | None = None,
+        instructions: str | None = None,
         verbosity: int | None = None,
         max_retries: int | None = 3,
         screenshot_delay: float | int = 0.5,
@@ -68,12 +70,17 @@ class ProxyOperatorAgent(OperatorAgent):
         if tools:
             agent_tools.extend(tools)
 
+        # Build callbacks, injecting prompt instructions if provided
+        agent_callbacks = list(callbacks or [])
+        if instructions:
+            agent_callbacks.append(PromptInstructionsCallback(instructions))
+
         computer_agent = BaseComputerAgent(
             model=model,
             tools=agent_tools,
             custom_loop=custom_loop,
             only_n_most_recent_images=only_n_most_recent_images,
-            callbacks=callbacks,
+            callbacks=agent_callbacks,
             verbosity=verbosity,
             trajectory_dir=trajectory_dir,
             max_retries=max_retries,
@@ -96,7 +103,6 @@ class ProxyOperatorAgent(OperatorAgent):
 # Single-task runner
 # ---------------------------------------------------------------------------
 
-
 async def run_single_task(
     dataset: str | Dataset | list[dict[str, Any]],
     *,
@@ -108,6 +114,7 @@ async def run_single_task(
     custom_loop: Any | None = None,
     only_n_most_recent_images: int | None = None,
     callbacks: list[Any] | None = None,
+    instructions: str | None = None,
     verbosity: int | None = None,
     trajectory_dir: str | dict | None = None,
     max_retries: int | None = 3,
@@ -140,6 +147,7 @@ async def run_single_task(
             custom_loop=custom_loop,
             only_n_most_recent_images=only_n_most_recent_images,
             callbacks=callbacks,
+            instructions=instructions,
             verbosity=verbosity,
             trajectory_dir=trajectory_dir,
             max_retries=max_retries,
@@ -157,7 +165,6 @@ async def run_single_task(
 # Full-dataset runner
 # ---------------------------------------------------------------------------
 
-
 async def run_full_dataset(
     dataset: str | Dataset | list[dict[str, Any]],
     *,
@@ -173,6 +180,7 @@ async def run_full_dataset(
     custom_loop: Any | None = None,
     only_n_most_recent_images: int | None = 5,
     callbacks: list[Any] | None = None,
+    instructions: str | None = None,
     verbosity: int | None = None,
     max_retries: int | None = 3,
     screenshot_delay: float | int = 0.5,
@@ -207,6 +215,7 @@ async def run_full_dataset(
             "custom_loop": custom_loop,
             "only_n_most_recent_images": only_n_most_recent_images,
             "callbacks": callbacks,
+            "instructions": instructions,
             "verbosity": verbosity,
             "max_retries": max_retries,
             "screenshot_delay": screenshot_delay,

From f270af30e1cae760335dc197f84f1175f3f44911 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Tue, 9 Sep 2025 10:57:16 -0400
Subject: [PATCH 2/6] added notebook

---
 notebooks/customizing_computeragent.ipynb | 194 ++++++++++++++++++++++
 1 file changed, 194 insertions(+)
 create mode 100644 notebooks/customizing_computeragent.ipynb

diff --git a/notebooks/customizing_computeragent.ipynb b/notebooks/customizing_computeragent.ipynb
new file mode 100644
index 00000000..b0234d24
--- /dev/null
+++ b/notebooks/customizing_computeragent.ipynb
@@ -0,0 +1,194 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Customizing Your ComputerAgent\n\n",
+    "This notebook demonstrates four practical ways to increase the capabilities and success rate of your `ComputerAgent` in the Agent SDK:\n\n",
+    "1. Simple: Prompt engineering (via optional `instructions`)\n",
+    "2. Easy: Tools (function tools and custom computer tools)\n",
+    "3. Intermediate: Callbacks\n",
+    "4. Expert: Custom `@register_agent` loops\n\n",
+    "> Tip: The same patterns work in scripts and services — the notebook just makes it easy to iterate."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setup\n\n",
+    "We'll import `ComputerAgent`, a simple computer shim, and some utilities."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import logging\n",
+    "from agent.agent import ComputerAgent\n",
+    "from agent.callbacks import PromptInstructionsCallback, LoggingCallback\n",
+    "\n",
+    "# A very small computer shim for demo purposes (for full computer handlers, see docs)\n",
+    "class DummyComputer:\n",
+    "    async def screenshot(self):\n",
+    "        # Return a 1x1 transparent PNG as base64 string (placeholder)\n",
+    "        import base64\n",
+    "        png_bytes = base64.b64decode(\"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8Xw8AAr8B9k2m0oYAAAAASUVORK5CYII=\")\n",
+    "        return base64.b64encode(png_bytes).decode()\n",
+    "\n",
+    "    async def click(self, x: int, y: int):\n",
+    "        pass\n",
+    "\n",
+    "    async def type(self, text: str):\n",
+    "        pass\n",
+    "\n",
+    "computer = DummyComputer()\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1) Simple: Prompt engineering\n\n",
+    "You can guide your agent with system-like `instructions`.\n\n",
+    "Under the hood, `ComputerAgent(instructions=...)` adds a `PromptInstructionsCallback` that prepends a user message before each LLM call.\n\n",
+    "This mirrors the recommended snippet in code:\n\n",
+    "```python\n",
+    "effective_input = full_input\n",
+    "if instructions:\n",
+    "    effective_input = [{\"role\": \"user\", \"content\": instructions}] + full_input\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "instructions = (\n",
+    "    \"You are a meticulous software operator. Prefer safe, deterministic actions. \"\n",
+    "    \"Always confirm via on-screen text before proceeding.\"\n",
+    ")\n",
+    "agent = ComputerAgent(\n",
+    "    model=\"openai/computer-use-preview\",\n",
+    "    tools=[computer],\n",
+    "    instructions=instructions,\n",
+    "    callbacks=[LoggingCallback(level=logging.INFO)],\n",
+    ")\n",
+    "messages = [\n",
+    "    {\"role\": \"user\", \"content\": \"Open the settings and turn on dark mode.\"}\n",
+    "]\n",
+    "\n",
+    "# In notebooks, you may want to consume the async generator\n",
+    "import asyncio\n",
+    "async def run_once():\n",
+    "    async for chunk in agent.run(messages):\n",
+    "        # Print any assistant text outputs\n",
+    "        for item in chunk.get(\"output\", []):\n",
+    "            if item.get(\"type\") == \"message\":\n",
+    "                for c in item.get(\"content\", []):\n",
+    "                    if c.get(\"text\"):\n",
+    "                        print(c.get(\"text\"))\n",
+    "\n",
+    "await run_once()\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2) Easy: Tools\n\n",
+    "Add function tools to expose deterministic capabilities. Tools are auto-extracted to schemas and callable by the agent."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def calculate_percentage(numerator: float, denominator: float) -> str:\n",
+    "    \"\"\"Calculate a percentage string.\n",
+    "\n",
+    "    Args:\n",
+    "        numerator: Numerator value\n",
+    "        denominator: Denominator value\n",
+    "    Returns:\n",
+    "        A formatted percentage string (e.g., '75.00%').\n",
+    "    \"\"\"\n",
+    "    if denominator == 0:\n",
+    "        return \"0.00%\"\n",
+    "    return f\"{(numerator/denominator)*100:.2f}%\"\n",
+    "\n",
+    "agent_with_tool = ComputerAgent(\n",
+    "    model=\"openai/computer-use-preview\",\n",
+    "    tools=[computer, calculate_percentage],\n",
+    "    instructions=\"When doing math, prefer the `calculate_percentage` tool when relevant.\",\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3) Intermediate: Callbacks\n\n",
+    "Callbacks offer lifecycle hooks. For example, limit recent images or record trajectories."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from agent.callbacks import ImageRetentionCallback, TrajectorySaverCallback\n",
+    "\n",
+    "agent_with_callbacks = ComputerAgent(\n",
+    "    model=\"anthropic/claude-3-5-sonnet-20241022\",\n",
+    "    tools=[computer],\n",
+    "    callbacks=[\n",
+    "        ImageRetentionCallback(only_n_most_recent_images=3),\n",
+    "        TrajectorySaverCallback(\"./trajectories\"),\n",
+    "    ],\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4) Expert: Custom `@register_agent`\n\n",
+    "Register custom agent configs that implement `predict_step` (and optionally `predict_click`). This gives you full control over prompting, message shaping, and tool wiring.\n\n",
+    "See: `libs/python/agent/agent/loops/` for concrete examples."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Next steps\n\n",
+    "- Start with `instructions` for fast wins.\n",
+    "- Add function tools for determinism and reliability.\n",
+    "- Use callbacks to manage cost, logs, and safety.\n",
+    "- Build custom loops for specialized domains."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From b21c66894641a783db546fb3018817afc829ec84 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Tue, 9 Sep 2025 10:58:37 -0400
Subject: [PATCH 3/6] added notebook

---
 docs/content/docs/agent-sdk/customizing-computeragent.mdx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/content/docs/agent-sdk/customizing-computeragent.mdx b/docs/content/docs/agent-sdk/customizing-computeragent.mdx
index 462bb5f9..d94f4ec0 100644
--- a/docs/content/docs/agent-sdk/customizing-computeragent.mdx
+++ b/docs/content/docs/agent-sdk/customizing-computeragent.mdx
@@ -2,6 +2,8 @@
 title: Customizing Your ComputerAgent
 ---
 
+<Callout>A corresponding <a href="https://github.com/trycua/cua/blob/main/notebooks/customizing_computeragent.ipynb" target="_blank">Jupyter Notebook</a> is available for this documentation.</Callout>
+
 The `ComputerAgent` interface provides an easy proxy to any computer-using model configuration, and it is a powerful framework for extending and building your own agentic systems.
 
 This guide shows four proven ways to increase capabilities and success rate:

From 665e65cb856a5515c04471dde336ce27f6ba48a2 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Tue, 9 Sep 2025 11:00:52 -0400
Subject: [PATCH 4/6] Replaced computer shim with Docker computer

---
 notebooks/customizing_computeragent.ipynb | 65 +++++++++++++----------
 1 file changed, 36 insertions(+), 29 deletions(-)

diff --git a/notebooks/customizing_computeragent.ipynb b/notebooks/customizing_computeragent.ipynb
index b0234d24..56f0beb9 100644
--- a/notebooks/customizing_computeragent.ipynb
+++ b/notebooks/customizing_computeragent.ipynb
@@ -4,12 +4,15 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Customizing Your ComputerAgent\n\n",
-    "This notebook demonstrates four practical ways to increase the capabilities and success rate of your `ComputerAgent` in the Agent SDK:\n\n",
+    "# Customizing Your ComputerAgent\n",
+    "\n",
+    "This notebook demonstrates four practical ways to increase the capabilities and success rate of your `ComputerAgent` in the Agent SDK:\n",
+    "\n",
     "1. Simple: Prompt engineering (via optional `instructions`)\n",
     "2. Easy: Tools (function tools and custom computer tools)\n",
     "3. Intermediate: Callbacks\n",
-    "4. Expert: Custom `@register_agent` loops\n\n",
+    "4. Expert: Custom `@register_agent` loops\n",
+    "\n",
     "> Tip: The same patterns work in scripts and services — the notebook just makes it easy to iterate."
    ]
   },
@@ -17,8 +20,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Setup\n\n",
-    "We'll import `ComputerAgent`, a simple computer shim, and some utilities."
+    "## Setup\n",
+    "\n",
+    "We'll import `ComputerAgent`, a simple Docker-based computer, and some utilities."
    ]
   },
   {
@@ -29,33 +33,31 @@
    "source": [
     "import logging\n",
     "from agent.agent import ComputerAgent\n",
-    "from agent.callbacks import PromptInstructionsCallback, LoggingCallback\n",
+    "from agent.callbacks import LoggingCallback\n",
+    "from computer import Computer\n",
     "\n",
-    "# A very small computer shim for demo purposes (for full computer handlers, see docs)\n",
-    "class DummyComputer:\n",
-    "    async def screenshot(self):\n",
-    "        # Return a 1x1 transparent PNG as base64 string (placeholder)\n",
-    "        import base64\n",
-    "        png_bytes = base64.b64decode(\"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8Xw8AAr8B9k2m0oYAAAAASUVORK5CYII=\")\n",
-    "        return base64.b64encode(png_bytes).decode()\n",
+    "computer = Computer(\n",
+    "    os_type=\"linux\",\n",
+    "    provider_type=\"docker\",\n",
+    "    image=\"trycua/cua-ubuntu:latest\",\n",
+    "    name=\"my-cua-container\"\n",
+    ")\n",
     "\n",
-    "    async def click(self, x: int, y: int):\n",
-    "        pass\n",
-    "\n",
-    "    async def type(self, text: str):\n",
-    "        pass\n",
-    "\n",
-    "computer = DummyComputer()\n"
+    "await computer.run() # Launch & connect to Docker container"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 1) Simple: Prompt engineering\n\n",
-    "You can guide your agent with system-like `instructions`.\n\n",
-    "Under the hood, `ComputerAgent(instructions=...)` adds a `PromptInstructionsCallback` that prepends a user message before each LLM call.\n\n",
-    "This mirrors the recommended snippet in code:\n\n",
+    "## 1) Simple: Prompt engineering\n",
+    "\n",
+    "You can guide your agent with system-like `instructions`.\n",
+    "\n",
+    "Under the hood, `ComputerAgent(instructions=...)` adds a `PromptInstructionsCallback` that prepends a user message before each LLM call.\n",
+    "\n",
+    "This mirrors the recommended snippet in code:\n",
+    "\n",
     "```python\n",
     "effective_input = full_input\n",
     "if instructions:\n",
@@ -101,7 +103,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 2) Easy: Tools\n\n",
+    "## 2) Easy: Tools\n",
+    "\n",
     "Add function tools to expose deterministic capabilities. Tools are auto-extracted to schemas and callable by the agent."
    ]
   },
@@ -135,7 +138,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 3) Intermediate: Callbacks\n\n",
+    "## 3) Intermediate: Callbacks\n",
+    "\n",
     "Callbacks offer lifecycle hooks. For example, limit recent images or record trajectories."
    ]
   },
@@ -161,8 +165,10 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 4) Expert: Custom `@register_agent`\n\n",
-    "Register custom agent configs that implement `predict_step` (and optionally `predict_click`). This gives you full control over prompting, message shaping, and tool wiring.\n\n",
+    "## 4) Expert: Custom `@register_agent`\n",
+    "\n",
+    "Register custom agent configs that implement `predict_step` (and optionally `predict_click`). This gives you full control over prompting, message shaping, and tool wiring.\n",
+    "\n",
     "See: `libs/python/agent/agent/loops/` for concrete examples."
    ]
   },
@@ -170,7 +176,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Next steps\n\n",
+    "## Next steps\n",
+    "\n",
     "- Start with `instructions` for fast wins.\n",
     "- Add function tools for determinism and reliability.\n",
     "- Use callbacks to manage cost, logs, and safety.\n",

From bae97a6cb7760e20562cfb2b02cbf548f70a13f3 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Tue, 9 Sep 2025 11:08:19 -0400
Subject: [PATCH 5/6] Added message format documentation

---
 docs/content/docs/agent-sdk/chat-history.mdx  |   8 +-
 .../content/docs/agent-sdk/message-format.mdx | 201 ++++++++++++++++++
 docs/content/docs/agent-sdk/meta.json         |   1 +
 3 files changed, 203 insertions(+), 7 deletions(-)
 create mode 100644 docs/content/docs/agent-sdk/message-format.mdx

diff --git a/docs/content/docs/agent-sdk/chat-history.mdx b/docs/content/docs/agent-sdk/chat-history.mdx
index 83435a70..e7041c3b 100644
--- a/docs/content/docs/agent-sdk/chat-history.mdx
+++ b/docs/content/docs/agent-sdk/chat-history.mdx
@@ -75,13 +75,7 @@ messages = [
 
 ## Message Types
 
-- **user**: User input messages
-- **computer_call**: Computer actions (click, type, keypress, etc.)
-- **computer_call_output**: Results from computer actions (usually screenshots)
-- **function_call**: Function calls (e.g., `computer.call`)
-- **function_call_output**: Results from function calls
-- **reasoning**: Agent's internal reasoning and planning
-- **message**: Agent text responses
+See the complete schema in [Message Format](./message-format).
 
 ### Memory Management
 
diff --git a/docs/content/docs/agent-sdk/message-format.mdx b/docs/content/docs/agent-sdk/message-format.mdx
new file mode 100644
index 00000000..ac329d4d
--- /dev/null
+++ b/docs/content/docs/agent-sdk/message-format.mdx
@@ -0,0 +1,201 @@
+---
+title: Message Format
+---
+
+This page documents the Python message and response schema used by the Agent SDK.
+It mirrors the structure shown in Chat History and provides precise type definitions you can target in your own code.
+
+All examples below use Python type hints with `TypedDict` and `Literal` from the standard `typing` module.
+
+## Response
+
+The agent yields response chunks as an async generator of objects with `output` and `usage`.
+
+```python
+from typing import List, TypedDict
+
+class Usage(TypedDict, total=False):
+    prompt_tokens: int
+    completion_tokens: int
+    total_tokens: int
+    response_cost: float  # USD cost if available
+
+class AgentResponse(TypedDict):
+    output: List["AgentMessage"]
+    usage: Usage
+```
+
+## Messages
+
+Agent messages represent the state of the conversation and the agent's actions.
+
+```python
+from typing import List, Literal, Optional, TypedDict, Union
+
+# Union of all message variants
+AgentMessage = Union[
+    "UserMessage",
+    "AssistantMessage",
+    "ReasoningMessage",
+    "ComputerCallMessage",
+    "ComputerCallOutputMessage",
+    "FunctionCallMessage",
+    "FunctionCallOutputMessage",
+]
+
+# Input message (role: user/system/developer)
+class UserMessage(TypedDict, total=False):
+    type: Literal["message"]  # optional for user input
+    role: Literal["user", "system", "developer"]
+    content: Union[str, List["InputContent"]]
+
+# Output message (assistant text)
+class AssistantMessage(TypedDict):
+    type: Literal["message"]
+    role: Literal["assistant"]
+    content: List["OutputContent"]
+
+# Output reasoning/thinking message
+class ReasoningMessage(TypedDict):
+    type: Literal["reasoning"]
+    summary: List["SummaryContent"]
+
+# Output computer action call (agent intends to act)
+class ComputerCallMessage(TypedDict):
+    type: Literal["computer_call"]
+    call_id: str
+    status: Literal["completed", "failed", "pending"]
+    action: "ComputerAction"
+
+# Output computer action result (always a screenshot)
+class ComputerCallOutputMessage(TypedDict):
+    type: Literal["computer_call_output"]
+    call_id: str
+    output: "ComputerResultContent"
+
+# Output function call (agent calls a Python tool)
+class FunctionCallMessage(TypedDict):
+    type: Literal["function_call"]
+    call_id: str
+    status: Literal["completed", "failed", "pending"]
+    name: str
+    arguments: str  # JSON-serialized kwargs
+
+# Output function call result (text)
+class FunctionCallOutputMessage(TypedDict):
+    type: Literal["function_call_output"]
+    call_id: str
+    output: str
+```
+
+## Message Content
+
+These content items appear inside `content` arrays for the message types above.
+
+```python
+# Input content kinds
+class InputContent(TypedDict):
+    type: Literal["input_image", "input_text"]
+    text: Optional[str]
+    image_url: Optional[str]  # e.g., data URL
+
+# Assistant output content
+class OutputContent(TypedDict):
+    type: Literal["output_text"]
+    text: str
+
+# Reasoning/summary output content
+class SummaryContent(TypedDict):
+    type: Literal["summary_text"]
+    text: str
+
+# Computer call outputs (screenshots)
+class ComputerResultContent(TypedDict):
+    type: Literal["computer_screenshot", "input_image"]
+    image_url: str  # data URL (e.g., "data:image/png;base64,....")
+```
+
+## Actions
+
+Computer actions represent concrete operations the agent will perform on the computer.
+
+Two broad families exist depending on the provider: OpenAI-style and Anthropic-style.
+
+```python
+# Union of all supported computer actions
+ComputerAction = Union[
+    "ClickAction",
+    "DoubleClickAction",
+    "DragAction",
+    "KeyPressAction",
+    "MoveAction",
+    "ScreenshotAction",
+    "ScrollAction",
+    "TypeAction",
+    "WaitAction",
+    # Anthropic variants
+    "LeftMouseDownAction",
+    "LeftMouseUpAction",
+]
+
+# OpenAI Computer Actions
+class ClickAction(TypedDict):
+    type: Literal["click"]
+    button: Literal["left", "right", "wheel", "back", "forward"]
+    x: int
+    y: int
+
+class DoubleClickAction(TypedDict, total=False):
+    type: Literal["double_click"]
+    button: Literal["left", "right", "wheel", "back", "forward"]
+    x: int
+    y: int
+
+class DragAction(TypedDict, total=False):
+    type: Literal["drag"]
+    button: Literal["left", "right", "wheel", "back", "forward"]
+    path: List[tuple[int, int]]  # [(x1, y1), (x2, y2), ...]
+
+class KeyPressAction(TypedDict):
+    type: Literal["keypress"]
+    keys: List[str]  # e.g., ["ctrl", "a"]
+
+class MoveAction(TypedDict):
+    type: Literal["move"]
+    x: int
+    y: int
+
+class ScreenshotAction(TypedDict):
+    type: Literal["screenshot"]
+
+class ScrollAction(TypedDict):
+    type: Literal["scroll"]
+    scroll_x: int
+    scroll_y: int
+    x: int
+    y: int
+
+class TypeAction(TypedDict):
+    type: Literal["type"]
+    text: str
+
+class WaitAction(TypedDict):
+    type: Literal["wait"]
+
+# Anthropic Computer Actions
+class LeftMouseDownAction(TypedDict):
+    type: Literal["left_mouse_down"]
+    x: int
+    y: int
+
+class LeftMouseUpAction(TypedDict):
+    type: Literal["left_mouse_up"]
+    x: int
+    y: int
+```
+
+## Notes
+
+- The agent runtime may add provider-specific fields when available (e.g., usage cost). Unknown fields should be ignored for forward compatibility.
+- Computer action outputs are screenshots as data URLs. For security and storage, some serializers may redact or omit large fields in persisted metadata.
+- The message flow typically alternates between reasoning, actions, screenshots, and concluding assistant text. See [Chat History](./chat-history) for a step-by-step example.
diff --git a/docs/content/docs/agent-sdk/meta.json b/docs/content/docs/agent-sdk/meta.json
index b745ce58..1083fc25 100644
--- a/docs/content/docs/agent-sdk/meta.json
+++ b/docs/content/docs/agent-sdk/meta.json
@@ -6,6 +6,7 @@
         "supported-agents",
 		"supported-model-providers",
 		"chat-history",
+		"message-format",
 		"customizing-computeragent",
 		"callbacks",
         "custom-tools",

From ae6d35ffa557455d093165612c3d4c77dccf1330 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Tue, 9 Sep 2025 11:23:13 -0400
Subject: [PATCH 6/6] Fixed broken link

---
 docs/content/docs/agent-sdk/customizing-computeragent.mdx | 2 --
 1 file changed, 2 deletions(-)

diff --git a/docs/content/docs/agent-sdk/customizing-computeragent.mdx b/docs/content/docs/agent-sdk/customizing-computeragent.mdx
index d94f4ec0..dac0d35f 100644
--- a/docs/content/docs/agent-sdk/customizing-computeragent.mdx
+++ b/docs/content/docs/agent-sdk/customizing-computeragent.mdx
@@ -13,8 +13,6 @@ This guide shows four proven ways to increase capabilities and success rate:
 - 3 — Intermediate: Callbacks
 - 4 — Expert: Custom `@register_agent`
 
-For a hands-on walkthrough, see the companion notebook: [notebooks/customizing_computeragent.ipynb](../../../notebooks/customizing_computeragent.ipynb).
-
 ## 1) Simple: Prompt engineering
 
 Provide guiding instructions to shape behavior. `ComputerAgent` accepts an optional `instructions: str | None` which acts like a system-style preface. Internally, this uses a callback that pre-pends a user message before each LLM call.