added simple guide for customizing computeragent

This commit is contained in:
Dillon DuPont
2025-09-09 10:55:57 -04:00
parent c38cab2776
commit 17d6709629
5 changed files with 145 additions and 4 deletions

View File

@@ -0,0 +1,121 @@
---
title: Customizing Your ComputerAgent
---
The `ComputerAgent` interface provides an easy proxy to any computer-using model configuration, and it is a powerful framework for extending and building your own agentic systems.
This guide shows four proven ways to increase capabilities and success rate:
- 1 — Simple: Prompt engineering
- 2 — Easy: Tools
- 3 — Intermediate: Callbacks
- 4 — Expert: Custom `@register_agent`
For a hands-on walkthrough, see the companion notebook: [notebooks/customizing_computeragent.ipynb](../../../notebooks/customizing_computeragent.ipynb).
## 1) Simple: Prompt engineering
Provide guiding instructions to shape behavior. `ComputerAgent` accepts an optional `instructions: str | None` which acts like a system-style preface. Internally, this uses a callback that pre-pends a user message before each LLM call.
```python
from agent.agent import ComputerAgent
agent = ComputerAgent(
model="openai/computer-use-preview",
tools=[computer],
instructions=(
"You are a meticulous software operator. Prefer safe, deterministic actions. "
"Always confirm via on-screen text before proceeding."
),
)
```
## 2) Easy: Tools
Expose deterministic capabilities as tools (Python functions or custom computer handlers). The agent will call them when appropriate.
```python
def calculate_percentage(numerator: float, denominator: float) -> str:
"""Calculate percentage as a string.
Args:
numerator: Numerator value
denominator: Denominator value
Returns:
A formatted percentage string (e.g., '75.00%').
"""
if denominator == 0:
return "0.00%"
return f"{(numerator/denominator)*100:.2f}%"
agent = ComputerAgent(
model="openai/computer-use-preview",
tools=[computer, calculate_percentage],
)
```
- See `docs/agent-sdk/custom-tools` for authoring function tools.
- See `docs/agent-sdk/custom-computer-handlers` for building full computer interfaces.
## 3) Intermediate: Callbacks
Callbacks provide lifecycle hooks to preprocess messages, postprocess outputs, record trajectories, manage costs, and more.
```python
from agent.callbacks import ImageRetentionCallback, TrajectorySaverCallback, BudgetManagerCallback
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
tools=[computer],
callbacks=[
ImageRetentionCallback(only_n_most_recent_images=3),
TrajectorySaverCallback("./trajectories"),
BudgetManagerCallback(max_budget=10.0, raise_error=True),
],
)
```
- Browse implementations in `libs/python/agent/agent/loops/`.
## 4) Expert: Custom `@register_agent`
Build your own agent configuration class to control prompting, message shaping, and tool handling. This is the most flexible option for specialized domains.
- Register your own `model=...` loop using `@register_agent`
- Browse implementations in `libs/python/agent/agent/loops/`.
- Implement `predict_step()` (and optionally `predict_click()`) and return the standardized output schema.
```python
from agent.decorators import register_agent
@register_agent(models=r".*my-special-model.*", priority=10)
class MyCustomAgentConfig:
async def predict_step(self, messages, model, tools, **kwargs):
# 1) Format messages for your provider
# 2) Call provider
# 3) Convert responses to the agent output schema
return {"output": [], "usage": {}}
async def predict_click(self, model, image_b64, instruction):
# Optional: click-only capability
return None
def get_capabilities(self):
return ["step"]
```
## HUD integration (optional)
When using the HUD evaluation integration (`agent/integrations/hud/`), you can pass `instructions`, `tools`, and `callbacks` directly
```python
from agent.integrations.hud import run_single_task
await run_single_task(
dataset="username/dataset-name",
model="openai/computer-use-preview",
instructions="Operate carefully. Always verify on-screen text before actions.",
# tools=[your_custom_function],
# callbacks=[YourCustomCallback()],
)
```

View File

@@ -6,6 +6,7 @@
"supported-agents",
"supported-model-providers",
"chat-history",
"customizing-computeragent",
"callbacks",
"custom-tools",
"custom-computer-handlers",

View File

@@ -31,7 +31,8 @@ from .callbacks import (
TrajectorySaverCallback,
BudgetManagerCallback,
TelemetryCallback,
OperatorNormalizerCallback
OperatorNormalizerCallback,
PromptInstructionsCallback,
)
from .computers import (
AsyncComputerHandler,
@@ -162,6 +163,7 @@ class ComputerAgent:
custom_loop: Optional[Callable] = None,
only_n_most_recent_images: Optional[int] = None,
callbacks: Optional[List[Any]] = None,
instructions: Optional[str] = None,
verbosity: Optional[int] = None,
trajectory_dir: Optional[str | Path | dict] = None,
max_retries: Optional[int] = 3,
@@ -180,6 +182,7 @@ class ComputerAgent:
custom_loop: Custom agent loop function to use instead of auto-selection
only_n_most_recent_images: If set, only keep the N most recent images in message history. Adds ImageRetentionCallback automatically.
callbacks: List of AsyncCallbackHandler instances for preprocessing/postprocessing
instructions: Optional system instructions to be passed to the model
verbosity: Logging level (logging.DEBUG, logging.INFO, etc.). If set, adds LoggingCallback automatically
trajectory_dir: If set, saves trajectory data (screenshots, responses) to this directory. Adds TrajectorySaverCallback automatically.
max_retries: Maximum number of retries for failed API calls
@@ -198,6 +201,7 @@ class ComputerAgent:
self.custom_loop = custom_loop
self.only_n_most_recent_images = only_n_most_recent_images
self.callbacks = callbacks or []
self.instructions = instructions
self.verbosity = verbosity
self.trajectory_dir = trajectory_dir
self.max_retries = max_retries
@@ -211,6 +215,10 @@ class ComputerAgent:
# Prepend operator normalizer callback
self.callbacks.insert(0, OperatorNormalizerCallback())
# Add prompt instructions callback if provided
if self.instructions:
self.callbacks.append(PromptInstructionsCallback(self.instructions))
# Add telemetry callback if telemetry_enabled is set
if self.telemetry_enabled:
if isinstance(self.telemetry_enabled, bool):

View File

@@ -9,6 +9,7 @@ from .trajectory_saver import TrajectorySaverCallback
from .budget_manager import BudgetManagerCallback
from .telemetry import TelemetryCallback
from .operator_validator import OperatorNormalizerCallback
from .prompt_instructions import PromptInstructionsCallback
__all__ = [
"AsyncCallbackHandler",
@@ -18,4 +19,5 @@ __all__ = [
"BudgetManagerCallback",
"TelemetryCallback",
"OperatorNormalizerCallback",
"PromptInstructionsCallback",
]

View File

@@ -20,6 +20,7 @@ from hud import trace
from agent.agent import ComputerAgent as BaseComputerAgent
from .proxy import FakeAsyncOpenAI
from agent.callbacks import PromptInstructionsCallback
# ---------------------------------------------------------------------------
@@ -47,6 +48,7 @@ class ProxyOperatorAgent(OperatorAgent):
custom_loop: Any | None = None,
only_n_most_recent_images: int | None = None,
callbacks: list[Any] | None = None,
instructions: str | None = None,
verbosity: int | None = None,
max_retries: int | None = 3,
screenshot_delay: float | int = 0.5,
@@ -68,12 +70,17 @@ class ProxyOperatorAgent(OperatorAgent):
if tools:
agent_tools.extend(tools)
# Build callbacks, injecting prompt instructions if provided
agent_callbacks = list(callbacks or [])
if instructions:
agent_callbacks.append(PromptInstructionsCallback(instructions))
computer_agent = BaseComputerAgent(
model=model,
tools=agent_tools,
custom_loop=custom_loop,
only_n_most_recent_images=only_n_most_recent_images,
callbacks=callbacks,
callbacks=agent_callbacks,
verbosity=verbosity,
trajectory_dir=trajectory_dir,
max_retries=max_retries,
@@ -96,7 +103,6 @@ class ProxyOperatorAgent(OperatorAgent):
# Single-task runner
# ---------------------------------------------------------------------------
async def run_single_task(
dataset: str | Dataset | list[dict[str, Any]],
*,
@@ -108,6 +114,7 @@ async def run_single_task(
custom_loop: Any | None = None,
only_n_most_recent_images: int | None = None,
callbacks: list[Any] | None = None,
instructions: str | None = None,
verbosity: int | None = None,
trajectory_dir: str | dict | None = None,
max_retries: int | None = 3,
@@ -140,6 +147,7 @@ async def run_single_task(
custom_loop=custom_loop,
only_n_most_recent_images=only_n_most_recent_images,
callbacks=callbacks,
instructions=instructions,
verbosity=verbosity,
trajectory_dir=trajectory_dir,
max_retries=max_retries,
@@ -157,7 +165,6 @@ async def run_single_task(
# Full-dataset runner
# ---------------------------------------------------------------------------
async def run_full_dataset(
dataset: str | Dataset | list[dict[str, Any]],
*,
@@ -173,6 +180,7 @@ async def run_full_dataset(
custom_loop: Any | None = None,
only_n_most_recent_images: int | None = 5,
callbacks: list[Any] | None = None,
instructions: str | None = None,
verbosity: int | None = None,
max_retries: int | None = 3,
screenshot_delay: float | int = 0.5,
@@ -207,6 +215,7 @@ async def run_full_dataset(
"custom_loop": custom_loop,
"only_n_most_recent_images": only_n_most_recent_images,
"callbacks": callbacks,
"instructions": instructions,
"verbosity": verbosity,
"max_retries": max_retries,
"screenshot_delay": screenshot_delay,