mirror of
https://github.com/trycua/computer.git
synced 2026-02-19 12:59:34 -06:00
added simple guide for customizing computeragent
This commit is contained in:
121
docs/content/docs/agent-sdk/customizing-computeragent.mdx
Normal file
121
docs/content/docs/agent-sdk/customizing-computeragent.mdx
Normal file
@@ -0,0 +1,121 @@
|
||||
---
|
||||
title: Customizing Your ComputerAgent
|
||||
---
|
||||
|
||||
The `ComputerAgent` interface provides an easy proxy to any computer-using model configuration, and it is a powerful framework for extending and building your own agentic systems.
|
||||
|
||||
This guide shows four proven ways to increase capabilities and success rate:
|
||||
|
||||
- 1 — Simple: Prompt engineering
|
||||
- 2 — Easy: Tools
|
||||
- 3 — Intermediate: Callbacks
|
||||
- 4 — Expert: Custom `@register_agent`
|
||||
|
||||
For a hands-on walkthrough, see the companion notebook: [notebooks/customizing_computeragent.ipynb](../../../notebooks/customizing_computeragent.ipynb).
|
||||
|
||||
## 1) Simple: Prompt engineering
|
||||
|
||||
Provide guiding instructions to shape behavior. `ComputerAgent` accepts an optional `instructions: str | None` which acts like a system-style preface. Internally, this uses a callback that pre-pends a user message before each LLM call.
|
||||
|
||||
```python
|
||||
from agent.agent import ComputerAgent
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="openai/computer-use-preview",
|
||||
tools=[computer],
|
||||
instructions=(
|
||||
"You are a meticulous software operator. Prefer safe, deterministic actions. "
|
||||
"Always confirm via on-screen text before proceeding."
|
||||
),
|
||||
)
|
||||
```
|
||||
|
||||
## 2) Easy: Tools
|
||||
|
||||
Expose deterministic capabilities as tools (Python functions or custom computer handlers). The agent will call them when appropriate.
|
||||
|
||||
```python
|
||||
def calculate_percentage(numerator: float, denominator: float) -> str:
|
||||
"""Calculate percentage as a string.
|
||||
|
||||
Args:
|
||||
numerator: Numerator value
|
||||
denominator: Denominator value
|
||||
Returns:
|
||||
A formatted percentage string (e.g., '75.00%').
|
||||
"""
|
||||
if denominator == 0:
|
||||
return "0.00%"
|
||||
return f"{(numerator/denominator)*100:.2f}%"
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="openai/computer-use-preview",
|
||||
tools=[computer, calculate_percentage],
|
||||
)
|
||||
```
|
||||
|
||||
- See `docs/agent-sdk/custom-tools` for authoring function tools.
|
||||
- See `docs/agent-sdk/custom-computer-handlers` for building full computer interfaces.
|
||||
|
||||
## 3) Intermediate: Callbacks
|
||||
|
||||
Callbacks provide lifecycle hooks to preprocess messages, postprocess outputs, record trajectories, manage costs, and more.
|
||||
|
||||
```python
|
||||
from agent.callbacks import ImageRetentionCallback, TrajectorySaverCallback, BudgetManagerCallback
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
tools=[computer],
|
||||
callbacks=[
|
||||
ImageRetentionCallback(only_n_most_recent_images=3),
|
||||
TrajectorySaverCallback("./trajectories"),
|
||||
BudgetManagerCallback(max_budget=10.0, raise_error=True),
|
||||
],
|
||||
)
|
||||
```
|
||||
|
||||
- Browse implementations in `libs/python/agent/agent/loops/`.
|
||||
|
||||
## 4) Expert: Custom `@register_agent`
|
||||
|
||||
Build your own agent configuration class to control prompting, message shaping, and tool handling. This is the most flexible option for specialized domains.
|
||||
|
||||
- Register your own `model=...` loop using `@register_agent`
|
||||
- Browse implementations in `libs/python/agent/agent/loops/`.
|
||||
- Implement `predict_step()` (and optionally `predict_click()`) and return the standardized output schema.
|
||||
|
||||
```python
|
||||
from agent.decorators import register_agent
|
||||
|
||||
@register_agent(models=r".*my-special-model.*", priority=10)
|
||||
class MyCustomAgentConfig:
|
||||
async def predict_step(self, messages, model, tools, **kwargs):
|
||||
# 1) Format messages for your provider
|
||||
# 2) Call provider
|
||||
# 3) Convert responses to the agent output schema
|
||||
return {"output": [], "usage": {}}
|
||||
|
||||
async def predict_click(self, model, image_b64, instruction):
|
||||
# Optional: click-only capability
|
||||
return None
|
||||
|
||||
def get_capabilities(self):
|
||||
return ["step"]
|
||||
```
|
||||
|
||||
## HUD integration (optional)
|
||||
|
||||
When using the HUD evaluation integration (`agent/integrations/hud/`), you can pass `instructions`, `tools`, and `callbacks` directly
|
||||
|
||||
```python
|
||||
from agent.integrations.hud import run_single_task
|
||||
|
||||
await run_single_task(
|
||||
dataset="username/dataset-name",
|
||||
model="openai/computer-use-preview",
|
||||
instructions="Operate carefully. Always verify on-screen text before actions.",
|
||||
# tools=[your_custom_function],
|
||||
# callbacks=[YourCustomCallback()],
|
||||
)
|
||||
```
|
||||
@@ -6,6 +6,7 @@
|
||||
"supported-agents",
|
||||
"supported-model-providers",
|
||||
"chat-history",
|
||||
"customizing-computeragent",
|
||||
"callbacks",
|
||||
"custom-tools",
|
||||
"custom-computer-handlers",
|
||||
|
||||
@@ -31,7 +31,8 @@ from .callbacks import (
|
||||
TrajectorySaverCallback,
|
||||
BudgetManagerCallback,
|
||||
TelemetryCallback,
|
||||
OperatorNormalizerCallback
|
||||
OperatorNormalizerCallback,
|
||||
PromptInstructionsCallback,
|
||||
)
|
||||
from .computers import (
|
||||
AsyncComputerHandler,
|
||||
@@ -162,6 +163,7 @@ class ComputerAgent:
|
||||
custom_loop: Optional[Callable] = None,
|
||||
only_n_most_recent_images: Optional[int] = None,
|
||||
callbacks: Optional[List[Any]] = None,
|
||||
instructions: Optional[str] = None,
|
||||
verbosity: Optional[int] = None,
|
||||
trajectory_dir: Optional[str | Path | dict] = None,
|
||||
max_retries: Optional[int] = 3,
|
||||
@@ -180,6 +182,7 @@ class ComputerAgent:
|
||||
custom_loop: Custom agent loop function to use instead of auto-selection
|
||||
only_n_most_recent_images: If set, only keep the N most recent images in message history. Adds ImageRetentionCallback automatically.
|
||||
callbacks: List of AsyncCallbackHandler instances for preprocessing/postprocessing
|
||||
instructions: Optional system instructions to be passed to the model
|
||||
verbosity: Logging level (logging.DEBUG, logging.INFO, etc.). If set, adds LoggingCallback automatically
|
||||
trajectory_dir: If set, saves trajectory data (screenshots, responses) to this directory. Adds TrajectorySaverCallback automatically.
|
||||
max_retries: Maximum number of retries for failed API calls
|
||||
@@ -198,6 +201,7 @@ class ComputerAgent:
|
||||
self.custom_loop = custom_loop
|
||||
self.only_n_most_recent_images = only_n_most_recent_images
|
||||
self.callbacks = callbacks or []
|
||||
self.instructions = instructions
|
||||
self.verbosity = verbosity
|
||||
self.trajectory_dir = trajectory_dir
|
||||
self.max_retries = max_retries
|
||||
@@ -211,6 +215,10 @@ class ComputerAgent:
|
||||
# Prepend operator normalizer callback
|
||||
self.callbacks.insert(0, OperatorNormalizerCallback())
|
||||
|
||||
# Add prompt instructions callback if provided
|
||||
if self.instructions:
|
||||
self.callbacks.append(PromptInstructionsCallback(self.instructions))
|
||||
|
||||
# Add telemetry callback if telemetry_enabled is set
|
||||
if self.telemetry_enabled:
|
||||
if isinstance(self.telemetry_enabled, bool):
|
||||
|
||||
@@ -9,6 +9,7 @@ from .trajectory_saver import TrajectorySaverCallback
|
||||
from .budget_manager import BudgetManagerCallback
|
||||
from .telemetry import TelemetryCallback
|
||||
from .operator_validator import OperatorNormalizerCallback
|
||||
from .prompt_instructions import PromptInstructionsCallback
|
||||
|
||||
__all__ = [
|
||||
"AsyncCallbackHandler",
|
||||
@@ -18,4 +19,5 @@ __all__ = [
|
||||
"BudgetManagerCallback",
|
||||
"TelemetryCallback",
|
||||
"OperatorNormalizerCallback",
|
||||
"PromptInstructionsCallback",
|
||||
]
|
||||
|
||||
@@ -20,6 +20,7 @@ from hud import trace
|
||||
|
||||
from agent.agent import ComputerAgent as BaseComputerAgent
|
||||
from .proxy import FakeAsyncOpenAI
|
||||
from agent.callbacks import PromptInstructionsCallback
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -47,6 +48,7 @@ class ProxyOperatorAgent(OperatorAgent):
|
||||
custom_loop: Any | None = None,
|
||||
only_n_most_recent_images: int | None = None,
|
||||
callbacks: list[Any] | None = None,
|
||||
instructions: str | None = None,
|
||||
verbosity: int | None = None,
|
||||
max_retries: int | None = 3,
|
||||
screenshot_delay: float | int = 0.5,
|
||||
@@ -68,12 +70,17 @@ class ProxyOperatorAgent(OperatorAgent):
|
||||
if tools:
|
||||
agent_tools.extend(tools)
|
||||
|
||||
# Build callbacks, injecting prompt instructions if provided
|
||||
agent_callbacks = list(callbacks or [])
|
||||
if instructions:
|
||||
agent_callbacks.append(PromptInstructionsCallback(instructions))
|
||||
|
||||
computer_agent = BaseComputerAgent(
|
||||
model=model,
|
||||
tools=agent_tools,
|
||||
custom_loop=custom_loop,
|
||||
only_n_most_recent_images=only_n_most_recent_images,
|
||||
callbacks=callbacks,
|
||||
callbacks=agent_callbacks,
|
||||
verbosity=verbosity,
|
||||
trajectory_dir=trajectory_dir,
|
||||
max_retries=max_retries,
|
||||
@@ -96,7 +103,6 @@ class ProxyOperatorAgent(OperatorAgent):
|
||||
# Single-task runner
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def run_single_task(
|
||||
dataset: str | Dataset | list[dict[str, Any]],
|
||||
*,
|
||||
@@ -108,6 +114,7 @@ async def run_single_task(
|
||||
custom_loop: Any | None = None,
|
||||
only_n_most_recent_images: int | None = None,
|
||||
callbacks: list[Any] | None = None,
|
||||
instructions: str | None = None,
|
||||
verbosity: int | None = None,
|
||||
trajectory_dir: str | dict | None = None,
|
||||
max_retries: int | None = 3,
|
||||
@@ -140,6 +147,7 @@ async def run_single_task(
|
||||
custom_loop=custom_loop,
|
||||
only_n_most_recent_images=only_n_most_recent_images,
|
||||
callbacks=callbacks,
|
||||
instructions=instructions,
|
||||
verbosity=verbosity,
|
||||
trajectory_dir=trajectory_dir,
|
||||
max_retries=max_retries,
|
||||
@@ -157,7 +165,6 @@ async def run_single_task(
|
||||
# Full-dataset runner
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def run_full_dataset(
|
||||
dataset: str | Dataset | list[dict[str, Any]],
|
||||
*,
|
||||
@@ -173,6 +180,7 @@ async def run_full_dataset(
|
||||
custom_loop: Any | None = None,
|
||||
only_n_most_recent_images: int | None = 5,
|
||||
callbacks: list[Any] | None = None,
|
||||
instructions: str | None = None,
|
||||
verbosity: int | None = None,
|
||||
max_retries: int | None = 3,
|
||||
screenshot_delay: float | int = 0.5,
|
||||
@@ -207,6 +215,7 @@ async def run_full_dataset(
|
||||
"custom_loop": custom_loop,
|
||||
"only_n_most_recent_images": only_n_most_recent_images,
|
||||
"callbacks": callbacks,
|
||||
"instructions": instructions,
|
||||
"verbosity": verbosity,
|
||||
"max_retries": max_retries,
|
||||
"screenshot_delay": screenshot_delay,
|
||||
|
||||
Reference in New Issue
Block a user