added simple guide for customizing computeragent

2026-02-19 12:59:34 -06:00 · 2025-09-09 10:55:57 -04:00
parent c38cab2776
commit 17d6709629
5 changed files with 145 additions and 4 deletions
--- a/docs/content/docs/agent-sdk/customizing-computeragent.mdx
+++ b/docs/content/docs/agent-sdk/customizing-computeragent.mdx
@@ -0,0 +1,121 @@
+---
+title: Customizing Your ComputerAgent
+---
+
+The `ComputerAgent` interface provides an easy proxy to any computer-using model configuration, and it is a powerful framework for extending and building your own agentic systems.
+
+This guide shows four proven ways to increase capabilities and success rate:
+
+- 1 — Simple: Prompt engineering
+- 2 — Easy: Tools
+- 3 — Intermediate: Callbacks
+- 4 — Expert: Custom `@register_agent`
+
+For a hands-on walkthrough, see the companion notebook: [notebooks/customizing_computeragent.ipynb](../../../notebooks/customizing_computeragent.ipynb).
+
+## 1) Simple: Prompt engineering
+
+Provide guiding instructions to shape behavior. `ComputerAgent` accepts an optional `instructions: str | None` which acts like a system-style preface. Internally, this uses a callback that pre-pends a user message before each LLM call.
+
+```python
+from agent.agent import ComputerAgent
+
+agent = ComputerAgent(
+    model="openai/computer-use-preview",
+    tools=[computer],
+    instructions=(
+        "You are a meticulous software operator. Prefer safe, deterministic actions. "
+        "Always confirm via on-screen text before proceeding."
+    ),
+)
+```
+
+## 2) Easy: Tools
+
+Expose deterministic capabilities as tools (Python functions or custom computer handlers). The agent will call them when appropriate.
+
+```python
+def calculate_percentage(numerator: float, denominator: float) -> str:
+    """Calculate percentage as a string.
+
+    Args:
+        numerator: Numerator value
+        denominator: Denominator value
+    Returns:
+        A formatted percentage string (e.g., '75.00%').
+    """
+    if denominator == 0:
+        return "0.00%"
+    return f"{(numerator/denominator)*100:.2f}%"
+
+agent = ComputerAgent(
+    model="openai/computer-use-preview",
+    tools=[computer, calculate_percentage],
+)
+```
+
+- See `docs/agent-sdk/custom-tools` for authoring function tools.
+- See `docs/agent-sdk/custom-computer-handlers` for building full computer interfaces.
+
+## 3) Intermediate: Callbacks
+
+Callbacks provide lifecycle hooks to preprocess messages, postprocess outputs, record trajectories, manage costs, and more.
+
+```python
+from agent.callbacks import ImageRetentionCallback, TrajectorySaverCallback, BudgetManagerCallback
+
+agent = ComputerAgent(
+    model="anthropic/claude-3-5-sonnet-20241022",
+    tools=[computer],
+    callbacks=[
+        ImageRetentionCallback(only_n_most_recent_images=3),
+        TrajectorySaverCallback("./trajectories"),
+        BudgetManagerCallback(max_budget=10.0, raise_error=True),
+    ],
+)
+```
+
+- Browse implementations in `libs/python/agent/agent/loops/`.
+
+## 4) Expert: Custom `@register_agent`
+
+Build your own agent configuration class to control prompting, message shaping, and tool handling. This is the most flexible option for specialized domains.
+
+- Register your own `model=...` loop using `@register_agent`
+- Browse implementations in `libs/python/agent/agent/loops/`.
+- Implement `predict_step()` (and optionally `predict_click()`) and return the standardized output schema.
+
+```python
+from agent.decorators import register_agent
+
+@register_agent(models=r".*my-special-model.*", priority=10)
+class MyCustomAgentConfig:
+    async def predict_step(self, messages, model, tools, **kwargs):
+        # 1) Format messages for your provider
+        # 2) Call provider
+        # 3) Convert responses to the agent output schema
+        return {"output": [], "usage": {}}
+
+    async def predict_click(self, model, image_b64, instruction):
+        # Optional: click-only capability
+        return None
+
+    def get_capabilities(self):
+        return ["step"]
+```
+
+## HUD integration (optional)
+
+When using the HUD evaluation integration (`agent/integrations/hud/`), you can pass `instructions`, `tools`, and `callbacks` directly
+
+```python
+from agent.integrations.hud import run_single_task
+
+await run_single_task(
+    dataset="username/dataset-name",
+    model="openai/computer-use-preview",
+    instructions="Operate carefully. Always verify on-screen text before actions.",
+    # tools=[your_custom_function],
+    # callbacks=[YourCustomCallback()],
+)
+```
--- a/docs/content/docs/agent-sdk/meta.json
+++ b/docs/content/docs/agent-sdk/meta.json
@@ -6,6 +6,7 @@
        "supported-agents",
 		"supported-model-providers",
 		"chat-history",
+		"customizing-computeragent",
 		"callbacks",
        "custom-tools",
 		"custom-computer-handlers",
--- a/libs/python/agent/agent/agent.py
+++ b/libs/python/agent/agent/agent.py
@@ -31,7 +31,8 @@ from .callbacks import (
    TrajectorySaverCallback, 
    BudgetManagerCallback,
    TelemetryCallback,
-    OperatorNormalizerCallback
+    OperatorNormalizerCallback,
+    PromptInstructionsCallback,
 )
 from .computers import (
    AsyncComputerHandler,
@@ -162,6 +163,7 @@ class ComputerAgent:
        custom_loop: Optional[Callable] = None,
        only_n_most_recent_images: Optional[int] = None,
        callbacks: Optional[List[Any]] = None,
+        instructions: Optional[str] = None,
        verbosity: Optional[int] = None,
        trajectory_dir: Optional[str | Path | dict] = None,
        max_retries: Optional[int] = 3,
@@ -180,6 +182,7 @@ class ComputerAgent:
            custom_loop: Custom agent loop function to use instead of auto-selection
            only_n_most_recent_images: If set, only keep the N most recent images in message history. Adds ImageRetentionCallback automatically.
            callbacks: List of AsyncCallbackHandler instances for preprocessing/postprocessing
+            instructions: Optional system instructions to be passed to the model
            verbosity: Logging level (logging.DEBUG, logging.INFO, etc.). If set, adds LoggingCallback automatically
            trajectory_dir: If set, saves trajectory data (screenshots, responses) to this directory. Adds TrajectorySaverCallback automatically.
            max_retries: Maximum number of retries for failed API calls
@@ -198,6 +201,7 @@ class ComputerAgent:
        self.custom_loop = custom_loop
        self.only_n_most_recent_images = only_n_most_recent_images
        self.callbacks = callbacks or []
+        self.instructions = instructions
        self.verbosity = verbosity
        self.trajectory_dir = trajectory_dir
        self.max_retries = max_retries
@@ -211,6 +215,10 @@ class ComputerAgent:
        # Prepend operator normalizer callback
        self.callbacks.insert(0, OperatorNormalizerCallback())

+        # Add prompt instructions callback if provided
+        if self.instructions:
+            self.callbacks.append(PromptInstructionsCallback(self.instructions))
+
        # Add telemetry callback if telemetry_enabled is set
        if self.telemetry_enabled:
            if isinstance(self.telemetry_enabled, bool):
--- a/libs/python/agent/agent/callbacks/init.py
+++ b/libs/python/agent/agent/callbacks/init.py
@@ -9,6 +9,7 @@ from .trajectory_saver import TrajectorySaverCallback
 from .budget_manager import BudgetManagerCallback
 from .telemetry import TelemetryCallback
 from .operator_validator import OperatorNormalizerCallback
+from .prompt_instructions import PromptInstructionsCallback

 __all__ = [
    "AsyncCallbackHandler",
@@ -18,4 +19,5 @@ __all__ = [
    "BudgetManagerCallback",
    "TelemetryCallback",
    "OperatorNormalizerCallback",
+    "PromptInstructionsCallback",
 ]
--- a/libs/python/agent/agent/integrations/hud/init.py
+++ b/libs/python/agent/agent/integrations/hud/init.py
@@ -20,6 +20,7 @@ from hud import trace

 from agent.agent import ComputerAgent as BaseComputerAgent
 from .proxy import FakeAsyncOpenAI
+from agent.callbacks import PromptInstructionsCallback


 # ---------------------------------------------------------------------------
@@ -47,6 +48,7 @@ class ProxyOperatorAgent(OperatorAgent):
        custom_loop: Any | None = None,
        only_n_most_recent_images: int | None = None,
        callbacks: list[Any] | None = None,
+        instructions: str | None = None,
        verbosity: int | None = None,
        max_retries: int | None = 3,
        screenshot_delay: float | int = 0.5,
@@ -68,12 +70,17 @@ class ProxyOperatorAgent(OperatorAgent):
        if tools:
            agent_tools.extend(tools)

+        # Build callbacks, injecting prompt instructions if provided
+        agent_callbacks = list(callbacks or [])
+        if instructions:
+            agent_callbacks.append(PromptInstructionsCallback(instructions))
+
        computer_agent = BaseComputerAgent(
            model=model,
            tools=agent_tools,
            custom_loop=custom_loop,
            only_n_most_recent_images=only_n_most_recent_images,
-            callbacks=callbacks,
+            callbacks=agent_callbacks,
            verbosity=verbosity,
            trajectory_dir=trajectory_dir,
            max_retries=max_retries,
@@ -96,7 +103,6 @@ class ProxyOperatorAgent(OperatorAgent):
 # Single-task runner
 # ---------------------------------------------------------------------------

-
 async def run_single_task(
    dataset: str | Dataset | list[dict[str, Any]],
    *,
@@ -108,6 +114,7 @@ async def run_single_task(
    custom_loop: Any | None = None,
    only_n_most_recent_images: int | None = None,
    callbacks: list[Any] | None = None,
+    instructions: str | None = None,
    verbosity: int | None = None,
    trajectory_dir: str | dict | None = None,
    max_retries: int | None = 3,
@@ -140,6 +147,7 @@ async def run_single_task(
            custom_loop=custom_loop,
            only_n_most_recent_images=only_n_most_recent_images,
            callbacks=callbacks,
+            instructions=instructions,
            verbosity=verbosity,
            trajectory_dir=trajectory_dir,
            max_retries=max_retries,
@@ -157,7 +165,6 @@ async def run_single_task(
 # Full-dataset runner
 # ---------------------------------------------------------------------------

-
 async def run_full_dataset(
    dataset: str | Dataset | list[dict[str, Any]],
    *,
@@ -173,6 +180,7 @@ async def run_full_dataset(
    custom_loop: Any | None = None,
    only_n_most_recent_images: int | None = 5,
    callbacks: list[Any] | None = None,
+    instructions: str | None = None,
    verbosity: int | None = None,
    max_retries: int | None = 3,
    screenshot_delay: float | int = 0.5,
@@ -207,6 +215,7 @@ async def run_full_dataset(
            "custom_loop": custom_loop,
            "only_n_most_recent_images": only_n_most_recent_images,
            "callbacks": callbacks,
+            "instructions": instructions,
            "verbosity": verbosity,
            "max_retries": max_retries,
            "screenshot_delay": screenshot_delay,