diff --git a/docs/content/docs/agent-sdk/customizing-computeragent.mdx b/docs/content/docs/agent-sdk/customizing-computeragent.mdx new file mode 100644 index 00000000..462bb5f9 --- /dev/null +++ b/docs/content/docs/agent-sdk/customizing-computeragent.mdx @@ -0,0 +1,121 @@ +--- +title: Customizing Your ComputerAgent +--- + +The `ComputerAgent` interface provides an easy proxy to any computer-using model configuration, and it is a powerful framework for extending and building your own agentic systems. + +This guide shows four proven ways to increase capabilities and success rate: + +- 1 — Simple: Prompt engineering +- 2 — Easy: Tools +- 3 — Intermediate: Callbacks +- 4 — Expert: Custom `@register_agent` + +For a hands-on walkthrough, see the companion notebook: [notebooks/customizing_computeragent.ipynb](../../../notebooks/customizing_computeragent.ipynb). + +## 1) Simple: Prompt engineering + +Provide guiding instructions to shape behavior. `ComputerAgent` accepts an optional `instructions: str | None` which acts like a system-style preface. Internally, this uses a callback that pre-pends a user message before each LLM call. + +```python +from agent.agent import ComputerAgent + +agent = ComputerAgent( + model="openai/computer-use-preview", + tools=[computer], + instructions=( + "You are a meticulous software operator. Prefer safe, deterministic actions. " + "Always confirm via on-screen text before proceeding." + ), +) +``` + +## 2) Easy: Tools + +Expose deterministic capabilities as tools (Python functions or custom computer handlers). The agent will call them when appropriate. + +```python +def calculate_percentage(numerator: float, denominator: float) -> str: + """Calculate percentage as a string. + + Args: + numerator: Numerator value + denominator: Denominator value + Returns: + A formatted percentage string (e.g., '75.00%'). + """ + if denominator == 0: + return "0.00%" + return f"{(numerator/denominator)*100:.2f}%" + +agent = ComputerAgent( + model="openai/computer-use-preview", + tools=[computer, calculate_percentage], +) +``` + +- See `docs/agent-sdk/custom-tools` for authoring function tools. +- See `docs/agent-sdk/custom-computer-handlers` for building full computer interfaces. + +## 3) Intermediate: Callbacks + +Callbacks provide lifecycle hooks to preprocess messages, postprocess outputs, record trajectories, manage costs, and more. + +```python +from agent.callbacks import ImageRetentionCallback, TrajectorySaverCallback, BudgetManagerCallback + +agent = ComputerAgent( + model="anthropic/claude-3-5-sonnet-20241022", + tools=[computer], + callbacks=[ + ImageRetentionCallback(only_n_most_recent_images=3), + TrajectorySaverCallback("./trajectories"), + BudgetManagerCallback(max_budget=10.0, raise_error=True), + ], +) +``` + +- Browse implementations in `libs/python/agent/agent/loops/`. + +## 4) Expert: Custom `@register_agent` + +Build your own agent configuration class to control prompting, message shaping, and tool handling. This is the most flexible option for specialized domains. + +- Register your own `model=...` loop using `@register_agent` +- Browse implementations in `libs/python/agent/agent/loops/`. +- Implement `predict_step()` (and optionally `predict_click()`) and return the standardized output schema. + +```python +from agent.decorators import register_agent + +@register_agent(models=r".*my-special-model.*", priority=10) +class MyCustomAgentConfig: + async def predict_step(self, messages, model, tools, **kwargs): + # 1) Format messages for your provider + # 2) Call provider + # 3) Convert responses to the agent output schema + return {"output": [], "usage": {}} + + async def predict_click(self, model, image_b64, instruction): + # Optional: click-only capability + return None + + def get_capabilities(self): + return ["step"] +``` + +## HUD integration (optional) + +When using the HUD evaluation integration (`agent/integrations/hud/`), you can pass `instructions`, `tools`, and `callbacks` directly + +```python +from agent.integrations.hud import run_single_task + +await run_single_task( + dataset="username/dataset-name", + model="openai/computer-use-preview", + instructions="Operate carefully. Always verify on-screen text before actions.", + # tools=[your_custom_function], + # callbacks=[YourCustomCallback()], +) +``` \ No newline at end of file diff --git a/docs/content/docs/agent-sdk/meta.json b/docs/content/docs/agent-sdk/meta.json index 07bf7199..b745ce58 100644 --- a/docs/content/docs/agent-sdk/meta.json +++ b/docs/content/docs/agent-sdk/meta.json @@ -6,6 +6,7 @@ "supported-agents", "supported-model-providers", "chat-history", + "customizing-computeragent", "callbacks", "custom-tools", "custom-computer-handlers", diff --git a/libs/python/agent/agent/agent.py b/libs/python/agent/agent/agent.py index b796866d..feb0363b 100644 --- a/libs/python/agent/agent/agent.py +++ b/libs/python/agent/agent/agent.py @@ -31,7 +31,8 @@ from .callbacks import ( TrajectorySaverCallback, BudgetManagerCallback, TelemetryCallback, - OperatorNormalizerCallback + OperatorNormalizerCallback, + PromptInstructionsCallback, ) from .computers import ( AsyncComputerHandler, @@ -162,6 +163,7 @@ class ComputerAgent: custom_loop: Optional[Callable] = None, only_n_most_recent_images: Optional[int] = None, callbacks: Optional[List[Any]] = None, + instructions: Optional[str] = None, verbosity: Optional[int] = None, trajectory_dir: Optional[str | Path | dict] = None, max_retries: Optional[int] = 3, @@ -180,6 +182,7 @@ class ComputerAgent: custom_loop: Custom agent loop function to use instead of auto-selection only_n_most_recent_images: If set, only keep the N most recent images in message history. Adds ImageRetentionCallback automatically. callbacks: List of AsyncCallbackHandler instances for preprocessing/postprocessing + instructions: Optional system instructions to be passed to the model verbosity: Logging level (logging.DEBUG, logging.INFO, etc.). If set, adds LoggingCallback automatically trajectory_dir: If set, saves trajectory data (screenshots, responses) to this directory. Adds TrajectorySaverCallback automatically. max_retries: Maximum number of retries for failed API calls @@ -198,6 +201,7 @@ class ComputerAgent: self.custom_loop = custom_loop self.only_n_most_recent_images = only_n_most_recent_images self.callbacks = callbacks or [] + self.instructions = instructions self.verbosity = verbosity self.trajectory_dir = trajectory_dir self.max_retries = max_retries @@ -211,6 +215,10 @@ class ComputerAgent: # Prepend operator normalizer callback self.callbacks.insert(0, OperatorNormalizerCallback()) + # Add prompt instructions callback if provided + if self.instructions: + self.callbacks.append(PromptInstructionsCallback(self.instructions)) + # Add telemetry callback if telemetry_enabled is set if self.telemetry_enabled: if isinstance(self.telemetry_enabled, bool): diff --git a/libs/python/agent/agent/callbacks/__init__.py b/libs/python/agent/agent/callbacks/__init__.py index e0befcc7..eca40173 100644 --- a/libs/python/agent/agent/callbacks/__init__.py +++ b/libs/python/agent/agent/callbacks/__init__.py @@ -9,6 +9,7 @@ from .trajectory_saver import TrajectorySaverCallback from .budget_manager import BudgetManagerCallback from .telemetry import TelemetryCallback from .operator_validator import OperatorNormalizerCallback +from .prompt_instructions import PromptInstructionsCallback __all__ = [ "AsyncCallbackHandler", @@ -18,4 +19,5 @@ __all__ = [ "BudgetManagerCallback", "TelemetryCallback", "OperatorNormalizerCallback", + "PromptInstructionsCallback", ] diff --git a/libs/python/agent/agent/integrations/hud/__init__.py b/libs/python/agent/agent/integrations/hud/__init__.py index 0da87bfa..b0d06041 100644 --- a/libs/python/agent/agent/integrations/hud/__init__.py +++ b/libs/python/agent/agent/integrations/hud/__init__.py @@ -20,6 +20,7 @@ from hud import trace from agent.agent import ComputerAgent as BaseComputerAgent from .proxy import FakeAsyncOpenAI +from agent.callbacks import PromptInstructionsCallback # --------------------------------------------------------------------------- @@ -47,6 +48,7 @@ class ProxyOperatorAgent(OperatorAgent): custom_loop: Any | None = None, only_n_most_recent_images: int | None = None, callbacks: list[Any] | None = None, + instructions: str | None = None, verbosity: int | None = None, max_retries: int | None = 3, screenshot_delay: float | int = 0.5, @@ -68,12 +70,17 @@ class ProxyOperatorAgent(OperatorAgent): if tools: agent_tools.extend(tools) + # Build callbacks, injecting prompt instructions if provided + agent_callbacks = list(callbacks or []) + if instructions: + agent_callbacks.append(PromptInstructionsCallback(instructions)) + computer_agent = BaseComputerAgent( model=model, tools=agent_tools, custom_loop=custom_loop, only_n_most_recent_images=only_n_most_recent_images, - callbacks=callbacks, + callbacks=agent_callbacks, verbosity=verbosity, trajectory_dir=trajectory_dir, max_retries=max_retries, @@ -96,7 +103,6 @@ class ProxyOperatorAgent(OperatorAgent): # Single-task runner # --------------------------------------------------------------------------- - async def run_single_task( dataset: str | Dataset | list[dict[str, Any]], *, @@ -108,6 +114,7 @@ async def run_single_task( custom_loop: Any | None = None, only_n_most_recent_images: int | None = None, callbacks: list[Any] | None = None, + instructions: str | None = None, verbosity: int | None = None, trajectory_dir: str | dict | None = None, max_retries: int | None = 3, @@ -140,6 +147,7 @@ async def run_single_task( custom_loop=custom_loop, only_n_most_recent_images=only_n_most_recent_images, callbacks=callbacks, + instructions=instructions, verbosity=verbosity, trajectory_dir=trajectory_dir, max_retries=max_retries, @@ -157,7 +165,6 @@ async def run_single_task( # Full-dataset runner # --------------------------------------------------------------------------- - async def run_full_dataset( dataset: str | Dataset | list[dict[str, Any]], *, @@ -173,6 +180,7 @@ async def run_full_dataset( custom_loop: Any | None = None, only_n_most_recent_images: int | None = 5, callbacks: list[Any] | None = None, + instructions: str | None = None, verbosity: int | None = None, max_retries: int | None = 3, screenshot_delay: float | int = 0.5, @@ -207,6 +215,7 @@ async def run_full_dataset( "custom_loop": custom_loop, "only_n_most_recent_images": only_n_most_recent_images, "callbacks": callbacks, + "instructions": instructions, "verbosity": verbosity, "max_retries": max_retries, "screenshot_delay": screenshot_delay,