From b9f307a149370fd3cdabffd8b2f24c3c69248756 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Fri, 8 Aug 2025 12:17:35 -0400 Subject: [PATCH] Added HUD integration --- .../docs/agent-sdk/benchmarks/meta.json | 3 +- .../agent-sdk/benchmarks/osworld-verified.mdx | 89 +++++++++++++++++++ .../docs/agent-sdk/integrations/hud.mdx | 43 +++++++++ .../docs/agent-sdk/integrations/meta.json | 4 + docs/content/docs/agent-sdk/meta.json | 3 +- 5 files changed, 140 insertions(+), 2 deletions(-) create mode 100644 docs/content/docs/agent-sdk/benchmarks/osworld-verified.mdx create mode 100644 docs/content/docs/agent-sdk/integrations/hud.mdx create mode 100644 docs/content/docs/agent-sdk/integrations/meta.json diff --git a/docs/content/docs/agent-sdk/benchmarks/meta.json b/docs/content/docs/agent-sdk/benchmarks/meta.json index aa49a156..3573a892 100644 --- a/docs/content/docs/agent-sdk/benchmarks/meta.json +++ b/docs/content/docs/agent-sdk/benchmarks/meta.json @@ -3,6 +3,7 @@ "introduction", "screenspot-v2", "screenspot-pro", - "interactive" + "interactive", + "osworld-verified" ] } \ No newline at end of file diff --git a/docs/content/docs/agent-sdk/benchmarks/osworld-verified.mdx b/docs/content/docs/agent-sdk/benchmarks/osworld-verified.mdx new file mode 100644 index 00000000..16e1ee2c --- /dev/null +++ b/docs/content/docs/agent-sdk/benchmarks/osworld-verified.mdx @@ -0,0 +1,89 @@ +--- +title: OSWorld-Verified +description: Benchmark ComputerAgent on OSWorld tasks using HUD +--- + +OSWorld-Verified is a curated subset of OSWorld tasks that can be run using the HUD framework. Use ComputerAgent with HUD to benchmark on these tasks. + +## Setup + +```bash +pip install hud-python==0.2.10 +``` + +Set environment variables: +```bash +export HUD_API_KEY="your_hud_key" +export ANTHROPIC_API_KEY="your_anthropic_key" # For Claude +export OPENAI_API_KEY="your_openai_key" # For OpenAI +``` + +## Quick Start + +```python +import asyncio +from hud import gym, load_taskset +from agent.integrations.hud import ComputerAgent + +async def run_osworld(): + # Load taskset + taskset = await load_taskset("OSWorld-Verified") + test = taskset[144] # Example task + + # Create environment (~2.5 min startup) + env = await gym.make(test) + + # Create agent + agent = ComputerAgent( + model="anthropic/claude-3-5-sonnet-20241022", # any ComputerAgent model string + environment="linux", + max_iterations=8 + ) + + # Run benchmark + obs, _ = await env.reset() + for i in range(agent.max_iterations): + action, done = await agent.predict(obs) + obs, reward, terminated, info = await env.step(action) + if done or terminated: + break + + # Evaluate results + result = await env.evaluate() + await env.close() + + return result + +# Run benchmark +result = asyncio.run(run_osworld()) +print(f"Success: {result.get('success', False)}") +``` + +## Parallel Execution + +Run all tasks in parallel using `run_job`: + +```python +from hud import run_job +from agent.integrations.hud import ComputerAgent +import logging + +logging.basicConfig(level=logging.INFO) + +# Load full taskset +taskset = await load_taskset("OSWorld-Verified") + +# Run parallel job +job = await run_job( + ComputerAgent, + taskset, + "osworld-computeragent", + max_steps_per_task=8, + max_concurrent_tasks=20, + auto_reply_question=True, + agent_kwargs={"model": "anthropic/claude-3-5-sonnet-20241022"} +) + +# Get analytics +analytics = await job.get_analytics() +``` diff --git a/docs/content/docs/agent-sdk/integrations/hud.mdx b/docs/content/docs/agent-sdk/integrations/hud.mdx new file mode 100644 index 00000000..786e45b5 --- /dev/null +++ b/docs/content/docs/agent-sdk/integrations/hud.mdx @@ -0,0 +1,43 @@ +--- +title: HUD Evals +description: Use ComputerAgent with HUD for benchmarking and evaluation +--- + +The HUD integration allows you to use ComputerAgent with the [HUD benchmarking framework](https://www.hud.so/), providing the same interface as existing HUD agents while leveraging ComputerAgent's capabilities. + +## Installation + +```bash +pip install "cua-agent[hud]" +## or install hud-python directly +# pip install hud-python==0.2.10 +``` + +## Usage + +```python +from agent.integrations.hud import ComputerAgent + +# Create agent with any ComputerAgent model +agent = ComputerAgent( + model="anthropic/claude-3-5-sonnet-20241022", # or any model string + environment="linux" +) + +# Use exactly like other HUD agents +action, done = await agent.predict(observation) +``` + +## Environment Variables + +Set these environment variables: + +- `HUD_API_KEY` - Your HUD API key +- `ANTHROPIC_API_KEY` - For Claude models +- `OPENAI_API_KEY` - For OpenAI models + +## Example Benchmarks + +1. [OSWorld-Verified](/agent-sdk/benchmarks/osworld-verified) - Benchmark on OSWorld tasks with parallel execution + +See the [HUD docs](https://docs.hud.so/environment-creation) for more eval environments. \ No newline at end of file diff --git a/docs/content/docs/agent-sdk/integrations/meta.json b/docs/content/docs/agent-sdk/integrations/meta.json new file mode 100644 index 00000000..7b7ebb81 --- /dev/null +++ b/docs/content/docs/agent-sdk/integrations/meta.json @@ -0,0 +1,4 @@ +{ + "title": "Integrations", + "pages": ["hud"] +} diff --git a/docs/content/docs/agent-sdk/meta.json b/docs/content/docs/agent-sdk/meta.json index 4907fe13..5db33148 100644 --- a/docs/content/docs/agent-sdk/meta.json +++ b/docs/content/docs/agent-sdk/meta.json @@ -12,6 +12,7 @@ "prompt-caching", "usage-tracking", "benchmarks", - "migration-guide" + "migration-guide", + "integrations" ] }