From b9f307a149370fd3cdabffd8b2f24c3c69248756 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Fri, 8 Aug 2025 12:17:35 -0400
Subject: [PATCH] Added HUD integration

---
 .../docs/agent-sdk/benchmarks/meta.json       |  3 +-
 .../agent-sdk/benchmarks/osworld-verified.mdx | 89 +++++++++++++++++++
 .../docs/agent-sdk/integrations/hud.mdx       | 43 +++++++++
 .../docs/agent-sdk/integrations/meta.json     |  4 +
 docs/content/docs/agent-sdk/meta.json         |  3 +-
 5 files changed, 140 insertions(+), 2 deletions(-)
 create mode 100644 docs/content/docs/agent-sdk/benchmarks/osworld-verified.mdx
 create mode 100644 docs/content/docs/agent-sdk/integrations/hud.mdx
 create mode 100644 docs/content/docs/agent-sdk/integrations/meta.json

diff --git a/docs/content/docs/agent-sdk/benchmarks/meta.json b/docs/content/docs/agent-sdk/benchmarks/meta.json
index aa49a156..3573a892 100644
--- a/docs/content/docs/agent-sdk/benchmarks/meta.json
+++ b/docs/content/docs/agent-sdk/benchmarks/meta.json
@@ -3,6 +3,7 @@
         "introduction",
         "screenspot-v2",
         "screenspot-pro",
-        "interactive"
+        "interactive",
+        "osworld-verified"
     ]
 }
\ No newline at end of file
diff --git a/docs/content/docs/agent-sdk/benchmarks/osworld-verified.mdx b/docs/content/docs/agent-sdk/benchmarks/osworld-verified.mdx
new file mode 100644
index 00000000..16e1ee2c
--- /dev/null
+++ b/docs/content/docs/agent-sdk/benchmarks/osworld-verified.mdx
@@ -0,0 +1,89 @@
+---
+title: OSWorld-Verified
+description: Benchmark ComputerAgent on OSWorld tasks using HUD
+---
+
+OSWorld-Verified is a curated subset of OSWorld tasks that can be run using the HUD framework. Use ComputerAgent with HUD to benchmark on these tasks.
+
+## Setup
+
+```bash
+pip install hud-python==0.2.10
+```
+
+Set environment variables:
+```bash
+export HUD_API_KEY="your_hud_key"
+export ANTHROPIC_API_KEY="your_anthropic_key"  # For Claude
+export OPENAI_API_KEY="your_openai_key"        # For OpenAI
+```
+
+## Quick Start
+
+```python
+import asyncio
+from hud import gym, load_taskset
+from agent.integrations.hud import ComputerAgent
+
+async def run_osworld():
+    # Load taskset
+    taskset = await load_taskset("OSWorld-Verified")
+    test = taskset[144]  # Example task
+    
+    # Create environment (~2.5 min startup)
+    env = await gym.make(test)
+    
+    # Create agent
+    agent = ComputerAgent(
+        model="anthropic/claude-3-5-sonnet-20241022", # any ComputerAgent model string
+        environment="linux",
+        max_iterations=8
+    )
+    
+    # Run benchmark
+    obs, _ = await env.reset()
+    for i in range(agent.max_iterations):
+        action, done = await agent.predict(obs)
+        obs, reward, terminated, info = await env.step(action)
+        if done or terminated:
+            break
+    
+    # Evaluate results
+    result = await env.evaluate()
+    await env.close()
+    
+    return result
+
+# Run benchmark
+result = asyncio.run(run_osworld())
+print(f"Success: {result.get('success', False)}")
+```
+
+## Parallel Execution
+
+Run all tasks in parallel using `run_job`:
+
+```python
+from hud import run_job
+from agent.integrations.hud import ComputerAgent
+import logging
+
+logging.basicConfig(level=logging.INFO)
+
+# Load full taskset
+taskset = await load_taskset("OSWorld-Verified")
+
+# Run parallel job
+job = await run_job(
+    ComputerAgent,
+    taskset,
+    "osworld-computeragent",
+    max_steps_per_task=8,
+    max_concurrent_tasks=20,
+    auto_reply_question=True,
+    agent_kwargs={"model": "anthropic/claude-3-5-sonnet-20241022"}
+)
+
+# Get analytics
+analytics = await job.get_analytics()
+```
diff --git a/docs/content/docs/agent-sdk/integrations/hud.mdx b/docs/content/docs/agent-sdk/integrations/hud.mdx
new file mode 100644
index 00000000..786e45b5
--- /dev/null
+++ b/docs/content/docs/agent-sdk/integrations/hud.mdx
@@ -0,0 +1,43 @@
+---
+title: HUD Evals
+description: Use ComputerAgent with HUD for benchmarking and evaluation
+---
+
+The HUD integration allows you to use ComputerAgent with the [HUD benchmarking framework](https://www.hud.so/), providing the same interface as existing HUD agents while leveraging ComputerAgent's capabilities.
+
+## Installation
+
+```bash
+pip install "cua-agent[hud]"
+## or install hud-python directly
+# pip install hud-python==0.2.10
+```
+
+## Usage
+
+```python
+from agent.integrations.hud import ComputerAgent
+
+# Create agent with any ComputerAgent model
+agent = ComputerAgent(
+    model="anthropic/claude-3-5-sonnet-20241022",  # or any model string
+    environment="linux"
+)
+
+# Use exactly like other HUD agents
+action, done = await agent.predict(observation)
+```
+
+## Environment Variables
+
+Set these environment variables:
+
+- `HUD_API_KEY` - Your HUD API key
+- `ANTHROPIC_API_KEY` - For Claude models
+- `OPENAI_API_KEY` - For OpenAI models
+
+## Example Benchmarks
+
+1. [OSWorld-Verified](/agent-sdk/benchmarks/osworld-verified) - Benchmark on OSWorld tasks with parallel execution
+
+See the [HUD docs](https://docs.hud.so/environment-creation) for more eval environments.
\ No newline at end of file
diff --git a/docs/content/docs/agent-sdk/integrations/meta.json b/docs/content/docs/agent-sdk/integrations/meta.json
new file mode 100644
index 00000000..7b7ebb81
--- /dev/null
+++ b/docs/content/docs/agent-sdk/integrations/meta.json
@@ -0,0 +1,4 @@
+{
+  "title": "Integrations",
+  "pages": ["hud"]
+}
diff --git a/docs/content/docs/agent-sdk/meta.json b/docs/content/docs/agent-sdk/meta.json
index 4907fe13..5db33148 100644
--- a/docs/content/docs/agent-sdk/meta.json
+++ b/docs/content/docs/agent-sdk/meta.json
@@ -12,6 +12,7 @@
         "prompt-caching",
 		"usage-tracking",
 		"benchmarks",
-        "migration-guide"
+        "migration-guide",
+		"integrations"
 	]
 }