updated docs

This commit is contained in:
Dillon DuPont
2025-08-27 17:44:04 -04:00
parent 38026a43f0
commit 8dee77bf68
2 changed files with 23 additions and 25 deletions

View File

@@ -10,37 +10,35 @@ The HUD integration allows you to use ComputerAgent with the [HUD benchmarking f
```bash
pip install "cua-agent[hud]"
## or install hud-python directly
# pip install hud-python==0.2.10
# pip install hud-python==0.4.12
```
## Usage
```python
from agent.integrations.hud import run_job
from hud import load_taskset
from hud.taskset import TaskSet
import logging
# Quick single-task smoke test
from agent.integrations.hud import run_single_task
# Load taskset
taskset = await load_taskset("OSWorld-Verified")
taskset = TaskSet(tasks=taskset[:10]) # limit to 10 tasks instead of all 370
# Run benchmark job
job = await run_job(
model="openai/computer-use-preview",
# model="anthropic/claude-3-5-sonnet-20241022",
# model="huggingface-local/HelloKKMe/GTA1-7B+openai/gpt-5",
task_or_taskset=taskset,
job_name="test-computeragent-job",
max_concurrent_tasks=5,
# add any extra ComputerAgent kwargs:
verbosity=logging.INFO, # Enable logging
# trajectory_dir=".." # Save trajectories locally
await run_single_task(
dataset="hud-evals/OSWorld-Verified-XLang", # or another HUD dataset
model="openai/computer-use-preview+openai/gpt-5-nano", # any supported model string
task_id=155, # e.g., reopen last closed tab
)
# Get results OR view them at app.hud.so
print(await job.get_analytics())
print(f"View results at: https://app.hud.so/jobs/{job.id}")
# Run a small split of OSWorld-Verified in parallel
from agent.integrations.hud import run_full_dataset
results = await run_full_dataset(
dataset="hud-evals/OSWorld-Verified-XLang", # can also pass a Dataset or list[dict]
model="openai/computer-use-preview",
split="train[:3]", # try a few tasks to start
max_concurrent=20, # tune to your infra
max_steps=50 # safety cap per task
)
# Environment variables required:
# - HUD_API_KEY (HUD access)
# - OPENAI_API_KEY or ANTHROPIC_API_KEY depending on your chosen model(s)
```
**Available Benchmarks:**