mirror of
https://github.com/trycua/computer.git
synced 2026-01-05 21:09:58 -06:00
updated docs
This commit is contained in:
@@ -10,37 +10,35 @@ The HUD integration allows you to use ComputerAgent with the [HUD benchmarking f
|
||||
```bash
|
||||
pip install "cua-agent[hud]"
|
||||
## or install hud-python directly
|
||||
# pip install hud-python==0.2.10
|
||||
# pip install hud-python==0.4.12
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
```python
|
||||
from agent.integrations.hud import run_job
|
||||
from hud import load_taskset
|
||||
from hud.taskset import TaskSet
|
||||
import logging
|
||||
# Quick single-task smoke test
|
||||
from agent.integrations.hud import run_single_task
|
||||
|
||||
# Load taskset
|
||||
taskset = await load_taskset("OSWorld-Verified")
|
||||
taskset = TaskSet(tasks=taskset[:10]) # limit to 10 tasks instead of all 370
|
||||
|
||||
# Run benchmark job
|
||||
job = await run_job(
|
||||
model="openai/computer-use-preview",
|
||||
# model="anthropic/claude-3-5-sonnet-20241022",
|
||||
# model="huggingface-local/HelloKKMe/GTA1-7B+openai/gpt-5",
|
||||
task_or_taskset=taskset,
|
||||
job_name="test-computeragent-job",
|
||||
max_concurrent_tasks=5,
|
||||
# add any extra ComputerAgent kwargs:
|
||||
verbosity=logging.INFO, # Enable logging
|
||||
# trajectory_dir=".." # Save trajectories locally
|
||||
await run_single_task(
|
||||
dataset="hud-evals/OSWorld-Verified-XLang", # or another HUD dataset
|
||||
model="openai/computer-use-preview+openai/gpt-5-nano", # any supported model string
|
||||
task_id=155, # e.g., reopen last closed tab
|
||||
)
|
||||
|
||||
# Get results OR view them at app.hud.so
|
||||
print(await job.get_analytics())
|
||||
print(f"View results at: https://app.hud.so/jobs/{job.id}")
|
||||
# Run a small split of OSWorld-Verified in parallel
|
||||
from agent.integrations.hud import run_full_dataset
|
||||
|
||||
results = await run_full_dataset(
|
||||
dataset="hud-evals/OSWorld-Verified-XLang", # can also pass a Dataset or list[dict]
|
||||
model="openai/computer-use-preview",
|
||||
split="train[:3]", # try a few tasks to start
|
||||
max_concurrent=20, # tune to your infra
|
||||
max_steps=50 # safety cap per task
|
||||
)
|
||||
|
||||
# Environment variables required:
|
||||
# - HUD_API_KEY (HUD access)
|
||||
# - OPENAI_API_KEY or ANTHROPIC_API_KEY depending on your chosen model(s)
|
||||
```
|
||||
|
||||
**Available Benchmarks:**
|
||||
|
||||
Reference in New Issue
Block a user