updated docs

2026-03-13 13:19:06 -05:00 · 2025-08-27 17:44:04 -04:00
parent 38026a43f0
commit 8dee77bf68
2 changed files with 23 additions and 25 deletions
--- a/docs/content/docs/agent-sdk/integrations/hud.mdx
+++ b/docs/content/docs/agent-sdk/integrations/hud.mdx
@@ -10,37 +10,35 @@ The HUD integration allows you to use ComputerAgent with the [HUD benchmarking f
 ```bash
 pip install "cua-agent[hud]"
 ## or install hud-python directly
-# pip install hud-python==0.2.10
+# pip install hud-python==0.4.12
 ```

 ## Usage

 ```python
-from agent.integrations.hud import run_job
-from hud import load_taskset
-from hud.taskset import TaskSet
-import logging
+# Quick single-task smoke test
+from agent.integrations.hud import run_single_task

-# Load taskset
-taskset = await load_taskset("OSWorld-Verified")
-taskset = TaskSet(tasks=taskset[:10]) # limit to 10 tasks instead of all 370
-
-# Run benchmark job
-job = await run_job(
-    model="openai/computer-use-preview",
-    # model="anthropic/claude-3-5-sonnet-20241022",
-    # model="huggingface-local/HelloKKMe/GTA1-7B+openai/gpt-5",
-    task_or_taskset=taskset,
-    job_name="test-computeragent-job",
-    max_concurrent_tasks=5,
-    # add any extra ComputerAgent kwargs:
-    verbosity=logging.INFO,  # Enable logging
-    # trajectory_dir=".."       # Save trajectories locally
+await run_single_task(
+    dataset="hud-evals/OSWorld-Verified-XLang",   # or another HUD dataset
+    model="openai/computer-use-preview+openai/gpt-5-nano",  # any supported model string
+    task_id=155,  # e.g., reopen last closed tab
 )

-# Get results OR view them at app.hud.so
-print(await job.get_analytics())
-print(f"View results at: https://app.hud.so/jobs/{job.id}")
+# Run a small split of OSWorld-Verified in parallel
+from agent.integrations.hud import run_full_dataset
+
+results = await run_full_dataset(
+    dataset="hud-evals/OSWorld-Verified-XLang",   # can also pass a Dataset or list[dict]
+    model="openai/computer-use-preview",
+    split="train[:3]",           # try a few tasks to start
+    max_concurrent=20,            # tune to your infra
+    max_steps=50                  # safety cap per task
+)
+
+# Environment variables required:
+# - HUD_API_KEY (HUD access)
+# - OPENAI_API_KEY or ANTHROPIC_API_KEY depending on your chosen model(s)
 ```

 **Available Benchmarks:**