From 8dee77bf6897d446238045e68dea5e5b245a625a Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Wed, 27 Aug 2025 17:44:04 -0400 Subject: [PATCH] updated docs --- .../docs/agent-sdk/integrations/hud.mdx | 44 +++++++++---------- libs/python/agent/pyproject.toml | 4 +- 2 files changed, 23 insertions(+), 25 deletions(-) diff --git a/docs/content/docs/agent-sdk/integrations/hud.mdx b/docs/content/docs/agent-sdk/integrations/hud.mdx index b517121e..cee5f77f 100644 --- a/docs/content/docs/agent-sdk/integrations/hud.mdx +++ b/docs/content/docs/agent-sdk/integrations/hud.mdx @@ -10,37 +10,35 @@ The HUD integration allows you to use ComputerAgent with the [HUD benchmarking f ```bash pip install "cua-agent[hud]" ## or install hud-python directly -# pip install hud-python==0.2.10 +# pip install hud-python==0.4.12 ``` ## Usage ```python -from agent.integrations.hud import run_job -from hud import load_taskset -from hud.taskset import TaskSet -import logging +# Quick single-task smoke test +from agent.integrations.hud import run_single_task -# Load taskset -taskset = await load_taskset("OSWorld-Verified") -taskset = TaskSet(tasks=taskset[:10]) # limit to 10 tasks instead of all 370 - -# Run benchmark job -job = await run_job( - model="openai/computer-use-preview", - # model="anthropic/claude-3-5-sonnet-20241022", - # model="huggingface-local/HelloKKMe/GTA1-7B+openai/gpt-5", - task_or_taskset=taskset, - job_name="test-computeragent-job", - max_concurrent_tasks=5, - # add any extra ComputerAgent kwargs: - verbosity=logging.INFO, # Enable logging - # trajectory_dir=".." # Save trajectories locally +await run_single_task( + dataset="hud-evals/OSWorld-Verified-XLang", # or another HUD dataset + model="openai/computer-use-preview+openai/gpt-5-nano", # any supported model string + task_id=155, # e.g., reopen last closed tab ) -# Get results OR view them at app.hud.so -print(await job.get_analytics()) -print(f"View results at: https://app.hud.so/jobs/{job.id}") +# Run a small split of OSWorld-Verified in parallel +from agent.integrations.hud import run_full_dataset + +results = await run_full_dataset( + dataset="hud-evals/OSWorld-Verified-XLang", # can also pass a Dataset or list[dict] + model="openai/computer-use-preview", + split="train[:3]", # try a few tasks to start + max_concurrent=20, # tune to your infra + max_steps=50 # safety cap per task +) + +# Environment variables required: +# - HUD_API_KEY (HUD access) +# - OPENAI_API_KEY or ANTHROPIC_API_KEY depending on your chosen model(s) ``` **Available Benchmarks:** diff --git a/libs/python/agent/pyproject.toml b/libs/python/agent/pyproject.toml index 4dd27062..4be2f6b6 100644 --- a/libs/python/agent/pyproject.toml +++ b/libs/python/agent/pyproject.toml @@ -55,7 +55,7 @@ cli = [ "yaspin>=3.1.0", ] hud = [ - "hud-python==0.2.10", + "hud-python>=0.4.12,<0.5.0", ] all = [ # omni requirements @@ -72,7 +72,7 @@ all = [ # cli requirements "yaspin>=3.1.0", # hud requirements - "hud-python==0.2.10", + "hud-python>=0.4.12,<0.5.0", ] [tool.uv]