diff --git a/docs/content/docs/agent-sdk/integrations/hud.mdx b/docs/content/docs/agent-sdk/integrations/hud.mdx index 3ad35878..35236746 100644 --- a/docs/content/docs/agent-sdk/integrations/hud.mdx +++ b/docs/content/docs/agent-sdk/integrations/hud.mdx @@ -38,7 +38,7 @@ You can run a single task from a HUD dataset for quick verification. from agent.integrations.hud import run_single_task await run_single_task( - dataset="hud-evals/OSWorld-Verified-XLang", # or another HUD dataset + dataset="hud-evals/OSWorld-Verified", # or another HUD dataset model="openai/computer-use-preview+openai/gpt-5-nano", # any supported model string task_id=155, # e.g., reopen last closed tab ) @@ -59,7 +59,7 @@ To benchmark your agent at scale, you can run an entire dataset (or a subset) in from agent.integrations.hud import run_full_dataset results = await run_full_dataset( - dataset="hud-evals/OSWorld-Verified-XLang", # can also pass a Dataset or list[dict] + dataset="hud-evals/OSWorld-Verified", # can also pass a Dataset or list[dict] model="openai/computer-use-preview", split="train[:3]", # try a few tasks to start max_concurrent=20, # tune to your infra @@ -83,7 +83,7 @@ results = await run_full_dataset( Both single-task and full-dataset runs share a common set of configuration options. These let you fine-tune how the evaluation runs. - `dataset` (`str` | `Dataset` | `list[dict]`): **Required** - HUD dataset name (e.g. `"hud-evals/OSWorld-Verified-XLang"`), a loaded `Dataset`, or a list of tasks. + HUD dataset name (e.g. `"hud-evals/OSWorld-Verified"`), a loaded `Dataset`, or a list of tasks. - `model` (`str`): Default: `"computer-use-preview"` Model string, e.g. `"openai/computer-use-preview+openai/gpt-5-nano"`. Supports composition with `+` (planning + grounding). - `allowed_tools` (`list[str]`): Default: `["openai_computer"]` diff --git a/notebooks/eval_osworld.ipynb b/notebooks/eval_osworld.ipynb index 5b76904a..1bc58e48 100644 --- a/notebooks/eval_osworld.ipynb +++ b/notebooks/eval_osworld.ipynb @@ -68,10 +68,10 @@ "source": [ "from agent.integrations.hud import run_single_task\n", "\n", - "# Quick single-task smoke test on OSWorld-Verified-XLang\n", - "# You can swap \"hud-evals/OSWorld-Verified-XLang\" -> \"hud-evals/SheetBench-V2\" to test SheetBench.\n", + "# Quick single-task smoke test on OSWorld-Verified\n", + "# You can swap \"hud-evals/OSWorld-Verified\" -> \"hud-evals/SheetBench-V2\" to test SheetBench.\n", "await run_single_task(\n", - " dataset=\"hud-evals/OSWorld-Verified-XLang\",\n", + " dataset=\"hud-evals/OSWorld-Verified\",\n", " model=\"openai/computer-use-preview+openai/gpt-5-nano\", # or any supported model string\n", " task_id=155 # open last tab task (easy)\n", ")" @@ -97,7 +97,7 @@ "job_name = f\"osworld-test-{str(uuid.uuid4())[:4]}\"\n", "\n", "results = await run_full_dataset(\n", - " dataset=\"hud-evals/OSWorld-Verified-XLang\", # You can also pass a Dataset or a list[dict]\n", + " dataset=\"hud-evals/OSWorld-Verified\", # You can also pass a Dataset or a list[dict]\n", " job_name=job_name, # Optional; defaults to a timestamp for custom datasets\n", " model=\"openai/computer-use-preview\", # Or any supported model string\n", " max_concurrent=20, # Tune to your infra\n", @@ -138,7 +138,7 @@ " job_name = f\"osworld {job_uuid} {model}\"\n", "\n", " results = await run_full_dataset(\n", - " dataset=\"hud-evals/OSWorld-Verified-XLang\",\n", + " dataset=\"hud-evals/OSWorld-Verified\",\n", " job_name=job_name, \n", " model=model,\n", " max_concurrent=20, \n",