Merge pull request #392 from trycua/fix/rename-osworld-dataset

Change HUD dataset name from `OSWorld-Verified-XLang` to `OSWorld-Verified`
This commit is contained in:
James Murdza
2025-09-03 11:26:19 -04:00
committed by GitHub
2 changed files with 8 additions and 8 deletions

View File

@@ -38,7 +38,7 @@ You can run a single task from a HUD dataset for quick verification.
from agent.integrations.hud import run_single_task
await run_single_task(
dataset="hud-evals/OSWorld-Verified-XLang", # or another HUD dataset
dataset="hud-evals/OSWorld-Verified", # or another HUD dataset
model="openai/computer-use-preview+openai/gpt-5-nano", # any supported model string
task_id=155, # e.g., reopen last closed tab
)
@@ -59,7 +59,7 @@ To benchmark your agent at scale, you can run an entire dataset (or a subset) in
from agent.integrations.hud import run_full_dataset
results = await run_full_dataset(
dataset="hud-evals/OSWorld-Verified-XLang", # can also pass a Dataset or list[dict]
dataset="hud-evals/OSWorld-Verified", # can also pass a Dataset or list[dict]
model="openai/computer-use-preview",
split="train[:3]", # try a few tasks to start
max_concurrent=20, # tune to your infra
@@ -83,7 +83,7 @@ results = await run_full_dataset(
Both single-task and full-dataset runs share a common set of configuration options. These let you fine-tune how the evaluation runs.
- `dataset` (`str` | `Dataset` | `list[dict]`): **Required**
HUD dataset name (e.g. `"hud-evals/OSWorld-Verified-XLang"`), a loaded `Dataset`, or a list of tasks.
HUD dataset name (e.g. `"hud-evals/OSWorld-Verified"`), a loaded `Dataset`, or a list of tasks.
- `model` (`str`): Default: `"computer-use-preview"`
Model string, e.g. `"openai/computer-use-preview+openai/gpt-5-nano"`. Supports composition with `+` (planning + grounding).
- `allowed_tools` (`list[str]`): Default: `["openai_computer"]`

View File

@@ -68,10 +68,10 @@
"source": [
"from agent.integrations.hud import run_single_task\n",
"\n",
"# Quick single-task smoke test on OSWorld-Verified-XLang\n",
"# You can swap \"hud-evals/OSWorld-Verified-XLang\" -> \"hud-evals/SheetBench-V2\" to test SheetBench.\n",
"# Quick single-task smoke test on OSWorld-Verified\n",
"# You can swap \"hud-evals/OSWorld-Verified\" -> \"hud-evals/SheetBench-V2\" to test SheetBench.\n",
"await run_single_task(\n",
" dataset=\"hud-evals/OSWorld-Verified-XLang\",\n",
" dataset=\"hud-evals/OSWorld-Verified\",\n",
" model=\"openai/computer-use-preview+openai/gpt-5-nano\", # or any supported model string\n",
" task_id=155 # open last tab task (easy)\n",
")"
@@ -97,7 +97,7 @@
"job_name = f\"osworld-test-{str(uuid.uuid4())[:4]}\"\n",
"\n",
"results = await run_full_dataset(\n",
" dataset=\"hud-evals/OSWorld-Verified-XLang\", # You can also pass a Dataset or a list[dict]\n",
" dataset=\"hud-evals/OSWorld-Verified\", # You can also pass a Dataset or a list[dict]\n",
" job_name=job_name, # Optional; defaults to a timestamp for custom datasets\n",
" model=\"openai/computer-use-preview\", # Or any supported model string\n",
" max_concurrent=20, # Tune to your infra\n",
@@ -138,7 +138,7 @@
" job_name = f\"osworld {job_uuid} {model}\"\n",
"\n",
" results = await run_full_dataset(\n",
" dataset=\"hud-evals/OSWorld-Verified-XLang\",\n",
" dataset=\"hud-evals/OSWorld-Verified\",\n",
" job_name=job_name, \n",
" model=model,\n",
" max_concurrent=20, \n",