mirror of
https://github.com/trycua/computer.git
synced 2026-01-08 06:20:00 -06:00
Merge pull request #392 from trycua/fix/rename-osworld-dataset
Change HUD dataset name from `OSWorld-Verified-XLang` to `OSWorld-Verified`
This commit is contained in:
@@ -38,7 +38,7 @@ You can run a single task from a HUD dataset for quick verification.
|
||||
from agent.integrations.hud import run_single_task
|
||||
|
||||
await run_single_task(
|
||||
dataset="hud-evals/OSWorld-Verified-XLang", # or another HUD dataset
|
||||
dataset="hud-evals/OSWorld-Verified", # or another HUD dataset
|
||||
model="openai/computer-use-preview+openai/gpt-5-nano", # any supported model string
|
||||
task_id=155, # e.g., reopen last closed tab
|
||||
)
|
||||
@@ -59,7 +59,7 @@ To benchmark your agent at scale, you can run an entire dataset (or a subset) in
|
||||
from agent.integrations.hud import run_full_dataset
|
||||
|
||||
results = await run_full_dataset(
|
||||
dataset="hud-evals/OSWorld-Verified-XLang", # can also pass a Dataset or list[dict]
|
||||
dataset="hud-evals/OSWorld-Verified", # can also pass a Dataset or list[dict]
|
||||
model="openai/computer-use-preview",
|
||||
split="train[:3]", # try a few tasks to start
|
||||
max_concurrent=20, # tune to your infra
|
||||
@@ -83,7 +83,7 @@ results = await run_full_dataset(
|
||||
Both single-task and full-dataset runs share a common set of configuration options. These let you fine-tune how the evaluation runs.
|
||||
|
||||
- `dataset` (`str` | `Dataset` | `list[dict]`): **Required**
|
||||
HUD dataset name (e.g. `"hud-evals/OSWorld-Verified-XLang"`), a loaded `Dataset`, or a list of tasks.
|
||||
HUD dataset name (e.g. `"hud-evals/OSWorld-Verified"`), a loaded `Dataset`, or a list of tasks.
|
||||
- `model` (`str`): Default: `"computer-use-preview"`
|
||||
Model string, e.g. `"openai/computer-use-preview+openai/gpt-5-nano"`. Supports composition with `+` (planning + grounding).
|
||||
- `allowed_tools` (`list[str]`): Default: `["openai_computer"]`
|
||||
|
||||
@@ -68,10 +68,10 @@
|
||||
"source": [
|
||||
"from agent.integrations.hud import run_single_task\n",
|
||||
"\n",
|
||||
"# Quick single-task smoke test on OSWorld-Verified-XLang\n",
|
||||
"# You can swap \"hud-evals/OSWorld-Verified-XLang\" -> \"hud-evals/SheetBench-V2\" to test SheetBench.\n",
|
||||
"# Quick single-task smoke test on OSWorld-Verified\n",
|
||||
"# You can swap \"hud-evals/OSWorld-Verified\" -> \"hud-evals/SheetBench-V2\" to test SheetBench.\n",
|
||||
"await run_single_task(\n",
|
||||
" dataset=\"hud-evals/OSWorld-Verified-XLang\",\n",
|
||||
" dataset=\"hud-evals/OSWorld-Verified\",\n",
|
||||
" model=\"openai/computer-use-preview+openai/gpt-5-nano\", # or any supported model string\n",
|
||||
" task_id=155 # open last tab task (easy)\n",
|
||||
")"
|
||||
@@ -97,7 +97,7 @@
|
||||
"job_name = f\"osworld-test-{str(uuid.uuid4())[:4]}\"\n",
|
||||
"\n",
|
||||
"results = await run_full_dataset(\n",
|
||||
" dataset=\"hud-evals/OSWorld-Verified-XLang\", # You can also pass a Dataset or a list[dict]\n",
|
||||
" dataset=\"hud-evals/OSWorld-Verified\", # You can also pass a Dataset or a list[dict]\n",
|
||||
" job_name=job_name, # Optional; defaults to a timestamp for custom datasets\n",
|
||||
" model=\"openai/computer-use-preview\", # Or any supported model string\n",
|
||||
" max_concurrent=20, # Tune to your infra\n",
|
||||
@@ -138,7 +138,7 @@
|
||||
" job_name = f\"osworld {job_uuid} {model}\"\n",
|
||||
"\n",
|
||||
" results = await run_full_dataset(\n",
|
||||
" dataset=\"hud-evals/OSWorld-Verified-XLang\",\n",
|
||||
" dataset=\"hud-evals/OSWorld-Verified\",\n",
|
||||
" job_name=job_name, \n",
|
||||
" model=model,\n",
|
||||
" max_concurrent=20, \n",
|
||||
|
||||
Reference in New Issue
Block a user