Reuse agent configuration for HUD evaluation

This commit is contained in:
James Murdza
2025-09-12 14:49:54 -04:00
parent 48e42d2334
commit ea1caea73c

View File

@@ -124,14 +124,16 @@
" verbosity=logging.INFO\n",
")\n",
"\n",
"agent_config = {\n",
" \"model\": \"openai/computer-use-preview\",\n",
" \"tools\": [computer],\n",
" \"trajectory_dir\": str(Path(\"trajectories\")),\n",
" \"only_n_most_recent_images\": 3,\n",
" \"verbosity\": logging.INFO\n",
"}\n",
"\n",
"# Create agent\n",
"agent = ComputerAgent(\n",
" model=\"openai/computer-use-preview\",\n",
" tools=[computer],\n",
" trajectory_dir=str(Path(\"trajectories\")),\n",
" only_n_most_recent_images=3,\n",
" verbosity=logging.INFO\n",
")"
"agent = ComputerAgent(**agent_config)"
]
},
{
@@ -195,7 +197,7 @@
"results = await run_full_dataset(\n",
" dataset=\"ddupont/OSWorld-Tiny-Public\", # You can also pass a Dataset or a list[dict]\n",
" job_name=job_name, # Optional; defaults to a timestamp for custom datasets\n",
" model=\"openai/computer-use-preview\", # Or any supported model string\n",
" **agent_config,\n",
" max_concurrent=20, # Tune to your infra\n",
" max_steps=50, # Safety cap per task\n",
" #split=\"train[:5]\" # Limit to just 5 tasks\n",