mirror of
https://github.com/trycua/computer.git
synced 2026-02-22 14:29:26 -06:00
Improved trajectory saving
This commit is contained in:
@@ -76,7 +76,7 @@
|
||||
"\u001b[90m╔═════════════════════════════════════════════════════════════════╗\u001b[0m\n",
|
||||
"\u001b[90m║\u001b[0m 🚀 See your agent live at: \u001b[90m║\u001b[0m\n",
|
||||
"\u001b[90m╟─────────────────────────────────────────────────────────────────╢\u001b[0m\n",
|
||||
"\u001b[90m║\u001b[0m \u001b[1m\u001b[33mhttps://app.hud.so/trace/426ed182-564d-4b12-b950-c551caeeb8a8\u001b[0m \u001b[90m║\u001b[0m\n",
|
||||
"\u001b[90m║\u001b[0m \u001b[1m\u001b[33mhttps://app.hud.so/trace/cbe6f71b-f520-4630-9f27-778647070327\u001b[0m \u001b[90m║\u001b[0m\n",
|
||||
"\u001b[90m╚═════════════════════════════════════════════════════════════════╝\u001b[0m\n",
|
||||
"\n",
|
||||
"Running: Can you make my computer bring back the last tab I shut down?\n"
|
||||
@@ -86,8 +86,15 @@
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2025-08-27 13:36:03,660 - agent.ComputerAgent - INFO - LLM processing started with 2 messages\n",
|
||||
"2025-08-27 13:36:21,971 - agent.ComputerAgent - INFO - LLM processing started with 6 messages\n",
|
||||
"2025-08-27 16:17:53,047 - agent.ComputerAgent - INFO - LLM processing started with 2 messages\n",
|
||||
"2025-08-27 16:18:02,697 - agent.ComputerAgent - INFO - LLM processing started with 5 messages\n",
|
||||
"2025-08-27 16:18:15,887 - agent.ComputerAgent - INFO - LLM processing started with 7 messages\n",
|
||||
"2025-08-27 16:18:28,541 - agent.ComputerAgent - INFO - LLM processing started with 9 messages\n",
|
||||
"2025-08-27 16:18:42,176 - agent.ComputerAgent - INFO - LLM processing started with 11 messages\n",
|
||||
"2025-08-27 16:18:55,937 - agent.ComputerAgent - INFO - LLM processing started with 13 messages\n",
|
||||
"2025-08-27 16:19:11,654 - agent.ComputerAgent - INFO - LLM processing started with 15 messages\n",
|
||||
"2025-08-27 16:19:23,839 - agent.ComputerAgent - INFO - LLM processing started with 17 messages\n",
|
||||
"2025-08-27 16:19:39,065 - agent.ComputerAgent - INFO - LLM processing started with 19 messages\n",
|
||||
"Tool execution failed: Tool evaluate has an output schema but did not return structured content\n",
|
||||
"Evaluation phase failed: [MCPToolResult(meta=None, content=[TextContent(type='text', text='Tool evaluate has an output schema but did not return structured content', annotations=None, meta=None)], structuredContent=None, isError=True)]\n"
|
||||
]
|
||||
@@ -98,7 +105,7 @@
|
||||
"text": [
|
||||
"✅ Reward: 0.0\n",
|
||||
"\n",
|
||||
"\u001b[92m✓ Trace complete!\u001b[0m \u001b[2mView at:\u001b[0m \u001b[1m\u001b[33mhttps://app.hud.so/trace/426ed182-564d-4b12-b950-c551caeeb8a8\u001b[0m\n",
|
||||
"\u001b[92m✓ Trace complete!\u001b[0m \u001b[2mView at:\u001b[0m \u001b[1m\u001b[33mhttps://app.hud.so/trace/cbe6f71b-f520-4630-9f27-778647070327\u001b[0m\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
@@ -110,7 +117,7 @@
|
||||
"# You can swap \"hud-evals/OSWorld-Verified-XLang\" -> \"hud-evals/SheetBench-V2\" to test SheetBench.\n",
|
||||
"await run_single_task(\n",
|
||||
" dataset=\"hud-evals/OSWorld-Verified-XLang\",\n",
|
||||
" model=\"openai/computer-use-preview\", # or any supported model string\n",
|
||||
" model=\"openai/computer-use-preview+openai/gpt-5-nano\", # or any supported model string\n",
|
||||
" task_id=155 # open last tab task (easy)\n",
|
||||
")"
|
||||
]
|
||||
@@ -148,6 +155,48 @@
|
||||
"print(f\"Total results: {len(results)}\")\n",
|
||||
"pprint(results[:3]) # preview"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Benchmark Composed Agents"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import uuid\n",
|
||||
"from agent.integrations.hud import run_full_dataset\n",
|
||||
"\n",
|
||||
"models_to_test = [\n",
|
||||
" \"openai/computer-use-preview+anthropic/claude-opus-4-1-20250805\",\n",
|
||||
" \"anthropic/claude-opus-4-1-20250805+openai/computer-use-preview\",\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"for model in models_to_test:\n",
|
||||
" # Full dataset evaluation (runs via HUD's run_dataset under the hood)\n",
|
||||
" job_uuid = str(uuid.uuid4())[:6]\n",
|
||||
" job_name = f\"osworld {job_uuid} {model}\"\n",
|
||||
"\n",
|
||||
" results = await run_full_dataset(\n",
|
||||
" dataset=\"hud-evals/OSWorld-Verified-XLang\",\n",
|
||||
" job_name=job_name, \n",
|
||||
" model=model,\n",
|
||||
" max_concurrent=20, \n",
|
||||
" max_steps=75,\n",
|
||||
" trajectory_dir=f\"trajectories/osworld_{job_uuid}\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # results is a list from hud.datasets.run_dataset; inspect/aggregate as needed\n",
|
||||
" print(f\"Job: {job_name}\")\n",
|
||||
" print(f\"Total results: {len(results)}\")\n",
|
||||
" pprint(results[:3]) # preview"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
||||
Reference in New Issue
Block a user