diff --git a/libs/python/agent/agent/callbacks/trajectory_saver.py b/libs/python/agent/agent/callbacks/trajectory_saver.py index 805b535d..53e4c189 100644 --- a/libs/python/agent/agent/callbacks/trajectory_saver.py +++ b/libs/python/agent/agent/callbacks/trajectory_saver.py @@ -94,6 +94,10 @@ class TrajectorySaverCallback(AsyncCallbackHandler): # format: turn_000/0000_name.json artifact_filename = f"{self.current_artifact:04d}_{name}" artifact_path = turn_dir / f"{artifact_filename}.json" + # add created_at + if isinstance(artifact, dict): + artifact = artifact.copy() + artifact["created_at"] = str(uuid.uuid1().time) with open(artifact_path, "w") as f: json.dump(sanitize_image_urls(artifact), f, indent=2) self.current_artifact += 1 @@ -171,7 +175,7 @@ class TrajectorySaverCallback(AsyncCallbackHandler): "status": "completed", "completed_at": str(uuid.uuid1().time), "total_usage": self.total_usage, - "new_items": sanitize_image_urls(new_items), + "new_items": new_items, "total_turns": self.current_turn }) diff --git a/libs/python/agent/agent/integrations/hud/__init__.py b/libs/python/agent/agent/integrations/hud/__init__.py index d7fc3a20..64c91fb6 100644 --- a/libs/python/agent/agent/integrations/hud/__init__.py +++ b/libs/python/agent/agent/integrations/hud/__init__.py @@ -41,6 +41,7 @@ class ProxyOperatorAgent(OperatorAgent): *, model: str | None = None, allowed_tools: list[str] | None = None, + trajectory_dir: str | None = None, **kwargs: Any, ) -> None: model = model or "computer-use-preview" @@ -54,8 +55,7 @@ class ProxyOperatorAgent(OperatorAgent): computer_agent = BaseComputerAgent( model=model, tools=[computer_shim], - verbosity=20, - trajectory_dir='trajectories' + trajectory_dir=trajectory_dir ) model_client = FakeAsyncOpenAI(computer_agent) @@ -115,6 +115,7 @@ async def run_full_dataset( max_concurrent: int = 30, max_steps: int = 50, split: str = "train", + trajectory_dir: str | None = None, ) -> list[Any]: """Run evaluation across the entire dataset using hud.datasets.run_dataset.""" @@ -134,7 +135,7 @@ async def run_full_dataset( name=job_name, dataset=dataset, agent_class=ProxyOperatorAgent, - agent_config={"model": model, "allowed_tools": allowed_tools}, + agent_config={"model": model, "allowed_tools": allowed_tools, "trajectory_dir": trajectory_dir}, max_concurrent=max_concurrent, metadata={"dataset": dataset_name}, max_steps=max_steps, diff --git a/notebooks/eval_osworld.ipynb b/notebooks/eval_osworld.ipynb index afbb1716..adb5d22b 100644 --- a/notebooks/eval_osworld.ipynb +++ b/notebooks/eval_osworld.ipynb @@ -76,7 +76,7 @@ "\u001b[90m╔═════════════════════════════════════════════════════════════════╗\u001b[0m\n", "\u001b[90m║\u001b[0m 🚀 See your agent live at: \u001b[90m║\u001b[0m\n", "\u001b[90m╟─────────────────────────────────────────────────────────────────╢\u001b[0m\n", - "\u001b[90m║\u001b[0m \u001b[1m\u001b[33mhttps://app.hud.so/trace/426ed182-564d-4b12-b950-c551caeeb8a8\u001b[0m \u001b[90m║\u001b[0m\n", + "\u001b[90m║\u001b[0m \u001b[1m\u001b[33mhttps://app.hud.so/trace/cbe6f71b-f520-4630-9f27-778647070327\u001b[0m \u001b[90m║\u001b[0m\n", "\u001b[90m╚═════════════════════════════════════════════════════════════════╝\u001b[0m\n", "\n", "Running: Can you make my computer bring back the last tab I shut down?\n" @@ -86,8 +86,15 @@ "name": "stderr", "output_type": "stream", "text": [ - "2025-08-27 13:36:03,660 - agent.ComputerAgent - INFO - LLM processing started with 2 messages\n", - "2025-08-27 13:36:21,971 - agent.ComputerAgent - INFO - LLM processing started with 6 messages\n", + "2025-08-27 16:17:53,047 - agent.ComputerAgent - INFO - LLM processing started with 2 messages\n", + "2025-08-27 16:18:02,697 - agent.ComputerAgent - INFO - LLM processing started with 5 messages\n", + "2025-08-27 16:18:15,887 - agent.ComputerAgent - INFO - LLM processing started with 7 messages\n", + "2025-08-27 16:18:28,541 - agent.ComputerAgent - INFO - LLM processing started with 9 messages\n", + "2025-08-27 16:18:42,176 - agent.ComputerAgent - INFO - LLM processing started with 11 messages\n", + "2025-08-27 16:18:55,937 - agent.ComputerAgent - INFO - LLM processing started with 13 messages\n", + "2025-08-27 16:19:11,654 - agent.ComputerAgent - INFO - LLM processing started with 15 messages\n", + "2025-08-27 16:19:23,839 - agent.ComputerAgent - INFO - LLM processing started with 17 messages\n", + "2025-08-27 16:19:39,065 - agent.ComputerAgent - INFO - LLM processing started with 19 messages\n", "Tool execution failed: Tool evaluate has an output schema but did not return structured content\n", "Evaluation phase failed: [MCPToolResult(meta=None, content=[TextContent(type='text', text='Tool evaluate has an output schema but did not return structured content', annotations=None, meta=None)], structuredContent=None, isError=True)]\n" ] @@ -98,7 +105,7 @@ "text": [ "✅ Reward: 0.0\n", "\n", - "\u001b[92m✓ Trace complete!\u001b[0m \u001b[2mView at:\u001b[0m \u001b[1m\u001b[33mhttps://app.hud.so/trace/426ed182-564d-4b12-b950-c551caeeb8a8\u001b[0m\n", + "\u001b[92m✓ Trace complete!\u001b[0m \u001b[2mView at:\u001b[0m \u001b[1m\u001b[33mhttps://app.hud.so/trace/cbe6f71b-f520-4630-9f27-778647070327\u001b[0m\n", "\n" ] } @@ -110,7 +117,7 @@ "# You can swap \"hud-evals/OSWorld-Verified-XLang\" -> \"hud-evals/SheetBench-V2\" to test SheetBench.\n", "await run_single_task(\n", " dataset=\"hud-evals/OSWorld-Verified-XLang\",\n", - " model=\"openai/computer-use-preview\", # or any supported model string\n", + " model=\"openai/computer-use-preview+openai/gpt-5-nano\", # or any supported model string\n", " task_id=155 # open last tab task (easy)\n", ")" ] @@ -148,6 +155,48 @@ "print(f\"Total results: {len(results)}\")\n", "pprint(results[:3]) # preview" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Benchmark Composed Agents" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import uuid\n", + "from agent.integrations.hud import run_full_dataset\n", + "\n", + "models_to_test = [\n", + " \"openai/computer-use-preview+anthropic/claude-opus-4-1-20250805\",\n", + " \"anthropic/claude-opus-4-1-20250805+openai/computer-use-preview\",\n", + "]\n", + "\n", + "\n", + "for model in models_to_test:\n", + " # Full dataset evaluation (runs via HUD's run_dataset under the hood)\n", + " job_uuid = str(uuid.uuid4())[:6]\n", + " job_name = f\"osworld {job_uuid} {model}\"\n", + "\n", + " results = await run_full_dataset(\n", + " dataset=\"hud-evals/OSWorld-Verified-XLang\",\n", + " job_name=job_name, \n", + " model=model,\n", + " max_concurrent=20, \n", + " max_steps=75,\n", + " trajectory_dir=f\"trajectories/osworld_{job_uuid}\"\n", + " )\n", + "\n", + " # results is a list from hud.datasets.run_dataset; inspect/aggregate as needed\n", + " print(f\"Job: {job_name}\")\n", + " print(f\"Total results: {len(results)}\")\n", + " pprint(results[:3]) # preview" + ] } ], "metadata": {