diff --git a/libs/python/agent/agent/callbacks/trajectory_saver.py b/libs/python/agent/agent/callbacks/trajectory_saver.py
index 805b535d..53e4c189 100644
--- a/libs/python/agent/agent/callbacks/trajectory_saver.py
+++ b/libs/python/agent/agent/callbacks/trajectory_saver.py
@@ -94,6 +94,10 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
             # format: turn_000/0000_name.json
             artifact_filename = f"{self.current_artifact:04d}_{name}"
             artifact_path = turn_dir / f"{artifact_filename}.json"
+            # add created_at
+            if isinstance(artifact, dict):
+                artifact = artifact.copy()
+                artifact["created_at"] = str(uuid.uuid1().time)
             with open(artifact_path, "w") as f:
                 json.dump(sanitize_image_urls(artifact), f, indent=2)
         self.current_artifact += 1
@@ -171,7 +175,7 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
             "status": "completed",
             "completed_at": str(uuid.uuid1().time),
             "total_usage": self.total_usage,
-            "new_items": sanitize_image_urls(new_items),
+            "new_items": new_items,
             "total_turns": self.current_turn
         })
         
diff --git a/libs/python/agent/agent/integrations/hud/__init__.py b/libs/python/agent/agent/integrations/hud/__init__.py
index d7fc3a20..64c91fb6 100644
--- a/libs/python/agent/agent/integrations/hud/__init__.py
+++ b/libs/python/agent/agent/integrations/hud/__init__.py
@@ -41,6 +41,7 @@ class ProxyOperatorAgent(OperatorAgent):
         *,
         model: str | None = None,
         allowed_tools: list[str] | None = None,
+        trajectory_dir: str | None = None,
         **kwargs: Any,
     ) -> None:
         model = model or "computer-use-preview"
@@ -54,8 +55,7 @@ class ProxyOperatorAgent(OperatorAgent):
         computer_agent = BaseComputerAgent(
             model=model, 
             tools=[computer_shim], 
-            verbosity=20, 
-            trajectory_dir='trajectories'
+            trajectory_dir=trajectory_dir
         )
         model_client = FakeAsyncOpenAI(computer_agent)
 
@@ -115,6 +115,7 @@ async def run_full_dataset(
     max_concurrent: int = 30,
     max_steps: int = 50,
     split: str = "train",
+    trajectory_dir: str | None = None,
 ) -> list[Any]:
     """Run evaluation across the entire dataset using hud.datasets.run_dataset."""
 
@@ -134,7 +135,7 @@ async def run_full_dataset(
         name=job_name,
         dataset=dataset,
         agent_class=ProxyOperatorAgent,
-        agent_config={"model": model, "allowed_tools": allowed_tools},
+        agent_config={"model": model, "allowed_tools": allowed_tools, "trajectory_dir": trajectory_dir},
         max_concurrent=max_concurrent,
         metadata={"dataset": dataset_name},
         max_steps=max_steps,
diff --git a/notebooks/eval_osworld.ipynb b/notebooks/eval_osworld.ipynb
index afbb1716..adb5d22b 100644
--- a/notebooks/eval_osworld.ipynb
+++ b/notebooks/eval_osworld.ipynb
@@ -76,7 +76,7 @@
       "\u001b[90m╔═════════════════════════════════════════════════════════════════╗\u001b[0m\n",
       "\u001b[90m║\u001b[0m                    🚀 See your agent live at:                   \u001b[90m║\u001b[0m\n",
       "\u001b[90m╟─────────────────────────────────────────────────────────────────╢\u001b[0m\n",
-      "\u001b[90m║\u001b[0m  \u001b[1m\u001b[33mhttps://app.hud.so/trace/426ed182-564d-4b12-b950-c551caeeb8a8\u001b[0m  \u001b[90m║\u001b[0m\n",
+      "\u001b[90m║\u001b[0m  \u001b[1m\u001b[33mhttps://app.hud.so/trace/cbe6f71b-f520-4630-9f27-778647070327\u001b[0m  \u001b[90m║\u001b[0m\n",
       "\u001b[90m╚═════════════════════════════════════════════════════════════════╝\u001b[0m\n",
       "\n",
       "Running: Can you make my computer bring back the last tab I shut down?\n"
@@ -86,8 +86,15 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2025-08-27 13:36:03,660 - agent.ComputerAgent - INFO - LLM processing started with 2 messages\n",
-      "2025-08-27 13:36:21,971 - agent.ComputerAgent - INFO - LLM processing started with 6 messages\n",
+      "2025-08-27 16:17:53,047 - agent.ComputerAgent - INFO - LLM processing started with 2 messages\n",
+      "2025-08-27 16:18:02,697 - agent.ComputerAgent - INFO - LLM processing started with 5 messages\n",
+      "2025-08-27 16:18:15,887 - agent.ComputerAgent - INFO - LLM processing started with 7 messages\n",
+      "2025-08-27 16:18:28,541 - agent.ComputerAgent - INFO - LLM processing started with 9 messages\n",
+      "2025-08-27 16:18:42,176 - agent.ComputerAgent - INFO - LLM processing started with 11 messages\n",
+      "2025-08-27 16:18:55,937 - agent.ComputerAgent - INFO - LLM processing started with 13 messages\n",
+      "2025-08-27 16:19:11,654 - agent.ComputerAgent - INFO - LLM processing started with 15 messages\n",
+      "2025-08-27 16:19:23,839 - agent.ComputerAgent - INFO - LLM processing started with 17 messages\n",
+      "2025-08-27 16:19:39,065 - agent.ComputerAgent - INFO - LLM processing started with 19 messages\n",
       "Tool execution failed: Tool evaluate has an output schema but did not return structured content\n",
       "Evaluation phase failed: [MCPToolResult(meta=None, content=[TextContent(type='text', text='Tool evaluate has an output schema but did not return structured content', annotations=None, meta=None)], structuredContent=None, isError=True)]\n"
      ]
@@ -98,7 +105,7 @@
      "text": [
       "✅ Reward: 0.0\n",
       "\n",
-      "\u001b[92m✓ Trace complete!\u001b[0m \u001b[2mView at:\u001b[0m \u001b[1m\u001b[33mhttps://app.hud.so/trace/426ed182-564d-4b12-b950-c551caeeb8a8\u001b[0m\n",
+      "\u001b[92m✓ Trace complete!\u001b[0m \u001b[2mView at:\u001b[0m \u001b[1m\u001b[33mhttps://app.hud.so/trace/cbe6f71b-f520-4630-9f27-778647070327\u001b[0m\n",
       "\n"
      ]
     }
@@ -110,7 +117,7 @@
     "# You can swap \"hud-evals/OSWorld-Verified-XLang\" -> \"hud-evals/SheetBench-V2\" to test SheetBench.\n",
     "await run_single_task(\n",
     "    dataset=\"hud-evals/OSWorld-Verified-XLang\",\n",
-    "    model=\"openai/computer-use-preview\",  # or any supported model string\n",
+    "    model=\"openai/computer-use-preview+openai/gpt-5-nano\",  # or any supported model string\n",
     "    task_id=155 # open last tab task (easy)\n",
     ")"
    ]
@@ -148,6 +155,48 @@
     "print(f\"Total results: {len(results)}\")\n",
     "pprint(results[:3])  # preview"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Benchmark Composed Agents"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import uuid\n",
+    "from agent.integrations.hud import run_full_dataset\n",
+    "\n",
+    "models_to_test = [\n",
+    "    \"openai/computer-use-preview+anthropic/claude-opus-4-1-20250805\",\n",
+    "    \"anthropic/claude-opus-4-1-20250805+openai/computer-use-preview\",\n",
+    "]\n",
+    "\n",
+    "\n",
+    "for model in models_to_test:\n",
+    "    # Full dataset evaluation (runs via HUD's run_dataset under the hood)\n",
+    "    job_uuid = str(uuid.uuid4())[:6]\n",
+    "    job_name = f\"osworld {job_uuid} {model}\"\n",
+    "\n",
+    "    results = await run_full_dataset(\n",
+    "        dataset=\"hud-evals/OSWorld-Verified-XLang\",\n",
+    "        job_name=job_name,                 \n",
+    "        model=model,\n",
+    "        max_concurrent=20,                   \n",
+    "        max_steps=75,\n",
+    "        trajectory_dir=f\"trajectories/osworld_{job_uuid}\"\n",
+    "    )\n",
+    "\n",
+    "    # results is a list from hud.datasets.run_dataset; inspect/aggregate as needed\n",
+    "    print(f\"Job: {job_name}\")\n",
+    "    print(f\"Total results: {len(results)}\")\n",
+    "    pprint(results[:3])  # preview"
+   ]
   }
  ],
  "metadata": {