diff --git a/libs/python/agent/agent/agent.py b/libs/python/agent/agent/agent.py
index fb9a45d9..eacc5fdd 100644
--- a/libs/python/agent/agent/agent.py
+++ b/libs/python/agent/agent/agent.py
@@ -29,7 +29,7 @@ from .callbacks import (
     TrajectorySaverCallback, 
     BudgetManagerCallback,
     TelemetryCallback,
-    OperatorValidatorCallback
+    OperatorNormalizerCallback
 )
 from .computers import (
     AsyncComputerHandler,
@@ -202,8 +202,8 @@ class ComputerAgent:
 
         # == Add built-in callbacks ==
 
-        # Prepend operator validator callback
-        self.callbacks.insert(0, OperatorValidatorCallback())
+        # Prepend operator normalizer callback
+        self.callbacks.insert(0, OperatorNormalizerCallback())
 
         # Add telemetry callback if telemetry_enabled is set
         if self.telemetry_enabled:
diff --git a/libs/python/agent/agent/callbacks/__init__.py b/libs/python/agent/agent/callbacks/__init__.py
index 35ab0263..e0befcc7 100644
--- a/libs/python/agent/agent/callbacks/__init__.py
+++ b/libs/python/agent/agent/callbacks/__init__.py
@@ -8,7 +8,7 @@ from .logging import LoggingCallback
 from .trajectory_saver import TrajectorySaverCallback
 from .budget_manager import BudgetManagerCallback
 from .telemetry import TelemetryCallback
-from .operator_validator import OperatorValidatorCallback
+from .operator_validator import OperatorNormalizerCallback
 
 __all__ = [
     "AsyncCallbackHandler",
@@ -17,5 +17,5 @@ __all__ = [
     "TrajectorySaverCallback",
     "BudgetManagerCallback",
     "TelemetryCallback",
-    "OperatorValidatorCallback",
+    "OperatorNormalizerCallback",
 ]
diff --git a/libs/python/agent/agent/callbacks/operator_validator.py b/libs/python/agent/agent/callbacks/operator_validator.py
index db19555c..2a1160c9 100644
--- a/libs/python/agent/agent/callbacks/operator_validator.py
+++ b/libs/python/agent/agent/callbacks/operator_validator.py
@@ -4,6 +4,7 @@ OperatorValidatorCallback
 Ensures agent output actions conform to expected schemas by fixing common issues:
 - click: add default button='left' if missing
 - keypress: wrap keys string into a list
+- etc.
 
 This runs in on_llm_end, which receives the output array (AgentMessage[] as dicts).
 """
@@ -14,14 +15,12 @@ from typing import Any, Dict, List
 from .base import AsyncCallbackHandler
 
 
-class OperatorValidatorCallback(AsyncCallbackHandler):
-    """Validates and normalizes operator/computer actions in LLM outputs."""
+class OperatorNormalizerCallback(AsyncCallbackHandler):
+    """Normalizes common computer call hallucinations / errors in computer call syntax."""
 
     async def on_llm_end(self, output: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         # Mutate in-place as requested, but still return the list for chaining
         for item in output or []:
-            if not isinstance(item, dict):
-                continue
             if item.get("type") != "computer_call":
                 continue
             action = item.get("action")
@@ -56,8 +55,6 @@ class OperatorValidatorCallback(AsyncCallbackHandler):
         # replace the assistant message itself with a reasoning message with summary text.
         if isinstance(output, list):
             for i, item in enumerate(output):
-                if not isinstance(item, dict):
-                    continue
                 # AssistantMessage shape: { type: 'message', role: 'assistant', content: OutputContent[] }
                 if item.get("type") == "message" and item.get("role") == "assistant":
                     next_idx = i + 1
diff --git a/libs/python/agent/agent/integrations/hud/__init__.py b/libs/python/agent/agent/integrations/hud/__init__.py
index 64c91fb6..c73532c7 100644
--- a/libs/python/agent/agent/integrations/hud/__init__.py
+++ b/libs/python/agent/agent/integrations/hud/__init__.py
@@ -42,6 +42,17 @@ class ProxyOperatorAgent(OperatorAgent):
         model: str | None = None,
         allowed_tools: list[str] | None = None,
         trajectory_dir: str | None = None,
+        # === ComputerAgent kwargs ===
+        tools: list[Any] | None = None,
+        custom_loop: Any | None = None,
+        only_n_most_recent_images: int | None = None,
+        callbacks: list[Any] | None = None,
+        verbosity: int | None = None,
+        max_retries: int | None = 3,
+        screenshot_delay: float | int = 0.5,
+        use_prompt_caching: bool | None = False,
+        max_trajectory_budget: float | dict | None = None,
+        telemetry_enabled: bool | None = True,
         **kwargs: Any,
     ) -> None:
         model = model or "computer-use-preview"
@@ -52,10 +63,24 @@ class ProxyOperatorAgent(OperatorAgent):
             'environment': 'linux',
             'dimensions': (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT)
         }
+        # Build tools ensuring the computer_shim is included
+        agent_tools: list[Any] = [computer_shim]
+        if tools:
+            agent_tools.extend(tools)
+
         computer_agent = BaseComputerAgent(
-            model=model, 
-            tools=[computer_shim], 
-            trajectory_dir=trajectory_dir
+            model=model,
+            tools=agent_tools,
+            custom_loop=custom_loop,
+            only_n_most_recent_images=only_n_most_recent_images,
+            callbacks=callbacks,
+            verbosity=verbosity,
+            trajectory_dir=trajectory_dir,
+            max_retries=max_retries,
+            screenshot_delay=screenshot_delay,
+            use_prompt_caching=use_prompt_caching,
+            max_trajectory_budget=max_trajectory_budget,
+            telemetry_enabled=telemetry_enabled,
         )
         model_client = FakeAsyncOpenAI(computer_agent)
 
@@ -78,6 +103,18 @@ async def run_single_task(
     task_id: int = 0,
     model: str | None = None,
     allowed_tools: list[str] | None = None,
+    # === ComputerAgent kwargs ===
+    tools: list[Any] | None = None,
+    custom_loop: Any | None = None,
+    only_n_most_recent_images: int | None = None,
+    callbacks: list[Any] | None = None,
+    verbosity: int | None = None,
+    trajectory_dir: str | None = None,
+    max_retries: int | None = 3,
+    screenshot_delay: float | int = 0.5,
+    use_prompt_caching: bool | None = False,
+    max_trajectory_budget: float | dict | None = None,
+    telemetry_enabled: bool | None = True,
 ) -> None:
     """Load one task from the dataset and execute it with Operator+CUA proxy."""
 
@@ -95,7 +132,22 @@ async def run_single_task(
     with trace(name=task_prompt):
         task = Task(**sample_task)  # type: ignore[arg-type]
 
-        agent = ProxyOperatorAgent(model=model, allowed_tools=allowed_tools)
+        agent = ProxyOperatorAgent(
+            model=model,
+            allowed_tools=allowed_tools,
+            # === ComputerAgent kwargs passthrough ===
+            tools=tools,
+            custom_loop=custom_loop,
+            only_n_most_recent_images=only_n_most_recent_images,
+            callbacks=callbacks,
+            verbosity=verbosity,
+            trajectory_dir=trajectory_dir,
+            max_retries=max_retries,
+            screenshot_delay=screenshot_delay,
+            use_prompt_caching=use_prompt_caching,
+            max_trajectory_budget=max_trajectory_budget,
+            telemetry_enabled=telemetry_enabled,
+        )
         print(f"Running: {task_prompt}")
         result = await agent.run(task, max_steps=10)
         print(f"✅ Reward: {getattr(result, 'reward')}")
@@ -116,6 +168,17 @@ async def run_full_dataset(
     max_steps: int = 50,
     split: str = "train",
     trajectory_dir: str | None = None,
+    # === ComputerAgent kwargs ===
+    tools: list[Any] | None = None,
+    custom_loop: Any | None = None,
+    only_n_most_recent_images: int | None = 5,
+    callbacks: list[Any] | None = None,
+    verbosity: int | None = None,
+    max_retries: int | None = 3,
+    screenshot_delay: float | int = 0.5,
+    use_prompt_caching: bool | None = False,
+    max_trajectory_budget: float | dict | None = None,
+    telemetry_enabled: bool | None = True,
 ) -> list[Any]:
     """Run evaluation across the entire dataset using hud.datasets.run_dataset."""
 
@@ -135,7 +198,22 @@ async def run_full_dataset(
         name=job_name,
         dataset=dataset,
         agent_class=ProxyOperatorAgent,
-        agent_config={"model": model, "allowed_tools": allowed_tools, "trajectory_dir": trajectory_dir},
+        agent_config={
+            "model": model,
+            "allowed_tools": allowed_tools,
+            "trajectory_dir": trajectory_dir,
+            # === ComputerAgent kwargs passthrough ===
+            "tools": tools,
+            "custom_loop": custom_loop,
+            "only_n_most_recent_images": only_n_most_recent_images,
+            "callbacks": callbacks,
+            "verbosity": verbosity,
+            "max_retries": max_retries,
+            "screenshot_delay": screenshot_delay,
+            "use_prompt_caching": use_prompt_caching,
+            "max_trajectory_budget": max_trajectory_budget,
+            "telemetry_enabled": telemetry_enabled,
+        },
         max_concurrent=max_concurrent,
         metadata={"dataset": dataset_name},
         max_steps=max_steps,
diff --git a/notebooks/eval_osworld.ipynb b/notebooks/eval_osworld.ipynb
index adb5d22b..3111bbb7 100644
--- a/notebooks/eval_osworld.ipynb
+++ b/notebooks/eval_osworld.ipynb
@@ -167,16 +167,340 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\u001b[90m╔══════════════════════════════════════════════════════════════════════════════════════════════════╗\u001b[0m\n",
+      "\u001b[90m║\u001b[0m  🚀 Job 'osworld b4db80 openai/computer-use-preview+anthropic/claude-opus-4-1-20250805' started: \u001b[90m║\u001b[0m\n",
+      "\u001b[90m╟──────────────────────────────────────────────────────────────────────────────────────────────────╢\u001b[0m\n",
+      "\u001b[90m║\u001b[0m                   \u001b[1m\u001b[33mhttps://app.hud.so/jobs/0d10d7d0-2c86-4a5d-a36b-2c28719773cf\u001b[0m                   \u001b[90m║\u001b[0m\n",
+      "\u001b[90m╚══════════════════════════════════════════════════════════════════════════════════════════════════╝\u001b[0m\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: InternalServerError: OpenAIException - {\n",
+      "  \"error\": {\n",
+      "    \"message\": \"An error occurred while processing your request. You can retry your request, or contact us through our help center at help.openai.com if the error persists. Please include the request ID req_97cc086a1b58a101f7db3ea88323a12f in your message.\",\n",
+      "    \"type\": \"server_error\",\n",
+      "    \"param\": null,\n",
+      "    \"code\": \"server_error\"\n",
+      "  }\n",
+      "}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: 22 validation errors for ResponseComputerToolCall\n",
+      "action.ActionClick.button\n",
+      "  Field required [type=missing, input_value={'keys': 'Tab', 'type': 'keypress'}, input_type=dict]\n",
+      "    For further information visit https://errors.pydantic.dev/2.11/v/missing\n",
+      "action.ActionClick.type\n",
+      "  Input should be 'click' [type=literal_error, input_value='keypress', input_type=str]\n",
+      "    For further information visit https://errors.pydantic.dev/2.11/v/literal_error\n",
+      "action.ActionClick.x\n",
+      "  Field required [type=missing, input_value={'keys': 'Tab', 'type': 'keypress'}, input_type=dict]\n",
+      "    For further information visit https://errors.pydantic.dev/2.11/v/missing\n",
+      "action.ActionClick.y\n",
+      "  Field required [type=missing, input_value={'keys': 'Tab', 'type': 'keypress'}, input_type=dict]\n",
+      "    For further information visit https://errors.pydantic.dev/2.11/v/missing\n",
+      "action.ActionDoubleClick.type\n",
+      "  Input should be 'double_click' [type=literal_error, input_value='keypress', input_type=str]\n",
+      "    For further information visit https://errors.pydantic.dev/2.11/v/literal_error\n",
+      "action.ActionDoubleClick.x\n",
+      "  Field required [type=missing, input_value={'keys': 'Tab', 'type': 'keypress'}, input_type=dict]\n",
+      "    For further information visit https://errors.pydantic.dev/2.11/v/missing\n",
+      "action.ActionDoubleClick.y\n",
+      "  Field required [type=missing, input_value={'keys': 'Tab', 'type': 'keypress'}, input_type=dict]\n",
+      "    For further information visit https://errors.pydantic.dev/2.11/v/missing\n",
+      "action.ActionDrag.path\n",
+      "  Field required [type=missing, input_value={'keys': 'Tab', 'type': 'keypress'}, input_type=dict]\n",
+      "    For further information visit https://errors.pydantic.dev/2.11/v/missing\n",
+      "action.ActionDrag.type\n",
+      "  Input should be 'drag' [type=literal_error, input_value='keypress', input_type=str]\n",
+      "    For further information visit https://errors.pydantic.dev/2.11/v/literal_error\n",
+      "action.ActionKeypress.keys\n",
+      "  Input should be a valid list [type=list_type, input_value='Tab', input_type=str]\n",
+      "    For further information visit https://errors.pydantic.dev/2.11/v/list_type\n",
+      "action.ActionMove.type\n",
+      "  Input should be 'move' [type=literal_error, input_value='keypress', input_type=str]\n",
+      "    For further information visit https://errors.pydantic.dev/2.11/v/literal_error\n",
+      "action.ActionMove.x\n",
+      "  Field required [type=missing, input_value={'keys': 'Tab', 'type': 'keypress'}, input_type=dict]\n",
+      "    For further information visit https://errors.pydantic.dev/2.11/v/missing\n",
+      "action.ActionMove.y\n",
+      "  Field required [type=missing, input_value={'keys': 'Tab', 'type': 'keypress'}, input_type=dict]\n",
+      "    For further information visit https://errors.pydantic.dev/2.11/v/missing\n",
+      "action.ActionScreenshot.type\n",
+      "  Input should be 'screenshot' [type=literal_error, input_value='keypress', input_type=str]\n",
+      "    For further information visit https://errors.pydantic.dev/2.11/v/literal_error\n",
+      "action.ActionScroll.scroll_x\n",
+      "  Field required [type=missing, input_value={'keys': 'Tab', 'type': 'keypress'}, input_type=dict]\n",
+      "    For further information visit https://errors.pydantic.dev/2.11/v/missing\n",
+      "action.ActionScroll.scroll_y\n",
+      "  Field required [type=missing, input_value={'keys': 'Tab', 'type': 'keypress'}, input_type=dict]\n",
+      "    For further information visit https://errors.pydantic.dev/2.11/v/missing\n",
+      "action.ActionScroll.type\n",
+      "  Input should be 'scroll' [type=literal_error, input_value='keypress', input_type=str]\n",
+      "    For further information visit https://errors.pydantic.dev/2.11/v/literal_error\n",
+      "action.ActionScroll.x\n",
+      "  Field required [type=missing, input_value={'keys': 'Tab', 'type': 'keypress'}, input_type=dict]\n",
+      "    For further information visit https://errors.pydantic.dev/2.11/v/missing\n",
+      "action.ActionScroll.y\n",
+      "  Field required [type=missing, input_value={'keys': 'Tab', 'type': 'keypress'}, input_type=dict]\n",
+      "    For further information visit https://errors.pydantic.dev/2.11/v/missing\n",
+      "action.ActionType.text\n",
+      "  Field required [type=missing, input_value={'keys': 'Tab', 'type': 'keypress'}, input_type=dict]\n",
+      "    For further information visit https://errors.pydantic.dev/2.11/v/missing\n",
+      "action.ActionType.type\n",
+      "  Input should be 'type' [type=literal_error, input_value='keypress', input_type=str]\n",
+      "    For further information visit https://errors.pydantic.dev/2.11/v/literal_error\n",
+      "action.ActionWait.type\n",
+      "  Input should be 'wait' [type=literal_error, input_value='keypress', input_type=str]\n",
+      "    For further information visit https://errors.pydantic.dev/2.11/v/literal_error\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: InternalServerError: OpenAIException - {\n",
+      "  \"error\": {\n",
+      "    \"message\": \"An error occurred while processing your request. You can retry your request, or contact us through our help center at help.openai.com if the error persists. Please include the request ID req_f7a393a984e7e85dc5845aef8a9471e4 in your message.\",\n",
+      "    \"type\": \"model_error\",\n",
+      "    \"param\": null,\n",
+      "    \"code\": null\n",
+      "  }\n",
+      "}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Client is not running, cannot disconnect\n",
+      "Client is not running, cannot disconnect\n",
+      "Client is not running, cannot disconnect\n",
+      "Client is not running, cannot disconnect\n",
+      "Client is not running, cannot disconnect\n",
+      "Client is not running, cannot disconnect\n",
+      "Client is not running, cannot disconnect\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n",
+      "Error parsing JSON response\n",
+      "Traceback (most recent call last):\n",
+      "  File \"c:\\Users\\dillo\\miniconda3\\envs\\cua\\Lib\\site-packages\\mcp\\client\\streamable_http.py\", line 310, in _handle_json_response\n",
+      "    await read_stream_writer.send(session_message)\n",
+      "  File \"c:\\Users\\dillo\\miniconda3\\envs\\cua\\Lib\\site-packages\\anyio\\streams\\memory.py\", line 242, in send\n",
+      "    self.send_nowait(item)\n",
+      "  File \"c:\\Users\\dillo\\miniconda3\\envs\\cua\\Lib\\site-packages\\anyio\\streams\\memory.py\", line 211, in send_nowait\n",
+      "    raise ClosedResourceError\n",
+      "anyio.ClosedResourceError\n",
+      "Request handler error: \n",
+      "Failed to close auto-created client: \n",
+      "Error parsing JSON response\n",
+      "Traceback (most recent call last):\n",
+      "  File \"c:\\Users\\dillo\\miniconda3\\envs\\cua\\Lib\\site-packages\\mcp\\client\\streamable_http.py\", line 310, in _handle_json_response\n",
+      "    await read_stream_writer.send(session_message)\n",
+      "  File \"c:\\Users\\dillo\\miniconda3\\envs\\cua\\Lib\\site-packages\\anyio\\streams\\memory.py\", line 242, in send\n",
+      "    self.send_nowait(item)\n",
+      "  File \"c:\\Users\\dillo\\miniconda3\\envs\\cua\\Lib\\site-packages\\anyio\\streams\\memory.py\", line 211, in send_nowait\n",
+      "    raise ClosedResourceError\n",
+      "anyio.ClosedResourceError\n",
+      "Request handler error: \n",
+      "Failed to close auto-created client: \n",
+      "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n"
+     ]
+    },
+    {
+     "ename": "CancelledError",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mCancelledError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[7], line 14\u001b[0m\n\u001b[0;32m     11\u001b[0m job_uuid \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(uuid\u001b[38;5;241m.\u001b[39muuid4())[:\u001b[38;5;241m6\u001b[39m]\n\u001b[0;32m     12\u001b[0m job_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mosworld \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mjob_uuid\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmodel\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m---> 14\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m run_full_dataset(\n\u001b[0;32m     15\u001b[0m     dataset\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhud-evals/OSWorld-Verified-XLang\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m     16\u001b[0m     job_name\u001b[38;5;241m=\u001b[39mjob_name,                 \n\u001b[0;32m     17\u001b[0m     model\u001b[38;5;241m=\u001b[39mmodel,\n\u001b[0;32m     18\u001b[0m     max_concurrent\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m30\u001b[39m,                   \n\u001b[0;32m     19\u001b[0m     max_steps\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m75\u001b[39m,\n\u001b[0;32m     20\u001b[0m     trajectory_dir\u001b[38;5;241m=\u001b[39m\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtrajectories/osworld_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mjob_uuid\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m     21\u001b[0m )\n\u001b[0;32m     23\u001b[0m \u001b[38;5;66;03m# results is a list from hud.datasets.run_dataset; inspect/aggregate as needed\u001b[39;00m\n\u001b[0;32m     24\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mJob: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mjob_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n",
+      "File \u001b[1;32mF:\\Projects\\cua\\cua-clean\\libs\\python\\agent\\agent\\integrations\\hud\\__init__.py:134\u001b[0m, in \u001b[0;36mrun_full_dataset\u001b[1;34m(dataset, job_name, model, allowed_tools, max_concurrent, max_steps, split, trajectory_dir)\u001b[0m\n\u001b[0;32m    131\u001b[0m     job_name \u001b[38;5;241m=\u001b[39m job_name \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEvaluation \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtime\u001b[38;5;241m.\u001b[39mstrftime(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mH:\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mM \u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mY-\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mm-\u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m    133\u001b[0m \u001b[38;5;66;03m# Execute evaluation\u001b[39;00m\n\u001b[1;32m--> 134\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m run_dataset(\n\u001b[0;32m    135\u001b[0m     name\u001b[38;5;241m=\u001b[39mjob_name,\n\u001b[0;32m    136\u001b[0m     dataset\u001b[38;5;241m=\u001b[39mdataset,\n\u001b[0;32m    137\u001b[0m     agent_class\u001b[38;5;241m=\u001b[39mProxyOperatorAgent,\n\u001b[0;32m    138\u001b[0m     agent_config\u001b[38;5;241m=\u001b[39m{\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmodel\u001b[39m\u001b[38;5;124m\"\u001b[39m: model, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mallowed_tools\u001b[39m\u001b[38;5;124m\"\u001b[39m: allowed_tools, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtrajectory_dir\u001b[39m\u001b[38;5;124m\"\u001b[39m: trajectory_dir},\n\u001b[0;32m    139\u001b[0m     max_concurrent\u001b[38;5;241m=\u001b[39mmax_concurrent,\n\u001b[0;32m    140\u001b[0m     metadata\u001b[38;5;241m=\u001b[39m{\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdataset\u001b[39m\u001b[38;5;124m\"\u001b[39m: dataset_name},\n\u001b[0;32m    141\u001b[0m     max_steps\u001b[38;5;241m=\u001b[39mmax_steps,\n\u001b[0;32m    142\u001b[0m     auto_respond\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[0;32m    143\u001b[0m )\n",
+      "File \u001b[1;32mc:\\Users\\dillo\\miniconda3\\envs\\cua\\Lib\\site-packages\\hud\\datasets.py:262\u001b[0m, in \u001b[0;36mrun_dataset\u001b[1;34m(name, dataset, agent_class, agent_config, max_concurrent, metadata, max_steps, split, auto_respond, custom_system_prompt)\u001b[0m\n\u001b[0;32m    259\u001b[0m                 results[index] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m agent\u001b[38;5;241m.\u001b[39mrun(task, max_steps\u001b[38;5;241m=\u001b[39mmax_steps)\n\u001b[0;32m    261\u001b[0m     \u001b[38;5;66;03m# Execute all tasks\u001b[39;00m\n\u001b[1;32m--> 262\u001b[0m     \u001b[38;5;28;01mawait\u001b[39;00m asyncio\u001b[38;5;241m.\u001b[39mgather(\n\u001b[0;32m    263\u001b[0m         \u001b[38;5;241m*\u001b[39m[_worker(i, task, max_steps\u001b[38;5;241m=\u001b[39mmax_steps) \u001b[38;5;28;01mfor\u001b[39;00m i, task \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(dataset)],\n\u001b[0;32m    264\u001b[0m         return_exceptions\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,  \u001b[38;5;66;03m# Don't fail entire batch on one error\u001b[39;00m\n\u001b[0;32m    265\u001b[0m     )\n\u001b[0;32m    267\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m results\n",
+      "File \u001b[1;32mc:\\Users\\dillo\\miniconda3\\envs\\cua\\Lib\\site-packages\\hud\\datasets.py:246\u001b[0m, in \u001b[0;36mrun_dataset.<locals>._worker\u001b[1;34m(index, task_dict, max_steps)\u001b[0m\n\u001b[0;32m    245\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m_worker\u001b[39m(index: \u001b[38;5;28mint\u001b[39m, task_dict: Any, max_steps: \u001b[38;5;28mint\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m40\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m--> 246\u001b[0m     \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mwith\u001b[39;00m sem:\n\u001b[0;32m    247\u001b[0m         \u001b[38;5;66;03m# Create trace for this task\u001b[39;00m\n\u001b[0;32m    248\u001b[0m         task_name \u001b[38;5;241m=\u001b[39m task_dict\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mprompt\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTask \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mindex\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m    249\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msystem_prompt\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m task_dict:\n",
+      "File \u001b[1;32mc:\\Users\\dillo\\miniconda3\\envs\\cua\\Lib\\asyncio\\locks.py:14\u001b[0m, in \u001b[0;36m_ContextManagerMixin.__aenter__\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m     13\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m__aenter__\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m---> 14\u001b[0m     \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39macquire()\n\u001b[0;32m     15\u001b[0m     \u001b[38;5;66;03m# We have no use for the \"as ...\"  clause in the with\u001b[39;00m\n\u001b[0;32m     16\u001b[0m     \u001b[38;5;66;03m# statement for locks.\u001b[39;00m\n\u001b[0;32m     17\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[1;32mc:\\Users\\dillo\\miniconda3\\envs\\cua\\Lib\\asyncio\\locks.py:386\u001b[0m, in \u001b[0;36mSemaphore.acquire\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    384\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m    385\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 386\u001b[0m         \u001b[38;5;28;01mawait\u001b[39;00m fut\n\u001b[0;32m    387\u001b[0m     \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[0;32m    388\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_waiters\u001b[38;5;241m.\u001b[39mremove(fut)\n",
+      "\u001b[1;31mCancelledError\u001b[0m: "
+     ]
+    }
+   ],
    "source": [
     "import uuid\n",
     "from agent.integrations.hud import run_full_dataset\n",
     "\n",
     "models_to_test = [\n",
-    "    \"openai/computer-use-preview+anthropic/claude-opus-4-1-20250805\",\n",
-    "    \"anthropic/claude-opus-4-1-20250805+openai/computer-use-preview\",\n",
+    "    \"openai/computer-use-preview+anthropic/claude-opus-4-20250514\",\n",
     "]\n",
-    "\n",
+    " \n",
     "\n",
     "for model in models_to_test:\n",
     "    # Full dataset evaluation (runs via HUD's run_dataset under the hood)\n",
@@ -189,13 +513,13 @@
     "        model=model,\n",
     "        max_concurrent=20,                   \n",
     "        max_steps=75,\n",
-    "        trajectory_dir=f\"trajectories/osworld_{job_uuid}\"\n",
+    "        trajectory_dir=f\"trajectories/osworld_{job_uuid}\",\n",
+    "        only_n_most_recent_images=3\n",
     "    )\n",
     "\n",
     "    # results is a list from hud.datasets.run_dataset; inspect/aggregate as needed\n",
     "    print(f\"Job: {job_name}\")\n",
-    "    print(f\"Total results: {len(results)}\")\n",
-    "    pprint(results[:3])  # preview"
+    "    print(f\"Total results: {len(results)}\")"
    ]
   }
  ],