diff --git a/libs/python/agent/agent/agent.py b/libs/python/agent/agent/agent.py index fb9a45d9..eacc5fdd 100644 --- a/libs/python/agent/agent/agent.py +++ b/libs/python/agent/agent/agent.py @@ -29,7 +29,7 @@ from .callbacks import ( TrajectorySaverCallback, BudgetManagerCallback, TelemetryCallback, - OperatorValidatorCallback + OperatorNormalizerCallback ) from .computers import ( AsyncComputerHandler, @@ -202,8 +202,8 @@ class ComputerAgent: # == Add built-in callbacks == - # Prepend operator validator callback - self.callbacks.insert(0, OperatorValidatorCallback()) + # Prepend operator normalizer callback + self.callbacks.insert(0, OperatorNormalizerCallback()) # Add telemetry callback if telemetry_enabled is set if self.telemetry_enabled: diff --git a/libs/python/agent/agent/callbacks/__init__.py b/libs/python/agent/agent/callbacks/__init__.py index 35ab0263..e0befcc7 100644 --- a/libs/python/agent/agent/callbacks/__init__.py +++ b/libs/python/agent/agent/callbacks/__init__.py @@ -8,7 +8,7 @@ from .logging import LoggingCallback from .trajectory_saver import TrajectorySaverCallback from .budget_manager import BudgetManagerCallback from .telemetry import TelemetryCallback -from .operator_validator import OperatorValidatorCallback +from .operator_validator import OperatorNormalizerCallback __all__ = [ "AsyncCallbackHandler", @@ -17,5 +17,5 @@ __all__ = [ "TrajectorySaverCallback", "BudgetManagerCallback", "TelemetryCallback", - "OperatorValidatorCallback", + "OperatorNormalizerCallback", ] diff --git a/libs/python/agent/agent/callbacks/operator_validator.py b/libs/python/agent/agent/callbacks/operator_validator.py index db19555c..2a1160c9 100644 --- a/libs/python/agent/agent/callbacks/operator_validator.py +++ b/libs/python/agent/agent/callbacks/operator_validator.py @@ -4,6 +4,7 @@ OperatorValidatorCallback Ensures agent output actions conform to expected schemas by fixing common issues: - click: add default button='left' if missing - keypress: wrap keys string into a list +- etc. This runs in on_llm_end, which receives the output array (AgentMessage[] as dicts). """ @@ -14,14 +15,12 @@ from typing import Any, Dict, List from .base import AsyncCallbackHandler -class OperatorValidatorCallback(AsyncCallbackHandler): - """Validates and normalizes operator/computer actions in LLM outputs.""" +class OperatorNormalizerCallback(AsyncCallbackHandler): + """Normalizes common computer call hallucinations / errors in computer call syntax.""" async def on_llm_end(self, output: List[Dict[str, Any]]) -> List[Dict[str, Any]]: # Mutate in-place as requested, but still return the list for chaining for item in output or []: - if not isinstance(item, dict): - continue if item.get("type") != "computer_call": continue action = item.get("action") @@ -56,8 +55,6 @@ class OperatorValidatorCallback(AsyncCallbackHandler): # replace the assistant message itself with a reasoning message with summary text. if isinstance(output, list): for i, item in enumerate(output): - if not isinstance(item, dict): - continue # AssistantMessage shape: { type: 'message', role: 'assistant', content: OutputContent[] } if item.get("type") == "message" and item.get("role") == "assistant": next_idx = i + 1 diff --git a/libs/python/agent/agent/integrations/hud/__init__.py b/libs/python/agent/agent/integrations/hud/__init__.py index 64c91fb6..c73532c7 100644 --- a/libs/python/agent/agent/integrations/hud/__init__.py +++ b/libs/python/agent/agent/integrations/hud/__init__.py @@ -42,6 +42,17 @@ class ProxyOperatorAgent(OperatorAgent): model: str | None = None, allowed_tools: list[str] | None = None, trajectory_dir: str | None = None, + # === ComputerAgent kwargs === + tools: list[Any] | None = None, + custom_loop: Any | None = None, + only_n_most_recent_images: int | None = None, + callbacks: list[Any] | None = None, + verbosity: int | None = None, + max_retries: int | None = 3, + screenshot_delay: float | int = 0.5, + use_prompt_caching: bool | None = False, + max_trajectory_budget: float | dict | None = None, + telemetry_enabled: bool | None = True, **kwargs: Any, ) -> None: model = model or "computer-use-preview" @@ -52,10 +63,24 @@ class ProxyOperatorAgent(OperatorAgent): 'environment': 'linux', 'dimensions': (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT) } + # Build tools ensuring the computer_shim is included + agent_tools: list[Any] = [computer_shim] + if tools: + agent_tools.extend(tools) + computer_agent = BaseComputerAgent( - model=model, - tools=[computer_shim], - trajectory_dir=trajectory_dir + model=model, + tools=agent_tools, + custom_loop=custom_loop, + only_n_most_recent_images=only_n_most_recent_images, + callbacks=callbacks, + verbosity=verbosity, + trajectory_dir=trajectory_dir, + max_retries=max_retries, + screenshot_delay=screenshot_delay, + use_prompt_caching=use_prompt_caching, + max_trajectory_budget=max_trajectory_budget, + telemetry_enabled=telemetry_enabled, ) model_client = FakeAsyncOpenAI(computer_agent) @@ -78,6 +103,18 @@ async def run_single_task( task_id: int = 0, model: str | None = None, allowed_tools: list[str] | None = None, + # === ComputerAgent kwargs === + tools: list[Any] | None = None, + custom_loop: Any | None = None, + only_n_most_recent_images: int | None = None, + callbacks: list[Any] | None = None, + verbosity: int | None = None, + trajectory_dir: str | None = None, + max_retries: int | None = 3, + screenshot_delay: float | int = 0.5, + use_prompt_caching: bool | None = False, + max_trajectory_budget: float | dict | None = None, + telemetry_enabled: bool | None = True, ) -> None: """Load one task from the dataset and execute it with Operator+CUA proxy.""" @@ -95,7 +132,22 @@ async def run_single_task( with trace(name=task_prompt): task = Task(**sample_task) # type: ignore[arg-type] - agent = ProxyOperatorAgent(model=model, allowed_tools=allowed_tools) + agent = ProxyOperatorAgent( + model=model, + allowed_tools=allowed_tools, + # === ComputerAgent kwargs passthrough === + tools=tools, + custom_loop=custom_loop, + only_n_most_recent_images=only_n_most_recent_images, + callbacks=callbacks, + verbosity=verbosity, + trajectory_dir=trajectory_dir, + max_retries=max_retries, + screenshot_delay=screenshot_delay, + use_prompt_caching=use_prompt_caching, + max_trajectory_budget=max_trajectory_budget, + telemetry_enabled=telemetry_enabled, + ) print(f"Running: {task_prompt}") result = await agent.run(task, max_steps=10) print(f"✅ Reward: {getattr(result, 'reward')}") @@ -116,6 +168,17 @@ async def run_full_dataset( max_steps: int = 50, split: str = "train", trajectory_dir: str | None = None, + # === ComputerAgent kwargs === + tools: list[Any] | None = None, + custom_loop: Any | None = None, + only_n_most_recent_images: int | None = 5, + callbacks: list[Any] | None = None, + verbosity: int | None = None, + max_retries: int | None = 3, + screenshot_delay: float | int = 0.5, + use_prompt_caching: bool | None = False, + max_trajectory_budget: float | dict | None = None, + telemetry_enabled: bool | None = True, ) -> list[Any]: """Run evaluation across the entire dataset using hud.datasets.run_dataset.""" @@ -135,7 +198,22 @@ async def run_full_dataset( name=job_name, dataset=dataset, agent_class=ProxyOperatorAgent, - agent_config={"model": model, "allowed_tools": allowed_tools, "trajectory_dir": trajectory_dir}, + agent_config={ + "model": model, + "allowed_tools": allowed_tools, + "trajectory_dir": trajectory_dir, + # === ComputerAgent kwargs passthrough === + "tools": tools, + "custom_loop": custom_loop, + "only_n_most_recent_images": only_n_most_recent_images, + "callbacks": callbacks, + "verbosity": verbosity, + "max_retries": max_retries, + "screenshot_delay": screenshot_delay, + "use_prompt_caching": use_prompt_caching, + "max_trajectory_budget": max_trajectory_budget, + "telemetry_enabled": telemetry_enabled, + }, max_concurrent=max_concurrent, metadata={"dataset": dataset_name}, max_steps=max_steps, diff --git a/notebooks/eval_osworld.ipynb b/notebooks/eval_osworld.ipynb index adb5d22b..3111bbb7 100644 --- a/notebooks/eval_osworld.ipynb +++ b/notebooks/eval_osworld.ipynb @@ -167,16 +167,340 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\u001b[90m╔══════════════════════════════════════════════════════════════════════════════════════════════════╗\u001b[0m\n", + "\u001b[90m║\u001b[0m 🚀 Job 'osworld b4db80 openai/computer-use-preview+anthropic/claude-opus-4-1-20250805' started: \u001b[90m║\u001b[0m\n", + "\u001b[90m╟──────────────────────────────────────────────────────────────────────────────────────────────────╢\u001b[0m\n", + "\u001b[90m║\u001b[0m \u001b[1m\u001b[33mhttps://app.hud.so/jobs/0d10d7d0-2c86-4a5d-a36b-2c28719773cf\u001b[0m \u001b[90m║\u001b[0m\n", + "\u001b[90m╚══════════════════════════════════════════════════════════════════════════════════════════════════╝\u001b[0m\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: InternalServerError: OpenAIException - {\n", + " \"error\": {\n", + " \"message\": \"An error occurred while processing your request. You can retry your request, or contact us through our help center at help.openai.com if the error persists. Please include the request ID req_97cc086a1b58a101f7db3ea88323a12f in your message.\",\n", + " \"type\": \"server_error\",\n", + " \"param\": null,\n", + " \"code\": \"server_error\"\n", + " }\n", + "}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: 22 validation errors for ResponseComputerToolCall\n", + "action.ActionClick.button\n", + " Field required [type=missing, input_value={'keys': 'Tab', 'type': 'keypress'}, input_type=dict]\n", + " For further information visit https://errors.pydantic.dev/2.11/v/missing\n", + "action.ActionClick.type\n", + " Input should be 'click' [type=literal_error, input_value='keypress', input_type=str]\n", + " For further information visit https://errors.pydantic.dev/2.11/v/literal_error\n", + "action.ActionClick.x\n", + " Field required [type=missing, input_value={'keys': 'Tab', 'type': 'keypress'}, input_type=dict]\n", + " For further information visit https://errors.pydantic.dev/2.11/v/missing\n", + "action.ActionClick.y\n", + " Field required [type=missing, input_value={'keys': 'Tab', 'type': 'keypress'}, input_type=dict]\n", + " For further information visit https://errors.pydantic.dev/2.11/v/missing\n", + "action.ActionDoubleClick.type\n", + " Input should be 'double_click' [type=literal_error, input_value='keypress', input_type=str]\n", + " For further information visit https://errors.pydantic.dev/2.11/v/literal_error\n", + "action.ActionDoubleClick.x\n", + " Field required [type=missing, input_value={'keys': 'Tab', 'type': 'keypress'}, input_type=dict]\n", + " For further information visit https://errors.pydantic.dev/2.11/v/missing\n", + "action.ActionDoubleClick.y\n", + " Field required [type=missing, input_value={'keys': 'Tab', 'type': 'keypress'}, input_type=dict]\n", + " For further information visit https://errors.pydantic.dev/2.11/v/missing\n", + "action.ActionDrag.path\n", + " Field required [type=missing, input_value={'keys': 'Tab', 'type': 'keypress'}, input_type=dict]\n", + " For further information visit https://errors.pydantic.dev/2.11/v/missing\n", + "action.ActionDrag.type\n", + " Input should be 'drag' [type=literal_error, input_value='keypress', input_type=str]\n", + " For further information visit https://errors.pydantic.dev/2.11/v/literal_error\n", + "action.ActionKeypress.keys\n", + " Input should be a valid list [type=list_type, input_value='Tab', input_type=str]\n", + " For further information visit https://errors.pydantic.dev/2.11/v/list_type\n", + "action.ActionMove.type\n", + " Input should be 'move' [type=literal_error, input_value='keypress', input_type=str]\n", + " For further information visit https://errors.pydantic.dev/2.11/v/literal_error\n", + "action.ActionMove.x\n", + " Field required [type=missing, input_value={'keys': 'Tab', 'type': 'keypress'}, input_type=dict]\n", + " For further information visit https://errors.pydantic.dev/2.11/v/missing\n", + "action.ActionMove.y\n", + " Field required [type=missing, input_value={'keys': 'Tab', 'type': 'keypress'}, input_type=dict]\n", + " For further information visit https://errors.pydantic.dev/2.11/v/missing\n", + "action.ActionScreenshot.type\n", + " Input should be 'screenshot' [type=literal_error, input_value='keypress', input_type=str]\n", + " For further information visit https://errors.pydantic.dev/2.11/v/literal_error\n", + "action.ActionScroll.scroll_x\n", + " Field required [type=missing, input_value={'keys': 'Tab', 'type': 'keypress'}, input_type=dict]\n", + " For further information visit https://errors.pydantic.dev/2.11/v/missing\n", + "action.ActionScroll.scroll_y\n", + " Field required [type=missing, input_value={'keys': 'Tab', 'type': 'keypress'}, input_type=dict]\n", + " For further information visit https://errors.pydantic.dev/2.11/v/missing\n", + "action.ActionScroll.type\n", + " Input should be 'scroll' [type=literal_error, input_value='keypress', input_type=str]\n", + " For further information visit https://errors.pydantic.dev/2.11/v/literal_error\n", + "action.ActionScroll.x\n", + " Field required [type=missing, input_value={'keys': 'Tab', 'type': 'keypress'}, input_type=dict]\n", + " For further information visit https://errors.pydantic.dev/2.11/v/missing\n", + "action.ActionScroll.y\n", + " Field required [type=missing, input_value={'keys': 'Tab', 'type': 'keypress'}, input_type=dict]\n", + " For further information visit https://errors.pydantic.dev/2.11/v/missing\n", + "action.ActionType.text\n", + " Field required [type=missing, input_value={'keys': 'Tab', 'type': 'keypress'}, input_type=dict]\n", + " For further information visit https://errors.pydantic.dev/2.11/v/missing\n", + "action.ActionType.type\n", + " Input should be 'type' [type=literal_error, input_value='keypress', input_type=str]\n", + " For further information visit https://errors.pydantic.dev/2.11/v/literal_error\n", + "action.ActionWait.type\n", + " Input should be 'wait' [type=literal_error, input_value='keypress', input_type=str]\n", + " For further information visit https://errors.pydantic.dev/2.11/v/literal_error\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: InternalServerError: OpenAIException - {\n", + " \"error\": {\n", + " \"message\": \"An error occurred while processing your request. You can retry your request, or contact us through our help center at help.openai.com if the error persists. Please include the request ID req_f7a393a984e7e85dc5845aef8a9471e4 in your message.\",\n", + " \"type\": \"model_error\",\n", + " \"param\": null,\n", + " \"code\": null\n", + " }\n", + "}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Step failed: litellm.InternalServerError: AnthropicError - {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"},\"request_id\":null}\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Client is not running, cannot disconnect\n", + "Client is not running, cannot disconnect\n", + "Client is not running, cannot disconnect\n", + "Client is not running, cannot disconnect\n", + "Client is not running, cannot disconnect\n", + "Client is not running, cannot disconnect\n", + "Client is not running, cannot disconnect\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n", + "Error parsing JSON response\n", + "Traceback (most recent call last):\n", + " File \"c:\\Users\\dillo\\miniconda3\\envs\\cua\\Lib\\site-packages\\mcp\\client\\streamable_http.py\", line 310, in _handle_json_response\n", + " await read_stream_writer.send(session_message)\n", + " File \"c:\\Users\\dillo\\miniconda3\\envs\\cua\\Lib\\site-packages\\anyio\\streams\\memory.py\", line 242, in send\n", + " self.send_nowait(item)\n", + " File \"c:\\Users\\dillo\\miniconda3\\envs\\cua\\Lib\\site-packages\\anyio\\streams\\memory.py\", line 211, in send_nowait\n", + " raise ClosedResourceError\n", + "anyio.ClosedResourceError\n", + "Request handler error: \n", + "Failed to close auto-created client: \n", + "Error parsing JSON response\n", + "Traceback (most recent call last):\n", + " File \"c:\\Users\\dillo\\miniconda3\\envs\\cua\\Lib\\site-packages\\mcp\\client\\streamable_http.py\", line 310, in _handle_json_response\n", + " await read_stream_writer.send(session_message)\n", + " File \"c:\\Users\\dillo\\miniconda3\\envs\\cua\\Lib\\site-packages\\anyio\\streams\\memory.py\", line 242, in send\n", + " self.send_nowait(item)\n", + " File \"c:\\Users\\dillo\\miniconda3\\envs\\cua\\Lib\\site-packages\\anyio\\streams\\memory.py\", line 211, in send_nowait\n", + " raise ClosedResourceError\n", + "anyio.ClosedResourceError\n", + "Request handler error: \n", + "Failed to close auto-created client: \n", + "Tool evaluate has an output schema but did not return structured content. Continuing without structured content validation.\n" + ] + }, + { + "ename": "CancelledError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mCancelledError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[7], line 14\u001b[0m\n\u001b[0;32m 11\u001b[0m job_uuid \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(uuid\u001b[38;5;241m.\u001b[39muuid4())[:\u001b[38;5;241m6\u001b[39m]\n\u001b[0;32m 12\u001b[0m job_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mosworld \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mjob_uuid\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmodel\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m---> 14\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m run_full_dataset(\n\u001b[0;32m 15\u001b[0m dataset\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhud-evals/OSWorld-Verified-XLang\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 16\u001b[0m job_name\u001b[38;5;241m=\u001b[39mjob_name, \n\u001b[0;32m 17\u001b[0m model\u001b[38;5;241m=\u001b[39mmodel,\n\u001b[0;32m 18\u001b[0m max_concurrent\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m30\u001b[39m, \n\u001b[0;32m 19\u001b[0m max_steps\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m75\u001b[39m,\n\u001b[0;32m 20\u001b[0m trajectory_dir\u001b[38;5;241m=\u001b[39m\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtrajectories/osworld_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mjob_uuid\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 21\u001b[0m )\n\u001b[0;32m 23\u001b[0m \u001b[38;5;66;03m# results is a list from hud.datasets.run_dataset; inspect/aggregate as needed\u001b[39;00m\n\u001b[0;32m 24\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mJob: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mjob_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[1;32mF:\\Projects\\cua\\cua-clean\\libs\\python\\agent\\agent\\integrations\\hud\\__init__.py:134\u001b[0m, in \u001b[0;36mrun_full_dataset\u001b[1;34m(dataset, job_name, model, allowed_tools, max_concurrent, max_steps, split, trajectory_dir)\u001b[0m\n\u001b[0;32m 131\u001b[0m job_name \u001b[38;5;241m=\u001b[39m job_name \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEvaluation \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtime\u001b[38;5;241m.\u001b[39mstrftime(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mH:\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mM \u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mY-\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mm-\u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 133\u001b[0m \u001b[38;5;66;03m# Execute evaluation\u001b[39;00m\n\u001b[1;32m--> 134\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m run_dataset(\n\u001b[0;32m 135\u001b[0m name\u001b[38;5;241m=\u001b[39mjob_name,\n\u001b[0;32m 136\u001b[0m dataset\u001b[38;5;241m=\u001b[39mdataset,\n\u001b[0;32m 137\u001b[0m agent_class\u001b[38;5;241m=\u001b[39mProxyOperatorAgent,\n\u001b[0;32m 138\u001b[0m agent_config\u001b[38;5;241m=\u001b[39m{\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmodel\u001b[39m\u001b[38;5;124m\"\u001b[39m: model, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mallowed_tools\u001b[39m\u001b[38;5;124m\"\u001b[39m: allowed_tools, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtrajectory_dir\u001b[39m\u001b[38;5;124m\"\u001b[39m: trajectory_dir},\n\u001b[0;32m 139\u001b[0m max_concurrent\u001b[38;5;241m=\u001b[39mmax_concurrent,\n\u001b[0;32m 140\u001b[0m metadata\u001b[38;5;241m=\u001b[39m{\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdataset\u001b[39m\u001b[38;5;124m\"\u001b[39m: dataset_name},\n\u001b[0;32m 141\u001b[0m max_steps\u001b[38;5;241m=\u001b[39mmax_steps,\n\u001b[0;32m 142\u001b[0m auto_respond\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[0;32m 143\u001b[0m )\n", + "File \u001b[1;32mc:\\Users\\dillo\\miniconda3\\envs\\cua\\Lib\\site-packages\\hud\\datasets.py:262\u001b[0m, in \u001b[0;36mrun_dataset\u001b[1;34m(name, dataset, agent_class, agent_config, max_concurrent, metadata, max_steps, split, auto_respond, custom_system_prompt)\u001b[0m\n\u001b[0;32m 259\u001b[0m results[index] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m agent\u001b[38;5;241m.\u001b[39mrun(task, max_steps\u001b[38;5;241m=\u001b[39mmax_steps)\n\u001b[0;32m 261\u001b[0m \u001b[38;5;66;03m# Execute all tasks\u001b[39;00m\n\u001b[1;32m--> 262\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m asyncio\u001b[38;5;241m.\u001b[39mgather(\n\u001b[0;32m 263\u001b[0m \u001b[38;5;241m*\u001b[39m[_worker(i, task, max_steps\u001b[38;5;241m=\u001b[39mmax_steps) \u001b[38;5;28;01mfor\u001b[39;00m i, task \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(dataset)],\n\u001b[0;32m 264\u001b[0m return_exceptions\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, \u001b[38;5;66;03m# Don't fail entire batch on one error\u001b[39;00m\n\u001b[0;32m 265\u001b[0m )\n\u001b[0;32m 267\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m results\n", + "File \u001b[1;32mc:\\Users\\dillo\\miniconda3\\envs\\cua\\Lib\\site-packages\\hud\\datasets.py:246\u001b[0m, in \u001b[0;36mrun_dataset.._worker\u001b[1;34m(index, task_dict, max_steps)\u001b[0m\n\u001b[0;32m 245\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m_worker\u001b[39m(index: \u001b[38;5;28mint\u001b[39m, task_dict: Any, max_steps: \u001b[38;5;28mint\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m40\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m--> 246\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mwith\u001b[39;00m sem:\n\u001b[0;32m 247\u001b[0m \u001b[38;5;66;03m# Create trace for this task\u001b[39;00m\n\u001b[0;32m 248\u001b[0m task_name \u001b[38;5;241m=\u001b[39m task_dict\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mprompt\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTask \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mindex\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 249\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msystem_prompt\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m task_dict:\n", + "File \u001b[1;32mc:\\Users\\dillo\\miniconda3\\envs\\cua\\Lib\\asyncio\\locks.py:14\u001b[0m, in \u001b[0;36m_ContextManagerMixin.__aenter__\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 13\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m__aenter__\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m---> 14\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39macquire()\n\u001b[0;32m 15\u001b[0m \u001b[38;5;66;03m# We have no use for the \"as ...\" clause in the with\u001b[39;00m\n\u001b[0;32m 16\u001b[0m \u001b[38;5;66;03m# statement for locks.\u001b[39;00m\n\u001b[0;32m 17\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "File \u001b[1;32mc:\\Users\\dillo\\miniconda3\\envs\\cua\\Lib\\asyncio\\locks.py:386\u001b[0m, in \u001b[0;36mSemaphore.acquire\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 384\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 385\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 386\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m fut\n\u001b[0;32m 387\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[0;32m 388\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_waiters\u001b[38;5;241m.\u001b[39mremove(fut)\n", + "\u001b[1;31mCancelledError\u001b[0m: " + ] + } + ], "source": [ "import uuid\n", "from agent.integrations.hud import run_full_dataset\n", "\n", "models_to_test = [\n", - " \"openai/computer-use-preview+anthropic/claude-opus-4-1-20250805\",\n", - " \"anthropic/claude-opus-4-1-20250805+openai/computer-use-preview\",\n", + " \"openai/computer-use-preview+anthropic/claude-opus-4-20250514\",\n", "]\n", - "\n", + " \n", "\n", "for model in models_to_test:\n", " # Full dataset evaluation (runs via HUD's run_dataset under the hood)\n", @@ -189,13 +513,13 @@ " model=model,\n", " max_concurrent=20, \n", " max_steps=75,\n", - " trajectory_dir=f\"trajectories/osworld_{job_uuid}\"\n", + " trajectory_dir=f\"trajectories/osworld_{job_uuid}\",\n", + " only_n_most_recent_images=3\n", " )\n", "\n", " # results is a list from hud.datasets.run_dataset; inspect/aggregate as needed\n", " print(f\"Job: {job_name}\")\n", - " print(f\"Total results: {len(results)}\")\n", - " pprint(results[:3]) # preview" + " print(f\"Total results: {len(results)}\")" ] } ],