diff --git a/libs/python/agent/agent/integrations/hud/agent.py b/libs/python/agent/agent/integrations/hud/agent.py index 4155d736..b30d0b6d 100644 --- a/libs/python/agent/agent/integrations/hud/agent.py +++ b/libs/python/agent/agent/integrations/hud/agent.py @@ -122,9 +122,17 @@ class FakeAsyncOpenAI: prev_blocks = [self.blocks_cache[b_id] for b_id in prev_block_ids] full_input = _to_plain_dict_list(prev_blocks + input) + # Pre-pend instructions message + effective_input = full_input + if instructions: + effective_input = [{ + "role": "user", + "content": instructions, + }] + full_input + # Run a single iteration of the ComputerAgent agent_result: Optional[Dict[str, Any]] = None - async for result in self.agent.run(full_input): # type: ignore[arg-type] + async for result in self.agent.run(effective_input): # type: ignore[arg-type] agent_result = result break assert agent_result is not None, "Agent failed to produce result" diff --git a/notebooks/eval_osworld.ipynb b/notebooks/eval_osworld.ipynb index 6d521eff..afbb1716 100644 --- a/notebooks/eval_osworld.ipynb +++ b/notebooks/eval_osworld.ipynb @@ -57,9 +57,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\dillo\\miniconda3\\envs\\cua\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -68,18 +76,20 @@ "\u001b[90m╔═════════════════════════════════════════════════════════════════╗\u001b[0m\n", "\u001b[90m║\u001b[0m 🚀 See your agent live at: \u001b[90m║\u001b[0m\n", "\u001b[90m╟─────────────────────────────────────────────────────────────────╢\u001b[0m\n", - "\u001b[90m║\u001b[0m \u001b[1m\u001b[33mhttps://app.hud.so/trace/7fa015b7-75b9-45df-accf-66b1d8895fe9\u001b[0m \u001b[90m║\u001b[0m\n", + "\u001b[90m║\u001b[0m \u001b[1m\u001b[33mhttps://app.hud.so/trace/426ed182-564d-4b12-b950-c551caeeb8a8\u001b[0m \u001b[90m║\u001b[0m\n", "\u001b[90m╚═════════════════════════════════════════════════════════════════╝\u001b[0m\n", "\n", - "Running: Calculate from the RawData tab the z-scores from the mean close price for each row. Return, starting in ANSWER!A1 and descending to ANSWER!A5, the 5 dates with the greatest absolute value of standard deviations from the mean\n" + "Running: Can you make my computer bring back the last tab I shut down?\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2025-08-27 13:22:49,324 - agent.ComputerAgent - INFO - LLM processing started with 1 messages\n", - "Step failed: 'dict' object has no attribute 'id'\n" + "2025-08-27 13:36:03,660 - agent.ComputerAgent - INFO - LLM processing started with 2 messages\n", + "2025-08-27 13:36:21,971 - agent.ComputerAgent - INFO - LLM processing started with 6 messages\n", + "Tool execution failed: Tool evaluate has an output schema but did not return structured content\n", + "Evaluation phase failed: [MCPToolResult(meta=None, content=[TextContent(type='text', text='Tool evaluate has an output schema but did not return structured content', annotations=None, meta=None)], structuredContent=None, isError=True)]\n" ] }, { @@ -88,7 +98,7 @@ "text": [ "✅ Reward: 0.0\n", "\n", - "\u001b[92m✓ Trace complete!\u001b[0m \u001b[2mView at:\u001b[0m \u001b[1m\u001b[33mhttps://app.hud.so/trace/7fa015b7-75b9-45df-accf-66b1d8895fe9\u001b[0m\n", + "\u001b[92m✓ Trace complete!\u001b[0m \u001b[2mView at:\u001b[0m \u001b[1m\u001b[33mhttps://app.hud.so/trace/426ed182-564d-4b12-b950-c551caeeb8a8\u001b[0m\n", "\n" ] }