diff --git a/notebooks/eval_osworld.ipynb b/notebooks/eval_osworld.ipynb index beccbb1e..6d521eff 100644 --- a/notebooks/eval_osworld.ipynb +++ b/notebooks/eval_osworld.ipynb @@ -57,17 +57,9 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "c:\\Users\\dillo\\miniconda3\\envs\\cua\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - }, { "name": "stdout", "output_type": "stream", @@ -76,7 +68,7 @@ "\u001b[90m╔═════════════════════════════════════════════════════════════════╗\u001b[0m\n", "\u001b[90m║\u001b[0m 🚀 See your agent live at: \u001b[90m║\u001b[0m\n", "\u001b[90m╟─────────────────────────────────────────────────────────────────╢\u001b[0m\n", - "\u001b[90m║\u001b[0m \u001b[1m\u001b[33mhttps://app.hud.so/trace/60b2956b-8520-4225-93d3-22ea2ff0253b\u001b[0m \u001b[90m║\u001b[0m\n", + "\u001b[90m║\u001b[0m \u001b[1m\u001b[33mhttps://app.hud.so/trace/7fa015b7-75b9-45df-accf-66b1d8895fe9\u001b[0m \u001b[90m║\u001b[0m\n", "\u001b[90m╚═════════════════════════════════════════════════════════════════╝\u001b[0m\n", "\n", "Running: Calculate from the RawData tab the z-scores from the mean close price for each row. Return, starting in ANSWER!A1 and descending to ANSWER!A5, the 5 dates with the greatest absolute value of standard deviations from the mean\n" @@ -86,7 +78,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "2025-08-27 13:10:52,780 - agent.ComputerAgent - INFO - LLM processing started with 1 messages\n", + "2025-08-27 13:22:49,324 - agent.ComputerAgent - INFO - LLM processing started with 1 messages\n", "Step failed: 'dict' object has no attribute 'id'\n" ] }, @@ -96,7 +88,7 @@ "text": [ "✅ Reward: 0.0\n", "\n", - "\u001b[92m✓ Trace complete!\u001b[0m \u001b[2mView at:\u001b[0m \u001b[1m\u001b[33mhttps://app.hud.so/trace/60b2956b-8520-4225-93d3-22ea2ff0253b\u001b[0m\n", + "\u001b[92m✓ Trace complete!\u001b[0m \u001b[2mView at:\u001b[0m \u001b[1m\u001b[33mhttps://app.hud.so/trace/7fa015b7-75b9-45df-accf-66b1d8895fe9\u001b[0m\n", "\n" ] } @@ -107,8 +99,9 @@ "# Quick single-task smoke test on OSWorld-Verified-XLang\n", "# You can swap \"hud-evals/OSWorld-Verified-XLang\" -> \"hud-evals/SheetBench-V2\" to test SheetBench.\n", "await run_single_task(\n", - " dataset=\"hud-evals/SheetBench-50\",\n", + " dataset=\"hud-evals/OSWorld-Verified-XLang\",\n", " model=\"openai/computer-use-preview\", # or any supported model string\n", + " task_id=155 # open last tab task (easy)\n", ")" ] },