added simple task id

2026-01-05 21:09:58 -06:00 · 2025-08-27 13:28:24 -04:00
parent 61a442da56
commit 3c502354a8
1 changed files with 6 additions and 13 deletions
--- a/notebooks/eval_osworld.ipynb
+++ b/notebooks/eval_osworld.ipynb
@@ -57,17 +57,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "metadata": {},
   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "c:\\Users\\dillo\\miniconda3\\envs\\cua\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n"
-     ]
-    },
    {
     "name": "stdout",
     "output_type": "stream",
@@ -76,7 +68,7 @@
      "\u001b[90m╔═════════════════════════════════════════════════════════════════╗\u001b[0m\n",
      "\u001b[90m║\u001b[0m                    🚀 See your agent live at:                   \u001b[90m║\u001b[0m\n",
      "\u001b[90m╟─────────────────────────────────────────────────────────────────╢\u001b[0m\n",
-      "\u001b[90m║\u001b[0m  \u001b[1m\u001b[33mhttps://app.hud.so/trace/60b2956b-8520-4225-93d3-22ea2ff0253b\u001b[0m  \u001b[90m║\u001b[0m\n",
+      "\u001b[90m║\u001b[0m  \u001b[1m\u001b[33mhttps://app.hud.so/trace/7fa015b7-75b9-45df-accf-66b1d8895fe9\u001b[0m  \u001b[90m║\u001b[0m\n",
      "\u001b[90m╚═════════════════════════════════════════════════════════════════╝\u001b[0m\n",
      "\n",
      "Running: Calculate from the RawData tab the z-scores from the mean close price for each row. Return, starting in ANSWER!A1 and descending to ANSWER!A5, the 5 dates with the greatest absolute value of standard deviations from the mean\n"
@@ -86,7 +78,7 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "2025-08-27 13:10:52,780 - agent.ComputerAgent - INFO - LLM processing started with 1 messages\n",
+      "2025-08-27 13:22:49,324 - agent.ComputerAgent - INFO - LLM processing started with 1 messages\n",
      "Step failed: 'dict' object has no attribute 'id'\n"
     ]
    },
@@ -96,7 +88,7 @@
     "text": [
      "✅ Reward: 0.0\n",
      "\n",
-      "\u001b[92m✓ Trace complete!\u001b[0m \u001b[2mView at:\u001b[0m \u001b[1m\u001b[33mhttps://app.hud.so/trace/60b2956b-8520-4225-93d3-22ea2ff0253b\u001b[0m\n",
+      "\u001b[92m✓ Trace complete!\u001b[0m \u001b[2mView at:\u001b[0m \u001b[1m\u001b[33mhttps://app.hud.so/trace/7fa015b7-75b9-45df-accf-66b1d8895fe9\u001b[0m\n",
      "\n"
     ]
    }
@@ -107,8 +99,9 @@
    "# Quick single-task smoke test on OSWorld-Verified-XLang\n",
    "# You can swap \"hud-evals/OSWorld-Verified-XLang\" -> \"hud-evals/SheetBench-V2\" to test SheetBench.\n",
    "await run_single_task(\n",
-    "    dataset=\"hud-evals/SheetBench-50\",\n",
+    "    dataset=\"hud-evals/OSWorld-Verified-XLang\",\n",
    "    model=\"openai/computer-use-preview\",  # or any supported model string\n",
+    "    task_id=155 # open last tab task (easy)\n",
    ")"
   ]
  },