computer/notebooks/eval_osworld.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ComputerAgent HUD Integration for OSWorld\n",
    "\n",
    "This notebook demonstrates how to use the ComputerAgent with HUD for OSWorld benchmarking.\n",
    "The ComputerAgent integration provides the same interface as OperatorAgent but works with both Claude and OpenAI models."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # Install dependencies if needed\n",
    "# !uv venv \n",
    "# !source .venv/bin/activate\n",
    "# !uv sync"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Required environment variables:\n",
    "# - HUD_API_KEY (for HUD access)\n",
    "# - ANTHROPIC_API_KEY (for Claude models)\n",
    "# - OPENAI_API_KEY (for OpenAI models)\n",
    "\n",
    "from pprint import pprint"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Quick single-task smoke test on OSWorld-Verified\n",
    "\n",
    "The ComputerAgent integration can use Claude, OpenAI, UI-TARS, or composed models just like the original ComputerAgent:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\dillo\\miniconda3\\envs\\cua\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\u001b[90m╔═════════════════════════════════════════════════════════════════╗\u001b[0m\n",
      "\u001b[90m║\u001b[0m                    🚀 See your agent live at:                   \u001b[90m║\u001b[0m\n",
      "\u001b[90m╟─────────────────────────────────────────────────────────────────╢\u001b[0m\n",
      "\u001b[90m║\u001b[0m  \u001b[1m\u001b[33mhttps://app.hud.so/trace/cbe6f71b-f520-4630-9f27-778647070327\u001b[0m  \u001b[90m║\u001b[0m\n",
      "\u001b[90m╚═════════════════════════════════════════════════════════════════╝\u001b[0m\n",
      "\n",
      "Running: Can you make my computer bring back the last tab I shut down?\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025-08-27 16:17:53,047 - agent.ComputerAgent - INFO - LLM processing started with 2 messages\n",
      "2025-08-27 16:18:02,697 - agent.ComputerAgent - INFO - LLM processing started with 5 messages\n",
      "2025-08-27 16:18:15,887 - agent.ComputerAgent - INFO - LLM processing started with 7 messages\n",
      "2025-08-27 16:18:28,541 - agent.ComputerAgent - INFO - LLM processing started with 9 messages\n",
      "2025-08-27 16:18:42,176 - agent.ComputerAgent - INFO - LLM processing started with 11 messages\n",
      "2025-08-27 16:18:55,937 - agent.ComputerAgent - INFO - LLM processing started with 13 messages\n",
      "2025-08-27 16:19:11,654 - agent.ComputerAgent - INFO - LLM processing started with 15 messages\n",
      "2025-08-27 16:19:23,839 - agent.ComputerAgent - INFO - LLM processing started with 17 messages\n",
      "2025-08-27 16:19:39,065 - agent.ComputerAgent - INFO - LLM processing started with 19 messages\n",
      "Tool execution failed: Tool evaluate has an output schema but did not return structured content\n",
      "Evaluation phase failed: [MCPToolResult(meta=None, content=[TextContent(type='text', text='Tool evaluate has an output schema but did not return structured content', annotations=None, meta=None)], structuredContent=None, isError=True)]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ Reward: 0.0\n",
      "\n",
      "\u001b[92m✓ Trace complete!\u001b[0m \u001b[2mView at:\u001b[0m \u001b[1m\u001b[33mhttps://app.hud.so/trace/cbe6f71b-f520-4630-9f27-778647070327\u001b[0m\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from agent.integrations.hud import run_single_task\n",
    "\n",
    "# Quick single-task smoke test on OSWorld-Verified-XLang\n",
    "# You can swap \"hud-evals/OSWorld-Verified-XLang\" -> \"hud-evals/SheetBench-V2\" to test SheetBench.\n",
    "await run_single_task(\n",
    "    dataset=\"hud-evals/OSWorld-Verified-XLang\",\n",
    "    model=\"openai/computer-use-preview+openai/gpt-5-nano\",  # or any supported model string\n",
    "    task_id=155 # open last tab task (easy)\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Run OSWorld-Verified in parallel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import uuid\n",
    "from agent.integrations.hud import run_full_dataset\n",
    "\n",
    "# Full dataset evaluation (runs via HUD's run_dataset under the hood)\n",
    "job_name = f\"osworld-test-{str(uuid.uuid4())[:4]}\"\n",
    "\n",
    "results = await run_full_dataset(\n",
    "    dataset=\"hud-evals/OSWorld-Verified-XLang\",          # You can also pass a Dataset or a list[dict]\n",
    "    job_name=job_name,                   # Optional; defaults to a timestamp for custom datasets\n",
    "    model=\"openai/computer-use-preview\", # Or any supported model string\n",
    "    max_concurrent=20,                   # Tune to your infra\n",
    "    max_steps=50,                        # Safety cap per task\n",
    "    split=\"train[:3]\"                    # Limit to just 3 tasks\n",
    ")\n",
    "\n",
    "# results is a list from hud.datasets.run_dataset; inspect/aggregate as needed\n",
    "print(f\"Job: {job_name}\")\n",
    "print(f\"Total results: {len(results)}\")\n",
    "pprint(results[:3])  # preview"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Benchmark Composed Agents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import uuid\n",
    "from agent.integrations.hud import run_full_dataset\n",
    "\n",
    "models_to_test = [\n",
    "    \"openai/computer-use-preview+anthropic/claude-opus-4-1-20250805\",\n",
    "    \"anthropic/claude-opus-4-1-20250805+openai/computer-use-preview\",\n",
    "]\n",
    "\n",
    "\n",
    "for model in models_to_test:\n",
    "    # Full dataset evaluation (runs via HUD's run_dataset under the hood)\n",
    "    job_uuid = str(uuid.uuid4())[:6]\n",
    "    job_name = f\"osworld {job_uuid} {model}\"\n",
    "\n",
    "    results = await run_full_dataset(\n",
    "        dataset=\"hud-evals/OSWorld-Verified-XLang\",\n",
    "        job_name=job_name,                 \n",
    "        model=model,\n",
    "        max_concurrent=20,                   \n",
    "        max_steps=75,\n",
    "        trajectory_dir=f\"trajectories/osworld_{job_uuid}\"\n",
    "    )\n",
    "\n",
    "    # results is a list from hud.datasets.run_dataset; inspect/aggregate as needed\n",
    "    print(f\"Job: {job_name}\")\n",
    "    print(f\"Total results: {len(results)}\")\n",
    "    pprint(results[:3])  # preview"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "cua",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}