mirror of
https://github.com/trycua/computer.git
synced 2026-01-05 12:59:58 -06:00
181 lines
5.1 KiB
Plaintext
181 lines
5.1 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# ComputerAgent HUD Integration for OSWorld\n",
|
|
"\n",
|
|
"This notebook demonstrates how to use the ComputerAgent with HUD for OSWorld benchmarking.\n",
|
|
"The ComputerAgent integration provides the same interface as OperatorAgent but works with both Claude and OpenAI models."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# # Install dependencies if needed\n",
|
|
"# !uv venv\n",
|
|
"# !source .venv/bin/activate\n",
|
|
"# !uv sync"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"%load_ext autoreload\n",
|
|
"%autoreload 2"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from dotenv import load_dotenv\n",
|
|
"import os\n",
|
|
"\n",
|
|
"# Load environment variables from ../.env\n",
|
|
"load_dotenv(dotenv_path=\"../.env\")\n",
|
|
"\n",
|
|
"# Required environment variables:\n",
|
|
"# - HUD_API_KEY (for HUD access)\n",
|
|
"# - ANTHROPIC_API_KEY (for Claude models)\n",
|
|
"# - OPENAI_API_KEY (for OpenAI models)\n",
|
|
"assert os.getenv(\"HUD_API_KEY\") is not None\n",
|
|
"assert os.getenv(\"ANTHROPIC_API_KEY\") is not None or os.getenv(\"OPENAI_API_KEY\") is not None\n",
|
|
"\n",
|
|
"from pprint import pprint"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Quick single-task smoke test on OSWorld-Verified\n",
|
|
"\n",
|
|
"The ComputerAgent integration can use Claude, OpenAI, UI-TARS, or composed models just like the original ComputerAgent:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from agent.integrations.hud import run_single_task\n",
|
|
"\n",
|
|
"# Quick single-task smoke test on OSWorld-Verified\n",
|
|
"# You can swap \"hud-evals/OSWorld-Verified\" -> \"hud-evals/SheetBench-V2\" to test SheetBench.\n",
|
|
"await run_single_task(\n",
|
|
" dataset=\"hud-evals/OSWorld-Verified\",\n",
|
|
" model=\"openai/computer-use-preview+openai/gpt-5\", # or any supported model string\n",
|
|
" task_id=155, # open last tab task (easy)\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Run OSWorld-Verified in parallel"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import uuid\n",
|
|
"from agent.integrations.hud import run_full_dataset\n",
|
|
"\n",
|
|
"# Full dataset evaluation (runs via HUD's run_dataset under the hood)\n",
|
|
"job_name = f\"osworld-test-{str(uuid.uuid4())[:4]}\"\n",
|
|
"\n",
|
|
"results = await run_full_dataset(\n",
|
|
" dataset=\"hud-evals/OSWorld-Verified\", # You can also pass a Dataset or a list[dict]\n",
|
|
" job_name=job_name, # Optional; defaults to a timestamp for custom datasets\n",
|
|
" model=\"openai/computer-use-preview\", # Or any supported model string\n",
|
|
" max_concurrent=20, # Tune to your infra\n",
|
|
" max_steps=50, # Safety cap per task\n",
|
|
" split=\"train[:3]\", # Limit to just 3 tasks\n",
|
|
")\n",
|
|
"\n",
|
|
"# results is a list from hud.datasets.run_dataset; inspect/aggregate as needed\n",
|
|
"print(f\"Job: {job_name}\")\n",
|
|
"print(f\"Total results: {len(results)}\")\n",
|
|
"pprint(results[:3]) # preview"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Benchmark Composed Agents"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import uuid\n",
|
|
"from agent.integrations.hud import run_full_dataset\n",
|
|
"\n",
|
|
"models_to_test = [\n",
|
|
" \"openai/computer-use-preview+anthropic/claude-opus-4-20250514\",\n",
|
|
"]\n",
|
|
"\n",
|
|
"\n",
|
|
"for model in models_to_test:\n",
|
|
" # Full dataset evaluation (runs via HUD's run_dataset under the hood)\n",
|
|
" job_uuid = str(uuid.uuid4())[:6]\n",
|
|
" job_name = f\"osworld {job_uuid} {model}\"\n",
|
|
"\n",
|
|
" results = await run_full_dataset(\n",
|
|
" dataset=\"hud-evals/OSWorld-Verified\",\n",
|
|
" job_name=job_name,\n",
|
|
" model=model,\n",
|
|
" max_concurrent=20,\n",
|
|
" max_steps=75,\n",
|
|
" trajectory_dir=f\"trajectories/osworld_{job_uuid}\",\n",
|
|
" only_n_most_recent_images=3,\n",
|
|
" )\n",
|
|
"\n",
|
|
" # results is a list from hud.datasets.run_dataset; inspect/aggregate as needed\n",
|
|
" print(f\"Job: {job_name}\")\n",
|
|
" print(f\"Total results: {len(results)}\")"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": ".venv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.13.6"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|