Add example notebook

This commit is contained in:
Dillon DuPont
2025-08-08 13:14:56 -04:00
parent 1882fb68e5
commit f819c578b7
+320
View File
@@ -0,0 +1,320 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ComputerAgent HUD Integration for OSWorld\n",
"\n",
"This notebook demonstrates how to use the ComputerAgent with HUD for OSWorld benchmarking.\n",
"The ComputerAgent integration provides the same interface as OperatorAgent but works with both Claude and OpenAI models."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# # Install dependencies if needed\n",
"# !uv venv \n",
"# !source .venv/bin/activate\n",
"# !uv sync"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# Required environment variables:\n",
"# - HUD_API_KEY (for HUD access)\n",
"# - ANTHROPIC_API_KEY (for Claude models)\n",
"# - OPENAI_API_KEY (for OpenAI models)\n",
"\n",
"from hud import gym, load_taskset\n",
"from pprint import pprint\n",
"import asyncio"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/dillondupont/cua-clean/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"# Import the HUD-integrated ComputerAgent\n",
"from agent.integrations.hud import ComputerAgent"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total tasks in OSWorld: 367\n",
"Task prompt: Make the background color of slide 2 same as the color of its title.\n"
]
}
],
"source": [
"# Load OSWorld taskset\n",
"taskset = await load_taskset(\"OSWorld-Verified\")\n",
"print(f\"Total tasks in OSWorld: {len(taskset)}\")\n",
"\n",
"# Select a test task\n",
"test = taskset[144]\n",
"print(f\"Task prompt: {test.prompt}\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[ERROR] 2025-08-08 12:42:12,634 | hud.exceptions | HTTP error from HUD SDK: Request failed: Environment is in error state, cannot invoke functions | URL: https://orchestration.hud.so/hud-gym/api/v2/environments/525ea26c-096d-41bc-b968-54c62a7f1b9d/invoke | Status: 400 | Response: {\"detail\":\"Environment is in error state, cannot invoke functions\"}\n"
]
},
{
"ename": "GymMakeException",
"evalue": "Failed to create environment | Data: {'gym_name': 'OSWorld-Ubuntu', 'environment_prompt': None, 'exception': 'Request failed: Environment is in error state, cannot invoke functions | Status: 400 | Response Text: {\"detail\":\"Environment is in error state, cannot invoke functions\"} | Response JSON: {\\'detail\\': \\'Environment is in error state, cannot invoke functions\\'} | Headers: {\\'content-length\\': \\'67\\', \\'content-type\\': \\'application/json\\', \\'date\\': \\'Fri, 08 Aug 2025 16:42:11 GMT\\', \\'server\\': \\'railway-edge\\', \\'x-railway-edge\\': \\'railway/us-east4\\', \\'x-railway-request-id\\': \\'cH9FJpMKQIGTcIome6l53A\\'}'}",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mHudRequestError\u001b[39m Traceback (most recent call last)",
"\u001b[36mFile \u001b[39m\u001b[32m~/cua-clean/.venv/lib/python3.12/site-packages/hud/gym.py:135\u001b[39m, in \u001b[36mmake\u001b[39m\u001b[34m(env_src, job, job_id, metadata)\u001b[39m\n\u001b[32m 134\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m task:\n\u001b[32m--> \u001b[39m\u001b[32m135\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m environment._setup()\n\u001b[32m 136\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m environment\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/cua-clean/.venv/lib/python3.12/site-packages/hud/env/environment.py:84\u001b[39m, in \u001b[36mEnvironment._setup\u001b[39m\u001b[34m(self, config)\u001b[39m\n\u001b[32m 83\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mself\u001b[39m.client, RemoteClient):\n\u001b[32m---> \u001b[39m\u001b[32m84\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m.get_urls()\n\u001b[32m 86\u001b[39m setup_request = SetupRequest()\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/cua-clean/.venv/lib/python3.12/site-packages/hud/env/environment.py:221\u001b[39m, in \u001b[36mEnvironment.get_urls\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 216\u001b[39m \u001b[38;5;250m\u001b[39m\u001b[33;03m\"\"\"Get URLs for the environment.\u001b[39;00m\n\u001b[32m 217\u001b[39m \n\u001b[32m 218\u001b[39m \u001b[33;03mReturns:\u001b[39;00m\n\u001b[32m 219\u001b[39m \u001b[33;03m dict: Dictionary of URLs for accessing the environment\u001b[39;00m\n\u001b[32m 220\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m221\u001b[39m data, _, _ = \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m.client.invoke(FunctionConfig(function=\u001b[33m\"\u001b[39m\u001b[33mget_urls\u001b[39m\u001b[33m\"\u001b[39m, args=[]))\n\u001b[32m 223\u001b[39m \u001b[38;5;28mself\u001b[39m.url = data.get(\u001b[33m\"\u001b[39m\u001b[33murl\u001b[39m\u001b[33m\"\u001b[39m)\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/cua-clean/.venv/lib/python3.12/site-packages/hud/env/remote_client.py:184\u001b[39m, in \u001b[36mRemoteClient.invoke\u001b[39m\u001b[34m(self, config)\u001b[39m\n\u001b[32m 181\u001b[39m \u001b[38;5;250m\u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 182\u001b[39m \u001b[33;03mInvoke a function in the environment.\u001b[39;00m\n\u001b[32m 183\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m184\u001b[39m data = \u001b[38;5;28;01mawait\u001b[39;00m make_request(\n\u001b[32m 185\u001b[39m method=\u001b[33m\"\u001b[39m\u001b[33mPOST\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 186\u001b[39m url=\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00msettings.base_url\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m/v2/environments/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m.env_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m/invoke\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 187\u001b[39m json=config.model_dump(),\n\u001b[32m 188\u001b[39m api_key=settings.api_key,\n\u001b[32m 189\u001b[39m )\n\u001b[32m 191\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m data[\u001b[33m\"\u001b[39m\u001b[33mresult\u001b[39m\u001b[33m\"\u001b[39m], b64decode(data[\u001b[33m\"\u001b[39m\u001b[33mstdout\u001b[39m\u001b[33m\"\u001b[39m]), b64decode(data[\u001b[33m\"\u001b[39m\u001b[33mstderr\u001b[39m\u001b[33m\"\u001b[39m])\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/cua-clean/.venv/lib/python3.12/site-packages/hud/server/requests.py:135\u001b[39m, in \u001b[36mmake_request\u001b[39m\u001b[34m(method, url, json, api_key, max_retries, retry_delay, client)\u001b[39m\n\u001b[32m 134\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m httpx.HTTPStatusError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m--> \u001b[39m\u001b[32m135\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m HudRequestError.from_httpx_error(e) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 136\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m httpx.RequestError \u001b[38;5;28;01mas\u001b[39;00m e:\n",
"\u001b[31mHudRequestError\u001b[39m: Request failed: Environment is in error state, cannot invoke functions | Status: 400 | Response Text: {\"detail\":\"Environment is in error state, cannot invoke functions\"} | Response JSON: {'detail': 'Environment is in error state, cannot invoke functions'} | Headers: {'content-length': '67', 'content-type': 'application/json', 'date': 'Fri, 08 Aug 2025 16:42:11 GMT', 'server': 'railway-edge', 'x-railway-edge': 'railway/us-east4', 'x-railway-request-id': 'cH9FJpMKQIGTcIome6l53A'}",
"\nThe above exception was the direct cause of the following exception:\n",
"\u001b[31mGymMakeException\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[4]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# Create environment (takes ~2.5 minutes to start)\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m env = \u001b[38;5;28;01mawait\u001b[39;00m gym.make(test)\n\u001b[32m 3\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33mEnvironment ready!\u001b[39m\u001b[33m\"\u001b[39m)\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/cua-clean/.venv/lib/python3.12/site-packages/hud/gym.py:139\u001b[39m, in \u001b[36mmake\u001b[39m\u001b[34m(env_src, job, job_id, metadata)\u001b[39m\n\u001b[32m 137\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m 138\u001b[39m build_data[\u001b[33m\"\u001b[39m\u001b[33mexception\u001b[39m\u001b[33m\"\u001b[39m] = \u001b[38;5;28mstr\u001b[39m(e)\n\u001b[32m--> \u001b[39m\u001b[32m139\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m GymMakeException(\u001b[33m\"\u001b[39m\u001b[33mFailed to create environment\u001b[39m\u001b[33m\"\u001b[39m, build_data) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01me\u001b[39;00m\n",
"\u001b[31mGymMakeException\u001b[39m: Failed to create environment | Data: {'gym_name': 'OSWorld-Ubuntu', 'environment_prompt': None, 'exception': 'Request failed: Environment is in error state, cannot invoke functions | Status: 400 | Response Text: {\"detail\":\"Environment is in error state, cannot invoke functions\"} | Response JSON: {\\'detail\\': \\'Environment is in error state, cannot invoke functions\\'} | Headers: {\\'content-length\\': \\'67\\', \\'content-type\\': \\'application/json\\', \\'date\\': \\'Fri, 08 Aug 2025 16:42:11 GMT\\', \\'server\\': \\'railway-edge\\', \\'x-railway-edge\\': \\'railway/us-east4\\', \\'x-railway-request-id\\': \\'cH9FJpMKQIGTcIome6l53A\\'}'}"
]
}
],
"source": [
"# Create environment (takes ~2.5 minutes to start)\n",
"env = await gym.make(test)\n",
"print(\"Environment ready!\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test with Claude Model\n",
"\n",
"The ComputerAgent can use Claude models just like the original ClaudeAgent:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Create ComputerAgent with Claude\n",
"claude_agent = ComputerAgent(\n",
" model=\"anthropic/claude-3-5-sonnet-20241022\",\n",
" environment=\"linux\", # OSWorld typically uses Linux\n",
")\n",
"\n",
"print(f\"Created Claude agent: {claude_agent.name}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Initial observation\n",
"obs, _ = await env.reset()\n",
"print(\"Initial observation complete\")\n",
"\n",
"# Agent loop with Claude\n",
"for i in range(8):\n",
" print(f\"========= Step {i + 1} ==========\")\n",
" \n",
" try:\n",
" action, done = await claude_agent.predict(obs)\n",
" print(f\"Agent's action: {action}\")\n",
"\n",
" obs, reward, terminated, info = await env.step(action)\n",
"\n",
" if done or terminated:\n",
" print(f\"Task completed after {i + 1} steps\")\n",
" break\n",
" \n",
" except Exception as e:\n",
" print(f\"Error in step {i + 1}: {e}\")\n",
" break"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test with OpenAI Model\n",
"\n",
"The same ComputerAgent can also use OpenAI models:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Reset environment for OpenAI test\n",
"await env.reset()\n",
"\n",
"# Create ComputerAgent with OpenAI\n",
"openai_agent = ComputerAgent(\n",
" model=\"openai/computer-use-preview\",\n",
" environment=\"linux\",\n",
")\n",
"\n",
"print(f\"Created OpenAI agent: {openai_agent.name}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Initial observation\n",
"obs, _ = await env.reset()\n",
"print(\"Initial observation complete\")\n",
"\n",
"# Agent loop with OpenAI\n",
"for i in range(8):\n",
" print(f\"========= Step {i + 1} ==========\")\n",
" \n",
" try:\n",
" action, done = await openai_agent.predict(obs)\n",
" print(f\"Agent's action: {action}\")\n",
"\n",
" obs, reward, terminated, info = await env.step(action)\n",
"\n",
" if done or terminated:\n",
" print(f\"Task completed after {i + 1} steps\")\n",
" break\n",
" \n",
" except Exception as e:\n",
" print(f\"Error in step {i + 1}: {e}\")\n",
" break"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Evaluate Results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Evaluate environment state\n",
"result = await env.evaluate()\n",
"print(\"=== Final Evaluation ===\")\n",
"pprint(result)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Clean up\n",
"await env.close()\n",
"print(\"Environment closed\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Comparison with Original Agents\n",
"\n",
"The ComputerAgent provides the same interface as ClaudeAgent and OperatorAgent:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Compare with original HUD agents\n",
"from hud.agent import ClaudeAgent, OperatorAgent\n",
"\n",
"# Original agents\n",
"original_claude = ClaudeAgent()\n",
"original_operator = OperatorAgent(environment=\"linux\")\n",
"\n",
"# ComputerAgent versions\n",
"computer_claude = ComputerAgent(model=\"anthropic/claude-3-5-sonnet-20241022\", environment=\"linux\")\n",
"computer_openai = ComputerAgent(model=\"openai/computer-use-preview\", environment=\"linux\")\n",
"\n",
"print(\"Original agents:\")\n",
"print(f\" ClaudeAgent: {original_claude.name}\")\n",
"print(f\" OperatorAgent: {original_operator.name}\")\n",
"print(\"\\nComputerAgent versions:\")\n",
"print(f\" ComputerAgent (Claude): {computer_claude.name}\")\n",
"print(f\" ComputerAgent (OpenAI): {computer_openai.name}\")\n",
"\n",
"print(\"\\nAll agents have the same interface and can be used interchangeably!\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.11"
}
},
"nbformat": 4,
"nbformat_minor": 4
}