mirror of
https://github.com/trycua/computer.git
synced 2026-04-29 19:52:35 -05:00
Add example notebook
This commit is contained in:
@@ -0,0 +1,320 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# ComputerAgent HUD Integration for OSWorld\n",
|
||||
"\n",
|
||||
"This notebook demonstrates how to use the ComputerAgent with HUD for OSWorld benchmarking.\n",
|
||||
"The ComputerAgent integration provides the same interface as OperatorAgent but works with both Claude and OpenAI models."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# # Install dependencies if needed\n",
|
||||
"# !uv venv \n",
|
||||
"# !source .venv/bin/activate\n",
|
||||
"# !uv sync"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Required environment variables:\n",
|
||||
"# - HUD_API_KEY (for HUD access)\n",
|
||||
"# - ANTHROPIC_API_KEY (for Claude models)\n",
|
||||
"# - OPENAI_API_KEY (for OpenAI models)\n",
|
||||
"\n",
|
||||
"from hud import gym, load_taskset\n",
|
||||
"from pprint import pprint\n",
|
||||
"import asyncio"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/Users/dillondupont/cua-clean/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Import the HUD-integrated ComputerAgent\n",
|
||||
"from agent.integrations.hud import ComputerAgent"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Total tasks in OSWorld: 367\n",
|
||||
"Task prompt: Make the background color of slide 2 same as the color of its title.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Load OSWorld taskset\n",
|
||||
"taskset = await load_taskset(\"OSWorld-Verified\")\n",
|
||||
"print(f\"Total tasks in OSWorld: {len(taskset)}\")\n",
|
||||
"\n",
|
||||
"# Select a test task\n",
|
||||
"test = taskset[144]\n",
|
||||
"print(f\"Task prompt: {test.prompt}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[ERROR] 2025-08-08 12:42:12,634 | hud.exceptions | HTTP error from HUD SDK: Request failed: Environment is in error state, cannot invoke functions | URL: https://orchestration.hud.so/hud-gym/api/v2/environments/525ea26c-096d-41bc-b968-54c62a7f1b9d/invoke | Status: 400 | Response: {\"detail\":\"Environment is in error state, cannot invoke functions\"}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "GymMakeException",
|
||||
"evalue": "Failed to create environment | Data: {'gym_name': 'OSWorld-Ubuntu', 'environment_prompt': None, 'exception': 'Request failed: Environment is in error state, cannot invoke functions | Status: 400 | Response Text: {\"detail\":\"Environment is in error state, cannot invoke functions\"} | Response JSON: {\\'detail\\': \\'Environment is in error state, cannot invoke functions\\'} | Headers: {\\'content-length\\': \\'67\\', \\'content-type\\': \\'application/json\\', \\'date\\': \\'Fri, 08 Aug 2025 16:42:11 GMT\\', \\'server\\': \\'railway-edge\\', \\'x-railway-edge\\': \\'railway/us-east4\\', \\'x-railway-request-id\\': \\'cH9FJpMKQIGTcIome6l53A\\'}'}",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||||
"\u001b[31mHudRequestError\u001b[39m Traceback (most recent call last)",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/cua-clean/.venv/lib/python3.12/site-packages/hud/gym.py:135\u001b[39m, in \u001b[36mmake\u001b[39m\u001b[34m(env_src, job, job_id, metadata)\u001b[39m\n\u001b[32m 134\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m task:\n\u001b[32m--> \u001b[39m\u001b[32m135\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m environment._setup()\n\u001b[32m 136\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m environment\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/cua-clean/.venv/lib/python3.12/site-packages/hud/env/environment.py:84\u001b[39m, in \u001b[36mEnvironment._setup\u001b[39m\u001b[34m(self, config)\u001b[39m\n\u001b[32m 83\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mself\u001b[39m.client, RemoteClient):\n\u001b[32m---> \u001b[39m\u001b[32m84\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m.get_urls()\n\u001b[32m 86\u001b[39m setup_request = SetupRequest()\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/cua-clean/.venv/lib/python3.12/site-packages/hud/env/environment.py:221\u001b[39m, in \u001b[36mEnvironment.get_urls\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 216\u001b[39m \u001b[38;5;250m\u001b[39m\u001b[33;03m\"\"\"Get URLs for the environment.\u001b[39;00m\n\u001b[32m 217\u001b[39m \n\u001b[32m 218\u001b[39m \u001b[33;03mReturns:\u001b[39;00m\n\u001b[32m 219\u001b[39m \u001b[33;03m dict: Dictionary of URLs for accessing the environment\u001b[39;00m\n\u001b[32m 220\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m221\u001b[39m data, _, _ = \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m.client.invoke(FunctionConfig(function=\u001b[33m\"\u001b[39m\u001b[33mget_urls\u001b[39m\u001b[33m\"\u001b[39m, args=[]))\n\u001b[32m 223\u001b[39m \u001b[38;5;28mself\u001b[39m.url = data.get(\u001b[33m\"\u001b[39m\u001b[33murl\u001b[39m\u001b[33m\"\u001b[39m)\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/cua-clean/.venv/lib/python3.12/site-packages/hud/env/remote_client.py:184\u001b[39m, in \u001b[36mRemoteClient.invoke\u001b[39m\u001b[34m(self, config)\u001b[39m\n\u001b[32m 181\u001b[39m \u001b[38;5;250m\u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 182\u001b[39m \u001b[33;03mInvoke a function in the environment.\u001b[39;00m\n\u001b[32m 183\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m184\u001b[39m data = \u001b[38;5;28;01mawait\u001b[39;00m make_request(\n\u001b[32m 185\u001b[39m method=\u001b[33m\"\u001b[39m\u001b[33mPOST\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 186\u001b[39m url=\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00msettings.base_url\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m/v2/environments/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m.env_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m/invoke\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 187\u001b[39m json=config.model_dump(),\n\u001b[32m 188\u001b[39m api_key=settings.api_key,\n\u001b[32m 189\u001b[39m )\n\u001b[32m 191\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m data[\u001b[33m\"\u001b[39m\u001b[33mresult\u001b[39m\u001b[33m\"\u001b[39m], b64decode(data[\u001b[33m\"\u001b[39m\u001b[33mstdout\u001b[39m\u001b[33m\"\u001b[39m]), b64decode(data[\u001b[33m\"\u001b[39m\u001b[33mstderr\u001b[39m\u001b[33m\"\u001b[39m])\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/cua-clean/.venv/lib/python3.12/site-packages/hud/server/requests.py:135\u001b[39m, in \u001b[36mmake_request\u001b[39m\u001b[34m(method, url, json, api_key, max_retries, retry_delay, client)\u001b[39m\n\u001b[32m 134\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m httpx.HTTPStatusError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m--> \u001b[39m\u001b[32m135\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m HudRequestError.from_httpx_error(e) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 136\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m httpx.RequestError \u001b[38;5;28;01mas\u001b[39;00m e:\n",
|
||||
"\u001b[31mHudRequestError\u001b[39m: Request failed: Environment is in error state, cannot invoke functions | Status: 400 | Response Text: {\"detail\":\"Environment is in error state, cannot invoke functions\"} | Response JSON: {'detail': 'Environment is in error state, cannot invoke functions'} | Headers: {'content-length': '67', 'content-type': 'application/json', 'date': 'Fri, 08 Aug 2025 16:42:11 GMT', 'server': 'railway-edge', 'x-railway-edge': 'railway/us-east4', 'x-railway-request-id': 'cH9FJpMKQIGTcIome6l53A'}",
|
||||
"\nThe above exception was the direct cause of the following exception:\n",
|
||||
"\u001b[31mGymMakeException\u001b[39m Traceback (most recent call last)",
|
||||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[4]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# Create environment (takes ~2.5 minutes to start)\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m env = \u001b[38;5;28;01mawait\u001b[39;00m gym.make(test)\n\u001b[32m 3\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33mEnvironment ready!\u001b[39m\u001b[33m\"\u001b[39m)\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/cua-clean/.venv/lib/python3.12/site-packages/hud/gym.py:139\u001b[39m, in \u001b[36mmake\u001b[39m\u001b[34m(env_src, job, job_id, metadata)\u001b[39m\n\u001b[32m 137\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m 138\u001b[39m build_data[\u001b[33m\"\u001b[39m\u001b[33mexception\u001b[39m\u001b[33m\"\u001b[39m] = \u001b[38;5;28mstr\u001b[39m(e)\n\u001b[32m--> \u001b[39m\u001b[32m139\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m GymMakeException(\u001b[33m\"\u001b[39m\u001b[33mFailed to create environment\u001b[39m\u001b[33m\"\u001b[39m, build_data) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01me\u001b[39;00m\n",
|
||||
"\u001b[31mGymMakeException\u001b[39m: Failed to create environment | Data: {'gym_name': 'OSWorld-Ubuntu', 'environment_prompt': None, 'exception': 'Request failed: Environment is in error state, cannot invoke functions | Status: 400 | Response Text: {\"detail\":\"Environment is in error state, cannot invoke functions\"} | Response JSON: {\\'detail\\': \\'Environment is in error state, cannot invoke functions\\'} | Headers: {\\'content-length\\': \\'67\\', \\'content-type\\': \\'application/json\\', \\'date\\': \\'Fri, 08 Aug 2025 16:42:11 GMT\\', \\'server\\': \\'railway-edge\\', \\'x-railway-edge\\': \\'railway/us-east4\\', \\'x-railway-request-id\\': \\'cH9FJpMKQIGTcIome6l53A\\'}'}"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Create environment (takes ~2.5 minutes to start)\n",
|
||||
"env = await gym.make(test)\n",
|
||||
"print(\"Environment ready!\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Test with Claude Model\n",
|
||||
"\n",
|
||||
"The ComputerAgent can use Claude models just like the original ClaudeAgent:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Create ComputerAgent with Claude\n",
|
||||
"claude_agent = ComputerAgent(\n",
|
||||
" model=\"anthropic/claude-3-5-sonnet-20241022\",\n",
|
||||
" environment=\"linux\", # OSWorld typically uses Linux\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(f\"Created Claude agent: {claude_agent.name}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Initial observation\n",
|
||||
"obs, _ = await env.reset()\n",
|
||||
"print(\"Initial observation complete\")\n",
|
||||
"\n",
|
||||
"# Agent loop with Claude\n",
|
||||
"for i in range(8):\n",
|
||||
" print(f\"========= Step {i + 1} ==========\")\n",
|
||||
" \n",
|
||||
" try:\n",
|
||||
" action, done = await claude_agent.predict(obs)\n",
|
||||
" print(f\"Agent's action: {action}\")\n",
|
||||
"\n",
|
||||
" obs, reward, terminated, info = await env.step(action)\n",
|
||||
"\n",
|
||||
" if done or terminated:\n",
|
||||
" print(f\"Task completed after {i + 1} steps\")\n",
|
||||
" break\n",
|
||||
" \n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"Error in step {i + 1}: {e}\")\n",
|
||||
" break"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Test with OpenAI Model\n",
|
||||
"\n",
|
||||
"The same ComputerAgent can also use OpenAI models:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Reset environment for OpenAI test\n",
|
||||
"await env.reset()\n",
|
||||
"\n",
|
||||
"# Create ComputerAgent with OpenAI\n",
|
||||
"openai_agent = ComputerAgent(\n",
|
||||
" model=\"openai/computer-use-preview\",\n",
|
||||
" environment=\"linux\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(f\"Created OpenAI agent: {openai_agent.name}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Initial observation\n",
|
||||
"obs, _ = await env.reset()\n",
|
||||
"print(\"Initial observation complete\")\n",
|
||||
"\n",
|
||||
"# Agent loop with OpenAI\n",
|
||||
"for i in range(8):\n",
|
||||
" print(f\"========= Step {i + 1} ==========\")\n",
|
||||
" \n",
|
||||
" try:\n",
|
||||
" action, done = await openai_agent.predict(obs)\n",
|
||||
" print(f\"Agent's action: {action}\")\n",
|
||||
"\n",
|
||||
" obs, reward, terminated, info = await env.step(action)\n",
|
||||
"\n",
|
||||
" if done or terminated:\n",
|
||||
" print(f\"Task completed after {i + 1} steps\")\n",
|
||||
" break\n",
|
||||
" \n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"Error in step {i + 1}: {e}\")\n",
|
||||
" break"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Evaluate Results"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Evaluate environment state\n",
|
||||
"result = await env.evaluate()\n",
|
||||
"print(\"=== Final Evaluation ===\")\n",
|
||||
"pprint(result)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Clean up\n",
|
||||
"await env.close()\n",
|
||||
"print(\"Environment closed\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Comparison with Original Agents\n",
|
||||
"\n",
|
||||
"The ComputerAgent provides the same interface as ClaudeAgent and OperatorAgent:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Compare with original HUD agents\n",
|
||||
"from hud.agent import ClaudeAgent, OperatorAgent\n",
|
||||
"\n",
|
||||
"# Original agents\n",
|
||||
"original_claude = ClaudeAgent()\n",
|
||||
"original_operator = OperatorAgent(environment=\"linux\")\n",
|
||||
"\n",
|
||||
"# ComputerAgent versions\n",
|
||||
"computer_claude = ComputerAgent(model=\"anthropic/claude-3-5-sonnet-20241022\", environment=\"linux\")\n",
|
||||
"computer_openai = ComputerAgent(model=\"openai/computer-use-preview\", environment=\"linux\")\n",
|
||||
"\n",
|
||||
"print(\"Original agents:\")\n",
|
||||
"print(f\" ClaudeAgent: {original_claude.name}\")\n",
|
||||
"print(f\" OperatorAgent: {original_operator.name}\")\n",
|
||||
"print(\"\\nComputerAgent versions:\")\n",
|
||||
"print(f\" ComputerAgent (Claude): {computer_claude.name}\")\n",
|
||||
"print(f\" ComputerAgent (OpenAI): {computer_openai.name}\")\n",
|
||||
"\n",
|
||||
"print(\"\\nAll agents have the same interface and can be used interchangeably!\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.11"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
Reference in New Issue
Block a user