limited tasks in notebook

This commit is contained in:
Dillon DuPont
2025-08-08 18:26:44 -04:00
parent ae128a2ae4
commit 5495529462
3 changed files with 162 additions and 107 deletions
@@ -65,17 +65,20 @@ Run all tasks in parallel using `run_job`:
```python
from agent.integrations.hud import run_job
from hud import load_taskset
import logging
# Load taskset
taskset = await load_taskset("SheetBench-V2")
taskset = await load_taskset("OSWorld-Verified")
taskset = taskset[:10] # limit to 10 tasks instead of all 370
# Run benchmark job
job = await run_job(
model="anthropic/claude-3-5-sonnet-20241022",
model="openai/computer-use-preview",
task_or_taskset=taskset,
job_name="test-computeragent-job",
# Any extra ComputerAgent kwargs:
# verbosity=logging.INFO, # Enable logging
max_concurrent_tasks=5,
# add any extra ComputerAgent kwargs:
verbosity=logging.INFO, # Enable logging
# trajectory_dir=".." # Save trajectories locally
)
@@ -21,6 +21,7 @@ from hud import load_taskset
# Load taskset
taskset = await load_taskset("OSWorld-Verified")
taskset = taskset[:10] # limit to 10 tasks instead of all 370
# Run benchmark job
job = await run_job(
+154 -103
View File
@@ -27,6 +27,16 @@
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Required environment variables:\n",
"# - HUD_API_KEY (for HUD access)\n",
@@ -40,7 +50,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 3,
"metadata": {},
"outputs": [
{
@@ -59,7 +69,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 5,
"metadata": {},
"outputs": [
{
@@ -67,7 +77,7 @@
"output_type": "stream",
"text": [
"Total tasks in OSWorld: 367\n",
"Task prompt: Make the background color of slide 2 same as the color of its title.\n"
"Task prompt: Can you make my computer bring back the last tab I shut down?\n"
]
}
],
@@ -77,7 +87,7 @@
"print(f\"Total tasks in OSWorld: {len(taskset)}\")\n",
"\n",
"# Select a test task\n",
"test = taskset[144]\n",
"test = taskset[148]\n",
"print(f\"Task prompt: {test.prompt}\")"
]
},
@@ -85,32 +95,47 @@
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total tasks in SheetBench: 50\n",
"Task prompt: Given the Input data, determine the ticker with the greatest correlation between volume and next day price change.\n",
"- in ANSWER tab put the Ticker in A1 and the correlation in B1\n",
" - use CORREL to determine correlation\n",
"- be sure to first sort the date by ticker z to a and then date ascending before calculating nextdaypricechange %\n",
"Correlation should be rounded to 2 decimal points\n"
]
}
],
"source": [
"# Load SheetBench taskset\n",
"taskset = await load_taskset(\"SheetBench-V2\")\n",
"print(f\"Total tasks in SheetBench: {len(taskset)}\")\n",
"\n",
"# Select a test task\n",
"test = taskset[0]\n",
"print(f\"Task prompt: {test.prompt}\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[ERROR] 2025-08-08 12:42:12,634 | hud.exceptions | HTTP error from HUD SDK: Request failed: Environment is in error state, cannot invoke functions | URL: https://orchestration.hud.so/hud-gym/api/v2/environments/525ea26c-096d-41bc-b968-54c62a7f1b9d/invoke | Status: 400 | Response: {\"detail\":\"Environment is in error state, cannot invoke functions\"}\n"
"[INFO] 2025-08-08 15:16:46,133 | hud.environment | View the live trace at https://app.hud.so/trace/662fd59f-5a8d-4205-9b88-32c00d0feab0\n"
]
},
{
"ename": "GymMakeException",
"evalue": "Failed to create environment | Data: {'gym_name': 'OSWorld-Ubuntu', 'environment_prompt': None, 'exception': 'Request failed: Environment is in error state, cannot invoke functions | Status: 400 | Response Text: {\"detail\":\"Environment is in error state, cannot invoke functions\"} | Response JSON: {\\'detail\\': \\'Environment is in error state, cannot invoke functions\\'} | Headers: {\\'content-length\\': \\'67\\', \\'content-type\\': \\'application/json\\', \\'date\\': \\'Fri, 08 Aug 2025 16:42:11 GMT\\', \\'server\\': \\'railway-edge\\', \\'x-railway-edge\\': \\'railway/us-east4\\', \\'x-railway-request-id\\': \\'cH9FJpMKQIGTcIome6l53A\\'}'}",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mHudRequestError\u001b[39m Traceback (most recent call last)",
"\u001b[36mFile \u001b[39m\u001b[32m~/cua-clean/.venv/lib/python3.12/site-packages/hud/gym.py:135\u001b[39m, in \u001b[36mmake\u001b[39m\u001b[34m(env_src, job, job_id, metadata)\u001b[39m\n\u001b[32m 134\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m task:\n\u001b[32m--> \u001b[39m\u001b[32m135\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m environment._setup()\n\u001b[32m 136\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m environment\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/cua-clean/.venv/lib/python3.12/site-packages/hud/env/environment.py:84\u001b[39m, in \u001b[36mEnvironment._setup\u001b[39m\u001b[34m(self, config)\u001b[39m\n\u001b[32m 83\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mself\u001b[39m.client, RemoteClient):\n\u001b[32m---> \u001b[39m\u001b[32m84\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m.get_urls()\n\u001b[32m 86\u001b[39m setup_request = SetupRequest()\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/cua-clean/.venv/lib/python3.12/site-packages/hud/env/environment.py:221\u001b[39m, in \u001b[36mEnvironment.get_urls\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 216\u001b[39m \u001b[38;5;250m\u001b[39m\u001b[33;03m\"\"\"Get URLs for the environment.\u001b[39;00m\n\u001b[32m 217\u001b[39m \n\u001b[32m 218\u001b[39m \u001b[33;03mReturns:\u001b[39;00m\n\u001b[32m 219\u001b[39m \u001b[33;03m dict: Dictionary of URLs for accessing the environment\u001b[39;00m\n\u001b[32m 220\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m221\u001b[39m data, _, _ = \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m.client.invoke(FunctionConfig(function=\u001b[33m\"\u001b[39m\u001b[33mget_urls\u001b[39m\u001b[33m\"\u001b[39m, args=[]))\n\u001b[32m 223\u001b[39m \u001b[38;5;28mself\u001b[39m.url = data.get(\u001b[33m\"\u001b[39m\u001b[33murl\u001b[39m\u001b[33m\"\u001b[39m)\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/cua-clean/.venv/lib/python3.12/site-packages/hud/env/remote_client.py:184\u001b[39m, in \u001b[36mRemoteClient.invoke\u001b[39m\u001b[34m(self, config)\u001b[39m\n\u001b[32m 181\u001b[39m \u001b[38;5;250m\u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 182\u001b[39m \u001b[33;03mInvoke a function in the environment.\u001b[39;00m\n\u001b[32m 183\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m184\u001b[39m data = \u001b[38;5;28;01mawait\u001b[39;00m make_request(\n\u001b[32m 185\u001b[39m method=\u001b[33m\"\u001b[39m\u001b[33mPOST\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 186\u001b[39m url=\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00msettings.base_url\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m/v2/environments/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m.env_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m/invoke\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 187\u001b[39m json=config.model_dump(),\n\u001b[32m 188\u001b[39m api_key=settings.api_key,\n\u001b[32m 189\u001b[39m )\n\u001b[32m 191\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m data[\u001b[33m\"\u001b[39m\u001b[33mresult\u001b[39m\u001b[33m\"\u001b[39m], b64decode(data[\u001b[33m\"\u001b[39m\u001b[33mstdout\u001b[39m\u001b[33m\"\u001b[39m]), b64decode(data[\u001b[33m\"\u001b[39m\u001b[33mstderr\u001b[39m\u001b[33m\"\u001b[39m])\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/cua-clean/.venv/lib/python3.12/site-packages/hud/server/requests.py:135\u001b[39m, in \u001b[36mmake_request\u001b[39m\u001b[34m(method, url, json, api_key, max_retries, retry_delay, client)\u001b[39m\n\u001b[32m 134\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m httpx.HTTPStatusError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m--> \u001b[39m\u001b[32m135\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m HudRequestError.from_httpx_error(e) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 136\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m httpx.RequestError \u001b[38;5;28;01mas\u001b[39;00m e:\n",
"\u001b[31mHudRequestError\u001b[39m: Request failed: Environment is in error state, cannot invoke functions | Status: 400 | Response Text: {\"detail\":\"Environment is in error state, cannot invoke functions\"} | Response JSON: {'detail': 'Environment is in error state, cannot invoke functions'} | Headers: {'content-length': '67', 'content-type': 'application/json', 'date': 'Fri, 08 Aug 2025 16:42:11 GMT', 'server': 'railway-edge', 'x-railway-edge': 'railway/us-east4', 'x-railway-request-id': 'cH9FJpMKQIGTcIome6l53A'}",
"\nThe above exception was the direct cause of the following exception:\n",
"\u001b[31mGymMakeException\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[4]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# Create environment (takes ~2.5 minutes to start)\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m env = \u001b[38;5;28;01mawait\u001b[39;00m gym.make(test)\n\u001b[32m 3\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33mEnvironment ready!\u001b[39m\u001b[33m\"\u001b[39m)\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/cua-clean/.venv/lib/python3.12/site-packages/hud/gym.py:139\u001b[39m, in \u001b[36mmake\u001b[39m\u001b[34m(env_src, job, job_id, metadata)\u001b[39m\n\u001b[32m 137\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m 138\u001b[39m build_data[\u001b[33m\"\u001b[39m\u001b[33mexception\u001b[39m\u001b[33m\"\u001b[39m] = \u001b[38;5;28mstr\u001b[39m(e)\n\u001b[32m--> \u001b[39m\u001b[32m139\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m GymMakeException(\u001b[33m\"\u001b[39m\u001b[33mFailed to create environment\u001b[39m\u001b[33m\"\u001b[39m, build_data) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01me\u001b[39;00m\n",
"\u001b[31mGymMakeException\u001b[39m: Failed to create environment | Data: {'gym_name': 'OSWorld-Ubuntu', 'environment_prompt': None, 'exception': 'Request failed: Environment is in error state, cannot invoke functions | Status: 400 | Response Text: {\"detail\":\"Environment is in error state, cannot invoke functions\"} | Response JSON: {\\'detail\\': \\'Environment is in error state, cannot invoke functions\\'} | Headers: {\\'content-length\\': \\'67\\', \\'content-type\\': \\'application/json\\', \\'date\\': \\'Fri, 08 Aug 2025 16:42:11 GMT\\', \\'server\\': \\'railway-edge\\', \\'x-railway-edge\\': \\'railway/us-east4\\', \\'x-railway-request-id\\': \\'cH9FJpMKQIGTcIome6l53A\\'}'}"
"name": "stdout",
"output_type": "stream",
"text": [
"Environment ready!\n"
]
}
],
@@ -120,6 +145,45 @@
"print(\"Environment ready!\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" <div style=\"width: 960px; height: 540px; overflow: hidden;\">\n",
" <div style=\"transform: scale(0.5); transform-origin: top left;\">\n",
" <iframe src=\"https://live.anchorbrowser.io?sessionId=7486a5f7-d7eb-458e-b1b1-a11852e0e217\" width=\"1920\" height=\"1080\" style=\"border: 1px solid #ddd;\">\n",
" </iframe>\n",
" </div>\n",
" </div>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'\\n <div style=\"width: 960px; height: 540px; overflow: hidden;\">\\n <div style=\"transform: scale(0.5); transform-origin: top left;\">\\n <iframe src=\"https://live.anchorbrowser.io?sessionId=7486a5f7-d7eb-458e-b1b1-a11852e0e217\" width=\"1920\" height=\"1080\" style=\"border: 1px solid #ddd;\">\\n </iframe>\\n </div>\\n </div>\\n '"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"await env.stream() # vnc"
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -131,14 +195,24 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Created Claude agent: computeragent-claude-3-5-sonnet-20241022\n"
]
}
],
"source": [
"import logging\n",
"# Create ComputerAgent with Claude\n",
"claude_agent = ComputerAgent(\n",
" model=\"anthropic/claude-3-5-sonnet-20241022\",\n",
" environment=\"linux\", # OSWorld typically uses Linux\n",
" verbosity=logging.INFO,\n",
")\n",
"\n",
"print(f\"Created Claude agent: {claude_agent.name}\")"
@@ -146,9 +220,33 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Initial observation complete\n",
"========= Step 1 ==========\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2025-08-08 15:17:04,030 - agent.ComputerAgent - INFO - LLM processing started with 1 messages\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Agent's action: [ResponseAction(type='response', reasoning='I\\'ll help you complete this task step by step, but I notice that I don\\'t have any input data or access to Excel through the available functions. The only function I have access to is the \"computer\" function which allows for basic desktop interaction.\\n\\nTo properly assist you, I would need:\\n1. The actual input data you want to analyze\\n2. Access to Excel or another spreadsheet tool to perform the calculations\\n\\nCould you please provide the input data and confirm if there\\'s a specific way to access Excel or the data file on this system?\\n\\nOnce provided, I can help calculate correlations between volume and next day price changes, sort the data as specified, and format the results according to your requirements.', logs={'conversation_length': 2}, text='I\\'ll help you complete this task step by step, but I notice that I don\\'t have any input data or access to Excel through the available functions. The only function I have access to is the \"computer\" function which allows for basic desktop interaction.\\n\\nTo properly assist you, I would need:\\n1. The actual input data you want to analyze\\n2. Access to Excel or another spreadsheet tool to perform the calculations\\n\\nCould you please provide the input data and confirm if there\\'s a specific way to access Excel or the data file on this system?\\n\\nOnce provided, I can help calculate correlations between volume and next day price changes, sort the data as specified, and format the results according to your requirements.')]\n",
"Task completed after 1 steps\n"
]
}
],
"source": [
"# Initial observation\n",
"obs, _ = await env.reset()\n",
@@ -173,62 +271,6 @@
" break"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test with OpenAI Model\n",
"\n",
"The same ComputerAgent can also use OpenAI models:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Reset environment for OpenAI test\n",
"await env.reset()\n",
"\n",
"# Create ComputerAgent with OpenAI\n",
"openai_agent = ComputerAgent(\n",
" model=\"openai/computer-use-preview\",\n",
" environment=\"linux\",\n",
")\n",
"\n",
"print(f\"Created OpenAI agent: {openai_agent.name}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Initial observation\n",
"obs, _ = await env.reset()\n",
"print(\"Initial observation complete\")\n",
"\n",
"# Agent loop with OpenAI\n",
"for i in range(8):\n",
" print(f\"========= Step {i + 1} ==========\")\n",
" \n",
" try:\n",
" action, done = await openai_agent.predict(obs)\n",
" print(f\"Agent's action: {action}\")\n",
"\n",
" obs, reward, terminated, info = await env.step(action)\n",
"\n",
" if done or terminated:\n",
" print(f\"Task completed after {i + 1} steps\")\n",
" break\n",
" \n",
" except Exception as e:\n",
" print(f\"Error in step {i + 1}: {e}\")\n",
" break"
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -250,9 +292,17 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 9,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Environment closed\n"
]
}
],
"source": [
"# Clean up\n",
"await env.close()\n",
@@ -263,9 +313,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Comparison with Original Agents\n",
"\n",
"The ComputerAgent provides the same interface as ClaudeAgent and OperatorAgent:"
"## Run OSWorld-Verified in parallel"
]
},
{
@@ -274,25 +322,28 @@
"metadata": {},
"outputs": [],
"source": [
"# Compare with original HUD agents\n",
"from hud.agent import ClaudeAgent, OperatorAgent\n",
"from agent.integrations.hud import run_job\n",
"from hud import load_taskset\n",
"import logging\n",
"\n",
"# Original agents\n",
"original_claude = ClaudeAgent()\n",
"original_operator = OperatorAgent(environment=\"linux\")\n",
"# Load taskset\n",
"taskset = await load_taskset(\"OSWorld-Verified\")\n",
"taskset = taskset[:10] # limit to 10 tasks instead of all 370\n",
"\n",
"# ComputerAgent versions\n",
"computer_claude = ComputerAgent(model=\"anthropic/claude-3-5-sonnet-20241022\", environment=\"linux\")\n",
"computer_openai = ComputerAgent(model=\"openai/computer-use-preview\", environment=\"linux\")\n",
"# Run benchmark job\n",
"job = await run_job(\n",
" model=\"openai/computer-use-preview\",\n",
" task_or_taskset=taskset,\n",
" job_name=\"test-computeragent-job\",\n",
" max_concurrent_tasks=5,\n",
" # add any extra ComputerAgent kwargs:\n",
" verbosity=logging.INFO, # Enable logging\n",
" # trajectory_dir=\"..\" # Save trajectories locally\n",
")\n",
"\n",
"print(\"Original agents:\")\n",
"print(f\" ClaudeAgent: {original_claude.name}\")\n",
"print(f\" OperatorAgent: {original_operator.name}\")\n",
"print(\"\\nComputerAgent versions:\")\n",
"print(f\" ComputerAgent (Claude): {computer_claude.name}\")\n",
"print(f\" ComputerAgent (OpenAI): {computer_openai.name}\")\n",
"\n",
"print(\"\\nAll agents have the same interface and can be used interchangeably!\")"
"# Get results OR view them at app.hud.so\n",
"print(await job.get_analytics())\n",
"print(f\"View results at: https://app.hud.so/jobs/{job.id}\")"
]
}
],