added run_job

This commit is contained in:
Dillon DuPont
2025-08-08 18:15:56 -04:00
parent f819c578b7
commit 8f15c21df9
4 changed files with 144 additions and 90 deletions

View File

@@ -63,26 +63,23 @@ print(f"Success: {result.get('success', False)}")
Run all tasks in parallel using `run_job`:
```python
from hud import run_job
from agent.integrations.hud import ComputerAgent
import logging
from agent.integrations.hud import run_job
from hud import load_taskset
logging.basicConfig(level=logging.INFO)
# Load taskset
taskset = await load_taskset("SheetBench-V2")
# Load full taskset
taskset = await load_taskset("OSWorld-Verified")
# Run parallel job
# Run benchmark job
job = await run_job(
ComputerAgent,
taskset,
"osworld-computeragent",
max_steps_per_task=100,
max_concurrent_tasks=20,
auto_reply_question=True,
agent_kwargs={"model": "anthropic/claude-3-5-sonnet-20241022"}
model="anthropic/claude-3-5-sonnet-20241022",
task_or_taskset=taskset,
job_name="test-computeragent-job",
# Any extra ComputerAgent kwargs:
# verbosity=logging.INFO, # Enable logging
# trajectory_dir=".." # Save trajectories locally
)
# Get analytics
analytics = await job.get_analytics()
```
# Get results OR view them at app.hud.so
print(await job.get_analytics())
print(f"View results at: https://app.hud.so/jobs/{job.id}")
```

View File

@@ -16,28 +16,28 @@ pip install "cua-agent[hud]"
## Usage
```python
from agent.integrations.hud import ComputerAgent
from agent.integrations.hud import run_job
from hud import load_taskset
# Create agent with any ComputerAgent model
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022", # or any model string
environment="linux"
# Load taskset
taskset = await load_taskset("OSWorld-Verified")
# Run benchmark job
job = await run_job(
model="anthropic/claude-3-5-sonnet-20241022",
task_or_taskset=taskset,
job_name="test-computeragent-job",
# Any extra ComputerAgent kwargs:
# verbosity=logging.INFO, # Enable logging
# trajectory_dir=".." # Save trajectories locally
)
# Use exactly like other HUD agents
action, done = await agent.predict(observation)
# Get results OR view them at app.hud.so
print(await job.get_analytics())
print(f"View results at: https://app.hud.so/jobs/{job.id}")
```
## Environment Variables
Set these environment variables:
- `HUD_API_KEY` - Your HUD API key
- `ANTHROPIC_API_KEY` - For Claude models
- `OPENAI_API_KEY` - For OpenAI models
## Example Benchmarks
1. [OSWorld-Verified](/agent-sdk/benchmarks/osworld-verified) - Benchmark on OSWorld tasks with parallel execution
**Available Benchmarks:**
1. [OSWorld-Verified](/agent-sdk/benchmarks/osworld-verified) - Benchmark on OSWorld tasks
See the [HUD docs](https://docs.hud.so/environment-creation) for more eval environments.

View File

@@ -1,7 +1,39 @@
"""HUD integration for ComputerAgent."""
from typing import Any, Optional, Dict
from hud import run_job as hud_run_job
from .agent import ComputerAgent
from .adapter import ComputerAgentAdapter
from .computer_handler import HUDComputerHandler
__all__ = ["ComputerAgent", "ComputerAgentAdapter", "HUDComputerHandler"]
async def run_job(
model: str,
task_or_taskset: Any,
job_name: str,
job_kwargs: Optional[Dict[str, Any]] = None,
**agent_kwargs: Any
) -> Any:
"""
Run a job using ComputerAgent with the specified model.
Args:
model: Model string for ComputerAgent (e.g., "anthropic/claude-3-5-sonnet-20241022")
task_or_taskset: Task or TaskSet to run
job_name: Name for the job
**agent_kwargs: Additional kwargs to pass to ComputerAgent
Returns:
Job instance from HUD
"""
return await hud_run_job(
agent_cls=ComputerAgent,
agent_kwargs={"model": model, **agent_kwargs},
task_or_taskset=task_or_taskset,
job_name=job_name,
**job_kwargs or {}
)
__all__ = ["ComputerAgent", "ComputerAgentAdapter", "HUDComputerHandler", "run_job"]

View File

@@ -16,6 +16,19 @@ from .computer_handler import HUDComputerHandler
logger = logging.getLogger(__name__)
BASE_SYSTEM_PROMPT = """
You are an autonomous computer-using agent. Follow these guidelines:
1. Be decisive and complete tasks without asking for confirmation unless absolutely necessary.
2. If you need user confirmation for safety-critical actions, use the formal safety check mechanism.
3. Do NOT ask questions like "Should I proceed?" or "Would you like me to continue?" - just proceed with the task.
4. When you find what you're looking for (e.g., a file to upload), proceed with the action directly.
5. Only stop when the task is fully complete or if you encounter an error that prevents completion.
6. Trust that the user wants you to complete the entire task they've requested.
7. You must say "Task completed" when the task is complete.
Remember: You have been given permission to complete the requested task autonomously.
""".strip()
class ComputerAgent(Agent[BaseComputerAgent, dict[str, Any]]):
"""
@@ -88,25 +101,16 @@ class ComputerAgent(Agent[BaseComputerAgent, dict[str, Any]]):
self.initial_prompt: Optional[str] = None
# System prompt for computer use tasks
self.base_system_prompt = """
You are an autonomous computer-using agent. Follow these guidelines:
1. Be decisive and complete tasks without asking for confirmation unless absolutely necessary.
2. If you need user confirmation for safety-critical actions, use the formal safety check mechanism.
3. Do NOT ask questions like "Should I proceed?" or "Would you like me to continue?" - just proceed with the task.
4. When you find what you're looking for (e.g., a file to upload), proceed with the action directly.
5. Only stop when the task is fully complete or if you encounter an error that prevents completion.
6. Trust that the user wants you to complete the entire task they've requested.
Remember: You have been given permission to complete the requested task autonomously.
"""
self.base_system_prompt = BASE_SYSTEM_PROMPT
async def fetch_response(self, observation: Observation) -> tuple[list[dict[str, Any]], bool]:
"""
Fetch a response from ComputerAgent based on the observation.
Args:
observation: The preprocessed observation
observation: The preprocessed observation, attributes:
screenshot: Base64 encoded PNG string of the screen
text: Text observation, if available
Returns:
tuple[list[dict[str, Any]], bool, list[LogType] | None]: A tuple containing the list of raw actions,
@@ -140,9 +144,39 @@ class ComputerAgent(Agent[BaseComputerAgent, dict[str, Any]]):
self.conversation_history.append({"role": "user", "content": message})
else:
# Subsequent interactions - add context about the current state
message = "Continue with the task based on the current screen state."
self.conversation_history.append({"role": "user", "content": message})
# Subsequent interactions - check if last action was computer_call
# If so, add computer_call_output with screenshot instead of user message
last_computer_calls = []
for msg in reversed(self.conversation_history):
if msg.get("type") == "computer_call" and msg.get("status") == "completed":
call_id = msg.get("call_id")
if call_id:
# Check if this call_id already has a computer_call_output
has_output = any(
m.get("type") == "computer_call_output" and m.get("call_id") == call_id
for m in self.conversation_history
)
if not has_output:
last_computer_calls.append(call_id)
elif msg.get("role") == "user":
# Stop at the last user message
break
if last_computer_calls and observation.screenshot:
# Add computer_call_output for each unresponded computer_call
for call_id in reversed(last_computer_calls): # Maintain order
self.conversation_history.append({
"type": "computer_call_output",
"call_id": call_id,
"output": {
"type": "input_image",
"image_url": f"data:image/png;base64,{observation.screenshot}"
}
})
else:
# No computer_call found, add regular user message
message = "Continue with the task based on the current screen state."
self.conversation_history.append({"role": "user", "content": message})
# Run ComputerAgent
try:
@@ -150,7 +184,8 @@ class ComputerAgent(Agent[BaseComputerAgent, dict[str, Any]]):
async for result in self.computer_agent.run(self.conversation_history, stream=False):
# Update conversation history with the output
self.conversation_history += result["output"]
break
# Check if we captured any actions
if captured_actions:
# Extract reasoning from the conversation history
@@ -171,44 +206,34 @@ class ComputerAgent(Agent[BaseComputerAgent, dict[str, Any]]):
action["reasoning"] = reasoning
action["logs"] = {"conversation_length": len(self.conversation_history)}
# Check if task is done by looking for assistant message indicating completion
done = False
for msg in reversed(self.conversation_history):
if msg.get("type") == "message" and msg.get("role") == "assistant":
content = msg.get("content", [])
for c in content:
if c.get("type") == "output_text" and "task completed" in c.get("text", "").lower():
done = True
break
break
return captured_actions, False
return captured_actions, done
else:
# No actions captured, task is likely complete
response_text = "Task completed."
for msg in reversed(self.conversation_history):
if msg.get("type") == "message" and msg.get("role") == "assistant":
content = msg.get("content", [])
for c in content:
if c.get("type") == "output_text":
response_text = c.get("text", response_text)
break
break
response_action = {
"type": "response",
"text": response_text,
"reasoning": response_text,
"logs": {"conversation_length": len(self.conversation_history)}
}
# Check if this indicates task completion or failure
# Check if the last message is "Task completed"
response_text = ""
for msg in reversed(self.conversation_history):
if msg.get("type") == "message" and msg.get("role") == "assistant":
content = msg.get("content", [])
for c in content:
if c.get("type") == "output_text":
response_text = c.get("text", response_text)
break
break
done = "task completed" in response_text.lower()
response_action = {
"type": "response",
"text": response_text,
"reasoning": response_text,
"logs": {"conversation_length": len(self.conversation_history)}
}
# Check if this indicates task completion or failure
if "task is infeasible" in response_text.lower():
response_action = {"type": "custom", "action": "FAIL"}
done = True
if "task is infeasible" in response_text.lower():
response_action = {"type": "custom", "action": "FAIL"}
return [response_action], done
return [response_action], done
except Exception as e:
logger.error(f"Error running ComputerAgent: {e}")
# Return an error response