mirror of
https://github.com/trycua/computer.git
synced 2026-01-04 04:19:57 -06:00
added run_job
This commit is contained in:
@@ -63,26 +63,23 @@ print(f"Success: {result.get('success', False)}")
|
||||
Run all tasks in parallel using `run_job`:
|
||||
|
||||
```python
|
||||
from hud import run_job
|
||||
from agent.integrations.hud import ComputerAgent
|
||||
import logging
|
||||
from agent.integrations.hud import run_job
|
||||
from hud import load_taskset
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
# Load taskset
|
||||
taskset = await load_taskset("SheetBench-V2")
|
||||
|
||||
# Load full taskset
|
||||
taskset = await load_taskset("OSWorld-Verified")
|
||||
|
||||
# Run parallel job
|
||||
# Run benchmark job
|
||||
job = await run_job(
|
||||
ComputerAgent,
|
||||
taskset,
|
||||
"osworld-computeragent",
|
||||
max_steps_per_task=100,
|
||||
max_concurrent_tasks=20,
|
||||
auto_reply_question=True,
|
||||
agent_kwargs={"model": "anthropic/claude-3-5-sonnet-20241022"}
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
task_or_taskset=taskset,
|
||||
job_name="test-computeragent-job",
|
||||
# Any extra ComputerAgent kwargs:
|
||||
# verbosity=logging.INFO, # Enable logging
|
||||
# trajectory_dir=".." # Save trajectories locally
|
||||
)
|
||||
|
||||
# Get analytics
|
||||
analytics = await job.get_analytics()
|
||||
```
|
||||
# Get results OR view them at app.hud.so
|
||||
print(await job.get_analytics())
|
||||
print(f"View results at: https://app.hud.so/jobs/{job.id}")
|
||||
```
|
||||
@@ -16,28 +16,28 @@ pip install "cua-agent[hud]"
|
||||
## Usage
|
||||
|
||||
```python
|
||||
from agent.integrations.hud import ComputerAgent
|
||||
from agent.integrations.hud import run_job
|
||||
from hud import load_taskset
|
||||
|
||||
# Create agent with any ComputerAgent model
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022", # or any model string
|
||||
environment="linux"
|
||||
# Load taskset
|
||||
taskset = await load_taskset("OSWorld-Verified")
|
||||
|
||||
# Run benchmark job
|
||||
job = await run_job(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
task_or_taskset=taskset,
|
||||
job_name="test-computeragent-job",
|
||||
# Any extra ComputerAgent kwargs:
|
||||
# verbosity=logging.INFO, # Enable logging
|
||||
# trajectory_dir=".." # Save trajectories locally
|
||||
)
|
||||
|
||||
# Use exactly like other HUD agents
|
||||
action, done = await agent.predict(observation)
|
||||
# Get results OR view them at app.hud.so
|
||||
print(await job.get_analytics())
|
||||
print(f"View results at: https://app.hud.so/jobs/{job.id}")
|
||||
```
|
||||
|
||||
## Environment Variables
|
||||
|
||||
Set these environment variables:
|
||||
|
||||
- `HUD_API_KEY` - Your HUD API key
|
||||
- `ANTHROPIC_API_KEY` - For Claude models
|
||||
- `OPENAI_API_KEY` - For OpenAI models
|
||||
|
||||
## Example Benchmarks
|
||||
|
||||
1. [OSWorld-Verified](/agent-sdk/benchmarks/osworld-verified) - Benchmark on OSWorld tasks with parallel execution
|
||||
**Available Benchmarks:**
|
||||
1. [OSWorld-Verified](/agent-sdk/benchmarks/osworld-verified) - Benchmark on OSWorld tasks
|
||||
|
||||
See the [HUD docs](https://docs.hud.so/environment-creation) for more eval environments.
|
||||
@@ -1,7 +1,39 @@
|
||||
"""HUD integration for ComputerAgent."""
|
||||
|
||||
from typing import Any, Optional, Dict
|
||||
from hud import run_job as hud_run_job
|
||||
|
||||
from .agent import ComputerAgent
|
||||
from .adapter import ComputerAgentAdapter
|
||||
from .computer_handler import HUDComputerHandler
|
||||
|
||||
__all__ = ["ComputerAgent", "ComputerAgentAdapter", "HUDComputerHandler"]
|
||||
|
||||
async def run_job(
|
||||
model: str,
|
||||
task_or_taskset: Any,
|
||||
job_name: str,
|
||||
job_kwargs: Optional[Dict[str, Any]] = None,
|
||||
**agent_kwargs: Any
|
||||
) -> Any:
|
||||
"""
|
||||
Run a job using ComputerAgent with the specified model.
|
||||
|
||||
Args:
|
||||
model: Model string for ComputerAgent (e.g., "anthropic/claude-3-5-sonnet-20241022")
|
||||
task_or_taskset: Task or TaskSet to run
|
||||
job_name: Name for the job
|
||||
**agent_kwargs: Additional kwargs to pass to ComputerAgent
|
||||
|
||||
Returns:
|
||||
Job instance from HUD
|
||||
"""
|
||||
return await hud_run_job(
|
||||
agent_cls=ComputerAgent,
|
||||
agent_kwargs={"model": model, **agent_kwargs},
|
||||
task_or_taskset=task_or_taskset,
|
||||
job_name=job_name,
|
||||
**job_kwargs or {}
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["ComputerAgent", "ComputerAgentAdapter", "HUDComputerHandler", "run_job"]
|
||||
@@ -16,6 +16,19 @@ from .computer_handler import HUDComputerHandler
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
BASE_SYSTEM_PROMPT = """
|
||||
You are an autonomous computer-using agent. Follow these guidelines:
|
||||
|
||||
1. Be decisive and complete tasks without asking for confirmation unless absolutely necessary.
|
||||
2. If you need user confirmation for safety-critical actions, use the formal safety check mechanism.
|
||||
3. Do NOT ask questions like "Should I proceed?" or "Would you like me to continue?" - just proceed with the task.
|
||||
4. When you find what you're looking for (e.g., a file to upload), proceed with the action directly.
|
||||
5. Only stop when the task is fully complete or if you encounter an error that prevents completion.
|
||||
6. Trust that the user wants you to complete the entire task they've requested.
|
||||
7. You must say "Task completed" when the task is complete.
|
||||
|
||||
Remember: You have been given permission to complete the requested task autonomously.
|
||||
""".strip()
|
||||
|
||||
class ComputerAgent(Agent[BaseComputerAgent, dict[str, Any]]):
|
||||
"""
|
||||
@@ -88,25 +101,16 @@ class ComputerAgent(Agent[BaseComputerAgent, dict[str, Any]]):
|
||||
self.initial_prompt: Optional[str] = None
|
||||
|
||||
# System prompt for computer use tasks
|
||||
self.base_system_prompt = """
|
||||
You are an autonomous computer-using agent. Follow these guidelines:
|
||||
|
||||
1. Be decisive and complete tasks without asking for confirmation unless absolutely necessary.
|
||||
2. If you need user confirmation for safety-critical actions, use the formal safety check mechanism.
|
||||
3. Do NOT ask questions like "Should I proceed?" or "Would you like me to continue?" - just proceed with the task.
|
||||
4. When you find what you're looking for (e.g., a file to upload), proceed with the action directly.
|
||||
5. Only stop when the task is fully complete or if you encounter an error that prevents completion.
|
||||
6. Trust that the user wants you to complete the entire task they've requested.
|
||||
|
||||
Remember: You have been given permission to complete the requested task autonomously.
|
||||
"""
|
||||
self.base_system_prompt = BASE_SYSTEM_PROMPT
|
||||
|
||||
async def fetch_response(self, observation: Observation) -> tuple[list[dict[str, Any]], bool]:
|
||||
"""
|
||||
Fetch a response from ComputerAgent based on the observation.
|
||||
|
||||
Args:
|
||||
observation: The preprocessed observation
|
||||
observation: The preprocessed observation, attributes:
|
||||
screenshot: Base64 encoded PNG string of the screen
|
||||
text: Text observation, if available
|
||||
|
||||
Returns:
|
||||
tuple[list[dict[str, Any]], bool, list[LogType] | None]: A tuple containing the list of raw actions,
|
||||
@@ -140,9 +144,39 @@ class ComputerAgent(Agent[BaseComputerAgent, dict[str, Any]]):
|
||||
|
||||
self.conversation_history.append({"role": "user", "content": message})
|
||||
else:
|
||||
# Subsequent interactions - add context about the current state
|
||||
message = "Continue with the task based on the current screen state."
|
||||
self.conversation_history.append({"role": "user", "content": message})
|
||||
# Subsequent interactions - check if last action was computer_call
|
||||
# If so, add computer_call_output with screenshot instead of user message
|
||||
last_computer_calls = []
|
||||
for msg in reversed(self.conversation_history):
|
||||
if msg.get("type") == "computer_call" and msg.get("status") == "completed":
|
||||
call_id = msg.get("call_id")
|
||||
if call_id:
|
||||
# Check if this call_id already has a computer_call_output
|
||||
has_output = any(
|
||||
m.get("type") == "computer_call_output" and m.get("call_id") == call_id
|
||||
for m in self.conversation_history
|
||||
)
|
||||
if not has_output:
|
||||
last_computer_calls.append(call_id)
|
||||
elif msg.get("role") == "user":
|
||||
# Stop at the last user message
|
||||
break
|
||||
|
||||
if last_computer_calls and observation.screenshot:
|
||||
# Add computer_call_output for each unresponded computer_call
|
||||
for call_id in reversed(last_computer_calls): # Maintain order
|
||||
self.conversation_history.append({
|
||||
"type": "computer_call_output",
|
||||
"call_id": call_id,
|
||||
"output": {
|
||||
"type": "input_image",
|
||||
"image_url": f"data:image/png;base64,{observation.screenshot}"
|
||||
}
|
||||
})
|
||||
else:
|
||||
# No computer_call found, add regular user message
|
||||
message = "Continue with the task based on the current screen state."
|
||||
self.conversation_history.append({"role": "user", "content": message})
|
||||
|
||||
# Run ComputerAgent
|
||||
try:
|
||||
@@ -150,7 +184,8 @@ class ComputerAgent(Agent[BaseComputerAgent, dict[str, Any]]):
|
||||
async for result in self.computer_agent.run(self.conversation_history, stream=False):
|
||||
# Update conversation history with the output
|
||||
self.conversation_history += result["output"]
|
||||
|
||||
break
|
||||
|
||||
# Check if we captured any actions
|
||||
if captured_actions:
|
||||
# Extract reasoning from the conversation history
|
||||
@@ -171,44 +206,34 @@ class ComputerAgent(Agent[BaseComputerAgent, dict[str, Any]]):
|
||||
action["reasoning"] = reasoning
|
||||
action["logs"] = {"conversation_length": len(self.conversation_history)}
|
||||
|
||||
# Check if task is done by looking for assistant message indicating completion
|
||||
done = False
|
||||
for msg in reversed(self.conversation_history):
|
||||
if msg.get("type") == "message" and msg.get("role") == "assistant":
|
||||
content = msg.get("content", [])
|
||||
for c in content:
|
||||
if c.get("type") == "output_text" and "task completed" in c.get("text", "").lower():
|
||||
done = True
|
||||
break
|
||||
break
|
||||
return captured_actions, False
|
||||
|
||||
return captured_actions, done
|
||||
else:
|
||||
# No actions captured, task is likely complete
|
||||
response_text = "Task completed."
|
||||
for msg in reversed(self.conversation_history):
|
||||
if msg.get("type") == "message" and msg.get("role") == "assistant":
|
||||
content = msg.get("content", [])
|
||||
for c in content:
|
||||
if c.get("type") == "output_text":
|
||||
response_text = c.get("text", response_text)
|
||||
break
|
||||
break
|
||||
|
||||
response_action = {
|
||||
"type": "response",
|
||||
"text": response_text,
|
||||
"reasoning": response_text,
|
||||
"logs": {"conversation_length": len(self.conversation_history)}
|
||||
}
|
||||
|
||||
# Check if this indicates task completion or failure
|
||||
# Check if the last message is "Task completed"
|
||||
response_text = ""
|
||||
for msg in reversed(self.conversation_history):
|
||||
if msg.get("type") == "message" and msg.get("role") == "assistant":
|
||||
content = msg.get("content", [])
|
||||
for c in content:
|
||||
if c.get("type") == "output_text":
|
||||
response_text = c.get("text", response_text)
|
||||
break
|
||||
break
|
||||
|
||||
done = "task completed" in response_text.lower()
|
||||
|
||||
response_action = {
|
||||
"type": "response",
|
||||
"text": response_text,
|
||||
"reasoning": response_text,
|
||||
"logs": {"conversation_length": len(self.conversation_history)}
|
||||
}
|
||||
|
||||
# Check if this indicates task completion or failure
|
||||
if "task is infeasible" in response_text.lower():
|
||||
response_action = {"type": "custom", "action": "FAIL"}
|
||||
done = True
|
||||
if "task is infeasible" in response_text.lower():
|
||||
response_action = {"type": "custom", "action": "FAIL"}
|
||||
|
||||
return [response_action], done
|
||||
|
||||
|
||||
return [response_action], done
|
||||
except Exception as e:
|
||||
logger.error(f"Error running ComputerAgent: {e}")
|
||||
# Return an error response
|
||||
|
||||
Reference in New Issue
Block a user